Grammalecte  Diff

Differences From Artifact [1ea2b6d97a]:

To Artifact [a5c1ea137d]:


1
2
3
4
5
6

7
8
9
10
11
12
13
1
2
3
4
5
6
7
8
9
10
11
12
13
14






+








import re
import traceback
import json

import compile_rules_js_convert as jsconv
import compile_rules_graph as crg


dDEF = {}
lFUNCTIONS = []

aRULESET = set()     # set of rule-ids to check if there is several rules with the same id
nRULEWITHOUTNAME = 0
25
26
27
28
29
30
31
32
33


34
35

36
37
38
39
40
41
42
43
44
45
46



47
48
49
50
51
52
53
54
55
56






57
58
59
60
61
62
63
26
27
28
29
30
31
32


33
34
35

36
37
38
39
40
41
42
43
44



45
46
47
48
49
50
51






52
53
54
55
56
57
58
59
60
61
62
63
64







-
-
+
+

-
+








-
-
-
+
+
+




-
-
-
-
-
-
+
+
+
+
+
+







    s = re.sub(r"isRealStart *\(\)", 'before("^ *$")', s)
    s = re.sub(r"isStart0 *\(\)", 'before0("^ *$|, *$")', s)
    s = re.sub(r"isRealStart0 *\(\)", 'before0("^ *$")', s)
    s = re.sub(r"isEnd *\(\)", 'after("^ *$|^,")', s)
    s = re.sub(r"isRealEnd *\(\)", 'after("^ *$")', s)
    s = re.sub(r"isEnd0 *\(\)", 'after0("^ *$|^,")', s)
    s = re.sub(r"isRealEnd0 *\(\)", 'after0("^ *$")', s)
    s = re.sub(r"(select|exclude)[(][\\](\d+)", '\\1(dDA, m.start(\\2), m.group(\\2)', s)
    s = re.sub(r"define[(][\\](\d+)", 'define(dDA, m.start(\\1)', s)
    s = re.sub(r"(select|exclude)[(][\\](\d+)", '\\1(dTokenPos, m.start(\\2), m.group(\\2)', s)
    s = re.sub(r"define[(][\\](\d+)", 'define(dTokenPos, m.start(\\1)', s)
    s = re.sub(r"(morph|morphex|displayInfo)[(][\\](\d+)", '\\1((m.start(\\2), m.group(\\2))', s)
    s = re.sub(r"(morph|morphex|displayInfo)[(]", '\\1(dDA, ', s)
    s = re.sub(r"(morph|morphex|displayInfo)[(]", '\\1(dTokenPos, ', s)
    s = re.sub(r"(sugg\w+|switch\w+)\(@", '\\1(m.group(i[4])', s)
    s = re.sub(r"word\(\s*1\b", 'nextword1(s, m.end()', s)                                  # word(1)
    s = re.sub(r"word\(\s*-1\b", 'prevword1(s, m.start()', s)                               # word(-1)
    s = re.sub(r"word\(\s*(\d)", 'nextword(s, m.end(), \\1', s)                             # word(n)
    s = re.sub(r"word\(\s*-(\d)", 'prevword(s, m.start(), \\1', s)                          # word(-n)
    s = re.sub(r"before\(\s*", 'look(s[:m.start()], ', s)                                   # before(s)
    s = re.sub(r"after\(\s*", 'look(s[m.end():], ', s)                                      # after(s)
    s = re.sub(r"textarea\(\s*", 'look(s, ', s)                                             # textarea(s)
    s = re.sub(r"before_chk1\(\s*", 'look_chk1(dDA, s[:m.start()], 0, ', s)                 # before_chk1(s)
    s = re.sub(r"after_chk1\(\s*", 'look_chk1(dDA, s[m.end():], m.end(), ', s)              # after_chk1(s)
    s = re.sub(r"textarea_chk1\(\s*", 'look_chk1(dDA, s, 0, ', s)                           # textarea_chk1(s)
    s = re.sub(r"before_chk1\(\s*", 'look_chk1(dTokenPos, s[:m.start()], 0, ', s)           # before_chk1(s)
    s = re.sub(r"after_chk1\(\s*", 'look_chk1(dTokenPos, s[m.end():], m.end(), ', s)        # after_chk1(s)
    s = re.sub(r"textarea_chk1\(\s*", 'look_chk1(dTokenPos, s, 0, ', s)                     # textarea_chk1(s)
    s = re.sub(r"/0", 'sx[m.start():m.end()]', s)                                           # /0
    s = re.sub(r"before0\(\s*", 'look(sx[:m.start()], ', s)                                 # before0(s)
    s = re.sub(r"after0\(\s*", 'look(sx[m.end():], ', s)                                    # after0(s)
    s = re.sub(r"textarea0\(\s*", 'look(sx, ', s)                                           # textarea0(s)
    s = re.sub(r"before0_chk1\(\s*", 'look_chk1(dDA, sx[:m.start()], 0, ', s)               # before0_chk1(s)
    s = re.sub(r"after0_chk1\(\s*", 'look_chk1(dDA, sx[m.end():], m.end(), ', s)            # after0_chk1(s)
    s = re.sub(r"textarea0_chk1\(\s*", 'look_chk1(dDA, sx, 0, ', s)                         # textarea0_chk1(s)
    s = re.sub(r"isEndOfNG\(\s*\)", 'isEndOfNG(dDA, s[m.end():], m.end())', s)              # isEndOfNG(s)
    s = re.sub(r"isNextNotCOD\(\s*\)", 'isNextNotCOD(dDA, s[m.end():], m.end())', s)        # isNextNotCOD(s)
    s = re.sub(r"isNextVerb\(\s*\)", 'isNextVerb(dDA, s[m.end():], m.end())', s)            # isNextVerb(s)
    s = re.sub(r"before0_chk1\(\s*", 'look_chk1(dTokenPos, sx[:m.start()], 0, ', s)         # before0_chk1(s)
    s = re.sub(r"after0_chk1\(\s*", 'look_chk1(dTokenPos, sx[m.end():], m.end(), ', s)      # after0_chk1(s)
    s = re.sub(r"textarea0_chk1\(\s*", 'look_chk1(dTokenPos, sx, 0, ', s)                   # textarea0_chk1(s)
    s = re.sub(r"isEndOfNG\(\s*\)", 'isEndOfNG(dTokenPos, s[m.end():], m.end())', s)        # isEndOfNG(s)
    s = re.sub(r"isNextNotCOD\(\s*\)", 'isNextNotCOD(dTokenPos, s[m.end():], m.end())', s)  # isNextNotCOD(s)
    s = re.sub(r"isNextVerb\(\s*\)", 'isNextVerb(dTokenPos, s[m.end():], m.end())', s)      # isNextVerb(s)
    s = re.sub(r"\bspell *[(]", '_oSpellChecker.isValid(', s)
    s = re.sub(r"[\\](\d+)", 'm.group(\\1)', s)
    return s


def uppercase (s, sLang):
    "(flag i is not enough): converts regex to uppercase regex: 'foo' becomes '[Ff][Oo][Oo]', but 'Bar' becomes 'B[Aa][Rr]'."
108
109
110
111
112
113
114
115
116
117









118
119
120
121
122
123
124
109
110
111
112
113
114
115

116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133







-


+
+
+
+
+
+
+
+
+









def createRule (s, nIdLine, sLang, bParagraph, dOptPriority):
    "returns rule as list [option name, regex, bCaseInsensitive, identifier, list of actions]"
    global dJSREGEXES
    global nRULEWITHOUTNAME

    #### OPTIONS
    sLineId = str(nIdLine) + ("p" if bParagraph else "s")
    sRuleId = sLineId

    #### GRAPH CALL
    if s.startswith("@@@@"):
        if bParagraph:
            print("Error. Graph call can’t be made only after the first pass (sentence by sentence)")
            exit()
        return ["@@@@", s[4:], sLineId]

    #### OPTIONS
    sOption = False         # False or [a-z0-9]+ name
    nPriority = 4           # Default is 4, value must be between 0 and 9
    tGroups = None          # code for groups positioning (only useful for JavaScript)
    cCaseMode = 'i'         # i: case insensitive,  s: case sensitive,  u: uppercasing allowed
    cWordLimitLeft = '['    # [: word limit, <: no specific limit
    cWordLimitRight = ']'   # ]: word limit, >: no specific limit
    m = re.match("^__(?P<borders_and_case>[[<]\\w[]>])(?P<option>/[a-zA-Z0-9]+|)(?P<ruleid>\\(\\w+\\)|)(?P<priority>![0-9]|)__ *", s)
341
342
343
344
345
346
347

348
349


350
351
352
353
354
355
356
350
351
352
353
354
355
356
357


358
359
360
361
362
363
364
365
366







+
-
-
+
+







        print("# Unknown action at line " + sIdAction)
        return None


def _calcRulesStats (lRules):
    d = {'=':0, '~': 0, '-': 0, '>': 0}
    for aRule in lRules:
        if aRule[0] != "@@@@":
        for aAction in aRule[6]:
            d[aAction[1]] = d[aAction[1]] + 1
            for aAction in aRule[6]:
                d[aAction[1]] = d[aAction[1]] + 1
    return (d, len(lRules))


def displayStats (lParagraphRules, lSentenceRules):
    print("  {:>18} {:>18} {:>18} {:>18}".format("DISAMBIGUATOR", "TEXT PROCESSOR", "GRAMMAR CHECKING", "REGEX"))
    d, nRule = _calcRulesStats(lParagraphRules)
    print("§ {:>10} actions {:>10} actions {:>10} actions  in {:>8} rules".format(d['='], d['~'], d['-'], nRule))
430
431
432
433
434
435
436
437
438
439
440
441
442


443
444
445

446
447
448

449
450

451
452

453
454
455
456
457
458
459
460
461
462

463
464

465
466

467
468
469
470
471


472
473
474





















475

476

477

478
479
480
481
482
483
484
440
441
442
443
444
445
446

447
448
449


450
451
452
453
454
455
456
457
458
459
460

461


462



463
464
465
466
467
468
469
470
471
472
473
474
475
476
477


478

479
480
481
482
483
484
485
486
487
488
489
490
491
492
493
494
495
496
497
498
499
500
501
502
503
504
505
506

507
508
509
510
511
512
513
514
515
516







-



-
-
+
+



+



+

-
+
-
-
+
-
-
-







+


+


+

-
-

-
+
+



+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+

+
-
+

+







    except:
        print("Error. Rules file in project [" + sLang + "] not found.")
        exit()

    # removing comments, zeroing empty lines, creating definitions, storing tests, merging rule lines
    print("  parsing rules...")
    global dDEF
    lLine = []
    lRuleLine = []
    lTest = []
    lOpt = []
    zBookmark = re.compile("^!!+")
    zGraphLink = re.compile(r"^@@@@GRAPHLINK>(\w+)@@@@")
    bGraph = False
    lGraphRule = []

    for i, sLine in enumerate(lRules, 1):
        if sLine.startswith('#END'):
            # arbitrary end
            printBookmark(0, "BREAK BY #END", i)
            break
        elif sLine.startswith("#"):
            # comment
            pass
        elif sLine.startswith("@@@@"):
        elif sLine.startswith("DEF:"):
            m = re.match(r"^@@@@GRAPHLINK>(\w+)@@@@", sLine.strip())
            if m:
            # definition
                #lRuleLine.append(["@GRAPHLINK", m.group(1)])
                printBookmark(1, "@GRAPHLINK: " + m.group(1), i)
        elif sLine.startswith("DEF:"):
            m = re.match("DEF: +([a-zA-Z_][a-zA-Z_0-9]*) +(.+)$", sLine.strip())
            if m:
                dDEF["{"+m.group(1)+"}"] = m.group(2)
            else:
                print("Error in definition: ", end="")
                print(sLine.strip())
        elif sLine.startswith("TEST:"):
            # test
            lTest.append("{:<8}".format(i) + "  " + sLine[5:].strip())
        elif sLine.startswith("TODO:"):
            # todo
            pass
        elif sLine.startswith(("OPTGROUP/", "OPTSOFTWARE:", "OPT/", "OPTLANG/", "OPTDEFAULTUILANG:", "OPTLABEL/", "OPTPRIORITY/")):
            # options
            lOpt.append(sLine)
        elif re.match("[  \t]*$", sLine):
            pass
        elif sLine.startswith("!!"):
            m = zBookmark.search(sLine)
            # bookmark
            m = re.match("!!+", sLine)
            nExMk = len(m.group(0))
            if sLine[nExMk:].strip():
                printBookmark(nExMk-2, sLine[nExMk:].strip(), i)
        # Graph rules
        elif sLine.startswith("@@@@GRAPH:"):
            # rules graph call
            m = re.match(r"@@@@GRAPH: *(\w+)", sLine.strip())
            if m:
                printBookmark(1, "@GRAPH: " + m.group(1), i)
                lRuleLine.append([i, "@@@@"+m.group(1)])
                bGraph = True
            lGraphRule.append([i, sLine])
            bGraph = True
        elif sLine.startswith("@@@@END_GRAPH"):
            lGraphRule.append([i, sLine])
            bGraph = False
        elif re.match("@@@@ *$", sLine):
            pass
        elif bGraph:
            lGraphRule.append([i, sLine])
        # Regex rules
        elif re.match("[  \t]*$", sLine):
            # empty line
            pass
        elif sLine.startswith(("    ", "\t")):
            # rule (continuation)
            lRuleLine[len(lRuleLine)-1][1] += " " + sLine.strip()
            lRuleLine[-1][1] += " " + sLine.strip()
        else:
            # new rule
            lRuleLine.append([i, sLine.strip()])

    # generating options files
    print("  parsing options...")
    try:
        dOptions, dOptPriority = prepareOptions(lOpt)
    except:
515
516
517
518
519
520
521
522

523
524
525
526
527
528
529
530

531
532
533
534
535
536
537
547
548
549
550
551
552
553

554
555
556
557
558
559
560
561

562
563
564
565
566
567
568
569







-
+







-
+







    # creating file with all functions callable by rules
    print("  creating callables...")
    sPyCallables = "# generated code, do not edit\n"
    sJSCallables = "// generated code, do not edit\nconst oEvalFunc = {\n"
    for sFuncName, sReturn in lFUNCTIONS:
        cType = sFuncName[0:1]
        if cType == "c": # condition
            sParams = "s, sx, m, dDA, sCountry, bCondMemo"
            sParams = "s, sx, m, dTokenPos, sCountry, bCondMemo"
        elif cType == "m": # message
            sParams = "s, m"
        elif cType == "s": # suggestion
            sParams = "s, m"
        elif cType == "p": # preprocessor
            sParams = "s, m"
        elif cType == "d": # disambiguator
            sParams = "s, m, dDA"
            sParams = "s, m, dTokenPos"
        else:
            print("# Unknown function type in [" + sFuncName + "]")
            continue
        sPyCallables += "def {} ({}):\n".format(sFuncName, sParams)
        sPyCallables += "    return " + sReturn + "\n"
        sJSCallables += "    {}: function ({})".format(sFuncName, sParams) + " {\n"
        sJSCallables += "        return " + jsconv.py2js(sReturn) + ";\n"
547
548
549
550
551
552
553
554




555
579
580
581
582
583
584
585
586
587
588
589
590
591








+
+
+
+

          "gctests": sGCTests,
          "gctestsJS": sGCTestsJS,
          "paragraph_rules": mergeRulesByOption(lParagraphRules),
          "sentence_rules": mergeRulesByOption(lSentenceRules),
          "paragraph_rules_JS": jsconv.writeRulesToJSArray(mergeRulesByOption(lParagraphRulesJS)),
          "sentence_rules_JS": jsconv.writeRulesToJSArray(mergeRulesByOption(lSentenceRulesJS)) }
    d.update(dOptions)

    # compile graph rules
    d2 = crg.make(lGraphRule, sLang, bJavaScript)
    d.update(d2)

    return d