Grammalecte  Check-in [1184e8ba6d]

Overview
Comment:[build][core] regex rules now use tokens for disambiguation
Downloads: Tarball | ZIP archive | SQL archive
Timelines: family | ancestors | descendants | both | core | build | rg
Files: files | file ages | folders
SHA3-256: 1184e8ba6d44299b56d10d9e778ca03ef1e154e05c43363c23109a761f9c4d99
User & Date: olr on 2018-06-13 09:26:32
Original Comment: [build][core] merge regex rules now use tokens for disambiguation
Other Links: branch diff | manifest | tags
Context
2018-06-13
17:10
[fr] conversion regex rules -> graph rules check-in: edf852434a user: olr tags: fr, rg
09:26
[build][core] regex rules now use tokens for disambiguation check-in: 1184e8ba6d user: olr tags: core, build, rg
07:57
[core] gc engine: dictionary of tokens position for disambiguation check-in: ff58bafc4d user: olr tags: core, rg
Changes

Modified compile_rules.py from [856372255b] to [a5c1ea137d].

26
27
28
29
30
31
32
33
34


35
36

37
38
39
40
41
42
43
44
45
46
47



48
49
50
51
52
53
54
55
56
57






58
59
60
61
62
63
64
26
27
28
29
30
31
32


33
34
35

36
37
38
39
40
41
42
43
44



45
46
47
48
49
50
51






52
53
54
55
56
57
58
59
60
61
62
63
64







-
-
+
+

-
+








-
-
-
+
+
+




-
-
-
-
-
-
+
+
+
+
+
+







    s = re.sub(r"isRealStart *\(\)", 'before("^ *$")', s)
    s = re.sub(r"isStart0 *\(\)", 'before0("^ *$|, *$")', s)
    s = re.sub(r"isRealStart0 *\(\)", 'before0("^ *$")', s)
    s = re.sub(r"isEnd *\(\)", 'after("^ *$|^,")', s)
    s = re.sub(r"isRealEnd *\(\)", 'after("^ *$")', s)
    s = re.sub(r"isEnd0 *\(\)", 'after0("^ *$|^,")', s)
    s = re.sub(r"isRealEnd0 *\(\)", 'after0("^ *$")', s)
    s = re.sub(r"(select|exclude)[(][\\](\d+)", '\\1(dDA, m.start(\\2), m.group(\\2)', s)
    s = re.sub(r"define[(][\\](\d+)", 'define(dDA, m.start(\\1)', s)
    s = re.sub(r"(select|exclude)[(][\\](\d+)", '\\1(dTokenPos, m.start(\\2), m.group(\\2)', s)
    s = re.sub(r"define[(][\\](\d+)", 'define(dTokenPos, m.start(\\1)', s)
    s = re.sub(r"(morph|morphex|displayInfo)[(][\\](\d+)", '\\1((m.start(\\2), m.group(\\2))', s)
    s = re.sub(r"(morph|morphex|displayInfo)[(]", '\\1(dDA, ', s)
    s = re.sub(r"(morph|morphex|displayInfo)[(]", '\\1(dTokenPos, ', s)
    s = re.sub(r"(sugg\w+|switch\w+)\(@", '\\1(m.group(i[4])', s)
    s = re.sub(r"word\(\s*1\b", 'nextword1(s, m.end()', s)                                  # word(1)
    s = re.sub(r"word\(\s*-1\b", 'prevword1(s, m.start()', s)                               # word(-1)
    s = re.sub(r"word\(\s*(\d)", 'nextword(s, m.end(), \\1', s)                             # word(n)
    s = re.sub(r"word\(\s*-(\d)", 'prevword(s, m.start(), \\1', s)                          # word(-n)
    s = re.sub(r"before\(\s*", 'look(s[:m.start()], ', s)                                   # before(s)
    s = re.sub(r"after\(\s*", 'look(s[m.end():], ', s)                                      # after(s)
    s = re.sub(r"textarea\(\s*", 'look(s, ', s)                                             # textarea(s)
    s = re.sub(r"before_chk1\(\s*", 'look_chk1(dDA, s[:m.start()], 0, ', s)                 # before_chk1(s)
    s = re.sub(r"after_chk1\(\s*", 'look_chk1(dDA, s[m.end():], m.end(), ', s)              # after_chk1(s)
    s = re.sub(r"textarea_chk1\(\s*", 'look_chk1(dDA, s, 0, ', s)                           # textarea_chk1(s)
    s = re.sub(r"before_chk1\(\s*", 'look_chk1(dTokenPos, s[:m.start()], 0, ', s)           # before_chk1(s)
    s = re.sub(r"after_chk1\(\s*", 'look_chk1(dTokenPos, s[m.end():], m.end(), ', s)        # after_chk1(s)
    s = re.sub(r"textarea_chk1\(\s*", 'look_chk1(dTokenPos, s, 0, ', s)                     # textarea_chk1(s)
    s = re.sub(r"/0", 'sx[m.start():m.end()]', s)                                           # /0
    s = re.sub(r"before0\(\s*", 'look(sx[:m.start()], ', s)                                 # before0(s)
    s = re.sub(r"after0\(\s*", 'look(sx[m.end():], ', s)                                    # after0(s)
    s = re.sub(r"textarea0\(\s*", 'look(sx, ', s)                                           # textarea0(s)
    s = re.sub(r"before0_chk1\(\s*", 'look_chk1(dDA, sx[:m.start()], 0, ', s)               # before0_chk1(s)
    s = re.sub(r"after0_chk1\(\s*", 'look_chk1(dDA, sx[m.end():], m.end(), ', s)            # after0_chk1(s)
    s = re.sub(r"textarea0_chk1\(\s*", 'look_chk1(dDA, sx, 0, ', s)                         # textarea0_chk1(s)
    s = re.sub(r"isEndOfNG\(\s*\)", 'isEndOfNG(dDA, s[m.end():], m.end())', s)              # isEndOfNG(s)
    s = re.sub(r"isNextNotCOD\(\s*\)", 'isNextNotCOD(dDA, s[m.end():], m.end())', s)        # isNextNotCOD(s)
    s = re.sub(r"isNextVerb\(\s*\)", 'isNextVerb(dDA, s[m.end():], m.end())', s)            # isNextVerb(s)
    s = re.sub(r"before0_chk1\(\s*", 'look_chk1(dTokenPos, sx[:m.start()], 0, ', s)         # before0_chk1(s)
    s = re.sub(r"after0_chk1\(\s*", 'look_chk1(dTokenPos, sx[m.end():], m.end(), ', s)      # after0_chk1(s)
    s = re.sub(r"textarea0_chk1\(\s*", 'look_chk1(dTokenPos, sx, 0, ', s)                   # textarea0_chk1(s)
    s = re.sub(r"isEndOfNG\(\s*\)", 'isEndOfNG(dTokenPos, s[m.end():], m.end())', s)        # isEndOfNG(s)
    s = re.sub(r"isNextNotCOD\(\s*\)", 'isNextNotCOD(dTokenPos, s[m.end():], m.end())', s)  # isNextNotCOD(s)
    s = re.sub(r"isNextVerb\(\s*\)", 'isNextVerb(dTokenPos, s[m.end():], m.end())', s)      # isNextVerb(s)
    s = re.sub(r"\bspell *[(]", '_oSpellChecker.isValid(', s)
    s = re.sub(r"[\\](\d+)", 'm.group(\\1)', s)
    return s


def uppercase (s, sLang):
    "(flag i is not enough): converts regex to uppercase regex: 'foo' becomes '[Ff][Oo][Oo]', but 'Bar' becomes 'B[Aa][Rr]'."
547
548
549
550
551
552
553
554

555
556
557
558
559
560
561
562

563
564
565
566
567
568
569
547
548
549
550
551
552
553

554
555
556
557
558
559
560
561

562
563
564
565
566
567
568
569







-
+







-
+







    # creating file with all functions callable by rules
    print("  creating callables...")
    sPyCallables = "# generated code, do not edit\n"
    sJSCallables = "// generated code, do not edit\nconst oEvalFunc = {\n"
    for sFuncName, sReturn in lFUNCTIONS:
        cType = sFuncName[0:1]
        if cType == "c": # condition
            sParams = "s, sx, m, dDA, sCountry, bCondMemo"
            sParams = "s, sx, m, dTokenPos, sCountry, bCondMemo"
        elif cType == "m": # message
            sParams = "s, m"
        elif cType == "s": # suggestion
            sParams = "s, m"
        elif cType == "p": # preprocessor
            sParams = "s, m"
        elif cType == "d": # disambiguator
            sParams = "s, m, dDA"
            sParams = "s, m, dTokenPos"
        else:
            print("# Unknown function type in [" + sFuncName + "]")
            continue
        sPyCallables += "def {} ({}):\n".format(sFuncName, sParams)
        sPyCallables += "    return " + sReturn + "\n"
        sJSCallables += "    {}: function ({})".format(sFuncName, sParams) + " {\n"
        sJSCallables += "        return " + jsconv.py2js(sReturn) + ";\n"

Modified gc_core/py/lang_core/gc_engine.py from [f48195d439] to [8eb4241441].

104
105
106
107
108
109
110
111
112
113
114
115
116
117
118

119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140

141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158

159
160
161

162
163
164
165
166
167
168
104
105
106
107
108
109
110

111
112
113
114
115
116

117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135

136
137

138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155

156
157
158
159
160
161
162
163
164
165
166
167







-






-
+


















-


-
+

















-
+



+







#### Parsing

def parse (sText, sCountry="${country_default}", bDebug=False, dOptions=None, bContext=False):
    "analyses the paragraph sText and returns list of errors"
    #sText = unicodedata.normalize("NFC", sText)
    aErrors = None
    sRealText = sText
    dDA = {}        # Disambiguisator. Key = position; value = list of morphologies
    dPriority = {}  # Key = position; value = priority
    dOpt = _dOptions  if not dOptions  else dOptions
    bShowRuleId = option('idrule')

    # parse paragraph
    try:
        sNew, aErrors = _proofread(None, sText, sRealText, 0, True, dDA, dPriority, sCountry, dOpt, bShowRuleId, bDebug, bContext)
        sNew, aErrors = _proofread(None, sText, sRealText, 0, True, dPriority, sCountry, dOpt, bShowRuleId, bDebug, bContext)
        if sNew:
            sText = sNew
    except:
        raise

    # cleanup
    if " " in sText:
        sText = sText.replace(" ", ' ') # nbsp
    if " " in sText:
        sText = sText.replace(" ", ' ') # nnbsp
    if "'" in sText:
        sText = sText.replace("'", "’")
    if "‑" in sText:
        sText = sText.replace("‑", "-") # nobreakdash

    # parse sentences
    for iStart, iEnd in _getSentenceBoundaries(sText):
        if 4 < (iEnd - iStart) < 2000:
            dDA.clear()
            try:
                oSentence = TokenSentence(sText[iStart:iEnd], sRealText[iStart:iEnd], iStart)
                _, errs = _proofread(oSentence, sText[iStart:iEnd], sRealText[iStart:iEnd], iStart, False, dDA, dPriority, sCountry, dOpt, bShowRuleId, bDebug, bContext)
                _, errs = _proofread(oSentence, sText[iStart:iEnd], sRealText[iStart:iEnd], iStart, False, dPriority, sCountry, dOpt, bShowRuleId, bDebug, bContext)
                aErrors.update(errs)
            except:
                raise
    return aErrors.values() # this is a view (iterable)


_zEndOfSentence = re.compile(r'([.?!:;…][ .?!… »”")]*|.$)')
_zBeginOfParagraph = re.compile(r"^\W*")
_zEndOfParagraph = re.compile(r"\W*$")

def _getSentenceBoundaries (sText):
    iStart = _zBeginOfParagraph.match(sText).end()
    for m in _zEndOfSentence.finditer(sText):
        yield (iStart, m.end())
        iStart = m.end()


def _proofread (oSentence, s, sx, nOffset, bParagraph, dDA, dPriority, sCountry, dOptions, bShowRuleId, bDebug, bContext):
def _proofread (oSentence, s, sx, nOffset, bParagraph, dPriority, sCountry, dOptions, bShowRuleId, bDebug, bContext):
    dErrs = {}
    bParagraphChange = False
    bSentenceChange = False
    dTokenPos = oSentence.dTokenPos if oSentence else {}
    for sOption, lRuleGroup in _getRules(bParagraph):
        if sOption == "@@@@":
            # graph rules
            if not bParagraph and bSentenceChange:
                oSentence.update(s)
                bSentenceChange = False
            for sGraphName, sLineId in lRuleGroup:
179
180
181
182
183
184
185
186

187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202

203
204
205



206
207
208
209
210
211
212
178
179
180
181
182
183
184

185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202



203
204
205
206
207
208
209
210
211
212







-
+
















+
-
-
-
+
+
+







            for zRegex, bUppercase, sLineId, sRuleId, nPriority, lActions in lRuleGroup:
                if sRuleId not in _aIgnoredRules:
                    for m in zRegex.finditer(s):
                        bCondMemo = None
                        for sFuncCond, cActionType, sWhat, *eAct in lActions:
                            # action in lActions: [ condition, action type, replacement/suggestion/action[, iGroup[, message, URL]] ]
                            try:
                                bCondMemo = not sFuncCond or globals()[sFuncCond](s, sx, m, dDA, sCountry, bCondMemo)
                                bCondMemo = not sFuncCond or globals()[sFuncCond](s, sx, m, dTokenPos, sCountry, bCondMemo)
                                if bCondMemo:
                                    if cActionType == "-":
                                        # grammar error
                                        nErrorStart = nOffset + m.start(eAct[0])
                                        if nErrorStart not in dErrs or nPriority > dPriority[nErrorStart]:
                                            dErrs[nErrorStart] = _createRegexError(s, sx, sWhat, nOffset, m, eAct[0], sLineId, sRuleId, bUppercase, eAct[1], eAct[2], bShowRuleId, sOption, bContext)
                                            dPriority[nErrorStart] = nPriority
                                    elif cActionType == "~":
                                        # text processor
                                        s = _rewrite(s, sWhat, eAct[0], m, bUppercase)
                                        bParagraphChange = True
                                        bSentenceChange = True
                                        if bDebug:
                                            echo("~ " + s + "  -- " + m.group(eAct[0]) + "  # " + sLineId)
                                    elif cActionType == "=":
                                        # disambiguation
                                        if not bParagraph:
                                        globals()[sWhat](s, m, dDA)
                                        if bDebug:
                                            echo("= " + m.group(0) + "  # " + sLineId + "\nDA: " + str(dDA))
                                            globals()[sWhat](s, m, dTokenPos)
                                            if bDebug:
                                                echo("= " + m.group(0) + "  # " + sLineId)
                                    elif cActionType == ">":
                                        # we do nothing, this test is just a condition to apply all following actions
                                        pass
                                    else:
                                        echo("# error: unknown action at " + sLineId)
                                elif cActionType == ">":
                                    break
395
396
397
398
399
400
401
402

403
404
405
406
407
408
409
410
411
412


413
414
415
416
417

418
419
420
421

422
423
424
425
426
427
428
429
430

431
432
433
434

435
436
437
438
439
440
441
395
396
397
398
399
400
401

402
403
404
405
406
407
408
409
410


411
412
413
414
415
416

417
418
419
420

421
422
423
424
425
426
427
428
429

430
431
432
433

434
435
436
437
438
439
440
441







-
+








-
-
+
+




-
+



-
+








-
+



-
+







#### common functions

def option (sOpt):
    "return True if option sOpt is active"
    return _dOptions.get(sOpt, False)


def displayInfo (dDA, tWord):
def displayInfo (dTokenPos, tWord):
    "for debugging: retrieve info of word"
    if not tWord:
        echo("> nothing to find")
        return True
    lMorph = _oSpellChecker.getMorph(tWord[1])
    if not lMorph:
        echo("> not in dictionary")
        return True
    if tWord[0] in dDA:
        echo("DA: " + str(dDA[tWord[0]]))
    if tWord[0] in dTokenPos and "lMorph" in dTokenPos[tWord[0]]:
        echo("DA: " + str(dTokenPos[tWord[0]]["lMorph"]))
    echo("FSA: " + str(lMorph))
    return True


def morph (dDA, tWord, sPattern, bStrict=True, bNoWord=False):
def morph (dTokenPos, tWord, sPattern, bStrict=True, bNoWord=False):
    "analyse a tuple (position, word), return True if sPattern in morphologies (disambiguation on)"
    if not tWord:
        return bNoWord
    lMorph = dDA[tWord[0]]  if tWord[0] in dDA  else _oSpellChecker.getMorph(tWord[1])
    lMorph = dTokenPos[tWord[0]]["lMorph"]  if tWord[0] in dTokenPos and "lMorph" in dTokenPos[tWord[0]]  else _oSpellChecker.getMorph(tWord[1])
    if not lMorph:
        return False
    p = re.compile(sPattern)
    if bStrict:
        return all(p.search(s)  for s in lMorph)
    return any(p.search(s)  for s in lMorph)


def morphex (dDA, tWord, sPattern, sNegPattern, bNoWord=False):
def morphex (dTokenPos, tWord, sPattern, sNegPattern, bNoWord=False):
    "analyse a tuple (position, word), returns True if not sNegPattern in word morphologies and sPattern in word morphologies (disambiguation on)"
    if not tWord:
        return bNoWord
    lMorph = dDA[tWord[0]]  if tWord[0] in dDA  else _oSpellChecker.getMorph(tWord[1])
    lMorph = dTokenPos[tWord[0]]["lMorph"]  if tWord[0] in dTokenPos and "lMorph" in dTokenPos[tWord[0]]  else _oSpellChecker.getMorph(tWord[1])
    if not lMorph:
        return False
    # check negative condition
    np = re.compile(sNegPattern)
    if any(np.search(s)  for s in lMorph):
        return False
    # search sPattern
513
514
515
516
517
518
519
520

521
522
523
524
525
526
527
528
529
530
531
532


533
534
535
536
537

538
539
540


541
542
543
544
545
546
547
548

549
550

551
552
553
554

555
556
557


558
559
560
561
562
563
564
565

566
567

568
569
570
571
572





573
574
575
576
577
578
579
513
514
515
516
517
518
519

520
521
522
523
524
525
526
527
528
529
530


531
532
533
534
535
536

537
538
539

540
541
542
543
544
545
546
547
548

549
550

551
552
553
554

555
556
557

558
559
560
561
562
563
564
565
566

567
568

569
570
571
572


573
574
575
576
577
578
579
580
581
582
583
584







-
+










-
-
+
+




-
+


-
+
+







-
+

-
+



-
+


-
+
+







-
+

-
+



-
-
+
+
+
+
+







    if sNegPattern and re.search(sNegPattern, s):
        return False
    if re.search(sPattern, s):
        return True
    return False


def look_chk1 (dDA, s, nOffset, sPattern, sPatternGroup1, sNegPatternGroup1=None):
def look_chk1 (dTokenPos, s, nOffset, sPattern, sPatternGroup1, sNegPatternGroup1=None):
    "returns True if s has pattern sPattern and m.group(1) has pattern sPatternGroup1"
    m = re.search(sPattern, s)
    if not m:
        return False
    try:
        sWord = m.group(1)
        nPos = m.start(1) + nOffset
    except:
        return False
    if sNegPatternGroup1:
        return morphex(dDA, (nPos, sWord), sPatternGroup1, sNegPatternGroup1)
    return morph(dDA, (nPos, sWord), sPatternGroup1, False)
        return morphex(dTokenPos, (nPos, sWord), sPatternGroup1, sNegPatternGroup1)
    return morph(dTokenPos, (nPos, sWord), sPatternGroup1, False)


#### Disambiguator

def select (dDA, nPos, sWord, sPattern, lDefault=None):
def select (dTokenPos, nPos, sWord, sPattern, lDefault=None):
    if not sWord:
        return True
    if nPos in dDA:
    if nPos not in dTokenPos:
        print("Error. There should be a token at this position: ", nPos)
        return True
    lMorph = _oSpellChecker.getMorph(sWord)
    if not lMorph or len(lMorph) == 1:
        return True
    lSelect = [ sMorph  for sMorph in lMorph  if re.search(sPattern, sMorph) ]
    if lSelect:
        if len(lSelect) != len(lMorph):
            dDA[nPos] = lSelect
            dTokenPos[nPos]["lMorph"] = lSelect
    elif lDefault:
        dDA[nPos] = lDefault
        dTokenPos[nPos]["lMorph"] = lDefault
    return True


def exclude (dDA, nPos, sWord, sPattern, lDefault=None):
def exclude (dTokenPos, nPos, sWord, sPattern, lDefault=None):
    if not sWord:
        return True
    if nPos in dDA:
    if nPos not in dTokenPos:
        print("Error. There should be a token at this position: ", nPos)
        return True
    lMorph = _oSpellChecker.getMorph(sWord)
    if not lMorph or len(lMorph) == 1:
        return True
    lSelect = [ sMorph  for sMorph in lMorph  if not re.search(sPattern, sMorph) ]
    if lSelect:
        if len(lSelect) != len(lMorph):
            dDA[nPos] = lSelect
            dTokenPos[nPos]["lMorph"] = lSelect
    elif lDefault:
        dDA[nPos] = lDefault
        dTokenPos[nPos]["lMorph"] = lDefault
    return True


def define (dDA, nPos, lMorph):
    dDA[nPos] = lMorph
def define (dTokenPos, nPos, lMorph):
    if nPos not in dTokenPos:
        print("Error. There should be a token at this position: ", nPos)
        return True
    dTokenPos[nPos]["lMorph"] = lMorph
    return True


#### GRAMMAR CHECKER PLUGINS

${plugins}

Modified gc_lang/fr/rules.grx from [8fec34a927] to [413f581a35].

388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
388
389
390
391
392
393
394

395
396
397
398
399
400
401







-







# URL
__<i>(p_URL)__
    https?://[\w./?&!%=+*"'@$#-]+ <<- ~>> *
__<i](p_URL2)__
    ((?:{w_1}[.])*)({w_2})([.](?:com|net|org|info|fr|ca|be|ch|i[ot]|co[.]uk|tk|es|jp|zh|ru|us|nl|xyz)) @@0,**,$
    <<- ~1>> *
    <<- ~2>> =\2.capitalize()
    <<- =>> define(\2, [":MP:e:i"])
    <<- ~3>> *

# Numéro de chapitre
__<i>(p_chapitre)__
    ^\d+[.][\d.-]* <<- ~>> *

# Numéro suivi de plusieurs espaces, considéré comme une numérotation de chapitre
12396
12397
12398
12399
12400
12401
12402


12403
12404


12405
12406
12407
12408
12409
12410
12411
12395
12396
12397
12398
12399
12400
12401
12402
12403


12404
12405
12406
12407
12408
12409
12410
12411
12412







+
+
-
-
+
+







TEST: dès qu’il le {{voie}}
TEST: donnant à entendre qu’il avait l’intention de violer Laura dès qu’il en aurait l’occasion


# verbe que + subjonctif
__vmode_qqch_que_subjonctif1__
    [>afin|>avant|>pour|>quoi|>permettre|>falloir|>vouloir|>ordonner|>exiger|>désirer|>préférer|>suffire]  [que|qu’|qu]  @:(?:Os|M)  @:I¬:[GYS]
        <<- /vmode/ -4>> =suggVerbMode(\4, ":S", \3)                                                # Après « \1 que », ce verbe devrait être au subjonctif.

    >douter                                                                                                [que|qu’|qu]  @:(?:Os|M)  @:I¬:(?:[GYSK]|If)
        <<- -4>> =suggVerbMode(\4, ":S", \3)                                                         # Après « \1 que », ce verbe devrait être au subjonctif.
    >douter  [que|qu’|qu]  @:(?:Os|M)  @:I¬:(?:[GYSK]|If)
        <<- /vmode/ morph(\1, ":V", ":N") -4>> =suggVerbMode(\4, ":S", \3)                          # Après « \1 que », ce verbe devrait être au subjonctif.

TEST: Il suffit qu’il {{court}} plus
TEST: Je veux qu’il {{finit}} son repas.
TEST: quoi qu’il en {{conclut}}
TEST: Je ne veux pas que tu {{es}} des ennuis
TEST: Avant que tu {{pars}}, je voudrais qu’on discute.
TEST: Nul doute qu’elle nourrira à brève échéance la haine de demain à notre égard.