Grammalecte  Diff

Differences From Artifact [16b07dde1f]:

To Artifact [09d4d7dd58]:


156
157
158
159
160
161
162
163

164
165
166
167
168
169

170
171
172
173
174
175
176
177
178
179
180


181
182
183
184
185
186
187
156
157
158
159
160
161
162

163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190







-
+






+











+
+







    bParagraphChange = False
    bSentenceChange = False
    dTokenPos = oSentence.dTokenPos if oSentence else {}
    for sOption, lRuleGroup in _getRules(bParagraph):
        if sOption == "@@@@":
            # graph rules
            if not bParagraph and bSentenceChange:
                oSentence.update(s)
                oSentence.update(s, bDebug)
                bSentenceChange = False
            for sGraphName, sLineId in lRuleGroup:
                if bDebug:
                    print("\n>>>> GRAPH:", sGraphName, sLineId)
                bParagraphChange, s = oSentence.parse(dAllGraph[sGraphName], dPriority, sCountry, dOptions, bShowRuleId, bDebug, bContext)
                dErrs.update(oSentence.dError)
                dTokenPos = oSentence.dTokenPos
        elif not sOption or dOptions.get(sOption, False):
            # regex rules
            for zRegex, bUppercase, sLineId, sRuleId, nPriority, lActions in lRuleGroup:
                if sRuleId not in _aIgnoredRules:
                    for m in zRegex.finditer(s):
                        bCondMemo = None
                        for sFuncCond, cActionType, sWhat, *eAct in lActions:
                            # action in lActions: [ condition, action type, replacement/suggestion/action[, iGroup[, message, URL]] ]
                            try:
                                bCondMemo = not sFuncCond or globals()[sFuncCond](s, sx, m, dTokenPos, sCountry, bCondMemo)
                                if bCondMemo:
                                    if bDebug:
                                        print("RULE:", sLineId)
                                    if cActionType == "-":
                                        # grammar error
                                        nErrorStart = nOffset + m.start(eAct[0])
                                        if nErrorStart not in dErrs or nPriority > dPriority.get(nErrorStart, -1):
                                            dErrs[nErrorStart] = _createError(s, sx, sWhat, nOffset, m, eAct[0], sLineId, sRuleId, bUppercase, eAct[1], eAct[2], bShowRuleId, sOption, bContext)
                                            dPriority[nErrorStart] = nPriority
                                    elif cActionType == "~":
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
410
411
412
413
414
415
416

417
418
419




420
421
422
423
424
425
426







-



-
-
-
-







    return any(zPattern.search(s)  for s in lMorph)


def morphex (dTokenPos, tWord, sPattern, sNegPattern, bNoWord=False):
    "analyse a tuple (position, word), returns True if not sNegPattern in word morphologies and sPattern in word morphologies (disambiguation on)"
    if not tWord:
        return bNoWord

    lMorph = dTokenPos[tWord[0]]["lMorph"]  if tWord[0] in dTokenPos and "lMorph" in dTokenPos[tWord[0]]  else _oSpellChecker.getMorph(tWord[1])
    if not lMorph:
        return False
    if (tWord[1].startswith("noir")):
        print(tWord)
        print(dTokenPos)
        print(lMorph)
    # check negative condition
    zNegPattern = re.compile(sNegPattern)
    if any(zNegPattern.search(s)  for s in lMorph):
        return False
    # search sPattern
    zPattern = re.compile(sPattern)
    return any(zPattern.search(s)  for s in lMorph)
574
575
576
577
578
579
580
581

582
583
584
585
586
587
588
589
590
591
592
593
594
595
596
597

598
599
600









601
602
603
604
605
606
607
572
573
574
575
576
577
578

579
580
581
582
583
584
585
586
587
588
589
590
591
592
593
594

595
596
597

598
599
600
601
602
603
604
605
606
607
608
609
610
611
612
613







-
+















-
+


-
+
+
+
+
+
+
+
+
+







    "Text parser"

    def __init__ (self, sSentence, sSentence0, nOffset):
        self.sSentence = sSentence
        self.sSentence0 = sSentence0
        self.nOffsetWithinParagraph = nOffset
        self.lToken = list(_oTokenizer.genTokens(sSentence, True))
        self.dTokenPos = { dToken["nStart"]: dToken  for dToken in self.lToken }
        self.dTokenPos = { dToken["nStart"]: dToken  for dToken in self.lToken  if dToken["sType"] != "INFO" }
        self.dTags = {}
        self.dError = {}

    def __str__ (self):
        s = "sentence: " + self.sSentence0 + "\n"
        s += "now:      " + self.sSentence  + "\n"
        for dToken in self.lToken:
            s += f'{dToken["nStart"]}\t{dToken["nEnd"]}\t{dToken["sValue"]}'
            if "lMorph" in dToken:
                s += "\t" + str(dToken["lMorph"])
            s += "\n"
        for nPos, dToken in self.dTokenPos.items():
            s += f"{nPos}\t{dToken}\n"
        return s

    def update (self, sSentence):
    def update (self, sSentence, bDebug=False):
        "update <sSentence> and retokenize"
        self.sSentence = sSentence
        self.lToken = list(_oTokenizer.genTokens(sSentence, True))
        lNewToken = list(_oTokenizer.genTokens(sSentence, True))
        for dToken in lNewToken:
            if "lMorph" in self.dTokenPos.get(dToken["nStart"], {}):
                dToken["lMorph"] = self.dTokenPos[dToken["nStart"]]["lMorph"]
        self.lToken = lNewToken
        self.dTokenPos = { dToken["nStart"]: dToken  for dToken in self.lToken  if dToken["sType"] != "INFO" }
        if bDebug:
            print("UPDATE:")
            print(self)

    def _getNextMatchingNodes (self, dToken, dGraph, dNode, bDebug=False):
        "generator: return nodes where <dToken> “values” match <dNode> arcs"
        # token value
        if dToken["sValue"] in dNode:
            if bDebug:
                print("  MATCH:", dToken["sValue"])
911
912
913
914
915
916
917

918
919
920
921
922
923
924
925
926
927
928
929
930
931
932
933
934
935
936
937
938
939
940























941
942
943
944
945
946
947
948
949
950
951
952
953
954

955





956
957
958
959
960
961
962
917
918
919
920
921
922
923
924























925
926
927
928
929
930
931
932
933
934
935
936
937
938
939
940
941
942
943
944
945
946
947
948
949
950
951
952
953
954
955
956
957
958
959
960
961
962

963
964
965
966
967
968
969
970
971
972
973
974







+
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+














+
-
+
+
+
+
+







        if bDebug:
            print("REWRITE")
        lNewToken = []
        nMergeUntil = 0
        dTokenMerger = None
        for dToken in self.lToken:
            bKeepToken = True
            if dToken["sType"] != "INFO":
            if "bImmune" in dToken:
                nErrorStart = self.nOffsetWithinParagraph + dToken["nStart"]
                if nErrorStart in self.dError:
                    if bDebug:
                        print("immunity -> error removed:", self.dError[nErrorStart])
                    del self.dError[nErrorStart]
            if nMergeUntil and dToken["i"] <= nMergeUntil:
                dTokenMerger["sValue"] += " " * (dToken["nStart"] - dTokenMerger["nEnd"]) + dToken["sValue"]
                dTokenMerger["nEnd"] = dToken["nEnd"]
                if bDebug:
                    print("  MERGED TOKEN:", dTokenMerger["sValue"])
                bKeepToken = False
            if "nMergeUntil" in dToken:
                if dToken["i"] > nMergeUntil: # this token is not already merged with a previous token
                    dTokenMerger = dToken
                if dToken["nMergeUntil"] > nMergeUntil:
                    nMergeUntil = dToken["nMergeUntil"]
                del dToken["nMergeUntil"]
            elif "bToRemove" in dToken:
                if bDebug:
                    print("  REMOVED:", dToken["sValue"])
                self.sSentence = self.sSentence[:dToken["nStart"]] + " " * (dToken["nEnd"] - dToken["nStart"]) + self.sSentence[dToken["nEnd"]:]
                bKeepToken = False
                if "bImmune" in dToken:
                    nErrorStart = self.nOffsetWithinParagraph + dToken["nStart"]
                    if nErrorStart in self.dError:
                        if bDebug:
                            print("immunity -> error removed:", self.dError[nErrorStart])
                        del self.dError[nErrorStart]
                if nMergeUntil and dToken["i"] <= nMergeUntil:
                    dTokenMerger["sValue"] += " " * (dToken["nStart"] - dTokenMerger["nEnd"]) + dToken["sValue"]
                    dTokenMerger["nEnd"] = dToken["nEnd"]
                    if bDebug:
                        print("  MERGED TOKEN:", dTokenMerger["sValue"])
                    bKeepToken = False
                if "nMergeUntil" in dToken:
                    if dToken["i"] > nMergeUntil: # this token is not already merged with a previous token
                        dTokenMerger = dToken
                    if dToken["nMergeUntil"] > nMergeUntil:
                        nMergeUntil = dToken["nMergeUntil"]
                    del dToken["nMergeUntil"]
                elif "bToRemove" in dToken:
                    if bDebug:
                        print("  REMOVED:", dToken["sValue"])
                    self.sSentence = self.sSentence[:dToken["nStart"]] + " " * (dToken["nEnd"] - dToken["nStart"]) + self.sSentence[dToken["nEnd"]:]
                    bKeepToken = False
            #
            if bKeepToken:
                lNewToken.append(dToken)
                if "sNewValue" in dToken:
                    # rewrite token and sentence
                    if bDebug:
                        print(dToken["sValue"], "->", dToken["sNewValue"])
                    dToken["sRealValue"] = dToken["sValue"]
                    dToken["sValue"] = dToken["sNewValue"]
                    nDiffLen = len(dToken["sRealValue"]) - len(dToken["sNewValue"])
                    sNewRepl = (dToken["sNewValue"] + " " * nDiffLen)  if nDiffLen >= 0  else dToken["sNewValue"][:len(dToken["sRealValue"])]
                    self.sSentence = self.sSentence[:dToken["nStart"]] + sNewRepl + self.sSentence[dToken["nEnd"]:]
                    del dToken["sNewValue"]
            else:
                try:
                del self.dTokenPos[dToken["nStart"]]
                    del self.dTokenPos[dToken["nStart"]]
                except:
                    print(self)
                    print(dToken)
                    exit()
        if bDebug:
            print("  REWRITED:", self.sSentence)
        self.lToken.clear()
        self.lToken = lNewToken