Overview
Comment: | [core] fix intertwined bugs concerning switches between regex rules and graph rules (the transition is a mess) |
---|---|
Downloads: | Tarball | ZIP archive | SQL archive |
Timelines: | family | ancestors | descendants | both | core | rg |
Files: | files | file ages | folders |
SHA3-256: |
c6acf0a935480c9849275a9867929c7a |
User & Date: | olr on 2018-06-29 15:32:17 |
Other Links: | branch diff | manifest | tags |
Context
2018-06-29
| ||
17:14 | [fr] conversion: regex rules -> graph rules check-in: 2c2013067a user: olr tags: fr, rg | |
15:32 | [core] fix intertwined bugs concerning switches between regex rules and graph rules (the transition is a mess) check-in: c6acf0a935 user: olr tags: core, rg | |
15:24 | [fr] fix few bugs check-in: ad0bbbe88a user: olr tags: fr, rg | |
Changes
Modified gc_core/py/lang_core/gc_engine.py from [16b07dde1f] to [09d4d7dd58].
︙ | ︙ | |||
156 157 158 159 160 161 162 | bParagraphChange = False bSentenceChange = False dTokenPos = oSentence.dTokenPos if oSentence else {} for sOption, lRuleGroup in _getRules(bParagraph): if sOption == "@@@@": # graph rules if not bParagraph and bSentenceChange: | | > > > | 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 | bParagraphChange = False bSentenceChange = False dTokenPos = oSentence.dTokenPos if oSentence else {} for sOption, lRuleGroup in _getRules(bParagraph): if sOption == "@@@@": # graph rules if not bParagraph and bSentenceChange: oSentence.update(s, bDebug) bSentenceChange = False for sGraphName, sLineId in lRuleGroup: if bDebug: print("\n>>>> GRAPH:", sGraphName, sLineId) bParagraphChange, s = oSentence.parse(dAllGraph[sGraphName], dPriority, sCountry, dOptions, bShowRuleId, bDebug, bContext) dErrs.update(oSentence.dError) dTokenPos = oSentence.dTokenPos elif not sOption or dOptions.get(sOption, False): # regex rules for zRegex, bUppercase, sLineId, sRuleId, nPriority, lActions in lRuleGroup: if sRuleId not in _aIgnoredRules: for m in zRegex.finditer(s): bCondMemo = None for sFuncCond, cActionType, sWhat, *eAct in lActions: # action in lActions: [ condition, action type, replacement/suggestion/action[, iGroup[, message, URL]] ] try: bCondMemo = not sFuncCond or globals()[sFuncCond](s, sx, m, dTokenPos, sCountry, bCondMemo) if bCondMemo: if bDebug: print("RULE:", sLineId) if cActionType == "-": # grammar error nErrorStart = nOffset + m.start(eAct[0]) if nErrorStart not in dErrs or nPriority > dPriority.get(nErrorStart, -1): dErrs[nErrorStart] = _createError(s, sx, sWhat, nOffset, m, eAct[0], sLineId, sRuleId, bUppercase, eAct[1], eAct[2], bShowRuleId, sOption, bContext) dPriority[nErrorStart] = nPriority elif cActionType == "~": |
︙ | ︙ | |||
407 408 409 410 411 412 413 | return any(zPattern.search(s) for s in lMorph) def morphex (dTokenPos, tWord, sPattern, sNegPattern, bNoWord=False): "analyse a tuple (position, word), returns True if not sNegPattern in word morphologies and sPattern in word morphologies (disambiguation on)" if not tWord: return bNoWord | < < < < < | 410 411 412 413 414 415 416 417 418 419 420 421 422 423 424 425 426 | return any(zPattern.search(s) for s in lMorph) def morphex (dTokenPos, tWord, sPattern, sNegPattern, bNoWord=False): "analyse a tuple (position, word), returns True if not sNegPattern in word morphologies and sPattern in word morphologies (disambiguation on)" if not tWord: return bNoWord lMorph = dTokenPos[tWord[0]]["lMorph"] if tWord[0] in dTokenPos and "lMorph" in dTokenPos[tWord[0]] else _oSpellChecker.getMorph(tWord[1]) if not lMorph: return False # check negative condition zNegPattern = re.compile(sNegPattern) if any(zNegPattern.search(s) for s in lMorph): return False # search sPattern zPattern = re.compile(sPattern) return any(zPattern.search(s) for s in lMorph) |
︙ | ︙ | |||
574 575 576 577 578 579 580 | "Text parser" def __init__ (self, sSentence, sSentence0, nOffset): self.sSentence = sSentence self.sSentence0 = sSentence0 self.nOffsetWithinParagraph = nOffset self.lToken = list(_oTokenizer.genTokens(sSentence, True)) | | | | > > > > > > > > | 572 573 574 575 576 577 578 579 580 581 582 583 584 585 586 587 588 589 590 591 592 593 594 595 596 597 598 599 600 601 602 603 604 605 606 607 608 609 610 611 612 613 | "Text parser" def __init__ (self, sSentence, sSentence0, nOffset): self.sSentence = sSentence self.sSentence0 = sSentence0 self.nOffsetWithinParagraph = nOffset self.lToken = list(_oTokenizer.genTokens(sSentence, True)) self.dTokenPos = { dToken["nStart"]: dToken for dToken in self.lToken if dToken["sType"] != "INFO" } self.dTags = {} self.dError = {} def __str__ (self): s = "sentence: " + self.sSentence0 + "\n" s += "now: " + self.sSentence + "\n" for dToken in self.lToken: s += f'{dToken["nStart"]}\t{dToken["nEnd"]}\t{dToken["sValue"]}' if "lMorph" in dToken: s += "\t" + str(dToken["lMorph"]) s += "\n" for nPos, dToken in self.dTokenPos.items(): s += f"{nPos}\t{dToken}\n" return s def update (self, sSentence, bDebug=False): "update <sSentence> and retokenize" self.sSentence = sSentence lNewToken = list(_oTokenizer.genTokens(sSentence, True)) for dToken in lNewToken: if "lMorph" in self.dTokenPos.get(dToken["nStart"], {}): dToken["lMorph"] = self.dTokenPos[dToken["nStart"]]["lMorph"] self.lToken = lNewToken self.dTokenPos = { dToken["nStart"]: dToken for dToken in self.lToken if dToken["sType"] != "INFO" } if bDebug: print("UPDATE:") print(self) def _getNextMatchingNodes (self, dToken, dGraph, dNode, bDebug=False): "generator: return nodes where <dToken> “values” match <dNode> arcs" # token value if dToken["sValue"] in dNode: if bDebug: print(" MATCH:", dToken["sValue"]) |
︙ | ︙ | |||
911 912 913 914 915 916 917 | if bDebug: print("REWRITE") lNewToken = [] nMergeUntil = 0 dTokenMerger = None for dToken in self.lToken: bKeepToken = True | > | | | | | | | | | | | | | | | | | | | | | | | > | > > > > | 917 918 919 920 921 922 923 924 925 926 927 928 929 930 931 932 933 934 935 936 937 938 939 940 941 942 943 944 945 946 947 948 949 950 951 952 953 954 955 956 957 958 959 960 961 962 963 964 965 966 967 968 969 970 971 972 973 974 | if bDebug: print("REWRITE") lNewToken = [] nMergeUntil = 0 dTokenMerger = None for dToken in self.lToken: bKeepToken = True if dToken["sType"] != "INFO": if "bImmune" in dToken: nErrorStart = self.nOffsetWithinParagraph + dToken["nStart"] if nErrorStart in self.dError: if bDebug: print("immunity -> error removed:", self.dError[nErrorStart]) del self.dError[nErrorStart] if nMergeUntil and dToken["i"] <= nMergeUntil: dTokenMerger["sValue"] += " " * (dToken["nStart"] - dTokenMerger["nEnd"]) + dToken["sValue"] dTokenMerger["nEnd"] = dToken["nEnd"] if bDebug: print(" MERGED TOKEN:", dTokenMerger["sValue"]) bKeepToken = False if "nMergeUntil" in dToken: if dToken["i"] > nMergeUntil: # this token is not already merged with a previous token dTokenMerger = dToken if dToken["nMergeUntil"] > nMergeUntil: nMergeUntil = dToken["nMergeUntil"] del dToken["nMergeUntil"] elif "bToRemove" in dToken: if bDebug: print(" REMOVED:", dToken["sValue"]) self.sSentence = self.sSentence[:dToken["nStart"]] + " " * (dToken["nEnd"] - dToken["nStart"]) + self.sSentence[dToken["nEnd"]:] bKeepToken = False # if bKeepToken: lNewToken.append(dToken) if "sNewValue" in dToken: # rewrite token and sentence if bDebug: print(dToken["sValue"], "->", dToken["sNewValue"]) dToken["sRealValue"] = dToken["sValue"] dToken["sValue"] = dToken["sNewValue"] nDiffLen = len(dToken["sRealValue"]) - len(dToken["sNewValue"]) sNewRepl = (dToken["sNewValue"] + " " * nDiffLen) if nDiffLen >= 0 else dToken["sNewValue"][:len(dToken["sRealValue"])] self.sSentence = self.sSentence[:dToken["nStart"]] + sNewRepl + self.sSentence[dToken["nEnd"]:] del dToken["sNewValue"] else: try: del self.dTokenPos[dToken["nStart"]] except: print(self) print(dToken) exit() if bDebug: print(" REWRITED:", self.sSentence) self.lToken.clear() self.lToken = lNewToken |
︙ | ︙ |