@@ -398,71 +398,55 @@ self.dTokenPos = { dToken["nStart"]: dToken for dToken in self.lTokens if dToken["sType"] != "INFO" } if bDebug: echo("UPDATE:") echo(self) - def _getNextPointers (self, dToken, dGraph, dPointer, bDebug=False): - "generator: return nodes where “values” match arcs" - dNode = dGraph[dPointer["iNode"]] - iToken1 = dPointer["iToken1"] + def _getMatches (self, dGraph, dToken, dNode, bKeep=False): + "generator: return matches where “values” match arcs" bTokenFound = False # token value if dToken["sValue"] in dNode: - if bDebug: - echo(" MATCH: " + dToken["sValue"]) - yield { "iToken1": iToken1, "iNode": dNode[dToken["sValue"]] } + yield (" ", dToken["sValue"], dNode[dToken["sValue"]]) bTokenFound = True if dToken["sValue"][0:2].istitle(): # we test only 2 first chars, to make valid words such as "Laissez-les", "Passe-partout". sValue = dToken["sValue"].lower() if sValue in dNode: - if bDebug: - echo(" MATCH: " + sValue) - yield { "iToken1": iToken1, "iNode": dNode[sValue] } + yield (" ", sValue, dNode[sValue]) bTokenFound = True elif dToken["sValue"].isupper(): sValue = dToken["sValue"].lower() if sValue in dNode: - if bDebug: - echo(" MATCH: " + sValue) - yield { "iToken1": iToken1, "iNode": dNode[sValue] } + yield (" ", sValue, dNode[sValue]) bTokenFound = True sValue = dToken["sValue"].capitalize() if sValue in dNode: - if bDebug: - echo(" MATCH: " + sValue) - yield { "iToken1": iToken1, "iNode": dNode[sValue] } + yield (" ", sValue, dNode[sValue]) bTokenFound = True # regex value arcs if dToken["sType"] not in frozenset(["INFO", "PUNC", "SIGN"]): if "" in dNode: for sRegex in dNode[""]: if "¬" not in sRegex: # no anti-pattern if re.search(sRegex, dToken["sValue"]): - if bDebug: - echo(" MATCH: ~" + sRegex) - yield { "iToken1": iToken1, "iNode": dNode[""][sRegex] } + yield ("~", sRegex, dNode[""][sRegex]) bTokenFound = True else: # there is an anti-pattern sPattern, sNegPattern = sRegex.split("¬", 1) if sNegPattern and re.search(sNegPattern, dToken["sValue"]): continue if not sPattern or re.search(sPattern, dToken["sValue"]): - if bDebug: - echo(" MATCH: ~" + sRegex) - yield { "iToken1": iToken1, "iNode": dNode[""][sRegex] } + yield ("~", sRegex, dNode[""][sRegex]) bTokenFound = True # analysable tokens if dToken["sType"][0:4] == "WORD": # token lemmas if "" in dNode: for sLemma in _oSpellChecker.getLemma(dToken["sValue"]): if sLemma in dNode[""]: - if bDebug: - echo(" MATCH: >" + sLemma) - yield { "iToken1": iToken1, "iNode": dNode[""][sLemma] } + yield (">", sLemma, dNode[""][sLemma]) bTokenFound = True # phonetic similarity if "" in dNode: for sPhonet in dNode[""]: if sPhonet.endswith("!"): @@ -473,106 +457,85 @@ if dToken["sValue"].lower() == sPhon: continue if dToken["sValue"].isupper() and dToken["sValue"].capitalize() == sPhon: continue if phonet.isSimilAs(dToken["sValue"], sPhonet.rstrip("!")): - if bDebug: - echo(" MATCH: %" + sPhonet) - yield { "iToken1": iToken1, "iNode": dNode[""][sPhonet] } + yield ("#", sPhonet, dNode[""][sPhonet]) bTokenFound = True # morph arcs if "" in dNode: lMorph = dToken.get("lMorph", _oSpellChecker.getMorph(dToken["sValue"])) if lMorph: for sSearch in dNode[""]: if "¬" not in sSearch: # no anti-pattern if any(sSearch in sMorph for sMorph in lMorph): - if bDebug: - echo(" MATCH: $" + sSearch) - yield { "iToken1": iToken1, "iNode": dNode[""][sSearch] } + yield ("$", sSearch, dNode[""][sSearch]) bTokenFound = True else: # there is an anti-pattern sPattern, sNegPattern = sSearch.split("¬", 1) if sNegPattern == "*": # all morphologies must match with if sPattern: if all(sPattern in sMorph for sMorph in lMorph): - if bDebug: - echo(" MATCH: $" + sSearch) - yield { "iToken1": iToken1, "iNode": dNode[""][sSearch] } + yield ("$", sSearch, dNode[""][sSearch]) bTokenFound = True else: if sNegPattern and any(sNegPattern in sMorph for sMorph in lMorph): continue if not sPattern or any(sPattern in sMorph for sMorph in lMorph): - if bDebug: - echo(" MATCH: $" + sSearch) - yield { "iToken1": iToken1, "iNode": dNode[""][sSearch] } + yield ("$", sSearch, dNode[""][sSearch]) bTokenFound = True # regex morph arcs if "" in dNode: lMorph = dToken.get("lMorph", _oSpellChecker.getMorph(dToken["sValue"])) if lMorph: for sRegex in dNode[""]: if "¬" not in sRegex: # no anti-pattern if any(re.search(sRegex, sMorph) for sMorph in lMorph): - if bDebug: - echo(" MATCH: @" + sRegex) - yield { "iToken1": iToken1, "iNode": dNode[""][sRegex] } + yield ("@", sRegex, dNode[""][sRegex]) bTokenFound = True else: # there is an anti-pattern sPattern, sNegPattern = sRegex.split("¬", 1) if sNegPattern == "*": # all morphologies must match with if sPattern: if all(re.search(sPattern, sMorph) for sMorph in lMorph): - if bDebug: - echo(" MATCH: @" + sRegex) - yield { "iToken1": iToken1, "iNode": dNode[""][sRegex] } + yield ("@", sRegex, dNode[""][sRegex]) bTokenFound = True else: if sNegPattern and any(re.search(sNegPattern, sMorph) for sMorph in lMorph): continue if not sPattern or any(re.search(sPattern, sMorph) for sMorph in lMorph): - if bDebug: - echo(" MATCH: @" + sRegex) - yield { "iToken1": iToken1, "iNode": dNode[""][sRegex] } + yield ("@", sRegex, dNode[""][sRegex]) bTokenFound = True # token tags if "aTags" in dToken and "" in dNode: for sTag in dToken["aTags"]: if sTag in dNode[""]: - if bDebug: - echo(" MATCH: /" + sTag) - yield { "iToken1": iToken1, "iNode": dNode[""][sTag] } + yield ("/", sTag, dNode[""][sTag]) bTokenFound = True # meta arc (for token type) if "" in dNode: for sMeta in dNode[""]: # no regex here, we just search if exists within if sMeta == "*" or dToken["sType"] == sMeta: - if bDebug: - echo(" MATCH: *" + sMeta) - yield { "iToken1": iToken1, "iNode": dNode[""][sMeta] } + yield ("*", sMeta, dNode[""][sMeta]) bTokenFound = True elif "¬" in sMeta: if dToken["sType"] not in sMeta: - if bDebug: - echo(" MATCH: *" + sMeta) - yield { "iToken1": iToken1, "iNode": dNode[""][sMeta] } + yield ("*", sMeta, dNode[""][sMeta]) bTokenFound = True - if not bTokenFound and "bKeep" in dPointer: - yield dPointer + if not bTokenFound and bKeep: + yield (None, "", -1) # JUMP - # Warning! Recurssion! + # Warning! Recursion! if "<>" in dNode: - dPointer2 = { "iToken1": iToken1, "iNode": dNode["<>"], "bKeep": True } - yield from self._getNextPointers(dToken, dGraph, dPointer2, bDebug) + yield from self._getMatches(dGraph, dToken, dGraph[dNode["<>"]], bKeep=True) def parseGraph (self, dGraph, sCountry="${country_default}", dOptions=None, bShowRuleId=False, bDebug=False, bContext=False): "parse graph with tokens from the text and execute actions encountered" lPointer = [] bTagAndRewrite = False @@ -580,14 +543,25 @@ if bDebug: echo("TOKEN: " + dToken["sValue"]) # check arcs for each existing pointer lNextPointer = [] for dPointer in lPointer: - lNextPointer.extend(self._getNextPointers(dToken, dGraph, dPointer, bDebug)) + for cActionType, sMatch, iNode in self._getMatches(dGraph, dToken, dGraph[dPointer["iNode"]]): + if cActionType is None: + lNextPointer.append(dPointer) + continue + if bDebug: + echo(" MATCH: " + cActionType + sMatch) + lNextPointer.append({ "iToken1": dPointer["iToken1"], "iNode": iNode }) lPointer = lNextPointer # check arcs of first nodes - lPointer.extend(self._getNextPointers(dToken, dGraph, { "iToken1": iToken, "iNode": 0 }, bDebug)) + for cActionType, sMatch, iNode in self._getMatches(dGraph, dToken, dGraph[0]): + if cActionType is None: + continue + if bDebug: + echo(" MATCH: " + cActionType + sMatch) + lPointer.append({ "iToken1": iToken, "iNode": iNode }) # check if there is rules to check for each pointer for dPointer in lPointer: #if bDebug: # echo("+", dPointer) if "" in dGraph[dPointer["iNode"]]: