Index: gc_core/py/lang_core/gc_engine.py ================================================================== --- gc_core/py/lang_core/gc_engine.py +++ gc_core/py/lang_core/gc_engine.py @@ -100,28 +100,11 @@ except: echo("Bad regular expression in # " + str(aRule[2])) aRule[0] = "(?i)" -#### Parsing - -_zEndOfSentence = re.compile(r'([.?!:;…][ .?!… »”")]*|.$)') -_zBeginOfParagraph = re.compile(r"^\W*") -_zEndOfParagraph = re.compile(r"\W*$") - -def _getSentenceBoundaries (sText): - iStart = _zBeginOfParagraph.match(sText).end() - for m in _zEndOfSentence.finditer(sText): - yield (iStart, m.end()) - iStart = m.end() - - -def parse (sText, sCountry="${country_default}", bDebug=False, dOptions=None, bContext=False): - "init point to analyze a text" - oText = TextParser(sText) - return oText.parse(sCountry, bDebug, dOptions, bContext) - +#### Rules and options def ignoreRule (sRuleId): "disable rule " _aIgnoredRules.add(sRuleId) @@ -202,10 +185,28 @@ def getSpellChecker (): "return the spellchecker object" return _oSpellChecker +#### Parsing + +_zEndOfSentence = re.compile(r'([.?!:;…][ .?!… »”")]*|.$)') +_zBeginOfParagraph = re.compile(r"^\W*") +_zEndOfParagraph = re.compile(r"\W*$") + +def _getSentenceBoundaries (sText): + iStart = _zBeginOfParagraph.match(sText).end() + for m in _zEndOfSentence.finditer(sText): + yield (iStart, m.end()) + iStart = m.end() + + +def parse (sText, sCountry="${country_default}", bDebug=False, dOptions=None, bContext=False): + "init point to analyze a text" + oText = TextParser(sText) + return oText.parse(sCountry, bDebug, dOptions, bContext) + #### TEXT PARSER class TextParser: "Text parser" @@ -283,11 +284,11 @@ self.update(sText, bDebug) bChange = False for sGraphName, sLineId in lRuleGroup: if sGraphName not in dOptions or dOptions[sGraphName]: if bDebug: - echo("\n>>>> GRAPH:", sGraphName, sLineId) + echo("\n>>>> GRAPH: " + sGraphName + " " + sLineId) sText = self.parseGraph(_rules_graph.dAllGraph[sGraphName], sCountry, dOptions, bShowRuleId, bDebug, bContext) elif not sOption or dOptions.get(sOption, False): # regex rules for zRegex, bUppercase, sLineId, sRuleId, nPriority, lActions in lRuleGroup: if sRuleId not in _aIgnoredRules: @@ -297,11 +298,11 @@ # action in lActions: [ condition, action type, replacement/suggestion/action[, iGroup[, message, URL]] ] try: bCondMemo = not sFuncCond or globals()[sFuncCond](sText, sText0, m, self.dTokenPos, sCountry, bCondMemo) if bCondMemo: if bDebug: - echo("RULE:", sLineId) + echo("RULE: " + sLineId) if cActionType == "-": # grammar error nErrorStart = nOffset + m.start(eAct[0]) if nErrorStart not in self.dError or nPriority > self.dErrorPriority.get(nErrorStart, -1): self.dError[nErrorStart] = self._createErrorFromRegex(sText, sText0, sWhat, nOffset, m, eAct[0], sLineId, sRuleId, bUppercase, eAct[1], eAct[2], bShowRuleId, sOption, bContext) @@ -352,31 +353,31 @@ iNode1 = dPointer["iNode1"] bTokenFound = False # token value if dToken["sValue"] in dNode: if bDebug: - echo(" MATCH:", dToken["sValue"]) + echo(" MATCH: " + dToken["sValue"]) yield { "iNode1": iNode1, "dNode": dGraph[dNode[dToken["sValue"]]] } bTokenFound = True if dToken["sValue"][0:2].istitle(): # we test only 2 first chars, to make valid words such as "Laissez-les", "Passe-partout". sValue = dToken["sValue"].lower() if sValue in dNode: if bDebug: - echo(" MATCH:", sValue) + echo(" MATCH: " + sValue) yield { "iNode1": iNode1, "dNode": dGraph[dNode[sValue]] } bTokenFound = True elif dToken["sValue"].isupper(): sValue = dToken["sValue"].lower() if sValue in dNode: if bDebug: - echo(" MATCH:", sValue) + echo(" MATCH: " + sValue) yield { "iNode1": iNode1, "dNode": dGraph[dNode[sValue]] } bTokenFound = True sValue = dToken["sValue"].capitalize() if sValue in dNode: if bDebug: - echo(" MATCH:", sValue) + echo(" MATCH: " + sValue) yield { "iNode1": iNode1, "dNode": dGraph[dNode[sValue]] } bTokenFound = True # regex value arcs if dToken["sType"] not in frozenset(["INFO", "PUNC", "SIGN"]): if "" in dNode: @@ -476,11 +477,11 @@ dOpt = _dOptions if not dOptions else dOptions lPointer = [] bTagAndRewrite = False for iToken, dToken in enumerate(self.lToken): if bDebug: - echo("TOKEN:", dToken["sValue"]) + echo("TOKEN: " + dToken["sValue"]) # check arcs for each existing pointer lNextPointer = [] for dPointer in lPointer: lNextPointer.extend(self._getNextPointers(dToken, dGraph, dPointer, bDebug)) lPointer = lNextPointer @@ -506,11 +507,11 @@ for sLineId, nextNodeKey in dNode.items(): bCondMemo = None for sRuleId in dGraph[nextNodeKey]: try: if bDebug: - echo(" >TRY:", sRuleId) + echo(" >TRY: " + sRuleId) sOption, sFuncCond, cActionType, sWhat, *eAct = _rules_graph.dRule[sRuleId] # Suggestion [ option, condition, "-", replacement/suggestion/action, iTokenStart, iTokenEnd, cStartLimit, cEndLimit, bCaseSvty, nPriority, sMessage, sURL ] # TextProcessor [ option, condition, "~", replacement/suggestion/action, iTokenStart, iTokenEnd, bCaseSvty ] # Disambiguator [ option, condition, "=", replacement/suggestion/action ] # Tag [ option, condition, "/", replacement/suggestion/action, iTokenStart, iTokenEnd ] @@ -529,29 +530,29 @@ nErrorEnd = self.nOffsetWithinParagraph + (self.lToken[nTokenErrorEnd]["nEnd"] if cEndLimit == ">" else self.lToken[nTokenErrorEnd]["nStart"]) if nErrorStart not in self.dError or nPriority > self.dErrorPriority.get(nErrorStart, -1): self.dError[nErrorStart] = self._createErrorFromTokens(sWhat, nTokenOffset, nLastToken, nTokenErrorStart, nErrorStart, nErrorEnd, sLineId, sRuleId, bCaseSvty, sMessage, sURL, bShowRuleId, sOption, bContext) self.dErrorPriority[nErrorStart] = nPriority if bDebug: - echo(" NEW_ERROR: ", sRuleId, sLineId, ": ", self.dError[nErrorStart]) + echo(" NEW_ERROR: {} {}: {}".format(sRuleId, sLineId, self.dError[nErrorStart])) elif cActionType == "~": # text processor nTokenStart = nTokenOffset + eAct[0] if eAct[0] > 0 else nLastToken + eAct[0] nTokenEnd = nTokenOffset + eAct[1] if eAct[1] > 0 else nLastToken + eAct[1] self._tagAndPrepareTokenForRewriting(sWhat, nTokenStart, nTokenEnd, nTokenOffset, nLastToken, eAct[2], bDebug) bChange = True if bDebug: - echo(" TEXT_PROCESSOR: ", sRuleId, sLineId) - echo(" ", self.lToken[nTokenStart]["sValue"], ":", self.lToken[nTokenEnd]["sValue"], " >", sWhat) + echo(" TEXT_PROCESSOR: " + sRuleId + " " + sLineId) + echo(" " + self.lToken[nTokenStart]["sValue"] + " : " + self.lToken[nTokenEnd]["sValue"] + " > ", sWhat) elif cActionType == "=": # disambiguation globals()[sWhat](self.lToken, nTokenOffset, nLastToken) if bDebug: - echo(" DISAMBIGUATOR: ", sRuleId, sLineId, "("+sWhat+")", self.lToken[nTokenOffset+1]["sValue"], ":", self.lToken[nLastToken]["sValue"]) + echo(" DISAMBIGUATOR: {} {} ({}) {}:{}".format(sRuleId, sLineId, sWhat, self.lToken[nTokenOffset+1]["sValue"], self.lToken[nLastToken]["sValue"])) elif cActionType == ">": # we do nothing, this test is just a condition to apply all following actions if bDebug: - echo(" COND_OK: ", sRuleId, sLineId) + echo(" COND_OK: " + sRuleId + " " + sLineId) pass elif cActionType == "/": # Tag nTokenStart = nTokenOffset + eAct[0] if eAct[0] > 0 else nLastToken + eAct[0] nTokenEnd = nTokenOffset + eAct[1] if eAct[1] > 0 else nLastToken + eAct[1] @@ -559,21 +560,21 @@ if "tags" in self.lToken[i]: self.lToken[i]["tags"].update(sWhat.split("|")) else: self.lToken[i]["tags"] = set(sWhat.split("|")) if bDebug: - echo(" TAG: ", sRuleId, sLineId) - echo(" ", sWhat, " >", self.lToken[nTokenStart]["sValue"], ":", self.lToken[nTokenEnd]["sValue"]) + echo(" TAG: " + sRuleId + " " + sLineId) + echo(" " + sWhat + " > " + self.lToken[nTokenStart]["sValue"] + " : " + self.lToken[nTokenEnd]["sValue"]) if sWhat not in self.dTags: self.dTags[sWhat] = [nTokenStart, nTokenStart] else: self.dTags[sWhat][0] = min(nTokenStart, self.dTags[sWhat][0]) self.dTags[sWhat][1] = max(nTokenEnd, self.dTags[sWhat][1]) elif cActionType == "%": # immunity if bDebug: - echo(" IMMUNITY:\n ", _rules_graph.dRule[sRuleId]) + echo(" IMMUNITY:\n " + _rules_graph.dRule[sRuleId]) nTokenStart = nTokenOffset + eAct[0] if eAct[0] > 0 else nLastToken + eAct[0] nTokenEnd = nTokenOffset + eAct[1] if eAct[1] > 0 else nLastToken + eAct[1] if nTokenEnd - nTokenStart == 0: self.lToken[nTokenStart]["bImmune"] = True nErrorStart = self.nOffsetWithinParagraph + self.lToken[nTokenStart]["nStart"] @@ -587,11 +588,11 @@ del self.dError[nErrorStart] else: echo("# error: unknown action at " + sLineId) elif cActionType == ">": if bDebug: - echo(" COND_BREAK: ", sRuleId, sLineId) + echo(" COND_BREAK: " + sRuleId + " " + sLineId) break except Exception as e: raise Exception(str(e), sLineId, sRuleId, self.sSentence) return bChange @@ -673,17 +674,15 @@ dErr['sBefore'] = self.sSentence0[max(0,nStart-80):nStart] dErr['sAfter'] = self.sSentence0[nEnd:nEnd+80] return dErr def _expand (self, sText, nTokenOffset, nLastToken): - #echo("*", sText) for m in re.finditer(r"\\(-?[0-9]+)", sText): if m.group(1)[0:1] == "-": sText = sText.replace(m.group(0), self.lToken[nLastToken+int(m.group(1))+1]["sValue"]) else: sText = sText.replace(m.group(0), self.lToken[nTokenOffset+int(m.group(1))]["sValue"]) - #echo(">", sText) return sText def rewriteText (self, sText, sRepl, iGroup, m, bUppercase): "text processor: write in at position" nLen = m.end(iGroup) - m.start(iGroup) @@ -702,11 +701,11 @@ return sText[0:m.start(iGroup)] + sNew + sText[m.end(iGroup):] def _tagAndPrepareTokenForRewriting (self, sWhat, nTokenRewriteStart, nTokenRewriteEnd, nTokenOffset, nLastToken, bCaseSvty, bDebug): "text processor: rewrite tokens between and position" if bDebug: - echo(" START:", nTokenRewriteStart, "END:", nTokenRewriteEnd) + echo(" START: {} - END: {} ".format(nTokenRewriteStart, nTokenRewriteEnd)) if sWhat == "*": # purge text if nTokenRewriteEnd - nTokenRewriteStart == 0: self.lToken[nTokenRewriteStart]["bToRemove"] = True else: @@ -759,30 +758,30 @@ if dToken["sType"] != "INFO": if nMergeUntil and iToken <= nMergeUntil: dTokenMerger["sValue"] += " " * (dToken["nStart"] - dTokenMerger["nEnd"]) + dToken["sValue"] dTokenMerger["nEnd"] = dToken["nEnd"] if bDebug: - echo(" MERGED TOKEN:", dTokenMerger["sValue"]) + echo(" MERGED TOKEN: " + dTokenMerger["sValue"]) bKeepToken = False if "nMergeUntil" in dToken: if iToken > nMergeUntil: # this token is not already merged with a previous token dTokenMerger = dToken if dToken["nMergeUntil"] > nMergeUntil: nMergeUntil = dToken["nMergeUntil"] del dToken["nMergeUntil"] elif "bToRemove" in dToken: if bDebug: - echo(" REMOVED:", dToken["sValue"]) + echo(" REMOVED: " + dToken["sValue"]) self.sSentence = self.sSentence[:dToken["nStart"]] + " " * (dToken["nEnd"] - dToken["nStart"]) + self.sSentence[dToken["nEnd"]:] bKeepToken = False # if bKeepToken: lNewToken.append(dToken) if "sNewValue" in dToken: # rewrite token and sentence if bDebug: - echo(dToken["sValue"], "->", dToken["sNewValue"]) + echo(dToken["sValue"] + " -> " + dToken["sNewValue"]) dToken["sRealValue"] = dToken["sValue"] dToken["sValue"] = dToken["sNewValue"] nDiffLen = len(dToken["sRealValue"]) - len(dToken["sNewValue"]) sNewRepl = (dToken["sNewValue"] + " " * nDiffLen) if nDiffLen >= 0 else dToken["sNewValue"][:len(dToken["sRealValue"])] self.sSentence = self.sSentence[:dToken["nStart"]] + sNewRepl + self.sSentence[dToken["nEnd"]:] @@ -793,11 +792,11 @@ except: echo(self) echo(dToken) exit() if bDebug: - echo(" TEXT REWRITED:", self.sSentence) + echo(" TEXT REWRITED: " + self.sSentence) self.lToken.clear() self.lToken = lNewToken #### common functions