Overview
Comment: | [core] text processor: communication between regex rules and graph rules + [graphspell][bug] tokenizer: set i variable to 0, if sentence is empty |
---|---|
Downloads: | Tarball | ZIP archive | SQL archive |
Timelines: | family | ancestors | descendants | both | core | graphspell | rg |
Files: | files | file ages | folders |
SHA3-256: |
cca3887aad8ca5967ef8b660679838f1 |
User & Date: | olr on 2018-06-12 11:24:50 |
Original Comment: | [graphspell][bug] tokenizer: set i variable to 0, if sentence is empty |
Other Links: | branch diff | manifest | tags |
Context
2018-06-12
| ||
12:10 | [fr] graph tests (draft) check-in: 2f0f044a66 user: olr tags: fr, rg | |
11:24 | [core] text processor: communication between regex rules and graph rules + [graphspell][bug] tokenizer: set i variable to 0, if sentence is empty check-in: cca3887aad user: olr tags: core, graphspell, rg | |
10:46 | [misc] SublimeText syntax rules update check-in: eb14e38462 user: olr tags: misc, rg | |
Changes
Modified gc_core/py/lang_core/gc_engine.py from [f8ff5d45bc] to [112316530e].
︙ | ︙ | |||
153 154 155 156 157 158 159 | for m in _zEndOfSentence.finditer(sText): yield (iStart, m.end()) iStart = m.end() def _proofread (oSentence, s, sx, nOffset, bParagraph, dDA, dPriority, sCountry, dOptions, bShowRuleId, bDebug, bContext): dErrs = {} | | > > > > | | | | 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 | for m in _zEndOfSentence.finditer(sText): yield (iStart, m.end()) iStart = m.end() def _proofread (oSentence, s, sx, nOffset, bParagraph, dDA, dPriority, sCountry, dOptions, bShowRuleId, bDebug, bContext): dErrs = {} bParagraphChange = False bSentenceChange = False for sOption, lRuleGroup in _getRules(bParagraph): if sOption == "@@@@": # graph rules if not bParagraph and bSentenceChange: oSentence.update(s) bSentenceChange = False for sGraphName, sLineId in lRuleGroup: if bDebug: print(sGraphName, sLineId) bParagraphChange, errs = oSentence.parse(dAllGraph[sGraphName], dPriority, sCountry, dOptions, bShowRuleId, bDebug, bContext) dErrs.update(errs) if bParagraphChange: s = oSentence.rewrite() if bDebug: print("~", oSentence.sSentence) elif not sOption or dOptions.get(sOption, False): # regex rules for zRegex, bUppercase, sLineId, sRuleId, nPriority, lActions in lRuleGroup: if sRuleId not in _aIgnoredRules: for m in zRegex.finditer(s): |
︙ | ︙ | |||
186 187 188 189 190 191 192 | nErrorStart = nOffset + m.start(eAct[0]) if nErrorStart not in dErrs or nPriority > dPriority[nErrorStart]: dErrs[nErrorStart] = _createRegexError(s, sx, sWhat, nOffset, m, eAct[0], sLineId, sRuleId, bUppercase, eAct[1], eAct[2], bShowRuleId, sOption, bContext) dPriority[nErrorStart] = nPriority elif cActionType == "~": # text processor s = _rewrite(s, sWhat, eAct[0], m, bUppercase) | | > | | 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 | nErrorStart = nOffset + m.start(eAct[0]) if nErrorStart not in dErrs or nPriority > dPriority[nErrorStart]: dErrs[nErrorStart] = _createRegexError(s, sx, sWhat, nOffset, m, eAct[0], sLineId, sRuleId, bUppercase, eAct[1], eAct[2], bShowRuleId, sOption, bContext) dPriority[nErrorStart] = nPriority elif cActionType == "~": # text processor s = _rewrite(s, sWhat, eAct[0], m, bUppercase) bParagraphChange = True bSentenceChange = True if bDebug: echo("~ " + s + " -- " + m.group(eAct[0]) + " # " + sLineId) elif cActionType == "=": # disambiguation globals()[sWhat](s, m, dDA) if bDebug: echo("= " + m.group(0) + " # " + sLineId + "\nDA: " + str(dDA)) elif cActionType == ">": # we do nothing, this test is just a condition to apply all following actions pass else: echo("# error: unknown action at " + sLineId) elif cActionType == ">": break except Exception as e: raise Exception(str(e), "# " + sLineId + " # " + sRuleId) if bParagraphChange: return (s, dErrs) return (False, dErrs) def _createRegexWriterError (s, sx, sRepl, nOffset, m, iGroup, sLineId, sRuleId, bUppercase, sMsg, sURL, bShowRuleId, sOption, bContext): "error for Writer (LO/OO)" xErr = SingleProofreadingError() |
︙ | ︙ | |||
581 582 583 584 585 586 587 588 589 590 591 592 593 594 | def __init__ (self, sSentence, sSentence0, nOffset): self.sSentence = sSentence self.sSentence0 = sSentence0 self.nOffset = nOffset self.lToken = list(_oTokenizer.genTokens(sSentence, True)) self.createError = self._createWriterError if _bWriterError else self._createDictError def _getNextMatchingNodes (self, dToken, dGraph, dNode): "generator: return nodes where <dToken> “values” match <dNode> arcs" # token value if dToken["sValue"] in dNode: #print("value found: ", dToken["sValue"]) yield dGraph[dNode[dToken["sValue"]]] # token lemmas | > > > > | 586 587 588 589 590 591 592 593 594 595 596 597 598 599 600 601 602 603 | def __init__ (self, sSentence, sSentence0, nOffset): self.sSentence = sSentence self.sSentence0 = sSentence0 self.nOffset = nOffset self.lToken = list(_oTokenizer.genTokens(sSentence, True)) self.createError = self._createWriterError if _bWriterError else self._createDictError def update (self, sSentence): self.sSentence = sSentence self.lToken = list(_oTokenizer.genTokens(sSentence, True)) def _getNextMatchingNodes (self, dToken, dGraph, dNode): "generator: return nodes where <dToken> “values” match <dNode> arcs" # token value if dToken["sValue"] in dNode: #print("value found: ", dToken["sValue"]) yield dGraph[dNode[dToken["sValue"]]] # token lemmas |
︙ | ︙ | |||
833 834 835 836 837 838 839 840 841 842 843 844 845 846 | dToken["sValue"] = dToken["sNewValue"] nDiffLen = len(dToken["sRealValue"]) - len(dToken["sNewValue"]) sNewRepl = (dToken["sNewValue"] + " " * nDiffLen) if nDiffLen >= 0 else dToken["sNewValue"][:len(dToken["sRealValue"])] self.sSentence = self.sSentence[:self.nOffset+dToken["nStart"]] + sNewRepl + self.sSentence[self.nOffset+dToken["nEnd"]:] del dToken["sNewValue"] self.lToken.clear() self.lToken = lNewToken #### Analyse tokens def g_morph (dToken, sPattern, sNegPattern=""): "analyse a token, return True if <sNegPattern> not in morphologies and <sPattern> in morphologies" | > | 842 843 844 845 846 847 848 849 850 851 852 853 854 855 856 | dToken["sValue"] = dToken["sNewValue"] nDiffLen = len(dToken["sRealValue"]) - len(dToken["sNewValue"]) sNewRepl = (dToken["sNewValue"] + " " * nDiffLen) if nDiffLen >= 0 else dToken["sNewValue"][:len(dToken["sRealValue"])] self.sSentence = self.sSentence[:self.nOffset+dToken["nStart"]] + sNewRepl + self.sSentence[self.nOffset+dToken["nEnd"]:] del dToken["sNewValue"] self.lToken.clear() self.lToken = lNewToken return self.sSentence #### Analyse tokens def g_morph (dToken, sPattern, sNegPattern=""): "analyse a token, return True if <sNegPattern> not in morphologies and <sPattern> in morphologies" |
︙ | ︙ |
Modified graphspell/tokenizer.py from [b723a02695] to [30951f1c9c].
︙ | ︙ | |||
41 42 43 44 45 46 47 48 49 50 51 52 53 54 | def __init__ (self, sLang): self.sLang = sLang if sLang not in _PATTERNS: self.sLang = "default" self.zToken = re.compile( "(?i)" + '|'.join(sRegex for sRegex in _PATTERNS[sLang]) ) def genTokens (self, sText, bStartEndToken=False): if bStartEndToken: yield { "i": 0, "sType": "INFO", "sValue": "<start>", "nStart": 0, "nEnd": 0 } for i, m in enumerate(self.zToken.finditer(sText), 1): yield { "i": i, "sType": m.lastgroup, "sValue": m.group(), "nStart": m.start(), "nEnd": m.end() } if bStartEndToken: iEnd = len(sText) yield { "i": i+1, "sType": "INFO", "sValue": "<end>", "nStart": iEnd, "nEnd": iEnd } | > | 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 | def __init__ (self, sLang): self.sLang = sLang if sLang not in _PATTERNS: self.sLang = "default" self.zToken = re.compile( "(?i)" + '|'.join(sRegex for sRegex in _PATTERNS[sLang]) ) def genTokens (self, sText, bStartEndToken=False): i = 0 if bStartEndToken: yield { "i": 0, "sType": "INFO", "sValue": "<start>", "nStart": 0, "nEnd": 0 } for i, m in enumerate(self.zToken.finditer(sText), 1): yield { "i": i, "sType": m.lastgroup, "sValue": m.group(), "nStart": m.start(), "nEnd": m.end() } if bStartEndToken: iEnd = len(sText) yield { "i": i+1, "sType": "INFO", "sValue": "<end>", "nStart": iEnd, "nEnd": iEnd } |