Grammalecte  Check-in [cca3887aad]

Overview
Comment:[core] text processor: communication between regex rules and graph rules + [graphspell][bug] tokenizer: set i variable to 0, if sentence is empty
Downloads: Tarball | ZIP archive | SQL archive
Timelines: family | ancestors | descendants | both | core | graphspell | rg
Files: files | file ages | folders
SHA3-256: cca3887aad8ca5967ef8b660679838f16c6997f3f388b034c01dfd9f9295216c
User & Date: olr on 2018-06-12 11:24:50
Original Comment: [graphspell][bug] tokenizer: set i variable to 0, if sentence is empty
Other Links: branch diff | manifest | tags
Context
2018-06-12
12:10
[fr] graph tests (draft) check-in: 2f0f044a66 user: olr tags: fr, rg
11:24
[core] text processor: communication between regex rules and graph rules + [graphspell][bug] tokenizer: set i variable to 0, if sentence is empty check-in: cca3887aad user: olr tags: core, graphspell, rg
10:46
[misc] SublimeText syntax rules update check-in: eb14e38462 user: olr tags: misc, rg
Changes

Modified gc_core/py/lang_core/gc_engine.py from [f8ff5d45bc] to [112316530e].

153
154
155
156
157
158
159
160


161
162
163



164
165
166
167

168
169
170


171
172
173
174
175
176
177
153
154
155
156
157
158
159

160
161
162
163
164
165
166
167
168
169
170

171
172


173
174
175
176
177
178
179
180
181







-
+
+



+
+
+



-
+

-
-
+
+







    for m in _zEndOfSentence.finditer(sText):
        yield (iStart, m.end())
        iStart = m.end()


def _proofread (oSentence, s, sx, nOffset, bParagraph, dDA, dPriority, sCountry, dOptions, bShowRuleId, bDebug, bContext):
    dErrs = {}
    bChange = False
    bParagraphChange = False
    bSentenceChange = False
    for sOption, lRuleGroup in _getRules(bParagraph):
        if sOption == "@@@@":
            # graph rules
            if not bParagraph and bSentenceChange:
                oSentence.update(s)
                bSentenceChange = False
            for sGraphName, sLineId in lRuleGroup:
                if bDebug:
                    print(sGraphName, sLineId)
                bChange, errs = oSentence.parse(dAllGraph[sGraphName], dPriority, sCountry, dOptions, bShowRuleId, bDebug, bContext)
                bParagraphChange, errs = oSentence.parse(dAllGraph[sGraphName], dPriority, sCountry, dOptions, bShowRuleId, bDebug, bContext)
                dErrs.update(errs)
                if bChange:
                    oSentence.rewrite()
                if bParagraphChange:
                    s = oSentence.rewrite()
                    if bDebug:
                        print("~", oSentence.sSentence)
        elif not sOption or dOptions.get(sOption, False):
            # regex rules
            for zRegex, bUppercase, sLineId, sRuleId, nPriority, lActions in lRuleGroup:
                if sRuleId not in _aIgnoredRules:
                    for m in zRegex.finditer(s):
186
187
188
189
190
191
192
193


194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210

211
212
213
214
215
216
217
190
191
192
193
194
195
196

197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214

215
216
217
218
219
220
221
222







-
+
+
















-
+







                                        nErrorStart = nOffset + m.start(eAct[0])
                                        if nErrorStart not in dErrs or nPriority > dPriority[nErrorStart]:
                                            dErrs[nErrorStart] = _createRegexError(s, sx, sWhat, nOffset, m, eAct[0], sLineId, sRuleId, bUppercase, eAct[1], eAct[2], bShowRuleId, sOption, bContext)
                                            dPriority[nErrorStart] = nPriority
                                    elif cActionType == "~":
                                        # text processor
                                        s = _rewrite(s, sWhat, eAct[0], m, bUppercase)
                                        bChange = True
                                        bParagraphChange = True
                                        bSentenceChange = True
                                        if bDebug:
                                            echo("~ " + s + "  -- " + m.group(eAct[0]) + "  # " + sLineId)
                                    elif cActionType == "=":
                                        # disambiguation
                                        globals()[sWhat](s, m, dDA)
                                        if bDebug:
                                            echo("= " + m.group(0) + "  # " + sLineId + "\nDA: " + str(dDA))
                                    elif cActionType == ">":
                                        # we do nothing, this test is just a condition to apply all following actions
                                        pass
                                    else:
                                        echo("# error: unknown action at " + sLineId)
                                elif cActionType == ">":
                                    break
                            except Exception as e:
                                raise Exception(str(e), "# " + sLineId + " # " + sRuleId)
    if bChange:
    if bParagraphChange:
        return (s, dErrs)
    return (False, dErrs)


def _createRegexWriterError (s, sx, sRepl, nOffset, m, iGroup, sLineId, sRuleId, bUppercase, sMsg, sURL, bShowRuleId, sOption, bContext):
    "error for Writer (LO/OO)"
    xErr = SingleProofreadingError()
581
582
583
584
585
586
587




588
589
590
591
592
593
594
586
587
588
589
590
591
592
593
594
595
596
597
598
599
600
601
602
603







+
+
+
+







    def __init__ (self, sSentence, sSentence0, nOffset):
        self.sSentence = sSentence
        self.sSentence0 = sSentence0
        self.nOffset = nOffset
        self.lToken = list(_oTokenizer.genTokens(sSentence, True))
        self.createError = self._createWriterError  if _bWriterError  else self._createDictError

    def update (self, sSentence):
        self.sSentence = sSentence
        self.lToken = list(_oTokenizer.genTokens(sSentence, True))

    def _getNextMatchingNodes (self, dToken, dGraph, dNode):
        "generator: return nodes where <dToken> “values” match <dNode> arcs"
        # token value
        if dToken["sValue"] in dNode:
            #print("value found: ", dToken["sValue"])
            yield dGraph[dNode[dToken["sValue"]]]
        # token lemmas
833
834
835
836
837
838
839

840
841
842
843
844
845
846
842
843
844
845
846
847
848
849
850
851
852
853
854
855
856







+







                    dToken["sValue"] = dToken["sNewValue"]
                    nDiffLen = len(dToken["sRealValue"]) - len(dToken["sNewValue"])
                    sNewRepl = (dToken["sNewValue"] + " " * nDiffLen)  if nDiffLen >= 0  else dToken["sNewValue"][:len(dToken["sRealValue"])]
                    self.sSentence = self.sSentence[:self.nOffset+dToken["nStart"]] + sNewRepl + self.sSentence[self.nOffset+dToken["nEnd"]:]
                    del dToken["sNewValue"]
        self.lToken.clear()
        self.lToken = lNewToken
        return self.sSentence



#### Analyse tokens

def g_morph (dToken, sPattern, sNegPattern=""):
    "analyse a token, return True if <sNegPattern> not in morphologies and <sPattern> in morphologies"

Modified graphspell/tokenizer.py from [b723a02695] to [30951f1c9c].

41
42
43
44
45
46
47

48
49
50
51
52
53
54
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55







+







    def __init__ (self, sLang):
        self.sLang = sLang
        if sLang not in _PATTERNS:
            self.sLang = "default"
        self.zToken = re.compile( "(?i)" + '|'.join(sRegex for sRegex in _PATTERNS[sLang]) )

    def genTokens (self, sText, bStartEndToken=False):
        i = 0
        if bStartEndToken:
            yield { "i": 0, "sType": "INFO", "sValue": "<start>", "nStart": 0, "nEnd": 0 }
        for i, m in enumerate(self.zToken.finditer(sText), 1):
            yield { "i": i, "sType": m.lastgroup, "sValue": m.group(), "nStart": m.start(), "nEnd": m.end() }
        if bStartEndToken:
            iEnd = len(sText)
            yield { "i": i+1, "sType": "INFO", "sValue": "<end>", "nStart": iEnd, "nEnd": iEnd }