Grammalecte  Check-in [cca3887aad]

Overview
Comment:[core] text processor: communication between regex rules and graph rules + [graphspell][bug] tokenizer: set i variable to 0, if sentence is empty
Downloads: Tarball | ZIP archive | SQL archive
Timelines: family | ancestors | descendants | both | core | graphspell | rg
Files: files | file ages | folders
SHA3-256: cca3887aad8ca5967ef8b660679838f16c6997f3f388b034c01dfd9f9295216c
User & Date: olr on 2018-06-12 11:24:50
Original Comment: [graphspell][bug] tokenizer: set i variable to 0, if sentence is empty
Other Links: branch diff | manifest | tags
Context
2018-06-12
12:10
[fr] graph tests (draft) check-in: 2f0f044a66 user: olr tags: fr, rg
11:24
[core] text processor: communication between regex rules and graph rules + [graphspell][bug] tokenizer: set i variable to 0, if sentence is empty check-in: cca3887aad user: olr tags: core, graphspell, rg
10:46
[misc] SublimeText syntax rules update check-in: eb14e38462 user: olr tags: misc, rg
Changes

Modified gc_core/py/lang_core/gc_engine.py from [f8ff5d45bc] to [112316530e].

153
154
155
156
157
158
159
160

161
162
163



164
165
166
167
168
169
170
171
172
173
174
175
176
177
    for m in _zEndOfSentence.finditer(sText):
        yield (iStart, m.end())
        iStart = m.end()


def _proofread (oSentence, s, sx, nOffset, bParagraph, dDA, dPriority, sCountry, dOptions, bShowRuleId, bDebug, bContext):
    dErrs = {}
    bChange = False

    for sOption, lRuleGroup in _getRules(bParagraph):
        if sOption == "@@@@":
            # graph rules



            for sGraphName, sLineId in lRuleGroup:
                if bDebug:
                    print(sGraphName, sLineId)
                bChange, errs = oSentence.parse(dAllGraph[sGraphName], dPriority, sCountry, dOptions, bShowRuleId, bDebug, bContext)
                dErrs.update(errs)
                if bChange:
                    oSentence.rewrite()
                    if bDebug:
                        print("~", oSentence.sSentence)
        elif not sOption or dOptions.get(sOption, False):
            # regex rules
            for zRegex, bUppercase, sLineId, sRuleId, nPriority, lActions in lRuleGroup:
                if sRuleId not in _aIgnoredRules:
                    for m in zRegex.finditer(s):







|
>



>
>
>



|

|
|







153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
    for m in _zEndOfSentence.finditer(sText):
        yield (iStart, m.end())
        iStart = m.end()


def _proofread (oSentence, s, sx, nOffset, bParagraph, dDA, dPriority, sCountry, dOptions, bShowRuleId, bDebug, bContext):
    dErrs = {}
    bParagraphChange = False
    bSentenceChange = False
    for sOption, lRuleGroup in _getRules(bParagraph):
        if sOption == "@@@@":
            # graph rules
            if not bParagraph and bSentenceChange:
                oSentence.update(s)
                bSentenceChange = False
            for sGraphName, sLineId in lRuleGroup:
                if bDebug:
                    print(sGraphName, sLineId)
                bParagraphChange, errs = oSentence.parse(dAllGraph[sGraphName], dPriority, sCountry, dOptions, bShowRuleId, bDebug, bContext)
                dErrs.update(errs)
                if bParagraphChange:
                    s = oSentence.rewrite()
                    if bDebug:
                        print("~", oSentence.sSentence)
        elif not sOption or dOptions.get(sOption, False):
            # regex rules
            for zRegex, bUppercase, sLineId, sRuleId, nPriority, lActions in lRuleGroup:
                if sRuleId not in _aIgnoredRules:
                    for m in zRegex.finditer(s):
186
187
188
189
190
191
192
193

194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
                                        nErrorStart = nOffset + m.start(eAct[0])
                                        if nErrorStart not in dErrs or nPriority > dPriority[nErrorStart]:
                                            dErrs[nErrorStart] = _createRegexError(s, sx, sWhat, nOffset, m, eAct[0], sLineId, sRuleId, bUppercase, eAct[1], eAct[2], bShowRuleId, sOption, bContext)
                                            dPriority[nErrorStart] = nPriority
                                    elif cActionType == "~":
                                        # text processor
                                        s = _rewrite(s, sWhat, eAct[0], m, bUppercase)
                                        bChange = True

                                        if bDebug:
                                            echo("~ " + s + "  -- " + m.group(eAct[0]) + "  # " + sLineId)
                                    elif cActionType == "=":
                                        # disambiguation
                                        globals()[sWhat](s, m, dDA)
                                        if bDebug:
                                            echo("= " + m.group(0) + "  # " + sLineId + "\nDA: " + str(dDA))
                                    elif cActionType == ">":
                                        # we do nothing, this test is just a condition to apply all following actions
                                        pass
                                    else:
                                        echo("# error: unknown action at " + sLineId)
                                elif cActionType == ">":
                                    break
                            except Exception as e:
                                raise Exception(str(e), "# " + sLineId + " # " + sRuleId)
    if bChange:
        return (s, dErrs)
    return (False, dErrs)


def _createRegexWriterError (s, sx, sRepl, nOffset, m, iGroup, sLineId, sRuleId, bUppercase, sMsg, sURL, bShowRuleId, sOption, bContext):
    "error for Writer (LO/OO)"
    xErr = SingleProofreadingError()







|
>
















|







190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
                                        nErrorStart = nOffset + m.start(eAct[0])
                                        if nErrorStart not in dErrs or nPriority > dPriority[nErrorStart]:
                                            dErrs[nErrorStart] = _createRegexError(s, sx, sWhat, nOffset, m, eAct[0], sLineId, sRuleId, bUppercase, eAct[1], eAct[2], bShowRuleId, sOption, bContext)
                                            dPriority[nErrorStart] = nPriority
                                    elif cActionType == "~":
                                        # text processor
                                        s = _rewrite(s, sWhat, eAct[0], m, bUppercase)
                                        bParagraphChange = True
                                        bSentenceChange = True
                                        if bDebug:
                                            echo("~ " + s + "  -- " + m.group(eAct[0]) + "  # " + sLineId)
                                    elif cActionType == "=":
                                        # disambiguation
                                        globals()[sWhat](s, m, dDA)
                                        if bDebug:
                                            echo("= " + m.group(0) + "  # " + sLineId + "\nDA: " + str(dDA))
                                    elif cActionType == ">":
                                        # we do nothing, this test is just a condition to apply all following actions
                                        pass
                                    else:
                                        echo("# error: unknown action at " + sLineId)
                                elif cActionType == ">":
                                    break
                            except Exception as e:
                                raise Exception(str(e), "# " + sLineId + " # " + sRuleId)
    if bParagraphChange:
        return (s, dErrs)
    return (False, dErrs)


def _createRegexWriterError (s, sx, sRepl, nOffset, m, iGroup, sLineId, sRuleId, bUppercase, sMsg, sURL, bShowRuleId, sOption, bContext):
    "error for Writer (LO/OO)"
    xErr = SingleProofreadingError()
581
582
583
584
585
586
587




588
589
590
591
592
593
594
    def __init__ (self, sSentence, sSentence0, nOffset):
        self.sSentence = sSentence
        self.sSentence0 = sSentence0
        self.nOffset = nOffset
        self.lToken = list(_oTokenizer.genTokens(sSentence, True))
        self.createError = self._createWriterError  if _bWriterError  else self._createDictError





    def _getNextMatchingNodes (self, dToken, dGraph, dNode):
        "generator: return nodes where <dToken> “values” match <dNode> arcs"
        # token value
        if dToken["sValue"] in dNode:
            #print("value found: ", dToken["sValue"])
            yield dGraph[dNode[dToken["sValue"]]]
        # token lemmas







>
>
>
>







586
587
588
589
590
591
592
593
594
595
596
597
598
599
600
601
602
603
    def __init__ (self, sSentence, sSentence0, nOffset):
        self.sSentence = sSentence
        self.sSentence0 = sSentence0
        self.nOffset = nOffset
        self.lToken = list(_oTokenizer.genTokens(sSentence, True))
        self.createError = self._createWriterError  if _bWriterError  else self._createDictError

    def update (self, sSentence):
        self.sSentence = sSentence
        self.lToken = list(_oTokenizer.genTokens(sSentence, True))

    def _getNextMatchingNodes (self, dToken, dGraph, dNode):
        "generator: return nodes where <dToken> “values” match <dNode> arcs"
        # token value
        if dToken["sValue"] in dNode:
            #print("value found: ", dToken["sValue"])
            yield dGraph[dNode[dToken["sValue"]]]
        # token lemmas
833
834
835
836
837
838
839

840
841
842
843
844
845
846
                    dToken["sValue"] = dToken["sNewValue"]
                    nDiffLen = len(dToken["sRealValue"]) - len(dToken["sNewValue"])
                    sNewRepl = (dToken["sNewValue"] + " " * nDiffLen)  if nDiffLen >= 0  else dToken["sNewValue"][:len(dToken["sRealValue"])]
                    self.sSentence = self.sSentence[:self.nOffset+dToken["nStart"]] + sNewRepl + self.sSentence[self.nOffset+dToken["nEnd"]:]
                    del dToken["sNewValue"]
        self.lToken.clear()
        self.lToken = lNewToken




#### Analyse tokens

def g_morph (dToken, sPattern, sNegPattern=""):
    "analyse a token, return True if <sNegPattern> not in morphologies and <sPattern> in morphologies"







>







842
843
844
845
846
847
848
849
850
851
852
853
854
855
856
                    dToken["sValue"] = dToken["sNewValue"]
                    nDiffLen = len(dToken["sRealValue"]) - len(dToken["sNewValue"])
                    sNewRepl = (dToken["sNewValue"] + " " * nDiffLen)  if nDiffLen >= 0  else dToken["sNewValue"][:len(dToken["sRealValue"])]
                    self.sSentence = self.sSentence[:self.nOffset+dToken["nStart"]] + sNewRepl + self.sSentence[self.nOffset+dToken["nEnd"]:]
                    del dToken["sNewValue"]
        self.lToken.clear()
        self.lToken = lNewToken
        return self.sSentence



#### Analyse tokens

def g_morph (dToken, sPattern, sNegPattern=""):
    "analyse a token, return True if <sNegPattern> not in morphologies and <sPattern> in morphologies"

Modified graphspell/tokenizer.py from [b723a02695] to [30951f1c9c].

41
42
43
44
45
46
47

48
49
50
51
52
53
54
    def __init__ (self, sLang):
        self.sLang = sLang
        if sLang not in _PATTERNS:
            self.sLang = "default"
        self.zToken = re.compile( "(?i)" + '|'.join(sRegex for sRegex in _PATTERNS[sLang]) )

    def genTokens (self, sText, bStartEndToken=False):

        if bStartEndToken:
            yield { "i": 0, "sType": "INFO", "sValue": "<start>", "nStart": 0, "nEnd": 0 }
        for i, m in enumerate(self.zToken.finditer(sText), 1):
            yield { "i": i, "sType": m.lastgroup, "sValue": m.group(), "nStart": m.start(), "nEnd": m.end() }
        if bStartEndToken:
            iEnd = len(sText)
            yield { "i": i+1, "sType": "INFO", "sValue": "<end>", "nStart": iEnd, "nEnd": iEnd }







>







41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
    def __init__ (self, sLang):
        self.sLang = sLang
        if sLang not in _PATTERNS:
            self.sLang = "default"
        self.zToken = re.compile( "(?i)" + '|'.join(sRegex for sRegex in _PATTERNS[sLang]) )

    def genTokens (self, sText, bStartEndToken=False):
        i = 0
        if bStartEndToken:
            yield { "i": 0, "sType": "INFO", "sValue": "<start>", "nStart": 0, "nEnd": 0 }
        for i, m in enumerate(self.zToken.finditer(sText), 1):
            yield { "i": i, "sType": m.lastgroup, "sValue": m.group(), "nStart": m.start(), "nEnd": m.end() }
        if bStartEndToken:
            iEnd = len(sText)
            yield { "i": i+1, "sType": "INFO", "sValue": "<end>", "nStart": iEnd, "nEnd": iEnd }