Grammalecte  Check-in [391d4d820a]

Overview
Comment:[core][py] gc engine: full analysis, results with sentences, tokens and errors
Downloads: Tarball | ZIP archive | SQL archive
Timelines: family | ancestors | descendants | both | trunk | core
Files: files | file ages | folders
SHA3-256: 391d4d820a095ef313eb7e7d1c594a8a6b3dd991e229bdc6268d76e9a44eac9f
User & Date: olr on 2019-05-25 10:08:51
Other Links: manifest | tags
Context
2019-05-25
10:20
[cli] new command: @@ for full analysis check-in: 4e38c87cf8 user: olr tags: trunk, cli
10:08
[core][py] gc engine: full analysis, results with sentences, tokens and errors check-in: 391d4d820a user: olr tags: trunk, core
09:53
[core][fr][js] update lexicographer check-in: 4aef5c025d user: olr tags: trunk, fr, core
Changes

Modified gc_core/py/lang_core/gc_engine.py from [6b982ab34c] to [4f89d6d445].

190
191
192
193
194
195
196
197
198


199
200

201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218





219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236


237
238
239
240
241
242
243
244
245
246

247
248
249
250
251
252
253
254



255
256
257
258
259

260
261
262
263
264
265
266
267
268

269
270

271
272
273
274
275
276
277
278
279
280

281
282
283
284
285
286
287
288
289
290
291








292
293
294
295
296
297
298
190
191
192
193
194
195
196


197
198
199

200
201
202
203
204
205
206
207
208
209
210
211
212
213
214




215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235


236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260




261









262


263










264











265
266
267
268
269
270
271
272
273
274
275
276
277
278
279







-
-
+
+

-
+














-
-
-
-
+
+
+
+
+
















-
-
+
+










+








+
+
+

-
-
-
-
+
-
-
-
-
-
-
-
-
-
+
-
-
+
-
-
-
-
-
-
-
-
-
-
+
-
-
-
-
-
-
-
-
-
-
-
+
+
+
+
+
+
+
+







    "set options to default values"
    global _dOptions
    _dOptions = getDefaultOptions()


#### Parsing

def parse (sText, sCountry="${country_default}", bDebug=False, dOptions=None, bContext=False):
    "init point to analyze a text"
def parse (sText, sCountry="${country_default}", bDebug=False, dOptions=None, bContext=False, bFullInfo=False):
    "init point to analyse <sText> and returns an iterable of errors or (with option <bFullInfo>) a list of sentences with tokens and errors"
    oText = TextParser(sText)
    return oText.parse(sCountry, bDebug, dOptions, bContext)
    return oText.parse(sCountry, bDebug, dOptions, bContext, bFullInfo)


#### TEXT PARSER

class TextParser:
    "Text parser"

    def __init__ (self, sText):
        self.sText = sText
        self.sText0 = sText
        self.sSentence = ""
        self.sSentence0 = ""
        self.nOffsetWithinParagraph = 0
        self.lToken = []
        self.dTokenPos = {}
        self.dTags = {}
        self.dError = {}
        self.dErrorPriority = {}  # Key = position; value = priority
        self.dTokenPos = {}         # {position: token}
        self.dTags = {}             # {position: tags}
        self.dError = {}            # {position: error}
        self.dSentenceError = {}    # {position: error} (for the current sentence only)
        self.dErrorPriority = {}    # {position: priority of the current error}

    def __str__ (self):
        s = "===== TEXT =====\n"
        s += "sentence: " + self.sSentence0 + "\n"
        s += "now:      " + self.sSentence  + "\n"
        for dToken in self.lToken:
            s += '#{i}\t{nStart}:{nEnd}\t{sValue}\t{sType}'.format(**dToken)
            if "lMorph" in dToken:
                s += "\t" + str(dToken["lMorph"])
            if "aTags" in dToken:
                s += "\t" + str(dToken["aTags"])
            s += "\n"
        #for nPos, dToken in self.dTokenPos.items():
        #    s += "{}\t{}\n".format(nPos, dToken)
        return s

    def parse (self, sCountry="${country_default}", bDebug=False, dOptions=None, bContext=False):
        "analyses sText and returns an iterable of errors"
    def parse (self, sCountry="${country_default}", bDebug=False, dOptions=None, bContext=False, bFullInfo=False):
        "analyses <sText> and returns an iterable of errors or (with option <bFullInfo>) a list of sentences with tokens and errors"
        #sText = unicodedata.normalize("NFC", sText)
        dOpt = dOptions or _dOptions
        bShowRuleId = option('idrule')
        # parse paragraph
        try:
            self.parseText(self.sText, self.sText0, True, 0, sCountry, dOpt, bShowRuleId, bDebug, bContext)
        except:
            raise
        # parse sentences
        sText = self._getCleanText()
        lSentences = []
        for iStart, iEnd in text.getSentenceBoundaries(sText):
            if 4 < (iEnd - iStart) < 2000:
                try:
                    self.sSentence = sText[iStart:iEnd]
                    self.sSentence0 = self.sText0[iStart:iEnd]
                    self.nOffsetWithinParagraph = iStart
                    self.lToken = list(_oTokenizer.genTokens(self.sSentence, True))
                    self.dTokenPos = { dToken["nStart"]: dToken  for dToken in self.lToken  if dToken["sType"] != "INFO" }
                    if bFullInfo:
                        dSentence = { "nStart": iStart, "nEnd": iEnd, "sSentence": self.sSentence, "lToken": list(self.lToken) }
                        # the list of tokens is duplicated, to keep all tokens from being deleted when analysis
                    self.parseText(self.sSentence, self.sSentence0, False, iStart, sCountry, dOpt, bShowRuleId, bDebug, bContext)
                except:
                    raise
        return self.dError.values() # this is a view (iterable)

                    if bFullInfo:
    def parseAndGetSentences (self, sCountry="${country_default}", bDebug=False):
        "analyses sText and returns a list of sentences with their tokens"
        #sText = unicodedata.normalize("NFC", sText)
        # parse paragraph
        try:
            self.parseText(self.sText, self.sText0, True, 0, sCountry, dOptions, bShowRuleId, bDebug, bContext)
        except:
            raise
        # parse sentences
                        dSentence["aGrammarErrors"] = list(self.dSentenceError.values())
        sText = self._getCleanText()
        lSentence = []
                        lSentences.append(dSentence)
        i = 0
        for iStart, iEnd in text.getSentenceBoundaries(sText):
            try:
                self.sSentence = sText[iStart:iEnd]
                self.sSentence0 = self.sText0[iStart:iEnd]
                self.nOffsetWithinParagraph = iStart
                self.lToken = list(_oTokenizer.genTokens(self.sSentence, True))
                self.dTokenPos = { dToken["nStart"]: dToken  for dToken in self.lToken  if dToken["sType"] != "INFO" }
                i += 1
                lSentence.append({
                        self.dSentenceError.clear()
                    "i": i,
                    "iStart": iStart,
                    "iEnd": iEnd,
                    "sSentence": self.sSentence,
                    "sSentence0": self.sSentence0,
                    "lToken": list(lToken) # this is a copy
                })
                self.parseText(self.sSentence, self.sSentence0, False, iStart, sCountry, dOptions, False, False, False)
            except:
                raise
        return lSentence
                except:
                    raise
        if bFullInfo:
            # Grammar checking and sentence analysis
            return lSentences
        else:
            # Grammar checking only
            return self.dError.values() # this is a view (iterable)

    def _getCleanText (self):
        sText = self.sText
        if " " in sText:
            sText = sText.replace(" ", ' ') # nbsp
        if " " in sText:
            sText = sText.replace(" ", ' ') # nnbsp
333
334
335
336
337
338
339

340
341
342
343
344
345
346
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328







+







                                            echo("RULE: " + sLineId)
                                        if cActionType == "-":
                                            # grammar error
                                            nErrorStart = nOffset + m.start(eAct[0])
                                            if nErrorStart not in self.dError or nPriority > self.dErrorPriority.get(nErrorStart, -1):
                                                self.dError[nErrorStart] = self._createErrorFromRegex(sText, sText0, sWhat, nOffset, m, eAct[0], sLineId, sRuleId, bUppercase, eAct[1], eAct[2], bShowRuleId, sOption, bContext)
                                                self.dErrorPriority[nErrorStart] = nPriority
                                                self.dSentenceError[nErrorStart] = self.dError[nErrorStart]
                                        elif cActionType == "~":
                                            # text processor
                                            sText = self.rewriteText(sText, sWhat, eAct[0], m, bUppercase)
                                            bChange = True
                                            if bDebug:
                                                echo("~ " + sText + "  -- " + m.group(eAct[0]) + "  # " + sLineId)
                                        elif cActionType == "=":
556
557
558
559
560
561
562

563
564
565
566
567
568
569
538
539
540
541
542
543
544
545
546
547
548
549
550
551
552







+







                                if "bImmune" not in self.lToken[nTokenErrorStart]:
                                    nTokenErrorEnd = nTokenOffset + iTokenEnd  if iTokenEnd > 0  else nLastToken + iTokenEnd
                                    nErrorStart = self.nOffsetWithinParagraph + (self.lToken[nTokenErrorStart]["nStart"] if cStartLimit == "<"  else self.lToken[nTokenErrorStart]["nEnd"])
                                    nErrorEnd = self.nOffsetWithinParagraph + (self.lToken[nTokenErrorEnd]["nEnd"] if cEndLimit == ">"  else self.lToken[nTokenErrorEnd]["nStart"])
                                    if nErrorStart not in self.dError or nPriority > self.dErrorPriority.get(nErrorStart, -1):
                                        self.dError[nErrorStart] = self._createErrorFromTokens(sWhat, nTokenOffset, nLastToken, nTokenErrorStart, nErrorStart, nErrorEnd, sLineId, sRuleId, bCaseSvty, sMessage, sURL, bShowRuleId, sOption, bContext)
                                        self.dErrorPriority[nErrorStart] = nPriority
                                        self.dSentenceError[nErrorStart] = self.dError[nErrorStart]
                                        if bDebug:
                                            echo("    NEW_ERROR: {}".format(self.dError[nErrorStart]))
                            elif cActionType == "~":
                                # text processor
                                nTokenStart = nTokenOffset + eAct[0]  if eAct[0] > 0  else nLastToken + eAct[0]
                                nTokenEnd = nTokenOffset + eAct[1]  if eAct[1] > 0  else nLastToken + eAct[1]
                                self._tagAndPrepareTokenForRewriting(sWhat, nTokenStart, nTokenEnd, nTokenOffset, nLastToken, eAct[2], bDebug)