Grammalecte  Check-in [ed53270796]

Overview
Comment:[core][cli][fx] grammar analysis update (keeping tokens from being deleted)
Downloads: Tarball | ZIP archive | SQL archive
Timelines: family | ancestors | descendants | both | cli | core | fx | salxg
Files: files | file ages | folders
SHA3-256: ed532707969ae823b44e3e90389efe77de32479c411fb32538b53be4da872f1c
User & Date: olr on 2020-08-26 12:21:03
Other Links: branch diff | manifest | tags
Context
2020-08-28
11:00
[graphspell] lexicographer: update tags check-in: df1367e6e2 user: olr tags: graphspell, salxg
2020-08-26
12:21
[core][cli][fx] grammar analysis update (keeping tokens from being deleted) check-in: ed53270796 user: olr tags: cli, core, fx, salxg
12:19
[graphspell][js] useless return check-in: be32ffb142 user: olr tags: graphspell, salxg
Changes

Modified gc_core/js/lang_core/gc_engine.js from [c1f3aa973c] to [bd80498a18].

218
219
220
221
222
223
224
225

226
227
228
229
230
231
232
218
219
220
221
222
223
224

225
226
227
228
229
230
231
232







-
+







                for (let dToken of this.lTokens) {
                    if (dToken["sType"] != "INFO") {
                        this.dTokenPos.set(dToken["nStart"], dToken);
                    }
                }
                if (bFullInfo) {
                    this.lTokens0 = Array.from(this.lTokens);
                    // the list of tokens is duplicated, to keep all tokens from being deleted when analysis
                    // the list of tokens is duplicated, to keep tokens from being deleted when analysis
                }
                this.parseText(this.sSentence, this.sSentence0, false, iStart, sCountry, dOpt, bShowRuleId, bDebug, bContext);
                if (bFullInfo) {
                    for (let oToken of this.lTokens0) {
                        if (oToken["sType"] == "WORD") {
                            oToken["bValidToken"] = gc_engine.oSpellChecker.isValidToken(oToken["sValue"]);
                        }
971
972
973
974
975
976
977
978
979
980
981
982
983
984
985
986
987
988
989
990

991
992
993
994
995
996
997
998
999
971
972
973
974
975
976
977

978
979
980
981

982
983
984
985
986
987
988
989
990

991
992
993
994
995
996
997







-




-







+

-








    rewriteFromTags (bDebug=false) {
        // rewrite the sentence, modify tokens, purge the token list
        if (bDebug) {
            console.log("REWRITE");
        }
        let lNewToken = [];
        let lNewTokens0 = [];
        let nMergeUntil = 0;
        let oMergingToken = null;
        for (let [iToken, oToken] of this.lTokens.entries()) {
            let bKeepToken = true;
            let bKeepToken0 = true;
            if (oToken["sType"] != "INFO") {
                if (nMergeUntil && iToken <= nMergeUntil) {
                    oMergingToken["sValue"] += " ".repeat(oToken["nStart"] - oMergingToken["nEnd"]) + oToken["sValue"];
                    oMergingToken["nEnd"] = oToken["nEnd"];
                    if (bDebug) {
                        console.log("  MERGED TOKEN: " + oMergingToken["sValue"]);
                    }
                    oToken["bMerged"] = true;
                    bKeepToken = false;
                    bKeepToken0 = false;
                }
                if (oToken.hasOwnProperty("nMergeUntil")) {
                    if (iToken > nMergeUntil) { // this token is not already merged with a previous token
                        oMergingToken = oToken;
                    }
                    if (oToken["nMergeUntil"] > nMergeUntil) {
                        nMergeUntil = oToken["nMergeUntil"];
1029
1030
1031
1032
1033
1034
1035
1036
1037
1038
1039
1040
1041
1042
1043
1044
1045
1046
1047
1048
1049
1050
1051
1052
1053
1054
1055
1027
1028
1029
1030
1031
1032
1033



1034
1035
1036
1037
1038
1039




1040
1041
1042
1043
1044
1045
1046







-
-
-






-
-
-
-







                    this.dTokenPos.delete(oToken["nStart"]);
                }
                catch (e) {
                    console.log(this.asString());
                    console.log(oToken);
                }
            }
            if (this.lTokens0 !== null && bKeepToken0) {
                lNewTokens0.push(oToken);
            }
        }
        if (bDebug) {
            console.log("  TEXT REWRITED: " + this.sSentence);
        }
        this.lTokens.length = 0;
        this.lTokens = lNewToken;
        if (this.lTokens0 !== null) {
            this.lTokens0.length = 0;
            this.lTokens0 = lNewTokens0;
        }
    }
};


if (typeof(exports) !== 'undefined') {
    exports.lang = gc_engine.lang;
    exports.locales = gc_engine.locales;

Modified gc_core/py/lang_core/gc_engine.py from [95514cb21c] to [a0179d3d20].

280
281
282
283
284
285
286
287

288
289
290
291
292
293
294
280
281
282
283
284
285
286

287
288
289
290
291
292
293
294







-
+







                try:
                    self.sSentence = sText[iStart:iEnd]
                    self.sSentence0 = self.sText0[iStart:iEnd]
                    self.nOffsetWithinParagraph = iStart
                    self.lTokens = list(_oTokenizer.genTokens(self.sSentence, True))
                    self.dTokenPos = { dToken["nStart"]: dToken  for dToken in self.lTokens  if dToken["sType"] != "INFO" }
                    if bFullInfo:
                        self.lTokens0 = list(self.lTokens)  # the list of tokens is duplicated, to keep all tokens from being deleted when analysis
                        self.lTokens0 = list(self.lTokens)  # the list of tokens is duplicated, to keep tokens from being deleted when analysis
                    self.parseText(self.sSentence, self.sSentence0, False, iStart, sCountry, dOpt, bShowRuleId, bDebug, bContext)
                    if bFullInfo:
                        for dToken in self.lTokens0:
                            if dToken["sType"] == "WORD":
                                dToken["bValidToken"] = _oSpellChecker.isValidToken(dToken["sValue"])
                            if "lMorph" not in dToken:
                                dToken["lMorph"] = _oSpellChecker.getMorph(dToken["sValue"])
838
839
840
841
842
843
844
845
846
847
848
849
850
851
852
853
854


855
856
857
858
859
860
861
838
839
840
841
842
843
844

845
846
847
848
849
850
851


852
853
854
855
856
857
858
859
860







-







-
-
+
+







            echo("REWRITE")
        lNewTokens = []
        lNewTokens0 = []
        nMergeUntil = 0
        dTokenMerger = {}
        for iToken, dToken in enumerate(self.lTokens):
            bKeepToken = True
            bKeepToken0 = True
            if dToken["sType"] != "INFO":
                if nMergeUntil and iToken <= nMergeUntil:
                    # token to merge
                    dTokenMerger["sValue"] += " " * (dToken["nStart"] - dTokenMerger["nEnd"]) + dToken["sValue"]
                    dTokenMerger["nEnd"] = dToken["nEnd"]
                    if bDebug:
                        echo("  MERGED TOKEN: " + dTokenMerger["sValue"])
                    bKeepToken = False
                    bKeepToken0 = False
                    dToken["bMerged"] = True
                    bKeepToken = False
                if "nMergeUntil" in dToken:
                    # first token to be merge with
                    if iToken > nMergeUntil: # this token is not to be merged with a previous token
                        dTokenMerger = dToken
                    if dToken["nMergeUntil"] > nMergeUntil:
                        nMergeUntil = dToken["nMergeUntil"]
                    del dToken["nMergeUntil"]
880
881
882
883
884
885
886
887
888
889
890
891
892
893
894
895
879
880
881
882
883
884
885


886
887
888
889










-
-




-
-
-
                    del dToken["sNewValue"]
            else:
                try:
                    del self.dTokenPos[dToken["nStart"]]
                except KeyError:
                    echo(self)
                    echo(dToken)
            if self.lTokens0 is not None and bKeepToken0:
                lNewTokens0.append(dToken)
        if bDebug:
            echo("  TEXT REWRITED: " + self.sSentence)
        self.lTokens.clear()
        self.lTokens = lNewTokens
        if self.lTokens0 is not None:
            self.lTokens0.clear()
            self.lTokens0 = lNewTokens0

Modified gc_lang/fr/webext/content_scripts/panel_gc.js from [c04a1ec88f] to [b1e5b16e34].

578
579
580
581
582
583
584
585

586
587
588
589
590
591
592
578
579
580
581
582
583
584

585
586
587
588
589
590
591
592







-
+







                this.nLxgCount += 1;
                if (oSentence.sSentence.trim() !== "") {
                    let xSentenceBlock = oGrammalecte.createNode("div", {className: "grammalecte_lxg_paragraph_sentence_block"});
                    xSentenceBlock.appendChild(oGrammalecte.createNode("div", {className: "grammalecte_lxg_list_num", textContent: this.nLxgCount}));
                    xSentenceBlock.appendChild(oGrammalecte.createNode("p", {className: "grammalecte_lxg_paragraph_sentence", textContent: oSentence.sSentence}));
                    let xTokenList = oGrammalecte.createNode("div", {className: "grammalecte_lxg_list_of_tokens"});
                    for (let oToken of oSentence.lTokens) {
                        if (oToken["sType"] != "INFO") {
                        if (oToken["sType"] != "INFO" && !oToken.hasOwnProperty("bMerged")) {
                            xTokenList.appendChild(this._createTokenBlock2(oToken));
                        }
                    }
                    xSentenceBlock.appendChild(xTokenList);
                    this.xLxgResultZone.appendChild(xSentenceBlock);
                }
            }

Modified grammalecte-cli.py from [906ad28942] to [65f33ab4b5].

342
343
344
345
346
347
348
349

350
351
352
353
354
355
356
342
343
344
345
346
347
348

349
350
351
352
353
354
355
356







-
+







                    if xArgs.textformatter:
                        sParagraph = oTextFormatter.formatText(sParagraph)
                    lParagraphErrors, lSentences = oGrammarChecker.gce.parse(sParagraph, bDebug=xArgs.debug, bFullInfo=True)
                    #echo(txt.getReadableErrors(lParagraphErrors, xArgs.width))
                    for dSentence in lSentences:
                        echo("{nStart}:{nEnd}  <{sSentence}>".format(**dSentence))
                        for dToken in dSentence["lTokens"]:
                            if dToken["sType"] == "INFO":
                            if dToken["sType"] == "INFO" or "bMerged" in dToken:
                                continue
                            echo("  {0[nStart]:>3}:{0[nEnd]:<3} {1} {0[sType]:<14} {2} {0[sValue]:<16} {3}".format(dToken, \
                                                                                                        "×" if dToken.get("bToRemove", False) else " ",
                                                                                                        "!" if dToken["sType"] == "WORD" and not dToken.get("bValidToken", False) else " ",
                                                                                                        " ".join(dToken.get("aTags", "")) ) )
                            if "lMorph" in dToken:
                                for sMorph, sLabel in zip(dToken["lMorph"], dToken["aLabels"]):