Overview
Comment: | [core][cli][fx] grammar analysis update (keeping tokens from being deleted) |
---|---|
Downloads: | Tarball | ZIP archive | SQL archive |
Timelines: | family | ancestors | descendants | both | cli | core | fx | salxg |
Files: | files | file ages | folders |
SHA3-256: |
ed532707969ae823b44e3e90389efe77 |
User & Date: | olr on 2020-08-26 12:21:03 |
Other Links: | branch diff | manifest | tags |
Context
2020-08-28
| ||
11:00 | [graphspell] lexicographer: update tags check-in: df1367e6e2 user: olr tags: graphspell, salxg | |
2020-08-26
| ||
12:21 | [core][cli][fx] grammar analysis update (keeping tokens from being deleted) check-in: ed53270796 user: olr tags: cli, core, fx, salxg | |
12:19 | [graphspell][js] useless return check-in: be32ffb142 user: olr tags: graphspell, salxg | |
Changes
Modified gc_core/js/lang_core/gc_engine.js from [c1f3aa973c] to [bd80498a18].
︙ | ︙ | |||
218 219 220 221 222 223 224 | for (let dToken of this.lTokens) { if (dToken["sType"] != "INFO") { this.dTokenPos.set(dToken["nStart"], dToken); } } if (bFullInfo) { this.lTokens0 = Array.from(this.lTokens); | | | 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 | for (let dToken of this.lTokens) { if (dToken["sType"] != "INFO") { this.dTokenPos.set(dToken["nStart"], dToken); } } if (bFullInfo) { this.lTokens0 = Array.from(this.lTokens); // the list of tokens is duplicated, to keep tokens from being deleted when analysis } this.parseText(this.sSentence, this.sSentence0, false, iStart, sCountry, dOpt, bShowRuleId, bDebug, bContext); if (bFullInfo) { for (let oToken of this.lTokens0) { if (oToken["sType"] == "WORD") { oToken["bValidToken"] = gc_engine.oSpellChecker.isValidToken(oToken["sValue"]); } |
︙ | ︙ | |||
971 972 973 974 975 976 977 | rewriteFromTags (bDebug=false) { // rewrite the sentence, modify tokens, purge the token list if (bDebug) { console.log("REWRITE"); } let lNewToken = []; | < < > < | 971 972 973 974 975 976 977 978 979 980 981 982 983 984 985 986 987 988 989 990 991 992 993 994 995 996 997 | rewriteFromTags (bDebug=false) { // rewrite the sentence, modify tokens, purge the token list if (bDebug) { console.log("REWRITE"); } let lNewToken = []; let nMergeUntil = 0; let oMergingToken = null; for (let [iToken, oToken] of this.lTokens.entries()) { let bKeepToken = true; if (oToken["sType"] != "INFO") { if (nMergeUntil && iToken <= nMergeUntil) { oMergingToken["sValue"] += " ".repeat(oToken["nStart"] - oMergingToken["nEnd"]) + oToken["sValue"]; oMergingToken["nEnd"] = oToken["nEnd"]; if (bDebug) { console.log(" MERGED TOKEN: " + oMergingToken["sValue"]); } oToken["bMerged"] = true; bKeepToken = false; } if (oToken.hasOwnProperty("nMergeUntil")) { if (iToken > nMergeUntil) { // this token is not already merged with a previous token oMergingToken = oToken; } if (oToken["nMergeUntil"] > nMergeUntil) { nMergeUntil = oToken["nMergeUntil"]; |
︙ | ︙ | |||
1029 1030 1031 1032 1033 1034 1035 | this.dTokenPos.delete(oToken["nStart"]); } catch (e) { console.log(this.asString()); console.log(oToken); } } | < < < < < < < | 1027 1028 1029 1030 1031 1032 1033 1034 1035 1036 1037 1038 1039 1040 1041 1042 1043 1044 1045 1046 | this.dTokenPos.delete(oToken["nStart"]); } catch (e) { console.log(this.asString()); console.log(oToken); } } } if (bDebug) { console.log(" TEXT REWRITED: " + this.sSentence); } this.lTokens.length = 0; this.lTokens = lNewToken; } }; if (typeof(exports) !== 'undefined') { exports.lang = gc_engine.lang; exports.locales = gc_engine.locales; |
︙ | ︙ |
Modified gc_core/py/lang_core/gc_engine.py from [95514cb21c] to [a0179d3d20].
︙ | ︙ | |||
280 281 282 283 284 285 286 | try: self.sSentence = sText[iStart:iEnd] self.sSentence0 = self.sText0[iStart:iEnd] self.nOffsetWithinParagraph = iStart self.lTokens = list(_oTokenizer.genTokens(self.sSentence, True)) self.dTokenPos = { dToken["nStart"]: dToken for dToken in self.lTokens if dToken["sType"] != "INFO" } if bFullInfo: | | | 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 | try: self.sSentence = sText[iStart:iEnd] self.sSentence0 = self.sText0[iStart:iEnd] self.nOffsetWithinParagraph = iStart self.lTokens = list(_oTokenizer.genTokens(self.sSentence, True)) self.dTokenPos = { dToken["nStart"]: dToken for dToken in self.lTokens if dToken["sType"] != "INFO" } if bFullInfo: self.lTokens0 = list(self.lTokens) # the list of tokens is duplicated, to keep tokens from being deleted when analysis self.parseText(self.sSentence, self.sSentence0, False, iStart, sCountry, dOpt, bShowRuleId, bDebug, bContext) if bFullInfo: for dToken in self.lTokens0: if dToken["sType"] == "WORD": dToken["bValidToken"] = _oSpellChecker.isValidToken(dToken["sValue"]) if "lMorph" not in dToken: dToken["lMorph"] = _oSpellChecker.getMorph(dToken["sValue"]) |
︙ | ︙ | |||
838 839 840 841 842 843 844 | echo("REWRITE") lNewTokens = [] lNewTokens0 = [] nMergeUntil = 0 dTokenMerger = {} for iToken, dToken in enumerate(self.lTokens): bKeepToken = True | < | | | 838 839 840 841 842 843 844 845 846 847 848 849 850 851 852 853 854 855 856 857 858 859 860 | echo("REWRITE") lNewTokens = [] lNewTokens0 = [] nMergeUntil = 0 dTokenMerger = {} for iToken, dToken in enumerate(self.lTokens): bKeepToken = True if dToken["sType"] != "INFO": if nMergeUntil and iToken <= nMergeUntil: # token to merge dTokenMerger["sValue"] += " " * (dToken["nStart"] - dTokenMerger["nEnd"]) + dToken["sValue"] dTokenMerger["nEnd"] = dToken["nEnd"] if bDebug: echo(" MERGED TOKEN: " + dTokenMerger["sValue"]) dToken["bMerged"] = True bKeepToken = False if "nMergeUntil" in dToken: # first token to be merge with if iToken > nMergeUntil: # this token is not to be merged with a previous token dTokenMerger = dToken if dToken["nMergeUntil"] > nMergeUntil: nMergeUntil = dToken["nMergeUntil"] del dToken["nMergeUntil"] |
︙ | ︙ | |||
880 881 882 883 884 885 886 | del dToken["sNewValue"] else: try: del self.dTokenPos[dToken["nStart"]] except KeyError: echo(self) echo(dToken) | < < < < < | 879 880 881 882 883 884 885 886 887 888 889 | del dToken["sNewValue"] else: try: del self.dTokenPos[dToken["nStart"]] except KeyError: echo(self) echo(dToken) if bDebug: echo(" TEXT REWRITED: " + self.sSentence) self.lTokens.clear() self.lTokens = lNewTokens |
Modified gc_lang/fr/webext/content_scripts/panel_gc.js from [c04a1ec88f] to [b1e5b16e34].
︙ | ︙ | |||
578 579 580 581 582 583 584 | this.nLxgCount += 1; if (oSentence.sSentence.trim() !== "") { let xSentenceBlock = oGrammalecte.createNode("div", {className: "grammalecte_lxg_paragraph_sentence_block"}); xSentenceBlock.appendChild(oGrammalecte.createNode("div", {className: "grammalecte_lxg_list_num", textContent: this.nLxgCount})); xSentenceBlock.appendChild(oGrammalecte.createNode("p", {className: "grammalecte_lxg_paragraph_sentence", textContent: oSentence.sSentence})); let xTokenList = oGrammalecte.createNode("div", {className: "grammalecte_lxg_list_of_tokens"}); for (let oToken of oSentence.lTokens) { | | | 578 579 580 581 582 583 584 585 586 587 588 589 590 591 592 | this.nLxgCount += 1; if (oSentence.sSentence.trim() !== "") { let xSentenceBlock = oGrammalecte.createNode("div", {className: "grammalecte_lxg_paragraph_sentence_block"}); xSentenceBlock.appendChild(oGrammalecte.createNode("div", {className: "grammalecte_lxg_list_num", textContent: this.nLxgCount})); xSentenceBlock.appendChild(oGrammalecte.createNode("p", {className: "grammalecte_lxg_paragraph_sentence", textContent: oSentence.sSentence})); let xTokenList = oGrammalecte.createNode("div", {className: "grammalecte_lxg_list_of_tokens"}); for (let oToken of oSentence.lTokens) { if (oToken["sType"] != "INFO" && !oToken.hasOwnProperty("bMerged")) { xTokenList.appendChild(this._createTokenBlock2(oToken)); } } xSentenceBlock.appendChild(xTokenList); this.xLxgResultZone.appendChild(xSentenceBlock); } } |
︙ | ︙ |
Modified grammalecte-cli.py from [906ad28942] to [65f33ab4b5].
︙ | ︙ | |||
342 343 344 345 346 347 348 | if xArgs.textformatter: sParagraph = oTextFormatter.formatText(sParagraph) lParagraphErrors, lSentences = oGrammarChecker.gce.parse(sParagraph, bDebug=xArgs.debug, bFullInfo=True) #echo(txt.getReadableErrors(lParagraphErrors, xArgs.width)) for dSentence in lSentences: echo("{nStart}:{nEnd} <{sSentence}>".format(**dSentence)) for dToken in dSentence["lTokens"]: | | | 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 | if xArgs.textformatter: sParagraph = oTextFormatter.formatText(sParagraph) lParagraphErrors, lSentences = oGrammarChecker.gce.parse(sParagraph, bDebug=xArgs.debug, bFullInfo=True) #echo(txt.getReadableErrors(lParagraphErrors, xArgs.width)) for dSentence in lSentences: echo("{nStart}:{nEnd} <{sSentence}>".format(**dSentence)) for dToken in dSentence["lTokens"]: if dToken["sType"] == "INFO" or "bMerged" in dToken: continue echo(" {0[nStart]:>3}:{0[nEnd]:<3} {1} {0[sType]:<14} {2} {0[sValue]:<16} {3}".format(dToken, \ "×" if dToken.get("bToRemove", False) else " ", "!" if dToken["sType"] == "WORD" and not dToken.get("bValidToken", False) else " ", " ".join(dToken.get("aTags", "")) ) ) if "lMorph" in dToken: for sMorph, sLabel in zip(dToken["lMorph"], dToken["aLabels"]): |
︙ | ︙ |