Overview
| Comment: | [core][cli][fx] grammar analysis update (keeping tokens from being deleted) |
|---|---|
| Downloads: | Tarball | ZIP archive | SQL archive |
| Timelines: | family | ancestors | descendants | both | cli | core | fx | salxg |
| Files: | files | file ages | folders |
| SHA3-256: |
ed532707969ae823b44e3e90389efe77 |
| User & Date: | olr on 2020-08-26 12:21:03 |
| Other Links: | branch diff | manifest | tags |
Context
|
2020-08-28
| ||
| 11:00 | [graphspell] lexicographer: update tags check-in: df1367e6e2 user: olr tags: graphspell, salxg | |
|
2020-08-26
| ||
| 12:21 | [core][cli][fx] grammar analysis update (keeping tokens from being deleted) check-in: ed53270796 user: olr tags: cli, core, fx, salxg | |
| 12:19 | [graphspell][js] useless return check-in: be32ffb142 user: olr tags: graphspell, salxg | |
Changes
Modified gc_core/js/lang_core/gc_engine.js from [c1f3aa973c] to [bd80498a18].
| ︙ | ︙ | |||
218 219 220 221 222 223 224 |
for (let dToken of this.lTokens) {
if (dToken["sType"] != "INFO") {
this.dTokenPos.set(dToken["nStart"], dToken);
}
}
if (bFullInfo) {
this.lTokens0 = Array.from(this.lTokens);
| | | 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 |
for (let dToken of this.lTokens) {
if (dToken["sType"] != "INFO") {
this.dTokenPos.set(dToken["nStart"], dToken);
}
}
if (bFullInfo) {
this.lTokens0 = Array.from(this.lTokens);
// the list of tokens is duplicated, to keep tokens from being deleted when analysis
}
this.parseText(this.sSentence, this.sSentence0, false, iStart, sCountry, dOpt, bShowRuleId, bDebug, bContext);
if (bFullInfo) {
for (let oToken of this.lTokens0) {
if (oToken["sType"] == "WORD") {
oToken["bValidToken"] = gc_engine.oSpellChecker.isValidToken(oToken["sValue"]);
}
|
| ︙ | ︙ | |||
971 972 973 974 975 976 977 |
rewriteFromTags (bDebug=false) {
// rewrite the sentence, modify tokens, purge the token list
if (bDebug) {
console.log("REWRITE");
}
let lNewToken = [];
| < < > < | 971 972 973 974 975 976 977 978 979 980 981 982 983 984 985 986 987 988 989 990 991 992 993 994 995 996 997 |
rewriteFromTags (bDebug=false) {
// rewrite the sentence, modify tokens, purge the token list
if (bDebug) {
console.log("REWRITE");
}
let lNewToken = [];
let nMergeUntil = 0;
let oMergingToken = null;
for (let [iToken, oToken] of this.lTokens.entries()) {
let bKeepToken = true;
if (oToken["sType"] != "INFO") {
if (nMergeUntil && iToken <= nMergeUntil) {
oMergingToken["sValue"] += " ".repeat(oToken["nStart"] - oMergingToken["nEnd"]) + oToken["sValue"];
oMergingToken["nEnd"] = oToken["nEnd"];
if (bDebug) {
console.log(" MERGED TOKEN: " + oMergingToken["sValue"]);
}
oToken["bMerged"] = true;
bKeepToken = false;
}
if (oToken.hasOwnProperty("nMergeUntil")) {
if (iToken > nMergeUntil) { // this token is not already merged with a previous token
oMergingToken = oToken;
}
if (oToken["nMergeUntil"] > nMergeUntil) {
nMergeUntil = oToken["nMergeUntil"];
|
| ︙ | ︙ | |||
1029 1030 1031 1032 1033 1034 1035 |
this.dTokenPos.delete(oToken["nStart"]);
}
catch (e) {
console.log(this.asString());
console.log(oToken);
}
}
| < < < < < < < | 1027 1028 1029 1030 1031 1032 1033 1034 1035 1036 1037 1038 1039 1040 1041 1042 1043 1044 1045 1046 |
this.dTokenPos.delete(oToken["nStart"]);
}
catch (e) {
console.log(this.asString());
console.log(oToken);
}
}
}
if (bDebug) {
console.log(" TEXT REWRITED: " + this.sSentence);
}
this.lTokens.length = 0;
this.lTokens = lNewToken;
}
};
if (typeof(exports) !== 'undefined') {
exports.lang = gc_engine.lang;
exports.locales = gc_engine.locales;
|
| ︙ | ︙ |
Modified gc_core/py/lang_core/gc_engine.py from [95514cb21c] to [a0179d3d20].
| ︙ | ︙ | |||
280 281 282 283 284 285 286 |
try:
self.sSentence = sText[iStart:iEnd]
self.sSentence0 = self.sText0[iStart:iEnd]
self.nOffsetWithinParagraph = iStart
self.lTokens = list(_oTokenizer.genTokens(self.sSentence, True))
self.dTokenPos = { dToken["nStart"]: dToken for dToken in self.lTokens if dToken["sType"] != "INFO" }
if bFullInfo:
| | | 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 |
try:
self.sSentence = sText[iStart:iEnd]
self.sSentence0 = self.sText0[iStart:iEnd]
self.nOffsetWithinParagraph = iStart
self.lTokens = list(_oTokenizer.genTokens(self.sSentence, True))
self.dTokenPos = { dToken["nStart"]: dToken for dToken in self.lTokens if dToken["sType"] != "INFO" }
if bFullInfo:
self.lTokens0 = list(self.lTokens) # the list of tokens is duplicated, to keep tokens from being deleted when analysis
self.parseText(self.sSentence, self.sSentence0, False, iStart, sCountry, dOpt, bShowRuleId, bDebug, bContext)
if bFullInfo:
for dToken in self.lTokens0:
if dToken["sType"] == "WORD":
dToken["bValidToken"] = _oSpellChecker.isValidToken(dToken["sValue"])
if "lMorph" not in dToken:
dToken["lMorph"] = _oSpellChecker.getMorph(dToken["sValue"])
|
| ︙ | ︙ | |||
838 839 840 841 842 843 844 |
echo("REWRITE")
lNewTokens = []
lNewTokens0 = []
nMergeUntil = 0
dTokenMerger = {}
for iToken, dToken in enumerate(self.lTokens):
bKeepToken = True
| < | | | 838 839 840 841 842 843 844 845 846 847 848 849 850 851 852 853 854 855 856 857 858 859 860 |
echo("REWRITE")
lNewTokens = []
lNewTokens0 = []
nMergeUntil = 0
dTokenMerger = {}
for iToken, dToken in enumerate(self.lTokens):
bKeepToken = True
if dToken["sType"] != "INFO":
if nMergeUntil and iToken <= nMergeUntil:
# token to merge
dTokenMerger["sValue"] += " " * (dToken["nStart"] - dTokenMerger["nEnd"]) + dToken["sValue"]
dTokenMerger["nEnd"] = dToken["nEnd"]
if bDebug:
echo(" MERGED TOKEN: " + dTokenMerger["sValue"])
dToken["bMerged"] = True
bKeepToken = False
if "nMergeUntil" in dToken:
# first token to be merge with
if iToken > nMergeUntil: # this token is not to be merged with a previous token
dTokenMerger = dToken
if dToken["nMergeUntil"] > nMergeUntil:
nMergeUntil = dToken["nMergeUntil"]
del dToken["nMergeUntil"]
|
| ︙ | ︙ | |||
880 881 882 883 884 885 886 |
del dToken["sNewValue"]
else:
try:
del self.dTokenPos[dToken["nStart"]]
except KeyError:
echo(self)
echo(dToken)
| < < < < < | 879 880 881 882 883 884 885 886 887 888 889 |
del dToken["sNewValue"]
else:
try:
del self.dTokenPos[dToken["nStart"]]
except KeyError:
echo(self)
echo(dToken)
if bDebug:
echo(" TEXT REWRITED: " + self.sSentence)
self.lTokens.clear()
self.lTokens = lNewTokens
|
Modified gc_lang/fr/webext/content_scripts/panel_gc.js from [c04a1ec88f] to [b1e5b16e34].
| ︙ | ︙ | |||
578 579 580 581 582 583 584 |
this.nLxgCount += 1;
if (oSentence.sSentence.trim() !== "") {
let xSentenceBlock = oGrammalecte.createNode("div", {className: "grammalecte_lxg_paragraph_sentence_block"});
xSentenceBlock.appendChild(oGrammalecte.createNode("div", {className: "grammalecte_lxg_list_num", textContent: this.nLxgCount}));
xSentenceBlock.appendChild(oGrammalecte.createNode("p", {className: "grammalecte_lxg_paragraph_sentence", textContent: oSentence.sSentence}));
let xTokenList = oGrammalecte.createNode("div", {className: "grammalecte_lxg_list_of_tokens"});
for (let oToken of oSentence.lTokens) {
| | | 578 579 580 581 582 583 584 585 586 587 588 589 590 591 592 |
this.nLxgCount += 1;
if (oSentence.sSentence.trim() !== "") {
let xSentenceBlock = oGrammalecte.createNode("div", {className: "grammalecte_lxg_paragraph_sentence_block"});
xSentenceBlock.appendChild(oGrammalecte.createNode("div", {className: "grammalecte_lxg_list_num", textContent: this.nLxgCount}));
xSentenceBlock.appendChild(oGrammalecte.createNode("p", {className: "grammalecte_lxg_paragraph_sentence", textContent: oSentence.sSentence}));
let xTokenList = oGrammalecte.createNode("div", {className: "grammalecte_lxg_list_of_tokens"});
for (let oToken of oSentence.lTokens) {
if (oToken["sType"] != "INFO" && !oToken.hasOwnProperty("bMerged")) {
xTokenList.appendChild(this._createTokenBlock2(oToken));
}
}
xSentenceBlock.appendChild(xTokenList);
this.xLxgResultZone.appendChild(xSentenceBlock);
}
}
|
| ︙ | ︙ |
Modified grammalecte-cli.py from [906ad28942] to [65f33ab4b5].
| ︙ | ︙ | |||
342 343 344 345 346 347 348 |
if xArgs.textformatter:
sParagraph = oTextFormatter.formatText(sParagraph)
lParagraphErrors, lSentences = oGrammarChecker.gce.parse(sParagraph, bDebug=xArgs.debug, bFullInfo=True)
#echo(txt.getReadableErrors(lParagraphErrors, xArgs.width))
for dSentence in lSentences:
echo("{nStart}:{nEnd} <{sSentence}>".format(**dSentence))
for dToken in dSentence["lTokens"]:
| | | 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 |
if xArgs.textformatter:
sParagraph = oTextFormatter.formatText(sParagraph)
lParagraphErrors, lSentences = oGrammarChecker.gce.parse(sParagraph, bDebug=xArgs.debug, bFullInfo=True)
#echo(txt.getReadableErrors(lParagraphErrors, xArgs.width))
for dSentence in lSentences:
echo("{nStart}:{nEnd} <{sSentence}>".format(**dSentence))
for dToken in dSentence["lTokens"]:
if dToken["sType"] == "INFO" or "bMerged" in dToken:
continue
echo(" {0[nStart]:>3}:{0[nEnd]:<3} {1} {0[sType]:<14} {2} {0[sValue]:<16} {3}".format(dToken, \
"×" if dToken.get("bToRemove", False) else " ",
"!" if dToken["sType"] == "WORD" and not dToken.get("bValidToken", False) else " ",
" ".join(dToken.get("aTags", "")) ) )
if "lMorph" in dToken:
for sMorph, sLabel in zip(dToken["lMorph"], dToken["aLabels"]):
|
| ︙ | ︙ |