Index: gc_core/js/lang_core/gc_engine.js ================================================================== --- gc_core/js/lang_core/gc_engine.js +++ gc_core/js/lang_core/gc_engine.js @@ -195,10 +195,12 @@ this.parseText(this.sText, this.sText0, true, 0, sCountry, dOpt, bShowRuleId, bDebug, bContext); } catch (e) { console.error(e); } + this.lTokens = null; + this.lTokens0 = null; let lParagraphErrors = null; if (bFullInfo) { lParagraphErrors = Array.from(this.dError.values()); this.dSentenceError.clear(); } @@ -209,30 +211,33 @@ for (let [iStart, iEnd] of text.getSentenceBoundaries(sText)) { try { this.sSentence = sText.slice(iStart, iEnd); this.sSentence0 = this.sText0.slice(iStart, iEnd); this.nOffsetWithinParagraph = iStart; - this.lToken = Array.from(gc_engine.oTokenizer.genTokens(this.sSentence, true)); + this.lTokens = Array.from(gc_engine.oTokenizer.genTokens(this.sSentence, true)); this.dTokenPos.clear(); - for (let dToken of this.lToken) { + for (let dToken of this.lTokens) { if (dToken["sType"] != "INFO") { this.dTokenPos.set(dToken["nStart"], dToken); } } if (bFullInfo) { - oSentence = { "nStart": iStart, "nEnd": iEnd, "sSentence": this.sSentence, "lToken": Array.from(this.lToken) }; - for (let oToken of oSentence["lToken"]) { - if (oToken["sType"] == "WORD") { - oToken["bValidToken"] = gc_engine.oSpellChecker.isValidToken(oToken["sValue"]); - } - } - // the list of tokens is duplicated, to keep all tokens from being deleted when analysis + this.lTokens0 = Array.from(this.lTokens); + // the list of tokens is duplicated, to keep tokens from being deleted when analysis } this.parseText(this.sSentence, this.sSentence0, false, iStart, sCountry, dOpt, bShowRuleId, bDebug, bContext); if (bFullInfo) { - oSentence["lGrammarErrors"] = Array.from(this.dSentenceError.values()); - lSentences.push(oSentence); + for (let oToken of this.lTokens0) { + gc_engine.oSpellChecker.setLabelsOnToken(oToken); + } + lSentences.push({ + "nStart": iStart, + "nEnd": iEnd, + "sSentence": this.sSentence0, + "lTokens": this.lTokens0, + "lGrammarErrors": Array.from(this.dSentenceError.values()) + }); this.dSentenceError.clear(); } } catch (e) { console.error(e); @@ -373,13 +378,13 @@ } if (this.dTokenPos.gl_get(oToken["nStart"], {}).hasOwnProperty("aTags")) { oToken["aTags"] = this.dTokenPos.get(oToken["nStart"])["aTags"]; } } - this.lToken = lNewToken; + this.lTokens = lNewToken; this.dTokenPos.clear(); - for (let oToken of this.lToken) { + for (let oToken of this.lTokens) { if (oToken["sType"] != "INFO") { this.dTokenPos.set(oToken["nStart"], oToken); } } if (bDebug) { @@ -615,11 +620,11 @@ parseGraph (oGraph, sCountry="${country_default}", dOptions=null, bShowRuleId=false, bDebug=false, bContext=false) { // parse graph with tokens from the text and execute actions encountered let lPointer = []; let bTagAndRewrite = false; try { - for (let [iToken, oToken] of this.lToken.entries()) { + for (let [iToken, oToken] of this.lTokens.entries()) { if (bDebug) { console.log("TOKEN: " + oToken["sValue"]); } // check arcs for each existing pointer let lNextPointer = []; @@ -667,21 +672,21 @@ // Disambiguator [ option, condition, "=", replacement/suggestion/action ] // Tag [ option, condition, "/", replacement/suggestion/action, iTokenStart, iTokenEnd ] // Immunity [ option, condition, "!", "", iTokenStart, iTokenEnd ] // Test [ option, condition, ">", "" ] if (!sOption || dOptions.gl_get(sOption, false)) { - bCondMemo = !sFuncCond || gc_functions[sFuncCond](this.lToken, nTokenOffset, nLastToken, sCountry, bCondMemo, this.dTags, this.sSentence, this.sSentence0); - //bCondMemo = !sFuncCond || oEvalFunc[sFuncCond](this.lToken, nTokenOffset, nLastToken, sCountry, bCondMemo, this.dTags, this.sSentence, this.sSentence0); + bCondMemo = !sFuncCond || gc_functions[sFuncCond](this.lTokens, nTokenOffset, nLastToken, sCountry, bCondMemo, this.dTags, this.sSentence, this.sSentence0); + //bCondMemo = !sFuncCond || oEvalFunc[sFuncCond](this.lTokens, nTokenOffset, nLastToken, sCountry, bCondMemo, this.dTags, this.sSentence, this.sSentence0); if (bCondMemo) { if (cActionType == "-") { // grammar error let [iTokenStart, iTokenEnd, cStartLimit, cEndLimit, bCaseSvty, nPriority, sMessage, iURL] = eAct; let nTokenErrorStart = (iTokenStart > 0) ? nTokenOffset + iTokenStart : nLastToken + iTokenStart; - if (!this.lToken[nTokenErrorStart].hasOwnProperty("bImmune")) { + if (!this.lTokens[nTokenErrorStart].hasOwnProperty("bImmune")) { let nTokenErrorEnd = (iTokenEnd > 0) ? nTokenOffset + iTokenEnd : nLastToken + iTokenEnd; - let nErrorStart = this.nOffsetWithinParagraph + ((cStartLimit == "<") ? this.lToken[nTokenErrorStart]["nStart"] : this.lToken[nTokenErrorStart]["nEnd"]); - let nErrorEnd = this.nOffsetWithinParagraph + ((cEndLimit == ">") ? this.lToken[nTokenErrorEnd]["nEnd"] : this.lToken[nTokenErrorEnd]["nStart"]); + let nErrorStart = this.nOffsetWithinParagraph + ((cStartLimit == "<") ? this.lTokens[nTokenErrorStart]["nStart"] : this.lTokens[nTokenErrorStart]["nEnd"]); + let nErrorEnd = this.nOffsetWithinParagraph + ((cEndLimit == ">") ? this.lTokens[nTokenErrorEnd]["nEnd"] : this.lTokens[nTokenErrorEnd]["nStart"]); if (!this.dError.has(nErrorStart) || nPriority > this.dErrorPriority.gl_get(nErrorStart, -1)) { this.dError.set(nErrorStart, this._createErrorFromTokens(sWhat, nTokenOffset, nLastToken, nTokenErrorStart, nErrorStart, nErrorEnd, sLineId, sRuleId, bCaseSvty, sMessage, gc_rules_graph.dURL[iURL], bShowRuleId, sOption, bContext)); this.dErrorPriority.set(nErrorStart, nPriority); this.dSentenceError.set(nErrorStart, this.dError.get(nErrorStart)); @@ -696,19 +701,19 @@ let nTokenStart = (eAct[0] > 0) ? nTokenOffset + eAct[0] : nLastToken + eAct[0]; let nTokenEnd = (eAct[1] > 0) ? nTokenOffset + eAct[1] : nLastToken + eAct[1]; this._tagAndPrepareTokenForRewriting(sWhat, nTokenStart, nTokenEnd, nTokenOffset, nLastToken, eAct[2], bDebug); bChange = true; if (bDebug) { - console.log(` TEXT_PROCESSOR: [${this.lToken[nTokenStart]["sValue"]}:${this.lToken[nTokenEnd]["sValue"]}] > ${sWhat}`); + console.log(` TEXT_PROCESSOR: [${this.lTokens[nTokenStart]["sValue"]}:${this.lTokens[nTokenEnd]["sValue"]}] > ${sWhat}`); } } else if (cActionType == "=") { // disambiguation - gc_functions[sWhat](this.lToken, nTokenOffset, nLastToken); - //oEvalFunc[sWhat](this.lToken, nTokenOffset, nLastToken); + gc_functions[sWhat](this.lTokens, nTokenOffset, nLastToken); + //oEvalFunc[sWhat](this.lTokens, nTokenOffset, nLastToken); if (bDebug) { - console.log(` DISAMBIGUATOR: (${sWhat}) [${this.lToken[nTokenOffset+1]["sValue"]}:${this.lToken[nLastToken]["sValue"]}]`); + console.log(` DISAMBIGUATOR: (${sWhat}) [${this.lTokens[nTokenOffset+1]["sValue"]}:${this.lTokens[nLastToken]["sValue"]}]`); } } else if (cActionType == ">") { // we do nothing, this test is just a condition to apply all following actions if (bDebug) { @@ -718,18 +723,18 @@ else if (cActionType == "/") { // Tag let nTokenStart = (eAct[0] > 0) ? nTokenOffset + eAct[0] : nLastToken + eAct[0]; let nTokenEnd = (eAct[1] > 0) ? nTokenOffset + eAct[1] : nLastToken + eAct[1]; for (let i = nTokenStart; i <= nTokenEnd; i++) { - if (this.lToken[i].hasOwnProperty("aTags")) { - this.lToken[i]["aTags"].add(...sWhat.split("|")) + if (this.lTokens[i].hasOwnProperty("aTags")) { + this.lTokens[i]["aTags"].add(...sWhat.split("|")) } else { - this.lToken[i]["aTags"] = new Set(sWhat.split("|")); + this.lTokens[i]["aTags"] = new Set(sWhat.split("|")); } } if (bDebug) { - console.log(` TAG: ${sWhat} > [${this.lToken[nTokenStart]["sValue"]}:${this.lToken[nTokenEnd]["sValue"]}]`); + console.log(` TAG: ${sWhat} > [${this.lTokens[nTokenStart]["sValue"]}:${this.lTokens[nTokenEnd]["sValue"]}]`); } for (let sTag of sWhat.split("|")) { if (!this.dTags.has(sTag)) { this.dTags.set(sTag, [nTokenStart, nTokenEnd]); } else { @@ -743,19 +748,19 @@ console.log(" IMMUNITY: " + sLineId + " / " + sRuleId); } let nTokenStart = (eAct[0] > 0) ? nTokenOffset + eAct[0] : nLastToken + eAct[0]; let nTokenEnd = (eAct[1] > 0) ? nTokenOffset + eAct[1] : nLastToken + eAct[1]; if (nTokenEnd - nTokenStart == 0) { - this.lToken[nTokenStart]["bImmune"] = true; - let nErrorStart = this.nOffsetWithinParagraph + this.lToken[nTokenStart]["nStart"]; + this.lTokens[nTokenStart]["bImmune"] = true; + let nErrorStart = this.nOffsetWithinParagraph + this.lTokens[nTokenStart]["nStart"]; if (this.dError.has(nErrorStart)) { this.dError.delete(nErrorStart); } } else { for (let i = nTokenStart; i <= nTokenEnd; i++) { - this.lToken[i]["bImmune"] = true; - let nErrorStart = this.nOffsetWithinParagraph + this.lToken[i]["nStart"]; + this.lTokens[i]["bImmune"] = true; + let nErrorStart = this.nOffsetWithinParagraph + this.lTokens[i]["nStart"]; if (this.dError.has(nErrorStart)) { this.dError.delete(nErrorStart); } } } @@ -809,24 +814,24 @@ _createErrorFromTokens (sSugg, nTokenOffset, nLastToken, iFirstToken, nStart, nEnd, sLineId, sRuleId, bCaseSvty, sMsg, sURL, bShowRuleId, sOption, bContext) { // suggestions let lSugg = []; if (sSugg.startsWith("=")) { - sSugg = gc_functions[sSugg.slice(1)](this.lToken, nTokenOffset, nLastToken); - //sSugg = oEvalFunc[sSugg.slice(1)](this.lToken, nTokenOffset, nLastToken); + sSugg = gc_functions[sSugg.slice(1)](this.lTokens, nTokenOffset, nLastToken); + //sSugg = oEvalFunc[sSugg.slice(1)](this.lTokens, nTokenOffset, nLastToken); lSugg = (sSugg) ? sSugg.split("|") : []; } else if (sSugg == "_") { lSugg = []; } else { lSugg = this._expand(sSugg, nTokenOffset, nLastToken).split("|"); } - if (bCaseSvty && lSugg.length > 0 && this.lToken[iFirstToken]["sValue"].slice(0,1).gl_isUpperCase()) { + if (bCaseSvty && lSugg.length > 0 && this.lTokens[iFirstToken]["sValue"].slice(0,1).gl_isUpperCase()) { lSugg = (this.sSentence.slice(nStart, nEnd).gl_isUpperCase()) ? lSugg.map((s) => s.toUpperCase()) : capitalizeArray(lSugg); } // Message - let sMessage = (sMsg.startsWith("=")) ? gc_functions[sMsg.slice(1)](this.lToken, nTokenOffset, nLastToken) : this._expand(sMsg, nTokenOffset, nLastToken); - //let sMessage = (sMsg.startsWith("=")) ? oEvalFunc[sMsg.slice(1)](this.lToken, nTokenOffset, nLastToken) : this._expand(sMsg, nTokenOffset, nLastToken); + let sMessage = (sMsg.startsWith("=")) ? gc_functions[sMsg.slice(1)](this.lTokens, nTokenOffset, nLastToken) : this._expand(sMsg, nTokenOffset, nLastToken); + //let sMessage = (sMsg.startsWith("=")) ? oEvalFunc[sMsg.slice(1)](this.lTokens, nTokenOffset, nLastToken) : this._expand(sMsg, nTokenOffset, nLastToken); if (bShowRuleId) { sMessage += " #" + sLineId + " / " + sRuleId; } // return this._createError(nStart, nEnd, sLineId, sRuleId, sOption, sMessage, lSugg, sURL, bContext); @@ -854,13 +859,13 @@ _expand (sText, nTokenOffset, nLastToken) { let m; while ((m = /\\(-?[0-9]+)/.exec(sText)) !== null) { if (m[1].slice(0,1) == "-") { - sText = sText.replace(m[0], this.lToken[nLastToken+parseInt(m[1],10)+1]["sValue"]); + sText = sText.replace(m[0], this.lTokens[nLastToken+parseInt(m[1],10)+1]["sValue"]); } else { - sText = sText.replace(m[0], this.lToken[nTokenOffset+parseInt(m[1],10)]["sValue"]); + sText = sText.replace(m[0], this.lTokens[nTokenOffset+parseInt(m[1],10)]["sValue"]); } } return sText; } @@ -895,45 +900,45 @@ _tagAndPrepareTokenForRewriting (sWhat, nTokenRewriteStart, nTokenRewriteEnd, nTokenOffset, nLastToken, bCaseSvty, bDebug) { // text processor: rewrite tokens between and position if (sWhat === "*") { // purge text if (nTokenRewriteEnd - nTokenRewriteStart == 0) { - this.lToken[nTokenRewriteStart]["bToRemove"] = true; + this.lTokens[nTokenRewriteStart]["bToRemove"] = true; } else { for (let i = nTokenRewriteStart; i <= nTokenRewriteEnd; i++) { - this.lToken[i]["bToRemove"] = true; + this.lTokens[i]["bToRemove"] = true; } } } else if (sWhat === "␣") { // merge tokens - this.lToken[nTokenRewriteStart]["nMergeUntil"] = nTokenRewriteEnd; + this.lTokens[nTokenRewriteStart]["nMergeUntil"] = nTokenRewriteEnd; } else if (sWhat === "_") { // neutralized token if (nTokenRewriteEnd - nTokenRewriteStart == 0) { - this.lToken[nTokenRewriteStart]["sNewValue"] = "_"; + this.lTokens[nTokenRewriteStart]["sNewValue"] = "_"; } else { for (let i = nTokenRewriteStart; i <= nTokenRewriteEnd; i++) { - this.lToken[i]["sNewValue"] = "_"; + this.lTokens[i]["sNewValue"] = "_"; } } } else { if (sWhat.startsWith("=")) { - sWhat = gc_functions[sWhat.slice(1)](this.lToken, nTokenOffset, nLastToken); - //sWhat = oEvalFunc[sWhat.slice(1)](this.lToken, nTokenOffset, nLastToken); + sWhat = gc_functions[sWhat.slice(1)](this.lTokens, nTokenOffset, nLastToken); + //sWhat = oEvalFunc[sWhat.slice(1)](this.lTokens, nTokenOffset, nLastToken); } else { sWhat = this._expand(sWhat, nTokenOffset, nLastToken); } - let bUppercase = bCaseSvty && this.lToken[nTokenRewriteStart]["sValue"].slice(0,1).gl_isUpperCase(); + let bUppercase = bCaseSvty && this.lTokens[nTokenRewriteStart]["sValue"].slice(0,1).gl_isUpperCase(); if (nTokenRewriteEnd - nTokenRewriteStart == 0) { // one token if (bUppercase) { sWhat = sWhat.gl_toCapitalize(); } - this.lToken[nTokenRewriteStart]["sNewValue"] = sWhat; + this.lTokens[nTokenRewriteStart]["sNewValue"] = sWhat; } else { // several tokens let lTokenValue = sWhat.split("|"); if (lTokenValue.length != (nTokenRewriteEnd - nTokenRewriteStart + 1)) { @@ -944,16 +949,16 @@ } let j = 0; for (let i = nTokenRewriteStart; i <= nTokenRewriteEnd; i++) { let sValue = lTokenValue[j]; if (!sValue || sValue === "*") { - this.lToken[i]["bToRemove"] = true; + this.lTokens[i]["bToRemove"] = true; } else { if (bUppercase) { sValue = sValue.gl_toCapitalize(); } - this.lToken[i]["sNewValue"] = sValue; + this.lTokens[i]["sNewValue"] = sValue; } j++; } } } @@ -965,19 +970,20 @@ console.log("REWRITE"); } let lNewToken = []; let nMergeUntil = 0; let oMergingToken = null; - for (let [iToken, oToken] of this.lToken.entries()) { + for (let [iToken, oToken] of this.lTokens.entries()) { let bKeepToken = true; if (oToken["sType"] != "INFO") { if (nMergeUntil && iToken <= nMergeUntil) { oMergingToken["sValue"] += " ".repeat(oToken["nStart"] - oMergingToken["nEnd"]) + oToken["sValue"]; oMergingToken["nEnd"] = oToken["nEnd"]; if (bDebug) { console.log(" MERGED TOKEN: " + oMergingToken["sValue"]); } + oToken["bMerged"] = true; bKeepToken = false; } if (oToken.hasOwnProperty("nMergeUntil")) { if (iToken > nMergeUntil) { // this token is not already merged with a previous token oMergingToken = oToken; @@ -1013,12 +1019,12 @@ } } if (bDebug) { console.log(" TEXT REWRITED: " + this.sSentence); } - this.lToken.length = 0; - this.lToken = lNewToken; + this.lTokens.length = 0; + this.lTokens = lNewToken; } }; if (typeof(exports) !== 'undefined') { Index: gc_core/py/lang_core/gc_engine.py ================================================================== --- gc_core/py/lang_core/gc_engine.py +++ gc_core/py/lang_core/gc_engine.py @@ -233,11 +233,11 @@ self.sText = sText self.sText0 = sText self.sSentence = "" self.sSentence0 = "" self.nOffsetWithinParagraph = 0 - self.lToken = [] + self.lTokens = [] self.dTokenPos = {} # {position: token} self.dTags = {} # {position: tags} self.dError = {} # {position: error} self.dSentenceError = {} # {position: error} (for the current sentence only) self.dErrorPriority = {} # {position: priority of the current error} @@ -244,11 +244,11 @@ def __str__ (self): s = "===== TEXT =====\n" s += "sentence: " + self.sSentence0 + "\n" s += "now: " + self.sSentence + "\n" - for dToken in self.lToken: + for dToken in self.lTokens: s += '#{i}\t{nStart}:{nEnd}\t{sValue}\t{sType}'.format(**dToken) if "lMorph" in dToken: s += "\t" + str(dToken["lMorph"]) if "aTags" in dToken: s += "\t" + str(dToken["aTags"]) @@ -265,10 +265,12 @@ # parse paragraph try: self.parseText(self.sText, self.sText0, True, 0, sCountry, dOpt, bShowRuleId, bDebug, bContext) except: raise + self.lTokens = None + self.lTokens0 = None if bFullInfo: lParagraphErrors = list(self.dError.values()) lSentences = [] self.dSentenceError.clear() # parse sentences @@ -277,22 +279,25 @@ if 4 < (iEnd - iStart) < 2000: try: self.sSentence = sText[iStart:iEnd] self.sSentence0 = self.sText0[iStart:iEnd] self.nOffsetWithinParagraph = iStart - self.lToken = list(_oTokenizer.genTokens(self.sSentence, True)) - self.dTokenPos = { dToken["nStart"]: dToken for dToken in self.lToken if dToken["sType"] != "INFO" } + self.lTokens = list(_oTokenizer.genTokens(self.sSentence, True)) + self.dTokenPos = { dToken["nStart"]: dToken for dToken in self.lTokens if dToken["sType"] != "INFO" } if bFullInfo: - dSentence = { "nStart": iStart, "nEnd": iEnd, "sSentence": self.sSentence, "lToken": list(self.lToken) } - for dToken in dSentence["lToken"]: - if dToken["sType"] == "WORD": - dToken["bValidToken"] = _oSpellChecker.isValidToken(dToken["sValue"]) - # the list of tokens is duplicated, to keep all tokens from being deleted when analysis + self.lTokens0 = list(self.lTokens) # the list of tokens is duplicated, to keep tokens from being deleted when analysis self.parseText(self.sSentence, self.sSentence0, False, iStart, sCountry, dOpt, bShowRuleId, bDebug, bContext) if bFullInfo: - dSentence["lGrammarErrors"] = list(self.dSentenceError.values()) - lSentences.append(dSentence) + for dToken in self.lTokens0: + _oSpellChecker.setLabelsOnToken(dToken) + lSentences.append({ + "nStart": iStart, + "nEnd": iEnd, + "sSentence": self.sSentence0, + "lTokens": self.lTokens0, + "lGrammarErrors": list(self.dSentenceError.values()) + }) self.dSentenceError.clear() except: raise if bFullInfo: # Grammar checking and sentence analysis @@ -380,18 +385,18 @@ self.sSentence = sText def update (self, sSentence, bDebug=False): "update and retokenize" self.sSentence = sSentence - lNewToken = list(_oTokenizer.genTokens(sSentence, True)) - for dToken in lNewToken: + lNewTokens = list(_oTokenizer.genTokens(sSentence, True)) + for dToken in lNewTokens: if "lMorph" in self.dTokenPos.get(dToken["nStart"], {}): dToken["lMorph"] = self.dTokenPos[dToken["nStart"]]["lMorph"] if "aTags" in self.dTokenPos.get(dToken["nStart"], {}): dToken["aTags"] = self.dTokenPos[dToken["nStart"]]["aTags"] - self.lToken = lNewToken - self.dTokenPos = { dToken["nStart"]: dToken for dToken in self.lToken if dToken["sType"] != "INFO" } + self.lTokens = lNewTokens + self.dTokenPos = { dToken["nStart"]: dToken for dToken in self.lTokens if dToken["sType"] != "INFO" } if bDebug: echo("UPDATE:") echo(self) def _getNextPointers (self, dToken, dGraph, dPointer, bDebug=False): @@ -551,11 +556,11 @@ def parseGraph (self, dGraph, sCountry="${country_default}", dOptions=None, bShowRuleId=False, bDebug=False, bContext=False): "parse graph with tokens from the text and execute actions encountered" lPointer = [] bTagAndRewrite = False - for iToken, dToken in enumerate(self.lToken): + for iToken, dToken in enumerate(self.lTokens): if bDebug: echo("TOKEN: " + dToken["sValue"]) # check arcs for each existing pointer lNextPointer = [] for dPointer in lPointer: @@ -592,20 +597,20 @@ # Disambiguator [ option, condition, "=", replacement/suggestion/action ] # Tag [ option, condition, "/", replacement/suggestion/action, iTokenStart, iTokenEnd ] # Immunity [ option, condition, "!", "", iTokenStart, iTokenEnd ] # Test [ option, condition, ">", "" ] if not sOption or dOptions.get(sOption, False): - bCondMemo = not sFuncCond or getattr(gc_functions, sFuncCond)(self.lToken, nTokenOffset, nLastToken, sCountry, bCondMemo, self.dTags, self.sSentence, self.sSentence0) + bCondMemo = not sFuncCond or getattr(gc_functions, sFuncCond)(self.lTokens, nTokenOffset, nLastToken, sCountry, bCondMemo, self.dTags, self.sSentence, self.sSentence0) if bCondMemo: if cActionType == "-": # grammar error iTokenStart, iTokenEnd, cStartLimit, cEndLimit, bCaseSvty, nPriority, sMessage, iURL = eAct nTokenErrorStart = nTokenOffset + iTokenStart if iTokenStart > 0 else nLastToken + iTokenStart - if "bImmune" not in self.lToken[nTokenErrorStart]: + if "bImmune" not in self.lTokens[nTokenErrorStart]: nTokenErrorEnd = nTokenOffset + iTokenEnd if iTokenEnd > 0 else nLastToken + iTokenEnd - nErrorStart = self.nOffsetWithinParagraph + (self.lToken[nTokenErrorStart]["nStart"] if cStartLimit == "<" else self.lToken[nTokenErrorStart]["nEnd"]) - nErrorEnd = self.nOffsetWithinParagraph + (self.lToken[nTokenErrorEnd]["nEnd"] if cEndLimit == ">" else self.lToken[nTokenErrorEnd]["nStart"]) + nErrorStart = self.nOffsetWithinParagraph + (self.lTokens[nTokenErrorStart]["nStart"] if cStartLimit == "<" else self.lTokens[nTokenErrorStart]["nEnd"]) + nErrorEnd = self.nOffsetWithinParagraph + (self.lTokens[nTokenErrorEnd]["nEnd"] if cEndLimit == ">" else self.lTokens[nTokenErrorEnd]["nStart"]) if nErrorStart not in self.dError or nPriority > self.dErrorPriority.get(nErrorStart, -1): self.dError[nErrorStart] = self._createErrorFromTokens(sWhat, nTokenOffset, nLastToken, nTokenErrorStart, nErrorStart, nErrorEnd, sLineId, sRuleId, bCaseSvty, \ sMessage, _rules_graph.dURL.get(iURL, ""), bShowRuleId, sOption, bContext) self.dErrorPriority[nErrorStart] = nPriority self.dSentenceError[nErrorStart] = self.dError[nErrorStart] @@ -616,31 +621,31 @@ nTokenStart = nTokenOffset + eAct[0] if eAct[0] > 0 else nLastToken + eAct[0] nTokenEnd = nTokenOffset + eAct[1] if eAct[1] > 0 else nLastToken + eAct[1] self._tagAndPrepareTokenForRewriting(sWhat, nTokenStart, nTokenEnd, nTokenOffset, nLastToken, eAct[2], bDebug) bChange = True if bDebug: - echo(" TEXT_PROCESSOR: [{}:{}] > {}".format(self.lToken[nTokenStart]["sValue"], self.lToken[nTokenEnd]["sValue"], sWhat)) + echo(" TEXT_PROCESSOR: [{}:{}] > {}".format(self.lTokens[nTokenStart]["sValue"], self.lTokens[nTokenEnd]["sValue"], sWhat)) elif cActionType == "=": # disambiguation - getattr(gc_functions, sWhat)(self.lToken, nTokenOffset, nLastToken) + getattr(gc_functions, sWhat)(self.lTokens, nTokenOffset, nLastToken) if bDebug: - echo(" DISAMBIGUATOR: ({}) [{}:{}]".format(sWhat, self.lToken[nTokenOffset+1]["sValue"], self.lToken[nLastToken]["sValue"])) + echo(" DISAMBIGUATOR: ({}) [{}:{}]".format(sWhat, self.lTokens[nTokenOffset+1]["sValue"], self.lTokens[nLastToken]["sValue"])) elif cActionType == ">": # we do nothing, this test is just a condition to apply all following actions if bDebug: echo(" COND_OK") elif cActionType == "/": # Tag nTokenStart = nTokenOffset + eAct[0] if eAct[0] > 0 else nLastToken + eAct[0] nTokenEnd = nTokenOffset + eAct[1] if eAct[1] > 0 else nLastToken + eAct[1] for i in range(nTokenStart, nTokenEnd+1): - if "aTags" in self.lToken[i]: - self.lToken[i]["aTags"].update(sWhat.split("|")) + if "aTags" in self.lTokens[i]: + self.lTokens[i]["aTags"].update(sWhat.split("|")) else: - self.lToken[i]["aTags"] = set(sWhat.split("|")) + self.lTokens[i]["aTags"] = set(sWhat.split("|")) if bDebug: - echo(" TAG: {} > [{}:{}]".format(sWhat, self.lToken[nTokenStart]["sValue"], self.lToken[nTokenEnd]["sValue"])) + echo(" TAG: {} > [{}:{}]".format(sWhat, self.lTokens[nTokenStart]["sValue"], self.lTokens[nTokenEnd]["sValue"])) for sTag in sWhat.split("|"): if sTag not in self.dTags: self.dTags[sTag] = [nTokenStart, nTokenEnd] else: self.dTags[sTag][0] = min(nTokenStart, self.dTags[sTag][0]) @@ -650,18 +655,18 @@ if bDebug: echo(" IMMUNITY: " + sLineId + " / " + sRuleId) nTokenStart = nTokenOffset + eAct[0] if eAct[0] > 0 else nLastToken + eAct[0] nTokenEnd = nTokenOffset + eAct[1] if eAct[1] > 0 else nLastToken + eAct[1] if nTokenEnd - nTokenStart == 0: - self.lToken[nTokenStart]["bImmune"] = True - nErrorStart = self.nOffsetWithinParagraph + self.lToken[nTokenStart]["nStart"] + self.lTokens[nTokenStart]["bImmune"] = True + nErrorStart = self.nOffsetWithinParagraph + self.lTokens[nTokenStart]["nStart"] if nErrorStart in self.dError: del self.dError[nErrorStart] else: for i in range(nTokenStart, nTokenEnd+1): - self.lToken[i]["bImmune"] = True - nErrorStart = self.nOffsetWithinParagraph + self.lToken[i]["nStart"] + self.lTokens[i]["bImmune"] = True + nErrorStart = self.nOffsetWithinParagraph + self.lTokens[i]["nStart"] if nErrorStart in self.dError: del self.dError[nErrorStart] else: echo("# error: unknown action at " + sLineId) elif cActionType == ">": @@ -695,20 +700,20 @@ return self._createErrorAsDict(nStart, nEnd, sLineId, sRuleId, sOption, sMessage, lSugg, sURL, bContext) def _createErrorFromTokens (self, sSugg, nTokenOffset, nLastToken, iFirstToken, nStart, nEnd, sLineId, sRuleId, bCaseSvty, sMsg, sURL, bShowRuleId, sOption, bContext): # suggestions if sSugg[0:1] == "=": - sSugg = getattr(gc_functions, sSugg[1:])(self.lToken, nTokenOffset, nLastToken) + sSugg = getattr(gc_functions, sSugg[1:])(self.lTokens, nTokenOffset, nLastToken) lSugg = sSugg.split("|") if sSugg else [] elif sSugg == "_": lSugg = [] else: lSugg = self._expand(sSugg, nTokenOffset, nLastToken).split("|") - if bCaseSvty and lSugg and self.lToken[iFirstToken]["sValue"][0:1].isupper(): + if bCaseSvty and lSugg and self.lTokens[iFirstToken]["sValue"][0:1].isupper(): lSugg = list(map(lambda s: s.upper(), lSugg)) if self.sSentence[nStart:nEnd].isupper() else list(map(lambda s: s[0:1].upper()+s[1:], lSugg)) # Message - sMessage = getattr(gc_functions, sMsg[1:])(self.lToken, nTokenOffset, nLastToken) if sMsg[0:1] == "=" else self._expand(sMsg, nTokenOffset, nLastToken) + sMessage = getattr(gc_functions, sMsg[1:])(self.lTokens, nTokenOffset, nLastToken) if sMsg[0:1] == "=" else self._expand(sMsg, nTokenOffset, nLastToken) if bShowRuleId: sMessage += " #" + sLineId + " / " + sRuleId # if _bWriterError: return self._createErrorForWriter(nStart, nEnd - nStart, sRuleId, sOption, sMessage, lSugg, sURL) @@ -753,13 +758,13 @@ return dErr def _expand (self, sText, nTokenOffset, nLastToken): for m in re.finditer(r"\\(-?[0-9]+)", sText): if m.group(1)[0:1] == "-": - sText = sText.replace(m.group(0), self.lToken[nLastToken+int(m.group(1))+1]["sValue"]) + sText = sText.replace(m.group(0), self.lTokens[nLastToken+int(m.group(1))+1]["sValue"]) else: - sText = sText.replace(m.group(0), self.lToken[nTokenOffset+int(m.group(1))]["sValue"]) + sText = sText.replace(m.group(0), self.lTokens[nTokenOffset+int(m.group(1))]["sValue"]) return sText def rewriteText (self, sText, sRepl, iGroup, m, bUppercase): "text processor: write in at position" nLen = m.end(iGroup) - m.start(iGroup) @@ -782,80 +787,85 @@ def _tagAndPrepareTokenForRewriting (self, sWhat, nTokenRewriteStart, nTokenRewriteEnd, nTokenOffset, nLastToken, bCaseSvty, bDebug): "text processor: rewrite tokens between and position" if sWhat == "*": # purge text if nTokenRewriteEnd - nTokenRewriteStart == 0: - self.lToken[nTokenRewriteStart]["bToRemove"] = True + self.lTokens[nTokenRewriteStart]["bToRemove"] = True else: for i in range(nTokenRewriteStart, nTokenRewriteEnd+1): - self.lToken[i]["bToRemove"] = True + self.lTokens[i]["bToRemove"] = True elif sWhat == "␣": # merge tokens - self.lToken[nTokenRewriteStart]["nMergeUntil"] = nTokenRewriteEnd + self.lTokens[nTokenRewriteStart]["nMergeUntil"] = nTokenRewriteEnd elif sWhat == "_": # neutralized token if nTokenRewriteEnd - nTokenRewriteStart == 0: - self.lToken[nTokenRewriteStart]["sNewValue"] = "_" + self.lTokens[nTokenRewriteStart]["sNewValue"] = "_" else: for i in range(nTokenRewriteStart, nTokenRewriteEnd+1): - self.lToken[i]["sNewValue"] = "_" + self.lTokens[i]["sNewValue"] = "_" else: if sWhat.startswith("="): - sWhat = getattr(gc_functions, sWhat[1:])(self.lToken, nTokenOffset, nLastToken) + sWhat = getattr(gc_functions, sWhat[1:])(self.lTokens, nTokenOffset, nLastToken) else: sWhat = self._expand(sWhat, nTokenOffset, nLastToken) - bUppercase = bCaseSvty and self.lToken[nTokenRewriteStart]["sValue"][0:1].isupper() + bUppercase = bCaseSvty and self.lTokens[nTokenRewriteStart]["sValue"][0:1].isupper() if nTokenRewriteEnd - nTokenRewriteStart == 0: # one token if bUppercase: sWhat = sWhat[0:1].upper() + sWhat[1:] - self.lToken[nTokenRewriteStart]["sNewValue"] = sWhat + self.lTokens[nTokenRewriteStart]["sNewValue"] = sWhat else: # several tokens lTokenValue = sWhat.split("|") if len(lTokenValue) != (nTokenRewriteEnd - nTokenRewriteStart + 1): if (bDebug): echo("Error. Text processor: number of replacements != number of tokens.") return for i, sValue in zip(range(nTokenRewriteStart, nTokenRewriteEnd+1), lTokenValue): if not sValue or sValue == "*": - self.lToken[i]["bToRemove"] = True + self.lTokens[i]["bToRemove"] = True else: if bUppercase: sValue = sValue[0:1].upper() + sValue[1:] - self.lToken[i]["sNewValue"] = sValue + self.lTokens[i]["sNewValue"] = sValue def rewriteFromTags (self, bDebug=False): "rewrite the sentence, modify tokens, purge the token list" if bDebug: echo("REWRITE") - lNewToken = [] + lNewTokens = [] + lNewTokens0 = [] nMergeUntil = 0 dTokenMerger = {} - for iToken, dToken in enumerate(self.lToken): + for iToken, dToken in enumerate(self.lTokens): bKeepToken = True if dToken["sType"] != "INFO": if nMergeUntil and iToken <= nMergeUntil: + # token to merge dTokenMerger["sValue"] += " " * (dToken["nStart"] - dTokenMerger["nEnd"]) + dToken["sValue"] dTokenMerger["nEnd"] = dToken["nEnd"] if bDebug: echo(" MERGED TOKEN: " + dTokenMerger["sValue"]) + dToken["bMerged"] = True bKeepToken = False if "nMergeUntil" in dToken: - if iToken > nMergeUntil: # this token is not already merged with a previous token + # first token to be merge with + if iToken > nMergeUntil: # this token is not to be merged with a previous token dTokenMerger = dToken if dToken["nMergeUntil"] > nMergeUntil: nMergeUntil = dToken["nMergeUntil"] del dToken["nMergeUntil"] elif "bToRemove" in dToken: + # deletion required if bDebug: echo(" REMOVED: " + dToken["sValue"]) self.sSentence = self.sSentence[:dToken["nStart"]] + " " * (dToken["nEnd"] - dToken["nStart"]) + self.sSentence[dToken["nEnd"]:] bKeepToken = False # if bKeepToken: - lNewToken.append(dToken) + lNewTokens.append(dToken) if "sNewValue" in dToken: # rewrite token and sentence if bDebug: echo(dToken["sValue"] + " -> " + dToken["sNewValue"]) dToken["sRealValue"] = dToken["sValue"] @@ -864,7 +874,7 @@ sNewRepl = (dToken["sNewValue"] + " " * nDiffLen) if nDiffLen >= 0 else dToken["sNewValue"][:len(dToken["sRealValue"])] self.sSentence = self.sSentence[:dToken["nStart"]] + sNewRepl + self.sSentence[dToken["nEnd"]:] del dToken["sNewValue"] if bDebug: echo(" TEXT REWRITED: " + self.sSentence) - self.lToken.clear() - self.lToken = lNewToken + self.lTokens.clear() + self.lTokens = lNewTokens Index: gc_lang/fr/webext/content_scripts/init.js ================================================================== --- gc_lang/fr/webext/content_scripts/init.js +++ gc_lang/fr/webext/content_scripts/init.js @@ -342,16 +342,16 @@ parseAndSpellcheck1: function (sText, sDestination, sParagraphId) { this.send("parseAndSpellcheck1", { sText: sText, sCountry: "FR", bDebug: false, bContext: false }, { sDestination: sDestination, sParagraphId: sParagraphId }); }, - getListOfTokens: function (sText) { - this.send("getListOfTokens", { sText: sText }, {}); + parseFull: function (sText, sDestination, sParagraphId) { + this.send("parseFull", { sText: sText, sCountry: "FR", bDebug: false, bContext: false }, { sDestination: sDestination }); }, - parseFull: function (sText) { - this.send("parseFull", { sText: sText, sCountry: "FR", bDebug: false, bContext: false }, {}); + getListOfTokens: function (sText, sDestination) { + this.send("getListOfTokens", { sText: sText }, { sDestination: sDestination }); }, getVerb: function (sVerb, bStart=true, bPro=false, bNeg=false, bTpsCo=false, bInt=false, bFem=false) { this.send("getVerb", { sVerb: sVerb, bPro: bPro, bNeg: bNeg, bTpsCo: bTpsCo, bInt: bInt, bFem: bFem }, { bStart: bStart }); }, @@ -420,18 +420,22 @@ if (oInfo.sDestination == "__GrammalectePanel__") { oGrammalecte.oGCPanel.refreshParagraph(oInfo.sParagraphId, result); } break; case "parseFull": - // TODO + if (oInfo.sDestination == "__GrammalectePanel__") { + oGrammalecte.oGCPanel.showParagraphAnalysis(result); + } break; case "getListOfTokens": - if (!bEnd) { - oGrammalecte.oGCPanel.addListOfTokens(result); - } else { - oGrammalecte.oGCPanel.stopWaitIcon(); - oGrammalecte.oGCPanel.endTimer(); + if (oInfo.sDestination == "__GrammalectePanel__") { + if (!bEnd) { + oGrammalecte.oGCPanel.addListOfTokens(result); + } else { + oGrammalecte.oGCPanel.stopWaitIcon(); + oGrammalecte.oGCPanel.endTimer(); + } } break; case "getSpellSuggestions": if (oInfo.sDestination == "__GrammalectePanel__") { oGrammalecte.oGCPanel.oTooltip.setSpellSuggestionsFor(result.sWord, result.aSugg, result.iSuggBlock, oInfo.sErrorId); Index: gc_lang/fr/webext/content_scripts/panel_gc.css ================================================================== --- gc_lang/fr/webext/content_scripts/panel_gc.css +++ gc_lang/fr/webext/content_scripts/panel_gc.css @@ -9,25 +9,23 @@ overflow: auto; } div.grammalecte_paragraph_block { margin: 5px 5px 0 5px; + background-color: hsl(0, 0%, 96%); + border-radius: 2px; } - p.grammalecte_paragraph { margin: 0; padding: 12px; - background-color: hsl(0, 0%, 96%); - border-radius: 2px; line-height: 1.3; text-align: left; font-size: 14px; font-family: "Courier New", Courier, "Lucida Sans Typewriter", "Lucida Typewriter", monospace; color: hsl(0, 0%, 0%); hyphens: none; } - /* Action buttons */ div.grammalecte_paragraph_actions { @@ -45,10 +43,17 @@ font-size: 14px; color: hsl(0, 0%, 96%); border-radius: 2px; } +div.grammalecte_paragraph_actions .grammalecte_blue { + color: hsl(0, 0%, 80%); +} +div.grammalecte_paragraph_actions .grammalecte_blue:hover { + background-color: hsl(210, 50%, 40%); + color: hsl(0, 0%, 100%); +} div.grammalecte_paragraph_actions .grammalecte_green { color: hsl(0, 0%, 80%); } div.grammalecte_paragraph_actions .grammalecte_green:hover { background-color: hsl(120, 50%, 40%); Index: gc_lang/fr/webext/content_scripts/panel_gc.js ================================================================== --- gc_lang/fr/webext/content_scripts/panel_gc.js +++ gc_lang/fr/webext/content_scripts/panel_gc.js @@ -3,19 +3,22 @@ /* jshint esversion:6, -W097 */ /* jslint esversion:6 */ /* global GrammalectePanel, oGrammalecte, oGrammalecteBackgroundPort, showError, window, document, console */ "use strict"; + function onGrammalecteGCPanelClick (xEvent) { try { let xElem = xEvent.target; if (xElem.id) { if (xElem.id.startsWith("grammalecte_sugg")) { oGrammalecte.oGCPanel.applySuggestion(xElem.id); } else if (xElem.id === "grammalecte_tooltip_ignore") { oGrammalecte.oGCPanel.ignoreError(xElem.id); + } else if (xElem.id.startsWith("grammalecte_analysis")) { + oGrammalecte.oGCPanel.sendParagraphToGrammaticalAnalysis(parseInt(xElem.dataset.para_num, 10)); } else if (xElem.id.startsWith("grammalecte_check")) { oGrammalecte.oGCPanel.recheckParagraph(parseInt(xElem.dataset.para_num, 10)); } else if (xElem.id.startsWith("grammalecte_hide")) { xElem.parentNode.parentNode.style.display = "none"; } else if (xElem.id.startsWith("grammalecte_err") @@ -64,10 +67,22 @@ this.iLastEditedParagraph = -1; this.nParagraph = 0; // Lexicographer this.nLxgCount = 0; this.xLxgPanelContent = oGrammalecte.createNode("div", {id: "grammalecte_lxg_panel_content"}); + this.xLxgInputBlock = oGrammalecte.createNode("div", {id: "grammalecte_lxg_input_block"}); + this.xLxgInput = oGrammalecte.createNode("div", {id: "grammalecte_lxg_input", lang: "fr", contentEditable: "true"}); + this.xLxgInputButton = oGrammalecte.createNode("div", {id: "grammalecte_lxg_input_button", textContent: "Analyse grammaticale"}); + this.xLxgInputButton.addEventListener("click", () => { this.grammaticalAnalysis(); }, false); + this.xLxgInputButton2 = oGrammalecte.createNode("div", {id: "grammalecte_lxg_input_button", textContent: "Analyse lexicale"}); + this.xLxgInputButton2.addEventListener("click", () => { this.getListOfTokens(); }, false); + this.xLxgInputBlock.appendChild(this.xLxgInput); + this.xLxgInputBlock.appendChild(this.xLxgInputButton); + this.xLxgInputBlock.appendChild(this.xLxgInputButton2); + this.xLxgPanelContent.appendChild(this.xLxgInputBlock); + this.xLxgResultZone = oGrammalecte.createNode("div", {id: "grammalecte_lxg_result_zone"}); + this.xLxgPanelContent.appendChild(this.xLxgResultZone); this.xPanelContent.appendChild(this.xLxgPanelContent); // Conjugueur this.xConjPanelContent = oGrammalecte.createNode("div", {id: "grammalecte_conj_panel_content"}); this.xConjPanelContent.innerHTML = sGrammalecteConjugueurHTML; // @Reviewers: sGrammalecteConjugueurHTML is a const value defined in this.xPanelContent.appendChild(this.xConjPanelContent); @@ -112,14 +127,10 @@ this.setAutoRefreshButton(); } this.xLxgButton.onclick = () => { if (!this.bWorking) { this.showLexicographer(); - this.clearLexicographer(); - this.startWaitIcon(); - oGrammalecteBackgroundPort.getListOfTokens(this.oTextControl.getText()); - //oGrammalecteBackgroundPort.parseFull(this.oTextControl.getText()) } }; this.xConjButton.onclick = () => { if (!this.bWorking) { this.showConjugueur(); @@ -238,14 +249,14 @@ addParagraphResult (oResult) { try { this.resetTimer(); if (oResult && (oResult.sParagraph.trim() !== "" || oResult.aGrammErr.length > 0 || oResult.aSpellErr.length > 0)) { - let xNodeDiv = oGrammalecte.createNode("div", {className: "grammalecte_paragraph_block"}); // actions let xActionsBar = oGrammalecte.createNode("div", {className: "grammalecte_paragraph_actions"}); xActionsBar.appendChild(oGrammalecte.createNode("div", {id: "grammalecte_check" + oResult.iParaNum, className: "grammalecte_paragraph_button grammalecte_green", textContent: "↻", title: "Réanalyser…"}, {para_num: oResult.iParaNum})); + xActionsBar.appendChild(oGrammalecte.createNode("div", {id: "grammalecte_analysis" + oResult.iParaNum, className: "grammalecte_paragraph_button grammalecte_blue", textContent: "»", title: "Analyse grammaticale…"}, {para_num: oResult.iParaNum})); xActionsBar.appendChild(oGrammalecte.createNode("div", {id: "grammalecte_hide" + oResult.iParaNum, className: "grammalecte_paragraph_button grammalecte_red", textContent: "×", title: "Cacher", style: "font-weight: bold;"})); // paragraph let xParagraph = oGrammalecte.createNode("p", {id: "grammalecte_paragraph"+oResult.iParaNum, className: "grammalecte_paragraph", lang: "fr", contentEditable: "true"}, {para_num: oResult.iParaNum}); xParagraph.setAttribute("spellcheck", "false"); // doesn’t seem possible to use “spellcheck” as a common attribute. xParagraph.dataset.timer_id = "0"; @@ -260,13 +271,14 @@ this.oTextControl.setParagraph(parseInt(xEvent.target.dataset.para_num, 10), xEvent.target.textContent); }.bind(this) , true); this._tagParagraph(xParagraph, oResult.sParagraph, oResult.iParaNum, oResult.aGrammErr, oResult.aSpellErr); // creation - xNodeDiv.appendChild(xActionsBar); - xNodeDiv.appendChild(xParagraph); - this.xParagraphList.appendChild(xNodeDiv); + let xParagraphBlock = oGrammalecte.createNode("div", {className: "grammalecte_paragraph_block"}); + xParagraphBlock.appendChild(xActionsBar); + xParagraphBlock.appendChild(xParagraph); + this.xParagraphList.appendChild(xParagraphBlock); this.nParagraph += 1; } } catch (e) { showError(e); @@ -530,86 +542,144 @@ // Lexicographer clearLexicographer () { this.nLxgCount = 0; - while (this.xLxgPanelContent.firstChild) { - this.xLxgPanelContent.removeChild(this.xLxgPanelContent.firstChild); - } - } - - addLxgSeparator (sText) { - if (this.xLxgPanelContent.textContent !== "") { - this.xLxgPanelContent.appendChild(oGrammalecte.createNode("div", {className: "grammalecte_lxg_separator", textContent: sText})); - } - } - - addMessageToLxgPanel (sMessage) { - let xNode = oGrammalecte.createNode("div", {className: "grammalecte_panel_flow_message", textContent: sMessage}); - this.xLxgPanelContent.appendChild(xNode); - } - - addListOfTokens (lToken) { + while (this.xLxgResultZone.firstChild) { + this.xLxgResultZone.removeChild(this.xLxgResultZone.firstChild); + } + } + + // Grammatical analysis + + sendParagraphToGrammaticalAnalysis (iParaNum) { + let xParagraph = this.xParent.getElementById("grammalecte_paragraph" + iParaNum); + this.xLxgInput.textContent = xParagraph.textContent; + this.grammaticalAnalysis(); + this.showLexicographer(); + } + + grammaticalAnalysis (iParaNum) { + if (!this.bOpened || this.bWorking) { + return; + } + this.startWaitIcon(); + this.clearLexicographer(); + let sText = this.xLxgInput.innerText.replace(/\n/g, " "); + console.log(sText); + oGrammalecteBackgroundPort.parseFull(sText, "__GrammalectePanel__"); + } + + showParagraphAnalysis (oResult) { + if (!this.bOpened || oResult === null) { + return; + } + try { + for (let oSentence of oResult.lSentences) { + this.nLxgCount += 1; + if (oSentence.sSentence.trim() !== "") { + let xSentenceBlock = oGrammalecte.createNode("div", {className: "grammalecte_lxg_paragraph_sentence_block"}); + xSentenceBlock.appendChild(oGrammalecte.createNode("div", {className: "grammalecte_lxg_list_num", textContent: this.nLxgCount})); + xSentenceBlock.appendChild(oGrammalecte.createNode("p", {className: "grammalecte_lxg_paragraph_sentence", textContent: oSentence.sSentence})); + let xTokenList = oGrammalecte.createNode("div", {className: "grammalecte_lxg_list_of_tokens"}); + for (let oToken of oSentence.lTokens) { + if (oToken["sType"] != "INFO" && !oToken.hasOwnProperty("bMerged")) { + if (oToken["sType"] == "WORD" && !oToken["bValidToken"]) { + oToken["sType"] = "UNKNOWN_WORD"; + } + xTokenList.appendChild(this._createTokenBlock(oToken)); + } + } + xSentenceBlock.appendChild(xTokenList); + this.xLxgResultZone.appendChild(xSentenceBlock); + } + } + } + catch (e) { + showError(e); + } + this.stopWaitIcon(); + } + + + // Lexical analysis + + getListOfTokens () { + if (!this.bOpened || this.bWorking) { + return; + } + this.startWaitIcon(); + this.clearLexicographer(); + let sText = this.xLxgInput.innerText; // to get carriage return (\n) + console.log(sText); + oGrammalecteBackgroundPort.getListOfTokens(sText, "__GrammalectePanel__"); + } + + addListOfTokens (oResult) { try { - if (lToken) { + if (oResult && oResult.sParagraph != "") { this.nLxgCount += 1; + let xSentenceBlock = oGrammalecte.createNode("div", {className: "grammalecte_lxg_paragraph_sentence_block"}); + xSentenceBlock.appendChild(oGrammalecte.createNode("div", {className: "grammalecte_lxg_list_num", textContent: this.nLxgCount})); + xSentenceBlock.appendChild(oGrammalecte.createNode("p", {className: "grammalecte_lxg_paragraph_sentence", textContent: oResult.sParagraph})); let xTokenList = oGrammalecte.createNode("div", {className: "grammalecte_lxg_list_of_tokens"}); - xTokenList.appendChild(oGrammalecte.createNode("div", {className: "grammalecte_lxg_list_num", textContent: this.nLxgCount})); - for (let oToken of lToken) { + for (let oToken of oResult.lTokens) { xTokenList.appendChild(this._createTokenBlock(oToken)); } - this.xLxgPanelContent.appendChild(xTokenList); + xSentenceBlock.appendChild(xTokenList); + this.xLxgResultZone.appendChild(xSentenceBlock); } } catch (e) { showError(e); } } _createTokenBlock (oToken) { let xTokenBlock = oGrammalecte.createNode("div", {className: "grammalecte_lxg_token_block"}); + // token description xTokenBlock.appendChild(this._createTokenDescr(oToken)); - if (oToken.aSubElem) { + // subtokens + if (oToken.hasOwnProperty("lSubTokens")) { let xSubBlock = oGrammalecte.createNode("div", {className: "grammalecte_lxg_token_subblock"}); - for (let oSubElem of oToken.aSubElem) { - xSubBlock.appendChild(this._createTokenDescr(oSubElem)); + for (let oSubToken of oToken["lSubTokens"]) { + if (oSubToken["sValue"] != "") { + xSubBlock.appendChild(this._createTokenDescr(oSubToken)); + } } xTokenBlock.appendChild(xSubBlock); } return xTokenBlock; } _createTokenDescr (oToken) { try { let xTokenDescr = oGrammalecte.createNode("div", {className: "grammalecte_lxg_token_descr"}); - if (oToken.sType == "LOCP") { - xTokenDescr.appendChild(oGrammalecte.createNode("div", {className: "grammalecte_lxg_token_also", textContent: "possiblement › "})); - } xTokenDescr.appendChild(oGrammalecte.createNode("div", {className: "grammalecte_lxg_token grammalecte_lxg_token_" + oToken.sType, textContent: oToken.sValue})); xTokenDescr.appendChild(oGrammalecte.createNode("div", {className: "grammalecte_lxg_token_colon", textContent: ":"})); - if (oToken.aLabel.length === 1) { - xTokenDescr.appendChild(oGrammalecte.createNode("div", {className: "grammalecte_lxg_morph_elem_inline", textContent: oToken.aLabel[0]})); + if (oToken.aLabels) { + if (oToken.aLabels.length < 2) { + // one morphology only + xTokenDescr.appendChild(oGrammalecte.createNode("div", {className: "grammalecte_lxg_morph_elem_inline", textContent: oToken.aLabels[0]})); + } else { + // several morphology + let xMorphList = oGrammalecte.createNode("div", {className: "grammalecte_lxg_morph_list"}); + for (let sLabel of oToken.aLabels) { + xMorphList.appendChild(oGrammalecte.createNode("div", {className: "grammalecte_lxg_morph_elem", textContent: "• " + sLabel})); + } + xTokenDescr.appendChild(xMorphList); + } } else { - let xMorphList = oGrammalecte.createNode("div", {className: "grammalecte_lxg_morph_list"}); - for (let sLabel of oToken.aLabel) { - xMorphList.appendChild(oGrammalecte.createNode("div", {className: "grammalecte_lxg_morph_elem", textContent: "• " + sLabel})); - } - xTokenDescr.appendChild(xMorphList); + xTokenDescr.appendChild(oGrammalecte.createNode("div", {className: "grammalecte_lxg_morph_elem_inline", textContent: "étiquettes non décrites : [" + oToken.lMorph + "]" })); } return xTokenDescr; } catch (e) { showError(e); } } - setHidden (sClass, bHidden) { - let xPanelContent = this.xParent.getElementById('grammalecte_panel_content'); - for (let xNode of xPanelContent.getElementsByClassName(sClass)) { - xNode.hidden = bHidden; - } - } // Conjugueur listenConj () { if (!this.bListenConj) { Index: gc_lang/fr/webext/content_scripts/panel_lxg.css ================================================================== --- gc_lang/fr/webext/content_scripts/panel_lxg.css +++ gc_lang/fr/webext/content_scripts/panel_lxg.css @@ -6,21 +6,66 @@ position: absolute; height: 100%; width: 100%; font-size: 13px; } + +div#grammalecte_lxg_input_block { + padding: 10px; + /*background-color: hsl(210, 50%, 95%);*/ + /*border-bottom: solid 1px hsl(210, 0%, 90%);*/ + text-align: right; +} + +div#grammalecte_lxg_result_zone { + +} + +div#grammalecte_lxg_input { + min-height: 100px; + padding: 10px; + background-color: hsl(210, 0%, 100%); + border: solid 1px hsl(210, 20%, 80%); + border-radius: 3px; + font-family: "Courier New", Courier, "Lucida Sans Typewriter", "Lucida Typewriter", monospace; + text-align: left; +} + +div#grammalecte_lxg_input_button { + display: inline-block; + margin: 0 10px 0 0; + padding: 3px 10px; + background-color: hsl(210, 50%, 50%); + color: hsl(210, 50%, 98%); + text-align: center; + cursor: pointer; + border-radius: 0 0 3px 3px; +} + +div.grammalecte_lxg_paragraph_sentence_block { + margin: 5px 0px 20px 0px; + background-color: hsl(210, 50%, 95%); + border-radius: 3px; + border-top: solid 1px hsl(210, 50%, 90%); + border-bottom: solid 1px hsl(210, 50%, 90%); + hyphens: none; +} +p.grammalecte_lxg_paragraph_sentence { + padding: 3px 10px; + font-weight: bold; + color: hsl(210, 50%, 40%); +} div.grammalecte_lxg_list_of_tokens { - margin: 10px 5px 0 5px; padding: 10px; - background-color: hsla(0, 0%, 95%, 1); + background-color: hsl(210, 50%, 99%); border-radius: 5px; } div.grammalecte_lxg_list_num { float: right; - margin: -12px 0 5px 10px; + margin: -2px 5px 5px 10px; padding: 5px 10px; font-family: "Trebuchet MS", "Fira Sans", "Ubuntu Condensed", "Liberation Sans", sans-serif; font-size: 14px; font-weight: bold; border-radius: 0 0 4px 4px; @@ -42,13 +87,13 @@ margin: 4px 0; } div.grammalecte_lxg_token_subblock { margin: 2px 0 2px 20px; padding: 5px; - border-left: 4px solid hsl(150, 30%, 70%); - background-color: hsl(210, 10%, 90%); - border-radius: 2px; + border-left: 4px solid hsl(210, 50%, 80%); + background-color: hsl(210, 50%, 94%); + border-radius: 3px; } div.grammalecte_lxg_token_descr { margin: 1px; padding: 1px; } @@ -85,10 +130,16 @@ } div.grammalecte_lxg_morph_elem { font-family: "Trebuchet MS", "Fira Sans", "Ubuntu Condensed", "Liberation Sans", sans-serif; color: hsl(0, 0%, 0%); } +div.grammalecte_lxg_other_tags { + padding-left: 20px; + font-family: "Trebuchet MS", "Fira Sans", "Ubuntu Condensed", "Liberation Sans", sans-serif; + color: hsl(0, 0%, 50%); +} + div.grammalecte_lxg_token_LOC { background-color: hsla(150, 50%, 30%, 1); } div.grammalecte_lxg_token_WORD { background-color: hsla(150, 50%, 50%, 1); Index: gc_lang/fr/webext/gce_worker.js ================================================================== --- gc_lang/fr/webext/gce_worker.js +++ gc_lang/fr/webext/gce_worker.js @@ -229,29 +229,28 @@ let aGrammErr = gc_engine.parse(sParagraph, sCountry, bDebug, null, bContext); let aSpellErr = oSpellChecker.parseParagraph(sParagraph); postMessage(createResponse("parseAndSpellcheck1", {sParagraph: sParagraph, aGrammErr: aGrammErr, aSpellErr: aSpellErr}, oInfo, true)); } -function parseFull (sText, sCountry, bDebug, bContext, oInfo={}) { - let i = 0; - sText = sText.replace(/­/g, "").normalize("NFC"); - for (let sParagraph of text.getParagraph(sText)) { - let lSentence = gc_engine.parse(sParagraph, sCountry, bDebug, null, bContext, true); - console.log("*", lSentence); - postMessage(createResponse("parseFull", {sParagraph: sParagraph, iParaNum: i, lSentence: lSentence}, oInfo, false)); - i += 1; - } - postMessage(createResponse("parseFull", null, oInfo, true)); +function parseFull (sParagraph, sCountry, bDebug, bContext, oInfo={}) { + sParagraph = sParagraph.replace(/­/g, "").normalize("NFC"); + let [lParagraphErrors, lSentences] = gc_engine.parse(sParagraph, sCountry, bDebug, null, bContext, true); + //console.log(lSentences); + postMessage(createResponse("parseFull", { lParagraphErrors: lParagraphErrors, lSentences: lSentences }, oInfo, true)); } function getListOfTokens (sText, oInfo={}) { // lexicographer try { sText = sText.replace(/­/g, "").normalize("NFC"); for (let sParagraph of text.getParagraph(sText)) { if (sParagraph.trim() !== "") { - postMessage(createResponse("getListOfTokens", lexgraph_fr.getListOfTokensReduc(sParagraph, true), oInfo, false)); + let lTokens = [ ...oTokenizer.genTokens(sParagraph) ]; + for (let oToken of lTokens) { + oSpellChecker.setLabelsOnToken(oToken); + } + postMessage(createResponse("getListOfTokens", { sParagraph: sParagraph, lTokens: lTokens }, oInfo, false)); } } postMessage(createResponse("getListOfTokens", null, oInfo, true)); } catch (e) { Index: grammalecte-cli.py ================================================================== --- grammalecte-cli.py +++ grammalecte-cli.py @@ -130,10 +130,14 @@ return (nError, cAction, vSugg) def main (): "launch the CLI (command line interface)" + if sys.version < "3.7": + print("Python 3.7+ required") + return + xParser = argparse.ArgumentParser() xParser.add_argument("-f", "--file", help="parse file (UTF-8 required!) [on Windows, -f is similar to -ff]", type=str) xParser.add_argument("-ff", "--file_to_file", help="parse file (UTF-8 required!) and create a result file (*.res.txt)", type=str) xParser.add_argument("-iff", "--interactive_file_to_file", help="parse file (UTF-8 required!) and create a result file (*.res.txt)", type=str) xParser.add_argument("-owe", "--only_when_errors", help="display results only when there are errors", action="store_true") @@ -336,21 +340,30 @@ elif sText.startswith("$"): for sParagraph in txt.getParagraph(sText[1:]): if xArgs.textformatter: sParagraph = oTextFormatter.formatText(sParagraph) lParagraphErrors, lSentences = oGrammarChecker.gce.parse(sParagraph, bDebug=xArgs.debug, bFullInfo=True) - echo(txt.getReadableErrors(lParagraphErrors, xArgs.width)) + #echo(txt.getReadableErrors(lParagraphErrors, xArgs.width)) for dSentence in lSentences: - echo("{nStart}:{nEnd}".format(**dSentence)) - echo(" <" + dSentence["sSentence"]+">") - for dToken in dSentence["lToken"]: - echo(" {0[nStart]:>3}:{0[nEnd]:<3} {1} {0[sType]:<14} {2} {0[sValue]:<16} {3:<10} {4}".format(dToken, \ - "×" if dToken.get("bToRemove", False) else " ", - "!" if dToken["sType"] == "WORD" and not dToken.get("bValidToken", False) else " ", - " ".join(dToken.get("lMorph", "")), \ - "·".join(dToken.get("aTags", "")) ) ) - echo(txt.getReadableErrors(dSentence["lGrammarErrors"], xArgs.width)) + echo("{nStart}:{nEnd} <{sSentence}>".format(**dSentence)) + for dToken in dSentence["lTokens"]: + if dToken["sType"] == "INFO" or "bMerged" in dToken: + continue + echo(" {0[nStart]:>3}:{0[nEnd]:<3} {1} {0[sType]:<14} {2} {0[sValue]:<16} {3}".format(dToken, \ + "×" if dToken.get("bToRemove", False) else " ", + "!" if dToken["sType"] == "WORD" and not dToken.get("bValidToken", False) else " ", + " ".join(dToken.get("aTags", "")) ) ) + if "lMorph" in dToken: + for sMorph, sLabel in zip(dToken["lMorph"], dToken["aLabels"]): + echo(" {0:40} {1}".format(sMorph, sLabel)) + if "lSubTokens" in dToken: + for dSubToken in dToken["lSubTokens"]: + if dSubToken["sValue"]: + echo(" · {0:20}".format(dSubToken["sValue"])) + for sMorph, sLabel in zip(dSubToken["lMorph"], dSubToken["aLabels"]): + echo(" {0:40} {1}".format(sMorph, sLabel)) + #echo(txt.getReadableErrors(dSentence["lGrammarErrors"], xArgs.width)) else: for sParagraph in txt.getParagraph(sText): if xArgs.textformatter: sParagraph = oTextFormatter.formatText(sParagraph) sRes, _ = oGrammarChecker.getParagraphWithErrors(sParagraph, bEmptyIfNoErrors=xArgs.only_when_errors, nWidth=xArgs.width, bDebug=xArgs.debug) Index: graphspell-js/ibdawg.js ================================================================== --- graphspell-js/ibdawg.js +++ graphspell-js/ibdawg.js @@ -311,10 +311,13 @@ return Boolean(this._convBytesToInteger(this.byDic.slice(iAddr, iAddr+this.nBytesArc)) & this._finalNodeMask); } getMorph (sWord) { // retrieves morphologies list, different casing allowed + if (!sWord) { + return []; + } sWord = str_transform.spellingNormalization(sWord); let l = this.morph(sWord); if (sWord[0].gl_isUpperCase()) { l.push(...this.morph(sWord.toLowerCase())); if (sWord.gl_isUpperCase() && sWord.length > 1) { Index: graphspell-js/lexgraph_fr.js ================================================================== --- graphspell-js/lexgraph_fr.js +++ graphspell-js/lexgraph_fr.js @@ -164,13 +164,16 @@ [':f', [" féminin", "féminin"]], [':s', [" singulier", "singulier"]], [':p', [" pluriel", "pluriel"]], [':i', [" invariable", "invariable"]], - [':V1', [" verbe (1ᵉʳ gr.),", "Verbe du 1ᵉʳ groupe"]], - [':V2', [" verbe (2ᵉ gr.),", "Verbe du 2ᵉ groupe"]], - [':V3', [" verbe (3ᵉ gr.),", "Verbe du 3ᵉ groupe"]], + [':V1_', [" verbe (1ᵉʳ gr.),", "Verbe du 1ᵉʳ groupe"]], + [':V2_', [" verbe (2ᵉ gr.),", "Verbe du 2ᵉ groupe"]], + [':V3_', [" verbe (3ᵉ gr.),", "Verbe du 3ᵉ groupe"]], + [':V1e', [" verbe (1ᵉʳ gr.),", "Verbe du 1ᵉʳ groupe"]], + [':V2e', [" verbe (2ᵉ gr.),", "Verbe du 2ᵉ groupe"]], + [':V3e', [" verbe (3ᵉ gr.),", "Verbe du 3ᵉ groupe"]], [':V0e', [" verbe,", "Verbe auxiliaire être"]], [':V0a', [" verbe,", "Verbe auxiliaire avoir"]], [':Y', [" infinitif,", "infinitif"]], [':P', [" participe présent,", "participe présent"]], @@ -211,26 +214,26 @@ [':Ot', [" pronom interrogatif,", "Pronom interrogatif"]], [':Or', [" pronom relatif,", "Pronom relatif"]], [':Ow', [" pronom adverbial,", "Pronom adverbial"]], [':Os', [" pronom personnel sujet,", "Pronom personnel sujet"]], [':Oo', [" pronom personnel objet,", "Pronom personnel objet"]], - [':Ov', [" préverbe,", "Préverbe (pronom personnel objet, +ne)"]], + [':Ov', [" préverbe,", "Préverbe"]], [':O1', [" 1ʳᵉ pers.,", "Pronom : 1ʳᵉ personne"]], [':O2', [" 2ᵉ pers.,", "Pronom : 2ᵉ personne"]], [':O3', [" 3ᵉ pers.,", "Pronom : 3ᵉ personne"]], [':C', [" conjonction,", "Conjonction"]], - [':Ĉ', [" conjonction (él.),", "Conjonction (élément)"]], [':Cc', [" conjonction de coordination,", "Conjonction de coordination"]], [':Cs', [" conjonction de subordination,", "Conjonction de subordination"]], - [':Ĉs', [" conjonction de subordination (él.),", "Conjonction de subordination (élément)"]], - - [':Ñ', [" locution nominale (él.),", "Locution nominale (élément)"]], - [':Â', [" locution adjectivale (él.),", "Locution adjectivale (élément)"]], - [':Ṽ', [" locution verbale (él.),", "Locution verbale (élément)"]], - [':Ŵ', [" locution adverbiale (él.),", "Locution adverbiale (élément)"]], - [':Ŕ', [" locution prépositive (él.),", "Locution prépositive (élément)"]], - [':Ĵ', [" locution interjective (él.),", "Locution interjective (élément)"]], + + [':ÉC', [" élément de conjonction,", "Élément de conjonction"]], + [':ÉCs', [" élément de conjonction de subordination,", "Élément de conjonction de subordination"]], + [':ÉN', [" élément de locution nominale,", "Élément de locution nominale"]], + [':ÉA', [" élément de locution adjectivale,", "Élément de locution adjectivale"]], + [':ÉV', [" élément de locution verbale,", "Élément de locution verbale"]], + [':ÉW', [" élément de locution adverbiale,", "Élément de locution adverbiale"]], + [':ÉR', [" élément de locution prépositive,", "Élément de locution prépositive"]], + [':ÉJ', [" élément de locution interjective,", "Élément de locution interjective"]], [':Zp', [" préfixe,", "Préfixe"]], [':Zs', [" suffixe,", "Suffixe"]], [':H', ["", ""]], @@ -277,64 +280,87 @@ ['t', " transitive directe"], ['p', " pronominale"], ['m', " impersonnelle"], ]), - dElidedPrefix: new Map([ - ['d', "(de), déterminant épicène invariable"], - ['l', "(le/la), déterminant masculin/féminin singulier"], - ['j', "(je), pronom personnel sujet, 1ʳᵉ pers., épicène singulier"], - ['m', "(me), pronom personnel objet, 1ʳᵉ pers., épicène singulier"], - ['t', "(te), pronom personnel objet, 2ᵉ pers., épicène singulier"], - ['s', "(se), pronom personnel objet, 3ᵉ pers., épicène singulier/pluriel"], - ['n', "(ne), adverbe de négation"], - ['c', "(ce), pronom démonstratif, masculin singulier/pluriel"], - ['ç', "(ça), pronom démonstratif, masculin singulier"], + dValues: new Map([ + ['d’', "(de), préposition ou déterminant épicène invariable"], + ['l’', "(le/la), déterminant ou pronom personnel objet, masculin/féminin singulier"], + ['j’', "(je), pronom personnel sujet, 1ʳᵉ pers., épicène singulier"], + ['m’', "(me), pronom personnel objet, 1ʳᵉ pers., épicène singulier"], + ['t’', "(te), pronom personnel objet, 2ᵉ pers., épicène singulier"], + ['s’', "(se), pronom personnel objet, 3ᵉ pers., épicène singulier/pluriel"], + ['n’', "(ne), adverbe de négation"], + ['c’', "(ce), pronom démonstratif, masculin singulier/pluriel"], + ['ç’', "(ça), pronom démonstratif, masculin singulier"], ['qu', "(que), conjonction de subordination"], - ['lorsqu', "(lorsque), conjonction de subordination"], - ['puisqu', "(lorsque), conjonction de subordination"], - ['quoiqu', "(quoique), conjonction de subordination"], - ['jusqu', "(jusque), préposition"] - ]), - - dPronoms: new Map([ - ['je', " pronom personnel sujet, 1ʳᵉ pers. sing."], - ['tu', " pronom personnel sujet, 2ᵉ pers. sing."], - ['il', " pronom personnel sujet, 3ᵉ pers. masc. sing."], - ['on', " pronom personnel sujet, 3ᵉ pers. sing. ou plur."], - ['elle', " pronom personnel sujet, 3ᵉ pers. fém. sing."], - ['nous', " pronom personnel sujet/objet, 1ʳᵉ pers. plur."], - ['vous', " pronom personnel sujet/objet, 2ᵉ pers. plur."], - ['ils', " pronom personnel sujet, 3ᵉ pers. masc. plur."], - ['elles', " pronom personnel sujet, 3ᵉ pers. masc. plur."], - - ["là", " particule démonstrative"], - ["ci", " particule démonstrative"], - - ['le', " COD, masc. sing."], - ['la', " COD, fém. sing."], - ['les', " COD, plur."], - - ['moi', " COI (à moi), sing."], - ['toi', " COI (à toi), sing."], - ['lui', " COI (à lui ou à elle), sing."], - ['nous2', " COI (à nous), plur."], - ['vous2', " COI (à vous), plur."], - ['leur', " COI (à eux ou à elles), plur."], - - ['y', " pronom adverbial"], - ["m'y", " (me) pronom personnel objet + (y) pronom adverbial"], - ["t'y", " (te) pronom personnel objet + (y) pronom adverbial"], - ["s'y", " (se) pronom personnel objet + (y) pronom adverbial"], - - ['en', " pronom adverbial"], - ["m'en", " (me) pronom personnel objet + (en) pronom adverbial"], - ["t'en", " (te) pronom personnel objet + (en) pronom adverbial"], - ["s'en", " (se) pronom personnel objet + (en) pronom adverbial"] - ]), - - dChar: new Map([ + ['lorsqu’', "(lorsque), conjonction de subordination"], + ['puisqu’', "(lorsque), conjonction de subordination"], + ['quoiqu’', "(quoique), conjonction de subordination"], + ['jusqu’', "(jusque), préposition"], + + ['-je', " pronom personnel sujet, 1ʳᵉ pers. sing."], + ['-tu', " pronom personnel sujet, 2ᵉ pers. sing."], + ['-il', " pronom personnel sujet, 3ᵉ pers. masc. sing."], + ['-iel', " pronom personnel sujet, 3ᵉ pers. sing."], + ['-on', " pronom personnel sujet, 3ᵉ pers. sing. ou plur."], + ['-elle', " pronom personnel sujet, 3ᵉ pers. fém. sing."], + ['-t-il', " “t” euphonique + pronom personnel sujet, 3ᵉ pers. masc. sing."], + ['-t-on', " “t” euphonique + pronom personnel sujet, 3ᵉ pers. sing. ou plur."], + ['-t-elle', " “t” euphonique + pronom personnel sujet, 3ᵉ pers. fém. sing."], + ['-t-iel', " “t” euphonique + pronom personnel sujet, 3ᵉ pers. sing."], + ['-nous', " pronom personnel sujet/objet, 1ʳᵉ pers. plur. ou COI (à nous), plur."], + ['-vous', " pronom personnel sujet/objet, 2ᵉ pers. plur. ou COI (à vous), plur."], + ['-ils', " pronom personnel sujet, 3ᵉ pers. masc. plur."], + ['-elles', " pronom personnel sujet, 3ᵉ pers. masc. plur."], + ['-iels', " pronom personnel sujet, 3ᵉ pers. plur."], + + ["-là", " particule démonstrative (là)"], + ["-ci", " particule démonstrative (ci)"], + + ['-le', " COD, masc. sing."], + ['-la', " COD, fém. sing."], + ['-les', " COD, plur."], + + ['-moi', " COI (à moi), sing."], + ['-toi', " COI (à toi), sing."], + ['-lui', " COI (à lui ou à elle), sing."], + ['-nous2', " COI (à nous), plur."], + ['-vous2', " COI (à vous), plur."], + ['-leur', " COI (à eux ou à elles), plur."], + + ['-le-moi', " COD, masc. sing. + COI (à moi), sing."], + ['-le-toi', " COD, masc. sing. + COI (à toi), sing."], + ['-le-lui', " COD, masc. sing. + COI (à lui ou à elle), sing."], + ['-le-nous', " COD, masc. sing. + COI (à nous), plur."], + ['-le-vous', " COD, masc. sing. + COI (à vous), plur."], + ['-le-leur', " COD, masc. sing. + COI (à eux ou à elles), plur."], + + ['-la-moi', " COD, fém. sing. + COI (à moi), sing."], + ['-la-toi', " COD, fém. sing. + COI (à toi), sing."], + ['-la-lui', " COD, fém. sing. + COI (à lui ou à elle), sing."], + ['-la-nous', " COD, fém. sing. + COI (à nous), plur."], + ['-la-vous', " COD, fém. sing. + COI (à vous), plur."], + ['-la-leur', " COD, fém. sing. + COI (à eux ou à elles), plur."], + + ['-les-moi', " COD, plur. + COI (à moi), sing."], + ['-les-toi', " COD, plur. + COI (à toi), sing."], + ['-les-lui', " COD, plur. + COI (à lui ou à elle), sing."], + ['-les-nous', " COD, plur. + COI (à nous), plur."], + ['-les-vous', " COD, plur. + COI (à vous), plur."], + ['-les-leur', " COD, plur. + COI (à eux ou à elles), plur."], + + ['-y', " pronom adverbial"], + ["-m’y", " (me) pronom personnel objet + (y) pronom adverbial"], + ["-t’y", " (te) pronom personnel objet + (y) pronom adverbial"], + ["-s’y", " (se) pronom personnel objet + (y) pronom adverbial"], + + ['-en', " pronom adverbial"], + ["-m’en", " (me) pronom personnel objet + (en) pronom adverbial"], + ["-t’en", " (te) pronom personnel objet + (en) pronom adverbial"], + ["-s’en", " (se) pronom personnel objet + (en) pronom adverbial"], + ['.', "point"], ['·', "point médian"], ['…', "points de suspension"], [':', "deux-points"], [';', "point-virgule"], @@ -367,393 +393,166 @@ ['⩾', "supérieur ou égal à"], ['%', "signe de pourcentage"], ['‰', "signe pour mille"], ]), - oSpellChecker: null, - oTokenizer: null, - oLocGraph: null, - _zPartDemForm: new RegExp("([a-zA-Zà-ö0-9À-Öø-ÿØ-ßĀ-ʯ]+)-(là|ci)$", "i"), _aPartDemExceptList: new Set(["celui", "celle", "ceux", "celles", "de", "jusque", "par", "marie-couche-toi"]), - _zInterroVerb: new RegExp("([a-zA-Zà-ö0-9À-Öø-ÿØ-ßĀ-ʯ]+)-(t-(?:il|elle|on)|je|tu|ils?|elles?|on|[nv]ous)$", "i"), - _zImperatifVerb: new RegExp("([a-zA-Zà-ö0-9À-Öø-ÿØ-ßĀ-ʯ]+)-((?:les?|la)-(?:moi|toi|lui|[nv]ous|leur)|y|en|[mts][’'](?:y|en)|les?|la|[mt]oi|leur|lui)$", "i"), + _zInterroVerb: new RegExp("([a-zA-Zà-ö0-9À-Öø-ÿØ-ßĀ-ʯ]+)(-(?:t-(?:ie?l|elle|on)|je|tu|ie?ls?|elles?|on|[nv]ous))$", "i"), + _zImperatifVerb: new RegExp("([a-zA-Zà-ö0-9À-Öø-ÿØ-ßĀ-ʯ]+)(-(?:l(?:es?|a)-(?:moi|toi|lui|[nv]ous|leur)|y|en|[mts]['’ʼ‘‛´`′‵՚ꞌꞋ](?:y|en)|les?|la|[mt]oi|leur|lui))$", "i"), _zTag: new RegExp("[:;/][a-zA-Z0-9ÑÂĴĈŔÔṼŴ!][^:;/]*", "g"), - - load: function (oSpellChecker, oTokenizer, oLocGraph) { - this.oSpellChecker = oSpellChecker; - this.oTokenizer = oTokenizer; - this.oLocGraph = JSON.parse(oLocGraph); - }, - split: function (sWord) { // returns an arry of strings (prefix, trimed_word, suffix) let sPrefix = ""; let sSuffix = ""; // préfixe élidé - let m = /^([ldmtsnjcç]|lorsqu|presqu|jusqu|puisqu|quoiqu|quelqu|qu)[’'‘`ʼ]([a-zA-Zà-öÀ-Ö0-9_ø-ÿØ-ßĀ-ʯfi-st-]+)/i.exec(sWord); + let m = /^([ldmtsnjcç]|lorsqu|presqu|jusqu|puisqu|quoiqu|quelqu|qu)['’ʼ‘‛´`′‵՚ꞌꞋ]([a-zA-Zà-öÀ-Ö0-9_ø-ÿØ-ßĀ-ʯfi-st-]+)/i.exec(sWord); if (m) { sPrefix = m[1] + "’"; sWord = m[2]; } // mots composés - m = /^([a-zA-Zà-öÀ-Ö0-9_ø-ÿØ-ßĀ-ʯfi-st-]+)(-(?:(?:les?|la)-(?:moi|toi|lui|[nv]ous|leur)|t-(?:il|elle|on)|y|en|[mts]’(?:y|en)|les?|l[aà]|[mt]oi|leur|lui|je|tu|ils?|elles?|on|[nv]ous|ce))$/i.exec(sWord); + m = /^([a-zA-Zà-öÀ-Ö0-9_ø-ÿØ-ßĀ-ʯfi-st]+)(-(?:(?:les?|la)-(?:moi|toi|lui|[nv]ous|leur)|t-(?:il|elle|on)|y|en|[mts]’(?:y|en)|les?|l[aà]|[mt]oi|leur|lui|je|tu|ils?|elles?|on|[nv]ous|ce))$/i.exec(sWord); if (m) { sWord = m[1]; sSuffix = m[2]; } // split word in 3 parts: prefix, root, suffix return [sPrefix, sWord, sSuffix]; }, - getInfoForToken: function (oToken) { - // Token: .sType, .sValue, .nStart, .nEnd - // return a object {sType, sValue, aLabel} + analyze: function (sWord) { + // return meaning of if found else an empty string + sWord = sWord.toLowerCase(); + if (this.dValues.has(sWord)) { + return this.dValues.get(sWord); + } + return ""; + }, + + readableMorph: function (sMorph) { + if (!sMorph) { + return " mot inconnu"; + } + let sRes = ""; + sMorph = sMorph.replace(/:V([0-3][ea_])[itpqnmr_eaxz]+/, ":V$1"); + let m; + while ((m = this._zTag.exec(sMorph)) !== null) { + if (this.dTag.has(m[0])) { + sRes += this.dTag.get(m[0])[0]; + } else { + sRes += " [" + m[0] + "]?"; + } + } + if (sRes.startsWith(" verbe") && !sRes.includes("infinitif")) { + sRes += " [" + sMorph.slice(1, sMorph.indexOf("/")) + "]"; + } + if (!sRes) { + return " [" + sMorph + "]: étiquettes inconnues"; + } + return sRes.gl_trimRight(","); + }, + + setLabelsOnToken (oToken) { + // Token: .sType, .sValue, .nStart, .nEnd, .lMorph let m = null; try { switch (oToken.sType) { case 'PUNC': case 'SIGN': - return { - sType: oToken.sType, - sValue: oToken.sValue, - aLabel: [this.dChar.gl_get(oToken.sValue, "caractère indéterminé")] - }; + oToken["aLabels"] = [this.dValues.gl_get(oToken["sValue"], "signe de ponctuation divers")]; break; case 'NUM': - return { - sType: oToken.sType, - sValue: oToken.sValue, - aLabel: ["nombre"] - }; + oToken["aLabels"] = ["nombre"]; break; case 'LINK': - return { - sType: oToken.sType, - sValue: oToken.sValue.slice(0, 40) + "…", - aLabel: ["hyperlien"] - }; + oToken["aLabels"] = ["hyperlien"]; break; case 'TAG': - return { - sType: oToken.sType, - sValue: oToken.sValue, - aLabel: ["étiquette (hashtag)"] - }; + oToken["aLabels"] = ["étiquette (hashtag)"]; break; case 'HTML': - return { - sType: oToken.sType, - sValue: oToken.sValue.slice(0, 40) + "…", - aLabel: ["balise HTML"] - }; + oToken["aLabels"] = ["balise HTML"]; break; case 'PSEUDOHTML': - return { - sType: oToken.sType, - sValue: oToken.sValue, - aLabel: ["balise pseudo-HTML"] - }; + oToken["aLabels"] = ["balise pseudo-HTML"]; break; case 'HTMLENTITY': - return { - sType: oToken.sType, - sValue: oToken.sValue, - aLabel: ["entité caractère XML/HTML"] - }; + oToken["aLabels"] = ["entité caractère XML/HTML"]; break; case 'HOUR': - return { - sType: oToken.sType, - sValue: oToken.sValue, - aLabel: ["heure"] - }; + oToken["aLabels"] = ["heure"]; break; case 'WORD_ELIDED': - let sTemp = oToken.sValue.replace("’", "").replace("'", "").replace("`", "").toLowerCase(); - return { - sType: oToken.sType, - sValue: oToken.sValue, - aLabel: [this.dElidedPrefix.gl_get(sTemp, "préfixe élidé inconnu")] - }; + oToken["aLabels"] = [this.dValues.gl_get(oToken["sValue"].toLowerCase(), "préfixe élidé inconnu")]; break; case 'WORD_ORDINAL': - return { - sType: oToken.sType, - sValue: oToken.sValue, - aLabel: ["nombre ordinal"] - }; + oToken["aLabels"] = ["nombre ordinal"]; break; case 'FOLDERUNIX': - return { - sType: oToken.sType, - sValue: oToken.sValue.slice(0, 40) + "…", - aLabel: ["dossier UNIX (et dérivés)"] - }; + oToken["aLabels"] = ["dossier UNIX (et dérivés)"]; break; case 'FOLDERWIN': - return { - sType: oToken.sType, - sValue: oToken.sValue.slice(0, 40) + "…", - aLabel: ["dossier Windows"] - }; + oToken["aLabels"] = ["dossier Windows"]; break; case 'WORD_ACRONYM': - return { - sType: oToken.sType, - sValue: oToken.sValue, - aLabel: ["Sigle ou acronyme"] - }; + oToken["aLabels"] = ["sigle ou acronyme"]; break; case 'WORD': - if (oToken.sValue.gl_count("-") > 4) { - return { - sType: "COMPLEX", - sValue: oToken.sValue, - aLabel: ["élément complexe indéterminé"] - }; - } else if (m = this._zPartDemForm.exec(oToken.sValue)) { - // mots avec particules démonstratives - if (this._aPartDemExceptList.has(m[1].toLowerCase())) { - return { - sType: "WORD", - sValue: oToken.sValue, - aLabel: this._getMorph(oToken.sValue) - }; - } - return { - sType: oToken.sType, - sValue: oToken.sValue, - aLabel: ["mot avec particule démonstrative"], - aSubElem: [ - { sType: oToken.sType, sValue: m[1], aLabel: this._getMorph(m[1]) }, - { sType: oToken.sType, sValue: "-" + m[2], aLabel: [this._formatSuffix(m[2].toLowerCase())] } - ] - }; - } else if (m = this._zImperatifVerb.exec(oToken.sValue)) { - // formes interrogatives - return { - sType: oToken.sType, - sValue: oToken.sValue, - aLabel: ["forme verbale impérative"], - aSubElem: [ - { sType: oToken.sType, sValue: m[1], aLabel: this._getMorph(m[1]) }, - { sType: oToken.sType, sValue: "-" + m[2], aLabel: [this._formatSuffix(m[2].toLowerCase())] } - ] - }; - } else if (m = this._zInterroVerb.exec(oToken.sValue)) { - // formes interrogatives - return { - sType: oToken.sType, - sValue: oToken.sValue, - aLabel: ["forme verbale interrogative"], - aSubElem: [ - { sType: oToken.sType, sValue: m[1], aLabel: this._getMorph(m[1]) }, - { sType: oToken.sType, sValue: "-" + m[2], aLabel: [this._formatSuffix(m[2].toLowerCase())] } - ] - }; - } else if (this.oSpellChecker.isValidToken(oToken.sValue)) { - return { - sType: oToken.sType, - sValue: oToken.sValue, - aLabel: this._getMorph(oToken.sValue) - }; - } else { - return { - sType: "UNKNOWN_WORD", - sValue: oToken.sValue, - aLabel: ["mot inconnu du dictionnaire"] - }; + if (oToken.hasOwnProperty("lMorph") && oToken["lMorph"].length > 0) { + // with morphology + oToken["aLabels"] = []; + for (let sMorph of oToken["lMorph"]) { + oToken["aLabels"].push(this.readableMorph(sMorph)); + } + } else { + // no morphology, guessing + if (oToken["sValue"].gl_count("-") > 4) { + oToken["aLabels"] = ["élément complexe indéterminé"]; + } + else if (m = this._zPartDemForm.exec(oToken["sValue"])) { + // mots avec particules démonstratives + oToken["aLabels"] = ["mot avec particule démonstrative"]; + } + else if (m = this._zImperatifVerb.exec(oToken["sValue"])) { + // formes interrogatives + oToken["aLabels"] = ["forme verbale impérative"]; + } + else if (m = this._zInterroVerb.exec(oToken["sValue"])) { + // formes interrogatives + oToken["aLabels"] = ["forme verbale interrogative"]; + } + else { + oToken["aLabels"] = ["mot inconnu du dictionnaire"]; + } + } + if (oToken.hasOwnProperty("lSubTokens")) { + for (let oSubToken of oToken["lSubTokens"]) { + if (oSubToken["sValue"]) { + if (this.dValues.has(oSubToken["sValue"])) { + oSubToken["lMorph"] = [ "" ]; + oSubToken["aLabels"] = [ this.dValues.get(oSubToken["sValue"]) ]; + } + else { + oSubToken["aLabels"] = oSubToken["lMorph"].map((sMorph) => this.readableMorph(sMorph)); + } + } + } } break; default: - return { - sType: oToken.sType, - sValue: oToken.sValue, - aLabel: ["token inconnu"] - } + oToken["aLabels"] = ["token de nature inconnue"]; } } catch (e) { console.error(e); } - return null; - }, - - _getMorph (sWord) { - let aElem = []; - for (let s of this.oSpellChecker.getMorph(sWord)) { - if (s.includes(":")) aElem.push(this._formatTags(s)); - } - if (aElem.length == 0) { - aElem.push("mot inconnu du dictionnaire"); - } - return aElem; - }, - - _formatTags (sTags) { - let sRes = ""; - sTags = sTags.replace(/V([0-3][ea]?)[itpqnmr_eaxz]+/, "V$1"); - let m; - while ((m = this._zTag.exec(sTags)) !== null) { - sRes += this.dTag.get(m[0])[0]; - } - if (sRes.startsWith(" verbe") && !sRes.includes("infinitif")) { - sRes += " [" + sTags.slice(1, sTags.indexOf("/")) + "]"; - } - if (!sRes) { - return "#Erreur. Étiquette inconnue : [" + sTags + "]"; - } - return sRes.gl_trimRight(","); - }, - - _formatTagsLoc (sTags) { - let sRes = ""; - let m; - while ((m = this._zTag.exec(sTags)) !== null) { - if (m[0].startsWith(":LV")) { - sRes += this.dLocTag.get(":LV"); - for (let c of m[0].slice(3)) { - sRes += this.dLocVerb.get(c); - } - } else { - sRes += this.dLocTag.get(m[0]); - } - } - if (!sRes) { - return "#Erreur. Étiquette inconnue : [" + sTags + "]"; - } - return sRes.gl_trimRight(","); - }, - - _formatSuffix (s) { - if (s.startsWith("t-")) { - return "“t” euphonique +" + this.dPronoms.get(s.slice(2)); - } - if (!s.includes("-")) { - return this.dPronoms.get(s.replace("’", "'")); - } - if (s.endsWith("ous")) { - s += '2'; - } - let nPos = s.indexOf("-"); - return this.dPronoms.get(s.slice(0, nPos)) + " +" + this.dPronoms.get(s.slice(nPos + 1)); - }, - - getListOfTokens (sText, bInfo=true) { - let aElem = []; - if (sText !== "") { - for (let oToken of this.oTokenizer.genTokens(sText)) { - if (bInfo) { - let aRes = this.getInfoForToken(oToken); - if (aRes) { - aElem.push(aRes); - } - } else if (oToken.sType !== "SPACE") { - aElem.push(oToken); - } - } - } - return aElem; - }, - - * generateInfoForTokenList (lToken) { - for (let oToken of lToken) { - let aRes = this.getInfoForToken(oToken); - if (aRes) { - yield aRes; - } - } - }, - - getListOfTokensReduc (sText, bInfo=true) { - let lToken = this.getListOfTokens(sText.replace("'", "’").trim(), false); - let iToken = 0; - let aElem = []; - if (lToken.length == 0) { - return aElem; - } - do { - let oToken = lToken[iToken]; - let sMorphLoc = ''; - let aTokenTempList = [oToken]; - if (oToken.sType == "WORD" || oToken.sType == "WORD_ELIDED"){ - let iLocEnd = iToken + 1; - let oLocNode = this.oLocGraph[oToken.sValue.toLowerCase()]; - while (oLocNode) { - let oTokenNext = lToken[iLocEnd]; - iLocEnd++; - if (oTokenNext) { - oLocNode = oLocNode[oTokenNext.sValue.toLowerCase()]; - } - if (oLocNode && iLocEnd <= lToken.length) { - sMorphLoc = oLocNode["_:_"]; - aTokenTempList.push(oTokenNext); - } else { - break; - } - } - } - - if (sMorphLoc) { - // we have a locution - let sValue = ''; - for (let oTokenWord of aTokenTempList) { - sValue += oTokenWord.sValue+' '; - } - let oTokenLocution = { - 'nStart': aTokenTempList[0].nStart, - 'nEnd': aTokenTempList[aTokenTempList.length-1].nEnd, - 'sType': "LOC", - 'sValue': sValue.replace('’ ','’').trim(), - 'aSubToken': aTokenTempList - }; - if (bInfo) { - let aSubElem = null; - if (sMorphLoc.startsWith("*|")) { - // cette suite de tokens n’est une locution que dans certains cas minoritaires - oTokenLocution.sType = "LOCP"; - for (let oElem of this.generateInfoForTokenList(aTokenTempList)) { - aElem.push(oElem); - } - sMorphLoc = sMorphLoc.slice(2); - } else { - aSubElem = [...this.generateInfoForTokenList(aTokenTempList)]; - } - // cette suite de tokens est la plupart du temps une locution - let aFormatedTag = []; - for (let sTagLoc of sMorphLoc.split('|') ){ - aFormatedTag.push(this._formatTagsLoc(sTagLoc)); - } - aElem.push({ - sType: oTokenLocution.sType, - sValue: oTokenLocution.sValue, - aLabel: aFormatedTag, - aSubElem: aSubElem - }); - } else { - aElem.push(oTokenLocution); - } - iToken = iToken + aTokenTempList.length; - } - else { - // No locution, we just add information - if (bInfo) { - let aRes = this.getInfoForToken(oToken); - if (aRes) { - aElem.push(aRes); - } - } else { - aElem.push(oToken); - } - iToken++; - } - } while (iToken < lToken.length); - return aElem; - }, - - // Other functions + }, + + + // Other functions + filterSugg: function (aSugg) { return aSugg.filter((sSugg) => { return !sSugg.endsWith("è") && !sSugg.endsWith("È"); }); } } Index: graphspell-js/spellchecker.js ================================================================== --- graphspell-js/spellchecker.js +++ graphspell-js/spellchecker.js @@ -132,13 +132,67 @@ loadLexicographer (sLangCode) { // load default suggestion module for if (typeof(process) !== 'undefined') { this.lexicographer = require(`./lexgraph_${sLangCode}.js`); } - else if (typeof(require) !== 'undefined') { - this.lexicographer = require(`resource://grammalecte/graphspell/lexgraph_${sLangCode}.js`); + else if (self && self.hasOwnProperty("lexgraph_"+sLangCode)) { // self is the Worker + this.lexicographer = self["lexgraph_"+sLangCode]; + } + } + + analyze (sWord) { + // returns a list of words and their morphologies + if (!this.lexicographer) { + return []; + } + let lWordAndMorph = []; + for (let sElem of this.lexicographer.split(sWord)) { + if (sElem) { + let lMorph = this.getMorph(sElem); + let sLex = this.lexicographer.analyze(sElem) + let aRes = []; + if (sLex) { + aRes = [ [lMorph.join(" | "), sLex] ]; + } else { + for (let sMorph of lMorph) { + aRes.push([sMorph, this.lexicographer.formatTags(sMorph)]); + } + } + if (aRes.length > 0) { + lWordAndMorph.push([sElem, aRes]); + } + } + } + return lWordAndMorph; + } + + readableMorph (sMorph) { + if (!this.lexicographer) { + return []; + } + return this.lexicographer.formatTags(sMorph); + } + + setLabelsOnToken (oToken) { + if (!this.lexicographer) { + return; + } + if (!oToken.hasOwnProperty("lMorph")) { + oToken["lMorph"] = this.getMorph(oToken["sValue"]); + } + if (oToken["sType"] == "WORD") { + oToken["bValidToken"] = this.isValidToken(oToken["sValue"]); + let [sPrefix, sStem, sSuffix] = this.lexicographer.split(oToken["sValue"]); + if (sStem != oToken["sValue"]) { + oToken["lSubTokens"] = [ + { "sType": "WORD", "sValue": sPrefix, "lMorph": this.getMorph(sPrefix) }, + { "sType": "WORD", "sValue": sStem, "lMorph": this.getMorph(sStem) }, + { "sType": "WORD", "sValue": sSuffix, "lMorph": this.getMorph(sSuffix) } + ]; + } } + this.lexicographer.setLabelsOnToken(oToken); } // Storage Index: graphspell/ibdawg.py ================================================================== --- graphspell/ibdawg.py +++ graphspell/ibdawg.py @@ -299,10 +299,12 @@ return False return bool(int.from_bytes(self.byDic[iAddr:iAddr+self.nBytesArc], byteorder='big') & self._finalNodeMask) def getMorph (self, sWord): "retrieves morphologies list, different casing allowed" + if not sWord: + return [] sWord = st.spellingNormalization(sWord) l = self.morph(sWord) if sWord[0:1].isupper(): l.extend(self.morph(sWord.lower())) if sWord.isupper() and len(sWord) > 1: Index: graphspell/lexgraph_fr.py ================================================================== --- graphspell/lexgraph_fr.py +++ graphspell/lexgraph_fr.py @@ -7,11 +7,12 @@ # : a dictionary for default suggestions. # : a boolean False # if the boolean is True, 4 functions are required: # split(sWord) -> returns a list of string (that will be analyzed) # analyze(sWord) -> returns a string with the meaning of word -# formatTags(sTags) -> returns a string with the meaning of tags +# readableMorph(sMorph) -> returns a string with the meaning of tags +# setLabelsOnToken(dToken) -> adds readable information on token # filterSugg(aWord) -> returns a filtered list of suggestions import re @@ -170,13 +171,16 @@ ':f': (" féminin", "féminin"), ':s': (" singulier", "singulier"), ':p': (" pluriel", "pluriel"), ':i': (" invariable", "invariable"), - ':V1': (" verbe (1ᵉʳ gr.),", "Verbe du 1ᵉʳ groupe"), - ':V2': (" verbe (2ᵉ gr.),", "Verbe du 2ᵉ groupe"), - ':V3': (" verbe (3ᵉ gr.),", "Verbe du 3ᵉ groupe"), + ':V1_': (" verbe (1ᵉʳ gr.),", "Verbe du 1ᵉʳ groupe"), + ':V2_': (" verbe (2ᵉ gr.),", "Verbe du 2ᵉ groupe"), + ':V3_': (" verbe (3ᵉ gr.),", "Verbe du 3ᵉ groupe"), + ':V1e': (" verbe (1ᵉʳ gr.),", "Verbe du 1ᵉʳ groupe"), + ':V2e': (" verbe (2ᵉ gr.),", "Verbe du 2ᵉ groupe"), + ':V3e': (" verbe (3ᵉ gr.),", "Verbe du 3ᵉ groupe"), ':V0e': (" verbe,", "Verbe auxiliaire être"), ':V0a': (" verbe,", "Verbe auxiliaire avoir"), ':Y': (" infinitif,", "infinitif"), ':P': (" participe présent,", "participe présent"), @@ -217,26 +221,26 @@ ':Ot': (" pronom interrogatif,", "Pronom interrogatif"), ':Or': (" pronom relatif,", "Pronom relatif"), ':Ow': (" pronom adverbial,", "Pronom adverbial"), ':Os': (" pronom personnel sujet,", "Pronom personnel sujet"), ':Oo': (" pronom personnel objet,", "Pronom personnel objet"), - ':Ov': (" préverbe,", "Préverbe (pronom personnel objet, +ne)"), + ':Ov': (" préverbe,", "Préverbe"), ':O1': (" 1ʳᵉ pers.,", "Pronom : 1ʳᵉ personne"), ':O2': (" 2ᵉ pers.,", "Pronom : 2ᵉ personne"), ':O3': (" 3ᵉ pers.,", "Pronom : 3ᵉ personne"), ':C': (" conjonction,", "Conjonction"), - ':Ĉ': (" conjonction (él.),", "Conjonction (élément)"), ':Cc': (" conjonction de coordination,", "Conjonction de coordination"), ':Cs': (" conjonction de subordination,", "Conjonction de subordination"), - ':Ĉs': (" conjonction de subordination (él.),", "Conjonction de subordination (élément)"), - - ':ÉN': (" locution nominale (él.),", "Locution nominale (élément)"), - ':ÉA': (" locution adjectivale (él.),", "Locution adjectivale (élément)"), - ':ÉV': (" locution verbale (él.),", "Locution verbale (élément)"), - ':ÉW': (" locution adverbiale (él.),", "Locution adverbiale (élément)"), - ':ÉR': (" locution prépositive (él.),", "Locution prépositive (élément)"), - ':ÉJ': (" locution interjective (él.),", "Locution interjective (élément)"), + + ':ÉC': (" élément de conjonction,", "Élément de conjonction"), + ':ÉCs': (" élément de conjonction de subordination,", "Élément de conjonction de subordination"), + ':ÉN': (" élément de locution nominale,", "Élément de locution nominale"), + ':ÉA': (" élément de locution adjectivale,", "Élément de locution adjectivale"), + ':ÉV': (" élément de locution verbale,", "Élément de locution verbale"), + ':ÉW': (" élément de locution adverbiale,", "Élément de locution adverbiale"), + ':ÉR': (" élément de locution prépositive,", "Élément de locution prépositive"), + ':ÉJ': (" élément de locution interjective,", "Élément de locution interjective"), ':Zp': (" préfixe,", "Préfixe"), ':Zs': (" suffixe,", "Suffixe"), ':H': ("", ""), @@ -272,22 +276,25 @@ 'jusqu’': "(jusque), préposition", '-je': " pronom personnel sujet, 1ʳᵉ pers. sing.", '-tu': " pronom personnel sujet, 2ᵉ pers. sing.", '-il': " pronom personnel sujet, 3ᵉ pers. masc. sing.", + '-iel': " pronom personnel sujet, 3ᵉ pers. sing.", '-on': " pronom personnel sujet, 3ᵉ pers. sing. ou plur.", '-elle': " pronom personnel sujet, 3ᵉ pers. fém. sing.", '-t-il': " “t” euphonique + pronom personnel sujet, 3ᵉ pers. masc. sing.", '-t-on': " “t” euphonique + pronom personnel sujet, 3ᵉ pers. sing. ou plur.", '-t-elle': " “t” euphonique + pronom personnel sujet, 3ᵉ pers. fém. sing.", + '-t-iel': " “t” euphonique + pronom personnel sujet, 3ᵉ pers. sing.", '-nous': " pronom personnel sujet/objet, 1ʳᵉ pers. plur. ou COI (à nous), plur.", '-vous': " pronom personnel sujet/objet, 2ᵉ pers. plur. ou COI (à vous), plur.", '-ils': " pronom personnel sujet, 3ᵉ pers. masc. plur.", '-elles': " pronom personnel sujet, 3ᵉ pers. masc. plur.", + '-iels': " pronom personnel sujet, 3ᵉ pers. plur.", - "-là": " particule démonstrative", - "-ci": " particule démonstrative", + "-là": " particule démonstrative (là)", + "-ci": " particule démonstrative (ci)", '-le': " COD, masc. sing.", '-la': " COD, fém. sing.", '-les': " COD, plur.", @@ -324,10 +331,45 @@ '-en': " pronom adverbial", "-m’en": " (me) pronom personnel objet + (en) pronom adverbial", "-t’en": " (te) pronom personnel objet + (en) pronom adverbial", "-s’en": " (se) pronom personnel objet + (en) pronom adverbial", + + '.': "point", + '·': "point médian", + '…': "points de suspension", + ':': "deux-points", + ';': "point-virgule", + ',': "virgule", + '?': "point d’interrogation", + '!': "point d’exclamation", + '(': "parenthèse ouvrante", + ')': "parenthèse fermante", + '[': "crochet ouvrant", + ']': "crochet fermant", + '{': "accolade ouvrante", + '}': "accolade fermante", + '-': "tiret", + '—': "tiret cadratin", + '–': "tiret demi-cadratin", + '«': "guillemet ouvrant (chevrons)", + '»': "guillemet fermant (chevrons)", + '“': "guillemet ouvrant double", + '”': "guillemet fermant double", + '‘': "guillemet ouvrant", + '’': "guillemet fermant", + '"': "guillemets droits (déconseillé en typographie)", + '/': "signe de la division", + '+': "signe de l’addition", + '*': "signe de la multiplication", + '=': "signe de l’égalité", + '<': "inférieur à", + '>': "supérieur à", + '⩽': "inférieur ou égal à", + '⩾': "supérieur ou égal à", + '%': "signe de pourcentage", + '‰': "signe pour mille" } _zElidedPrefix = re.compile("(?i)^([ldmtsnjcç]|lorsqu|presqu|jusqu|puisqu|quoiqu|quelqu|qu)[’'‘`ʼ]([\\w-]+)") _zCompoundWord = re.compile("(?i)(\\w+)(-(?:(?:les?|la)-(?:moi|toi|lui|[nv]ous|leur)|t-(?:il|elle|on)|y|en|[mts]’(?:y|en)|les?|l[aà]|[mt]oi|leur|lui|je|tu|ils?|elles?|on|[nv]ous|ce))$") @@ -356,22 +398,96 @@ if sWord in _dValues: return _dValues[sWord] return "" -def formatTags (sTags): +def readableMorph (sMorph): "returns string: readable tags" + if not sMorph: + return "mot inconnu" sRes = "" - sTags = re.sub("(?<=V[1-3])[itpqnmr_eaxz]+", "", sTags) - sTags = re.sub("(?<=V0[ea])[itpqnmr_eaxz]+", "", sTags) - for m in _zTag.finditer(sTags): - sRes += _dTAGS.get(m.group(0), " [{}]".format(m.group(0)))[0] + sMorph = re.sub("(?<=V[0123][ea_])[itpqnmr_eaxz]+", "", sMorph) + for m in _zTag.finditer(sMorph): + if m.group(0) in _dTAGS: + sRes += _dTAGS[m.group(0)][0] + else: + sRes += " [" + m.group(0) + "]?" if sRes.startswith(" verbe") and not sRes.endswith("infinitif"): - sRes += " [{}]".format(sTags[1:sTags.find("/")]) + sRes += " [" + sMorph[1:sMorph.find("/")] +"]" + if not sRes: + return " [" + sMorph + "]: étiquettes inconnues" return sRes.rstrip(",") + +_zPartDemForm = re.compile("([\\w]+)-(là|ci)$") +_zInterroVerb = re.compile("([\\w]+)(-(?:t-(?:ie?l|elle|on)|je|tu|ie?ls?|elles?|on|[nv]ous))$") +_zImperatifVerb = re.compile("([\\w]+)(-(?:l(?:es?|a)-(?:moi|toi|lui|[nv]ous|leur)|y|en|[mts]['’ʼ‘‛´`′‵՚ꞌꞋ](?:y|en)|les?|la|[mt]oi|leur|lui))$") + +def setLabelsOnToken (dToken): + # Token: .sType, .sValue, .nStart, .nEnd, .lMorph + try: + if dToken["sType"] == "PUNC" or dToken["sType"] == "SIGN": + dToken["aLabels"] = [_dValues.get(dToken["sValue"], "signe de ponctuation divers")] + elif dToken["sType"] == 'NUM': + dToken["aLabels"] = ["nombre"] + elif dToken["sType"] == 'LINK': + dToken["aLabels"] = ["hyperlien"] + elif dToken["sType"] == 'TAG': + dToken["aLabels"] = ["étiquette (hashtag)"] + elif dToken["sType"] == 'HTML': + dToken["aLabels"] = ["balise HTML"] + elif dToken["sType"] == 'PSEUDOHTML': + dToken["aLabels"] = ["balise pseudo-HTML"] + elif dToken["sType"] == 'HTMLENTITY': + dToken["aLabels"] = ["entité caractère XML/HTML"] + elif dToken["sType"] == 'HOUR': + dToken["aLabels"] = ["heure"] + elif dToken["sType"] == 'WORD_ELIDED': + dToken["aLabels"] = [_dValues.get(dToken["sValue"].lower(), "préfixe élidé inconnu")] + elif dToken["sType"] == 'WORD_ORDINAL': + dToken["aLabels"] = ["nombre ordinal"] + elif dToken["sType"] == 'FOLDERUNIX': + dToken["aLabels"] = ["dossier UNIX (et dérivés)"] + elif dToken["sType"] == 'FOLDERWIN': + dToken["aLabels"] = ["dossier Windows"] + elif dToken["sType"] == 'WORD_ACRONYM': + dToken["aLabels"] = ["sigle ou acronyme"] + elif dToken["sType"] == 'WORD': + if "lMorph" in dToken and dToken["lMorph"]: + # with morphology + dToken["aLabels"] = [] + for sMorph in dToken["lMorph"]: + dToken["aLabels"].append(readableMorph(sMorph)) + else: + # no morphology, guessing + if dToken["sValue"].count("-") > 4: + dToken["aLabels"] = ["élément complexe indéterminé"] + elif _zPartDemForm.search(dToken["sValue"]): + # mots avec particules démonstratives + dToken["aLabels"] = ["mot avec particule démonstrative"] + elif _zImperatifVerb.search(dToken["sValue"]): + # formes interrogatives + dToken["aLabels"] = ["forme verbale impérative"] + elif _zInterroVerb.search(dToken["sValue"]): + # formes interrogatives + dToken["aLabels"] = ["forme verbale interrogative"] + else: + dToken["aLabels"] = ["mot inconnu du dictionnaire"] + if "lSubTokens" in dToken: + for dSubToken in dToken["lSubTokens"]: + if dSubToken["sValue"]: + if dSubToken["sValue"] in _dValues: + dSubToken["lMorph"] = [ "" ] + dSubToken["aLabels"] = [ _dValues[dSubToken["sValue"]] ] + else: + dSubToken["aLabels"] = [ readableMorph(sMorph) for sMorph in dSubToken["lMorph"] ] + else: + dToken["aLabels"] = ["token de nature inconnue"] + except: + return + # Other functions def filterSugg (aSugg): "exclude suggestions" return filter(lambda sSugg: not sSugg.endswith(("è", "È")), aSugg) Index: graphspell/spellchecker.py ================================================================== --- graphspell/spellchecker.py +++ graphspell/spellchecker.py @@ -98,11 +98,11 @@ def deactivatePersonalDictionary (self): "deactivate personal dictionary" self.bPersonalDic = False - # Default suggestions + # Lexicographer def loadLexicographer (self, sLangCode): "load default suggestion module for " try: self.lexicographer = importlib.import_module(".lexgraph_"+sLangCode, "grammalecte.graphspell") @@ -120,15 +120,36 @@ lMorph = self.getMorph(sElem) sLex = self.lexicographer.analyze(sElem) if sLex: aRes = [ (" | ".join(lMorph), sLex) ] else: - aRes = [ (sMorph, self.lexicographer.formatTags(sMorph)) for sMorph in lMorph ] + aRes = [ (sMorph, self.lexicographer.readableMorph(sMorph)) for sMorph in lMorph ] if aRes: lWordAndMorph.append((sElem, aRes)) return lWordAndMorph + def readableMorph (self, sMorph): + if not self.lexicographer: + return "" + return self.lexicographer.readableMorph(sMorph) + + def setLabelsOnToken (self, dToken): + if not self.lexicographer: + return + if "lMorph" not in dToken: + dToken["lMorph"] = self.getMorph(dToken["sValue"]) + if dToken["sType"] == "WORD": + dToken["bValidToken"] = self.isValidToken(dToken["sValue"]) + sPrefix, sStem, sSuffix = self.lexicographer.split(dToken["sValue"]) + if sStem != dToken["sValue"]: + dToken["lSubTokens"] = [ + { "sType": "WORD", "sValue": sPrefix, "lMorph": self.getMorph(sPrefix) }, + { "sType": "WORD", "sValue": sStem, "lMorph": self.getMorph(sStem) }, + { "sType": "WORD", "sValue": sSuffix, "lMorph": self.getMorph(sSuffix) } + ] + self.lexicographer.setLabelsOnToken(dToken) + # Storage def activateStorage (self): "store all lemmas and morphologies retrieved from the word graph" @@ -233,11 +254,11 @@ return self._dLemmas[sWord] return { s[1:s.find("/")] for s in self.getMorph(sWord) } def suggest (self, sWord, nSuggLimit=10): "generator: returns 1, 2 or 3 lists of suggestions" - if self.lexicographer.dSugg: + if self.lexicographer: if sWord in self.lexicographer.dSugg: yield self.lexicographer.dSugg[sWord].split("|") elif sWord.istitle() and sWord.lower() in self.lexicographer.dSugg: lRes = self.lexicographer.dSugg[sWord.lower()].split("|") yield list(map(lambda sSugg: sSugg[0:1].upper()+sSugg[1:], lRes))