Index: gc_core/js/lang_core/gc_engine.js ================================================================== --- gc_core/js/lang_core/gc_engine.js +++ gc_core/js/lang_core/gc_engine.js @@ -225,16 +225,10 @@ // the list of tokens is duplicated, to keep tokens from being deleted when analysis } this.parseText(this.sSentence, this.sSentence0, false, iStart, sCountry, dOpt, bShowRuleId, bDebug, bContext); if (bFullInfo) { for (let oToken of this.lTokens0) { - if (oToken["sType"] == "WORD") { - oToken["bValidToken"] = gc_engine.oSpellChecker.isValidToken(oToken["sValue"]); - } - if (!oToken.hasOwnProperty("lMorph")) { - oToken["lMorph"] = gc_engine.oSpellChecker.getMorph(oToken["sValue"]); - } gc_engine.oSpellChecker.setLabelsOnToken(oToken); } lSentences.push({ "nStart": iStart, "nEnd": iEnd, Index: gc_core/py/lang_core/gc_engine.py ================================================================== --- gc_core/py/lang_core/gc_engine.py +++ gc_core/py/lang_core/gc_engine.py @@ -286,14 +286,10 @@ if bFullInfo: self.lTokens0 = list(self.lTokens) # the list of tokens is duplicated, to keep tokens from being deleted when analysis self.parseText(self.sSentence, self.sSentence0, False, iStart, sCountry, dOpt, bShowRuleId, bDebug, bContext) if bFullInfo: for dToken in self.lTokens0: - if dToken["sType"] == "WORD": - dToken["bValidToken"] = _oSpellChecker.isValidToken(dToken["sValue"]) - if "lMorph" not in dToken: - dToken["lMorph"] = _oSpellChecker.getMorph(dToken["sValue"]) _oSpellChecker.setLabelsOnToken(dToken) lSentences.append({ "nStart": iStart, "nEnd": iEnd, "sSentence": self.sSentence0, Index: gc_lang/fr/webext/content_scripts/panel_gc.js ================================================================== --- gc_lang/fr/webext/content_scripts/panel_gc.js +++ gc_lang/fr/webext/content_scripts/panel_gc.js @@ -584,11 +584,11 @@ for (let oToken of oSentence.lTokens) { if (oToken["sType"] != "INFO" && !oToken.hasOwnProperty("bMerged")) { if (oToken["sType"] == "WORD" && !oToken["bValidToken"]) { oToken["sType"] = "UNKNOWN_WORD"; } - xTokenList.appendChild(this._createTokenBlock2(oToken)); + xTokenList.appendChild(this._createTokenBlock(oToken)); } } xSentenceBlock.appendChild(xTokenList); this.xLxgResultZone.appendChild(xSentenceBlock); } @@ -598,51 +598,10 @@ showError(e); } this.stopWaitIcon(); } - _createTokenBlock2 (oToken) { - let xTokenBlock = oGrammalecte.createNode("div", {className: "grammalecte_lxg_token_block"}); - // token description - xTokenBlock.appendChild(this._createTokenDescr2(oToken)); - return xTokenBlock; - } - - _createTokenDescr2 (oToken) { - try { - let xTokenDescr = oGrammalecte.createNode("div", {className: "grammalecte_lxg_token_descr"}); - xTokenDescr.appendChild(oGrammalecte.createNode("div", {className: "grammalecte_lxg_token grammalecte_lxg_token_" + oToken.sType, textContent: oToken.sValue})); - xTokenDescr.appendChild(oGrammalecte.createNode("div", {className: "grammalecte_lxg_token_colon", textContent: ":"})); - if (oToken.aLabels) { - if (oToken.aLabels.length < 2) { - // one morphology only - xTokenDescr.appendChild(oGrammalecte.createNode("div", {className: "grammalecte_lxg_morph_elem_inline", textContent: oToken.aLabels[0]})); - } else { - // several morphology - let xMorphList = oGrammalecte.createNode("div", {className: "grammalecte_lxg_morph_list"}); - for (let sLabel of oToken.aLabels) { - xMorphList.appendChild(oGrammalecte.createNode("div", {className: "grammalecte_lxg_morph_elem", textContent: "• " + sLabel})); - } - xTokenDescr.appendChild(xMorphList); - } - } else { - xTokenDescr.appendChild(oGrammalecte.createNode("div", {className: "grammalecte_lxg_morph_elem_inline", textContent: "étiquettes non décrites : [" + oToken.lMorph + "]" })); - } - // other labels description - if (oToken.aOtherLabels) { - let xSubBlock = oGrammalecte.createNode("div", {className: "grammalecte_lxg_token_subblock"}); - for (let sLabel of oToken.aOtherLabels) { - xSubBlock.appendChild(oGrammalecte.createNode("div", {className: "grammalecte_lxg_other_tags", textContent: "• " + sLabel})); - } - xTokenDescr.appendChild(xSubBlock); - } - return xTokenDescr; - } - catch (e) { - showError(e); - } - } // Lexical analysis getListOfTokens () { if (!this.bOpened || this.bWorking) { @@ -675,37 +634,44 @@ } } _createTokenBlock (oToken) { let xTokenBlock = oGrammalecte.createNode("div", {className: "grammalecte_lxg_token_block"}); + // token description xTokenBlock.appendChild(this._createTokenDescr(oToken)); - if (oToken.aSubElem) { + // subtokens + if (oToken.hasOwnProperty("lSubTokens")) { let xSubBlock = oGrammalecte.createNode("div", {className: "grammalecte_lxg_token_subblock"}); - for (let oSubElem of oToken.aSubElem) { - xSubBlock.appendChild(this._createTokenDescr(oSubElem)); + for (let oSubToken of oToken["lSubTokens"]) { + if (oSubToken["sValue"] != "") { + xSubBlock.appendChild(this._createTokenDescr(oSubToken)); + } } xTokenBlock.appendChild(xSubBlock); } return xTokenBlock; } _createTokenDescr (oToken) { try { let xTokenDescr = oGrammalecte.createNode("div", {className: "grammalecte_lxg_token_descr"}); - if (oToken.sType == "LOCP") { - xTokenDescr.appendChild(oGrammalecte.createNode("div", {className: "grammalecte_lxg_token_also", textContent: "possiblement › "})); - } xTokenDescr.appendChild(oGrammalecte.createNode("div", {className: "grammalecte_lxg_token grammalecte_lxg_token_" + oToken.sType, textContent: oToken.sValue})); xTokenDescr.appendChild(oGrammalecte.createNode("div", {className: "grammalecte_lxg_token_colon", textContent: ":"})); - if (oToken.aLabel.length === 1) { - xTokenDescr.appendChild(oGrammalecte.createNode("div", {className: "grammalecte_lxg_morph_elem_inline", textContent: oToken.aLabel[0]})); + if (oToken.aLabels) { + if (oToken.aLabels.length < 2) { + // one morphology only + xTokenDescr.appendChild(oGrammalecte.createNode("div", {className: "grammalecte_lxg_morph_elem_inline", textContent: oToken.aLabels[0]})); + } else { + // several morphology + let xMorphList = oGrammalecte.createNode("div", {className: "grammalecte_lxg_morph_list"}); + for (let sLabel of oToken.aLabels) { + xMorphList.appendChild(oGrammalecte.createNode("div", {className: "grammalecte_lxg_morph_elem", textContent: "• " + sLabel})); + } + xTokenDescr.appendChild(xMorphList); + } } else { - let xMorphList = oGrammalecte.createNode("div", {className: "grammalecte_lxg_morph_list"}); - for (let sLabel of oToken.aLabel) { - xMorphList.appendChild(oGrammalecte.createNode("div", {className: "grammalecte_lxg_morph_elem", textContent: "• " + sLabel})); - } - xTokenDescr.appendChild(xMorphList); + xTokenDescr.appendChild(oGrammalecte.createNode("div", {className: "grammalecte_lxg_morph_elem_inline", textContent: "étiquettes non décrites : [" + oToken.lMorph + "]" })); } return xTokenDescr; } catch (e) { showError(e); Index: gc_lang/fr/webext/content_scripts/panel_lxg.css ================================================================== --- gc_lang/fr/webext/content_scripts/panel_lxg.css +++ gc_lang/fr/webext/content_scripts/panel_lxg.css @@ -87,13 +87,13 @@ margin: 4px 0; } div.grammalecte_lxg_token_subblock { margin: 2px 0 2px 20px; padding: 5px; - border-left: 4px solid hsl(150, 30%, 70%); - background-color: hsl(210, 10%, 90%); - border-radius: 2px; + border-left: 4px solid hsl(210, 50%, 80%); + background-color: hsl(210, 50%, 94%); + border-radius: 3px; } div.grammalecte_lxg_token_descr { margin: 1px; padding: 1px; } Index: grammalecte-cli.py ================================================================== --- grammalecte-cli.py +++ grammalecte-cli.py @@ -353,10 +353,16 @@ "!" if dToken["sType"] == "WORD" and not dToken.get("bValidToken", False) else " ", " ".join(dToken.get("aTags", "")) ) ) if "lMorph" in dToken: for sMorph, sLabel in zip(dToken["lMorph"], dToken["aLabels"]): echo(" {0:40} {1}".format(sMorph, sLabel)) + if "lSubTokens" in dToken: + for dSubToken in dToken["lSubTokens"]: + if dSubToken["sValue"]: + echo(" · {0:20}".format(dSubToken["sValue"])) + for sMorph, sLabel in zip(dSubToken["lMorph"], dSubToken["aLabels"]): + echo(" {0:40} {1}".format(sMorph, sLabel)) #echo(txt.getReadableErrors(dSentence["lGrammarErrors"], xArgs.width)) else: for sParagraph in txt.getParagraph(sText): if xArgs.textformatter: sParagraph = oTextFormatter.formatText(sParagraph) Index: graphspell-js/ibdawg.js ================================================================== --- graphspell-js/ibdawg.js +++ graphspell-js/ibdawg.js @@ -313,10 +313,13 @@ return Boolean(this._convBytesToInteger(this.byDic.slice(iAddr, iAddr+this.nBytesArc)) & this._finalNodeMask); } getMorph (sWord) { // retrieves morphologies list, different casing allowed + if (!sWord) { + return []; + } sWord = str_transform.spellingNormalization(sWord); let l = this.morph(sWord); if (sWord[0].gl_isUpperCase()) { l.push(...this.morph(sWord.toLowerCase())); if (sWord.gl_isUpperCase() && sWord.length > 1) { Index: graphspell-js/lexgraph_fr.js ================================================================== --- graphspell-js/lexgraph_fr.js +++ graphspell-js/lexgraph_fr.js @@ -421,11 +421,11 @@ if (m) { sPrefix = m[1] + "’"; sWord = m[2]; } // mots composés - m = /^([a-zA-Zà-öÀ-Ö0-9_ø-ÿØ-ßĀ-ʯfi-st-]+)(-(?:(?:les?|la)-(?:moi|toi|lui|[nv]ous|leur)|t-(?:il|elle|on)|y|en|[mts]’(?:y|en)|les?|l[aà]|[mt]oi|leur|lui|je|tu|ils?|elles?|on|[nv]ous|ce))$/i.exec(sWord); + m = /^([a-zA-Zà-öÀ-Ö0-9_ø-ÿØ-ßĀ-ʯfi-st]+)(-(?:(?:les?|la)-(?:moi|toi|lui|[nv]ous|leur)|t-(?:il|elle|on)|y|en|[mts]’(?:y|en)|les?|l[aà]|[mt]oi|leur|lui|je|tu|ils?|elles?|on|[nv]ous|ce))$/i.exec(sWord); if (m) { sWord = m[1]; sSuffix = m[2]; } // split word in 3 parts: prefix, root, suffix @@ -440,10 +440,13 @@ } return ""; }, readableMorph: function (sMorph) { + if (!sMorph) { + return " mot inconnu"; + } let sRes = ""; sMorph = sMorph.replace(/:V([0-3][ea_])[itpqnmr_eaxz]+/, ":V$1"); let m; while ((m = this._zTag.exec(sMorph)) !== null) { if (this.dTag.has(m[0])) { @@ -511,21 +514,10 @@ // with morphology oToken["aLabels"] = []; for (let sMorph of oToken["lMorph"]) { oToken["aLabels"].push(this.readableMorph(sMorph)); } - if (oToken.hasOwnProperty("sTags")) { - let aTags = []; - for (let sTag of oToken["sTags"]) { - if (this.dValues.has(sTag)) { - aTags.push(this.dValues.get(sTag)) - } - } - if (aTags.length > 0) { - oToken["aOtherLabels"] = aTags; - } - } } else { // no morphology, guessing if (oToken["sValue"].gl_count("-") > 4) { oToken["aLabels"] = ["élément complexe indéterminé"]; } @@ -542,10 +534,23 @@ oToken["aLabels"] = ["forme verbale interrogative"]; } else { oToken["aLabels"] = ["mot inconnu du dictionnaire"]; } + } + if (oToken.hasOwnProperty("lSubTokens")) { + for (let oSubToken of oToken["lSubTokens"]) { + if (oSubToken["sValue"]) { + if (this.dValues.has(oSubToken["sValue"])) { + oSubToken["lMorph"] = [ "" ]; + oSubToken["aLabels"] = [ this.dValues.get(oSubToken["sValue"]) ]; + } + else { + oSubToken["aLabels"] = oSubToken["lMorph"].map((sMorph) => this.readableMorph(sMorph)); + } + } + } } break; default: oToken["aLabels"] = ["token de nature inconnue"]; } @@ -554,172 +559,172 @@ } }, getInfoForToken: function (oToken) { // Token: .sType, .sValue, .nStart, .nEnd - // return a object {sType, sValue, aLabel} + // return a object {sType, sValue, aLabels} let m = null; try { switch (oToken.sType) { case 'PUNC': case 'SIGN': return { sType: oToken.sType, sValue: oToken.sValue, - aLabel: [this.dValues.gl_get(oToken.sValue, "caractère indéterminé")] + aLabels: [this.dValues.gl_get(oToken.sValue, "caractère indéterminé")] }; break; case 'NUM': return { sType: oToken.sType, sValue: oToken.sValue, - aLabel: ["nombre"] + aLabels: ["nombre"] }; break; case 'LINK': return { sType: oToken.sType, sValue: oToken.sValue.slice(0, 40) + "…", - aLabel: ["hyperlien"] + aLabels: ["hyperlien"] }; break; case 'TAG': return { sType: oToken.sType, sValue: oToken.sValue, - aLabel: ["étiquette (hashtag)"] + aLabels: ["étiquette (hashtag)"] }; break; case 'HTML': return { sType: oToken.sType, sValue: oToken.sValue.slice(0, 40) + "…", - aLabel: ["balise HTML"] + aLabels: ["balise HTML"] }; break; case 'PSEUDOHTML': return { sType: oToken.sType, sValue: oToken.sValue, - aLabel: ["balise pseudo-HTML"] + aLabels: ["balise pseudo-HTML"] }; break; case 'HTMLENTITY': return { sType: oToken.sType, sValue: oToken.sValue, - aLabel: ["entité caractère XML/HTML"] + aLabels: ["entité caractère XML/HTML"] }; break; case 'HOUR': return { sType: oToken.sType, sValue: oToken.sValue, - aLabel: ["heure"] + aLabels: ["heure"] }; break; case 'WORD_ELIDED': return { sType: oToken.sType, sValue: oToken.sValue, - aLabel: [this.dValues.gl_get(oToken.sValue.toLowerCase(), "préfixe élidé inconnu")] + aLabels: [this.dValues.gl_get(oToken.sValue.toLowerCase(), "préfixe élidé inconnu")] }; break; case 'WORD_ORDINAL': return { sType: oToken.sType, sValue: oToken.sValue, - aLabel: ["nombre ordinal"] + aLabels: ["nombre ordinal"] }; break; case 'FOLDERUNIX': return { sType: oToken.sType, sValue: oToken.sValue.slice(0, 40) + "…", - aLabel: ["dossier UNIX (et dérivés)"] + aLabels: ["dossier UNIX (et dérivés)"] }; break; case 'FOLDERWIN': return { sType: oToken.sType, sValue: oToken.sValue.slice(0, 40) + "…", - aLabel: ["dossier Windows"] + aLabels: ["dossier Windows"] }; break; case 'WORD_ACRONYM': return { sType: oToken.sType, sValue: oToken.sValue, - aLabel: ["Sigle ou acronyme"] + aLabels: ["Sigle ou acronyme"] }; break; case 'WORD': if (oToken.sValue.gl_count("-") > 4) { return { sType: "COMPLEX", sValue: oToken.sValue, - aLabel: ["élément complexe indéterminé"] + aLabels: ["élément complexe indéterminé"] }; } else if (m = this._zPartDemForm.exec(oToken.sValue)) { // mots avec particules démonstratives if (this._aPartDemExceptList.has(m[1].toLowerCase())) { return { sType: "WORD", sValue: oToken.sValue, - aLabel: this._getMorph(oToken.sValue) + aLabels: this._getMorph(oToken.sValue) }; } return { sType: oToken.sType, sValue: oToken.sValue, - aLabel: ["mot avec particule démonstrative"], - aSubElem: [ - { sType: oToken.sType, sValue: m[1], aLabel: this._getMorph(m[1]) }, - { sType: oToken.sType, sValue: m[2], aLabel: [ this._formatSuffix(m[2]) ] } + aLabels: ["mot avec particule démonstrative"], + lSubTokens: [ + { sType: oToken.sType, sValue: m[1], aLabels: this._getMorph(m[1]) }, + { sType: oToken.sType, sValue: m[2], aLabels: [ this._formatSuffix(m[2]) ] } ] }; } else if (m = this._zImperatifVerb.exec(oToken.sValue)) { // formes interrogatives return { sType: oToken.sType, sValue: oToken.sValue, - aLabel: ["forme verbale impérative"], - aSubElem: [ - { sType: oToken.sType, sValue: m[1], aLabel: this._getMorph(m[1]) }, - { sType: oToken.sType, sValue: m[2], aLabel: [ this._formatSuffix(m[2]) ] } + aLabels: ["forme verbale impérative"], + lSubTokens: [ + { sType: oToken.sType, sValue: m[1], aLabels: this._getMorph(m[1]) }, + { sType: oToken.sType, sValue: m[2], aLabels: [ this._formatSuffix(m[2]) ] } ] }; } else if (m = this._zInterroVerb.exec(oToken.sValue)) { // formes interrogatives return { sType: oToken.sType, sValue: oToken.sValue, - aLabel: ["forme verbale interrogative"], - aSubElem: [ - { sType: oToken.sType, sValue: m[1], aLabel: this._getMorph(m[1]) }, - { sType: oToken.sType, sValue: m[2], aLabel: [ this._formatSuffix(m[2]) ] } + aLabels: ["forme verbale interrogative"], + lSubTokens: [ + { sType: oToken.sType, sValue: m[1], aLabels: this._getMorph(m[1]) }, + { sType: oToken.sType, sValue: m[2], aLabels: [ this._formatSuffix(m[2]) ] } ] }; } else if (this.oSpellChecker.isValidToken(oToken.sValue)) { return { sType: oToken.sType, sValue: oToken.sValue, - aLabel: this._getMorph(oToken.sValue) + aLabels: this._getMorph(oToken.sValue) }; } else { return { sType: "UNKNOWN_WORD", sValue: oToken.sValue, - aLabel: ["mot inconnu du dictionnaire"] + aLabels: ["mot inconnu du dictionnaire"] }; } break; default: return { sType: oToken.sType, sValue: oToken.sValue, - aLabel: ["token inconnu"] + aLabels: ["token inconnu"] } } } catch (e) { console.error(e); } Index: graphspell-js/spellchecker.js ================================================================== --- graphspell-js/spellchecker.js +++ graphspell-js/spellchecker.js @@ -173,10 +173,24 @@ } setLabelsOnToken (oToken) { if (!this.lexicographer) { return; + } + if (!oToken.hasOwnProperty("lMorph")) { + oToken["lMorph"] = this.getMorph(oToken["sValue"]); + } + if (oToken["sType"] == "WORD") { + oToken["bValidToken"] = this.isValidToken(oToken["sValue"]); + let [sPrefix, sStem, sSuffix] = this.lexicographer.split(oToken["sValue"]); + if (sStem != oToken["sValue"]) { + oToken["lSubTokens"] = [ + { "sType": "WORD", "sValue": sPrefix, "lMorph": this.getMorph(sPrefix) }, + { "sType": "WORD", "sValue": sStem, "lMorph": this.getMorph(sStem) }, + { "sType": "WORD", "sValue": sSuffix, "lMorph": this.getMorph(sSuffix) } + ]; + } } this.lexicographer.setLabelsOnToken(oToken); } Index: graphspell/ibdawg.py ================================================================== --- graphspell/ibdawg.py +++ graphspell/ibdawg.py @@ -298,10 +298,12 @@ return False return bool(int.from_bytes(self.byDic[iAddr:iAddr+self.nBytesArc], byteorder='big') & self._finalNodeMask) def getMorph (self, sWord): "retrieves morphologies list, different casing allowed" + if not sWord: + return [] sWord = st.spellingNormalization(sWord) l = self.morph(sWord) if sWord[0:1].isupper(): l.extend(self.morph(sWord.lower())) if sWord.isupper() and len(sWord) > 1: Index: graphspell/lexgraph_fr.py ================================================================== --- graphspell/lexgraph_fr.py +++ graphspell/lexgraph_fr.py @@ -399,10 +399,12 @@ return "" def readableMorph (sMorph): "returns string: readable tags" + if not sMorph: + return "mot inconnu" sRes = "" sMorph = re.sub("(?<=V[0123][ea_])[itpqnmr_eaxz]+", "", sMorph) for m in _zTag.finditer(sMorph): if m.group(0) in _dTAGS: sRes += _dTAGS[m.group(0)][0] @@ -452,17 +454,10 @@ if "lMorph" in dToken and dToken["lMorph"]: # with morphology dToken["aLabels"] = [] for sMorph in dToken["lMorph"]: dToken["aLabels"].append(readableMorph(sMorph)) - if "sTags" in dToken: - aTags = [] - for sTag in dToken["sTags"]: - if sTag in _dValues: - aTags.append(_dValues[sTag]) - if aTags: - dToken["aOtherLabels"] = aTags else: # no morphology, guessing if dToken["sValue"].count("-") > 4: dToken["aLabels"] = ["élément complexe indéterminé"] elif _zPartDemForm.search(dToken["sValue"]): @@ -472,10 +467,20 @@ # formes interrogatives dToken["aLabels"] = ["forme verbale impérative"] elif _zInterroVerb.search(dToken["sValue"]): # formes interrogatives dToken["aLabels"] = ["forme verbale interrogative"] + else: + dToken["aLabels"] = ["mot inconnu du dictionnaire"] + if "lSubTokens" in dToken: + for dSubToken in dToken["lSubTokens"]: + if dSubToken["sValue"]: + if dSubToken["sValue"] in _dValues: + dSubToken["lMorph"] = [ "" ] + dSubToken["aLabels"] = [ _dValues[dSubToken["sValue"]] ] + else: + dSubToken["aLabels"] = [ readableMorph(sMorph) for sMorph in dSubToken["lMorph"] ] else: dToken["aLabels"] = ["token de nature inconnue"] except: return Index: graphspell/spellchecker.py ================================================================== --- graphspell/spellchecker.py +++ graphspell/spellchecker.py @@ -133,10 +133,21 @@ return self.lexicographer.readableMorph(sMorph) def setLabelsOnToken (self, dToken): if not self.lexicographer: return + if "lMorph" not in dToken: + dToken["lMorph"] = self.getMorph(dToken["sValue"]) + if dToken["sType"] == "WORD": + dToken["bValidToken"] = self.isValidToken(dToken["sValue"]) + sPrefix, sStem, sSuffix = self.lexicographer.split(dToken["sValue"]) + if sStem != dToken["sValue"]: + dToken["lSubTokens"] = [ + { "sType": "WORD", "sValue": sPrefix, "lMorph": self.getMorph(sPrefix) }, + { "sType": "WORD", "sValue": sStem, "lMorph": self.getMorph(sStem) }, + { "sType": "WORD", "sValue": sSuffix, "lMorph": self.getMorph(sSuffix) } + ] self.lexicographer.setLabelsOnToken(dToken) # Storage