Index: gc_core/js/text.js ================================================================== --- gc_core/js/text.js +++ gc_core/js/text.js @@ -9,16 +9,16 @@ var helpers = require("resource://grammalecte/helpers.js"); } var text = { - getParagraph: function* (sText) { + getParagraph: function* (sText, sSepParagraph = "\n") { // generator: returns paragraphs of text let iStart = 0; let iEnd = 0; sText = sText.replace("\r\n", "\n").replace("\r", "\n"); - while ((iEnd = sText.indexOf("\n", iStart)) !== -1) { + while ((iEnd = sText.indexOf(sSepParagraph, iStart)) !== -1) { yield sText.slice(iStart, iEnd); iStart = iEnd + 1; } yield sText.slice(iStart); }, @@ -43,11 +43,11 @@ }, getReadableError: function (oErr) { // Returns an error oErr as a readable error try { - let sResult = "\n* " + oErr['nStart'] + ":" + oErr['nEnd'] + let sResult = "\n* " + oErr['nStart'] + ":" + oErr['nEnd'] + " # " + oErr['sLineId'] + " # " + oErr['sRuleId'] + ":\n"; sResult += " " + oErr["sMessage"]; if (oErr["aSuggestions"].length > 0) { sResult += "\n > Suggestions : " + oErr["aSuggestions"].join(" | "); } Index: gc_lang/fr/build_data.py ================================================================== --- gc_lang/fr/build_data.py +++ gc_lang/fr/build_data.py @@ -256,11 +256,11 @@ def makePhonetTable (sp, bJS=False): print("> Correspondances phonétiques ", end="") print("(Python et JavaScript)" if bJS else "(Python seulement)") - + try: oDict = ibdawg.IBDAWG("French.bdic") except: traceback.print_exc() return @@ -316,12 +316,14 @@ for sLine in hSrc.readlines(): if not sLine.startswith("#") and sLine.strip(): lElem = sLine.strip().split() dCur = dLocutions for sWord in lElem: - if sWord not in dCur: + if sWord not in dCur and not sWord.startswith(":"): dCur[sWord] = {} + if sWord not in dCur and sWord.startswith(":"): + dCur[sWord] = '' dCur = dCur[sWord] sCode = "# generated data (do not edit)\n\n" + \ "dLocutions = " + str(dLocutions) + "\n" open(sp+"/modules/locutions_data.py", "w", encoding="utf-8", newline="\n").write(sCode) Index: gc_lang/fr/data/locutions.txt ================================================================== --- gc_lang/fr/data/locutions.txt +++ gc_lang/fr/data/locutions.txt cannot compute difference between binary files Index: gc_lang/fr/modules-js/lexicographe.js ================================================================== --- gc_lang/fr/modules-js/lexicographe.js +++ gc_lang/fr/modules-js/lexicographe.js @@ -7,15 +7,15 @@ ${string} ${map} -if (typeof(require) !== 'undefined') { +if (typeof (require) !== 'undefined') { var helpers = require("resource://grammalecte/helpers.js"); } -const _dTAGS = new Map ([ +const _dTAGS = new Map([ [':G', "[mot grammatical]"], [':N', " nom,"], [':A', " adjectif,"], [':M1', " prénom,"], [':M2', " patronyme,"], @@ -41,16 +41,17 @@ [':Ot', " pronom interrogatif,"], [':Or', " pronom relatif,"], [':Ow', " pronom adverbial,"], [':Os', " pronom personnel sujet,"], [':Oo', " pronom personnel objet,"], - [':C', " conjonction,"], - [':Ĉ', " conjonction (él.),"], + [':C', " conjonction,"], [':Cc', " conjonction de coordination,"], [':Cs', " conjonction de subordination,"], - [':Ĉs', " conjonction de subordination (él.),"], - + + [':Ĺ', " locution,"], + [':Ĉ', " locution conjonctivale (él.),"], + [':Ĉs', " locution conjonctivale de subordination (él.),"], [':Ŵ', " locution adverbiale (él.),"], [':Ñ', " locution nominale (él.),"], [':Â', " locution adjectivale (él.),"], [':Ṽ', " locution verbale (él.),"], [':Ŕ', " locution prépositive (él.),"], @@ -66,11 +67,11 @@ [':V0a', " verbe,"], [':O1', " 1ʳᵉ pers.,"], [':O2', " 2ᵉ pers.,"], [':O3', " 3ᵉ pers.,"], - + [':e', " épicène"], [':m', " masculin"], [':f', " féminin"], [':s', " singulier"], [':p', " pluriel"], @@ -107,11 +108,11 @@ ['/R', " {réforme}"], ['/A', ""], ['/X', ""] ]); -const _dPFX = new Map ([ +const _dPFX = new Map([ ['d', "(de), déterminant épicène invariable"], ['l', "(le/la), déterminant masculin/féminin singulier"], ['j', "(je), pronom personnel sujet, 1ʳᵉ pers., épicène singulier"], ['m', "(me), pronom personnel objet, 1ʳᵉ pers., épicène singulier"], ['t', "(te), pronom personnel objet, 2ᵉ pers., épicène singulier"], @@ -123,28 +124,28 @@ ['lorsqu', "(lorsque), conjonction de subordination"], ['quoiqu', "(quoique), conjonction de subordination"], ['jusqu', "(jusque), préposition"] ]); -const _dAD = new Map ([ +const _dAD = new Map([ ['je', " pronom personnel sujet, 1ʳᵉ pers. sing."], ['tu', " pronom personnel sujet, 2ᵉ pers. sing."], ['il', " pronom personnel sujet, 3ᵉ pers. masc. sing."], ['on', " pronom personnel sujet, 3ᵉ pers. sing. ou plur."], ['elle', " pronom personnel sujet, 3ᵉ pers. fém. sing."], ['nous', " pronom personnel sujet/objet, 1ʳᵉ pers. plur."], ['vous', " pronom personnel sujet/objet, 2ᵉ pers. plur."], ['ils', " pronom personnel sujet, 3ᵉ pers. masc. plur."], ['elles', " pronom personnel sujet, 3ᵉ pers. masc. plur."], - + ["là", " particule démonstrative"], ["ci", " particule démonstrative"], - + ['le', " COD, masc. sing."], ['la', " COD, fém. sing."], ['les', " COD, plur."], - + ['moi', " COI (à moi), sing."], ['toi', " COI (à toi), sing."], ['lui', " COI (à lui ou à elle), sing."], ['nous2', " COI (à nous), plur."], ['vous2', " COI (à vous), plur."], @@ -159,11 +160,11 @@ ["m'en", " (me) pronom personnel objet + (en) pronom adverbial"], ["t'en", " (te) pronom personnel objet + (en) pronom adverbial"], ["s'en", " (se) pronom personnel objet + (en) pronom adverbial"] ]); -const _dSeparator = new Map ([ +const _dSeparator = new Map([ ['.', "point"], ['·', "point médian"], ['…', "points de suspension"], [':', "deux-points"], [';', "point-virgule"], @@ -194,74 +195,110 @@ ]); class Lexicographe { - constructor (oDict) { + constructor(oDict, oTokenizer, oLocution) { this.oDict = oDict; - this._zElidedPrefix = new RegExp ("^([dljmtsncç]|quoiqu|lorsqu|jusqu|puisqu|qu)['’](.+)", "i"); - this._zCompoundWord = new RegExp ("([a-zA-Zà-ö0-9À-Öø-ÿØ-ßĀ-ʯ]+)-((?:les?|la)-(?:moi|toi|lui|[nv]ous|leur)|t-(?:il|elle|on)|y|en|[mts][’'](?:y|en)|les?|l[aà]|[mt]oi|leur|lui|je|tu|ils?|elles?|on|[nv]ous)$", "i"); - this._zTag = new RegExp ("[:;/][a-zA-Zà-ö0-9À-Öø-ÿØ-ßĀ-ʯ*][^:;/]*", "g"); + this.oTokenizer = oTokenizer; + this.oLocution = JSON.parse(oLocution); + + this._zElidedPrefix = new RegExp("^([dljmtsncç]|quoiqu|lorsqu|jusqu|puisqu|qu)['’](.+)", "i"); + this._zCompoundWord = new RegExp("([a-zA-Zà-ö0-9À-Öø-ÿØ-ßĀ-ʯ]+)-((?:les?|la)-(?:moi|toi|lui|[nv]ous|leur)|t-(?:il|elle|on)|y|en|[mts][’'](?:y|en)|les?|l[aà]|[mt]oi|leur|lui|je|tu|ils?|elles?|on|[nv]ous)$", "i"); + this._zTag = new RegExp("[:;/][a-zA-Zà-ö0-9À-Öø-ÿØ-ßĀ-ʯ*][^:;/]*", "g"); + } - getInfoForToken (oToken) { + getInfoForToken(oToken) { // Token: .sType, .sValue, .nStart, .nEnd // return a list [type, token_string, values] let m = null; try { switch (oToken.sType) { case 'SEPARATOR': - return { sType: oToken.sType, sValue: oToken.sValue, aLabel: [_dSeparator.gl_get(oToken.sValue, "caractère indéterminé")] }; + return { + sType: oToken.sType, + sValue: oToken.sValue, + aLabel: [_dSeparator.gl_get(oToken.sValue, "caractère indéterminé")] + }; break; case 'NUM': - return { sType: oToken.sType, sValue: oToken.sValue, aLabel: ["nombre"] }; + return { + sType: oToken.sType, + sValue: oToken.sValue, + aLabel: ["nombre"] + }; break; case 'LINK': - return { sType: oToken.sType, sValue: oToken.sValue.slice(0,40)+"…", aLabel: ["hyperlien"] }; + return { + sType: oToken.sType, + sValue: oToken.sValue.slice(0, 40) + "…", + aLabel: ["hyperlien"] + }; break; case 'ELPFX': let sTemp = oToken.sValue.replace("’", "").replace("'", "").replace("`", "").toLowerCase(); - return { sType: oToken.sType, sValue: oToken.sValue, aLabel: [_dPFX.gl_get(sTemp, "préfixe élidé inconnu")] }; + return { + sType: oToken.sType, + sValue: oToken.sValue, + aLabel: [_dPFX.gl_get(sTemp, "préfixe élidé inconnu")] + }; break; case 'FOLDER': - return { sType: oToken.sType, sValue: oToken.sValue.slice(0,40)+"…", aLabel: ["dossier"] }; + return { + sType: oToken.sType, + sValue: oToken.sValue.slice(0, 40) + "…", + aLabel: ["dossier"] + }; break; - case 'WORD': + case 'WORD': if (oToken.sValue.gl_count("-") > 4) { - return { sType: "COMPLEX", sValue: oToken.sValue, aLabel: ["élément complexe indéterminé"] }; - } - else if (this.oDict.isValidToken(oToken.sValue)) { + return { + sType: "COMPLEX", + sValue: oToken.sValue, + aLabel: ["élément complexe indéterminé"] + }; + } else if (this.oDict.isValidToken(oToken.sValue)) { let lMorph = this.oDict.getMorph(oToken.sValue); let aElem = []; - for (let s of lMorph){ - if (s.includes(":")) aElem.push( this._formatTags(s) ); + for (let s of lMorph) { + if (s.includes(":")) aElem.push(this._formatTags(s)); } - return { sType: oToken.sType, sValue: oToken.sValue, aLabel: aElem}; - } - else if (m = this._zCompoundWord.exec(oToken.sValue)) { + return { + sType: oToken.sType, + sValue: oToken.sValue, + aLabel: aElem + }; + } else if (m = this._zCompoundWord.exec(oToken.sValue)) { // mots composés let lMorph = this.oDict.getMorph(m[1]); let aElem = []; - for (let s of lMorph){ - if (s.includes(":")) aElem.push( this._formatTags(s) ); + for (let s of lMorph) { + if (s.includes(":")) aElem.push(this._formatTags(s)); } aElem.push("-" + m[2] + ": " + this._formatSuffix(m[2].toLowerCase())); - return { sType: oToken.sType, sValue: oToken.sValue, aLabel: aElem }; - } - else { - return { sType: "UNKNOWN", sValue: oToken.sValue, aLabel: ["inconnu du dictionnaire"] }; + return { + sType: oToken.sType, + sValue: oToken.sValue, + aLabel: aElem + }; + } else { + return { + sType: "UNKNOWN", + sValue: oToken.sValue, + aLabel: ["inconnu du dictionnaire"] + }; } break; } - } - catch (e) { + } catch (e) { helpers.logerror(e); } return null; } - _formatTags (sTags) { + _formatTags(sTags) { let sRes = ""; sTags = sTags.replace(/V([0-3][ea]?)[itpqnmr_eaxz]+/, "V$1"); let m; while ((m = this._zTag.exec(sTags)) !== null) { sRes += _dTAGS.get(m[0]); @@ -278,11 +315,11 @@ return sRes; } return sRes.gl_trimRight(","); } - _formatSuffix (s) { + _formatSuffix(s) { if (s.startsWith("t-")) { return "“t” euphonique +" + _dAD.get(s.slice(2)); } if (!s.includes("-")) { return _dAD.get(s.replace("’", "'")); @@ -289,13 +326,120 @@ } if (s.endsWith("ous")) { s += '2'; } let nPos = s.indexOf("-"); - return _dAD.get(s.slice(0, nPos)) + " +" + _dAD.get(s.slice(nPos+1)); + return _dAD.get(s.slice(0, nPos)) + " +" + _dAD.get(s.slice(nPos + 1)); + } + + getListOfTokens(sText, bInfo = true) { + let aElem = []; + sText = sText.replace("'", "’").trim(); + if (sText !== "") { + let aRes = null; + for (let oToken of this.oTokenizer.genTokens(sText)) { + if (bInfo) { + aRes = this.getInfoForToken(oToken); + if (aRes) { + aElem.push(aRes); + } + } else if (oToken.sType !== "SPACE") { + aElem.push(oToken); + } + } + } + return aElem; + } + + elpfxToword(sELPFX){ + return sELPFX.replace('’', 'e'); + } + + getListOfTokensReduc(sText, bInfo = true) { + let lstToken = this.getListOfTokens(sText, false); + //console.log(lstToken); + //console.log(this.oLocution); + + let id = 0; + let aElem = []; + let aRes = null; + let isType = {'WORD':1,'ELPFX':1}; + do { + let oToken = lstToken[id] + let aLocution = this.oLocution[this.elpfxToword(oToken.sValue)]; + //console.log('Start cherche', oToken.sValue, aLocution); + let stop = false; + let start = id + 1; + let lastTokenWord = ''; + let ok = false; + let oLst = []; + oLst.push(oToken); + while (!stop && typeof aLocution !== "undefined") { + if (start > lstToken.length){ + break; + } + let typeToken = ''; + let nextToken = lstToken[start]; + //console.log(start, nextToken, aLocution); + + if ( typeof nextToken !== "undefined" ) { + aLocution = aLocution[this.elpfxToword(nextToken.sValue)]; + typeToken = nextToken.sType; + } else { + aLocution = "undefined"; + } + + if ( typeToken in isType && typeof aLocution !== "undefined") { + lastTokenWord = Object.keys(aLocution)[0]; + start++; + oLst.push(nextToken); + //console.log( nextToken.sValue ); + } else if ( !(typeToken in isType) || typeof aLocution == "undefined") { + stop = true; + if ( lastTokenWord.substring(0, 1) == ':' ) { + ok = true; + } + } + }; + + if ( ok ){ + let word = ''; + for (let oToken of oLst) { + word += oToken.sValue+' '; + //console.log('***',word); + } + id = id + oLst.length-1; + let tmpToken = { + 'nEnd':oLst[oLst.length-1].nEnd, + 'nStart':oLst[0].nStart, + 'sType':"WORD", + 'sValue':word.replace('’ ','’').trim() + }; + if (bInfo) { + aElem.push({ + sType: tmpToken.sType, + sValue: tmpToken.sValue, + aLabel: [this._formatTags(lastTokenWord).replace(/(\(él.\))/g,'').trim()] + }); + } else { + aElem.push(tmpToken); + } + } else { + if (bInfo) { + aRes = this.getInfoForToken(oToken); + if (aRes) { + aElem.push(aRes); + } + } else { + aElem.push(oToken); + } + } + id++; + } while (id < lstToken.length); + return aElem; } } -if (typeof(exports) !== 'undefined') { +if (typeof (exports) !== 'undefined') { exports.Lexicographe = Lexicographe; } Index: gc_lang/fr/modules-js/locutions_data.json ================================================================== --- gc_lang/fr/modules-js/locutions_data.json +++ gc_lang/fr/modules-js/locutions_data.json cannot compute difference between binary files Index: gc_lang/fr/modules-js/phonet_data.json ================================================================== --- gc_lang/fr/modules-js/phonet_data.json +++ gc_lang/fr/modules-js/phonet_data.json cannot compute difference between binary files Index: gc_lang/fr/modules/locutions_data.py ================================================================== --- gc_lang/fr/modules/locutions_data.py +++ gc_lang/fr/modules/locutions_data.py cannot compute difference between binary files Index: gc_lang/fr/modules/phonet_data.py ================================================================== --- gc_lang/fr/modules/phonet_data.py +++ gc_lang/fr/modules/phonet_data.py cannot compute difference between binary files Index: gc_lang/fr/webext/gce_worker.js ================================================================== --- gc_lang/fr/webext/gce_worker.js +++ gc_lang/fr/webext/gce_worker.js @@ -138,10 +138,11 @@ let oDict = null; let oTokenizer = null; let oLxg = null; let oTest = null; +let oLocution = null; /* Technical note: This worker don’t work as a PromiseWorker (which returns a promise), so when we send request @@ -160,15 +161,18 @@ mfsp.init(helpers.loadFile(sExtensionPath + "/grammalecte/fr/mfsp_data.json")); //console.log("[Worker] Modules have been initialized…"); gc_engine.load(sContext, sExtensionPath+"grammalecte/_dictionaries"); oDict = gc_engine.getDictionary(); oTest = new TestGrammarChecking(gc_engine, sExtensionPath+"/grammalecte/fr/tests_data.json"); - oLxg = new Lexicographe(oDict); + oTokenizer = new Tokenizer("fr"); + + oLocution = helpers.loadFile(sExtensionPath + "/grammalecte/fr/locutions_data.json"); + + oLxg = new Lexicographe(oDict, oTokenizer, oLocution); if (dOptions !== null) { gc_engine.setOptions(dOptions); } - oTokenizer = new Tokenizer("fr"); //tests(); bInitDone = true; } else { console.log("[Worker] Already initialized…") } @@ -296,23 +300,15 @@ function getListOfTokens (sText, dInfo={}) { try { for (let sParagraph of text.getParagraph(sText)) { if (sParagraph.trim() !== "") { - let aElem = []; - let aRes = null; - for (let oToken of oTokenizer.genTokens(sParagraph)) { - aRes = oLxg.getInfoForToken(oToken); - if (aRes) { - aElem.push(aRes); - } - } - postMessage(createResponse("getListOfTokens", aElem, dInfo, false)); + postMessage(createResponse("getListOfTokens", oLxg.getListOfTokensReduc(sParagraph, true), dInfo, false)); } } postMessage(createResponse("getListOfTokens", null, dInfo, true)); } catch (e) { helpers.logerror(e); postMessage(createResponse("getListOfTokens", createErrorResult(e, "no tokens"), dInfo, true, true)); } }