Index: gc_core/js/text.js ================================================================== --- gc_core/js/text.js +++ gc_core/js/text.js @@ -9,16 +9,16 @@ var helpers = require("resource://grammalecte/helpers.js"); } var text = { - getParagraph: function* (sText) { + getParagraph: function* (sText, sSepParagraph = "\n") { // generator: returns paragraphs of text let iStart = 0; let iEnd = 0; sText = sText.replace("\r\n", "\n").replace("\r", "\n"); - while ((iEnd = sText.indexOf("\n", iStart)) !== -1) { + while ((iEnd = sText.indexOf(sSepParagraph, iStart)) !== -1) { yield sText.slice(iStart, iEnd); iStart = iEnd + 1; } yield sText.slice(iStart); }, @@ -43,11 +43,11 @@ }, getReadableError: function (oErr) { // Returns an error oErr as a readable error try { - let sResult = "\n* " + oErr['nStart'] + ":" + oErr['nEnd'] + let sResult = "\n* " + oErr['nStart'] + ":" + oErr['nEnd'] + " # " + oErr['sLineId'] + " # " + oErr['sRuleId'] + ":\n"; sResult += " " + oErr["sMessage"]; if (oErr["aSuggestions"].length > 0) { sResult += "\n > Suggestions : " + oErr["aSuggestions"].join(" | "); } Index: gc_lang/fr/build_data.py ================================================================== --- gc_lang/fr/build_data.py +++ gc_lang/fr/build_data.py @@ -5,15 +5,17 @@ # by Olivier R. # License: MPL 2 import json import os +import itertools import grammalecte.ibdawg as ibdawg from grammalecte.echo import echo from grammalecte.str_transform import defineSuffixCode import grammalecte.fr.conj as conj +import grammalecte.tokenizer as tkz class cd: """Context manager for changing the current working directory""" def __init__ (self, newPath): @@ -24,10 +26,21 @@ os.chdir(self.newPath) def __exit__ (self, etype, value, traceback): os.chdir(self.savedPath) + +def readFile (spf): + if os.path.isfile(spf): + with open(spf, "r", encoding="utf-8") as hSrc: + for sLine in hSrc: + sLine = sLine.strip() + if sLine and not sLine.startswith("#"): + yield sLine + else: + raise OSError("# Error. File not found or not loadable: " + spf) + def makeDictionaries (sp, sVersion): with cd(sp+"/dictionnaire"): os.system("genfrdic.py -s -gl -v "+sVersion) @@ -47,75 +60,73 @@ "1sg": ":1s", "2sg": ":2s", "3sg": ":3s", "1pl": ":1p", "2pl": ":2p", "3pl": ":3p", "1isg": ":1ś", "mas sg": ":Q1", "mas pl": ":Q2", "mas inv": ":Q1", "fem sg": ":Q3", "fem pl": ":Q4", "epi inv": ":Q1" } # read lexicon - with open(sp+"/data/dictConj.txt", 'r', encoding='utf-8') as hSrc: - nStop = 0 - for n, line in enumerate(hSrc.readlines()): - line = line.strip() - nTab = line.count("\t") - if nTab == 1: - # new entry - sLemma, sVtyp = line.split("\t") - dConj = { ":PQ": { ":P": "", ":Q1": "", ":Q2": "", ":Q3": "", ":Q4": ""}, - ":Ip": { ":1s": "", ":2s": "", ":3s": "", ":1p": "", ":2p": "", ":3p": "", ":1ś": "" }, - ":Iq": { ":1s": "", ":2s": "", ":3s": "", ":1p": "", ":2p": "", ":3p": "" }, - ":Is": { ":1s": "", ":2s": "", ":3s": "", ":1p": "", ":2p": "", ":3p": "" }, - ":If": { ":1s": "", ":2s": "", ":3s": "", ":1p": "", ":2p": "", ":3p": "" }, - ":K": { ":1s": "", ":2s": "", ":3s": "", ":1p": "", ":2p": "", ":3p": "" }, - ":Sp": { ":1s": "", ":2s": "", ":3s": "", ":1p": "", ":2p": "", ":3p": "", ":1ś": "" }, - ":Sq": { ":1s": "", ":2s": "", ":3s": "", ":1p": "", ":2p": "", ":3p": "", ":1ś": "" }, - ":E": { ":2s": "", ":1p": "", ":2p": "" } - } - if sVtyp not in lVtyp: - dVtyp[sVtyp] = nVtyp - lVtyp.append(sVtyp) - nVtyp += 1 - elif nTab == 2: - # flexion - _, sTag, sFlex = line.split("\t") - if sTag.count(" ") == 0: - if sTag == "ppre": - dConj[":PQ"][":P"] = defineSuffixCode(sLemma, sFlex) - else: - try: - mode, g = sTag.split(maxsplit=1) - mode = dTrad[mode] - g = dTrad[g] - if dConj[mode][g] == "": - dConj[mode][g] = defineSuffixCode(sLemma, sFlex) - else: - # comment gérer les autres graphies ? - pass - except: - print(sLemma.encode("utf-8").decode("ascii"), " - ", sTag, " - non géré: ", mode, " / ", g) - elif line == "$": - # we store the dictionary of rules for this lemma - if dConj[":Ip"][":1ś"] == "2è": - dConj[":Ip"][":1ś"] = "2é" - elif sLemma == "pouvoir": - dConj[":Ip"][":1ś"] = "6uis" - lConjTags = [] - for key in [":PQ", ":Ip", ":Iq", ":Is", ":If", ":K", ":Sp", ":Sq", ":E"]: - bFound = False - for i, d in enumerate(dPatternList[key]): - if dConj[key] == d: - bFound = True - lConjTags.append(i) - break - if not bFound: - lConjTags.append(len(dPatternList[key])) - dPatternList[key].append(dConj[key]) - tConjTags = tuple(lConjTags) - if tConjTags not in lTags: - dTags[tConjTags] = nTags - lTags.append(tConjTags) - nTags += 1 - dVerb[sLemma] = (dVtyp[sVtyp], dTags[tConjTags]) - else: - print("# Error - unknown line #", n) + nStop = 0 + for n, sLine in enumerate(readFile(sp+"/data/dictConj.txt")): + nTab = sLine.count("\t") + if nTab == 1: + # new entry + sLemma, sVtyp = sLine.split("\t") + dConj = { ":PQ": { ":P": "", ":Q1": "", ":Q2": "", ":Q3": "", ":Q4": ""}, + ":Ip": { ":1s": "", ":2s": "", ":3s": "", ":1p": "", ":2p": "", ":3p": "", ":1ś": "" }, + ":Iq": { ":1s": "", ":2s": "", ":3s": "", ":1p": "", ":2p": "", ":3p": "" }, + ":Is": { ":1s": "", ":2s": "", ":3s": "", ":1p": "", ":2p": "", ":3p": "" }, + ":If": { ":1s": "", ":2s": "", ":3s": "", ":1p": "", ":2p": "", ":3p": "" }, + ":K": { ":1s": "", ":2s": "", ":3s": "", ":1p": "", ":2p": "", ":3p": "" }, + ":Sp": { ":1s": "", ":2s": "", ":3s": "", ":1p": "", ":2p": "", ":3p": "", ":1ś": "" }, + ":Sq": { ":1s": "", ":2s": "", ":3s": "", ":1p": "", ":2p": "", ":3p": "", ":1ś": "" }, + ":E": { ":2s": "", ":1p": "", ":2p": "" } + } + if sVtyp not in lVtyp: + dVtyp[sVtyp] = nVtyp + lVtyp.append(sVtyp) + nVtyp += 1 + elif nTab == 2: + # flexion + _, sTag, sFlex = sLine.split("\t") + if sTag.count(" ") == 0: + if sTag == "ppre": + dConj[":PQ"][":P"] = defineSuffixCode(sLemma, sFlex) + else: + try: + mode, g = sTag.split(maxsplit=1) + mode = dTrad[mode] + g = dTrad[g] + if dConj[mode][g] == "": + dConj[mode][g] = defineSuffixCode(sLemma, sFlex) + else: + # comment gérer les autres graphies ? + pass + except: + print(sLemma.encode("utf-8").decode("ascii"), " - ", sTag, " - non géré: ", mode, " / ", g) + elif sLine == "$": + # we store the dictionary of rules for this lemma + if dConj[":Ip"][":1ś"] == "2è": + dConj[":Ip"][":1ś"] = "2é" + elif sLemma == "pouvoir": + dConj[":Ip"][":1ś"] = "6uis" + lConjTags = [] + for key in [":PQ", ":Ip", ":Iq", ":Is", ":If", ":K", ":Sp", ":Sq", ":E"]: + bFound = False + for i, d in enumerate(dPatternList[key]): + if dConj[key] == d: + bFound = True + lConjTags.append(i) + break + if not bFound: + lConjTags.append(len(dPatternList[key])) + dPatternList[key].append(dConj[key]) + tConjTags = tuple(lConjTags) + if tConjTags not in lTags: + dTags[tConjTags] = nTags + lTags.append(tConjTags) + nTags += 1 + dVerb[sLemma] = (dVtyp[sVtyp], dTags[tConjTags]) + else: + print("# Error - unknown line #", n) # convert tuples to bytes string # si ça merde, toute la partie conversion peut être supprimée # lBytesTags = [] # for t in lTags: @@ -169,72 +180,69 @@ dTag = {} lTagMasForm = [] lTagMiscPlur = [] dMiscPlur = {} dMasForm = {} - # read lexicon - with open(sp+"/data/dictDecl.txt", 'r', encoding='utf-8') as hSrc: - lTag = [] - lTagMasPl = [] - for n, line in enumerate(hSrc.readlines()): - line = line.strip() - nTab = line.count("\t") - if nTab == 1: - # new entry - lTag.clear() - lTagMasPl.clear() - sLemma, sFlags = line.split("\t") - if sFlags.startswith("S"): - cType = "s" - elif sFlags.startswith("X"): - cType = "p" - elif sFlags.startswith("A"): - cType = "p" - elif sFlags.startswith("I"): - cType = "p" - elif sFlags.startswith("F"): - cType = "m" - elif sFlags.startswith("W"): - cType = "m" - else: - cType = "?" - print(" > inconnu : " + sFlags) - elif nTab == 2: - if cType == "s": - continue - _, sFlexTags, sFlex = line.split("\t") - if cType == "p": - if sFlexTags.endswith("pl"): - lTag.append(defineSuffixCode(sLemma, sFlex)) - elif cType == "m": - if sFlexTags.endswith("mas sg") or sFlexTags.endswith("mas inv"): - lTag.append(defineSuffixCode(sLemma, sFlex)) - if sFlexTags.endswith("mas pl"): - lTagMasPl.append(defineSuffixCode(sLemma, sFlex)) - else: - print("erreur: " + cType) - elif line == "$": - if cType == "s": - aPlurS.add(sLemma) - elif cType == "p": - sTag = "|".join(lTag) - if sTag not in dTag: - dTag[sTag] = len(lTagMiscPlur) - lTagMiscPlur.append(sTag) - dMiscPlur[sLemma] = dTag[sTag] - elif cType == "m": - sTag = "|".join(lTag) - if lTagMasPl: - sTag += "/" + "|".join(lTagMasPl) - if sTag not in dTag: - dTag[sTag] = len(lTagMasForm) - lTagMasForm.append(sTag) - dMasForm[sLemma] = dTag[sTag] - else: - print("unknown tag: " + ctype) - else: - print("# Error - unknown line #", n) + lTag = [] + lTagMasPl = [] + for n, sLine in enumerate(readFile(sp+"/data/dictDecl.txt")): + nTab = sLine.count("\t") + if nTab == 1: + # new entry + lTag.clear() + lTagMasPl.clear() + sLemma, sFlags = sLine.split("\t") + if sFlags.startswith("S"): + cType = "s" + elif sFlags.startswith("X"): + cType = "p" + elif sFlags.startswith("A"): + cType = "p" + elif sFlags.startswith("I"): + cType = "p" + elif sFlags.startswith("F"): + cType = "m" + elif sFlags.startswith("W"): + cType = "m" + else: + cType = "?" + print(" > inconnu : " + sFlags) + elif nTab == 2: + if cType == "s": + continue + _, sFlexTags, sFlex = sLine.split("\t") + if cType == "p": + if sFlexTags.endswith("pl"): + lTag.append(defineSuffixCode(sLemma, sFlex)) + elif cType == "m": + if sFlexTags.endswith("mas sg") or sFlexTags.endswith("mas inv"): + lTag.append(defineSuffixCode(sLemma, sFlex)) + if sFlexTags.endswith("mas pl"): + lTagMasPl.append(defineSuffixCode(sLemma, sFlex)) + else: + print("erreur: " + cType) + elif sLine == "$": + if cType == "s": + aPlurS.add(sLemma) + elif cType == "p": + sTag = "|".join(lTag) + if sTag not in dTag: + dTag[sTag] = len(lTagMiscPlur) + lTagMiscPlur.append(sTag) + dMiscPlur[sLemma] = dTag[sTag] + elif cType == "m": + sTag = "|".join(lTag) + if lTagMasPl: + sTag += "/" + "|".join(lTagMasPl) + if sTag not in dTag: + dTag[sTag] = len(lTagMasForm) + lTagMasForm.append(sTag) + dMasForm[sLemma] = dTag[sTag] + else: + print("unknown tag: " + ctype) + else: + print("# Error - unknown line #", n) ## write file for Python sCode = "# generated data (do not edit)\n\n" + \ "# list of affix codes\n" + \ "lTagMiscPlur = " + str(lTagMiscPlur) + "\n" + \ @@ -256,42 +264,39 @@ def makePhonetTable (sp, bJS=False): print("> Correspondances phonétiques ", end="") print("(Python et JavaScript)" if bJS else "(Python seulement)") - + try: oDict = ibdawg.IBDAWG("French.bdic") except: traceback.print_exc() return - with open(sp+"/data/phonet_simil.txt", 'r', encoding='utf-8') as hSrc: - # set of homophonic words - lSet = [] - for sLine in hSrc.readlines(): - if not sLine.startswith("#") and sLine.strip(): - lWord = sLine.strip().split() - aMore = set() - for sWord in lWord: - if sWord.endswith("er") and conj.isVerb(sWord): - aMore = aMore.union(conj.getConjSimilInfiV1(sWord)) - lWord.extend(list(aMore)) - lSet.append(sorted(set(lWord))) - #print(lWord) - # dictionary of words - dWord = {} - for i, aSet in enumerate(lSet): - for sWord in aSet: - if oDict.lookup(sWord): - dWord[sWord] = i # warning, what if word in several sets? - else: - echo("Mot inconnu : " + sWord) - # dictionary of morphologies - dMorph = {} - for sWord in dWord: - dMorph[sWord] = oDict.getMorph(sWord) + # set of homophonic words + lSet = [] + for sLine in readFile(sp+"/data/phonet_simil.txt"): + lWord = sLine.split() + aMore = set() + for sWord in lWord: + if sWord.endswith("er") and conj.isVerb(sWord): + aMore = aMore.union(conj.getConjSimilInfiV1(sWord)) + lWord.extend(list(aMore)) + lSet.append(sorted(set(lWord))) + # dictionary of words + dWord = {} + for i, aSet in enumerate(lSet): + for sWord in aSet: + if oDict.lookup(sWord): + dWord[sWord] = i # warning, what if word in several sets? + else: + echo("Mot inconnu : " + sWord) + # dictionary of morphologies + dMorph = {} + for sWord in dWord: + dMorph[sWord] = oDict.getMorph(sWord) # write file for Python sCode = "# generated data (do not edit)\n\n" + \ "dWord = " + str(dWord) + "\n\n" + \ "lSet = " + str(lSet) + "\n\n" + \ @@ -309,26 +314,27 @@ def makeLocutions (sp, bJS=False): "compile list of locutions in JSON" print("> Locutions ", end="") print("(Python et JavaScript)" if bJS else "(Python seulement)") - with open(sp+"/data/locutions.txt", 'r', encoding='utf-8') as hSrc: - dLocutions = {} - for sLine in hSrc.readlines(): - if not sLine.startswith("#") and sLine.strip(): - lElem = sLine.strip().split() - dCur = dLocutions - for sWord in lElem: - if sWord not in dCur: - dCur[sWord] = {} - dCur = dCur[sWord] + dLocGraph = {} + oTokenizer = tkz.Tokenizer("fr") + for sLine in itertools.chain(readFile(sp+"/data/locutions.txt"), readFile(sp+"/data/locutions_vrac.txt")): + dCur = dLocGraph + sLoc, sTag = sLine.split("\t") + for oToken in oTokenizer.genTokens(sLoc.strip()): + sWord = oToken["sValue"] + if sWord not in dCur: + dCur[sWord] = {} + dCur = dCur[sWord] + dCur[":"] = sTag sCode = "# generated data (do not edit)\n\n" + \ - "dLocutions = " + str(dLocutions) + "\n" + "dLocutions = " + str(dLocGraph) + "\n" open(sp+"/modules/locutions_data.py", "w", encoding="utf-8", newline="\n").write(sCode) if bJS: - open(sp+"/modules-js/locutions_data.json", "w", encoding="utf-8", newline="\n").write(json.dumps(dLocutions, ensure_ascii=False)) + open(sp+"/modules-js/locutions_data.json", "w", encoding="utf-8", newline="\n").write(json.dumps(dLocGraph, ensure_ascii=False)) def before (spLaunch, dVars, bJS=False): print("========== Build Hunspell dictionaries ==========") makeDictionaries(spLaunch, dVars['oxt_version']) Index: gc_lang/fr/data/locutions.txt ================================================================== --- gc_lang/fr/data/locutions.txt +++ gc_lang/fr/data/locutions.txt @@ -1,8 +1,14 @@ -à califourchon -à cœur joie -à cœur ouvert -à corps perdu -à perte de vue -à visage découvert -par ailleurs -par acquit de conscience +à califourchon :LW +à contrecœur :LW +à cœur joie :LW +à cœur ouvert :LW +à corps perdu :LW +à bâtons rompus :LW +à perte de vue :LW +à visage découvert :LW +à vue d’œil :LW +à l’aveuglette :LW +ad hominem :LW +en tout et pour tout :LW +par ailleurs :LW +par acquit de conscience :LW ADDED gc_lang/fr/data/locutions_vrac.txt Index: gc_lang/fr/data/locutions_vrac.txt ================================================================== --- /dev/null +++ gc_lang/fr/data/locutions_vrac.txt Index: gc_lang/fr/modules-js/lexicographe.js ================================================================== --- gc_lang/fr/modules-js/lexicographe.js +++ gc_lang/fr/modules-js/lexicographe.js @@ -7,15 +7,15 @@ ${string} ${map} -if (typeof(require) !== 'undefined') { +if (typeof (require) !== 'undefined') { var helpers = require("resource://grammalecte/helpers.js"); } -const _dTAGS = new Map ([ +const _dTAGS = new Map([ [':G', "[mot grammatical]"], [':N', " nom,"], [':A', " adjectif,"], [':M1', " prénom,"], [':M2', " patronyme,"], @@ -46,11 +46,11 @@ [':C', " conjonction,"], [':Ĉ', " conjonction (él.),"], [':Cc', " conjonction de coordination,"], [':Cs', " conjonction de subordination,"], [':Ĉs', " conjonction de subordination (él.),"], - + [':Ŵ', " locution adverbiale (él.),"], [':Ñ', " locution nominale (él.),"], [':Â', " locution adjectivale (él.),"], [':Ṽ', " locution verbale (él.),"], [':Ŕ', " locution prépositive (él.),"], @@ -66,11 +66,11 @@ [':V0a', " verbe,"], [':O1', " 1ʳᵉ pers.,"], [':O2', " 2ᵉ pers.,"], [':O3', " 3ᵉ pers.,"], - + [':e', " épicène"], [':m', " masculin"], [':f', " féminin"], [':s', " singulier"], [':p', " pluriel"], @@ -107,11 +107,38 @@ ['/R', " {réforme}"], ['/A', ""], ['/X', ""] ]); -const _dPFX = new Map ([ +const _dLocTAGS = new Map([ + [':LN', "locution nominale"], + [':LA', "locution adjectivale"], + [':LV', "locution verbale"], + [':LW', "locution adverbiale"], + [':LR', "locution prépositive"], + [':LO', "locution pronominale"], + [':LC', "locution conjonctive"], + [':LJ', "locution interjective"], + + [':B', " cardinal"], + [':e', " épicène"], + [':m', " masculin"], + [':f', " féminin"], + [':s', " singulier"], + [':p', " pluriel"], + [':i', " invariable"], + ['/L', " {latin}"] +]); +const _dLocVERB = new Map([ + ['i', " intransitif"], + ['n', " transitif indirect"], + ['t', " transitif direct"], + ['p', " pronominal"], + ['m', " impersonnel"], +]); + +const _dPFX = new Map([ ['d', "(de), déterminant épicène invariable"], ['l', "(le/la), déterminant masculin/féminin singulier"], ['j', "(je), pronom personnel sujet, 1ʳᵉ pers., épicène singulier"], ['m', "(me), pronom personnel objet, 1ʳᵉ pers., épicène singulier"], ['t', "(te), pronom personnel objet, 2ᵉ pers., épicène singulier"], @@ -123,28 +150,28 @@ ['lorsqu', "(lorsque), conjonction de subordination"], ['quoiqu', "(quoique), conjonction de subordination"], ['jusqu', "(jusque), préposition"] ]); -const _dAD = new Map ([ +const _dAD = new Map([ ['je', " pronom personnel sujet, 1ʳᵉ pers. sing."], ['tu', " pronom personnel sujet, 2ᵉ pers. sing."], ['il', " pronom personnel sujet, 3ᵉ pers. masc. sing."], ['on', " pronom personnel sujet, 3ᵉ pers. sing. ou plur."], ['elle', " pronom personnel sujet, 3ᵉ pers. fém. sing."], ['nous', " pronom personnel sujet/objet, 1ʳᵉ pers. plur."], ['vous', " pronom personnel sujet/objet, 2ᵉ pers. plur."], ['ils', " pronom personnel sujet, 3ᵉ pers. masc. plur."], ['elles', " pronom personnel sujet, 3ᵉ pers. masc. plur."], - + ["là", " particule démonstrative"], ["ci", " particule démonstrative"], - + ['le', " COD, masc. sing."], ['la', " COD, fém. sing."], ['les', " COD, plur."], - + ['moi', " COI (à moi), sing."], ['toi', " COI (à toi), sing."], ['lui', " COI (à lui ou à elle), sing."], ['nous2', " COI (à nous), plur."], ['vous2', " COI (à vous), plur."], @@ -159,11 +186,11 @@ ["m'en", " (me) pronom personnel objet + (en) pronom adverbial"], ["t'en", " (te) pronom personnel objet + (en) pronom adverbial"], ["s'en", " (se) pronom personnel objet + (en) pronom adverbial"] ]); -const _dSeparator = new Map ([ +const _dSeparator = new Map([ ['.', "point"], ['·', "point médian"], ['…', "points de suspension"], [':', "deux-points"], [';', "point-virgule"], @@ -194,68 +221,104 @@ ]); class Lexicographe { - constructor (oDict) { + constructor (oDict, oTokenizer, oLocGraph) { this.oDict = oDict; - this._zElidedPrefix = new RegExp ("^([dljmtsncç]|quoiqu|lorsqu|jusqu|puisqu|qu)['’](.+)", "i"); - this._zCompoundWord = new RegExp ("([a-zA-Zà-ö0-9À-Öø-ÿØ-ßĀ-ʯ]+)-((?:les?|la)-(?:moi|toi|lui|[nv]ous|leur)|t-(?:il|elle|on)|y|en|[mts][’'](?:y|en)|les?|l[aà]|[mt]oi|leur|lui|je|tu|ils?|elles?|on|[nv]ous)$", "i"); - this._zTag = new RegExp ("[:;/][a-zA-Zà-ö0-9À-Öø-ÿØ-ßĀ-ʯ*][^:;/]*", "g"); + this.oTokenizer = oTokenizer; + this.oLocGraph = JSON.parse(oLocGraph); + + this._zElidedPrefix = new RegExp("^([dljmtsncç]|quoiqu|lorsqu|jusqu|puisqu|qu)['’](.+)", "i"); + this._zCompoundWord = new RegExp("([a-zA-Zà-ö0-9À-Öø-ÿØ-ßĀ-ʯ]+)-((?:les?|la)-(?:moi|toi|lui|[nv]ous|leur)|t-(?:il|elle|on)|y|en|[mts][’'](?:y|en)|les?|l[aà]|[mt]oi|leur|lui|je|tu|ils?|elles?|on|[nv]ous)$", "i"); + this._zTag = new RegExp("[:;/][a-zA-Zà-ö0-9À-Öø-ÿØ-ßĀ-ʯ*Ṽ][^:;/]*", "g"); + this._zLocTag = new RegExp("(:L[A-Z])([a-z].?)?(.*)"); } getInfoForToken (oToken) { // Token: .sType, .sValue, .nStart, .nEnd - // return a list [type, token_string, values] + // return a object {sType, sValue, aLabel} let m = null; try { switch (oToken.sType) { case 'SEPARATOR': - return { sType: oToken.sType, sValue: oToken.sValue, aLabel: [_dSeparator.gl_get(oToken.sValue, "caractère indéterminé")] }; + return { + sType: oToken.sType, + sValue: oToken.sValue, + aLabel: [_dSeparator.gl_get(oToken.sValue, "caractère indéterminé")] + }; break; case 'NUM': - return { sType: oToken.sType, sValue: oToken.sValue, aLabel: ["nombre"] }; + return { + sType: oToken.sType, + sValue: oToken.sValue, + aLabel: ["nombre"] + }; break; case 'LINK': - return { sType: oToken.sType, sValue: oToken.sValue.slice(0,40)+"…", aLabel: ["hyperlien"] }; + return { + sType: oToken.sType, + sValue: oToken.sValue.slice(0, 40) + "…", + aLabel: ["hyperlien"] + }; break; case 'ELPFX': let sTemp = oToken.sValue.replace("’", "").replace("'", "").replace("`", "").toLowerCase(); - return { sType: oToken.sType, sValue: oToken.sValue, aLabel: [_dPFX.gl_get(sTemp, "préfixe élidé inconnu")] }; + return { + sType: oToken.sType, + sValue: oToken.sValue, + aLabel: [_dPFX.gl_get(sTemp, "préfixe élidé inconnu")] + }; break; case 'FOLDER': - return { sType: oToken.sType, sValue: oToken.sValue.slice(0,40)+"…", aLabel: ["dossier"] }; + return { + sType: oToken.sType, + sValue: oToken.sValue.slice(0, 40) + "…", + aLabel: ["dossier"] + }; break; - case 'WORD': + case 'WORD': if (oToken.sValue.gl_count("-") > 4) { - return { sType: "COMPLEX", sValue: oToken.sValue, aLabel: ["élément complexe indéterminé"] }; - } - else if (this.oDict.isValidToken(oToken.sValue)) { + return { + sType: "COMPLEX", + sValue: oToken.sValue, + aLabel: ["élément complexe indéterminé"] + }; + } else if (this.oDict.isValidToken(oToken.sValue)) { let lMorph = this.oDict.getMorph(oToken.sValue); let aElem = []; - for (let s of lMorph){ - if (s.includes(":")) aElem.push( this._formatTags(s) ); + for (let s of lMorph) { + if (s.includes(":")) aElem.push(this._formatTags(s)); } - return { sType: oToken.sType, sValue: oToken.sValue, aLabel: aElem}; - } - else if (m = this._zCompoundWord.exec(oToken.sValue)) { + return { + sType: oToken.sType, + sValue: oToken.sValue, + aLabel: aElem + }; + } else if (m = this._zCompoundWord.exec(oToken.sValue)) { // mots composés let lMorph = this.oDict.getMorph(m[1]); let aElem = []; - for (let s of lMorph){ - if (s.includes(":")) aElem.push( this._formatTags(s) ); + for (let s of lMorph) { + if (s.includes(":")) aElem.push(this._formatTags(s)); } aElem.push("-" + m[2] + ": " + this._formatSuffix(m[2].toLowerCase())); - return { sType: oToken.sType, sValue: oToken.sValue, aLabel: aElem }; - } - else { - return { sType: "UNKNOWN", sValue: oToken.sValue, aLabel: ["inconnu du dictionnaire"] }; + return { + sType: oToken.sType, + sValue: oToken.sValue, + aLabel: aElem + }; + } else { + return { + sType: "UNKNOWN", + sValue: oToken.sValue, + aLabel: ["inconnu du dictionnaire"] + }; } break; } - } - catch (e) { + } catch (e) { helpers.logerror(e); } return null; } @@ -277,10 +340,34 @@ helpers.echo(sRes); return sRes; } return sRes.gl_trimRight(","); } + + _formatTagsLoc (sTags) { + let sRes = ""; + let oTagsVerb = this._zLocTag.exec(sTags); + sTags = oTagsVerb[1]+oTagsVerb[3]; + let m; + while ((m = this._zTag.exec(sTags)) !== null) { + sRes += _dLocTAGS.get(m[0]); + if (m[0] == ':LV' && oTagsVerb[2]){ + oTagsVerb[2].split(/(?!$)/u).forEach(function(sKey) { + sRes += _dLocVERB.get(sKey); + }); + } + if (sRes.length > 100) { + break; + } + } + if (!sRes) { + sRes = "#Erreur. Étiquette inconnue : [" + sTags + "]"; + helpers.echo(sRes); + return sRes; + } + return sRes.gl_trimRight(","); + } _formatSuffix (s) { if (s.startsWith("t-")) { return "“t” euphonique +" + _dAD.get(s.slice(2)); } @@ -289,13 +376,109 @@ } if (s.endsWith("ous")) { s += '2'; } let nPos = s.indexOf("-"); - return _dAD.get(s.slice(0, nPos)) + " +" + _dAD.get(s.slice(nPos+1)); + return _dAD.get(s.slice(0, nPos)) + " +" + _dAD.get(s.slice(nPos + 1)); + } + + getListOfTokens (sText, bInfo=true) { + let aElem = []; + if (sText !== "") { + for (let oToken of this.oTokenizer.genTokens(sText)) { + if (bInfo) { + let aRes = this.getInfoForToken(oToken); + if (aRes) { + aElem.push(aRes); + } + } else if (oToken.sType !== "SPACE") { + aElem.push(oToken); + } + } + } + return aElem; + } + + generateInfoForTokenList (lToken) { + let aElem = []; + for (let oToken of lToken) { + let aRes = this.getInfoForToken(oToken); + if (aRes) { + aElem.push(aRes); + } + } + return aElem; + } + + getListOfTokensReduc (sText, bInfo=true) { + let aTokenList = this.getListOfTokens(sText.replace("'", "’").trim(), false); + let iKey = 0; + let aElem = []; + do { + let oToken = aTokenList[iKey]; + let sMorphLoc = ''; + let aTokenTempList = [oToken]; + if (oToken.sType == "WORD" || oToken.sType == "ELPFX"){ + let iKeyTree = iKey + 1; + let oLocNode = this.oLocGraph[oToken.sValue.toLowerCase()]; + while (oLocNode) { + let oTokenNext = aTokenList[iKeyTree]; + iKeyTree++; + if (oTokenNext) { + oLocNode = oLocNode[oTokenNext.sValue.toLowerCase()]; + } + if (oLocNode && iKeyTree <= aTokenList.length) { + sMorphLoc = oLocNode[":"]; + aTokenTempList.push(oTokenNext); + } else { + break; + } + } + } + + if (sMorphLoc) { + let sValue = ''; + for (let oTokenWord of aTokenTempList) { + sValue += oTokenWord.sValue+' '; + } + let oTokenLocution = { + 'nStart': aTokenTempList[0].nStart, + 'nEnd': aTokenTempList[aTokenTempList.length-1].nEnd, + 'sType': "LOC", + 'sValue': sValue.replace('’ ','’').trim(), + 'aSubToken': aTokenTempList + }; + if (bInfo) { + let aFormatedTag = []; + for (let sTagLoc of sMorphLoc.split('|') ){ + aFormatedTag.push( this._formatTagsLoc(sTagLoc) ); + } + aElem.push({ + sType: oTokenLocution.sType, + sValue: oTokenLocution.sValue, + aLabel: aFormatedTag, + aSubElem: this.generateInfoForTokenList(aTokenTempList) + }); + } else { + aElem.push(oTokenLocution); + } + iKey = iKey + aTokenTempList.length; + } else { + if (bInfo) { + let aRes = this.getInfoForToken(oToken); + if (aRes) { + aElem.push(aRes); + } + } else { + aElem.push(oToken); + } + iKey++; + } + } while (iKey < aTokenList.length); + return aElem; } } if (typeof(exports) !== 'undefined') { exports.Lexicographe = Lexicographe; } Index: gc_lang/fr/modules-js/locutions_data.json ================================================================== --- gc_lang/fr/modules-js/locutions_data.json +++ gc_lang/fr/modules-js/locutions_data.json @@ -1,1 +1,1 @@ -{"à": {"califourchon": {}, "cœur": {"joie": {}, "ouvert": {}}, "corps": {"perdu": {}}, "perte": {"de": {"vue": {}}}, "visage": {"découvert": {}}}, "par": {"ailleurs": {}, "acquit": {"de": {"conscience": {}}}}} +{"à": {"califourchon": {":": ":Ŵ"}, "contrecœur": {":": ":Ŵ"}, "cœur": {"joie": {":": ":Ŵ"}, "ouvert": {":": ":Ŵ"}}, "corps": {"perdu": {":": ":Ŵ"}}, "bâtons": {"rompus": {":": ":Ŵ"}}, "perte": {"de": {"vue": {":": ":Ŵ"}}}, "visage": {"découvert": {":": ":Ŵ"}}, "vue": {"d’": {"œil": {":": ":Ŵ"}}}, "l’": {"aveuglette": {":": ":Ŵ"}}}, "ad": {"hominem": {":": ":Ŵ"}}, "en": {"tout": {"et": {"pour": {"tout": {":": ":Ŵ"}}}}}, "par": {"ailleurs": {":": ":Ŵ"}, "acquit": {"de": {"conscience": {":": ":Ŵ"}}}}} Index: gc_lang/fr/modules/locutions_data.py ================================================================== --- gc_lang/fr/modules/locutions_data.py +++ gc_lang/fr/modules/locutions_data.py @@ -1,3 +1,3 @@ # generated data (do not edit) -dLocutions = {'à': {'califourchon': {}, 'cœur': {'joie': {}, 'ouvert': {}}, 'corps': {'perdu': {}}, 'perte': {'de': {'vue': {}}}, 'visage': {'découvert': {}}}, 'par': {'ailleurs': {}, 'acquit': {'de': {'conscience': {}}}}} +dLocutions = {'à': {'califourchon': {':': ':Ŵ'}, 'contrecœur': {':': ':Ŵ'}, 'cœur': {'joie': {':': ':Ŵ'}, 'ouvert': {':': ':Ŵ'}}, 'corps': {'perdu': {':': ':Ŵ'}}, 'bâtons': {'rompus': {':': ':Ŵ'}}, 'perte': {'de': {'vue': {':': ':Ŵ'}}}, 'visage': {'découvert': {':': ':Ŵ'}}, 'vue': {'d’': {'œil': {':': ':Ŵ'}}}, 'l’': {'aveuglette': {':': ':Ŵ'}}}, 'ad': {'hominem': {':': ':Ŵ'}}, 'en': {'tout': {'et': {'pour': {'tout': {':': ':Ŵ'}}}}}, 'par': {'ailleurs': {':': ':Ŵ'}, 'acquit': {'de': {'conscience': {':': ':Ŵ'}}}}} Index: gc_lang/fr/webext/content_scripts/panel_lxg.css ================================================================== --- gc_lang/fr/webext/content_scripts/panel_lxg.css +++ gc_lang/fr/webext/content_scripts/panel_lxg.css @@ -7,12 +7,12 @@ } .grammalecte_lxg_list_of_tokens { margin: 5px 0 10px 0; padding: 10px; - background-color: hsla(0, 0%, 96%, 1); - border-radius: 2px; + background-color: hsla(0, 0%, 95%, 1); + border-radius: 5px; } .grammalecte_lxg_list_num { float: right; margin: -12px 0 5px 10px; @@ -33,10 +33,21 @@ font-size: 20px; } .grammalecte_lxg_token_block { margin: 4px 0; +} +.grammalecte_lxg_token_subblock { + margin: 2px 0 2px 20px; + padding: 5px; + border-left: 4px solid hsl(150, 30%, 70%); + background-color: hsl(210, 10%, 90%); + border-radius: 2px; +} +.grammalecte_lxg_token_descr { + margin: 1px; + padding: 1px; } .grammalecte_lxg_token { display: inline-block; background-color: hsl(150, 0%, 50%); color: hsl(0, 0%, 96%); @@ -49,15 +60,18 @@ display: inline-block; padding: 2px 5px; color: hsl(0, 0%, 50%); } .grammalecte_lxg_morph_list { - padding: 2px 0 10px 20px; + padding: 2px 0 2px 20px; } .grammalecte_lxg_morph_elem { color: hsl(0, 0%, 0%); } +.grammalecte_lxg_token_LOC { + background-color: hsla(150, 50%, 30%, 1); +} .grammalecte_lxg_token_WORD { background-color: hsla(150, 50%, 50%, 1); } .grammalecte_lxg_token_ELPFX { background-color: hsla(150, 30%, 50%, 1); Index: gc_lang/fr/webext/content_scripts/panel_lxg.js ================================================================== --- gc_lang/fr/webext/content_scripts/panel_lxg.js +++ gc_lang/fr/webext/content_scripts/panel_lxg.js @@ -27,43 +27,61 @@ addMessage (sMessage) { let xNode = oGrammalecte.createNode("div", {className: "grammalecte_panel_message", textContent: sMessage}); this._xContentNode.appendChild(xNode); } - addListOfTokens (lTokens) { + addListOfTokens (lToken) { try { - if (lTokens) { + if (lToken) { this._nCount += 1; - let xNodeDiv = oGrammalecte.createNode("div", {className: "grammalecte_lxg_list_of_tokens"}); - xNodeDiv.appendChild(oGrammalecte.createNode("div", {className: "grammalecte_lxg_list_num", textContent: this._nCount})); - for (let oToken of lTokens) { - xNodeDiv.appendChild(this._createTokenNode(oToken)); + let xTokenList = oGrammalecte.createNode("div", {className: "grammalecte_lxg_list_of_tokens"}); + xTokenList.appendChild(oGrammalecte.createNode("div", {className: "grammalecte_lxg_list_num", textContent: this._nCount})); + for (let oToken of lToken) { + xTokenList.appendChild(this._createTokenBlock(oToken)); + } + this._xContentNode.appendChild(xTokenList); + } + } + catch (e) { + showError(e); + } + } + + _createTokenBlock (oToken) { + let xTokenBlock = oGrammalecte.createNode("div", {className: "grammalecte_lxg_token_block"}); + xTokenBlock.appendChild(this._createTokenDescr(oToken)); + if (oToken.aSubElem) { + let xSubBlock = oGrammalecte.createNode("div", {className: "grammalecte_lxg_token_subblock"}); + for (let oSubElem of oToken.aSubElem) { + xSubBlock.appendChild(this._createTokenDescr(oSubElem)); + } + xTokenBlock.appendChild(xSubBlock); + } + return xTokenBlock; + } + + _createTokenDescr (oToken) { + try { + let xTokenDescr = oGrammalecte.createNode("div", {className: "grammalecte_lxg_token_descr"}); + xTokenDescr.appendChild(oGrammalecte.createNode("div", {className: "grammalecte_lxg_token grammalecte_lxg_token_" + oToken.sType, textContent: oToken.sValue})); + xTokenDescr.appendChild(oGrammalecte.createNode("div", {className: "grammalecte_lxg_token_colon", textContent: ":"})); + if (oToken.aLabel.length === 1) { + xTokenDescr.appendChild(document.createTextNode(oToken.aLabel[0])); + } else { + let xMorphList = oGrammalecte.createNode("div", {className: "grammalecte_lxg_morph_list"}); + for (let sLabel of oToken.aLabel) { + xMorphList.appendChild(oGrammalecte.createNode("div", {className: "grammalecte_lxg_morph_elem", textContent: "• " + sLabel})); } - this._xContentNode.appendChild(xNodeDiv); + xTokenDescr.appendChild(xMorphList); } + return xTokenDescr; } catch (e) { showError(e); } } - _createTokenNode (oToken) { - let xTokenNode = oGrammalecte.createNode("div", {className: "grammalecte_lxg_token_block"}); - xTokenNode.appendChild(oGrammalecte.createNode("div", {className: "grammalecte_lxg_token grammalecte_lxg_token_" + oToken.sType, textContent: oToken.sValue})); - xTokenNode.appendChild(oGrammalecte.createNode("div", {className: "grammalecte_lxg_token_colon", textContent: ":"})); - if (oToken.aLabel.length === 1) { - xTokenNode.appendChild(document.createTextNode(oToken.aLabel[0])); - } else { - let xTokenList = oGrammalecte.createNode("div", {className: "grammalecte_lxg_morph_list"}); - for (let sLabel of oToken.aLabel) { - xTokenList.appendChild(oGrammalecte.createNode("div", {className: "grammalecte_lxg_morph_elem", textContent: "• " + sLabel})); - } - xTokenNode.appendChild(xTokenList); - } - return xTokenNode; - } - setHidden (sClass, bHidden) { for (let xNode of document.getElementsByClassName(sClass)) { xNode.hidden = bHidden; } } Index: gc_lang/fr/webext/gce_worker.js ================================================================== --- gc_lang/fr/webext/gce_worker.js +++ gc_lang/fr/webext/gce_worker.js @@ -138,10 +138,11 @@ let oDict = null; let oTokenizer = null; let oLxg = null; let oTest = null; +let oLocution = null; /* Technical note: This worker don’t work as a PromiseWorker (which returns a promise), so when we send request @@ -160,15 +161,18 @@ mfsp.init(helpers.loadFile(sExtensionPath + "/grammalecte/fr/mfsp_data.json")); //console.log("[Worker] Modules have been initialized…"); gc_engine.load(sContext, sExtensionPath+"grammalecte/_dictionaries"); oDict = gc_engine.getDictionary(); oTest = new TestGrammarChecking(gc_engine, sExtensionPath+"/grammalecte/fr/tests_data.json"); - oLxg = new Lexicographe(oDict); + oTokenizer = new Tokenizer("fr"); + + oLocution = helpers.loadFile(sExtensionPath + "/grammalecte/fr/locutions_data.json"); + + oLxg = new Lexicographe(oDict, oTokenizer, oLocution); if (dOptions !== null) { gc_engine.setOptions(dOptions); } - oTokenizer = new Tokenizer("fr"); //tests(); bInitDone = true; } else { console.log("[Worker] Already initialized…") } @@ -296,23 +300,15 @@ function getListOfTokens (sText, dInfo={}) { try { for (let sParagraph of text.getParagraph(sText)) { if (sParagraph.trim() !== "") { - let aElem = []; - let aRes = null; - for (let oToken of oTokenizer.genTokens(sParagraph)) { - aRes = oLxg.getInfoForToken(oToken); - if (aRes) { - aElem.push(aRes); - } - } - postMessage(createResponse("getListOfTokens", aElem, dInfo, false)); + postMessage(createResponse("getListOfTokens", oLxg.getListOfTokensReduc(sParagraph, true), dInfo, false)); } } postMessage(createResponse("getListOfTokens", null, dInfo, true)); } catch (e) { helpers.logerror(e); postMessage(createResponse("getListOfTokens", createErrorResult(e, "no tokens"), dInfo, true, true)); } }