ADDED gc_core/js/char_player.js Index: gc_core/js/char_player.js ================================================================== --- /dev/null +++ gc_core/js/char_player.js @@ -0,0 +1,322 @@ +// list of similar chars +// useful for suggestion mechanism + +${map} + + +function distanceDamerauLevenshtein (s1, s2) { + // distance of Damerau-Levenshtein between and + // https://fr.wikipedia.org/wiki/Distance_de_Damerau-Levenshtein + let d = new Map(); + let nLen1 = s1.length; + let nLen2 = s2.length; + for (let i = -1; i <= nLen1; i++) { + d.set([i, -1], i + 1); + } + for (let j = -1; i <= nLen2; i++) { + d.set([-1, j], j + 1); + } + for (let i = 0; i < nLen1; i++) { + for (let j =0; j < nLen2; i++) { + let nCost = (s1[i] === s2[j]) ? 0 : 1; + d.set([i, j], Math.min( + d.get([i-1, j]) + 1, // Deletion + d.get([i, j-1]) + 1, // Insertion + d.get([i-1, j-1]) + nCost, // Substitution + )); + if (i && j && s1[i] == s2[j-1] && s1[i-1] == s2[j]) { + d.set([i, j], Math.min(d.get([i, j]), d.get([i-2, j-2]) + nCost)); // Transposition + } + } + } + return d.get([nLen1-1, nLen2-1]); +} + + +// Method: Remove Useless Chars + +const aVovels = new Set([ + 'a', 'e', 'i', 'o', 'u', 'y', + 'à', 'é', 'î', 'ô', 'û', 'ÿ', + 'â', 'è', 'ï', 'ö', 'ù', 'ŷ', + 'ä', 'ê', 'í', 'ó', 'ü', 'ý', + 'á', 'ë', 'ì', 'ò', 'ú', 'ỳ', + 'ā', 'ē', 'ī', 'ō', 'ū', 'ȳ', + 'h', 'œ', 'æ' +]); + + +function clearWord (sWord) { + // remove vovels and h + let sRes = ""; + for (let cChar of sWord.slice(1)) { + if (!aVovels.has(cChar)) { + sRes += cChar; + } + } + return sWord.slice(0, 1).replace("h", "") + sRes; +} + + +// Similar chars + +const d1to1 = new Map([ + ["1", "liîLIÎ"], + ["2", "zZ"], + ["3", "eéèêEÉÈÊ"], + ["4", "aàâAÀÂ"], + ["5", "sgSG"], + ["6", "bdgBDG"], + ["7", "ltLT"], + ["8", "bB"], + ["9", "gbdGBD"], + ["0", "oôOÔ"], + + ["a", "aàâáäæ"], + ["A", "AÀÂÁÄÆ"], + ["à", "aàâáäæ"], + ["À", "AÀÂÁÄÆ"], + ["â", "aàâáäæ"], + ["Â", "AÀÂÁÄÆ"], + ["á", "aàâáäæ"], + ["Á", "AÀÂÁÄÆ"], + ["ä", "aàâáäæ"], + ["Ä", "AÀÂÁÄÆ"], + + ["æ", "æéa"], + ["Æ", "ÆÉA"], + + ["c", "cçskqśŝ"], + ["C", "CÇSKQŚŜ"], + ["ç", "cçskqśŝ"], + ["Ç", "CÇSKQŚŜ"], + + ["e", "eéèêëœ"], + ["E", "EÉÈÊËŒ"], + ["é", "eéèêëœ"], + ["É", "EÉÈÊËŒ"], + ["ê", "eéèêëœ"], + ["Ê", "EÉÈÊËŒ"], + ["è", "eéèêëœ"], + ["È", "EÉÈÊËŒ"], + ["ë", "eéèêëœ"], + ["Ë", "EÉÈÊËŒ"], + + ["g", "gj"], + ["G", "GJ"], + + ["i", "iîïyíìÿ"], + ["I", "IÎÏYÍÌŸ"], + ["î", "iîïyíìÿ"], + ["Î", "IÎÏYÍÌŸ"], + ["ï", "iîïyíìÿ"], + ["Ï", "IÎÏYÍÌŸ"], + ["í", "iîïyíìÿ"], + ["Í", "IÎÏYÍÌŸ"], + ["ì", "iîïyíìÿ"], + ["Ì", "IÎÏYÍÌŸ"], + + ["j", "jg"], + ["J", "JG"], + + ["k", "kcq"], + ["K", "KCQ"], + + ["n", "nñ"], + ["N", "NÑ"], + + ["o", "oôóòöœ"], + ["O", "OÔÓÒÖŒ"], + ["ô", "oôóòöœ"], + ["Ô", "OÔÓÒÖŒ"], + ["ó", "oôóòöœ"], + ["Ó", "OÔÓÒÖŒ"], + ["ò", "oôóòöœ"], + ["Ò", "OÔÓÒÖŒ"], + ["ö", "oôóòöœ"], + ["Ö", "OÔÓÒÖŒ"], + + ["œ", "œoôeéèêë"], + ["Œ", "ŒOÔEÉÈÊË"], + + ["q", "qck"], + ["Q", "QCK"], + + ["s", "sśŝcç"], + ["S", "SŚŜCÇ"], + ["ś", "sśŝcç"], + ["Ś", "SŚŜCÇ"], + ["ŝ", "sśŝcç"], + ["Ŝ", "SŚŜCÇ"], + + ["u", "uûùüú"], + ["U", "UÛÙÜÚ"], + ["û", "uûùüú"], + ["Û", "UÛÙÜÚ"], + ["ù", "uûùüú"], + ["Ù", "UÛÙÜÚ"], + ["ü", "uûùüú"], + ["Ü", "UÛÙÜÚ"], + ["ú", "uûùüú"], + ["Ú", "UÛÙÜÚ"], + + ["v", "vw"], + ["V", "VW"], + + ["w", "wv"], + ["W", "WV"], + + ["x", "xck"], + ["X", "XCK"], + + ["y", "yÿiîŷýỳ"], + ["Y", "YŸIÎŶÝỲ"], + ["ÿ", "yÿiîŷýỳ"], + ["Ÿ", "YŸIÎŶÝỲ"], + ["ŷ", "yÿiîŷýỳ"], + ["Ŷ", "YŸIÎŶÝỲ"], + ["ý", "yÿiîŷýỳ"], + ["Ý", "YŸIÎŶÝỲ"], + ["ỳ", "yÿiîŷýỳ"], + ["Ỳ", "YŸIÎŶÝỲ"], + + ["z", "zs"], + ["Z", "ZS"], +]); + +const d1toX = new Map([ + ["æ", ["ae",]], + ["Æ", ["AE",]], + ["b", ["bb",]], + ["B", ["BB",]], + ["c", ["cc", "ss", "qu", "ch"]], + ["C", ["CC", "SS", "QU", "CH"]], + ["ç", ["ss", "cc", "qh", "ch"]], + ["Ç", ["SS", "CC", "QH", "CH"]], + ["d", ["dd",]], + ["D", ["DD",]], + ["f", ["ff", "ph"]], + ["F", ["FF", "PH"]], + ["g", ["gu", "ge", "gg", "gh"]], + ["G", ["GU", "GE", "GG", "GH"]], + ["i", ["ii",]], + ["I", ["II",]], + ["j", ["jj", "dj"]], + ["J", ["JJ", "DJ"]], + ["k", ["qu", "ck", "ch", "cu", "kk", "kh"]], + ["K", ["QU", "CK", "CH", "CU", "KK", "KH"]], + ["l", ["ll",]], + ["L", ["LL",]], + ["m", ["mm", "mn"]], + ["M", ["MM", "MN"]], + ["n", ["nn", "nm", "mn"]], + ["N", ["NN", "NM", "MN"]], + ["o", ["au", "eau", "aut"]], + ["O", ["AU", "EAU", "AUT"]], + ["œ", ["oe", "eu"]], + ["Œ", ["OE", "EU"]], + ["p", ["pp", "ph"]], + ["P", ["PP", "PH"]], + ["q", ["qu", "ch", "cq", "ck", "kk"]], + ["Q", ["QU", "CH", "CQ", "CK", "KK"]], + ["r", ["rr",]], + ["R", ["RR",]], + ["s", ["ss", "sh"]], + ["S", ["SS", "SH"]], + ["t", ["tt", "th"]], + ["T", ["TT", "TH"]], + ["x", ["cc", "ct", "xx"]], + ["X", ["CC", "CT", "XX"]], + ["z", ["ss", "zh"]], + ["Z", ["SS", "ZH"]], +]); + +const d2toX = new Map([ + ["an", ["en",]], + ["AN", ["EN",]], + ["en", ["an",]], + ["EN", ["AN",]], + ["ai", ["ei", "é", "è", "ê", "ë"]], + ["AI", ["EI", "É", "È", "Ê", "Ë"]], + ["ei", ["ai", "é", "è", "ê", "ë"]], + ["EI", ["AI", "É", "È", "Ê", "Ë"]], + ["ch", ["sh", "c", "ss"]], + ["CH", ["SH", "C", "SS"]], + ["ct", ["x", "cc"]], + ["CT", ["X", "CC"]], + ["oa", ["oi",]], + ["OA", ["OI",]], + ["oi", ["oa", "oie"]], + ["OI", ["OA", "OIE"]], + ["qu", ["q", "cq", "ck", "c", "k"]], + ["QU", ["Q", "CQ", "CK", "C", "K"]], + ["ss", ["c", "ç"]], + ["SS", ["C", "Ç"]], +]); + + +// End of word + +const dFinal1 = new Map([ + ["a", ["as", "at", "ant", "ah"]], + ["A", ["AS", "AT", "ANT", "AH"]], + ["c", ["ch",]], + ["C", ["CH",]], + ["e", ["et", "er", "ets", "ée", "ez", "ai", "ais", "ait", "ent", "eh"]], + ["E", ["ET", "ER", "ETS", "ÉE", "EZ", "AI", "AIS", "AIT", "ENT", "EH"]], + ["é", ["et", "er", "ets", "ée", "ez", "ai", "ais", "ait"]], + ["É", ["ET", "ER", "ETS", "ÉE", "EZ", "AI", "AIS", "AIT"]], + ["è", ["et", "er", "ets", "ée", "ez", "ai", "ais", "ait"]], + ["È", ["ET", "ER", "ETS", "ÉE", "EZ", "AI", "AIS", "AIT"]], + ["ê", ["et", "er", "ets", "ée", "ez", "ai", "ais", "ait"]], + ["Ê", ["ET", "ER", "ETS", "ÉE", "EZ", "AI", "AIS", "AIT"]], + ["ë", ["et", "er", "ets", "ée", "ez", "ai", "ais", "ait"]], + ["Ë", ["ET", "ER", "ETS", "ÉE", "EZ", "AI", "AIS", "AIT"]], + ["g", ["gh",]], + ["G", ["GH",]], + ["i", ["is", "it", "ie", "in"]], + ["I", ["IS", "IT", "IE", "IN"]], + ["n", ["nt", "nd", "ns", "nh"]], + ["N", ["NT", "ND", "NS", "NH"]], + ["o", ["aut", "ot", "os"]], + ["O", ["AUT", "OT", "OS"]], + ["ô", ["aut", "ot", "os"]], + ["Ô", ["AUT", "OT", "OS"]], + ["ö", ["aut", "ot", "os"]], + ["Ö", ["AUT", "OT", "OS"]], + ["p", ["ph",]], + ["P", ["PH",]], + ["s", ["sh",]], + ["S", ["SH",]], + ["t", ["th",]], + ["T", ["TH",]], + ["u", ["ut", "us", "uh"]], + ["U", ["UT", "US", "UH"]], +]); + +const dFinal2 = new Map([ + ["ai", ["aient", "ais", "et"]], + ["AI", ["AIENT", "AIS", "ET"]], + ["an", ["ant", "ent"]], + ["AN", ["ANT", "ENT"]], + ["en", ["ent", "ant"]], + ["EN", ["ENT", "ANT"]], + ["ei", ["ait", "ais"]], + ["EI", ["AIT", "AIS"]], + ["on", ["ons", "ont"]], + ["ON", ["ONS", "ONT"]], + ["oi", ["ois", "oit", "oix"]], + ["OI", ["OIS", "OIT", "OIX"]], +]); + + +// Préfixes + +aPfx1 = new Set([ + "anti", "archi", "contre", "hyper", "mé", "méta", "im", "in", "ir", "par", "proto", + "pseudo", "pré", "re", "ré", "sans", "sous", "supra", "sur", "ultra" +]); + +aPfx2 = new Set([ + "belgo", "franco", "génito", "gynéco", "médico", "russo" +]); Index: gc_core/js/ibdawg.js ================================================================== --- gc_core/js/ibdawg.js +++ gc_core/js/ibdawg.js @@ -9,14 +9,14 @@ var str_transform = require("resource://grammalecte/str_transform.js"); var helpers = require("resource://grammalecte/helpers.js"); } -// String -// Don’t remove. Necessary in TB. +// Don’t remove . Necessary in TB. ${string} - +${map} +${set} class IBDAWG { // INDEXABLE BINARY DIRECT ACYCLIC WORD GRAPH @@ -56,22 +56,25 @@ switch (this.nVersion) { case 1: this.morph = this._morph1; this.stem = this._stem1; this._lookupArcNode = this._lookupArcNode1; + this._getArcs = this._getArcs1; this._writeNodes = this._writeNodes1; break; case 2: this.morph = this._morph2; this.stem = this._stem2; this._lookupArcNode = this._lookupArcNode2; + this._getArcs = this._getArcs2; this._writeNodes = this._writeNodes2; break; case 3: this.morph = this._morph3; this.stem = this._stem3; this._lookupArcNode = this._lookupArcNode3; + this._getArcs = this._getArcs3; this._writeNodes = this._writeNodes3; break; default: throw ValueError("# Error: unknown code: " + this.nVersion); } @@ -166,10 +169,137 @@ l = l.concat(this.morph(sWord.gl_toCapitalize())); } } return l; } + + suggest (sWord, nMaxSugg=10) { + // returns a set of suggestions for + let aSugg = this._suggest(sWord, nMaxDel=Math.floor(sWord.length / 5)); + if (sWord.gl_isTitle()) { + aSugg.gl_update(this._suggest(sWord.lower(), nMaxDel=Math.floor(sWord.length / 5))); + aSugg = new Set(aSugg.map((sSugg) => { return sSugg.title(); })); + } + else if (sWord.gl_isLowerCase()) { + aSugg.gl_update(this._suggest(sWord.title(), nMaxDel=Math.floor(sWord.length / 5))); + } + if (aSugg.size == 0) { + aSugg.gl_update(this._suggestWithCrushedUselessChars(cp.clearWord(sWord))); + } + aSugg = aSugg.filter((sSugg) => { return !sSugg.endsWith("è") && !sSugg.endsWith("È"); }); // fr language + return aSugg.sort((sSugg) => { return cp.distanceDamerauLevenshtein(sWord, sSugg); }).slice(0, nMaxSugg); + } + + _suggest (sRemain, nMaxDel=0, nDeep=0, iAddr=0, sNewWord="", bAvoidLoop=False) { + // returns a set of suggestions + // recursive function + //show(nDeep, sNewWord + ":" + sRemain) + let aSugg = new Set(); + if (sRemain == "") { + if (this._convBytesToInteger(this.byDic.slice(iAddr, iAddr+this.nBytesArc)) & this._finalNodeMask) { + //show(nDeep, "___" + sNewWord + "___"); + aSugg.add(sNewWord); + } + for (let sTail of this._getTails(iAddr)) { + aSugg.add(sNewWord+sTail); + } + return aSugg; + } + let cCurrent = sRemain.slice(0, 1); + for (let [cChar, jAddr] of this._getSimilarArcs(cCurrent, iAddr)) { + aSugg.gl_update(this._suggest(sRemain.slice(1), nMaxDel, nDeep+1, jAddr, sNewWord+cChar)); + } + if (!bAvoidLoop) { // avoid infinite loop + if (cCurrent == sRemain.slice(1, 2)) { + // same char, we remove 1 char without adding 1 to + aSugg.gl_update(this._suggest(sRemain.slice(1), nMaxDel, nDeep+1, iAddr, sNewWord)); + } + else { + // switching chars + aSugg.gl_update(this._suggest(sRemain.slice(1, 2)+sRemain.slice(0, 1)+sRemain.slice(2), nMaxDel, nDeep+1, iAddr, sNewWord, true)); + // delete char + if (nMaxDel > 0) { + aSugg.gl_update(this._suggest(sRemain.slice(1), nMaxDel-1, nDeep+1, iAddr, sNewWord, true)); + } + } + // Replacements + for (let sRepl of cp.d1toX.gl_get(cCurrent, [])) { + aSugg.gl_update(this._suggest(sRepl + sRemain.slice(1), nMaxDel, nDeep+1, iAddr, sNewWord, true)); + } + for (let sRepl of cp.d2toX.gl_get(sRemain[0:2], [])) { + aSugg.gl_update(this._suggest(sRepl + sRemain.slice(2), nMaxDel, nDeep+1, iAddr, sNewWord, true)); + } + // end of word + if (sRemain.length == 2) { + for (let sRepl of cp.dFinal2.gl_get(sRemain, [])) { + aSugg.gl_update(this._suggest(sRepl, nMaxDel, nDeep+1, iAddr, sNewWord, true)); + } + } + else if (sRemain.length == 1) { + aSugg.gl_update(this._suggest("", nMaxDel, nDeep+1, iAddr, sNewWord, true)); // remove last char and go on + for (let sRepl of cp.dFinal1.gl_get(sRemain, [])) { + aSugg.gl_update(this._suggest(sRepl, nMaxDel, nDeep+1, iAddr, sNewWord, true)); + } + } + } + return aSugg; + } + + * _getSimilarArcs (cChar, iAddr) { + // generator: yield similar char of and address of the following node + for (let c of cp.d1to1.gl_get(cChar, [cChar])) { + if (this.dChar.has(c)) { + let jAddr = this._lookupArcNode(this.dChar.get(c), iAddr); + if (jAddr) { + yield [c, jAddr]; + } + } + } + } + + _getTails (iAddr, sTail="", n=2) { + // return a list of suffixes ending at a distance of from + let aTails = new Set(); + for (let [nVal, jAddr] of this._getArcs(iAddr)) { + if (nVal < this.nChar) { + if (this._convBytesToInteger(this.byDic.slice(jAddr, jAddr+this.nBytesArc)) & this._finalNodeMask) { + aTails.add(sTail + this.dCharVal.get(nVal)); + } + if (n && aTails.size == 0) { + aTails.update(this._getTails(jAddr, sTail+this.dCharVal.get(nVal), n-1)); + } + } + } + return aTails; + } + + _suggestWithCrushedUselessChars (sWord, nDeep=0, iAddr=0, sNewWord="", bAvoidLoop=False) { + let aSugg = new Set(); + if (sWord.length == 0) { + if (this._convBytesToInteger(this.byDic.slice(iAddr, iAddr+this.nBytesArc)) & this._finalNodeMask) { + show(nDeep, "!!! " + sNewWord + " !!!"); + aSugg.add(sNewWord); + } + return aSugg; + } + let cCurrent = sWord.slice(0, 1); + for (let [cChar, jAddr] of this._getSimilarArcsAndCrushedChars(cCurrent, iAddr)) { + show(nDeep, cChar); + aSugg.gl_update(this._suggestWithCrushedUselessChars(sWord[1:], nDeep+1, jAddr, sNewWord+cChar)); + } + return aSugg; + } + + * _getSimilarArcsAndCrushedChars (cChar, iAddr) { + // generator: yield similar char of and address of the following node + for (let [nVal, jAddr] of this._getArcs(iAddr)) { + if (this.dCharVal.get(nVal, null) in cp.aVovels) { + yield [this.dCharVal[nVal], jAddr]; + } + } + yield* this._getSimilarArcs(cChar, iAddr); + } // morph (sWord) { // is defined in constructor // } @@ -188,21 +318,21 @@ } if (this._convBytesToInteger(this.byDic.slice(iAddr, iAddr+this.nBytesArc)) & this._finalNodeMask) { let l = []; let nRawArc = 0; while (!(nRawArc & this._lastArcMask)) { - var iEndArcAddr = iAddr + this.nBytesArc; + let iEndArcAddr = iAddr + this.nBytesArc; nRawArc = this._convBytesToInteger(this.byDic.slice(iAddr, iEndArcAddr)); - var nArc = nRawArc & this._arcMask; + let nArc = nRawArc & this._arcMask; if (nArc >= this.nChar) { // This value is not a char, this is a stemming code - var sStem = ">" + this.funcStemming(sWord, this.lArcVal[nArc]); + let sStem = ">" + this.funcStemming(sWord, this.lArcVal[nArc]); // Now , we go to the next node and retrieve all following arcs values, all of them are tags - var iAddr2 = this._convBytesToInteger(this.byDic.slice(iEndArcAddr, iEndArcAddr+this.nBytesNodeAddress)); - var nRawArc2 = 0; + let iAddr2 = this._convBytesToInteger(this.byDic.slice(iEndArcAddr, iEndArcAddr+this.nBytesNodeAddress)); + let nRawArc2 = 0; while (!(nRawArc2 & this._lastArcMask)) { - var iEndArcAddr2 = iAddr2 + this.nBytesArc; + let iEndArcAddr2 = iAddr2 + this.nBytesArc; nRawArc2 = this._convBytesToInteger(this.byDic.slice(iAddr2, iEndArcAddr2)); l.push(sStem + " " + this.lArcVal[nRawArc2 & this._arcMask]); iAddr2 = iEndArcAddr2+this.nBytesNodeAddress; } } @@ -227,13 +357,13 @@ } if (this._convBytesToInteger(this.byDic.slice(iAddr, iAddr+this.nBytesArc)) & this._finalNodeMask) { let l = []; let nRawArc = 0; while (!(nRawArc & this._lastArcMask)) { - var iEndArcAddr = iAddr + this.nBytesArc; + let iEndArcAddr = iAddr + this.nBytesArc; nRawArc = this._convBytesToInteger(this.byDic.slice(iAddr, iEndArcAddr)); - var nArc = nRawArc & this._arcMask; + let nArc = nRawArc & this._arcMask; if (nArc >= this.nChar) { // This value is not a char, this is a stemming code l.push(this.funcStemming(sWord, this.lArcVal[nArc])); } iAddr = iEndArcAddr + this.nBytesNodeAddress; @@ -260,10 +390,23 @@ } iAddr = iEndArcAddr + this.nBytesNodeAddress; } } } + + _getArcs1 (iAddr) { + "generator: return all arcs at as tuples of (nVal, iAddr)" + while (true) { + let iEndArcAddr = iAddr+this.nBytesArc; + let nRawArc = this._convBytesToInteger(this.byDic.slice(iAddr, iEndArcAddr)); + yield [nRawArc & this._arcMask, this._convBytesToInteger(this.byDic.slice(iEndArcAddr, iEndArcAddr+this.nBytesNodeAddress))]; + if (nRawArc & this._lastArcMask) { + break; + } + iAddr = iEndArcAddr+this.nBytesNodeAddress; + } + } // VERSION 2 _morph2 (sWord) { // to do } ADDED gc_core/js/jsex_set.js Index: gc_core/js/jsex_set.js ================================================================== --- /dev/null +++ gc_core/js/jsex_set.js @@ -0,0 +1,13 @@ + +// Set +/*jslint esversion: 6*/ + +if (Set.prototype.grammalecte === undefined) { + Set.prototype.gl_update = function (aSet) { + for (let elem of aSet) { + this.add(elem); + } + }; + + Set.prototype.grammalecte = true; +} Index: gc_core/py/ibdawg.py ================================================================== --- gc_core/py/ibdawg.py +++ gc_core/py/ibdawg.py @@ -83,12 +83,12 @@ self._getArcs = self._getArcs2 self._writeNodes = self._writeNodes2 elif self.nVersion == 3: self.morph = self._morph3 self.stem = self._stem3 - self._getArcs = self._getArcs3 self._lookupArcNode = self._lookupArcNode3 + self._getArcs = self._getArcs3 self._writeNodes = self._writeNodes3 else: raise ValueError(" # Error: unknown code: {}".format(self.nVersion)) self.bOptNumSigle = False @@ -184,11 +184,10 @@ l.extend(self.morph(sWord.capitalize())) return l def suggest (self, sWord, nMaxSugg=10): "returns a set of suggestions for " - #return self._suggestWithCrushedUselessChars(cp.clearWord(sWord)) aSugg = set() if sWord.istitle(): aSugg.update(self._suggest(sWord, nMaxDel=len(sWord) // 5)) aSugg.update(self._suggest(sWord.lower(), nMaxDel=len(sWord) // 5)) aSugg = set(map(lambda sSugg: sSugg.title(), aSugg))