Index: gc_core/js/char_player.js ================================================================== --- gc_core/js/char_player.js +++ gc_core/js/char_player.js @@ -24,77 +24,10 @@ sRes += this._dTransChars.gl_get(c, c); } return sRes.replace("eau", "o").replace("au", "o"); }, - distanceDamerauLevenshtein: function (s1, s2) { - // distance of Damerau-Levenshtein between and - // https://fr.wikipedia.org/wiki/Distance_de_Damerau-Levenshtein - try { - let nLen1 = s1.length; - let nLen2 = s2.length; - let matrix = []; - for (let i = 0; i <= nLen1; i++) { - matrix[i] = new Array(nLen2 + 1); - } - for (let i = 0; i <= nLen1; i++) { - matrix[i][0] = i; - } - for (let j = 0; j <= nLen2; j++) { - matrix[0][j] = j; - } - for (let i = 1; i <= nLen1; i++) { - for (let j = 1; j <= nLen2; j++) { - let nCost = (s1[i] === s2[j]) ? 0 : 1; - matrix[i][j] = Math.min( - matrix[i-1][j] + 1, // Deletion - matrix[i][j-1] + 1, // Insertion - matrix[i-1][j-1] + nCost // Substitution - ); - if (i > 1 && j > 1 && s1[i] == s2[j-1] && s1[i-1] == s2[j]) { - matrix[i][j] = Math.min(matrix[i][j], matrix[i-2][j-2] + nCost); // Transposition - } - } - } - //console.log(s2 + ": " + matrix[nLen1][nLen2]); - return matrix[nLen1][nLen2]; - } - catch (e) { - helpers.logerror(e); - } - }, - - showDistance (s1, s2) { - let s1b = this.cleanWord(s1); - let s2b = this.cleanWord(s2); - console.log(`Distance: ${s1} / ${s2} = ${this.distanceDamerauLevenshtein(s1, s2)})`); - console.log(`Distance: ${s1b} / ${s2b} = ${this.distanceDamerauLevenshtein(s1b, s2b)})`); - }, - - // Method: Remove Useless Chars - - aVovels: new Set([ - 'a', 'e', 'i', 'o', 'u', 'y', - 'à', 'é', 'î', 'ô', 'û', 'ÿ', - 'â', 'è', 'ï', 'ö', 'ù', 'ŷ', - 'ä', 'ê', 'í', 'ó', 'ü', 'ý', - 'á', 'ë', 'ì', 'ò', 'ú', 'ỳ', - 'ā', 'ē', 'ī', 'ō', 'ū', 'ȳ', - 'h', 'œ', 'æ' - ]), - - shrinkWord: function (sWord) { - // remove vovels and h - let sRes = ""; - for (let cChar of sWord.slice(1)) { - if (!this.aVovels.has(cChar)) { - sRes += cChar; - } - } - return sWord.slice(0, 1).replace("h", "") + sRes; - }, - // Similar chars d1to1: new Map([ ["1", "liîLIÎ"], Index: gc_core/js/ibdawg.js ================================================================== --- gc_core/js/ibdawg.js +++ gc_core/js/ibdawg.js @@ -191,38 +191,39 @@ suggest (sWord, nMaxSugg=10) { // returns a array of suggestions for let sPfx = ""; let sSfx = ""; [sPfx, sWord, sSfx] = char_player.cut(sWord); + let nMaxSwitch = Math.max(Math.floor(sWord.length / 3), 1); let nMaxDel = Math.floor(sWord.length / 5); let nMaxHardRepl = Math.max(Math.floor((sWord.length - 5) / 4), 1); - let aSugg = this._suggest(sWord, nMaxDel, nMaxHardRepl); + let aSugg = this._suggest(sWord, nMaxSwitch, nMaxDel, nMaxHardRepl); if (sWord.gl_isTitle()) { - aSugg.gl_update(this._suggest(sWord.toLowerCase(), nMaxDel, nMaxHardRepl)); + aSugg.gl_update(this._suggest(sWord.toLowerCase(), nMaxSwitch, nMaxDel, nMaxHardRepl)); } else if (sWord.gl_isLowerCase()) { - aSugg.gl_update(this._suggest(sWord.gl_toCapitalize(), nMaxDel, nMaxHardRepl)); + aSugg.gl_update(this._suggest(sWord.gl_toCapitalize(), nMaxSwitch, nMaxDel, nMaxHardRepl)); } // Set to Array aSugg = Array.from(aSugg); aSugg = char_player.filterSugg(aSugg); if (sWord.gl_isTitle()) { aSugg = aSugg.map((sSugg) => { return sSugg.gl_toCapitalize(); }); } let dDistTemp = new Map(); let sCleanWord = char_player.cleanWord(sWord); - aSugg.forEach((sSugg) => { dDistTemp.set(sSugg, char_player.distanceDamerauLevenshtein(sCleanWord, char_player.cleanWord(sSugg))); }); + aSugg.forEach((sSugg) => { dDistTemp.set(sSugg, str_transform.distanceDamerauLevenshtein(sCleanWord, char_player.cleanWord(sSugg))); }); aSugg = aSugg.sort((sA, sB) => { return dDistTemp.get(sA) - dDistTemp.get(sB); }).slice(0, nMaxSugg); dDistTemp.clear(); if (sSfx || sPfx) { // we add what we removed return aSugg.map( (sSugg) => { return sPfx + sSugg + sSfx } ); } return aSugg; } - _suggest (sRemain, nMaxDel=0, nMaxHardRepl=0, nDeep=0, iAddr=0, sNewWord="", bAvoidLoop=false) { + _suggest (sRemain, nMaxSwitch=0, nMaxDel=0, nMaxHardRepl=0, nDeep=0, iAddr=0, sNewWord="", bAvoidLoop=false) { // returns a set of suggestions // recursive function let aSugg = new Set(); if (sRemain == "") { if (this._convBytesToInteger(this.byDic.slice(iAddr, iAddr+this.nBytesArc)) & this._finalNodeMask) { @@ -232,61 +233,71 @@ aSugg.add(sNewWord+sTail); } return aSugg; } let cCurrent = sRemain.slice(0, 1); - for (let [cChar, jAddr] of this._getSimilarArcs(cCurrent, iAddr)) { - aSugg.gl_update(this._suggest(sRemain.slice(1), nMaxDel, nMaxHardRepl, nDeep+1, jAddr, sNewWord+cChar)); + for (let [cChar, jAddr] of this._getSimilarCharArcs(cCurrent, iAddr)) { + aSugg.gl_update(this._suggest(sRemain.slice(1), nMaxSwitch, nMaxDel, nMaxHardRepl, nDeep+1, jAddr, sNewWord+cChar)); } if (!bAvoidLoop) { // avoid infinite loop - if (cCurrent == sRemain.slice(1, 2)) { - // same char, we remove 1 char without adding 1 to - aSugg.gl_update(this._suggest(sRemain.slice(1), nMaxDel, nMaxHardRepl, nDeep+1, iAddr, sNewWord)); - } - else { - // switching chars - aSugg.gl_update(this._suggest(sRemain.slice(1, 2)+sRemain.slice(0, 1)+sRemain.slice(2), nMaxDel, nMaxHardRepl, nDeep+1, iAddr, sNewWord, true)); - // delete char - if (nMaxDel > 0) { - aSugg.gl_update(this._suggest(sRemain.slice(1), nMaxDel-1, nMaxHardRepl, nDeep+1, iAddr, sNewWord, true)); - } - } - // Phonetic replacements - for (let sRepl of char_player.d1toX.gl_get(cCurrent, [])) { - aSugg.gl_update(this._suggest(sRepl + sRemain.slice(1), nMaxDel, nMaxHardRepl, nDeep+1, iAddr, sNewWord, true)); - } - for (let sRepl of char_player.d2toX.gl_get(sRemain.slice(0, 2), [])) { - aSugg.gl_update(this._suggest(sRepl + sRemain.slice(2), nMaxDel, nMaxHardRepl, nDeep+1, iAddr, sNewWord, true)); - } - // Hard replacements - if (nDeep > 3 && nMaxHardRepl && sRemain.length >= 2) { - for (let [nVal, kAddr] of this._getArcs1(iAddr)) { - if (this.dCharVal.has(nVal)) { - let cChar = this.dCharVal.get(nVal); + if (sRemain.length > 1) { + if (cCurrent == sRemain.slice(1, 2)) { + // same char, we remove 1 char without adding 1 to + aSugg.gl_update(this._suggest(sRemain.slice(1), nMaxSwitch, nMaxDel, nMaxHardRepl, nDeep+1, iAddr, sNewWord)); + } + else { + // switching chars + if (nMaxSwitch > 0) { + aSugg.gl_update(this._suggest(sRemain.slice(1, 2)+sRemain.slice(0, 1)+sRemain.slice(2), nMaxSwitch-1, nMaxDel, nMaxHardRepl, nDeep+1, iAddr, sNewWord, true)); + } + // delete char + if (nMaxDel > 0) { + aSugg.gl_update(this._suggest(sRemain.slice(1), nMaxSwitch, nMaxDel-1, nMaxHardRepl, nDeep+1, iAddr, sNewWord, true)); + } + } + // Phonetic replacements + for (let sRepl of char_player.d1toX.gl_get(cCurrent, [])) { + aSugg.gl_update(this._suggest(sRepl + sRemain.slice(1), nMaxSwitch, nMaxDel, nMaxHardRepl, nDeep+1, iAddr, sNewWord, true)); + } + for (let sRepl of char_player.d2toX.gl_get(sRemain.slice(0, 2), [])) { + aSugg.gl_update(this._suggest(sRepl + sRemain.slice(2), nMaxSwitch, nMaxDel, nMaxHardRepl, nDeep+1, iAddr, sNewWord, true)); + } + // Hard replacements + if (nDeep > 3 && nMaxHardRepl && sRemain.length >= 2) { + for (let [cChar, kAddr] of this._getCharArcs(iAddr)) { if (!char_player.d1to1.gl_get(cCurrent, "").includes(cChar)) { - aSugg.gl_update(this._suggest(sRemain.slice(1), nMaxDel, nMaxHardRepl-1, nDeep+1, kAddr, sNewWord+cChar, true)); + aSugg.gl_update(this._suggest(sRemain.slice(1), nMaxSwitch, nMaxDel, nMaxHardRepl-1, nDeep+1, kAddr, sNewWord+cChar, true)); } } } } // end of word if (sRemain.length == 2) { for (let sRepl of char_player.dFinal2.gl_get(sRemain, [])) { - aSugg.gl_update(this._suggest(sRepl, nMaxDel, nMaxHardRepl, nDeep+1, iAddr, sNewWord, true)); + aSugg.gl_update(this._suggest(sRepl, nMaxSwitch, nMaxDel, nMaxHardRepl, nDeep+1, iAddr, sNewWord, true)); } } else if (sRemain.length == 1) { - aSugg.gl_update(this._suggest("", nMaxDel, nMaxHardRepl, nDeep+1, iAddr, sNewWord, true)); // remove last char and go on + aSugg.gl_update(this._suggest("", nMaxSwitch, nMaxDel, nMaxHardRepl, nDeep+1, iAddr, sNewWord, true)); // remove last char and go on for (let sRepl of char_player.dFinal1.gl_get(sRemain, [])) { - aSugg.gl_update(this._suggest(sRepl, nMaxDel, nMaxHardRepl, nDeep+1, iAddr, sNewWord, true)); + aSugg.gl_update(this._suggest(sRepl, nMaxSwitch, nMaxDel, nMaxHardRepl, nDeep+1, iAddr, sNewWord, true)); } } } return aSugg; } - * _getSimilarArcs (cChar, iAddr) { + * _getCharArcs (iAddr) { + // generator: yield all chars and addresses from node at address + for (let [nVal, jAddr] of this._getArcs(iAddr)) { + if (nVal < this.nChar) { + yield [this.dCharVal.get(nVal), jAddr]; + } + } + } + + * _getSimilarCharArcs (cChar, iAddr) { // generator: yield similar char of and address of the following node for (let c of char_player.d1to1.gl_get(cChar, [cChar])) { if (this.dChar.has(c)) { let jAddr = this._lookupArcNode(this.dChar.get(c), iAddr); if (jAddr) { Index: gc_core/js/str_transform.js ================================================================== --- gc_core/js/str_transform.js +++ gc_core/js/str_transform.js @@ -2,10 +2,52 @@ /*jslint esversion: 6*/ // Note: 48 is the ASCII code for "0" var str_transform = { + + distanceDamerauLevenshtein: function (s1, s2) { + // distance of Damerau-Levenshtein between and + // https://fr.wikipedia.org/wiki/Distance_de_Damerau-Levenshtein + try { + let nLen1 = s1.length; + let nLen2 = s2.length; + let matrix = []; + for (let i = 0; i <= nLen1; i++) { + matrix[i] = new Array(nLen2 + 1); + } + for (let i = 0; i <= nLen1; i++) { + matrix[i][0] = i; + } + for (let j = 0; j <= nLen2; j++) { + matrix[0][j] = j; + } + for (let i = 1; i <= nLen1; i++) { + for (let j = 1; j <= nLen2; j++) { + let nCost = (s1[i] === s2[j]) ? 0 : 1; + matrix[i][j] = Math.min( + matrix[i-1][j] + 1, // Deletion + matrix[i][j-1] + 1, // Insertion + matrix[i-1][j-1] + nCost // Substitution + ); + if (i > 1 && j > 1 && s1[i] == s2[j-1] && s1[i-1] == s2[j]) { + matrix[i][j] = Math.min(matrix[i][j], matrix[i-2][j-2] + nCost); // Transposition + } + } + } + //console.log(s2 + ": " + matrix[nLen1][nLen2]); + return matrix[nLen1][nLen2]; + } + catch (e) { + helpers.logerror(e); + } + }, + + showDistance (s1, s2) { + console.log(`Distance: ${s1} / ${s2} = ${this.distanceDamerauLevenshtein(s1, s2)})`); + }, + getStemFromSuffixCode: function (sFlex, sSfxCode) { // Suffix only if (sSfxCode == "0") { return sFlex; }