Overview
Comment: | [graphspell][js] suggest optimisation with Jaro-Winkler (thanks to IllusionPerdu) |
---|---|
Downloads: | Tarball | ZIP archive | SQL archive |
Timelines: | family | ancestors | descendants | both | graphspell | bdic_opt |
Files: | files | file ages | folders |
SHA3-256: |
3b3a02f4d385bf7b9d9c2ef3f992cc0c |
User & Date: | olr on 2020-09-15 13:50:00 |
Other Links: | branch diff | manifest | tags |
Context
2020-09-15
| ||
14:01 | [graphspell][js] remove specific trick in cleanWord() check-in: 6569849b49 user: olr tags: graphspell, bdic_opt | |
13:50 | [graphspell][js] suggest optimisation with Jaro-Winkler (thanks to IllusionPerdu) check-in: 3b3a02f4d3 user: olr tags: graphspell, bdic_opt | |
2020-09-14
| ||
14:38 | [graphspell] string comparison: use Jaro-Winkler check-in: efebe44d15 user: olr tags: graphspell, bdic_opt | |
Changes
Modified graphspell-js/char_player.js from [0602ec129b] to [8dac23cf9b].
1 2 3 4 5 6 7 8 9 10 | // list of similar chars // useful for suggestion mechanism /* jshint esversion:6 */ /* jslint esversion:6 */ ${map} var char_player = { | | > > > > | | | | | | | | | | | | | | | | | | | | | | | | | | | | 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 | // list of similar chars // useful for suggestion mechanism /* jshint esversion:6 */ /* jslint esversion:6 */ ${map} var char_player = { /* oDistanceBetweenChars: - with Jaro-Winkler, values between 1 and 10 - with Damerau-Levenshtein, values / 10 (between 0 and 1: 0.1, 0.2 ... 0.9) */ oDistanceBetweenChars: { //"a": {}, "e": {"é": 5}, //"é": {"e": 5}, "i": {"y": 2}, //"o": {}, //"u": {}, "y": {"i": 3}, "b": {"d": 8, "h": 9}, "c": {"ç": 1, "k": 5, "q": 5, "s": 5, "x": 5, "z": 8}, "d": {"b": 8}, "f": {"v": 8}, "g": {"j": 5}, "h": {"b": 9}, "j": {"g": 5, "i": 9}, "k": {"c": 5, "q": 1, "x": 5}, "l": {"i": 9}, "m": {"n": 8}, "n": {"m": 8, "r": 9}, "p": {"q": 9}, "q": {"c": 5, "k": 1, "p": 9}, "r": {"n": 9, "j": 9}, "s": {"c": 5, "ç": 1, "x": 5, "z": 5}, "t": {"d": 9}, "v": {"f": 8, "w": 1}, "w": {"v": 1}, "x": {"c": 5, "k": 5, "q": 5, "s": 5}, "z": {"s": 5} }, distanceBetweenChars: function (c1, c2) { if (c1 == c2) { return 0; } if (this.oDistanceBetweenChars.hasOwnProperty(c1) && this.oDistanceBetweenChars[c1].hasOwnProperty(c2)) { |
︙ | ︙ |
Modified graphspell-js/ibdawg.js from [1cb5337715] to [2160aa77f7].
︙ | ︙ | |||
18 19 20 21 22 23 24 | var char_player = require("./char_player.js"); } class SuggResult { // Structure for storing, classifying and filtering suggestions | | > | | | > > > > > | < > | | | | > | | | | > > | < < > | | | | | > > > > > | > | | | < > > > | > > > | | < > > > > | > > > | < < > | | | > | 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 | var char_player = require("./char_player.js"); } class SuggResult { // Structure for storing, classifying and filtering suggestions constructor (sWord, nSuggLimit=10, nDistLimit=-1) { this.sWord = sWord; this.sSimplifiedWord = str_transform.simplifyWord(sWord); this.nDistLimit = (nDistLimit >= 0) ? nDistLimit : Math.floor(sWord.length / 3) + 1; this.nMinDist = 1000; // Temporary sets this.aAllSugg = new Set(); // All suggestions, even the one rejected this.dGoodSugg = new Map(); // Acceptable suggestions this.dBestSugg = new Map(); // Best suggestions // Parameters this.nSuggLimit = nSuggLimit; this.nSuggLimitExt = nSuggLimit + 2; // we add few entries in case suggestions merge after casing modifications this.nBestSuggLimit = Math.floor(nSuggLimit * 1.5); // n times the requested limit this.nGoodSuggLimit = nSuggLimit * 15; // n times the requested limit } addSugg (sSugg) { // add a suggestion if (this.aAllSugg.has(sSugg)) { return; } this.aAllSugg.add(sSugg); // jaro 0->1 1 les chaines sont égale let nDistJaro = 1 - str_transform.distanceJaroWinkler(this.sSimplifiedWord, str_transform.simplifyWord(sSugg)); let nDist = Math.floor(nDistJaro * 10); if (nDistJaro < .11) { // Best suggestions this.dBestSugg.set(sSugg, Math.round(nDistJaro*1000)); if (this.dBestSugg.size > this.nBestSuggLimit) { this.nDistLimit = -1; // make suggest() to end search } } else if (nDistJaro < .33) { // Good suggestions this.dGoodSugg.set(sSugg, Math.round(nDistJaro*1000)); if (this.dGoodSugg.size > this.nGoodSuggLimit) { this.nDistLimit = -1; // make suggest() to end search } } else { if (nDist < this.nMinDist) { this.nMinDist = nDist; } this.nDistLimit = Math.min(this.nDistLimit, this.nMinDist); } if (nDist <= this.nDistLimit) { if (nDist < this.nMinDist) { this.nMinDist = nDist; } this.nDistLimit = Math.min(this.nDistLimit, this.nMinDist+1); } } getSuggestions () { // return a list of suggestions let lRes = []; if (this.dBestSugg.size > 0) { // sort only with simplified words let lResTmp = [...this.dBestSugg.entries()].sort((a, b) => { return a[1] - b[1]; }); let nSize = Math.min(this.nSuggLimitExt, lResTmp.length); for (let i=0; i < nSize; i++){ lRes.push(lResTmp[i][0]); } } if (lRes.length < this.nSuggLimitExt) { // sort with simplified words and original word let lResTmp = [...this.dGoodSugg.entries()].sort((a, b) => { // Low precision to rely more on simplified words let nJaroA = Math.round(str_transform.distanceJaroWinkler(this.sWord, a[0]) * 10); let nJaroB = Math.round(str_transform.distanceJaroWinkler(this.sWord, b[0]) * 10); if (nJaroA == nJaroB) { return a[1] - b[1]; // warning: both lists are NOT sorted the same way (key: a-b) } else { return nJaroB - nJaroA; // warning: both lists are NOT sorted the same way (key: b-a) } }).slice(0, this.nSuggLimitExt); let nSize = Math.min(this.nSuggLimitExt, lResTmp.length); for (let i=0; i < nSize; i++){ lRes.push(lResTmp[i][0]); } } // casing if (this.sWord.gl_isUpperCase()) { lRes = lRes.map((sSugg) => { return sSugg.toUpperCase(); }); lRes = [...new Set(lRes)]; } else if (this.sWord.slice(0,1).gl_isUpperCase()) { lRes = lRes.map((sSugg) => { return sSugg.slice(0,1).toUpperCase() + sSugg.slice(1); }); lRes = [...new Set(lRes)]; } return lRes.slice(0, this.nSuggLimit); } reset () { this.dSugg.clear(); this.dGoodSugg.clear(); this.dBestSugg.clear(); } } class IBDAWG { // INDEXABLE BINARY DIRECT ACYCLIC WORD GRAPH |
︙ | ︙ | |||
327 328 329 330 331 332 333 | if (this.lexicographer) { [sPfx, sWord, sSfx] = this.lexicographer.split(sWord); } let nMaxSwitch = Math.max(Math.floor(sWord.length / 3), 1); let nMaxDel = Math.floor(sWord.length / 5); let nMaxHardRepl = Math.max(Math.floor((sWord.length - 5) / 4), 1); let nMaxJump = Math.max(Math.floor(sWord.length / 4), 1); | | > | | 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 | if (this.lexicographer) { [sPfx, sWord, sSfx] = this.lexicographer.split(sWord); } let nMaxSwitch = Math.max(Math.floor(sWord.length / 3), 1); let nMaxDel = Math.floor(sWord.length / 5); let nMaxHardRepl = Math.max(Math.floor((sWord.length - 5) / 4), 1); let nMaxJump = Math.max(Math.floor(sWord.length / 4), 1); let oSuggResult = new SuggResult(sWord, nSuggLimit); let sWord = str_transform.cleanWord(sWord); if (bSplitTrailingNumbers) { this._splitTrailingNumbers(oSuggResult, sWord); } this._splitSuggest(oSuggResult, sWord); this._suggest(oSuggResult, sWord, nMaxSwitch, nMaxDel, nMaxHardRepl, nMaxJump); let aSugg = oSuggResult.getSuggestions(); if (this.lexicographer) { aSugg = this.lexicographer.filterSugg(aSugg); } if (sSfx || sPfx) { // we add what we removed return aSugg.map( (sSugg) => { return sPfx + sSugg + sSfx; } ); } |
︙ | ︙ |
Modified graphspell-js/str_transform.js from [5a573a5745] to [8ec0376c2c].
︙ | ︙ | |||
62 63 64 65 66 67 68 69 70 71 72 73 74 75 | if (c != sWord.slice(i, i+1) || (c == 'e' && sWord.slice(i, i+2) != "ee")) { // exception for <e> to avoid confusion between crée / créai sNewWord += c; } i++; } return sNewWord.replace(/eau/g, "o").replace(/au/g, "o").replace(/ai/g, "éi").replace(/ei/g, "é").replace(/ph/g, "f"); }, _xTransNumbersToExponent: new Map([ ["0", "⁰"], ["1", "¹"], ["2", "²"], ["3", "³"], ["4", "⁴"], ["5", "⁵"], ["6", "⁶"], ["7", "⁷"], ["8", "⁸"], ["9", "⁹"] ]), numbersToExponent: function (sWord) { let sNewWord = ""; | > > > > > > > > > > > > > | 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 | if (c != sWord.slice(i, i+1) || (c == 'e' && sWord.slice(i, i+2) != "ee")) { // exception for <e> to avoid confusion between crée / créai sNewWord += c; } i++; } return sNewWord.replace(/eau/g, "o").replace(/au/g, "o").replace(/ai/g, "éi").replace(/ei/g, "é").replace(/ph/g, "f"); }, cleanWord: function (sWord) { // word clean for the user who make commun and preditive error help suggest // remove letters repeated more than 2 times if (sWord.match(/(.)(\1){2,}/igm)){ sWord = sWord.replace(/(.*)(.)(.\2)/igm,'$1$2').replace(/(.)(\1)+/igm,'$1$1'); } // words ending with -ik -> replace with -ique if (sWord.match(/ik$/ig)){ sWord = sWord.replace(/(.*)ik$/ig,'$1ique'); } return sWord; }, _xTransNumbersToExponent: new Map([ ["0", "⁰"], ["1", "¹"], ["2", "²"], ["3", "³"], ["4", "⁴"], ["5", "⁵"], ["6", "⁶"], ["7", "⁷"], ["8", "⁸"], ["9", "⁹"] ]), numbersToExponent: function (sWord) { let sNewWord = ""; |
︙ | ︙ | |||
205 206 207 208 209 210 211 | let adjwt = char_player.oDistanceBetweenChars; if (minv > Num_com) { for (let i = 0; i < a_len; i++) { if (!a_flag[i]) { for (let j = 0; j < b_len; j++) { if (!b_flag[j]) { if (adjwt[a[i]] && adjwt[a[i]][b[j]]) { | | | 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 | let adjwt = char_player.oDistanceBetweenChars; if (minv > Num_com) { for (let i = 0; i < a_len; i++) { if (!a_flag[i]) { for (let j = 0; j < b_len; j++) { if (!b_flag[j]) { if (adjwt[a[i]] && adjwt[a[i]][b[j]]) { N_simi += adjwt[a[i]][b[j]]; b_flag[j] = 2; break; } } } } } |
︙ | ︙ |