Overview
| Comment: | [graphspell][js] suggest optimisation with Jaro-Winkler (thanks to IllusionPerdu) |
|---|---|
| Downloads: | Tarball | ZIP archive | SQL archive |
| Timelines: | family | ancestors | descendants | both | graphspell | bdic_opt |
| Files: | files | file ages | folders |
| SHA3-256: |
3b3a02f4d385bf7b9d9c2ef3f992cc0c |
| User & Date: | olr on 2020-09-15 13:50:00 |
| Other Links: | branch diff | manifest | tags |
Context
|
2020-09-15
| ||
| 14:01 | [graphspell][js] remove specific trick in cleanWord() check-in: 6569849b49 user: olr tags: graphspell, bdic_opt | |
| 13:50 | [graphspell][js] suggest optimisation with Jaro-Winkler (thanks to IllusionPerdu) check-in: 3b3a02f4d3 user: olr tags: graphspell, bdic_opt | |
|
2020-09-14
| ||
| 14:38 | [graphspell] string comparison: use Jaro-Winkler check-in: efebe44d15 user: olr tags: graphspell, bdic_opt | |
Changes
Modified graphspell-js/char_player.js from [0602ec129b] to [8dac23cf9b].
1 2 3 4 5 6 7 8 9 10 |
// list of similar chars
// useful for suggestion mechanism
/* jshint esversion:6 */
/* jslint esversion:6 */
${map}
var char_player = {
| | > > > > | | | | | | | | | | | | | | | | | | | | | | | | | | | | 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 |
// list of similar chars
// useful for suggestion mechanism
/* jshint esversion:6 */
/* jslint esversion:6 */
${map}
var char_player = {
/*
oDistanceBetweenChars:
- with Jaro-Winkler, values between 1 and 10
- with Damerau-Levenshtein, values / 10 (between 0 and 1: 0.1, 0.2 ... 0.9)
*/
oDistanceBetweenChars: {
//"a": {},
"e": {"é": 5},
//"é": {"e": 5},
"i": {"y": 2},
//"o": {},
//"u": {},
"y": {"i": 3},
"b": {"d": 8, "h": 9},
"c": {"ç": 1, "k": 5, "q": 5, "s": 5, "x": 5, "z": 8},
"d": {"b": 8},
"f": {"v": 8},
"g": {"j": 5},
"h": {"b": 9},
"j": {"g": 5, "i": 9},
"k": {"c": 5, "q": 1, "x": 5},
"l": {"i": 9},
"m": {"n": 8},
"n": {"m": 8, "r": 9},
"p": {"q": 9},
"q": {"c": 5, "k": 1, "p": 9},
"r": {"n": 9, "j": 9},
"s": {"c": 5, "ç": 1, "x": 5, "z": 5},
"t": {"d": 9},
"v": {"f": 8, "w": 1},
"w": {"v": 1},
"x": {"c": 5, "k": 5, "q": 5, "s": 5},
"z": {"s": 5}
},
distanceBetweenChars: function (c1, c2) {
if (c1 == c2) {
return 0;
}
if (this.oDistanceBetweenChars.hasOwnProperty(c1) && this.oDistanceBetweenChars[c1].hasOwnProperty(c2)) {
|
| ︙ | ︙ |
Modified graphspell-js/ibdawg.js from [1cb5337715] to [2160aa77f7].
| ︙ | ︙ | |||
18 19 20 21 22 23 24 |
var char_player = require("./char_player.js");
}
class SuggResult {
// Structure for storing, classifying and filtering suggestions
| | > | | | > > > > > | < > | | | | > | | | | > > | < < > | | | | | > > > > > | > | | | < > > > | > > > | | < > > > > | > > > | < < > | | | > | 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 |
var char_player = require("./char_player.js");
}
class SuggResult {
// Structure for storing, classifying and filtering suggestions
constructor (sWord, nSuggLimit=10, nDistLimit=-1) {
this.sWord = sWord;
this.sSimplifiedWord = str_transform.simplifyWord(sWord);
this.nDistLimit = (nDistLimit >= 0) ? nDistLimit : Math.floor(sWord.length / 3) + 1;
this.nMinDist = 1000;
// Temporary sets
this.aAllSugg = new Set(); // All suggestions, even the one rejected
this.dGoodSugg = new Map(); // Acceptable suggestions
this.dBestSugg = new Map(); // Best suggestions
// Parameters
this.nSuggLimit = nSuggLimit;
this.nSuggLimitExt = nSuggLimit + 2; // we add few entries in case suggestions merge after casing modifications
this.nBestSuggLimit = Math.floor(nSuggLimit * 1.5); // n times the requested limit
this.nGoodSuggLimit = nSuggLimit * 15; // n times the requested limit
}
addSugg (sSugg) {
// add a suggestion
if (this.aAllSugg.has(sSugg)) {
return;
}
this.aAllSugg.add(sSugg);
// jaro 0->1 1 les chaines sont égale
let nDistJaro = 1 - str_transform.distanceJaroWinkler(this.sSimplifiedWord, str_transform.simplifyWord(sSugg));
let nDist = Math.floor(nDistJaro * 10);
if (nDistJaro < .11) { // Best suggestions
this.dBestSugg.set(sSugg, Math.round(nDistJaro*1000));
if (this.dBestSugg.size > this.nBestSuggLimit) {
this.nDistLimit = -1; // make suggest() to end search
}
} else if (nDistJaro < .33) { // Good suggestions
this.dGoodSugg.set(sSugg, Math.round(nDistJaro*1000));
if (this.dGoodSugg.size > this.nGoodSuggLimit) {
this.nDistLimit = -1; // make suggest() to end search
}
} else {
if (nDist < this.nMinDist) {
this.nMinDist = nDist;
}
this.nDistLimit = Math.min(this.nDistLimit, this.nMinDist);
}
if (nDist <= this.nDistLimit) {
if (nDist < this.nMinDist) {
this.nMinDist = nDist;
}
this.nDistLimit = Math.min(this.nDistLimit, this.nMinDist+1);
}
}
getSuggestions () {
// return a list of suggestions
let lRes = [];
if (this.dBestSugg.size > 0) {
// sort only with simplified words
let lResTmp = [...this.dBestSugg.entries()].sort((a, b) => { return a[1] - b[1]; });
let nSize = Math.min(this.nSuggLimitExt, lResTmp.length);
for (let i=0; i < nSize; i++){
lRes.push(lResTmp[i][0]);
}
}
if (lRes.length < this.nSuggLimitExt) {
// sort with simplified words and original word
let lResTmp = [...this.dGoodSugg.entries()].sort((a, b) => {
// Low precision to rely more on simplified words
let nJaroA = Math.round(str_transform.distanceJaroWinkler(this.sWord, a[0]) * 10);
let nJaroB = Math.round(str_transform.distanceJaroWinkler(this.sWord, b[0]) * 10);
if (nJaroA == nJaroB) {
return a[1] - b[1]; // warning: both lists are NOT sorted the same way (key: a-b)
} else {
return nJaroB - nJaroA; // warning: both lists are NOT sorted the same way (key: b-a)
}
}).slice(0, this.nSuggLimitExt);
let nSize = Math.min(this.nSuggLimitExt, lResTmp.length);
for (let i=0; i < nSize; i++){
lRes.push(lResTmp[i][0]);
}
}
// casing
if (this.sWord.gl_isUpperCase()) {
lRes = lRes.map((sSugg) => { return sSugg.toUpperCase(); });
lRes = [...new Set(lRes)];
}
else if (this.sWord.slice(0,1).gl_isUpperCase()) {
lRes = lRes.map((sSugg) => { return sSugg.slice(0,1).toUpperCase() + sSugg.slice(1); });
lRes = [...new Set(lRes)];
}
return lRes.slice(0, this.nSuggLimit);
}
reset () {
this.dSugg.clear();
this.dGoodSugg.clear();
this.dBestSugg.clear();
}
}
class IBDAWG {
// INDEXABLE BINARY DIRECT ACYCLIC WORD GRAPH
|
| ︙ | ︙ | |||
327 328 329 330 331 332 333 |
if (this.lexicographer) {
[sPfx, sWord, sSfx] = this.lexicographer.split(sWord);
}
let nMaxSwitch = Math.max(Math.floor(sWord.length / 3), 1);
let nMaxDel = Math.floor(sWord.length / 5);
let nMaxHardRepl = Math.max(Math.floor((sWord.length - 5) / 4), 1);
let nMaxJump = Math.max(Math.floor(sWord.length / 4), 1);
| | > | | 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 |
if (this.lexicographer) {
[sPfx, sWord, sSfx] = this.lexicographer.split(sWord);
}
let nMaxSwitch = Math.max(Math.floor(sWord.length / 3), 1);
let nMaxDel = Math.floor(sWord.length / 5);
let nMaxHardRepl = Math.max(Math.floor((sWord.length - 5) / 4), 1);
let nMaxJump = Math.max(Math.floor(sWord.length / 4), 1);
let oSuggResult = new SuggResult(sWord, nSuggLimit);
let sWord = str_transform.cleanWord(sWord);
if (bSplitTrailingNumbers) {
this._splitTrailingNumbers(oSuggResult, sWord);
}
this._splitSuggest(oSuggResult, sWord);
this._suggest(oSuggResult, sWord, nMaxSwitch, nMaxDel, nMaxHardRepl, nMaxJump);
let aSugg = oSuggResult.getSuggestions();
if (this.lexicographer) {
aSugg = this.lexicographer.filterSugg(aSugg);
}
if (sSfx || sPfx) {
// we add what we removed
return aSugg.map( (sSugg) => { return sPfx + sSugg + sSfx; } );
}
|
| ︙ | ︙ |
Modified graphspell-js/str_transform.js from [5a573a5745] to [8ec0376c2c].
| ︙ | ︙ | |||
62 63 64 65 66 67 68 69 70 71 72 73 74 75 |
if (c != sWord.slice(i, i+1) || (c == 'e' && sWord.slice(i, i+2) != "ee")) { // exception for <e> to avoid confusion between crée / créai
sNewWord += c;
}
i++;
}
return sNewWord.replace(/eau/g, "o").replace(/au/g, "o").replace(/ai/g, "éi").replace(/ei/g, "é").replace(/ph/g, "f");
},
_xTransNumbersToExponent: new Map([
["0", "⁰"], ["1", "¹"], ["2", "²"], ["3", "³"], ["4", "⁴"], ["5", "⁵"], ["6", "⁶"], ["7", "⁷"], ["8", "⁸"], ["9", "⁹"]
]),
numbersToExponent: function (sWord) {
let sNewWord = "";
| > > > > > > > > > > > > > | 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 |
if (c != sWord.slice(i, i+1) || (c == 'e' && sWord.slice(i, i+2) != "ee")) { // exception for <e> to avoid confusion between crée / créai
sNewWord += c;
}
i++;
}
return sNewWord.replace(/eau/g, "o").replace(/au/g, "o").replace(/ai/g, "éi").replace(/ei/g, "é").replace(/ph/g, "f");
},
cleanWord: function (sWord) {
// word clean for the user who make commun and preditive error help suggest
// remove letters repeated more than 2 times
if (sWord.match(/(.)(\1){2,}/igm)){
sWord = sWord.replace(/(.*)(.)(.\2)/igm,'$1$2').replace(/(.)(\1)+/igm,'$1$1');
}
// words ending with -ik -> replace with -ique
if (sWord.match(/ik$/ig)){
sWord = sWord.replace(/(.*)ik$/ig,'$1ique');
}
return sWord;
},
_xTransNumbersToExponent: new Map([
["0", "⁰"], ["1", "¹"], ["2", "²"], ["3", "³"], ["4", "⁴"], ["5", "⁵"], ["6", "⁶"], ["7", "⁷"], ["8", "⁸"], ["9", "⁹"]
]),
numbersToExponent: function (sWord) {
let sNewWord = "";
|
| ︙ | ︙ | |||
205 206 207 208 209 210 211 |
let adjwt = char_player.oDistanceBetweenChars;
if (minv > Num_com) {
for (let i = 0; i < a_len; i++) {
if (!a_flag[i]) {
for (let j = 0; j < b_len; j++) {
if (!b_flag[j]) {
if (adjwt[a[i]] && adjwt[a[i]][b[j]]) {
| | | 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 |
let adjwt = char_player.oDistanceBetweenChars;
if (minv > Num_com) {
for (let i = 0; i < a_len; i++) {
if (!a_flag[i]) {
for (let j = 0; j < b_len; j++) {
if (!b_flag[j]) {
if (adjwt[a[i]] && adjwt[a[i]][b[j]]) {
N_simi += adjwt[a[i]][b[j]];
b_flag[j] = 2;
break;
}
}
}
}
}
|
| ︙ | ︙ |