Grammalecte  Check-in [3b3a02f4d3]

Overview
Comment:[graphspell][js] suggest optimisation with Jaro-Winkler (thanks to IllusionPerdu)
Downloads: Tarball | ZIP archive | SQL archive
Timelines: family | ancestors | descendants | both | graphspell | bdic_opt
Files: files | file ages | folders
SHA3-256: 3b3a02f4d385bf7b9d9c2ef3f992cc0c7cc150c8b7366c74ade85c8154499a54
User & Date: olr on 2020-09-15 13:50:00
Other Links: branch diff | manifest | tags
Context
2020-09-15
14:01
[graphspell][js] remove specific trick in cleanWord() check-in: 6569849b49 user: olr tags: graphspell, bdic_opt
13:50
[graphspell][js] suggest optimisation with Jaro-Winkler (thanks to IllusionPerdu) check-in: 3b3a02f4d3 user: olr tags: graphspell, bdic_opt
2020-09-14
14:38
[graphspell] string comparison: use Jaro-Winkler check-in: efebe44d15 user: olr tags: graphspell, bdic_opt
Changes

Modified graphspell-js/char_player.js from [0602ec129b] to [8dac23cf9b].

1
2
3
4
5
6
7
8
9
10
11





12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39



























40
41
42
43
44
45
46
1
2
3
4
5
6
7
8
9
10

11
12
13
14
15
16



























17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50










-
+
+
+
+
+

-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+







// list of similar chars
// useful for suggestion mechanism

/* jshint esversion:6 */
/* jslint esversion:6 */

${map}


var char_player = {

    /*
        oDistanceBetweenChars:
            - with Jaro-Winkler, values between 1 and 10
            - with Damerau-Levenshtein, values / 10 (between 0 and 1: 0.1, 0.2 ... 0.9)
    */
    oDistanceBetweenChars: {
        "a": {},
        "e": {"é": 0.5},
        "é": {"e": 0.5},
        "i": {"y": 0.2},
        "o": {},
        "u": {},
        "y": {"i": 0.3},
        "b": {"d": 0.8, "h": 0.9},
        "c": {"ç": 0.1, "k": 0.5, "q": 0.5, "s": 0.5, "x": 0.5, "z": 0.8},
        "d": {"b": 0.8},
        "f": {"v": 0.8},
        "g": {"j": 0.5},
        "h": {"b": 0.9},
        "j": {"g": 0.5, "i": 0.9},
        "k": {"c": 0.5, "q": 0.1, "x": 0.5},
        "l": {"i": 0.9},
        "m": {"n": 0.8},
        "n": {"m": 0.8, "r": 0.9},
        "p": {"q": 0.9},
        "q": {"c": 0.5, "k": 0.1, "p": 0.9},
        "r": {"n": 0.9, "j": 0.9},
        "s": {"c": 0.5, "ç": 0.1, "x": 0.5, "z": 0.5},
        "t": {"d": 0.9},
        "v": {"f": 0.8, "w": 0.1},
        "w": {"v": 0.1},
        "x": {"c": 0.5, "k": 0.5, "q": 0.5, "s": 0.5},
        "z": {"s": 0.5}
        //"a": {},
        "e": {"é": 5},
        //"é": {"e": 5},
        "i": {"y": 2},
        //"o": {},
        //"u": {},
        "y": {"i": 3},
        "b": {"d": 8, "h": 9},
        "c": {"ç": 1, "k": 5, "q": 5, "s": 5, "x": 5, "z": 8},
        "d": {"b": 8},
        "f": {"v": 8},
        "g": {"j": 5},
        "h": {"b": 9},
        "j": {"g": 5, "i": 9},
        "k": {"c": 5, "q": 1, "x": 5},
        "l": {"i": 9},
        "m": {"n": 8},
        "n": {"m": 8, "r": 9},
        "p": {"q": 9},
        "q": {"c": 5, "k": 1, "p": 9},
        "r": {"n": 9, "j": 9},
        "s": {"c": 5, "ç": 1, "x": 5, "z": 5},
        "t": {"d": 9},
        "v": {"f": 8, "w": 1},
        "w": {"v": 1},
        "x": {"c": 5, "k": 5, "q": 5, "s": 5},
        "z": {"s": 5}
    },

    distanceBetweenChars: function (c1, c2) {
        if (c1 == c2) {
            return 0;
        }
        if (this.oDistanceBetweenChars.hasOwnProperty(c1) && this.oDistanceBetweenChars[c1].hasOwnProperty(c2)) {

Modified graphspell-js/ibdawg.js from [1cb5337715] to [2160aa77f7].

18
19
20
21
22
23
24
25

26
27
28
29

30
31
32








33
34
35

36
37
38
39
40
41
42
43
44
45
46
47
48
49
50













51
52
53
54
55
56
57











58
59
60
61

62
63

64
65
66



67


68

69
70
71






72
73
74









75
76
77
78

79
80
81
82
83
84
85
86
87

88
89
90
91
92



93
94
95
96
97
98
99
18
19
20
21
22
23
24

25
26
27
28
29
30



31
32
33
34
35
36
37
38
39
40

41
42
43
44
45
46










47
48
49
50
51
52
53
54
55
56
57
58
59







60
61
62
63
64
65
66
67
68
69
70
71
72
73

74
75
76
77



78
79
80

81
82
83
84



85
86
87
88
89
90



91
92
93
94
95
96
97
98
99


100
101
102
103
104
105
106
107
108
109
110

111
112
113
114


115
116
117
118
119
120
121
122
123
124







-
+




+
-
-
-
+
+
+
+
+
+
+
+


-
+





-
-
-
-
-
-
-
-
-
-
+
+
+
+
+
+
+
+
+
+
+
+
+
-
-
-
-
-
-
-
+
+
+
+
+
+
+
+
+
+
+



-
+


+
-
-
-
+
+
+
-
+
+

+
-
-
-
+
+
+
+
+
+
-
-
-
+
+
+
+
+
+
+
+
+
-
-


+








-
+



-
-
+
+
+







    var char_player = require("./char_player.js");
}


class SuggResult {
    // Structure for storing, classifying and filtering suggestions

    constructor (sWord, nDistLimit=-1) {
    constructor (sWord, nSuggLimit=10, nDistLimit=-1) {
        this.sWord = sWord;
        this.sSimplifiedWord = str_transform.simplifyWord(sWord);
        this.nDistLimit = (nDistLimit >= 0) ? nDistLimit :  Math.floor(sWord.length / 3) + 1;
        this.nMinDist = 1000;
        // Temporary sets
        this.aSugg = new Set();
        this.dSugg = new Map([ [0, []],  [1, []],  [2, []] ]);
        this.aAllSugg = new Set();      // all found words even those refused
        this.aAllSugg = new Set();  // All suggestions, even the one rejected
        this.dGoodSugg = new Map(); // Acceptable suggestions
        this.dBestSugg = new Map(); // Best suggestions
        // Parameters
        this.nSuggLimit = nSuggLimit;
        this.nSuggLimitExt = nSuggLimit + 2;                // we add few entries in case suggestions merge after casing modifications
        this.nBestSuggLimit = Math.floor(nSuggLimit * 1.5); // n times the requested limit
        this.nGoodSuggLimit = nSuggLimit * 15;              // n times the requested limit
    }

    addSugg (sSugg, nDeep=0) {
    addSugg (sSugg) {
        // add a suggestion
        if (this.aAllSugg.has(sSugg)) {
            return;
        }
        this.aAllSugg.add(sSugg);
        if (!this.aSugg.has(sSugg)) {
            //let nDist = Math.floor(str_transform.distanceDamerauLevenshtein(this.sSimplifiedWord, str_transform.simplifyWord(sSugg)));
            let nDist = Math.floor((1 - str_transform.distanceJaroWinkler(this.sSimplifiedWord, str_transform.simplifyWord(sSugg)))*10);
            if (nDist <= this.nDistLimit) {
                if (sSugg.includes(" ")) { // add 1 to distance for split suggestions
                    nDist += 1;
                }
                if (!this.dSugg.has(nDist)) {
                    this.dSugg.set(nDist, []);
                }
        // jaro 0->1 1 les chaines sont égale
        let nDistJaro = 1 - str_transform.distanceJaroWinkler(this.sSimplifiedWord, str_transform.simplifyWord(sSugg));
        let nDist = Math.floor(nDistJaro * 10);
        if (nDistJaro < .11) {        // Best suggestions
            this.dBestSugg.set(sSugg, Math.round(nDistJaro*1000));
            if (this.dBestSugg.size > this.nBestSuggLimit) {
                this.nDistLimit = -1; // make suggest() to end search
            }
        } else if (nDistJaro < .33) { // Good suggestions
            this.dGoodSugg.set(sSugg, Math.round(nDistJaro*1000));
            if (this.dGoodSugg.size > this.nGoodSuggLimit) {
                this.nDistLimit = -1; // make suggest() to end search
            }
                this.dSugg.get(nDist).push(sSugg);
                this.aSugg.add(sSugg);
                if (nDist < this.nMinDist) {
                    this.nMinDist = nDist;
                }
                this.nDistLimit = Math.min(this.nDistLimit, this.nMinDist+1);
            }
        } else {
            if (nDist < this.nMinDist) {
                this.nMinDist = nDist;
            }
            this.nDistLimit = Math.min(this.nDistLimit, this.nMinDist);
        }
        if (nDist <= this.nDistLimit) {
            if (nDist < this.nMinDist) {
                this.nMinDist = nDist;
            }
            this.nDistLimit = Math.min(this.nDistLimit, this.nMinDist+1);
        }
    }

    getSuggestions (nSuggLimit=10) {
    getSuggestions () {
        // return a list of suggestions
        let lRes = [];
        if (this.dBestSugg.size > 0) {
        let bFirstListSorted = false;
        for (let [nDist, lSugg] of this.dSugg.entries()) {
            if (nDist > this.nDistLimit) {
            // sort only with simplified words
            let lResTmp = [...this.dBestSugg.entries()].sort((a, b) => { return a[1] - b[1]; });
            let nSize = Math.min(this.nSuggLimitExt, lResTmp.length);
                break;
            for (let i=0;  i < nSize;  i++){
                lRes.push(lResTmp[i][0]);
            }
        }
            if (!bFirstListSorted && lSugg.length > 1) {
                //lRes.sort((a, b) => { return str_transform.distanceDamerauLevenshtein(this.sWord, a) - str_transform.distanceDamerauLevenshtein(this.sWord, b); });
                lRes.sort((a, b) => { return str_transform.distanceJaroWinkler(this.sWord, b) - str_transform.distanceJaroWinkler(this.sWord, a); });
        if (lRes.length < this.nSuggLimitExt) {
            // sort with simplified words and original word
            let lResTmp = [...this.dGoodSugg.entries()].sort((a, b) => {
                // Low precision to rely more on simplified words
                let nJaroA = Math.round(str_transform.distanceJaroWinkler(this.sWord, a[0]) * 10);
                let nJaroB = Math.round(str_transform.distanceJaroWinkler(this.sWord, b[0]) * 10);
                bFirstListSorted = true;
            }
            lRes.push(...lSugg);
                if (nJaroA == nJaroB) {
                    return a[1] - b[1];     // warning: both lists are NOT sorted the same way (key: a-b)
                } else {
                    return nJaroB - nJaroA; // warning: both lists are NOT sorted the same way (key: b-a)
                }
            }).slice(0, this.nSuggLimitExt);
            let nSize = Math.min(this.nSuggLimitExt, lResTmp.length);
            for (let i=0;  i < nSize;  i++){
                lRes.push(lResTmp[i][0]);
            if (lRes.length > nSuggLimit) {
                break;
            }
        }
        // casing
        if (this.sWord.gl_isUpperCase()) {
            lRes = lRes.map((sSugg) => { return sSugg.toUpperCase(); });
            lRes = [...new Set(lRes)];
        }
        else if (this.sWord.slice(0,1).gl_isUpperCase()) {
            lRes = lRes.map((sSugg) => { return sSugg.slice(0,1).toUpperCase() + sSugg.slice(1); });
            lRes = [...new Set(lRes)];
        }
        return lRes.slice(0, nSuggLimit);
        return lRes.slice(0, this.nSuggLimit);
    }

    reset () {
        this.aSugg.clear();
        this.dSugg.clear();
        this.dSugg.clear();
        this.dGoodSugg.clear();
        this.dBestSugg.clear();
    }
}


class IBDAWG {
    // INDEXABLE BINARY DIRECT ACYCLIC WORD GRAPH

327
328
329
330
331
332
333
334


335
336
337
338
339
340

341
342
343
344
345
346
347
352
353
354
355
356
357
358

359
360
361
362
363
364
365

366
367
368
369
370
371
372
373







-
+
+





-
+







        if (this.lexicographer) {
            [sPfx, sWord, sSfx] = this.lexicographer.split(sWord);
        }
        let nMaxSwitch = Math.max(Math.floor(sWord.length / 3), 1);
        let nMaxDel = Math.floor(sWord.length / 5);
        let nMaxHardRepl = Math.max(Math.floor((sWord.length - 5) / 4), 1);
        let nMaxJump = Math.max(Math.floor(sWord.length / 4), 1);
        let oSuggResult = new SuggResult(sWord);
        let oSuggResult = new SuggResult(sWord, nSuggLimit);
        let sWord = str_transform.cleanWord(sWord);
        if (bSplitTrailingNumbers) {
            this._splitTrailingNumbers(oSuggResult, sWord);
        }
        this._splitSuggest(oSuggResult, sWord);
        this._suggest(oSuggResult, sWord, nMaxSwitch, nMaxDel, nMaxHardRepl, nMaxJump);
        let aSugg = oSuggResult.getSuggestions(nSuggLimit);
        let aSugg = oSuggResult.getSuggestions();
        if (this.lexicographer) {
            aSugg = this.lexicographer.filterSugg(aSugg);
        }
        if (sSfx || sPfx) {
            // we add what we removed
            return aSugg.map( (sSugg) => { return sPfx + sSugg + sSfx; } );
        }

Modified graphspell-js/str_transform.js from [5a573a5745] to [8ec0376c2c].

62
63
64
65
66
67
68













69
70
71
72
73
74
75
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88







+
+
+
+
+
+
+
+
+
+
+
+
+







            if (c != sWord.slice(i, i+1) || (c == 'e' && sWord.slice(i, i+2) != "ee")) {  // exception for <e> to avoid confusion between crée / créai
                sNewWord += c;
            }
            i++;
        }
        return sNewWord.replace(/eau/g, "o").replace(/au/g, "o").replace(/ai/g, "éi").replace(/ei/g, "é").replace(/ph/g, "f");
    },

    cleanWord: function (sWord) {
        // word clean for the user who make commun and preditive error help suggest
        // remove letters repeated more than 2 times
        if (sWord.match(/(.)(\1){2,}/igm)){
            sWord = sWord.replace(/(.*)(.)(.\2)/igm,'$1$2').replace(/(.)(\1)+/igm,'$1$1');
        }
        // words ending with -ik -> replace with -ique
        if (sWord.match(/ik$/ig)){
            sWord = sWord.replace(/(.*)ik$/ig,'$1ique');
        }
        return sWord;
    },

    _xTransNumbersToExponent: new Map([
        ["0", "⁰"], ["1", "¹"], ["2", "²"], ["3", "³"], ["4", "⁴"], ["5", "⁵"], ["6", "⁶"], ["7", "⁷"], ["8", "⁸"], ["9", "⁹"]
    ]),

    numbersToExponent: function (sWord) {
        let sNewWord = "";
205
206
207
208
209
210
211
212

213
214
215
216
217
218
219
218
219
220
221
222
223
224

225
226
227
228
229
230
231
232







-
+







        let adjwt = char_player.oDistanceBetweenChars;
        if (minv > Num_com) {
          for (let i = 0; i < a_len; i++) {
            if (!a_flag[i]) {
              for (let j = 0; j < b_len; j++) {
                if (!b_flag[j]) {
                  if (adjwt[a[i]] && adjwt[a[i]][b[j]]) {
                    N_simi += adjwt[a[i]][b[j]] * 10; // le fois 10 est un ajustement pour que ça fonctionne bien dans cette fonction
                    N_simi += adjwt[a[i]][b[j]];
                    b_flag[j] = 2;
                    break;
                  }
                }
              }
            }
          }