Grammalecte  Changes On Branch 51e3a2e76e76dc47

Changes In Branch spellsugg Through [51e3a2e76e] Excluding Merge-Ins

This is equivalent to a diff from 8ddc15536e to 51e3a2e76e

2017-11-09
11:56
[core] ibdawg: suggestion mechanism > reduce 1toX replacements overload (much, much faster) check-in: 767e396f2d user: olr tags: core, spellsugg
2017-11-08
21:16
[core] ibdawg: update char_player check-in: 51e3a2e76e user: olr tags: core, spellsugg
15:57
[fr] màj: écriture dystypographique check-in: 0bd6a29f03 user: olr tags: trunk, fr
15:14
[core][js] ibdawg: use SuggResult check-in: 2f30b8f748 user: olr tags: core, spellsugg
2017-11-07
11:56
[core] ibdawg: another suggestion method check-in: 1edae62ad8 user: olr tags: core, spellsugg
2017-11-06
17:21
[core][py] timer for testing check-in: 8ddc15536e user: olr tags: trunk, core
14:21
[core][py] ibdawg: debugging check-in: cbe1422160 user: olr tags: trunk, core

Modified cli.py from [ed7458a4d3] to [68793c6981].

206
207
208
209
210
211
212

213
214
215
216
217
218
219
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220







+







                        echo("* " + sWord)
                        for sMorph in oDict.getMorph(sWord):
                            echo("  {:<32} {}".format(sMorph, oLexGraphe.formatTags(sMorph)))
            elif sText.startswith("!"):
                for sWord in sText[1:].strip().split():
                    if sWord:
                        echo(" | ".join(oDict.suggest(sWord)))
                        #echo(" | ".join(oDict.suggest2(sWord)))
            elif sText.startswith(">"):
                oDict.drawPath(sText[1:].strip())
            elif sText.startswith("="):
                for sRes in oDict.select(sText[1:].strip()):
                    echo(sRes)
            elif sText.startswith("/+ "):
                gce.setOptions({ opt:True  for opt in sText[3:].strip().split()  if opt in gce.getOptions() })

Modified gc_core/js/char_player.js from [f5b2267e7d] to [ac345212e4].

22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65

66
67
68
69
70
71
72

73
74

75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
22
23
24
25
26
27
28





































29







30


31




















32
33
34
35
36
37
38







-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
+
-
-
-
-
-
-
-
+
-
-
+
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-







        let sRes = "";
        for (let c of sWord) {
            sRes += this._dTransChars.gl_get(c, c);
        }
        return sRes.replace("eau", "o").replace("au", "o");
    },

    distanceDamerauLevenshtein: function (s1, s2) {
        // distance of Damerau-Levenshtein between <s1> and <s2>
        // https://fr.wikipedia.org/wiki/Distance_de_Damerau-Levenshtein
        try {
            let nLen1 = s1.length;
            let nLen2 = s2.length;
            let matrix = [];
            for (let i = 0;  i <= nLen1;  i++) {
                matrix[i] = new Array(nLen2 + 1);
            }
            for (let i = 0;  i <= nLen1;  i++) {
                matrix[i][0] = i;
            }
            for (let j = 0;  j <= nLen2;  j++) {
                matrix[0][j] = j;
            }
            for (let i = 1;  i <= nLen1;  i++) {
                for (let j = 1;  j <= nLen2;  j++) {
                    let nCost = (s1[i] === s2[j]) ? 0 : 1;
                    matrix[i][j] = Math.min(
                        matrix[i-1][j] + 1,         // Deletion
                        matrix[i][j-1] + 1,         // Insertion
                        matrix[i-1][j-1] + nCost    // Substitution
                    );
                    if (i > 1 && j > 1 && s1[i] == s2[j-1] && s1[i-1] == s2[j]) {
                        matrix[i][j] = Math.min(matrix[i][j], matrix[i-2][j-2] + nCost);  // Transposition
                    }
                }
            }
            //console.log(s2 + ": " + matrix[nLen1][nLen2]);
            return matrix[nLen1][nLen2];
        }
        catch (e) {
            helpers.logerror(e);
        }
    },

    aVowel: new Set("aáàâäāeéèêëēiíìîïīoóòôöōuúùûüūyýỳŷÿȳœæAÁÀÂÄĀEÉÈÊËĒIÍÌÎÏĪOÓÒÔÖŌUÚÙÛÜŪYÝỲŶŸȲŒÆ"),
    showDistance (s1, s2) {
        let s1b = this.cleanWord(s1);
        let s2b = this.cleanWord(s2);
        console.log(`Distance: ${s1} / ${s2} = ${this.distanceDamerauLevenshtein(s1, s2)})`);
        console.log(`Distance: ${s1b} / ${s2b} = ${this.distanceDamerauLevenshtein(s1b, s2b)})`);
    },

    aConsonant: new Set("bcdefghjklmnñpqrstvwxzBCDEFGHJKLMNÑPQRSTVWXZ"),
    // Method: Remove Useless Chars

    aDouble: new Set("bcdfjklmnprstzBCDFJKLMNPRSTZ"),  // letter that may be used twice successively
    aVovels: new Set([
        'a', 'e', 'i', 'o', 'u', 'y',
        'à', 'é', 'î', 'ô', 'û', 'ÿ',
        'â', 'è', 'ï', 'ö', 'ù', 'ŷ',
        'ä', 'ê', 'í', 'ó', 'ü', 'ý',
        'á', 'ë', 'ì', 'ò', 'ú', 'ỳ',
        'ā', 'ē', 'ī', 'ō', 'ū', 'ȳ',
        'h', 'œ', 'æ'
    ]),

    shrinkWord: function (sWord) {
        // remove vovels and h
        let sRes = "";
        for (let cChar of sWord.slice(1)) {
            if (!this.aVovels.has(cChar)) {
                sRes += cChar;
            }
        }
        return sWord.slice(0, 1).replace("h", "") + sRes;
    },


    // Similar chars

    d1to1: new Map([
        ["1", "liîLIÎ"],
        ["2", "zZ"],
227
228
229
230
231
232
233








234
235
236
237
238
239
240
241
242
243
244
245
246
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182


183
184
185
186
187
188
189







+
+
+
+
+
+
+
+




-
-







        ["B", ["BB",]],
        ["c", ["cc", "ss", "qu", "ch"]],
        ["C", ["CC", "SS", "QU", "CH"]],
        ["ç", ["ss", "cc", "qh", "ch"]],
        ["Ç", ["SS", "CC", "QH", "CH"]],
        ["d", ["dd",]],
        ["D", ["DD",]],
        ["é", ["ai", "ei"]],
        ["É", ["AI", "EI"]],
        ["è", ["ai", "ei"]],
        ["È", ["AI", "EI"]],
        ["ê", ["ai", "ei"]],
        ["Ê", ["AI", "EI"]],
        ["ë", ["ai", "ei"]],
        ["Ë", ["AI", "EI"]],
        ["f", ["ff", "ph"]],
        ["F", ["FF", "PH"]],
        ["g", ["gu", "ge", "gg", "gh"]],
        ["G", ["GU", "GE", "GG", "GH"]],
        ["i", ["ii",]],
        ["I", ["II",]],
        ["j", ["jj", "dj"]],
        ["J", ["JJ", "DJ"]],
        ["k", ["qu", "ck", "ch", "cu", "kk", "kh"]],
        ["K", ["QU", "CK", "CH", "CU", "KK", "KH"]],
        ["l", ["ll",]],
        ["L", ["LL",]],
        ["m", ["mm", "mn"]],

Modified gc_core/js/ibdawg.js from [c87880e4ad] to [ca747a7a44].

13
14
15
16
17
18
19































































20
21
22
23
24
25
26
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89







+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+









// Don’t remove <string>. Necessary in TB.
${string}
${map}
${set}


class SuggResult {
    // Structure for storing, classifying and filtering suggestions

    constructor (sWord, nDistLimit=-1) {
        this.sWord = sWord;
        this.sCleanWord = char_player.cleanWord(sWord);
        this.nDistLimit = (nDistLimit >= 0) ? nDistLimit :  Math.floor(sWord.length / 3) + 1;
        this.nMinDist = 1000;
        this.aSugg = new Set();
        this.dSugg = new Map([ [0, []],  [1, []] ]);
    }

    addSugg (sSugg, nDeep=0) {
        // add a suggestion
        if (!this.aSugg.has(sSugg)) {
            let nDist = str_transform.distanceDamerauLevenshtein(this.sCleanWord, char_player.cleanWord(sSugg));
            if (nDist <= this.nDistLimit) {
                if (!this.dSugg.has(nDist)) {
                    this.dSugg.set(nDist, []);
                }
                this.dSugg.get(nDist).push(sSugg);
                this.aSugg.add(sSugg);
                if (nDist < this.nMinDist) {
                    this.nMinDist = nDist;
                }
                this.nDistLimit = Math.min(this.nDistLimit, this.nMinDist+2);
            }
        }
    }

    getSuggestions (nSuggLimit=10, nDistLimit=-1) {
        // return a list of suggestions
        let lRes = [];
        if (this.dSugg.get(0).length) {
            // we sort the better results with the original word
            let dDistTemp = new Map();
            lRes.forEach((sSugg) => { dDistTemp.set(sSugg, str_transform.distanceDamerauLevenshtein(this.sWord, sSugg)); });
            lRes = lRes.sort((sA, sB) => { return dDistTemp.get(sA) - dDistTemp.get(sB); });
            dDistTemp.clear();
        }
        for (let lSugg of this.dSugg.values()) {
            for (let sSugg of lSugg) { lRes.push(sSugg); }
            if (lRes.length > nSuggLimit) {
                break;
            }
        }
        lRes = char_player.filterSugg(lRes);
        if (this.sWord.gl_isTitle()) {
            lRes = lRes.map((sSugg) => { return sSugg.gl_toCapitalize(); });
        }
        else if (this.sWord.gl_isUpperCase()) {
            lRes = lRes.map((sSugg) => { return sSugg.toUpperCase(); });
        }
        return lRes.slice(0, nSuggLimit);
    }

    reset () {
        this.aSugg.clear();
        this.dSugg.clear();
    }
}


class IBDAWG {
    // INDEXABLE BINARY DIRECT ACYCLIC WORD GRAPH

    constructor (sDicName, sPath="") {
        try {
            let sURL = (sPath !== "") ? sPath + "/" + sDicName : "resource://grammalecte/_dictionaries/"+sDicName;
189
190
191
192
193
194
195

196
197

198

199
200

201
202
203

204
205
206

207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223

224
225
226
227
228
229

230
231
232

233
234

235
236
237
238


239
240

241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
























263
264
265
266

267
268
269
270
271
272
273
274

275
276
277
278

279
280

281
282
283









284
285
286
287

288
289
290
291
292
293
294
252
253
254
255
256
257
258
259
260
261
262

263
264

265
266
267

268
269


270









271
272
273
274
275
276
277

278
279
280

281
282

283
284
285

286
287

288
289
290


291
292
293
294
295






















296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319


320

321
322
323
324
325
326
327
328

329
330
331
332

333
334

335
336
337
338
339
340
341
342
343
344
345
346
347

348
349

350
351
352
353
354
355
356
357







+


+
-
+

-
+


-
+

-
-
+
-
-
-
-
-
-
-
-
-







-
+


-


-
+


-
+

-
+


-
-
+
+


+
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
-
-

-
+







-
+



-
+

-
+



+
+
+
+
+
+
+
+
+
-


-
+







    }

    suggest (sWord, nMaxSugg=10) {
        // returns a array of suggestions for <sWord>
        let sPfx = "";
        let sSfx = "";
        [sPfx, sWord, sSfx] = char_player.cut(sWord);
        let nMaxSwitch = Math.max(Math.floor(sWord.length / 3), 1);
        let nMaxDel = Math.floor(sWord.length / 5);
        let nMaxHardRepl = Math.max(Math.floor((sWord.length - 5) / 4), 1);
        let oSuggResult = new SuggResult(sWord);
        let aSugg = this._suggest(sWord, nMaxDel, nMaxHardRepl);
        this._suggest(oSuggResult, sWord, nMaxSwitch, nMaxDel, nMaxHardRepl);
        if (sWord.gl_isTitle()) {
            aSugg.gl_update(this._suggest(sWord.toLowerCase(), nMaxDel, nMaxHardRepl));
            this._suggest(oSuggResult, sWord.toLowerCase(), nMaxSwitch, nMaxDel, nMaxHardRepl);
        }
        else if (sWord.gl_isLowerCase()) {
            aSugg.gl_update(this._suggest(sWord.gl_toCapitalize(), nMaxDel, nMaxHardRepl));
            this._suggest(oSuggResult, sWord.gl_toCapitalize(), nMaxSwitch, nMaxDel, nMaxHardRepl);
        }
        // Set to Array
        aSugg = Array.from(aSugg);
        let aSugg = oSuggResult.getSuggestions();
        aSugg = char_player.filterSugg(aSugg);
        if (sWord.gl_isTitle()) {
            aSugg = aSugg.map((sSugg) => { return sSugg.gl_toCapitalize(); });
        }
        let dDistTemp = new Map();
        let sCleanWord = char_player.cleanWord(sWord);
        aSugg.forEach((sSugg) => { dDistTemp.set(sSugg, char_player.distanceDamerauLevenshtein(sCleanWord, char_player.cleanWord(sSugg))); });
        aSugg = aSugg.sort((sA, sB) => { return dDistTemp.get(sA) - dDistTemp.get(sB); }).slice(0, nMaxSugg);
        dDistTemp.clear();
        if (sSfx || sPfx) {
            // we add what we removed
            return aSugg.map( (sSugg) => { return sPfx + sSugg + sSfx } );
        }
        return aSugg;
    }

    _suggest (sRemain, nMaxDel=0, nMaxHardRepl=0, nDeep=0, iAddr=0, sNewWord="", bAvoidLoop=false) {
    _suggest (oSuggResult, sRemain, nMaxSwitch=0, nMaxDel=0, nMaxHardRepl=0, nDeep=0, iAddr=0, sNewWord="", bAvoidLoop=false) {
        // returns a set of suggestions
        // recursive function
        let aSugg = new Set();
        if (sRemain == "") {
            if (this._convBytesToInteger(this.byDic.slice(iAddr, iAddr+this.nBytesArc)) & this._finalNodeMask) {
                aSugg.add(sNewWord);
                oSuggResult.addSugg(sNewWord);
            }
            for (let sTail of this._getTails(iAddr)) {
                aSugg.add(sNewWord+sTail);
                oSuggResult.addSugg(sNewWord+sTail);
            }
            return aSugg;
            return;
        }
        let cCurrent = sRemain.slice(0, 1);
        for (let [cChar, jAddr] of this._getSimilarArcs(cCurrent, iAddr)) {
            aSugg.gl_update(this._suggest(sRemain.slice(1), nMaxDel, nMaxHardRepl, nDeep+1, jAddr, sNewWord+cChar));
        for (let [cChar, jAddr] of this._getSimilarCharArcs(cCurrent, iAddr)) {
            this._suggest(oSuggResult, sRemain.slice(1), nMaxSwitch, nMaxDel, nMaxHardRepl, nDeep+1, jAddr, sNewWord+cChar);
        }
        if (!bAvoidLoop) { // avoid infinite loop
            if (sRemain.length > 1) {
            if (cCurrent == sRemain.slice(1, 2)) {
                // same char, we remove 1 char without adding 1 to <sNewWord>
                aSugg.gl_update(this._suggest(sRemain.slice(1), nMaxDel, nMaxHardRepl, nDeep+1, iAddr, sNewWord));
            }
            else {
                // switching chars
                aSugg.gl_update(this._suggest(sRemain.slice(1, 2)+sRemain.slice(0, 1)+sRemain.slice(2), nMaxDel, nMaxHardRepl, nDeep+1, iAddr, sNewWord, true));
                // delete char
                if (nMaxDel > 0) {
                    aSugg.gl_update(this._suggest(sRemain.slice(1), nMaxDel-1, nMaxHardRepl, nDeep+1, iAddr, sNewWord, true));
                }
            }
            // Phonetic replacements
            for (let sRepl of char_player.d1toX.gl_get(cCurrent, [])) {
                aSugg.gl_update(this._suggest(sRepl + sRemain.slice(1), nMaxDel, nMaxHardRepl, nDeep+1, iAddr, sNewWord, true));
            }
            for (let sRepl of char_player.d2toX.gl_get(sRemain.slice(0, 2), [])) {
                aSugg.gl_update(this._suggest(sRepl + sRemain.slice(2), nMaxDel, nMaxHardRepl, nDeep+1, iAddr, sNewWord, true));
            }
            // Hard replacements
            if (nDeep > 3 && nMaxHardRepl && sRemain.length >= 2) {
                for (let [nVal, kAddr] of this._getArcs1(iAddr)) {
                if (cCurrent == sRemain.slice(1, 2)) {
                    // same char, we remove 1 char without adding 1 to <sNewWord>
                    this._suggest(oSuggResult, sRemain.slice(1), nMaxSwitch, nMaxDel, nMaxHardRepl, nDeep+1, iAddr, sNewWord);
                }
                else {
                    // switching chars
                    if (nMaxSwitch > 0) {
                        this._suggest(oSuggResult, sRemain.slice(1, 2)+sRemain.slice(0, 1)+sRemain.slice(2), nMaxSwitch-1, nMaxDel, nMaxHardRepl, nDeep+1, iAddr, sNewWord, true);
                    }
                    // delete char
                    if (nMaxDel > 0) {
                        this._suggest(oSuggResult, sRemain.slice(1), nMaxSwitch, nMaxDel-1, nMaxHardRepl, nDeep+1, iAddr, sNewWord, true);
                    }
                }
                // Phonetic replacements
                for (let sRepl of char_player.d1toX.gl_get(cCurrent, [])) {
                    this._suggest(oSuggResult, sRepl + sRemain.slice(1), nMaxSwitch, nMaxDel, nMaxHardRepl, nDeep+1, iAddr, sNewWord, true);
                }
                for (let sRepl of char_player.d2toX.gl_get(sRemain.slice(0, 2), [])) {
                    this._suggest(oSuggResult, sRepl + sRemain.slice(2), nMaxSwitch, nMaxDel, nMaxHardRepl, nDeep+1, iAddr, sNewWord, true);
                }
                // Hard replacements
                if (nDeep > 3 && nMaxHardRepl && sRemain.length >= 2) {
                    for (let [cChar, kAddr] of this._getCharArcs(iAddr)) {
                    if (this.dCharVal.has(nVal)) {
                        let cChar = this.dCharVal.get(nVal);
                        if (!char_player.d1to1.gl_get(cCurrent, "").includes(cChar)) {
                            aSugg.gl_update(this._suggest(sRemain.slice(1), nMaxDel, nMaxHardRepl-1, nDeep+1, kAddr, sNewWord+cChar, true));
                            this._suggest(oSuggResult, sRemain.slice(1), nMaxSwitch, nMaxDel, nMaxHardRepl-1, nDeep+1, kAddr, sNewWord+cChar, true);
                        }
                    }
                }
            }
            // end of word
            if (sRemain.length == 2) {
                for (let sRepl of char_player.dFinal2.gl_get(sRemain, [])) {
                    aSugg.gl_update(this._suggest(sRepl, nMaxDel, nMaxHardRepl, nDeep+1, iAddr, sNewWord, true));
                    this._suggest(oSuggResult, sRepl, nMaxSwitch, nMaxDel, nMaxHardRepl, nDeep+1, iAddr, sNewWord, true);
                }
            }
            else if (sRemain.length == 1) {
                aSugg.gl_update(this._suggest("", nMaxDel, nMaxHardRepl, nDeep+1, iAddr, sNewWord, true)); // remove last char and go on
                this._suggest(oSuggResult, "", nMaxSwitch, nMaxDel, nMaxHardRepl, nDeep+1, iAddr, sNewWord, true); // remove last char and go on
                for (let sRepl of char_player.dFinal1.gl_get(sRemain, [])) {
                    aSugg.gl_update(this._suggest(sRepl, nMaxDel, nMaxHardRepl, nDeep+1, iAddr, sNewWord, true));
                    this._suggest(oSuggResult, sRepl, nMaxSwitch, nMaxDel, nMaxHardRepl, nDeep+1, iAddr, sNewWord, true);
                }
            }
        }
    }

    * _getCharArcs (iAddr) {
        // generator: yield all chars and addresses from node at address <iAddr>
        for (let [nVal, jAddr] of this._getArcs(iAddr)) {
            if (nVal < this.nChar) {
                yield [this.dCharVal.get(nVal), jAddr];
            }
        }
        return aSugg;
    }

    * _getSimilarArcs (cChar, iAddr) {
    * _getSimilarCharArcs (cChar, iAddr) {
        // generator: yield similar char of <cChar> and address of the following node
        for (let c of char_player.d1to1.gl_get(cChar, [cChar])) {
            if (this.dChar.has(c)) {
                let jAddr = this._lookupArcNode(this.dChar.get(c), iAddr);
                if (jAddr) {
                    yield [c, jAddr];
                }

Modified gc_core/js/str_transform.js from [6745507121] to [e70f6ede55].

1
2
3
4
5
6










































7
8
9
10
11
12
13
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55






+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+







//// STRING TRANSFORMATION
/*jslint esversion: 6*/

// Note: 48 is the ASCII code for "0"

var str_transform = {

    distanceDamerauLevenshtein: function (s1, s2) {
        // distance of Damerau-Levenshtein between <s1> and <s2>
        // https://fr.wikipedia.org/wiki/Distance_de_Damerau-Levenshtein
        try {
            let nLen1 = s1.length;
            let nLen2 = s2.length;
            let matrix = [];
            for (let i = 0;  i <= nLen1;  i++) {
                matrix[i] = new Array(nLen2 + 1);
            }
            for (let i = 0;  i <= nLen1;  i++) {
                matrix[i][0] = i;
            }
            for (let j = 0;  j <= nLen2;  j++) {
                matrix[0][j] = j;
            }
            for (let i = 1;  i <= nLen1;  i++) {
                for (let j = 1;  j <= nLen2;  j++) {
                    let nCost = (s1[i] === s2[j]) ? 0 : 1;
                    matrix[i][j] = Math.min(
                        matrix[i-1][j] + 1,         // Deletion
                        matrix[i][j-1] + 1,         // Insertion
                        matrix[i-1][j-1] + nCost    // Substitution
                    );
                    if (i > 1 && j > 1 && s1[i] == s2[j-1] && s1[i-1] == s2[j]) {
                        matrix[i][j] = Math.min(matrix[i][j], matrix[i-2][j-2] + nCost);  // Transposition
                    }
                }
            }
            //console.log(s2 + ": " + matrix[nLen1][nLen2]);
            return matrix[nLen1][nLen2];
        }
        catch (e) {
            helpers.logerror(e);
        }
    },

    showDistance (s1, s2) {
        console.log(`Distance: ${s1} / ${s2} = ${this.distanceDamerauLevenshtein(s1, s2)})`);
    },

    getStemFromSuffixCode: function (sFlex, sSfxCode) {
        // Suffix only
        if (sSfxCode == "0") {
            return sFlex;
        }
        return sSfxCode[0] == '0' ? sFlex + sSfxCode.slice(1) : sFlex.slice(0, -(sSfxCode.charCodeAt(0)-48)) + sSfxCode.slice(1);
    },

Modified gc_core/py/char_player.py from [97f69bd70d] to [b008c1ffec].

15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44


45
46
47
48
49
50

51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
15
16
17
18
19
20
21























22
23






24






















25
26
27
28
29
30
31







-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
+
+
-
-
-
-
-
-
+
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-







})

def cleanWord (sWord):
    "word simplication before calculating distance between words"
    return sWord.lower().translate(_xTransChars).replace("eau", "o").replace("au", "o")


def distanceDamerauLevenshtein (s1, s2):
    "distance of Damerau-Levenshtein between <s1> and <s2>"
    # https://fr.wikipedia.org/wiki/Distance_de_Damerau-Levenshtein
    d = {}
    nLen1 = len(s1)
    nLen2 = len(s2)
    for i in range(-1, nLen1+1):
        d[i, -1] = i + 1
    for j in range(-1, nLen2+1):
        d[-1, j] = j + 1
    for i in range(nLen1):
        for j in range(nLen2):
            nCost = 0  if s1[i] == s2[j]  else 1
            d[i, j] = min(
                d[i-1, j]   + 1,        # Deletion
                d[i,   j-1] + 1,        # Insertion
                d[i-1, j-1] + nCost,    # Substitution
            )
            if i and j and s1[i] == s2[j-1] and s1[i-1] == s2[j]:
                d[i, j] = min(d[i, j], d[i-2, j-2] + nCost)     # Transposition
    return d[nLen1-1, nLen2-1]


aVowel = set("aáàâäāeéèêëēiíìîïīoóòôöōuúùûüūyýỳŷÿȳœæAÁÀÂÄĀEÉÈÊËĒIÍÌÎÏĪOÓÒÔÖŌUÚÙÛÜŪYÝỲŶŸȲŒÆ")
aConsonant = set("bcdefghjklmnñpqrstvwxzBCDEFGHJKLMNÑPQRSTVWXZ")
def showDistance (s1, s2):
    s1b = cleanWord(s1);
    s2b = cleanWord(s2);
    print(f"Distance: {s1} / {s2} = {distanceDamerauLevenshtein(s1, s2)}")
    print(f"Distance: {s1b} / {s2b} = {distanceDamerauLevenshtein(s1b, s2b)}")

aDouble = set("bcdfjklmnprstzBCDFJKLMNPRSTZ")  # letter that may be used twice successively

# Method: Remove Useless Chars

_dVovels = {
    'a': '',  'e': '',  'i': '',  'o': '',  'u': '',  'y': '',
    'à': '',  'é': '',  'î': '',  'ô': '',  'û': '',  'ÿ': '',
    'â': '',  'è': '',  'ï': '',  'ö': '',  'ù': '',  'ŷ': '',
    'ä': '',  'ê': '',  'í': '',  'ó': '',  'ü': '',  'ý': '',
    'á': '',  'ë': '',  'ì': '',  'ò': '',  'ú': '',  'ỳ': '',
    'ā': '',  'ē': '',  'ī': '',  'ō': '',  'ū': '',  'ȳ': '',
    'h': '',  'œ': '',  'æ': ''
 }

_xTransVovels = str.maketrans(_dVovels)


aVovels = frozenset(_dVovels.keys())


def shrinkWord (sWord):
    "remove vovels and h"
    return sWord[0:1].replace("h", "") + sWord[1:].translate(_xTransVovels)


# Similar chars

d1to1 = {
    "1": "liîLIÎ",
    "2": "zZ",
205
206
207
208
209
210
211








212
213
214
215
216
217
218
219
220
221
222
223
224
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175


176
177
178
179
180
181
182







+
+
+
+
+
+
+
+




-
-







    "B": ("BB",),
    "c": ("cc", "ss", "qu", "ch"),
    "C": ("CC", "SS", "QU", "CH"),
    "ç": ("ss", "cc", "qh", "ch"),
    "Ç": ("SS", "CC", "QH", "CH"),
    "d": ("dd",),
    "D": ("DD",),
    "é": ("ai", "ei"),
    "É": ("AI", "EI"),
    "è": ("ai", "ei"),
    "È": ("AI", "EI"),
    "ê": ("ai", "ei"),
    "Ê": ("AI", "EI"),
    "ë": ("ai", "ei"),
    "Ë": ("AI", "EI"),
    "f": ("ff", "ph"),
    "F": ("FF", "PH"),
    "g": ("gu", "ge", "gg", "gh"),
    "G": ("GU", "GE", "GG", "GH"),
    "i": ("ii",),
    "I": ("II",),
    "j": ("jj", "dj"),
    "J": ("JJ", "DJ"),
    "k": ("qu", "ck", "ch", "cu", "kk", "kh"),
    "K": ("QU", "CK", "CH", "CU", "KK", "KH"),
    "l": ("ll",),
    "L": ("LL",),
    "m": ("mm", "mn"),

Modified gc_core/py/ibdawg.py from [a0b40e49a1] to [0203105f4e].

1
2
3
4
5
6
7
8
9
10
11
12


13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29















































30
31
32
33
34
35
36
1
2
3
4
5
6
7
8
9
10


11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83










-
-
+
+

















+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+







#!python3

import os
import traceback
import pkgutil
import re
from itertools import chain
from functools import wraps
import time

#import logging
#logging.basicConfig(filename="suggestions.log", level=logging.DEBUG)
import logging
logging.basicConfig(filename="suggestions.log", level=logging.DEBUG)

from . import str_transform as st
from . import char_player as cp
from .echo import echo


def timethis (func):
    "decorator for the execution time"
    @wraps(func)
    def wrapper (*args, **kwargs):
        fStart = time.time()
        result = func(*args, **kwargs)
        fEnd = time.time()
        print(func.__name__, fEnd - fStart)
        return result
    return wrapper


class SuggResult:
    """Structure for storing, classifying and filtering suggestions"""

    def __init__ (self, sWord, nDistLimit=-1):
        self.sWord = sWord
        self.sCleanWord = cp.cleanWord(sWord)
        self.nDistLimit = nDistLimit  if nDistLimit >= 0  else  (len(sWord) // 3) + 1
        self.nMinDist = 1000
        self.aSugg = set()
        self.dSugg = { 0: [],  1: [] }

    def addSugg (self, sSugg, nDeep=0):
        "add a suggestion"
        #logging.info((nDeep * "  ") + "__" + sSugg + "__")
        if sSugg not in self.aSugg:
            nDist = st.distanceDamerauLevenshtein(self.sCleanWord, cp.cleanWord(sSugg))
            if nDist <= self.nDistLimit:
                if nDist not in self.dSugg:
                    self.dSugg[nDist] = []
                self.dSugg[nDist].append(sSugg)
                self.aSugg.add(sSugg)
                if nDist < self.nMinDist:
                    self.nMinDist = nDist
                self.nDistLimit = min(self.nDistLimit, self.nMinDist+2)

    def getSuggestions (self, nSuggLimit=10, nDistLimit=-1):
        "return a list of suggestions"
        lRes = []
        if self.dSugg[0]:
            # we sort the better results with the original word
            self.dSugg[0].sort(key=lambda sSugg: st.distanceDamerauLevenshtein(self.sWord, sSugg))
        for lSugg in self.dSugg.values():
            lRes.extend(lSugg)
            if len(lRes) > nSuggLimit:
                break
        lRes = list(cp.filterSugg(lRes))
        if self.sWord.istitle():
            lRes = list(map(lambda sSugg: sSugg.title(), lRes))
        elif self.sWord.isupper():
            lRes = list(map(lambda sSugg: sSugg.upper(), lRes))
        return lRes[:nSuggLimit]

    def reset (self):
        self.aSugg.clear()
        self.dSugg.clear()


class IBDAWG:
    """INDEXABLE BINARY DIRECT ACYCLIC WORD GRAPH"""

    def __init__ (self, sDicName):
        self.by = pkgutil.get_data(__package__, "_dictionaries/" + sDicName)
        if not self.by:
200
201
202
203
204
205
206

207
208

209

210
211

212
213
214
215
216
217


























































218
219
220
221
222
223

224
225
226
227
228
229






230
231
232


233
234
235
236
237




238
239
240
241
242
243
244

245
246
247
248
249
250
251
252
253
254
255
256
257
258



259
260
261
262
263
264
265
266
267
268
269
270
271

272
273
274
275
276
277







278
279
280
281
282
283
284
285
286
287
288
289
290
291
292

293
294
295
296


297
298
299
300
301
302





303
304
305
306
307
308
309
247
248
249
250
251
252
253
254
255
256
257

258
259

260

261




262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324

325

326




327
328
329
330
331
332



333
334





335
336
337
338







339














340
341
342











343

344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371

372
373
374


375
376






377
378
379
380
381
382
383
384
385
386
387
388







+


+
-
+

-
+
-

-
-
-
-
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+





-
+
-

-
-
-
-
+
+
+
+
+
+
-
-
-
+
+
-
-
-
-
-
+
+
+
+
-
-
-
-
-
-
-
+
-
-
-
-
-
-
-
-
-
-
-
-
-
-
+
+
+
-
-
-
-
-
-
-
-
-
-
-

-
+






+
+
+
+
+
+
+














-
+


-
-
+
+
-
-
-
-
-
-
+
+
+
+
+







                l.extend(self.morph(sWord.capitalize()))
        return l

    @timethis
    def suggest (self, sWord, nMaxSugg=10):
        "returns a set of suggestions for <sWord>"
        sPfx, sWord, sSfx = cp.cut(sWord)
        nMaxSwitch = max(len(sWord) // 3, 1)
        nMaxDel = len(sWord) // 5
        nMaxHardRepl = max((len(sWord) - 5) // 4, 1)
        oSuggResult = SuggResult(sWord)
        aSugg = self._suggest(sWord, nMaxDel=nMaxDel, nMaxHardRepl=nMaxHardRepl)
        self._suggest(oSuggResult, sWord, nMaxSwitch=nMaxSwitch, nMaxDel=nMaxDel, nMaxHardRepl=nMaxHardRepl)
        if sWord.istitle():
            aSugg.update(self._suggest(sWord.lower(), nMaxDel=nMaxDel, nMaxHardRepl=nMaxHardRepl))
            self._suggest(oSuggResult, sWord.lower(), nMaxSwitch=nMaxSwitch, nMaxDel=nMaxDel, nMaxHardRepl=nMaxHardRepl)
            aSugg = set(map(lambda sSugg: sSugg.title(), aSugg))
        elif sWord.islower():
            aSugg.update(self._suggest(sWord.title(), nMaxDel=nMaxDel, nMaxHardRepl=nMaxHardRepl))
        aSugg = cp.filterSugg(aSugg)
        sCleanWord = cp.cleanWord(sWord)
        aSugg = sorted(aSugg, key=lambda sSugg: cp.distanceDamerauLevenshtein(sCleanWord, cp.cleanWord(sSugg)))[:nMaxSugg]
            self._suggest(oSuggResult, sWord.title(), nMaxSwitch=nMaxSwitch, nMaxDel=nMaxDel, nMaxHardRepl=nMaxHardRepl)
        aSugg = oSuggResult.getSuggestions()
        if sSfx or sPfx:
            # we add what we removed
            return list(map(lambda sSug: sPfx + sSug + sSfx, aSugg))
        return aSugg

    def _suggest (self, oSuggResult, sRemain, nMaxSwitch=0, nMaxDel=0, nMaxHardRepl=0, nDeep=0, iAddr=0, sNewWord="", sAction="", bAvoidLoop=False):
        # recursive function
        #logging.info((nDeep * "  ") + sNewWord + ":" + sRemain + " · " + sAction)
        if not sRemain:
            if int.from_bytes(self.byDic[iAddr:iAddr+self.nBytesArc], byteorder='big') & self._finalNodeMask:
                oSuggResult.addSugg(sNewWord, nDeep)
            for sTail in self._getTails(iAddr):
                oSuggResult.addSugg(sNewWord+sTail, nDeep)
            return
        cCurrent = sRemain[0:1]
        for cChar, jAddr in self._getSimilarCharArcs(cCurrent, iAddr):
            self._suggest(oSuggResult, sRemain[1:], nMaxSwitch, nMaxDel, nMaxHardRepl, nDeep+1, jAddr, sNewWord+cChar, "*")
        if not bAvoidLoop: # avoid infinite loop
            if len(sRemain) > 1:
                if cCurrent == sRemain[1:2]:
                    # same char, we remove 1 char without adding 1 to <sNewWord>
                    self._suggest(oSuggResult, sRemain[1:], nMaxSwitch, nMaxDel, nMaxHardRepl, nDeep+1, iAddr, sNewWord, cCurrent+"/2")
                else:
                    # switching chars
                    if nMaxSwitch:
                        self._suggest(oSuggResult, sRemain[1:2]+sRemain[0:1]+sRemain[2:], nMaxSwitch-1, nMaxDel, nMaxHardRepl, nDeep+1, iAddr, sNewWord, "><",True)
                    # delete char
                    if nMaxDel:
                        self._suggest(oSuggResult, sRemain[1:], nMaxSwitch, nMaxDel-1, nMaxHardRepl, nDeep+1, iAddr, sNewWord, "-"+cCurrent, True)
                # Phonetic replacements
                for sRepl in cp.d1toX.get(cCurrent, ()):
                    self._suggest(oSuggResult, sRepl + sRemain[1:], nMaxSwitch, nMaxDel, nMaxHardRepl, nDeep+1, iAddr, sNewWord, cCurrent+">"+sRepl, True)
                for sRepl in cp.d2toX.get(sRemain[0:2], ()):
                    self._suggest(oSuggResult, sRepl + sRemain[2:], nMaxSwitch, nMaxDel, nMaxHardRepl, nDeep+1, iAddr, sNewWord, sRemain[0:2]+">"+sRepl, True)
                # Hard replacements
                if nDeep > 3 and nMaxHardRepl:
                    for cChar, kAddr in self._getCharArcs(iAddr):
                        if cChar not in cp.d1to1.get(cCurrent, ""):
                            self._suggest(oSuggResult, sRemain[1:], nMaxSwitch, nMaxDel, nMaxHardRepl-1, nDeep+1, kAddr, sNewWord+cChar, "[["+cChar+"]]", True)
            # end of word
            if len(sRemain) == 2:
                for sRepl in cp.dFinal2.get(sRemain, ()):
                    self._suggest(oSuggResult, sRepl, nMaxSwitch, nMaxDel, nMaxHardRepl, nDeep+1, iAddr, sNewWord, sRemain + " >> " + sRepl, True)
            elif len(sRemain) == 1:
                self._suggest(oSuggResult, "", nMaxSwitch, nMaxDel, nMaxHardRepl, nDeep+1, iAddr, sNewWord, sRemain + " [last char removed] ", True) # remove last char and go on
                for sRepl in cp.dFinal1.get(sRemain, ()):
                    self._suggest(oSuggResult, sRepl, nMaxSwitch, nMaxDel, nMaxHardRepl, nDeep+1, iAddr, sNewWord, sRemain + " >> " + sRepl, True)
        return

    @timethis
    def suggest2 (self, sWord, nMaxSugg=10):
        "returns a set of suggestions for <sWord>"
        sPfx, sWord, sSfx = cp.cut(sWord)
        oSuggResult = SuggResult(sWord)
        self._suggest2(oSuggResult)
        aSugg = oSuggResult.getSuggestions()
        if sSfx or sPfx:
            # we add what we removed
            return list(map(lambda sSug: sPfx + sSug + sSfx, aSugg))
        return aSugg

    def _suggest (self, sRemain, nMaxDel=0, nMaxHardRepl=0, nDeep=0, iAddr=0, sNewWord="", bAvoidLoop=False):
    def _suggest2 (self, oSuggResult, nDeep=0, iAddr=0, sNewWord=""):
        "returns a set of suggestions"
        # recursive function
        #logging.info((nDeep * "  ") + sNewWord + ":" + sRemain)
        aSugg = set()
        if not sRemain:
            if int.from_bytes(self.byDic[iAddr:iAddr+self.nBytesArc], byteorder='big') & self._finalNodeMask:
        #logging.info((nDeep * "  ") + sNewWord)
        if nDeep >= oSuggResult.nDistLimit:
            sCleanNewWord = cp.cleanWord(sNewWord)
            if st.distanceSift4(oSuggResult.sCleanWord[:len(sCleanNewWord)], sCleanNewWord) > oSuggResult.nDistLimit:
                return
        if int.from_bytes(self.byDic[iAddr:iAddr+self.nBytesArc], byteorder='big') & self._finalNodeMask:
                #logging.info((nDeep * "  ") + "__" + sNewWord + "__")
                aSugg.add(sNewWord)
            for sTail in self._getTails(iAddr):
            oSuggResult.addSugg(sNewWord, nDeep)
        for cChar, jAddr in self._getCharArcsWithPriority(iAddr, oSuggResult.sWord[nDeep:nDeep+1]):
                #logging.info((nDeep * "  ") + "__" + sNewWord+sTail + "__")
                aSugg.add(sNewWord+sTail)
            return aSugg
        cCurrent = sRemain[0:1]
        for cChar, jAddr in self._getSimilarArcs(cCurrent, iAddr):
            self._suggest2(oSuggResult, nDeep+1, jAddr, sNewWord+cChar)
        return

    def _getCharArcs (self, iAddr):
            aSugg.update(self._suggest(sRemain[1:], nMaxDel, nMaxHardRepl, nDeep+1, jAddr, sNewWord+cChar))
        if not bAvoidLoop: # avoid infinite loop
            if cCurrent == sRemain[1:2]:
                # same char, we remove 1 char without adding 1 to <sNewWord>
                aSugg.update(self._suggest(sRemain[1:], nMaxDel, nMaxHardRepl, nDeep+1, iAddr, sNewWord))
            else:
                # switching chars
        "generator: yield all chars and addresses from node at address <iAddr>"
                aSugg.update(self._suggest(sRemain[1:2]+sRemain[0:1]+sRemain[2:], nMaxDel, nMaxHardRepl, nDeep+1, iAddr, sNewWord, True))
                # delete char
                if nMaxDel > 0:
                    aSugg.update(self._suggest(sRemain[1:], nMaxDel-1, nMaxHardRepl, nDeep+1, iAddr, sNewWord, True))
            # Phonetic replacements
            for sRepl in cp.d1toX.get(cCurrent, ()):
                aSugg.update(self._suggest(sRepl + sRemain[1:], nMaxDel, nMaxHardRepl, nDeep+1, iAddr, sNewWord, True))
            for sRepl in cp.d2toX.get(sRemain[0:2], ()):
                aSugg.update(self._suggest(sRepl + sRemain[2:], nMaxDel, nMaxHardRepl, nDeep+1, iAddr, sNewWord, True))
            # Hard replacements
            if nDeep > 3 and nMaxHardRepl and len(sRemain) >= 2:
                for nVal, kAddr in self._getArcs1(iAddr):
                    if nVal in self.dCharVal:
                        cChar = self.dCharVal[nVal]
        for nVal, jAddr in self._getArcs(iAddr):
            if nVal < self.nChar:
                yield (self.dCharVal[nVal], jAddr)
                        if cChar not in cp.d1to1.get(cCurrent, ""):
                            aSugg.update(self._suggest(sRemain[1:], nMaxDel, nMaxHardRepl-1, nDeep+1, kAddr, sNewWord+cChar, True))
            # end of word
            if len(sRemain) == 2:
                for sRepl in cp.dFinal2.get(sRemain, ()):
                    aSugg.update(self._suggest(sRepl, nMaxDel, nMaxHardRepl, nDeep+1, iAddr, sNewWord, True))
            elif len(sRemain) == 1:
                aSugg.update(self._suggest("", nMaxDel, nMaxHardRepl, nDeep+1, iAddr, sNewWord, True)) # remove last char and go on
                for sRepl in cp.dFinal1.get(sRemain, ()):
                    aSugg.update(self._suggest(sRepl, nMaxDel, nMaxHardRepl, nDeep+1, iAddr, sNewWord, True))
        return aSugg

    def _getSimilarArcs (self, cChar, iAddr):
    def _getSimilarCharArcs (self, cChar, iAddr):
        "generator: yield similar char of <cChar> and address of the following node"
        for c in cp.d1to1.get(cChar, [cChar]):
            if c in self.dChar:
                jAddr = self._lookupArcNode(self.dChar[c], iAddr)
                if jAddr:
                    yield (c, jAddr)

    def _getCharArcsWithPriority (self, iAddr, cChar):
        if not cChar:
            yield from self._getCharArcs(iAddr)
        lTuple = list(self._getCharArcs(iAddr))
        lTuple.sort(key=lambda t: 0  if t[0] in cp.d1to1.get(cChar, cChar)  else  1)
        yield from lTuple

    def _getTails (self, iAddr, sTail="", n=2):
        "return a list of suffixes ending at a distance of <n> from <iAddr>"
        aTails = set()
        for nVal, jAddr in self._getArcs(iAddr):
            if nVal < self.nChar:
                if int.from_bytes(self.byDic[jAddr:jAddr+self.nBytesArc], byteorder='big') & self._finalNodeMask:
                    aTails.add(sTail + self.dCharVal[nVal])
                if n and not aTails:
                    aTails.update(self._getTails(jAddr, sTail+self.dCharVal[nVal], n-1))
        return aTails

    def drawPath (self, sWord, iAddr=0):
        "show the path taken by <sWord> in the graph"
        cChar = sWord[0:1]  if sWord  else " "
        c1 = sWord[0:1]  if sWord  else " "
        iPos = -1
        n = 0
        print(cChar + ": ", end="")
        for nVal, jAddr in self._getArcs(iAddr):
        print(c1 + ": ", end="")
        for c2, jAddr in self._getCharArcs(iAddr):
            if nVal in self.dCharVal:
                print(self.dCharVal[nVal], end="")
                if self.dCharVal[nVal] == sWord[0:1]:
                    iNextNodeAddr = jAddr
                    iPos = n
                n += 1
            print(c2, end="")
            if c2 == sWord[0:1]:
                iNextNodeAddr = jAddr
                iPos = n
            n += 1
        if not sWord:
            return
        if iPos >= 0:
            print("\n   "+ " " * iPos + "|")
            self.drawPath(sWord[1:], iNextNodeAddr)

    def select (self, sPattern=""):

Modified gc_core/py/str_transform.py from [23bc31f0e4] to [3d25d1270b].

1
2

















































































































3
4
5
6
7
8
9
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122


+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+







#!python3


#### DISTANCE CALCULATIONS

def longestCommonSubstring (s1, s2):
    # http://en.wikipedia.org/wiki/Longest_common_substring_problem
    # http://en.wikibooks.org/wiki/Algorithm_implementation/Strings/Longest_common_substring
    M = [ [0]*(1+len(s2)) for i in range(1+len(s1)) ]
    longest, x_longest = 0, 0
    for x in range(1, 1+len(s1)):
        for y in range(1, 1+len(s2)):
            if s1[x-1] == s2[y-1]:
                M[x][y] = M[x-1][y-1] + 1
                if M[x][y] > longest:
                    longest = M[x][y]
                    x_longest = x
            else:
                M[x][y] = 0
    return s1[x_longest-longest : x_longest]


def distanceDamerauLevenshtein (s1, s2):
    "distance of Damerau-Levenshtein between <s1> and <s2>"
    # https://fr.wikipedia.org/wiki/Distance_de_Damerau-Levenshtein
    d = {}
    nLen1 = len(s1)
    nLen2 = len(s2)
    for i in range(-1, nLen1+1):
        d[i, -1] = i + 1
    for j in range(-1, nLen2+1):
        d[-1, j] = j + 1
    for i in range(nLen1):
        for j in range(nLen2):
            nCost = 0  if s1[i] == s2[j]  else 1
            d[i, j] = min(
                d[i-1, j]   + 1,        # Deletion
                d[i,   j-1] + 1,        # Insertion
                d[i-1, j-1] + nCost,    # Substitution
            )
            if i and j and s1[i] == s2[j-1] and s1[i-1] == s2[j]:
                d[i, j] = min(d[i, j], d[i-2, j-2] + nCost)     # Transposition
    return d[nLen1-1, nLen2-1]


def distanceSift4 (s1, s2, nMaxOffset=5):
    "implementation of general Sift4."
    # https://siderite.blogspot.com/2014/11/super-fast-and-accurate-string-distance.html
    if not s1:
        return len(s2)
    if not s2:
        return len(s1)
    nLen1, nLen2 = len(s1), len(s2)
    i1, i2 = 0, 0   # Cursors for each string
    nLargestCS = 0  # Largest common substring
    nLocalCS = 0    # Local common substring
    nTrans = 0      # Number of transpositions ('ab' vs 'ba')
    lOffset = []    # Offset pair array, for computing the transpositions
 
    while i1 < nLen1 and i2 < nLen2:
        if s1[i1] == s2[i2]:
            nLocalCS += 1
            # Check if current match is a transposition
            bTrans = False
            i = 0
            while i < len(lOffset):
                t = lOffset[i]
                if i1 <= t[0] or i2 <= t[1]:
                    bTrans = abs(i2-i1) >= abs(t[1] - t[0])
                    if bTrans:
                        nTrans += 1
                    elif not t[2]:
                        t[2] = True
                        nTrans += 1
                    break
                elif i1 > t[1] and i2 > t[0]:
                    del lOffset[i]
                else:
                    i += 1
            lOffset.append([i1, i2, bTrans])
        else:
            nLargestCS += nLocalCS
            nLocalCS = 0
            if i1 != i2:
                i1 = i2 = min(i1, i2)
            for i in range(nMaxOffset):
                if i1 + i >= nLen1 and i2 + i >= nLen2:
                    break
                elif i1 + i < nLen1 and s1[i1+i] == s2[i2]:
                    i1 += i - 1
                    i2 -= 1
                    break
                elif i2 + i < nLen2 and s1[i1] == s2[i2+i]:
                    i2 += i - 1
                    i1 -= 1
                    break
        i1 += 1
        i2 += 1
        if i1 >= nLen1 or i2 >= nLen2:
            nLargestCS += nLocalCS
            nLocalCS = 0
            i1 = i2 = min(i1, i2)
    nLargestCS += nLocalCS
    return round(max(nLen1, nLen2) - nLargestCS + nTrans)


def showDistance (s1, s2):
    print(f"Damerau-Levenshtein: {s1} / {s2} = {distanceDamerauLevenshtein(s1, s2)}")
    print(f"Sift4: {s1} / {s2} = {distanceSift4(s1, s2)}")




#### STEMMING OPERATIONS

## No stemming

def noStemming (sFlex, sStem):
    return sStem

def rebuildWord (sFlex, cmd1, cmd2):
    if cmd1 == "_":
36
37
38
39
40
41
42

43
44
45
46
47
48
49
50

51
52
53
54
55
56
57
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172







+








+







        return "0"
    jSfx = 0
    for i in range(min(len(sFlex), len(sStem))):
        if sFlex[i] != sStem[i]:
            break
        jSfx += 1
    return chr(len(sFlex)-jSfx+48) + sStem[jSfx:]  


def changeWordWithSuffixCode (sWord, sSfxCode):
    if sSfxCode == "0":
        return sWord
    return sWord[:-(ord(sSfxCode[0])-48)] + sSfxCode[1:]  if sSfxCode[0] != '0'  else sWord + sSfxCode[1:]


# Prefix and suffix

def defineAffixCode (sFlex, sStem):
    """ Returns a string defining how to get stem from flexion. Examples:
            "0" if stem = flexion
            "stem" if no common substring
            "n(pfx)/m(sfx)"
        with n and m: chars with numeric meaning, "0" = 0, "1" = 1, ... ":" = 10, etc. (See ASCII table.) Says how many letters to strip from flexion.
            pfx [optional]: string to add before the flexion 
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
187
188
189
190
191
192
193















194
195
196
197
198
199
200
201
202
203







-
-
-
-
-
-
-
-
-
-
-
-
-
-
-










        n = sFlex.find(sSubs)
        m = len(sFlex) - (len(sSubs)+n)
        sAff = "{}/".format(chr(n+48))  if not sPfx  else "{}{}/".format(chr(n+48), sPfx)
        sAff += chr(m+48)  if not sSfx  else "{}{}".format(chr(m+48), sSfx)
        return sAff
    return sStem

def longestCommonSubstring (s1, s2):
    # http://en.wikipedia.org/wiki/Longest_common_substring_problem
    # http://en.wikibooks.org/wiki/Algorithm_implementation/Strings/Longest_common_substring
    M = [ [0]*(1+len(s2)) for i in range(1+len(s1)) ]
    longest, x_longest = 0, 0
    for x in range(1, 1+len(s1)):
        for y in range(1, 1+len(s2)):
            if s1[x-1] == s2[y-1]:
                M[x][y] = M[x-1][y-1] + 1
                if M[x][y] > longest:
                    longest = M[x][y]
                    x_longest = x
            else:
                M[x][y] = 0
    return s1[x_longest-longest : x_longest]

def changeWordWithAffixCode (sWord, sAffCode):
    if sAffCode == "0":
        return sWord
    if '/' not in sAffCode:
        return "# error #"
    sPfxCode, sSfxCode = sAffCode.split('/')
    sWord = sPfxCode[1:] + sWord[(ord(sPfxCode[0])-48):] 
    return sWord[:-(ord(sSfxCode[0])-48)] + sSfxCode[1:]  if sSfxCode[0] != '0'  else sWord + sSfxCode[1:]