Grammalecte  Check-in [fb9b86e9e8]

Overview
Comment:[graphspell] ibdawg: performance improvements [fr] tests [fx] tests
Downloads: Tarball | ZIP archive | SQL archive
Timelines: family | ancestors | descendants | both | trunk | fr | fx | graphspell
Files: files | file ages | folders
SHA3-256: fb9b86e9e88e9d8d1e8b3ea816cb5321d0ca5fe8b6f7a778b4b63dca648bcf58
User & Date: olr on 2020-09-16 06:09:45
Other Links: manifest | tags
Context
2020-09-16
08:04
[fr] ajustements check-in: 00ed7d0ed7 user: olr tags: trunk, fr
06:09
[graphspell] ibdawg: performance improvements [fr] tests [fx] tests check-in: fb9b86e9e8 user: olr tags: trunk, fr, fx, graphspell
2020-09-15
17:04
[fr] update suggest tests Closed-Leaf check-in: a8c66433af user: olr tags: fr, bdic_opt
2020-09-14
07:55
[fr] ajustements check-in: cd8c458e7b user: olr tags: trunk, fr
Changes

Modified gc_lang/fr/modules/tests_modules.py from [1156e862bc] to [420f61228e].

18
19
20
21
22
23
24
25

26
27
28
29
30
31
32
18
19
20
21
22
23
24

25
26
27
28
29
30
31
32







-
+







def timeblock (label, hDst=None):
    "performance counter (contextmanager)"
    start = time.perf_counter()
    try:
        yield
    finally:
        end = time.perf_counter()
        print('{} : {}'.format(label, end - start))
        print('{:<20} : {}'.format(label, end - start))
        if hDst:
            hDst.write("{:<12.6}".format(end-start))


class TestDictionary (unittest.TestCase):
    "Test du correcteur orthographique"

47
48
49
50
51
52
53

54




55
56


57
58
59
60
61
62
63
47
48
49
50
51
52
53
54

55
56
57
58
59

60
61
62
63
64
65
66
67
68







+
-
+
+
+
+

-
+
+







            self.assertTrue(self.oDic.isValid(sWord), sWord)

    def test_isvalid_failed (self):
        for sWord in ["BranchE", "BRanche", "BRAnCHE", "émilie", "éMILIE", "émiLie"]:
            self.assertFalse(self.oDic.isValid(sWord), sWord)

    def test_suggest (self):
        for sWord in [
        for sWord in ["déelirranttesss", "vallidasion", "Emilie", "exibission"]:
            "déelirranttesss", "vallidasion", "Emilie", "exibission", "ditirembique", "jai", "email",
            "fatiqué", "coeur", "trèèèèèèèèès", "vraaaaiiiimeeeeennnt", "apele", "email", "Co2",
            "emmppâiiiller", "testt", "apelaion", "exsepttion", "sintaxik", "ebriete", "ennormmement"
        ]:
            with timeblock(sWord):
                self.assertNotEqual(0, self.oDic.suggest(sWord))
                aSugg = self.oDic.suggest(sWord)
                print(sWord, "->", " ".join(aSugg))


class TestConjugation (unittest.TestCase):
    "Tests des conjugaisons"

    @classmethod
    def setUpClass (cls):

Modified gc_lang/fr/perf_memo.text from [6a0d81df00] to [ad156793c1].

26
27
28
29
30
31
32
33
34



26
27
28
29
30
31
32


33
34
35







-
-
+
+
+
0.6.2       2018.02.19 19:06    5.51302     1.29359     0.874157    0.260415    0.271596    0.290641    0.684754    0.376905    0.0815201   0.00919633  (spelling normalization)
1.0         2018.11.23 10:59    2.88577     0.702486    0.485648    0.139897    0.14079     0.148125    0.348751    0.201061    0.0360297   0.0043535   (x2, with new GC engine)
1.1         2019.05.16 09:42    1.50743     0.360923    0.261113    0.0749272   0.0763827   0.0771537   0.180504    0.102942    0.0182762   0.0021925   (×2, but new processor: AMD Ryzen 7 2700X)
1.2.1       2019.08.06 20:57    1.42886     0.358425    0.247356    0.0704405   0.0754886   0.0765604   0.177197    0.0988517   0.0188103   0.0020243
1.6.0       2020.01.03 20:22    1.38847     0.346214    0.240242    0.0709539   0.0737499   0.0748733   0.176477    0.0969171   0.0187857   0.0025143   (nouveau dictionnaire avec lemmes masculins)
1.9.0       2020.04.20 19:57    1.51183     0.369546    0.25681     0.0734314   0.0764396   0.0785668   0.183922    0.103674    0.0185812   0.002099    (NFC normalization)
1.9.2       2020.05.12 08:43    1.62465     0.398831    0.273012    0.0810811   0.080937    0.0845885   0.204133    0.114146    0.0212864   0.0029547
1.12.2      2020.09.09 13:34    1.50568     0.374504    0.233108    0.0798712   0.0804466   0.0769674   0.171519    0.0945132   0.0165344   0.0019474   
1.12.2      2020.09.09 13:35    1.41094     0.359093    0.236443    0.06968     0.0734418   0.0738087   0.169371    0.0946279   0.0167106   0.0019773   
1.12.2      2020.09.09 13:34    1.50568     0.374504    0.233108    0.0798712   0.0804466   0.0769674   0.171519    0.0945132   0.0165344   0.0019474
1.12.2      2020.09.09 13:35    1.41094     0.359093    0.236443    0.06968     0.0734418   0.0738087   0.169371    0.0946279   0.0167106   0.0019773
1.12.2      2020.09.11 19:16    1.35297     0.330545    0.221731    0.0666998   0.0692539   0.0701707   0.160564    0.0891676   0.015807    0.0045998

Modified gc_lang/fr/webext/gce_worker.js from [c94d68b34b] to [2672ecfd48].

288
289
290
291
292
293
294
295

296
297
298
299
300
301









302
303
304
305
306
307
308
288
289
290
291
292
293
294

295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317







-
+






+
+
+
+
+
+
+
+
+







function resetOptions (oInfo={}) {
    gc_engine.resetOptions();
    let dOptions = helpers.mapToObject(gc_engine.getOptions());
    postMessage(createResponse("resetOptions", dOptions, oInfo, true));
}

function tests () {
    console.log(conj.getConj("devenir", ":E", ":2s"));
    /*console.log(conj.getConj("devenir", ":E", ":2s"));
    console.log(mfsp.getMasForm("emmerdeuse", true));
    console.log(mfsp.getMasForm("pointilleuse", false));
    console.log(phonet.getSimil("est"));
    let aRes = gc_engine.parse("Je suit...");
    for (let oErr of aRes) {
        console.log(text.getReadableError(oErr));
    }*/
    for (let sWord of ["fatiqué", "coeur", "trèèèèèèèèès", "vraaaaiiiimeeeeennnt", "apele", "email", "Co2", "emmppâiiiller", "testt", "apelaion", "exsepttion", "sintaxik", "ebriete", "ennormmement"]) {
        console.time("Suggestions for " + sWord);
        for (let aSugg of oSpellChecker.suggest(sWord)) {
            if (aSugg.length) {
                console.log(sWord + " -> ", aSugg.join(" "));
            }
        }
        console.timeEnd("Suggestions for " + sWord);
    }
}

function textToTest (sText, sCountry, bDebug, bContext, oInfo={}) {
    if (!gc_engine) {
        postMessage(createResponse("textToTest", "# Grammar checker not loaded.", oInfo, true));
        return;

Modified graphspell-js/char_player.js from [0602ec129b] to [dab3d08473].

1
2
3
4
5
6
7
8
9
10
11





12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39



























40
41
42
43
44
45
46
1
2
3
4
5
6
7
8
9
10

11
12
13
14
15
16



























17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50










-
+
+
+
+
+

-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+







// list of similar chars
// useful for suggestion mechanism

/* jshint esversion:6 */
/* jslint esversion:6 */

${map}


var char_player = {

    /*
        oDistanceBetweenChars:
            - with Jaro-Winkler, values between 1 and 10
            - with Damerau-Levenshtein, values / 10 (between 0 and 1: 0.1, 0.2 ... 0.9)
    */
    oDistanceBetweenChars: {
        "a": {},
        "e": {"é": 0.5},
        "é": {"e": 0.5},
        "i": {"y": 0.2},
        "o": {},
        "u": {},
        "y": {"i": 0.3},
        "b": {"d": 0.8, "h": 0.9},
        "c": {"ç": 0.1, "k": 0.5, "q": 0.5, "s": 0.5, "x": 0.5, "z": 0.8},
        "d": {"b": 0.8},
        "f": {"v": 0.8},
        "g": {"j": 0.5},
        "h": {"b": 0.9},
        "j": {"g": 0.5, "i": 0.9},
        "k": {"c": 0.5, "q": 0.1, "x": 0.5},
        "l": {"i": 0.9},
        "m": {"n": 0.8},
        "n": {"m": 0.8, "r": 0.9},
        "p": {"q": 0.9},
        "q": {"c": 0.5, "k": 0.1, "p": 0.9},
        "r": {"n": 0.9, "j": 0.9},
        "s": {"c": 0.5, "ç": 0.1, "x": 0.5, "z": 0.5},
        "t": {"d": 0.9},
        "v": {"f": 0.8, "w": 0.1},
        "w": {"v": 0.1},
        "x": {"c": 0.5, "k": 0.5, "q": 0.5, "s": 0.5},
        "z": {"s": 0.5}
        //"a": {},
        "e": {"é": 5},
        //"é": {"e": 5},
        "i": {"y": 2},
        //"o": {},
        //"u": {},
        "y": {"i": 3},
        "b": {"d": 8, "h": 9},
        "c": {"ç": 1, "k": 5, "q": 5, "s": 5, "x": 5, "z": 8},
        "d": {"b": 8},
        "f": {"v": 8},
        "g": {"j": 5},
        "h": {"b": 9},
        "j": {"g": 5, "i": 9},
        "k": {"c": 5, "q": 1, "x": 5},
        "l": {"i": 9},
        "m": {"n": 8},
        "n": {"m": 8, "r": 9},
        "p": {"q": 9},
        "q": {"c": 5, "k": 1, "p": 9},
        "r": {"n": 9, "j": 9},
        "s": {"c": 5, "ç": 1, "x": 5, "z": 5},
        "t": {"d": 9},
        "v": {"f": 8, "w": 1},
        "w": {"v": 1},
        "x": {"c": 5, "k": 5, "q": 5, "s": 5},
        "z": {"s": 5}
    },

    distanceBetweenChars: function (c1, c2) {
        if (c1 == c2) {
            return 0;
        }
        if (this.oDistanceBetweenChars.hasOwnProperty(c1) && this.oDistanceBetweenChars[c1].hasOwnProperty(c2)) {
316
317
318
319
320
321
322
323
324


325
326
327
328
329
330
331
332
333
334
335
336
337
338


339
340
341
342
343
344
345
320
321
322
323
324
325
326


327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351







-
-
+
+














+
+







        ["UN", ["EIN",]],
    ]),

    // End of word
    dFinal1: new Map([
        ["a", ["as", "at", "ant", "ah"]],
        ["A", ["AS", "AT", "ANT", "AH"]],
        ["c", ["ch",]],
        ["C", ["CH",]],
        ["c", ["ch", "que"]],
        ["C", ["CH", "QUE"]],
        ["e", ["et", "er", "ets", "ée", "ez", "ai", "ais", "ait", "ent", "eh"]],
        ["E", ["ET", "ER", "ETS", "ÉE", "EZ", "AI", "AIS", "AIT", "ENT", "EH"]],
        ["é", ["et", "er", "ets", "ée", "ez", "ai", "ais", "ait"]],
        ["É", ["ET", "ER", "ETS", "ÉE", "EZ", "AI", "AIS", "AIT"]],
        ["è", ["et", "er", "ets", "ée", "ez", "ai", "ais", "ait"]],
        ["È", ["ET", "ER", "ETS", "ÉE", "EZ", "AI", "AIS", "AIT"]],
        ["ê", ["et", "er", "ets", "ée", "ez", "ai", "ais", "ait"]],
        ["Ê", ["ET", "ER", "ETS", "ÉE", "EZ", "AI", "AIS", "AIT"]],
        ["ë", ["et", "er", "ets", "ée", "ez", "ai", "ais", "ait"]],
        ["Ë", ["ET", "ER", "ETS", "ÉE", "EZ", "AI", "AIS", "AIT"]],
        ["g", ["gh",]],
        ["G", ["GH",]],
        ["i", ["is", "it", "ie", "in"]],
        ["I", ["IS", "IT", "IE", "IN"]],
        ["k", ["que"]],
        ["K", ["QUE"]],
        ["n", ["nt", "nd", "ns", "nh"]],
        ["N", ["NT", "ND", "NS", "NH"]],
        ["o", ["aut", "ot", "os"]],
        ["O", ["AUT", "OT", "OS"]],
        ["ô", ["aut", "ot", "os"]],
        ["Ô", ["AUT", "OT", "OS"]],
        ["ö", ["aut", "ot", "os"]],

Modified graphspell-js/ibdawg.js from [69d7490b82] to [fa91a09215].

18
19
20
21
22
23
24
25

26
27
28
29

30
31
32








33
34
35

36
37
38
39
40
41
42
43
44
45
46
47
48
49













50
51
52
53
54
55
56











57
58
59
60

61
62

63
64
65



66


67

68
69





70
71
72










73
74
75
76

77
78
79
80
81
82
83
84
85

86
87
88
89
90



91
92
93
94
95
96
97
18
19
20
21
22
23
24

25
26
27
28
29
30



31
32
33
34
35
36
37
38
39
40

41
42
43
44
45
46









47
48
49
50
51
52
53
54
55
56
57
58
59







60
61
62
63
64
65
66
67
68
69
70
71
72
73

74
75
76
77



78
79
80

81
82
83
84


85
86
87
88
89



90
91
92
93
94
95
96
97
98
99


100
101
102
103
104
105
106
107
108
109
110

111
112
113
114


115
116
117
118
119
120
121
122
123
124







-
+




+
-
-
-
+
+
+
+
+
+
+
+


-
+





-
-
-
-
-
-
-
-
-
+
+
+
+
+
+
+
+
+
+
+
+
+
-
-
-
-
-
-
-
+
+
+
+
+
+
+
+
+
+
+



-
+


+
-
-
-
+
+
+
-
+
+

+
-
-
+
+
+
+
+
-
-
-
+
+
+
+
+
+
+
+
+
+
-
-


+








-
+



-
-
+
+
+







    var char_player = require("./char_player.js");
}


class SuggResult {
    // Structure for storing, classifying and filtering suggestions

    constructor (sWord, nDistLimit=-1) {
    constructor (sWord, nSuggLimit=10, nDistLimit=-1) {
        this.sWord = sWord;
        this.sSimplifiedWord = str_transform.simplifyWord(sWord);
        this.nDistLimit = (nDistLimit >= 0) ? nDistLimit :  Math.floor(sWord.length / 3) + 1;
        this.nMinDist = 1000;
        // Temporary sets
        this.aSugg = new Set();
        this.dSugg = new Map([ [0, []],  [1, []],  [2, []] ]);
        this.aAllSugg = new Set();      // all found words even those refused
        this.aAllSugg = new Set();  // All suggestions, even the one rejected
        this.dGoodSugg = new Map(); // Acceptable suggestions
        this.dBestSugg = new Map(); // Best suggestions
        // Parameters
        this.nSuggLimit = nSuggLimit;
        this.nSuggLimitExt = nSuggLimit + 2;                // we add few entries in case suggestions merge after casing modifications
        this.nBestSuggLimit = Math.floor(nSuggLimit * 1.5); // n times the requested limit
        this.nGoodSuggLimit = nSuggLimit * 15;              // n times the requested limit
    }

    addSugg (sSugg, nDeep=0) {
    addSugg (sSugg) {
        // add a suggestion
        if (this.aAllSugg.has(sSugg)) {
            return;
        }
        this.aAllSugg.add(sSugg);
        if (!this.aSugg.has(sSugg)) {
            let nDist = Math.floor(str_transform.distanceDamerauLevenshtein(this.sSimplifiedWord, str_transform.simplifyWord(sSugg)));
            if (nDist <= this.nDistLimit) {
                if (sSugg.includes(" ")) { // add 1 to distance for split suggestions
                    nDist += 1;
                }
                if (!this.dSugg.has(nDist)) {
                    this.dSugg.set(nDist, []);
                }
        // jaro 0->1 1 les chaines sont égale
        let nDistJaro = 1 - str_transform.distanceJaroWinkler(this.sSimplifiedWord, str_transform.simplifyWord(sSugg));
        let nDist = Math.floor(nDistJaro * 10);
        if (nDistJaro < .11) {        // Best suggestions
            this.dBestSugg.set(sSugg, Math.round(nDistJaro*1000));
            if (this.dBestSugg.size > this.nBestSuggLimit) {
                this.nDistLimit = -1; // make suggest() to end search
            }
        } else if (nDistJaro < .33) { // Good suggestions
            this.dGoodSugg.set(sSugg, Math.round(nDistJaro*1000));
            if (this.dGoodSugg.size > this.nGoodSuggLimit) {
                this.nDistLimit = -1; // make suggest() to end search
            }
                this.dSugg.get(nDist).push(sSugg);
                this.aSugg.add(sSugg);
                if (nDist < this.nMinDist) {
                    this.nMinDist = nDist;
                }
                this.nDistLimit = Math.min(this.nDistLimit, this.nMinDist+1);
            }
        } else {
            if (nDist < this.nMinDist) {
                this.nMinDist = nDist;
            }
            this.nDistLimit = Math.min(this.nDistLimit, this.nMinDist);
        }
        if (nDist <= this.nDistLimit) {
            if (nDist < this.nMinDist) {
                this.nMinDist = nDist;
            }
            this.nDistLimit = Math.min(this.nDistLimit, this.nMinDist+1);
        }
    }

    getSuggestions (nSuggLimit=10) {
    getSuggestions () {
        // return a list of suggestions
        let lRes = [];
        if (this.dBestSugg.size > 0) {
        let bFirstListSorted = false;
        for (let [nDist, lSugg] of this.dSugg.entries()) {
            if (nDist > this.nDistLimit) {
            // sort only with simplified words
            let lResTmp = [...this.dBestSugg.entries()].sort((a, b) => { return a[1] - b[1]; });
            let nSize = Math.min(this.nSuggLimitExt, lResTmp.length);
                break;
            for (let i=0;  i < nSize;  i++){
                lRes.push(lResTmp[i][0]);
            }
        }
            if (!bFirstListSorted && lSugg.length > 1) {
                lRes.sort((a, b) => { return str_transform.distanceDamerauLevenshtein(this.sWord, a) - str_transform.distanceDamerauLevenshtein(this.sWord, b); });
        if (lRes.length < this.nSuggLimitExt) {
            // sort with simplified words and original word
            let lResTmp = [...this.dGoodSugg.entries()].sort((a, b) => {
                // Low precision to rely more on simplified words
                let nJaroA = Math.round(str_transform.distanceJaroWinkler(this.sWord, a[0]) * 10);
                bFirstListSorted = true;
            }
            lRes.push(...lSugg);
                let nJaroB = Math.round(str_transform.distanceJaroWinkler(this.sWord, b[0]) * 10);
                if (nJaroA == nJaroB) {
                    return a[1] - b[1];     // warning: both lists are NOT sorted the same way (key: a-b)
                } else {
                    return nJaroB - nJaroA; // warning: both lists are NOT sorted the same way (key: b-a)
                }
            }).slice(0, this.nSuggLimitExt);
            let nSize = Math.min(this.nSuggLimitExt, lResTmp.length);
            for (let i=0;  i < nSize;  i++){
                lRes.push(lResTmp[i][0]);
            if (lRes.length > nSuggLimit) {
                break;
            }
        }
        // casing
        if (this.sWord.gl_isUpperCase()) {
            lRes = lRes.map((sSugg) => { return sSugg.toUpperCase(); });
            lRes = [...new Set(lRes)];
        }
        else if (this.sWord.slice(0,1).gl_isUpperCase()) {
            lRes = lRes.map((sSugg) => { return sSugg.slice(0,1).toUpperCase() + sSugg.slice(1); });
            lRes = [...new Set(lRes)];
        }
        return lRes.slice(0, nSuggLimit);
        return lRes.slice(0, this.nSuggLimit);
    }

    reset () {
        this.aSugg.clear();
        this.dSugg.clear();
        this.dSugg.clear();
        this.dGoodSugg.clear();
        this.dBestSugg.clear();
    }
}


class IBDAWG {
    // INDEXABLE BINARY DIRECT ACYCLIC WORD GRAPH

120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159












160
161
162
163
164














165
166
167

168
169
170




171
172
173
174
175
176
177



178
179
180
181
182
183
184
185
186
187
188
189
190

191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
147
148
149
150
151
152
153














154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184





185
186
187
188
189
190
191
192
193
194
195
196
197
198



199



200
201
202
203







204
205
206













207
208
209
210
211
212
213
214
215
216
217

218
219
220
221
222
223
224







-
-
-
-
-
-
-
-
-
-
-
-
-
-



















+
+
+
+
+
+
+
+
+
+
+
+
-
-
-
-
-
+
+
+
+
+
+
+
+
+
+
+
+
+
+
-
-
-
+
-
-
-
+
+
+
+
-
-
-
-
-
-
-
+
+
+
-
-
-
-
-
-
-
-
-
-
-
-
-
+










-







        }
        /*
            Properties:
            sName, nCompressionMethod, sHeader, lArcVal, nArcVal, sByDic, sLang, nChar, nBytesArc, nBytesNodeAddress,
            nEntry, nNode, nArc, nAff, cStemming, nTag, dChar, nBytesOffset,
        */

        /*
            Bug workaround.
            Mozilla’s JS parser sucks. Can’t read file bigger than 4 Mb!
            So we convert huge hexadecimal string to list of numbers…
            https://github.com/mozilla/addons-linter/issues/1361
        */
        let lTemp = [];
        for (let i = 0;  i < this.sByDic.length;  i+=2) {
            lTemp.push(parseInt(this.sByDic.slice(i, i+2), 16));
        }
        this.byDic = lTemp;
        //this.byDic = new Uint8Array(lTemp);  // not quicker, even slower
        /* end of bug workaround */

        if (!(this.sHeader.startsWith("/grammalecte-fsa/") || this.sHeader.startsWith("/pyfsa/"))) {
            throw TypeError("# Error. Not a grammalecte-fsa binary dictionary. Header: " + this.sHeader);
        }
        if (!(this.nCompressionMethod == 1 || this.nCompressionMethod == 2 || this.nCompressionMethod == 3)) {
            throw RangeError("# Error. Unknown dictionary compression method: " + this.nCompressionMethod);
        }
        // <dChar> to get the value of an arc, <dCharVal> to get the char of an arc with its value
        this.dChar = helpers.objectToMap(this.dChar);
        this.dCharVal = this.dChar.gl_reverse();
        this.a2grams = (this.l2grams) ? new Set(this.l2grams) : null;

        if (this.cStemming == "S") {
            this.funcStemming = str_transform.changeWordWithSuffixCode;
        } else if (this.cStemming == "A") {
            this.funcStemming = str_transform.changeWordWithAffixCode;
        } else {
            this.funcStemming = str_transform.noStemming;
        }

        /*
            Bug workaround.
            Mozilla’s JS parser sucks. Can’t read file bigger than 4 Mb!
            So we convert huge hexadecimal string to list of numbers…
            https://github.com/mozilla/addons-linter/issues/1361
        */
        /*
            Performance trick:
            Instead of converting bytes to integers each times we parse the binary dictionary,
            we do it once, then parse the array
        */
        let nAcc = 0;
        this._arcMask = (2 ** ((this.nBytesArc * 8) - 3)) - 1;
        this._finalNodeMask = 1 << ((this.nBytesArc * 8) - 1);
        this._lastArcMask = 1 << ((this.nBytesArc * 8) - 2);


        let lBytesBuffer = [];
        let lTemp = [];
        let nDivisor = (this.nBytesArc + this.nBytesNodeAddress) / 2;
        for (let i = 0;  i < this.sByDic.length;  i+=2) {
            lBytesBuffer.push(parseInt(this.sByDic.slice(i, i+2), 16));
            if (nAcc == (this.nBytesArc - 1)) {
                lTemp.push(this._convBytesToInteger(lBytesBuffer));
                lBytesBuffer = [];
            }
            else if (nAcc == (this.nBytesArc + this.nBytesNodeAddress - 1)) {
                lTemp.push(Math.round(this._convBytesToInteger(lBytesBuffer) / nDivisor));  // Math.round should be useless, BUT with JS who knowns what can happen…
                lBytesBuffer = [];
                nAcc = -1;
            }
        // Configuring DAWG functions according to nCompressionMethod
        switch (this.nCompressionMethod) {
            case 1:
            nAcc = nAcc + 1;
                this.morph = this._morph1;
                this.stem = this._stem1;
                this._lookupArcNode = this._lookupArcNode1;
        }
        this.byDic = lTemp;
        /* end of bug workaround */

                this._getArcs = this._getArcs1;
                this._writeNodes = this._writeNodes1;
                break;
            case 2:
                this.morph = this._morph2;
                this.stem = this._stem2;
                this._lookupArcNode = this._lookupArcNode2;
        this._arcMask = (2 ** ((this.nBytesArc * 8) - 3)) - 1;
        this._finalNodeMask = 1 << ((this.nBytesArc * 8) - 1);
        this._lastArcMask = 1 << ((this.nBytesArc * 8) - 2);
                this._getArcs = this._getArcs2;
                this._writeNodes = this._writeNodes2;
                break;
            case 3:
                this.morph = this._morph3;
                this.stem = this._stem3;
                this._lookupArcNode = this._lookupArcNode3;
                this._getArcs = this._getArcs3;
                this._writeNodes = this._writeNodes3;
                break;
            default:
                throw ValueError("# Error: unknown code: " + this.nCompressionMethod);
        }

        //console.log(this.getInfo());
        this.bAcronymValid = true;
        this.bNumAtLastValid = false;

        // lexicographer module ?
        this.lexicographer = null;
        // JS still sucks: we’ll try importation when importation will be available in Workers. Still waiting...
        if (self && self.hasOwnProperty("lexgraph_"+this.sLangCode)) { // self is the Worker
            this.lexicographer = self["lexgraph_"+this.sLangCode];
        }

    }

    getInfo () {
        return  `  Language: ${this.sLangName}   Lang code: ${this.sLangCode}   Dictionary name: ${this.sDicName}\n` +
                `  Compression method: ${this.nCompressionMethod}   Date: ${this.sDate}   Stemming: ${this.cStemming}FX\n` +
                `  Arcs values:  ${this.nArcVal} = ${this.nChar} characters,  ${this.nAff} affixes,  ${this.nTag} tags\n` +
                `  Dictionary: ${this.nEntry} entries,    ${this.nNode} nodes,   ${this.nArc} arcs\n` +
304
305
306
307
308
309
310
311

312
313
314
315
316
317
318
319
320

321
322

323
324

325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343


344
345
346
347
348
349

350
351
352
353
354
355
356
320
321
322
323
324
325
326

327
328
329
330
331
332
333
334
335

336
337

338
339

340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358

359
360
361
362
363
364
365

366
367
368
369
370
371
372
373







-
+








-
+

-
+

-
+


















-
+
+





-
+







                return false;
            }
            iAddr = this._lookupArcNode(this.dChar.get(c), iAddr);
            if (iAddr === null) {
                return false;
            }
        }
        return Boolean(this._convBytesToInteger(this.byDic.slice(iAddr, iAddr+this.nBytesArc)) & this._finalNodeMask);
        return Boolean(this.byDic[iAddr] & this._finalNodeMask);
    }

    getMorph (sWord) {
        // retrieves morphologies list, different casing allowed
        if (!sWord) {
            return [];
        }
        sWord = str_transform.spellingNormalization(sWord);
        let l = this.morph(sWord);
        let l = this._morph(sWord);
        if (sWord[0].gl_isUpperCase()) {
            l.push(...this.morph(sWord.toLowerCase()));
            l.push(...this._morph(sWord.toLowerCase()));
            if (sWord.gl_isUpperCase() && sWord.length > 1) {
                l.push(...this.morph(sWord.gl_toCapitalize()));
                l.push(...this._morph(sWord.gl_toCapitalize()));
            }
        }
        return l;
    }

    suggest (sWord, nSuggLimit=10, bSplitTrailingNumbers=false) {
        // returns a array of suggestions for <sWord>
        //console.time("Suggestions for " + sWord);
        sWord = str_transform.spellingNormalization(sWord);
        let sPfx = "";
        let sSfx = "";
        if (this.lexicographer) {
            [sPfx, sWord, sSfx] = this.lexicographer.split(sWord);
        }
        let nMaxSwitch = Math.max(Math.floor(sWord.length / 3), 1);
        let nMaxDel = Math.floor(sWord.length / 5);
        let nMaxHardRepl = Math.max(Math.floor((sWord.length - 5) / 4), 1);
        let nMaxJump = Math.max(Math.floor(sWord.length / 4), 1);
        let oSuggResult = new SuggResult(sWord);
        let oSuggResult = new SuggResult(sWord, nSuggLimit);
        sWord = str_transform.cleanWord(sWord);
        if (bSplitTrailingNumbers) {
            this._splitTrailingNumbers(oSuggResult, sWord);
        }
        this._splitSuggest(oSuggResult, sWord);
        this._suggest(oSuggResult, sWord, nMaxSwitch, nMaxDel, nMaxHardRepl, nMaxJump);
        let aSugg = oSuggResult.getSuggestions(nSuggLimit);
        let aSugg = oSuggResult.getSuggestions();
        if (this.lexicographer) {
            aSugg = this.lexicographer.filterSugg(aSugg);
        }
        if (sSfx || sPfx) {
            // we add what we removed
            return aSugg.map( (sSugg) => { return sPfx + sSugg + sSfx; } );
        }
376
377
378
379
380
381
382
383

384
385
386
387
388
389
390
393
394
395
396
397
398
399

400
401
402
403
404
405
406
407







-
+







            }
        }
    }

    _suggest (oSuggResult, sRemain, nMaxSwitch=0, nMaxDel=0, nMaxHardRepl=0, nMaxJump=0, nDist=0, nDeep=0, iAddr=0, sNewWord="", bAvoidLoop=false) {
        // returns a set of suggestions
        // recursive function
        if (this._convBytesToInteger(this.byDic.slice(iAddr, iAddr+this.nBytesArc)) & this._finalNodeMask) {
        if (this.byDic[iAddr] & this._finalNodeMask) {
            if (sRemain == "") {
                oSuggResult.addSugg(sNewWord);
                for (let sTail of this._getTails(iAddr)) {
                    oSuggResult.addSugg(sNewWord+sTail);
                }
                return;
            }
486
487
488
489
490
491
492
493

494
495
496
497
498
499
500
501
502
503
504
505
506
507
508
509
510
511
512
513
514
503
504
505
506
507
508
509

510
511
512
513
514
515
516
517
518
519
520




521
522
523
524
525
526
527







-
+










-
-
-
-







    }

    _getTails (iAddr, sTail="", n=2) {
        // return a list of suffixes ending at a distance of <n> from <iAddr>
        let aTails = new Set();
        for (let [nVal, jAddr] of this._getArcs(iAddr)) {
            if (nVal <= this.nChar) {
                if (this._convBytesToInteger(this.byDic.slice(jAddr, jAddr+this.nBytesArc)) & this._finalNodeMask) {
                if (this.byDic[jAddr] & this._finalNodeMask) {
                    aTails.add(sTail + this.dCharVal.get(nVal));
                }
                if (n && aTails.size == 0) {
                    aTails.gl_update(this._getTails(jAddr, sTail+this.dCharVal.get(nVal), n-1));
                }
            }
        }
        return aTails;
    }

    // morph (sWord) {
    //     is defined in constructor
    // }

    getSimilarEntries (sWord, nSuggLimit=10) {
        // return a list of tuples (similar word, stem, morphology)
        if (sWord == "") {
            return [];
        }
        let lResult = [];
        for (let sSimilar of this.suggest(sWord, nSuggLimit)) {
528
529
530
531
532
533
534
535

536
537
538
539
540

541
542

543
544
545

546
547
548
549

550
551
552
553
554
555
556
557
558
559

560
561
562
563
564
565
566
567
568
569
570
571

572
573
574
575
576


577
578
579
580
581
582

583
584
585
586


587
588

589
590
591

592
593
594
595
596
597
598

599
600
601
602
603
604
605
606
607
608
609
610

611
612
613
614
615


616
617
618
619
620
621

622
623
624
625
626
627
628

629
630
631
632


633
634
635
636

637
638
639
640
641
642
643

644
645
646
647
648

649
650
651
652
653



654
655
656
657

658
659
660
661
662
663
664
665
666
667
668
669
670
671
672
673
674
675
676
677
678
679
680
681
682
683
684
685
686
687
688
689
690
691
692
541
542
543
544
545
546
547

548
549
550



551
552

553
554
555

556
557
558
559

560
561
562
563
564
565
566
567
568
569

570
571
572
573
574
575
576
577
578
579
580
581

582
583
584
585


586
587
588
589
590
591
592

593
594
595


596
597
598

599
600
601

602
603
604
605
606
607
608

609
610
611
612
613
614
615
616
617
618
619
620

621
622
623
624


625
626
627
628
629
630
631

632
633
634
635
636
637
638

639
640
641


642
643
644
645
646

647
648
649
650
651
652
653

654
655
656
657
658

659
660
661



662
663
664
665
666
667

668
669



























670
671
672
673
674
675
676







-
+


-
-
-
+

-
+


-
+



-
+









-
+











-
+



-
-
+
+





-
+


-
-
+
+

-
+


-
+






-
+











-
+



-
-
+
+





-
+






-
+


-
-
+
+



-
+






-
+




-
+


-
-
-
+
+
+



-
+

-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-







            zFlexPattern = (sFlexPattern !== "") ? new RegExp(sFlexPattern) : null;
            zTagsPattern = (sTagsPattern !== "") ? new RegExp(sTagsPattern) : null;
        }
        catch (e) {
            console.log("Error in regex pattern");
            console.log(e.message);
        }
        yield* this._select1(zFlexPattern, zTagsPattern, 0, "");
        yield* this._select(zFlexPattern, zTagsPattern, 0, "");
    }

    // VERSION 1

    * _select1 (zFlexPattern, zTagsPattern, iAddr, sWord) {
    * _select (zFlexPattern, zTagsPattern, iAddr, sWord) {
        // recursive generator
        for (let [nVal, jAddr] of this._getArcs1(iAddr)) {
        for (let [nVal, jAddr] of this._getArcs(iAddr)) {
            if (nVal <= this.nChar) {
                // simple character
                yield* this._select1(zFlexPattern, zTagsPattern, jAddr, sWord + this.lArcVal[nVal]);
                yield* this._select(zFlexPattern, zTagsPattern, jAddr, sWord + this.lArcVal[nVal]);
            } else {
                if (!zFlexPattern || zFlexPattern.test(sWord)) {
                    let sStem = this.funcStemming(sWord, this.lArcVal[nVal]);
                    for (let [nMorphVal, _] of this._getArcs1(jAddr)) {
                    for (let [nMorphVal, _] of this._getArcs(jAddr)) {
                        if (!zTagsPattern || zTagsPattern.test(this.lArcVal[nMorphVal])) {
                            yield [sWord, sStem, this.lArcVal[nMorphVal]];
                        }
                    }
                }
            }
        }
    }

    _morph1 (sWord) {
    _morph (sWord) {
        // returns morphologies of sWord
        let iAddr = 0;
        for (let c of sWord) {
            if (!this.dChar.has(c)) {
                return [];
            }
            iAddr = this._lookupArcNode(this.dChar.get(c), iAddr);
            if (iAddr === null) {
                return [];
            }
        }
        if (this._convBytesToInteger(this.byDic.slice(iAddr, iAddr+this.nBytesArc)) & this._finalNodeMask) {
        if (this.byDic[iAddr] & this._finalNodeMask) {
            let l = [];
            let nRawArc = 0;
            while (!(nRawArc & this._lastArcMask)) {
                let iEndArcAddr = iAddr + this.nBytesArc;
                nRawArc = this._convBytesToInteger(this.byDic.slice(iAddr, iEndArcAddr));
                let iEndArcAddr = iAddr + 1;
                nRawArc = this.byDic[iAddr];
                let nArc = nRawArc & this._arcMask;
                if (nArc > this.nChar) {
                    // This value is not a char, this is a stemming code
                    let sStem = ">" + this.funcStemming(sWord, this.lArcVal[nArc]);
                    // Now , we go to the next node and retrieve all following arcs values, all of them are tags
                    let iAddr2 = this._convBytesToInteger(this.byDic.slice(iEndArcAddr, iEndArcAddr+this.nBytesNodeAddress));
                    let iAddr2 = this.byDic[iEndArcAddr];
                    let nRawArc2 = 0;
                    while (!(nRawArc2 & this._lastArcMask)) {
                        let iEndArcAddr2 = iAddr2 + this.nBytesArc;
                        nRawArc2 = this._convBytesToInteger(this.byDic.slice(iAddr2, iEndArcAddr2));
                        let iEndArcAddr2 = iAddr2 + 1;
                        nRawArc2 = this.byDic[iAddr2];
                        l.push(sStem + "/" + this.lArcVal[nRawArc2 & this._arcMask]);
                        iAddr2 = iEndArcAddr2+this.nBytesNodeAddress;
                        iAddr2 = iEndArcAddr2 + 1;
                    }
                }
                iAddr = iEndArcAddr + this.nBytesNodeAddress;
                iAddr = iEndArcAddr + 1;
            }
            return l;
        }
        return [];
    }

    _stem1 (sWord) {
    _stem (sWord) {
        // returns stems list of sWord
        let iAddr = 0;
        for (let c of sWord) {
            if (!this.dChar.has(c)) {
                return [];
            }
            iAddr = this._lookupArcNode(this.dChar.get(c), iAddr);
            if (iAddr === null) {
                return [];
            }
        }
        if (this._convBytesToInteger(this.byDic.slice(iAddr, iAddr+this.nBytesArc)) & this._finalNodeMask) {
        if (this.byDic[iAddr] & this._finalNodeMask) {
            let l = [];
            let nRawArc = 0;
            while (!(nRawArc & this._lastArcMask)) {
                let iEndArcAddr = iAddr + this.nBytesArc;
                nRawArc = this._convBytesToInteger(this.byDic.slice(iAddr, iEndArcAddr));
                let iEndArcAddr = iAddr + 1;
                nRawArc = this.byDic[iAddr];
                let nArc = nRawArc & this._arcMask;
                if (nArc > this.nChar) {
                    // This value is not a char, this is a stemming code
                    l.push(this.funcStemming(sWord, this.lArcVal[nArc]));
                }
                iAddr = iEndArcAddr + this.nBytesNodeAddress;
                iAddr = iEndArcAddr + 1;
            }
            return l;
        }
        return [];
    }

    _lookupArcNode1 (nVal, iAddr) {
    _lookupArcNode (nVal, iAddr) {
        // looks if nVal is an arc at the node at iAddr, if yes, returns address of next node else None
        while (true) {
            let iEndArcAddr = iAddr+this.nBytesArc;
            let nRawArc = this._convBytesToInteger(this.byDic.slice(iAddr, iEndArcAddr));
            let iEndArcAddr = iAddr+1;
            let nRawArc = this.byDic[iAddr];
            if (nVal == (nRawArc & this._arcMask)) {
                // the value we are looking for
                // we return the address of the next node
                return this._convBytesToInteger(this.byDic.slice(iEndArcAddr, iEndArcAddr+this.nBytesNodeAddress));
                return this.byDic[iEndArcAddr];
            }
            else {
                // value not found
                if (nRawArc & this._lastArcMask) {
                    return null;
                }
                iAddr = iEndArcAddr + this.nBytesNodeAddress;
                iAddr = iEndArcAddr + 1;
            }
        }
    }

    * _getArcs1 (iAddr) {
    * _getArcs (iAddr) {
        // generator: return all arcs at <iAddr> as tuples of (nVal, iAddr)
        while (true) {
            let iEndArcAddr = iAddr+this.nBytesArc;
            let nRawArc = this._convBytesToInteger(this.byDic.slice(iAddr, iEndArcAddr));
            yield [nRawArc & this._arcMask, this._convBytesToInteger(this.byDic.slice(iEndArcAddr, iEndArcAddr+this.nBytesNodeAddress))];
            let iEndArcAddr = iAddr+1;
            let nRawArc = this.byDic[iAddr];
            yield [nRawArc & this._arcMask, this.byDic[iEndArcAddr]];
            if (nRawArc & this._lastArcMask) {
                break;
            }
            iAddr = iEndArcAddr+this.nBytesNodeAddress;
            iAddr = iEndArcAddr+1;
        }
    }

    // VERSION 2
    _morph2 (sWord) {
        // to do
    }

    _stem2 (sWord) {
        // to do
    }

    _lookupArcNode2 (nVal, iAddr) {
        // to do
    }


    // VERSION 3
    _morph3 (sWord) {
        // to do
    }

    _stem3 (sWord) {
        // to do
    }

    _lookupArcNode3 (nVal, iAddr) {
        // to do
    }
}


if (typeof(exports) !== 'undefined') {
    exports.IBDAWG = IBDAWG;
}

Modified graphspell-js/str_transform.js from [4c4ee6009e] to [3bf8e3b480].

1
2
3
4
5
6
7
8
9
10
11
12
13

14
15
16
17
18
19
20
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21













+







// STRING TRANSFORMATION

/* jshint esversion:6, -W097 */
/* jslint esversion:6 */
/* global exports, console */

"use strict";


if (typeof(process) !== 'undefined') {
    var char_player = require("./char_player.js");
}



// Note: 48 is the ASCII code for "0"

var str_transform = {

    getNgrams: function (sWord, n=2) {
        let lNgrams = [];
61
62
63
64
65
66
67






68
69
70
71
72
73
74
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81







+
+
+
+
+
+







            if (c != sWord.slice(i, i+1) || (c == 'e' && sWord.slice(i, i+2) != "ee")) {  // exception for <e> to avoid confusion between crée / créai
                sNewWord += c;
            }
            i++;
        }
        return sNewWord.replace(/eau/g, "o").replace(/au/g, "o").replace(/ai/g, "éi").replace(/ei/g, "é").replace(/ph/g, "f");
    },

    cleanWord: function (sWord) {
        // word clean for the user who make commun and preditive error help suggest
        // remove letters repeated more than 2 times
        return sWord.replace(/(.)\1{2,}/ig,'$1$1');
    },

    _xTransNumbersToExponent: new Map([
        ["0", "⁰"], ["1", "¹"], ["2", "²"], ["3", "³"], ["4", "⁴"], ["5", "⁵"], ["6", "⁶"], ["7", "⁷"], ["8", "⁸"], ["9", "⁹"]
    ]),

    numbersToExponent: function (sWord) {
        let sNewWord = "";
148
149
150
151
152
153
154



























































































155
156
157
158
159
160
161
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259







+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+







            }
            return matrix[nLen1][nLen2];
        }
        catch (e) {
            console.error(e);
        }
    },

    distanceJaroWinkler: function(a, b, boost = .666) {
        // https://github.com/thsig/jaro-winkler-JS
        //if (a == b) { return 1.0; }
        let a_len = a.length;
        let b_len = b.length;
        let a_flag = [];
        let b_flag = [];
        let search_range = Math.floor(Math.max(a_len, b_len) / 2) - 1;
        let minv = Math.min(a_len, b_len);

        // Looking only within the search range, count and flag the matched pairs.
        let Num_com = 0;
        let yl1 = b_len - 1;
        for (let i = 0; i < a_len; i++) {
          let lowlim = (i >= search_range) ? i - search_range : 0;
          let hilim  = ((i + search_range) <= yl1) ? (i + search_range) : yl1;
          for (let j = lowlim; j <= hilim; j++) {
            if (b_flag[j] !== 1 && a[j] === b[i]) {
              a_flag[j] = 1;
              b_flag[i] = 1;
              Num_com++;
              break;
            }
          }
        }

        // Return if no characters in common
        if (Num_com === 0) { return 0.0; }

        // Count the number of transpositions
        let k = 0;
        let N_trans = 0;
        for (let i = 0; i < a_len; i++) {
          if (a_flag[i] === 1) {
            let j;
            for (j = k; j < b_len; j++) {
              if (b_flag[j] === 1) {
                k = j + 1;
                break;
              }
            }
            if (a[i] !== b[j]) { N_trans++; }
          }
        }
        N_trans = Math.floor(N_trans / 2);

        // Adjust for similarities in nonmatched characters
        let N_simi = 0;
        let adjwt = char_player.oDistanceBetweenChars;
        if (minv > Num_com) {
          for (let i = 0; i < a_len; i++) {
            if (!a_flag[i]) {
              for (let j = 0; j < b_len; j++) {
                if (!b_flag[j]) {
                  if (adjwt[a[i]] && adjwt[a[i]][b[j]]) {
                    N_simi += adjwt[a[i]][b[j]];
                    b_flag[j] = 2;
                    break;
                  }
                }
              }
            }
          }
        }

        let Num_sim = (N_simi / 10.0) + Num_com;

        // Main weight computation
        let weight = Num_sim / a_len + Num_sim / b_len + (Num_com - N_trans) / Num_com;
        weight = weight / 3;

        // Continue to boost the weight if the strings are similar
        if (weight > boost) {
          // Adjust for having up to the first 4 characters in common
          let j = (minv >= 4) ? 4 : minv;
          let i;
          for (i = 0; (i < j) && a[i] === b[i]; i++) { }
          if (i) { weight += i * 0.1 * (1.0 - weight) };

          // Adjust for long strings.
          // After agreeing beginning chars, at least two more must agree
          // and the agreeing characters must be more than half of the
          // remaining characters.
          if (minv > 4 && Num_com > i + 1 && 2 * Num_com >= minv + i) {
            weight += (1 - weight) * ((Num_com - i - 1) / (a_len * b_len - i*2 + 2));
          }
        }

        return weight;
    },

    showDistance (s1, s2) {
        console.log(`Distance: ${s1} / ${s2} = ${this.distanceDamerauLevenshtein(s1, s2)})`);
    },

    // Suffix only
    defineSuffixCode: function (sFlex, sStem) {

Modified graphspell/char_player.py from [e2b351100d] to [24cf38a4cb].

1
2
3
4
5
6
7



8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34



























35
36
37
38
39
40
41
1
2
3
4
5
6
7
8
9
10



























11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44







+
+
+
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+







"""
List of similar chars
useful for suggestion mechanism
"""


dDistanceBetweenChars = {
    # dDistanceBetweenChars:
    # - with Jaro-Winkler, values between 1 and 10
    # - with Damerau-Levenshtein, values / 10 (between 0 and 1: 0.1, 0.2 ... 0.9)
    "a": {},
    "e": {"é": 0.5},
    "é": {"e": 0.5},
    "i": {"y": 0.2},
    "o": {},
    "u": {},
    "y": {"i": 0.3},
    "b": {"d": 0.8, "h": 0.9},
    "c": {"ç": 0.1, "k": 0.5, "q": 0.5, "s": 0.5, "x": 0.5, "z": 0.8},
    "d": {"b": 0.8},
    "f": {"v": 0.8},
    "g": {"j": 0.5},
    "h": {"b": 0.9},
    "j": {"g": 0.5, "i": 0.9},
    "k": {"c": 0.5, "q": 0.1, "x": 0.5},
    "l": {"i": 0.9},
    "m": {"n": 0.8},
    "n": {"m": 0.8, "r": 0.9},
    "p": {"q": 0.9},
    "q": {"c": 0.5, "k": 0.1, "p": 0.9},
    "r": {"n": 0.9, "j": 0.9},
    "s": {"c": 0.5, "ç": 0.1, "x": 0.5, "z": 0.5},
    "t": {"d": 0.9},
    "v": {"f": 0.8, "w": 0.1},
    "w": {"v": 0.1},
    "x": {"c": 0.5, "k": 0.5, "q": 0.5, "s": 0.5},
    "z": {"s": 0.5}
    #"a": {},
    "e": {"é": 5},
    "é": {"e": 5},
    "i": {"y": 2},
    #"o": {},
    #"u": {},
    "y": {"i": 3},
    "b": {"d": 8, "h": 9},
    "c": {"ç": 1, "k": 5, "q": 5, "s": 5, "x": 5, "z": 8},
    "d": {"b": 8},
    "f": {"v": 8},
    "g": {"j": 5},
    "h": {"b": 9},
    "j": {"g": 5, "i": 9},
    "k": {"c": 5, "q": 1, "x": 5},
    "l": {"i": 9},
    "m": {"n": 8},
    "n": {"m": 8, "r": 9},
    "p": {"q": 9},
    "q": {"c": 5, "k": 1, "p": 9},
    "r": {"n": 9, "j": 9},
    "s": {"c": 5, "ç": 1, "x": 5, "z": 5},
    "t": {"d": 9},
    "v": {"f": 8, "w": 1},
    "w": {"v": 1},
    "x": {"c": 5, "k": 5, "q": 5, "s": 5},
    "z": {"s": 5}
}


def distanceBetweenChars (c1, c2):
    "returns a float between 0 and 1"
    if c1 == c2:
        return 0
313
314
315
316
317
318
319
320
321


322
323
324
325
326
327
328
329
330
331
332
333
334
335


336
337
338
339
340
341
342
316
317
318
319
320
321
322


323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347







-
-
+
+














+
+









# End of word

dFinal1 = {
    "a": ("as", "at", "ant", "ah"),
    "A": ("AS", "AT", "ANT", "AH"),
    "c": ("ch",),
    "C": ("CH",),
    "c": ("ch", "que"),
    "C": ("CH", "QUE"),
    "e": ("et", "er", "ets", "ée", "ez", "ai", "ais", "ait", "ent", "eh"),
    "E": ("ET", "ER", "ETS", "ÉE", "EZ", "AI", "AIS", "AIT", "ENT", "EH"),
    "é": ("et", "er", "ets", "ée", "ez", "ai", "ais", "ait"),
    "É": ("ET", "ER", "ETS", "ÉE", "EZ", "AI", "AIS", "AIT"),
    "è": ("et", "er", "ets", "ée", "ez", "ai", "ais", "ait"),
    "È": ("ET", "ER", "ETS", "ÉE", "EZ", "AI", "AIS", "AIT"),
    "ê": ("et", "er", "ets", "ée", "ez", "ai", "ais", "ait"),
    "Ê": ("ET", "ER", "ETS", "ÉE", "EZ", "AI", "AIS", "AIT"),
    "ë": ("et", "er", "ets", "ée", "ez", "ai", "ais", "ait"),
    "Ë": ("ET", "ER", "ETS", "ÉE", "EZ", "AI", "AIS", "AIT"),
    "g": ("gh",),
    "G": ("GH",),
    "i": ("is", "it", "ie", "in"),
    "I": ("IS", "IT", "IE", "IN"),
    "k": ("que",),
    "K": ("QUE",),
    "n": ("nt", "nd", "ns", "nh"),
    "N": ("NT", "ND", "NS", "NH"),
    "o": ("aut", "ot", "os"),
    "O": ("AUT", "OT", "OS"),
    "ô": ("aut", "ot", "os"),
    "Ô": ("AUT", "OT", "OS"),
    "ö": ("aut", "ot", "os"),

Modified graphspell/ibdawg.py from [d16ed0d683] to [d672255b46].

11
12
13
14
15
16
17

18
19
20
21
22
23
24
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25







+







import re
from functools import wraps
import time
import json
import binascii
import importlib
from collections import OrderedDict
from math import floor

#import logging
#logging.basicConfig(filename="suggestions.log", level=logging.DEBUG)

from . import str_transform as st
from . import char_player as cp
from .echo import echo
36
37
38
39
40
41
42
43

44
45
46
47

48
49
50








51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71


















72
73

74
75
76
77
78
79
80
81
82
83



84
85

86
87
88
89







90
91
92
93
94
95

96
97
98
99
100
101
102
37
38
39
40
41
42
43

44
45
46
47
48
49



50
51
52
53
54
55
56
57
58
59
60
61
62
63
64














65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83

84
85
86
87







88
89
90


91




92
93
94
95
96
97
98
99
100
101
102
103

104
105
106
107
108
109
110
111







-
+




+
-
-
-
+
+
+
+
+
+
+
+







-
-
-
-
-
-
-
-
-
-
-
-
-
-
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+

-
+



-
-
-
-
-
-
-
+
+
+
-
-
+
-
-
-
-
+
+
+
+
+
+
+





-
+







        return result
    return wrapper


class SuggResult:
    """Structure for storing, classifying and filtering suggestions"""

    def __init__ (self, sWord, nDistLimit=-1):
    def __init__ (self, sWord, nSuggLimit=10, nDistLimit=-1):
        self.sWord = sWord
        self.sSimplifiedWord = st.simplifyWord(sWord)
        self.nDistLimit = nDistLimit  if nDistLimit >= 0  else  (len(sWord) // 3) + 1
        self.nMinDist = 1000
        # Temporary sets
        self.aSugg = set()
        self.dSugg = { 0: [],  1: [],  2: [] }
        self.aAllSugg = set()       # all found words even those refused
        self.aAllSugg = set()   # All suggestions, even the one rejected
        self.dGoodSugg = {}     # Acceptable suggestions
        self.dBestSugg = {}     # Best suggestions
        # Parameters
        self.nSuggLimit = nSuggLimit
        self.nSuggLimitExt = nSuggLimit + 2             # we add few entries in case suggestions merge after casing modifications
        self.nBestSuggLimit = floor(nSuggLimit * 1.5)   # n times the requested limit
        self.nGoodSuggLimit = nSuggLimit * 15           # n times the requested limit

    def addSugg (self, sSugg, nDeep=0):
        "add a suggestion"
        #logging.info((nDeep * "  ") + "__" + sSugg + "__")
        if sSugg in self.aAllSugg:
            return
        self.aAllSugg.add(sSugg)
        if sSugg not in self.aSugg:
            #nDist = min(st.distanceDamerauLevenshtein(self.sWord, sSugg), st.distanceDamerauLevenshtein(self.sSimplifiedWord, st.simplifyWord(sSugg)))
            nDist = int(st.distanceDamerauLevenshtein(self.sSimplifiedWord, st.simplifyWord(sSugg)))
            #logging.info((nDeep * "  ") + "__" + sSugg + "__ :" + self.sSimplifiedWord +"|"+ st.simplifyWord(sSugg) +" -> "+ str(nDist))
            if nDist <= self.nDistLimit:
                if " " in sSugg:
                    nDist += 1
                if nDist not in self.dSugg:
                    self.dSugg[nDist] = []
                self.dSugg[nDist].append(sSugg)
                self.aSugg.add(sSugg)
                if nDist < self.nMinDist:
                    self.nMinDist = nDist
                self.nDistLimit = min(self.nDistLimit, self.nMinDist+1)
        nDistJaro = 1 - st.distanceJaroWinkler(self.sSimplifiedWord, st.simplifyWord(sSugg))
        nDist = floor(nDistJaro * 10)
        if nDistJaro < .11:     # Best suggestions
            self.dBestSugg[sSugg] = round(nDistJaro*1000)
            if len(self.dBestSugg) > self.nBestSuggLimit:
                self.nDistLimit = -1  # make suggest() to end search
        elif nDistJaro < .33:   # Good suggestions
            self.dGoodSugg[sSugg] = round(nDistJaro*1000)
            if len(self.dGoodSugg) > self.nGoodSuggLimit:
                self.nDistLimit = -1  # make suggest() to end search
        else:
            if nDist < self.nMinDist:
                self.nMinDist = nDist
            self.nDistLimit = min(self.nDistLimit, self.nMinDist)
        if nDist <= self.nDistLimit:
            if nDist < self.nMinDist:
                self.nMinDist = nDist
            self.nDistLimit = min(self.nDistLimit, self.nMinDist+1)

    def getSuggestions (self, nSuggLimit=10):
    def getSuggestions (self):
        "return a list of suggestions"
        # we sort the better results with the original word
        lRes = []
        bFirstListSorted = False
        for nDist, lSugg in self.dSugg.items():
            if nDist > self.nDistLimit:
                break
            if not bFirstListSorted and len(lSugg) > 1:
                lSugg.sort(key=lambda sSugg: st.distanceDamerauLevenshtein(self.sWord, sSugg))
                bFirstListSorted = True
        if len(self.dBestSugg) > 0:
            # sort only with simplified words
            lResTmp = sorted(self.dBestSugg.items(), key=lambda x: x[1])
            #print(nDist, "|".join(lSugg))
            #for sSugg in lSugg:
            for i in range(min(self.nSuggLimitExt, len(lResTmp))):
            #    print(sSugg, st.distanceDamerauLevenshtein(self.sWord, sSugg))
            lRes.extend(lSugg)
            if len(lRes) > nSuggLimit:
                break
                lRes.append(lResTmp[i][0])
        if len(lRes) < self.nSuggLimitExt:
            # sort with simplified words and original word
            lResTmp = sorted(self.dGoodSugg.items(), key=lambda x: ((1-st.distanceJaroWinkler(self.sWord, x[0]))*10, x[1]))
            for i in range(min(self.nSuggLimitExt, len(lResTmp))):
                lRes.append(lResTmp[i][0])
        # casing
        if self.sWord.isupper():
            lRes = list(OrderedDict.fromkeys(map(lambda sSugg: sSugg.upper(), lRes))) # use dict, when Python 3.6+
        elif self.sWord[0:1].isupper():
            # dont’ use <.istitle>
            lRes = list(OrderedDict.fromkeys(map(lambda sSugg: sSugg[0:1].upper()+sSugg[1:], lRes))) # use dict, when Python 3.6+
        return lRes[:nSuggLimit]
        return lRes[:self.nSuggLimit]

    def reset (self):
        "clear data"
        self.aSugg.clear()
        self.dSugg.clear()


116
117
118
119
120
121
122




















123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154

155
156
157
158
159
160
161
162
163






















164
165
166
167
168
169
170







+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+



-









-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-







            else:
                raise OSError("# Error. Unknown file type: "+source)
        else:
            self._initJSON(source)

        self.sFileName = source  if isinstance(source, str)  else "[None]"

        # Performance trick:
        #     Instead of converting bytes to integers each times we parse the binary dictionary,
        #     we do it once, then parse the array
        nAcc = 0
        byBuffer = b""
        lTemp = []
        nDivisor = (self.nBytesArc + self.nBytesNodeAddress) / 2
        for i in range(0, len(self.byDic)):
            byBuffer += self.byDic[i:i+1]
            if nAcc == (self.nBytesArc - 1):
                lTemp.append(int.from_bytes(byBuffer, byteorder="big"))
                byBuffer = b""
            elif nAcc == (self.nBytesArc + self.nBytesNodeAddress - 1):
                lTemp.append(round(int.from_bytes(byBuffer, byteorder="big") / nDivisor))
                byBuffer = b""
                nAcc = -1
            nAcc = nAcc + 1
        self.byDic = lTemp;

        # masks
        self._arcMask = (2 ** ((self.nBytesArc * 8) - 3)) - 1
        self._finalNodeMask = 1 << ((self.nBytesArc * 8) - 1)
        self._lastArcMask = 1 << ((self.nBytesArc * 8) - 2)
        self._addrBitMask = 1 << ((self.nBytesArc * 8) - 3)  # version 2

        # function to decode the affix/suffix code
        if self.cStemming == "S":
            self.funcStemming = st.changeWordWithSuffixCode
        elif self.cStemming == "A":
            self.funcStemming = st.changeWordWithAffixCode
        else:
            self.funcStemming = st.noStemming

        # Configuring DAWG functions according to nCompressionMethod
        if self.nCompressionMethod == 1:
            self.morph = self._morph1
            self.stem = self._stem1
            self._lookupArcNode = self._lookupArcNode1
            self._getArcs = self._getArcs1
            self._writeNodes = self._writeNodes1
        elif self.nCompressionMethod == 2:
            self.morph = self._morph2
            self.stem = self._stem2
            self._lookupArcNode = self._lookupArcNode2
            self._getArcs = self._getArcs2
            self._writeNodes = self._writeNodes2
        elif self.nCompressionMethod == 3:
            self.morph = self._morph3
            self.stem = self._stem3
            self._lookupArcNode = self._lookupArcNode3
            self._getArcs = self._getArcs3
            self._writeNodes = self._writeNodes3
        else:
            raise ValueError("  # Error: unknown code: {}".format(self.nCompressionMethod))

        self.bAcronymValid = False
        self.bNumAtLastValid = False

        # lexicographer module ?
        self.lexicographer = None
        try:
            self.lexicographer = importlib.import_module(".lexgraph_"+self.sLangCode, "grammalecte.graphspell")
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
206
207
208
209
210
211
212

213
214
215
216
217
218
219







-







        self.cStemming = l.pop(0)
        self.nTag = self.nArcVal - self.nChar - self.nAff
        # <dChar> to get the value of an arc, <dCharVal> to get the char of an arc with its value
        self.dChar = {}
        for i in range(1, self.nChar+1):
            self.dChar[self.lArcVal[i]] = i
        self.dCharVal = { v: k  for k, v in self.dChar.items() }
        self.nBytesOffset = 1 # version 3

    def _initJSON (self, oJSON):
        "initialize with a JSON text file"
        self.sByDic = ""  # init to prevent pylint whining
        self.__dict__.update(oJSON)
        self.byDic = binascii.unhexlify(self.sByDic)
        self.dCharVal = { v: k  for k, v in self.dChar.items() }
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
249
250
251
252
253
254
255

256
257
258
259
260
261
262







-







                "nNode": self.nNode,
                "nArc": self.nArc,
                "nArcVal": self.nArcVal,
                "lArcVal": self.lArcVal,
                "nCompressionMethod": self.nCompressionMethod,
                "nBytesArc": self.nBytesArc,
                "nBytesNodeAddress": self.nBytesNodeAddress,
                "nBytesOffset": self.nBytesOffset,
                # JavaScript is a pile of shit, so Mozilla’s JS parser don’t like file bigger than 4 Mb!
                # So, if necessary, we use an hexadecimal string, that we will convert later in Firefox’s extension.
                # https://github.com/mozilla/addons-linter/issues/1361
                "sByDic": self.byDic.hex()  if bBinaryDictAsHexString  else [ e  for e in self.byDic ],
                "l2grams": list(self.a2grams)
            }, ensure_ascii=False))
            if bInJSModule:
296
297
298
299
300
301
302
303

304
305
306
307
308
309
310

311
312

313
314

315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330


331
332
333
334
335

336
337
338
339
340
341
342
300
301
302
303
304
305
306

307
308
309
310
311
312
313

314
315

316
317

318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333

334
335
336
337
338
339

340
341
342
343
344
345
346
347







-
+






-
+

-
+

-
+















-
+
+




-
+







        iAddr = 0
        for c in sWord:
            if c not in self.dChar:
                return False
            iAddr = self._lookupArcNode(self.dChar[c], iAddr)
            if iAddr is None:
                return False
        return bool(int.from_bytes(self.byDic[iAddr:iAddr+self.nBytesArc], byteorder='big') & self._finalNodeMask)
        return bool(self.byDic[iAddr] & self._finalNodeMask)

    def getMorph (self, sWord):
        "retrieves morphologies list, different casing allowed"
        if not sWord:
            return []
        sWord = st.spellingNormalization(sWord)
        l = self.morph(sWord)
        l = self._morph(sWord)
        if sWord[0:1].isupper():
            l.extend(self.morph(sWord.lower()))
            l.extend(self._morph(sWord.lower()))
            if sWord.isupper() and len(sWord) > 1:
                l.extend(self.morph(sWord.capitalize()))
                l.extend(self._morph(sWord.capitalize()))
        return l

    #@timethis
    def suggest (self, sWord, nSuggLimit=10, bSplitTrailingNumbers=False):
        "returns a set of suggestions for <sWord>"
        sWord = sWord.rstrip(".")   # useful for LibreOffice
        sWord = st.spellingNormalization(sWord)
        sPfx = ""
        sSfx = ""
        if self.lexicographer:
            sPfx, sWord, sSfx = self.lexicographer.split(sWord)
        nMaxSwitch = max(len(sWord) // 3, 1)
        nMaxDel = len(sWord) // 5
        nMaxHardRepl = max((len(sWord) - 5) // 4, 1)
        nMaxJump = max(len(sWord) // 4, 1)
        oSuggResult = SuggResult(sWord)
        oSuggResult = SuggResult(sWord, nSuggLimit)
        sWord = st.cleanWord(sWord)
        if bSplitTrailingNumbers:
            self._splitTrailingNumbers(oSuggResult, sWord)
        self._splitSuggest(oSuggResult, sWord)
        self._suggest(oSuggResult, sWord, nMaxSwitch, nMaxDel, nMaxHardRepl, nMaxJump)
        aSugg = oSuggResult.getSuggestions(nSuggLimit)
        aSugg = oSuggResult.getSuggestions()
        if self.lexicographer:
            aSugg = self.lexicographer.filterSugg(aSugg)
        if sSfx or sPfx:
            # we add what we removed
            return list(map(lambda sSug: sPfx + sSug + sSfx, aSugg))
        return aSugg

352
353
354
355
356
357
358
359

360
361
362
363
364
365
366
357
358
359
360
361
362
363

364
365
366
367
368
369
370
371







-
+







                sWord1, sWord2 = sWord.split(cSplitter, 1)
                if self.isValid(sWord1) and self.isValid(sWord2):
                    oSuggResult.addSugg(sWord1+" "+sWord2)

    def _suggest (self, oSuggResult, sRemain, nMaxSwitch=0, nMaxDel=0, nMaxHardRepl=0, nMaxJump=0, nDist=0, nDeep=0, iAddr=0, sNewWord="", bAvoidLoop=False):
        # recursive function
        #logging.info((nDeep * "  ") + sNewWord + ":" + sRemain)
        if int.from_bytes(self.byDic[iAddr:iAddr+self.nBytesArc], byteorder='big') & self._finalNodeMask:
        if self.byDic[iAddr] & self._finalNodeMask:
            if not sRemain:
                oSuggResult.addSugg(sNewWord, nDeep)
                for sTail in self._getTails(iAddr):
                    oSuggResult.addSugg(sNewWord+sTail, nDeep)
                return
            if (len(sNewWord) + len(sRemain) == len(oSuggResult.sWord)) and oSuggResult.sWord.lower().startswith(sNewWord.lower()) and self.isValid(sRemain):
                if self.sLangCode == "fr" and sNewWord.lower() in ("l", "d", "n", "m", "t", "s", "c", "j", "qu", "lorsqu", "puisqu", "quoiqu", "jusqu", "quelqu") and sRemain[0:1] in cp.aVowel:
419
420
421
422
423
424
425
426

427
428
429
430
431
432
433
424
425
426
427
428
429
430

431
432
433
434
435
436
437
438







-
+







                yield (self.dCharVal[nVal], jAddr)

    def _getTails (self, iAddr, sTail="", n=2):
        "return a list of suffixes ending at a distance of <n> from <iAddr>"
        aTails = set()
        for nVal, jAddr in self._getArcs(iAddr):
            if nVal <= self.nChar:
                if int.from_bytes(self.byDic[jAddr:jAddr+self.nBytesArc], byteorder='big') & self._finalNodeMask:
                if self.byDic[jAddr] & self._finalNodeMask:
                    aTails.add(sTail + self.dCharVal[nVal])
                if n and not aTails:
                    aTails.update(self._getTails(jAddr, sTail+self.dCharVal[nVal], n-1))
        return aTails

    def drawPath (self, sWord, iAddr=0):
        "show the path taken by <sWord> in the graph"
467
468
469
470
471
472
473
474

475
476
477
478
479
480

481
482

483
484
485

486
487
488
489

490
491
492
493

494
495
496
497
498
499
500
501
502

503
504
505
506
507


508
509
510
511
512
513

514
515
516
517


518
519
520


521
522
523
524

525
526
527
528
529
530
531
532
533

534
535
536
537
538


539
540
541
542
543

544
545
546
547

548
549
550
551


552
553
554
555

556
557
558
559

560
561

562
563
564
565
566



567
568
569

570
571

572
573
574
575
576
577
578
579


580
581

582
583
584
585
586
587
588
589
590
591
592
593
594
595
596
597
598
599
600
601
602
603
604
605
606
607
608
609
610
611
612
613
614
615
616
617
618
619
620
621
622
623
624
625
626
627
628
629
630
631
632
633
634
635
636
637
638
639
640
641
642
643
644
645
646
647
648
649
650
651
652
653
654
655
656
657
658
659
660
661
662
663
664
665
666
667
668
669
670
671

672
673
674
675
676
677
678
679
680
681
682
683
684
685
686
687
688
689
690
691
692
693
694
695
696
697
698

699
700
701
702
703
704
705
706
707
708
709
710
711
712
713
714
715
716
717
718
719
720
721
722
723
724
725
726
727
728
729
730
731
732
733
734
735
736
737
738
739
740
741
742
743
744
745
746
747
748
749
750
751
752
753
754
755
756
757
758
759
760
761
762
763
764
765
766
767
768
769
770
771
772
773
774
775
776
777
778
779
780
781
782
783
784
785
786
787
788
789
790
791
792
793
794
795
796
797
798
472
473
474
475
476
477
478

479
480





481
482

483
484
485

486
487
488
489

490
491
492
493

494
495
496
497
498
499
500
501
502

503
504
505
506


507
508
509
510
511
512
513

514
515
516


517
518
519


520
521
522
523
524

525
526
527
528
529
530
531
532
533

534
535
536
537


538
539
540
541
542
543

544
545
546
547

548
549
550


551
552
553
554
555

556
557
558
559

560
561

562
563
564



565
566
567
568
569

570
571

572
573
574
575
576
577
578


579
580
581

582


























































































583



























584


































































































585
586







-
+

-
-
-
-
-
+

-
+


-
+



-
+



-
+








-
+



-
-
+
+





-
+


-
-
+
+

-
-
+
+



-
+








-
+



-
-
+
+




-
+



-
+


-
-
+
+



-
+



-
+

-
+


-
-
-
+
+
+


-
+

-
+






-
-
+
+

-
+
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
+
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
+
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-


            if sFlexPattern:
                zFlexPattern = re.compile(sFlexPattern)
            if sTagsPattern:
                zTagsPattern = re.compile(sTagsPattern)
        except re.error:
            print("# Error in regex pattern")
            traceback.print_exc()
        yield from self._select1(zFlexPattern, zTagsPattern, 0, "")
        yield from self._select(zFlexPattern, zTagsPattern, 0, "")

    # def morph (self, sWord):
    #     is defined in __init__

    # VERSION 1
    def _select1 (self, zFlexPattern, zTagsPattern, iAddr, sWord):
    def _select (self, zFlexPattern, zTagsPattern, iAddr, sWord):
        # recursive generator
        for nVal, jAddr in self._getArcs1(iAddr):
        for nVal, jAddr in self._getArcs(iAddr):
            if nVal <= self.nChar:
                # simple character
                yield from self._select1(zFlexPattern, zTagsPattern, jAddr, sWord + self.lArcVal[nVal])
                yield from self._select(zFlexPattern, zTagsPattern, jAddr, sWord + self.lArcVal[nVal])
            else:
                if not zFlexPattern or zFlexPattern.search(sWord):
                    sStem = self.funcStemming(sWord, self.lArcVal[nVal])
                    for nMorphVal, _ in self._getArcs1(jAddr):
                    for nMorphVal, _ in self._getArcs(jAddr):
                        if not zTagsPattern or zTagsPattern.search(self.lArcVal[nMorphVal]):
                            yield [sWord, sStem, self.lArcVal[nMorphVal]]

    def _morph1 (self, sWord):
    def _morph (self, sWord):
        "returns morphologies of <sWord>"
        iAddr = 0
        for c in sWord:
            if c not in self.dChar:
                return []
            iAddr = self._lookupArcNode(self.dChar[c], iAddr)
            if iAddr is None:
                return []
        if int.from_bytes(self.byDic[iAddr:iAddr+self.nBytesArc], byteorder='big') & self._finalNodeMask:
        if self.byDic[iAddr] & self._finalNodeMask:
            l = []
            nRawArc = 0
            while not nRawArc & self._lastArcMask:
                iEndArcAddr = iAddr + self.nBytesArc
                nRawArc = int.from_bytes(self.byDic[iAddr:iEndArcAddr], byteorder='big')
                iEndArcAddr = iAddr + 1
                nRawArc = self.byDic[iAddr]
                nArc = nRawArc & self._arcMask
                if nArc > self.nChar:
                    # This value is not a char, this is a stemming code
                    sStem = ">" + self.funcStemming(sWord, self.lArcVal[nArc])
                    # Now , we go to the next node and retrieve all following arcs values, all of them are tags
                    iAddr2 = int.from_bytes(self.byDic[iEndArcAddr:iEndArcAddr+self.nBytesNodeAddress], byteorder='big')
                    iAddr2 = self.byDic[iEndArcAddr]
                    nRawArc2 = 0
                    while not nRawArc2 & self._lastArcMask:
                        iEndArcAddr2 = iAddr2 + self.nBytesArc
                        nRawArc2 = int.from_bytes(self.byDic[iAddr2:iEndArcAddr2], byteorder='big')
                        iEndArcAddr2 = iAddr2 + 1
                        nRawArc2 = self.byDic[iAddr2]
                        l.append(sStem + "/" + self.lArcVal[nRawArc2 & self._arcMask])
                        iAddr2 = iEndArcAddr2+self.nBytesNodeAddress
                iAddr = iEndArcAddr+self.nBytesNodeAddress
                        iAddr2 = iEndArcAddr2 + 1
                iAddr = iEndArcAddr + 1
            return l
        return []

    def _stem1 (self, sWord):
    def _stem (self, sWord):
        "returns stems list of <sWord>"
        iAddr = 0
        for c in sWord:
            if c not in self.dChar:
                return []
            iAddr = self._lookupArcNode(self.dChar[c], iAddr)
            if iAddr is None:
                return []
        if int.from_bytes(self.byDic[iAddr:iAddr+self.nBytesArc], byteorder='big') & self._finalNodeMask:
        if self.byDic[iAddr] & self._finalNodeMask:
            l = []
            nRawArc = 0
            while not nRawArc & self._lastArcMask:
                iEndArcAddr = iAddr + self.nBytesArc
                nRawArc = int.from_bytes(self.byDic[iAddr:iEndArcAddr], byteorder='big')
                iEndArcAddr = iAddr + 1
                nRawArc = self.byDic[iAddr]
                nArc = nRawArc & self._arcMask
                if nArc > self.nChar:
                    # This value is not a char, this is a stemming code
                    l.append(self.funcStemming(sWord, self.lArcVal[nArc]))
                iAddr = iEndArcAddr+self.nBytesNodeAddress
                iAddr = iEndArcAddr + 1
            return l
        return []

    def _lookupArcNode1 (self, nVal, iAddr):
    def _lookupArcNode (self, nVal, iAddr):
        "looks if <nVal> is an arc at the node at <iAddr>, if yes, returns address of next node else None"
        while True:
            iEndArcAddr = iAddr+self.nBytesArc
            nRawArc = int.from_bytes(self.byDic[iAddr:iEndArcAddr], byteorder='big')
            iEndArcAddr = iAddr + 1
            nRawArc = self.byDic[iAddr]
            if nVal == (nRawArc & self._arcMask):
                # the value we are looking for
                # we return the address of the next node
                return int.from_bytes(self.byDic[iEndArcAddr:iEndArcAddr+self.nBytesNodeAddress], byteorder='big')
                return self.byDic[iEndArcAddr]
            # value not found
            if nRawArc & self._lastArcMask:
                return None
            iAddr = iEndArcAddr+self.nBytesNodeAddress
            iAddr = iEndArcAddr + 1

    def _getArcs1 (self, iAddr):
    def _getArcs (self, iAddr):
        "generator: return all arcs at <iAddr> as tuples of (nVal, iAddr)"
        while True:
            iEndArcAddr = iAddr+self.nBytesArc
            nRawArc = int.from_bytes(self.byDic[iAddr:iEndArcAddr], byteorder='big')
            yield nRawArc & self._arcMask, int.from_bytes(self.byDic[iEndArcAddr:iEndArcAddr+self.nBytesNodeAddress], byteorder='big')
            iEndArcAddr = iAddr + 1
            nRawArc = self.byDic[iAddr]
            yield nRawArc & self._arcMask, self.byDic[iEndArcAddr]
            if nRawArc & self._lastArcMask:
                break
            iAddr = iEndArcAddr+self.nBytesNodeAddress
            iAddr = iEndArcAddr + 1

    def _writeNodes1 (self, spfDest):
    def _writeNodes (self, spfDest):
        "for debugging only"
        print(" > Write binary nodes")
        with open(spfDest, 'w', 'utf-8', newline="\n") as hDst:
            iAddr = 0
            hDst.write("i{:_>10} -- #{:_>10}\n".format("0", iAddr))
            while iAddr < len(self.byDic):
                iEndArcAddr = iAddr+self.nBytesArc
                nRawArc = int.from_bytes(self.byDic[iAddr:iEndArcAddr], byteorder='big')
                iEndArcAddr = iAddr + 1
                nRawArc = self.byDic[iAddr]
                nArc = nRawArc & self._arcMask
                hDst.write("  {:<20}  {:0>16}  i{:>10}   #{:_>10}\n".format(self.lArcVal[nArc], bin(nRawArc)[2:], "?", \
                hDst.write("  {:<20}  {:0>16}  i{:>10}   #{:_>10}\n".format(self.lArcVal[nArc], bin(nRawArc)[2:], "?", self.byDic[iEndArcAddr]))
                                                                            int.from_bytes(self.byDic[iEndArcAddr:iEndArcAddr+self.nBytesNodeAddress], \
                                                                                           byteorder='big')))
                iAddr = iEndArcAddr+self.nBytesNodeAddress
                if (nRawArc & self._lastArcMask) and iAddr < len(self.byDic):
                    hDst.write("\ni{:_>10} -- #{:_>10}\n".format("?", iAddr))
            hDst.close()

    # VERSION 2
    def _morph2 (self, sWord):
        "returns morphologies of <sWord>"
        iAddr = 0
        for c in sWord:
            if c not in self.dChar:
                return []
            iAddr = self._lookupArcNode(self.dChar[c], iAddr)
            if iAddr is None:
                return []
        if int.from_bytes(self.byDic[iAddr:iAddr+self.nBytesArc], byteorder='big') & self._finalNodeMask:
            l = []
            nRawArc = 0
            while not nRawArc & self._lastArcMask:
                iEndArcAddr = iAddr + self.nBytesArc
                nRawArc = int.from_bytes(self.byDic[iAddr:iEndArcAddr], byteorder='big')
                nArc = nRawArc & self._arcMask
                if nArc > self.nChar:
                    # This value is not a char, this is a stemming code
                    sStem = ">" + self.funcStemming(sWord, self.lArcVal[nArc])
                    # Now , we go to the next node and retrieve all following arcs values, all of them are tags
                    if not nRawArc & self._addrBitMask:
                        iAddr2 = int.from_bytes(self.byDic[iEndArcAddr:iEndArcAddr+self.nBytesNodeAddress], byteorder='big')
                    else:
                        # we go to the end of the node
                        iAddr2 = iEndArcAddr
                        while not nRawArc & self._lastArcMask:
                            nRawArc = int.from_bytes(self.byDic[iAddr2:iAddr2+self.nBytesArc], byteorder='big')
                            iAddr2 += self.nBytesArc + self.nBytesNodeAddress
                    nRawArc2 = 0
                    while not nRawArc2 & self._lastArcMask:
                        iEndArcAddr2 = iAddr2 + self.nBytesArc
                        nRawArc2 = int.from_bytes(self.byDic[iAddr2:iEndArcAddr2], byteorder='big')
                        l.append(sStem + "/" + self.lArcVal[nRawArc2 & self._arcMask])
                        iAddr2 = iEndArcAddr2+self.nBytesNodeAddress  if not nRawArc2 & self._addrBitMask else iEndArcAddr2
                iAddr = iEndArcAddr+self.nBytesNodeAddress  if not nRawArc & self._addrBitMask  else iEndArcAddr
            return l
        return []

    def _stem2 (self, sWord):
        "returns stems list of <sWord>"
        iAddr = 0
        for c in sWord:
            if c not in self.dChar:
                return []
            iAddr = self._lookupArcNode(self.dChar[c], iAddr)
            if iAddr is None:
                return []
        if int.from_bytes(self.byDic[iAddr:iAddr+self.nBytesArc], byteorder='big') & self._finalNodeMask:
            l = []
            nRawArc = 0
            while not nRawArc & self._lastArcMask:
                iEndArcAddr = iAddr + self.nBytesArc
                nRawArc = int.from_bytes(self.byDic[iAddr:iEndArcAddr], byteorder='big')
                nArc = nRawArc & self._arcMask
                if nArc > self.nChar:
                    # This value is not a char, this is a stemming code
                    l.append(self.funcStemming(sWord, self.lArcVal[nArc]))
                    # Now , we go to the next node
                    if not nRawArc & self._addrBitMask:
                        iAddr2 = int.from_bytes(self.byDic[iEndArcAddr:iEndArcAddr+self.nBytesNodeAddress], byteorder='big')
                    else:
                        # we go to the end of the node
                        iAddr2 = iEndArcAddr
                        while not nRawArc & self._lastArcMask:
                            nRawArc = int.from_bytes(self.byDic[iAddr2:iAddr2+self.nBytesArc], byteorder='big')
                            iAddr2 += self.nBytesArc + self.nBytesNodeAddress
                iAddr = iEndArcAddr+self.nBytesNodeAddress  if not nRawArc & self._addrBitMask  else iEndArcAddr
            return l
        return []

    def _lookupArcNode2 (self, nVal, iAddr):
        "looks if <nVal> is an arc at the node at <iAddr>, if yes, returns address of next node else None"
        while True:
            iEndArcAddr = iAddr+self.nBytesArc
            nRawArc = int.from_bytes(self.byDic[iAddr:iEndArcAddr], byteorder='big')
            if nVal == (nRawArc & self._arcMask):
                # the value we are looking for
                if not nRawArc & self._addrBitMask:
                    # we return the address of the next node
                    return int.from_bytes(self.byDic[iEndArcAddr:iEndArcAddr+self.nBytesNodeAddress], byteorder='big')
                # we go to the end of the node
                iAddr = iEndArcAddr
                iAddr = iEndArcAddr + 1
                while not nRawArc & self._lastArcMask:
                    nRawArc = int.from_bytes(self.byDic[iAddr:iAddr+self.nBytesArc], byteorder='big')
                    iAddr += self.nBytesArc + self.nBytesNodeAddress  if not nRawArc & self._addrBitMask  else self.nBytesArc
                return iAddr
            # value not found
            if nRawArc & self._lastArcMask:
                return None
            iAddr = iEndArcAddr+self.nBytesNodeAddress  if not nRawArc & self._addrBitMask  else iEndArcAddr

    def _writeNodes2 (self, spfDest):
        "for debugging only"
        print(" > Write binary nodes")
        with open(spfDest, 'w', 'utf-8', newline="\n") as hDst:
            iAddr = 0
            hDst.write("i{:_>10} -- #{:_>10}\n".format("0", iAddr))
            while iAddr < len(self.byDic):
                iEndArcAddr = iAddr+self.nBytesArc
                nRawArc = int.from_bytes(self.byDic[iAddr:iEndArcAddr], byteorder='big')
                nArc = nRawArc & self._arcMask
                if not nRawArc & self._addrBitMask:
                    iNextNodeAddr = int.from_bytes(self.byDic[iEndArcAddr:iEndArcAddr+self.nBytesNodeAddress], byteorder='big')
                    hDst.write("  {:<20}  {:0>16}  i{:>10}   #{:_>10}\n".format(self.lArcVal[nArc], bin(nRawArc)[2:], "?", iNextNodeAddr))
                    iAddr = iEndArcAddr+self.nBytesNodeAddress
                else:
                    hDst.write("  {:<20}  {:0>16}\n".format(self.lArcVal[nArc], bin(nRawArc)[2:]))
                    iAddr = iEndArcAddr
                if nRawArc & self._lastArcMask:
                if (nRawArc & self._lastArcMask) and iAddr < len(self.byDic):
                    hDst.write("\ni{:_>10} -- #{:_>10}\n".format("?", iAddr))
            hDst.close()

    # VERSION 3
    def _morph3 (self, sWord):
        "returns morphologies of <sWord>"
        iAddr = 0
        for c in sWord:
            if c not in self.dChar:
                return []
            iAddr = self._lookupArcNode(self.dChar[c], iAddr)
            if iAddr is None:
                return []
        if int.from_bytes(self.byDic[iAddr:iAddr+self.nBytesArc], byteorder='big') & self._finalNodeMask:
            l = []
            nRawArc = 0
            iAddrNode = iAddr
            while not nRawArc & self._lastArcMask:
                iEndArcAddr = iAddr + self.nBytesArc
                nRawArc = int.from_bytes(self.byDic[iAddr:iEndArcAddr], byteorder='big')
                nArc = nRawArc & self._arcMask
                if nArc > self.nChar:
                    # This value is not a char, this is a stemming code
                    sStem = ">" + self.funcStemming(sWord, self.lArcVal[nArc])
                    # Now , we go to the next node and retrieve all following arcs values, all of them are tags
                    if not nRawArc & self._addrBitMask:
                        iAddr2 = int.from_bytes(self.byDic[iEndArcAddr:iEndArcAddr+self.nBytesNodeAddress], byteorder='big')
                    else:
                        iAddr2 = iAddrNode + int.from_bytes(self.byDic[iEndArcAddr:iEndArcAddr+self.nBytesOffset], byteorder='big')
                    nRawArc2 = 0
                    while not nRawArc2 & self._lastArcMask:
                        iEndArcAddr2 = iAddr2 + self.nBytesArc
                        nRawArc2 = int.from_bytes(self.byDic[iAddr2:iEndArcAddr2], byteorder='big')
                        l.append(sStem + "/" + self.lArcVal[nRawArc2 & self._arcMask])
                        iAddr2 = iEndArcAddr2+self.nBytesNodeAddress  if not nRawArc2 & self._addrBitMask  else iEndArcAddr2+self.nBytesOffset
                iAddr = iEndArcAddr+self.nBytesNodeAddress  if not nRawArc & self._addrBitMask  else iEndArcAddr+self.nBytesOffset
            return l
        return []

    def _stem3 (self, sWord):
        "returns stems list of <sWord>"
        iAddr = 0
        for c in sWord:
            if c not in self.dChar:
                return []
            iAddr = self._lookupArcNode(self.dChar[c], iAddr)
            if iAddr is None:
                return []
        if int.from_bytes(self.byDic[iAddr:iAddr+self.nBytesArc], byteorder='big') & self._finalNodeMask:
            l = []
            nRawArc = 0
            #iAddrNode = iAddr
            while not nRawArc & self._lastArcMask:
                iEndArcAddr = iAddr + self.nBytesArc
                nRawArc = int.from_bytes(self.byDic[iAddr:iEndArcAddr], byteorder='big')
                nArc = nRawArc & self._arcMask
                if nArc > self.nChar:
                    # This value is not a char, this is a stemming code
                    l.append(self.funcStemming(sWord, self.lArcVal[nArc]))
                iAddr = iEndArcAddr+self.nBytesNodeAddress  if not nRawArc & self._addrBitMask  else iEndArcAddr+self.nBytesOffset
            return l
        return []

    def _lookupArcNode3 (self, nVal, iAddr):
        "looks if <nVal> is an arc at the node at <iAddr>, if yes, returns address of next node else None"
        iAddrNode = iAddr
        while True:
            iEndArcAddr = iAddr+self.nBytesArc
            nRawArc = int.from_bytes(self.byDic[iAddr:iEndArcAddr], byteorder='big')
            if nVal == (nRawArc & self._arcMask):
                # the value we are looking for
                if not nRawArc & self._addrBitMask:
                    return int.from_bytes(self.byDic[iEndArcAddr:iEndArcAddr+self.nBytesNodeAddress], byteorder='big')
                return iAddrNode + int.from_bytes(self.byDic[iEndArcAddr:iEndArcAddr+self.nBytesOffset], byteorder='big')
            # value not found
            if nRawArc & self._lastArcMask:
                return None
            iAddr = iEndArcAddr+self.nBytesNodeAddress  if not nRawArc & self._addrBitMask  else iEndArcAddr+self.nBytesOffset

    def _writeNodes3 (self, spfDest):
        "for debugging only"
        print(" > Write binary nodes")
        with open(spfDest, 'w', 'utf-8', newline="\n") as hDst:
            iAddr = 0
            hDst.write("i{:_>10} -- #{:_>10}\n".format("0", iAddr))
            while iAddr < len(self.byDic):
                iEndArcAddr = iAddr+self.nBytesArc
                nRawArc = int.from_bytes(self.byDic[iAddr:iEndArcAddr], byteorder='big')
                nArc = nRawArc & self._arcMask
                if not nRawArc & self._addrBitMask:
                    iNextNodeAddr = int.from_bytes(self.byDic[iEndArcAddr:iEndArcAddr+self.nBytesNodeAddress], byteorder='big')
                    hDst.write("  {:<20}  {:0>16}  i{:>10}   #{:_>10}\n".format(self.lArcVal[nArc], bin(nRawArc)[2:], "?", iNextNodeAddr))
                    iAddr = iEndArcAddr+self.nBytesNodeAddress
                else:
                    iNextNodeAddr = int.from_bytes(self.byDic[iEndArcAddr:iEndArcAddr+self.nBytesOffset], byteorder='big')
                    hDst.write("  {:<20}  {:0>16}  i{:>10}   +{:_>10}\n".format(self.lArcVal[nArc], bin(nRawArc)[2:], "?", iNextNodeAddr))
                    iAddr = iEndArcAddr+self.nBytesOffset
                if nRawArc & self._lastArcMask:
                    hDst.write("\ni{:_>10} -- #{:_>10}\n".format("?", iAddr))
            hDst.close()

Modified graphspell/str_transform.py from [e27e70d363] to [98c57fa9ba].

1
2
3
4
5
6
7

8
9

10
11
12
13
14
15
16
1
2
3
4
5
6
7
8
9

10
11
12
13
14
15
16
17







+

-
+







"""
Operations on strings:
- calculate distance between two strings
- transform strings with transformation codes
"""

import unicodedata
import re

from .char_player import distanceBetweenChars
from .char_player import distanceBetweenChars, dDistanceBetweenChars


#### N-GRAMS

def getNgrams (sWord, n=2):
    "return a list of Ngrams strings"
    return [ sWord[i:i+n]  for i in range(len(sWord)-n+1) ]
48
49
50
51
52
53
54





55
56
57
58
59
60
61
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67







+
+
+
+
+







    sWord = sWord.lower().translate(_xTransCharsForSimplification)
    sNewWord = ""
    for i, c in enumerate(sWord, 1):
        if c != sWord[i:i+1] or (c == 'e' and sWord[i:i+2] != "ee"):  # exception for <e> to avoid confusion between crée / créai
            sNewWord += c
    return sNewWord.replace("eau", "o").replace("au", "o").replace("ai", "éi").replace("ei", "é").replace("ph", "f")


def cleanWord (sWord):
    "remove letters repeated more than 2 times"
    return re.sub("(.)\\1{2,}", '\\1\\1', sWord)


_xTransNumbersToExponent = str.maketrans({
    "0": "⁰", "1": "¹", "2": "²", "3": "³", "4": "⁴", "5": "⁵", "6": "⁶", "7": "⁷", "8": "⁸", "9": "⁹"
})

def numbersToExponent (sWord):
    "convert numeral chars to exponant chars"
102
103
104
105
106
107
108













































































109
110
111
112
113
114
115
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198







+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+







                d[i,   j-1] + 1,        # Insertion
                d[i-1, j-1] + nCost,    # Substitution
            )
            if i and j and s1[i] == s2[j-1] and s1[i-1] == s2[j]:
                d[i, j] = min(d[i, j], d[i-2, j-2] + nCost)     # Transposition
    return d[nLen1-1, nLen2-1]


def distanceJaroWinkler (a, b, boost = .666):
    # https://github.com/thsig/jaro-winkler-JS
    #if (a == b): return 1.0
    a_len = len(a)
    b_len = len(b)
    nMax = max(a_len, b_len)
    a_flag = [None for _ in range(nMax)]
    b_flag = [None for _ in range(nMax)]
    search_range = (max(a_len, b_len) // 2) - 1
    minv = min(a_len, b_len)

    # Looking only within the search range, count and flag the matched pairs.
    Num_com = 0
    yl1 = b_len - 1
    for i in range(a_len):
        lowlim = i - search_range  if i >= search_range  else 0
        hilim  = i + search_range  if (i + search_range) <= yl1  else yl1
        for j in range(lowlim, hilim+1):
            if b_flag[j] != 1 and a[j:j+1] == b[i:i+1]:
                a_flag[j] = 1
                b_flag[i] = 1
                Num_com += 1
                break

    # Return if no characters in common
    if Num_com == 0:
        return 0.0

    # Count the number of transpositions
    k = 0
    N_trans = 0
    for i in range(a_len):
        if a_flag[i] == 1:
            for j in range(k, b_len):
                if b_flag[j] == 1:
                    k = j + 1
                    break
            if a[i] != b[j]:
                N_trans += 1
    N_trans = N_trans // 2

    # Adjust for similarities in nonmatched characters
    N_simi = 0
    if minv > Num_com:
        for i in range(a_len):
            if not a_flag[i]:
                for j in range(b_len):
                    if not b_flag[j]:
                        if a[i] in dDistanceBetweenChars and b[j] in dDistanceBetweenChars[a[i]]:
                            N_simi += dDistanceBetweenChars[a[i]][b[j]]
                            b_flag[j] = 2
                            break

    Num_sim = (N_simi / 10.0) + Num_com

    # Main weight computation
    weight = Num_sim / a_len + Num_sim / b_len + (Num_com - N_trans) / Num_com
    weight = weight / 3

    # Continue to boost the weight if the strings are similar
    if weight > boost:
        # Adjust for having up to the first 4 characters in common
        j = 4  if minv >= 4  else minv
        i = 0
        while i < j  and a[i] == b[i]:
            i += 1
        if i:
            weight += i * 0.1 * (1.0 - weight)
        # Adjust for long strings.
        # After agreeing beginning chars, at least two more must agree
        # and the agreeing characters must be more than half of the
        # remaining characters.
        if minv > 4  and  Num_com > i + 1  and  2 * Num_com >= minv + i:
            weight += (1 - weight) * ((Num_com - i - 1) / (a_len * b_len - i*2 + 2))
    return weight


def distanceSift4 (s1, s2, nMaxOffset=5):
    "implementation of general Sift4."
    # https://siderite.blogspot.com/2014/11/super-fast-and-accurate-string-distance.html
    if not s1:
        return len(s2)
    if not s2: