Grammalecte  Changes On Branch b3f2f1d72a96d629

Changes In Branch bdic_opt Through [b3f2f1d72a] Excluding Merge-Ins

This is equivalent to a diff from 4a19028115 to b3f2f1d72a

2020-09-14
07:55
[fr] ajustements check-in: cd8c458e7b user: olr tags: trunk, fr
2020-09-12
12:45
[graphspell][py] ibdawg: code cleaning, remove version 2 and 3, never used check-in: 4fa9631623 user: olr tags: graphspell, bdic_opt
12:32
[graphspell][js] ibdawg: code cleaning, remove version 2 and 3, never used check-in: b3f2f1d72a user: olr tags: graphspell, bdic_opt
12:22
[fr] tests update check-in: 0207fe1b5b user: olr tags: fr, bdic_opt
2020-09-11
19:20
merge trunk check-in: 43afb8b856 user: olr tags: bdic_opt
19:18
[fr] tests: spellchecker.suggest() check-in: 4a19028115 user: olr tags: trunk, fr
14:21
[fr] faux positif check-in: 86f302f4ef user: olr tags: trunk, fr

Modified gc_lang/fr/modules/tests_modules.py from [1156e862bc] to [8a9c45807b].

18
19
20
21
22
23
24
25

26
27
28
29
30
31
32
18
19
20
21
22
23
24

25
26
27
28
29
30
31
32







-
+







def timeblock (label, hDst=None):
    "performance counter (contextmanager)"
    start = time.perf_counter()
    try:
        yield
    finally:
        end = time.perf_counter()
        print('{} : {}'.format(label, end - start))
        print('{:<20} : {}'.format(label, end - start))
        if hDst:
            hDst.write("{:<12.6}".format(end-start))


class TestDictionary (unittest.TestCase):
    "Test du correcteur orthographique"

47
48
49
50
51
52
53
54

55
56
57
58
59
60
61
47
48
49
50
51
52
53

54
55
56
57
58
59
60
61







-
+







            self.assertTrue(self.oDic.isValid(sWord), sWord)

    def test_isvalid_failed (self):
        for sWord in ["BranchE", "BRanche", "BRAnCHE", "émilie", "éMILIE", "émiLie"]:
            self.assertFalse(self.oDic.isValid(sWord), sWord)

    def test_suggest (self):
        for sWord in ["déelirranttesss", "vallidasion", "Emilie", "exibission"]:
        for sWord in ["déelirranttesss", "vallidasion", "Emilie", "exibission", "ditirembique", "jai", "email"]:
            with timeblock(sWord):
                self.assertNotEqual(0, self.oDic.suggest(sWord))


class TestConjugation (unittest.TestCase):
    "Tests des conjugaisons"

Modified gc_lang/fr/perf_memo.text from [6a0d81df00] to [ad156793c1].

26
27
28
29
30
31
32
33
34



26
27
28
29
30
31
32


33
34
35







-
-
+
+
+
0.6.2       2018.02.19 19:06    5.51302     1.29359     0.874157    0.260415    0.271596    0.290641    0.684754    0.376905    0.0815201   0.00919633  (spelling normalization)
1.0         2018.11.23 10:59    2.88577     0.702486    0.485648    0.139897    0.14079     0.148125    0.348751    0.201061    0.0360297   0.0043535   (x2, with new GC engine)
1.1         2019.05.16 09:42    1.50743     0.360923    0.261113    0.0749272   0.0763827   0.0771537   0.180504    0.102942    0.0182762   0.0021925   (×2, but new processor: AMD Ryzen 7 2700X)
1.2.1       2019.08.06 20:57    1.42886     0.358425    0.247356    0.0704405   0.0754886   0.0765604   0.177197    0.0988517   0.0188103   0.0020243
1.6.0       2020.01.03 20:22    1.38847     0.346214    0.240242    0.0709539   0.0737499   0.0748733   0.176477    0.0969171   0.0187857   0.0025143   (nouveau dictionnaire avec lemmes masculins)
1.9.0       2020.04.20 19:57    1.51183     0.369546    0.25681     0.0734314   0.0764396   0.0785668   0.183922    0.103674    0.0185812   0.002099    (NFC normalization)
1.9.2       2020.05.12 08:43    1.62465     0.398831    0.273012    0.0810811   0.080937    0.0845885   0.204133    0.114146    0.0212864   0.0029547
1.12.2      2020.09.09 13:34    1.50568     0.374504    0.233108    0.0798712   0.0804466   0.0769674   0.171519    0.0945132   0.0165344   0.0019474   
1.12.2      2020.09.09 13:35    1.41094     0.359093    0.236443    0.06968     0.0734418   0.0738087   0.169371    0.0946279   0.0167106   0.0019773   
1.12.2      2020.09.09 13:34    1.50568     0.374504    0.233108    0.0798712   0.0804466   0.0769674   0.171519    0.0945132   0.0165344   0.0019474
1.12.2      2020.09.09 13:35    1.41094     0.359093    0.236443    0.06968     0.0734418   0.0738087   0.169371    0.0946279   0.0167106   0.0019773
1.12.2      2020.09.11 19:16    1.35297     0.330545    0.221731    0.0666998   0.0692539   0.0701707   0.160564    0.0891676   0.015807    0.0045998

Modified graphspell-js/ibdawg.js from [69d7490b82] to [baafc4cde1].

120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159












160
161
162
163
164














165
166
167

168
169
170




171
172
173
174
175
176
177



178
179
180
181
182
183
184
185
186
187
188
189
190

191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
120
121
122
123
124
125
126














127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157





158
159
160
161
162
163
164
165
166
167
168
169
170
171



172



173
174
175
176







177
178
179













180
181
182
183
184
185
186
187
188
189
190

191
192
193
194
195
196
197







-
-
-
-
-
-
-
-
-
-
-
-
-
-



















+
+
+
+
+
+
+
+
+
+
+
+
-
-
-
-
-
+
+
+
+
+
+
+
+
+
+
+
+
+
+
-
-
-
+
-
-
-
+
+
+
+
-
-
-
-
-
-
-
+
+
+
-
-
-
-
-
-
-
-
-
-
-
-
-
+










-







        }
        /*
            Properties:
            sName, nCompressionMethod, sHeader, lArcVal, nArcVal, sByDic, sLang, nChar, nBytesArc, nBytesNodeAddress,
            nEntry, nNode, nArc, nAff, cStemming, nTag, dChar, nBytesOffset,
        */

        /*
            Bug workaround.
            Mozilla’s JS parser sucks. Can’t read file bigger than 4 Mb!
            So we convert huge hexadecimal string to list of numbers…
            https://github.com/mozilla/addons-linter/issues/1361
        */
        let lTemp = [];
        for (let i = 0;  i < this.sByDic.length;  i+=2) {
            lTemp.push(parseInt(this.sByDic.slice(i, i+2), 16));
        }
        this.byDic = lTemp;
        //this.byDic = new Uint8Array(lTemp);  // not quicker, even slower
        /* end of bug workaround */

        if (!(this.sHeader.startsWith("/grammalecte-fsa/") || this.sHeader.startsWith("/pyfsa/"))) {
            throw TypeError("# Error. Not a grammalecte-fsa binary dictionary. Header: " + this.sHeader);
        }
        if (!(this.nCompressionMethod == 1 || this.nCompressionMethod == 2 || this.nCompressionMethod == 3)) {
            throw RangeError("# Error. Unknown dictionary compression method: " + this.nCompressionMethod);
        }
        // <dChar> to get the value of an arc, <dCharVal> to get the char of an arc with its value
        this.dChar = helpers.objectToMap(this.dChar);
        this.dCharVal = this.dChar.gl_reverse();
        this.a2grams = (this.l2grams) ? new Set(this.l2grams) : null;

        if (this.cStemming == "S") {
            this.funcStemming = str_transform.changeWordWithSuffixCode;
        } else if (this.cStemming == "A") {
            this.funcStemming = str_transform.changeWordWithAffixCode;
        } else {
            this.funcStemming = str_transform.noStemming;
        }

        /*
            Bug workaround.
            Mozilla’s JS parser sucks. Can’t read file bigger than 4 Mb!
            So we convert huge hexadecimal string to list of numbers…
            https://github.com/mozilla/addons-linter/issues/1361
        */
        /*
            Performance trick:
            Instead of converting bytes to integers each times we parse the binary dictionary,
            we do it once, then parse the array
        */
        let nAcc = 0;
        this._arcMask = (2 ** ((this.nBytesArc * 8) - 3)) - 1;
        this._finalNodeMask = 1 << ((this.nBytesArc * 8) - 1);
        this._lastArcMask = 1 << ((this.nBytesArc * 8) - 2);


        let lBytesBuffer = [];
        let lTemp = [];
        let nDivisor = (this.nBytesArc + this.nBytesNodeAddress) / 2;
        for (let i = 0;  i < this.sByDic.length;  i+=2) {
            lBytesBuffer.push(parseInt(this.sByDic.slice(i, i+2), 16));
            if (nAcc == (this.nBytesArc - 1)) {
                lTemp.push(this._convBytesToInteger(lBytesBuffer));
                lBytesBuffer = [];
            }
            else if (nAcc == (this.nBytesArc + this.nBytesNodeAddress - 1)) {
                lTemp.push(Math.round(this._convBytesToInteger(lBytesBuffer) / nDivisor));  // Math.round should be useless, BUT with JS who knowns what can happen…
                lBytesBuffer = [];
                nAcc = -1;
            }
        // Configuring DAWG functions according to nCompressionMethod
        switch (this.nCompressionMethod) {
            case 1:
            nAcc = nAcc + 1;
                this.morph = this._morph1;
                this.stem = this._stem1;
                this._lookupArcNode = this._lookupArcNode1;
        }
        this.byDic = lTemp;
        /* end of bug workaround */

                this._getArcs = this._getArcs1;
                this._writeNodes = this._writeNodes1;
                break;
            case 2:
                this.morph = this._morph2;
                this.stem = this._stem2;
                this._lookupArcNode = this._lookupArcNode2;
        this._arcMask = (2 ** ((this.nBytesArc * 8) - 3)) - 1;
        this._finalNodeMask = 1 << ((this.nBytesArc * 8) - 1);
        this._lastArcMask = 1 << ((this.nBytesArc * 8) - 2);
                this._getArcs = this._getArcs2;
                this._writeNodes = this._writeNodes2;
                break;
            case 3:
                this.morph = this._morph3;
                this.stem = this._stem3;
                this._lookupArcNode = this._lookupArcNode3;
                this._getArcs = this._getArcs3;
                this._writeNodes = this._writeNodes3;
                break;
            default:
                throw ValueError("# Error: unknown code: " + this.nCompressionMethod);
        }

        //console.log(this.getInfo());
        this.bAcronymValid = true;
        this.bNumAtLastValid = false;

        // lexicographer module ?
        this.lexicographer = null;
        // JS still sucks: we’ll try importation when importation will be available in Workers. Still waiting...
        if (self && self.hasOwnProperty("lexgraph_"+this.sLangCode)) { // self is the Worker
            this.lexicographer = self["lexgraph_"+this.sLangCode];
        }

    }

    getInfo () {
        return  `  Language: ${this.sLangName}   Lang code: ${this.sLangCode}   Dictionary name: ${this.sDicName}\n` +
                `  Compression method: ${this.nCompressionMethod}   Date: ${this.sDate}   Stemming: ${this.cStemming}FX\n` +
                `  Arcs values:  ${this.nArcVal} = ${this.nChar} characters,  ${this.nAff} affixes,  ${this.nTag} tags\n` +
                `  Dictionary: ${this.nEntry} entries,    ${this.nNode} nodes,   ${this.nArc} arcs\n` +
304
305
306
307
308
309
310
311

312
313
314
315
316
317
318
319
320

321
322

323
324

325
326
327
328
329
330
331
293
294
295
296
297
298
299

300
301
302
303
304
305
306
307
308

309
310

311
312

313
314
315
316
317
318
319
320







-
+








-
+

-
+

-
+







                return false;
            }
            iAddr = this._lookupArcNode(this.dChar.get(c), iAddr);
            if (iAddr === null) {
                return false;
            }
        }
        return Boolean(this._convBytesToInteger(this.byDic.slice(iAddr, iAddr+this.nBytesArc)) & this._finalNodeMask);
        return Boolean(this.byDic[iAddr] & this._finalNodeMask);
    }

    getMorph (sWord) {
        // retrieves morphologies list, different casing allowed
        if (!sWord) {
            return [];
        }
        sWord = str_transform.spellingNormalization(sWord);
        let l = this.morph(sWord);
        let l = this._morph(sWord);
        if (sWord[0].gl_isUpperCase()) {
            l.push(...this.morph(sWord.toLowerCase()));
            l.push(...this._morph(sWord.toLowerCase()));
            if (sWord.gl_isUpperCase() && sWord.length > 1) {
                l.push(...this.morph(sWord.gl_toCapitalize()));
                l.push(...this._morph(sWord.gl_toCapitalize()));
            }
        }
        return l;
    }

    suggest (sWord, nSuggLimit=10, bSplitTrailingNumbers=false) {
        // returns a array of suggestions for <sWord>
376
377
378
379
380
381
382
383

384
385
386
387
388
389
390
365
366
367
368
369
370
371

372
373
374
375
376
377
378
379







-
+







            }
        }
    }

    _suggest (oSuggResult, sRemain, nMaxSwitch=0, nMaxDel=0, nMaxHardRepl=0, nMaxJump=0, nDist=0, nDeep=0, iAddr=0, sNewWord="", bAvoidLoop=false) {
        // returns a set of suggestions
        // recursive function
        if (this._convBytesToInteger(this.byDic.slice(iAddr, iAddr+this.nBytesArc)) & this._finalNodeMask) {
        if (this.byDic[iAddr] & this._finalNodeMask) {
            if (sRemain == "") {
                oSuggResult.addSugg(sNewWord);
                for (let sTail of this._getTails(iAddr)) {
                    oSuggResult.addSugg(sNewWord+sTail);
                }
                return;
            }
486
487
488
489
490
491
492
493

494
495
496
497
498
499
500
475
476
477
478
479
480
481

482
483
484
485
486
487
488
489







-
+







    }

    _getTails (iAddr, sTail="", n=2) {
        // return a list of suffixes ending at a distance of <n> from <iAddr>
        let aTails = new Set();
        for (let [nVal, jAddr] of this._getArcs(iAddr)) {
            if (nVal <= this.nChar) {
                if (this._convBytesToInteger(this.byDic.slice(jAddr, jAddr+this.nBytesArc)) & this._finalNodeMask) {
                if (this.byDic[jAddr] & this._finalNodeMask) {
                    aTails.add(sTail + this.dCharVal.get(nVal));
                }
                if (n && aTails.size == 0) {
                    aTails.gl_update(this._getTails(jAddr, sTail+this.dCharVal.get(nVal), n-1));
                }
            }
        }
528
529
530
531
532
533
534
535

536
537
538
539
540

541
542

543
544
545

546
547
548
549

550
551
552
553
554
555
556
557
558
559

560
561
562
563
564
565
566
567
568
569
570
571

572
573
574
575
576


577
578
579
580
581
582

583
584
585
586


587
588

589
590
591

592
593
594
595
596
597
598

599
600
601
602
603
604
605
606
607
608
609
610

611
612
613
614
615


616
617
618
619
620
621

622
623
624
625
626
627
628

629
630
631
632


633
634
635
636

637
638
639
640
641
642
643

644
645
646
647
648

649
650
651
652
653



654
655
656
657

658
659
660
661
662
663
664
665
666
667
668
669
670
671
672
673
674
675
676
677
678
679
680
681
682
683
684
685
686
687
688
689
690
691
692
517
518
519
520
521
522
523

524
525
526



527
528

529
530
531

532
533
534
535

536
537
538
539
540
541
542
543
544
545

546
547
548
549
550
551
552
553
554
555
556
557

558
559
560
561


562
563
564
565
566
567
568

569
570
571


572
573
574

575
576
577

578
579
580
581
582
583
584

585
586
587
588
589
590
591
592
593
594
595
596

597
598
599
600


601
602
603
604
605
606
607

608
609
610
611
612
613
614

615
616
617


618
619
620
621
622

623
624
625
626
627
628
629

630
631
632
633
634

635
636
637



638
639
640
641
642
643

644
645



























646
647
648
649
650
651
652







-
+


-
-
-
+

-
+


-
+



-
+









-
+











-
+



-
-
+
+





-
+


-
-
+
+

-
+


-
+






-
+











-
+



-
-
+
+





-
+






-
+


-
-
+
+



-
+






-
+




-
+


-
-
-
+
+
+



-
+

-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-







            zFlexPattern = (sFlexPattern !== "") ? new RegExp(sFlexPattern) : null;
            zTagsPattern = (sTagsPattern !== "") ? new RegExp(sTagsPattern) : null;
        }
        catch (e) {
            console.log("Error in regex pattern");
            console.log(e.message);
        }
        yield* this._select1(zFlexPattern, zTagsPattern, 0, "");
        yield* this._select(zFlexPattern, zTagsPattern, 0, "");
    }

    // VERSION 1

    * _select1 (zFlexPattern, zTagsPattern, iAddr, sWord) {
    * _select (zFlexPattern, zTagsPattern, iAddr, sWord) {
        // recursive generator
        for (let [nVal, jAddr] of this._getArcs1(iAddr)) {
        for (let [nVal, jAddr] of this._getArcs(iAddr)) {
            if (nVal <= this.nChar) {
                // simple character
                yield* this._select1(zFlexPattern, zTagsPattern, jAddr, sWord + this.lArcVal[nVal]);
                yield* this._select(zFlexPattern, zTagsPattern, jAddr, sWord + this.lArcVal[nVal]);
            } else {
                if (!zFlexPattern || zFlexPattern.test(sWord)) {
                    let sStem = this.funcStemming(sWord, this.lArcVal[nVal]);
                    for (let [nMorphVal, _] of this._getArcs1(jAddr)) {
                    for (let [nMorphVal, _] of this._getArcs(jAddr)) {
                        if (!zTagsPattern || zTagsPattern.test(this.lArcVal[nMorphVal])) {
                            yield [sWord, sStem, this.lArcVal[nMorphVal]];
                        }
                    }
                }
            }
        }
    }

    _morph1 (sWord) {
    _morph (sWord) {
        // returns morphologies of sWord
        let iAddr = 0;
        for (let c of sWord) {
            if (!this.dChar.has(c)) {
                return [];
            }
            iAddr = this._lookupArcNode(this.dChar.get(c), iAddr);
            if (iAddr === null) {
                return [];
            }
        }
        if (this._convBytesToInteger(this.byDic.slice(iAddr, iAddr+this.nBytesArc)) & this._finalNodeMask) {
        if (this.byDic[iAddr] & this._finalNodeMask) {
            let l = [];
            let nRawArc = 0;
            while (!(nRawArc & this._lastArcMask)) {
                let iEndArcAddr = iAddr + this.nBytesArc;
                nRawArc = this._convBytesToInteger(this.byDic.slice(iAddr, iEndArcAddr));
                let iEndArcAddr = iAddr + 1;
                nRawArc = this.byDic[iAddr];
                let nArc = nRawArc & this._arcMask;
                if (nArc > this.nChar) {
                    // This value is not a char, this is a stemming code
                    let sStem = ">" + this.funcStemming(sWord, this.lArcVal[nArc]);
                    // Now , we go to the next node and retrieve all following arcs values, all of them are tags
                    let iAddr2 = this._convBytesToInteger(this.byDic.slice(iEndArcAddr, iEndArcAddr+this.nBytesNodeAddress));
                    let iAddr2 = this.byDic[iEndArcAddr];
                    let nRawArc2 = 0;
                    while (!(nRawArc2 & this._lastArcMask)) {
                        let iEndArcAddr2 = iAddr2 + this.nBytesArc;
                        nRawArc2 = this._convBytesToInteger(this.byDic.slice(iAddr2, iEndArcAddr2));
                        let iEndArcAddr2 = iAddr2 + 1;
                        nRawArc2 = this.byDic[iAddr2];
                        l.push(sStem + "/" + this.lArcVal[nRawArc2 & this._arcMask]);
                        iAddr2 = iEndArcAddr2+this.nBytesNodeAddress;
                        iAddr2 = iEndArcAddr2 + 1;
                    }
                }
                iAddr = iEndArcAddr + this.nBytesNodeAddress;
                iAddr = iEndArcAddr + 1;
            }
            return l;
        }
        return [];
    }

    _stem1 (sWord) {
    _stem (sWord) {
        // returns stems list of sWord
        let iAddr = 0;
        for (let c of sWord) {
            if (!this.dChar.has(c)) {
                return [];
            }
            iAddr = this._lookupArcNode(this.dChar.get(c), iAddr);
            if (iAddr === null) {
                return [];
            }
        }
        if (this._convBytesToInteger(this.byDic.slice(iAddr, iAddr+this.nBytesArc)) & this._finalNodeMask) {
        if (this.byDic[iAddr] & this._finalNodeMask) {
            let l = [];
            let nRawArc = 0;
            while (!(nRawArc & this._lastArcMask)) {
                let iEndArcAddr = iAddr + this.nBytesArc;
                nRawArc = this._convBytesToInteger(this.byDic.slice(iAddr, iEndArcAddr));
                let iEndArcAddr = iAddr + 1;
                nRawArc = this.byDic[iAddr];
                let nArc = nRawArc & this._arcMask;
                if (nArc > this.nChar) {
                    // This value is not a char, this is a stemming code
                    l.push(this.funcStemming(sWord, this.lArcVal[nArc]));
                }
                iAddr = iEndArcAddr + this.nBytesNodeAddress;
                iAddr = iEndArcAddr + 1;
            }
            return l;
        }
        return [];
    }

    _lookupArcNode1 (nVal, iAddr) {
    _lookupArcNode (nVal, iAddr) {
        // looks if nVal is an arc at the node at iAddr, if yes, returns address of next node else None
        while (true) {
            let iEndArcAddr = iAddr+this.nBytesArc;
            let nRawArc = this._convBytesToInteger(this.byDic.slice(iAddr, iEndArcAddr));
            let iEndArcAddr = iAddr+1;
            let nRawArc = this.byDic[iAddr];
            if (nVal == (nRawArc & this._arcMask)) {
                // the value we are looking for
                // we return the address of the next node
                return this._convBytesToInteger(this.byDic.slice(iEndArcAddr, iEndArcAddr+this.nBytesNodeAddress));
                return this.byDic[iEndArcAddr];
            }
            else {
                // value not found
                if (nRawArc & this._lastArcMask) {
                    return null;
                }
                iAddr = iEndArcAddr + this.nBytesNodeAddress;
                iAddr = iEndArcAddr + 1;
            }
        }
    }

    * _getArcs1 (iAddr) {
    * _getArcs (iAddr) {
        // generator: return all arcs at <iAddr> as tuples of (nVal, iAddr)
        while (true) {
            let iEndArcAddr = iAddr+this.nBytesArc;
            let nRawArc = this._convBytesToInteger(this.byDic.slice(iAddr, iEndArcAddr));
            yield [nRawArc & this._arcMask, this._convBytesToInteger(this.byDic.slice(iEndArcAddr, iEndArcAddr+this.nBytesNodeAddress))];
            let iEndArcAddr = iAddr+1;
            let nRawArc = this.byDic[iAddr];
            yield [nRawArc & this._arcMask, this.byDic[iEndArcAddr]];
            if (nRawArc & this._lastArcMask) {
                break;
            }
            iAddr = iEndArcAddr+this.nBytesNodeAddress;
            iAddr = iEndArcAddr+1;
        }
    }

    // VERSION 2
    _morph2 (sWord) {
        // to do
    }

    _stem2 (sWord) {
        // to do
    }

    _lookupArcNode2 (nVal, iAddr) {
        // to do
    }


    // VERSION 3
    _morph3 (sWord) {
        // to do
    }

    _stem3 (sWord) {
        // to do
    }

    _lookupArcNode3 (nVal, iAddr) {
        // to do
    }
}


if (typeof(exports) !== 'undefined') {
    exports.IBDAWG = IBDAWG;
}

Modified graphspell/ibdawg.py from [d16ed0d683] to [0fe5cbd03f].

116
117
118
119
120
121
122




















123
124
125
126
127
128
129
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149







+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+







            else:
                raise OSError("# Error. Unknown file type: "+source)
        else:
            self._initJSON(source)

        self.sFileName = source  if isinstance(source, str)  else "[None]"

        # Performance trick:
        #     Instead of converting bytes to integers each times we parse the binary dictionary,
        #     we do it once, then parse the array
        nAcc = 0
        byBuffer = b""
        lTemp = []
        nDivisor = (self.nBytesArc + self.nBytesNodeAddress) / 2
        for i in range(0, len(self.byDic)):
            byBuffer += self.byDic[i:i+1]
            if nAcc == (self.nBytesArc - 1):
                lTemp.append(int.from_bytes(byBuffer, byteorder="big"))
                byBuffer = b""
            elif nAcc == (self.nBytesArc + self.nBytesNodeAddress - 1):
                lTemp.append(round(int.from_bytes(byBuffer, byteorder="big") / nDivisor))
                byBuffer = b""
                nAcc = -1
            nAcc = nAcc + 1
        self.byDic = lTemp;

        # masks
        self._arcMask = (2 ** ((self.nBytesArc * 8) - 3)) - 1
        self._finalNodeMask = 1 << ((self.nBytesArc * 8) - 1)
        self._lastArcMask = 1 << ((self.nBytesArc * 8) - 2)
        self._addrBitMask = 1 << ((self.nBytesArc * 8) - 3)  # version 2

        # function to decode the affix/suffix code
        if self.cStemming == "S":
296
297
298
299
300
301
302
303

304
305
306
307
308
309
310
316
317
318
319
320
321
322

323
324
325
326
327
328
329
330







-
+







        iAddr = 0
        for c in sWord:
            if c not in self.dChar:
                return False
            iAddr = self._lookupArcNode(self.dChar[c], iAddr)
            if iAddr is None:
                return False
        return bool(int.from_bytes(self.byDic[iAddr:iAddr+self.nBytesArc], byteorder='big') & self._finalNodeMask)
        return bool(self.byDic[iAddr] & self._finalNodeMask)

    def getMorph (self, sWord):
        "retrieves morphologies list, different casing allowed"
        if not sWord:
            return []
        sWord = st.spellingNormalization(sWord)
        l = self.morph(sWord)
352
353
354
355
356
357
358
359

360
361
362
363
364
365
366
372
373
374
375
376
377
378

379
380
381
382
383
384
385
386







-
+







                sWord1, sWord2 = sWord.split(cSplitter, 1)
                if self.isValid(sWord1) and self.isValid(sWord2):
                    oSuggResult.addSugg(sWord1+" "+sWord2)

    def _suggest (self, oSuggResult, sRemain, nMaxSwitch=0, nMaxDel=0, nMaxHardRepl=0, nMaxJump=0, nDist=0, nDeep=0, iAddr=0, sNewWord="", bAvoidLoop=False):
        # recursive function
        #logging.info((nDeep * "  ") + sNewWord + ":" + sRemain)
        if int.from_bytes(self.byDic[iAddr:iAddr+self.nBytesArc], byteorder='big') & self._finalNodeMask:
        if self.byDic[iAddr] & self._finalNodeMask:
            if not sRemain:
                oSuggResult.addSugg(sNewWord, nDeep)
                for sTail in self._getTails(iAddr):
                    oSuggResult.addSugg(sNewWord+sTail, nDeep)
                return
            if (len(sNewWord) + len(sRemain) == len(oSuggResult.sWord)) and oSuggResult.sWord.lower().startswith(sNewWord.lower()) and self.isValid(sRemain):
                if self.sLangCode == "fr" and sNewWord.lower() in ("l", "d", "n", "m", "t", "s", "c", "j", "qu", "lorsqu", "puisqu", "quoiqu", "jusqu", "quelqu") and sRemain[0:1] in cp.aVowel:
419
420
421
422
423
424
425
426

427
428
429
430
431
432
433
439
440
441
442
443
444
445

446
447
448
449
450
451
452
453







-
+







                yield (self.dCharVal[nVal], jAddr)

    def _getTails (self, iAddr, sTail="", n=2):
        "return a list of suffixes ending at a distance of <n> from <iAddr>"
        aTails = set()
        for nVal, jAddr in self._getArcs(iAddr):
            if nVal <= self.nChar:
                if int.from_bytes(self.byDic[jAddr:jAddr+self.nBytesArc], byteorder='big') & self._finalNodeMask:
                if self.byDic[jAddr] & self._finalNodeMask:
                    aTails.add(sTail + self.dCharVal[nVal])
                if n and not aTails:
                    aTails.update(self._getTails(jAddr, sTail+self.dCharVal[nVal], n-1))
        return aTails

    def drawPath (self, sWord, iAddr=0):
        "show the path taken by <sWord> in the graph"
495
496
497
498
499
500
501
502

503
504
505
506
507


508
509
510
511
512
513

514
515
516
517


518
519
520


521
522
523
524
525
526
527
528
529
530
531
532
533

534
535
536
537
538


539
540
541
542
543

544
545
546
547
548
549
550
551


552
553
554
555

556
557
558
559

560
561
562
563
564
565
566



567
568
569

570
571
572
573
574
575
576
577
578
579


580
581

582
583
584

585
586
587
588
589
590
591
515
516
517
518
519
520
521

522
523
524
525


526
527
528
529
530
531
532

533
534
535


536
537
538


539
540
541
542
543
544
545
546
547
548
549
550
551
552

553
554
555
556


557
558
559
560
561
562

563
564
565
566
567
568
569


570
571
572
573
574

575
576
577
578

579
580
581
582
583



584
585
586
587
588

589
590
591
592
593
594
595
596
597


598
599
600

601



602
603
604
605
606
607
608
609







-
+



-
-
+
+





-
+


-
-
+
+

-
-
+
+












-
+



-
-
+
+




-
+






-
-
+
+



-
+



-
+




-
-
-
+
+
+


-
+








-
-
+
+

-
+
-
-
-
+







        iAddr = 0
        for c in sWord:
            if c not in self.dChar:
                return []
            iAddr = self._lookupArcNode(self.dChar[c], iAddr)
            if iAddr is None:
                return []
        if int.from_bytes(self.byDic[iAddr:iAddr+self.nBytesArc], byteorder='big') & self._finalNodeMask:
        if self.byDic[iAddr] & self._finalNodeMask:
            l = []
            nRawArc = 0
            while not nRawArc & self._lastArcMask:
                iEndArcAddr = iAddr + self.nBytesArc
                nRawArc = int.from_bytes(self.byDic[iAddr:iEndArcAddr], byteorder='big')
                iEndArcAddr = iAddr + 1
                nRawArc = self.byDic[iAddr]
                nArc = nRawArc & self._arcMask
                if nArc > self.nChar:
                    # This value is not a char, this is a stemming code
                    sStem = ">" + self.funcStemming(sWord, self.lArcVal[nArc])
                    # Now , we go to the next node and retrieve all following arcs values, all of them are tags
                    iAddr2 = int.from_bytes(self.byDic[iEndArcAddr:iEndArcAddr+self.nBytesNodeAddress], byteorder='big')
                    iAddr2 = self.byDic[iEndArcAddr]
                    nRawArc2 = 0
                    while not nRawArc2 & self._lastArcMask:
                        iEndArcAddr2 = iAddr2 + self.nBytesArc
                        nRawArc2 = int.from_bytes(self.byDic[iAddr2:iEndArcAddr2], byteorder='big')
                        iEndArcAddr2 = iAddr2 + 1
                        nRawArc2 = self.byDic[iAddr2]
                        l.append(sStem + "/" + self.lArcVal[nRawArc2 & self._arcMask])
                        iAddr2 = iEndArcAddr2+self.nBytesNodeAddress
                iAddr = iEndArcAddr+self.nBytesNodeAddress
                        iAddr2 = iEndArcAddr2 + 1
                iAddr = iEndArcAddr + 1
            return l
        return []

    def _stem1 (self, sWord):
        "returns stems list of <sWord>"
        iAddr = 0
        for c in sWord:
            if c not in self.dChar:
                return []
            iAddr = self._lookupArcNode(self.dChar[c], iAddr)
            if iAddr is None:
                return []
        if int.from_bytes(self.byDic[iAddr:iAddr+self.nBytesArc], byteorder='big') & self._finalNodeMask:
        if self.byDic[iAddr] & self._finalNodeMask:
            l = []
            nRawArc = 0
            while not nRawArc & self._lastArcMask:
                iEndArcAddr = iAddr + self.nBytesArc
                nRawArc = int.from_bytes(self.byDic[iAddr:iEndArcAddr], byteorder='big')
                iEndArcAddr = iAddr + 1
                nRawArc = self.byDic[iAddr]
                nArc = nRawArc & self._arcMask
                if nArc > self.nChar:
                    # This value is not a char, this is a stemming code
                    l.append(self.funcStemming(sWord, self.lArcVal[nArc]))
                iAddr = iEndArcAddr+self.nBytesNodeAddress
                iAddr = iEndArcAddr + 1
            return l
        return []

    def _lookupArcNode1 (self, nVal, iAddr):
        "looks if <nVal> is an arc at the node at <iAddr>, if yes, returns address of next node else None"
        while True:
            iEndArcAddr = iAddr+self.nBytesArc
            nRawArc = int.from_bytes(self.byDic[iAddr:iEndArcAddr], byteorder='big')
            iEndArcAddr = iAddr + 1
            nRawArc = self.byDic[iAddr]
            if nVal == (nRawArc & self._arcMask):
                # the value we are looking for
                # we return the address of the next node
                return int.from_bytes(self.byDic[iEndArcAddr:iEndArcAddr+self.nBytesNodeAddress], byteorder='big')
                return self.byDic[iEndArcAddr]
            # value not found
            if nRawArc & self._lastArcMask:
                return None
            iAddr = iEndArcAddr+self.nBytesNodeAddress
            iAddr = iEndArcAddr + 1

    def _getArcs1 (self, iAddr):
        "generator: return all arcs at <iAddr> as tuples of (nVal, iAddr)"
        while True:
            iEndArcAddr = iAddr+self.nBytesArc
            nRawArc = int.from_bytes(self.byDic[iAddr:iEndArcAddr], byteorder='big')
            yield nRawArc & self._arcMask, int.from_bytes(self.byDic[iEndArcAddr:iEndArcAddr+self.nBytesNodeAddress], byteorder='big')
            iEndArcAddr = iAddr + 1
            nRawArc = self.byDic[iAddr]
            yield nRawArc & self._arcMask, self.byDic[iEndArcAddr]
            if nRawArc & self._lastArcMask:
                break
            iAddr = iEndArcAddr+self.nBytesNodeAddress
            iAddr = iEndArcAddr + 1

    def _writeNodes1 (self, spfDest):
        "for debugging only"
        print(" > Write binary nodes")
        with open(spfDest, 'w', 'utf-8', newline="\n") as hDst:
            iAddr = 0
            hDst.write("i{:_>10} -- #{:_>10}\n".format("0", iAddr))
            while iAddr < len(self.byDic):
                iEndArcAddr = iAddr+self.nBytesArc
                nRawArc = int.from_bytes(self.byDic[iAddr:iEndArcAddr], byteorder='big')
                iEndArcAddr = iAddr + 1
                nRawArc = self.byDic[iAddr]
                nArc = nRawArc & self._arcMask
                hDst.write("  {:<20}  {:0>16}  i{:>10}   #{:_>10}\n".format(self.lArcVal[nArc], bin(nRawArc)[2:], "?", \
                hDst.write("  {:<20}  {:0>16}  i{:>10}   #{:_>10}\n".format(self.lArcVal[nArc], bin(nRawArc)[2:], "?", self.byDic[iEndArcAddr]))
                                                                            int.from_bytes(self.byDic[iEndArcAddr:iEndArcAddr+self.nBytesNodeAddress], \
                                                                                           byteorder='big')))
                iAddr = iEndArcAddr+self.nBytesNodeAddress
                iAddr = iEndArcAddr + 1
                if (nRawArc & self._lastArcMask) and iAddr < len(self.byDic):
                    hDst.write("\ni{:_>10} -- #{:_>10}\n".format("?", iAddr))
            hDst.close()

    # VERSION 2
    def _morph2 (self, sWord):
        "returns morphologies of <sWord>"