Grammalecte  Check-in [40ebc5eada]

Overview
Comment:[graphspell][build][lo][fx] merge dict2: use binary list instead of binary string, drop support for binary file -> use JSON, code cleaning
Downloads: Tarball | ZIP archive | SQL archive
Timelines: family | ancestors | descendants | both | trunk | build | major_change | fx | lo | graphspell
Files: files | file ages | folders
SHA3-256: 40ebc5eada1d3af48331420ebea6da504933163fc3dacc9382b2f71e1725b356
User & Date: olr on 2020-11-07 11:40:39
Other Links: manifest | tags
Context
2020-11-07
18:35
[fr] immunités check-in: 18f6c0c93d user: olr tags: trunk, fr
11:40
[graphspell][build][lo][fx] merge dict2: use binary list instead of binary string, drop support for binary file -> use JSON, code cleaning check-in: 40ebc5eada user: olr tags: trunk, build, major_change, fx, lo, graphspell
09:46
[fr] ajustements check-in: 422d57b490 user: olr tags: trunk, fr
2020-11-05
16:25
[graphspell][fx] dawg: remove useless parameters Closed-Leaf check-in: 5538934848 user: olr tags: fx, graphspell, dict2
Changes

Modified gc_lang/fr/build_data.py from [6e865955c0] to [3d9c0f4ca9].

48
49
50
51
52
53
54
55

56
57
58
59
60
61
62
48
49
50
51
52
53
54

55
56
57
58
59
60
61
62







-
+







        raise OSError("# Error. File not found or not loadable: " + spf)


def loadDictionary ():
    global oDict
    if not oDict:
        try:
            oDict = ibdawg.IBDAWG("fr-allvars.bdic")
            oDict = ibdawg.IBDAWG("fr-allvars.json")
        except:
            traceback.print_exc()


def makeDictionaries (sp, sVersion):
    with cd(sp+"/dictionnaire"):
        if platform.system() == "Windows":

Modified gc_lang/fr/oxt/ContextMenu/ContextMenu.py from [c17c3b29b1] to [f3255b533a].

131
132
133
134
135
136
137
138

139
140
141
142
143
144
145
131
132
133
134
135
136
137

138
139
140
141
142
143
144
145







-
+







            if not oSpellChecker:
                xCurCtx = uno.getComponentContext()
                oGC = self.ctx.ServiceManager.createInstanceWithContext("org.openoffice.comp.pyuno.Lightproof.grammalecte", self.ctx)
                if hasattr(oGC, "getSpellChecker"):
                    # https://bugs.documentfoundation.org/show_bug.cgi?id=97790
                    oSpellChecker = oGC.getSpellChecker()
                else:
                    oSpellChecker = SpellChecker("${lang}", "fr-allvars.bdic")
                    oSpellChecker = SpellChecker("${lang}", "fr-allvars.json")
        except:
            traceback.print_exc()

    def execute (self, args):
        if not args:
            return
        try:

Modified gc_lang/fr/oxt/DictOptions/LexiconEditor.py from [828f4f365e] to [5ef5214006].

404
405
406
407
408
409
410
411

412
413
414
415
416
417
418
404
405
406
407
408
409
410

411
412
413
414
415
416
417
418







-
+








    @_waitPointer
    def importDictionary (self):
        spfImported = ""
        try:
            xFilePicker = self.xSvMgr.createInstanceWithContext('com.sun.star.ui.dialogs.FilePicker', self.ctx)  # other possibility: com.sun.star.ui.dialogs.SystemFilePicker
            xFilePicker.initialize([uno.getConstantByName("com.sun.star.ui.dialogs.TemplateDescription.FILEOPEN_SIMPLE")]) # seems useless
            xFilePicker.appendFilter("Supported files", "*.json; *.bdic")
            xFilePicker.appendFilter("Supported files", "*.json")
            xFilePicker.setDefaultName("fr.__personal__.json") # useless, doesn’t work
            xFilePicker.setDisplayDirectory("")
            xFilePicker.setMultiSelectionMode(False)
            nResult = xFilePicker.execute()
            if nResult == 1:
                # lFile = xFilePicker.getSelectedFiles()
                lFile = xFilePicker.getFiles()
459
460
461
462
463
464
465
466

467
468
469
470
471
472
473
459
460
461
462
463
464
465

466
467
468
469
470
471
472
473







-
+







            self.xDateDic.Label = self.dUI.get("void", "#err")
        MessageBox(self.xDocument, self.dUI.get('save_message', "#err"), self.dUI.get('save_title', "#err"))

    def exportDictionary (self):
        try:
            xFilePicker = self.xSvMgr.createInstanceWithContext('com.sun.star.ui.dialogs.FilePicker', self.ctx)  # other possibility: com.sun.star.ui.dialogs.SystemFilePicker
            xFilePicker.initialize([uno.getConstantByName("com.sun.star.ui.dialogs.TemplateDescription.FILESAVE_SIMPLE")]) # seems useless
            xFilePicker.appendFilter("Supported files", "*.json; *.bdic")
            xFilePicker.appendFilter("Supported files", "*.json")
            xFilePicker.setDefaultName("fr.__personal__.json") # useless, doesn’t work
            xFilePicker.setDisplayDirectory("")
            xFilePicker.setMultiSelectionMode(False)
            nResult = xFilePicker.execute()
            if nResult == 1:
                # lFile = xFilePicker.getSelectedFiles()
                lFile = xFilePicker.getFiles()

Modified gc_lang/fr/oxt/DictOptions/SearchWords.py from [764a885065] to [2c4ada79ef].

182
183
184
185
186
187
188
189

190
191
192
193
194
195
196
182
183
184
185
186
187
188

189
190
191
192
193
194
195
196







-
+







            elif xActionEvent.ActionCommand == "Close":
                self.xContainer.endExecute()
        except:
            traceback.print_exc()

    def initSpellChecker (self):
        if not self.oSpellChecker:
            self.oSpellChecker = sc.SpellChecker("fr", "fr-allvars.bdic", "", self.oPersonalDicJSON)
            self.oSpellChecker = sc.SpellChecker("fr", "fr-allvars.json", "", self.oPersonalDicJSON)

    @_waitPointer
    def searchSimilar (self):
        self.initSpellChecker()
        sWord = self.xWord.Text.strip()
        if sWord:
            xGridDataModel = self.xGridModel.GridDataModel

Modified gc_lang/fr/oxt/Graphspell.py from [810dc52bd8] to [76770ac233].

65
66
67
68
69
70
71
72

73
74
75
76
77
78
79
65
66
67
68
69
70
71

72
73
74
75
76
77
78
79







-
+







                sPersonalDicJSON = self.xOptionNode.getPropertyValue("personal_dic")
                if sPersonalDicJSON:
                    try:
                        personal_dic = json.loads(sPersonalDicJSON)
                    except:
                        print("Graphspell: wrong personal_dic")
                        traceback.print_exc()
            self.oGraphspell = SpellChecker("fr", "fr-"+sMainDicName+".bdic", "", personal_dic)
            self.oGraphspell = SpellChecker("fr", "fr-"+sMainDicName+".json", "", personal_dic)
            self.loadHunspell()
            # print("Graphspell: init done")
        except:
            print("Graphspell: init failed")
            traceback.print_exc()

    def loadHunspell (self):

Modified gc_lang/fr/setup.py from [8a0db0631b] to [955f5741fe].

89
90
91
92
93
94
95
96

97
98
99
100
101
102
103
89
90
91
92
93
94
95

96
97
98
99
100
101
102
103







-
+







    #     'test': ['coverage'],
    # },

    # If there are data files included in your packages that need to be
    # installed, specify them here.  If using Python 2.6 or less, then these
    # have to be included in MANIFEST.in as well.
    package_data={
        'grammalecte': ['graphspell/_dictionaries/*.bdic', '*.txt']
        'grammalecte': ['graphspell/_dictionaries/*.json', '*.txt']
    },

    # Although 'package_data' is the preferred approach, in some case you may
    # need to place data files outside of your packages. See:
    # http://docs.python.org/3.4/distutils/setupscript.html#installing-additional-files # noqa
    # In this case, 'data_file' will be installed into '<sys.prefix>/my_data'
    # data_files=[('my_data', ['data/data_file'])],

Modified gc_lang/fr/webext/panel/lex_editor.js from [a95656b530] to [ffa654539b].

757
758
759
760
761
762
763
764

765
766
767
768
769
770
771
757
758
759
760
761
762
763

764
765
766
767
768
769
770
771







-
+







    },

    build: function () {
        let xProgressNode = document.getElementById("wait_progress");
        let lEntry = oLexiconTable.getEntries();
        if (lEntry.length > 0) {
            let oDAWG = new DAWG(lEntry, "S", "fr", "Français", this.sName, this.sDescription, xProgressNode);
            let oJSON = oDAWG.createBinaryJSON(1);
            let oJSON = oDAWG.createBinaryJSON();
            oDictHandler.saveDictionary(this.sName, oJSON);
            this.oIBDAWG = new IBDAWG(oJSON);
            this.setDictData(this.oIBDAWG.nEntry, this.oIBDAWG.sDate);
        } else {
            oDictHandler.saveDictionary(this.sName, null);
            this.setDictData(0, "[néant]");
        }

Modified graphspell-js/dawg.js from [525275df92] to [82fa7fdc68].

340
341
342
343
344
345
346
347
348


349
350
351
352
353
354
355







356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371

372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387


































388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410


411
412
413
414
415
416
417
340
341
342
343
344
345
346


347
348







349
350
351
352
353
354
355
356
357
358
359

360
361
362
363
364
365
366
367
368
369

370
371
372
373
374
375
376
377
378








379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430

431
432
433

434
435
436
437
438
439
440
441
442







-
-
+
+
-
-
-
-
-
-
-
+
+
+
+
+
+
+




-










-
+








-
-
-
-
-
-
-
-
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+


















-



-
+
+







                    }
                }
            }
        }
    }

    // BINARY CONVERSION
    createBinaryJSON (nCompressionMethod) {
        console.log("Write DAWG as an indexable binary dictionary [method: "+nCompressionMethod+"]");
    _calculateBinary () {
        console.log("Write DAWG as an indexable binary dictionary");
        if (nCompressionMethod == 1) {
            this.nBytesArc = Math.floor( (this.nArcVal.toString(2).length + 2) / 8 ) + 1;     // We add 2 bits. See DawgNode.convToBytes1()
            this.nBytesOffset = 0;
            this._calcNumBytesNodeAddress();
            this._calcNodesAddress1();
        } else {
            console.log("Error: unknown compression method");
        this.nBytesArc = Math.floor( (this.nArcVal.toString(2).length + 2) / 8 ) + 1;     // We add 2 bits. See DawgNode.convToBytes()
        this.nBytesOffset = 0;
        this._calcNumBytesNodeAddress();
        this._calcNodesAddress();
        this.sByDic = this.oRoot.convToBytes(this.nBytesArc, this.nBytesNodeAddress);
        for (let oNode of this.dMinimizedNodes.values()) {
            this.sByDic += oNode.convToBytes(this.nBytesArc, this.nBytesNodeAddress);
        }
        console.log("Arc values (chars, affixes and tags): " + this.nArcVal);
        console.log("Arc size: "+this.nBytesArc+" bytes, Address size: "+this.nBytesNodeAddress+" bytes");
        console.log("-> " + this.nBytesArc+this.nBytesNodeAddress + " * " + this.nArc + " = " + (this.nBytesArc+this.nBytesNodeAddress)*this.nArc + " bytes");
        return this._createJSON(nCompressionMethod);
    }

    _calcNumBytesNodeAddress () {
        // how many bytes needed to store all nodes/arcs in the binary dictionary
        this.nBytesNodeAddress = 1;
        while (((this.nBytesArc + this.nBytesNodeAddress) * this.nArc) > (2 ** (this.nBytesNodeAddress * 8))) {
            this.nBytesNodeAddress += 1;
        }
    }

    _calcNodesAddress1 () {
    _calcNodesAddress () {
        let nBytesNode = this.nBytesArc + this.nBytesNodeAddress;
        let iAddr = this.oRoot.arcs.size * nBytesNode;
        for (let oNode of this.dMinimizedNodes.values()) {
            oNode.addr = iAddr;
            iAddr += Math.max(oNode.arcs.size, 1) * nBytesNode;
        }
    }

    _createJSON (nCompressionMethod) {
        let sByDic = "";
        if (nCompressionMethod == 1) {
            sByDic = this.oRoot.convToBytes1(this.nBytesArc, this.nBytesNodeAddress);
            for (let oNode of this.dMinimizedNodes.values()) {
                sByDic += oNode.convToBytes1(this.nBytesArc, this.nBytesNodeAddress);
            }
        }
    _binaryToList () {
        this.lByDic = [];
        let nAcc = 0;
        let lBytesBuffer = [];
        let nDivisor = (this.nBytesArc + this.nBytesNodeAddress) / 2;
        for (let i = 0;  i < this.sByDic.length;  i+=2) {
            lBytesBuffer.push(parseInt(this.sByDic.slice(i, i+2), 16));
            if (nAcc == (this.nBytesArc - 1)) {
                this.lByDic.push(this._convBytesToInteger(lBytesBuffer));
                lBytesBuffer = [];
            }
            else if (nAcc == (this.nBytesArc + this.nBytesNodeAddress - 1)) {
                this.lByDic.push(Math.round(this._convBytesToInteger(lBytesBuffer) / nDivisor));  // Math.round should be useless, BUT with JS who knowns what can happen…
                lBytesBuffer = [];
                nAcc = -1;
            }
            nAcc = nAcc + 1;
        }
    }

    _convBytesToInteger (aBytes) {
        // Byte order = Big Endian (bigger first)
        let nVal = 0;
        let nWeight = (aBytes.length - 1) * 8;
        for (let n of aBytes) {
            nVal += n << nWeight;
            nWeight = nWeight - 8;
        }
        return nVal;
    }

    createBinaryJSON () {
        this._calculateBinary();
        this._binaryToList();
        let oJSON = {
            "sHeader": "/grammalecte-fsa/",
            "sLangCode": this.sLangCode,
            "sLangName": this.sLangName,
            "sDicName": this.sDicName,
            "sDescription": this.sDescription,
            "sFileName": "[none]",
            "sDate": this._getDate(),
            "nEntry": this.nEntry,
            "nChar": this.nChar,
            "nAff": this.nAff,
            "nTag": this.nTag,
            "cStemming": this.cStemming,
            "dChar": helpers.mapToObject(this.dChar),
            "nNode": this.nNode,
            "nArc": this.nArc,
            "lArcVal": this.lArcVal,
            "nArcVal": this.nArcVal,
            "nCompressionMethod": nCompressionMethod,
            "nBytesArc": this.nBytesArc,
            "nBytesNodeAddress": this.nBytesNodeAddress,
            "nBytesOffset": this.nBytesOffset,
            "sByDic": sByDic,    // binary word graph
            //"sByDic": this.sByDic,    // binary word graph
            "lByDic": this.lByDic,
            "l2grams": Array.from(this.a2grams)
        };
        return oJSON;
    }

    _getDate () {
        let oDate = new Date();
492
493
494
495
496
497
498
499

500
501
502
503
504
505
506
517
518
519
520
521
522
523

524
525
526
527
528
529
530
531







-
+







            for (let oNode of this.arcs.values()) {
                oNode.display(nTab+1, lArcVal, bRecur);
            }
        }
    }

    // VERSION 1 =====================================================================================================
    convToBytes1 (nBytesArc, nBytesNodeAddress) {
    convToBytes (nBytesArc, nBytesNodeAddress) {
        /*
            Node scheme:
            - Arc length is defined by nBytesArc
            - Address length is defined by nBytesNodeAddress

            |                Arc                |                         Address of next node                          |
            |                                   |                                                                       |

Modified graphspell-js/dic_merger.js from [dea1fd0b02] to [10f02569ea].

34
35
36
37
38
39
40
41

42
43
44
45
46
47
48
49
50
34
35
36
37
38
39
40

41
42
43
44
45
46
47
48
49
50







-
+









            }
        }
        if (xProgressBar) {
            xProgressBar.value = xProgressBar.max;
        }
        try {
            let oDAWG = new DAWG(lEntry, cStemming, sLangCode, sLangName, sDicName, sDescription, xProgressBar);
            let oDict = oDAWG.createBinaryJSON(1);
            let oDict = oDAWG.createBinaryJSON();
            return oDict;
        }
        catch (e) {
            console.log("Dictionaries merger: unable to generate merged dictionary");
            console.error(e);
            return null;
        }
    }
}

Modified graphspell-js/ibdawg.js from [280555859f] to [73dd04c644].

143
144
145
146
147
148
149
150

151
152
153
154
155
156
157
158
159
160
161
162
163
164






























165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221

222
223
224
225
226
227
228
143
144
145
146
147
148
149

150
151
152
153
154
155
156



157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199


































200
201
202
203
204
205
206
207
208
209
210
211
212
213

214
215
216
217
218
219
220
221







-
+






-
-
-





+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+








-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-














-
+







            console.error(e);
            console.log("path: " + sPath);
            console.log("dic:" + source.slice(0, 1000));
            throw Error("# Error. File not found or not loadable.\n" + e.message + "\n");
        }
        /*
            Properties:
            sName, nCompressionMethod, sHeader, lArcVal, nArcVal, sByDic, sLang, nChar, nBytesArc, nBytesNodeAddress,
            sName, sHeader, lArcVal, nArcVal, sByDic, sLang, nChar, nBytesArc, nBytesNodeAddress,
            nEntry, nNode, nArc, nAff, cStemming, nTag, dChar, nBytesOffset,
        */

        if (!(this.sHeader.startsWith("/grammalecte-fsa/") || this.sHeader.startsWith("/pyfsa/"))) {
            throw TypeError("# Error. Not a grammalecte-fsa binary dictionary. Header: " + this.sHeader);
        }
        if (!(this.nCompressionMethod == 1 || this.nCompressionMethod == 2 || this.nCompressionMethod == 3)) {
            throw RangeError("# Error. Unknown dictionary compression method: " + this.nCompressionMethod);
        }
        // <dChar> to get the value of an arc, <dCharVal> to get the char of an arc with its value
        this.dChar = helpers.objectToMap(this.dChar);
        this.dCharVal = this.dChar.gl_reverse();
        this.a2grams = (this.l2grams) ? new Set(this.l2grams) : null;

        if (!this.hasOwnProperty("lByDic")) {
            // old dictionary version
            if (!this.hasOwnProperty("sByDic")) {
                throw TypeError("# Error. No usable data in the dictionary.");
            }
            this.lByDic = [];
            let nAcc = 0;
            let lBytesBuffer = [];
            let nDivisor = (this.nBytesArc + this.nBytesNodeAddress) / 2;
            for (let i = 0;  i < this.sByDic.length;  i+=2) {
                lBytesBuffer.push(parseInt(this.sByDic.slice(i, i+2), 16));
                if (nAcc == (this.nBytesArc - 1)) {
                    this.lByDic.push(this._convBytesToInteger(lBytesBuffer));
                    lBytesBuffer = [];
                }
                else if (nAcc == (this.nBytesArc + this.nBytesNodeAddress - 1)) {
                    this.lByDic.push(Math.round(this._convBytesToInteger(lBytesBuffer) / nDivisor));  // Math.round should be useless, BUT with JS who knowns what can happen…
                    lBytesBuffer = [];
                    nAcc = -1;
                }
                nAcc = nAcc + 1;
            }
        }

        // masks
        this._arcMask = (2 ** ((this.nBytesArc * 8) - 3)) - 1;
        this._finalNodeMask = 1 << ((this.nBytesArc * 8) - 1);
        this._lastArcMask = 1 << ((this.nBytesArc * 8) - 2);

        // function to decode the affix/suffix code
        if (this.cStemming == "S") {
            this.funcStemming = str_transform.changeWordWithSuffixCode;
        } else if (this.cStemming == "A") {
            this.funcStemming = str_transform.changeWordWithAffixCode;
        } else {
            this.funcStemming = str_transform.noStemming;
        }

        /*
            Bug workaround.
            Mozilla’s JS parser sucks. Can’t read file bigger than 4 Mb!
            So we convert huge hexadecimal string to list of numbers…
            https://github.com/mozilla/addons-linter/issues/1361
        */
        /*
            Performance trick:
            Instead of converting bytes to integers each times we parse the binary dictionary,
            we do it once, then parse the array
        */
        this.lByDic = [];
        let nAcc = 0;
        let lBytesBuffer = [];
        let nDivisor = (this.nBytesArc + this.nBytesNodeAddress) / 2;
        for (let i = 0;  i < this.sByDic.length;  i+=2) {
            lBytesBuffer.push(parseInt(this.sByDic.slice(i, i+2), 16));
            if (nAcc == (this.nBytesArc - 1)) {
                this.lByDic.push(this._convBytesToInteger(lBytesBuffer));
                lBytesBuffer = [];
            }
            else if (nAcc == (this.nBytesArc + this.nBytesNodeAddress - 1)) {
                this.lByDic.push(Math.round(this._convBytesToInteger(lBytesBuffer) / nDivisor));  // Math.round should be useless, BUT with JS who knowns what can happen…
                lBytesBuffer = [];
                nAcc = -1;
            }
            nAcc = nAcc + 1;
        }
        /* end of bug workaround */

        this._arcMask = (2 ** ((this.nBytesArc * 8) - 3)) - 1;
        this._finalNodeMask = 1 << ((this.nBytesArc * 8) - 1);
        this._lastArcMask = 1 << ((this.nBytesArc * 8) - 2);

        //console.log(this.getInfo());
        this.bAcronymValid = true;
        this.bNumAtLastValid = false;

        // lexicographer module ?
        this.lexicographer = null;
        // JS still sucks: we’ll try importation when importation will be available in Workers. Still waiting...
        if (self && self.hasOwnProperty("lexgraph_"+this.sLangCode)) { // self is the Worker
            this.lexicographer = self["lexgraph_"+this.sLangCode];
        }
    }

    getInfo () {
        return  `  Language: ${this.sLangName}   Lang code: ${this.sLangCode}   Dictionary name: ${this.sDicName}\n` +
                `  Compression method: ${this.nCompressionMethod}   Date: ${this.sDate}   Stemming: ${this.cStemming}FX\n` +
                `  Date: ${this.sDate}   Stemming: ${this.cStemming}FX\n` +
                `  Arcs values:  ${this.nArcVal} = ${this.nChar} characters,  ${this.nAff} affixes,  ${this.nTag} tags\n` +
                `  Dictionary: ${this.nEntry} entries,    ${this.nNode} nodes,   ${this.nArc} arcs\n` +
                `  Address size: ${this.nBytesNodeAddress} bytes,  Arc size: ${this.nBytesArc} bytes\n`;
    }

    getJSON () {
        let oJSON = {
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
232
233
234
235
236
237
238

239
240
241
242
243
244
245







-







            "nTag": this.nTag,
            "cStemming": this.cStemming,
            "dChar": helpers.mapToObject(this.dChar),
            "nNode": this.nNode,
            "nArc": this.nArc,
            "lArcVal": this.lArcVal,
            "nArcVal": this.nArcVal,
            "nCompressionMethod": this.nCompressionMethod,
            "nBytesArc": this.nBytesArc,
            "nBytesNodeAddress": this.nBytesNodeAddress,
            "nBytesOffset": this.nBytesOffset,
            "sByDic": this.sByDic,  // binary word graph
            "l2grams": this.l2grams
        };
        return oJSON;

Modified graphspell/dawg.py from [b60434a390] to [729715ac89].

354
355
356
357
358
359
360
361
362


363
364
365
366
367




368
369
370
371


372
373
374
375

376
377

378
379
380
381
382
383
384
385
386
387
388
389
390
391
392

393
394
395
396
397
398
399

400
401


402
403
404
405
406
407

408

409


410
411




412
413
414


415
416
417

418
419
420


421
422
423
424



425
426
427
428
429
430
431

432
433
434

435
436

437
438

439
440
441
442
443
444
445
446

447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477


478
479
480
481

482
483
484

485
486
487
488

489
490
491
492
493
494
495
496
497
498
499
500
501
502
503
504
505
506
507
508
509
510
511
512
513
514
515
516
517
518
519
520
521
522
523
524
525
526
527
528
529
530
531
532
533
534
535
536
537
538
539
540
541
542
543
544
545
546
547
548
549
550
551
552
553
554
555
556
557
558
559
560
561
562
563
564
565
566
567
568
569
570

571
572
573

574
575
576
577
578




579
580
581
582
583
584
585
586
587
588
589
590
591
592
593
594
595
596
597
598
599
600
601
602
603
604
605
606
607
608
609
610
611
354
355
356
357
358
359
360


361
362





363
364
365
366




367
368




369


370



371
372
373
374
375
376
377
378
379
380
381

382
383
384
385
386
387
388

389


390
391






392

393
394
395
396


397
398
399
400



401
402



403



404
405




406
407
408







409



410
411

412
413

414








415





416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433

434
435
436
437
438
439

440
441
442
443
444

445
446
447

448
449



450


451











































































452
453
454

455
456
457

458





459
460
461
462









463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478

479
480
481
482
483
484
485







-
-
+
+
-
-
-
-
-
+
+
+
+
-
-
-
-
+
+
-
-
-
-
+
-
-
+
-
-
-











-
+






-
+
-
-
+
+
-
-
-
-
-
-
+
-
+

+
+
-
-
+
+
+
+
-
-
-
+
+
-
-
-
+
-
-
-
+
+
-
-
-
-
+
+
+
-
-
-
-
-
-
-
+
-
-
-
+

-
+

-
+
-
-
-
-
-
-
-
-
+
-
-
-
-
-


















-






-
+
+



-
+


-
+

-
-
-
+
-
-

-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-



-
+


-
+
-
-
-
-
-
+
+
+
+
-
-
-
-
-
-
-
-
-
















-







                sEntry = sWord + "\t" + self.funcStemming(sWord, self.lArcVal[nVal])
                for nMorphVal, _ in oNextNode.arcs.items():
                    if not zPattern or zPattern.search(self.lArcVal[nMorphVal]):
                        yield sEntry + "\t" + self.lArcVal[nMorphVal]


    # BINARY CONVERSION
    def _calculateBinary (self, nCompressionMethod):
        print(" > Write DAWG as an indexable binary dictionary [method: %d]" % nCompressionMethod)
    def _calculateBinary (self):
        print(" > Write DAWG as an indexable binary dictionary")
        if nCompressionMethod == 1:
            self.nBytesArc = ( (self.nArcVal.bit_length() + 2) // 8 ) + 1   # We add 2 bits. See DawgNode.convToBytes1()
            self.nBytesOffset = 0
            self._calcNumBytesNodeAddress()
            self._calcNodesAddress1()
        self.nBytesArc = ( (self.nArcVal.bit_length() + 2) // 8 ) + 1   # We add 2 bits. See DawgNode.convToBytes()
        self.nBytesOffset = 0
        self._calcNumBytesNodeAddress()
        self._calcNodesAddress()
        elif nCompressionMethod == 2:
            self.nBytesArc = ( (self.nArcVal.bit_length() + 3) // 8 ) + 1   # We add 3 bits. See DawgNode.convToBytes2()
            self.nBytesOffset = 0
            self._calcNumBytesNodeAddress()
        self.byDic = b""
        self.byDic = self.oRoot.convToBytes(self.nBytesArc, self.nBytesNodeAddress)
            self._calcNodesAddress2()
        elif nCompressionMethod == 3:
            self.nBytesArc = ( (self.nArcVal.bit_length() + 3) // 8 ) + 1   # We add 3 bits. See DawgNode.convToBytes3()
            self.nBytesOffset = 1
        for oNode in self.lMinimizedNodes:
            self.nMaxOffset = (2 ** (self.nBytesOffset * 8)) - 1
            self._calcNumBytesNodeAddress()
            self.byDic += oNode.convToBytes(self.nBytesArc, self.nBytesNodeAddress)
            self._calcNodesAddress3()
        else:
            print(" # Error: unknown compression method")
        print("   Arc values (chars, affixes and tags): {}  ->  {} bytes".format( self.nArcVal, len("\t".join(self.lArcVal).encode("utf-8")) ))
        print("   Arc size: {} bytes, Address size: {} bytes   ->   {} * {} = {} bytes".format( self.nBytesArc, self.nBytesNodeAddress, \
                                                                                                self.nBytesArc+self.nBytesNodeAddress, self.nArc, \
                                                                                                (self.nBytesArc+self.nBytesNodeAddress)*self.nArc ))

    def _calcNumBytesNodeAddress (self):
        "how many bytes needed to store all nodes/arcs in the binary dictionary"
        self.nBytesNodeAddress = 1
        while ((self.nBytesArc + self.nBytesNodeAddress) * self.nArc) > (2 ** (self.nBytesNodeAddress * 8)):
            self.nBytesNodeAddress += 1

    def _calcNodesAddress1 (self):
    def _calcNodesAddress (self):
        nBytesNode = self.nBytesArc + self.nBytesNodeAddress
        iAddr = len(self.oRoot.arcs) * nBytesNode
        for oNode in self.lMinimizedNodes:
            oNode.addr = iAddr
            iAddr += max(len(oNode.arcs), 1) * nBytesNode

    def _calcNodesAddress2 (self):
    def _binaryToList (self):
        nBytesNode = self.nBytesArc + self.nBytesNodeAddress
        iAddr = len(self.oRoot.arcs) * nBytesNode
        """
        Convert binary string to binary list
        for oNode in self.lSortedNodes:
            oNode.addr = iAddr
            iAddr += max(len(oNode.arcs), 1) * nBytesNode
            for oNextNode in oNode.arcs.values():
                if (oNode.pos + 1) == oNextNode.pos:
                    iAddr -= self.nBytesNodeAddress
        BEFORE: Arc                 Address                                 Arc                 Address
                    #break
                ||||||||| ||||||||| ||||||||| ||||||||| ||||||||| ||||||||| ||||||||| ||||||||| ||||||||| ||||||||| ||||||||| ||||||||| ...

        AFTER:  list of integers: [ arc, address, arc, address, arc, address, ... arc, address ]
        """
    def _calcNodesAddress3 (self):
        nBytesNode = self.nBytesArc + self.nBytesNodeAddress
        self.lByDic = []
        nAcc = 0
        byBuffer = b""
        nDivisor = (self.nBytesArc + self.nBytesNodeAddress) / 2
        # theorical nodes size if only addresses and no offset
        self.oRoot.size = len(self.oRoot.arcs) * nBytesNode
        for oNode in self.lSortedNodes:
        for i in range(0, len(self.byDic)):
            byBuffer += self.byDic[i:i+1]
            oNode.size = max(len(oNode.arcs), 1) * nBytesNode
        # rewind and calculate dropdown from the end, several times
        nDiff = self.nBytesNodeAddress - self.nBytesOffset
            if nAcc == (self.nBytesArc - 1):
        bEnd = False
        while not bEnd:
            bEnd = True
                self.lByDic.append(int.from_bytes(byBuffer, byteorder="big"))
                byBuffer = b""
            # recalculate addresses
            iAddr = self.oRoot.size
            for oNode in self.lSortedNodes:
                oNode.addr = iAddr
            elif nAcc == (self.nBytesArc + self.nBytesNodeAddress - 1):
                self.lByDic.append(round(int.from_bytes(byBuffer, byteorder="big") / nDivisor))
                byBuffer = b""
                iAddr += oNode.size
            # rewind and calculate dropdown from the end, several times
            for i in range(self.nNode-1, -1, -1):
                nSize = max(len(self.lSortedNodes[i].arcs), 1) * nBytesNode
                for oNextNode in self.lSortedNodes[i].arcs.values():
                    if 1 < (oNextNode.addr - self.lSortedNodes[i].addr) < self.nMaxOffset:
                        nSize -= nDiff
                nAcc = -1
                if self.lSortedNodes[i].size != nSize:
                    self.lSortedNodes[i].size = nSize
                    bEnd = False
            nAcc = nAcc + 1

    def getBinaryAsJSON (self, nCompressionMethod=1, bBinaryDictAsHexString=True):
    def getBinaryAsJSON (self):
        "return a JSON string containing all necessary data of the dictionary (compressed as a binary string)"
        self._calculateBinary(nCompressionMethod)
        self._calculateBinary()
        byDic = b""
        if nCompressionMethod == 1:
            byDic = self.oRoot.convToBytes1(self.nBytesArc, self.nBytesNodeAddress)
            for oNode in self.lMinimizedNodes:
                byDic += oNode.convToBytes1(self.nBytesArc, self.nBytesNodeAddress)
        elif nCompressionMethod == 2:
            byDic = self.oRoot.convToBytes2(self.nBytesArc, self.nBytesNodeAddress)
            for oNode in self.lSortedNodes:
        self._binaryToList()
                byDic += oNode.convToBytes2(self.nBytesArc, self.nBytesNodeAddress)
        elif nCompressionMethod == 3:
            byDic = self.oRoot.convToBytes3(self.nBytesArc, self.nBytesNodeAddress, self.nBytesOffset)
            for oNode in self.lSortedNodes:
                byDic += oNode.convToBytes3(self.nBytesArc, self.nBytesNodeAddress, self.nBytesOffset)
        return {
            "sHeader": "/grammalecte-fsa/",
            "sLangCode": self.sLangCode,
            "sLangName": self.sLangName,
            "sDicName": self.sDicName,
            "sDescription": self.sDescription,
            "sFileName": self.sFileName,
            "sDate": self._getDate(),
            "nEntry": self.nEntry,
            "nChar": self.nChar,
            "nAff": self.nAff,
            "nTag": self.nTag,
            "cStemming": self.cStemming,
            "dChar": self.dChar,
            "nNode": self.nNode,
            "nArc": self.nArc,
            "nArcVal": self.nArcVal,
            "lArcVal": self.lArcVal,
            "nCompressionMethod": nCompressionMethod,
            "nBytesArc": self.nBytesArc,
            "nBytesNodeAddress": self.nBytesNodeAddress,
            "nBytesOffset": self.nBytesOffset,
            # Mozilla’s JS parser don’t like file bigger than 4 Mb!
            # So, if necessary, we use an hexadecimal string, that we will convert later in Firefox’s extension.
            # https://github.com/mozilla/addons-linter/issues/1361
            "sByDic": byDic.hex()  if bBinaryDictAsHexString  else [ e  for e in byDic ],
            #"sByDic": self.byDic.hex(),
            "lByDic": self.lByDic,
            "l2grams": list(self.a2grams)
        }

    def writeAsJSObject (self, spfDst, nCompressionMethod, bInJSModule=False, bBinaryDictAsHexString=True):
    def writeAsJSObject (self, spfDst):
        "write a file (JSON or JS module) with all the necessary data"
        if not spfDst.endswith(".json"):
            spfDst += "."+str(nCompressionMethod)+".json"
            spfDst += ".json"
        with open(spfDst, "w", encoding="utf-8", newline="\n") as hDst:
            if bInJSModule:
                hDst.write('// JavaScript\n// Generated data (do not edit)\n\n"use strict";\n\nconst dictionary = ')
            hDst.write( json.dumps(self.getBinaryAsJSON(nCompressionMethod, bBinaryDictAsHexString), ensure_ascii=False) )
            hDst.write( json.dumps(self.getBinaryAsJSON(), ensure_ascii=False) )
            if bInJSModule:
                hDst.write(";\n\nexports.dictionary = dictionary;\n")

    def writeBinary (self, sPathFile, nCompressionMethod, bDebug=False):
        """
        Save as a binary file.

        Format of the binary indexable dictionary:
        Each section is separated with 4 bytes of \0

        - Section Header:
            /grammalecte-fsa/[compression method]
                * compression method is an ASCII string

        - Section Informations:
            /[lang code]
            /[lang name]
            /[dictionary name]
            /[date creation]
            /[number of chars]
            /[number of bytes for each arc]
            /[number of bytes for each address node]
            /[number of entries]
            /[number of nodes]
            /[number of arcs]
            /[number of affixes]
                * each field is a ASCII string
            /[stemming code]
                * "S" means stems are generated by /suffix_code/,
                  "A" means they are generated by /affix_code/
                  See defineSuffixCode() and defineAffixCode() for details.
                  "N" means no stemming

        - Section Values:
                * a list of strings encoded in binary from utf-8, each value separated with a tabulation

        - Section Word Graph (nodes / arcs)
                * A list of nodes which are a list of arcs with an address of the next node.
                  See DawgNode.convToBytes() for details.

        - Section 2grams:
                * A list of 2grams (as strings: 2 chars) encoded in binary from utf-8, each value separated with a tabulation
        """
        self._calculateBinary(nCompressionMethod)
        if not sPathFile.endswith(".bdic"):
            sPathFile += "."+str(nCompressionMethod)+".bdic"
        with open(sPathFile, 'wb') as hDst:
            # header
            hDst.write("/grammalecte-fsa/{}/".format(nCompressionMethod).encode("utf-8"))
            hDst.write(b"\0\0\0\0")
            # infos
            sInfo = "{}//{}//{}//{}//{}//{}//{}//{}//{}//{}//{}//{}//{}".format(self.sLangCode, self.sLangName, self.sDicName, self.sDescription, self._getDate(), \
                                                                                self.nChar, self.nBytesArc, self.nBytesNodeAddress, \
                                                                                self.nEntry, self.nNode, self.nArc, self.nAff, self.cStemming)
            hDst.write(sInfo.encode("utf-8"))
            hDst.write(b"\0\0\0\0")
            # lArcVal
            hDst.write("\t".join(self.lArcVal).encode("utf-8"))
            hDst.write(b"\0\0\0\0")
            # 2grams
            hDst.write("\t".join(self.a2grams).encode("utf-8"))
            hDst.write(b"\0\0\0\0")
            # DAWG: nodes / arcs
            if nCompressionMethod == 1:
                hDst.write(self.oRoot.convToBytes1(self.nBytesArc, self.nBytesNodeAddress))
                for oNode in self.lMinimizedNodes:
                    hDst.write(oNode.convToBytes1(self.nBytesArc, self.nBytesNodeAddress))
            elif nCompressionMethod == 2:
                hDst.write(self.oRoot.convToBytes2(self.nBytesArc, self.nBytesNodeAddress))
                for oNode in self.lSortedNodes:
                    hDst.write(oNode.convToBytes2(self.nBytesArc, self.nBytesNodeAddress))
            elif nCompressionMethod == 3:
                hDst.write(self.oRoot.convToBytes3(self.nBytesArc, self.nBytesNodeAddress, self.nBytesOffset))
                for oNode in self.lSortedNodes:
                    hDst.write(oNode.convToBytes3(self.nBytesArc, self.nBytesNodeAddress, self.nBytesOffset))
        if bDebug:
            self._writeNodes(sPathFile, nCompressionMethod)

    def _getDate (self):
        return time.strftime("%Y-%m-%d %H:%M:%S")

    def _writeNodes (self, sPathFile, nCompressionMethod):
    def _writeNodes (self, sPathFile):
        "for debugging only"
        print(" > Write nodes")
        with open(sPathFile+".nodes."+str(nCompressionMethod)+".txt", 'w', encoding='utf-8', newline="\n") as hDst:
        with open(sPathFile+".nodes.txt", 'w', encoding='utf-8', newline="\n") as hDst:
            if nCompressionMethod == 1:
                hDst.write(self.oRoot.getTxtRepr1(self.nBytesArc, self.lArcVal)+"\n")
                #hDst.write( ''.join( [ "%02X " %  z  for z in self.oRoot.convToBytes1(self.nBytesArc, self.nBytesNodeAddress) ] ).strip() )
                for oNode in self.lMinimizedNodes:
                    hDst.write(oNode.getTxtRepr1(self.nBytesArc, self.lArcVal)+"\n")
            hDst.write(self.oRoot.getTxtRepr(self.nBytesArc, self.lArcVal)+"\n")
            #hDst.write( ''.join( [ "%02X " %  z  for z in self.oRoot.convToBytes(self.nBytesArc, self.nBytesNodeAddress) ] ).strip() )
            for oNode in self.lMinimizedNodes:
                hDst.write(oNode.getTxtRepr(self.nBytesArc, self.lArcVal)+"\n")
            if nCompressionMethod == 2:
                hDst.write(self.oRoot.getTxtRepr2(self.nBytesArc, self.lArcVal)+"\n")
                for oNode in self.lSortedNodes:
                    hDst.write(oNode.getTxtRepr2(self.nBytesArc, self.lArcVal)+"\n")
            if nCompressionMethod == 3:
                hDst.write(self.oRoot.getTxtRepr3(self.nBytesArc, self.nBytesOffset, self.lArcVal)+"\n")
                #hDst.write( ''.join( [ "%02X " %  z  for z in self.oRoot.convToBytes3(self.nBytesArc, self.nBytesNodeAddress, self.nBytesOffset) ] ).strip() )
                for oNode in self.lSortedNodes:
                    hDst.write(oNode.getTxtRepr3(self.nBytesArc, self.nBytesOffset, self.lArcVal)+"\n")



class DawgNode:
    """Node of the word graph"""

    NextId = 0
    NextPos = 1 # (version 2)

    def __init__ (self):
        self.i = DawgNode.NextId
        DawgNode.NextId += 1
        self.final = False
        self.arcs = {}          # key: arc value; value: a node
        self.addr = 0           # address in the binary dictionary
        self.pos = 0            # position in the binary dictionary (version 2)
        self.size = 0           # size of node in bytes (version 3)

    @classmethod
    def resetNextId (cls):
        "set NextId to 0 "
        cls.NextId = 0

    def setPos (self): # version 2
644
645
646
647
648
649
650
651

652
653
654
655
656
657
658
518
519
520
521
522
523
524

525
526
527
528
529
530
531
532







-
+







        self.arcs = collections.OrderedDict(sorted(self.arcs.items(), key=lambda t: dValOccur.get(t[0], 0), reverse=True))

    def sortArcs2 (self, dValOccur, lArcVal):
        "sort arcs of each node depending on the previous char"
        self.arcs = collections.OrderedDict(sorted(self.arcs.items(), key=lambda t: dValOccur.get(lArcVal[t[0]], 0), reverse=True))

    # VERSION 1 =====================================================================================================
    def convToBytes1 (self, nBytesArc, nBytesNodeAddress):
    def convToBytes (self, nBytesArc, nBytesNodeAddress):
        """
        Convert to bytes (method 1).

        Node scheme:
        - Arc length is defined by nBytesArc
        - Address length is defined by nBytesNodeAddress

686
687
688
689
690
691
692
693

694
695
696
697
698
699
700
701
702
703
704
705
706
707
708
709
710
711
712
713
714
715
716
717
718
719
720
721
722
723
724
725
726
727
728
729
730
731
732
733
734
735
736
737
738
739
740
741
742
743
744
745
746
747
748
749
750
751
752
753
754
755
756
757
758
759
760
761
762
763
764
765
766
767
768
769
770
771
772
773
774
775
776
777
778
779
780
781
782
783
784
785
786
787
788
789
790
791
792
793
794
795
796
797
798
799
800
801
802
803
804
805
806
807
808
809
810
811
812
813
814
815
816
817
818
819
820
821
822
823
824
825
826
827
828
829
830
831
832
833
834
835
836
837
838
839
840
841
842
843
844
845
846
847
848
849
850
851
852
853
854
855
856
857
858
859
860
861
862
863
864
865
866
867
560
561
562
563
564
565
566

567
568
569
570
571
572
573
574
575
576
577
578
579
580
581
582
583























































































































































584
585
586
587
588
589
590







-
+
















-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-







                val = val | nFinalNodeMask
            if i == nArc:
                val = val | nFinalArcMask
            by += val.to_bytes(nBytesArc, byteorder='big')
            by += self.arcs[arc].addr.to_bytes(nBytesNodeAddress, byteorder='big')
        return by

    def getTxtRepr1 (self, nBytesArc, lVal):
    def getTxtRepr (self, nBytesArc, lVal):
        "return representation as string of node (method 1)"
        nArc = len(self.arcs)
        nFinalNodeMask = 1 << ((nBytesArc*8)-1)
        nFinalArcMask = 1 << ((nBytesArc*8)-2)
        s = "i{:_>10} -- #{:_>10}\n".format(self.i, self.addr)
        if not nArc:
            s += "  {:<20}  {:0>16}  i{:_>10}   #{:_>10}\n".format("", bin(nFinalNodeMask | nFinalArcMask)[2:], "0", "0")
            return s
        for i, arc in enumerate(self.arcs, 1):
            val = arc
            if i == 1 and self.final:
                val = val | nFinalNodeMask
            if i == nArc:
                val = val | nFinalArcMask
            s += "  {:<20}  {:0>16}  i{:_>10}   #{:_>10}\n".format(lVal[arc], bin(val)[2:], self.arcs[arc].i, self.arcs[arc].addr)
        return s

    # VERSION 2 =====================================================================================================
    def convToBytes2 (self, nBytesArc, nBytesNodeAddress):
        """
        Convert to bytes (method 2).

        Node scheme:
        - Arc length is defined by nBytesArc
        - Address length is defined by nBytesNodeAddress

        |                Arc                |                         Address of next node                          |
        |                                   |                                                                       |
         ┏━━━━━━━━━━━━━━━┓ ┏━━━━━━━━━━━━━━━┓ ┏━━━━━━━━━━━━━━━┓ ┏━━━━━━━━━━━━━━━┓ ┏━━━━━━━━━━━━━━━┓ ┏━━━━━━━━━━━━━━━┓
         ┃ ┃ ┃ ┃ ┃ ┃ ┃ ┃ ┃ ┃ ┃ ┃ ┃ ┃ ┃ ┃ ┃ ┃ ┃ ┃ ┃ ┃ ┃ ┃ ┃ ┃ ┃ ┃ ┃ ┃ ┃ ┃ ┃ ┃ ┃ ┃ ┃ ┃ ┃ ┃ ┃ ┃ ┃ ┃ ┃ ┃ ┃ ┃ ┃ ┃ ┃ ┃ ┃ ┃
         ┗━━━━━━━━━━━━━━━┛ ┗━━━━━━━━━━━━━━━┛ ┗━━━━━━━━━━━━━━━┛ ┗━━━━━━━━━━━━━━━┛ ┗━━━━━━━━━━━━━━━┛ ┗━━━━━━━━━━━━━━━┛
         [...]
         ┏━━━━━━━━━━━━━━━┓ ┏━━━━━━━━━━━━━━━┓ ┏━━━━━━━━━━━━━━━┓ ┏━━━━━━━━━━━━━━━┓ ┏━━━━━━━━━━━━━━━┓ ┏━━━━━━━━━━━━━━━┓
         ┃ ┃ ┃ ┃ ┃ ┃ ┃ ┃ ┃ ┃ ┃ ┃ ┃ ┃ ┃ ┃ ┃ ┃ ┃ ┃ ┃ ┃ ┃ ┃ ┃ ┃ ┃ ┃ ┃ ┃ ┃ ┃ ┃ ┃ ┃ ┃ ┃ ┃ ┃ ┃ ┃ ┃ ┃ ┃ ┃ ┃ ┃ ┃ ┃ ┃ ┃ ┃ ┃ ┃
         ┗━━━━━━━━━━━━━━━┛ ┗━━━━━━━━━━━━━━━┛ ┗━━━━━━━━━━━━━━━┛ ┗━━━━━━━━━━━━━━━┛ ┗━━━━━━━━━━━━━━━┛ ┗━━━━━━━━━━━━━━━┛
          ^ ^ ^
          ┃ ┃ ┃
          ┃ ┃ ┗━━ if 1, caution, no address: next node is the following node
          ┃ ┗━━━━ if 1, last arc of this node
          ┗━━━━━━ if 1, this node is final (only on the first arc)
        """
        nArc = len(self.arcs)
        nFinalNodeMask = 1 << ((nBytesArc*8)-1)
        nFinalArcMask = 1 << ((nBytesArc*8)-2)
        nNextNodeMask = 1 << ((nBytesArc*8)-3)
        if not nArc:
            val = nFinalNodeMask | nFinalArcMask
            by = val.to_bytes(nBytesArc, byteorder='big')
            by += (0).to_bytes(nBytesNodeAddress, byteorder='big')
            return by
        by = b""
        for i, arc in enumerate(self.arcs, 1):
            val = arc
            if i == 1 and self.final:
                val = val | nFinalNodeMask
            if i == nArc:
                val = val | nFinalArcMask
            if (self.pos + 1) == self.arcs[arc].pos and self.i != 0:
                val = val | nNextNodeMask
                by += val.to_bytes(nBytesArc, byteorder='big')
            else:
                by += val.to_bytes(nBytesArc, byteorder='big')
                by += self.arcs[arc].addr.to_bytes(nBytesNodeAddress, byteorder='big')
        return by

    def getTxtRepr2 (self, nBytesArc, lVal):
        "return representation as string of node (method 2)"
        nArc = len(self.arcs)
        nFinalNodeMask = 1 << ((nBytesArc*8)-1)
        nFinalArcMask = 1 << ((nBytesArc*8)-2)
        nNextNodeMask = 1 << ((nBytesArc*8)-3)
        s = "i{:_>10} -- #{:_>10}\n".format(self.i, self.addr)
        if not nArc:
            s += "  {:<20}  {:0>16}  i{:_>10}   #{:_>10}\n".format("", bin(nFinalNodeMask | nFinalArcMask)[2:], "0", "0")
            return s
        for i, arc in enumerate(self.arcs, 1):
            val = arc
            if i == 1 and self.final:
                val = val | nFinalNodeMask
            if i == nArc:
                val = val | nFinalArcMask
            if (self.pos + 1) == self.arcs[arc].pos  and self.i != 0:
                val = val | nNextNodeMask
                s += "  {:<20}  {:0>16}\n".format(lVal[arc], bin(val)[2:])
            else:
                s += "  {:<20}  {:0>16}  i{:_>10}   #{:_>10}\n".format(lVal[arc], bin(val)[2:], self.arcs[arc].i, self.arcs[arc].addr)
        return s

    # VERSION 3 =====================================================================================================
    def convToBytes3 (self, nBytesArc, nBytesNodeAddress, nBytesOffset):
        """
        Convert to bytes (method 3).

        Node scheme:
        - Arc length is defined by nBytesArc
        - Address length is defined by nBytesNodeAddress
        - Offset length is defined by nBytesOffset

        |                Arc                |            Address of next node  or  offset to next node              |
        |                                   |                                                                       |
         ┏━━━━━━━━━━━━━━━┓ ┏━━━━━━━━━━━━━━━┓ ┏━━━━━━━━━━━━━━━┓ ┏━━━━━━━━━━━━━━━┓ ┏━━━━━━━━━━━━━━━┓ ┏━━━━━━━━━━━━━━━┓
         ┃1┃0┃0┃ ┃ ┃ ┃ ┃ ┃ ┃ ┃ ┃ ┃ ┃ ┃ ┃ ┃ ┃ ┃ ┃ ┃ ┃ ┃ ┃ ┃ ┃ ┃ ┃ ┃ ┃ ┃ ┃ ┃ ┃ ┃ ┃ ┃ ┃ ┃ ┃ ┃ ┃ ┃ ┃ ┃ ┃ ┃ ┃ ┃ ┃ ┃ ┃ ┃ ┃
         ┗━━━━━━━━━━━━━━━┛ ┗━━━━━━━━━━━━━━━┛ ┗━━━━━━━━━━━━━━━┛ ┗━━━━━━━━━━━━━━━┛ ┗━━━━━━━━━━━━━━━┛ ┗━━━━━━━━━━━━━━━┛
         [...]
         ┏━━━━━━━━━━━━━━━┓ ┏━━━━━━━━━━━━━━━┓ ┏━━━━━━━━━━━━━━━┓
         ┃0┃0┃1┃ ┃ ┃ ┃ ┃ ┃ ┃ ┃ ┃ ┃ ┃ ┃ ┃ ┃ ┃ ┃ ┃ ┃ ┃ ┃ ┃ ┃ ┃ ┃     Offsets are shorter than addresses
         ┗━━━━━━━━━━━━━━━┛ ┗━━━━━━━━━━━━━━━┛ ┗━━━━━━━━━━━━━━━┛
         ┏━━━━━━━━━━━━━━━┓ ┏━━━━━━━━━━━━━━━┓ ┏━━━━━━━━━━━━━━━┓ ┏━━━━━━━━━━━━━━━┓ ┏━━━━━━━━━━━━━━━┓ ┏━━━━━━━━━━━━━━━┓
         ┃0┃1┃0┃ ┃ ┃ ┃ ┃ ┃ ┃ ┃ ┃ ┃ ┃ ┃ ┃ ┃ ┃ ┃ ┃ ┃ ┃ ┃ ┃ ┃ ┃ ┃ ┃ ┃ ┃ ┃ ┃ ┃ ┃ ┃ ┃ ┃ ┃ ┃ ┃ ┃ ┃ ┃ ┃ ┃ ┃ ┃ ┃ ┃ ┃ ┃ ┃ ┃ ┃
         ┗━━━━━━━━━━━━━━━┛ ┗━━━━━━━━━━━━━━━┛ ┗━━━━━━━━━━━━━━━┛ ┗━━━━━━━━━━━━━━━┛ ┗━━━━━━━━━━━━━━━┛ ┗━━━━━━━━━━━━━━━┛

          ^ ^ ^
          ┃ ┃ ┃
          ┃ ┃ ┗━━ if 1, offset instead of address of next node
          ┃ ┗━━━━ if 1, last arc of this node
          ┗━━━━━━ if 1, this node is final (only on the first arc)
        """
        nArc = len(self.arcs)
        nFinalNodeMask = 1 << ((nBytesArc*8)-1)
        nFinalArcMask = 1 << ((nBytesArc*8)-2)
        nNextNodeMask = 1 << ((nBytesArc*8)-3)
        nMaxOffset = (2 ** (nBytesOffset * 8)) - 1
        if not nArc:
            val = nFinalNodeMask | nFinalArcMask
            by = val.to_bytes(nBytesArc, byteorder='big')
            by += (0).to_bytes(nBytesNodeAddress, byteorder='big')
            return by
        by = b""
        for i, arc in enumerate(self.arcs, 1):
            val = arc
            if i == 1 and self.final:
                val = val | nFinalNodeMask
            if i == nArc:
                val = val | nFinalArcMask
            if 1 < (self.arcs[arc].addr - self.addr) < nMaxOffset and self.i != 0:
                val = val | nNextNodeMask
                by += val.to_bytes(nBytesArc, byteorder='big')
                by += (self.arcs[arc].addr-self.addr).to_bytes(nBytesOffset, byteorder='big')
            else:
                by += val.to_bytes(nBytesArc, byteorder='big')
                by += self.arcs[arc].addr.to_bytes(nBytesNodeAddress, byteorder='big')
        return by

    def getTxtRepr3 (self, nBytesArc, nBytesOffset, lVal):
        "return representation as string of node (method 3)"
        nArc = len(self.arcs)
        nFinalNodeMask = 1 << ((nBytesArc*8)-1)
        nFinalArcMask = 1 << ((nBytesArc*8)-2)
        nNextNodeMask = 1 << ((nBytesArc*8)-3)
        nMaxOffset = (2 ** (nBytesOffset * 8)) - 1
        s = "i{:_>10} -- #{:_>10}  ({})\n".format(self.i, self.addr, self.size)
        if not nArc:
            s += "  {:<20}  {:0>16}  i{:_>10}   #{:_>10}\n".format("", bin(nFinalNodeMask | nFinalArcMask)[2:], "0", "0")
            return s
        for i, arc in enumerate(self.arcs, 1):
            val = arc
            if i == 1 and self.final:
                val = val | nFinalNodeMask
            if i == nArc:
                val = val | nFinalArcMask
            if 1 < (self.arcs[arc].addr - self.addr) < nMaxOffset and self.i != 0:
                val = val | nNextNodeMask
                s += "  {:<20}  {:0>16}  i{:_>10}   +{:_>10}\n".format(lVal[arc], bin(val)[2:], self.arcs[arc].i, self.arcs[arc].addr - self.addr)
            else:
                s += "  {:<20}  {:0>16}  i{:_>10}   #{:_>10}\n".format(lVal[arc], bin(val)[2:], self.arcs[arc].i, self.arcs[arc].addr)
        return s



# Another attempt to sort node arcs

_dCharOrder = {
    # key: previous char, value: dictionary of chars {c: nValue}
    "": {}

Modified graphspell/ibdawg.py from [2cf8f6a51c] to [15700e29f3].

110
111
112
113
114
115
116
117
118


119
120

121
122
123
124
125


126
127
128


129
130



131


132
133
134
135
136







137
138
139
140
141
142
143
144
145
146
147
148











149
150
151
152
153
154
155
110
111
112
113
114
115
116


117
118
119

120





121
122



123
124
125

126
127
128
129
130
131





132
133
134
135
136
137
138












139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156







-
-
+
+

-
+
-
-
-
-
-
+
+
-
-
-
+
+

-
+
+
+

+
+
-
-
-
-
-
+
+
+
+
+
+
+
-
-
-
-
-
-
-
-
-
-
-
-
+
+
+
+
+
+
+
+
+
+
+









class IBDAWG:
    """INDEXABLE BINARY DIRECT ACYCLIC WORD GRAPH"""

    def __init__ (self, source):
        if isinstance(source, str):
            self.by = pkgutil.get_data(__package__, "_dictionaries/" + source)
            if not self.by:
            by = pkgutil.get_data(__package__, "_dictionaries/" + source)
            if not by:
                raise OSError("# Error. File not found or not loadable: "+source)

            self.sFileName = source
            if source.endswith(".bdic"):
                self._initBinary()
            elif source.endswith(".json"):
                self._initJSON(json.loads(self.by.decode("utf-8")))     #json.loads(self.by)    # In Python 3.6, can read directly binary strings
            else:
            oData = json.loads(by.decode("utf-8"))     #json.loads(by)    # In Python 3.6, can read directly binary strings
        else:
                raise OSError("# Error. Unknown file type: "+source)
        else:
            self._initJSON(source)
            self.sFileName = "[None]"
            oData = source

        self.sFileName = source  if isinstance(source, str)  else "[None]"
        self.__dict__.update(oData)
        self.dCharVal = { v: k  for k, v in self.dChar.items() }
        self.a2grams = set(getattr(self, 'l2grams'))  if hasattr(self, 'l2grams')  else None

        if "lByDic" not in oData:
            print(">>>> lByDic not in oData")
        # Performance trick:
        #     Instead of converting bytes to integers each times we parse the binary dictionary,
        #     we do it once, then parse the array
        nAcc = 0
        byBuffer = b""
            if "sByDic" not in oData:
                raise TypeError("# Error. No usable data in the dictionary.")
            # old dictionary version
            self.lByDic = []
            self.byDic = binascii.unhexlify(oData["sByDic"])
            nAcc = 0
            byBuffer = b""
        self.lByDic = []
        nDivisor = (self.nBytesArc + self.nBytesNodeAddress) / 2
        for i in range(0, len(self.byDic)):
            byBuffer += self.byDic[i:i+1]
            if nAcc == (self.nBytesArc - 1):
                self.lByDic.append(int.from_bytes(byBuffer, byteorder="big"))
                byBuffer = b""
            elif nAcc == (self.nBytesArc + self.nBytesNodeAddress - 1):
                self.lByDic.append(round(int.from_bytes(byBuffer, byteorder="big") / nDivisor))
                byBuffer = b""
                nAcc = -1
            nAcc = nAcc + 1
            nDivisor = (self.nBytesArc + self.nBytesNodeAddress) / 2
            for i in range(0, len(self.byDic)):
                byBuffer += self.byDic[i:i+1]
                if nAcc == (self.nBytesArc - 1):
                    self.lByDic.append(int.from_bytes(byBuffer, byteorder="big"))
                    byBuffer = b""
                elif nAcc == (self.nBytesArc + self.nBytesNodeAddress - 1):
                    self.lByDic.append(round(int.from_bytes(byBuffer, byteorder="big") / nDivisor))
                    byBuffer = b""
                    nAcc = -1
                nAcc = nAcc + 1

        # masks
        self._arcMask = (2 ** ((self.nBytesArc * 8) - 3)) - 1
        self._finalNodeMask = 1 << ((self.nBytesArc * 8) - 1)
        self._lastArcMask = 1 << ((self.nBytesArc * 8) - 2)

        # function to decode the affix/suffix code
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224

225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
167
168
169
170
171
172
173
















































174
175
176

177
178
179
180
181



































182
183
184
185
186
187
188







-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-



-
+




-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-







        # lexicographer module ?
        self.lexicographer = None
        try:
            self.lexicographer = importlib.import_module(".lexgraph_"+self.sLangCode, "grammalecte.graphspell")
        except ImportError:
            print("# No module <graphspell.lexgraph_"+self.sLangCode+".py>")


    def _initBinary (self):
        "initialize with binary structure file"
        if self.by[0:17] != b"/grammalecte-fsa/":
            raise TypeError("# Error. Not a grammalecte-fsa binary dictionary. Header: {}".format(self.by[0:9]))
        if not(self.by[17:18] == b"1" or self.by[17:18] == b"2" or self.by[17:18] == b"3"):
            raise ValueError("# Error. Unknown dictionary version: {}".format(self.by[17:18]))
        try:
            byHeader, byInfo, byValues, by2grams, byDic = self.by.split(b"\0\0\0\0", 4)
        except Exception:
            raise Exception

        self.nCompressionMethod = int(self.by[17:18].decode("utf-8"))
        self.sHeader = byHeader.decode("utf-8")
        self.lArcVal = byValues.decode("utf-8").split("\t")
        self.nArcVal = len(self.lArcVal)
        self.byDic = byDic
        self.a2grams = set(by2grams.decode("utf-8").split("\t"))

        l = byInfo.decode("utf-8").split("//")
        self.sLangCode = l.pop(0)
        self.sLangName = l.pop(0)
        self.sDicName = l.pop(0)
        self.sDescription = l.pop(0)
        self.sDate = l.pop(0)
        self.nChar = int(l.pop(0))
        self.nBytesArc = int(l.pop(0))
        self.nBytesNodeAddress = int(l.pop(0))
        self.nEntry = int(l.pop(0))
        self.nNode = int(l.pop(0))
        self.nArc = int(l.pop(0))
        self.nAff = int(l.pop(0))
        self.cStemming = l.pop(0)
        self.nTag = self.nArcVal - self.nChar - self.nAff
        # <dChar> to get the value of an arc, <dCharVal> to get the char of an arc with its value
        self.dChar = {}
        for i in range(1, self.nChar+1):
            self.dChar[self.lArcVal[i]] = i
        self.dCharVal = { v: k  for k, v in self.dChar.items() }

    def _initJSON (self, oJSON):
        "initialize with a JSON text file"
        self.sByDic = ""  # init to prevent pylint whining
        self.__dict__.update(oJSON)
        self.byDic = binascii.unhexlify(self.sByDic)
        self.dCharVal = { v: k  for k, v in self.dChar.items() }
        self.a2grams = set(getattr(self, 'l2grams'))  if hasattr(self, 'l2grams')  else None

    def getInfo (self):
        "return string about the IBDAWG"
        return  "  Language: {0.sLangName}   Lang code: {0.sLangCode}   Dictionary name: {0.sDicName}" \
                "  Compression method: {0.nCompressionMethod:>2}   Date: {0.sDate}   Stemming: {0.cStemming}FX\n" \
                "  Date: {0.sDate}   Stemming: {0.cStemming}FX\n" \
                "  Arcs values:  {0.nArcVal:>10,} = {0.nChar:>5,} characters,  {0.nAff:>6,} affixes,  {0.nTag:>6,} tags\n" \
                "  Dictionary: {0.nEntry:>12,} entries,    {0.nNode:>11,} nodes,   {0.nArc:>11,} arcs\n" \
                "  Address size: {0.nBytesNodeAddress:>1} bytes,  Arc size: {0.nBytesArc:>1} bytes\n".format(self)

    def writeAsJSObject (self, spfDest, bInJSModule=False, bBinaryDictAsHexString=False):
        "write IBDAWG as a JavaScript object in a JavaScript module"
        with open(spfDest, "w", encoding="utf-8", newline="\n") as hDst:
            if bInJSModule:
                hDst.write('// JavaScript\n// Generated data (do not edit)\n\n"use strict";\n\nconst dictionary = ')
            hDst.write(json.dumps({
                "sHeader": "/grammalecte-fsa/",
                "sLangCode": self.sLangCode,
                "sLangName": self.sLangName,
                "sDicName": self.sDicName,
                "sDescription": self.sDescription,
                "sFileName": self.sFileName,
                "sDate": self.sDate,
                "nEntry": self.nEntry,
                "nChar": self.nChar,
                "nAff": self.nAff,
                "nTag": self.nTag,
                "cStemming": self.cStemming,
                "dChar": self.dChar,
                "nNode": self.nNode,
                "nArc": self.nArc,
                "nArcVal": self.nArcVal,
                "lArcVal": self.lArcVal,
                "nCompressionMethod": self.nCompressionMethod,
                "nBytesArc": self.nBytesArc,
                "nBytesNodeAddress": self.nBytesNodeAddress,
                # JavaScript is a pile of shit, so Mozilla’s JS parser don’t like file bigger than 4 Mb!
                # So, if necessary, we use an hexadecimal string, that we will convert later in Firefox’s extension.
                # https://github.com/mozilla/addons-linter/issues/1361
                "sByDic": self.byDic.hex()  if bBinaryDictAsHexString  else [ e  for e in self.byDic ],
                "l2grams": list(self.a2grams)
            }, ensure_ascii=False))
            if bInJSModule:
                hDst.write(";\n\nexports.dictionary = dictionary;\n")

    def isValidToken (self, sToken):
        "checks if <sToken> is valid (if there is hyphens in <sToken>, <sToken> is split, each part is checked)"
        sToken = st.spellingNormalization(sToken)
        if self.isValid(sToken):
            return True
        if "-" in sToken:
            if sToken.count("-") > 4:

Modified graphspell/spellchecker.py from [2bdbe76996] to [9b47d651ea].

12
13
14
15
16
17
18
19
20


21
22
23
24
25
26
27
12
13
14
15
16
17
18


19
20
21
22
23
24
25
26
27







-
-
+
+







import traceback

from . import ibdawg
from . import tokenizer


dDefaultDictionaries = {
    "fr": "fr-allvars.bdic",
    "en": "en.bdic"
    "fr": "fr-allvars.json",
    "en": "en.json"
}


class SpellChecker ():
    "SpellChecker: wrapper for the IBDAWG class"

    def __init__ (self, sLangCode, sfMainDic="", sfCommunityDic="", sfPersonalDic=""):

Modified lex_build.py from [0d00b07703] to [5bdf726eee].

1
2
3
4
5
6
7
8
9
10
11
12
13
14

15
16
17
18

19
20

21
22
23

24
25
26
27
28
29
30
1
2
3
4
5
6
7
8
9
10
11
12
13

14
15
16
17

18


19
20


21
22
23
24
25
26
27
28













-
+



-
+
-
-
+

-
-
+







#!python3

"""
Lexicon builder
"""

import argparse
from distutils import dir_util

import graphspell.dawg as fsa
from graphspell.ibdawg import IBDAWG


def build (spfSrc, sLangCode, sLangName, sfDict, bJSON=False, sDicName="", sDescription="", sFilter="", cStemmingMethod="S", nCompressMethod=1):
def build (spfSrc, sLangCode, sLangName, sfDict, bJavaScript=False, sDicName="", sDescription="", sFilter="", cStemmingMethod="S", nCompressMethod=1):
    "transform a text lexicon as a binary indexable dictionary"
    oDAWG = fsa.DAWG(spfSrc, cStemmingMethod, sLangCode, sLangName, sDicName, sDescription, sFilter)
    dir_util.mkpath("graphspell/_dictionaries")
    oDAWG.writeInfo("graphspell/_dictionaries/" + sfDict + ".info.txt")
    oDAWG.writeAsJSObject("graphspell/_dictionaries/" + sfDict + ".json")
    oDAWG.writeBinary("graphspell/_dictionaries/" + sfDict + ".bdic", int(nCompressMethod))
    if bJSON:
    if bJavaScript:
        dir_util.mkpath("graphspell-js/_dictionaries")
        oDic = IBDAWG(sfDict + ".bdic")
        oDic.writeAsJSObject("graphspell-js/_dictionaries/" + sfDict + ".json", bBinaryDictAsHexString=True)
        oDAWG.writeAsJSObject("graphspell-js/_dictionaries/" + sfDict + ".json")


def main ():
    "parse args from CLI"
    xParser = argparse.ArgumentParser()
    xParser.add_argument("src_lexicon", type=str, help="path and file name of the source lexicon")
    xParser.add_argument("lang_code", type=str, help="language code")

Modified make.py from [f59af684eb] to [a76be310e9].

313
314
315
316
317
318
319
320

321
322
323
324
325
326

327
328
329
330
331

332
333
334
335
336
337
338
313
314
315
316
317
318
319

320
321
322
323
324
325

326
327
328
329
330

331
332
333
334
335
336
337
338







-
+





-
+




-
+







    dVars["dic_personal_filename_js"] = ""
    lDict = [ ("main", s)  for s in dVars['dic_filenames'].split(",") ]
    if bCommunityDict:
        lDict.append(("community", dVars['dic_community_filename']))
    if bPersonalDict:
        lDict.append(("personal", dVars['dic_personal_filename']))
    for sType, sFileName in lDict:
        spfPyDic = f"graphspell/_dictionaries/{sFileName}.bdic"
        spfPyDic = f"graphspell/_dictionaries/{sFileName}.json"
        spfJSDic = f"graphspell-js/_dictionaries/{sFileName}.json"
        if not os.path.isfile(spfPyDic) or (bJavaScript and not os.path.isfile(spfJSDic)):
            buildDictionary(dVars, sType, bJavaScript)
        print("  +", spfPyDic)
        file_util.copy_file(spfPyDic, "grammalecte/graphspell/_dictionaries")
        dVars['dic_'+sType+'_filename_py'] = sFileName + '.bdic'
        dVars['dic_'+sType+'_filename_py'] = sFileName + '.json'
        if bJavaScript:
            print("  +", spfJSDic)
            file_util.copy_file(spfJSDic, "grammalecte-js/graphspell/_dictionaries")
            dVars['dic_'+sType+'_filename_js'] = sFileName + '.json'
    dVars['dic_main_filename_py'] = dVars['dic_default_filename_py'] + ".bdic"
    dVars['dic_main_filename_py'] = dVars['dic_default_filename_py'] + ".json"
    dVars['dic_main_filename_js'] = dVars['dic_default_filename_js'] + ".json"


def buildDictionary (dVars, sType, bJavaScript=False):
    "build binary dictionary for Graphspell from lexicons"
    if sType == "main":
        spfLexSrc = dVars['lexicon_src']

Modified reader.py from [66f5eb17ae] to [e2706fc6a2].

1
2
3
4
5
6
7
8
9
10

11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29

30
31
32
33
34
35
36
1
2
3
4
5
6
7
8
9

10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28

29
30
31
32
33
34
35
36









-
+


















-
+







#!python3
# Just a file for one-shot scripts

import os
import sys
import re

import graphspell.ibdawg as ibdawg

oDict = ibdawg.IBDAWG("French.bdic")
oDict = ibdawg.IBDAWG("fr-allvars.json")


def readFile (spf):
    if os.path.isfile(spf):
        with open(spf, "r", encoding="utf-8") as hSrc:
            for sLine in hSrc:
                yield sLine
    else:
        print("# Error: file not found.")

# --------------------------------------------------------------------------------------------------

def listUnknownWords (spf):
    with open(spf+".res.txt", "w", encoding="utf-8") as hDst:
        for sLine in readFile(spfSrc):
            sLine = sLine.strip()
            if sLine:
                for sWord in sLine.split():
                    if not oDict.isValid(sWord): 
                    if not oDict.isValid(sWord):
                        hDst.write(sWord+"\n")

# --------------------------------------------------------------------------------------------------

def createLexStatFile (spf, dStat):
    dWord = {}
    for i, sLine in enumerate(readFile(spf)):