Grammalecte  Check-in [5538934848]

Overview
Comment:[graphspell][fx] dawg: remove useless parameters
Downloads: Tarball | ZIP archive | SQL archive
Timelines: family | ancestors | descendants | both | fx | graphspell | dict2
Files: files | file ages | folders
SHA3-256: 5538934848f82156ff524ccf9654299bbfe1ee5ea832b765b1d0529c28123028
User & Date: olr on 2020-11-05 16:25:58
Other Links: branch diff | manifest | tags
Context
2020-11-07
11:40
[graphspell][build][lo][fx] merge dict2: use binary list instead of binary string, drop support for binary file -> use JSON, code cleaning check-in: 40ebc5eada user: olr tags: trunk, build, major_change, fx, lo, graphspell
2020-11-05
16:25
[graphspell][fx] dawg: remove useless parameters Closed-Leaf check-in: 5538934848 user: olr tags: fx, graphspell, dict2
13:27
[graphspell][js] fix syntax error check-in: 6d54aadbb1 user: olr tags: graphspell, dict2
Changes

Modified gc_lang/fr/webext/panel/lex_editor.js from [a95656b530] to [ffa654539b].

757
758
759
760
761
762
763
764

765
766
767
768
769
770
771
757
758
759
760
761
762
763

764
765
766
767
768
769
770
771







-
+







    },

    build: function () {
        let xProgressNode = document.getElementById("wait_progress");
        let lEntry = oLexiconTable.getEntries();
        if (lEntry.length > 0) {
            let oDAWG = new DAWG(lEntry, "S", "fr", "Français", this.sName, this.sDescription, xProgressNode);
            let oJSON = oDAWG.createBinaryJSON(1);
            let oJSON = oDAWG.createBinaryJSON();
            oDictHandler.saveDictionary(this.sName, oJSON);
            this.oIBDAWG = new IBDAWG(oJSON);
            this.setDictData(this.oIBDAWG.nEntry, this.oIBDAWG.sDate);
        } else {
            oDictHandler.saveDictionary(this.sName, null);
            this.setDictData(0, "[néant]");
        }

Modified graphspell-js/dawg.js from [4cbdb7b217] to [82fa7fdc68].

340
341
342
343
344
345
346
347

348
349
350
351
352
353
354
340
341
342
343
344
345
346

347
348
349
350
351
352
353
354







-
+







                    }
                }
            }
        }
    }

    // BINARY CONVERSION
    _calculateBinary (nCompressionMethod=1) {
    _calculateBinary () {
        console.log("Write DAWG as an indexable binary dictionary");
        this.nBytesArc = Math.floor( (this.nArcVal.toString(2).length + 2) / 8 ) + 1;     // We add 2 bits. See DawgNode.convToBytes()
        this.nBytesOffset = 0;
        this._calcNumBytesNodeAddress();
        this._calcNodesAddress();
        this.sByDic = this.oRoot.convToBytes(this.nBytesArc, this.nBytesNodeAddress);
        for (let oNode of this.dMinimizedNodes.values()) {
403
404
405
406
407
408
409
410
411


412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
403
404
405
406
407
408
409


410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430

431
432
433
434
435
436
437







-
-
+
+



















-







        for (let n of aBytes) {
            nVal += n << nWeight;
            nWeight = nWeight - 8;
        }
        return nVal;
    }

    createBinaryJSON (nCompressionMethod=1) {
        this._calculateBinary(nCompressionMethod);
    createBinaryJSON () {
        this._calculateBinary();
        this._binaryToList();
        let oJSON = {
            "sHeader": "/grammalecte-fsa/",
            "sLangCode": this.sLangCode,
            "sLangName": this.sLangName,
            "sDicName": this.sDicName,
            "sDescription": this.sDescription,
            "sFileName": "[none]",
            "sDate": this._getDate(),
            "nEntry": this.nEntry,
            "nChar": this.nChar,
            "nAff": this.nAff,
            "nTag": this.nTag,
            "cStemming": this.cStemming,
            "dChar": helpers.mapToObject(this.dChar),
            "nNode": this.nNode,
            "nArc": this.nArc,
            "lArcVal": this.lArcVal,
            "nArcVal": this.nArcVal,
            "nCompressionMethod": nCompressionMethod,
            "nBytesArc": this.nBytesArc,
            "nBytesNodeAddress": this.nBytesNodeAddress,
            "nBytesOffset": this.nBytesOffset,
            //"sByDic": this.sByDic,    // binary word graph
            "lByDic": this.lByDic,
            "l2grams": Array.from(this.a2grams)
        };

Modified graphspell-js/dic_merger.js from [dea1fd0b02] to [10f02569ea].

34
35
36
37
38
39
40
41

42
43
44
45
46
47
48
34
35
36
37
38
39
40

41
42
43
44
45
46
47
48







-
+







            }
        }
        if (xProgressBar) {
            xProgressBar.value = xProgressBar.max;
        }
        try {
            let oDAWG = new DAWG(lEntry, cStemming, sLangCode, sLangName, sDicName, sDescription, xProgressBar);
            let oDict = oDAWG.createBinaryJSON(1);
            let oDict = oDAWG.createBinaryJSON();
            return oDict;
        }
        catch (e) {
            console.log("Dictionaries merger: unable to generate merged dictionary");
            console.error(e);
            return null;
        }

Modified graphspell-js/ibdawg.js from [8c83fe3dde] to [73dd04c644].

143
144
145
146
147
148
149
150

151
152
153
154
155
156
157
143
144
145
146
147
148
149

150
151
152
153
154
155
156
157







-
+







            console.error(e);
            console.log("path: " + sPath);
            console.log("dic:" + source.slice(0, 1000));
            throw Error("# Error. File not found or not loadable.\n" + e.message + "\n");
        }
        /*
            Properties:
            sName, nCompressionMethod, sHeader, lArcVal, nArcVal, sByDic, sLang, nChar, nBytesArc, nBytesNodeAddress,
            sName, sHeader, lArcVal, nArcVal, sByDic, sLang, nChar, nBytesArc, nBytesNodeAddress,
            nEntry, nNode, nArc, nAff, cStemming, nTag, dChar, nBytesOffset,
        */

        if (!(this.sHeader.startsWith("/grammalecte-fsa/") || this.sHeader.startsWith("/pyfsa/"))) {
            throw TypeError("# Error. Not a grammalecte-fsa binary dictionary. Header: " + this.sHeader);
        }
        // <dChar> to get the value of an arc, <dCharVal> to get the char of an arc with its value
207
208
209
210
211
212
213
214

215
216
217
218
219
220
221
207
208
209
210
211
212
213

214
215
216
217
218
219
220
221







-
+







        if (self && self.hasOwnProperty("lexgraph_"+this.sLangCode)) { // self is the Worker
            this.lexicographer = self["lexgraph_"+this.sLangCode];
        }
    }

    getInfo () {
        return  `  Language: ${this.sLangName}   Lang code: ${this.sLangCode}   Dictionary name: ${this.sDicName}\n` +
                `  Compression method: ${this.nCompressionMethod}   Date: ${this.sDate}   Stemming: ${this.cStemming}FX\n` +
                `  Date: ${this.sDate}   Stemming: ${this.cStemming}FX\n` +
                `  Arcs values:  ${this.nArcVal} = ${this.nChar} characters,  ${this.nAff} affixes,  ${this.nTag} tags\n` +
                `  Dictionary: ${this.nEntry} entries,    ${this.nNode} nodes,   ${this.nArc} arcs\n` +
                `  Address size: ${this.nBytesNodeAddress} bytes,  Arc size: ${this.nBytesArc} bytes\n`;
    }

    getJSON () {
        let oJSON = {
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
232
233
234
235
236
237
238

239
240
241
242
243
244
245







-







            "nTag": this.nTag,
            "cStemming": this.cStemming,
            "dChar": helpers.mapToObject(this.dChar),
            "nNode": this.nNode,
            "nArc": this.nArc,
            "lArcVal": this.lArcVal,
            "nArcVal": this.nArcVal,
            "nCompressionMethod": this.nCompressionMethod,
            "nBytesArc": this.nBytesArc,
            "nBytesNodeAddress": this.nBytesNodeAddress,
            "nBytesOffset": this.nBytesOffset,
            "sByDic": this.sByDic,  // binary word graph
            "l2grams": this.l2grams
        };
        return oJSON;

Modified graphspell/dawg.py from [781b24100a] to [729715ac89].

354
355
356
357
358
359
360
361

362
363
364
365
366
367
368
354
355
356
357
358
359
360

361
362
363
364
365
366
367
368







-
+







                sEntry = sWord + "\t" + self.funcStemming(sWord, self.lArcVal[nVal])
                for nMorphVal, _ in oNextNode.arcs.items():
                    if not zPattern or zPattern.search(self.lArcVal[nMorphVal]):
                        yield sEntry + "\t" + self.lArcVal[nMorphVal]


    # BINARY CONVERSION
    def _calculateBinary (self, nCompressionMethod=1):
    def _calculateBinary (self):
        print(" > Write DAWG as an indexable binary dictionary")
        self.nBytesArc = ( (self.nArcVal.bit_length() + 2) // 8 ) + 1   # We add 2 bits. See DawgNode.convToBytes()
        self.nBytesOffset = 0
        self._calcNumBytesNodeAddress()
        self._calcNodesAddress()
        self.byDic = b""
        self.byDic = self.oRoot.convToBytes(self.nBytesArc, self.nBytesNodeAddress)
383
384
385
386
387
388
389







390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405

406
407

408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439

440
441
442

443
444
445
446

447
448
449
450
451
452
453

454
455
456

457
458
459
460
461
462
463
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411

412
413

414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433

434
435
436
437
438
439
440
441
442
443
444

445
446
447

448
449



450


451
452
453
454

455
456
457

458
459
460
461
462
463
464
465







+
+
+
+
+
+
+















-
+

-
+



















-











-
+


-
+

-
-
-
+
-
-




-
+


-
+







        nBytesNode = self.nBytesArc + self.nBytesNodeAddress
        iAddr = len(self.oRoot.arcs) * nBytesNode
        for oNode in self.lMinimizedNodes:
            oNode.addr = iAddr
            iAddr += max(len(oNode.arcs), 1) * nBytesNode

    def _binaryToList (self):
        """
        Convert binary string to binary list
        BEFORE: Arc                 Address                                 Arc                 Address
                ||||||||| ||||||||| ||||||||| ||||||||| ||||||||| ||||||||| ||||||||| ||||||||| ||||||||| ||||||||| ||||||||| ||||||||| ...

        AFTER:  list of integers: [ arc, address, arc, address, arc, address, ... arc, address ]
        """
        self.lByDic = []
        nAcc = 0
        byBuffer = b""
        nDivisor = (self.nBytesArc + self.nBytesNodeAddress) / 2
        for i in range(0, len(self.byDic)):
            byBuffer += self.byDic[i:i+1]
            if nAcc == (self.nBytesArc - 1):
                self.lByDic.append(int.from_bytes(byBuffer, byteorder="big"))
                byBuffer = b""
            elif nAcc == (self.nBytesArc + self.nBytesNodeAddress - 1):
                self.lByDic.append(round(int.from_bytes(byBuffer, byteorder="big") / nDivisor))
                byBuffer = b""
                nAcc = -1
            nAcc = nAcc + 1

    def getBinaryAsJSON (self, nCompressionMethod=1):
    def getBinaryAsJSON (self):
        "return a JSON string containing all necessary data of the dictionary (compressed as a binary string)"
        self._calculateBinary(nCompressionMethod)
        self._calculateBinary()
        self._binaryToList()
        return {
            "sHeader": "/grammalecte-fsa/",
            "sLangCode": self.sLangCode,
            "sLangName": self.sLangName,
            "sDicName": self.sDicName,
            "sDescription": self.sDescription,
            "sFileName": self.sFileName,
            "sDate": self._getDate(),
            "nEntry": self.nEntry,
            "nChar": self.nChar,
            "nAff": self.nAff,
            "nTag": self.nTag,
            "cStemming": self.cStemming,
            "dChar": self.dChar,
            "nNode": self.nNode,
            "nArc": self.nArc,
            "nArcVal": self.nArcVal,
            "lArcVal": self.lArcVal,
            "nCompressionMethod": nCompressionMethod,
            "nBytesArc": self.nBytesArc,
            "nBytesNodeAddress": self.nBytesNodeAddress,
            "nBytesOffset": self.nBytesOffset,
            # Mozilla’s JS parser don’t like file bigger than 4 Mb!
            # So, if necessary, we use an hexadecimal string, that we will convert later in Firefox’s extension.
            # https://github.com/mozilla/addons-linter/issues/1361
            #"sByDic": self.byDic.hex(),
            "lByDic": self.lByDic,
            "l2grams": list(self.a2grams)
        }

    def writeAsJSObject (self, spfDst, nCompressionMethod=1, bInJSModule=False):
    def writeAsJSObject (self, spfDst):
        "write a file (JSON or JS module) with all the necessary data"
        if not spfDst.endswith(".json"):
            spfDst += "."+str(nCompressionMethod)+".json"
            spfDst += ".json"
        with open(spfDst, "w", encoding="utf-8", newline="\n") as hDst:
            if bInJSModule:
                hDst.write('// JavaScript\n// Generated data (do not edit)\n\n"use strict";\n\nconst dictionary = ')
            hDst.write( json.dumps(self.getBinaryAsJSON(nCompressionMethod), ensure_ascii=False) )
            hDst.write( json.dumps(self.getBinaryAsJSON(), ensure_ascii=False) )
            if bInJSModule:
                hDst.write(";\n\nexports.dictionary = dictionary;\n")

    def _getDate (self):
        return time.strftime("%Y-%m-%d %H:%M:%S")

    def _writeNodes (self, sPathFile, nCompressionMethod=1):
    def _writeNodes (self, sPathFile):
        "for debugging only"
        print(" > Write nodes")
        with open(sPathFile+".nodes."+str(nCompressionMethod)+".txt", 'w', encoding='utf-8', newline="\n") as hDst:
        with open(sPathFile+".nodes.txt", 'w', encoding='utf-8', newline="\n") as hDst:
            hDst.write(self.oRoot.getTxtRepr(self.nBytesArc, self.lArcVal)+"\n")
            #hDst.write( ''.join( [ "%02X " %  z  for z in self.oRoot.convToBytes(self.nBytesArc, self.nBytesNodeAddress) ] ).strip() )
            for oNode in self.lMinimizedNodes:
                hDst.write(oNode.getTxtRepr(self.nBytesArc, self.lArcVal)+"\n")



Modified graphspell/ibdawg.py from [953707753a] to [15700e29f3].

170
171
172
173
174
175
176
177

178
179
180
181
182
183
184
170
171
172
173
174
175
176

177
178
179
180
181
182
183
184







-
+







            self.lexicographer = importlib.import_module(".lexgraph_"+self.sLangCode, "grammalecte.graphspell")
        except ImportError:
            print("# No module <graphspell.lexgraph_"+self.sLangCode+".py>")

    def getInfo (self):
        "return string about the IBDAWG"
        return  "  Language: {0.sLangName}   Lang code: {0.sLangCode}   Dictionary name: {0.sDicName}" \
                "  Compression method: {0.nCompressionMethod:>2}   Date: {0.sDate}   Stemming: {0.cStemming}FX\n" \
                "  Date: {0.sDate}   Stemming: {0.cStemming}FX\n" \
                "  Arcs values:  {0.nArcVal:>10,} = {0.nChar:>5,} characters,  {0.nAff:>6,} affixes,  {0.nTag:>6,} tags\n" \
                "  Dictionary: {0.nEntry:>12,} entries,    {0.nNode:>11,} nodes,   {0.nArc:>11,} arcs\n" \
                "  Address size: {0.nBytesNodeAddress:>1} bytes,  Arc size: {0.nBytesArc:>1} bytes\n".format(self)

    def isValidToken (self, sToken):
        "checks if <sToken> is valid (if there is hyphens in <sToken>, <sToken> is split, each part is checked)"
        sToken = st.spellingNormalization(sToken)