Grammalecte  Check-in [8333a8bf1b]

Overview
Comment:[graphspell] new header <grammalecte-fsa> for binary dictionaries
Downloads: Tarball | ZIP archive | SQL archive
Timelines: family | ancestors | descendants | both | graphspell | multid
Files: files | file ages | folders
SHA3-256: 8333a8bf1bb8a85bc0b361a59d2fe3c81874c2aef94c4879717f49044168f011
User & Date: olr on 2018-04-01 08:16:11
Other Links: branch diff | manifest | tags
Context
2018-04-01
09:31
[fr][bug] conj: test if tTags exists check-in: a89587a82c user: olr tags: fr, multid
08:16
[graphspell] new header <grammalecte-fsa> for binary dictionaries check-in: 8333a8bf1b user: olr tags: graphspell, multid
08:01
[fx] main panel: dictionaries check-in: 24a9f4dab6 user: olr tags: fx, multid
Changes

Modified graphspell-js/dawg.js from [e2cd530970] to [3711bc314d].

372
373
374
375
376
377
378
379

380
381
382
383
384
385
386
372
373
374
375
376
377
378

379
380
381
382
383
384
385
386







-
+







        if (nCompressionMethod == 1) {
            sByDic = this.oRoot.convToBytes1(this.nBytesArc, this.nBytesNodeAddress);
            for (let oNode of this.dMinimizedNodes.values()) {
                sByDic += oNode.convToBytes1(this.nBytesArc, this.nBytesNodeAddress);
            }
        }
        let oJSON = {
            "sHeader": "/pyfsa/",
            "sHeader": "/grammalecte-fsa/",
            "sLangCode": this.sLangCode,
            "sLangName": this.sLangName,
            "sDicName": this.sDicName,
            "sFileName": "[none]",
            "sDate": this._getDate(),
            "nEntry": this.nEntry,
            "nChar": this.nChar,

Modified graphspell-js/ibdawg.js from [050f6e0036] to [1b3dff6227].

115
116
117
118
119
120
121
122
123


124
125
126
127
128
129
130
115
116
117
118
119
120
121


122
123
124
125
126
127
128
129
130







-
-
+
+







        for (let i = 0;  i < this.sByDic.length;  i+=2) {
            lTemp.push(parseInt(this.sByDic.slice(i, i+2), 16));
        }
        this.byDic = lTemp;
        //this.byDic = new Uint8Array(lTemp);  // not quicker, even slower
        /* end of bug workaround */

        if (!this.sHeader.startsWith("/pyfsa/")) {
            throw TypeError("# Error. Not a pyfsa binary dictionary. Header: " + this.sHeader);
        if (!this.sHeader.startsWith("/grammalecte-fsa/")) {
            throw TypeError("# Error. Not a grammalecte-fsa binary dictionary. Header: " + this.sHeader);
        }
        if (!(this.nCompressionMethod == 1 || this.nCompressionMethod == 2 || this.nCompressionMethod == 3)) {
            throw RangeError("# Error. Unknown dictionary compression method: " + this.nCompressionMethod);
        }
        // <dChar> to get the value of an arc, <dCharVal> to get the char of an arc with its value
        this.dChar = helpers.objectToMap(this.dChar);
        this.dCharVal = this.dChar.gl_reverse();
179
180
181
182
183
184
185
186

187
188
189
190
191
192
193
179
180
181
182
183
184
185

186
187
188
189
190
191
192
193







-
+







                `  Arcs values:  ${this.nArcVal} = ${this.nChar} characters,  ${this.nAff} affixes,  ${this.nTag} tags\n` +
                `  Dictionary: ${this.nEntry} entries,    ${this.nNode} nodes,   ${this.nArc} arcs\n` +
                `  Address size: ${this.nBytesNodeAddress} bytes,  Arc size: ${this.nBytesArc} bytes\n`;
    }

    getJSON () {
        let oJSON = {
            "sHeader": "/pyfsa/",
            "sHeader": "/grammalecte-fsa/",
            "sLangCode": this.sLangCode,
            "sLangName": this.sLangName,
            "sDicName": this.sDicName,
            "sFileName": this.sFileName,
            "sDate": this.sDate,
            "nEntry": this.nEntry,
            "nChar": this.nChar,

Modified graphspell/dawg.py from [7cd1b4dc56] to [63684196d2].

396
397
398
399
400
401
402
403

404
405
406
407
408
409
410
396
397
398
399
400
401
402

403
404
405
406
407
408
409
410







-
+







            for oNode in self.lSortedNodes:
                byDic += oNode.convToBytes2(self.nBytesArc, self.nBytesNodeAddress)
        elif nCompressionMethod == 3:
            byDic = self.oRoot.convToBytes3(self.nBytesArc, self.nBytesNodeAddress, self.nBytesOffset)
            for oNode in self.lSortedNodes:
                byDic += oNode.convToBytes3(self.nBytesArc, self.nBytesNodeAddress, self.nBytesOffset)
        return {
            "sHeader": "/pyfsa/",
            "sHeader": "/grammalecte-fsa/",
            "sLangCode": self.sLangCode,
            "sLangName": self.sLangName,
            "sDicName": self.sDicName,
            "sFileName": self.sFileName,
            "sDate": self._getDate(),
            "nEntry": self.nEntry,
            "nChar": self.nChar,
438
439
440
441
442
443
444
445

446
447
448
449
450
451
452
438
439
440
441
442
443
444

445
446
447
448
449
450
451
452







-
+








    def writeBinary (self, sPathFile, nCompressionMethod, bDebug=False):
        """
        Format of the binary indexable dictionary:
        Each section is separated with 4 bytes of \0
        
        - Section Header:
            /pyfsa/[compression method]
            /grammalecte-fsa/[compression method]
                * compression method is an ASCII string
        
        - Section Informations:
            /[lang code]
            /[lang name]
            /[dictionary name]
            /[date creation]
472
473
474
475
476
477
478
479

480
481
482
483
484
485
486
472
473
474
475
476
477
478

479
480
481
482
483
484
485
486







-
+







                  See DawgNode.convToBytes() for details.
        """
        self._calculateBinary(nCompressionMethod)
        if not sPathFile.endswith(".bdic"):
            sPathFile += "."+str(nCompressionMethod)+".bdic"
        with open(sPathFile, 'wb') as hDst:
            # header
            hDst.write("/pyfsa/{}/".format(nCompressionMethod).encode("utf-8"))
            hDst.write("/grammalecte-fsa/{}/".format(nCompressionMethod).encode("utf-8"))
            hDst.write(b"\0\0\0\0")
            # infos
            sInfo = "{}//{}//{}//{}//{}//{}//{}//{}//{}//{}//{}//{}//".format(self.sLangCode, self.sLangName, self.sDicName, self._getDate(), \
                                                                              self.nChar, self.nBytesArc, self.nBytesNodeAddress, \
                                                                              self.nEntry, self.nNode, self.nArc, self.nAff, self.cStemming)
            hDst.write(sInfo.encode("utf-8"))
            hDst.write(b"\0\0\0\0")

Modified graphspell/ibdawg.py from [b794aeec36] to [f523996e8f].

132
133
134
135
136
137
138
139
140
141
142




143
144
145
146
147
148

149
150
151
152
153
154
155
132
133
134
135
136
137
138




139
140
141
142
143
144
145
146
147

148
149
150
151
152
153
154
155







-
-
-
-
+
+
+
+





-
+







            raise ValueError("  # Error: unknown code: {}".format(self.nCompressionMethod))

        self.bOptNumSigle = False
        self.bOptNumAtLast = False

    def _initBinary (self):
        "initialize with binary structure file"
        if self.by[0:7] != b"/pyfsa/":
            raise TypeError("# Error. Not a pyfsa binary dictionary. Header: {}".format(self.by[0:9]))
        if not(self.by[7:8] == b"1" or self.by[7:8] == b"2" or self.by[7:8] == b"3"):
            raise ValueError("# Error. Unknown dictionary version: {}".format(self.by[7:8]))
        if self.by[0:17] != b"/grammalecte-fsa/":
            raise TypeError("# Error. Not a grammalecte-fsa binary dictionary. Header: {}".format(self.by[0:9]))
        if not(self.by[17:18] == b"1" or self.by[17:18] == b"2" or self.by[17:18] == b"3"):
            raise ValueError("# Error. Unknown dictionary version: {}".format(self.by[17:18]))
        try:
            header, info, values, bdic = self.by.split(b"\0\0\0\0", 3)
        except Exception:
            raise Exception
        
        self.nCompressionMethod = int(self.by[7:8].decode("utf-8"))
        self.nCompressionMethod = int(self.by[17:18].decode("utf-8"))
        self.sHeader = header.decode("utf-8")
        self.lArcVal = values.decode("utf-8").split("\t")
        self.nArcVal = len(self.lArcVal)
        self.byDic = bdic

        l = info.decode("utf-8").split("//")
        self.sLangCode = l.pop(0)
187
188
189
190
191
192
193
194

195
196
197
198
199
200
201
187
188
189
190
191
192
193

194
195
196
197
198
199
200
201







-
+








    def writeAsJSObject (self, spfDest, bInJSModule=False, bBinaryDictAsHexString=False):
        "write IBDAWG as a JavaScript object in a JavaScript module"
        with open(spfDest, "w", encoding="utf-8", newline="\n") as hDst:
            if bInJSModule:
                hDst.write('// JavaScript\n// Generated data (do not edit)\n\n"use strict";\n\nconst dictionary = ')
            hDst.write(json.dumps({
                            "sHeader": "/pyfsa/",
                            "sHeader": "/grammalecte-fsa/",
                            "sLangCode": self.sLangCode,
                            "sLangName": self.sLangName,
                            "sDicName": self.sDicName,
                            "sFileName": self.sFileName,
                            "sDate": self.sDate,
                            "nEntry": self.nEntry,
                            "nChar": self.nChar,