Overview
Comment: | [graphspell] dawg: consistency |
---|---|
Downloads: | Tarball | ZIP archive | SQL archive |
Timelines: | family | ancestors | descendants | both | trunk | graphspell |
Files: | files | file ages | folders |
SHA3-256: |
665bb08741ffd67e3f608d4ed35e6235 |
User & Date: | olr on 2018-02-10 20:01:43 |
Other Links: | manifest | tags |
Context
2018-02-10
| ||
21:32 | [fr] tests: <comment vous Vinf> / <comment vous 2p> check-in: b1ca5b3203 user: olr tags: trunk, fr | |
20:01 | [graphspell] dawg: consistency check-in: 665bb08741 user: olr tags: trunk, graphspell | |
13:53 | [graphspell] add date to JSON dictionary check-in: 0a4c113f2c user: olr tags: trunk, graphspell | |
Changes
Modified graphspell-js/dawg.js from [f93ec574ae] to [68590304c9].
︙ | ︙ | |||
329 330 331 332 333 334 335 | } // BINARY CONVERSION createBinary (nMethod) { console.log("Write DAWG as an indexable binary dictionary [method: "+nMethod+"]"); if (nMethod == 1) { this.nBytesArc = Math.floor( (this.nArcVal.toString(2).length + 2) / 8 ) + 1; // We add 2 bits. See DawgNode.convToBytes1() | > | | | 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 | } // BINARY CONVERSION createBinary (nMethod) { console.log("Write DAWG as an indexable binary dictionary [method: "+nMethod+"]"); if (nMethod == 1) { this.nBytesArc = Math.floor( (this.nArcVal.toString(2).length + 2) / 8 ) + 1; // We add 2 bits. See DawgNode.convToBytes1() this.nBytesOffset = 0; this._calcNumBytesNodeAddress(); this._calcNodesAddress1(); } else { console.log("Error: unknown compression method"); } console.log("Arc values (chars, affixes and tags): " + this.nArcVal); console.log("Arc size: "+this.nBytesArc+" bytes, Address size: "+this.nBytesNodeAddress+" bytes"); console.log("-> " + this.nBytesArc+this.nBytesNodeAddress + " * " + this.nArc + " = " + (this.nBytesArc+this.nBytesNodeAddress)*this.nArc + " bytes"); return this._createJSON(nMethod); |
︙ | ︙ | |||
414 415 416 417 418 419 420 | "nEntries": this.nEntry, "nNode": this.nNode, "nArc": this.nArc, "nAff": this.nAff, "cStemming": this.cStemming, "nTag": this.nTag, "dChar": helpers.mapToObject(this.dChar), | | | 415 416 417 418 419 420 421 422 423 424 425 426 427 428 429 | "nEntries": this.nEntry, "nNode": this.nNode, "nArc": this.nArc, "nAff": this.nAff, "cStemming": this.cStemming, "nTag": this.nTag, "dChar": helpers.mapToObject(this.dChar), "nBytesOffset": this.nBytesOffset }; return oJSON; }, _getDate () { let oDate = new Date(); let sMonth = (oDate.getMonth() + 1).toString().padStart(2, "0"); // Month+1: Because JS always sucks somehow. |
︙ | ︙ |
Modified graphspell-js/ibdawg.js from [f5e06268c1] to [7af6f8dfaf].
︙ | ︙ | |||
99 100 101 102 103 104 105 | } catch (e) { throw Error("# Error. File not found or not loadable.\n" + e.message + "\n"); } /* Properties: sName, nVersion, sHeader, lArcVal, nArcVal, byDic, sLang, nChar, nBytesArc, nBytesNodeAddress, | | | 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 | } catch (e) { throw Error("# Error. File not found or not loadable.\n" + e.message + "\n"); } /* Properties: sName, nVersion, sHeader, lArcVal, nArcVal, byDic, sLang, nChar, nBytesArc, nBytesNodeAddress, nEntries, nNode, nArc, nAff, cStemming, nTag, dChar, nBytesOffset, */ /* Bug workaround. Mozilla’s JS parser sucks. Can’t read file bigger than 4 Mb! So we convert huge hexadecimal string to list of numbers… https://github.com/mozilla/addons-linter/issues/1361 |
︙ | ︙ |
Modified graphspell/dawg.py from [1156fb2ccd] to [9c1c8b10c8].
︙ | ︙ | |||
8 9 10 11 12 13 14 15 16 17 18 19 20 21 | # This tool encodes lexicon into an indexable binary dictionary # Input files MUST be encoded in UTF-8. import sys import os import collections from . import str_transform as st from .progressbar import ProgressBar def readFile (spf): | > > | 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 | # This tool encodes lexicon into an indexable binary dictionary # Input files MUST be encoded in UTF-8. import sys import os import collections import json import datetime from . import str_transform as st from .progressbar import ProgressBar def readFile (spf): |
︙ | ︙ | |||
306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 | # BINARY CONVERSION def createBinary (self, sPathFile, nMethod, bDebug=False): print(" > Write DAWG as an indexable binary dictionary [method: %d]" % nMethod) if nMethod == 1: self.nBytesArc = ( (self.nArcVal.bit_length() + 2) // 8 ) + 1 # We add 2 bits. See DawgNode.convToBytes1() self._calcNumBytesNodeAddress() self._calcNodesAddress1() elif nMethod == 2: self.nBytesArc = ( (self.nArcVal.bit_length() + 3) // 8 ) + 1 # We add 3 bits. See DawgNode.convToBytes2() self._calcNumBytesNodeAddress() self._calcNodesAddress2() elif nMethod == 3: self.nBytesArc = ( (self.nArcVal.bit_length() + 3) // 8 ) + 1 # We add 3 bits. See DawgNode.convToBytes3() self.nBytesOffset = 1 self.nMaxOffset = (2 ** (self.nBytesOffset * 8)) - 1 self._calcNumBytesNodeAddress() self._calcNodesAddress3() else: print(" # Error: unknown compression method") print(" Arc values (chars, affixes and tags): {} -> {} bytes".format( self.nArcVal, len("\t".join(self.lArcVal).encode("utf-8")) )) print(" Arc size: {} bytes, Address size: {} bytes -> {} * {} = {} bytes".format( self.nBytesArc, self.nBytesNodeAddress, \ self.nBytesArc+self.nBytesNodeAddress, self.nArc, \ (self.nBytesArc+self.nBytesNodeAddress)*self.nArc )) self._writeBinary(sPathFile, nMethod) if bDebug: self._writeNodes(sPathFile, nMethod) def _calcNumBytesNodeAddress (self): "how many bytes needed to store all nodes/arcs in the binary dictionary" self.nBytesNodeAddress = 1 while ((self.nBytesArc + self.nBytesNodeAddress) * self.nArc) > (2 ** (self.nBytesNodeAddress * 8)): | > > > | 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 | # BINARY CONVERSION def createBinary (self, sPathFile, nMethod, bDebug=False): print(" > Write DAWG as an indexable binary dictionary [method: %d]" % nMethod) if nMethod == 1: self.nBytesArc = ( (self.nArcVal.bit_length() + 2) // 8 ) + 1 # We add 2 bits. See DawgNode.convToBytes1() self.nBytesOffset = 0 self._calcNumBytesNodeAddress() self._calcNodesAddress1() elif nMethod == 2: self.nBytesArc = ( (self.nArcVal.bit_length() + 3) // 8 ) + 1 # We add 3 bits. See DawgNode.convToBytes2() self.nBytesOffset = 0 self._calcNumBytesNodeAddress() self._calcNodesAddress2() elif nMethod == 3: self.nBytesArc = ( (self.nArcVal.bit_length() + 3) // 8 ) + 1 # We add 3 bits. See DawgNode.convToBytes3() self.nBytesOffset = 1 self.nMaxOffset = (2 ** (self.nBytesOffset * 8)) - 1 self._calcNumBytesNodeAddress() self._calcNodesAddress3() else: print(" # Error: unknown compression method") print(" Arc values (chars, affixes and tags): {} -> {} bytes".format( self.nArcVal, len("\t".join(self.lArcVal).encode("utf-8")) )) print(" Arc size: {} bytes, Address size: {} bytes -> {} * {} = {} bytes".format( self.nBytesArc, self.nBytesNodeAddress, \ self.nBytesArc+self.nBytesNodeAddress, self.nArc, \ (self.nBytesArc+self.nBytesNodeAddress)*self.nArc )) self._writeBinary(sPathFile, nMethod) self._writeAsJSObject(sPathFile, nMethod) if bDebug: self._writeNodes(sPathFile, nMethod) def _calcNumBytesNodeAddress (self): "how many bytes needed to store all nodes/arcs in the binary dictionary" self.nBytesNodeAddress = 1 while ((self.nBytesArc + self.nBytesNodeAddress) * self.nArc) > (2 ** (self.nBytesNodeAddress * 8)): |
︙ | ︙ | |||
377 378 379 380 381 382 383 384 385 386 387 388 389 390 | nSize = max(len(self.lSortedNodes[i].arcs), 1) * nBytesNode for oNextNode in self.lSortedNodes[i].arcs.values(): if 1 < (oNextNode.addr - self.lSortedNodes[i].addr) < self.nMaxOffset: nSize -= nDiff if self.lSortedNodes[i].size != nSize: self.lSortedNodes[i].size = nSize bEnd = False def _writeBinary (self, sPathFile, nMethod): """ Format of the binary indexable dictionary: Each section is separated with 4 bytes of \0 - Section Header: | > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > | 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 422 423 424 425 426 427 428 429 430 431 432 433 434 435 436 437 438 439 440 441 442 | nSize = max(len(self.lSortedNodes[i].arcs), 1) * nBytesNode for oNextNode in self.lSortedNodes[i].arcs.values(): if 1 < (oNextNode.addr - self.lSortedNodes[i].addr) < self.nMaxOffset: nSize -= nDiff if self.lSortedNodes[i].size != nSize: self.lSortedNodes[i].size = nSize bEnd = False def _writeAsJSObject (self, spfDst, nMethod, bInJSModule=False, bBinaryDictAsHexString=True): if not spfDst.endswith(".json"): spfDst += "."+str(nMethod)+".json" byDic = b"" if nMethod == 1: byDic = self.oRoot.convToBytes1(self.nBytesArc, self.nBytesNodeAddress) for oNode in self.lMinimizedNodes: byDic += oNode.convToBytes1(self.nBytesArc, self.nBytesNodeAddress) elif nMethod == 2: byDic = self.oRoot.convToBytes2(self.nBytesArc, self.nBytesNodeAddress) for oNode in self.lSortedNodes: byDic += oNode.convToBytes2(self.nBytesArc, self.nBytesNodeAddress) elif nMethod == 3: byDic = self.oRoot.convToBytes3(self.nBytesArc, self.nBytesNodeAddress, self.nBytesOffset) for oNode in self.lSortedNodes: byDic += oNode.convToBytes3(self.nBytesArc, self.nBytesNodeAddress, self.nBytesOffset) with open(spfDst, "w", encoding="utf-8", newline="\n") as hDst: if bInJSModule: hDst.write('// JavaScript\n// Generated data (do not edit)\n\n"use strict";\n\nconst dictionary = ') hDst.write(json.dumps({ "sName": "todo", "nVersion": nMethod, "sDate": str(datetime.datetime.now())[:-7], "sHeader": "/pyfsa/"+str(nMethod)+"/", "lArcVal": self.lArcVal, "nArcVal": self.nArcVal, # JavaScript is a pile of shit, so Mozilla’s JS parser don’t like file bigger than 4 Mb! # So, if necessary, we use an hexadecimal string, that we will convert later in Firefox’s extension. # https://github.com/mozilla/addons-linter/issues/1361 "byDic": byDic.hex() if bBinaryDictAsHexString else [ e for e in byDic ], "sLang": self.sLang, "nChar": self.nChar, "nBytesArc": self.nBytesArc, "nBytesNodeAddress": self.nBytesNodeAddress, "nEntries": self.nEntry, "nNode": self.nNode, "nArc": self.nArc, "nAff": self.nAff, "cStemming": self.cStemming, "nTag": self.nTag, "dChar": self.dChar, "nBytesOffset": self.nBytesOffset }, ensure_ascii=False)) if bInJSModule: hDst.write(";\n\nexports.dictionary = dictionary;\n") def _writeBinary (self, sPathFile, nMethod): """ Format of the binary indexable dictionary: Each section is separated with 4 bytes of \0 - Section Header: |
︙ | ︙ |
Modified graphspell/ibdawg.py from [cf92ef19a3] to [c090c02520].
︙ | ︙ | |||
151 152 153 154 155 156 157 | else: raise ValueError(" # Error: unknown code: {}".format(self.nVersion)) self.bOptNumSigle = False self.bOptNumAtLast = False def getInfo (self): | | | 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 | else: raise ValueError(" # Error: unknown code: {}".format(self.nVersion)) self.bOptNumSigle = False self.bOptNumAtLast = False def getInfo (self): return " Language: {0.sLang:>10} Version: {0.nVersion:>2} Date: {0.sDate} Stemming: {0.cStemming}FX\n" \ " Arcs values: {0.nArcVal:>10,} = {0.nChar:>5,} characters, {0.nAff:>6,} affixes, {0.nTag:>6,} tags\n" \ " Dictionary: {0.nEntries:>12,} entries, {0.nNode:>11,} nodes, {0.nArc:>11,} arcs\n" \ " Address size: {0.nBytesNodeAddress:>1} bytes, Arc size: {0.nBytesArc:>1} bytes\n".format(self) def writeAsJSObject (self, spfDest, bInJSModule=False, bBinaryDictAsHexString=False): "write IBDAWG as a JavaScript object in a JavaScript module" import json |
︙ | ︙ |