Index: graphspell-js/dawg.js ================================================================== --- graphspell-js/dawg.js +++ graphspell-js/dawg.js @@ -331,12 +331,13 @@ // BINARY CONVERSION createBinary (nMethod) { console.log("Write DAWG as an indexable binary dictionary [method: "+nMethod+"]"); if (nMethod == 1) { this.nBytesArc = Math.floor( (this.nArcVal.toString(2).length + 2) / 8 ) + 1; // We add 2 bits. See DawgNode.convToBytes1() - this._calcNumBytesNodeAddress() - this._calcNodesAddress1() + this.nBytesOffset = 0; + this._calcNumBytesNodeAddress(); + this._calcNodesAddress1(); } else { console.log("Error: unknown compression method"); } console.log("Arc values (chars, affixes and tags): " + this.nArcVal); console.log("Arc size: "+this.nBytesArc+" bytes, Address size: "+this.nBytesNodeAddress+" bytes"); @@ -416,11 +417,11 @@ "nArc": this.nArc, "nAff": this.nAff, "cStemming": this.cStemming, "nTag": this.nTag, "dChar": helpers.mapToObject(this.dChar), - "nBytesOffset": 1 + "nBytesOffset": this.nBytesOffset }; return oJSON; }, _getDate () { Index: graphspell-js/ibdawg.js ================================================================== --- graphspell-js/ibdawg.js +++ graphspell-js/ibdawg.js @@ -101,11 +101,11 @@ throw Error("# Error. File not found or not loadable.\n" + e.message + "\n"); } /* Properties: sName, nVersion, sHeader, lArcVal, nArcVal, byDic, sLang, nChar, nBytesArc, nBytesNodeAddress, - nEntries, nNode, nArc, nAff, cStemming, nTag, dChar, _arcMask, _finalNodeMask, _lastArcMask, _addrBitMask, nBytesOffset, + nEntries, nNode, nArc, nAff, cStemming, nTag, dChar, nBytesOffset, */ /* Bug workaround. Mozilla’s JS parser sucks. Can’t read file bigger than 4 Mb! Index: graphspell/dawg.py ================================================================== --- graphspell/dawg.py +++ graphspell/dawg.py @@ -10,10 +10,12 @@ import sys import os import collections +import json +import datetime from . import str_transform as st from .progressbar import ProgressBar @@ -308,14 +310,16 @@ # BINARY CONVERSION def createBinary (self, sPathFile, nMethod, bDebug=False): print(" > Write DAWG as an indexable binary dictionary [method: %d]" % nMethod) if nMethod == 1: self.nBytesArc = ( (self.nArcVal.bit_length() + 2) // 8 ) + 1 # We add 2 bits. See DawgNode.convToBytes1() + self.nBytesOffset = 0 self._calcNumBytesNodeAddress() self._calcNodesAddress1() elif nMethod == 2: self.nBytesArc = ( (self.nArcVal.bit_length() + 3) // 8 ) + 1 # We add 3 bits. See DawgNode.convToBytes2() + self.nBytesOffset = 0 self._calcNumBytesNodeAddress() self._calcNodesAddress2() elif nMethod == 3: self.nBytesArc = ( (self.nArcVal.bit_length() + 3) // 8 ) + 1 # We add 3 bits. See DawgNode.convToBytes3() self.nBytesOffset = 1 @@ -327,10 +331,11 @@ print(" Arc values (chars, affixes and tags): {} -> {} bytes".format( self.nArcVal, len("\t".join(self.lArcVal).encode("utf-8")) )) print(" Arc size: {} bytes, Address size: {} bytes -> {} * {} = {} bytes".format( self.nBytesArc, self.nBytesNodeAddress, \ self.nBytesArc+self.nBytesNodeAddress, self.nArc, \ (self.nBytesArc+self.nBytesNodeAddress)*self.nArc )) self._writeBinary(sPathFile, nMethod) + self._writeAsJSObject(sPathFile, nMethod) if bDebug: self._writeNodes(sPathFile, nMethod) def _calcNumBytesNodeAddress (self): "how many bytes needed to store all nodes/arcs in the binary dictionary" @@ -379,10 +384,57 @@ if 1 < (oNextNode.addr - self.lSortedNodes[i].addr) < self.nMaxOffset: nSize -= nDiff if self.lSortedNodes[i].size != nSize: self.lSortedNodes[i].size = nSize bEnd = False + + def _writeAsJSObject (self, spfDst, nMethod, bInJSModule=False, bBinaryDictAsHexString=True): + if not spfDst.endswith(".json"): + spfDst += "."+str(nMethod)+".json" + byDic = b"" + if nMethod == 1: + byDic = self.oRoot.convToBytes1(self.nBytesArc, self.nBytesNodeAddress) + for oNode in self.lMinimizedNodes: + byDic += oNode.convToBytes1(self.nBytesArc, self.nBytesNodeAddress) + elif nMethod == 2: + byDic = self.oRoot.convToBytes2(self.nBytesArc, self.nBytesNodeAddress) + for oNode in self.lSortedNodes: + byDic += oNode.convToBytes2(self.nBytesArc, self.nBytesNodeAddress) + elif nMethod == 3: + byDic = self.oRoot.convToBytes3(self.nBytesArc, self.nBytesNodeAddress, self.nBytesOffset) + for oNode in self.lSortedNodes: + byDic += oNode.convToBytes3(self.nBytesArc, self.nBytesNodeAddress, self.nBytesOffset) + + with open(spfDst, "w", encoding="utf-8", newline="\n") as hDst: + if bInJSModule: + hDst.write('// JavaScript\n// Generated data (do not edit)\n\n"use strict";\n\nconst dictionary = ') + hDst.write(json.dumps({ + "sName": "todo", + "nVersion": nMethod, + "sDate": str(datetime.datetime.now())[:-7], + "sHeader": "/pyfsa/"+str(nMethod)+"/", + "lArcVal": self.lArcVal, + "nArcVal": self.nArcVal, + # JavaScript is a pile of shit, so Mozilla’s JS parser don’t like file bigger than 4 Mb! + # So, if necessary, we use an hexadecimal string, that we will convert later in Firefox’s extension. + # https://github.com/mozilla/addons-linter/issues/1361 + "byDic": byDic.hex() if bBinaryDictAsHexString else [ e for e in byDic ], + "sLang": self.sLang, + "nChar": self.nChar, + "nBytesArc": self.nBytesArc, + "nBytesNodeAddress": self.nBytesNodeAddress, + "nEntries": self.nEntry, + "nNode": self.nNode, + "nArc": self.nArc, + "nAff": self.nAff, + "cStemming": self.cStemming, + "nTag": self.nTag, + "dChar": self.dChar, + "nBytesOffset": self.nBytesOffset + }, ensure_ascii=False)) + if bInJSModule: + hDst.write(";\n\nexports.dictionary = dictionary;\n") def _writeBinary (self, sPathFile, nMethod): """ Format of the binary indexable dictionary: Each section is separated with 4 bytes of \0 Index: graphspell/ibdawg.py ================================================================== --- graphspell/ibdawg.py +++ graphspell/ibdawg.py @@ -153,11 +153,11 @@ self.bOptNumSigle = False self.bOptNumAtLast = False def getInfo (self): - return " Language: {0.sLang:>10} Version: {0.nVersion:>2} Stemming: {0.cStemming}FX\n" \ + return " Language: {0.sLang:>10} Version: {0.nVersion:>2} Date: {0.sDate} Stemming: {0.cStemming}FX\n" \ " Arcs values: {0.nArcVal:>10,} = {0.nChar:>5,} characters, {0.nAff:>6,} affixes, {0.nTag:>6,} tags\n" \ " Dictionary: {0.nEntries:>12,} entries, {0.nNode:>11,} nodes, {0.nArc:>11,} arcs\n" \ " Address size: {0.nBytesNodeAddress:>1} bytes, Arc size: {0.nBytesArc:>1} bytes\n".format(self) def writeAsJSObject (self, spfDest, bInJSModule=False, bBinaryDictAsHexString=False):