Index: graphspell-js/dawg.js ================================================================== --- graphspell-js/dawg.js +++ graphspell-js/dawg.js @@ -342,20 +342,23 @@ } } } // BINARY CONVERSION - createBinaryJSON (nCompressionMethod=1) { + _calculateBinary (nCompressionMethod=1) { console.log("Write DAWG as an indexable binary dictionary"); this.nBytesArc = Math.floor( (this.nArcVal.toString(2).length + 2) / 8 ) + 1; // We add 2 bits. See DawgNode.convToBytes() this.nBytesOffset = 0; this._calcNumBytesNodeAddress(); this._calcNodesAddress(); + let this.sByDic = this.oRoot.convToBytes(this.nBytesArc, this.nBytesNodeAddress); + for (let oNode of this.dMinimizedNodes.values()) { + this.sByDic += oNode.convToBytes(this.nBytesArc, this.nBytesNodeAddress); + } console.log("Arc values (chars, affixes and tags): " + this.nArcVal); console.log("Arc size: "+this.nBytesArc+" bytes, Address size: "+this.nBytesNodeAddress+" bytes"); console.log("-> " + this.nBytesArc+this.nBytesNodeAddress + " * " + this.nArc + " = " + (this.nBytesArc+this.nBytesNodeAddress)*this.nArc + " bytes"); - return this._createJSON(nCompressionMethod); } _calcNumBytesNodeAddress () { // how many bytes needed to store all nodes/arcs in the binary dictionary this.nBytesNodeAddress = 1; @@ -371,15 +374,44 @@ oNode.addr = iAddr; iAddr += Math.max(oNode.arcs.size, 1) * nBytesNode; } } - _createJSON (nCompressionMethod=1) { - let sByDic = this.oRoot.convToBytes(this.nBytesArc, this.nBytesNodeAddress); - for (let oNode of this.dMinimizedNodes.values()) { - sByDic += oNode.convToBytes(this.nBytesArc, this.nBytesNodeAddress); + _binaryToList () { + this.lByDic = []; + let nAcc = 0; + let lBytesBuffer = []; + let nDivisor = (this.nBytesArc + this.nBytesNodeAddress) / 2; + for (let i = 0; i < this.sByDic.length; i+=2) { + lBytesBuffer.push(parseInt(this.sByDic.slice(i, i+2), 16)); + if (nAcc == (this.nBytesArc - 1)) { + this.lByDic.push(this._convBytesToInteger(lBytesBuffer)); + lBytesBuffer = []; + } + else if (nAcc == (this.nBytesArc + this.nBytesNodeAddress - 1)) { + this.lByDic.push(Math.round(this._convBytesToInteger(lBytesBuffer) / nDivisor)); // Math.round should be useless, BUT with JS who knowns what can happen… + lBytesBuffer = []; + nAcc = -1; + } + nAcc = nAcc + 1; + } + } + + _convBytesToInteger (aBytes) { + // Byte order = Big Endian (bigger first) + let nVal = 0; + let nWeight = (aBytes.length - 1) * 8; + for (let n of aBytes) { + nVal += n << nWeight; + nWeight = nWeight - 8; } + return nVal; + } + + createBinaryJSON (nCompressionMethod=1) { + this._calculateBinary(nCompressionMethod); + this._binaryToList(); let oJSON = { "sHeader": "/grammalecte-fsa/", "sLangCode": this.sLangCode, "sLangName": this.sLangName, "sDicName": this.sDicName, @@ -398,11 +430,12 @@ "nArcVal": this.nArcVal, "nCompressionMethod": nCompressionMethod, "nBytesArc": this.nBytesArc, "nBytesNodeAddress": this.nBytesNodeAddress, "nBytesOffset": this.nBytesOffset, - "sByDic": sByDic, // binary word graph + //"sByDic": this.sByDic, // binary word graph + "lByDic": this.lByDic, "l2grams": Array.from(this.a2grams) }; return oJSON; } Index: graphspell-js/ibdawg.js ================================================================== --- graphspell-js/ibdawg.js +++ graphspell-js/ibdawg.js @@ -152,60 +152,53 @@ */ if (!(this.sHeader.startsWith("/grammalecte-fsa/") || this.sHeader.startsWith("/pyfsa/"))) { throw TypeError("# Error. Not a grammalecte-fsa binary dictionary. Header: " + this.sHeader); } - if (!(this.nCompressionMethod == 1 || this.nCompressionMethod == 2 || this.nCompressionMethod == 3)) { - throw RangeError("# Error. Unknown dictionary compression method: " + this.nCompressionMethod); - } // to get the value of an arc, to get the char of an arc with its value this.dChar = helpers.objectToMap(this.dChar); this.dCharVal = this.dChar.gl_reverse(); this.a2grams = (this.l2grams) ? new Set(this.l2grams) : null; + if (!this.hasOwnProperty("lByDic")) { + // old dictionary version + if (!this.hasOwnProperty("sByDic")) { + throw TypeError("# Error. No usable data in the dictionary."); + } + this.lByDic = []; + let nAcc = 0; + let lBytesBuffer = []; + let nDivisor = (this.nBytesArc + this.nBytesNodeAddress) / 2; + for (let i = 0; i < this.sByDic.length; i+=2) { + lBytesBuffer.push(parseInt(this.sByDic.slice(i, i+2), 16)); + if (nAcc == (this.nBytesArc - 1)) { + this.lByDic.push(this._convBytesToInteger(lBytesBuffer)); + lBytesBuffer = []; + } + else if (nAcc == (this.nBytesArc + this.nBytesNodeAddress - 1)) { + this.lByDic.push(Math.round(this._convBytesToInteger(lBytesBuffer) / nDivisor)); // Math.round should be useless, BUT with JS who knowns what can happen… + lBytesBuffer = []; + nAcc = -1; + } + nAcc = nAcc + 1; + } + } + + // masks + this._arcMask = (2 ** ((this.nBytesArc * 8) - 3)) - 1; + this._finalNodeMask = 1 << ((this.nBytesArc * 8) - 1); + this._lastArcMask = 1 << ((this.nBytesArc * 8) - 2); + + // function to decode the affix/suffix code if (this.cStemming == "S") { this.funcStemming = str_transform.changeWordWithSuffixCode; } else if (this.cStemming == "A") { this.funcStemming = str_transform.changeWordWithAffixCode; } else { this.funcStemming = str_transform.noStemming; } - /* - Bug workaround. - Mozilla’s JS parser sucks. Can’t read file bigger than 4 Mb! - So we convert huge hexadecimal string to list of numbers… - https://github.com/mozilla/addons-linter/issues/1361 - */ - /* - Performance trick: - Instead of converting bytes to integers each times we parse the binary dictionary, - we do it once, then parse the array - */ - this.lByDic = []; - let nAcc = 0; - let lBytesBuffer = []; - let nDivisor = (this.nBytesArc + this.nBytesNodeAddress) / 2; - for (let i = 0; i < this.sByDic.length; i+=2) { - lBytesBuffer.push(parseInt(this.sByDic.slice(i, i+2), 16)); - if (nAcc == (this.nBytesArc - 1)) { - this.lByDic.push(this._convBytesToInteger(lBytesBuffer)); - lBytesBuffer = []; - } - else if (nAcc == (this.nBytesArc + this.nBytesNodeAddress - 1)) { - this.lByDic.push(Math.round(this._convBytesToInteger(lBytesBuffer) / nDivisor)); // Math.round should be useless, BUT with JS who knowns what can happen… - lBytesBuffer = []; - nAcc = -1; - } - nAcc = nAcc + 1; - } - /* end of bug workaround */ - - this._arcMask = (2 ** ((this.nBytesArc * 8) - 3)) - 1; - this._finalNodeMask = 1 << ((this.nBytesArc * 8) - 1); - this._lastArcMask = 1 << ((this.nBytesArc * 8) - 2); - //console.log(this.getInfo()); this.bAcronymValid = true; this.bNumAtLastValid = false; // lexicographer module ? Index: graphspell/dawg.py ================================================================== --- graphspell/dawg.py +++ graphspell/dawg.py @@ -362,10 +362,14 @@ print(" > Write DAWG as an indexable binary dictionary") self.nBytesArc = ( (self.nArcVal.bit_length() + 2) // 8 ) + 1 # We add 2 bits. See DawgNode.convToBytes() self.nBytesOffset = 0 self._calcNumBytesNodeAddress() self._calcNodesAddress() + self.byDic = b"" + self.byDic = self.oRoot.convToBytes(self.nBytesArc, self.nBytesNodeAddress) + for oNode in self.lMinimizedNodes: + self.byDic += oNode.convToBytes(self.nBytesArc, self.nBytesNodeAddress) print(" Arc values (chars, affixes and tags): {} -> {} bytes".format( self.nArcVal, len("\t".join(self.lArcVal).encode("utf-8")) )) print(" Arc size: {} bytes, Address size: {} bytes -> {} * {} = {} bytes".format( self.nBytesArc, self.nBytesNodeAddress, \ self.nBytesArc+self.nBytesNodeAddress, self.nArc, \ (self.nBytesArc+self.nBytesNodeAddress)*self.nArc )) @@ -380,17 +384,30 @@ iAddr = len(self.oRoot.arcs) * nBytesNode for oNode in self.lMinimizedNodes: oNode.addr = iAddr iAddr += max(len(oNode.arcs), 1) * nBytesNode - def getBinaryAsJSON (self, nCompressionMethod=1, bBinaryDictAsHexString=True): + def _binaryToList (self): + self.lByDic = [] + nAcc = 0 + byBuffer = b"" + nDivisor = (self.nBytesArc + self.nBytesNodeAddress) / 2 + for i in range(0, len(self.byDic)): + byBuffer += self.byDic[i:i+1] + if nAcc == (self.nBytesArc - 1): + self.lByDic.append(int.from_bytes(byBuffer, byteorder="big")) + byBuffer = b"" + elif nAcc == (self.nBytesArc + self.nBytesNodeAddress - 1): + self.lByDic.append(round(int.from_bytes(byBuffer, byteorder="big") / nDivisor)) + byBuffer = b"" + nAcc = -1 + nAcc = nAcc + 1 + + def getBinaryAsJSON (self, nCompressionMethod=1): "return a JSON string containing all necessary data of the dictionary (compressed as a binary string)" self._calculateBinary(nCompressionMethod) - byDic = b"" - byDic = self.oRoot.convToBytes(self.nBytesArc, self.nBytesNodeAddress) - for oNode in self.lMinimizedNodes: - byDic += oNode.convToBytes(self.nBytesArc, self.nBytesNodeAddress) + self._binaryToList() return { "sHeader": "/grammalecte-fsa/", "sLangCode": self.sLangCode, "sLangName": self.sLangName, "sDicName": self.sDicName, @@ -412,22 +429,23 @@ "nBytesNodeAddress": self.nBytesNodeAddress, "nBytesOffset": self.nBytesOffset, # Mozilla’s JS parser don’t like file bigger than 4 Mb! # So, if necessary, we use an hexadecimal string, that we will convert later in Firefox’s extension. # https://github.com/mozilla/addons-linter/issues/1361 - "sByDic": byDic.hex() if bBinaryDictAsHexString else [ e for e in byDic ], + #"sByDic": self.byDic.hex(), + "lByDic": self.lByDic, "l2grams": list(self.a2grams) } - def writeAsJSObject (self, spfDst, nCompressionMethod=1, bInJSModule=False, bBinaryDictAsHexString=True): + def writeAsJSObject (self, spfDst, nCompressionMethod=1, bInJSModule=False): "write a file (JSON or JS module) with all the necessary data" if not spfDst.endswith(".json"): spfDst += "."+str(nCompressionMethod)+".json" with open(spfDst, "w", encoding="utf-8", newline="\n") as hDst: if bInJSModule: hDst.write('// JavaScript\n// Generated data (do not edit)\n\n"use strict";\n\nconst dictionary = ') - hDst.write( json.dumps(self.getBinaryAsJSON(nCompressionMethod, bBinaryDictAsHexString), ensure_ascii=False) ) + hDst.write( json.dumps(self.getBinaryAsJSON(nCompressionMethod), ensure_ascii=False) ) if bInJSModule: hDst.write(";\n\nexports.dictionary = dictionary;\n") def _getDate (self): return time.strftime("%Y-%m-%d %H:%M:%S") @@ -454,11 +472,10 @@ DawgNode.NextId += 1 self.final = False self.arcs = {} # key: arc value; value: a node self.addr = 0 # address in the binary dictionary self.pos = 0 # position in the binary dictionary (version 2) - self.size = 0 # size of node in bytes (version 3) @classmethod def resetNextId (cls): "set NextId to 0 " cls.NextId = 0 Index: graphspell/ibdawg.py ================================================================== --- graphspell/ibdawg.py +++ graphspell/ibdawg.py @@ -121,33 +121,34 @@ oData = json.loads(by.decode("utf-8")) #json.loads(by) # In Python 3.6, can read directly binary strings else: self.sFileName = "[None]" oData = source - self.sByDic = "" # init to prevent pylint whining self.__dict__.update(oData) - self.byDic = binascii.unhexlify(self.sByDic) self.dCharVal = { v: k for k, v in self.dChar.items() } self.a2grams = set(getattr(self, 'l2grams')) if hasattr(self, 'l2grams') else None - # Performance trick: - # Instead of converting bytes to integers each times we parse the binary dictionary, - # we do it once, then parse the array - nAcc = 0 - byBuffer = b"" - self.lByDic = [] - nDivisor = (self.nBytesArc + self.nBytesNodeAddress) / 2 - for i in range(0, len(self.byDic)): - byBuffer += self.byDic[i:i+1] - if nAcc == (self.nBytesArc - 1): - self.lByDic.append(int.from_bytes(byBuffer, byteorder="big")) - byBuffer = b"" - elif nAcc == (self.nBytesArc + self.nBytesNodeAddress - 1): - self.lByDic.append(round(int.from_bytes(byBuffer, byteorder="big") / nDivisor)) - byBuffer = b"" - nAcc = -1 - nAcc = nAcc + 1 + if "lByDic" not in oData: + print(">>>> lByDic not in oData") + if "sByDic" not in oData: + raise TypeError("# Error. No usable data in the dictionary.") + # old dictionary version + self.lByDic = [] + self.byDic = binascii.unhexlify(oData["sByDic"]) + nAcc = 0 + byBuffer = b"" + nDivisor = (self.nBytesArc + self.nBytesNodeAddress) / 2 + for i in range(0, len(self.byDic)): + byBuffer += self.byDic[i:i+1] + if nAcc == (self.nBytesArc - 1): + self.lByDic.append(int.from_bytes(byBuffer, byteorder="big")) + byBuffer = b"" + elif nAcc == (self.nBytesArc + self.nBytesNodeAddress - 1): + self.lByDic.append(round(int.from_bytes(byBuffer, byteorder="big") / nDivisor)) + byBuffer = b"" + nAcc = -1 + nAcc = nAcc + 1 # masks self._arcMask = (2 ** ((self.nBytesArc * 8) - 3)) - 1 self._finalNodeMask = 1 << ((self.nBytesArc * 8) - 1) self._lastArcMask = 1 << ((self.nBytesArc * 8) - 2)