Index: graphspell-js/dawg.js ================================================================== --- graphspell-js/dawg.js +++ graphspell-js/dawg.js @@ -327,13 +327,13 @@ } } } // BINARY CONVERSION - createBinary (nMethod) { - console.log("Write DAWG as an indexable binary dictionary [method: "+nMethod+"]"); - if (nMethod == 1) { + createBinary (nCompressionMethod) { + console.log("Write DAWG as an indexable binary dictionary [method: "+nCompressionMethod+"]"); + if (nCompressionMethod == 1) { this.nBytesArc = Math.floor( (this.nArcVal.toString(2).length + 2) / 8 ) + 1; // We add 2 bits. See DawgNode.convToBytes1() this.nBytesOffset = 0; this._calcNumBytesNodeAddress(); this._calcNodesAddress1(); } else { @@ -340,11 +340,11 @@ console.log("Error: unknown compression method"); } console.log("Arc values (chars, affixes and tags): " + this.nArcVal); console.log("Arc size: "+this.nBytesArc+" bytes, Address size: "+this.nBytesNodeAddress+" bytes"); console.log("-> " + this.nBytesArc+this.nBytesNodeAddress + " * " + this.nArc + " = " + (this.nBytesArc+this.nBytesNodeAddress)*this.nArc + " bytes"); - return this._createJSON(nMethod); + return this._createJSON(nCompressionMethod); } _calcNumBytesNodeAddress () { // how many bytes needed to store all nodes/arcs in the binary dictionary this.nBytesNodeAddress = 1; @@ -360,23 +360,23 @@ oNode.addr = iAddr; iAddr += Math.max(oNode.arcs.size, 1) * nBytesNode; } } - _createJSON (nMethod) { + _createJSON (nCompressionMethod) { let sByDic = ""; - if (nMethod == 1) { + if (nCompressionMethod == 1) { sByDic = this.oRoot.convToBytes1(this.nBytesArc, this.nBytesNodeAddress); for (let oNode of this.dMinimizedNodes.values()) { sByDic += oNode.convToBytes1(this.nBytesArc, this.nBytesNodeAddress); } } let oJSON = { "sName": this.sName, - "nVersion": nMethod, + "nCompressionMethod": nCompressionMethod, "sDate": this._getDate(), - "sHeader": this.sHeader + nMethod + "/", + "sHeader": this.sHeader + nCompressionMethod + "/", "lArcVal": this.lArcVal, "nArcVal": this.nArcVal, "byDic": sByDic, // binary word graph "sLang": this.sLang, "nChar": this.nChar, Index: graphspell-js/ibdawg.js ================================================================== --- graphspell-js/ibdawg.js +++ graphspell-js/ibdawg.js @@ -100,11 +100,11 @@ catch (e) { throw Error("# Error. File not found or not loadable.\n" + e.message + "\n"); } /* Properties: - sName, nVersion, sHeader, lArcVal, nArcVal, byDic, sLang, nChar, nBytesArc, nBytesNodeAddress, + sName, nCompressionMethod, sHeader, lArcVal, nArcVal, byDic, sLang, nChar, nBytesArc, nBytesNodeAddress, nEntries, nNode, nArc, nAff, cStemming, nTag, dChar, nBytesOffset, */ /* Bug workaround. @@ -120,12 +120,12 @@ /* end of bug workaround */ if (!this.sHeader.startsWith("/pyfsa/")) { throw TypeError("# Error. Not a pyfsa binary dictionary. Header: " + this.sHeader); } - if (!(this.nVersion == "1" || this.nVersion == "2" || this.nVersion == "3")) { - throw RangeError("# Error. Unknown dictionary version: " + this.nVersion); + if (!(this.nCompressionMethod == "1" || this.nCompressionMethod == "2" || this.nCompressionMethod == "3")) { + throw RangeError("# Error. Unknown dictionary compression method: " + this.nCompressionMethod); } // to get the value of an arc, to get the char of an arc with its value this.dChar = helpers.objectToMap(this.dChar); this.dCharVal = this.dChar.gl_reverse(); //this.byDic = new Uint8Array(this.byDic); // not quicker, even slower @@ -141,12 +141,12 @@ this._arcMask = (2 ** ((this.nBytesArc * 8) - 3)) - 1; this._finalNodeMask = 1 << ((this.nBytesArc * 8) - 1); this._lastArcMask = 1 << ((this.nBytesArc * 8) - 2); - // Configuring DAWG functions according to nVersion - switch (this.nVersion) { + // Configuring DAWG functions according to nCompressionMethod + switch (this.nCompressionMethod) { case 1: this.morph = this._morph1; this.stem = this._stem1; this._lookupArcNode = this._lookupArcNode1; this._getArcs = this._getArcs1; @@ -165,19 +165,19 @@ this._lookupArcNode = this._lookupArcNode3; this._getArcs = this._getArcs3; this._writeNodes = this._writeNodes3; break; default: - throw ValueError("# Error: unknown code: " + this.nVersion); + throw ValueError("# Error: unknown code: " + this.nCompressionMethod); } //console.log(this.getInfo()); this.bOptNumSigle = true; this.bOptNumAtLast = false; } getInfo () { - return ` Language: ${this.sLang} Version: ${this.nVersion} Date: ${this.sDate} Stemming: ${this.cStemming}FX\n` + + return ` Language: ${this.sLang} Version: ${this.nCompressionMethod} Date: ${this.sDate} Stemming: ${this.cStemming}FX\n` + ` Arcs values: ${this.nArcVal} = ${this.nChar} characters, ${this.nAff} affixes, ${this.nTag} tags\n` + ` Dictionary: ${this.nEntries} entries, ${this.nNode} nodes, ${this.nArc} arcs\n` + ` Address size: ${this.nBytesNodeAddress} bytes, Arc size: ${this.nBytesArc} bytes\n`; } Index: graphspell/dawg.py ================================================================== --- graphspell/dawg.py +++ graphspell/dawg.py @@ -306,23 +306,23 @@ if not zPattern or zPattern.search(self.lArcVal[nMorphVal]): yield sEntry + "\t" + self.lArcVal[nMorphVal] # BINARY CONVERSION - def createBinary (self, sPathFile, nMethod, bDebug=False): - print(" > Write DAWG as an indexable binary dictionary [method: %d]" % nMethod) - if nMethod == 1: + def createBinary (self, sPathFile, nCompressionMethod, bDebug=False): + print(" > Write DAWG as an indexable binary dictionary [method: %d]" % nCompressionMethod) + if nCompressionMethod == 1: self.nBytesArc = ( (self.nArcVal.bit_length() + 2) // 8 ) + 1 # We add 2 bits. See DawgNode.convToBytes1() self.nBytesOffset = 0 self._calcNumBytesNodeAddress() self._calcNodesAddress1() - elif nMethod == 2: + elif nCompressionMethod == 2: self.nBytesArc = ( (self.nArcVal.bit_length() + 3) // 8 ) + 1 # We add 3 bits. See DawgNode.convToBytes2() self.nBytesOffset = 0 self._calcNumBytesNodeAddress() self._calcNodesAddress2() - elif nMethod == 3: + elif nCompressionMethod == 3: self.nBytesArc = ( (self.nArcVal.bit_length() + 3) // 8 ) + 1 # We add 3 bits. See DawgNode.convToBytes3() self.nBytesOffset = 1 self.nMaxOffset = (2 ** (self.nBytesOffset * 8)) - 1 self._calcNumBytesNodeAddress() self._calcNodesAddress3() @@ -330,14 +330,14 @@ print(" # Error: unknown compression method") print(" Arc values (chars, affixes and tags): {} -> {} bytes".format( self.nArcVal, len("\t".join(self.lArcVal).encode("utf-8")) )) print(" Arc size: {} bytes, Address size: {} bytes -> {} * {} = {} bytes".format( self.nBytesArc, self.nBytesNodeAddress, \ self.nBytesArc+self.nBytesNodeAddress, self.nArc, \ (self.nBytesArc+self.nBytesNodeAddress)*self.nArc )) - self._writeBinary(sPathFile, nMethod) - self._writeAsJSObject(sPathFile, nMethod) + self._writeBinary(sPathFile, nCompressionMethod) + self._writeAsJSObject(sPathFile, nCompressionMethod) if bDebug: - self._writeNodes(sPathFile, nMethod) + self._writeNodes(sPathFile, nCompressionMethod) def _calcNumBytesNodeAddress (self): "how many bytes needed to store all nodes/arcs in the binary dictionary" self.nBytesNodeAddress = 1 while ((self.nBytesArc + self.nBytesNodeAddress) * self.nArc) > (2 ** (self.nBytesNodeAddress * 8)): @@ -385,35 +385,35 @@ nSize -= nDiff if self.lSortedNodes[i].size != nSize: self.lSortedNodes[i].size = nSize bEnd = False - def _writeAsJSObject (self, spfDst, nMethod, bInJSModule=False, bBinaryDictAsHexString=True): + def _writeAsJSObject (self, spfDst, nCompressionMethod, bInJSModule=False, bBinaryDictAsHexString=True): if not spfDst.endswith(".json"): - spfDst += "."+str(nMethod)+".json" + spfDst += "."+str(nCompressionMethod)+".json" byDic = b"" - if nMethod == 1: + if nCompressionMethod == 1: byDic = self.oRoot.convToBytes1(self.nBytesArc, self.nBytesNodeAddress) for oNode in self.lMinimizedNodes: byDic += oNode.convToBytes1(self.nBytesArc, self.nBytesNodeAddress) - elif nMethod == 2: + elif nCompressionMethod == 2: byDic = self.oRoot.convToBytes2(self.nBytesArc, self.nBytesNodeAddress) for oNode in self.lSortedNodes: byDic += oNode.convToBytes2(self.nBytesArc, self.nBytesNodeAddress) - elif nMethod == 3: + elif nCompressionMethod == 3: byDic = self.oRoot.convToBytes3(self.nBytesArc, self.nBytesNodeAddress, self.nBytesOffset) for oNode in self.lSortedNodes: byDic += oNode.convToBytes3(self.nBytesArc, self.nBytesNodeAddress, self.nBytesOffset) with open(spfDst, "w", encoding="utf-8", newline="\n") as hDst: if bInJSModule: hDst.write('// JavaScript\n// Generated data (do not edit)\n\n"use strict";\n\nconst dictionary = ') hDst.write(json.dumps({ "sName": "todo", - "nVersion": nMethod, + "nCompressionMethod": nCompressionMethod, "sDate": str(datetime.datetime.now())[:-7], - "sHeader": "/pyfsa/"+str(nMethod)+"/", + "sHeader": "/pyfsa/"+str(nCompressionMethod)+"/", "lArcVal": self.lArcVal, "nArcVal": self.nArcVal, # JavaScript is a pile of shit, so Mozilla’s JS parser don’t like file bigger than 4 Mb! # So, if necessary, we use an hexadecimal string, that we will convert later in Firefox’s extension. # https://github.com/mozilla/addons-linter/issues/1361 @@ -432,18 +432,18 @@ "nBytesOffset": self.nBytesOffset }, ensure_ascii=False)) if bInJSModule: hDst.write(";\n\nexports.dictionary = dictionary;\n") - def _writeBinary (self, sPathFile, nMethod): + def _writeBinary (self, sPathFile, nCompressionMethod): """ Format of the binary indexable dictionary: Each section is separated with 4 bytes of \0 - Section Header: - /pyfsa/[version] - * version is an ASCII string + /pyfsa/[compression method] + * compression method is an ASCII string - Section Informations: /[tag_lang] /[number of chars] /[number of bytes for each arc] @@ -464,51 +464,51 @@ - Section Word Graph (nodes / arcs) * A list of nodes which are a list of arcs with an address of the next node. See DawgNode.convToBytes() for details. """ if not sPathFile.endswith(".bdic"): - sPathFile += "."+str(nMethod)+".bdic" + sPathFile += "."+str(nCompressionMethod)+".bdic" with open(sPathFile, 'wb') as hDst: # header - hDst.write("/pyfsa/{}/".format(nMethod).encode("utf-8")) + hDst.write("/pyfsa/{}/".format(nCompressionMethod).encode("utf-8")) hDst.write(b"\0\0\0\0") # infos hDst.write("{}/{}/{}/{}/{}/{}/{}/{}/{}".format(self.sLang, self.nChar, self.nBytesArc, self.nBytesNodeAddress, \ self.nEntry, self.nNode, self.nArc, self.nAff, self.cStemming).encode("utf-8")) hDst.write(b"\0\0\0\0") # lArcVal hDst.write("\t".join(self.lArcVal).encode("utf-8")) hDst.write(b"\0\0\0\0") # DAWG: nodes / arcs - if nMethod == 1: + if nCompressionMethod == 1: hDst.write(self.oRoot.convToBytes1(self.nBytesArc, self.nBytesNodeAddress)) for oNode in self.lMinimizedNodes: hDst.write(oNode.convToBytes1(self.nBytesArc, self.nBytesNodeAddress)) - elif nMethod == 2: + elif nCompressionMethod == 2: hDst.write(self.oRoot.convToBytes2(self.nBytesArc, self.nBytesNodeAddress)) for oNode in self.lSortedNodes: hDst.write(oNode.convToBytes2(self.nBytesArc, self.nBytesNodeAddress)) - elif nMethod == 3: + elif nCompressionMethod == 3: hDst.write(self.oRoot.convToBytes3(self.nBytesArc, self.nBytesNodeAddress, self.nBytesOffset)) for oNode in self.lSortedNodes: hDst.write(oNode.convToBytes3(self.nBytesArc, self.nBytesNodeAddress, self.nBytesOffset)) hDst.close() - def _writeNodes (self, sPathFile, nMethod): + def _writeNodes (self, sPathFile, nCompressionMethod): "for debugging only" print(" > Write nodes") - with open(sPathFile+".nodes."+str(nMethod)+".txt", 'w', encoding='utf-8', newline="\n") as hDst: - if nMethod == 1: + with open(sPathFile+".nodes."+str(nCompressionMethod)+".txt", 'w', encoding='utf-8', newline="\n") as hDst: + if nCompressionMethod == 1: hDst.write(self.oRoot.getTxtRepr1(self.nBytesArc, self.nBytesNodeAddress, self.lArcVal)+"\n") #hDst.write( ''.join( [ "%02X " % z for z in self.oRoot.convToBytes1(self.nBytesArc, self.nBytesNodeAddress) ] ).strip() ) for oNode in self.lMinimizedNodes: hDst.write(oNode.getTxtRepr1(self.nBytesArc, self.nBytesNodeAddress, self.lArcVal)+"\n") - if nMethod == 2: + if nCompressionMethod == 2: hDst.write(self.oRoot.getTxtRepr2(self.nBytesArc, self.nBytesNodeAddress, self.lArcVal)+"\n") for oNode in self.lSortedNodes: hDst.write(oNode.getTxtRepr2(self.nBytesArc, self.nBytesNodeAddress, self.lArcVal)+"\n") - if nMethod == 3: + if nCompressionMethod == 3: hDst.write(self.oRoot.getTxtRepr3(self.nBytesArc, self.nBytesNodeAddress, self.nBytesOffset, self.lArcVal)+"\n") #hDst.write( ''.join( [ "%02X " % z for z in self.oRoot.convToBytes3(self.nBytesArc, self.nBytesNodeAddress, self.nBytesOffset) ] ).strip() ) for oNode in self.lSortedNodes: hDst.write(oNode.getTxtRepr3(self.nBytesArc, self.nBytesNodeAddress, self.nBytesOffset, self.lArcVal)+"\n") hDst.close() Index: graphspell/ibdawg.py ================================================================== --- graphspell/ibdawg.py +++ graphspell/ibdawg.py @@ -91,11 +91,11 @@ header, info, values, bdic = self.by.split(b"\0\0\0\0", 3) except Exception: raise Exception self.sName = sDicName - self.nVersion = int(self.by[7:8].decode("utf-8")) + self.nCompressionMethod = int(self.by[7:8].decode("utf-8")) self.sHeader = header.decode("utf-8") self.lArcVal = values.decode("utf-8").split("\t") self.nArcVal = len(self.lArcVal) self.byDic = bdic @@ -127,37 +127,37 @@ self._lastArcMask = 1 << ((self.nBytesArc * 8) - 2) self._addrBitMask = 1 << ((self.nBytesArc * 8) - 3) # version 2 self.nBytesOffset = 1 # version 3 - # Configuring DAWG functions according to nVersion - if self.nVersion == 1: + # Configuring DAWG functions according to nCompressionMethod + if self.nCompressionMethod == 1: self.morph = self._morph1 self.stem = self._stem1 self._lookupArcNode = self._lookupArcNode1 self._getArcs = self._getArcs1 self._writeNodes = self._writeNodes1 - elif self.nVersion == 2: + elif self.nCompressionMethod == 2: self.morph = self._morph2 self.stem = self._stem2 self._lookupArcNode = self._lookupArcNode2 self._getArcs = self._getArcs2 self._writeNodes = self._writeNodes2 - elif self.nVersion == 3: + elif self.nCompressionMethod == 3: self.morph = self._morph3 self.stem = self._stem3 self._lookupArcNode = self._lookupArcNode3 self._getArcs = self._getArcs3 self._writeNodes = self._writeNodes3 else: - raise ValueError(" # Error: unknown code: {}".format(self.nVersion)) + raise ValueError(" # Error: unknown code: {}".format(self.nCompressionMethod)) self.bOptNumSigle = False self.bOptNumAtLast = False def getInfo (self): - return " Language: {0.sLang:>10} Version: {0.nVersion:>2} Date: {0.sDate} Stemming: {0.cStemming}FX\n" \ + return " Language: {0.sLang:>10} Version: {0.nCompressionMethod:>2} Date: {0.sDate} Stemming: {0.cStemming}FX\n" \ " Arcs values: {0.nArcVal:>10,} = {0.nChar:>5,} characters, {0.nAff:>6,} affixes, {0.nTag:>6,} tags\n" \ " Dictionary: {0.nEntries:>12,} entries, {0.nNode:>11,} nodes, {0.nArc:>11,} arcs\n" \ " Address size: {0.nBytesNodeAddress:>1} bytes, Arc size: {0.nBytesArc:>1} bytes\n".format(self) def writeAsJSObject (self, spfDest, bInJSModule=False, bBinaryDictAsHexString=False): @@ -166,11 +166,11 @@ with open(spfDest, "w", encoding="utf-8", newline="\n") as hDst: if bInJSModule: hDst.write('// JavaScript\n// Generated data (do not edit)\n\n"use strict";\n\nconst dictionary = ') hDst.write(json.dumps({ "sName": self.sName, - "nVersion": self.nVersion, + "nCompressionMethod": self.nCompressionMethod, "sDate": str(datetime.datetime.now())[:-7], "sHeader": self.sHeader, "lArcVal": self.lArcVal, "nArcVal": self.nArcVal, # JavaScript is a pile of shit, so Mozilla’s JS parser don’t like file bigger than 4 Mb!