Index: graphspell/dawg.py ================================================================== --- graphspell/dawg.py +++ graphspell/dawg.py @@ -307,11 +307,11 @@ if not zPattern or zPattern.search(self.lArcVal[nMorphVal]): yield sEntry + "\t" + self.lArcVal[nMorphVal] # BINARY CONVERSION - def createBinary (self, sPathFile, nCompressionMethod, bDebug=False): + def _calculateBinary (self, nCompressionMethod): print(" > Write DAWG as an indexable binary dictionary [method: %d]" % nCompressionMethod) if nCompressionMethod == 1: self.nBytesArc = ( (self.nArcVal.bit_length() + 2) // 8 ) + 1 # We add 2 bits. See DawgNode.convToBytes1() self.nBytesOffset = 0 self._calcNumBytesNodeAddress() @@ -331,14 +331,10 @@ print(" # Error: unknown compression method") print(" Arc values (chars, affixes and tags): {} -> {} bytes".format( self.nArcVal, len("\t".join(self.lArcVal).encode("utf-8")) )) print(" Arc size: {} bytes, Address size: {} bytes -> {} * {} = {} bytes".format( self.nBytesArc, self.nBytesNodeAddress, \ self.nBytesArc+self.nBytesNodeAddress, self.nArc, \ (self.nBytesArc+self.nBytesNodeAddress)*self.nArc )) - self._writeBinary(sPathFile, nCompressionMethod) - self._writeAsJSObject(sPathFile, nCompressionMethod) - if bDebug: - self._writeNodes(sPathFile, nCompressionMethod) def _calcNumBytesNodeAddress (self): "how many bytes needed to store all nodes/arcs in the binary dictionary" self.nBytesNodeAddress = 1 while ((self.nBytesArc + self.nBytesNodeAddress) * self.nArc) > (2 ** (self.nBytesNodeAddress * 8)): @@ -386,13 +382,12 @@ nSize -= nDiff if self.lSortedNodes[i].size != nSize: self.lSortedNodes[i].size = nSize bEnd = False - def _writeAsJSObject (self, spfDst, nCompressionMethod, bInJSModule=False, bBinaryDictAsHexString=True): - if not spfDst.endswith(".json"): - spfDst += "."+str(nCompressionMethod)+".json" + def getBinaryAsJSON (self, nCompressionMethod=1, bBinaryDictAsHexString=True): + self._calculateBinary(nCompressionMethod) byDic = b"" if nCompressionMethod == 1: byDic = self.oRoot.convToBytes1(self.nBytesArc, self.nBytesNodeAddress) for oNode in self.lMinimizedNodes: byDic += oNode.convToBytes1(self.nBytesArc, self.nBytesNodeAddress) @@ -402,16 +397,11 @@ byDic += oNode.convToBytes2(self.nBytesArc, self.nBytesNodeAddress) elif nCompressionMethod == 3: byDic = self.oRoot.convToBytes3(self.nBytesArc, self.nBytesNodeAddress, self.nBytesOffset) for oNode in self.lSortedNodes: byDic += oNode.convToBytes3(self.nBytesArc, self.nBytesNodeAddress, self.nBytesOffset) - - with open(spfDst, "w", encoding="utf-8", newline="\n") as hDst: - if bInJSModule: - hDst.write('// JavaScript\n// Generated data (do not edit)\n\n"use strict";\n\nconst dictionary = ') - hDst.write(json.dumps({ - "sHeader": "/pyfsa/", + return json.dumps({ "sHeader": "/pyfsa/", "sLangCode": self.sLangCode, "sLangName": self.sLangName, "sDicName": self.sDicName, "sFileName": self.sFileName, "sDate": self._getDate(), @@ -427,19 +417,28 @@ "lArcVal": self.lArcVal, "nCompressionMethod": nCompressionMethod, "nBytesArc": self.nBytesArc, "nBytesNodeAddress": self.nBytesNodeAddress, "nBytesOffset": self.nBytesOffset, - # JavaScript is a pile of shit, so Mozilla’s JS parser don’t like file bigger than 4 Mb! + # Mozilla’s JS parser don’t like file bigger than 4 Mb! # So, if necessary, we use an hexadecimal string, that we will convert later in Firefox’s extension. # https://github.com/mozilla/addons-linter/issues/1361 "sByDic": byDic.hex() if bBinaryDictAsHexString else [ e for e in byDic ] - }, ensure_ascii=False)) + }, ensure_ascii=False) + + + def writeAsJSObject (self, spfDst, nCompressionMethod, bInJSModule=False, bBinaryDictAsHexString=True): + if not spfDst.endswith(".json"): + spfDst += "."+str(nCompressionMethod)+".json" + with open(spfDst, "w", encoding="utf-8", newline="\n") as hDst: + if bInJSModule: + hDst.write('// JavaScript\n// Generated data (do not edit)\n\n"use strict";\n\nconst dictionary = ') + hDst.write( self.getBinaryAsJSON(nCompressionMethod, bBinaryDictAsHexString) ) if bInJSModule: hDst.write(";\n\nexports.dictionary = dictionary;\n") - def _writeBinary (self, sPathFile, nCompressionMethod): + def writeBinary (self, sPathFile, nCompressionMethod, bDebug=False): """ Format of the binary indexable dictionary: Each section is separated with 4 bytes of \0 - Section Header: @@ -470,10 +469,11 @@ - Section Word Graph (nodes / arcs) * A list of nodes which are a list of arcs with an address of the next node. See DawgNode.convToBytes() for details. """ + self._calculateBinary(nCompressionMethod) if not sPathFile.endswith(".bdic"): sPathFile += "."+str(nCompressionMethod)+".bdic" with open(sPathFile, 'wb') as hDst: # header hDst.write("/pyfsa/{}/".format(nCompressionMethod).encode("utf-8")) @@ -498,11 +498,12 @@ hDst.write(oNode.convToBytes2(self.nBytesArc, self.nBytesNodeAddress)) elif nCompressionMethod == 3: hDst.write(self.oRoot.convToBytes3(self.nBytesArc, self.nBytesNodeAddress, self.nBytesOffset)) for oNode in self.lSortedNodes: hDst.write(oNode.convToBytes3(self.nBytesArc, self.nBytesNodeAddress, self.nBytesOffset)) - hDst.close() + if bDebug: + self._writeNodes(sPathFile, nCompressionMethod) def _getDate (self): return time.strftime("%Y.%m.%d, %H:%M") def _writeNodes (self, sPathFile, nCompressionMethod): @@ -521,23 +522,10 @@ if nCompressionMethod == 3: hDst.write(self.oRoot.getTxtRepr3(self.nBytesArc, self.nBytesNodeAddress, self.nBytesOffset, self.lArcVal)+"\n") #hDst.write( ''.join( [ "%02X " % z for z in self.oRoot.convToBytes3(self.nBytesArc, self.nBytesNodeAddress, self.nBytesOffset) ] ).strip() ) for oNode in self.lSortedNodes: hDst.write(oNode.getTxtRepr3(self.nBytesArc, self.nBytesNodeAddress, self.nBytesOffset, self.lArcVal)+"\n") - hDst.close() - - def writeResults (self, sPathFile): - bFileExits = os.path.isfile("_lexicons.res.txt") - with open("_lexicons.res.txt", "a", encoding='utf-8', newline="\n") as hDst: - sFormat1 = "{:<12} {:>12} {:>5} {:>8} {:>8} {:>6} {:>8} {:>9} {:>9} {:>15} {:>12} {:>12}\n" - sFormat2 = "{:<12} {:>12,} {:>5,} {:>8,} {:>8} {:>6,} {:>8,} {:>9,} {:>9,} {:>15,} {:>12,} {:>12,}\n" - if not bFileExits: - hDst.write(sFormat1.format("Lexicon", "Entries", "Chars", "Affixes", "Stemming", "Tags", "Values", "Nodes", "Arcs", "Lexicon (Kb)", "Dict (Kb)", "LT Dict (Kb)")) - hDst.write(sFormat2.format(self.sLangName, self.nEntry, self.nChar, self.nAff, self.cStemming + "FX", self.nTag, self.nArcVal, \ - self.nNode, self.nArc, os.path.getsize(self.sFileName), os.path.getsize(sPathFile), \ - os.path.getsize("cfsa/dict/{}.dict".format(self.sLangName)) if os.path.isfile("cfsa/dict/{}.dict".format(self.sLangName)) else 0)) - hDst.close() class DawgNode: NextId = 0 Index: lex_build.py ================================================================== --- lex_build.py +++ lex_build.py @@ -12,11 +12,11 @@ def build (spfSrc, sLangCode, sLangName, sfDict, bJSON=False, sDicName="", cStemmingMethod="S", nCompressMethod=1): "transform a text lexicon as a binary indexable dictionary" oDAWG = fsa.DAWG(spfSrc, cStemmingMethod, sLangCode, sLangName, sDicName) dir_util.mkpath("graphspell/_dictionaries") oDAWG.writeInfo("graphspell/_dictionaries/" + sfDict + ".info.txt") - oDAWG.createBinary("graphspell/_dictionaries/" + sfDict + ".bdic", int(nCompressMethod)) + oDAWG.writeBinary("graphspell/_dictionaries/" + sfDict + ".bdic", int(nCompressMethod)) if bJSON: dir_util.mkpath("graphspell-js/_dictionaries") oDic = IBDAWG(sfDict + ".bdic") oDic.writeAsJSObject("graphspell-js/_dictionaries/" + sfDict + ".json", bBinaryDictAsHexString=True)