Index: graphspell-js/dawg.js ================================================================== --- graphspell-js/dawg.js +++ graphspell-js/dawg.js @@ -26,11 +26,11 @@ A word is a list of numbers [ c1, c2, c3 . . . cN, iAffix, iTags] Each arc is an index in this.lArcVal, where are stored characters, suffix/affix codes for stemming and tags. Important: As usual, the last node (after ‘iTags’) is tagged final, AND the node after ‘cN’ is ALSO tagged final. */ - constructor (lEntrySrc, sLang, cStemming, xProgressBarNode=null) { + constructor (lEntrySrc, sLangCode, sLangName, sDicName, cStemming, xProgressBarNode=null) { console.log("===== Direct Acyclic Word Graph - Minimal Acyclic Finite State Automaton ====="); let funcStemmingGen = null; switch (cStemming.toUpperCase()) { case "A": funcStemmingGen = str_transform.defineAffixCode; break; @@ -102,12 +102,13 @@ for (let sAff of dAff.keys()) { lKeyVal.push([dAff.get(sAff)+nChar, dAffOccur.get(sAff)]); } for (let sTag of dTag.keys()) { lKeyVal.push([dTag.get(sTag)+nChar+nAff, dTagOccur.get(sTag)]); } let dValOccur = new Map(lKeyVal); lKeyVal.length = 0; // clear the array - this.sHeader = "/pyfsa/"; - this.sLang = sLang; + this.sLangCode = sLangCode; + this.sLangName = sLangName; + this.sDicName = sDicName; this.nEntry = lWord.length; this.aPreviousEntry = []; oNodeCounter.reset(); this.oRoot = new DawgNode(); this.lUncheckedNodes = []; // list of nodes that have not been checked for duplication. @@ -369,29 +370,31 @@ for (let oNode of this.dMinimizedNodes.values()) { sByDic += oNode.convToBytes1(this.nBytesArc, this.nBytesNodeAddress); } } let oJSON = { - "sName": this.sName, - "nCompressionMethod": nCompressionMethod, + "sHeader": "/pyfsa/", + "sLangCode": this.sLangCode, + "sLangName": this.sLangName, + "sDicName": this.sDicName, + "sFileName": "[none]", "sDate": this._getDate(), - "sHeader": this.sHeader + nCompressionMethod + "/", - "lArcVal": this.lArcVal, - "nArcVal": this.nArcVal, - "byDic": sByDic, // binary word graph - "sLang": this.sLang, + "nEntries": this.nEntry, "nChar": this.nChar, - "nBytesArc": this.nBytesArc, - "nBytesNodeAddress": this.nBytesNodeAddress, - "nEntries": this.nEntry, + "nAff": this.nAff, + "nTag": this.nTag, + "cStemming": this.cStemming, + "dChar": helpers.mapToObject(this.dChar), "nNode": this.nNode, "nArc": this.nArc, - "nAff": this.nAff, - "cStemming": this.cStemming, - "nTag": this.nTag, - "dChar": helpers.mapToObject(this.dChar), - "nBytesOffset": this.nBytesOffset + "lArcVal": this.lArcVal, + "nArcVal": this.nArcVal, + "nCompressionMethod": nCompressionMethod, + "nBytesArc": this.nBytesArc, + "nBytesNodeAddress": this.nBytesNodeAddress, + "nBytesOffset": this.nBytesOffset, + "sByDic": sByDic // binary word graph }; return oJSON; }, _getDate () { Index: graphspell-js/ibdawg.js ================================================================== --- graphspell-js/ibdawg.js +++ graphspell-js/ibdawg.js @@ -100,11 +100,11 @@ catch (e) { throw Error("# Error. File not found or not loadable.\n" + e.message + "\n"); } /* Properties: - sName, nCompressionMethod, sHeader, lArcVal, nArcVal, byDic, sLang, nChar, nBytesArc, nBytesNodeAddress, + sName, nCompressionMethod, sHeader, lArcVal, nArcVal, sByDic, sLang, nChar, nBytesArc, nBytesNodeAddress, nEntries, nNode, nArc, nAff, cStemming, nTag, dChar, nBytesOffset, */ /* Bug workaround. @@ -111,14 +111,16 @@ Mozilla’s JS parser sucks. Can’t read file bigger than 4 Mb! So we convert huge hexadecimal string to list of numbers… https://github.com/mozilla/addons-linter/issues/1361 */ let lTemp = []; - for (let i = 0; i < this.byDic.length; i+=2) { - lTemp.push(parseInt(this.byDic.slice(i, i+2), 16)); + for (let i = 0; i < this.sByDic.length; i+=2) { + lTemp.push(parseInt(this.sByDic.slice(i, i+2), 16)); } + this.sByDic = ""; this.byDic = lTemp; + //this.byDic = new Uint8Array(lTemp); // not quicker, even slower /* end of bug workaround */ if (!this.sHeader.startsWith("/pyfsa/")) { throw TypeError("# Error. Not a pyfsa binary dictionary. Header: " + this.sHeader); } @@ -126,11 +128,10 @@ throw RangeError("# Error. Unknown dictionary compression method: " + this.nCompressionMethod); } // to get the value of an arc, to get the char of an arc with its value this.dChar = helpers.objectToMap(this.dChar); this.dCharVal = this.dChar.gl_reverse(); - //this.byDic = new Uint8Array(this.byDic); // not quicker, even slower if (this.cStemming == "S") { this.funcStemming = str_transform.changeWordWithSuffixCode; } else if (this.cStemming == "A") { this.funcStemming = str_transform.changeWordWithAffixCode; @@ -173,11 +174,12 @@ this.bOptNumSigle = true; this.bOptNumAtLast = false; } getInfo () { - return ` Language: ${this.sLang} Version: ${this.nCompressionMethod} Date: ${this.sDate} Stemming: ${this.cStemming}FX\n` + + return ` Language: ${this.sLangName} Lang code: ${this.sLangCode} Dictionary name: ${this.sDicName}\n` + + ` Compression method: ${this.nCompressionMethod} Date: ${this.sDate} Stemming: ${this.cStemming}FX\n` + ` Arcs values: ${this.nArcVal} = ${this.nChar} characters, ${this.nAff} affixes, ${this.nTag} tags\n` + ` Dictionary: ${this.nEntries} entries, ${this.nNode} nodes, ${this.nArc} arcs\n` + ` Address size: ${this.nBytesNodeAddress} bytes, Arc size: ${this.nBytesArc} bytes\n`; } Index: graphspell/dawg.py ================================================================== --- graphspell/dawg.py +++ graphspell/dawg.py @@ -39,11 +39,11 @@ # We store suffix/affix codes and tags within the graph after the “real” word. # A word is a list of numbers [ c1, c2, c3 . . . cN, iAffix, iTags] # Each arc is an index in self.lArcVal, where are stored characters, suffix/affix codes for stemming and tags. # Important: As usual, the last node (after ‘iTags’) is tagged final, AND the node after ‘cN’ is ALSO tagged final. - def __init__ (self, spfSrc, sLangName, cStemming): + def __init__ (self, spfSrc, sLangCode, sLangName, sDicName, cStemming): print("===== Direct Acyclic Word Graph - Minimal Acyclic Finite State Automaton =====") cStemming = cStemming.upper() if cStemming == "A": funcStemmingGen = st.defineAffixCode elif cStemming == "S": @@ -100,12 +100,14 @@ #with open(spfSrc[:-8]+".valuesfreq.txt", 'w', encoding='utf-8') as hFreqDst: # DEBUG # for iKey, nOcc in sorted(dValOccur.items(), key=lambda t: t[1], reverse=True): # hFreqDst.write("{}: {}\n".format(lVal[iKey], nOcc)) # hFreqDst.close() - self.sFile = spfSrc - self.sLang = sLangName + self.sFileName = spfSrc + self.sLangCode = sLangCode + self.sLangName = sLangName + self.sDicName = sDicName self.nEntry = len(lWord) self.aPreviousEntry = [] DawgNode.resetNextId() self.oRoot = DawgNode() self.lUncheckedNodes = [] # list of nodes that have not been checked for duplication. @@ -406,32 +408,34 @@ with open(spfDst, "w", encoding="utf-8", newline="\n") as hDst: if bInJSModule: hDst.write('// JavaScript\n// Generated data (do not edit)\n\n"use strict";\n\nconst dictionary = ') hDst.write(json.dumps({ - "sName": "todo", - "nCompressionMethod": nCompressionMethod, + "sHeader": "/pyfsa/", + "sLangCode": self.sLangCode, + "sLangName": self.sLangName, + "sDicName": self.sDicName, + "sFileName": self.sFileName, "sDate": str(datetime.datetime.now())[:-7], - "sHeader": "/pyfsa/"+str(nCompressionMethod)+"/", - "lArcVal": self.lArcVal, + "nEntries": self.nEntry, + "nChar": self.nChar, + "nAff": self.nAff, + "nTag": self.nTag, + "cStemming": self.cStemming, + "dChar": self.dChar, + "nNode": self.nNode, + "nArc": self.nArc, "nArcVal": self.nArcVal, + "lArcVal": self.lArcVal, + "nCompressionMethod": nCompressionMethod, + "nBytesArc": self.nBytesArc, + "nBytesNodeAddress": self.nBytesNodeAddress, + "nBytesOffset": self.nBytesOffset # JavaScript is a pile of shit, so Mozilla’s JS parser don’t like file bigger than 4 Mb! # So, if necessary, we use an hexadecimal string, that we will convert later in Firefox’s extension. # https://github.com/mozilla/addons-linter/issues/1361 - "byDic": byDic.hex() if bBinaryDictAsHexString else [ e for e in byDic ], - "sLang": self.sLang, - "nChar": self.nChar, - "nBytesArc": self.nBytesArc, - "nBytesNodeAddress": self.nBytesNodeAddress, - "nEntries": self.nEntry, - "nNode": self.nNode, - "nArc": self.nArc, - "nAff": self.nAff, - "cStemming": self.cStemming, - "nTag": self.nTag, - "dChar": self.dChar, - "nBytesOffset": self.nBytesOffset + "sByDic": byDic.hex() if bBinaryDictAsHexString else [ e for e in byDic ], }, ensure_ascii=False)) if bInJSModule: hDst.write(";\n\nexports.dictionary = dictionary;\n") def _writeBinary (self, sPathFile, nCompressionMethod): @@ -470,11 +474,11 @@ with open(sPathFile, 'wb') as hDst: # header hDst.write("/pyfsa/{}/".format(nCompressionMethod).encode("utf-8")) hDst.write(b"\0\0\0\0") # infos - hDst.write("{}/{}/{}/{}/{}/{}/{}/{}/{}".format(self.sLang, self.nChar, self.nBytesArc, self.nBytesNodeAddress, \ + hDst.write("{}/{}/{}/{}/{}/{}/{}/{}/{}".format(self.sLangName, self.nChar, self.nBytesArc, self.nBytesNodeAddress, \ self.nEntry, self.nNode, self.nArc, self.nAff, self.cStemming).encode("utf-8")) hDst.write(b"\0\0\0\0") # lArcVal hDst.write("\t".join(self.lArcVal).encode("utf-8")) hDst.write(b"\0\0\0\0") @@ -518,13 +522,13 @@ with open("_lexicons.res.txt", "a", encoding='utf-8', newline="\n") as hDst: sFormat1 = "{:<12} {:>12} {:>5} {:>8} {:>8} {:>6} {:>8} {:>9} {:>9} {:>15} {:>12} {:>12}\n" sFormat2 = "{:<12} {:>12,} {:>5,} {:>8,} {:>8} {:>6,} {:>8,} {:>9,} {:>9,} {:>15,} {:>12,} {:>12,}\n" if not bFileExits: hDst.write(sFormat1.format("Lexicon", "Entries", "Chars", "Affixes", "Stemming", "Tags", "Values", "Nodes", "Arcs", "Lexicon (Kb)", "Dict (Kb)", "LT Dict (Kb)")) - hDst.write(sFormat2.format(self.sLang, self.nEntry, self.nChar, self.nAff, self.cStemming + "FX", self.nTag, self.nArcVal, \ - self.nNode, self.nArc, os.path.getsize(self.sFile), os.path.getsize(sPathFile), \ - os.path.getsize("cfsa/dict/{}.dict".format(self.sLang)) if os.path.isfile("cfsa/dict/{}.dict".format(self.sLang)) else 0)) + hDst.write(sFormat2.format(self.sLangName, self.nEntry, self.nChar, self.nAff, self.cStemming + "FX", self.nTag, self.nArcVal, \ + self.nNode, self.nArc, os.path.getsize(self.sFileName), os.path.getsize(sPathFile), \ + os.path.getsize("cfsa/dict/{}.dict".format(self.sLangName)) if os.path.isfile("cfsa/dict/{}.dict".format(self.sLangName)) else 0)) hDst.close() class DawgNode: Index: graphspell/ibdawg.py ================================================================== --- graphspell/ibdawg.py +++ graphspell/ibdawg.py @@ -153,11 +153,12 @@ self.bOptNumSigle = False self.bOptNumAtLast = False def getInfo (self): - return " Language: {0.sLang:>10} Version: {0.nCompressionMethod:>2} Date: {0.sDate} Stemming: {0.cStemming}FX\n" \ + return " Language: {0.sLangName} Lang code: {0.sLangCode} Dictionary name: {0.sDicName}" \ + " Compression method: {0.nCompressionMethod:>2} Date: {0.sDate} Stemming: {0.cStemming}FX\n" \ " Arcs values: {0.nArcVal:>10,} = {0.nChar:>5,} characters, {0.nAff:>6,} affixes, {0.nTag:>6,} tags\n" \ " Dictionary: {0.nEntries:>12,} entries, {0.nNode:>11,} nodes, {0.nArc:>11,} arcs\n" \ " Address size: {0.nBytesNodeAddress:>1} bytes, Arc size: {0.nBytesArc:>1} bytes\n".format(self) def writeAsJSObject (self, spfDest, bInJSModule=False, bBinaryDictAsHexString=False):