Index: gc_lang/fr/build_data.py ================================================================== --- gc_lang/fr/build_data.py +++ gc_lang/fr/build_data.py @@ -50,11 +50,11 @@ def loadDictionary (): global oDict if not oDict: try: - oDict = ibdawg.IBDAWG("fr-allvars.bdic") + oDict = ibdawg.IBDAWG("fr-allvars.json") except: traceback.print_exc() def makeDictionaries (sp, sVersion): Index: gc_lang/fr/oxt/ContextMenu/ContextMenu.py ================================================================== --- gc_lang/fr/oxt/ContextMenu/ContextMenu.py +++ gc_lang/fr/oxt/ContextMenu/ContextMenu.py @@ -133,11 +133,11 @@ oGC = self.ctx.ServiceManager.createInstanceWithContext("org.openoffice.comp.pyuno.Lightproof.grammalecte", self.ctx) if hasattr(oGC, "getSpellChecker"): # https://bugs.documentfoundation.org/show_bug.cgi?id=97790 oSpellChecker = oGC.getSpellChecker() else: - oSpellChecker = SpellChecker("${lang}", "fr-allvars.bdic") + oSpellChecker = SpellChecker("${lang}", "fr-allvars.json") except: traceback.print_exc() def execute (self, args): if not args: Index: gc_lang/fr/oxt/DictOptions/LexiconEditor.py ================================================================== --- gc_lang/fr/oxt/DictOptions/LexiconEditor.py +++ gc_lang/fr/oxt/DictOptions/LexiconEditor.py @@ -406,11 +406,11 @@ def importDictionary (self): spfImported = "" try: xFilePicker = self.xSvMgr.createInstanceWithContext('com.sun.star.ui.dialogs.FilePicker', self.ctx) # other possibility: com.sun.star.ui.dialogs.SystemFilePicker xFilePicker.initialize([uno.getConstantByName("com.sun.star.ui.dialogs.TemplateDescription.FILEOPEN_SIMPLE")]) # seems useless - xFilePicker.appendFilter("Supported files", "*.json; *.bdic") + xFilePicker.appendFilter("Supported files", "*.json") xFilePicker.setDefaultName("fr.__personal__.json") # useless, doesn’t work xFilePicker.setDisplayDirectory("") xFilePicker.setMultiSelectionMode(False) nResult = xFilePicker.execute() if nResult == 1: @@ -461,11 +461,11 @@ def exportDictionary (self): try: xFilePicker = self.xSvMgr.createInstanceWithContext('com.sun.star.ui.dialogs.FilePicker', self.ctx) # other possibility: com.sun.star.ui.dialogs.SystemFilePicker xFilePicker.initialize([uno.getConstantByName("com.sun.star.ui.dialogs.TemplateDescription.FILESAVE_SIMPLE")]) # seems useless - xFilePicker.appendFilter("Supported files", "*.json; *.bdic") + xFilePicker.appendFilter("Supported files", "*.json") xFilePicker.setDefaultName("fr.__personal__.json") # useless, doesn’t work xFilePicker.setDisplayDirectory("") xFilePicker.setMultiSelectionMode(False) nResult = xFilePicker.execute() if nResult == 1: Index: gc_lang/fr/oxt/DictOptions/SearchWords.py ================================================================== --- gc_lang/fr/oxt/DictOptions/SearchWords.py +++ gc_lang/fr/oxt/DictOptions/SearchWords.py @@ -184,11 +184,11 @@ except: traceback.print_exc() def initSpellChecker (self): if not self.oSpellChecker: - self.oSpellChecker = sc.SpellChecker("fr", "fr-allvars.bdic", "", self.oPersonalDicJSON) + self.oSpellChecker = sc.SpellChecker("fr", "fr-allvars.json", "", self.oPersonalDicJSON) @_waitPointer def searchSimilar (self): self.initSpellChecker() sWord = self.xWord.Text.strip() Index: gc_lang/fr/oxt/Graphspell.py ================================================================== --- gc_lang/fr/oxt/Graphspell.py +++ gc_lang/fr/oxt/Graphspell.py @@ -67,11 +67,11 @@ try: personal_dic = json.loads(sPersonalDicJSON) except: print("Graphspell: wrong personal_dic") traceback.print_exc() - self.oGraphspell = SpellChecker("fr", "fr-"+sMainDicName+".bdic", "", personal_dic) + self.oGraphspell = SpellChecker("fr", "fr-"+sMainDicName+".json", "", personal_dic) self.loadHunspell() # print("Graphspell: init done") except: print("Graphspell: init failed") traceback.print_exc() Index: gc_lang/fr/setup.py ================================================================== --- gc_lang/fr/setup.py +++ gc_lang/fr/setup.py @@ -91,11 +91,11 @@ # If there are data files included in your packages that need to be # installed, specify them here. If using Python 2.6 or less, then these # have to be included in MANIFEST.in as well. package_data={ - 'grammalecte': ['graphspell/_dictionaries/*.bdic', '*.txt'] + 'grammalecte': ['graphspell/_dictionaries/*.json', '*.txt'] }, # Although 'package_data' is the preferred approach, in some case you may # need to place data files outside of your packages. See: # http://docs.python.org/3.4/distutils/setupscript.html#installing-additional-files # noqa Index: gc_lang/fr/webext/panel/lex_editor.js ================================================================== --- gc_lang/fr/webext/panel/lex_editor.js +++ gc_lang/fr/webext/panel/lex_editor.js @@ -759,11 +759,11 @@ build: function () { let xProgressNode = document.getElementById("wait_progress"); let lEntry = oLexiconTable.getEntries(); if (lEntry.length > 0) { let oDAWG = new DAWG(lEntry, "S", "fr", "Français", this.sName, this.sDescription, xProgressNode); - let oJSON = oDAWG.createBinaryJSON(1); + let oJSON = oDAWG.createBinaryJSON(); oDictHandler.saveDictionary(this.sName, oJSON); this.oIBDAWG = new IBDAWG(oJSON); this.setDictData(this.oIBDAWG.nEntry, this.oIBDAWG.sDate); } else { oDictHandler.saveDictionary(this.sName, null); Index: graphspell-js/dawg.js ================================================================== --- graphspell-js/dawg.js +++ graphspell-js/dawg.js @@ -342,24 +342,23 @@ } } } // BINARY CONVERSION - createBinaryJSON (nCompressionMethod) { - console.log("Write DAWG as an indexable binary dictionary [method: "+nCompressionMethod+"]"); - if (nCompressionMethod == 1) { - this.nBytesArc = Math.floor( (this.nArcVal.toString(2).length + 2) / 8 ) + 1; // We add 2 bits. See DawgNode.convToBytes1() - this.nBytesOffset = 0; - this._calcNumBytesNodeAddress(); - this._calcNodesAddress1(); - } else { - console.log("Error: unknown compression method"); + _calculateBinary () { + console.log("Write DAWG as an indexable binary dictionary"); + this.nBytesArc = Math.floor( (this.nArcVal.toString(2).length + 2) / 8 ) + 1; // We add 2 bits. See DawgNode.convToBytes() + this.nBytesOffset = 0; + this._calcNumBytesNodeAddress(); + this._calcNodesAddress(); + this.sByDic = this.oRoot.convToBytes(this.nBytesArc, this.nBytesNodeAddress); + for (let oNode of this.dMinimizedNodes.values()) { + this.sByDic += oNode.convToBytes(this.nBytesArc, this.nBytesNodeAddress); } console.log("Arc values (chars, affixes and tags): " + this.nArcVal); console.log("Arc size: "+this.nBytesArc+" bytes, Address size: "+this.nBytesNodeAddress+" bytes"); console.log("-> " + this.nBytesArc+this.nBytesNodeAddress + " * " + this.nArc + " = " + (this.nBytesArc+this.nBytesNodeAddress)*this.nArc + " bytes"); - return this._createJSON(nCompressionMethod); } _calcNumBytesNodeAddress () { // how many bytes needed to store all nodes/arcs in the binary dictionary this.nBytesNodeAddress = 1; @@ -366,27 +365,53 @@ while (((this.nBytesArc + this.nBytesNodeAddress) * this.nArc) > (2 ** (this.nBytesNodeAddress * 8))) { this.nBytesNodeAddress += 1; } } - _calcNodesAddress1 () { + _calcNodesAddress () { let nBytesNode = this.nBytesArc + this.nBytesNodeAddress; let iAddr = this.oRoot.arcs.size * nBytesNode; for (let oNode of this.dMinimizedNodes.values()) { oNode.addr = iAddr; iAddr += Math.max(oNode.arcs.size, 1) * nBytesNode; } } - _createJSON (nCompressionMethod) { - let sByDic = ""; - if (nCompressionMethod == 1) { - sByDic = this.oRoot.convToBytes1(this.nBytesArc, this.nBytesNodeAddress); - for (let oNode of this.dMinimizedNodes.values()) { - sByDic += oNode.convToBytes1(this.nBytesArc, this.nBytesNodeAddress); - } - } + _binaryToList () { + this.lByDic = []; + let nAcc = 0; + let lBytesBuffer = []; + let nDivisor = (this.nBytesArc + this.nBytesNodeAddress) / 2; + for (let i = 0; i < this.sByDic.length; i+=2) { + lBytesBuffer.push(parseInt(this.sByDic.slice(i, i+2), 16)); + if (nAcc == (this.nBytesArc - 1)) { + this.lByDic.push(this._convBytesToInteger(lBytesBuffer)); + lBytesBuffer = []; + } + else if (nAcc == (this.nBytesArc + this.nBytesNodeAddress - 1)) { + this.lByDic.push(Math.round(this._convBytesToInteger(lBytesBuffer) / nDivisor)); // Math.round should be useless, BUT with JS who knowns what can happen… + lBytesBuffer = []; + nAcc = -1; + } + nAcc = nAcc + 1; + } + } + + _convBytesToInteger (aBytes) { + // Byte order = Big Endian (bigger first) + let nVal = 0; + let nWeight = (aBytes.length - 1) * 8; + for (let n of aBytes) { + nVal += n << nWeight; + nWeight = nWeight - 8; + } + return nVal; + } + + createBinaryJSON () { + this._calculateBinary(); + this._binaryToList(); let oJSON = { "sHeader": "/grammalecte-fsa/", "sLangCode": this.sLangCode, "sLangName": this.sLangName, "sDicName": this.sDicName, @@ -401,15 +426,15 @@ "dChar": helpers.mapToObject(this.dChar), "nNode": this.nNode, "nArc": this.nArc, "lArcVal": this.lArcVal, "nArcVal": this.nArcVal, - "nCompressionMethod": nCompressionMethod, "nBytesArc": this.nBytesArc, "nBytesNodeAddress": this.nBytesNodeAddress, "nBytesOffset": this.nBytesOffset, - "sByDic": sByDic, // binary word graph + //"sByDic": this.sByDic, // binary word graph + "lByDic": this.lByDic, "l2grams": Array.from(this.a2grams) }; return oJSON; } @@ -494,11 +519,11 @@ } } } // VERSION 1 ===================================================================================================== - convToBytes1 (nBytesArc, nBytesNodeAddress) { + convToBytes (nBytesArc, nBytesNodeAddress) { /* Node scheme: - Arc length is defined by nBytesArc - Address length is defined by nBytesNodeAddress Index: graphspell-js/dic_merger.js ================================================================== --- graphspell-js/dic_merger.js +++ graphspell-js/dic_merger.js @@ -36,11 +36,11 @@ if (xProgressBar) { xProgressBar.value = xProgressBar.max; } try { let oDAWG = new DAWG(lEntry, cStemming, sLangCode, sLangName, sDicName, sDescription, xProgressBar); - let oDict = oDAWG.createBinaryJSON(1); + let oDict = oDAWG.createBinaryJSON(); return oDict; } catch (e) { console.log("Dictionaries merger: unable to generate merged dictionary"); console.error(e); Index: graphspell-js/ibdawg.js ================================================================== --- graphspell-js/ibdawg.js +++ graphspell-js/ibdawg.js @@ -145,67 +145,60 @@ console.log("dic:" + source.slice(0, 1000)); throw Error("# Error. File not found or not loadable.\n" + e.message + "\n"); } /* Properties: - sName, nCompressionMethod, sHeader, lArcVal, nArcVal, sByDic, sLang, nChar, nBytesArc, nBytesNodeAddress, + sName, sHeader, lArcVal, nArcVal, sByDic, sLang, nChar, nBytesArc, nBytesNodeAddress, nEntry, nNode, nArc, nAff, cStemming, nTag, dChar, nBytesOffset, */ if (!(this.sHeader.startsWith("/grammalecte-fsa/") || this.sHeader.startsWith("/pyfsa/"))) { throw TypeError("# Error. Not a grammalecte-fsa binary dictionary. Header: " + this.sHeader); } - if (!(this.nCompressionMethod == 1 || this.nCompressionMethod == 2 || this.nCompressionMethod == 3)) { - throw RangeError("# Error. Unknown dictionary compression method: " + this.nCompressionMethod); - } // to get the value of an arc, to get the char of an arc with its value this.dChar = helpers.objectToMap(this.dChar); this.dCharVal = this.dChar.gl_reverse(); this.a2grams = (this.l2grams) ? new Set(this.l2grams) : null; + if (!this.hasOwnProperty("lByDic")) { + // old dictionary version + if (!this.hasOwnProperty("sByDic")) { + throw TypeError("# Error. No usable data in the dictionary."); + } + this.lByDic = []; + let nAcc = 0; + let lBytesBuffer = []; + let nDivisor = (this.nBytesArc + this.nBytesNodeAddress) / 2; + for (let i = 0; i < this.sByDic.length; i+=2) { + lBytesBuffer.push(parseInt(this.sByDic.slice(i, i+2), 16)); + if (nAcc == (this.nBytesArc - 1)) { + this.lByDic.push(this._convBytesToInteger(lBytesBuffer)); + lBytesBuffer = []; + } + else if (nAcc == (this.nBytesArc + this.nBytesNodeAddress - 1)) { + this.lByDic.push(Math.round(this._convBytesToInteger(lBytesBuffer) / nDivisor)); // Math.round should be useless, BUT with JS who knowns what can happen… + lBytesBuffer = []; + nAcc = -1; + } + nAcc = nAcc + 1; + } + } + + // masks + this._arcMask = (2 ** ((this.nBytesArc * 8) - 3)) - 1; + this._finalNodeMask = 1 << ((this.nBytesArc * 8) - 1); + this._lastArcMask = 1 << ((this.nBytesArc * 8) - 2); + + // function to decode the affix/suffix code if (this.cStemming == "S") { this.funcStemming = str_transform.changeWordWithSuffixCode; } else if (this.cStemming == "A") { this.funcStemming = str_transform.changeWordWithAffixCode; } else { this.funcStemming = str_transform.noStemming; } - /* - Bug workaround. - Mozilla’s JS parser sucks. Can’t read file bigger than 4 Mb! - So we convert huge hexadecimal string to list of numbers… - https://github.com/mozilla/addons-linter/issues/1361 - */ - /* - Performance trick: - Instead of converting bytes to integers each times we parse the binary dictionary, - we do it once, then parse the array - */ - this.lByDic = []; - let nAcc = 0; - let lBytesBuffer = []; - let nDivisor = (this.nBytesArc + this.nBytesNodeAddress) / 2; - for (let i = 0; i < this.sByDic.length; i+=2) { - lBytesBuffer.push(parseInt(this.sByDic.slice(i, i+2), 16)); - if (nAcc == (this.nBytesArc - 1)) { - this.lByDic.push(this._convBytesToInteger(lBytesBuffer)); - lBytesBuffer = []; - } - else if (nAcc == (this.nBytesArc + this.nBytesNodeAddress - 1)) { - this.lByDic.push(Math.round(this._convBytesToInteger(lBytesBuffer) / nDivisor)); // Math.round should be useless, BUT with JS who knowns what can happen… - lBytesBuffer = []; - nAcc = -1; - } - nAcc = nAcc + 1; - } - /* end of bug workaround */ - - this._arcMask = (2 ** ((this.nBytesArc * 8) - 3)) - 1; - this._finalNodeMask = 1 << ((this.nBytesArc * 8) - 1); - this._lastArcMask = 1 << ((this.nBytesArc * 8) - 2); - //console.log(this.getInfo()); this.bAcronymValid = true; this.bNumAtLastValid = false; // lexicographer module ? @@ -216,11 +209,11 @@ } } getInfo () { return ` Language: ${this.sLangName} Lang code: ${this.sLangCode} Dictionary name: ${this.sDicName}\n` + - ` Compression method: ${this.nCompressionMethod} Date: ${this.sDate} Stemming: ${this.cStemming}FX\n` + + ` Date: ${this.sDate} Stemming: ${this.cStemming}FX\n` + ` Arcs values: ${this.nArcVal} = ${this.nChar} characters, ${this.nAff} affixes, ${this.nTag} tags\n` + ` Dictionary: ${this.nEntry} entries, ${this.nNode} nodes, ${this.nArc} arcs\n` + ` Address size: ${this.nBytesNodeAddress} bytes, Arc size: ${this.nBytesArc} bytes\n`; } @@ -241,11 +234,10 @@ "dChar": helpers.mapToObject(this.dChar), "nNode": this.nNode, "nArc": this.nArc, "lArcVal": this.lArcVal, "nArcVal": this.nArcVal, - "nCompressionMethod": this.nCompressionMethod, "nBytesArc": this.nBytesArc, "nBytesNodeAddress": this.nBytesNodeAddress, "nBytesOffset": this.nBytesOffset, "sByDic": this.sByDic, // binary word graph "l2grams": this.l2grams Index: graphspell/dawg.py ================================================================== --- graphspell/dawg.py +++ graphspell/dawg.py @@ -356,30 +356,20 @@ if not zPattern or zPattern.search(self.lArcVal[nMorphVal]): yield sEntry + "\t" + self.lArcVal[nMorphVal] # BINARY CONVERSION - def _calculateBinary (self, nCompressionMethod): - print(" > Write DAWG as an indexable binary dictionary [method: %d]" % nCompressionMethod) - if nCompressionMethod == 1: - self.nBytesArc = ( (self.nArcVal.bit_length() + 2) // 8 ) + 1 # We add 2 bits. See DawgNode.convToBytes1() - self.nBytesOffset = 0 - self._calcNumBytesNodeAddress() - self._calcNodesAddress1() - elif nCompressionMethod == 2: - self.nBytesArc = ( (self.nArcVal.bit_length() + 3) // 8 ) + 1 # We add 3 bits. See DawgNode.convToBytes2() - self.nBytesOffset = 0 - self._calcNumBytesNodeAddress() - self._calcNodesAddress2() - elif nCompressionMethod == 3: - self.nBytesArc = ( (self.nArcVal.bit_length() + 3) // 8 ) + 1 # We add 3 bits. See DawgNode.convToBytes3() - self.nBytesOffset = 1 - self.nMaxOffset = (2 ** (self.nBytesOffset * 8)) - 1 - self._calcNumBytesNodeAddress() - self._calcNodesAddress3() - else: - print(" # Error: unknown compression method") + def _calculateBinary (self): + print(" > Write DAWG as an indexable binary dictionary") + self.nBytesArc = ( (self.nArcVal.bit_length() + 2) // 8 ) + 1 # We add 2 bits. See DawgNode.convToBytes() + self.nBytesOffset = 0 + self._calcNumBytesNodeAddress() + self._calcNodesAddress() + self.byDic = b"" + self.byDic = self.oRoot.convToBytes(self.nBytesArc, self.nBytesNodeAddress) + for oNode in self.lMinimizedNodes: + self.byDic += oNode.convToBytes(self.nBytesArc, self.nBytesNodeAddress) print(" Arc values (chars, affixes and tags): {} -> {} bytes".format( self.nArcVal, len("\t".join(self.lArcVal).encode("utf-8")) )) print(" Arc size: {} bytes, Address size: {} bytes -> {} * {} = {} bytes".format( self.nBytesArc, self.nBytesNodeAddress, \ self.nBytesArc+self.nBytesNodeAddress, self.nArc, \ (self.nBytesArc+self.nBytesNodeAddress)*self.nArc )) @@ -387,70 +377,44 @@ "how many bytes needed to store all nodes/arcs in the binary dictionary" self.nBytesNodeAddress = 1 while ((self.nBytesArc + self.nBytesNodeAddress) * self.nArc) > (2 ** (self.nBytesNodeAddress * 8)): self.nBytesNodeAddress += 1 - def _calcNodesAddress1 (self): + def _calcNodesAddress (self): nBytesNode = self.nBytesArc + self.nBytesNodeAddress iAddr = len(self.oRoot.arcs) * nBytesNode for oNode in self.lMinimizedNodes: oNode.addr = iAddr iAddr += max(len(oNode.arcs), 1) * nBytesNode - def _calcNodesAddress2 (self): - nBytesNode = self.nBytesArc + self.nBytesNodeAddress - iAddr = len(self.oRoot.arcs) * nBytesNode - for oNode in self.lSortedNodes: - oNode.addr = iAddr - iAddr += max(len(oNode.arcs), 1) * nBytesNode - for oNextNode in oNode.arcs.values(): - if (oNode.pos + 1) == oNextNode.pos: - iAddr -= self.nBytesNodeAddress - #break - - def _calcNodesAddress3 (self): - nBytesNode = self.nBytesArc + self.nBytesNodeAddress - # theorical nodes size if only addresses and no offset - self.oRoot.size = len(self.oRoot.arcs) * nBytesNode - for oNode in self.lSortedNodes: - oNode.size = max(len(oNode.arcs), 1) * nBytesNode - # rewind and calculate dropdown from the end, several times - nDiff = self.nBytesNodeAddress - self.nBytesOffset - bEnd = False - while not bEnd: - bEnd = True - # recalculate addresses - iAddr = self.oRoot.size - for oNode in self.lSortedNodes: - oNode.addr = iAddr - iAddr += oNode.size - # rewind and calculate dropdown from the end, several times - for i in range(self.nNode-1, -1, -1): - nSize = max(len(self.lSortedNodes[i].arcs), 1) * nBytesNode - for oNextNode in self.lSortedNodes[i].arcs.values(): - if 1 < (oNextNode.addr - self.lSortedNodes[i].addr) < self.nMaxOffset: - nSize -= nDiff - if self.lSortedNodes[i].size != nSize: - self.lSortedNodes[i].size = nSize - bEnd = False - - def getBinaryAsJSON (self, nCompressionMethod=1, bBinaryDictAsHexString=True): + def _binaryToList (self): + """ + Convert binary string to binary list + BEFORE: Arc Address Arc Address + ||||||||| ||||||||| ||||||||| ||||||||| ||||||||| ||||||||| ||||||||| ||||||||| ||||||||| ||||||||| ||||||||| ||||||||| ... + + AFTER: list of integers: [ arc, address, arc, address, arc, address, ... arc, address ] + """ + self.lByDic = [] + nAcc = 0 + byBuffer = b"" + nDivisor = (self.nBytesArc + self.nBytesNodeAddress) / 2 + for i in range(0, len(self.byDic)): + byBuffer += self.byDic[i:i+1] + if nAcc == (self.nBytesArc - 1): + self.lByDic.append(int.from_bytes(byBuffer, byteorder="big")) + byBuffer = b"" + elif nAcc == (self.nBytesArc + self.nBytesNodeAddress - 1): + self.lByDic.append(round(int.from_bytes(byBuffer, byteorder="big") / nDivisor)) + byBuffer = b"" + nAcc = -1 + nAcc = nAcc + 1 + + def getBinaryAsJSON (self): "return a JSON string containing all necessary data of the dictionary (compressed as a binary string)" - self._calculateBinary(nCompressionMethod) - byDic = b"" - if nCompressionMethod == 1: - byDic = self.oRoot.convToBytes1(self.nBytesArc, self.nBytesNodeAddress) - for oNode in self.lMinimizedNodes: - byDic += oNode.convToBytes1(self.nBytesArc, self.nBytesNodeAddress) - elif nCompressionMethod == 2: - byDic = self.oRoot.convToBytes2(self.nBytesArc, self.nBytesNodeAddress) - for oNode in self.lSortedNodes: - byDic += oNode.convToBytes2(self.nBytesArc, self.nBytesNodeAddress) - elif nCompressionMethod == 3: - byDic = self.oRoot.convToBytes3(self.nBytesArc, self.nBytesNodeAddress, self.nBytesOffset) - for oNode in self.lSortedNodes: - byDic += oNode.convToBytes3(self.nBytesArc, self.nBytesNodeAddress, self.nBytesOffset) + self._calculateBinary() + self._binaryToList() return { "sHeader": "/grammalecte-fsa/", "sLangCode": self.sLangCode, "sLangName": self.sLangName, "sDicName": self.sDicName, @@ -465,128 +429,39 @@ "dChar": self.dChar, "nNode": self.nNode, "nArc": self.nArc, "nArcVal": self.nArcVal, "lArcVal": self.lArcVal, - "nCompressionMethod": nCompressionMethod, "nBytesArc": self.nBytesArc, "nBytesNodeAddress": self.nBytesNodeAddress, "nBytesOffset": self.nBytesOffset, # Mozilla’s JS parser don’t like file bigger than 4 Mb! # So, if necessary, we use an hexadecimal string, that we will convert later in Firefox’s extension. # https://github.com/mozilla/addons-linter/issues/1361 - "sByDic": byDic.hex() if bBinaryDictAsHexString else [ e for e in byDic ], + #"sByDic": self.byDic.hex(), + "lByDic": self.lByDic, "l2grams": list(self.a2grams) } - def writeAsJSObject (self, spfDst, nCompressionMethod, bInJSModule=False, bBinaryDictAsHexString=True): + def writeAsJSObject (self, spfDst): "write a file (JSON or JS module) with all the necessary data" if not spfDst.endswith(".json"): - spfDst += "."+str(nCompressionMethod)+".json" + spfDst += ".json" with open(spfDst, "w", encoding="utf-8", newline="\n") as hDst: - if bInJSModule: - hDst.write('// JavaScript\n// Generated data (do not edit)\n\n"use strict";\n\nconst dictionary = ') - hDst.write( json.dumps(self.getBinaryAsJSON(nCompressionMethod, bBinaryDictAsHexString), ensure_ascii=False) ) - if bInJSModule: - hDst.write(";\n\nexports.dictionary = dictionary;\n") - - def writeBinary (self, sPathFile, nCompressionMethod, bDebug=False): - """ - Save as a binary file. - - Format of the binary indexable dictionary: - Each section is separated with 4 bytes of \0 - - - Section Header: - /grammalecte-fsa/[compression method] - * compression method is an ASCII string - - - Section Informations: - /[lang code] - /[lang name] - /[dictionary name] - /[date creation] - /[number of chars] - /[number of bytes for each arc] - /[number of bytes for each address node] - /[number of entries] - /[number of nodes] - /[number of arcs] - /[number of affixes] - * each field is a ASCII string - /[stemming code] - * "S" means stems are generated by /suffix_code/, - "A" means they are generated by /affix_code/ - See defineSuffixCode() and defineAffixCode() for details. - "N" means no stemming - - - Section Values: - * a list of strings encoded in binary from utf-8, each value separated with a tabulation - - - Section Word Graph (nodes / arcs) - * A list of nodes which are a list of arcs with an address of the next node. - See DawgNode.convToBytes() for details. - - - Section 2grams: - * A list of 2grams (as strings: 2 chars) encoded in binary from utf-8, each value separated with a tabulation - """ - self._calculateBinary(nCompressionMethod) - if not sPathFile.endswith(".bdic"): - sPathFile += "."+str(nCompressionMethod)+".bdic" - with open(sPathFile, 'wb') as hDst: - # header - hDst.write("/grammalecte-fsa/{}/".format(nCompressionMethod).encode("utf-8")) - hDst.write(b"\0\0\0\0") - # infos - sInfo = "{}//{}//{}//{}//{}//{}//{}//{}//{}//{}//{}//{}//{}".format(self.sLangCode, self.sLangName, self.sDicName, self.sDescription, self._getDate(), \ - self.nChar, self.nBytesArc, self.nBytesNodeAddress, \ - self.nEntry, self.nNode, self.nArc, self.nAff, self.cStemming) - hDst.write(sInfo.encode("utf-8")) - hDst.write(b"\0\0\0\0") - # lArcVal - hDst.write("\t".join(self.lArcVal).encode("utf-8")) - hDst.write(b"\0\0\0\0") - # 2grams - hDst.write("\t".join(self.a2grams).encode("utf-8")) - hDst.write(b"\0\0\0\0") - # DAWG: nodes / arcs - if nCompressionMethod == 1: - hDst.write(self.oRoot.convToBytes1(self.nBytesArc, self.nBytesNodeAddress)) - for oNode in self.lMinimizedNodes: - hDst.write(oNode.convToBytes1(self.nBytesArc, self.nBytesNodeAddress)) - elif nCompressionMethod == 2: - hDst.write(self.oRoot.convToBytes2(self.nBytesArc, self.nBytesNodeAddress)) - for oNode in self.lSortedNodes: - hDst.write(oNode.convToBytes2(self.nBytesArc, self.nBytesNodeAddress)) - elif nCompressionMethod == 3: - hDst.write(self.oRoot.convToBytes3(self.nBytesArc, self.nBytesNodeAddress, self.nBytesOffset)) - for oNode in self.lSortedNodes: - hDst.write(oNode.convToBytes3(self.nBytesArc, self.nBytesNodeAddress, self.nBytesOffset)) - if bDebug: - self._writeNodes(sPathFile, nCompressionMethod) + hDst.write( json.dumps(self.getBinaryAsJSON(), ensure_ascii=False) ) def _getDate (self): return time.strftime("%Y-%m-%d %H:%M:%S") - def _writeNodes (self, sPathFile, nCompressionMethod): + def _writeNodes (self, sPathFile): "for debugging only" print(" > Write nodes") - with open(sPathFile+".nodes."+str(nCompressionMethod)+".txt", 'w', encoding='utf-8', newline="\n") as hDst: - if nCompressionMethod == 1: - hDst.write(self.oRoot.getTxtRepr1(self.nBytesArc, self.lArcVal)+"\n") - #hDst.write( ''.join( [ "%02X " % z for z in self.oRoot.convToBytes1(self.nBytesArc, self.nBytesNodeAddress) ] ).strip() ) - for oNode in self.lMinimizedNodes: - hDst.write(oNode.getTxtRepr1(self.nBytesArc, self.lArcVal)+"\n") - if nCompressionMethod == 2: - hDst.write(self.oRoot.getTxtRepr2(self.nBytesArc, self.lArcVal)+"\n") - for oNode in self.lSortedNodes: - hDst.write(oNode.getTxtRepr2(self.nBytesArc, self.lArcVal)+"\n") - if nCompressionMethod == 3: - hDst.write(self.oRoot.getTxtRepr3(self.nBytesArc, self.nBytesOffset, self.lArcVal)+"\n") - #hDst.write( ''.join( [ "%02X " % z for z in self.oRoot.convToBytes3(self.nBytesArc, self.nBytesNodeAddress, self.nBytesOffset) ] ).strip() ) - for oNode in self.lSortedNodes: - hDst.write(oNode.getTxtRepr3(self.nBytesArc, self.nBytesOffset, self.lArcVal)+"\n") + with open(sPathFile+".nodes.txt", 'w', encoding='utf-8', newline="\n") as hDst: + hDst.write(self.oRoot.getTxtRepr(self.nBytesArc, self.lArcVal)+"\n") + #hDst.write( ''.join( [ "%02X " % z for z in self.oRoot.convToBytes(self.nBytesArc, self.nBytesNodeAddress) ] ).strip() ) + for oNode in self.lMinimizedNodes: + hDst.write(oNode.getTxtRepr(self.nBytesArc, self.lArcVal)+"\n") class DawgNode: """Node of the word graph""" @@ -599,11 +474,10 @@ DawgNode.NextId += 1 self.final = False self.arcs = {} # key: arc value; value: a node self.addr = 0 # address in the binary dictionary self.pos = 0 # position in the binary dictionary (version 2) - self.size = 0 # size of node in bytes (version 3) @classmethod def resetNextId (cls): "set NextId to 0 " cls.NextId = 0 @@ -646,11 +520,11 @@ def sortArcs2 (self, dValOccur, lArcVal): "sort arcs of each node depending on the previous char" self.arcs = collections.OrderedDict(sorted(self.arcs.items(), key=lambda t: dValOccur.get(lArcVal[t[0]], 0), reverse=True)) # VERSION 1 ===================================================================================================== - def convToBytes1 (self, nBytesArc, nBytesNodeAddress): + def convToBytes (self, nBytesArc, nBytesNodeAddress): """ Convert to bytes (method 1). Node scheme: - Arc length is defined by nBytesArc @@ -688,11 +562,11 @@ val = val | nFinalArcMask by += val.to_bytes(nBytesArc, byteorder='big') by += self.arcs[arc].addr.to_bytes(nBytesNodeAddress, byteorder='big') return by - def getTxtRepr1 (self, nBytesArc, lVal): + def getTxtRepr (self, nBytesArc, lVal): "return representation as string of node (method 1)" nArc = len(self.arcs) nFinalNodeMask = 1 << ((nBytesArc*8)-1) nFinalArcMask = 1 << ((nBytesArc*8)-2) s = "i{:_>10} -- #{:_>10}\n".format(self.i, self.addr) @@ -705,161 +579,10 @@ val = val | nFinalNodeMask if i == nArc: val = val | nFinalArcMask s += " {:<20} {:0>16} i{:_>10} #{:_>10}\n".format(lVal[arc], bin(val)[2:], self.arcs[arc].i, self.arcs[arc].addr) return s - - # VERSION 2 ===================================================================================================== - def convToBytes2 (self, nBytesArc, nBytesNodeAddress): - """ - Convert to bytes (method 2). - - Node scheme: - - Arc length is defined by nBytesArc - - Address length is defined by nBytesNodeAddress - - | Arc | Address of next node | - | | | - ┏━━━━━━━━━━━━━━━┓ ┏━━━━━━━━━━━━━━━┓ ┏━━━━━━━━━━━━━━━┓ ┏━━━━━━━━━━━━━━━┓ ┏━━━━━━━━━━━━━━━┓ ┏━━━━━━━━━━━━━━━┓ - ┃ ┃ ┃ ┃ ┃ ┃ ┃ ┃ ┃ ┃ ┃ ┃ ┃ ┃ ┃ ┃ ┃ ┃ ┃ ┃ ┃ ┃ ┃ ┃ ┃ ┃ ┃ ┃ ┃ ┃ ┃ ┃ ┃ ┃ ┃ ┃ ┃ ┃ ┃ ┃ ┃ ┃ ┃ ┃ ┃ ┃ ┃ ┃ ┃ ┃ ┃ ┃ ┃ ┃ - ┗━━━━━━━━━━━━━━━┛ ┗━━━━━━━━━━━━━━━┛ ┗━━━━━━━━━━━━━━━┛ ┗━━━━━━━━━━━━━━━┛ ┗━━━━━━━━━━━━━━━┛ ┗━━━━━━━━━━━━━━━┛ - [...] - ┏━━━━━━━━━━━━━━━┓ ┏━━━━━━━━━━━━━━━┓ ┏━━━━━━━━━━━━━━━┓ ┏━━━━━━━━━━━━━━━┓ ┏━━━━━━━━━━━━━━━┓ ┏━━━━━━━━━━━━━━━┓ - ┃ ┃ ┃ ┃ ┃ ┃ ┃ ┃ ┃ ┃ ┃ ┃ ┃ ┃ ┃ ┃ ┃ ┃ ┃ ┃ ┃ ┃ ┃ ┃ ┃ ┃ ┃ ┃ ┃ ┃ ┃ ┃ ┃ ┃ ┃ ┃ ┃ ┃ ┃ ┃ ┃ ┃ ┃ ┃ ┃ ┃ ┃ ┃ ┃ ┃ ┃ ┃ ┃ ┃ - ┗━━━━━━━━━━━━━━━┛ ┗━━━━━━━━━━━━━━━┛ ┗━━━━━━━━━━━━━━━┛ ┗━━━━━━━━━━━━━━━┛ ┗━━━━━━━━━━━━━━━┛ ┗━━━━━━━━━━━━━━━┛ - ^ ^ ^ - ┃ ┃ ┃ - ┃ ┃ ┗━━ if 1, caution, no address: next node is the following node - ┃ ┗━━━━ if 1, last arc of this node - ┗━━━━━━ if 1, this node is final (only on the first arc) - """ - nArc = len(self.arcs) - nFinalNodeMask = 1 << ((nBytesArc*8)-1) - nFinalArcMask = 1 << ((nBytesArc*8)-2) - nNextNodeMask = 1 << ((nBytesArc*8)-3) - if not nArc: - val = nFinalNodeMask | nFinalArcMask - by = val.to_bytes(nBytesArc, byteorder='big') - by += (0).to_bytes(nBytesNodeAddress, byteorder='big') - return by - by = b"" - for i, arc in enumerate(self.arcs, 1): - val = arc - if i == 1 and self.final: - val = val | nFinalNodeMask - if i == nArc: - val = val | nFinalArcMask - if (self.pos + 1) == self.arcs[arc].pos and self.i != 0: - val = val | nNextNodeMask - by += val.to_bytes(nBytesArc, byteorder='big') - else: - by += val.to_bytes(nBytesArc, byteorder='big') - by += self.arcs[arc].addr.to_bytes(nBytesNodeAddress, byteorder='big') - return by - - def getTxtRepr2 (self, nBytesArc, lVal): - "return representation as string of node (method 2)" - nArc = len(self.arcs) - nFinalNodeMask = 1 << ((nBytesArc*8)-1) - nFinalArcMask = 1 << ((nBytesArc*8)-2) - nNextNodeMask = 1 << ((nBytesArc*8)-3) - s = "i{:_>10} -- #{:_>10}\n".format(self.i, self.addr) - if not nArc: - s += " {:<20} {:0>16} i{:_>10} #{:_>10}\n".format("", bin(nFinalNodeMask | nFinalArcMask)[2:], "0", "0") - return s - for i, arc in enumerate(self.arcs, 1): - val = arc - if i == 1 and self.final: - val = val | nFinalNodeMask - if i == nArc: - val = val | nFinalArcMask - if (self.pos + 1) == self.arcs[arc].pos and self.i != 0: - val = val | nNextNodeMask - s += " {:<20} {:0>16}\n".format(lVal[arc], bin(val)[2:]) - else: - s += " {:<20} {:0>16} i{:_>10} #{:_>10}\n".format(lVal[arc], bin(val)[2:], self.arcs[arc].i, self.arcs[arc].addr) - return s - - # VERSION 3 ===================================================================================================== - def convToBytes3 (self, nBytesArc, nBytesNodeAddress, nBytesOffset): - """ - Convert to bytes (method 3). - - Node scheme: - - Arc length is defined by nBytesArc - - Address length is defined by nBytesNodeAddress - - Offset length is defined by nBytesOffset - - | Arc | Address of next node or offset to next node | - | | | - ┏━━━━━━━━━━━━━━━┓ ┏━━━━━━━━━━━━━━━┓ ┏━━━━━━━━━━━━━━━┓ ┏━━━━━━━━━━━━━━━┓ ┏━━━━━━━━━━━━━━━┓ ┏━━━━━━━━━━━━━━━┓ - ┃1┃0┃0┃ ┃ ┃ ┃ ┃ ┃ ┃ ┃ ┃ ┃ ┃ ┃ ┃ ┃ ┃ ┃ ┃ ┃ ┃ ┃ ┃ ┃ ┃ ┃ ┃ ┃ ┃ ┃ ┃ ┃ ┃ ┃ ┃ ┃ ┃ ┃ ┃ ┃ ┃ ┃ ┃ ┃ ┃ ┃ ┃ ┃ ┃ ┃ ┃ ┃ ┃ - ┗━━━━━━━━━━━━━━━┛ ┗━━━━━━━━━━━━━━━┛ ┗━━━━━━━━━━━━━━━┛ ┗━━━━━━━━━━━━━━━┛ ┗━━━━━━━━━━━━━━━┛ ┗━━━━━━━━━━━━━━━┛ - [...] - ┏━━━━━━━━━━━━━━━┓ ┏━━━━━━━━━━━━━━━┓ ┏━━━━━━━━━━━━━━━┓ - ┃0┃0┃1┃ ┃ ┃ ┃ ┃ ┃ ┃ ┃ ┃ ┃ ┃ ┃ ┃ ┃ ┃ ┃ ┃ ┃ ┃ ┃ ┃ ┃ ┃ ┃ Offsets are shorter than addresses - ┗━━━━━━━━━━━━━━━┛ ┗━━━━━━━━━━━━━━━┛ ┗━━━━━━━━━━━━━━━┛ - ┏━━━━━━━━━━━━━━━┓ ┏━━━━━━━━━━━━━━━┓ ┏━━━━━━━━━━━━━━━┓ ┏━━━━━━━━━━━━━━━┓ ┏━━━━━━━━━━━━━━━┓ ┏━━━━━━━━━━━━━━━┓ - ┃0┃1┃0┃ ┃ ┃ ┃ ┃ ┃ ┃ ┃ ┃ ┃ ┃ ┃ ┃ ┃ ┃ ┃ ┃ ┃ ┃ ┃ ┃ ┃ ┃ ┃ ┃ ┃ ┃ ┃ ┃ ┃ ┃ ┃ ┃ ┃ ┃ ┃ ┃ ┃ ┃ ┃ ┃ ┃ ┃ ┃ ┃ ┃ ┃ ┃ ┃ ┃ ┃ - ┗━━━━━━━━━━━━━━━┛ ┗━━━━━━━━━━━━━━━┛ ┗━━━━━━━━━━━━━━━┛ ┗━━━━━━━━━━━━━━━┛ ┗━━━━━━━━━━━━━━━┛ ┗━━━━━━━━━━━━━━━┛ - - ^ ^ ^ - ┃ ┃ ┃ - ┃ ┃ ┗━━ if 1, offset instead of address of next node - ┃ ┗━━━━ if 1, last arc of this node - ┗━━━━━━ if 1, this node is final (only on the first arc) - """ - nArc = len(self.arcs) - nFinalNodeMask = 1 << ((nBytesArc*8)-1) - nFinalArcMask = 1 << ((nBytesArc*8)-2) - nNextNodeMask = 1 << ((nBytesArc*8)-3) - nMaxOffset = (2 ** (nBytesOffset * 8)) - 1 - if not nArc: - val = nFinalNodeMask | nFinalArcMask - by = val.to_bytes(nBytesArc, byteorder='big') - by += (0).to_bytes(nBytesNodeAddress, byteorder='big') - return by - by = b"" - for i, arc in enumerate(self.arcs, 1): - val = arc - if i == 1 and self.final: - val = val | nFinalNodeMask - if i == nArc: - val = val | nFinalArcMask - if 1 < (self.arcs[arc].addr - self.addr) < nMaxOffset and self.i != 0: - val = val | nNextNodeMask - by += val.to_bytes(nBytesArc, byteorder='big') - by += (self.arcs[arc].addr-self.addr).to_bytes(nBytesOffset, byteorder='big') - else: - by += val.to_bytes(nBytesArc, byteorder='big') - by += self.arcs[arc].addr.to_bytes(nBytesNodeAddress, byteorder='big') - return by - - def getTxtRepr3 (self, nBytesArc, nBytesOffset, lVal): - "return representation as string of node (method 3)" - nArc = len(self.arcs) - nFinalNodeMask = 1 << ((nBytesArc*8)-1) - nFinalArcMask = 1 << ((nBytesArc*8)-2) - nNextNodeMask = 1 << ((nBytesArc*8)-3) - nMaxOffset = (2 ** (nBytesOffset * 8)) - 1 - s = "i{:_>10} -- #{:_>10} ({})\n".format(self.i, self.addr, self.size) - if not nArc: - s += " {:<20} {:0>16} i{:_>10} #{:_>10}\n".format("", bin(nFinalNodeMask | nFinalArcMask)[2:], "0", "0") - return s - for i, arc in enumerate(self.arcs, 1): - val = arc - if i == 1 and self.final: - val = val | nFinalNodeMask - if i == nArc: - val = val | nFinalArcMask - if 1 < (self.arcs[arc].addr - self.addr) < nMaxOffset and self.i != 0: - val = val | nNextNodeMask - s += " {:<20} {:0>16} i{:_>10} +{:_>10}\n".format(lVal[arc], bin(val)[2:], self.arcs[arc].i, self.arcs[arc].addr - self.addr) - else: - s += " {:<20} {:0>16} i{:_>10} #{:_>10}\n".format(lVal[arc], bin(val)[2:], self.arcs[arc].i, self.arcs[arc].addr) - return s - # Another attempt to sort node arcs _dCharOrder = { Index: graphspell/ibdawg.py ================================================================== --- graphspell/ibdawg.py +++ graphspell/ibdawg.py @@ -112,42 +112,43 @@ class IBDAWG: """INDEXABLE BINARY DIRECT ACYCLIC WORD GRAPH""" def __init__ (self, source): if isinstance(source, str): - self.by = pkgutil.get_data(__package__, "_dictionaries/" + source) - if not self.by: - raise OSError("# Error. File not found or not loadable: "+source) - - if source.endswith(".bdic"): - self._initBinary() - elif source.endswith(".json"): - self._initJSON(json.loads(self.by.decode("utf-8"))) #json.loads(self.by) # In Python 3.6, can read directly binary strings - else: - raise OSError("# Error. Unknown file type: "+source) - else: - self._initJSON(source) - - self.sFileName = source if isinstance(source, str) else "[None]" - - # Performance trick: - # Instead of converting bytes to integers each times we parse the binary dictionary, - # we do it once, then parse the array - nAcc = 0 - byBuffer = b"" - self.lByDic = [] - nDivisor = (self.nBytesArc + self.nBytesNodeAddress) / 2 - for i in range(0, len(self.byDic)): - byBuffer += self.byDic[i:i+1] - if nAcc == (self.nBytesArc - 1): - self.lByDic.append(int.from_bytes(byBuffer, byteorder="big")) - byBuffer = b"" - elif nAcc == (self.nBytesArc + self.nBytesNodeAddress - 1): - self.lByDic.append(round(int.from_bytes(byBuffer, byteorder="big") / nDivisor)) - byBuffer = b"" - nAcc = -1 - nAcc = nAcc + 1 + by = pkgutil.get_data(__package__, "_dictionaries/" + source) + if not by: + raise OSError("# Error. File not found or not loadable: "+source) + self.sFileName = source + oData = json.loads(by.decode("utf-8")) #json.loads(by) # In Python 3.6, can read directly binary strings + else: + self.sFileName = "[None]" + oData = source + + self.__dict__.update(oData) + self.dCharVal = { v: k for k, v in self.dChar.items() } + self.a2grams = set(getattr(self, 'l2grams')) if hasattr(self, 'l2grams') else None + + if "lByDic" not in oData: + print(">>>> lByDic not in oData") + if "sByDic" not in oData: + raise TypeError("# Error. No usable data in the dictionary.") + # old dictionary version + self.lByDic = [] + self.byDic = binascii.unhexlify(oData["sByDic"]) + nAcc = 0 + byBuffer = b"" + nDivisor = (self.nBytesArc + self.nBytesNodeAddress) / 2 + for i in range(0, len(self.byDic)): + byBuffer += self.byDic[i:i+1] + if nAcc == (self.nBytesArc - 1): + self.lByDic.append(int.from_bytes(byBuffer, byteorder="big")) + byBuffer = b"" + elif nAcc == (self.nBytesArc + self.nBytesNodeAddress - 1): + self.lByDic.append(round(int.from_bytes(byBuffer, byteorder="big") / nDivisor)) + byBuffer = b"" + nAcc = -1 + nAcc = nAcc + 1 # masks self._arcMask = (2 ** ((self.nBytesArc * 8) - 3)) - 1 self._finalNodeMask = 1 << ((self.nBytesArc * 8) - 1) self._lastArcMask = 1 << ((self.nBytesArc * 8) - 2) @@ -168,101 +169,18 @@ try: self.lexicographer = importlib.import_module(".lexgraph_"+self.sLangCode, "grammalecte.graphspell") except ImportError: print("# No module ") - - def _initBinary (self): - "initialize with binary structure file" - if self.by[0:17] != b"/grammalecte-fsa/": - raise TypeError("# Error. Not a grammalecte-fsa binary dictionary. Header: {}".format(self.by[0:9])) - if not(self.by[17:18] == b"1" or self.by[17:18] == b"2" or self.by[17:18] == b"3"): - raise ValueError("# Error. Unknown dictionary version: {}".format(self.by[17:18])) - try: - byHeader, byInfo, byValues, by2grams, byDic = self.by.split(b"\0\0\0\0", 4) - except Exception: - raise Exception - - self.nCompressionMethod = int(self.by[17:18].decode("utf-8")) - self.sHeader = byHeader.decode("utf-8") - self.lArcVal = byValues.decode("utf-8").split("\t") - self.nArcVal = len(self.lArcVal) - self.byDic = byDic - self.a2grams = set(by2grams.decode("utf-8").split("\t")) - - l = byInfo.decode("utf-8").split("//") - self.sLangCode = l.pop(0) - self.sLangName = l.pop(0) - self.sDicName = l.pop(0) - self.sDescription = l.pop(0) - self.sDate = l.pop(0) - self.nChar = int(l.pop(0)) - self.nBytesArc = int(l.pop(0)) - self.nBytesNodeAddress = int(l.pop(0)) - self.nEntry = int(l.pop(0)) - self.nNode = int(l.pop(0)) - self.nArc = int(l.pop(0)) - self.nAff = int(l.pop(0)) - self.cStemming = l.pop(0) - self.nTag = self.nArcVal - self.nChar - self.nAff - # to get the value of an arc, to get the char of an arc with its value - self.dChar = {} - for i in range(1, self.nChar+1): - self.dChar[self.lArcVal[i]] = i - self.dCharVal = { v: k for k, v in self.dChar.items() } - - def _initJSON (self, oJSON): - "initialize with a JSON text file" - self.sByDic = "" # init to prevent pylint whining - self.__dict__.update(oJSON) - self.byDic = binascii.unhexlify(self.sByDic) - self.dCharVal = { v: k for k, v in self.dChar.items() } - self.a2grams = set(getattr(self, 'l2grams')) if hasattr(self, 'l2grams') else None - def getInfo (self): "return string about the IBDAWG" return " Language: {0.sLangName} Lang code: {0.sLangCode} Dictionary name: {0.sDicName}" \ - " Compression method: {0.nCompressionMethod:>2} Date: {0.sDate} Stemming: {0.cStemming}FX\n" \ + " Date: {0.sDate} Stemming: {0.cStemming}FX\n" \ " Arcs values: {0.nArcVal:>10,} = {0.nChar:>5,} characters, {0.nAff:>6,} affixes, {0.nTag:>6,} tags\n" \ " Dictionary: {0.nEntry:>12,} entries, {0.nNode:>11,} nodes, {0.nArc:>11,} arcs\n" \ " Address size: {0.nBytesNodeAddress:>1} bytes, Arc size: {0.nBytesArc:>1} bytes\n".format(self) - def writeAsJSObject (self, spfDest, bInJSModule=False, bBinaryDictAsHexString=False): - "write IBDAWG as a JavaScript object in a JavaScript module" - with open(spfDest, "w", encoding="utf-8", newline="\n") as hDst: - if bInJSModule: - hDst.write('// JavaScript\n// Generated data (do not edit)\n\n"use strict";\n\nconst dictionary = ') - hDst.write(json.dumps({ - "sHeader": "/grammalecte-fsa/", - "sLangCode": self.sLangCode, - "sLangName": self.sLangName, - "sDicName": self.sDicName, - "sDescription": self.sDescription, - "sFileName": self.sFileName, - "sDate": self.sDate, - "nEntry": self.nEntry, - "nChar": self.nChar, - "nAff": self.nAff, - "nTag": self.nTag, - "cStemming": self.cStemming, - "dChar": self.dChar, - "nNode": self.nNode, - "nArc": self.nArc, - "nArcVal": self.nArcVal, - "lArcVal": self.lArcVal, - "nCompressionMethod": self.nCompressionMethod, - "nBytesArc": self.nBytesArc, - "nBytesNodeAddress": self.nBytesNodeAddress, - # JavaScript is a pile of shit, so Mozilla’s JS parser don’t like file bigger than 4 Mb! - # So, if necessary, we use an hexadecimal string, that we will convert later in Firefox’s extension. - # https://github.com/mozilla/addons-linter/issues/1361 - "sByDic": self.byDic.hex() if bBinaryDictAsHexString else [ e for e in self.byDic ], - "l2grams": list(self.a2grams) - }, ensure_ascii=False)) - if bInJSModule: - hDst.write(";\n\nexports.dictionary = dictionary;\n") - def isValidToken (self, sToken): "checks if is valid (if there is hyphens in , is split, each part is checked)" sToken = st.spellingNormalization(sToken) if self.isValid(sToken): return True Index: graphspell/spellchecker.py ================================================================== --- graphspell/spellchecker.py +++ graphspell/spellchecker.py @@ -14,12 +14,12 @@ from . import ibdawg from . import tokenizer dDefaultDictionaries = { - "fr": "fr-allvars.bdic", - "en": "en.bdic" + "fr": "fr-allvars.json", + "en": "en.json" } class SpellChecker (): "SpellChecker: wrapper for the IBDAWG class" Index: lex_build.py ================================================================== --- lex_build.py +++ lex_build.py @@ -9,20 +9,18 @@ import graphspell.dawg as fsa from graphspell.ibdawg import IBDAWG -def build (spfSrc, sLangCode, sLangName, sfDict, bJSON=False, sDicName="", sDescription="", sFilter="", cStemmingMethod="S", nCompressMethod=1): +def build (spfSrc, sLangCode, sLangName, sfDict, bJavaScript=False, sDicName="", sDescription="", sFilter="", cStemmingMethod="S", nCompressMethod=1): "transform a text lexicon as a binary indexable dictionary" oDAWG = fsa.DAWG(spfSrc, cStemmingMethod, sLangCode, sLangName, sDicName, sDescription, sFilter) dir_util.mkpath("graphspell/_dictionaries") - oDAWG.writeInfo("graphspell/_dictionaries/" + sfDict + ".info.txt") - oDAWG.writeBinary("graphspell/_dictionaries/" + sfDict + ".bdic", int(nCompressMethod)) - if bJSON: + oDAWG.writeAsJSObject("graphspell/_dictionaries/" + sfDict + ".json") + if bJavaScript: dir_util.mkpath("graphspell-js/_dictionaries") - oDic = IBDAWG(sfDict + ".bdic") - oDic.writeAsJSObject("graphspell-js/_dictionaries/" + sfDict + ".json", bBinaryDictAsHexString=True) + oDAWG.writeAsJSObject("graphspell-js/_dictionaries/" + sfDict + ".json") def main (): "parse args from CLI" xParser = argparse.ArgumentParser() Index: make.py ================================================================== --- make.py +++ make.py @@ -315,22 +315,22 @@ if bCommunityDict: lDict.append(("community", dVars['dic_community_filename'])) if bPersonalDict: lDict.append(("personal", dVars['dic_personal_filename'])) for sType, sFileName in lDict: - spfPyDic = f"graphspell/_dictionaries/{sFileName}.bdic" + spfPyDic = f"graphspell/_dictionaries/{sFileName}.json" spfJSDic = f"graphspell-js/_dictionaries/{sFileName}.json" if not os.path.isfile(spfPyDic) or (bJavaScript and not os.path.isfile(spfJSDic)): buildDictionary(dVars, sType, bJavaScript) print(" +", spfPyDic) file_util.copy_file(spfPyDic, "grammalecte/graphspell/_dictionaries") - dVars['dic_'+sType+'_filename_py'] = sFileName + '.bdic' + dVars['dic_'+sType+'_filename_py'] = sFileName + '.json' if bJavaScript: print(" +", spfJSDic) file_util.copy_file(spfJSDic, "grammalecte-js/graphspell/_dictionaries") dVars['dic_'+sType+'_filename_js'] = sFileName + '.json' - dVars['dic_main_filename_py'] = dVars['dic_default_filename_py'] + ".bdic" + dVars['dic_main_filename_py'] = dVars['dic_default_filename_py'] + ".json" dVars['dic_main_filename_js'] = dVars['dic_default_filename_js'] + ".json" def buildDictionary (dVars, sType, bJavaScript=False): "build binary dictionary for Graphspell from lexicons" Index: reader.py ================================================================== --- reader.py +++ reader.py @@ -5,11 +5,11 @@ import sys import re import graphspell.ibdawg as ibdawg -oDict = ibdawg.IBDAWG("French.bdic") +oDict = ibdawg.IBDAWG("fr-allvars.json") def readFile (spf): if os.path.isfile(spf): with open(spf, "r", encoding="utf-8") as hSrc: @@ -24,11 +24,11 @@ with open(spf+".res.txt", "w", encoding="utf-8") as hDst: for sLine in readFile(spfSrc): sLine = sLine.strip() if sLine: for sWord in sLine.split(): - if not oDict.isValid(sWord): + if not oDict.isValid(sWord): hDst.write(sWord+"\n") # -------------------------------------------------------------------------------------------------- def createLexStatFile (spf, dStat):