Index: gc_lang/fr/build_data.py ================================================================== --- gc_lang/fr/build_data.py +++ gc_lang/fr/build_data.py @@ -50,11 +50,11 @@ def loadDictionary (): global oDict if not oDict: try: - oDict = ibdawg.IBDAWG("fr-allvars.bdic") + oDict = ibdawg.IBDAWG("fr-allvars.json") except: traceback.print_exc() def makeDictionaries (sp, sVersion): Index: gc_lang/fr/oxt/ContextMenu/ContextMenu.py ================================================================== --- gc_lang/fr/oxt/ContextMenu/ContextMenu.py +++ gc_lang/fr/oxt/ContextMenu/ContextMenu.py @@ -133,11 +133,11 @@ oGC = self.ctx.ServiceManager.createInstanceWithContext("org.openoffice.comp.pyuno.Lightproof.grammalecte", self.ctx) if hasattr(oGC, "getSpellChecker"): # https://bugs.documentfoundation.org/show_bug.cgi?id=97790 oSpellChecker = oGC.getSpellChecker() else: - oSpellChecker = SpellChecker("${lang}", "fr-allvars.bdic") + oSpellChecker = SpellChecker("${lang}", "fr-allvars.json") except: traceback.print_exc() def execute (self, args): if not args: Index: gc_lang/fr/oxt/DictOptions/LexiconEditor.py ================================================================== --- gc_lang/fr/oxt/DictOptions/LexiconEditor.py +++ gc_lang/fr/oxt/DictOptions/LexiconEditor.py @@ -406,11 +406,11 @@ def importDictionary (self): spfImported = "" try: xFilePicker = self.xSvMgr.createInstanceWithContext('com.sun.star.ui.dialogs.FilePicker', self.ctx) # other possibility: com.sun.star.ui.dialogs.SystemFilePicker xFilePicker.initialize([uno.getConstantByName("com.sun.star.ui.dialogs.TemplateDescription.FILEOPEN_SIMPLE")]) # seems useless - xFilePicker.appendFilter("Supported files", "*.json; *.bdic") + xFilePicker.appendFilter("Supported files", "*.json") xFilePicker.setDefaultName("fr.__personal__.json") # useless, doesn’t work xFilePicker.setDisplayDirectory("") xFilePicker.setMultiSelectionMode(False) nResult = xFilePicker.execute() if nResult == 1: @@ -461,11 +461,11 @@ def exportDictionary (self): try: xFilePicker = self.xSvMgr.createInstanceWithContext('com.sun.star.ui.dialogs.FilePicker', self.ctx) # other possibility: com.sun.star.ui.dialogs.SystemFilePicker xFilePicker.initialize([uno.getConstantByName("com.sun.star.ui.dialogs.TemplateDescription.FILESAVE_SIMPLE")]) # seems useless - xFilePicker.appendFilter("Supported files", "*.json; *.bdic") + xFilePicker.appendFilter("Supported files", "*.json") xFilePicker.setDefaultName("fr.__personal__.json") # useless, doesn’t work xFilePicker.setDisplayDirectory("") xFilePicker.setMultiSelectionMode(False) nResult = xFilePicker.execute() if nResult == 1: Index: gc_lang/fr/oxt/DictOptions/SearchWords.py ================================================================== --- gc_lang/fr/oxt/DictOptions/SearchWords.py +++ gc_lang/fr/oxt/DictOptions/SearchWords.py @@ -184,11 +184,11 @@ except: traceback.print_exc() def initSpellChecker (self): if not self.oSpellChecker: - self.oSpellChecker = sc.SpellChecker("fr", "fr-allvars.bdic", "", self.oPersonalDicJSON) + self.oSpellChecker = sc.SpellChecker("fr", "fr-allvars.json", "", self.oPersonalDicJSON) @_waitPointer def searchSimilar (self): self.initSpellChecker() sWord = self.xWord.Text.strip() Index: gc_lang/fr/oxt/Graphspell.py ================================================================== --- gc_lang/fr/oxt/Graphspell.py +++ gc_lang/fr/oxt/Graphspell.py @@ -67,11 +67,11 @@ try: personal_dic = json.loads(sPersonalDicJSON) except: print("Graphspell: wrong personal_dic") traceback.print_exc() - self.oGraphspell = SpellChecker("fr", "fr-"+sMainDicName+".bdic", "", personal_dic) + self.oGraphspell = SpellChecker("fr", "fr-"+sMainDicName+".json", "", personal_dic) self.loadHunspell() # print("Graphspell: init done") except: print("Graphspell: init failed") traceback.print_exc() Index: gc_lang/fr/setup.py ================================================================== --- gc_lang/fr/setup.py +++ gc_lang/fr/setup.py @@ -91,11 +91,11 @@ # If there are data files included in your packages that need to be # installed, specify them here. If using Python 2.6 or less, then these # have to be included in MANIFEST.in as well. package_data={ - 'grammalecte': ['graphspell/_dictionaries/*.bdic', '*.txt'] + 'grammalecte': ['graphspell/_dictionaries/*.json', '*.txt'] }, # Although 'package_data' is the preferred approach, in some case you may # need to place data files outside of your packages. See: # http://docs.python.org/3.4/distutils/setupscript.html#installing-additional-files # noqa Index: graphspell-js/dawg.js ================================================================== --- graphspell-js/dawg.js +++ graphspell-js/dawg.js @@ -342,20 +342,16 @@ } } } // BINARY CONVERSION - createBinaryJSON (nCompressionMethod) { - console.log("Write DAWG as an indexable binary dictionary [method: "+nCompressionMethod+"]"); - if (nCompressionMethod == 1) { - this.nBytesArc = Math.floor( (this.nArcVal.toString(2).length + 2) / 8 ) + 1; // We add 2 bits. See DawgNode.convToBytes1() - this.nBytesOffset = 0; - this._calcNumBytesNodeAddress(); - this._calcNodesAddress1(); - } else { - console.log("Error: unknown compression method"); - } + createBinaryJSON (nCompressionMethod=1) { + console.log("Write DAWG as an indexable binary dictionary"); + this.nBytesArc = Math.floor( (this.nArcVal.toString(2).length + 2) / 8 ) + 1; // We add 2 bits. See DawgNode.convToBytes() + this.nBytesOffset = 0; + this._calcNumBytesNodeAddress(); + this._calcNodesAddress(); console.log("Arc values (chars, affixes and tags): " + this.nArcVal); console.log("Arc size: "+this.nBytesArc+" bytes, Address size: "+this.nBytesNodeAddress+" bytes"); console.log("-> " + this.nBytesArc+this.nBytesNodeAddress + " * " + this.nArc + " = " + (this.nBytesArc+this.nBytesNodeAddress)*this.nArc + " bytes"); return this._createJSON(nCompressionMethod); } @@ -366,26 +362,23 @@ while (((this.nBytesArc + this.nBytesNodeAddress) * this.nArc) > (2 ** (this.nBytesNodeAddress * 8))) { this.nBytesNodeAddress += 1; } } - _calcNodesAddress1 () { + _calcNodesAddress () { let nBytesNode = this.nBytesArc + this.nBytesNodeAddress; let iAddr = this.oRoot.arcs.size * nBytesNode; for (let oNode of this.dMinimizedNodes.values()) { oNode.addr = iAddr; iAddr += Math.max(oNode.arcs.size, 1) * nBytesNode; } } - _createJSON (nCompressionMethod) { - let sByDic = ""; - if (nCompressionMethod == 1) { - sByDic = this.oRoot.convToBytes1(this.nBytesArc, this.nBytesNodeAddress); - for (let oNode of this.dMinimizedNodes.values()) { - sByDic += oNode.convToBytes1(this.nBytesArc, this.nBytesNodeAddress); - } + _createJSON (nCompressionMethod=1) { + let sByDic = this.oRoot.convToBytes(this.nBytesArc, this.nBytesNodeAddress); + for (let oNode of this.dMinimizedNodes.values()) { + sByDic += oNode.convToBytes(this.nBytesArc, this.nBytesNodeAddress); } let oJSON = { "sHeader": "/grammalecte-fsa/", "sLangCode": this.sLangCode, "sLangName": this.sLangName, @@ -494,11 +487,11 @@ } } } // VERSION 1 ===================================================================================================== - convToBytes1 (nBytesArc, nBytesNodeAddress) { + convToBytes (nBytesArc, nBytesNodeAddress) { /* Node scheme: - Arc length is defined by nBytesArc - Address length is defined by nBytesNodeAddress Index: graphspell/dawg.py ================================================================== --- graphspell/dawg.py +++ graphspell/dawg.py @@ -356,30 +356,16 @@ if not zPattern or zPattern.search(self.lArcVal[nMorphVal]): yield sEntry + "\t" + self.lArcVal[nMorphVal] # BINARY CONVERSION - def _calculateBinary (self, nCompressionMethod): - print(" > Write DAWG as an indexable binary dictionary [method: %d]" % nCompressionMethod) - if nCompressionMethod == 1: - self.nBytesArc = ( (self.nArcVal.bit_length() + 2) // 8 ) + 1 # We add 2 bits. See DawgNode.convToBytes1() - self.nBytesOffset = 0 - self._calcNumBytesNodeAddress() - self._calcNodesAddress1() - elif nCompressionMethod == 2: - self.nBytesArc = ( (self.nArcVal.bit_length() + 3) // 8 ) + 1 # We add 3 bits. See DawgNode.convToBytes2() - self.nBytesOffset = 0 - self._calcNumBytesNodeAddress() - self._calcNodesAddress2() - elif nCompressionMethod == 3: - self.nBytesArc = ( (self.nArcVal.bit_length() + 3) // 8 ) + 1 # We add 3 bits. See DawgNode.convToBytes3() - self.nBytesOffset = 1 - self.nMaxOffset = (2 ** (self.nBytesOffset * 8)) - 1 - self._calcNumBytesNodeAddress() - self._calcNodesAddress3() - else: - print(" # Error: unknown compression method") + def _calculateBinary (self, nCompressionMethod=1): + print(" > Write DAWG as an indexable binary dictionary") + self.nBytesArc = ( (self.nArcVal.bit_length() + 2) // 8 ) + 1 # We add 2 bits. See DawgNode.convToBytes() + self.nBytesOffset = 0 + self._calcNumBytesNodeAddress() + self._calcNodesAddress() print(" Arc values (chars, affixes and tags): {} -> {} bytes".format( self.nArcVal, len("\t".join(self.lArcVal).encode("utf-8")) )) print(" Arc size: {} bytes, Address size: {} bytes -> {} * {} = {} bytes".format( self.nBytesArc, self.nBytesNodeAddress, \ self.nBytesArc+self.nBytesNodeAddress, self.nArc, \ (self.nBytesArc+self.nBytesNodeAddress)*self.nArc )) @@ -387,70 +373,24 @@ "how many bytes needed to store all nodes/arcs in the binary dictionary" self.nBytesNodeAddress = 1 while ((self.nBytesArc + self.nBytesNodeAddress) * self.nArc) > (2 ** (self.nBytesNodeAddress * 8)): self.nBytesNodeAddress += 1 - def _calcNodesAddress1 (self): + def _calcNodesAddress (self): nBytesNode = self.nBytesArc + self.nBytesNodeAddress iAddr = len(self.oRoot.arcs) * nBytesNode for oNode in self.lMinimizedNodes: oNode.addr = iAddr iAddr += max(len(oNode.arcs), 1) * nBytesNode - def _calcNodesAddress2 (self): - nBytesNode = self.nBytesArc + self.nBytesNodeAddress - iAddr = len(self.oRoot.arcs) * nBytesNode - for oNode in self.lSortedNodes: - oNode.addr = iAddr - iAddr += max(len(oNode.arcs), 1) * nBytesNode - for oNextNode in oNode.arcs.values(): - if (oNode.pos + 1) == oNextNode.pos: - iAddr -= self.nBytesNodeAddress - #break - - def _calcNodesAddress3 (self): - nBytesNode = self.nBytesArc + self.nBytesNodeAddress - # theorical nodes size if only addresses and no offset - self.oRoot.size = len(self.oRoot.arcs) * nBytesNode - for oNode in self.lSortedNodes: - oNode.size = max(len(oNode.arcs), 1) * nBytesNode - # rewind and calculate dropdown from the end, several times - nDiff = self.nBytesNodeAddress - self.nBytesOffset - bEnd = False - while not bEnd: - bEnd = True - # recalculate addresses - iAddr = self.oRoot.size - for oNode in self.lSortedNodes: - oNode.addr = iAddr - iAddr += oNode.size - # rewind and calculate dropdown from the end, several times - for i in range(self.nNode-1, -1, -1): - nSize = max(len(self.lSortedNodes[i].arcs), 1) * nBytesNode - for oNextNode in self.lSortedNodes[i].arcs.values(): - if 1 < (oNextNode.addr - self.lSortedNodes[i].addr) < self.nMaxOffset: - nSize -= nDiff - if self.lSortedNodes[i].size != nSize: - self.lSortedNodes[i].size = nSize - bEnd = False - def getBinaryAsJSON (self, nCompressionMethod=1, bBinaryDictAsHexString=True): "return a JSON string containing all necessary data of the dictionary (compressed as a binary string)" self._calculateBinary(nCompressionMethod) byDic = b"" - if nCompressionMethod == 1: - byDic = self.oRoot.convToBytes1(self.nBytesArc, self.nBytesNodeAddress) - for oNode in self.lMinimizedNodes: - byDic += oNode.convToBytes1(self.nBytesArc, self.nBytesNodeAddress) - elif nCompressionMethod == 2: - byDic = self.oRoot.convToBytes2(self.nBytesArc, self.nBytesNodeAddress) - for oNode in self.lSortedNodes: - byDic += oNode.convToBytes2(self.nBytesArc, self.nBytesNodeAddress) - elif nCompressionMethod == 3: - byDic = self.oRoot.convToBytes3(self.nBytesArc, self.nBytesNodeAddress, self.nBytesOffset) - for oNode in self.lSortedNodes: - byDic += oNode.convToBytes3(self.nBytesArc, self.nBytesNodeAddress, self.nBytesOffset) + byDic = self.oRoot.convToBytes(self.nBytesArc, self.nBytesNodeAddress) + for oNode in self.lMinimizedNodes: + byDic += oNode.convToBytes(self.nBytesArc, self.nBytesNodeAddress) return { "sHeader": "/grammalecte-fsa/", "sLangCode": self.sLangCode, "sLangName": self.sLangName, "sDicName": self.sDicName, @@ -476,11 +416,11 @@ # https://github.com/mozilla/addons-linter/issues/1361 "sByDic": byDic.hex() if bBinaryDictAsHexString else [ e for e in byDic ], "l2grams": list(self.a2grams) } - def writeAsJSObject (self, spfDst, nCompressionMethod, bInJSModule=False, bBinaryDictAsHexString=True): + def writeAsJSObject (self, spfDst, nCompressionMethod=1, bInJSModule=False, bBinaryDictAsHexString=True): "write a file (JSON or JS module) with all the necessary data" if not spfDst.endswith(".json"): spfDst += "."+str(nCompressionMethod)+".json" with open(spfDst, "w", encoding="utf-8", newline="\n") as hDst: if bInJSModule: @@ -487,106 +427,21 @@ hDst.write('// JavaScript\n// Generated data (do not edit)\n\n"use strict";\n\nconst dictionary = ') hDst.write( json.dumps(self.getBinaryAsJSON(nCompressionMethod, bBinaryDictAsHexString), ensure_ascii=False) ) if bInJSModule: hDst.write(";\n\nexports.dictionary = dictionary;\n") - def writeBinary (self, sPathFile, nCompressionMethod, bDebug=False): - """ - Save as a binary file. - - Format of the binary indexable dictionary: - Each section is separated with 4 bytes of \0 - - - Section Header: - /grammalecte-fsa/[compression method] - * compression method is an ASCII string - - - Section Informations: - /[lang code] - /[lang name] - /[dictionary name] - /[date creation] - /[number of chars] - /[number of bytes for each arc] - /[number of bytes for each address node] - /[number of entries] - /[number of nodes] - /[number of arcs] - /[number of affixes] - * each field is a ASCII string - /[stemming code] - * "S" means stems are generated by /suffix_code/, - "A" means they are generated by /affix_code/ - See defineSuffixCode() and defineAffixCode() for details. - "N" means no stemming - - - Section Values: - * a list of strings encoded in binary from utf-8, each value separated with a tabulation - - - Section Word Graph (nodes / arcs) - * A list of nodes which are a list of arcs with an address of the next node. - See DawgNode.convToBytes() for details. - - - Section 2grams: - * A list of 2grams (as strings: 2 chars) encoded in binary from utf-8, each value separated with a tabulation - """ - self._calculateBinary(nCompressionMethod) - if not sPathFile.endswith(".bdic"): - sPathFile += "."+str(nCompressionMethod)+".bdic" - with open(sPathFile, 'wb') as hDst: - # header - hDst.write("/grammalecte-fsa/{}/".format(nCompressionMethod).encode("utf-8")) - hDst.write(b"\0\0\0\0") - # infos - sInfo = "{}//{}//{}//{}//{}//{}//{}//{}//{}//{}//{}//{}//{}".format(self.sLangCode, self.sLangName, self.sDicName, self.sDescription, self._getDate(), \ - self.nChar, self.nBytesArc, self.nBytesNodeAddress, \ - self.nEntry, self.nNode, self.nArc, self.nAff, self.cStemming) - hDst.write(sInfo.encode("utf-8")) - hDst.write(b"\0\0\0\0") - # lArcVal - hDst.write("\t".join(self.lArcVal).encode("utf-8")) - hDst.write(b"\0\0\0\0") - # 2grams - hDst.write("\t".join(self.a2grams).encode("utf-8")) - hDst.write(b"\0\0\0\0") - # DAWG: nodes / arcs - if nCompressionMethod == 1: - hDst.write(self.oRoot.convToBytes1(self.nBytesArc, self.nBytesNodeAddress)) - for oNode in self.lMinimizedNodes: - hDst.write(oNode.convToBytes1(self.nBytesArc, self.nBytesNodeAddress)) - elif nCompressionMethod == 2: - hDst.write(self.oRoot.convToBytes2(self.nBytesArc, self.nBytesNodeAddress)) - for oNode in self.lSortedNodes: - hDst.write(oNode.convToBytes2(self.nBytesArc, self.nBytesNodeAddress)) - elif nCompressionMethod == 3: - hDst.write(self.oRoot.convToBytes3(self.nBytesArc, self.nBytesNodeAddress, self.nBytesOffset)) - for oNode in self.lSortedNodes: - hDst.write(oNode.convToBytes3(self.nBytesArc, self.nBytesNodeAddress, self.nBytesOffset)) - if bDebug: - self._writeNodes(sPathFile, nCompressionMethod) - def _getDate (self): return time.strftime("%Y-%m-%d %H:%M:%S") - def _writeNodes (self, sPathFile, nCompressionMethod): + def _writeNodes (self, sPathFile, nCompressionMethod=1): "for debugging only" print(" > Write nodes") with open(sPathFile+".nodes."+str(nCompressionMethod)+".txt", 'w', encoding='utf-8', newline="\n") as hDst: - if nCompressionMethod == 1: - hDst.write(self.oRoot.getTxtRepr1(self.nBytesArc, self.lArcVal)+"\n") - #hDst.write( ''.join( [ "%02X " % z for z in self.oRoot.convToBytes1(self.nBytesArc, self.nBytesNodeAddress) ] ).strip() ) - for oNode in self.lMinimizedNodes: - hDst.write(oNode.getTxtRepr1(self.nBytesArc, self.lArcVal)+"\n") - if nCompressionMethod == 2: - hDst.write(self.oRoot.getTxtRepr2(self.nBytesArc, self.lArcVal)+"\n") - for oNode in self.lSortedNodes: - hDst.write(oNode.getTxtRepr2(self.nBytesArc, self.lArcVal)+"\n") - if nCompressionMethod == 3: - hDst.write(self.oRoot.getTxtRepr3(self.nBytesArc, self.nBytesOffset, self.lArcVal)+"\n") - #hDst.write( ''.join( [ "%02X " % z for z in self.oRoot.convToBytes3(self.nBytesArc, self.nBytesNodeAddress, self.nBytesOffset) ] ).strip() ) - for oNode in self.lSortedNodes: - hDst.write(oNode.getTxtRepr3(self.nBytesArc, self.nBytesOffset, self.lArcVal)+"\n") + hDst.write(self.oRoot.getTxtRepr(self.nBytesArc, self.lArcVal)+"\n") + #hDst.write( ''.join( [ "%02X " % z for z in self.oRoot.convToBytes(self.nBytesArc, self.nBytesNodeAddress) ] ).strip() ) + for oNode in self.lMinimizedNodes: + hDst.write(oNode.getTxtRepr(self.nBytesArc, self.lArcVal)+"\n") class DawgNode: """Node of the word graph""" @@ -646,11 +501,11 @@ def sortArcs2 (self, dValOccur, lArcVal): "sort arcs of each node depending on the previous char" self.arcs = collections.OrderedDict(sorted(self.arcs.items(), key=lambda t: dValOccur.get(lArcVal[t[0]], 0), reverse=True)) # VERSION 1 ===================================================================================================== - def convToBytes1 (self, nBytesArc, nBytesNodeAddress): + def convToBytes (self, nBytesArc, nBytesNodeAddress): """ Convert to bytes (method 1). Node scheme: - Arc length is defined by nBytesArc @@ -688,11 +543,11 @@ val = val | nFinalArcMask by += val.to_bytes(nBytesArc, byteorder='big') by += self.arcs[arc].addr.to_bytes(nBytesNodeAddress, byteorder='big') return by - def getTxtRepr1 (self, nBytesArc, lVal): + def getTxtRepr (self, nBytesArc, lVal): "return representation as string of node (method 1)" nArc = len(self.arcs) nFinalNodeMask = 1 << ((nBytesArc*8)-1) nFinalArcMask = 1 << ((nBytesArc*8)-2) s = "i{:_>10} -- #{:_>10}\n".format(self.i, self.addr) @@ -705,161 +560,10 @@ val = val | nFinalNodeMask if i == nArc: val = val | nFinalArcMask s += " {:<20} {:0>16} i{:_>10} #{:_>10}\n".format(lVal[arc], bin(val)[2:], self.arcs[arc].i, self.arcs[arc].addr) return s - - # VERSION 2 ===================================================================================================== - def convToBytes2 (self, nBytesArc, nBytesNodeAddress): - """ - Convert to bytes (method 2). - - Node scheme: - - Arc length is defined by nBytesArc - - Address length is defined by nBytesNodeAddress - - | Arc | Address of next node | - | | | - ┏━━━━━━━━━━━━━━━┓ ┏━━━━━━━━━━━━━━━┓ ┏━━━━━━━━━━━━━━━┓ ┏━━━━━━━━━━━━━━━┓ ┏━━━━━━━━━━━━━━━┓ ┏━━━━━━━━━━━━━━━┓ - ┃ ┃ ┃ ┃ ┃ ┃ ┃ ┃ ┃ ┃ ┃ ┃ ┃ ┃ ┃ ┃ ┃ ┃ ┃ ┃ ┃ ┃ ┃ ┃ ┃ ┃ ┃ ┃ ┃ ┃ ┃ ┃ ┃ ┃ ┃ ┃ ┃ ┃ ┃ ┃ ┃ ┃ ┃ ┃ ┃ ┃ ┃ ┃ ┃ ┃ ┃ ┃ ┃ ┃ - ┗━━━━━━━━━━━━━━━┛ ┗━━━━━━━━━━━━━━━┛ ┗━━━━━━━━━━━━━━━┛ ┗━━━━━━━━━━━━━━━┛ ┗━━━━━━━━━━━━━━━┛ ┗━━━━━━━━━━━━━━━┛ - [...] - ┏━━━━━━━━━━━━━━━┓ ┏━━━━━━━━━━━━━━━┓ ┏━━━━━━━━━━━━━━━┓ ┏━━━━━━━━━━━━━━━┓ ┏━━━━━━━━━━━━━━━┓ ┏━━━━━━━━━━━━━━━┓ - ┃ ┃ ┃ ┃ ┃ ┃ ┃ ┃ ┃ ┃ ┃ ┃ ┃ ┃ ┃ ┃ ┃ ┃ ┃ ┃ ┃ ┃ ┃ ┃ ┃ ┃ ┃ ┃ ┃ ┃ ┃ ┃ ┃ ┃ ┃ ┃ ┃ ┃ ┃ ┃ ┃ ┃ ┃ ┃ ┃ ┃ ┃ ┃ ┃ ┃ ┃ ┃ ┃ ┃ - ┗━━━━━━━━━━━━━━━┛ ┗━━━━━━━━━━━━━━━┛ ┗━━━━━━━━━━━━━━━┛ ┗━━━━━━━━━━━━━━━┛ ┗━━━━━━━━━━━━━━━┛ ┗━━━━━━━━━━━━━━━┛ - ^ ^ ^ - ┃ ┃ ┃ - ┃ ┃ ┗━━ if 1, caution, no address: next node is the following node - ┃ ┗━━━━ if 1, last arc of this node - ┗━━━━━━ if 1, this node is final (only on the first arc) - """ - nArc = len(self.arcs) - nFinalNodeMask = 1 << ((nBytesArc*8)-1) - nFinalArcMask = 1 << ((nBytesArc*8)-2) - nNextNodeMask = 1 << ((nBytesArc*8)-3) - if not nArc: - val = nFinalNodeMask | nFinalArcMask - by = val.to_bytes(nBytesArc, byteorder='big') - by += (0).to_bytes(nBytesNodeAddress, byteorder='big') - return by - by = b"" - for i, arc in enumerate(self.arcs, 1): - val = arc - if i == 1 and self.final: - val = val | nFinalNodeMask - if i == nArc: - val = val | nFinalArcMask - if (self.pos + 1) == self.arcs[arc].pos and self.i != 0: - val = val | nNextNodeMask - by += val.to_bytes(nBytesArc, byteorder='big') - else: - by += val.to_bytes(nBytesArc, byteorder='big') - by += self.arcs[arc].addr.to_bytes(nBytesNodeAddress, byteorder='big') - return by - - def getTxtRepr2 (self, nBytesArc, lVal): - "return representation as string of node (method 2)" - nArc = len(self.arcs) - nFinalNodeMask = 1 << ((nBytesArc*8)-1) - nFinalArcMask = 1 << ((nBytesArc*8)-2) - nNextNodeMask = 1 << ((nBytesArc*8)-3) - s = "i{:_>10} -- #{:_>10}\n".format(self.i, self.addr) - if not nArc: - s += " {:<20} {:0>16} i{:_>10} #{:_>10}\n".format("", bin(nFinalNodeMask | nFinalArcMask)[2:], "0", "0") - return s - for i, arc in enumerate(self.arcs, 1): - val = arc - if i == 1 and self.final: - val = val | nFinalNodeMask - if i == nArc: - val = val | nFinalArcMask - if (self.pos + 1) == self.arcs[arc].pos and self.i != 0: - val = val | nNextNodeMask - s += " {:<20} {:0>16}\n".format(lVal[arc], bin(val)[2:]) - else: - s += " {:<20} {:0>16} i{:_>10} #{:_>10}\n".format(lVal[arc], bin(val)[2:], self.arcs[arc].i, self.arcs[arc].addr) - return s - - # VERSION 3 ===================================================================================================== - def convToBytes3 (self, nBytesArc, nBytesNodeAddress, nBytesOffset): - """ - Convert to bytes (method 3). - - Node scheme: - - Arc length is defined by nBytesArc - - Address length is defined by nBytesNodeAddress - - Offset length is defined by nBytesOffset - - | Arc | Address of next node or offset to next node | - | | | - ┏━━━━━━━━━━━━━━━┓ ┏━━━━━━━━━━━━━━━┓ ┏━━━━━━━━━━━━━━━┓ ┏━━━━━━━━━━━━━━━┓ ┏━━━━━━━━━━━━━━━┓ ┏━━━━━━━━━━━━━━━┓ - ┃1┃0┃0┃ ┃ ┃ ┃ ┃ ┃ ┃ ┃ ┃ ┃ ┃ ┃ ┃ ┃ ┃ ┃ ┃ ┃ ┃ ┃ ┃ ┃ ┃ ┃ ┃ ┃ ┃ ┃ ┃ ┃ ┃ ┃ ┃ ┃ ┃ ┃ ┃ ┃ ┃ ┃ ┃ ┃ ┃ ┃ ┃ ┃ ┃ ┃ ┃ ┃ ┃ - ┗━━━━━━━━━━━━━━━┛ ┗━━━━━━━━━━━━━━━┛ ┗━━━━━━━━━━━━━━━┛ ┗━━━━━━━━━━━━━━━┛ ┗━━━━━━━━━━━━━━━┛ ┗━━━━━━━━━━━━━━━┛ - [...] - ┏━━━━━━━━━━━━━━━┓ ┏━━━━━━━━━━━━━━━┓ ┏━━━━━━━━━━━━━━━┓ - ┃0┃0┃1┃ ┃ ┃ ┃ ┃ ┃ ┃ ┃ ┃ ┃ ┃ ┃ ┃ ┃ ┃ ┃ ┃ ┃ ┃ ┃ ┃ ┃ ┃ ┃ Offsets are shorter than addresses - ┗━━━━━━━━━━━━━━━┛ ┗━━━━━━━━━━━━━━━┛ ┗━━━━━━━━━━━━━━━┛ - ┏━━━━━━━━━━━━━━━┓ ┏━━━━━━━━━━━━━━━┓ ┏━━━━━━━━━━━━━━━┓ ┏━━━━━━━━━━━━━━━┓ ┏━━━━━━━━━━━━━━━┓ ┏━━━━━━━━━━━━━━━┓ - ┃0┃1┃0┃ ┃ ┃ ┃ ┃ ┃ ┃ ┃ ┃ ┃ ┃ ┃ ┃ ┃ ┃ ┃ ┃ ┃ ┃ ┃ ┃ ┃ ┃ ┃ ┃ ┃ ┃ ┃ ┃ ┃ ┃ ┃ ┃ ┃ ┃ ┃ ┃ ┃ ┃ ┃ ┃ ┃ ┃ ┃ ┃ ┃ ┃ ┃ ┃ ┃ ┃ - ┗━━━━━━━━━━━━━━━┛ ┗━━━━━━━━━━━━━━━┛ ┗━━━━━━━━━━━━━━━┛ ┗━━━━━━━━━━━━━━━┛ ┗━━━━━━━━━━━━━━━┛ ┗━━━━━━━━━━━━━━━┛ - - ^ ^ ^ - ┃ ┃ ┃ - ┃ ┃ ┗━━ if 1, offset instead of address of next node - ┃ ┗━━━━ if 1, last arc of this node - ┗━━━━━━ if 1, this node is final (only on the first arc) - """ - nArc = len(self.arcs) - nFinalNodeMask = 1 << ((nBytesArc*8)-1) - nFinalArcMask = 1 << ((nBytesArc*8)-2) - nNextNodeMask = 1 << ((nBytesArc*8)-3) - nMaxOffset = (2 ** (nBytesOffset * 8)) - 1 - if not nArc: - val = nFinalNodeMask | nFinalArcMask - by = val.to_bytes(nBytesArc, byteorder='big') - by += (0).to_bytes(nBytesNodeAddress, byteorder='big') - return by - by = b"" - for i, arc in enumerate(self.arcs, 1): - val = arc - if i == 1 and self.final: - val = val | nFinalNodeMask - if i == nArc: - val = val | nFinalArcMask - if 1 < (self.arcs[arc].addr - self.addr) < nMaxOffset and self.i != 0: - val = val | nNextNodeMask - by += val.to_bytes(nBytesArc, byteorder='big') - by += (self.arcs[arc].addr-self.addr).to_bytes(nBytesOffset, byteorder='big') - else: - by += val.to_bytes(nBytesArc, byteorder='big') - by += self.arcs[arc].addr.to_bytes(nBytesNodeAddress, byteorder='big') - return by - - def getTxtRepr3 (self, nBytesArc, nBytesOffset, lVal): - "return representation as string of node (method 3)" - nArc = len(self.arcs) - nFinalNodeMask = 1 << ((nBytesArc*8)-1) - nFinalArcMask = 1 << ((nBytesArc*8)-2) - nNextNodeMask = 1 << ((nBytesArc*8)-3) - nMaxOffset = (2 ** (nBytesOffset * 8)) - 1 - s = "i{:_>10} -- #{:_>10} ({})\n".format(self.i, self.addr, self.size) - if not nArc: - s += " {:<20} {:0>16} i{:_>10} #{:_>10}\n".format("", bin(nFinalNodeMask | nFinalArcMask)[2:], "0", "0") - return s - for i, arc in enumerate(self.arcs, 1): - val = arc - if i == 1 and self.final: - val = val | nFinalNodeMask - if i == nArc: - val = val | nFinalArcMask - if 1 < (self.arcs[arc].addr - self.addr) < nMaxOffset and self.i != 0: - val = val | nNextNodeMask - s += " {:<20} {:0>16} i{:_>10} +{:_>10}\n".format(lVal[arc], bin(val)[2:], self.arcs[arc].i, self.arcs[arc].addr - self.addr) - else: - s += " {:<20} {:0>16} i{:_>10} #{:_>10}\n".format(lVal[arc], bin(val)[2:], self.arcs[arc].i, self.arcs[arc].addr) - return s - # Another attempt to sort node arcs _dCharOrder = { Index: graphspell/ibdawg.py ================================================================== --- graphspell/ibdawg.py +++ graphspell/ibdawg.py @@ -112,24 +112,24 @@ class IBDAWG: """INDEXABLE BINARY DIRECT ACYCLIC WORD GRAPH""" def __init__ (self, source): if isinstance(source, str): - self.by = pkgutil.get_data(__package__, "_dictionaries/" + source) - if not self.by: + by = pkgutil.get_data(__package__, "_dictionaries/" + source) + if not by: raise OSError("# Error. File not found or not loadable: "+source) - - if source.endswith(".bdic"): - self._initBinary() - elif source.endswith(".json"): - self._initJSON(json.loads(self.by.decode("utf-8"))) #json.loads(self.by) # In Python 3.6, can read directly binary strings - else: - raise OSError("# Error. Unknown file type: "+source) - else: - self._initJSON(source) - - self.sFileName = source if isinstance(source, str) else "[None]" + self.sFileName = source + oData = json.loads(by.decode("utf-8")) #json.loads(by) # In Python 3.6, can read directly binary strings + else: + self.sFileName = "[None]" + oData = source + + self.sByDic = "" # init to prevent pylint whining + self.__dict__.update(oData) + self.byDic = binascii.unhexlify(self.sByDic) + self.dCharVal = { v: k for k, v in self.dChar.items() } + self.a2grams = set(getattr(self, 'l2grams')) if hasattr(self, 'l2grams') else None # Performance trick: # Instead of converting bytes to integers each times we parse the binary dictionary, # we do it once, then parse the array nAcc = 0 @@ -168,101 +168,18 @@ try: self.lexicographer = importlib.import_module(".lexgraph_"+self.sLangCode, "grammalecte.graphspell") except ImportError: print("# No module ") - - def _initBinary (self): - "initialize with binary structure file" - if self.by[0:17] != b"/grammalecte-fsa/": - raise TypeError("# Error. Not a grammalecte-fsa binary dictionary. Header: {}".format(self.by[0:9])) - if not(self.by[17:18] == b"1" or self.by[17:18] == b"2" or self.by[17:18] == b"3"): - raise ValueError("# Error. Unknown dictionary version: {}".format(self.by[17:18])) - try: - byHeader, byInfo, byValues, by2grams, byDic = self.by.split(b"\0\0\0\0", 4) - except Exception: - raise Exception - - self.nCompressionMethod = int(self.by[17:18].decode("utf-8")) - self.sHeader = byHeader.decode("utf-8") - self.lArcVal = byValues.decode("utf-8").split("\t") - self.nArcVal = len(self.lArcVal) - self.byDic = byDic - self.a2grams = set(by2grams.decode("utf-8").split("\t")) - - l = byInfo.decode("utf-8").split("//") - self.sLangCode = l.pop(0) - self.sLangName = l.pop(0) - self.sDicName = l.pop(0) - self.sDescription = l.pop(0) - self.sDate = l.pop(0) - self.nChar = int(l.pop(0)) - self.nBytesArc = int(l.pop(0)) - self.nBytesNodeAddress = int(l.pop(0)) - self.nEntry = int(l.pop(0)) - self.nNode = int(l.pop(0)) - self.nArc = int(l.pop(0)) - self.nAff = int(l.pop(0)) - self.cStemming = l.pop(0) - self.nTag = self.nArcVal - self.nChar - self.nAff - # to get the value of an arc, to get the char of an arc with its value - self.dChar = {} - for i in range(1, self.nChar+1): - self.dChar[self.lArcVal[i]] = i - self.dCharVal = { v: k for k, v in self.dChar.items() } - - def _initJSON (self, oJSON): - "initialize with a JSON text file" - self.sByDic = "" # init to prevent pylint whining - self.__dict__.update(oJSON) - self.byDic = binascii.unhexlify(self.sByDic) - self.dCharVal = { v: k for k, v in self.dChar.items() } - self.a2grams = set(getattr(self, 'l2grams')) if hasattr(self, 'l2grams') else None - def getInfo (self): "return string about the IBDAWG" return " Language: {0.sLangName} Lang code: {0.sLangCode} Dictionary name: {0.sDicName}" \ " Compression method: {0.nCompressionMethod:>2} Date: {0.sDate} Stemming: {0.cStemming}FX\n" \ " Arcs values: {0.nArcVal:>10,} = {0.nChar:>5,} characters, {0.nAff:>6,} affixes, {0.nTag:>6,} tags\n" \ " Dictionary: {0.nEntry:>12,} entries, {0.nNode:>11,} nodes, {0.nArc:>11,} arcs\n" \ " Address size: {0.nBytesNodeAddress:>1} bytes, Arc size: {0.nBytesArc:>1} bytes\n".format(self) - def writeAsJSObject (self, spfDest, bInJSModule=False, bBinaryDictAsHexString=False): - "write IBDAWG as a JavaScript object in a JavaScript module" - with open(spfDest, "w", encoding="utf-8", newline="\n") as hDst: - if bInJSModule: - hDst.write('// JavaScript\n// Generated data (do not edit)\n\n"use strict";\n\nconst dictionary = ') - hDst.write(json.dumps({ - "sHeader": "/grammalecte-fsa/", - "sLangCode": self.sLangCode, - "sLangName": self.sLangName, - "sDicName": self.sDicName, - "sDescription": self.sDescription, - "sFileName": self.sFileName, - "sDate": self.sDate, - "nEntry": self.nEntry, - "nChar": self.nChar, - "nAff": self.nAff, - "nTag": self.nTag, - "cStemming": self.cStemming, - "dChar": self.dChar, - "nNode": self.nNode, - "nArc": self.nArc, - "nArcVal": self.nArcVal, - "lArcVal": self.lArcVal, - "nCompressionMethod": self.nCompressionMethod, - "nBytesArc": self.nBytesArc, - "nBytesNodeAddress": self.nBytesNodeAddress, - # JavaScript is a pile of shit, so Mozilla’s JS parser don’t like file bigger than 4 Mb! - # So, if necessary, we use an hexadecimal string, that we will convert later in Firefox’s extension. - # https://github.com/mozilla/addons-linter/issues/1361 - "sByDic": self.byDic.hex() if bBinaryDictAsHexString else [ e for e in self.byDic ], - "l2grams": list(self.a2grams) - }, ensure_ascii=False)) - if bInJSModule: - hDst.write(";\n\nexports.dictionary = dictionary;\n") - def isValidToken (self, sToken): "checks if is valid (if there is hyphens in , is split, each part is checked)" sToken = st.spellingNormalization(sToken) if self.isValid(sToken): return True Index: graphspell/spellchecker.py ================================================================== --- graphspell/spellchecker.py +++ graphspell/spellchecker.py @@ -14,12 +14,12 @@ from . import ibdawg from . import tokenizer dDefaultDictionaries = { - "fr": "fr-allvars.bdic", - "en": "en.bdic" + "fr": "fr-allvars.json", + "en": "en.json" } class SpellChecker (): "SpellChecker: wrapper for the IBDAWG class" Index: lex_build.py ================================================================== --- lex_build.py +++ lex_build.py @@ -9,20 +9,18 @@ import graphspell.dawg as fsa from graphspell.ibdawg import IBDAWG -def build (spfSrc, sLangCode, sLangName, sfDict, bJSON=False, sDicName="", sDescription="", sFilter="", cStemmingMethod="S", nCompressMethod=1): +def build (spfSrc, sLangCode, sLangName, sfDict, bJavaScript=False, sDicName="", sDescription="", sFilter="", cStemmingMethod="S", nCompressMethod=1): "transform a text lexicon as a binary indexable dictionary" oDAWG = fsa.DAWG(spfSrc, cStemmingMethod, sLangCode, sLangName, sDicName, sDescription, sFilter) dir_util.mkpath("graphspell/_dictionaries") - oDAWG.writeInfo("graphspell/_dictionaries/" + sfDict + ".info.txt") - oDAWG.writeBinary("graphspell/_dictionaries/" + sfDict + ".bdic", int(nCompressMethod)) - if bJSON: + oDAWG.writeAsJSObject("graphspell/_dictionaries/" + sfDict + ".json") + if bJavaScript: dir_util.mkpath("graphspell-js/_dictionaries") - oDic = IBDAWG(sfDict + ".bdic") - oDic.writeAsJSObject("graphspell-js/_dictionaries/" + sfDict + ".json", bBinaryDictAsHexString=True) + oDAWG.writeAsJSObject("graphspell-js/_dictionaries/" + sfDict + ".json") def main (): "parse args from CLI" xParser = argparse.ArgumentParser() Index: make.py ================================================================== --- make.py +++ make.py @@ -315,22 +315,22 @@ if bCommunityDict: lDict.append(("community", dVars['dic_community_filename'])) if bPersonalDict: lDict.append(("personal", dVars['dic_personal_filename'])) for sType, sFileName in lDict: - spfPyDic = f"graphspell/_dictionaries/{sFileName}.bdic" + spfPyDic = f"graphspell/_dictionaries/{sFileName}.json" spfJSDic = f"graphspell-js/_dictionaries/{sFileName}.json" if not os.path.isfile(spfPyDic) or (bJavaScript and not os.path.isfile(spfJSDic)): buildDictionary(dVars, sType, bJavaScript) print(" +", spfPyDic) file_util.copy_file(spfPyDic, "grammalecte/graphspell/_dictionaries") - dVars['dic_'+sType+'_filename_py'] = sFileName + '.bdic' + dVars['dic_'+sType+'_filename_py'] = sFileName + '.json' if bJavaScript: print(" +", spfJSDic) file_util.copy_file(spfJSDic, "grammalecte-js/graphspell/_dictionaries") dVars['dic_'+sType+'_filename_js'] = sFileName + '.json' - dVars['dic_main_filename_py'] = dVars['dic_default_filename_py'] + ".bdic" + dVars['dic_main_filename_py'] = dVars['dic_default_filename_py'] + ".json" dVars['dic_main_filename_js'] = dVars['dic_default_filename_js'] + ".json" def buildDictionary (dVars, sType, bJavaScript=False): "build binary dictionary for Graphspell from lexicons" Index: reader.py ================================================================== --- reader.py +++ reader.py @@ -5,11 +5,11 @@ import sys import re import graphspell.ibdawg as ibdawg -oDict = ibdawg.IBDAWG("French.bdic") +oDict = ibdawg.IBDAWG("fr-allvars.json") def readFile (spf): if os.path.isfile(spf): with open(spf, "r", encoding="utf-8") as hSrc: @@ -24,11 +24,11 @@ with open(spf+".res.txt", "w", encoding="utf-8") as hDst: for sLine in readFile(spfSrc): sLine = sLine.strip() if sLine: for sWord in sLine.split(): - if not oDict.isValid(sWord): + if not oDict.isValid(sWord): hDst.write(sWord+"\n") # -------------------------------------------------------------------------------------------------- def createLexStatFile (spf, dStat):