Index: graphspell/ibdawg.py ================================================================== --- graphspell/ibdawg.py +++ graphspell/ibdawg.py @@ -112,24 +112,24 @@ class IBDAWG: """INDEXABLE BINARY DIRECT ACYCLIC WORD GRAPH""" def __init__ (self, source): if isinstance(source, str): - self.by = pkgutil.get_data(__package__, "_dictionaries/" + source) - if not self.by: + by = pkgutil.get_data(__package__, "_dictionaries/" + source) + if not by: raise OSError("# Error. File not found or not loadable: "+source) - - if source.endswith(".bdic"): - self._initBinary() - elif source.endswith(".json"): - self._initJSON(json.loads(self.by.decode("utf-8"))) #json.loads(self.by) # In Python 3.6, can read directly binary strings - else: - raise OSError("# Error. Unknown file type: "+source) - else: - self._initJSON(source) - - self.sFileName = source if isinstance(source, str) else "[None]" + self.sFileName = source + oData = json.loads(by.decode("utf-8")) #json.loads(by) # In Python 3.6, can read directly binary strings + else: + self.sFileName = "[None]" + oData = source + + self.sByDic = "" # init to prevent pylint whining + self.__dict__.update(oData) + self.byDic = binascii.unhexlify(self.sByDic) + self.dCharVal = { v: k for k, v in self.dChar.items() } + self.a2grams = set(getattr(self, 'l2grams')) if hasattr(self, 'l2grams') else None # Performance trick: # Instead of converting bytes to integers each times we parse the binary dictionary, # we do it once, then parse the array nAcc = 0 @@ -168,58 +168,10 @@ try: self.lexicographer = importlib.import_module(".lexgraph_"+self.sLangCode, "grammalecte.graphspell") except ImportError: print("# No module ") - - def _initBinary (self): - "initialize with binary structure file" - if self.by[0:17] != b"/grammalecte-fsa/": - raise TypeError("# Error. Not a grammalecte-fsa binary dictionary. Header: {}".format(self.by[0:9])) - if not(self.by[17:18] == b"1" or self.by[17:18] == b"2" or self.by[17:18] == b"3"): - raise ValueError("# Error. Unknown dictionary version: {}".format(self.by[17:18])) - try: - byHeader, byInfo, byValues, by2grams, byDic = self.by.split(b"\0\0\0\0", 4) - except Exception: - raise Exception - - self.nCompressionMethod = int(self.by[17:18].decode("utf-8")) - self.sHeader = byHeader.decode("utf-8") - self.lArcVal = byValues.decode("utf-8").split("\t") - self.nArcVal = len(self.lArcVal) - self.byDic = byDic - self.a2grams = set(by2grams.decode("utf-8").split("\t")) - - l = byInfo.decode("utf-8").split("//") - self.sLangCode = l.pop(0) - self.sLangName = l.pop(0) - self.sDicName = l.pop(0) - self.sDescription = l.pop(0) - self.sDate = l.pop(0) - self.nChar = int(l.pop(0)) - self.nBytesArc = int(l.pop(0)) - self.nBytesNodeAddress = int(l.pop(0)) - self.nEntry = int(l.pop(0)) - self.nNode = int(l.pop(0)) - self.nArc = int(l.pop(0)) - self.nAff = int(l.pop(0)) - self.cStemming = l.pop(0) - self.nTag = self.nArcVal - self.nChar - self.nAff - # to get the value of an arc, to get the char of an arc with its value - self.dChar = {} - for i in range(1, self.nChar+1): - self.dChar[self.lArcVal[i]] = i - self.dCharVal = { v: k for k, v in self.dChar.items() } - - def _initJSON (self, oJSON): - "initialize with a JSON text file" - self.sByDic = "" # init to prevent pylint whining - self.__dict__.update(oJSON) - self.byDic = binascii.unhexlify(self.sByDic) - self.dCharVal = { v: k for k, v in self.dChar.items() } - self.a2grams = set(getattr(self, 'l2grams')) if hasattr(self, 'l2grams') else None - def getInfo (self): "return string about the IBDAWG" return " Language: {0.sLangName} Lang code: {0.sLangCode} Dictionary name: {0.sDicName}" \ " Compression method: {0.nCompressionMethod:>2} Date: {0.sDate} Stemming: {0.cStemming}FX\n" \ " Arcs values: {0.nArcVal:>10,} = {0.nChar:>5,} characters, {0.nAff:>6,} affixes, {0.nTag:>6,} tags\n" \