Index: graphspell/dawg.py ================================================================== --- graphspell/dawg.py +++ graphspell/dawg.py @@ -69,23 +69,22 @@ aEntry = set() lChar = ['']; dChar = {}; nChar = 1; dCharOccur = {} lAff = []; dAff = {}; nAff = 0; dAffOccur = {} lTag = []; dTag = {}; nTag = 0; dTagOccur = {} - nErr = 0 self.a2grams = set() try: zFilter = re.compile(sSelectFilterRegex) if sSelectFilterRegex else None - except: - print(" # Error. Wrong filter regex. Filter ignored.") + except re.error: + print("# Error. Wrong filter regex. Filter ignored: ", zFilter) traceback.print_exc() zFilter = None # read lexicon - if type(src) is str: + if isinstance(src, str): iterable = readFile(src) else: iterable = src for sFlex, sStem, sTag in iterable: if not zFilter or zFilter.search(sTag): @@ -125,11 +124,11 @@ # Dictionary of arc values occurrency, to sort arcs of each node dValOccur = dict( [ (dChar[c], dCharOccur[c]) for c in dChar ] \ + [ (dAff[aff]+nChar, dAffOccur[aff]) for aff in dAff ] \ + [ (dTag[tag]+nChar+nAff, dTagOccur[tag]) for tag in dTag ] ) - self.sFileName = src if type(src) is str else "[None]" + self.sFileName = src if isinstance(src, str) else "[None]" self.sLangCode = sLangCode self.sLangName = sLangName self.sDicName = sDicName self.sDescription = sDescription if dLexiconData: @@ -332,11 +331,11 @@ "generator: returns all entries which morphology fits " zPattern = None if sPattern: try: zPattern = re.compile(sPattern) - except: + except re.error: print("# Error in regex pattern") traceback.print_exc() yield from self._select(zPattern, self.oRoot, "") def _select (self, zPattern, oNode, sWord): Index: graphspell/ibdawg.py ================================================================== --- graphspell/ibdawg.py +++ graphspell/ibdawg.py @@ -90,11 +90,11 @@ class IBDAWG: """INDEXABLE BINARY DIRECT ACYCLIC WORD GRAPH""" def __init__ (self, source): - if type(source) is str: + if isinstance(source, str): self.by = pkgutil.get_data(__package__, "_dictionaries/" + source) if not self.by: raise OSError("# Error. File not found or not loadable: "+source) if source.endswith(".bdic"): @@ -104,11 +104,11 @@ else: raise OSError("# Error. Unknown file type: "+source) else: self._initJSON(source) - self.sFileName = source if type(source) is str else "[None]" + self.sFileName = source if isinstance(source, str) else "[None]" self._arcMask = (2 ** ((self.nBytesArc * 8) - 3)) - 1 self._finalNodeMask = 1 << ((self.nBytesArc * 8) - 1) self._lastArcMask = 1 << ((self.nBytesArc * 8) - 2) self._addrBitMask = 1 << ((self.nBytesArc * 8) - 3) # version 2 @@ -478,11 +478,11 @@ try: if sFlexPattern: zFlexPattern = re.compile(sFlexPattern) if sTagsPattern: zTagsPattern = re.compile(sTagsPattern) - except: + except re.error: print("# Error in regex pattern") traceback.print_exc() yield from self._select1(zFlexPattern, zTagsPattern, 0, "") # def morph (self, sWord): @@ -512,21 +512,21 @@ if iAddr is None: return [] if int.from_bytes(self.byDic[iAddr:iAddr+self.nBytesArc], byteorder='big') & self._finalNodeMask: l = [] nRawArc = 0 - while not (nRawArc & self._lastArcMask): + while not nRawArc & self._lastArcMask: iEndArcAddr = iAddr + self.nBytesArc nRawArc = int.from_bytes(self.byDic[iAddr:iEndArcAddr], byteorder='big') nArc = nRawArc & self._arcMask if nArc > self.nChar: # This value is not a char, this is a stemming code sStem = ">" + self.funcStemming(sWord, self.lArcVal[nArc]) # Now , we go to the next node and retrieve all following arcs values, all of them are tags iAddr2 = int.from_bytes(self.byDic[iEndArcAddr:iEndArcAddr+self.nBytesNodeAddress], byteorder='big') nRawArc2 = 0 - while not (nRawArc2 & self._lastArcMask): + while not nRawArc2 & self._lastArcMask: iEndArcAddr2 = iAddr2 + self.nBytesArc nRawArc2 = int.from_bytes(self.byDic[iAddr2:iEndArcAddr2], byteorder='big') l.append(sStem + "/" + self.lArcVal[nRawArc2 & self._arcMask]) iAddr2 = iEndArcAddr2+self.nBytesNodeAddress iAddr = iEndArcAddr+self.nBytesNodeAddress @@ -543,11 +543,11 @@ if iAddr is None: return [] if int.from_bytes(self.byDic[iAddr:iAddr+self.nBytesArc], byteorder='big') & self._finalNodeMask: l = [] nRawArc = 0 - while not (nRawArc & self._lastArcMask): + while not nRawArc & self._lastArcMask: iEndArcAddr = iAddr + self.nBytesArc nRawArc = int.from_bytes(self.byDic[iAddr:iEndArcAddr], byteorder='big') nArc = nRawArc & self._arcMask if nArc > self.nChar: # This value is not a char, this is a stemming code @@ -563,15 +563,14 @@ nRawArc = int.from_bytes(self.byDic[iAddr:iEndArcAddr], byteorder='big') if nVal == (nRawArc & self._arcMask): # the value we are looking for # we return the address of the next node return int.from_bytes(self.byDic[iEndArcAddr:iEndArcAddr+self.nBytesNodeAddress], byteorder='big') - else: - # value not found - if nRawArc & self._lastArcMask: - return None - iAddr = iEndArcAddr+self.nBytesNodeAddress + # value not found + if nRawArc & self._lastArcMask: + return None + iAddr = iEndArcAddr+self.nBytesNodeAddress def _getArcs1 (self, iAddr): "generator: return all arcs at as tuples of (nVal, iAddr)" while True: iEndArcAddr = iAddr+self.nBytesArc @@ -610,33 +609,33 @@ if iAddr is None: return [] if int.from_bytes(self.byDic[iAddr:iAddr+self.nBytesArc], byteorder='big') & self._finalNodeMask: l = [] nRawArc = 0 - while not (nRawArc & self._lastArcMask): + while not nRawArc & self._lastArcMask: iEndArcAddr = iAddr + self.nBytesArc nRawArc = int.from_bytes(self.byDic[iAddr:iEndArcAddr], byteorder='big') nArc = nRawArc & self._arcMask if nArc > self.nChar: # This value is not a char, this is a stemming code sStem = ">" + self.funcStemming(sWord, self.lArcVal[nArc]) # Now , we go to the next node and retrieve all following arcs values, all of them are tags - if not (nRawArc & self._addrBitMask): + if not nRawArc & self._addrBitMask: iAddr2 = int.from_bytes(self.byDic[iEndArcAddr:iEndArcAddr+self.nBytesNodeAddress], byteorder='big') else: # we go to the end of the node iAddr2 = iEndArcAddr - while not (nRawArc & self._lastArcMask): + while not nRawArc & self._lastArcMask: nRawArc = int.from_bytes(self.byDic[iAddr2:iAddr2+self.nBytesArc], byteorder='big') iAddr2 += self.nBytesArc + self.nBytesNodeAddress nRawArc2 = 0 - while not (nRawArc2 & self._lastArcMask): + while not nRawArc2 & self._lastArcMask: iEndArcAddr2 = iAddr2 + self.nBytesArc nRawArc2 = int.from_bytes(self.byDic[iAddr2:iEndArcAddr2], byteorder='big') l.append(sStem + "/" + self.lArcVal[nRawArc2 & self._arcMask]) - iAddr2 = iEndArcAddr2+self.nBytesNodeAddress if not (nRawArc2 & self._addrBitMask) else iEndArcAddr2 - iAddr = iEndArcAddr+self.nBytesNodeAddress if not (nRawArc & self._addrBitMask) else iEndArcAddr + iAddr2 = iEndArcAddr2+self.nBytesNodeAddress if not nRawArc2 & self._addrBitMask else iEndArcAddr2 + iAddr = iEndArcAddr+self.nBytesNodeAddress if not nRawArc & self._addrBitMask else iEndArcAddr return l return [] def _stem2 (self, sWord): "returns stems list of " @@ -648,27 +647,27 @@ if iAddr is None: return [] if int.from_bytes(self.byDic[iAddr:iAddr+self.nBytesArc], byteorder='big') & self._finalNodeMask: l = [] nRawArc = 0 - while not (nRawArc & self._lastArcMask): + while not nRawArc & self._lastArcMask: iEndArcAddr = iAddr + self.nBytesArc nRawArc = int.from_bytes(self.byDic[iAddr:iEndArcAddr], byteorder='big') nArc = nRawArc & self._arcMask if nArc > self.nChar: # This value is not a char, this is a stemming code l.append(self.funcStemming(sWord, self.lArcVal[nArc])) # Now , we go to the next node - if not (nRawArc & self._addrBitMask): + if not nRawArc & self._addrBitMask: iAddr2 = int.from_bytes(self.byDic[iEndArcAddr:iEndArcAddr+self.nBytesNodeAddress], byteorder='big') else: # we go to the end of the node iAddr2 = iEndArcAddr - while not (nRawArc & self._lastArcMask): + while not nRawArc & self._lastArcMask: nRawArc = int.from_bytes(self.byDic[iAddr2:iAddr2+self.nBytesArc], byteorder='big') iAddr2 += self.nBytesArc + self.nBytesNodeAddress - iAddr = iEndArcAddr+self.nBytesNodeAddress if not (nRawArc & self._addrBitMask) else iEndArcAddr + iAddr = iEndArcAddr+self.nBytesNodeAddress if not nRawArc & self._addrBitMask else iEndArcAddr return l return [] def _lookupArcNode2 (self, nVal, iAddr): "looks if is an arc at the node at , if yes, returns address of next node else None" @@ -675,25 +674,23 @@ while True: iEndArcAddr = iAddr+self.nBytesArc nRawArc = int.from_bytes(self.byDic[iAddr:iEndArcAddr], byteorder='big') if nVal == (nRawArc & self._arcMask): # the value we are looking for - if not (nRawArc & self._addrBitMask): + if not nRawArc & self._addrBitMask: # we return the address of the next node return int.from_bytes(self.byDic[iEndArcAddr:iEndArcAddr+self.nBytesNodeAddress], byteorder='big') - else: - # we go to the end of the node - iAddr = iEndArcAddr - while not (nRawArc & self._lastArcMask): - nRawArc = int.from_bytes(self.byDic[iAddr:iAddr+self.nBytesArc], byteorder='big') - iAddr += self.nBytesArc + self.nBytesNodeAddress if not (nRawArc & self._addrBitMask) else self.nBytesArc - return iAddr - else: - # value not found - if nRawArc & self._lastArcMask: - return None - iAddr = iEndArcAddr+self.nBytesNodeAddress if not (nRawArc & self._addrBitMask) else iEndArcAddr + # we go to the end of the node + iAddr = iEndArcAddr + while not nRawArc & self._lastArcMask: + nRawArc = int.from_bytes(self.byDic[iAddr:iAddr+self.nBytesArc], byteorder='big') + iAddr += self.nBytesArc + self.nBytesNodeAddress if not nRawArc & self._addrBitMask else self.nBytesArc + return iAddr + # value not found + if nRawArc & self._lastArcMask: + return None + iAddr = iEndArcAddr+self.nBytesNodeAddress if not nRawArc & self._addrBitMask else iEndArcAddr def _writeNodes2 (self, spfDest): "for debugging only" print(" > Write binary nodes") with open(spfDest, 'w', 'utf-8', newline="\n") as hDst: @@ -701,11 +698,11 @@ hDst.write("i{:_>10} -- #{:_>10}\n".format("0", iAddr)) while iAddr < len(self.byDic): iEndArcAddr = iAddr+self.nBytesArc nRawArc = int.from_bytes(self.byDic[iAddr:iEndArcAddr], byteorder='big') nArc = nRawArc & self._arcMask - if not (nRawArc & self._addrBitMask): + if not nRawArc & self._addrBitMask: iNextNodeAddr = int.from_bytes(self.byDic[iEndArcAddr:iEndArcAddr+self.nBytesNodeAddress], byteorder='big') hDst.write(" {:<20} {:0>16} i{:>10} #{:_>10}\n".format(self.lArcVal[nArc], bin(nRawArc)[2:], "?", iNextNodeAddr)) iAddr = iEndArcAddr+self.nBytesNodeAddress else: hDst.write(" {:<20} {:0>16}\n".format(self.lArcVal[nArc], bin(nRawArc)[2:])) @@ -726,29 +723,29 @@ return [] if int.from_bytes(self.byDic[iAddr:iAddr+self.nBytesArc], byteorder='big') & self._finalNodeMask: l = [] nRawArc = 0 iAddrNode = iAddr - while not (nRawArc & self._lastArcMask): + while not nRawArc & self._lastArcMask: iEndArcAddr = iAddr + self.nBytesArc nRawArc = int.from_bytes(self.byDic[iAddr:iEndArcAddr], byteorder='big') nArc = nRawArc & self._arcMask if nArc > self.nChar: # This value is not a char, this is a stemming code sStem = ">" + self.funcStemming(sWord, self.lArcVal[nArc]) # Now , we go to the next node and retrieve all following arcs values, all of them are tags - if not (nRawArc & self._addrBitMask): + if not nRawArc & self._addrBitMask: iAddr2 = int.from_bytes(self.byDic[iEndArcAddr:iEndArcAddr+self.nBytesNodeAddress], byteorder='big') else: iAddr2 = iAddrNode + int.from_bytes(self.byDic[iEndArcAddr:iEndArcAddr+self.nBytesOffset], byteorder='big') nRawArc2 = 0 - while not (nRawArc2 & self._lastArcMask): + while not nRawArc2 & self._lastArcMask: iEndArcAddr2 = iAddr2 + self.nBytesArc nRawArc2 = int.from_bytes(self.byDic[iAddr2:iEndArcAddr2], byteorder='big') l.append(sStem + "/" + self.lArcVal[nRawArc2 & self._arcMask]) - iAddr2 = iEndArcAddr2+self.nBytesNodeAddress if not (nRawArc2 & self._addrBitMask) else iEndArcAddr2+self.nBytesOffset - iAddr = iEndArcAddr+self.nBytesNodeAddress if not (nRawArc & self._addrBitMask) else iEndArcAddr+self.nBytesOffset + iAddr2 = iEndArcAddr2+self.nBytesNodeAddress if not nRawArc2 & self._addrBitMask else iEndArcAddr2+self.nBytesOffset + iAddr = iEndArcAddr+self.nBytesNodeAddress if not nRawArc & self._addrBitMask else iEndArcAddr+self.nBytesOffset return l return [] def _stem3 (self, sWord): "returns stems list of " @@ -761,18 +758,18 @@ return [] if int.from_bytes(self.byDic[iAddr:iAddr+self.nBytesArc], byteorder='big') & self._finalNodeMask: l = [] nRawArc = 0 #iAddrNode = iAddr - while not (nRawArc & self._lastArcMask): + while not nRawArc & self._lastArcMask: iEndArcAddr = iAddr + self.nBytesArc nRawArc = int.from_bytes(self.byDic[iAddr:iEndArcAddr], byteorder='big') nArc = nRawArc & self._arcMask if nArc > self.nChar: # This value is not a char, this is a stemming code l.append(self.funcStemming(sWord, self.lArcVal[nArc])) - iAddr = iEndArcAddr+self.nBytesNodeAddress if not (nRawArc & self._addrBitMask) else iEndArcAddr+self.nBytesOffset + iAddr = iEndArcAddr+self.nBytesNodeAddress if not nRawArc & self._addrBitMask else iEndArcAddr+self.nBytesOffset return l return [] def _lookupArcNode3 (self, nVal, iAddr): "looks if is an arc at the node at , if yes, returns address of next node else None" @@ -780,19 +777,17 @@ while True: iEndArcAddr = iAddr+self.nBytesArc nRawArc = int.from_bytes(self.byDic[iAddr:iEndArcAddr], byteorder='big') if nVal == (nRawArc & self._arcMask): # the value we are looking for - if not (nRawArc & self._addrBitMask): + if not nRawArc & self._addrBitMask: return int.from_bytes(self.byDic[iEndArcAddr:iEndArcAddr+self.nBytesNodeAddress], byteorder='big') - else: - return iAddrNode + int.from_bytes(self.byDic[iEndArcAddr:iEndArcAddr+self.nBytesOffset], byteorder='big') - else: - # value not found - if nRawArc & self._lastArcMask: - return None - iAddr = iEndArcAddr+self.nBytesNodeAddress if not (nRawArc & self._addrBitMask) else iEndArcAddr+self.nBytesOffset + return iAddrNode + int.from_bytes(self.byDic[iEndArcAddr:iEndArcAddr+self.nBytesOffset], byteorder='big') + # value not found + if nRawArc & self._lastArcMask: + return None + iAddr = iEndArcAddr+self.nBytesNodeAddress if not nRawArc & self._addrBitMask else iEndArcAddr+self.nBytesOffset def _writeNodes3 (self, spfDest): "for debugging only" print(" > Write binary nodes") with open(spfDest, 'w', 'utf-8', newline="\n") as hDst: @@ -800,11 +795,11 @@ hDst.write("i{:_>10} -- #{:_>10}\n".format("0", iAddr)) while iAddr < len(self.byDic): iEndArcAddr = iAddr+self.nBytesArc nRawArc = int.from_bytes(self.byDic[iAddr:iEndArcAddr], byteorder='big') nArc = nRawArc & self._arcMask - if not (nRawArc & self._addrBitMask): + if not nRawArc & self._addrBitMask: iNextNodeAddr = int.from_bytes(self.byDic[iEndArcAddr:iEndArcAddr+self.nBytesNodeAddress], byteorder='big') hDst.write(" {:<20} {:0>16} i{:>10} #{:_>10}\n".format(self.lArcVal[nArc], bin(nRawArc)[2:], "?", iNextNodeAddr)) iAddr = iEndArcAddr+self.nBytesNodeAddress else: iNextNodeAddr = int.from_bytes(self.byDic[iEndArcAddr:iEndArcAddr+self.nBytesOffset], byteorder='big')