Index: graphspell/ibdawg.py ================================================================== --- graphspell/ibdawg.py +++ graphspell/ibdawg.py @@ -151,32 +151,10 @@ elif self.cStemming == "A": self.funcStemming = st.changeWordWithAffixCode else: self.funcStemming = st.noStemming - # Configuring DAWG functions according to nCompressionMethod - if self.nCompressionMethod == 1: - self.morph = self._morph1 - self.stem = self._stem1 - self._lookupArcNode = self._lookupArcNode1 - self._getArcs = self._getArcs1 - self._writeNodes = self._writeNodes1 - elif self.nCompressionMethod == 2: - self.morph = self._morph2 - self.stem = self._stem2 - self._lookupArcNode = self._lookupArcNode2 - self._getArcs = self._getArcs2 - self._writeNodes = self._writeNodes2 - elif self.nCompressionMethod == 3: - self.morph = self._morph3 - self.stem = self._stem3 - self._lookupArcNode = self._lookupArcNode3 - self._getArcs = self._getArcs3 - self._writeNodes = self._writeNodes3 - else: - raise ValueError(" # Error: unknown code: {}".format(self.nCompressionMethod)) - self.bAcronymValid = False self.bNumAtLastValid = False # lexicographer module ? self.lexicographer = None @@ -325,15 +303,15 @@ def getMorph (self, sWord): "retrieves morphologies list, different casing allowed" if not sWord: return [] sWord = st.spellingNormalization(sWord) - l = self.morph(sWord) + l = self._morph(sWord) if sWord[0:1].isupper(): - l.extend(self.morph(sWord.lower())) + l.extend(self._morph(sWord.lower())) if sWord.isupper() and len(sWord) > 1: - l.extend(self.morph(sWord.capitalize())) + l.extend(self._morph(sWord.capitalize())) return l #@timethis def suggest (self, sWord, nSuggLimit=10, bSplitTrailingNumbers=False): "returns a set of suggestions for " @@ -489,30 +467,26 @@ if sTagsPattern: zTagsPattern = re.compile(sTagsPattern) except re.error: print("# Error in regex pattern") traceback.print_exc() - yield from self._select1(zFlexPattern, zTagsPattern, 0, "") - - # def morph (self, sWord): - # is defined in __init__ - - # VERSION 1 - def _select1 (self, zFlexPattern, zTagsPattern, iAddr, sWord): - # recursive generator - for nVal, jAddr in self._getArcs1(iAddr): + yield from self._select(zFlexPattern, zTagsPattern, 0, "") + + def _select (self, zFlexPattern, zTagsPattern, iAddr, sWord): + # recursive generator + for nVal, jAddr in self._getArcs(iAddr): if nVal <= self.nChar: # simple character - yield from self._select1(zFlexPattern, zTagsPattern, jAddr, sWord + self.lArcVal[nVal]) + yield from self._select(zFlexPattern, zTagsPattern, jAddr, sWord + self.lArcVal[nVal]) else: if not zFlexPattern or zFlexPattern.search(sWord): sStem = self.funcStemming(sWord, self.lArcVal[nVal]) - for nMorphVal, _ in self._getArcs1(jAddr): + for nMorphVal, _ in self._getArcs(jAddr): if not zTagsPattern or zTagsPattern.search(self.lArcVal[nMorphVal]): yield [sWord, sStem, self.lArcVal[nMorphVal]] - def _morph1 (self, sWord): + def _morph (self, sWord): "returns morphologies of " iAddr = 0 for c in sWord: if c not in self.dChar: return [] @@ -539,11 +513,11 @@ iAddr2 = iEndArcAddr2 + 1 iAddr = iEndArcAddr + 1 return l return [] - def _stem1 (self, sWord): + def _stem (self, sWord): "returns stems list of " iAddr = 0 for c in sWord: if c not in self.dChar: return [] @@ -562,11 +536,11 @@ l.append(self.funcStemming(sWord, self.lArcVal[nArc])) iAddr = iEndArcAddr + 1 return l return [] - def _lookupArcNode1 (self, nVal, iAddr): + def _lookupArcNode (self, nVal, iAddr): "looks if is an arc at the node at , if yes, returns address of next node else None" while True: iEndArcAddr = iAddr + 1 nRawArc = self.byDic[iAddr] if nVal == (nRawArc & self._arcMask): @@ -576,21 +550,21 @@ # value not found if nRawArc & self._lastArcMask: return None iAddr = iEndArcAddr + 1 - def _getArcs1 (self, iAddr): + def _getArcs (self, iAddr): "generator: return all arcs at as tuples of (nVal, iAddr)" while True: iEndArcAddr = iAddr + 1 nRawArc = self.byDic[iAddr] yield nRawArc & self._arcMask, self.byDic[iEndArcAddr] if nRawArc & self._lastArcMask: break iAddr = iEndArcAddr + 1 - def _writeNodes1 (self, spfDest): + def _writeNodes (self, spfDest): "for debugging only" print(" > Write binary nodes") with open(spfDest, 'w', 'utf-8', newline="\n") as hDst: iAddr = 0 hDst.write("i{:_>10} -- #{:_>10}\n".format("0", iAddr)) @@ -599,218 +573,7 @@ nRawArc = self.byDic[iAddr] nArc = nRawArc & self._arcMask hDst.write(" {:<20} {:0>16} i{:>10} #{:_>10}\n".format(self.lArcVal[nArc], bin(nRawArc)[2:], "?", self.byDic[iEndArcAddr])) iAddr = iEndArcAddr + 1 if (nRawArc & self._lastArcMask) and iAddr < len(self.byDic): - hDst.write("\ni{:_>10} -- #{:_>10}\n".format("?", iAddr)) - hDst.close() - - # VERSION 2 - def _morph2 (self, sWord): - "returns morphologies of " - iAddr = 0 - for c in sWord: - if c not in self.dChar: - return [] - iAddr = self._lookupArcNode(self.dChar[c], iAddr) - if iAddr is None: - return [] - if int.from_bytes(self.byDic[iAddr:iAddr+self.nBytesArc], byteorder='big') & self._finalNodeMask: - l = [] - nRawArc = 0 - while not nRawArc & self._lastArcMask: - iEndArcAddr = iAddr + self.nBytesArc - nRawArc = int.from_bytes(self.byDic[iAddr:iEndArcAddr], byteorder='big') - nArc = nRawArc & self._arcMask - if nArc > self.nChar: - # This value is not a char, this is a stemming code - sStem = ">" + self.funcStemming(sWord, self.lArcVal[nArc]) - # Now , we go to the next node and retrieve all following arcs values, all of them are tags - if not nRawArc & self._addrBitMask: - iAddr2 = int.from_bytes(self.byDic[iEndArcAddr:iEndArcAddr+self.nBytesNodeAddress], byteorder='big') - else: - # we go to the end of the node - iAddr2 = iEndArcAddr - while not nRawArc & self._lastArcMask: - nRawArc = int.from_bytes(self.byDic[iAddr2:iAddr2+self.nBytesArc], byteorder='big') - iAddr2 += self.nBytesArc + self.nBytesNodeAddress - nRawArc2 = 0 - while not nRawArc2 & self._lastArcMask: - iEndArcAddr2 = iAddr2 + self.nBytesArc - nRawArc2 = int.from_bytes(self.byDic[iAddr2:iEndArcAddr2], byteorder='big') - l.append(sStem + "/" + self.lArcVal[nRawArc2 & self._arcMask]) - iAddr2 = iEndArcAddr2+self.nBytesNodeAddress if not nRawArc2 & self._addrBitMask else iEndArcAddr2 - iAddr = iEndArcAddr+self.nBytesNodeAddress if not nRawArc & self._addrBitMask else iEndArcAddr - return l - return [] - - def _stem2 (self, sWord): - "returns stems list of " - iAddr = 0 - for c in sWord: - if c not in self.dChar: - return [] - iAddr = self._lookupArcNode(self.dChar[c], iAddr) - if iAddr is None: - return [] - if int.from_bytes(self.byDic[iAddr:iAddr+self.nBytesArc], byteorder='big') & self._finalNodeMask: - l = [] - nRawArc = 0 - while not nRawArc & self._lastArcMask: - iEndArcAddr = iAddr + self.nBytesArc - nRawArc = int.from_bytes(self.byDic[iAddr:iEndArcAddr], byteorder='big') - nArc = nRawArc & self._arcMask - if nArc > self.nChar: - # This value is not a char, this is a stemming code - l.append(self.funcStemming(sWord, self.lArcVal[nArc])) - # Now , we go to the next node - if not nRawArc & self._addrBitMask: - iAddr2 = int.from_bytes(self.byDic[iEndArcAddr:iEndArcAddr+self.nBytesNodeAddress], byteorder='big') - else: - # we go to the end of the node - iAddr2 = iEndArcAddr - while not nRawArc & self._lastArcMask: - nRawArc = int.from_bytes(self.byDic[iAddr2:iAddr2+self.nBytesArc], byteorder='big') - iAddr2 += self.nBytesArc + self.nBytesNodeAddress - iAddr = iEndArcAddr+self.nBytesNodeAddress if not nRawArc & self._addrBitMask else iEndArcAddr - return l - return [] - - def _lookupArcNode2 (self, nVal, iAddr): - "looks if is an arc at the node at , if yes, returns address of next node else None" - while True: - iEndArcAddr = iAddr+self.nBytesArc - nRawArc = int.from_bytes(self.byDic[iAddr:iEndArcAddr], byteorder='big') - if nVal == (nRawArc & self._arcMask): - # the value we are looking for - if not nRawArc & self._addrBitMask: - # we return the address of the next node - return int.from_bytes(self.byDic[iEndArcAddr:iEndArcAddr+self.nBytesNodeAddress], byteorder='big') - # we go to the end of the node - iAddr = iEndArcAddr - while not nRawArc & self._lastArcMask: - nRawArc = int.from_bytes(self.byDic[iAddr:iAddr+self.nBytesArc], byteorder='big') - iAddr += self.nBytesArc + self.nBytesNodeAddress if not nRawArc & self._addrBitMask else self.nBytesArc - return iAddr - # value not found - if nRawArc & self._lastArcMask: - return None - iAddr = iEndArcAddr+self.nBytesNodeAddress if not nRawArc & self._addrBitMask else iEndArcAddr - - def _writeNodes2 (self, spfDest): - "for debugging only" - print(" > Write binary nodes") - with open(spfDest, 'w', 'utf-8', newline="\n") as hDst: - iAddr = 0 - hDst.write("i{:_>10} -- #{:_>10}\n".format("0", iAddr)) - while iAddr < len(self.byDic): - iEndArcAddr = iAddr+self.nBytesArc - nRawArc = int.from_bytes(self.byDic[iAddr:iEndArcAddr], byteorder='big') - nArc = nRawArc & self._arcMask - if not nRawArc & self._addrBitMask: - iNextNodeAddr = int.from_bytes(self.byDic[iEndArcAddr:iEndArcAddr+self.nBytesNodeAddress], byteorder='big') - hDst.write(" {:<20} {:0>16} i{:>10} #{:_>10}\n".format(self.lArcVal[nArc], bin(nRawArc)[2:], "?", iNextNodeAddr)) - iAddr = iEndArcAddr+self.nBytesNodeAddress - else: - hDst.write(" {:<20} {:0>16}\n".format(self.lArcVal[nArc], bin(nRawArc)[2:])) - iAddr = iEndArcAddr - if nRawArc & self._lastArcMask: - hDst.write("\ni{:_>10} -- #{:_>10}\n".format("?", iAddr)) - hDst.close() - - # VERSION 3 - def _morph3 (self, sWord): - "returns morphologies of " - iAddr = 0 - for c in sWord: - if c not in self.dChar: - return [] - iAddr = self._lookupArcNode(self.dChar[c], iAddr) - if iAddr is None: - return [] - if int.from_bytes(self.byDic[iAddr:iAddr+self.nBytesArc], byteorder='big') & self._finalNodeMask: - l = [] - nRawArc = 0 - iAddrNode = iAddr - while not nRawArc & self._lastArcMask: - iEndArcAddr = iAddr + self.nBytesArc - nRawArc = int.from_bytes(self.byDic[iAddr:iEndArcAddr], byteorder='big') - nArc = nRawArc & self._arcMask - if nArc > self.nChar: - # This value is not a char, this is a stemming code - sStem = ">" + self.funcStemming(sWord, self.lArcVal[nArc]) - # Now , we go to the next node and retrieve all following arcs values, all of them are tags - if not nRawArc & self._addrBitMask: - iAddr2 = int.from_bytes(self.byDic[iEndArcAddr:iEndArcAddr+self.nBytesNodeAddress], byteorder='big') - else: - iAddr2 = iAddrNode + int.from_bytes(self.byDic[iEndArcAddr:iEndArcAddr+self.nBytesOffset], byteorder='big') - nRawArc2 = 0 - while not nRawArc2 & self._lastArcMask: - iEndArcAddr2 = iAddr2 + self.nBytesArc - nRawArc2 = int.from_bytes(self.byDic[iAddr2:iEndArcAddr2], byteorder='big') - l.append(sStem + "/" + self.lArcVal[nRawArc2 & self._arcMask]) - iAddr2 = iEndArcAddr2+self.nBytesNodeAddress if not nRawArc2 & self._addrBitMask else iEndArcAddr2+self.nBytesOffset - iAddr = iEndArcAddr+self.nBytesNodeAddress if not nRawArc & self._addrBitMask else iEndArcAddr+self.nBytesOffset - return l - return [] - - def _stem3 (self, sWord): - "returns stems list of " - iAddr = 0 - for c in sWord: - if c not in self.dChar: - return [] - iAddr = self._lookupArcNode(self.dChar[c], iAddr) - if iAddr is None: - return [] - if int.from_bytes(self.byDic[iAddr:iAddr+self.nBytesArc], byteorder='big') & self._finalNodeMask: - l = [] - nRawArc = 0 - #iAddrNode = iAddr - while not nRawArc & self._lastArcMask: - iEndArcAddr = iAddr + self.nBytesArc - nRawArc = int.from_bytes(self.byDic[iAddr:iEndArcAddr], byteorder='big') - nArc = nRawArc & self._arcMask - if nArc > self.nChar: - # This value is not a char, this is a stemming code - l.append(self.funcStemming(sWord, self.lArcVal[nArc])) - iAddr = iEndArcAddr+self.nBytesNodeAddress if not nRawArc & self._addrBitMask else iEndArcAddr+self.nBytesOffset - return l - return [] - - def _lookupArcNode3 (self, nVal, iAddr): - "looks if is an arc at the node at , if yes, returns address of next node else None" - iAddrNode = iAddr - while True: - iEndArcAddr = iAddr+self.nBytesArc - nRawArc = int.from_bytes(self.byDic[iAddr:iEndArcAddr], byteorder='big') - if nVal == (nRawArc & self._arcMask): - # the value we are looking for - if not nRawArc & self._addrBitMask: - return int.from_bytes(self.byDic[iEndArcAddr:iEndArcAddr+self.nBytesNodeAddress], byteorder='big') - return iAddrNode + int.from_bytes(self.byDic[iEndArcAddr:iEndArcAddr+self.nBytesOffset], byteorder='big') - # value not found - if nRawArc & self._lastArcMask: - return None - iAddr = iEndArcAddr+self.nBytesNodeAddress if not nRawArc & self._addrBitMask else iEndArcAddr+self.nBytesOffset - - def _writeNodes3 (self, spfDest): - "for debugging only" - print(" > Write binary nodes") - with open(spfDest, 'w', 'utf-8', newline="\n") as hDst: - iAddr = 0 - hDst.write("i{:_>10} -- #{:_>10}\n".format("0", iAddr)) - while iAddr < len(self.byDic): - iEndArcAddr = iAddr+self.nBytesArc - nRawArc = int.from_bytes(self.byDic[iAddr:iEndArcAddr], byteorder='big') - nArc = nRawArc & self._arcMask - if not nRawArc & self._addrBitMask: - iNextNodeAddr = int.from_bytes(self.byDic[iEndArcAddr:iEndArcAddr+self.nBytesNodeAddress], byteorder='big') - hDst.write(" {:<20} {:0>16} i{:>10} #{:_>10}\n".format(self.lArcVal[nArc], bin(nRawArc)[2:], "?", iNextNodeAddr)) - iAddr = iEndArcAddr+self.nBytesNodeAddress - else: - iNextNodeAddr = int.from_bytes(self.byDic[iEndArcAddr:iEndArcAddr+self.nBytesOffset], byteorder='big') - hDst.write(" {:<20} {:0>16} i{:>10} +{:_>10}\n".format(self.lArcVal[nArc], bin(nRawArc)[2:], "?", iNextNodeAddr)) - iAddr = iEndArcAddr+self.nBytesOffset - if nRawArc & self._lastArcMask: hDst.write("\ni{:_>10} -- #{:_>10}\n".format("?", iAddr)) hDst.close()