Index: gc_core/py/dawg.py ================================================================== --- gc_core/py/dawg.py +++ gc_core/py/dawg.py @@ -147,16 +147,16 @@ # hFreqDst.close() self.sFile = spfSrc self.sLang = sLangName self.nEntry = len(lWord) - self.previousWord = [] + self.aPreviousEntry = [] DawgNode.resetNextId() - self.root = DawgNode() - self.uncheckedNodes = [] # list of nodes that have not been checked for duplication. - self.minimizedNodes = {} # list of unique nodes that have been checked for duplication. - self.sortedNodes = [] # version 2 and 3 + self.oRoot = DawgNode() + self.lUncheckedNodes = [] # list of nodes that have not been checked for duplication. + self.lMinimizedNodes = {} # list of unique nodes that have been checked for duplication. + self.lSortedNodes = [] # version 2 and 3 self.nNode = 0 self.nArc = 0 self.dChar = dChar self.nChar = len(dChar) self.nAff = nAff @@ -172,118 +172,118 @@ self.funcStemming = st.noStemming # build lWord.sort() oProgBar = ProgressBar(0, len(lWord)) - for word in lWord: - self.insert(word) + for aEntry in lWord: + self.insert(aEntry) oProgBar.increment(1) oProgBar.done() self.finish() self.countNodes() self.countArcs() self.sortNodes() self.sortNodeArcs(dValOccur) - #self.sortNodeArcs2 (self.root, "") + #self.sortNodeArcs2 (self.oRoot, "") self.displayInfo() # BUILD DAWG - def insert (self, word): - if word < self.previousWord: + def insert (self, aEntry): + if aEntry < self.aPreviousEntry: sys.exit("# Error: Words must be inserted in alphabetical order.") # find common prefix between word and previous word - commonPrefix = 0 - for i in range(min(len(word), len(self.previousWord))): - if word[i] != self.previousWord[i]: + nCommonPrefix = 0 + for i in range(min(len(aEntry), len(self.aPreviousEntry))): + if aEntry[i] != self.aPreviousEntry[i]: break - commonPrefix += 1 + nCommonPrefix += 1 - # Check the uncheckedNodes for redundant nodes, proceeding from last + # Check the lUncheckedNodes for redundant nodes, proceeding from last # one down to the common prefix size. Then truncate the list at that point. - self._minimize(commonPrefix) + self._minimize(nCommonPrefix) # add the suffix, starting from the correct node mid-way through the graph - if len(self.uncheckedNodes) == 0: - oNode = self.root + if len(self.lUncheckedNodes) == 0: + oNode = self.oRoot else: - oNode = self.uncheckedNodes[-1][2] + oNode = self.lUncheckedNodes[-1][2] - iChar = commonPrefix - for c in word[commonPrefix:]: + iChar = nCommonPrefix + for c in aEntry[nCommonPrefix:]: oNextNode = DawgNode() oNode.arcs[c] = oNextNode - self.uncheckedNodes.append((oNode, c, oNextNode)) - if iChar == (len(word) - 2): + self.lUncheckedNodes.append((oNode, c, oNextNode)) + if iChar == (len(aEntry) - 2): oNode.final = True iChar += 1 oNode = oNextNode oNode.final = True - self.previousWord = word + self.aPreviousEntry = aEntry def finish (self): "minimize unchecked nodes" self._minimize(0) def _minimize (self, downTo): # proceed from the leaf up to a certain point - for i in range( len(self.uncheckedNodes)-1, downTo-1, -1 ): - (parent, char, child) = self.uncheckedNodes[i] - if child in self.minimizedNodes: + for i in range( len(self.lUncheckedNodes)-1, downTo-1, -1 ): + oNode, char, oChildNode = self.lUncheckedNodes[i] + if oChildNode in self.lMinimizedNodes: # replace the child with the previously encountered one - parent.arcs[char] = self.minimizedNodes[child] + oNode.arcs[char] = self.lMinimizedNodes[oChildNode] else: # add the state to the minimized nodes. - self.minimizedNodes[child] = child - self.uncheckedNodes.pop() + self.lMinimizedNodes[oChildNode] = oChildNode + self.lUncheckedNodes.pop() def countNodes (self): - self.nNode = len(self.minimizedNodes) + self.nNode = len(self.lMinimizedNodes) def countArcs (self): self.nArc = 0 - for node in self.minimizedNodes: - self.nArc += len(node.arcs) + for oNode in self.lMinimizedNodes: + self.nArc += len(oNode.arcs) def sortNodeArcs (self, dValOccur): print(" > Sort node arcs") - self.root.sortArcs(dValOccur) - for oNode in self.minimizedNodes: + self.oRoot.sortArcs(dValOccur) + for oNode in self.lMinimizedNodes: oNode.sortArcs(dValOccur) def sortNodeArcs2 (self, oNode, cPrevious=""): # recursive function - dCharOccur = getCharOrderForPreviousChar(cPrevious) + dCharOccur = getCharOrderAfterChar(cPrevious) if dCharOccur: oNode.sortArcs2(dCharOccur, self.lArcVal) for nArcVal, oNextNode in oNode.arcs.items(): self.sortNodeArcs2(oNextNode, self.lArcVal[nArcVal]) def sortNodes (self): print(" > Sort nodes") - for oNode in self.root.arcs.values(): + for oNode in self.oRoot.arcs.values(): self._parseNodes(oNode) def _parseNodes (self, oNode): # Warning: recursive method if oNode.pos > 0: return oNode.setPos() - self.sortedNodes.append(oNode) + self.lSortedNodes.append(oNode) for oNextNode in oNode.arcs.values(): self._parseNodes(oNextNode) def lookup (self, sWord): - oNode = self.root + oNode = self.oRoot for c in sWord: if self.dChar.get(c, '') not in oNode.arcs: return False oNode = oNode.arcs[self.dChar[c]] return oNode.final def morph (self, sWord): - oNode = self.root + oNode = self.oRoot for c in sWord: if self.dChar.get(c, '') not in oNode.arcs: return '' oNode = oNode.arcs[self.dChar[c]] if oNode.final: @@ -308,11 +308,11 @@ print(" * {:<12} {:>16,}".format("Arcs:", self.nArc)) print(" * {:<12} {:>16}".format("Stemming:", self.cStemming + "FX")) def getArcStats (self): d = {} - for oNode in self.minimizedNodes: + for oNode in self.lMinimizedNodes: n = len(oNode.arcs) d[n] = d.get(n, 0) + 1 s = " * Nodes:\n" for n in d: s = s + " {:>9} nodes have {:>3} arcs\n".format(d[n], n) @@ -329,19 +329,19 @@ # BINARY CONVERSION def createBinary (self, sPathFile, nMethod, bDebug=False): print(" > Write DAWG as an indexable binary dictionary [method: %d]" % nMethod) if nMethod == 1: - self.nBytesArc = ( ( (self.nArcVal).bit_length() + 2 ) // 8 ) + 1 # We add 2 bits. See DawgNode.convToBytes1() + self.nBytesArc = ( (self.nArcVal.bit_length() + 2) // 8 ) + 1 # We add 2 bits. See DawgNode.convToBytes1() self._calcNumBytesNodeAddress() self._calcNodesAddress1() elif nMethod == 2: - self.nBytesArc = ( ( (self.nArcVal).bit_length() + 3 ) // 8 ) + 1 # We add 3 bits. See DawgNode.convToBytes2() + self.nBytesArc = ( (self.nArcVal.bit_length() + 3) // 8 ) + 1 # We add 3 bits. See DawgNode.convToBytes2() self._calcNumBytesNodeAddress() self._calcNodesAddress2() elif nMethod == 3: - self.nBytesArc = ( ( (self.nArcVal).bit_length() + 3 ) // 8 ) + 1 # We add 3 bits. See DawgNode.convToBytes3() + self.nBytesArc = ( (self.nArcVal.bit_length() + 3) // 8 ) + 1 # We add 3 bits. See DawgNode.convToBytes3() self.nBytesOffset = 1 self.nMaxOffset = (2 ** (self.nBytesOffset * 8)) - 1 self._calcNumBytesNodeAddress() self._calcNodesAddress3() else: @@ -360,19 +360,19 @@ while ((self.nBytesArc + self.nBytesNodeAddress) * self.nArc) > (2 ** (self.nBytesNodeAddress * 8)): self.nBytesNodeAddress += 1 def _calcNodesAddress1 (self): nBytesNode = self.nBytesArc + self.nBytesNodeAddress - iAddr = len(self.root.arcs) * nBytesNode - for oNode in self.minimizedNodes: + iAddr = len(self.oRoot.arcs) * nBytesNode + for oNode in self.lMinimizedNodes: oNode.addr = iAddr iAddr += max(len(oNode.arcs), 1) * nBytesNode def _calcNodesAddress2 (self): nBytesNode = self.nBytesArc + self.nBytesNodeAddress - iAddr = len(self.root.arcs) * nBytesNode - for oNode in self.sortedNodes: + iAddr = len(self.oRoot.arcs) * nBytesNode + for oNode in self.lSortedNodes: oNode.addr = iAddr iAddr += max(len(oNode.arcs), 1) * nBytesNode for oNextNode in oNode.arcs.values(): if (oNode.pos + 1) == oNextNode.pos: iAddr -= self.nBytesNodeAddress @@ -379,31 +379,31 @@ #break def _calcNodesAddress3 (self): nBytesNode = self.nBytesArc + self.nBytesNodeAddress # theorical nodes size if only addresses and no offset - self.root.size = len(self.root.arcs) * nBytesNode - for oNode in self.sortedNodes: + self.oRoot.size = len(self.oRoot.arcs) * nBytesNode + for oNode in self.lSortedNodes: oNode.size = max(len(oNode.arcs), 1) * nBytesNode # rewind and calculate dropdown from the end, several times nDiff = self.nBytesNodeAddress - self.nBytesOffset bEnd = False while not bEnd: bEnd = True # recalculate addresses - iAddr = self.root.size - for oNode in self.sortedNodes: + iAddr = self.oRoot.size + for oNode in self.lSortedNodes: oNode.addr = iAddr iAddr += oNode.size # rewind and calculate dropdown from the end, several times for i in range(self.nNode-1, -1, -1): - nSize = max(len(self.sortedNodes[i].arcs), 1) * nBytesNode - for oNextNode in self.sortedNodes[i].arcs.values(): - if 1 < (oNextNode.addr - self.sortedNodes[i].addr) < self.nMaxOffset: + nSize = max(len(self.lSortedNodes[i].arcs), 1) * nBytesNode + for oNextNode in self.lSortedNodes[i].arcs.values(): + if 1 < (oNextNode.addr - self.lSortedNodes[i].addr) < self.nMaxOffset: nSize -= nDiff - if self.sortedNodes[i].size != nSize: - self.sortedNodes[i].size = nSize + if self.lSortedNodes[i].size != nSize: + self.lSortedNodes[i].size = nSize bEnd = False def _writeBinary (self, sPathFile, nMethod): """ Format of the binary indexable dictionary: @@ -448,40 +448,40 @@ # lArcVal hDst.write("\t".join(self.lArcVal).encode("utf-8")) hDst.write(b"\0\0\0\0") # DAWG: nodes / arcs if nMethod == 1: - hDst.write(self.root.convToBytes1(self.nBytesArc, self.nBytesNodeAddress)) - for oNode in self.minimizedNodes: + hDst.write(self.oRoot.convToBytes1(self.nBytesArc, self.nBytesNodeAddress)) + for oNode in self.lMinimizedNodes: hDst.write(oNode.convToBytes1(self.nBytesArc, self.nBytesNodeAddress)) elif nMethod == 2: - hDst.write(self.root.convToBytes2(self.nBytesArc, self.nBytesNodeAddress)) - for oNode in self.sortedNodes: + hDst.write(self.oRoot.convToBytes2(self.nBytesArc, self.nBytesNodeAddress)) + for oNode in self.lSortedNodes: hDst.write(oNode.convToBytes2(self.nBytesArc, self.nBytesNodeAddress)) elif nMethod == 3: - hDst.write(self.root.convToBytes3(self.nBytesArc, self.nBytesNodeAddress, self.nBytesOffset)) - for oNode in self.sortedNodes: + hDst.write(self.oRoot.convToBytes3(self.nBytesArc, self.nBytesNodeAddress, self.nBytesOffset)) + for oNode in self.lSortedNodes: hDst.write(oNode.convToBytes3(self.nBytesArc, self.nBytesNodeAddress, self.nBytesOffset)) hDst.close() def _writeNodes (self, sPathFile, nMethod): "for debugging only" print(" > Write nodes") with open(sPathFile+".nodes."+str(nMethod)+".txt", 'w', encoding='utf-8', newline="\n") as hDst: if nMethod == 1: - hDst.write(self.root.getTxtRepr1(self.nBytesArc, self.nBytesNodeAddress, self.lArcVal)+"\n") - #hDst.write( ''.join( [ "%02X " % z for z in self.root.convToBytes1(self.nBytesArc, self.nBytesNodeAddress) ] ).strip() ) - for oNode in self.minimizedNodes: + hDst.write(self.oRoot.getTxtRepr1(self.nBytesArc, self.nBytesNodeAddress, self.lArcVal)+"\n") + #hDst.write( ''.join( [ "%02X " % z for z in self.oRoot.convToBytes1(self.nBytesArc, self.nBytesNodeAddress) ] ).strip() ) + for oNode in self.lMinimizedNodes: hDst.write(oNode.getTxtRepr1(self.nBytesArc, self.nBytesNodeAddress, self.lArcVal)+"\n") if nMethod == 2: - hDst.write(self.root.getTxtRepr2(self.nBytesArc, self.nBytesNodeAddress, self.lArcVal)+"\n") - for oNode in self.sortedNodes: + hDst.write(self.oRoot.getTxtRepr2(self.nBytesArc, self.nBytesNodeAddress, self.lArcVal)+"\n") + for oNode in self.lSortedNodes: hDst.write(oNode.getTxtRepr2(self.nBytesArc, self.nBytesNodeAddress, self.lArcVal)+"\n") if nMethod == 3: - hDst.write(self.root.getTxtRepr3(self.nBytesArc, self.nBytesNodeAddress, self.nBytesOffset, self.lArcVal)+"\n") - #hDst.write( ''.join( [ "%02X " % z for z in self.root.convToBytes3(self.nBytesArc, self.nBytesNodeAddress, self.nBytesOffset) ] ).strip() ) - for oNode in self.sortedNodes: + hDst.write(self.oRoot.getTxtRepr3(self.nBytesArc, self.nBytesNodeAddress, self.nBytesOffset, self.lArcVal)+"\n") + #hDst.write( ''.join( [ "%02X " % z for z in self.oRoot.convToBytes3(self.nBytesArc, self.nBytesNodeAddress, self.nBytesOffset) ] ).strip() ) + for oNode in self.lSortedNodes: hDst.write(oNode.getTxtRepr3(self.nBytesArc, self.nBytesNodeAddress, self.nBytesOffset, self.lArcVal)+"\n") hDst.close() def writeResults (self, sPathFile): bFileExits = os.path.isfile("_lexicons.res.txt") @@ -764,12 +764,12 @@ _dCharOrder[cPrevious] = {} _dCharOrder[cPrevious][cChar] = _dCharOrder[cPrevious].get(cChar, 0) + 1 cPrevious = cChar -def getCharOrderForPreviousChar (cChar): +def getCharOrderAfterChar (cChar): return _dCharOrder.get(cChar, None) def displayCharOrder (): for key, value in _dCharOrder.items(): print("[" + key + "]: ", ", ".join([ c+":"+str(n) for c, n in sorted(value.items(), key=lambda t: t[1], reverse=True) ]))