Index: gc_core/py/dawg.py ================================================================== --- gc_core/py/dawg.py +++ gc_core/py/dawg.py @@ -14,10 +14,11 @@ import collections from . import str_transform as st from .progressbar import ProgressBar + def readFile (spf): print(" < Read lexicon: " + spf) if os.path.isfile(spf): with open(spf, "r", encoding="utf-8") as hSrc: @@ -103,10 +104,11 @@ lTag = []; dTag = {}; nTag = 0; dTagOccur = {} nErr = 0 # read lexicon for sFlex, sStem, sTag in getElemsFromFile(spfSrc): + addWordToCharDict(sFlex) # chars for c in sFlex: if c not in dChar: dChar[c] = nChar lChar.append(c) @@ -179,10 +181,11 @@ self.finish() self.countNodes() self.countArcs() self.sortNodes() self.sortNodeArcs(dValOccur) + #self.sortNodeArcs2 (self.root, "") self.displayInfo() # BUILD DAWG def insert (self, word): if word < self.previousWord: @@ -245,10 +248,18 @@ print(" > Sort node arcs") self.root.sortArcs(dValOccur) for oNode in self.minimizedNodes: oNode.sortArcs(dValOccur) + def sortNodeArcs2 (self, oNode, cPrevious=""): + # recursive function + dCharOccur = getCharOrderForPreviousChar(cPrevious) + if dCharOccur: + oNode.sortArcs2(dCharOccur, self.lArcVal) + for nArcVal, oNextNode in oNode.arcs.items(): + self.sortNodeArcs2(oNextNode, self.lArcVal[nArcVal]) + def sortNodes (self): print(" > Sort nodes") for oNode in self.root.arcs.values(): self._parseNodes(oNode) @@ -527,11 +538,14 @@ # Used as a key in a python dictionary. # Nodes are equivalent if they have identical arcs, and each identical arc leads to identical states. return self.__str__() == other.__str__() def sortArcs (self, dValOccur): - self.arcs = collections.OrderedDict(sorted(self.arcs.items(), key=lambda t: dValOccur[t[0]], reverse=True)) + self.arcs = collections.OrderedDict(sorted(self.arcs.items(), key=lambda t: dValOccur.get(t[0], 0), reverse=True)) + + def sortArcs2 (self, dValOccur, lArcVal): + self.arcs = collections.OrderedDict(sorted(self.arcs.items(), key=lambda t: dValOccur.get(lArcVal[t[0]], 0), reverse=True)) # VERSION 1 ===================================================================================================== def convToBytes1 (self, nBytesArc, nBytesNodeAddress): """ Node scheme: @@ -730,5 +744,32 @@ val = val | nNextNodeMask s += " {:<20} {:0>16} i{:_>10} +{:_>10}\n".format(lVal[arc], bin(val)[2:], self.arcs[arc].i, self.arcs[arc].addr - self.addr) else: s += " {:<20} {:0>16} i{:_>10} #{:_>10}\n".format(lVal[arc], bin(val)[2:], self.arcs[arc].i, self.arcs[arc].addr) return s + + + +# Another attempt to sort node arcs + +_dCharOrder = { + # key: previous char, value: dictionary of chars {c: nValue} + "": {} +} + + +def addWordToCharDict (sWord): + cPrevious = "" + for cChar in sWord: + if cPrevious not in _dCharOrder: + _dCharOrder[cPrevious] = {} + _dCharOrder[cPrevious][cChar] = _dCharOrder[cPrevious].get(cChar, 0) + 1 + cPrevious = cChar + + +def getCharOrderForPreviousChar (cChar): + return _dCharOrder.get(cChar, None) + + +def displayCharOrder (): + for key, value in _dCharOrder.items(): + print("[" + key + "]: ", ", ".join([ c+":"+str(n) for c, n in sorted(value.items(), key=lambda t: t[1], reverse=True) ]))