Overview
Comment: | [core] dawg: attempt to speed up the dictionary lookup by reordering arcs (pointless ATM) |
---|---|
Downloads: | Tarball | ZIP archive | SQL archive |
Timelines: | family | ancestors | descendants | both | trunk | core |
Files: | files | file ages | folders |
SHA3-256: |
b03874043469e152d3aefca8b7936b5a |
User & Date: | olr on 2017-06-29 13:20:59 |
Other Links: | manifest | tags |
Context
2017-06-29
| ||
19:40 | [core] code cleaning check-in: fe361b4e8d user: olr tags: trunk, core | |
13:20 | [core] dawg: attempt to speed up the dictionary lookup by reordering arcs (pointless ATM) check-in: b038740434 user: olr tags: trunk, core | |
04:16 | [core] ibdawg: add next node when drawing path of a word check-in: 94f5da9f4a user: olr tags: trunk, core | |
Changes
Modified gc_core/py/dawg.py from [ddd6fe1cc6] to [d8c1a249fb].
︙ | ︙ | |||
12 13 14 15 16 17 18 19 20 21 22 23 24 25 | import sys import os import collections from . import str_transform as st from .progressbar import ProgressBar def readFile (spf): print(" < Read lexicon: " + spf) if os.path.isfile(spf): with open(spf, "r", encoding="utf-8") as hSrc: for sLine in hSrc: sLine = sLine.strip() | > | 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 | import sys import os import collections from . import str_transform as st from .progressbar import ProgressBar def readFile (spf): print(" < Read lexicon: " + spf) if os.path.isfile(spf): with open(spf, "r", encoding="utf-8") as hSrc: for sLine in hSrc: sLine = sLine.strip() |
︙ | ︙ | |||
101 102 103 104 105 106 107 108 109 110 111 112 113 114 | lChar = ['']; dChar = {}; nChar = 1; dCharOccur = {} lAff = []; dAff = {}; nAff = 0; dAffOccur = {} lTag = []; dTag = {}; nTag = 0; dTagOccur = {} nErr = 0 # read lexicon for sFlex, sStem, sTag in getElemsFromFile(spfSrc): # chars for c in sFlex: if c not in dChar: dChar[c] = nChar lChar.append(c) nChar += 1 dCharOccur[c] = dCharOccur.get(c, 0) + 1 | > | 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 | lChar = ['']; dChar = {}; nChar = 1; dCharOccur = {} lAff = []; dAff = {}; nAff = 0; dAffOccur = {} lTag = []; dTag = {}; nTag = 0; dTagOccur = {} nErr = 0 # read lexicon for sFlex, sStem, sTag in getElemsFromFile(spfSrc): addWordToCharDict(sFlex) # chars for c in sFlex: if c not in dChar: dChar[c] = nChar lChar.append(c) nChar += 1 dCharOccur[c] = dCharOccur.get(c, 0) + 1 |
︙ | ︙ | |||
177 178 179 180 181 182 183 184 185 186 187 188 189 190 | oProgBar.increment(1) oProgBar.done() self.finish() self.countNodes() self.countArcs() self.sortNodes() self.sortNodeArcs(dValOccur) self.displayInfo() # BUILD DAWG def insert (self, word): if word < self.previousWord: sys.exit("# Error: Words must be inserted in alphabetical order.") | > | 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 | oProgBar.increment(1) oProgBar.done() self.finish() self.countNodes() self.countArcs() self.sortNodes() self.sortNodeArcs(dValOccur) #self.sortNodeArcs2 (self.root, "") self.displayInfo() # BUILD DAWG def insert (self, word): if word < self.previousWord: sys.exit("# Error: Words must be inserted in alphabetical order.") |
︙ | ︙ | |||
243 244 245 246 247 248 249 250 251 252 253 254 255 256 | def sortNodeArcs (self, dValOccur): print(" > Sort node arcs") self.root.sortArcs(dValOccur) for oNode in self.minimizedNodes: oNode.sortArcs(dValOccur) def sortNodes (self): print(" > Sort nodes") for oNode in self.root.arcs.values(): self._parseNodes(oNode) def _parseNodes (self, oNode): # Warning: recursive method | > > > > > > > > | 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 | def sortNodeArcs (self, dValOccur): print(" > Sort node arcs") self.root.sortArcs(dValOccur) for oNode in self.minimizedNodes: oNode.sortArcs(dValOccur) def sortNodeArcs2 (self, oNode, cPrevious=""): # recursive function dCharOccur = getCharOrderForPreviousChar(cPrevious) if dCharOccur: oNode.sortArcs2(dCharOccur, self.lArcVal) for nArcVal, oNextNode in oNode.arcs.items(): self.sortNodeArcs2(oNextNode, self.lArcVal[nArcVal]) def sortNodes (self): print(" > Sort nodes") for oNode in self.root.arcs.values(): self._parseNodes(oNode) def _parseNodes (self, oNode): # Warning: recursive method |
︙ | ︙ | |||
525 526 527 528 529 530 531 | def __eq__ (self, other): # Used as a key in a python dictionary. # Nodes are equivalent if they have identical arcs, and each identical arc leads to identical states. return self.__str__() == other.__str__() def sortArcs (self, dValOccur): | | > > > | 536 537 538 539 540 541 542 543 544 545 546 547 548 549 550 551 552 553 | def __eq__ (self, other): # Used as a key in a python dictionary. # Nodes are equivalent if they have identical arcs, and each identical arc leads to identical states. return self.__str__() == other.__str__() def sortArcs (self, dValOccur): self.arcs = collections.OrderedDict(sorted(self.arcs.items(), key=lambda t: dValOccur.get(t[0], 0), reverse=True)) def sortArcs2 (self, dValOccur, lArcVal): self.arcs = collections.OrderedDict(sorted(self.arcs.items(), key=lambda t: dValOccur.get(lArcVal[t[0]], 0), reverse=True)) # VERSION 1 ===================================================================================================== def convToBytes1 (self, nBytesArc, nBytesNodeAddress): """ Node scheme: - Arc length is defined by nBytesArc - Address length is defined by nBytesNodeAddress |
︙ | ︙ | |||
728 729 730 731 732 733 734 | val = val | nFinalArcMask if 1 < (self.arcs[arc].addr - self.addr) < nMaxOffset and self.i != 0: val = val | nNextNodeMask s += " {:<20} {:0>16} i{:_>10} +{:_>10}\n".format(lVal[arc], bin(val)[2:], self.arcs[arc].i, self.arcs[arc].addr - self.addr) else: s += " {:<20} {:0>16} i{:_>10} #{:_>10}\n".format(lVal[arc], bin(val)[2:], self.arcs[arc].i, self.arcs[arc].addr) return s | > > > > > > > > > > > > > > > > > > > > > > > > > > > | 742 743 744 745 746 747 748 749 750 751 752 753 754 755 756 757 758 759 760 761 762 763 764 765 766 767 768 769 770 771 772 773 774 775 | val = val | nFinalArcMask if 1 < (self.arcs[arc].addr - self.addr) < nMaxOffset and self.i != 0: val = val | nNextNodeMask s += " {:<20} {:0>16} i{:_>10} +{:_>10}\n".format(lVal[arc], bin(val)[2:], self.arcs[arc].i, self.arcs[arc].addr - self.addr) else: s += " {:<20} {:0>16} i{:_>10} #{:_>10}\n".format(lVal[arc], bin(val)[2:], self.arcs[arc].i, self.arcs[arc].addr) return s # Another attempt to sort node arcs _dCharOrder = { # key: previous char, value: dictionary of chars {c: nValue} "": {} } def addWordToCharDict (sWord): cPrevious = "" for cChar in sWord: if cPrevious not in _dCharOrder: _dCharOrder[cPrevious] = {} _dCharOrder[cPrevious][cChar] = _dCharOrder[cPrevious].get(cChar, 0) + 1 cPrevious = cChar def getCharOrderForPreviousChar (cChar): return _dCharOrder.get(cChar, None) def displayCharOrder (): for key, value in _dCharOrder.items(): print("[" + key + "]: ", ", ".join([ c+":"+str(n) for c, n in sorted(value.items(), key=lambda t: t[1], reverse=True) ])) |