︙ | | | ︙ | |
12
13
14
15
16
17
18
19
20
21
22
23
24
25
|
import sys
import os
import collections
from . import str_transform as st
from .progressbar import ProgressBar
def readFile (spf):
print(" < Read lexicon: " + spf)
if os.path.isfile(spf):
with open(spf, "r", encoding="utf-8") as hSrc:
for sLine in hSrc:
sLine = sLine.strip()
|
>
|
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
|
import sys
import os
import collections
from . import str_transform as st
from .progressbar import ProgressBar
def readFile (spf):
print(" < Read lexicon: " + spf)
if os.path.isfile(spf):
with open(spf, "r", encoding="utf-8") as hSrc:
for sLine in hSrc:
sLine = sLine.strip()
|
︙ | | | ︙ | |
101
102
103
104
105
106
107
108
109
110
111
112
113
114
|
lChar = ['']; dChar = {}; nChar = 1; dCharOccur = {}
lAff = []; dAff = {}; nAff = 0; dAffOccur = {}
lTag = []; dTag = {}; nTag = 0; dTagOccur = {}
nErr = 0
# read lexicon
for sFlex, sStem, sTag in getElemsFromFile(spfSrc):
# chars
for c in sFlex:
if c not in dChar:
dChar[c] = nChar
lChar.append(c)
nChar += 1
dCharOccur[c] = dCharOccur.get(c, 0) + 1
|
>
|
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
|
lChar = ['']; dChar = {}; nChar = 1; dCharOccur = {}
lAff = []; dAff = {}; nAff = 0; dAffOccur = {}
lTag = []; dTag = {}; nTag = 0; dTagOccur = {}
nErr = 0
# read lexicon
for sFlex, sStem, sTag in getElemsFromFile(spfSrc):
addWordToCharDict(sFlex)
# chars
for c in sFlex:
if c not in dChar:
dChar[c] = nChar
lChar.append(c)
nChar += 1
dCharOccur[c] = dCharOccur.get(c, 0) + 1
|
︙ | | | ︙ | |
177
178
179
180
181
182
183
184
185
186
187
188
189
190
|
oProgBar.increment(1)
oProgBar.done()
self.finish()
self.countNodes()
self.countArcs()
self.sortNodes()
self.sortNodeArcs(dValOccur)
self.displayInfo()
# BUILD DAWG
def insert (self, word):
if word < self.previousWord:
sys.exit("# Error: Words must be inserted in alphabetical order.")
|
>
|
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
|
oProgBar.increment(1)
oProgBar.done()
self.finish()
self.countNodes()
self.countArcs()
self.sortNodes()
self.sortNodeArcs(dValOccur)
#self.sortNodeArcs2 (self.root, "")
self.displayInfo()
# BUILD DAWG
def insert (self, word):
if word < self.previousWord:
sys.exit("# Error: Words must be inserted in alphabetical order.")
|
︙ | | | ︙ | |
243
244
245
246
247
248
249
250
251
252
253
254
255
256
|
def sortNodeArcs (self, dValOccur):
print(" > Sort node arcs")
self.root.sortArcs(dValOccur)
for oNode in self.minimizedNodes:
oNode.sortArcs(dValOccur)
def sortNodes (self):
print(" > Sort nodes")
for oNode in self.root.arcs.values():
self._parseNodes(oNode)
def _parseNodes (self, oNode):
# Warning: recursive method
|
>
>
>
>
>
>
>
>
|
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
|
def sortNodeArcs (self, dValOccur):
print(" > Sort node arcs")
self.root.sortArcs(dValOccur)
for oNode in self.minimizedNodes:
oNode.sortArcs(dValOccur)
def sortNodeArcs2 (self, oNode, cPrevious=""):
# recursive function
dCharOccur = getCharOrderForPreviousChar(cPrevious)
if dCharOccur:
oNode.sortArcs2(dCharOccur, self.lArcVal)
for nArcVal, oNextNode in oNode.arcs.items():
self.sortNodeArcs2(oNextNode, self.lArcVal[nArcVal])
def sortNodes (self):
print(" > Sort nodes")
for oNode in self.root.arcs.values():
self._parseNodes(oNode)
def _parseNodes (self, oNode):
# Warning: recursive method
|
︙ | | | ︙ | |
525
526
527
528
529
530
531
532
533
534
535
536
537
538
539
|
def __eq__ (self, other):
# Used as a key in a python dictionary.
# Nodes are equivalent if they have identical arcs, and each identical arc leads to identical states.
return self.__str__() == other.__str__()
def sortArcs (self, dValOccur):
self.arcs = collections.OrderedDict(sorted(self.arcs.items(), key=lambda t: dValOccur[t[0]], reverse=True))
# VERSION 1 =====================================================================================================
def convToBytes1 (self, nBytesArc, nBytesNodeAddress):
"""
Node scheme:
- Arc length is defined by nBytesArc
- Address length is defined by nBytesNodeAddress
|
|
>
>
>
|
536
537
538
539
540
541
542
543
544
545
546
547
548
549
550
551
552
553
|
def __eq__ (self, other):
# Used as a key in a python dictionary.
# Nodes are equivalent if they have identical arcs, and each identical arc leads to identical states.
return self.__str__() == other.__str__()
def sortArcs (self, dValOccur):
self.arcs = collections.OrderedDict(sorted(self.arcs.items(), key=lambda t: dValOccur.get(t[0], 0), reverse=True))
def sortArcs2 (self, dValOccur, lArcVal):
self.arcs = collections.OrderedDict(sorted(self.arcs.items(), key=lambda t: dValOccur.get(lArcVal[t[0]], 0), reverse=True))
# VERSION 1 =====================================================================================================
def convToBytes1 (self, nBytesArc, nBytesNodeAddress):
"""
Node scheme:
- Arc length is defined by nBytesArc
- Address length is defined by nBytesNodeAddress
|
︙ | | | ︙ | |
728
729
730
731
732
733
734
|
val = val | nFinalArcMask
if 1 < (self.arcs[arc].addr - self.addr) < nMaxOffset and self.i != 0:
val = val | nNextNodeMask
s += " {:<20} {:0>16} i{:_>10} +{:_>10}\n".format(lVal[arc], bin(val)[2:], self.arcs[arc].i, self.arcs[arc].addr - self.addr)
else:
s += " {:<20} {:0>16} i{:_>10} #{:_>10}\n".format(lVal[arc], bin(val)[2:], self.arcs[arc].i, self.arcs[arc].addr)
return s
|
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
|
742
743
744
745
746
747
748
749
750
751
752
753
754
755
756
757
758
759
760
761
762
763
764
765
766
767
768
769
770
771
772
773
774
775
|
val = val | nFinalArcMask
if 1 < (self.arcs[arc].addr - self.addr) < nMaxOffset and self.i != 0:
val = val | nNextNodeMask
s += " {:<20} {:0>16} i{:_>10} +{:_>10}\n".format(lVal[arc], bin(val)[2:], self.arcs[arc].i, self.arcs[arc].addr - self.addr)
else:
s += " {:<20} {:0>16} i{:_>10} #{:_>10}\n".format(lVal[arc], bin(val)[2:], self.arcs[arc].i, self.arcs[arc].addr)
return s
# Another attempt to sort node arcs
_dCharOrder = {
# key: previous char, value: dictionary of chars {c: nValue}
"": {}
}
def addWordToCharDict (sWord):
cPrevious = ""
for cChar in sWord:
if cPrevious not in _dCharOrder:
_dCharOrder[cPrevious] = {}
_dCharOrder[cPrevious][cChar] = _dCharOrder[cPrevious].get(cChar, 0) + 1
cPrevious = cChar
def getCharOrderForPreviousChar (cChar):
return _dCharOrder.get(cChar, None)
def displayCharOrder ():
for key, value in _dCharOrder.items():
print("[" + key + "]: ", ", ".join([ c+":"+str(n) for c, n in sorted(value.items(), key=lambda t: t[1], reverse=True) ]))
|