18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
|
import re
import traceback
from . import str_transform as st
from .progressbar import ProgressBar
def readFile (spf):
"generator: read file <spf> and return for each line a list of elements separated by a tabulation."
print(" < Read lexicon: " + spf)
if os.path.isfile(spf):
with open(spf, "r", encoding="utf-8") as hSrc:
for sLine in hSrc:
sLine = sLine.strip()
if sLine and not sLine.startswith("#"):
yield sLine.split("\t")
else:
raise OSError("# Error. File not found or not loadable: " + spf)
class DAWG:
"""DIRECT ACYCLIC WORD GRAPH"""
|
>
>
>
>
>
>
>
|
>
>
>
|
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
|
import re
import traceback
from . import str_transform as st
from .progressbar import ProgressBar
dLexiconData = {}
def readFile (spf):
"generator: read file <spf> and return for each line a list of elements separated by a tabulation."
print(" < Read lexicon: " + spf)
if os.path.isfile(spf):
dLexiconData.clear()
with open(spf, "r", encoding="utf-8") as hSrc:
for sLine in hSrc:
sLine = sLine.strip()
if sLine.startswith("##") :
m = re.match("## *(\\w+) *:(.*)$", sLine)
if m:
dLexiconData[m.group(1)] = m.group(2).strip()
elif sLine and not sLine.startswith("#"):
yield sLine.split("\t")
if dLexiconData:
print("Data from dictionary:")
print(dLexiconData)
else:
raise OSError("# Error. File not found or not loadable: " + spf)
class DAWG:
"""DIRECT ACYCLIC WORD GRAPH"""
|
118
119
120
121
122
123
124
125
126
127
128
129
130
131
|
+ [ (dTag[tag]+nChar+nAff, dTagOccur[tag]) for tag in dTag ] )
self.sFileName = src if type(src) is str else "[None]"
self.sLangCode = sLangCode
self.sLangName = sLangName
self.sDicName = sDicName
self.sDescription = sDescription
self.nEntry = len(lWord)
self.aPreviousEntry = []
DawgNode.resetNextId()
self.oRoot = DawgNode()
self.lUncheckedNodes = [] # list of nodes that have not been checked for duplication.
self.lMinimizedNodes = {} # list of unique nodes that have been checked for duplication.
self.lSortedNodes = [] # version 2 and 3
|
>
>
>
>
>
|
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
|
+ [ (dTag[tag]+nChar+nAff, dTagOccur[tag]) for tag in dTag ] )
self.sFileName = src if type(src) is str else "[None]"
self.sLangCode = sLangCode
self.sLangName = sLangName
self.sDicName = sDicName
self.sDescription = sDescription
if dLexiconData:
self.sLangCode = dLexiconData.get("LangCode", self.sLangCode)
self.sLangName = dLexiconData.get("LangName", self.sLangName)
self.sDicName = dLexiconData.get("DicName", self.sDicName)
self.sDescription = dLexiconData.get("Description", self.sDescription)
self.nEntry = len(lWord)
self.aPreviousEntry = []
DawgNode.resetNextId()
self.oRoot = DawgNode()
self.lUncheckedNodes = [] # list of nodes that have not been checked for duplication.
self.lMinimizedNodes = {} # list of unique nodes that have been checked for duplication.
self.lSortedNodes = [] # version 2 and 3
|
541
542
543
544
545
546
547
548
549
550
551
552
553
554
555
|
hDst.write(self.oRoot.convToBytes3(self.nBytesArc, self.nBytesNodeAddress, self.nBytesOffset))
for oNode in self.lSortedNodes:
hDst.write(oNode.convToBytes3(self.nBytesArc, self.nBytesNodeAddress, self.nBytesOffset))
if bDebug:
self._writeNodes(sPathFile, nCompressionMethod)
def _getDate (self):
return time.strftime("%Y.%m.%d, %H:%M")
def _writeNodes (self, sPathFile, nCompressionMethod):
"for debugging only"
print(" > Write nodes")
with open(sPathFile+".nodes."+str(nCompressionMethod)+".txt", 'w', encoding='utf-8', newline="\n") as hDst:
if nCompressionMethod == 1:
hDst.write(self.oRoot.getTxtRepr1(self.nBytesArc, self.lArcVal)+"\n")
|
|
|
556
557
558
559
560
561
562
563
564
565
566
567
568
569
570
|
hDst.write(self.oRoot.convToBytes3(self.nBytesArc, self.nBytesNodeAddress, self.nBytesOffset))
for oNode in self.lSortedNodes:
hDst.write(oNode.convToBytes3(self.nBytesArc, self.nBytesNodeAddress, self.nBytesOffset))
if bDebug:
self._writeNodes(sPathFile, nCompressionMethod)
def _getDate (self):
return time.strftime("%Y-%m-%d %H:%M:%S")
def _writeNodes (self, sPathFile, nCompressionMethod):
"for debugging only"
print(" > Write nodes")
with open(sPathFile+".nodes."+str(nCompressionMethod)+".txt", 'w', encoding='utf-8', newline="\n") as hDst:
if nCompressionMethod == 1:
hDst.write(self.oRoot.getTxtRepr1(self.nBytesArc, self.lArcVal)+"\n")
|