Grammalecte  Diff

Differences From Artifact [a9c487538f]:

To Artifact [eb4c8506bd]:


12
13
14
15
16
17
18





























































19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38

39
40
41
42
43
44
45
46
47
48

49
50
51
52
53
54
55

56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78





















79
80
81
82
83


84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104

105
106
107
108
109




110
111
112
113
114
115
116
117
118

119
120

121
122

123
124
125
126
127
128
129
130


131
132
133
134
135
136
137

138
139
140
141
142


143
144
145
146
147
148



149
150

151
152

153
154

155
156
157
158


159
160

161
162
163


164
165
166
167


168
169
170
171
172

173
174
175
176
177
178
179
180
181
182



183
184

185
186
187
188


189
190
191

192
193
194
195
196


197
198
199
200
201


202
203








204
205
206

207
208
209
210
211
212
213
214

215
216
217
218
219

220
221
222
223
224
225
226
227

228
229
230
231
232
233
234
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98

99

100
101
102
103
104
105
106
107

108







109























110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130





131
132

133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151

152
153




154
155
156
157
158
159
160
161
162
163
164
165

166
167

168
169

170
171
172
173
174
175
176


177
178
179
180
181
182
183
184
185
186
187
188
189


190
191
192
193
194



195
196
197
198

199
200

201
202

203
204
205


206
207
208

209
210


211
212
213
214


215
216
217
218
219
220

221
222
223
224
225
226
227
228



229
230
231
232

233
234
235


236
237
238
239

240
241
242
243


244
245
246
247
248


249
250
251
252
253
254
255
256
257
258
259
260
261
262

263
264
265
266
267
268
269
270

271
272
273
274
275

276
277
278
279
280
281
282
283

284
285
286
287
288
289
290
291







+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+



















-
+
-








-
+
-
-
-
-
-
-
-
+
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
-
-
-
-
-
+
+
-



















-
+

-
-
-
-
+
+
+
+








-
+

-
+

-
+






-
-
+
+







+



-
-
+
+



-
-
-
+
+
+

-
+

-
+

-
+


-
-
+
+

-
+

-
-
+
+


-
-
+
+




-
+







-
-
-
+
+
+

-
+


-
-
+
+


-
+



-
-
+
+



-
-
+
+


+
+
+
+
+
+
+
+


-
+







-
+




-
+







-
+







import sys
import os
import collections

from . import str_transform as st
from .progressbar import ProgressBar



def readFile (spf):
    print(" < Read lexicon: " + spf)
    if os.path.isfile(spf):
        with open(spf, "r", encoding="utf-8") as hSrc:
            for sLine in hSrc:
                sLine = sLine.strip()
                if sLine and not sLine.startswith("#"):
                    yield sLine
    else:
        raise OSError("# Error. File not found or not loadable: " + spf)


def getElemsFromFile (spf):
    "returns tuple of (flexion, stem, tags) from lexicon file"
    nErr = 0
    if not spf.endswith(".clex"):
        for sLine in readFile(spf):
            try:
                sFlex, sStem, sTag = sLine.split("\t")
                yield (sFlex, sStem, sTag)
            except:
                nErr += 1
    else:
        sTag = "_" # neutral tag
        sTag2 = ""
        for sLine in readFile(spf):
            if sLine.startswith("[") and sLine.endswith("]"):
                # tag line
                if "-->" in sLine:
                    try:
                        sTag, sSfxCode, sTag2 = sLine[1:-1].split(" --> ")
                    except:
                        nErr += 1
                        continue
                    sTag = sTag.strip()
                    sSfxCode = sSfxCode.strip()
                    sTag2 = sTag2.strip()
                else:
                    sTag = sLine[1:-1]
                    sTag2 = ""
            else:
                # entry line
                if "\t" in sLine:
                    if sLine.count("\t") > 1:
                        nErr += 1
                        continue
                    sFlex, sStem = sLine.split("\t")
                else:
                    sFlex = sStem = sLine
                #print(sFlex, sStem, sTag)
                yield (sFlex, sStem, sTag)
                if sTag2:
                    sFlex2 = st.changeWordWithSuffixCode(sFlex, sSfxCode)
                    #print(sFlex2, sStem, sTag2)
                    yield (sFlex2, sStem, sTag2)
    if nErr:
        print(" # Lines ignored: {:>10}".format(nErr))



class DAWG:
    """DIRECT ACYCLIC WORD GRAPH"""
    # This code is inspired from Steve Hanov’s DAWG, 2011. (http://stevehanov.ca/blog/index.php?id=115)
    # We store suffix/affix codes and tags within the graph after the “real” word.
    # A word is a list of numbers [ c1, c2, c3 . . . cN, iAffix, iTags]
    # Each arc is an index in self.lArcVal, where are stored characters, suffix/affix codes for stemming and tags.
    # Important: As usual, the last node (after ‘iTags’) is tagged final, AND the node after ‘cN’ is ALSO tagged final.

    def __init__ (self, spfSrc, sLangName, cStemming):
        print("===== Direct Acyclic Word Graph - Minimal Acyclic Finite State Automaton =====")
        cStemming = cStemming.upper()
        if cStemming == "A":
            funcStemmingGen = st.defineAffixCode
        elif cStemming == "S":
            funcStemmingGen = st.defineSuffixCode
        elif cStemming == "N":
            funcStemmingGen = st.noStemming
        else:
            print("# Error code: {}".format(cStemming))
            raise ValueError("# Error. Unknown stemming code: {}".format(cStemming))
            exit()

        lEntry = []
        lChar = ['']; dChar = {}; nChar = 1; dCharOccur = {}
        lAff  = [];   dAff  = {}; nAff  = 0; dAffOccur = {}
        lTag  = [];   dTag  = {}; nTag  = 0; dTagOccur = {}
        nErr = 0
        
        # read lexicon
        with open(spfSrc, 'r', encoding='utf-8') as hSrc:
        for sFlex, sStem, sTag in getElemsFromFile(spfSrc):
            print(" > Reading lexicon: " + spfSrc + " ...")
            for line in hSrc:
                line = line.strip()
                if not (line.startswith('#') or line == ''):
                    try:
                        flex, stem, tag = line.split("\t")
                    except:
            addWordToCharDict(sFlex)
                        nErr += 1
                        continue
                    # chars
                    for c in flex:
                        if c not in dChar:
                            dChar[c] = nChar
                            lChar.append(c)
                            nChar += 1
                        dCharOccur[c] = dCharOccur.get(c, 0) + 1
                    # affixes to find stem from flexion
                    aff = funcStemmingGen(flex, stem)
                    if aff not in dAff:
                        dAff[aff] = nAff
                        lAff.append(aff)
                        nAff += 1
                    dAffOccur[aff] = dCharOccur.get(aff, 0) + 1
                    # tags
                    if tag not in dTag:
                        dTag[tag] = nTag
                        lTag.append(tag)
                        nTag += 1
                    dTagOccur[tag] = dTagOccur.get(tag, 0) + 1
                    lEntry.append((flex, dAff[aff], dTag[tag]))
            # chars
            for c in sFlex:
                if c not in dChar:
                    dChar[c] = nChar
                    lChar.append(c)
                    nChar += 1
                dCharOccur[c] = dCharOccur.get(c, 0) + 1
            # affixes to find stem from flexion
            aff = funcStemmingGen(sFlex, sStem)
            if aff not in dAff:
                dAff[aff] = nAff
                lAff.append(aff)
                nAff += 1
            dAffOccur[aff] = dCharOccur.get(aff, 0) + 1
            # tags
            if sTag not in dTag:
                dTag[sTag] = nTag
                lTag.append(sTag)
                nTag += 1
            dTagOccur[sTag] = dTagOccur.get(sTag, 0) + 1
            lEntry.append((sFlex, dAff[aff], dTag[sTag]))
            hSrc.close()
        if nErr:
            print(" # Lines ignored: {:>10}".format(nErr))
        if not(lEntry):
            print(" # Empty lexicon")
        if not lEntry:
            raise ValueError("# Error. Empty lexicon")
            exit()
        
        # Preparing DAWG
        print(" > Preparing list of words")
        lVal = lChar + lAff + lTag
        lWord = [ [dChar[c] for c in sFlex] + [iAff+nChar] + [iTag+nChar+nAff]  for sFlex, iAff, iTag in lEntry ]
        lEntry = None
        
        # Dictionary of arc values occurrency, to sort arcs of each node
        dValOccur = dict( [ (dChar[c], dCharOccur[c])  for c in dChar ] \
                        + [ (dAff[aff]+nChar, dAffOccur[aff]) for aff in dAff ] \
                        + [ (dTag[tag]+nChar+nAff, dTagOccur[tag]) for tag in dTag ] )
        #with open(spfSrc[:-8]+".valuesfreq.txt", 'w', encoding='utf-8') as hFreqDst:  # DEBUG
        #    for iKey, nOcc in sorted(dValOccur.items(), key=lambda t: t[1], reverse=True):
        #        hFreqDst.write("{}: {}\n".format(lVal[iKey], nOcc))
        #    hFreqDst.close()
        
        self.sFile = spfSrc
        self.sLang = sLangName
        self.nEntry = len(lWord)
        self.previousWord = []
        self.aPreviousEntry = []
        DawgNode.resetNextId()
        self.root = DawgNode()
        self.uncheckedNodes = []  # list of nodes that have not been checked for duplication.
        self.minimizedNodes = {}  # list of unique nodes that have been checked for duplication.
        self.sortedNodes = []     # version 2 and 3
        self.oRoot = DawgNode()
        self.lUncheckedNodes = []  # list of nodes that have not been checked for duplication.
        self.lMinimizedNodes = {}  # list of unique nodes that have been checked for duplication.
        self.lSortedNodes = []     # version 2 and 3
        self.nNode = 0
        self.nArc = 0
        self.dChar = dChar
        self.nChar = len(dChar)
        self.nAff = nAff
        self.lArcVal = lVal
        self.nArcVal = len(lVal)
        self.nTag = self.nArcVal - self.nChar - nAff
        self.cStemming = cStemming.upper()
        self.cStemming = cStemming
        if cStemming == "A":
            self.funcStemming = st.getStemFromAffixCode
            self.funcStemming = st.changeWordWithAffixCode
        elif cStemming == "S":    
            self.funcStemming = st.getStemFromSuffixCode
            self.funcStemming = st.changeWordWithSuffixCode
        else:
            self.funcStemming = st.noStemming
        
        # build
        lWord.sort()
        oProgBar = ProgressBar(0, len(lWord))
        for word in lWord:
            self.insert(word)
        for aEntry in lWord:
            self.insert(aEntry)
            oProgBar.increment(1)
        oProgBar.done()
        self.finish()
        self.countNodes()
        self.countArcs()
        self.sortNodes()
        self.sortNodeArcs(dValOccur)
        #self.sortNodeArcs2 (self.oRoot, "")
        self.displayInfo()

    # BUILD DAWG
    def insert (self, word):
        if word < self.previousWord:
    def insert (self, aEntry):
        if aEntry < self.aPreviousEntry:
            sys.exit("# Error: Words must be inserted in alphabetical order.")
        
        # find common prefix between word and previous word
        commonPrefix = 0
        for i in range(min(len(word), len(self.previousWord))):
            if word[i] != self.previousWord[i]:
        nCommonPrefix = 0
        for i in range(min(len(aEntry), len(self.aPreviousEntry))):
            if aEntry[i] != self.aPreviousEntry[i]:
                break
            commonPrefix += 1
            nCommonPrefix += 1

        # Check the uncheckedNodes for redundant nodes, proceeding from last
        # Check the lUncheckedNodes for redundant nodes, proceeding from last
        # one down to the common prefix size. Then truncate the list at that point.
        self._minimize(commonPrefix)
        self._minimize(nCommonPrefix)

        # add the suffix, starting from the correct node mid-way through the graph
        if len(self.uncheckedNodes) == 0:
            oNode = self.root
        if len(self.lUncheckedNodes) == 0:
            oNode = self.oRoot
        else:
            oNode = self.uncheckedNodes[-1][2]
            oNode = self.lUncheckedNodes[-1][2]

        iChar = commonPrefix
        for c in word[commonPrefix:]:
        iChar = nCommonPrefix
        for c in aEntry[nCommonPrefix:]:
            oNextNode = DawgNode()
            oNode.arcs[c] = oNextNode
            self.uncheckedNodes.append((oNode, c, oNextNode))
            if iChar == (len(word) - 2): 
            self.lUncheckedNodes.append((oNode, c, oNextNode))
            if iChar == (len(aEntry) - 2): 
                oNode.final = True
            iChar += 1
            oNode = oNextNode
        oNode.final = True
        self.previousWord = word
        self.aPreviousEntry = aEntry

    def finish (self):
        "minimize unchecked nodes"
        self._minimize(0)

    def _minimize (self, downTo):
        # proceed from the leaf up to a certain point
        for i in range( len(self.uncheckedNodes)-1, downTo-1, -1 ):
            (parent, char, child) = self.uncheckedNodes[i]
            if child in self.minimizedNodes:
        for i in range( len(self.lUncheckedNodes)-1, downTo-1, -1 ):
            oNode, char, oChildNode = self.lUncheckedNodes[i]
            if oChildNode in self.lMinimizedNodes:
                # replace the child with the previously encountered one
                parent.arcs[char] = self.minimizedNodes[child]
                oNode.arcs[char] = self.lMinimizedNodes[oChildNode]
            else:
                # add the state to the minimized nodes.
                self.minimizedNodes[child] = child
            self.uncheckedNodes.pop()
                self.lMinimizedNodes[oChildNode] = oChildNode
            self.lUncheckedNodes.pop()

    def countNodes (self):
        self.nNode = len(self.minimizedNodes)
        self.nNode = len(self.lMinimizedNodes)

    def countArcs (self):
        self.nArc = 0
        for node in self.minimizedNodes:
            self.nArc += len(node.arcs)
        for oNode in self.lMinimizedNodes:
            self.nArc += len(oNode.arcs)
    
    def sortNodeArcs (self, dValOccur):
        print(" > Sort node arcs")
        self.root.sortArcs(dValOccur)
        for oNode in self.minimizedNodes:
        self.oRoot.sortArcs(dValOccur)
        for oNode in self.lMinimizedNodes:
            oNode.sortArcs(dValOccur)
    
    def sortNodeArcs2 (self, oNode, cPrevious=""):
        # recursive function
        dCharOccur = getCharOrderAfterChar(cPrevious)
        if dCharOccur:
            oNode.sortArcs2(dCharOccur, self.lArcVal)
        for nArcVal, oNextNode in oNode.arcs.items():
            self.sortNodeArcs2(oNextNode, self.lArcVal[nArcVal])

    def sortNodes (self):
        print(" > Sort nodes")
        for oNode in self.root.arcs.values():
        for oNode in self.oRoot.arcs.values():
            self._parseNodes(oNode)
    
    def _parseNodes (self, oNode):
        # Warning: recursive method
        if oNode.pos > 0:
            return
        oNode.setPos()
        self.sortedNodes.append(oNode)
        self.lSortedNodes.append(oNode)
        for oNextNode in oNode.arcs.values():
             self._parseNodes(oNextNode)
        
    def lookup (self, sWord):
        oNode = self.root
        oNode = self.oRoot
        for c in sWord:
            if self.dChar.get(c, '') not in oNode.arcs:
                return False
            oNode = oNode.arcs[self.dChar[c]]
        return oNode.final

    def morph (self, sWord):
        oNode = self.root
        oNode = self.oRoot
        for c in sWord:
            if self.dChar.get(c, '') not in oNode.arcs:
                return ''
            oNode = oNode.arcs[self.dChar[c]]
        if oNode.final:
            s = "* "
            for arc in oNode.arcs:
249
250
251
252
253
254
255
256

257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277

278
279
280
281

282
283
284
285

286
287
288
289
290
291
292
306
307
308
309
310
311
312

313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333

334
335
336
337

338
339
340
341

342
343
344
345
346
347
348
349







-
+




















-
+



-
+



-
+







        print(" * {:<12} {:>16,}".format("Arc values:", self.nArcVal))
        print(" * {:<12} {:>16,}".format("Nodes:", self.nNode))
        print(" * {:<12} {:>16,}".format("Arcs:", self.nArc))
        print(" * {:<12} {:>16}".format("Stemming:", self.cStemming + "FX"))

    def getArcStats (self):
        d = {}
        for oNode in self.minimizedNodes:
        for oNode in self.lMinimizedNodes:
            n = len(oNode.arcs)
            d[n] = d.get(n, 0) + 1
        s = " * Nodes:\n"
        for n in d:
            s = s + " {:>9} nodes have {:>3} arcs\n".format(d[n], n)
        return s

    def writeInfo (self, sPathFile):
        print(" > Write informations")
        with open(sPathFile, 'w', encoding='utf-8', newline="\n") as hDst:
            hDst.write(self.getArcStats())
            hDst.write("\n * Values:\n")
            for i, s in enumerate(self.lArcVal):
                hDst.write(" {:>6}. {}\n".format(i, s))
            hDst.close()

    # BINARY CONVERSION
    def createBinary (self, sPathFile, nMethod, bDebug=False):
        print(" > Write DAWG as an indexable binary dictionary [method: %d]" % nMethod)
        if nMethod == 1:
            self.nBytesArc = ( ( (self.nArcVal).bit_length() + 2 ) // 8 ) + 1   # We add 2 bits. See DawgNode.convToBytes1()
            self.nBytesArc = ( (self.nArcVal.bit_length() + 2) // 8 ) + 1   # We add 2 bits. See DawgNode.convToBytes1()
            self._calcNumBytesNodeAddress()
            self._calcNodesAddress1()
        elif nMethod == 2:
            self.nBytesArc = ( ( (self.nArcVal).bit_length() + 3 ) // 8 ) + 1   # We add 3 bits. See DawgNode.convToBytes2()
            self.nBytesArc = ( (self.nArcVal.bit_length() + 3) // 8 ) + 1   # We add 3 bits. See DawgNode.convToBytes2()
            self._calcNumBytesNodeAddress()
            self._calcNodesAddress2()
        elif nMethod == 3:
            self.nBytesArc = ( ( (self.nArcVal).bit_length() + 3 ) // 8 ) + 1   # We add 3 bits. See DawgNode.convToBytes3()
            self.nBytesArc = ( (self.nArcVal.bit_length() + 3) // 8 ) + 1   # We add 3 bits. See DawgNode.convToBytes3()
            self.nBytesOffset = 1
            self.nMaxOffset = (2 ** (self.nBytesOffset * 8)) - 1
            self._calcNumBytesNodeAddress()
            self._calcNodesAddress3()
        else:
            print(" # Error: unknown compression method")
        print("   Arc values (chars, affixes and tags): {}  ->  {} bytes".format( self.nArcVal, len("\t".join(self.lArcVal).encode("utf-8")) ))
301
302
303
304
305
306
307
308
309


310
311
312
313
314
315
316


317
318
319
320
321
322
323
324
325
326
327
328


329
330
331
332
333
334
335
336
337


338
339
340
341
342
343
344



345
346
347


348
349
350
351
352
353
354
358
359
360
361
362
363
364


365
366
367
368
369
370
371


372
373
374
375
376
377
378
379
380
381
382
383


384
385
386
387
388
389
390
391
392


393
394
395
396
397
398



399
400
401
402


403
404
405
406
407
408
409
410
411







-
-
+
+





-
-
+
+










-
-
+
+







-
-
+
+




-
-
-
+
+
+

-
-
+
+







        "how many bytes needed to store all nodes/arcs in the binary dictionary"
        self.nBytesNodeAddress = 1
        while ((self.nBytesArc + self.nBytesNodeAddress) * self.nArc) > (2 ** (self.nBytesNodeAddress * 8)):
            self.nBytesNodeAddress += 1

    def _calcNodesAddress1 (self):
        nBytesNode = self.nBytesArc + self.nBytesNodeAddress
        iAddr = len(self.root.arcs) * nBytesNode
        for oNode in self.minimizedNodes:
        iAddr = len(self.oRoot.arcs) * nBytesNode
        for oNode in self.lMinimizedNodes:
            oNode.addr = iAddr
            iAddr += max(len(oNode.arcs), 1) * nBytesNode

    def _calcNodesAddress2 (self):
        nBytesNode = self.nBytesArc + self.nBytesNodeAddress
        iAddr = len(self.root.arcs) * nBytesNode
        for oNode in self.sortedNodes:
        iAddr = len(self.oRoot.arcs) * nBytesNode
        for oNode in self.lSortedNodes:
            oNode.addr = iAddr
            iAddr += max(len(oNode.arcs), 1) * nBytesNode
            for oNextNode in oNode.arcs.values():
                if (oNode.pos + 1) == oNextNode.pos:
                    iAddr -= self.nBytesNodeAddress
                    #break

    def _calcNodesAddress3 (self):
        nBytesNode = self.nBytesArc + self.nBytesNodeAddress
        # theorical nodes size if only addresses and no offset
        self.root.size = len(self.root.arcs) * nBytesNode
        for oNode in self.sortedNodes:
        self.oRoot.size = len(self.oRoot.arcs) * nBytesNode
        for oNode in self.lSortedNodes:
            oNode.size = max(len(oNode.arcs), 1) * nBytesNode
        # rewind and calculate dropdown from the end, several times
        nDiff = self.nBytesNodeAddress - self.nBytesOffset
        bEnd = False
        while not bEnd:
            bEnd = True
            # recalculate addresses
            iAddr = self.root.size
            for oNode in self.sortedNodes:
            iAddr = self.oRoot.size
            for oNode in self.lSortedNodes:
                oNode.addr = iAddr
                iAddr += oNode.size
            # rewind and calculate dropdown from the end, several times
            for i in range(self.nNode-1, -1, -1):
                nSize = max(len(self.sortedNodes[i].arcs), 1) * nBytesNode
                for oNextNode in self.sortedNodes[i].arcs.values():
                    if 1 < (oNextNode.addr - self.sortedNodes[i].addr) < self.nMaxOffset:
                nSize = max(len(self.lSortedNodes[i].arcs), 1) * nBytesNode
                for oNextNode in self.lSortedNodes[i].arcs.values():
                    if 1 < (oNextNode.addr - self.lSortedNodes[i].addr) < self.nMaxOffset:
                        nSize -= nDiff
                if self.sortedNodes[i].size != nSize:
                    self.sortedNodes[i].size = nSize
                if self.lSortedNodes[i].size != nSize:
                    self.lSortedNodes[i].size = nSize
                    bEnd = False

    def _writeBinary (self, sPathFile, nMethod):
        """
        Format of the binary indexable dictionary:
        Each section is separated with 4 bytes of \0
        
389
390
391
392
393
394
395
396
397


398
399
400
401


402
403
404
405


406
407
408
409
410
411
412
413
414
415
416



417
418
419
420


421
422
423
424
425



426
427
428
429
430
431
432
446
447
448
449
450
451
452


453
454
455
456


457
458
459
460


461
462
463
464
465
466
467
468
469
470



471
472
473
474
475


476
477
478
479



480
481
482
483
484
485
486
487
488
489







-
-
+
+


-
-
+
+


-
-
+
+








-
-
-
+
+
+


-
-
+
+


-
-
-
+
+
+







                                                           self.nEntry, self.nNode, self.nArc, self.nAff, self.cStemming).encode("utf-8"))
            hDst.write(b"\0\0\0\0")
            # lArcVal
            hDst.write("\t".join(self.lArcVal).encode("utf-8"))
            hDst.write(b"\0\0\0\0")
            # DAWG: nodes / arcs
            if nMethod == 1:
                hDst.write(self.root.convToBytes1(self.nBytesArc, self.nBytesNodeAddress))
                for oNode in self.minimizedNodes:
                hDst.write(self.oRoot.convToBytes1(self.nBytesArc, self.nBytesNodeAddress))
                for oNode in self.lMinimizedNodes:
                    hDst.write(oNode.convToBytes1(self.nBytesArc, self.nBytesNodeAddress))
            elif nMethod == 2:
                hDst.write(self.root.convToBytes2(self.nBytesArc, self.nBytesNodeAddress))
                for oNode in self.sortedNodes:
                hDst.write(self.oRoot.convToBytes2(self.nBytesArc, self.nBytesNodeAddress))
                for oNode in self.lSortedNodes:
                    hDst.write(oNode.convToBytes2(self.nBytesArc, self.nBytesNodeAddress))
            elif nMethod == 3:
                hDst.write(self.root.convToBytes3(self.nBytesArc, self.nBytesNodeAddress, self.nBytesOffset))
                for oNode in self.sortedNodes:
                hDst.write(self.oRoot.convToBytes3(self.nBytesArc, self.nBytesNodeAddress, self.nBytesOffset))
                for oNode in self.lSortedNodes:
                    hDst.write(oNode.convToBytes3(self.nBytesArc, self.nBytesNodeAddress, self.nBytesOffset))
            hDst.close()

    def _writeNodes (self, sPathFile, nMethod):
        "for debugging only"
        print(" > Write nodes")
        with open(sPathFile+".nodes."+str(nMethod)+".txt", 'w', encoding='utf-8', newline="\n") as hDst:
            if nMethod == 1:
                hDst.write(self.root.getTxtRepr1(self.nBytesArc, self.nBytesNodeAddress, self.lArcVal)+"\n")
                #hDst.write( ''.join( [ "%02X " %  z  for z in self.root.convToBytes1(self.nBytesArc, self.nBytesNodeAddress) ] ).strip() )
                for oNode in self.minimizedNodes:
                hDst.write(self.oRoot.getTxtRepr1(self.nBytesArc, self.nBytesNodeAddress, self.lArcVal)+"\n")
                #hDst.write( ''.join( [ "%02X " %  z  for z in self.oRoot.convToBytes1(self.nBytesArc, self.nBytesNodeAddress) ] ).strip() )
                for oNode in self.lMinimizedNodes:
                    hDst.write(oNode.getTxtRepr1(self.nBytesArc, self.nBytesNodeAddress, self.lArcVal)+"\n")
            if nMethod == 2:
                hDst.write(self.root.getTxtRepr2(self.nBytesArc, self.nBytesNodeAddress, self.lArcVal)+"\n")
                for oNode in self.sortedNodes:
                hDst.write(self.oRoot.getTxtRepr2(self.nBytesArc, self.nBytesNodeAddress, self.lArcVal)+"\n")
                for oNode in self.lSortedNodes:
                    hDst.write(oNode.getTxtRepr2(self.nBytesArc, self.nBytesNodeAddress, self.lArcVal)+"\n")
            if nMethod == 3:
                hDst.write(self.root.getTxtRepr3(self.nBytesArc, self.nBytesNodeAddress, self.nBytesOffset, self.lArcVal)+"\n")
                #hDst.write( ''.join( [ "%02X " %  z  for z in self.root.convToBytes3(self.nBytesArc, self.nBytesNodeAddress, self.nBytesOffset) ] ).strip() )
                for oNode in self.sortedNodes:
                hDst.write(self.oRoot.getTxtRepr3(self.nBytesArc, self.nBytesNodeAddress, self.nBytesOffset, self.lArcVal)+"\n")
                #hDst.write( ''.join( [ "%02X " %  z  for z in self.oRoot.convToBytes3(self.nBytesArc, self.nBytesNodeAddress, self.nBytesOffset) ] ).strip() )
                for oNode in self.lSortedNodes:
                    hDst.write(oNode.getTxtRepr3(self.nBytesArc, self.nBytesNodeAddress, self.nBytesOffset, self.lArcVal)+"\n")
            hDst.close()
    
    def writeResults (self, sPathFile):
        bFileExits = os.path.isfile("_lexicons.res.txt")
        with open("_lexicons.res.txt", "a", encoding='utf-8', newline="\n") as hDst:
            sFormat1 = "{:<12} {:>12} {:>5} {:>8} {:>8} {:>6} {:>8} {:>9} {:>9} {:>15} {:>12} {:>12}\n"
479
480
481
482
483
484
485
486




487
488
489
490
491
492
493
536
537
538
539
540
541
542

543
544
545
546
547
548
549
550
551
552
553







-
+
+
+
+








    def __eq__ (self, other):
        # Used as a key in a python dictionary.
        # Nodes are equivalent if they have identical arcs, and each identical arc leads to identical states.
        return self.__str__() == other.__str__()

    def sortArcs (self, dValOccur):
        self.arcs = collections.OrderedDict(sorted(self.arcs.items(), key=lambda t: dValOccur[t[0]], reverse=True))
        self.arcs = collections.OrderedDict(sorted(self.arcs.items(), key=lambda t: dValOccur.get(t[0], 0), reverse=True))

    def sortArcs2 (self, dValOccur, lArcVal):
        self.arcs = collections.OrderedDict(sorted(self.arcs.items(), key=lambda t: dValOccur.get(lArcVal[t[0]], 0), reverse=True))

    # VERSION 1 =====================================================================================================
    def convToBytes1 (self, nBytesArc, nBytesNodeAddress):
        """
        Node scheme:
        - Arc length is defined by nBytesArc
        - Address length is defined by nBytesNodeAddress
682
683
684
685
686
687
688



























742
743
744
745
746
747
748
749
750
751
752
753
754
755
756
757
758
759
760
761
762
763
764
765
766
767
768
769
770
771
772
773
774
775







+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
                val = val | nFinalArcMask
            if 1 < (self.arcs[arc].addr - self.addr) < nMaxOffset and self.i != 0:
                val = val | nNextNodeMask
                s += "  {:<20}  {:0>16}  i{:_>10}   +{:_>10}\n".format(lVal[arc], bin(val)[2:], self.arcs[arc].i, self.arcs[arc].addr - self.addr)
            else:
                s += "  {:<20}  {:0>16}  i{:_>10}   #{:_>10}\n".format(lVal[arc], bin(val)[2:], self.arcs[arc].i, self.arcs[arc].addr)
        return s



# Another attempt to sort node arcs

_dCharOrder = {
    # key: previous char, value: dictionary of chars {c: nValue}
    "": {}
}


def addWordToCharDict (sWord):
    cPrevious = ""
    for cChar in sWord:
        if cPrevious not in _dCharOrder:
            _dCharOrder[cPrevious] = {}
        _dCharOrder[cPrevious][cChar] = _dCharOrder[cPrevious].get(cChar, 0) + 1
        cPrevious = cChar


def getCharOrderAfterChar (cChar):
    return _dCharOrder.get(cChar, None)


def displayCharOrder ():
    for key, value in _dCharOrder.items():
        print("[" + key + "]: ", ", ".join([ c+":"+str(n)  for c, n  in  sorted(value.items(), key=lambda t: t[1], reverse=True) ]))