Grammalecte  Check-in [fe361b4e8d]

Overview
Comment:[core] code cleaning
Downloads: Tarball | ZIP archive | SQL archive
Timelines: family | ancestors | descendants | both | trunk | core
Files: files | file ages | folders
SHA3-256: fe361b4e8d6378fe8c08482252cb7d6c157a1c26b9ccdedf0bba76c5d9f95b41
User & Date: olr on 2017-06-29 19:40:12
Other Links: manifest | tags
Context
2017-06-30
08:10
[fr] faux positifs check-in: c39ac2a56a user: olr tags: trunk, fr
2017-06-29
19:40
[core] code cleaning check-in: fe361b4e8d user: olr tags: trunk, core
13:20
[core] dawg: attempt to speed up the dictionary lookup by reordering arcs (pointless ATM) check-in: b038740434 user: olr tags: trunk, core
Changes

Modified gc_core/py/dawg.py from [d8c1a249fb] to [eb4c8506bd].

145
146
147
148
149
150
151
152

153
154
155
156
157




158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178


179
180
181
182
183
184
185
186

187
188
189
190
191


192
193
194
195
196
197



198
199

200
201

202
203

204
205
206
207


208
209

210
211
212


213
214
215
216


217
218
219
220
221

222
223
224
225
226
227
228
229
230
231



232
233

234
235
236
237


238
239
240

241
242
243
244
245


246
247
248
249
250


251
252
253
254
255

256
257
258
259
260
261
262
263

264
265
266
267
268
269
270
271

272
273
274
275
276

277
278
279
280
281
282
283
284

285
286
287
288
289
290
291
145
146
147
148
149
150
151

152
153




154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176


177
178
179
180
181
182
183
184
185

186
187
188
189


190
191
192
193
194



195
196
197
198

199
200

201
202

203
204
205


206
207
208

209
210


211
212
213
214


215
216
217
218
219
220

221
222
223
224
225
226
227
228



229
230
231
232

233
234
235


236
237
238
239

240
241
242
243


244
245
246
247
248


249
250
251
252
253
254

255
256
257
258
259
260
261
262

263
264
265
266
267
268
269
270

271
272
273
274
275

276
277
278
279
280
281
282
283

284
285
286
287
288
289
290
291







-
+

-
-
-
-
+
+
+
+



















-
-
+
+







-
+



-
-
+
+



-
-
-
+
+
+

-
+

-
+

-
+


-
-
+
+

-
+

-
-
+
+


-
-
+
+




-
+







-
-
-
+
+
+

-
+


-
-
+
+


-
+



-
-
+
+



-
-
+
+




-
+







-
+







-
+




-
+







-
+







        #    for iKey, nOcc in sorted(dValOccur.items(), key=lambda t: t[1], reverse=True):
        #        hFreqDst.write("{}: {}\n".format(lVal[iKey], nOcc))
        #    hFreqDst.close()
        
        self.sFile = spfSrc
        self.sLang = sLangName
        self.nEntry = len(lWord)
        self.previousWord = []
        self.aPreviousEntry = []
        DawgNode.resetNextId()
        self.root = DawgNode()
        self.uncheckedNodes = []  # list of nodes that have not been checked for duplication.
        self.minimizedNodes = {}  # list of unique nodes that have been checked for duplication.
        self.sortedNodes = []     # version 2 and 3
        self.oRoot = DawgNode()
        self.lUncheckedNodes = []  # list of nodes that have not been checked for duplication.
        self.lMinimizedNodes = {}  # list of unique nodes that have been checked for duplication.
        self.lSortedNodes = []     # version 2 and 3
        self.nNode = 0
        self.nArc = 0
        self.dChar = dChar
        self.nChar = len(dChar)
        self.nAff = nAff
        self.lArcVal = lVal
        self.nArcVal = len(lVal)
        self.nTag = self.nArcVal - self.nChar - nAff
        self.cStemming = cStemming
        if cStemming == "A":
            self.funcStemming = st.changeWordWithAffixCode
        elif cStemming == "S":    
            self.funcStemming = st.changeWordWithSuffixCode
        else:
            self.funcStemming = st.noStemming
        
        # build
        lWord.sort()
        oProgBar = ProgressBar(0, len(lWord))
        for word in lWord:
            self.insert(word)
        for aEntry in lWord:
            self.insert(aEntry)
            oProgBar.increment(1)
        oProgBar.done()
        self.finish()
        self.countNodes()
        self.countArcs()
        self.sortNodes()
        self.sortNodeArcs(dValOccur)
        #self.sortNodeArcs2 (self.root, "")
        #self.sortNodeArcs2 (self.oRoot, "")
        self.displayInfo()

    # BUILD DAWG
    def insert (self, word):
        if word < self.previousWord:
    def insert (self, aEntry):
        if aEntry < self.aPreviousEntry:
            sys.exit("# Error: Words must be inserted in alphabetical order.")
        
        # find common prefix between word and previous word
        commonPrefix = 0
        for i in range(min(len(word), len(self.previousWord))):
            if word[i] != self.previousWord[i]:
        nCommonPrefix = 0
        for i in range(min(len(aEntry), len(self.aPreviousEntry))):
            if aEntry[i] != self.aPreviousEntry[i]:
                break
            commonPrefix += 1
            nCommonPrefix += 1

        # Check the uncheckedNodes for redundant nodes, proceeding from last
        # Check the lUncheckedNodes for redundant nodes, proceeding from last
        # one down to the common prefix size. Then truncate the list at that point.
        self._minimize(commonPrefix)
        self._minimize(nCommonPrefix)

        # add the suffix, starting from the correct node mid-way through the graph
        if len(self.uncheckedNodes) == 0:
            oNode = self.root
        if len(self.lUncheckedNodes) == 0:
            oNode = self.oRoot
        else:
            oNode = self.uncheckedNodes[-1][2]
            oNode = self.lUncheckedNodes[-1][2]

        iChar = commonPrefix
        for c in word[commonPrefix:]:
        iChar = nCommonPrefix
        for c in aEntry[nCommonPrefix:]:
            oNextNode = DawgNode()
            oNode.arcs[c] = oNextNode
            self.uncheckedNodes.append((oNode, c, oNextNode))
            if iChar == (len(word) - 2): 
            self.lUncheckedNodes.append((oNode, c, oNextNode))
            if iChar == (len(aEntry) - 2): 
                oNode.final = True
            iChar += 1
            oNode = oNextNode
        oNode.final = True
        self.previousWord = word
        self.aPreviousEntry = aEntry

    def finish (self):
        "minimize unchecked nodes"
        self._minimize(0)

    def _minimize (self, downTo):
        # proceed from the leaf up to a certain point
        for i in range( len(self.uncheckedNodes)-1, downTo-1, -1 ):
            (parent, char, child) = self.uncheckedNodes[i]
            if child in self.minimizedNodes:
        for i in range( len(self.lUncheckedNodes)-1, downTo-1, -1 ):
            oNode, char, oChildNode = self.lUncheckedNodes[i]
            if oChildNode in self.lMinimizedNodes:
                # replace the child with the previously encountered one
                parent.arcs[char] = self.minimizedNodes[child]
                oNode.arcs[char] = self.lMinimizedNodes[oChildNode]
            else:
                # add the state to the minimized nodes.
                self.minimizedNodes[child] = child
            self.uncheckedNodes.pop()
                self.lMinimizedNodes[oChildNode] = oChildNode
            self.lUncheckedNodes.pop()

    def countNodes (self):
        self.nNode = len(self.minimizedNodes)
        self.nNode = len(self.lMinimizedNodes)

    def countArcs (self):
        self.nArc = 0
        for node in self.minimizedNodes:
            self.nArc += len(node.arcs)
        for oNode in self.lMinimizedNodes:
            self.nArc += len(oNode.arcs)
    
    def sortNodeArcs (self, dValOccur):
        print(" > Sort node arcs")
        self.root.sortArcs(dValOccur)
        for oNode in self.minimizedNodes:
        self.oRoot.sortArcs(dValOccur)
        for oNode in self.lMinimizedNodes:
            oNode.sortArcs(dValOccur)
    
    def sortNodeArcs2 (self, oNode, cPrevious=""):
        # recursive function
        dCharOccur = getCharOrderForPreviousChar(cPrevious)
        dCharOccur = getCharOrderAfterChar(cPrevious)
        if dCharOccur:
            oNode.sortArcs2(dCharOccur, self.lArcVal)
        for nArcVal, oNextNode in oNode.arcs.items():
            self.sortNodeArcs2(oNextNode, self.lArcVal[nArcVal])

    def sortNodes (self):
        print(" > Sort nodes")
        for oNode in self.root.arcs.values():
        for oNode in self.oRoot.arcs.values():
            self._parseNodes(oNode)
    
    def _parseNodes (self, oNode):
        # Warning: recursive method
        if oNode.pos > 0:
            return
        oNode.setPos()
        self.sortedNodes.append(oNode)
        self.lSortedNodes.append(oNode)
        for oNextNode in oNode.arcs.values():
             self._parseNodes(oNextNode)
        
    def lookup (self, sWord):
        oNode = self.root
        oNode = self.oRoot
        for c in sWord:
            if self.dChar.get(c, '') not in oNode.arcs:
                return False
            oNode = oNode.arcs[self.dChar[c]]
        return oNode.final

    def morph (self, sWord):
        oNode = self.root
        oNode = self.oRoot
        for c in sWord:
            if self.dChar.get(c, '') not in oNode.arcs:
                return ''
            oNode = oNode.arcs[self.dChar[c]]
        if oNode.final:
            s = "* "
            for arc in oNode.arcs:
306
307
308
309
310
311
312
313

314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334

335
336
337
338

339
340
341
342

343
344
345
346
347
348
349
306
307
308
309
310
311
312

313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333

334
335
336
337

338
339
340
341

342
343
344
345
346
347
348
349







-
+




















-
+



-
+



-
+







        print(" * {:<12} {:>16,}".format("Arc values:", self.nArcVal))
        print(" * {:<12} {:>16,}".format("Nodes:", self.nNode))
        print(" * {:<12} {:>16,}".format("Arcs:", self.nArc))
        print(" * {:<12} {:>16}".format("Stemming:", self.cStemming + "FX"))

    def getArcStats (self):
        d = {}
        for oNode in self.minimizedNodes:
        for oNode in self.lMinimizedNodes:
            n = len(oNode.arcs)
            d[n] = d.get(n, 0) + 1
        s = " * Nodes:\n"
        for n in d:
            s = s + " {:>9} nodes have {:>3} arcs\n".format(d[n], n)
        return s

    def writeInfo (self, sPathFile):
        print(" > Write informations")
        with open(sPathFile, 'w', encoding='utf-8', newline="\n") as hDst:
            hDst.write(self.getArcStats())
            hDst.write("\n * Values:\n")
            for i, s in enumerate(self.lArcVal):
                hDst.write(" {:>6}. {}\n".format(i, s))
            hDst.close()

    # BINARY CONVERSION
    def createBinary (self, sPathFile, nMethod, bDebug=False):
        print(" > Write DAWG as an indexable binary dictionary [method: %d]" % nMethod)
        if nMethod == 1:
            self.nBytesArc = ( ( (self.nArcVal).bit_length() + 2 ) // 8 ) + 1   # We add 2 bits. See DawgNode.convToBytes1()
            self.nBytesArc = ( (self.nArcVal.bit_length() + 2) // 8 ) + 1   # We add 2 bits. See DawgNode.convToBytes1()
            self._calcNumBytesNodeAddress()
            self._calcNodesAddress1()
        elif nMethod == 2:
            self.nBytesArc = ( ( (self.nArcVal).bit_length() + 3 ) // 8 ) + 1   # We add 3 bits. See DawgNode.convToBytes2()
            self.nBytesArc = ( (self.nArcVal.bit_length() + 3) // 8 ) + 1   # We add 3 bits. See DawgNode.convToBytes2()
            self._calcNumBytesNodeAddress()
            self._calcNodesAddress2()
        elif nMethod == 3:
            self.nBytesArc = ( ( (self.nArcVal).bit_length() + 3 ) // 8 ) + 1   # We add 3 bits. See DawgNode.convToBytes3()
            self.nBytesArc = ( (self.nArcVal.bit_length() + 3) // 8 ) + 1   # We add 3 bits. See DawgNode.convToBytes3()
            self.nBytesOffset = 1
            self.nMaxOffset = (2 ** (self.nBytesOffset * 8)) - 1
            self._calcNumBytesNodeAddress()
            self._calcNodesAddress3()
        else:
            print(" # Error: unknown compression method")
        print("   Arc values (chars, affixes and tags): {}  ->  {} bytes".format( self.nArcVal, len("\t".join(self.lArcVal).encode("utf-8")) ))
358
359
360
361
362
363
364
365
366


367
368
369
370
371
372
373


374
375
376
377
378
379
380
381
382
383
384
385


386
387
388
389
390
391
392
393
394


395
396
397
398
399
400
401



402
403
404


405
406
407
408
409
410
411
358
359
360
361
362
363
364


365
366
367
368
369
370
371


372
373
374
375
376
377
378
379
380
381
382
383


384
385
386
387
388
389
390
391
392


393
394
395
396
397
398



399
400
401
402


403
404
405
406
407
408
409
410
411







-
-
+
+





-
-
+
+










-
-
+
+







-
-
+
+




-
-
-
+
+
+

-
-
+
+







        "how many bytes needed to store all nodes/arcs in the binary dictionary"
        self.nBytesNodeAddress = 1
        while ((self.nBytesArc + self.nBytesNodeAddress) * self.nArc) > (2 ** (self.nBytesNodeAddress * 8)):
            self.nBytesNodeAddress += 1

    def _calcNodesAddress1 (self):
        nBytesNode = self.nBytesArc + self.nBytesNodeAddress
        iAddr = len(self.root.arcs) * nBytesNode
        for oNode in self.minimizedNodes:
        iAddr = len(self.oRoot.arcs) * nBytesNode
        for oNode in self.lMinimizedNodes:
            oNode.addr = iAddr
            iAddr += max(len(oNode.arcs), 1) * nBytesNode

    def _calcNodesAddress2 (self):
        nBytesNode = self.nBytesArc + self.nBytesNodeAddress
        iAddr = len(self.root.arcs) * nBytesNode
        for oNode in self.sortedNodes:
        iAddr = len(self.oRoot.arcs) * nBytesNode
        for oNode in self.lSortedNodes:
            oNode.addr = iAddr
            iAddr += max(len(oNode.arcs), 1) * nBytesNode
            for oNextNode in oNode.arcs.values():
                if (oNode.pos + 1) == oNextNode.pos:
                    iAddr -= self.nBytesNodeAddress
                    #break

    def _calcNodesAddress3 (self):
        nBytesNode = self.nBytesArc + self.nBytesNodeAddress
        # theorical nodes size if only addresses and no offset
        self.root.size = len(self.root.arcs) * nBytesNode
        for oNode in self.sortedNodes:
        self.oRoot.size = len(self.oRoot.arcs) * nBytesNode
        for oNode in self.lSortedNodes:
            oNode.size = max(len(oNode.arcs), 1) * nBytesNode
        # rewind and calculate dropdown from the end, several times
        nDiff = self.nBytesNodeAddress - self.nBytesOffset
        bEnd = False
        while not bEnd:
            bEnd = True
            # recalculate addresses
            iAddr = self.root.size
            for oNode in self.sortedNodes:
            iAddr = self.oRoot.size
            for oNode in self.lSortedNodes:
                oNode.addr = iAddr
                iAddr += oNode.size
            # rewind and calculate dropdown from the end, several times
            for i in range(self.nNode-1, -1, -1):
                nSize = max(len(self.sortedNodes[i].arcs), 1) * nBytesNode
                for oNextNode in self.sortedNodes[i].arcs.values():
                    if 1 < (oNextNode.addr - self.sortedNodes[i].addr) < self.nMaxOffset:
                nSize = max(len(self.lSortedNodes[i].arcs), 1) * nBytesNode
                for oNextNode in self.lSortedNodes[i].arcs.values():
                    if 1 < (oNextNode.addr - self.lSortedNodes[i].addr) < self.nMaxOffset:
                        nSize -= nDiff
                if self.sortedNodes[i].size != nSize:
                    self.sortedNodes[i].size = nSize
                if self.lSortedNodes[i].size != nSize:
                    self.lSortedNodes[i].size = nSize
                    bEnd = False

    def _writeBinary (self, sPathFile, nMethod):
        """
        Format of the binary indexable dictionary:
        Each section is separated with 4 bytes of \0
        
446
447
448
449
450
451
452
453
454


455
456
457
458


459
460
461
462


463
464
465
466
467
468
469
470
471
472
473



474
475
476
477


478
479
480
481
482



483
484
485
486
487
488
489
446
447
448
449
450
451
452


453
454
455
456


457
458
459
460


461
462
463
464
465
466
467
468
469
470



471
472
473
474
475


476
477
478
479



480
481
482
483
484
485
486
487
488
489







-
-
+
+


-
-
+
+


-
-
+
+








-
-
-
+
+
+


-
-
+
+


-
-
-
+
+
+







                                                           self.nEntry, self.nNode, self.nArc, self.nAff, self.cStemming).encode("utf-8"))
            hDst.write(b"\0\0\0\0")
            # lArcVal
            hDst.write("\t".join(self.lArcVal).encode("utf-8"))
            hDst.write(b"\0\0\0\0")
            # DAWG: nodes / arcs
            if nMethod == 1:
                hDst.write(self.root.convToBytes1(self.nBytesArc, self.nBytesNodeAddress))
                for oNode in self.minimizedNodes:
                hDst.write(self.oRoot.convToBytes1(self.nBytesArc, self.nBytesNodeAddress))
                for oNode in self.lMinimizedNodes:
                    hDst.write(oNode.convToBytes1(self.nBytesArc, self.nBytesNodeAddress))
            elif nMethod == 2:
                hDst.write(self.root.convToBytes2(self.nBytesArc, self.nBytesNodeAddress))
                for oNode in self.sortedNodes:
                hDst.write(self.oRoot.convToBytes2(self.nBytesArc, self.nBytesNodeAddress))
                for oNode in self.lSortedNodes:
                    hDst.write(oNode.convToBytes2(self.nBytesArc, self.nBytesNodeAddress))
            elif nMethod == 3:
                hDst.write(self.root.convToBytes3(self.nBytesArc, self.nBytesNodeAddress, self.nBytesOffset))
                for oNode in self.sortedNodes:
                hDst.write(self.oRoot.convToBytes3(self.nBytesArc, self.nBytesNodeAddress, self.nBytesOffset))
                for oNode in self.lSortedNodes:
                    hDst.write(oNode.convToBytes3(self.nBytesArc, self.nBytesNodeAddress, self.nBytesOffset))
            hDst.close()

    def _writeNodes (self, sPathFile, nMethod):
        "for debugging only"
        print(" > Write nodes")
        with open(sPathFile+".nodes."+str(nMethod)+".txt", 'w', encoding='utf-8', newline="\n") as hDst:
            if nMethod == 1:
                hDst.write(self.root.getTxtRepr1(self.nBytesArc, self.nBytesNodeAddress, self.lArcVal)+"\n")
                #hDst.write( ''.join( [ "%02X " %  z  for z in self.root.convToBytes1(self.nBytesArc, self.nBytesNodeAddress) ] ).strip() )
                for oNode in self.minimizedNodes:
                hDst.write(self.oRoot.getTxtRepr1(self.nBytesArc, self.nBytesNodeAddress, self.lArcVal)+"\n")
                #hDst.write( ''.join( [ "%02X " %  z  for z in self.oRoot.convToBytes1(self.nBytesArc, self.nBytesNodeAddress) ] ).strip() )
                for oNode in self.lMinimizedNodes:
                    hDst.write(oNode.getTxtRepr1(self.nBytesArc, self.nBytesNodeAddress, self.lArcVal)+"\n")
            if nMethod == 2:
                hDst.write(self.root.getTxtRepr2(self.nBytesArc, self.nBytesNodeAddress, self.lArcVal)+"\n")
                for oNode in self.sortedNodes:
                hDst.write(self.oRoot.getTxtRepr2(self.nBytesArc, self.nBytesNodeAddress, self.lArcVal)+"\n")
                for oNode in self.lSortedNodes:
                    hDst.write(oNode.getTxtRepr2(self.nBytesArc, self.nBytesNodeAddress, self.lArcVal)+"\n")
            if nMethod == 3:
                hDst.write(self.root.getTxtRepr3(self.nBytesArc, self.nBytesNodeAddress, self.nBytesOffset, self.lArcVal)+"\n")
                #hDst.write( ''.join( [ "%02X " %  z  for z in self.root.convToBytes3(self.nBytesArc, self.nBytesNodeAddress, self.nBytesOffset) ] ).strip() )
                for oNode in self.sortedNodes:
                hDst.write(self.oRoot.getTxtRepr3(self.nBytesArc, self.nBytesNodeAddress, self.nBytesOffset, self.lArcVal)+"\n")
                #hDst.write( ''.join( [ "%02X " %  z  for z in self.oRoot.convToBytes3(self.nBytesArc, self.nBytesNodeAddress, self.nBytesOffset) ] ).strip() )
                for oNode in self.lSortedNodes:
                    hDst.write(oNode.getTxtRepr3(self.nBytesArc, self.nBytesNodeAddress, self.nBytesOffset, self.lArcVal)+"\n")
            hDst.close()
    
    def writeResults (self, sPathFile):
        bFileExits = os.path.isfile("_lexicons.res.txt")
        with open("_lexicons.res.txt", "a", encoding='utf-8', newline="\n") as hDst:
            sFormat1 = "{:<12} {:>12} {:>5} {:>8} {:>8} {:>6} {:>8} {:>9} {:>9} {:>15} {:>12} {:>12}\n"
762
763
764
765
766
767
768
769

770
771
772
773
774
775
762
763
764
765
766
767
768

769
770
771
772
773
774
775







-
+






    for cChar in sWord:
        if cPrevious not in _dCharOrder:
            _dCharOrder[cPrevious] = {}
        _dCharOrder[cPrevious][cChar] = _dCharOrder[cPrevious].get(cChar, 0) + 1
        cPrevious = cChar


def getCharOrderForPreviousChar (cChar):
def getCharOrderAfterChar (cChar):
    return _dCharOrder.get(cChar, None)


def displayCharOrder ():
    for key, value in _dCharOrder.items():
        print("[" + key + "]: ", ", ".join([ c+":"+str(n)  for c, n  in  sorted(value.items(), key=lambda t: t[1], reverse=True) ]))