Grammalecte  Diff

Differences From Artifact [9c1c8b10c8]:

To Artifact [1fe1ba5da8]:


304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
                sEntry = sWord + "\t" + self.funcStemming(sWord, self.lArcVal[nVal])
                for nMorphVal, _ in oNextNode.arcs.items():
                    if not zPattern or zPattern.search(self.lArcVal[nMorphVal]):
                        yield sEntry + "\t" + self.lArcVal[nMorphVal]


    # BINARY CONVERSION
    def createBinary (self, sPathFile, nMethod, bDebug=False):
        print(" > Write DAWG as an indexable binary dictionary [method: %d]" % nMethod)
        if nMethod == 1:
            self.nBytesArc = ( (self.nArcVal.bit_length() + 2) // 8 ) + 1   # We add 2 bits. See DawgNode.convToBytes1()
            self.nBytesOffset = 0
            self._calcNumBytesNodeAddress()
            self._calcNodesAddress1()
        elif nMethod == 2:
            self.nBytesArc = ( (self.nArcVal.bit_length() + 3) // 8 ) + 1   # We add 3 bits. See DawgNode.convToBytes2()
            self.nBytesOffset = 0
            self._calcNumBytesNodeAddress()
            self._calcNodesAddress2()
        elif nMethod == 3:
            self.nBytesArc = ( (self.nArcVal.bit_length() + 3) // 8 ) + 1   # We add 3 bits. See DawgNode.convToBytes3()
            self.nBytesOffset = 1
            self.nMaxOffset = (2 ** (self.nBytesOffset * 8)) - 1
            self._calcNumBytesNodeAddress()
            self._calcNodesAddress3()
        else:
            print(" # Error: unknown compression method")
        print("   Arc values (chars, affixes and tags): {}  ->  {} bytes".format( self.nArcVal, len("\t".join(self.lArcVal).encode("utf-8")) ))
        print("   Arc size: {} bytes, Address size: {} bytes   ->   {} * {} = {} bytes".format( self.nBytesArc, self.nBytesNodeAddress, \
                                                                                                self.nBytesArc+self.nBytesNodeAddress, self.nArc, \
                                                                                                (self.nBytesArc+self.nBytesNodeAddress)*self.nArc ))
        self._writeBinary(sPathFile, nMethod)
        self._writeAsJSObject(sPathFile, nMethod)
        if bDebug:
            self._writeNodes(sPathFile, nMethod)

    def _calcNumBytesNodeAddress (self):
        "how many bytes needed to store all nodes/arcs in the binary dictionary"
        self.nBytesNodeAddress = 1
        while ((self.nBytesArc + self.nBytesNodeAddress) * self.nArc) > (2 ** (self.nBytesNodeAddress * 8)):
            self.nBytesNodeAddress += 1








|
|
|




|




|











|
|

|







304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
                sEntry = sWord + "\t" + self.funcStemming(sWord, self.lArcVal[nVal])
                for nMorphVal, _ in oNextNode.arcs.items():
                    if not zPattern or zPattern.search(self.lArcVal[nMorphVal]):
                        yield sEntry + "\t" + self.lArcVal[nMorphVal]


    # BINARY CONVERSION
    def createBinary (self, sPathFile, nCompressionMethod, bDebug=False):
        print(" > Write DAWG as an indexable binary dictionary [method: %d]" % nCompressionMethod)
        if nCompressionMethod == 1:
            self.nBytesArc = ( (self.nArcVal.bit_length() + 2) // 8 ) + 1   # We add 2 bits. See DawgNode.convToBytes1()
            self.nBytesOffset = 0
            self._calcNumBytesNodeAddress()
            self._calcNodesAddress1()
        elif nCompressionMethod == 2:
            self.nBytesArc = ( (self.nArcVal.bit_length() + 3) // 8 ) + 1   # We add 3 bits. See DawgNode.convToBytes2()
            self.nBytesOffset = 0
            self._calcNumBytesNodeAddress()
            self._calcNodesAddress2()
        elif nCompressionMethod == 3:
            self.nBytesArc = ( (self.nArcVal.bit_length() + 3) // 8 ) + 1   # We add 3 bits. See DawgNode.convToBytes3()
            self.nBytesOffset = 1
            self.nMaxOffset = (2 ** (self.nBytesOffset * 8)) - 1
            self._calcNumBytesNodeAddress()
            self._calcNodesAddress3()
        else:
            print(" # Error: unknown compression method")
        print("   Arc values (chars, affixes and tags): {}  ->  {} bytes".format( self.nArcVal, len("\t".join(self.lArcVal).encode("utf-8")) ))
        print("   Arc size: {} bytes, Address size: {} bytes   ->   {} * {} = {} bytes".format( self.nBytesArc, self.nBytesNodeAddress, \
                                                                                                self.nBytesArc+self.nBytesNodeAddress, self.nArc, \
                                                                                                (self.nBytesArc+self.nBytesNodeAddress)*self.nArc ))
        self._writeBinary(sPathFile, nCompressionMethod)
        self._writeAsJSObject(sPathFile, nCompressionMethod)
        if bDebug:
            self._writeNodes(sPathFile, nCompressionMethod)

    def _calcNumBytesNodeAddress (self):
        "how many bytes needed to store all nodes/arcs in the binary dictionary"
        self.nBytesNodeAddress = 1
        while ((self.nBytesArc + self.nBytesNodeAddress) * self.nArc) > (2 ** (self.nBytesNodeAddress * 8)):
            self.nBytesNodeAddress += 1

383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
                for oNextNode in self.lSortedNodes[i].arcs.values():
                    if 1 < (oNextNode.addr - self.lSortedNodes[i].addr) < self.nMaxOffset:
                        nSize -= nDiff
                if self.lSortedNodes[i].size != nSize:
                    self.lSortedNodes[i].size = nSize
                    bEnd = False

    def _writeAsJSObject (self, spfDst, nMethod, bInJSModule=False, bBinaryDictAsHexString=True):
        if not spfDst.endswith(".json"):
            spfDst += "."+str(nMethod)+".json"
        byDic = b""
        if nMethod == 1:
            byDic = self.oRoot.convToBytes1(self.nBytesArc, self.nBytesNodeAddress)
            for oNode in self.lMinimizedNodes:
                byDic += oNode.convToBytes1(self.nBytesArc, self.nBytesNodeAddress)
        elif nMethod == 2:
            byDic = self.oRoot.convToBytes2(self.nBytesArc, self.nBytesNodeAddress)
            for oNode in self.lSortedNodes:
                byDic += oNode.convToBytes2(self.nBytesArc, self.nBytesNodeAddress)
        elif nMethod == 3:
            byDic = self.oRoot.convToBytes3(self.nBytesArc, self.nBytesNodeAddress, self.nBytesOffset)
            for oNode in self.lSortedNodes:
                byDic += oNode.convToBytes3(self.nBytesArc, self.nBytesNodeAddress, self.nBytesOffset)

        with open(spfDst, "w", encoding="utf-8", newline="\n") as hDst:
            if bInJSModule:
                hDst.write('// JavaScript\n// Generated data (do not edit)\n\n"use strict";\n\nconst dictionary = ')
            hDst.write(json.dumps({
                            "sName": "todo",
                            "nVersion": nMethod,
                            "sDate": str(datetime.datetime.now())[:-7],
                            "sHeader": "/pyfsa/"+str(nMethod)+"/",
                            "lArcVal": self.lArcVal,
                            "nArcVal": self.nArcVal,
                            # JavaScript is a pile of shit, so Mozilla’s JS parser don’t like file bigger than 4 Mb!
                            # So, if necessary, we use an hexadecimal string, that we will convert later in Firefox’s extension.
                            # https://github.com/mozilla/addons-linter/issues/1361
                            "byDic": byDic.hex()  if bBinaryDictAsHexString  else [ e  for e in byDic ],
                            "sLang": self.sLang,







|

|

|



|



|









|

|







383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
                for oNextNode in self.lSortedNodes[i].arcs.values():
                    if 1 < (oNextNode.addr - self.lSortedNodes[i].addr) < self.nMaxOffset:
                        nSize -= nDiff
                if self.lSortedNodes[i].size != nSize:
                    self.lSortedNodes[i].size = nSize
                    bEnd = False

    def _writeAsJSObject (self, spfDst, nCompressionMethod, bInJSModule=False, bBinaryDictAsHexString=True):
        if not spfDst.endswith(".json"):
            spfDst += "."+str(nCompressionMethod)+".json"
        byDic = b""
        if nCompressionMethod == 1:
            byDic = self.oRoot.convToBytes1(self.nBytesArc, self.nBytesNodeAddress)
            for oNode in self.lMinimizedNodes:
                byDic += oNode.convToBytes1(self.nBytesArc, self.nBytesNodeAddress)
        elif nCompressionMethod == 2:
            byDic = self.oRoot.convToBytes2(self.nBytesArc, self.nBytesNodeAddress)
            for oNode in self.lSortedNodes:
                byDic += oNode.convToBytes2(self.nBytesArc, self.nBytesNodeAddress)
        elif nCompressionMethod == 3:
            byDic = self.oRoot.convToBytes3(self.nBytesArc, self.nBytesNodeAddress, self.nBytesOffset)
            for oNode in self.lSortedNodes:
                byDic += oNode.convToBytes3(self.nBytesArc, self.nBytesNodeAddress, self.nBytesOffset)

        with open(spfDst, "w", encoding="utf-8", newline="\n") as hDst:
            if bInJSModule:
                hDst.write('// JavaScript\n// Generated data (do not edit)\n\n"use strict";\n\nconst dictionary = ')
            hDst.write(json.dumps({
                            "sName": "todo",
                            "nCompressionMethod": nCompressionMethod,
                            "sDate": str(datetime.datetime.now())[:-7],
                            "sHeader": "/pyfsa/"+str(nCompressionMethod)+"/",
                            "lArcVal": self.lArcVal,
                            "nArcVal": self.nArcVal,
                            # JavaScript is a pile of shit, so Mozilla’s JS parser don’t like file bigger than 4 Mb!
                            # So, if necessary, we use an hexadecimal string, that we will convert later in Firefox’s extension.
                            # https://github.com/mozilla/addons-linter/issues/1361
                            "byDic": byDic.hex()  if bBinaryDictAsHexString  else [ e  for e in byDic ],
                            "sLang": self.sLang,
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
                            "nTag": self.nTag,
                            "dChar": self.dChar,
                            "nBytesOffset": self.nBytesOffset
                        }, ensure_ascii=False))
            if bInJSModule:
                hDst.write(";\n\nexports.dictionary = dictionary;\n")

    def _writeBinary (self, sPathFile, nMethod):
        """
        Format of the binary indexable dictionary:
        Each section is separated with 4 bytes of \0
        
        - Section Header:
            /pyfsa/[version]
                * version is an ASCII string
        
        - Section Informations:
            /[tag_lang]
            /[number of chars]
            /[number of bytes for each arc]
            /[number of bytes for each address node]
            /[number of entries]







|





|
|







430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
                            "nTag": self.nTag,
                            "dChar": self.dChar,
                            "nBytesOffset": self.nBytesOffset
                        }, ensure_ascii=False))
            if bInJSModule:
                hDst.write(";\n\nexports.dictionary = dictionary;\n")

    def _writeBinary (self, sPathFile, nCompressionMethod):
        """
        Format of the binary indexable dictionary:
        Each section is separated with 4 bytes of \0
        
        - Section Header:
            /pyfsa/[compression method]
                * compression method is an ASCII string
        
        - Section Informations:
            /[tag_lang]
            /[number of chars]
            /[number of bytes for each arc]
            /[number of bytes for each address node]
            /[number of entries]
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
485
486
487
488
489
490
491
492
493
494
495
496
497
498
499
500
501
502
503
504
505
506
507
508
509
510
511
512
513
514
515
516
                * a list of strings encoded in binary from utf-8, each value separated with a tabulation
        
        - Section Word Graph (nodes / arcs)
                * A list of nodes which are a list of arcs with an address of the next node.
                  See DawgNode.convToBytes() for details.
        """
        if not sPathFile.endswith(".bdic"):
            sPathFile += "."+str(nMethod)+".bdic"
        with open(sPathFile, 'wb') as hDst:
            # header
            hDst.write("/pyfsa/{}/".format(nMethod).encode("utf-8"))
            hDst.write(b"\0\0\0\0")
            # infos
            hDst.write("{}/{}/{}/{}/{}/{}/{}/{}/{}".format(self.sLang, self.nChar, self.nBytesArc, self.nBytesNodeAddress, \
                                                           self.nEntry, self.nNode, self.nArc, self.nAff, self.cStemming).encode("utf-8"))
            hDst.write(b"\0\0\0\0")
            # lArcVal
            hDst.write("\t".join(self.lArcVal).encode("utf-8"))
            hDst.write(b"\0\0\0\0")
            # DAWG: nodes / arcs
            if nMethod == 1:
                hDst.write(self.oRoot.convToBytes1(self.nBytesArc, self.nBytesNodeAddress))
                for oNode in self.lMinimizedNodes:
                    hDst.write(oNode.convToBytes1(self.nBytesArc, self.nBytesNodeAddress))
            elif nMethod == 2:
                hDst.write(self.oRoot.convToBytes2(self.nBytesArc, self.nBytesNodeAddress))
                for oNode in self.lSortedNodes:
                    hDst.write(oNode.convToBytes2(self.nBytesArc, self.nBytesNodeAddress))
            elif nMethod == 3:
                hDst.write(self.oRoot.convToBytes3(self.nBytesArc, self.nBytesNodeAddress, self.nBytesOffset))
                for oNode in self.lSortedNodes:
                    hDst.write(oNode.convToBytes3(self.nBytesArc, self.nBytesNodeAddress, self.nBytesOffset))
            hDst.close()

    def _writeNodes (self, sPathFile, nMethod):
        "for debugging only"
        print(" > Write nodes")
        with open(sPathFile+".nodes."+str(nMethod)+".txt", 'w', encoding='utf-8', newline="\n") as hDst:
            if nMethod == 1:
                hDst.write(self.oRoot.getTxtRepr1(self.nBytesArc, self.nBytesNodeAddress, self.lArcVal)+"\n")
                #hDst.write( ''.join( [ "%02X " %  z  for z in self.oRoot.convToBytes1(self.nBytesArc, self.nBytesNodeAddress) ] ).strip() )
                for oNode in self.lMinimizedNodes:
                    hDst.write(oNode.getTxtRepr1(self.nBytesArc, self.nBytesNodeAddress, self.lArcVal)+"\n")
            if nMethod == 2:
                hDst.write(self.oRoot.getTxtRepr2(self.nBytesArc, self.nBytesNodeAddress, self.lArcVal)+"\n")
                for oNode in self.lSortedNodes:
                    hDst.write(oNode.getTxtRepr2(self.nBytesArc, self.nBytesNodeAddress, self.lArcVal)+"\n")
            if nMethod == 3:
                hDst.write(self.oRoot.getTxtRepr3(self.nBytesArc, self.nBytesNodeAddress, self.nBytesOffset, self.lArcVal)+"\n")
                #hDst.write( ''.join( [ "%02X " %  z  for z in self.oRoot.convToBytes3(self.nBytesArc, self.nBytesNodeAddress, self.nBytesOffset) ] ).strip() )
                for oNode in self.lSortedNodes:
                    hDst.write(oNode.getTxtRepr3(self.nBytesArc, self.nBytesNodeAddress, self.nBytesOffset, self.lArcVal)+"\n")
            hDst.close()
    
    def writeResults (self, sPathFile):







|


|









|



|



|





|


|
|




|



|







462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
485
486
487
488
489
490
491
492
493
494
495
496
497
498
499
500
501
502
503
504
505
506
507
508
509
510
511
512
513
514
515
516
                * a list of strings encoded in binary from utf-8, each value separated with a tabulation
        
        - Section Word Graph (nodes / arcs)
                * A list of nodes which are a list of arcs with an address of the next node.
                  See DawgNode.convToBytes() for details.
        """
        if not sPathFile.endswith(".bdic"):
            sPathFile += "."+str(nCompressionMethod)+".bdic"
        with open(sPathFile, 'wb') as hDst:
            # header
            hDst.write("/pyfsa/{}/".format(nCompressionMethod).encode("utf-8"))
            hDst.write(b"\0\0\0\0")
            # infos
            hDst.write("{}/{}/{}/{}/{}/{}/{}/{}/{}".format(self.sLang, self.nChar, self.nBytesArc, self.nBytesNodeAddress, \
                                                           self.nEntry, self.nNode, self.nArc, self.nAff, self.cStemming).encode("utf-8"))
            hDst.write(b"\0\0\0\0")
            # lArcVal
            hDst.write("\t".join(self.lArcVal).encode("utf-8"))
            hDst.write(b"\0\0\0\0")
            # DAWG: nodes / arcs
            if nCompressionMethod == 1:
                hDst.write(self.oRoot.convToBytes1(self.nBytesArc, self.nBytesNodeAddress))
                for oNode in self.lMinimizedNodes:
                    hDst.write(oNode.convToBytes1(self.nBytesArc, self.nBytesNodeAddress))
            elif nCompressionMethod == 2:
                hDst.write(self.oRoot.convToBytes2(self.nBytesArc, self.nBytesNodeAddress))
                for oNode in self.lSortedNodes:
                    hDst.write(oNode.convToBytes2(self.nBytesArc, self.nBytesNodeAddress))
            elif nCompressionMethod == 3:
                hDst.write(self.oRoot.convToBytes3(self.nBytesArc, self.nBytesNodeAddress, self.nBytesOffset))
                for oNode in self.lSortedNodes:
                    hDst.write(oNode.convToBytes3(self.nBytesArc, self.nBytesNodeAddress, self.nBytesOffset))
            hDst.close()

    def _writeNodes (self, sPathFile, nCompressionMethod):
        "for debugging only"
        print(" > Write nodes")
        with open(sPathFile+".nodes."+str(nCompressionMethod)+".txt", 'w', encoding='utf-8', newline="\n") as hDst:
            if nCompressionMethod == 1:
                hDst.write(self.oRoot.getTxtRepr1(self.nBytesArc, self.nBytesNodeAddress, self.lArcVal)+"\n")
                #hDst.write( ''.join( [ "%02X " %  z  for z in self.oRoot.convToBytes1(self.nBytesArc, self.nBytesNodeAddress) ] ).strip() )
                for oNode in self.lMinimizedNodes:
                    hDst.write(oNode.getTxtRepr1(self.nBytesArc, self.nBytesNodeAddress, self.lArcVal)+"\n")
            if nCompressionMethod == 2:
                hDst.write(self.oRoot.getTxtRepr2(self.nBytesArc, self.nBytesNodeAddress, self.lArcVal)+"\n")
                for oNode in self.lSortedNodes:
                    hDst.write(oNode.getTxtRepr2(self.nBytesArc, self.nBytesNodeAddress, self.lArcVal)+"\n")
            if nCompressionMethod == 3:
                hDst.write(self.oRoot.getTxtRepr3(self.nBytesArc, self.nBytesNodeAddress, self.nBytesOffset, self.lArcVal)+"\n")
                #hDst.write( ''.join( [ "%02X " %  z  for z in self.oRoot.convToBytes3(self.nBytesArc, self.nBytesNodeAddress, self.nBytesOffset) ] ).strip() )
                for oNode in self.lSortedNodes:
                    hDst.write(oNode.getTxtRepr3(self.nBytesArc, self.nBytesNodeAddress, self.nBytesOffset, self.lArcVal)+"\n")
            hDst.close()
    
    def writeResults (self, sPathFile):