Grammalecte  Diff

Differences From Artifact [1fe1ba5da8]:

To Artifact [8edab9530f]:


37
38
39
40
41
42
43
44

45
46
47
48
49
50
51
37
38
39
40
41
42
43

44
45
46
47
48
49
50
51







-
+







    """DIRECT ACYCLIC WORD GRAPH"""
    # This code is inspired from Steve Hanov’s DAWG, 2011. (http://stevehanov.ca/blog/index.php?id=115)
    # We store suffix/affix codes and tags within the graph after the “real” word.
    # A word is a list of numbers [ c1, c2, c3 . . . cN, iAffix, iTags]
    # Each arc is an index in self.lArcVal, where are stored characters, suffix/affix codes for stemming and tags.
    # Important: As usual, the last node (after ‘iTags’) is tagged final, AND the node after ‘cN’ is ALSO tagged final.

    def __init__ (self, spfSrc, sLangName, cStemming):
    def __init__ (self, spfSrc, sLangCode, sLangName, sDicName, cStemming):
        print("===== Direct Acyclic Word Graph - Minimal Acyclic Finite State Automaton =====")
        cStemming = cStemming.upper()
        if cStemming == "A":
            funcStemmingGen = st.defineAffixCode
        elif cStemming == "S":
            funcStemmingGen = st.defineSuffixCode
        elif cStemming == "N":
98
99
100
101
102
103
104
105
106




107
108
109
110
111
112
113
98
99
100
101
102
103
104


105
106
107
108
109
110
111
112
113
114
115







-
-
+
+
+
+







                        + [ (dAff[aff]+nChar, dAffOccur[aff]) for aff in dAff ] \
                        + [ (dTag[tag]+nChar+nAff, dTagOccur[tag]) for tag in dTag ] )
        #with open(spfSrc[:-8]+".valuesfreq.txt", 'w', encoding='utf-8') as hFreqDst:  # DEBUG
        #    for iKey, nOcc in sorted(dValOccur.items(), key=lambda t: t[1], reverse=True):
        #        hFreqDst.write("{}: {}\n".format(lVal[iKey], nOcc))
        #    hFreqDst.close()
        
        self.sFile = spfSrc
        self.sLang = sLangName
        self.sFileName = spfSrc
        self.sLangCode = sLangCode
        self.sLangName = sLangName
        self.sDicName = sDicName
        self.nEntry = len(lWord)
        self.aPreviousEntry = []
        DawgNode.resetNextId()
        self.oRoot = DawgNode()
        self.lUncheckedNodes = []  # list of nodes that have not been checked for duplication.
        self.lMinimizedNodes = {}  # list of unique nodes that have been checked for duplication.
        self.lSortedNodes = []     # version 2 and 3
404
405
406
407
408
409
410



411

412

413

414
415







416





417
418
419
420

421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
406
407
408
409
410
411
412
413
414
415

416

417
418
419


420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435

436












437
438
439
440
441
442
443







+
+
+
-
+
-
+

+
-
-
+
+
+
+
+
+
+

+
+
+
+
+



-
+
-
-
-
-
-
-
-
-
-
-
-
-







            for oNode in self.lSortedNodes:
                byDic += oNode.convToBytes3(self.nBytesArc, self.nBytesNodeAddress, self.nBytesOffset)

        with open(spfDst, "w", encoding="utf-8", newline="\n") as hDst:
            if bInJSModule:
                hDst.write('// JavaScript\n// Generated data (do not edit)\n\n"use strict";\n\nconst dictionary = ')
            hDst.write(json.dumps({
                            "sHeader": "/pyfsa/",
                            "sLangCode": self.sLangCode,
                            "sLangName": self.sLangName,
                            "sName": "todo",
                            "sDicName": self.sDicName,
                            "nCompressionMethod": nCompressionMethod,
                            "sFileName": self.sFileName,
                            "sDate": str(datetime.datetime.now())[:-7],
                            "nEntries": self.nEntry,
                            "sHeader": "/pyfsa/"+str(nCompressionMethod)+"/",
                            "lArcVal": self.lArcVal,
                            "nChar": self.nChar,
                            "nAff": self.nAff,
                            "nTag": self.nTag,
                            "cStemming": self.cStemming,
                            "dChar": self.dChar,
                            "nNode": self.nNode,
                            "nArc": self.nArc,
                            "nArcVal": self.nArcVal,
                            "lArcVal": self.lArcVal,
                            "nCompressionMethod": nCompressionMethod,
                            "nBytesArc": self.nBytesArc,
                            "nBytesNodeAddress": self.nBytesNodeAddress,
                            "nBytesOffset": self.nBytesOffset
                            # JavaScript is a pile of shit, so Mozilla’s JS parser don’t like file bigger than 4 Mb!
                            # So, if necessary, we use an hexadecimal string, that we will convert later in Firefox’s extension.
                            # https://github.com/mozilla/addons-linter/issues/1361
                            "byDic": byDic.hex()  if bBinaryDictAsHexString  else [ e  for e in byDic ],
                            "sByDic": byDic.hex()  if bBinaryDictAsHexString  else [ e  for e in byDic ],
                            "sLang": self.sLang,
                            "nChar": self.nChar,
                            "nBytesArc": self.nBytesArc,
                            "nBytesNodeAddress": self.nBytesNodeAddress,
                            "nEntries": self.nEntry,
                            "nNode": self.nNode,
                            "nArc": self.nArc,
                            "nAff": self.nAff,
                            "cStemming": self.cStemming,
                            "nTag": self.nTag,
                            "dChar": self.dChar,
                            "nBytesOffset": self.nBytesOffset
                        }, ensure_ascii=False))
            if bInJSModule:
                hDst.write(";\n\nexports.dictionary = dictionary;\n")

    def _writeBinary (self, sPathFile, nCompressionMethod):
        """
        Format of the binary indexable dictionary:
468
469
470
471
472
473
474
475

476
477
478
479
480
481
482
472
473
474
475
476
477
478

479
480
481
482
483
484
485
486







-
+







        if not sPathFile.endswith(".bdic"):
            sPathFile += "."+str(nCompressionMethod)+".bdic"
        with open(sPathFile, 'wb') as hDst:
            # header
            hDst.write("/pyfsa/{}/".format(nCompressionMethod).encode("utf-8"))
            hDst.write(b"\0\0\0\0")
            # infos
            hDst.write("{}/{}/{}/{}/{}/{}/{}/{}/{}".format(self.sLang, self.nChar, self.nBytesArc, self.nBytesNodeAddress, \
            hDst.write("{}/{}/{}/{}/{}/{}/{}/{}/{}".format(self.sLangName, self.nChar, self.nBytesArc, self.nBytesNodeAddress, \
                                                           self.nEntry, self.nNode, self.nArc, self.nAff, self.cStemming).encode("utf-8"))
            hDst.write(b"\0\0\0\0")
            # lArcVal
            hDst.write("\t".join(self.lArcVal).encode("utf-8"))
            hDst.write(b"\0\0\0\0")
            # DAWG: nodes / arcs
            if nCompressionMethod == 1:
516
517
518
519
520
521
522
523
524
525



526
527
528
529
530
531
532
520
521
522
523
524
525
526



527
528
529
530
531
532
533
534
535
536







-
-
-
+
+
+







    def writeResults (self, sPathFile):
        bFileExits = os.path.isfile("_lexicons.res.txt")
        with open("_lexicons.res.txt", "a", encoding='utf-8', newline="\n") as hDst:
            sFormat1 = "{:<12} {:>12} {:>5} {:>8} {:>8} {:>6} {:>8} {:>9} {:>9} {:>15} {:>12} {:>12}\n"
            sFormat2 = "{:<12} {:>12,} {:>5,} {:>8,} {:>8} {:>6,} {:>8,} {:>9,} {:>9,} {:>15,} {:>12,} {:>12,}\n"
            if not bFileExits:
                hDst.write(sFormat1.format("Lexicon", "Entries", "Chars", "Affixes", "Stemming", "Tags", "Values", "Nodes", "Arcs", "Lexicon (Kb)", "Dict (Kb)", "LT Dict (Kb)"))
            hDst.write(sFormat2.format(self.sLang, self.nEntry, self.nChar, self.nAff, self.cStemming + "FX", self.nTag, self.nArcVal, \
                                       self.nNode, self.nArc, os.path.getsize(self.sFile), os.path.getsize(sPathFile), \
                                       os.path.getsize("cfsa/dict/{}.dict".format(self.sLang)) if os.path.isfile("cfsa/dict/{}.dict".format(self.sLang)) else 0))
            hDst.write(sFormat2.format(self.sLangName, self.nEntry, self.nChar, self.nAff, self.cStemming + "FX", self.nTag, self.nArcVal, \
                                       self.nNode, self.nArc, os.path.getsize(self.sFileName), os.path.getsize(sPathFile), \
                                       os.path.getsize("cfsa/dict/{}.dict".format(self.sLangName)) if os.path.isfile("cfsa/dict/{}.dict".format(self.sLangName)) else 0))
            hDst.close()



class DawgNode:
    NextId = 0
    NextPos = 1 # (version 2)