Grammalecte  Diff

Differences From Artifact [ade3c3ba0a]:

To Artifact [70e7a3c81c]:


41
42
43
44
45
46
47
48

49
50
51
52
53
54
55
41
42
43
44
45
46
47

48
49
50
51
52
53
54
55







-
+







    """DIRECT ACYCLIC WORD GRAPH"""
    # This code is inspired from Steve Hanov’s DAWG, 2011. (http://stevehanov.ca/blog/index.php?id=115)
    # We store suffix/affix codes and tags within the graph after the “real” word.
    # A word is a list of numbers [ c1, c2, c3 . . . cN, iAffix, iTags]
    # Each arc is an index in self.lArcVal, where are stored characters, suffix/affix codes for stemming and tags.
    # Important: As usual, the last node (after ‘iTags’) is tagged final, AND the node after ‘cN’ is ALSO tagged final.

    def __init__ (self, src, cStemming, sLangCode, sLangName="", sDicName="", sSelectFilterRegex=""):
    def __init__ (self, src, cStemming, sLangCode, sLangName="", sDicName="", sDescription="", sSelectFilterRegex=""):
        print("===== Direct Acyclic Word Graph - Minimal Acyclic Finite State Automaton =====")
        cStemming = cStemming.upper()
        if cStemming == "A":
            funcStemmingGen = st.defineAffixCode
        elif cStemming == "S":
            funcStemmingGen = st.defineSuffixCode
        elif cStemming == "N":
117
118
119
120
121
122
123

124
125
126
127
128
129
130
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131







+







                        + [ (dAff[aff]+nChar, dAffOccur[aff]) for aff in dAff ] \
                        + [ (dTag[tag]+nChar+nAff, dTagOccur[tag]) for tag in dTag ] )

        self.sFileName = src  if type(src) is str  else "[None]"
        self.sLangCode = sLangCode
        self.sLangName = sLangName
        self.sDicName = sDicName
        self.sDescription = sDescription
        self.nEntry = len(lWord)
        self.aPreviousEntry = []
        DawgNode.resetNextId()
        self.oRoot = DawgNode()
        self.lUncheckedNodes = []  # list of nodes that have not been checked for duplication.
        self.lMinimizedNodes = {}  # list of unique nodes that have been checked for duplication.
        self.lSortedNodes = []     # version 2 and 3
429
430
431
432
433
434
435

436
437
438
439
440
441
442
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444







+







            for oNode in self.lSortedNodes:
                byDic += oNode.convToBytes3(self.nBytesArc, self.nBytesNodeAddress, self.nBytesOffset)
        return {
            "sHeader": "/grammalecte-fsa/",
            "sLangCode": self.sLangCode,
            "sLangName": self.sLangName,
            "sDicName": self.sDicName,
            "sDescription": self.sDescription,
            "sFileName": self.sFileName,
            "sDate": self._getDate(),
            "nEntry": self.nEntry,
            "nChar": self.nChar,
            "nAff": self.nAff,
            "nTag": self.nTag,
            "cStemming": self.cStemming,
511
512
513
514
515
516
517
518
519
520



521
522
523
524
525
526
527
513
514
515
516
517
518
519



520
521
522
523
524
525
526
527
528
529







-
-
-
+
+
+







        if not sPathFile.endswith(".bdic"):
            sPathFile += "."+str(nCompressionMethod)+".bdic"
        with open(sPathFile, 'wb') as hDst:
            # header
            hDst.write("/grammalecte-fsa/{}/".format(nCompressionMethod).encode("utf-8"))
            hDst.write(b"\0\0\0\0")
            # infos
            sInfo = "{}//{}//{}//{}//{}//{}//{}//{}//{}//{}//{}//{}//".format(self.sLangCode, self.sLangName, self.sDicName, self._getDate(), \
                                                                              self.nChar, self.nBytesArc, self.nBytesNodeAddress, \
                                                                              self.nEntry, self.nNode, self.nArc, self.nAff, self.cStemming)
            sInfo = "{}//{}//{}//{}//{}//{}//{}//{}//{}//{}//{}//{}//{}".format(self.sLangCode, self.sLangName, self.sDicName, self.sDescription, self._getDate(), \
                                                                                self.nChar, self.nBytesArc, self.nBytesNodeAddress, \
                                                                                self.nEntry, self.nNode, self.nArc, self.nAff, self.cStemming)
            hDst.write(sInfo.encode("utf-8"))
            hDst.write(b"\0\0\0\0")
            # lArcVal
            hDst.write("\t".join(self.lArcVal).encode("utf-8"))
            hDst.write(b"\0\0\0\0")
            # 2grams
            hDst.write("\t".join(self.a2grams).encode("utf-8"))