Grammalecte  Diff

Differences From Artifact [257e064164]:

To Artifact [ade3c3ba0a]:


59
60
61
62
63
64
65


66
67
68
69
70
71
72
73
74
75
76
77
78
79

80
81
82
83
84
85
86

        aEntry = set()
        lChar = ['']; dChar = {}; nChar = 1; dCharOccur = {}
        lAff  = [];   dAff  = {}; nAff  = 0; dAffOccur = {}
        lTag  = [];   dTag  = {}; nTag  = 0; dTagOccur = {}
        nErr = 0



        try:
            zFilter = re.compile(sSelectFilterRegex)  if sSelectFilterRegex  else None
        except:
            print(" # Error. Wrong filter regex. Filter ignored.")
            traceback.print_exc()
            zFilter = None

        # read lexicon
        if type(src) is str:
            iterable = readFile(src)
        else:
            iterable = src
        for sFlex, sStem, sTag in iterable:
            if not zFilter or zFilter.search(sTag):

                addWordToCharDict(sFlex)
                # chars
                for c in sFlex:
                    if c not in dChar:
                        dChar[c] = nChar
                        lChar.append(c)
                        nChar += 1







>
>














>







59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89

        aEntry = set()
        lChar = ['']; dChar = {}; nChar = 1; dCharOccur = {}
        lAff  = [];   dAff  = {}; nAff  = 0; dAffOccur = {}
        lTag  = [];   dTag  = {}; nTag  = 0; dTagOccur = {}
        nErr = 0

        self.a2grams = set()

        try:
            zFilter = re.compile(sSelectFilterRegex)  if sSelectFilterRegex  else None
        except:
            print(" # Error. Wrong filter regex. Filter ignored.")
            traceback.print_exc()
            zFilter = None

        # read lexicon
        if type(src) is str:
            iterable = readFile(src)
        else:
            iterable = src
        for sFlex, sStem, sTag in iterable:
            if not zFilter or zFilter.search(sTag):
                self.a2grams.update(st.getNgrams(sFlex))
                addWordToCharDict(sFlex)
                # chars
                for c in sFlex:
                    if c not in dChar:
                        dChar[c] = nChar
                        lChar.append(c)
                        nChar += 1
281
282
283
284
285
286
287

288
289
290
291
292
293
294
        print(" * {:<12} {:>16,}".format("Entries:", self.nEntry))
        print(" * {:<12} {:>16,}".format("Characters:", self.nChar))
        print(" * {:<12} {:>16,}".format("Affixes:", self.nAff))
        print(" * {:<12} {:>16,}".format("Tags:", self.nTag))
        print(" * {:<12} {:>16,}".format("Arc values:", self.nArcVal))
        print(" * {:<12} {:>16,}".format("Nodes:", self.nNode))
        print(" * {:<12} {:>16,}".format("Arcs:", self.nArc))

        print(" * {:<12} {:>16}".format("Stemming:", self.cStemming + "FX"))

    def getArcStats (self):
        "return a string with statistics about nodes and arcs"
        d = {}
        for oNode in self.lMinimizedNodes:
            n = len(oNode.arcs)







>







284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
        print(" * {:<12} {:>16,}".format("Entries:", self.nEntry))
        print(" * {:<12} {:>16,}".format("Characters:", self.nChar))
        print(" * {:<12} {:>16,}".format("Affixes:", self.nAff))
        print(" * {:<12} {:>16,}".format("Tags:", self.nTag))
        print(" * {:<12} {:>16,}".format("Arc values:", self.nArcVal))
        print(" * {:<12} {:>16,}".format("Nodes:", self.nNode))
        print(" * {:<12} {:>16,}".format("Arcs:", self.nArc))
        print(" * {:<12} {:>16,}".format("2grams:", len(self.a2grams)))
        print(" * {:<12} {:>16}".format("Stemming:", self.cStemming + "FX"))

    def getArcStats (self):
        "return a string with statistics about nodes and arcs"
        d = {}
        for oNode in self.lMinimizedNodes:
            n = len(oNode.arcs)
444
445
446
447
448
449
450
451

452
453
454
455
456
457
458
            "nCompressionMethod": nCompressionMethod,
            "nBytesArc": self.nBytesArc,
            "nBytesNodeAddress": self.nBytesNodeAddress,
            "nBytesOffset": self.nBytesOffset,
            # Mozilla’s JS parser don’t like file bigger than 4 Mb!
            # So, if necessary, we use an hexadecimal string, that we will convert later in Firefox’s extension.
            # https://github.com/mozilla/addons-linter/issues/1361
            "sByDic": byDic.hex()  if bBinaryDictAsHexString  else [ e  for e in byDic ]

        }

    def writeAsJSObject (self, spfDst, nCompressionMethod, bInJSModule=False, bBinaryDictAsHexString=True):
        "write a file (JSON or JS module) with all the necessary data"
        if not spfDst.endswith(".json"):
            spfDst += "."+str(nCompressionMethod)+".json"
        with open(spfDst, "w", encoding="utf-8", newline="\n") as hDst:







|
>







448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
            "nCompressionMethod": nCompressionMethod,
            "nBytesArc": self.nBytesArc,
            "nBytesNodeAddress": self.nBytesNodeAddress,
            "nBytesOffset": self.nBytesOffset,
            # Mozilla’s JS parser don’t like file bigger than 4 Mb!
            # So, if necessary, we use an hexadecimal string, that we will convert later in Firefox’s extension.
            # https://github.com/mozilla/addons-linter/issues/1361
            "sByDic": byDic.hex()  if bBinaryDictAsHexString  else [ e  for e in byDic ],
            "l2grams": list(self.a2grams)
        }

    def writeAsJSObject (self, spfDst, nCompressionMethod, bInJSModule=False, bBinaryDictAsHexString=True):
        "write a file (JSON or JS module) with all the necessary data"
        if not spfDst.endswith(".json"):
            spfDst += "."+str(nCompressionMethod)+".json"
        with open(spfDst, "w", encoding="utf-8", newline="\n") as hDst:
494
495
496
497
498
499
500



501
502
503
504
505
506
507
508
509
510
511
512
513
514
515
516
517



518
519
520
521
522
523
524

        - Section Values:
                * a list of strings encoded in binary from utf-8, each value separated with a tabulation

        - Section Word Graph (nodes / arcs)
                * A list of nodes which are a list of arcs with an address of the next node.
                  See DawgNode.convToBytes() for details.



        """
        self._calculateBinary(nCompressionMethod)
        if not sPathFile.endswith(".bdic"):
            sPathFile += "."+str(nCompressionMethod)+".bdic"
        with open(sPathFile, 'wb') as hDst:
            # header
            hDst.write("/grammalecte-fsa/{}/".format(nCompressionMethod).encode("utf-8"))
            hDst.write(b"\0\0\0\0")
            # infos
            sInfo = "{}//{}//{}//{}//{}//{}//{}//{}//{}//{}//{}//{}//".format(self.sLangCode, self.sLangName, self.sDicName, self._getDate(), \
                                                                              self.nChar, self.nBytesArc, self.nBytesNodeAddress, \
                                                                              self.nEntry, self.nNode, self.nArc, self.nAff, self.cStemming)
            hDst.write(sInfo.encode("utf-8"))
            hDst.write(b"\0\0\0\0")
            # lArcVal
            hDst.write("\t".join(self.lArcVal).encode("utf-8"))
            hDst.write(b"\0\0\0\0")



            # DAWG: nodes / arcs
            if nCompressionMethod == 1:
                hDst.write(self.oRoot.convToBytes1(self.nBytesArc, self.nBytesNodeAddress))
                for oNode in self.lMinimizedNodes:
                    hDst.write(oNode.convToBytes1(self.nBytesArc, self.nBytesNodeAddress))
            elif nCompressionMethod == 2:
                hDst.write(self.oRoot.convToBytes2(self.nBytesArc, self.nBytesNodeAddress))







>
>
>

















>
>
>







499
500
501
502
503
504
505
506
507
508
509
510
511
512
513
514
515
516
517
518
519
520
521
522
523
524
525
526
527
528
529
530
531
532
533
534
535

        - Section Values:
                * a list of strings encoded in binary from utf-8, each value separated with a tabulation

        - Section Word Graph (nodes / arcs)
                * A list of nodes which are a list of arcs with an address of the next node.
                  See DawgNode.convToBytes() for details.

        - Section 2grams:
                * A list of 2grams (as strings: 2 chars) encoded in binary from utf-8, each value separated with a tabulation
        """
        self._calculateBinary(nCompressionMethod)
        if not sPathFile.endswith(".bdic"):
            sPathFile += "."+str(nCompressionMethod)+".bdic"
        with open(sPathFile, 'wb') as hDst:
            # header
            hDst.write("/grammalecte-fsa/{}/".format(nCompressionMethod).encode("utf-8"))
            hDst.write(b"\0\0\0\0")
            # infos
            sInfo = "{}//{}//{}//{}//{}//{}//{}//{}//{}//{}//{}//{}//".format(self.sLangCode, self.sLangName, self.sDicName, self._getDate(), \
                                                                              self.nChar, self.nBytesArc, self.nBytesNodeAddress, \
                                                                              self.nEntry, self.nNode, self.nArc, self.nAff, self.cStemming)
            hDst.write(sInfo.encode("utf-8"))
            hDst.write(b"\0\0\0\0")
            # lArcVal
            hDst.write("\t".join(self.lArcVal).encode("utf-8"))
            hDst.write(b"\0\0\0\0")
            # 2grams
            hDst.write("\t".join(self.a2grams).encode("utf-8"))
            hDst.write(b"\0\0\0\0")
            # DAWG: nodes / arcs
            if nCompressionMethod == 1:
                hDst.write(self.oRoot.convToBytes1(self.nBytesArc, self.nBytesNodeAddress))
                for oNode in self.lMinimizedNodes:
                    hDst.write(oNode.convToBytes1(self.nBytesArc, self.nBytesNodeAddress))
            elif nCompressionMethod == 2:
                hDst.write(self.oRoot.convToBytes2(self.nBytesArc, self.nBytesNodeAddress))