Grammalecte  Diff

Differences From Artifact [c41b426a86]:

To Artifact [b794aeec36]:


75
76
77
78
79
80
81
82
83
84
85





86
87
88
89
90
91
92








93
94

95
96
97
98
99
100
101
75
76
77
78
79
80
81




82
83
84
85
86
87






88
89
90
91
92
93
94
95
96

97
98
99
100
101
102
103
104







-
-
-
-
+
+
+
+
+

-
-
-
-
-
-
+
+
+
+
+
+
+
+

-
+







        self.aSugg.clear()
        self.dSugg.clear()


class IBDAWG:
    """INDEXABLE BINARY DIRECT ACYCLIC WORD GRAPH"""

    def __init__ (self, sfDict):
        self.by = pkgutil.get_data(__package__, "_dictionaries/" + sfDict)
        if not self.by:
            raise OSError("# Error. File not found or not loadable: "+sfDict)
    def __init__ (self, source):
        if type(source) is str:
            self.by = pkgutil.get_data(__package__, "_dictionaries/" + source)
            if not self.by:
                raise OSError("# Error. File not found or not loadable: "+source)

        if sfDict.endswith(".bdic"):
            self._initBinary()
        elif sfDict.endswith(".json"):
            self._initJSON()
        else:
            raise OSError("# Error. Unknown file type: "+sfDict)
            if source.endswith(".bdic"):
                self._initBinary()
            elif source.endswith(".json"):
                self._initJSON(json.loads(self.by.decode("utf-8")))     #json.loads(self.by)    # In Python 3.6, can read directly binary strings
            else:
                raise OSError("# Error. Unknown file type: "+source)
        else:
            self._initJSON(source)

        self.sFileName = sfDict
        self.sFileName = source  if type(source) is str  else "[None]"

        self._arcMask = (2 ** ((self.nBytesArc * 8) - 3)) - 1
        self._finalNodeMask = 1 << ((self.nBytesArc * 8) - 1)
        self._lastArcMask = 1 << ((self.nBytesArc * 8) - 2)
        self._addrBitMask = 1 << ((self.nBytesArc * 8) - 3)  # version 2

        # function to decode the affix/suffix code
165
166
167
168
169
170
171
172

173
174

175
176

177
178
179
180
181
182
183
168
169
170
171
172
173
174

175
176

177

178
179
180
181
182
183
184
185
186







-
+

-
+
-

+







        # <dChar> to get the value of an arc, <dCharVal> to get the char of an arc with its value
        self.dChar = {}
        for i in range(1, self.nChar+1):
            self.dChar[self.lArcVal[i]] = i
        self.dCharVal = { v: k  for k, v in self.dChar.items() }
        self.nBytesOffset = 1 # version 3

    def _initJSON (self):
    def _initJSON (self, oJSON):
        "initialize with a JSON text file"
        self.__dict__.update(json.loads(self.by.decode("utf-8")))
        self.__dict__.update(oJSON)
        #self.__dict__.update(json.loads(self.by))                  # In Python 3.6, can read directly binary strings
        self.byDic = binascii.unhexlify(self.sByDic)
        self.dCharVal = { v: k  for k, v in self.dChar.items() }

    def getInfo (self):
        return  "  Language: {0.sLangName}   Lang code: {0.sLangCode}   Dictionary name: {0.sDicName}" \
                "  Compression method: {0.nCompressionMethod:>2}   Date: {0.sDate}   Stemming: {0.cStemming}FX\n" \
                "  Arcs values:  {0.nArcVal:>10,} = {0.nChar:>5,} characters,  {0.nAff:>6,} affixes,  {0.nTag:>6,} tags\n" \
                "  Dictionary: {0.nEntry:>12,} entries,    {0.nNode:>11,} nodes,   {0.nArc:>11,} arcs\n" \
                "  Address size: {0.nBytesNodeAddress:>1} bytes,  Arc size: {0.nBytesArc:>1} bytes\n".format(self)
403
404
405
406
407
408
409











410
411
412
413
414
415
416
417
418
419













420
421
422
423
424
425

426
427
428
429
430

431

432
433
434
435




436
437
438
439
440
441
442
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423










424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441

442
443
444
445
446

447
448
449




450
451
452
453
454
455
456
457
458
459
460







+
+
+
+
+
+
+
+
+
+
+
-
-
-
-
-
-
-
-
-
-
+
+
+
+
+
+
+
+
+
+
+
+
+





-
+




-
+

+
-
-
-
-
+
+
+
+







            n += 1
        if not sWord:
            return
        if iPos >= 0:
            print("\n   "+ " " * iPos + "|")
            self.drawPath(sWord[1:], iNextNodeAddr)

    def getSimilarEntries (self, sWord, nSuggLimit=10):
        "return a list of tuples (similar word, stem, morphology)"
        if not sWord:
            return []
        lResult = []
        for sSimilar in self.suggest(sWord, nSuggLimit):
            for sMorph in self.getMorph(sSimilar):
                nCut = sMorph.find(" ")
                lResult.append( (sSimilar, sMorph[1:nCut], sMorph[nCut+1:]) )
        return lResult

    def select (self, sPattern=""):
        "generator: returns all entries which morphology fits <sPattern>"
        zPattern = None
        if sPattern:
            try:
                zPattern = re.compile(sPattern)
            except:
                print("# Error in regex pattern")
                traceback.print_exc()
        yield from self._select1(zPattern, 0, "")
    def select (self, sFlexPattern="", sTagsPattern=""):
        "generator: returns all entries which flexion fits <sFlexPattern> and morphology fits <sTagsPattern>"
        zFlexPattern = None
        zTagsPattern = None
        try:
            if sFlexPattern:
                zFlexPattern = re.compile(sFlexPattern)
            if sTagsPattern:
                zTagsPattern = re.compile(sTagsPattern)
        except:
            print("# Error in regex pattern")
            traceback.print_exc()
        yield from self._select1(zFlexPattern, zTagsPattern, 0, "")

    # def morph (self, sWord):
    #     is defined in __init__

    # VERSION 1
    def _select1 (self, zPattern, iAddr, sWord):
    def _select1 (self, zFlexPattern, zTagsPattern, iAddr, sWord):
        # recursive generator
        for nVal, jAddr in self._getArcs1(iAddr):
            if nVal <= self.nChar:
                # simple character
                yield from self._select1(zPattern, jAddr, sWord + self.lArcVal[nVal])
                yield from self._select1(zFlexPattern, zTagsPattern, jAddr, sWord + self.lArcVal[nVal])
            else:
                if not zFlexPattern or zFlexPattern.search(sWord):
                sEntry = sWord + "\t" + self.funcStemming(sWord, self.lArcVal[nVal])
                for nMorphVal, _ in self._getArcs1(jAddr):
                    if not zPattern or zPattern.search(self.lArcVal[nMorphVal]):
                        yield sEntry + "\t" + self.lArcVal[nMorphVal]
                    sStem = self.funcStemming(sWord, self.lArcVal[nVal])
                    for nMorphVal, _ in self._getArcs1(jAddr):
                        if not zTagsPattern or zTagsPattern.search(self.lArcVal[nMorphVal]):
                            yield [sWord, sStem, self.lArcVal[nMorphVal]]

    def _morph1 (self, sWord):
        "returns morphologies of <sWord>"
        iAddr = 0
        for c in sWord:
            if c not in self.dChar:
                return []