75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
|
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
|
-
-
-
-
+
+
+
+
+
-
-
-
-
-
-
+
+
+
+
+
+
+
+
-
+
|
self.aSugg.clear()
self.dSugg.clear()
class IBDAWG:
"""INDEXABLE BINARY DIRECT ACYCLIC WORD GRAPH"""
def __init__ (self, sfDict):
self.by = pkgutil.get_data(__package__, "_dictionaries/" + sfDict)
if not self.by:
raise OSError("# Error. File not found or not loadable: "+sfDict)
def __init__ (self, source):
if type(source) is str:
self.by = pkgutil.get_data(__package__, "_dictionaries/" + source)
if not self.by:
raise OSError("# Error. File not found or not loadable: "+source)
if sfDict.endswith(".bdic"):
self._initBinary()
elif sfDict.endswith(".json"):
self._initJSON()
else:
raise OSError("# Error. Unknown file type: "+sfDict)
if source.endswith(".bdic"):
self._initBinary()
elif source.endswith(".json"):
self._initJSON(json.loads(self.by.decode("utf-8"))) #json.loads(self.by) # In Python 3.6, can read directly binary strings
else:
raise OSError("# Error. Unknown file type: "+source)
else:
self._initJSON(source)
self.sFileName = sfDict
self.sFileName = source if type(source) is str else "[None]"
self._arcMask = (2 ** ((self.nBytesArc * 8) - 3)) - 1
self._finalNodeMask = 1 << ((self.nBytesArc * 8) - 1)
self._lastArcMask = 1 << ((self.nBytesArc * 8) - 2)
self._addrBitMask = 1 << ((self.nBytesArc * 8) - 3) # version 2
# function to decode the affix/suffix code
|
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
|
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
|
-
+
-
+
-
|
# <dChar> to get the value of an arc, <dCharVal> to get the char of an arc with its value
self.dChar = {}
for i in range(1, self.nChar+1):
self.dChar[self.lArcVal[i]] = i
self.dCharVal = { v: k for k, v in self.dChar.items() }
self.nBytesOffset = 1 # version 3
def _initJSON (self):
def _initJSON (self, oJSON):
"initialize with a JSON text file"
self.__dict__.update(json.loads(self.by.decode("utf-8")))
self.__dict__.update(oJSON)
#self.__dict__.update(json.loads(self.by)) # In Python 3.6, can read directly binary strings
self.byDic = binascii.unhexlify(self.sByDic)
def getInfo (self):
return " Language: {0.sLangName} Lang code: {0.sLangCode} Dictionary name: {0.sDicName}" \
" Compression method: {0.nCompressionMethod:>2} Date: {0.sDate} Stemming: {0.cStemming}FX\n" \
" Arcs values: {0.nArcVal:>10,} = {0.nChar:>5,} characters, {0.nAff:>6,} affixes, {0.nTag:>6,} tags\n" \
" Dictionary: {0.nEntry:>12,} entries, {0.nNode:>11,} nodes, {0.nArc:>11,} arcs\n" \
|
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
|
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
|
+
+
+
+
+
+
+
+
+
+
+
-
-
-
-
-
-
-
-
-
-
+
+
+
+
+
+
+
+
+
+
+
+
+
-
+
-
+
+
-
-
-
-
+
+
+
+
|
n += 1
if not sWord:
return
if iPos >= 0:
print("\n "+ " " * iPos + "|")
self.drawPath(sWord[1:], iNextNodeAddr)
def getSimilarEntries (self, sWord, nSuggLimit=10):
"return a list of tuples (similar word, stem, morphology)"
if not sWord:
return []
lResult = []
for sSimilar in self.suggest(sWord, nSuggLimit):
for sMorph in self.getMorph(sSimilar):
nCut = sMorph.find(" ")
lResult.append( (sSimilar, sMorph[1:nCut], sMorph[nCut+1:]) )
return lResult
def select (self, sPattern=""):
"generator: returns all entries which morphology fits <sPattern>"
zPattern = None
if sPattern:
try:
zPattern = re.compile(sPattern)
except:
print("# Error in regex pattern")
traceback.print_exc()
yield from self._select1(zPattern, 0, "")
def select (self, sFlexPattern="", sTagsPattern=""):
"generator: returns all entries which flexion fits <sFlexPattern> and morphology fits <sTagsPattern>"
zFlexPattern = None
zTagsPattern = None
try:
if sFlexPattern:
zFlexPattern = re.compile(sFlexPattern)
if sTagsPattern:
zTagsPattern = re.compile(sTagsPattern)
except:
print("# Error in regex pattern")
traceback.print_exc()
yield from self._select1(zFlexPattern, zTagsPattern, 0, "")
# def morph (self, sWord):
# is defined in __init__
# VERSION 1
def _select1 (self, zPattern, iAddr, sWord):
def _select1 (self, zFlexPattern, zTagsPattern, iAddr, sWord):
# recursive generator
for nVal, jAddr in self._getArcs1(iAddr):
if nVal <= self.nChar:
# simple character
yield from self._select1(zPattern, jAddr, sWord + self.lArcVal[nVal])
yield from self._select1(zFlexPattern, zTagsPattern, jAddr, sWord + self.lArcVal[nVal])
else:
if not zFlexPattern or zFlexPattern.search(sWord):
sEntry = sWord + "\t" + self.funcStemming(sWord, self.lArcVal[nVal])
for nMorphVal, _ in self._getArcs1(jAddr):
if not zPattern or zPattern.search(self.lArcVal[nMorphVal]):
yield sEntry + "\t" + self.lArcVal[nMorphVal]
sStem = self.funcStemming(sWord, self.lArcVal[nVal])
for nMorphVal, _ in self._getArcs1(jAddr):
if not zTagsPattern or zTagsPattern.search(self.lArcVal[nMorphVal]):
yield [sWord, sStem, self.lArcVal[nMorphVal]]
def _morph1 (self, sWord):
"returns morphologies of <sWord>"
iAddr = 0
for c in sWord:
if c not in self.dChar:
return []
|