︙ | | |
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
|
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
|
+
+
+
+
|
def __init__ (self, sWord, nDistLimit=-1):
self.sWord = sWord
self.sSimplifiedWord = cp.simplifyWord(sWord)
self.nDistLimit = nDistLimit if nDistLimit >= 0 else (len(sWord) // 3) + 1
self.nMinDist = 1000
self.aSugg = set()
self.dSugg = { 0: [], 1: [], 2: [] }
self.aAllSugg = set() # all found words even those refused
def addSugg (self, sSugg, nDeep=0):
"add a suggestion"
#logging.info((nDeep * " ") + "__" + sSugg + "__")
if sSugg in self.aAllSugg:
return
self.aAllSugg.add(sSugg)
if sSugg not in self.aSugg:
nDist = st.distanceDamerauLevenshtein(self.sSimplifiedWord, cp.simplifyWord(sSugg))
if nDist <= self.nDistLimit:
if nDist not in self.dSugg:
self.dSugg[nDist] = []
self.dSugg[nDist].append(sSugg)
self.aSugg.add(sSugg)
|
︙ | | |
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
|
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
|
-
-
-
-
+
+
+
+
+
-
-
-
-
-
-
+
+
+
+
+
+
+
+
-
+
|
self.aSugg.clear()
self.dSugg.clear()
class IBDAWG:
"""INDEXABLE BINARY DIRECT ACYCLIC WORD GRAPH"""
def __init__ (self, sfDict):
self.by = pkgutil.get_data(__package__, "_dictionaries/" + sfDict)
if not self.by:
raise OSError("# Error. File not found or not loadable: "+sfDict)
def __init__ (self, source):
if type(source) is str:
self.by = pkgutil.get_data(__package__, "_dictionaries/" + source)
if not self.by:
raise OSError("# Error. File not found or not loadable: "+source)
if sfDict.endswith(".bdic"):
self._initBinary()
elif sfDict.endswith(".json"):
self._initJSON()
else:
raise OSError("# Error. Unknown file type: "+sfDict)
if source.endswith(".bdic"):
self._initBinary()
elif source.endswith(".json"):
self._initJSON(json.loads(self.by.decode("utf-8"))) #json.loads(self.by) # In Python 3.6, can read directly binary strings
else:
raise OSError("# Error. Unknown file type: "+source)
else:
self._initJSON(source)
self.sFileName = sfDict
self.sFileName = source if type(source) is str else "[None]"
self._arcMask = (2 ** ((self.nBytesArc * 8) - 3)) - 1
self._finalNodeMask = 1 << ((self.nBytesArc * 8) - 1)
self._lastArcMask = 1 << ((self.nBytesArc * 8) - 2)
self._addrBitMask = 1 << ((self.nBytesArc * 8) - 3) # version 2
# function to decode the affix/suffix code
|
︙ | | |
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
|
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
|
-
-
-
-
+
+
+
+
-
+
|
raise ValueError(" # Error: unknown code: {}".format(self.nCompressionMethod))
self.bOptNumSigle = False
self.bOptNumAtLast = False
def _initBinary (self):
"initialize with binary structure file"
if self.by[0:7] != b"/pyfsa/":
raise TypeError("# Error. Not a pyfsa binary dictionary. Header: {}".format(self.by[0:9]))
if not(self.by[7:8] == b"1" or self.by[7:8] == b"2" or self.by[7:8] == b"3"):
raise ValueError("# Error. Unknown dictionary version: {}".format(self.by[7:8]))
if self.by[0:17] != b"/grammalecte-fsa/":
raise TypeError("# Error. Not a grammalecte-fsa binary dictionary. Header: {}".format(self.by[0:9]))
if not(self.by[17:18] == b"1" or self.by[17:18] == b"2" or self.by[17:18] == b"3"):
raise ValueError("# Error. Unknown dictionary version: {}".format(self.by[17:18]))
try:
header, info, values, bdic = self.by.split(b"\0\0\0\0", 3)
except Exception:
raise Exception
self.nCompressionMethod = int(self.by[7:8].decode("utf-8"))
self.nCompressionMethod = int(self.by[17:18].decode("utf-8"))
self.sHeader = header.decode("utf-8")
self.lArcVal = values.decode("utf-8").split("\t")
self.nArcVal = len(self.lArcVal)
self.byDic = bdic
l = info.decode("utf-8").split("//")
self.sLangCode = l.pop(0)
|
︙ | | |
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
|
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
|
-
+
-
+
-
+
-
+
|
# <dChar> to get the value of an arc, <dCharVal> to get the char of an arc with its value
self.dChar = {}
for i in range(1, self.nChar+1):
self.dChar[self.lArcVal[i]] = i
self.dCharVal = { v: k for k, v in self.dChar.items() }
self.nBytesOffset = 1 # version 3
def _initJSON (self):
def _initJSON (self, oJSON):
"initialize with a JSON text file"
self.__dict__.update(json.loads(self.by.decode("utf-8")))
self.__dict__.update(oJSON)
#self.__dict__.update(json.loads(self.by)) # In Python 3.6, can read directly binary strings
self.byDic = binascii.unhexlify(self.sByDic)
self.dCharVal = { v: k for k, v in self.dChar.items() }
def getInfo (self):
return " Language: {0.sLangName} Lang code: {0.sLangCode} Dictionary name: {0.sDicName}" \
" Compression method: {0.nCompressionMethod:>2} Date: {0.sDate} Stemming: {0.cStemming}FX\n" \
" Arcs values: {0.nArcVal:>10,} = {0.nChar:>5,} characters, {0.nAff:>6,} affixes, {0.nTag:>6,} tags\n" \
" Dictionary: {0.nEntry:>12,} entries, {0.nNode:>11,} nodes, {0.nArc:>11,} arcs\n" \
" Address size: {0.nBytesNodeAddress:>1} bytes, Arc size: {0.nBytesArc:>1} bytes\n".format(self)
def writeAsJSObject (self, spfDest, bInJSModule=False, bBinaryDictAsHexString=False):
"write IBDAWG as a JavaScript object in a JavaScript module"
with open(spfDest, "w", encoding="utf-8", newline="\n") as hDst:
if bInJSModule:
hDst.write('// JavaScript\n// Generated data (do not edit)\n\n"use strict";\n\nconst dictionary = ')
hDst.write(json.dumps({
"sHeader": "/pyfsa/",
"sHeader": "/grammalecte-fsa/",
"sLangCode": self.sLangCode,
"sLangName": self.sLangName,
"sDicName": self.sDicName,
"sFileName": self.sFileName,
"sDate": self.sDate,
"nEntry": self.nEntry,
"nChar": self.nChar,
|
︙ | | |
230
231
232
233
234
235
236
237
238
239
240
241
242
243
|
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
|
+
+
|
def isValid (self, sWord):
"checks if <sWord> is valid (different casing tested if the first letter is a capital)"
if not sWord:
return None
if "’" in sWord: # ugly hack
sWord = sWord.replace("’", "'")
if self.lookup(sWord):
return True
if sWord.isdigit():
return True
if sWord[0:1].isupper():
if len(sWord) > 1:
if sWord.istitle():
return self.lookup(sWord.lower())
if sWord.isupper():
if self.bOptNumSigle:
|
︙ | | |
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
|
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
|
+
+
+
+
+
+
+
+
+
+
+
-
-
-
-
-
-
-
-
-
-
+
+
+
+
+
+
+
+
+
+
+
+
+
-
+
-
+
+
-
-
-
-
+
+
+
+
|
n += 1
if not sWord:
return
if iPos >= 0:
print("\n "+ " " * iPos + "|")
self.drawPath(sWord[1:], iNextNodeAddr)
def getSimilarEntries (self, sWord, nSuggLimit=10):
"return a list of tuples (similar word, stem, morphology)"
if not sWord:
return []
lResult = []
for sSimilar in self.suggest(sWord, nSuggLimit):
for sMorph in self.getMorph(sSimilar):
nCut = sMorph.find(" ")
lResult.append( (sSimilar, sMorph[1:nCut], sMorph[nCut+1:]) )
return lResult
def select (self, sPattern=""):
"generator: returns all entries which morphology fits <sPattern>"
zPattern = None
if sPattern:
try:
zPattern = re.compile(sPattern)
except:
print("# Error in regex pattern")
traceback.print_exc()
yield from self._select1(zPattern, 0, "")
def select (self, sFlexPattern="", sTagsPattern=""):
"generator: returns all entries which flexion fits <sFlexPattern> and morphology fits <sTagsPattern>"
zFlexPattern = None
zTagsPattern = None
try:
if sFlexPattern:
zFlexPattern = re.compile(sFlexPattern)
if sTagsPattern:
zTagsPattern = re.compile(sTagsPattern)
except:
print("# Error in regex pattern")
traceback.print_exc()
yield from self._select1(zFlexPattern, zTagsPattern, 0, "")
# def morph (self, sWord):
# is defined in __init__
# VERSION 1
def _select1 (self, zPattern, iAddr, sWord):
def _select1 (self, zFlexPattern, zTagsPattern, iAddr, sWord):
# recursive generator
for nVal, jAddr in self._getArcs1(iAddr):
if nVal <= self.nChar:
# simple character
yield from self._select1(zPattern, jAddr, sWord + self.lArcVal[nVal])
yield from self._select1(zFlexPattern, zTagsPattern, jAddr, sWord + self.lArcVal[nVal])
else:
if not zFlexPattern or zFlexPattern.search(sWord):
sEntry = sWord + "\t" + self.funcStemming(sWord, self.lArcVal[nVal])
for nMorphVal, _ in self._getArcs1(jAddr):
if not zPattern or zPattern.search(self.lArcVal[nMorphVal]):
yield sEntry + "\t" + self.lArcVal[nMorphVal]
sStem = self.funcStemming(sWord, self.lArcVal[nVal])
for nMorphVal, _ in self._getArcs1(jAddr):
if not zTagsPattern or zTagsPattern.search(self.lArcVal[nMorphVal]):
yield [sWord, sStem, self.lArcVal[nMorphVal]]
def _morph1 (self, sWord):
"returns morphologies of <sWord>"
iAddr = 0
for c in sWord:
if c not in self.dChar:
return []
|
︙ | | |