52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
|
if self.cStemming == "S":
self.funcStemming = st.changeWordWithSuffixCode
elif self.cStemming == "A":
self.funcStemming = st.changeWordWithAffixCode
else:
self.funcStemming = st.noStemming
self.nTag = self.nArcVal - self.nChar - self.nAff
# <dChar> to get the value of an arc, <dArcVal> to get the char of an arc with its value
self.dChar = {}
for i in range(1, self.nChar):
self.dChar[self.lArcVal[i]] = i
self.dArcVal = { v: k for k, v in self.dChar.items() }
self._arcMask = (2 ** ((self.nBytesArc * 8) - 3)) - 1
self._finalNodeMask = 1 << ((self.nBytesArc * 8) - 1)
self._lastArcMask = 1 << ((self.nBytesArc * 8) - 2)
self._addrBitMask = 1 << ((self.nBytesArc * 8) - 3) # version 2
self.nBytesOffset = 1 # version 3
|
|
|
|
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
|
if self.cStemming == "S":
self.funcStemming = st.changeWordWithSuffixCode
elif self.cStemming == "A":
self.funcStemming = st.changeWordWithAffixCode
else:
self.funcStemming = st.noStemming
self.nTag = self.nArcVal - self.nChar - self.nAff
# <dChar> to get the value of an arc, <dCharVal> to get the char of an arc with its value
self.dChar = {}
for i in range(1, self.nChar):
self.dChar[self.lArcVal[i]] = i
self.dCharVal = { v: k for k, v in self.dChar.items() }
self._arcMask = (2 ** ((self.nBytesArc * 8) - 3)) - 1
self._finalNodeMask = 1 << ((self.nBytesArc * 8) - 1)
self._lastArcMask = 1 << ((self.nBytesArc * 8) - 2)
self._addrBitMask = 1 << ((self.nBytesArc * 8) - 3) # version 2
self.nBytesOffset = 1 # version 3
|
170
171
172
173
174
175
176
177
178
179
180
181
182
183
|
for c in sWord:
if c not in self.dChar:
return False
iAddr = self._lookupArcNode(self.dChar[c], iAddr)
if iAddr == None:
return False
return bool(int.from_bytes(self.byDic[iAddr:iAddr+self.nBytesArc], byteorder='big') & self._finalNodeMask)
def suggest (self, sWord):
"returns a set of similar words"
# first, we check for similar words
#return set(self._suggestWithCrushedUselessChars(cp.clearWord(sWord)))
lSugg = self._suggest(sWord)
if not lSugg:
|
>
>
>
>
>
>
>
>
>
|
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
|
for c in sWord:
if c not in self.dChar:
return False
iAddr = self._lookupArcNode(self.dChar[c], iAddr)
if iAddr == None:
return False
return bool(int.from_bytes(self.byDic[iAddr:iAddr+self.nBytesArc], byteorder='big') & self._finalNodeMask)
def getMorph (self, sWord):
"retrieves morphologies list, different casing allowed"
l = self.morph(sWord)
if sWord[0:1].isupper():
l.extend(self.morph(sWord.lower()))
if sWord.isupper() and len(sWord) > 1:
l.extend(self.morph(sWord.capitalize()))
return l
def suggest (self, sWord):
"returns a set of similar words"
# first, we check for similar words
#return set(self._suggestWithCrushedUselessChars(cp.clearWord(sWord)))
lSugg = self._suggest(sWord)
if not lSugg:
|
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
|
show(nDeep, cChar)
lSugg.extend(self._suggestWithCrushedUselessChars(sWord[1:], nDeep+1, jAddr, sNewWord+cChar))
return lSugg
def _getSimilarArcsAndCrushedChars (self, cChar, iAddr):
"generator: yield similar char of <cChar> and address of the following node"
for nVal, jAddr in self._getArcs(iAddr):
if self.dArcVal.get(nVal, "") in cp.aUselessChar:
yield (self.dArcVal[nVal], jAddr)
for c in cp.d1to1.get(cChar, [cChar]):
if c in self.dChar:
jAddr = self._lookupArcNode(self.dChar[c], iAddr)
if jAddr:
yield (c, jAddr)
def getMorph (self, sWord):
"retrieves morphologies list, different casing allowed"
l = self.morph(sWord)
if sWord[0:1].isupper():
l.extend(self.morph(sWord.lower()))
if sWord.isupper() and len(sWord) > 1:
l.extend(self.morph(sWord.capitalize()))
return l
# def morph (self, sWord):
# is defined in __init__
# VERSION 1
def _morph1 (self, sWord):
"returns morphologies of sWord"
|
|
|
|
>
|
>
|
|
>
>
|
>
>
>
>
|
>
|
|
|
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
|
show(nDeep, cChar)
lSugg.extend(self._suggestWithCrushedUselessChars(sWord[1:], nDeep+1, jAddr, sNewWord+cChar))
return lSugg
def _getSimilarArcsAndCrushedChars (self, cChar, iAddr):
"generator: yield similar char of <cChar> and address of the following node"
for nVal, jAddr in self._getArcs(iAddr):
if self.dCharVal.get(nVal, None) in cp.aUselessChar:
yield (self.dCharVal[nVal], jAddr)
for c in cp.d1to1.get(cChar, [cChar]):
if c in self.dChar:
jAddr = self._lookupArcNode(self.dChar[c], iAddr)
if jAddr:
yield (c, jAddr)
def drawPath (self, sWord, iAddr=0):
if not sWord:
return
iPos = -1
n = 0
print(sWord[0:1] + ": ", end="")
for nVal, jAddr in self._getArcs(iAddr):
if nVal in self.dCharVal:
print(self.dCharVal[nVal], end="")
if self.dCharVal[nVal] == sWord[0:1]:
iNextNodeAddr = jAddr
iPos = n
n += 1
if iPos >= 0:
print("\n "+ " " * iPos + "|")
self.drawPath(sWord[1:], iNextNodeAddr)
# def morph (self, sWord):
# is defined in __init__
# VERSION 1
def _morph1 (self, sWord):
"returns morphologies of sWord"
|