︙ | | |
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
|
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
|
-
+
-
+
-
+
-
|
# Performance trick:
# Instead of converting bytes to integers each times we parse the binary dictionary,
# we do it once, then parse the array
nAcc = 0
byBuffer = b""
lTemp = []
self.lByDic = []
nDivisor = (self.nBytesArc + self.nBytesNodeAddress) / 2
for i in range(0, len(self.byDic)):
byBuffer += self.byDic[i:i+1]
if nAcc == (self.nBytesArc - 1):
lTemp.append(int.from_bytes(byBuffer, byteorder="big"))
self.lByDic.append(int.from_bytes(byBuffer, byteorder="big"))
byBuffer = b""
elif nAcc == (self.nBytesArc + self.nBytesNodeAddress - 1):
lTemp.append(round(int.from_bytes(byBuffer, byteorder="big") / nDivisor))
self.lByDic.append(round(int.from_bytes(byBuffer, byteorder="big") / nDivisor))
byBuffer = b""
nAcc = -1
nAcc = nAcc + 1
self.byDic = lTemp;
# masks
self._arcMask = (2 ** ((self.nBytesArc * 8) - 3)) - 1
self._finalNodeMask = 1 << ((self.nBytesArc * 8) - 1)
self._lastArcMask = 1 << ((self.nBytesArc * 8) - 2)
|
︙ | | |
300
301
302
303
304
305
306
307
308
309
310
|
299
300
301
302
303
304
305
306
307
308
309
|
-
+
|
if c not in self.dChar:
return False
iAddr = self._lookupArcNode(self.dChar[c], iAddr)
if iAddr is None:
return False
return bool(self.byDic[iAddr] & self._finalNodeMask)
return bool(self.lByDic[iAddr] & self._finalNodeMask)
def getMorph (self, sWord):
"retrieves morphologies list, different casing allowed"
if not sWord:
return []
|
︙ | | |
357
358
359
360
361
362
363
364
365
366
367
|
356
357
358
359
360
361
362
363
364
365
366
|
-
+
|
oSuggResult.addSugg(sWord1+" "+sWord2)
def _suggest (self, oSuggResult, sRemain, nMaxSwitch=0, nMaxDel=0, nMaxHardRepl=0, nMaxJump=0, nDist=0, nDeep=0, iAddr=0, sNewWord="", bAvoidLoop=False):
# recursive function
#logging.info((nDeep * " ") + sNewWord + ":" + sRemain)
if self.byDic[iAddr] & self._finalNodeMask:
if self.lByDic[iAddr] & self._finalNodeMask:
if not sRemain:
oSuggResult.addSugg(sNewWord, nDeep)
for sTail in self._getTails(iAddr):
oSuggResult.addSugg(sNewWord+sTail, nDeep)
return
|
︙ | | |
424
425
426
427
428
429
430
431
432
433
434
|
423
424
425
426
427
428
429
430
431
432
433
|
-
+
|
def _getTails (self, iAddr, sTail="", n=2):
"return a list of suffixes ending at a distance of <n> from <iAddr>"
aTails = set()
for nVal, jAddr in self._getArcs(iAddr):
if nVal <= self.nChar:
if self.byDic[jAddr] & self._finalNodeMask:
if self.lByDic[jAddr] & self._finalNodeMask:
aTails.add(sTail + self.dCharVal[nVal])
if n and not aTails:
aTails.update(self._getTails(jAddr, sTail+self.dCharVal[nVal], n-1))
return aTails
|
︙ | | |
496
497
498
499
500
501
502
503
504
505
506
507
508
509
510
511
512
513
514
515
516
517
518
519
520
521
|
495
496
497
498
499
500
501
502
503
504
505
506
507
508
509
510
511
512
513
514
515
516
517
518
519
520
|
-
+
-
+
-
+
-
+
|
if c not in self.dChar:
return []
iAddr = self._lookupArcNode(self.dChar[c], iAddr)
if iAddr is None:
return []
if self.byDic[iAddr] & self._finalNodeMask:
if self.lByDic[iAddr] & self._finalNodeMask:
l = []
nRawArc = 0
while not nRawArc & self._lastArcMask:
iEndArcAddr = iAddr + 1
nRawArc = self.byDic[iAddr]
nRawArc = self.lByDic[iAddr]
nArc = nRawArc & self._arcMask
if nArc > self.nChar:
# This value is not a char, this is a stemming code
sStem = ">" + self.funcStemming(sWord, self.lArcVal[nArc])
# Now , we go to the next node and retrieve all following arcs values, all of them are tags
iAddr2 = self.byDic[iEndArcAddr]
iAddr2 = self.lByDic[iEndArcAddr]
nRawArc2 = 0
while not nRawArc2 & self._lastArcMask:
iEndArcAddr2 = iAddr2 + 1
nRawArc2 = self.byDic[iAddr2]
nRawArc2 = self.lByDic[iAddr2]
l.append(sStem + "/" + self.lArcVal[nRawArc2 & self._arcMask])
iAddr2 = iEndArcAddr2 + 1
iAddr = iEndArcAddr + 1
return l
return []
|
︙ | | |
527
528
529
530
531
532
533
534
535
536
537
538
539
540
541
542
543
544
545
546
547
548
549
550
551
552
553
554
555
556
557
558
559
560
561
562
563
564
565
566
567
568
569
570
571
572
573
574
575
576
577
578
579
580
581
582
583
584
|
526
527
528
529
530
531
532
533
534
535
536
537
538
539
540
541
542
543
544
545
546
547
548
549
550
551
552
553
554
555
556
557
558
559
560
561
562
563
564
565
566
567
568
569
570
571
572
573
574
575
576
577
578
579
580
581
582
583
|
-
+
-
+
-
+
-
+
-
-
+
+
-
+
-
+
-
+
-
+
|
if c not in self.dChar:
return []
iAddr = self._lookupArcNode(self.dChar[c], iAddr)
if iAddr is None:
return []
if self.byDic[iAddr] & self._finalNodeMask:
if self.lByDic[iAddr] & self._finalNodeMask:
l = []
nRawArc = 0
while not nRawArc & self._lastArcMask:
iEndArcAddr = iAddr + 1
nRawArc = self.byDic[iAddr]
nRawArc = self.lByDic[iAddr]
nArc = nRawArc & self._arcMask
if nArc > self.nChar:
# This value is not a char, this is a stemming code
l.append(self.funcStemming(sWord, self.lArcVal[nArc]))
iAddr = iEndArcAddr + 1
return l
return []
def _lookupArcNode (self, nVal, iAddr):
"looks if <nVal> is an arc at the node at <iAddr>, if yes, returns address of next node else None"
while True:
iEndArcAddr = iAddr + 1
nRawArc = self.byDic[iAddr]
nRawArc = self.lByDic[iAddr]
if nVal == (nRawArc & self._arcMask):
# the value we are looking for
# we return the address of the next node
return self.byDic[iEndArcAddr]
return self.lByDic[iEndArcAddr]
# value not found
if nRawArc & self._lastArcMask:
return None
iAddr = iEndArcAddr + 1
def _getArcs (self, iAddr):
"generator: return all arcs at <iAddr> as tuples of (nVal, iAddr)"
while True:
iEndArcAddr = iAddr + 1
nRawArc = self.byDic[iAddr]
yield nRawArc & self._arcMask, self.byDic[iEndArcAddr]
nRawArc = self.lByDic[iAddr]
yield nRawArc & self._arcMask, self.lByDic[iEndArcAddr]
if nRawArc & self._lastArcMask:
break
iAddr = iEndArcAddr + 1
def _writeNodes (self, spfDest):
"for debugging only"
print(" > Write binary nodes")
with open(spfDest, 'w', 'utf-8', newline="\n") as hDst:
iAddr = 0
hDst.write("i{:_>10} -- #{:_>10}\n".format("0", iAddr))
while iAddr < len(self.byDic):
while iAddr < len(self.lByDic):
iEndArcAddr = iAddr + 1
nRawArc = self.byDic[iAddr]
nRawArc = self.lByDic[iAddr]
nArc = nRawArc & self._arcMask
hDst.write(" {:<20} {:0>16} i{:>10} #{:_>10}\n".format(self.lArcVal[nArc], bin(nRawArc)[2:], "?", self.byDic[iEndArcAddr]))
hDst.write(" {:<20} {:0>16} i{:>10} #{:_>10}\n".format(self.lArcVal[nArc], bin(nRawArc)[2:], "?", self.lByDic[iEndArcAddr]))
iAddr = iEndArcAddr + 1
if (nRawArc & self._lastArcMask) and iAddr < len(self.byDic):
if (nRawArc & self._lastArcMask) and iAddr < len(self.lByDic):
hDst.write("\ni{:_>10} -- #{:_>10}\n".format("?", iAddr))
hDst.close()
|