130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
|
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
|
-
+
-
+
-
+
-
|
self.sFileName = source if isinstance(source, str) else "[None]"
# Performance trick:
# Instead of converting bytes to integers each times we parse the binary dictionary,
# we do it once, then parse the array
nAcc = 0
byBuffer = b""
lTemp = []
self.lByDic = []
nDivisor = (self.nBytesArc + self.nBytesNodeAddress) / 2
for i in range(0, len(self.byDic)):
byBuffer += self.byDic[i:i+1]
if nAcc == (self.nBytesArc - 1):
lTemp.append(int.from_bytes(byBuffer, byteorder="big"))
self.lByDic.append(int.from_bytes(byBuffer, byteorder="big"))
byBuffer = b""
elif nAcc == (self.nBytesArc + self.nBytesNodeAddress - 1):
lTemp.append(round(int.from_bytes(byBuffer, byteorder="big") / nDivisor))
self.lByDic.append(round(int.from_bytes(byBuffer, byteorder="big") / nDivisor))
byBuffer = b""
nAcc = -1
nAcc = nAcc + 1
self.byDic = lTemp;
# masks
self._arcMask = (2 ** ((self.nBytesArc * 8) - 3)) - 1
self._finalNodeMask = 1 << ((self.nBytesArc * 8) - 1)
self._lastArcMask = 1 << ((self.nBytesArc * 8) - 2)
# function to decode the affix/suffix code
|
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
|
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
|
-
+
|
iAddr = 0
for c in sWord:
if c not in self.dChar:
return False
iAddr = self._lookupArcNode(self.dChar[c], iAddr)
if iAddr is None:
return False
return bool(self.byDic[iAddr] & self._finalNodeMask)
return bool(self.lByDic[iAddr] & self._finalNodeMask)
def getMorph (self, sWord):
"retrieves morphologies list, different casing allowed"
if not sWord:
return []
sWord = st.spellingNormalization(sWord)
l = self._morph(sWord)
|
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
|
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
|
-
+
|
sWord1, sWord2 = sWord.split(cSplitter, 1)
if self.isValid(sWord1) and self.isValid(sWord2):
oSuggResult.addSugg(sWord1+" "+sWord2)
def _suggest (self, oSuggResult, sRemain, nMaxSwitch=0, nMaxDel=0, nMaxHardRepl=0, nMaxJump=0, nDist=0, nDeep=0, iAddr=0, sNewWord="", bAvoidLoop=False):
# recursive function
#logging.info((nDeep * " ") + sNewWord + ":" + sRemain)
if self.byDic[iAddr] & self._finalNodeMask:
if self.lByDic[iAddr] & self._finalNodeMask:
if not sRemain:
oSuggResult.addSugg(sNewWord, nDeep)
for sTail in self._getTails(iAddr):
oSuggResult.addSugg(sNewWord+sTail, nDeep)
return
if (len(sNewWord) + len(sRemain) == len(oSuggResult.sWord)) and oSuggResult.sWord.lower().startswith(sNewWord.lower()) and self.isValid(sRemain):
if self.sLangCode == "fr" and sNewWord.lower() in ("l", "d", "n", "m", "t", "s", "c", "j", "qu", "lorsqu", "puisqu", "quoiqu", "jusqu", "quelqu") and sRemain[0:1] in cp.aVowel:
|
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
|
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
|
-
+
|
yield (self.dCharVal[nVal], jAddr)
def _getTails (self, iAddr, sTail="", n=2):
"return a list of suffixes ending at a distance of <n> from <iAddr>"
aTails = set()
for nVal, jAddr in self._getArcs(iAddr):
if nVal <= self.nChar:
if self.byDic[jAddr] & self._finalNodeMask:
if self.lByDic[jAddr] & self._finalNodeMask:
aTails.add(sTail + self.dCharVal[nVal])
if n and not aTails:
aTails.update(self._getTails(jAddr, sTail+self.dCharVal[nVal], n-1))
return aTails
def drawPath (self, sWord, iAddr=0):
"show the path taken by <sWord> in the graph"
|
494
495
496
497
498
499
500
501
502
503
504
505
506
507
508
509
510
511
512
513
514
515
516
517
518
519
520
521
522
523
524
525
526
527
528
529
530
531
532
533
534
535
536
537
538
539
540
541
542
543
544
545
546
547
548
549
550
551
552
553
554
555
556
557
558
559
560
561
562
563
564
565
566
567
568
569
570
571
572
573
574
575
576
577
578
579
580
581
582
583
584
|
493
494
495
496
497
498
499
500
501
502
503
504
505
506
507
508
509
510
511
512
513
514
515
516
517
518
519
520
521
522
523
524
525
526
527
528
529
530
531
532
533
534
535
536
537
538
539
540
541
542
543
544
545
546
547
548
549
550
551
552
553
554
555
556
557
558
559
560
561
562
563
564
565
566
567
568
569
570
571
572
573
574
575
576
577
578
579
580
581
582
583
|
-
+
-
+
-
+
-
+
-
+
-
+
-
+
-
+
-
-
+
+
-
+
-
+
-
+
-
+
|
iAddr = 0
for c in sWord:
if c not in self.dChar:
return []
iAddr = self._lookupArcNode(self.dChar[c], iAddr)
if iAddr is None:
return []
if self.byDic[iAddr] & self._finalNodeMask:
if self.lByDic[iAddr] & self._finalNodeMask:
l = []
nRawArc = 0
while not nRawArc & self._lastArcMask:
iEndArcAddr = iAddr + 1
nRawArc = self.byDic[iAddr]
nRawArc = self.lByDic[iAddr]
nArc = nRawArc & self._arcMask
if nArc > self.nChar:
# This value is not a char, this is a stemming code
sStem = ">" + self.funcStemming(sWord, self.lArcVal[nArc])
# Now , we go to the next node and retrieve all following arcs values, all of them are tags
iAddr2 = self.byDic[iEndArcAddr]
iAddr2 = self.lByDic[iEndArcAddr]
nRawArc2 = 0
while not nRawArc2 & self._lastArcMask:
iEndArcAddr2 = iAddr2 + 1
nRawArc2 = self.byDic[iAddr2]
nRawArc2 = self.lByDic[iAddr2]
l.append(sStem + "/" + self.lArcVal[nRawArc2 & self._arcMask])
iAddr2 = iEndArcAddr2 + 1
iAddr = iEndArcAddr + 1
return l
return []
def _stem (self, sWord):
"returns stems list of <sWord>"
iAddr = 0
for c in sWord:
if c not in self.dChar:
return []
iAddr = self._lookupArcNode(self.dChar[c], iAddr)
if iAddr is None:
return []
if self.byDic[iAddr] & self._finalNodeMask:
if self.lByDic[iAddr] & self._finalNodeMask:
l = []
nRawArc = 0
while not nRawArc & self._lastArcMask:
iEndArcAddr = iAddr + 1
nRawArc = self.byDic[iAddr]
nRawArc = self.lByDic[iAddr]
nArc = nRawArc & self._arcMask
if nArc > self.nChar:
# This value is not a char, this is a stemming code
l.append(self.funcStemming(sWord, self.lArcVal[nArc]))
iAddr = iEndArcAddr + 1
return l
return []
def _lookupArcNode (self, nVal, iAddr):
"looks if <nVal> is an arc at the node at <iAddr>, if yes, returns address of next node else None"
while True:
iEndArcAddr = iAddr + 1
nRawArc = self.byDic[iAddr]
nRawArc = self.lByDic[iAddr]
if nVal == (nRawArc & self._arcMask):
# the value we are looking for
# we return the address of the next node
return self.byDic[iEndArcAddr]
return self.lByDic[iEndArcAddr]
# value not found
if nRawArc & self._lastArcMask:
return None
iAddr = iEndArcAddr + 1
def _getArcs (self, iAddr):
"generator: return all arcs at <iAddr> as tuples of (nVal, iAddr)"
while True:
iEndArcAddr = iAddr + 1
nRawArc = self.byDic[iAddr]
yield nRawArc & self._arcMask, self.byDic[iEndArcAddr]
nRawArc = self.lByDic[iAddr]
yield nRawArc & self._arcMask, self.lByDic[iEndArcAddr]
if nRawArc & self._lastArcMask:
break
iAddr = iEndArcAddr + 1
def _writeNodes (self, spfDest):
"for debugging only"
print(" > Write binary nodes")
with open(spfDest, 'w', 'utf-8', newline="\n") as hDst:
iAddr = 0
hDst.write("i{:_>10} -- #{:_>10}\n".format("0", iAddr))
while iAddr < len(self.byDic):
while iAddr < len(self.lByDic):
iEndArcAddr = iAddr + 1
nRawArc = self.byDic[iAddr]
nRawArc = self.lByDic[iAddr]
nArc = nRawArc & self._arcMask
hDst.write(" {:<20} {:0>16} i{:>10} #{:_>10}\n".format(self.lArcVal[nArc], bin(nRawArc)[2:], "?", self.byDic[iEndArcAddr]))
hDst.write(" {:<20} {:0>16} i{:>10} #{:_>10}\n".format(self.lArcVal[nArc], bin(nRawArc)[2:], "?", self.lByDic[iEndArcAddr]))
iAddr = iEndArcAddr + 1
if (nRawArc & self._lastArcMask) and iAddr < len(self.byDic):
if (nRawArc & self._lastArcMask) and iAddr < len(self.lByDic):
hDst.write("\ni{:_>10} -- #{:_>10}\n".format("?", iAddr))
hDst.close()
|