Index: cli.py ================================================================== --- cli.py +++ cli.py @@ -208,10 +208,11 @@ echo(" {:<32} {}".format(sMorph, oLexGraphe.formatTags(sMorph))) elif sText.startswith("!"): for sWord in sText[1:].strip().split(): if sWord: echo(" | ".join(oDict.suggest(sWord))) + echo(" | ".join(oDict.suggest2(sWord))) elif sText.startswith(">"): oDict.drawPath(sText[1:].strip()) elif sText.startswith("="): for sRes in oDict.select(sText[1:].strip()): echo(sRes) Index: gc_core/py/char_player.py ================================================================== --- gc_core/py/char_player.py +++ gc_core/py/char_player.py @@ -46,10 +46,71 @@ s1b = cleanWord(s1); s2b = cleanWord(s2); print(f"Distance: {s1} / {s2} = {distanceDamerauLevenshtein(s1, s2)}") print(f"Distance: {s1b} / {s2b} = {distanceDamerauLevenshtein(s1b, s2b)}") + +def distanceSift4 (s1, s2, nMaxOffset=5): + "implementation of general Sift4." + # https://siderite.blogspot.com/2014/11/super-fast-and-accurate-string-distance.html + if not s1: + return len(s2) + if not s2: + return len(s1) + nLen1, nLen2 = len(s1), len(s2) + i1, i2 = 0, 0 # Cursors for each string + nLargestCS = 0 # Largest common substring + nLocalCS = 0 # Local common substring + nTrans = 0 # Number of transpositions ('ab' vs 'ba') + lOffset = [] # Offset pair array, for computing the transpositions + + while i1 < nLen1 and i2 < nLen2: + if s1[i1] == s2[i2]: + nLocalCS += 1 + # Check if current match is a transposition + bTrans = False + i = 0 + while i < len(lOffset): + t = lOffset[i] + if i1 <= t[0] or i2 <= t[1]: + bTrans = abs(i2-i1) >= abs(t[1] - t[0]) + if bTrans: + nTrans += 1 + elif not t[2]: + t[2] = True + nTrans += 1 + break + elif i1 > t[1] and i2 > t[0]: + del lOffset[i] + else: + i += 1 + lOffset.append([i1, i2, bTrans]) + else: + nLargestCS += nLocalCS + nLocalCS = 0 + if i1 != i2: + i1 = i2 = min(i1, i2) + for i in range(nMaxOffset): + if i1 + i >= nLen1 and i2 + i >= nLen2: + break + elif i1 + i < nLen1 and s1[i1+i] == s2[i2]: + i1 += i - 1 + i2 -= 1 + break + elif i2 + i < nLen2 and s1[i1] == s2[i2+i]: + i2 += i - 1 + i1 -= 1 + break + i1 += 1 + i2 += 1 + if i1 >= nLen1 or i2 >= nLen2: + nLargestCS += nLocalCS + nLocalCS = 0 + i1 = i2 = min(i1, i2) + nLargestCS += nLocalCS + return round(max(nLen1, nLen2) - nLargestCS + nTrans) + # Method: Remove Useless Chars _dVovels = { 'a': '', 'e': '', 'i': '', 'o': '', 'u': '', 'y': '', Index: gc_core/py/ibdawg.py ================================================================== --- gc_core/py/ibdawg.py +++ gc_core/py/ibdawg.py @@ -6,12 +6,12 @@ import re from itertools import chain from functools import wraps import time -#import logging -#logging.basicConfig(filename="suggestions.log", level=logging.DEBUG) +import logging +logging.basicConfig(filename="suggestions.log", level=logging.DEBUG) from . import str_transform as st from . import char_player as cp from .echo import echo @@ -218,14 +218,14 @@ if sSfx or sPfx: # we add what we removed return list(map(lambda sSug: sPfx + sSug + sSfx, aSugg)) return aSugg - def _suggest (self, sRemain, nMaxDel=0, nMaxHardRepl=0, nDeep=0, iAddr=0, sNewWord="", bAvoidLoop=False): + def _suggest (self, sRemain, nMaxDel=0, nMaxHardRepl=0, nDeep=0, iAddr=0, sNewWord="", sAction="", bAvoidLoop=False): "returns a set of suggestions" # recursive function - #logging.info((nDeep * " ") + sNewWord + ":" + sRemain) + #logging.info((nDeep * " ") + sNewWord + ":" + sRemain + " ยท " + sAction) aSugg = set() if not sRemain: if int.from_bytes(self.byDic[iAddr:iAddr+self.nBytesArc], byteorder='big') & self._finalNodeMask: #logging.info((nDeep * " ") + "__" + sNewWord + "__") aSugg.add(sNewWord) @@ -233,43 +233,84 @@ #logging.info((nDeep * " ") + "__" + sNewWord+sTail + "__") aSugg.add(sNewWord+sTail) return aSugg cCurrent = sRemain[0:1] for cChar, jAddr in self._getSimilarArcs(cCurrent, iAddr): - aSugg.update(self._suggest(sRemain[1:], nMaxDel, nMaxHardRepl, nDeep+1, jAddr, sNewWord+cChar)) + aSugg.update(self._suggest(sRemain[1:], nMaxDel, nMaxHardRepl, nDeep+1, jAddr, sNewWord+cChar, "*")) if not bAvoidLoop: # avoid infinite loop if cCurrent == sRemain[1:2]: # same char, we remove 1 char without adding 1 to - aSugg.update(self._suggest(sRemain[1:], nMaxDel, nMaxHardRepl, nDeep+1, iAddr, sNewWord)) + aSugg.update(self._suggest(sRemain[1:], nMaxDel, nMaxHardRepl, nDeep+1, iAddr, sNewWord, cCurrent+"/2")) else: # switching chars - aSugg.update(self._suggest(sRemain[1:2]+sRemain[0:1]+sRemain[2:], nMaxDel, nMaxHardRepl, nDeep+1, iAddr, sNewWord, True)) + aSugg.update(self._suggest(sRemain[1:2]+sRemain[0:1]+sRemain[2:], nMaxDel, nMaxHardRepl, nDeep+1, iAddr, sNewWord, "><",True)) # delete char if nMaxDel > 0: - aSugg.update(self._suggest(sRemain[1:], nMaxDel-1, nMaxHardRepl, nDeep+1, iAddr, sNewWord, True)) + aSugg.update(self._suggest(sRemain[1:], nMaxDel-1, nMaxHardRepl, nDeep+1, iAddr, sNewWord, "-"+cCurrent, True)) # Phonetic replacements for sRepl in cp.d1toX.get(cCurrent, ()): - aSugg.update(self._suggest(sRepl + sRemain[1:], nMaxDel, nMaxHardRepl, nDeep+1, iAddr, sNewWord, True)) + aSugg.update(self._suggest(sRepl + sRemain[1:], nMaxDel, nMaxHardRepl, nDeep+1, iAddr, sNewWord, cCurrent+">"+sRepl, True)) for sRepl in cp.d2toX.get(sRemain[0:2], ()): - aSugg.update(self._suggest(sRepl + sRemain[2:], nMaxDel, nMaxHardRepl, nDeep+1, iAddr, sNewWord, True)) + aSugg.update(self._suggest(sRepl + sRemain[2:], nMaxDel, nMaxHardRepl, nDeep+1, iAddr, sNewWord, sRemain[0:2]+">"+sRepl, True)) # Hard replacements if nDeep > 3 and nMaxHardRepl and len(sRemain) >= 2: for nVal, kAddr in self._getArcs1(iAddr): if nVal in self.dCharVal: cChar = self.dCharVal[nVal] if cChar not in cp.d1to1.get(cCurrent, ""): - aSugg.update(self._suggest(sRemain[1:], nMaxDel, nMaxHardRepl-1, nDeep+1, kAddr, sNewWord+cChar, True)) + aSugg.update(self._suggest(sRemain[1:], nMaxDel, nMaxHardRepl-1, nDeep+1, kAddr, sNewWord+cChar, "[["+cChar+"]]", True)) # end of word if len(sRemain) == 2: for sRepl in cp.dFinal2.get(sRemain, ()): - aSugg.update(self._suggest(sRepl, nMaxDel, nMaxHardRepl, nDeep+1, iAddr, sNewWord, True)) + aSugg.update(self._suggest(sRepl, nMaxDel, nMaxHardRepl, nDeep+1, iAddr, sNewWord, sRemain + " >> " + sRepl, True)) elif len(sRemain) == 1: - aSugg.update(self._suggest("", nMaxDel, nMaxHardRepl, nDeep+1, iAddr, sNewWord, True)) # remove last char and go on + aSugg.update(self._suggest("", nMaxDel, nMaxHardRepl, nDeep+1, iAddr, sNewWord, sRemain + " [last char removed] ", True)) # remove last char and go on for sRepl in cp.dFinal1.get(sRemain, ()): - aSugg.update(self._suggest(sRepl, nMaxDel, nMaxHardRepl, nDeep+1, iAddr, sNewWord, True)) + aSugg.update(self._suggest(sRepl, nMaxDel, nMaxHardRepl, nDeep+1, iAddr, sNewWord, sRemain + " >> " + sRepl, True)) + return aSugg + + @timethis + def suggest2 (self, sWord, nMaxSugg=10): + "returns a set of suggestions for " + sPfx, sWord, sSfx = cp.cut(sWord) + nMaxDist = (len(sWord) // 3) + 1 + sCleanWord = cp.cleanWord(sWord) + aSugg = self._suggest2(sWord, sCleanWord, nMaxDist) + if sWord.istitle(): + aSugg.update(self._suggest2(sWord.lower(), sCleanWord, nMaxDist)) + aSugg = set(map(lambda sSugg: sSugg.title(), aSugg)) + elif sWord.islower(): + aSugg.update(self._suggest2(sWord.title(), sCleanWord, nMaxDist)) + aSugg = cp.filterSugg(aSugg) + aSugg = sorted(aSugg, key=lambda sSugg: cp.distanceSift4(sCleanWord, cp.cleanWord(sSugg)))[:nMaxSugg] + if sSfx or sPfx: + # we add what we removed + return list(map(lambda sSug: sPfx + sSug + sSfx, aSugg)) + return aSugg + + def _suggest2 (self, sWord, sCleanWord, nMaxDist, nDeep=0, iAddr=0, sNewWord=""): + #logging.info((nDeep * " ") + sNewWord) + if nDeep >= nMaxDist: + sCleanNewWord = cp.cleanWord(sNewWord) + if cp.distanceSift4(sCleanWord[:len(sCleanNewWord)], sCleanNewWord) > nMaxDist: + return set() + aSugg = set() + if int.from_bytes(self.byDic[iAddr:iAddr+self.nBytesArc], byteorder='big') & self._finalNodeMask: + #logging.info((nDeep * " ") + "__" + sNewWord + "__") + sCleanNewWord = cp.cleanWord(sNewWord) + if cp.distanceSift4(sCleanWord, sCleanNewWord) <= nMaxDist: + aSugg.add(sNewWord) + for cChar, jAddr in self._getCharArcs(iAddr): + aSugg.update(self._suggest2(sWord, sCleanWord, nMaxDist, nDeep+1, jAddr, sNewWord+cChar)) return aSugg + def _getCharArcs (self, iAddr): + "generator: yield all chars and addresses from node at address " + for nVal, jAddr in self._getArcs(iAddr): + if nVal < self.nChar: + yield (self.dCharVal[nVal], jAddr) + def _getSimilarArcs (self, cChar, iAddr): "generator: yield similar char of and address of the following node" for c in cp.d1to1.get(cChar, [cChar]): if c in self.dChar: jAddr = self._lookupArcNode(self.dChar[c], iAddr)