Index: cli.py ================================================================== --- cli.py +++ cli.py @@ -208,10 +208,11 @@ echo(" {:<32} {}".format(sMorph, oLexGraphe.formatTags(sMorph))) elif sText.startswith("!"): for sWord in sText[1:].strip().split(): if sWord: echo(" | ".join(oDict.suggest(sWord))) + echo(" | ".join(oDict.suggest2(sWord))) elif sText.startswith(">"): oDict.drawPath(sText[1:].strip()) elif sText.startswith("="): for sRes in oDict.select(sText[1:].strip()): echo(sRes) Index: gc_core/py/char_player.py ================================================================== --- gc_core/py/char_player.py +++ gc_core/py/char_player.py @@ -16,63 +16,10 @@ def cleanWord (sWord): "word simplication before calculating distance between words" return sWord.lower().translate(_xTransChars).replace("eau", "o").replace("au", "o") - -def distanceDamerauLevenshtein (s1, s2): - "distance of Damerau-Levenshtein between and " - # https://fr.wikipedia.org/wiki/Distance_de_Damerau-Levenshtein - d = {} - nLen1 = len(s1) - nLen2 = len(s2) - for i in range(-1, nLen1+1): - d[i, -1] = i + 1 - for j in range(-1, nLen2+1): - d[-1, j] = j + 1 - for i in range(nLen1): - for j in range(nLen2): - nCost = 0 if s1[i] == s2[j] else 1 - d[i, j] = min( - d[i-1, j] + 1, # Deletion - d[i, j-1] + 1, # Insertion - d[i-1, j-1] + nCost, # Substitution - ) - if i and j and s1[i] == s2[j-1] and s1[i-1] == s2[j]: - d[i, j] = min(d[i, j], d[i-2, j-2] + nCost) # Transposition - return d[nLen1-1, nLen2-1] - - -def showDistance (s1, s2): - s1b = cleanWord(s1); - s2b = cleanWord(s2); - print(f"Distance: {s1} / {s2} = {distanceDamerauLevenshtein(s1, s2)}") - print(f"Distance: {s1b} / {s2b} = {distanceDamerauLevenshtein(s1b, s2b)}") - - -# Method: Remove Useless Chars - -_dVovels = { - 'a': '', 'e': '', 'i': '', 'o': '', 'u': '', 'y': '', - 'à': '', 'é': '', 'î': '', 'ô': '', 'û': '', 'ÿ': '', - 'â': '', 'è': '', 'ï': '', 'ö': '', 'ù': '', 'ŷ': '', - 'ä': '', 'ê': '', 'í': '', 'ó': '', 'ü': '', 'ý': '', - 'á': '', 'ë': '', 'ì': '', 'ò': '', 'ú': '', 'ỳ': '', - 'ā': '', 'ē': '', 'ī': '', 'ō': '', 'ū': '', 'ȳ': '', - 'h': '', 'œ': '', 'æ': '' - } - -_xTransVovels = str.maketrans(_dVovels) - - -aVovels = frozenset(_dVovels.keys()) - - -def shrinkWord (sWord): - "remove vovels and h" - return sWord[0:1].replace("h", "") + sWord[1:].translate(_xTransVovels) - # Similar chars d1to1 = { "1": "liîLIÎ", Index: gc_core/py/ibdawg.py ================================================================== --- gc_core/py/ibdawg.py +++ gc_core/py/ibdawg.py @@ -6,12 +6,12 @@ import re from itertools import chain from functools import wraps import time -#import logging -#logging.basicConfig(filename="suggestions.log", level=logging.DEBUG) +import logging +logging.basicConfig(filename="suggestions.log", level=logging.DEBUG) from . import str_transform as st from . import char_player as cp from .echo import echo @@ -25,10 +25,56 @@ fEnd = time.time() print(func.__name__, fEnd - fStart) return result return wrapper + +class SuggResult: + """Structure for storing, classifying and filtering suggestions""" + + def __init__ (self, sWord, nDistLimit=-1): + self.sWord = sWord + self.sCleanWord = cp.cleanWord(sWord) + self.nDistLimit = nDistLimit if nDistLimit >= 0 else (len(sWord) // 3) + 1 + self.nMinDist = 1000 + self.aSugg = set() + self.dSugg = { 0: [], 1: [] } + + def addSugg (self, sSugg, nDeep=0): + "add a suggestion" + #print(sSugg) + if sSugg not in self.aSugg: + nDist = st.distanceDamerauLevenshtein(self.sCleanWord, cp.cleanWord(sSugg)) + if nDist <= self.nDistLimit: + if nDist not in self.dSugg: + self.dSugg[nDist] = [] + self.dSugg[nDist].append(sSugg) + self.aSugg.add(sSugg) + #logging.info((nDeep * " ") + "__" + sSugg + "__") + if nDist < self.nMinDist: + self.nMinDist = nDist + self.nDistLimit = min(self.nDistLimit, self.nMinDist+2) + + def getSuggestions (self, nSuggLimit=10, nDistLimit=-1): + "return a list of suggestions" + lRes = [] + #if self.dSugg[0]: + # # we sort the better results with the original word + # self.dSugg[0].sort(key=lambda sSugg: cp.distanceDamerauLevenshtein(self.sWord, sSugg)) + for lSugg in self.dSugg.values(): + lRes.extend(lSugg) + if len(lRes) > nSuggLimit: + break + lRes = list(cp.filterSugg(lRes)) + if self.sWord.istitle(): + lRes = list(map(lambda sSugg: sSugg.title(), lRes)) + return lRes[:nSuggLimit] + + def reset (self): + self.aSugg.clear() + self.dSugg.clear() + class IBDAWG: """INDEXABLE BINARY DIRECT ACYCLIC WORD GRAPH""" def __init__ (self, sDicName): @@ -204,71 +250,98 @@ def suggest (self, sWord, nMaxSugg=10): "returns a set of suggestions for " sPfx, sWord, sSfx = cp.cut(sWord) nMaxDel = len(sWord) // 5 nMaxHardRepl = max((len(sWord) - 5) // 4, 1) - aSugg = self._suggest(sWord, nMaxDel=nMaxDel, nMaxHardRepl=nMaxHardRepl) + oSuggResult = SuggResult(sWord) + self._suggest(oSuggResult, sWord, nMaxDel=nMaxDel, nMaxHardRepl=nMaxHardRepl) if sWord.istitle(): - aSugg.update(self._suggest(sWord.lower(), nMaxDel=nMaxDel, nMaxHardRepl=nMaxHardRepl)) - aSugg = set(map(lambda sSugg: sSugg.title(), aSugg)) + self._suggest(oSuggResult, sWord.lower(), nMaxDel=nMaxDel, nMaxHardRepl=nMaxHardRepl) elif sWord.islower(): - aSugg.update(self._suggest(sWord.title(), nMaxDel=nMaxDel, nMaxHardRepl=nMaxHardRepl)) - aSugg = cp.filterSugg(aSugg) - sCleanWord = cp.cleanWord(sWord) - aSugg = sorted(aSugg, key=lambda sSugg: cp.distanceDamerauLevenshtein(sCleanWord, cp.cleanWord(sSugg)))[:nMaxSugg] + self._suggest(oSuggResult, sWord.title(), nMaxDel=nMaxDel, nMaxHardRepl=nMaxHardRepl) + aSugg = oSuggResult.getSuggestions() if sSfx or sPfx: # we add what we removed return list(map(lambda sSug: sPfx + sSug + sSfx, aSugg)) return aSugg - def _suggest (self, sRemain, nMaxDel=0, nMaxHardRepl=0, nDeep=0, iAddr=0, sNewWord="", bAvoidLoop=False): - "returns a set of suggestions" + def _suggest (self, oSuggResult, sRemain, nMaxDel=0, nMaxHardRepl=0, nDeep=0, iAddr=0, sNewWord="", sAction="", bAvoidLoop=False): # recursive function - #logging.info((nDeep * " ") + sNewWord + ":" + sRemain) - aSugg = set() + #logging.info((nDeep * " ") + sNewWord + ":" + sRemain + " · " + sAction) if not sRemain: if int.from_bytes(self.byDic[iAddr:iAddr+self.nBytesArc], byteorder='big') & self._finalNodeMask: #logging.info((nDeep * " ") + "__" + sNewWord + "__") - aSugg.add(sNewWord) + oSuggResult.addSugg(sNewWord) for sTail in self._getTails(iAddr): #logging.info((nDeep * " ") + "__" + sNewWord+sTail + "__") - aSugg.add(sNewWord+sTail) - return aSugg + oSuggResult.addSugg(sNewWord+sTail) + return cCurrent = sRemain[0:1] for cChar, jAddr in self._getSimilarArcs(cCurrent, iAddr): - aSugg.update(self._suggest(sRemain[1:], nMaxDel, nMaxHardRepl, nDeep+1, jAddr, sNewWord+cChar)) + self._suggest(oSuggResult, sRemain[1:], nMaxDel, nMaxHardRepl, nDeep+1, jAddr, sNewWord+cChar, "*") if not bAvoidLoop: # avoid infinite loop if cCurrent == sRemain[1:2]: # same char, we remove 1 char without adding 1 to - aSugg.update(self._suggest(sRemain[1:], nMaxDel, nMaxHardRepl, nDeep+1, iAddr, sNewWord)) + self._suggest(oSuggResult, sRemain[1:], nMaxDel, nMaxHardRepl, nDeep+1, iAddr, sNewWord, cCurrent+"/2") else: # switching chars - aSugg.update(self._suggest(sRemain[1:2]+sRemain[0:1]+sRemain[2:], nMaxDel, nMaxHardRepl, nDeep+1, iAddr, sNewWord, True)) + self._suggest(oSuggResult, sRemain[1:2]+sRemain[0:1]+sRemain[2:], nMaxDel, nMaxHardRepl, nDeep+1, iAddr, sNewWord, "><",True) # delete char if nMaxDel > 0: - aSugg.update(self._suggest(sRemain[1:], nMaxDel-1, nMaxHardRepl, nDeep+1, iAddr, sNewWord, True)) + self._suggest(oSuggResult, sRemain[1:], nMaxDel-1, nMaxHardRepl, nDeep+1, iAddr, sNewWord, "-"+cCurrent, True) # Phonetic replacements for sRepl in cp.d1toX.get(cCurrent, ()): - aSugg.update(self._suggest(sRepl + sRemain[1:], nMaxDel, nMaxHardRepl, nDeep+1, iAddr, sNewWord, True)) + self._suggest(oSuggResult, sRepl + sRemain[1:], nMaxDel, nMaxHardRepl, nDeep+1, iAddr, sNewWord, cCurrent+">"+sRepl, True) for sRepl in cp.d2toX.get(sRemain[0:2], ()): - aSugg.update(self._suggest(sRepl + sRemain[2:], nMaxDel, nMaxHardRepl, nDeep+1, iAddr, sNewWord, True)) + self._suggest(oSuggResult, sRepl + sRemain[2:], nMaxDel, nMaxHardRepl, nDeep+1, iAddr, sNewWord, sRemain[0:2]+">"+sRepl, True) # Hard replacements if nDeep > 3 and nMaxHardRepl and len(sRemain) >= 2: for nVal, kAddr in self._getArcs1(iAddr): if nVal in self.dCharVal: cChar = self.dCharVal[nVal] if cChar not in cp.d1to1.get(cCurrent, ""): - aSugg.update(self._suggest(sRemain[1:], nMaxDel, nMaxHardRepl-1, nDeep+1, kAddr, sNewWord+cChar, True)) + self._suggest(oSuggResult, sRemain[1:], nMaxDel, nMaxHardRepl-1, nDeep+1, kAddr, sNewWord+cChar, "[["+cChar+"]]", True) # end of word if len(sRemain) == 2: for sRepl in cp.dFinal2.get(sRemain, ()): - aSugg.update(self._suggest(sRepl, nMaxDel, nMaxHardRepl, nDeep+1, iAddr, sNewWord, True)) + self._suggest(oSuggResult, sRepl, nMaxDel, nMaxHardRepl, nDeep+1, iAddr, sNewWord, sRemain + " >> " + sRepl, True) elif len(sRemain) == 1: - aSugg.update(self._suggest("", nMaxDel, nMaxHardRepl, nDeep+1, iAddr, sNewWord, True)) # remove last char and go on + self._suggest(oSuggResult, "", nMaxDel, nMaxHardRepl, nDeep+1, iAddr, sNewWord, sRemain + " [last char removed] ", True) # remove last char and go o for sRepl in cp.dFinal1.get(sRemain, ()): - aSugg.update(self._suggest(sRepl, nMaxDel, nMaxHardRepl, nDeep+1, iAddr, sNewWord, True)) + self._suggest(oSuggResult, sRepl, nMaxDel, nMaxHardRepl, nDeep+1, iAddr, sNewWord, sRemain + " >> " + sRepl, True) + return + + @timethis + def suggest2 (self, sWord, nMaxSugg=10): + "returns a set of suggestions for " + sPfx, sWord, sSfx = cp.cut(sWord) + oSuggResult = SuggResult(sWord) + self._suggest2(oSuggResult) + aSugg = oSuggResult.getSuggestions() + if sSfx or sPfx: + # we add what we removed + return list(map(lambda sSug: sPfx + sSug + sSfx, aSugg)) return aSugg + + def _suggest2 (self, oSuggResult, nDeep=0, iAddr=0, sNewWord=""): + #logging.info((nDeep * " ") + sNewWord) + if nDeep >= oSuggResult.nDistLimit: + sCleanNewWord = cp.cleanWord(sNewWord) + if st.distanceSift4(oSuggResult.sCleanWord[:len(sCleanNewWord)], sCleanNewWord) > oSuggResult.nDistLimit: + return + if int.from_bytes(self.byDic[iAddr:iAddr+self.nBytesArc], byteorder='big') & self._finalNodeMask: + #logging.info((nDeep * " ") + "__" + sNewWord + "__") + oSuggResult.addSugg(sNewWord, nDeep) + for cChar, jAddr in self._getCharArcs(iAddr): + self._suggest2(oSuggResult, nDeep+1, jAddr, sNewWord+cChar) + return + + def _getCharArcs (self, iAddr): + "generator: yield all chars and addresses from node at address " + for nVal, jAddr in self._getArcs(iAddr): + if nVal < self.nChar: + yield (self.dCharVal[nVal], jAddr) def _getSimilarArcs (self, cChar, iAddr): "generator: yield similar char of and address of the following node" for c in cp.d1to1.get(cChar, [cChar]): if c in self.dChar: Index: gc_core/py/str_transform.py ================================================================== --- gc_core/py/str_transform.py +++ gc_core/py/str_transform.py @@ -1,7 +1,120 @@ #!python3 + +#### DISTANCE CALCULATIONS + +def longestCommonSubstring (s1, s2): + # http://en.wikipedia.org/wiki/Longest_common_substring_problem + # http://en.wikibooks.org/wiki/Algorithm_implementation/Strings/Longest_common_substring + M = [ [0]*(1+len(s2)) for i in range(1+len(s1)) ] + longest, x_longest = 0, 0 + for x in range(1, 1+len(s1)): + for y in range(1, 1+len(s2)): + if s1[x-1] == s2[y-1]: + M[x][y] = M[x-1][y-1] + 1 + if M[x][y] > longest: + longest = M[x][y] + x_longest = x + else: + M[x][y] = 0 + return s1[x_longest-longest : x_longest] + + +def distanceDamerauLevenshtein (s1, s2): + "distance of Damerau-Levenshtein between and " + # https://fr.wikipedia.org/wiki/Distance_de_Damerau-Levenshtein + d = {} + nLen1 = len(s1) + nLen2 = len(s2) + for i in range(-1, nLen1+1): + d[i, -1] = i + 1 + for j in range(-1, nLen2+1): + d[-1, j] = j + 1 + for i in range(nLen1): + for j in range(nLen2): + nCost = 0 if s1[i] == s2[j] else 1 + d[i, j] = min( + d[i-1, j] + 1, # Deletion + d[i, j-1] + 1, # Insertion + d[i-1, j-1] + nCost, # Substitution + ) + if i and j and s1[i] == s2[j-1] and s1[i-1] == s2[j]: + d[i, j] = min(d[i, j], d[i-2, j-2] + nCost) # Transposition + return d[nLen1-1, nLen2-1] + + +def distanceSift4 (s1, s2, nMaxOffset=5): + "implementation of general Sift4." + # https://siderite.blogspot.com/2014/11/super-fast-and-accurate-string-distance.html + if not s1: + return len(s2) + if not s2: + return len(s1) + nLen1, nLen2 = len(s1), len(s2) + i1, i2 = 0, 0 # Cursors for each string + nLargestCS = 0 # Largest common substring + nLocalCS = 0 # Local common substring + nTrans = 0 # Number of transpositions ('ab' vs 'ba') + lOffset = [] # Offset pair array, for computing the transpositions + + while i1 < nLen1 and i2 < nLen2: + if s1[i1] == s2[i2]: + nLocalCS += 1 + # Check if current match is a transposition + bTrans = False + i = 0 + while i < len(lOffset): + t = lOffset[i] + if i1 <= t[0] or i2 <= t[1]: + bTrans = abs(i2-i1) >= abs(t[1] - t[0]) + if bTrans: + nTrans += 1 + elif not t[2]: + t[2] = True + nTrans += 1 + break + elif i1 > t[1] and i2 > t[0]: + del lOffset[i] + else: + i += 1 + lOffset.append([i1, i2, bTrans]) + else: + nLargestCS += nLocalCS + nLocalCS = 0 + if i1 != i2: + i1 = i2 = min(i1, i2) + for i in range(nMaxOffset): + if i1 + i >= nLen1 and i2 + i >= nLen2: + break + elif i1 + i < nLen1 and s1[i1+i] == s2[i2]: + i1 += i - 1 + i2 -= 1 + break + elif i2 + i < nLen2 and s1[i1] == s2[i2+i]: + i2 += i - 1 + i1 -= 1 + break + i1 += 1 + i2 += 1 + if i1 >= nLen1 or i2 >= nLen2: + nLargestCS += nLocalCS + nLocalCS = 0 + i1 = i2 = min(i1, i2) + nLargestCS += nLocalCS + return round(max(nLen1, nLen2) - nLargestCS + nTrans) + + +def showDistance (s1, s2): + print(f"Damerau-Levenshtein: {s1} / {s2} = {distanceDamerauLevenshtein(s1, s2)}") + print(f"Sift4: {s1} / {s2} = {distanceSift4(s1, s2)}") + + + + +#### STEMMING OPERATIONS + ## No stemming def noStemming (sFlex, sStem): return sStem @@ -38,18 +151,20 @@ for i in range(min(len(sFlex), len(sStem))): if sFlex[i] != sStem[i]: break jSfx += 1 return chr(len(sFlex)-jSfx+48) + sStem[jSfx:] + def changeWordWithSuffixCode (sWord, sSfxCode): if sSfxCode == "0": return sWord return sWord[:-(ord(sSfxCode[0])-48)] + sSfxCode[1:] if sSfxCode[0] != '0' else sWord + sSfxCode[1:] # Prefix and suffix + def defineAffixCode (sFlex, sStem): """ Returns a string defining how to get stem from flexion. Examples: "0" if stem = flexion "stem" if no common substring "n(pfx)/m(sfx)" @@ -74,25 +189,10 @@ sAff = "{}/".format(chr(n+48)) if not sPfx else "{}{}/".format(chr(n+48), sPfx) sAff += chr(m+48) if not sSfx else "{}{}".format(chr(m+48), sSfx) return sAff return sStem -def longestCommonSubstring (s1, s2): - # http://en.wikipedia.org/wiki/Longest_common_substring_problem - # http://en.wikibooks.org/wiki/Algorithm_implementation/Strings/Longest_common_substring - M = [ [0]*(1+len(s2)) for i in range(1+len(s1)) ] - longest, x_longest = 0, 0 - for x in range(1, 1+len(s1)): - for y in range(1, 1+len(s2)): - if s1[x-1] == s2[y-1]: - M[x][y] = M[x-1][y-1] + 1 - if M[x][y] > longest: - longest = M[x][y] - x_longest = x - else: - M[x][y] = 0 - return s1[x_longest-longest : x_longest] def changeWordWithAffixCode (sWord, sAffCode): if sAffCode == "0": return sWord if '/' not in sAffCode: