Index: grammalecte-cli.py ================================================================== --- grammalecte-cli.py +++ grammalecte-cli.py @@ -272,25 +272,33 @@ # pseudo-console sInputText = "\n~==========~ Enter your text [/h /q] ~==========~\n" sText = _getText(sInputText) while True: if sText.startswith("?"): + # morphologies for sWord in sText[1:].strip().split(): if sWord: echo("* " + sWord) for sElem, aRes in oSpellChecker.analyze(sWord): echo(" - " + sElem) for sMorph, sMeaning in aRes: echo(" {:<40} {}".format(sMorph, sMeaning)) elif sText.startswith("!"): + # spelling suggestions for sWord in sText[1:].strip().split(): if sWord: for lSugg in oSpellChecker.suggest(sWord): echo(" | ".join(lSugg)) + if xArgs.debug: + for sSugg in lSugg: + nDist = strt.distanceDamerauLevenshteinX(sWord, sSugg) + echo(f"{sSugg:30} {nDist}") elif sText.startswith(">"): + # path in the word graph oSpellChecker.drawPath(sText[1:].strip()) elif sText.startswith("="): + # search in dictionnary sSearch = sText[1:].strip() if "=" in sSearch: nCut = sSearch.find("=") sFlexPattern = sSearch[0:nCut] sTagsPattern = sSearch[nCut+1:] @@ -298,14 +306,14 @@ sFlexPattern = sSearch sTagsPattern = "" for aRes in oSpellChecker.select(sFlexPattern, sTagsPattern): echo("{:<30} {:<30} {}".format(*aRes)) elif sText.startswith("≠"): + # distances calculation lWords = sText[1:].split("|") for s1, s2 in itertools.combinations(lWords, 2): - nDist = strt.distanceDamerauLevenshtein(s1, s2) - print(f"{s1} ≠ {s2}: {nDist}") + strt.showDistance(s1, s2) elif sText.startswith("/o+ "): oGrammarChecker.gce.setOptions({ opt:True for opt in sText[3:].strip().split() if opt in oGrammarChecker.gce.getOptions() }) echo("done") elif sText.startswith("/o- "): oGrammarChecker.gce.setOptions({ opt:False for opt in sText[3:].strip().split() if opt in oGrammarChecker.gce.getOptions() }) Index: graphspell/char_player.py ================================================================== --- graphspell/char_player.py +++ graphspell/char_player.py @@ -1,42 +1,72 @@ """ List of similar chars useful for suggestion mechanism """ - dDistanceBetweenChars = { - # dDistanceBetweenChars: + # - with Damerau-Levenshtein, values / 10 (between 0 and 1: 0.1, 0.2 ... 0.9) # - with Jaro-Winkler, values between 1 and 10 - # - with Damerau-Levenshtein, values / 10 (between 0 and 1: 0.1, 0.2 ... 0.9) - #"a": {}, - "e": {"é": 5}, - "é": {"e": 5}, - "i": {"y": 2}, - #"o": {}, - #"u": {}, - "y": {"i": 3}, - "b": {"d": 8, "h": 9}, - "c": {"ç": 1, "k": 5, "q": 5, "s": 5, "x": 5, "z": 8}, - "d": {"b": 8}, - "f": {"v": 8}, - "g": {"j": 5}, - "h": {"b": 9}, - "j": {"g": 5, "i": 9}, - "k": {"c": 5, "q": 1, "x": 5}, - "l": {"i": 9}, - "m": {"n": 8}, - "n": {"m": 8, "r": 9}, - "p": {"q": 9}, - "q": {"c": 5, "k": 1, "p": 9}, - "r": {"n": 9, "j": 9}, - "s": {"c": 5, "ç": 1, "x": 5, "z": 5}, - "t": {"d": 9}, - "v": {"f": 8, "w": 1}, - "w": {"v": 1}, - "x": {"c": 5, "k": 5, "q": 5, "s": 5}, - "z": {"s": 5} + # voyelles + "a": { "a": 0, "á": .1, "à": .1, "â": .1, "ã": .1 }, + "á": { "a": .1, "á": 0, "à": .1, "â": .1, "ã": .1 }, + "à": { "a": .1, "á": .1, "à": 0, "â": .1, "ã": .1 }, + "â": { "a": .1, "á": .1, "à": .1, "â": 0, "ã": .1 }, + "ã": { "a": .1, "á": .1, "à": .1, "â": .1, "ã": 0 }, + + "e": { "e": 0, "é": .1, "è": .1, "ê": .1, "ẽ": .1 }, + "é": { "e": .1, "é": 0, "è": .1, "ê": .1, "ẽ": .1 }, + "è": { "e": .1, "é": .1, "è": 0, "ê": .1, "ẽ": .1 }, + "ê": { "e": .1, "é": .1, "è": .1, "ê": 0, "ẽ": .1 }, + "ẽ": { "e": .1, "é": .1, "è": .1, "ê": .1, "ẽ": 0 }, + + "i": { "i": 0, "í": .1, "ì": .1, "î": .1, "ĩ": .1 }, + "í": { "i": .1, "í": 0, "ì": .1, "î": .1, "ĩ": .1 }, + "ì": { "i": .1, "í": .1, "ì": 0, "î": .1, "ĩ": .1 }, + "î": { "i": .1, "í": .1, "ì": .1, "î": 0, "ĩ": .1 }, + "ĩ": { "i": .1, "í": .1, "ì": .1, "î": .1, "ĩ": 0 }, + + "o": { "o": 0, "ó": .1, "ò": .1, "ô": .1, "õ": .1 }, + "ó": { "o": .1, "ó": 0, "ò": .1, "ô": .1, "õ": .1 }, + "ò": { "o": .1, "ó": .1, "ò": 0, "ô": .1, "õ": .1 }, + "ô": { "o": .1, "ó": .1, "ò": .1, "ô": 0, "õ": .1 }, + "õ": { "o": .1, "ó": .1, "ò": .1, "ô": .1, "õ": 0 }, + + "u": { "u": 0, "ú": .1, "ù": .1, "û": .1, "ũ": .1 }, + "ú": { "u": .1, "ú": 0, "ù": .1, "û": .1, "ũ": .1 }, + "ù": { "u": .1, "ú": .1, "ù": 0, "û": .1, "ũ": .1 }, + "û": { "u": .1, "ú": .1, "ù": .1, "û": 0, "ũ": .1 }, + "ũ": { "u": .1, "ú": .1, "ù": .1, "û": .1, "ũ": 0 }, + + "y": { "y": 0, "ý": .1, "ỳ": .1, "ŷ": .1, "ỹ": .1 }, + "ý": { "y": .1, "ý": 0, "ỳ": .1, "ŷ": .1, "ỹ": .1 }, + "ỳ": { "y": .1, "ý": .1, "ỳ": 0, "ŷ": .1, "ỹ": .1 }, + "ŷ": { "y": .1, "ý": .1, "ỳ": .1, "ŷ": 0, "ỹ": .1 }, + "ỹ": { "y": .1, "ý": .1, "ỳ": .1, "ŷ": .1, "ỹ": 0 }, + + ## consonnes + "b": { "b": 0, "d": .8, "h": .9 }, + "c": { "c": 0, "ç": .1, "k": .5, "q": .5, "s": .5, "x": .5, "z": .8 }, + "ç": { "c": .1, "ç": 0, "k": .5, "q": .5, "s": .5, "x": .5, "z": .8 }, + "d": { "d": 0, "b": .8 }, + "f": { "f": 0, "v": .8 }, + "g": { "g": 0, "j": .5, "q": .8 }, + "h": { "h": 0, "b": .9 }, + "j": { "j": 0, "g": .5, "i": .8 }, + "k": { "k": 0, "c": .5, "q": .1, "x": .5 }, + "l": { "l": 0, "i": .8 }, + "m": { "m": 0, "n": .6 }, + "n": { "n": 0, "ñ": .1, "m": .6, "r": .8 }, + "p": { "p": 0, "q": .8 }, + "q": { "q": 0, "c": .5, "k": .1, "p": .8, "g": .8 }, + "r": { "r": 0, "n": .8, "j": .9 }, + "s": { "s": 0, "c": .5, "ç": .1, "x": .5, "z": .5 }, + "t": { "t": 0, "d": .9 }, + "v": { "v": 0, "f": .8, "w": .2 }, + "w": { "w": 0, "v": .2 }, + "x": { "x": 0, "c": .5, "k": .5, "q": .5, "s": .5 }, + "z": { "z": 0, "s": .5 } } def distanceBetweenChars (c1, c2): "returns a float between 0 and 1" Index: graphspell/ibdawg.py ================================================================== --- graphspell/ibdawg.py +++ graphspell/ibdawg.py @@ -60,23 +60,24 @@ nSimDist = st.distanceSift4(self.sSimplifiedWord, st.simplifyWord(sSugg)) #st.showDistance(self.sSimplifiedWord, st.simplifyWord(sSugg)) if nSimDist < self.nMinDist: self.nMinDist = nSimDist if nSimDist <= (self.nMinDist + 1): - nDist = min(st.distanceDamerauLevenshtein(self.sWord, sSugg), st.distanceDamerauLevenshtein(self.sSimplifiedWord, st.simplifyWord(sSugg))) + nDist = min(st.distanceDamerauLevenshteinX(self.sWord, sSugg), st.distanceDamerauLevenshteinX(self.sSimplifiedWord, st.simplifyWord(sSugg))) #print(">", end="") #st.showDistance(self.sWord, sSugg) - self.dAccSugg[sSugg] = min(nDist, nSimDist+1) + self.dAccSugg[sSugg] = nDist if " " not in sSugg else nDist+1 if len(self.dAccSugg) > self.nTempSuggLimit: self.nDistLimit = -1 # suggest() ends searching when this variable = -1 self.nDistLimit = min(self.nDistLimit, self.nMinDist+1) def getSuggestions (self): "return a list of suggestions" # sort according to distance lRes = [] lResTmp = sorted(self.dAccSugg.items(), key=lambda x: (x[1], x[0])) + #print("\n>", lResTmp) for i in range(min(self.nSuggLimit, len(lResTmp))): lRes.append(lResTmp[i][0]) #st.showDistance(self.sWord, lResTmp[i][0]) # casing if self.sWord.isupper(): Index: graphspell/str_transform.py ================================================================== --- graphspell/str_transform.py +++ graphspell/str_transform.py @@ -8,10 +8,26 @@ import re from .char_player import distanceBetweenChars, dDistanceBetweenChars from .echo import echo + +# import time +# from functools import wraps +# +# def timethis (func): +# "decorator for the execution time" +# @wraps(func) +# def wrapper (*args, **kwargs): +# "something to prevent pylint whining" +# fStart = time.perf_counter_ns() +# result = func(*args, **kwargs) +# fEnd = time.perf_counter_ns() +# print(func.__name__, fEnd - fStart) +# return result +# return wrapper + #### N-GRAMS def getNgrams (sWord, n=2): "return a list of Ngrams strings" @@ -89,10 +105,80 @@ else: lMatrix[x][y] = 0 return s1[nLongestX-nLongest : nLongestX] +llMatrix = [ + [ 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31, 32, 33, 34, 35, 36, 37, 38, 39, 40, 41 ], + [ 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 ], + [ 2, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 ], + [ 3, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 ], + [ 4, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 ], + [ 5, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 ], + [ 6, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 ], + [ 7, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 ], + [ 8, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 ], + [ 9, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 ], + [ 10, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 ], + [ 11, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 ], + [ 12, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 ], + [ 13, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 ], + [ 14, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 ], + [ 15, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 ], + [ 16, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 ], + [ 17, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 ], + [ 18, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 ], + [ 19, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 ], + [ 20, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 ], + [ 21, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 ], + [ 22, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 ], + [ 23, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 ], + [ 24, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 ], + [ 25, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 ], + [ 26, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 ], + [ 27, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 ], + [ 28, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 ], + [ 29, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 ], + [ 30, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 ], + [ 31, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 ], + [ 32, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 ], + [ 33, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 ], + [ 34, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 ], + [ 35, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 ], + [ 36, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 ], + [ 37, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 ], + [ 38, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 ], + [ 39, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 ], + [ 40, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 ], + [ 41, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 ] +] + +#@timethis +def distanceDamerauLevenshteinX (s1, s2): + "distance of Damerau-Levenshtein between and " + # https://fr.wikipedia.org/wiki/Distance_de_Damerau-Levenshtein + if s1 == s2: + return 0 + nLen1 = len(s1) + nLen2 = len(s2) + if nLen1 > 40 or nLen2 > 40: + return distanceDamerauLevenshtein(s1, s2) + for i in range(1, nLen1+1): + for j in range(1, nLen2+1): + #nCost = 0 if s1[i-1] == s2[j-1] else 1 + nCost = distanceBetweenChars(s1[i-1], s2[j-1]) + llMatrix[i][j] = min( + llMatrix[i-1][j] + 1, # Deletion + llMatrix[i][j-1] + 1, # Insertion + llMatrix[i-1][j-1] + nCost, # Substitution + ) + if i > 1 and j > 1 and s1[i-1] == s2[j-2] and s1[i-2] == s2[j-1]: + llMatrix[i][j] = min(llMatrix[i][j], llMatrix[i-2][j-2] + nCost) # Transposition + #_showLLMatrix(s1, s2) + return llMatrix[nLen1][nLen2] + +#@timethis def distanceDamerauLevenshtein (s1, s2): "distance of Damerau-Levenshtein between and " # https://fr.wikipedia.org/wiki/Distance_de_Damerau-Levenshtein d = {} nLen1 = len(s1) @@ -101,26 +187,59 @@ d[i, -1] = i + 1 for j in range(-1, nLen2+1): d[-1, j] = j + 1 for i in range(nLen1): for j in range(nLen2): - nCost = 0 if s1[i] == s2[j] else 1 #nCost = distanceBetweenChars(s1[i], s2[j]) + # We don’t use distanceBetweenChars to speed up the calc, + # as we use this function only for very long words. + nCost = 0 if s1[i] == s2[j] else 1 d[i, j] = min( d[i-1, j] + 1, # Deletion d[i, j-1] + 1, # Insertion - d[i-1, j-1] + nCost, # Substitution + d[i-1, j-1] + nCost # Substitution ) if i and j and s1[i] == s2[j-1] and s1[i-1] == s2[j]: d[i, j] = min(d[i, j], d[i-2, j-2] + nCost) # Transposition + #_showMatrix(s1, s2, d) return d[nLen1-1, nLen2-1] +def _showMatrix (s1, s2, dMatrix): + "display the matrix for Damerau-Levenshtein distance calculation" + nLen1 = len(s1) + nLen2 = len(s2) + print("\n ", end="") + for j in range(-1, nLen2+1): + print(f"{s2[j:j+1]:>4}", end="") + print("") + for i in range(-1, nLen1+1): + print(f"{s1[i:i+1]:2}", end="") + for j in range(-1, nLen2+1): + print(f"{dMatrix.get((i, j),' ·'):4}", end="") + print("") + +def _showLLMatrix (s1, s2): + "display the matrix for fast Damerau-Levenshtein distance calculation" + nLen1 = len(s1) + nLen2 = len(s2) + print("\n ", end="") + for j in range(-1, nLen2+1): + print(f"{s2[j:j+1]:>4}", end="") + print("") + for i in range(0, nLen1+2): + print(f"{s1[i-1:i]:2}", end="") + for v in llMatrix[i][:nLen2+2]: + print(f"{v:4}", end="") + print("") + +#@timethis def distanceJaroWinkler (sWord1, sWord2, fBoost = .666): "distance of Jaro-Winkler between and , returns a float" # https://github.com/thsig/jaro-winkler-JS - #if (sWord1 == sWord2): return 1.0 + if (sWord1 == sWord2): + return 1.0 nLen1 = len(sWord1) nLen2 = len(sWord2) nMax = max(nLen1, nLen2) aFlags1 = [ None for _ in range(nMax) ] aFlags2 = [ None for _ in range(nMax) ] @@ -191,10 +310,11 @@ if nMinLen > 4 and nCommon > i + 1 and 2 * nCommon >= nMinLen + i: fWeight += (1 - fWeight) * ((nCommon - i - 1) / (nLen1 * nLen2 - i*2 + 2)) return fWeight +#@timethis def distanceSift4 (s1, s2, nMaxOffset=5): "implementation of general Sift4." # faster than Damerau-Levenshtein and Jaro-Winkler # https://siderite.dev/blog/super-fast-and-accurate-string-distance.html if not s1: @@ -254,15 +374,16 @@ nLargestCS += nLocalCS return round(max(nLen1, nLen2) - nLargestCS + nTrans) def showDistance (s1, s2): - "display Damerau-Levenshtein distance and Sift4 distance between and " + "display distances between and " nDL = distanceDamerauLevenshtein(s1, s2) + fDLX = float(distanceDamerauLevenshteinX(s1, s2)) nS4 = distanceSift4(s1, s2) fJW = distanceJaroWinkler(s1, s2) - echo(f"{s1:22} ≠ {s2:22} \tDL: {nDL}\tS4: {nS4}\tJW: {fJW}") + echo(f"{s1:22} ≠ {s2:22} \tDL: {nDL:2}\tDLX: {fDLX:2.2}\tS4: {nS4:2}\tJW: {fJW}") #### STEMMING OPERATIONS