Index: graphspell/char_player.py ================================================================== --- graphspell/char_player.py +++ graphspell/char_player.py @@ -3,37 +3,40 @@ useful for suggestion mechanism """ dDistanceBetweenChars = { - "a": {}, - "e": {"é": 0.5}, - "é": {"e": 0.5}, - "i": {"y": 0.2}, - "o": {}, - "u": {}, - "y": {"i": 0.3}, - "b": {"d": 0.8, "h": 0.9}, - "c": {"ç": 0.1, "k": 0.5, "q": 0.5, "s": 0.5, "x": 0.5, "z": 0.8}, - "d": {"b": 0.8}, - "f": {"v": 0.8}, - "g": {"j": 0.5}, - "h": {"b": 0.9}, - "j": {"g": 0.5, "i": 0.9}, - "k": {"c": 0.5, "q": 0.1, "x": 0.5}, - "l": {"i": 0.9}, - "m": {"n": 0.8}, - "n": {"m": 0.8, "r": 0.9}, - "p": {"q": 0.9}, - "q": {"c": 0.5, "k": 0.1, "p": 0.9}, - "r": {"n": 0.9, "j": 0.9}, - "s": {"c": 0.5, "ç": 0.1, "x": 0.5, "z": 0.5}, - "t": {"d": 0.9}, - "v": {"f": 0.8, "w": 0.1}, - "w": {"v": 0.1}, - "x": {"c": 0.5, "k": 0.5, "q": 0.5, "s": 0.5}, - "z": {"s": 0.5} + # dDistanceBetweenChars: + # - with Jaro-Winkler, values between 1 and 10 + # - with Damerau-Levenshtein, values / 10 (between 0 and 1: 0.1, 0.2 ... 0.9) + #"a": {}, + "e": {"é": 5}, + "é": {"e": 5}, + "i": {"y": 2}, + #"o": {}, + #"u": {}, + "y": {"i": 3}, + "b": {"d": 8, "h": 9}, + "c": {"ç": 1, "k": 5, "q": 5, "s": 5, "x": 5, "z": 8}, + "d": {"b": 8}, + "f": {"v": 8}, + "g": {"j": 5}, + "h": {"b": 9}, + "j": {"g": 5, "i": 9}, + "k": {"c": 5, "q": 1, "x": 5}, + "l": {"i": 9}, + "m": {"n": 8}, + "n": {"m": 8, "r": 9}, + "p": {"q": 9}, + "q": {"c": 5, "k": 1, "p": 9}, + "r": {"n": 9, "j": 9}, + "s": {"c": 5, "ç": 1, "x": 5, "z": 5}, + "t": {"d": 9}, + "v": {"f": 8, "w": 1}, + "w": {"v": 1}, + "x": {"c": 5, "k": 5, "q": 5, "s": 5}, + "z": {"s": 5} } def distanceBetweenChars (c1, c2): "returns a float between 0 and 1" @@ -315,12 +318,12 @@ # End of word dFinal1 = { "a": ("as", "at", "ant", "ah"), "A": ("AS", "AT", "ANT", "AH"), - "c": ("ch",), - "C": ("CH",), + "c": ("ch", "que"), + "C": ("CH", "QUE"), "e": ("et", "er", "ets", "ée", "ez", "ai", "ais", "ait", "ent", "eh"), "E": ("ET", "ER", "ETS", "ÉE", "EZ", "AI", "AIS", "AIT", "ENT", "EH"), "é": ("et", "er", "ets", "ée", "ez", "ai", "ais", "ait"), "É": ("ET", "ER", "ETS", "ÉE", "EZ", "AI", "AIS", "AIT"), "è": ("et", "er", "ets", "ée", "ez", "ai", "ais", "ait"), @@ -331,10 +334,12 @@ "Ë": ("ET", "ER", "ETS", "ÉE", "EZ", "AI", "AIS", "AIT"), "g": ("gh",), "G": ("GH",), "i": ("is", "it", "ie", "in"), "I": ("IS", "IT", "IE", "IN"), + "k": ("que",), + "K": ("QUE",), "n": ("nt", "nd", "ns", "nh"), "N": ("NT", "ND", "NS", "NH"), "o": ("aut", "ot", "os"), "O": ("AUT", "OT", "OS"), "ô": ("aut", "ot", "os"), Index: graphspell/ibdawg.py ================================================================== --- graphspell/ibdawg.py +++ graphspell/ibdawg.py @@ -13,10 +13,11 @@ import time import json import binascii import importlib from collections import OrderedDict +from math import floor #import logging #logging.basicConfig(filename="suggestions.log", level=logging.DEBUG) from . import str_transform as st @@ -38,63 +39,71 @@ class SuggResult: """Structure for storing, classifying and filtering suggestions""" - def __init__ (self, sWord, nDistLimit=-1): + def __init__ (self, sWord, nSuggLimit=10, nDistLimit=-1): self.sWord = sWord self.sSimplifiedWord = st.simplifyWord(sWord) self.nDistLimit = nDistLimit if nDistLimit >= 0 else (len(sWord) // 3) + 1 self.nMinDist = 1000 - self.aSugg = set() - self.dSugg = { 0: [], 1: [], 2: [] } - self.aAllSugg = set() # all found words even those refused + # Temporary sets + self.aAllSugg = set() # All suggestions, even the one rejected + self.dGoodSugg = {} # Acceptable suggestions + self.dBestSugg = {} # Best suggestions + # Parameters + self.nSuggLimit = nSuggLimit + self.nSuggLimitExt = nSuggLimit + 2 # we add few entries in case suggestions merge after casing modifications + self.nBestSuggLimit = floor(nSuggLimit * 1.5) # n times the requested limit + self.nGoodSuggLimit = nSuggLimit * 15 # n times the requested limit def addSugg (self, sSugg, nDeep=0): "add a suggestion" #logging.info((nDeep * " ") + "__" + sSugg + "__") if sSugg in self.aAllSugg: return self.aAllSugg.add(sSugg) - if sSugg not in self.aSugg: - #nDist = min(st.distanceDamerauLevenshtein(self.sWord, sSugg), st.distanceDamerauLevenshtein(self.sSimplifiedWord, st.simplifyWord(sSugg))) - nDist = int(st.distanceDamerauLevenshtein(self.sSimplifiedWord, st.simplifyWord(sSugg))) - #logging.info((nDeep * " ") + "__" + sSugg + "__ :" + self.sSimplifiedWord +"|"+ st.simplifyWord(sSugg) +" -> "+ str(nDist)) - if nDist <= self.nDistLimit: - if " " in sSugg: - nDist += 1 - if nDist not in self.dSugg: - self.dSugg[nDist] = [] - self.dSugg[nDist].append(sSugg) - self.aSugg.add(sSugg) - if nDist < self.nMinDist: - self.nMinDist = nDist - self.nDistLimit = min(self.nDistLimit, self.nMinDist+1) - - def getSuggestions (self, nSuggLimit=10): + nDistJaro = 1 - st.distanceJaroWinkler(self.sSimplifiedWord, st.simplifyWord(sSugg)) + nDist = floor(nDistJaro * 10) + if nDistJaro < .11: # Best suggestions + self.dBestSugg[sSugg] = round(nDistJaro*1000) + if len(self.dBestSugg) > self.nBestSuggLimit: + self.nDistLimit = -1 # make suggest() to end search + elif nDistJaro < .33: # Good suggestions + self.dGoodSugg[sSugg] = round(nDistJaro*1000) + if len(self.dGoodSugg) > self.nGoodSuggLimit: + self.nDistLimit = -1 # make suggest() to end search + else: + if nDist < self.nMinDist: + self.nMinDist = nDist + self.nDistLimit = min(self.nDistLimit, self.nMinDist) + if nDist <= self.nDistLimit: + if nDist < self.nMinDist: + self.nMinDist = nDist + self.nDistLimit = min(self.nDistLimit, self.nMinDist+1) + + def getSuggestions (self): "return a list of suggestions" # we sort the better results with the original word lRes = [] - bFirstListSorted = False - for nDist, lSugg in self.dSugg.items(): - if nDist > self.nDistLimit: - break - if not bFirstListSorted and len(lSugg) > 1: - lSugg.sort(key=lambda sSugg: st.distanceDamerauLevenshtein(self.sWord, sSugg)) - bFirstListSorted = True - #print(nDist, "|".join(lSugg)) - #for sSugg in lSugg: - # print(sSugg, st.distanceDamerauLevenshtein(self.sWord, sSugg)) - lRes.extend(lSugg) - if len(lRes) > nSuggLimit: - break + if len(self.dBestSugg) > 0: + # sort only with simplified words + lResTmp = sorted(self.dBestSugg.items(), key=lambda x: x[1]) + for i in range(min(self.nSuggLimitExt, len(lResTmp))): + lRes.append(lResTmp[i][0]) + if len(lRes) < self.nSuggLimitExt: + # sort with simplified words and original word + lResTmp = sorted(self.dGoodSugg.items(), key=lambda x: ((1-st.distanceJaroWinkler(self.sWord, x[0]))*10, x[1])) + for i in range(min(self.nSuggLimitExt, len(lResTmp))): + lRes.append(lResTmp[i][0]) + # casing if self.sWord.isupper(): lRes = list(OrderedDict.fromkeys(map(lambda sSugg: sSugg.upper(), lRes))) # use dict, when Python 3.6+ elif self.sWord[0:1].isupper(): # dont’ use <.istitle> lRes = list(OrderedDict.fromkeys(map(lambda sSugg: sSugg[0:1].upper()+sSugg[1:], lRes))) # use dict, when Python 3.6+ - return lRes[:nSuggLimit] + return lRes[:self.nSuggLimit] def reset (self): "clear data" self.aSugg.clear() self.dSugg.clear() @@ -320,16 +329,17 @@ sPfx, sWord, sSfx = self.lexicographer.split(sWord) nMaxSwitch = max(len(sWord) // 3, 1) nMaxDel = len(sWord) // 5 nMaxHardRepl = max((len(sWord) - 5) // 4, 1) nMaxJump = max(len(sWord) // 4, 1) - oSuggResult = SuggResult(sWord) + oSuggResult = SuggResult(sWord, nSuggLimit) + sWord = st.cleanWord(sWord) if bSplitTrailingNumbers: self._splitTrailingNumbers(oSuggResult, sWord) self._splitSuggest(oSuggResult, sWord) self._suggest(oSuggResult, sWord, nMaxSwitch, nMaxDel, nMaxHardRepl, nMaxJump) - aSugg = oSuggResult.getSuggestions(nSuggLimit) + aSugg = oSuggResult.getSuggestions() if self.lexicographer: aSugg = self.lexicographer.filterSugg(aSugg) if sSfx or sPfx: # we add what we removed return list(map(lambda sSug: sPfx + sSug + sSfx, aSugg)) Index: graphspell/str_transform.py ================================================================== --- graphspell/str_transform.py +++ graphspell/str_transform.py @@ -3,12 +3,13 @@ - calculate distance between two strings - transform strings with transformation codes """ import unicodedata +import re -from .char_player import distanceBetweenChars +from .char_player import distanceBetweenChars, dDistanceBetweenChars #### N-GRAMS def getNgrams (sWord, n=2): @@ -50,10 +51,15 @@ for i, c in enumerate(sWord, 1): if c != sWord[i:i+1] or (c == 'e' and sWord[i:i+2] != "ee"): # exception for to avoid confusion between crée / créai sNewWord += c return sNewWord.replace("eau", "o").replace("au", "o").replace("ai", "éi").replace("ei", "é").replace("ph", "f") + +def cleanWord (sWord): + "remove letters repeated more than 2 times" + return re.sub("(.)\\1{2,}", '\\1\\1', sWord) + _xTransNumbersToExponent = str.maketrans({ "0": "⁰", "1": "¹", "2": "²", "3": "³", "4": "⁴", "5": "⁵", "6": "⁶", "7": "⁷", "8": "⁸", "9": "⁹" }) @@ -104,10 +110,87 @@ ) if i and j and s1[i] == s2[j-1] and s1[i-1] == s2[j]: d[i, j] = min(d[i, j], d[i-2, j-2] + nCost) # Transposition return d[nLen1-1, nLen2-1] + +def distanceJaroWinkler (a, b, boost = .666): + # https://github.com/thsig/jaro-winkler-JS + #if (a == b): return 1.0 + a_len = len(a) + b_len = len(b) + nMax = max(a_len, b_len) + a_flag = [None for _ in range(nMax)] + b_flag = [None for _ in range(nMax)] + search_range = (max(a_len, b_len) // 2) - 1 + minv = min(a_len, b_len) + + # Looking only within the search range, count and flag the matched pairs. + Num_com = 0 + yl1 = b_len - 1 + for i in range(a_len): + lowlim = i - search_range if i >= search_range else 0 + hilim = i + search_range if (i + search_range) <= yl1 else yl1 + for j in range(lowlim, hilim+1): + if b_flag[j] != 1 and a[j:j+1] == b[i:i+1]: + a_flag[j] = 1 + b_flag[i] = 1 + Num_com += 1 + break + + # Return if no characters in common + if Num_com == 0: + return 0.0 + + # Count the number of transpositions + k = 0 + N_trans = 0 + for i in range(a_len): + if a_flag[i] == 1: + for j in range(k, b_len): + if b_flag[j] == 1: + k = j + 1 + break + if a[i] != b[j]: + N_trans += 1 + N_trans = N_trans // 2 + + # Adjust for similarities in nonmatched characters + N_simi = 0 + if minv > Num_com: + for i in range(a_len): + if not a_flag[i]: + for j in range(b_len): + if not b_flag[j]: + if a[i] in dDistanceBetweenChars and b[j] in dDistanceBetweenChars[a[i]]: + N_simi += dDistanceBetweenChars[a[i]][b[j]] + b_flag[j] = 2 + break + + Num_sim = (N_simi / 10.0) + Num_com + + # Main weight computation + weight = Num_sim / a_len + Num_sim / b_len + (Num_com - N_trans) / Num_com + weight = weight / 3 + + # Continue to boost the weight if the strings are similar + if weight > boost: + # Adjust for having up to the first 4 characters in common + j = 4 if minv >= 4 else minv + i = 0 + while i < j and a[i] == b[i]: + i += 1 + if i: + weight += i * 0.1 * (1.0 - weight) + # Adjust for long strings. + # After agreeing beginning chars, at least two more must agree + # and the agreeing characters must be more than half of the + # remaining characters. + if minv > 4 and Num_com > i + 1 and 2 * Num_com >= minv + i: + weight += (1 - weight) * ((Num_com - i - 1) / (a_len * b_len - i*2 + 2)) + return weight + def distanceSift4 (s1, s2, nMaxOffset=5): "implementation of general Sift4." # https://siderite.blogspot.com/2014/11/super-fast-and-accurate-string-distance.html if not s1: