Index: gc_core/py/char_player.py ================================================================== --- gc_core/py/char_player.py +++ gc_core/py/char_player.py @@ -16,124 +16,10 @@ def cleanWord (sWord): "word simplication before calculating distance between words" return sWord.lower().translate(_xTransChars).replace("eau", "o").replace("au", "o") - -def distanceDamerauLevenshtein (s1, s2): - "distance of Damerau-Levenshtein between and " - # https://fr.wikipedia.org/wiki/Distance_de_Damerau-Levenshtein - d = {} - nLen1 = len(s1) - nLen2 = len(s2) - for i in range(-1, nLen1+1): - d[i, -1] = i + 1 - for j in range(-1, nLen2+1): - d[-1, j] = j + 1 - for i in range(nLen1): - for j in range(nLen2): - nCost = 0 if s1[i] == s2[j] else 1 - d[i, j] = min( - d[i-1, j] + 1, # Deletion - d[i, j-1] + 1, # Insertion - d[i-1, j-1] + nCost, # Substitution - ) - if i and j and s1[i] == s2[j-1] and s1[i-1] == s2[j]: - d[i, j] = min(d[i, j], d[i-2, j-2] + nCost) # Transposition - return d[nLen1-1, nLen2-1] - - -def showDistance (s1, s2): - s1b = cleanWord(s1); - s2b = cleanWord(s2); - print(f"Distance: {s1} / {s2} = {distanceDamerauLevenshtein(s1, s2)}") - print(f"Distance: {s1b} / {s2b} = {distanceDamerauLevenshtein(s1b, s2b)}") - - -def distanceSift4 (s1, s2, nMaxOffset=5): - "implementation of general Sift4." - # https://siderite.blogspot.com/2014/11/super-fast-and-accurate-string-distance.html - if not s1: - return len(s2) - if not s2: - return len(s1) - nLen1, nLen2 = len(s1), len(s2) - i1, i2 = 0, 0 # Cursors for each string - nLargestCS = 0 # Largest common substring - nLocalCS = 0 # Local common substring - nTrans = 0 # Number of transpositions ('ab' vs 'ba') - lOffset = [] # Offset pair array, for computing the transpositions - - while i1 < nLen1 and i2 < nLen2: - if s1[i1] == s2[i2]: - nLocalCS += 1 - # Check if current match is a transposition - bTrans = False - i = 0 - while i < len(lOffset): - t = lOffset[i] - if i1 <= t[0] or i2 <= t[1]: - bTrans = abs(i2-i1) >= abs(t[1] - t[0]) - if bTrans: - nTrans += 1 - elif not t[2]: - t[2] = True - nTrans += 1 - break - elif i1 > t[1] and i2 > t[0]: - del lOffset[i] - else: - i += 1 - lOffset.append([i1, i2, bTrans]) - else: - nLargestCS += nLocalCS - nLocalCS = 0 - if i1 != i2: - i1 = i2 = min(i1, i2) - for i in range(nMaxOffset): - if i1 + i >= nLen1 and i2 + i >= nLen2: - break - elif i1 + i < nLen1 and s1[i1+i] == s2[i2]: - i1 += i - 1 - i2 -= 1 - break - elif i2 + i < nLen2 and s1[i1] == s2[i2+i]: - i2 += i - 1 - i1 -= 1 - break - i1 += 1 - i2 += 1 - if i1 >= nLen1 or i2 >= nLen2: - nLargestCS += nLocalCS - nLocalCS = 0 - i1 = i2 = min(i1, i2) - nLargestCS += nLocalCS - return round(max(nLen1, nLen2) - nLargestCS + nTrans) - - -# Method: Remove Useless Chars - -_dVovels = { - 'a': '', 'e': '', 'i': '', 'o': '', 'u': '', 'y': '', - 'à': '', 'é': '', 'î': '', 'ô': '', 'û': '', 'ÿ': '', - 'â': '', 'è': '', 'ï': '', 'ö': '', 'ù': '', 'ŷ': '', - 'ä': '', 'ê': '', 'í': '', 'ó': '', 'ü': '', 'ý': '', - 'á': '', 'ë': '', 'ì': '', 'ò': '', 'ú': '', 'ỳ': '', - 'ā': '', 'ē': '', 'ī': '', 'ō': '', 'ū': '', 'ȳ': '', - 'h': '', 'œ': '', 'æ': '' - } - -_xTransVovels = str.maketrans(_dVovels) - - -aVovels = frozenset(_dVovels.keys()) - - -def shrinkWord (sWord): - "remove vovels and h" - return sWord[0:1].replace("h", "") + sWord[1:].translate(_xTransVovels) - # Similar chars d1to1 = { "1": "liîLIÎ", Index: gc_core/py/ibdawg.py ================================================================== --- gc_core/py/ibdawg.py +++ gc_core/py/ibdawg.py @@ -25,10 +25,49 @@ fEnd = time.time() print(func.__name__, fEnd - fStart) return result return wrapper + +class SuggResult: + + def __init__ (self, sWord, nDistLimit=-1): + self.sWord = sWord + self.sCleanWord = cp.cleanWord(sWord) + self.nDistLimit = nDistLimit if nDistLimit >= 0 else (len(sWord) // 3) + 1 + self.nMinDist = 1000 + self.nMaxDist = 0 + self.aSugg = set() + self.dSugg = { 0: [], 1: [], 2: [] } + + def addSugg (self, sSugg, nDeep=0): + "add a suggestion" + if sSugg not in self.aSugg: + nDist = st.distanceSift4(self.sCleanWord, cp.cleanWord(sSugg)) + if nDist <= self.nDistLimit: + if nDist not in self.dSugg: + self.dSugg[nDist] = [] + self.dSugg[nDist].append(sSugg) + logging.info((nDeep * " ") + "__" + sSugg + "__") + if nDist > self.nMaxDist: + self.nMaxDist = nDist + if nDist < self.nMinDist: + self.nMinDist = nDist + self.nDistLimit = min(self.nDistLimit, self.nMinDist+2) + + def getSuggestions (self, nSuggLimit=10, nDistLimit=-1): + "return a list of suggestions" + lRes = [] + for lSugg in self.dSugg.values(): + lRes.extend(lSugg) + if len(lRes) > nSuggLimit: + break + lRes = list(cp.filterSugg(lRes)) + if self.sWord.istitle(): + lRes = list(map(lambda sSugg: sSugg.title(), lRes)) + return lRes[:nSuggLimit] + class IBDAWG: """INDEXABLE BINARY DIRECT ACYCLIC WORD GRAPH""" def __init__ (self, sDicName): @@ -212,11 +251,11 @@ aSugg = set(map(lambda sSugg: sSugg.title(), aSugg)) elif sWord.islower(): aSugg.update(self._suggest(sWord.title(), nMaxDel=nMaxDel, nMaxHardRepl=nMaxHardRepl)) aSugg = cp.filterSugg(aSugg) sCleanWord = cp.cleanWord(sWord) - aSugg = sorted(aSugg, key=lambda sSugg: cp.distanceDamerauLevenshtein(sCleanWord, cp.cleanWord(sSugg)))[:nMaxSugg] + aSugg = sorted(aSugg, key=lambda sSugg: st.distanceDamerauLevenshtein(sCleanWord, cp.cleanWord(sSugg)))[:nMaxSugg] if sSfx or sPfx: # we add what we removed return list(map(lambda sSug: sPfx + sSug + sSfx, aSugg)) return aSugg @@ -270,40 +309,30 @@ @timethis def suggest2 (self, sWord, nMaxSugg=10): "returns a set of suggestions for " sPfx, sWord, sSfx = cp.cut(sWord) - nMaxDist = (len(sWord) // 3) + 1 - sCleanWord = cp.cleanWord(sWord) - aSugg = self._suggest2(sWord, sCleanWord, nMaxDist) - if sWord.istitle(): - aSugg.update(self._suggest2(sWord.lower(), sCleanWord, nMaxDist)) - aSugg = set(map(lambda sSugg: sSugg.title(), aSugg)) - elif sWord.islower(): - aSugg.update(self._suggest2(sWord.title(), sCleanWord, nMaxDist)) - aSugg = cp.filterSugg(aSugg) - aSugg = sorted(aSugg, key=lambda sSugg: cp.distanceSift4(sCleanWord, cp.cleanWord(sSugg)))[:nMaxSugg] + oSuggResult = SuggResult(sWord) + self._suggest2(oSuggResult) + aSugg = oSuggResult.getSuggestions() if sSfx or sPfx: # we add what we removed return list(map(lambda sSug: sPfx + sSug + sSfx, aSugg)) return aSugg - def _suggest2 (self, sWord, sCleanWord, nMaxDist, nDeep=0, iAddr=0, sNewWord=""): + def _suggest2 (self, oSuggResult, nDeep=0, iAddr=0, sNewWord=""): #logging.info((nDeep * " ") + sNewWord) - if nDeep >= nMaxDist: + if nDeep >= oSuggResult.nDistLimit: sCleanNewWord = cp.cleanWord(sNewWord) - if cp.distanceSift4(sCleanWord[:len(sCleanNewWord)], sCleanNewWord) > nMaxDist: - return set() - aSugg = set() + if st.distanceSift4(oSuggResult.sCleanWord[:len(sCleanNewWord)], sCleanNewWord) > oSuggResult.nDistLimit: + return if int.from_bytes(self.byDic[iAddr:iAddr+self.nBytesArc], byteorder='big') & self._finalNodeMask: #logging.info((nDeep * " ") + "__" + sNewWord + "__") - sCleanNewWord = cp.cleanWord(sNewWord) - if cp.distanceSift4(sCleanWord, sCleanNewWord) <= nMaxDist: - aSugg.add(sNewWord) + oSuggResult.addSugg(sNewWord, nDeep) for cChar, jAddr in self._getCharArcs(iAddr): - aSugg.update(self._suggest2(sWord, sCleanWord, nMaxDist, nDeep+1, jAddr, sNewWord+cChar)) - return aSugg + self._suggest2(oSuggResult, nDeep+1, jAddr, sNewWord+cChar) + return def _getCharArcs (self, iAddr): "generator: yield all chars and addresses from node at address " for nVal, jAddr in self._getArcs(iAddr): if nVal < self.nChar: Index: gc_core/py/str_transform.py ================================================================== --- gc_core/py/str_transform.py +++ gc_core/py/str_transform.py @@ -1,7 +1,120 @@ #!python3 + +#### DISTANCE CALCULATIONS + +def longestCommonSubstring (s1, s2): + # http://en.wikipedia.org/wiki/Longest_common_substring_problem + # http://en.wikibooks.org/wiki/Algorithm_implementation/Strings/Longest_common_substring + M = [ [0]*(1+len(s2)) for i in range(1+len(s1)) ] + longest, x_longest = 0, 0 + for x in range(1, 1+len(s1)): + for y in range(1, 1+len(s2)): + if s1[x-1] == s2[y-1]: + M[x][y] = M[x-1][y-1] + 1 + if M[x][y] > longest: + longest = M[x][y] + x_longest = x + else: + M[x][y] = 0 + return s1[x_longest-longest : x_longest] + + +def distanceDamerauLevenshtein (s1, s2): + "distance of Damerau-Levenshtein between and " + # https://fr.wikipedia.org/wiki/Distance_de_Damerau-Levenshtein + d = {} + nLen1 = len(s1) + nLen2 = len(s2) + for i in range(-1, nLen1+1): + d[i, -1] = i + 1 + for j in range(-1, nLen2+1): + d[-1, j] = j + 1 + for i in range(nLen1): + for j in range(nLen2): + nCost = 0 if s1[i] == s2[j] else 1 + d[i, j] = min( + d[i-1, j] + 1, # Deletion + d[i, j-1] + 1, # Insertion + d[i-1, j-1] + nCost, # Substitution + ) + if i and j and s1[i] == s2[j-1] and s1[i-1] == s2[j]: + d[i, j] = min(d[i, j], d[i-2, j-2] + nCost) # Transposition + return d[nLen1-1, nLen2-1] + + +def distanceSift4 (s1, s2, nMaxOffset=5): + "implementation of general Sift4." + # https://siderite.blogspot.com/2014/11/super-fast-and-accurate-string-distance.html + if not s1: + return len(s2) + if not s2: + return len(s1) + nLen1, nLen2 = len(s1), len(s2) + i1, i2 = 0, 0 # Cursors for each string + nLargestCS = 0 # Largest common substring + nLocalCS = 0 # Local common substring + nTrans = 0 # Number of transpositions ('ab' vs 'ba') + lOffset = [] # Offset pair array, for computing the transpositions + + while i1 < nLen1 and i2 < nLen2: + if s1[i1] == s2[i2]: + nLocalCS += 1 + # Check if current match is a transposition + bTrans = False + i = 0 + while i < len(lOffset): + t = lOffset[i] + if i1 <= t[0] or i2 <= t[1]: + bTrans = abs(i2-i1) >= abs(t[1] - t[0]) + if bTrans: + nTrans += 1 + elif not t[2]: + t[2] = True + nTrans += 1 + break + elif i1 > t[1] and i2 > t[0]: + del lOffset[i] + else: + i += 1 + lOffset.append([i1, i2, bTrans]) + else: + nLargestCS += nLocalCS + nLocalCS = 0 + if i1 != i2: + i1 = i2 = min(i1, i2) + for i in range(nMaxOffset): + if i1 + i >= nLen1 and i2 + i >= nLen2: + break + elif i1 + i < nLen1 and s1[i1+i] == s2[i2]: + i1 += i - 1 + i2 -= 1 + break + elif i2 + i < nLen2 and s1[i1] == s2[i2+i]: + i2 += i - 1 + i1 -= 1 + break + i1 += 1 + i2 += 1 + if i1 >= nLen1 or i2 >= nLen2: + nLargestCS += nLocalCS + nLocalCS = 0 + i1 = i2 = min(i1, i2) + nLargestCS += nLocalCS + return round(max(nLen1, nLen2) - nLargestCS + nTrans) + + +def showDistance (s1, s2): + print(f"Damerau-Levenshtein: {s1} / {s2} = {distanceDamerauLevenshtein(s1, s2)}") + print(f"Sift4: {s1} / {s2} = {distanceSift4(s1, s2)}") + + + + +#### STEMMING OPERATIONS + ## No stemming def noStemming (sFlex, sStem): return sStem @@ -38,18 +151,20 @@ for i in range(min(len(sFlex), len(sStem))): if sFlex[i] != sStem[i]: break jSfx += 1 return chr(len(sFlex)-jSfx+48) + sStem[jSfx:] + def changeWordWithSuffixCode (sWord, sSfxCode): if sSfxCode == "0": return sWord return sWord[:-(ord(sSfxCode[0])-48)] + sSfxCode[1:] if sSfxCode[0] != '0' else sWord + sSfxCode[1:] # Prefix and suffix + def defineAffixCode (sFlex, sStem): """ Returns a string defining how to get stem from flexion. Examples: "0" if stem = flexion "stem" if no common substring "n(pfx)/m(sfx)" @@ -74,25 +189,10 @@ sAff = "{}/".format(chr(n+48)) if not sPfx else "{}{}/".format(chr(n+48), sPfx) sAff += chr(m+48) if not sSfx else "{}{}".format(chr(m+48), sSfx) return sAff return sStem -def longestCommonSubstring (s1, s2): - # http://en.wikipedia.org/wiki/Longest_common_substring_problem - # http://en.wikibooks.org/wiki/Algorithm_implementation/Strings/Longest_common_substring - M = [ [0]*(1+len(s2)) for i in range(1+len(s1)) ] - longest, x_longest = 0, 0 - for x in range(1, 1+len(s1)): - for y in range(1, 1+len(s2)): - if s1[x-1] == s2[y-1]: - M[x][y] = M[x-1][y-1] + 1 - if M[x][y] > longest: - longest = M[x][y] - x_longest = x - else: - M[x][y] = 0 - return s1[x_longest-longest : x_longest] def changeWordWithAffixCode (sWord, sAffCode): if sAffCode == "0": return sWord if '/' not in sAffCode: