Index: graphspell-js/str_transform.js ================================================================== --- graphspell-js/str_transform.js +++ graphspell-js/str_transform.js @@ -250,13 +250,67 @@ } return weight; }, - showDistance (s1, s2) { - console.log(`Distance Jaro-Winkler: ${s1} / ${s2} = ${this.distanceJaroWinkler(s1, s2)})`); - console.log(`Distance Damerau-Levenshtein: ${s1} / ${s2} = ${this.distanceDamerauLevenshtein(s1, s2)})`); + distanceSift4: function (s1, s2, maxOffset=5) { + // Sift4 - simplest version : https://siderite.dev/blog/super-fast-and-accurate-string-distance.html + // online algorithm to compute the distance between two strings in O(n) + // maxOffset is the number of characters to search for matching letters + if (!s1 || !s1.length) { + if (!s2) { + return 0; + } + return s2.length; + } + + if (!s2 || !s2.length) { + return s1.length; + } + + var l1 = s1.length; + var l2 = s2.length; + + var c1 = 0; //cursor for string 1 + var c2 = 0; //cursor for string 2 + var lcss = 0; //largest common subsequence + var local_cs = 0; //local common substring + while ((c1 < l1) && (c2 < l2)) { + if (s1.charAt(c1) == s2.charAt(c2)) { + local_cs++; + } else { + lcss += local_cs; + local_cs = 0; + if (c1 != c2) { + c1 = c2 = Math.max(c1, c2); //using max to bypass the need for computer transpositions ('ab' vs 'ba') + } + for (var i = 0; i < maxOffset && (c1 + i < l1 || c2 + i < l2); i++) { + if ((c1 + i < l1) && (s1.charAt(c1 + i) == s2.charAt(c2))) { + c1 += i; + local_cs++; + break; + } + if ((c2 + i < l2) && (s1.charAt(c1) == s2.charAt(c2 + i))) { + c2 += i; + local_cs++; + break; + } + } + } + c1++; + c2++; + } + lcss += local_cs; + return Math.round(Math.max(l1, l2) - lcss); + }, + + showDistance: function (s1, s2) { + console.log(`${s1} ≠ ${s2}`); + let nDL = this.distanceDamerauLevenshtein(s1, s2); + let nS4 = this.distanceSift4(s1, s2); + let fJW = this.distanceJaroWinkler(s1, s2); + console.log(`DL: ${nDL} — S4: ${nS4} — JW: ${fJW}`); }, // Suffix only defineSuffixCode: function (sFlex, sStem) { /* Index: graphspell/str_transform.py ================================================================== --- graphspell/str_transform.py +++ graphspell/str_transform.py @@ -192,11 +192,12 @@ return fWeight def distanceSift4 (s1, s2, nMaxOffset=5): "implementation of general Sift4." - # https://siderite.blogspot.com/2014/11/super-fast-and-accurate-string-distance.html + # faster than Damerau-Levenshtein and Jaro-Winkler + # https://siderite.dev/blog/super-fast-and-accurate-string-distance.html if not s1: return len(s2) if not s2: return len(s1) nLen1, nLen2 = len(s1), len(s2) @@ -253,14 +254,14 @@ return round(max(nLen1, nLen2) - nLargestCS + nTrans) def showDistance (s1, s2): "display Damerau-Levenshtein distance and Sift4 distance between and " - print("Jaro-Winkler: " + s1 + "/" + s2 + " = " + distanceJaroWinkler(s1, s2)) - print("Damerau-Levenshtein: " + s1 + "/" + s2 + " = " + distanceDamerauLevenshtein(s1, s2)) - print("Sift4:" + s1 + "/" + s2 + " = " + distanceSift4(s1, s2)) - + nDL = distanceDamerauLevenshtein(s1, s2) + nS4 = distanceSift4(s1, s2) + fJW = distanceJaroWinkler(s1, s2) + print(s1, "≠", s2, "\tDL:", nDL, "\tS4:", nS4, "\tJW:", fJW) #### STEMMING OPERATIONS