Grammalecte  Diff

Differences From Artifact [4b10b3e705]:

To Artifact [928aa819c5]:


190
191
192
193
194
195
196

197

198
199
200
201
202
203
204
190
191
192
193
194
195
196
197

198
199
200
201
202
203
204
205







+
-
+







        if nMinLen > 4  and  nCommon > i + 1  and  2 * nCommon >= nMinLen + i:
            fWeight += (1 - fWeight) * ((nCommon - i - 1) / (nLen1 * nLen2 - i*2 + 2))
    return fWeight


def distanceSift4 (s1, s2, nMaxOffset=5):
    "implementation of general Sift4."
    # faster than Damerau-Levenshtein and Jaro-Winkler
    # https://siderite.blogspot.com/2014/11/super-fast-and-accurate-string-distance.html
    # https://siderite.dev/blog/super-fast-and-accurate-string-distance.html
    if not s1:
        return len(s2)
    if not s2:
        return len(s1)
    nLen1, nLen2 = len(s1), len(s2)
    i1, i2 = 0, 0   # Cursors for each string
    nLargestCS = 0  # Largest common substring
251
252
253
254
255
256
257
258
259
260
261




262
263
264
265
266
267
268
252
253
254
255
256
257
258




259
260
261
262
263
264
265
266
267
268
269







-
-
-
-
+
+
+
+







            i1 = i2 = min(i1, i2)
    nLargestCS += nLocalCS
    return round(max(nLen1, nLen2) - nLargestCS + nTrans)


def showDistance (s1, s2):
    "display Damerau-Levenshtein distance and Sift4 distance between <s1> and <s2>"
    print("Jaro-Winkler: " + s1 + "/" + s2 + " = " + distanceJaroWinkler(s1, s2))
    print("Damerau-Levenshtein: " + s1 + "/" + s2 + " = " + distanceDamerauLevenshtein(s1, s2))
    print("Sift4:" + s1 + "/" + s2 + " = " + distanceSift4(s1, s2))

    nDL = distanceDamerauLevenshtein(s1, s2)
    nS4 = distanceSift4(s1, s2)
    fJW = distanceJaroWinkler(s1, s2)
    print(s1, "≠", s2, "\tDL:", nDL, "\tS4:", nS4, "\tJW:", fJW)



#### STEMMING OPERATIONS

## No stemming