190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
|
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
|
+
-
+
|
if nMinLen > 4 and nCommon > i + 1 and 2 * nCommon >= nMinLen + i:
fWeight += (1 - fWeight) * ((nCommon - i - 1) / (nLen1 * nLen2 - i*2 + 2))
return fWeight
def distanceSift4 (s1, s2, nMaxOffset=5):
"implementation of general Sift4."
# faster than Damerau-Levenshtein and Jaro-Winkler
# https://siderite.blogspot.com/2014/11/super-fast-and-accurate-string-distance.html
# https://siderite.dev/blog/super-fast-and-accurate-string-distance.html
if not s1:
return len(s2)
if not s2:
return len(s1)
nLen1, nLen2 = len(s1), len(s2)
i1, i2 = 0, 0 # Cursors for each string
nLargestCS = 0 # Largest common substring
|
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
|
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
|
-
-
-
-
+
+
+
+
|
i1 = i2 = min(i1, i2)
nLargestCS += nLocalCS
return round(max(nLen1, nLen2) - nLargestCS + nTrans)
def showDistance (s1, s2):
"display Damerau-Levenshtein distance and Sift4 distance between <s1> and <s2>"
print("Jaro-Winkler: " + s1 + "/" + s2 + " = " + distanceJaroWinkler(s1, s2))
print("Damerau-Levenshtein: " + s1 + "/" + s2 + " = " + distanceDamerauLevenshtein(s1, s2))
print("Sift4:" + s1 + "/" + s2 + " = " + distanceSift4(s1, s2))
nDL = distanceDamerauLevenshtein(s1, s2)
nS4 = distanceSift4(s1, s2)
fJW = distanceJaroWinkler(s1, s2)
print(s1, "≠", s2, "\tDL:", nDL, "\tS4:", nS4, "\tJW:", fJW)
#### STEMMING OPERATIONS
## No stemming
|