Grammalecte  Check-in [debfd84fcd]

Overview
Comment:[graphspell] str_transform: màj
Downloads: Tarball | ZIP archive | SQL archive
Timelines: family | ancestors | descendants | both | trunk | graphspell
Files: files | file ages | folders
SHA3-256: debfd84fcda654ca1923968b6521e27d8eec8861ba3099e1e50180a63a22895a
User & Date: olr on 2024-06-11 09:11:17
Other Links: manifest | tags
Context
2024-06-11
17:17
[fr] faux positifs check-in: 4d40f61f04 user: olr tags: trunk, fr
09:11
[graphspell] str_transform: màj check-in: debfd84fcd user: olr tags: trunk, graphspell
08:59
[graphspell] new ad hoc suggestions check-in: 3148bfaece user: olr tags: trunk, graphspell
Changes

Modified graphspell-js/str_transform.js from [2de0e81f6b] to [3da9c81f36].

248
249
250
251
252
253
254



















































255
256
257






258
259
260
261
262
263
264
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305



306
307
308
309
310
311
312
313
314
315
316
317
318







+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
-
-
-
+
+
+
+
+
+







            weight += (1 - weight) * ((Num_com - i - 1) / (a_len * b_len - i*2 + 2));
          }
        }

        return weight;
    },

    distanceSift4: function (s1, s2, maxOffset=5) {
        // Sift4 - simplest version : https://siderite.dev/blog/super-fast-and-accurate-string-distance.html
        // online algorithm to compute the distance between two strings in O(n)
        // maxOffset is the number of characters to search for matching letters
        if (!s1 || !s1.length) {
            if (!s2) {
                return 0;
            }
            return s2.length;
        }

        if (!s2 || !s2.length) {
            return s1.length;
        }

        var l1 = s1.length;
        var l2 = s2.length;

        var c1 = 0; //cursor for string 1
        var c2 = 0; //cursor for string 2
        var lcss = 0; //largest common subsequence
        var local_cs = 0; //local common substring
        while ((c1 < l1) && (c2 < l2)) {
            if (s1.charAt(c1) == s2.charAt(c2)) {
                local_cs++;
            } else {
                lcss += local_cs;
                local_cs = 0;
                if (c1 != c2) {
                    c1 = c2 = Math.max(c1, c2); //using max to bypass the need for computer transpositions ('ab' vs 'ba')
                }
                for (var i = 0; i < maxOffset && (c1 + i < l1 || c2 + i < l2); i++) {
                    if ((c1 + i < l1) && (s1.charAt(c1 + i) == s2.charAt(c2))) {
                        c1 += i;
                        local_cs++;
                        break;
                    }
                    if ((c2 + i < l2) && (s1.charAt(c1) == s2.charAt(c2 + i))) {
                        c2 += i;
                        local_cs++;
                        break;
                    }
                }
            }
            c1++;
            c2++;
        }
        lcss += local_cs;
        return Math.round(Math.max(l1, l2) - lcss);
    },

    showDistance (s1, s2) {
        console.log(`Distance Jaro-Winkler: ${s1} / ${s2} = ${this.distanceJaroWinkler(s1, s2)})`);
        console.log(`Distance Damerau-Levenshtein: ${s1} / ${s2} = ${this.distanceDamerauLevenshtein(s1, s2)})`);
    showDistance: function (s1, s2) {
        console.log(`${s1}  ${s2}`);
        let nDL = this.distanceDamerauLevenshtein(s1, s2);
        let nS4 = this.distanceSift4(s1, s2);
        let fJW = this.distanceJaroWinkler(s1, s2);
        console.log(`DL: ${nDL} — S4: ${nS4} — JW: ${fJW}`);
    },

    // Suffix only
    defineSuffixCode: function (sFlex, sStem) {
        /*
            Returns a string defining how to get stem from flexion
                "n(sfx)"

Modified graphspell/str_transform.py from [4b10b3e705] to [928aa819c5].

190
191
192
193
194
195
196

197

198
199
200
201
202
203
204
190
191
192
193
194
195
196
197

198
199
200
201
202
203
204
205







+
-
+







        if nMinLen > 4  and  nCommon > i + 1  and  2 * nCommon >= nMinLen + i:
            fWeight += (1 - fWeight) * ((nCommon - i - 1) / (nLen1 * nLen2 - i*2 + 2))
    return fWeight


def distanceSift4 (s1, s2, nMaxOffset=5):
    "implementation of general Sift4."
    # faster than Damerau-Levenshtein and Jaro-Winkler
    # https://siderite.blogspot.com/2014/11/super-fast-and-accurate-string-distance.html
    # https://siderite.dev/blog/super-fast-and-accurate-string-distance.html
    if not s1:
        return len(s2)
    if not s2:
        return len(s1)
    nLen1, nLen2 = len(s1), len(s2)
    i1, i2 = 0, 0   # Cursors for each string
    nLargestCS = 0  # Largest common substring
251
252
253
254
255
256
257
258
259
260
261




262
263
264
265
266
267
268
252
253
254
255
256
257
258




259
260
261
262
263
264
265
266
267
268
269







-
-
-
-
+
+
+
+







            i1 = i2 = min(i1, i2)
    nLargestCS += nLocalCS
    return round(max(nLen1, nLen2) - nLargestCS + nTrans)


def showDistance (s1, s2):
    "display Damerau-Levenshtein distance and Sift4 distance between <s1> and <s2>"
    print("Jaro-Winkler: " + s1 + "/" + s2 + " = " + distanceJaroWinkler(s1, s2))
    print("Damerau-Levenshtein: " + s1 + "/" + s2 + " = " + distanceDamerauLevenshtein(s1, s2))
    print("Sift4:" + s1 + "/" + s2 + " = " + distanceSift4(s1, s2))

    nDL = distanceDamerauLevenshtein(s1, s2)
    nS4 = distanceSift4(s1, s2)
    fJW = distanceJaroWinkler(s1, s2)
    print(s1, "≠", s2, "\tDL:", nDL, "\tS4:", nS4, "\tJW:", fJW)



#### STEMMING OPERATIONS

## No stemming