Grammalecte  Check-in [90478790e5]

Overview
Comment:[graphspell] experiment: DamerauLevenstein distance modified by function calculating distance between chars
Downloads: Tarball | ZIP archive | SQL archive
Timelines: family | ancestors | descendants | both | trunk | graphspell
Files: files | file ages | folders
SHA3-256: 90478790e5f24f1080fba675fedafa45f731c5ec9b4ba52ce24981e5a268b0e8
User & Date: olr on 2020-05-03 09:12:35
Other Links: manifest | tags
Context
2020-05-03
09:21
[graphspell] remove useless code check-in: 2e960183fa user: olr tags: trunk, graphspell
09:12
[graphspell] experiment: DamerauLevenstein distance modified by function calculating distance between chars check-in: 90478790e5 user: olr tags: trunk, graphspell
06:32
[fr] ajustements check-in: 7108ce2dc1 user: olr tags: trunk, fr
Changes

Modified graphspell-js/char_player.js from [60a9fdaff6] to [a12e814d2d].

16
17
18
19
20
21
22
23








































24
25
26
27
28
29





30
31
32
33
34
35
36
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64





65
66
67
68
69
70
71
72
73
74
75
76








+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+

-
-
-
-
-
+
+
+
+
+







    spellingNormalization: function (sWord) {
        let sNewWord = "";
        for (let c of sWord) {
            sNewWord += this._xTransCharsForSpelling.gl_get(c, c);
        }
        return sNewWord.normalize("NFC");
    },

    oDistanceBetweenChars: {
        "a": {},
        "e": {"é": 0.5},
        "é": {"e": 0.5},
        "i": {"y": 0.2},
        "o": {},
        "u": {},
        "y": {"i": 0.3},
        "b": {"d": 0.8, "h": 0.9},
        "c": {"ç": 0.1, "k": 0.5, "q": 0.5, "s": 0.5, "x": 0.5, "z": 0.8},
        "d": {"b": 0.8},
        "f": {"v": 0.8},
        "g": {"j": 0.5},
        "h": {"b": 0.9},
        "j": {"g": 0.5, "i": 0.9},
        "k": {"c": 0.5, "q": 0.1, "x": 0.5},
        "l": {"i": 0.9},
        "m": {"n": 0.8},
        "n": {"m": 0.8, "r": 0.9},
        "p": {"q": 0.9},
        "q": {"c": 0.5, "k": 0.1, "p": 0.9},
        "r": {"n": 0.9, "j": 0.9},
        "s": {"c": 0.5, "ç": 0.1, "x": 0.5, "z": 0.5},
        "t": {"d": 0.9},
        "v": {"f": 0.8, "w": 0.1},
        "w": {"v": 0.1},
        "x": {"c": 0.5, "k": 0.5, "q": 0.5, "s": 0.5},
        "z": {"s": 0.5}
    },

    distanceBetweenChars: function (c1, c2) {
        if (c1 == c2) {
            return 0;
        }
        if (this.oDistanceBetweenChars.hasOwnProperty(c1) && this.oDistanceBetweenChars[c1].hasOwnProperty(c2)) {
            return this.oDistanceBetweenChars[c1][c2];
        }
        return 1;
    },

    _xTransCharsForSimplification: new Map([
        ['à', 'a'],  ['é', 'é'],  ['î', 'i'],  ['ô', 'o'],  ['û', 'u'],  ['ÿ', 'i'],  ['y', 'i'],
        ['â', 'a'],  ['è', 'é'],  ['ï', 'i'],  ['ö', 'o'],  ['ù', 'u'],  ['ŷ', 'i'],
        ['ä', 'a'],  ['ê', 'é'],  ['í', 'i'],  ['ó', 'o'],  ['ü', 'u'],  ['ý', 'i'],
        ['á', 'a'],  ['ë', 'é'],  ['ì', 'i'],  ['ò', 'o'],  ['ú', 'u'],  ['ỳ', 'i'],
        ['ā', 'a'],  ['ē', 'é'],  ['ī', 'i'],  ['ō', 'o'],  ['ū', 'u'],  ['ȳ', 'i'],
        ['à', 'a'],  ['é', 'é'],  ['î', 'i'],  ['ô', 'o'],  ['û', 'u'],  ['ÿ', 'y'],
        ['â', 'a'],  ['è', 'é'],  ['ï', 'i'],  ['ö', 'o'],  ['ù', 'u'],  ['ŷ', 'y'],
        ['ä', 'a'],  ['ê', 'é'],  ['í', 'i'],  ['ó', 'o'],  ['ü', 'u'],  ['ý', 'y'],
        ['á', 'a'],  ['ë', 'é'],  ['ì', 'i'],  ['ò', 'o'],  ['ú', 'u'],  ['ỳ', 'y'],
        ['ā', 'a'],  ['ē', 'é'],  ['ī', 'i'],  ['ō', 'o'],  ['ū', 'u'],  ['ȳ', 'y'],
        ['ç', 'c'],  ['ñ', 'n'],  ['k', 'q'],  ['w', 'v'],
        ['œ', 'oe'], ['æ', 'ae'],
        ['ſ', 's'],  ['ffi', 'ffi'],  ['ffl', 'ffl'],  ['ff', 'ff'],  ['ſt', 'ft'],  ['fi', 'fi'],  ['fl', 'fl'],  ['st', 'st'],
        ["⁰", "0"], ["¹", "1"], ["²", "2"], ["³", "3"], ["⁴", "4"], ["⁵", "5"], ["⁶", "6"], ["⁷", "7"], ["⁸", "8"], ["⁹", "9"],
        ["₀", "0"], ["₁", "1"], ["₂", "2"], ["₃", "3"], ["₄", "4"], ["₅", "5"], ["₆", "6"], ["₇", "7"], ["₈", "8"], ["₉", "9"]
    ]),

Modified graphspell-js/str_transform.js from [d8c3ab3e0d] to [b9cc7d5a41].

1
2
3
4
5
6
7
8






9
10
11
12
13
14
15
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21








+
+
+
+
+
+







// STRING TRANSFORMATION

/* jshint esversion:6, -W097 */
/* jslint esversion:6 */
/* global exports, console */

"use strict";

if (typeof(process) !== 'undefined') {
    var char_player = require("./char_player.js");
} else if (typeof(require) !== 'undefined') {
    var char_player = require("resource://grammalecte/graphspell/char_player.js");
}


// Note: 48 is the ASCII code for "0"

var str_transform = {

    getNgrams: function (sWord, n=2) {
        let lNgrams = [];
53
54
55
56
57
58
59
60

61
62
63
64
65
66
67
68


69
70

71
72
73

74
75
76
77
78


79
80
81
82
83
84
85
86
87
88
89

90
91
92
93
94
95
96

97
98
99
100
101
102
103
59
60
61
62
63
64
65

66
67
68
69
70
71
72


73
74
75

76
77
78

79
80
81
82
83

84
85
86
87
88
89
90
91
92
93
94
95

96
97
98
99
100
101
102

103
104
105
106
107
108
109
110







-
+






-
-
+
+

-
+


-
+




-
+
+










-
+






-
+







                    table[i+1][j+1] = 0;
                }
            }
        }
        return longestCommonSubstring;
    },

    distanceDamerauLevenshtein2: function (s1, s2) {
    distanceDamerauLevenshtein: function (s1, s2) {
        // distance of Damerau-Levenshtein between <s1> and <s2>
        // https://fr.wikipedia.org/wiki/Distance_de_Damerau-Levenshtein
        try {
            let nLen1 = s1.length;
            let nLen2 = s2.length;
            let matrix = [];
            for (let i = 0;  i <= nLen1;  i++) {
                matrix[i] = new Array(nLen2 + 1);
            for (let i = 0;  i <= nLen1+1;  i++) {
                matrix[i] = new Array(nLen2 + 2);
            }
            for (let i = 0;  i <= nLen1;  i++) {
            for (let i = 0;  i <= nLen1+1;  i++) {
                matrix[i][0] = i;
            }
            for (let j = 0;  j <= nLen2;  j++) {
            for (let j = 0;  j <= nLen2+1;  j++) {
                matrix[0][j] = j;
            }
            for (let i = 1;  i <= nLen1;  i++) {
                for (let j = 1;  j <= nLen2;  j++) {
                    let nCost = (s1[i] === s2[j]) ? 0 : 1;
                    //let nCost = (s1[i-1] === s2[j-1]) ? 0 : 1;
                    let nCost = char_player.distanceBetweenChars(s1[i-1], s2[j-1]);
                    matrix[i][j] = Math.min(
                        matrix[i-1][j] + 1,         // Deletion
                        matrix[i][j-1] + 1,         // Insertion
                        matrix[i-1][j-1] + nCost    // Substitution
                    );
                    if (i > 1 && j > 1 && s1[i] == s2[j-1] && s1[i-1] == s2[j]) {
                        matrix[i][j] = Math.min(matrix[i][j], matrix[i-2][j-2] + nCost);  // Transposition
                    }
                }
            }
            return matrix[nLen1][nLen2];
            return Math.floor(matrix[nLen1][nLen2]);
        }
        catch (e) {
            console.error(e);
        }
    },

    distanceDamerauLevenshtein: function (s1, s2) {
    distanceDamerauLevenshtein2: function (s1, s2) {
        // distance of Damerau-Levenshtein between <s1> and <s2>
        // https://fr.wikipedia.org/wiki/Distance_de_Damerau-Levenshtein
        try {
            let nLen1 = s1.length;
            let nLen2 = s2.length;
            let INF = nLen1 + nLen2;
            let matrix = [];

Modified graphspell/char_player.py from [d15991830e] to [72875ebb27].

11
12
13
14
15
16
17
18







































19
20
21
22
23
24





25
26
27
28
29
30
31
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58





59
60
61
62
63
64
65
66
67
68
69
70








+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+

-
-
-
-
-
+
+
+
+
+







    'ſ': 's',  'ffi': 'ffi',  'ffl': 'ffl',  'ff': 'ff',  'ſt': 'ft',  'fi': 'fi',  'fl': 'fl',  'st': 'st'
})

def spellingNormalization (sWord):
    "nomalization NFC and removing ligatures"
    return unicodedata.normalize("NFC", sWord.translate(_xTransCharsForSpelling))


dDistanceBetweenChars = {
    "a": {},
    "e": {"é": 0.5},
    "é": {"e": 0.5},
    "i": {"y": 0.2},
    "o": {},
    "u": {},
    "y": {"i": 0.3},
    "b": {"d": 0.8, "h": 0.9},
    "c": {"ç": 0.1, "k": 0.5, "q": 0.5, "s": 0.5, "x": 0.5, "z": 0.8},
    "d": {"b": 0.8},
    "f": {"v": 0.8},
    "g": {"j": 0.5},
    "h": {"b": 0.9},
    "j": {"g": 0.5, "i": 0.9},
    "k": {"c": 0.5, "q": 0.1, "x": 0.5},
    "l": {"i": 0.9},
    "m": {"n": 0.8},
    "n": {"m": 0.8, "r": 0.9},
    "p": {"q": 0.9},
    "q": {"c": 0.5, "k": 0.1, "p": 0.9},
    "r": {"n": 0.9, "j": 0.9},
    "s": {"c": 0.5, "ç": 0.1, "x": 0.5, "z": 0.5},
    "t": {"d": 0.9},
    "v": {"f": 0.8, "w": 0.1},
    "w": {"v": 0.1},
    "x": {"c": 0.5, "k": 0.5, "q": 0.5, "s": 0.5},
    "z": {"s": 0.5}
}


def distanceBetweenChars (c1, c2):
    if c1 == c2:
        return 0
    if c1 not in dDistanceBetweenChars:
        return 1
    return dDistanceBetweenChars[c1].get(c2, 1)


_xTransCharsForSimplification = str.maketrans({
    'à': 'a',  'é': 'é',  'î': 'i',  'ô': 'o',  'û': 'u',  'ÿ': 'i',  "y": "i",
    'â': 'a',  'è': 'é',  'ï': 'i',  'ö': 'o',  'ù': 'u',  'ŷ': 'i',
    'ä': 'a',  'ê': 'é',  'í': 'i',  'ó': 'o',  'ü': 'u',  'ý': 'i',
    'á': 'a',  'ë': 'é',  'ì': 'i',  'ò': 'o',  'ú': 'u',  'ỳ': 'i',
    'ā': 'a',  'ē': 'é',  'ī': 'i',  'ō': 'o',  'ū': 'u',  'ȳ': 'i',
    'à': 'a',  'é': 'é',  'î': 'i',  'ô': 'o',  'û': 'u',  'ÿ': 'y',
    'â': 'a',  'è': 'é',  'ï': 'i',  'ö': 'o',  'ù': 'u',  'ŷ': 'y',
    'ä': 'a',  'ê': 'é',  'í': 'i',  'ó': 'o',  'ü': 'u',  'ý': 'y',
    'á': 'a',  'ë': 'é',  'ì': 'i',  'ò': 'o',  'ú': 'u',  'ỳ': 'y',
    'ā': 'a',  'ē': 'é',  'ī': 'i',  'ō': 'o',  'ū': 'u',  'ȳ': 'y',
    'ç': 'c',  'ñ': 'n',  'k': 'q',  'w': 'v',
    'œ': 'oe',  'æ': 'ae',
    'ſ': 's',  'ffi': 'ffi',  'ffl': 'ffl',  'ff': 'ff',  'ſt': 'ft',  'fi': 'fi',  'fl': 'fl',  'st': 'st',
    "⁰": "0", "¹": "1", "²": "2", "³": "3", "⁴": "4", "⁵": "5", "⁶": "6", "⁷": "7", "⁸": "8", "⁹": "9",
    "₀": "0", "₁": "1", "₂": "2", "₃": "3", "₄": "4", "₅": "5", "₆": "6", "₇": "7", "₈": "8", "₉": "9"
})

Modified graphspell/str_transform.py from [7dcad03ac9] to [452d0bdcef].

1
2
3
4
5



6
7
8
9
10
11
12
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15





+
+
+







"""
Operations on strings:
- calculate distance between two strings
- transform strings with transformation codes
"""

from .char_player import distanceBetweenChars


#### Ngrams

def getNgrams (sWord, n=2):
    "return a list of Ngrams strings"
    return [ sWord[i:i+n]  for i in range(len(sWord)-n+1) ]

40
41
42
43
44
45
46
47


48
49
50
51
52
53
54
55

56
57
58
59
60
61
62
43
44
45
46
47
48
49

50
51
52
53
54
55
56
57
58

59
60
61
62
63
64
65
66







-
+
+







-
+







    nLen2 = len(s2)
    for i in range(-1, nLen1+1):
        d[i, -1] = i + 1
    for j in range(-1, nLen2+1):
        d[-1, j] = j + 1
    for i in range(nLen1):
        for j in range(nLen2):
            nCost = 0  if s1[i] == s2[j]  else 1
            #nCost = 0  if s1[i] == s2[j]  else 1
            nCost = distanceBetweenChars(s1[i], s2[j])
            d[i, j] = min(
                d[i-1, j]   + 1,        # Deletion
                d[i,   j-1] + 1,        # Insertion
                d[i-1, j-1] + nCost,    # Substitution
            )
            if i and j and s1[i] == s2[j-1] and s1[i-1] == s2[j]:
                d[i, j] = min(d[i, j], d[i-2, j-2] + nCost)     # Transposition
    return d[nLen1-1, nLen2-1]
    return int(d[nLen1-1, nLen2-1])


def distanceSift4 (s1, s2, nMaxOffset=5):
    "implementation of general Sift4."
    # https://siderite.blogspot.com/2014/11/super-fast-and-accurate-string-distance.html
    if not s1:
        return len(s2)