Grammalecte  Check-in [1329ae8f1c]

Overview
Comment:[core] ibdawg: clean words before damerau-levenshtein comparison
Downloads: Tarball | ZIP archive | SQL archive
Timelines: family | ancestors | descendants | both | trunk | core
Files: files | file ages | folders
SHA3-256: 1329ae8f1c84636f6974834cc2bb7383def1d59e208f6573baeea10795b5faa6
User & Date: olr on 2017-10-25 11:37:33
Other Links: manifest | tags
Context
2017-10-25
14:30
[core][fr] ibdawg: char_player > phonème o check-in: 0ad1970e9c user: olr tags: trunk, fr, core
11:37
[core] ibdawg: clean words before damerau-levenshtein comparison check-in: 1329ae8f1c user: olr tags: trunk, core
09:41
[core] ibdawg: suggestion mechanism > split word function check-in: 388e8809cf user: olr tags: trunk, core
Changes

Modified gc_core/js/char_player.js from [c0ed55106f] to [9c8e1eeca8].

1
2
3
4
5
6
7




















8
9
10
11
12
13
14
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34







+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+







// list of similar chars
// useful for suggestion mechanism

${map}


var char_player = {

    _dTransChars: new Map([
        ['à', 'a'],  ['é', 'e'],  ['î', 'i'],  ['ô', 'o'],  ['û', 'u'],  ['ÿ', 'y'],
        ['â', 'a'],  ['è', 'e'],  ['ï', 'i'],  ['ö', 'o'],  ['ù', 'u'],  ['ŷ', 'y'],
        ['ä', 'a'],  ['ê', 'e'],  ['í', 'i'],  ['ó', 'o'],  ['ü', 'u'],  ['ý', 'y'],
        ['á', 'a'],  ['ë', 'e'],  ['ì', 'i'],  ['ò', 'o'],  ['ú', 'u'],  ['ỳ', 'y'],
        ['ā', 'a'],  ['ē', 'e'],  ['ī', 'i'],  ['ō', 'o'],  ['ū', 'u'],  ['ȳ', 'y'],
        ['ñ', 'n'],
        ['œ', 'oe'], ['æ', 'ae'], 
    ]),

    cleanWord: function (sWord) {
        // word simplication before calculating distance between words
        sWord = sWord.toLowerCase();
        let sRes = "";
        for (let c of sWord) {
            sRes += this._dTransChars.gl_get(c, c);
        }
        return sWord;
    },

    distanceDamerauLevenshtein: function (s1, s2) {
        // distance of Damerau-Levenshtein between <s1> and <s2>
        // https://fr.wikipedia.org/wiki/Distance_de_Damerau-Levenshtein
        try {
            let nLen1 = s1.length;
            let nLen2 = s2.length;
52
53
54
55
56
57
58
59

60
61
62
63
64
65
66
72
73
74
75
76
77
78

79
80
81
82
83
84
85
86







-
+







        'â', 'è', 'ï', 'ö', 'ù', 'ŷ',
        'ä', 'ê', 'í', 'ó', 'ü', 'ý',
        'á', 'ë', 'ì', 'ò', 'ú', 'ỳ',
        'ā', 'ē', 'ī', 'ō', 'ū', 'ȳ',
        'h', 'œ', 'æ'
    ]),

    clearWord: function (sWord) {
    shrinkWord: function (sWord) {
        // remove vovels and h
        let sRes = "";
        for (let cChar of sWord.slice(1)) {
            if (!this.aVovels.has(cChar)) {
                sRes += cChar;
            }
        }

Modified gc_core/js/ibdawg.js from [c871817c8f] to [82209aec2c].

199
200
201
202
203
204
205
206

207
208
209
210
211
212
213
214

215

216
217
218
219
220
221
222
199
200
201
202
203
204
205

206
207
208
209
210
211
212
213
214
215

216
217
218
219
220
221
222
223







-
+








+
-
+







        if (sWord.gl_isTitle()) {
            aSugg.gl_update(this._suggest(sWord.toLowerCase(), nMaxDel, nMaxHardRepl));
        }
        else if (sWord.gl_isLowerCase()) {
            aSugg.gl_update(this._suggest(sWord.gl_toCapitalize(), nMaxDel, nMaxHardRepl));
        }
        if (aSugg.size == 0) {
            aSugg.gl_update(this._suggestWithCrushedUselessChars(char_player.clearWord(sWord)));
            aSugg.gl_update(this._suggestWithCrushedUselessChars(char_player.shrinkWord(sWord)));
        }
        // Set to Array
        aSugg = Array.from(aSugg);
        aSugg = aSugg.filter((sSugg) => { return !sSugg.endsWith("è") && !sSugg.endsWith("È"); }); // fr language 
        if (sWord.gl_isTitle()) {
            aSugg = aSugg.map((sSugg) => { return sSugg.gl_toCapitalize(); });
        }
        let dDistTemp = new Map();
        let sCleanWord = char_player.cleanWord(sWord)
        aSugg.forEach((sSugg) => { dDistTemp.set(sSugg, char_player.distanceDamerauLevenshtein(sWord, sSugg)); });
        aSugg.forEach((sSugg) => { dDistTemp.set(sSugg, char_player.distanceDamerauLevenshtein(sCleanWord, char_player.cleanWord(sSugg))); });
        aSugg = aSugg.sort((sA, sB) => { return dDistTemp.get(sA) - dDistTemp.get(sB); }).slice(0, nMaxSugg);
        dDistTemp.clear();
        if (sSfx || sPfx) {
            // we add what we removed
            return aSugg.map( (sSugg) => { return sPfx + sSugg + sSfx } );
        }
        return aSugg;

Modified gc_core/py/char_player.py from [2ac4c0eb20] to [83d2f45b7c].

1
2
3
4
5















6
7
8
9
10
11
12
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27





+
+
+
+
+
+
+
+
+
+
+
+
+
+
+







# list of similar chars
# useful for suggestion mechanism

import re


_xTransChars = str.maketrans({
    'à': 'a',  'é': 'e',  'î': 'i',  'ô': 'o',  'û': 'u',  'ÿ': 'y',
    'â': 'a',  'è': 'e',  'ï': 'i',  'ö': 'o',  'ù': 'u',  'ŷ': 'y',
    'ä': 'a',  'ê': 'e',  'í': 'i',  'ó': 'o',  'ü': 'u',  'ý': 'y',
    'á': 'a',  'ë': 'e',  'ì': 'i',  'ò': 'o',  'ú': 'u',  'ỳ': 'y',
    'ā': 'a',  'ē': 'e',  'ī': 'i',  'ō': 'o',  'ū': 'u',  'ȳ': 'y',
    'ñ': 'n',
    'œ': 'oe',  'æ': 'ae', 
})

def cleanWord (sWord):
    "word simplication before calculating distance between words"
    return sWord.lower().translate(_xTransChars)


def distanceDamerauLevenshtein (s1, s2):
    "distance of Damerau-Levenshtein between <s1> and <s2>"
    # https://fr.wikipedia.org/wiki/Distance_de_Damerau-Levenshtein
    d = {}
    nLen1 = len(s1)
    nLen2 = len(s2)
41
42
43
44
45
46
47
48

49
50
51
52
53
54
55
56
57
58
59
60
61
62

63
64
65
66
67
68
69
70







-
+








_xTransVovels = str.maketrans(_dVovels)


aVovels = frozenset(_dVovels.keys())


def clearWord (sWord):
def shrinkWord (sWord):
    "remove vovels and h"
    return sWord[0:1].replace("h", "") + sWord[1:].translate(_xTransVovels)


# Similar chars

d1to1 = {

Modified gc_core/py/ibdawg.py from [e132c3a736] to [f563ae7bdb].

196
197
198
199
200
201
202
203

204

205

206
207
208
209
210
211
212
196
197
198
199
200
201
202

203
204
205

206
207
208
209
210
211
212
213







-
+

+
-
+







        if sWord.istitle():
            aSugg.update(self._suggest(sWord.lower(), nMaxDel=nMaxDel, nMaxHardRepl=nMaxHardRepl))
            aSugg = set(map(lambda sSugg: sSugg.title(), aSugg))
        elif sWord.islower():
            aSugg.update(self._suggest(sWord.title(), nMaxDel=nMaxDel, nMaxHardRepl=nMaxHardRepl))
        if not aSugg:
            #print("crush useless chars")
            aSugg.update(self._suggestWithCrushedUselessChars(cp.clearWord(sWord)))
            aSugg.update(self._suggestWithCrushedUselessChars(cp.shrinkWord(sWord)))
        aSugg = cp.filterSugg(aSugg)
        sCleanWord = cp.cleanWord(sWord)
        aSugg = sorted(aSugg, key=lambda sSugg: cp.distanceDamerauLevenshtein(sWord, sSugg))[:nMaxSugg]
        aSugg = sorted(aSugg, key=lambda sSugg: cp.distanceDamerauLevenshtein(sCleanWord, cp.cleanWord(sSugg)))[:nMaxSugg]
        if sSfx or sPfx:
            # we add what we removed
            return list(map(lambda sSug: sPfx + sSug + sSfx, aSugg))
        return aSugg

    def _suggest (self, sRemain, nMaxDel=0, nMaxHardRepl=0, nDeep=0, iAddr=0, sNewWord="", bAvoidLoop=False):
        "returns a set of suggestions"