Overview
| Comment: | [core] ibdawg: clean words before damerau-levenshtein comparison | 
|---|---|
| Downloads: | Tarball | ZIP archive | SQL archive | 
| Timelines: | family | ancestors | descendants | both | trunk | core | 
| Files: | files | file ages | folders | 
| SHA3-256: | 1329ae8f1c84636f6974834cc2bb7383 | 
| User & Date: | olr on 2017-10-25 11:37:33 | 
| Other Links: | manifest | tags | 
Context
| 2017-10-25 | ||
| 14:30 | [core][fr] ibdawg: char_player > phonème o check-in: 0ad1970e9c user: olr tags: trunk, fr, core | |
| 11:37 | [core] ibdawg: clean words before damerau-levenshtein comparison check-in: 1329ae8f1c user: olr tags: trunk, core | |
| 09:41 | [core] ibdawg: suggestion mechanism > split word function check-in: 388e8809cf user: olr tags: trunk, core | |
Changes
Modified gc_core/js/char_player.js from [c0ed55106f] to [9c8e1eeca8].
| 1 2 3 4 5 6 7 8 9 10 11 12 13 14 | 
// list of similar chars
// useful for suggestion mechanism
${map}
var char_player = {
    distanceDamerauLevenshtein: function (s1, s2) {
        // distance of Damerau-Levenshtein between <s1> and <s2>
        // https://fr.wikipedia.org/wiki/Distance_de_Damerau-Levenshtein
        try {
            let nLen1 = s1.length;
            let nLen2 = s2.length;
 | > > > > > > > > > > > > > > > > > > > > | 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 | 
// list of similar chars
// useful for suggestion mechanism
${map}
var char_player = {
    _dTransChars: new Map([
        ['à', 'a'],  ['é', 'e'],  ['î', 'i'],  ['ô', 'o'],  ['û', 'u'],  ['ÿ', 'y'],
        ['â', 'a'],  ['è', 'e'],  ['ï', 'i'],  ['ö', 'o'],  ['ù', 'u'],  ['ŷ', 'y'],
        ['ä', 'a'],  ['ê', 'e'],  ['í', 'i'],  ['ó', 'o'],  ['ü', 'u'],  ['ý', 'y'],
        ['á', 'a'],  ['ë', 'e'],  ['ì', 'i'],  ['ò', 'o'],  ['ú', 'u'],  ['ỳ', 'y'],
        ['ā', 'a'],  ['ē', 'e'],  ['ī', 'i'],  ['ō', 'o'],  ['ū', 'u'],  ['ȳ', 'y'],
        ['ñ', 'n'],
        ['œ', 'oe'], ['æ', 'ae'], 
    ]),
    cleanWord: function (sWord) {
        // word simplication before calculating distance between words
        sWord = sWord.toLowerCase();
        let sRes = "";
        for (let c of sWord) {
            sRes += this._dTransChars.gl_get(c, c);
        }
        return sWord;
    },
    distanceDamerauLevenshtein: function (s1, s2) {
        // distance of Damerau-Levenshtein between <s1> and <s2>
        // https://fr.wikipedia.org/wiki/Distance_de_Damerau-Levenshtein
        try {
            let nLen1 = s1.length;
            let nLen2 = s2.length;
 | 
| ︙ | ︙ | |||
| 52 53 54 55 56 57 58 | 
        'â', 'è', 'ï', 'ö', 'ù', 'ŷ',
        'ä', 'ê', 'í', 'ó', 'ü', 'ý',
        'á', 'ë', 'ì', 'ò', 'ú', 'ỳ',
        'ā', 'ē', 'ī', 'ō', 'ū', 'ȳ',
        'h', 'œ', 'æ'
    ]),
 | | | 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 | 
        'â', 'è', 'ï', 'ö', 'ù', 'ŷ',
        'ä', 'ê', 'í', 'ó', 'ü', 'ý',
        'á', 'ë', 'ì', 'ò', 'ú', 'ỳ',
        'ā', 'ē', 'ī', 'ō', 'ū', 'ȳ',
        'h', 'œ', 'æ'
    ]),
    shrinkWord: function (sWord) {
        // remove vovels and h
        let sRes = "";
        for (let cChar of sWord.slice(1)) {
            if (!this.aVovels.has(cChar)) {
                sRes += cChar;
            }
        }
 | 
| ︙ | ︙ | 
Modified gc_core/js/ibdawg.js from [c871817c8f] to [82209aec2c].
| ︙ | ︙ | |||
| 199 200 201 202 203 204 205 | 
        if (sWord.gl_isTitle()) {
            aSugg.gl_update(this._suggest(sWord.toLowerCase(), nMaxDel, nMaxHardRepl));
        }
        else if (sWord.gl_isLowerCase()) {
            aSugg.gl_update(this._suggest(sWord.gl_toCapitalize(), nMaxDel, nMaxHardRepl));
        }
        if (aSugg.size == 0) {
 | | > | | 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 | 
        if (sWord.gl_isTitle()) {
            aSugg.gl_update(this._suggest(sWord.toLowerCase(), nMaxDel, nMaxHardRepl));
        }
        else if (sWord.gl_isLowerCase()) {
            aSugg.gl_update(this._suggest(sWord.gl_toCapitalize(), nMaxDel, nMaxHardRepl));
        }
        if (aSugg.size == 0) {
            aSugg.gl_update(this._suggestWithCrushedUselessChars(char_player.shrinkWord(sWord)));
        }
        // Set to Array
        aSugg = Array.from(aSugg);
        aSugg = aSugg.filter((sSugg) => { return !sSugg.endsWith("è") && !sSugg.endsWith("È"); }); // fr language 
        if (sWord.gl_isTitle()) {
            aSugg = aSugg.map((sSugg) => { return sSugg.gl_toCapitalize(); });
        }
        let dDistTemp = new Map();
        let sCleanWord = char_player.cleanWord(sWord)
        aSugg.forEach((sSugg) => { dDistTemp.set(sSugg, char_player.distanceDamerauLevenshtein(sCleanWord, char_player.cleanWord(sSugg))); });
        aSugg = aSugg.sort((sA, sB) => { return dDistTemp.get(sA) - dDistTemp.get(sB); }).slice(0, nMaxSugg);
        dDistTemp.clear();
        if (sSfx || sPfx) {
            // we add what we removed
            return aSugg.map( (sSugg) => { return sPfx + sSugg + sSfx } );
        }
        return aSugg;
 | 
| ︙ | ︙ | 
Modified gc_core/py/char_player.py from [2ac4c0eb20] to [83d2f45b7c].
| 1 2 3 4 5 6 7 8 9 10 11 12 | 
# list of similar chars
# useful for suggestion mechanism
import re
def distanceDamerauLevenshtein (s1, s2):
    "distance of Damerau-Levenshtein between <s1> and <s2>"
    # https://fr.wikipedia.org/wiki/Distance_de_Damerau-Levenshtein
    d = {}
    nLen1 = len(s1)
    nLen2 = len(s2)
 | > > > > > > > > > > > > > > > | 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 | 
# list of similar chars
# useful for suggestion mechanism
import re
_xTransChars = str.maketrans({
    'à': 'a',  'é': 'e',  'î': 'i',  'ô': 'o',  'û': 'u',  'ÿ': 'y',
    'â': 'a',  'è': 'e',  'ï': 'i',  'ö': 'o',  'ù': 'u',  'ŷ': 'y',
    'ä': 'a',  'ê': 'e',  'í': 'i',  'ó': 'o',  'ü': 'u',  'ý': 'y',
    'á': 'a',  'ë': 'e',  'ì': 'i',  'ò': 'o',  'ú': 'u',  'ỳ': 'y',
    'ā': 'a',  'ē': 'e',  'ī': 'i',  'ō': 'o',  'ū': 'u',  'ȳ': 'y',
    'ñ': 'n',
    'œ': 'oe',  'æ': 'ae', 
})
def cleanWord (sWord):
    "word simplication before calculating distance between words"
    return sWord.lower().translate(_xTransChars)
def distanceDamerauLevenshtein (s1, s2):
    "distance of Damerau-Levenshtein between <s1> and <s2>"
    # https://fr.wikipedia.org/wiki/Distance_de_Damerau-Levenshtein
    d = {}
    nLen1 = len(s1)
    nLen2 = len(s2)
 | 
| ︙ | ︙ | |||
| 41 42 43 44 45 46 47 | _xTransVovels = str.maketrans(_dVovels) aVovels = frozenset(_dVovels.keys()) | | | 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 | 
_xTransVovels = str.maketrans(_dVovels)
aVovels = frozenset(_dVovels.keys())
def shrinkWord (sWord):
    "remove vovels and h"
    return sWord[0:1].replace("h", "") + sWord[1:].translate(_xTransVovels)
# Similar chars
d1to1 = {
 | 
| ︙ | ︙ | 
Modified gc_core/py/ibdawg.py from [e132c3a736] to [f563ae7bdb].
| ︙ | ︙ | |||
| 196 197 198 199 200 201 202 | 
        if sWord.istitle():
            aSugg.update(self._suggest(sWord.lower(), nMaxDel=nMaxDel, nMaxHardRepl=nMaxHardRepl))
            aSugg = set(map(lambda sSugg: sSugg.title(), aSugg))
        elif sWord.islower():
            aSugg.update(self._suggest(sWord.title(), nMaxDel=nMaxDel, nMaxHardRepl=nMaxHardRepl))
        if not aSugg:
            #print("crush useless chars")
 | | > | | 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 | 
        if sWord.istitle():
            aSugg.update(self._suggest(sWord.lower(), nMaxDel=nMaxDel, nMaxHardRepl=nMaxHardRepl))
            aSugg = set(map(lambda sSugg: sSugg.title(), aSugg))
        elif sWord.islower():
            aSugg.update(self._suggest(sWord.title(), nMaxDel=nMaxDel, nMaxHardRepl=nMaxHardRepl))
        if not aSugg:
            #print("crush useless chars")
            aSugg.update(self._suggestWithCrushedUselessChars(cp.shrinkWord(sWord)))
        aSugg = cp.filterSugg(aSugg)
        sCleanWord = cp.cleanWord(sWord)
        aSugg = sorted(aSugg, key=lambda sSugg: cp.distanceDamerauLevenshtein(sCleanWord, cp.cleanWord(sSugg)))[:nMaxSugg]
        if sSfx or sPfx:
            # we add what we removed
            return list(map(lambda sSug: sPfx + sSug + sSfx, aSugg))
        return aSugg
    def _suggest (self, sRemain, nMaxDel=0, nMaxHardRepl=0, nDeep=0, iAddr=0, sNewWord="", bAvoidLoop=False):
        "returns a set of suggestions"
 | 
| ︙ | ︙ |