Grammalecte  Check-in [fbf59c7547]

Overview
Comment:[core] ibdawg: bug fixed and code cleaning
Downloads: Tarball | ZIP archive | SQL archive
Timelines: family | ancestors | descendants | both | trunk | core
Files: files | file ages | folders
SHA3-256: fbf59c7547496f1405546022c62cb095fdb2162aaf96fe449f57c3634bcfc709
User & Date: olr on 2017-11-22 14:57:05
Other Links: manifest | tags
Context
2017-11-22
14:58
[core][js] Damerau-Levenshtein distance buggy: can’t find out why -> another version check-in: 300e9c7d3d user: olr tags: trunk, core
14:57
[core] ibdawg: bug fixed and code cleaning check-in: fbf59c7547 user: olr tags: trunk, core
2017-11-21
17:54
[core] suggestion engine: word simplification check-in: fa01465ba8 user: olr tags: trunk, core
Changes

Modified gc_core/js/char_player.js from [2d3bad14c9] to [c9b14a8774].

12
13
14
15
16
17
18
19

20
21
22
23
24
25
26
27
28
29
30
31
32

33
34
35
36
37
38
39
12
13
14
15
16
17
18

19
20
21
22
23
24
25
26
27
28
29
30
31

32
33
34
35
36
37
38
39







-
+












-
+







        ['ä', 'a'],  ['ê', 'e'],  ['í', 'i'],  ['ó', 'o'],  ['ü', 'u'],  ['ý', 'i'],
        ['á', 'a'],  ['ë', 'e'],  ['ì', 'i'],  ['ò', 'o'],  ['ú', 'u'],  ['ỳ', 'i'],
        ['ā', 'a'],  ['ē', 'e'],  ['ī', 'i'],  ['ō', 'o'],  ['ū', 'u'],  ['ȳ', 'i'],
        ['ñ', 'n'],  ['k', 'q'],  ['w', 'v'],
        ['œ', 'oe'], ['æ', 'ae'], 
    ]),

    cleanWord: function (sWord) {
    simplifyWord: function (sWord) {
        // word simplication before calculating distance between words
        sWord = sWord.toLowerCase();
        let sNewWord = "";
        let i = 1;
        for (let c of sWord) {
            let cNew = this._dTransChars.gl_get(c, c);
            let cNext = sWord.slice(i, i+1)
            if (cNew != this._dTransChars.gl_get(cNext, cNext)) {
                sNewWord += cNew;
            }
            i++;
        }
        return sNewWord.replace("eau", "o").replace("au", "o").replace("ai", "e").replace("ei", "e").replace("ph", "f");
        return sNewWord.replace(/eau/g, "o").replace(/au/g, "o").replace(/ai/g, "e").replace(/ei/g, "e").replace(/ph/g, "f");
    },

    aVowel: new Set("aáàâäāeéèêëēiíìîïīoóòôöōuúùûüūyýỳŷÿȳœæAÁÀÂÄĀEÉÈÊËĒIÍÌÎÏĪOÓÒÔÖŌUÚÙÛÜŪYÝỲŶŸȲŒÆ"),
    aConsonant: new Set("bcçdfghjklmnñpqrstvwxzBCÇDFGHJKLMNÑPQRSTVWXZ"),
    aDouble: new Set("bcdfjklmnprstzBCDFJKLMNPRSTZ"),  // letters that may be used twice successively


Modified gc_core/js/ibdawg.js from [0055aa4ed5] to [c1be157216].

19
20
21
22
23
24
25
26

27
28
29
30

31
32
33
34
35
36

37
38
39
40
41
42
43
19
20
21
22
23
24
25

26
27
28
29

30
31
32
33
34
35

36
37
38
39
40
41
42
43







-
+



-
+





-
+









class SuggResult {
    // Structure for storing, classifying and filtering suggestions

    constructor (sWord, nDistLimit=-1) {
        this.sWord = sWord;
        this.sCleanWord = char_player.cleanWord(sWord);
        this.sSimplifiedWord = char_player.simplifyWord(sWord);
        this.nDistLimit = (nDistLimit >= 0) ? nDistLimit :  Math.floor(sWord.length / 3) + 1;
        this.nMinDist = 1000;
        this.aSugg = new Set();
        this.dSugg = new Map([ [0, []],  [1, []] ]);
        this.dSugg = new Map([ [0, []],  [1, []],  [2, []] ]);
    }

    addSugg (sSugg, nDeep=0) {
        // add a suggestion
        if (!this.aSugg.has(sSugg)) {
            let nDist = str_transform.distanceDamerauLevenshtein(this.sCleanWord, char_player.cleanWord(sSugg));
            let nDist = str_transform.distanceDamerauLevenshtein(this.sSimplifiedWord, char_player.simplifyWord(sSugg));
            if (nDist <= this.nDistLimit) {
                if (!this.dSugg.has(nDist)) {
                    this.dSugg.set(nDist, []);
                }
                this.dSugg.get(nDist).push(sSugg);
                this.aSugg.add(sSugg);
                if (nDist < this.nMinDist) {
247
248
249
250
251
252
253
254

255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270

271
272
273
274
275
276
277
247
248
249
250
251
252
253

254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269

270
271
272
273
274
275
276
277







-
+















-
+







            if (sWord.gl_isUpperCase() && sWord.length > 1) {
                l = l.concat(this.morph(sWord.gl_toCapitalize()));
            }
        }
        return l;
    }

    suggest (sWord, nMaxSugg=10) {
    suggest (sWord, nSuggLimit=10) {
        // returns a array of suggestions for <sWord>
        let sPfx = "";
        let sSfx = "";
        [sPfx, sWord, sSfx] = char_player.cut(sWord);
        let nMaxSwitch = Math.max(Math.floor(sWord.length / 3), 1);
        let nMaxDel = Math.floor(sWord.length / 5);
        let nMaxHardRepl = Math.max(Math.floor((sWord.length - 5) / 4), 1);
        let oSuggResult = new SuggResult(sWord);
        this._suggest(oSuggResult, sWord, nMaxSwitch, nMaxDel, nMaxHardRepl);
        if (sWord.gl_isTitle()) {
            this._suggest(oSuggResult, sWord.toLowerCase(), nMaxSwitch, nMaxDel, nMaxHardRepl);
        }
        else if (sWord.gl_isLowerCase()) {
            this._suggest(oSuggResult, sWord.gl_toCapitalize(), nMaxSwitch, nMaxDel, nMaxHardRepl);
        }
        let aSugg = oSuggResult.getSuggestions();
        let aSugg = oSuggResult.getSuggestions(nSuggLimit);
        if (sSfx || sPfx) {
            // we add what we removed
            return aSugg.map( (sSugg) => { return sPfx + sSugg + sSfx } );
        }
        return aSugg;
    }

285
286
287
288
289
290
291
292

293
294
295
296
297
298
299
285
286
287
288
289
290
291

292
293
294
295
296
297
298
299







-
+







            for (let sTail of this._getTails(iAddr)) {
                oSuggResult.addSugg(sNewWord+sTail);
            }
            return;
        }
        let cCurrent = sRemain.slice(0, 1);
        for (let [cChar, jAddr] of this._getCharArcs(iAddr)) {
            if (char_player.d1to1.gl_get(cCurrent, [cCurrent]).includes(cChar)) {
            if (char_player.d1to1.gl_get(cCurrent, cCurrent).indexOf(cChar) != -1) {
                this._suggest(oSuggResult, sRemain.slice(1), nMaxSwitch, nMaxDel, nMaxHardRepl, nDeep+1, jAddr, sNewWord+cChar);
            }
            else if (!bAvoidLoop && nMaxHardRepl) {
                this._suggest(oSuggResult, sRemain.slice(1), nMaxSwitch, nMaxDel, nMaxHardRepl-1, nDeep+1, jAddr, sNewWord+cChar, true);
            }
        }
        if (!bAvoidLoop) { // avoid infinite loop

Modified gc_core/js/str_transform.js from [e70f6ede55] to [9dddadeae1].

30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
30
31
32
33
34
35
36

37
38
39
40
41
42
43







-







                        matrix[i-1][j-1] + nCost    // Substitution
                    );
                    if (i > 1 && j > 1 && s1[i] == s2[j-1] && s1[i-1] == s2[j]) {
                        matrix[i][j] = Math.min(matrix[i][j], matrix[i-2][j-2] + nCost);  // Transposition
                    }
                }
            }
            //console.log(s2 + ": " + matrix[nLen1][nLen2]);
            return matrix[nLen1][nLen2];
        }
        catch (e) {
            helpers.logerror(e);
        }
    },

Modified gc_core/py/char_player.py from [a33649b94f] to [82e97eae54].

10
11
12
13
14
15
16
17

18
19
20
21
22
23
24
10
11
12
13
14
15
16

17
18
19
20
21
22
23
24







-
+







    'ä': 'a',  'ê': 'e',  'í': 'i',  'ó': 'o',  'ü': 'u',  'ý': 'i',
    'á': 'a',  'ë': 'e',  'ì': 'i',  'ò': 'o',  'ú': 'u',  'ỳ': 'i',
    'ā': 'a',  'ē': 'e',  'ī': 'i',  'ō': 'o',  'ū': 'u',  'ȳ': 'i',
    'ñ': 'n',  'k': 'q',  'w': 'v',
    'œ': 'oe',  'æ': 'ae', 
})

def cleanWord (sWord):
def simplifyWord (sWord):
    "word simplication before calculating distance between words"
    sWord = sWord.lower().translate(_xTransChars)
    sNewWord = ""
    for i, c in enumerate(sWord, 1):
        if c != sWord[i:i+1]:
            sNewWord += c
    return sNewWord.replace("eau", "o").replace("au", "o").replace("ai", "e").replace("ei", "e").replace("ph", "f")

Modified gc_core/py/ibdawg.py from [4376dde7d7] to [0bb28ceca0].

28
29
30
31
32
33
34
35

36
37
38
39

40
41
42
43
44
45

46
47
48
49
50
51
52
28
29
30
31
32
33
34

35
36
37
38

39
40
41
42
43
44

45
46
47
48
49
50
51
52







-
+



-
+





-
+









class SuggResult:
    """Structure for storing, classifying and filtering suggestions"""

    def __init__ (self, sWord, nDistLimit=-1):
        self.sWord = sWord
        self.sCleanWord = cp.cleanWord(sWord)
        self.sSimplifiedWord = cp.simplifyWord(sWord)
        self.nDistLimit = nDistLimit  if nDistLimit >= 0  else  (len(sWord) // 3) + 1
        self.nMinDist = 1000
        self.aSugg = set()
        self.dSugg = { 0: [],  1: [] }
        self.dSugg = { 0: [],  1: [],  2: [] }

    def addSugg (self, sSugg, nDeep=0):
        "add a suggestion"
        #logging.info((nDeep * "  ") + "__" + sSugg + "__")
        if sSugg not in self.aSugg:
            nDist = st.distanceDamerauLevenshtein(self.sCleanWord, cp.cleanWord(sSugg))
            nDist = st.distanceDamerauLevenshtein(self.sSimplifiedWord, cp.simplifyWord(sSugg))
            if nDist <= self.nDistLimit:
                if nDist not in self.dSugg:
                    self.dSugg[nDist] = []
                self.dSugg[nDist].append(sSugg)
                self.aSugg.add(sSugg)
                if nDist < self.nMinDist:
                    self.nMinDist = nDist
243
244
245
246
247
248
249
250

251
252
253
254
255
256
257
258
259
260
261
262

263
264
265
266
267
268
269
243
244
245
246
247
248
249

250
251
252
253
254
255
256
257
258
259
260
261

262
263
264
265
266
267
268
269







-
+











-
+







        if sWord[0:1].isupper():
            l.extend(self.morph(sWord.lower()))
            if sWord.isupper() and len(sWord) > 1:
                l.extend(self.morph(sWord.capitalize()))
        return l

    #@timethis
    def suggest (self, sWord, nMaxSugg=10):
    def suggest (self, sWord, nSuggLimit=10):
        "returns a set of suggestions for <sWord>"
        sPfx, sWord, sSfx = cp.cut(sWord)
        nMaxSwitch = max(len(sWord) // 3, 1)
        nMaxDel = len(sWord) // 5
        nMaxHardRepl = max((len(sWord) - 5) // 4, 1)
        oSuggResult = SuggResult(sWord)
        self._suggest(oSuggResult, sWord, nMaxSwitch=nMaxSwitch, nMaxDel=nMaxDel, nMaxHardRepl=nMaxHardRepl)
        if sWord.istitle():
            self._suggest(oSuggResult, sWord.lower(), nMaxSwitch=nMaxSwitch, nMaxDel=nMaxDel, nMaxHardRepl=nMaxHardRepl)
        elif sWord.islower():
            self._suggest(oSuggResult, sWord.title(), nMaxSwitch=nMaxSwitch, nMaxDel=nMaxDel, nMaxHardRepl=nMaxHardRepl)
        aSugg = oSuggResult.getSuggestions()
        aSugg = oSuggResult.getSuggestions(nSuggLimit)
        if sSfx or sPfx:
            # we add what we removed
            return list(map(lambda sSug: sPfx + sSug + sSfx, aSugg))
        return aSugg

    def _suggest (self, oSuggResult, sRemain, nMaxSwitch=0, nMaxDel=0, nMaxHardRepl=0, nDeep=0, iAddr=0, sNewWord="", bAvoidLoop=False):
        # recursive function
318
319
320
321
322
323
324
325

326
327
328
329
330
331
332
318
319
320
321
322
323
324

325
326
327
328
329
330
331
332







-
+







            return list(map(lambda sSug: sPfx + sSug + sSfx, aSugg))
        return aSugg

    def _suggest2 (self, oSuggResult, nDeep=0, iAddr=0, sNewWord=""):
        # recursive function
        #logging.info((nDeep * "  ") + sNewWord)
        if nDeep >= oSuggResult.nDistLimit:
            sCleanNewWord = cp.cleanWord(sNewWord)
            sCleanNewWord = cp.simplifyWord(sNewWord)
            if st.distanceSift4(oSuggResult.sCleanWord[:len(sCleanNewWord)], sCleanNewWord) > oSuggResult.nDistLimit:
                return
        if int.from_bytes(self.byDic[iAddr:iAddr+self.nBytesArc], byteorder='big') & self._finalNodeMask:
            oSuggResult.addSugg(sNewWord, nDeep)
        for cChar, jAddr in self._getCharArcsWithPriority(iAddr, oSuggResult.sWord[nDeep:nDeep+1]):
            self._suggest2(oSuggResult, nDeep+1, jAddr, sNewWord+cChar)
        return

Modified make.py from [37cd0bb45e] to [2aed0ffb52].

353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
353
354
355
356
357
358
359

360
361
362
363
364
365
366







-







            if False:
                # obsolete
                with helpers.cd("_build/xpi/"+sLang):
                    spfFirefox = dVars['win_fx_dev_path']  if platform.system() == "Windows"  else dVars['linux_fx_dev_path']
                    os.system('jpm run -b "' + spfFirefox + '"')

            if xArgs.web_ext or xArgs.firefox:
                
                with helpers.cd("_build/webext/"+sLang):
                    if xArgs.firefox:
                        # Firefox Developper edition
                        spfFirefox = dVars['win_fx_dev_path']  if platform.system() == "Windows"  else dVars['linux_fx_dev_path']
                    else:
                        # Firefox Nightly edition
                        spfFirefox = dVars['win_fx_nightly_path']  if platform.system() == "Windows"  else dVars['linux_fx_nightly_path']