Grammalecte  Check-in [64ccfa7e38]

Overview
Comment:[core][bug] ibdawg: avoid storing several times the same suggestion
Downloads: Tarball | ZIP archive | SQL archive
Timelines: family | ancestors | descendants | both | core | spellsugg
Files: files | file ages | folders
SHA3-256: 64ccfa7e38801aa95ad1ce8437396c96c4674b80d78ba5ff4c8ceb5af276e268
User & Date: olr on 2017-11-07 18:25:09
Other Links: branch diff | manifest | tags
Context
2017-11-07
19:28
[core] sort first range of suggestions + code clarification check-in: d22466bd67 user: olr tags: core, spellsugg
18:25
[core][bug] ibdawg: avoid storing several times the same suggestion check-in: 64ccfa7e38 user: olr tags: core, spellsugg
17:59
[core] ibdawg: use SuggResult for the first suggestion method also check-in: 515e7f3768 user: olr tags: core, spellsugg
Changes

Modified gc_core/py/ibdawg.py from [8db71f38ab] to [2152c0fca3].

25
26
27
28
29
30
31

32
33
34
35
36
37
38
39

40
41
42

43
44
45
46
47
48

49

50
51
52
53
54
55
56



57
58
59
60
61
62
63
64
65




66
67
68
69
70
71
72
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39

40
41
42
43
44
45
46
47
48
49
50
51

52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82







+







-
+



+






+
-
+







+
+
+









+
+
+
+







        fEnd = time.time()
        print(func.__name__, fEnd - fStart)
        return result
    return wrapper


class SuggResult:
    """Structure for storing, classifying and filtering suggestions"""

    def __init__ (self, sWord, nDistLimit=-1):
        self.sWord = sWord
        self.sCleanWord = cp.cleanWord(sWord)
        self.nDistLimit = nDistLimit  if nDistLimit >= 0  else  (len(sWord) // 3) + 1
        self.nMinDist = 1000
        self.aSugg = set()
        self.dSugg = { 0: [],  1: [],  2: [] }
        self.dSugg = { 0: [],  1: [] }

    def addSugg (self, sSugg, nDeep=0):
        "add a suggestion"
        #print(sSugg)
        if sSugg not in self.aSugg:
            nDist = st.distanceDamerauLevenshtein(self.sCleanWord, cp.cleanWord(sSugg))
            if nDist <= self.nDistLimit:
                if nDist not in self.dSugg:
                    self.dSugg[nDist] = []
                self.dSugg[nDist].append(sSugg)
                self.aSugg.add(sSugg)
                logging.info((nDeep * "  ") + "__" + sSugg + "__")
                #logging.info((nDeep * "  ") + "__" + sSugg + "__")
                if nDist < self.nMinDist:
                    self.nMinDist = nDist
                self.nDistLimit = min(self.nDistLimit, self.nMinDist+2)

    def getSuggestions (self, nSuggLimit=10, nDistLimit=-1):
        "return a list of suggestions"
        lRes = []
        #if self.dSugg[0]:
        #    # we sort the better results with the original word
        #    self.dSugg[0].sort(key=lambda sSugg: cp.distanceDamerauLevenshtein(self.sWord, sSugg))
        for lSugg in self.dSugg.values():
            lRes.extend(lSugg)
            if len(lRes) > nSuggLimit:
                break
        lRes = list(cp.filterSugg(lRes))
        if self.sWord.istitle():
            lRes = list(map(lambda sSugg: sSugg.title(), lRes))
        return lRes[:nSuggLimit]

    def reset (self):
        self.aSugg.clear()
        self.dSugg.clear()


class IBDAWG:
    """INDEXABLE BINARY DIRECT ACYCLIC WORD GRAPH"""

    def __init__ (self, sDicName):
        self.by = pkgutil.get_data(__package__, "_dictionaries/" + sDicName)
        if not self.by:
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
261
262
263
264
265
266
267

268
269
270
271
272
273
274







-







        aSugg = oSuggResult.getSuggestions()
        if sSfx or sPfx:
            # we add what we removed
            return list(map(lambda sSug: sPfx + sSug + sSfx, aSugg))
        return aSugg

    def _suggest (self, oSuggResult, sRemain, nMaxDel=0, nMaxHardRepl=0, nDeep=0, iAddr=0, sNewWord="", sAction="", bAvoidLoop=False):
        "returns a set of suggestions"
        # recursive function
        #logging.info((nDeep * "  ") + sNewWord + ":" + sRemain + " · " + sAction)
        if not sRemain:
            if int.from_bytes(self.byDic[iAddr:iAddr+self.nBytesArc], byteorder='big') & self._finalNodeMask:
                #logging.info((nDeep * "  ") + "__" + sNewWord + "__")
                oSuggResult.addSugg(sNewWord)
            for sTail in self._getTails(iAddr):