Grammalecte  Check-in [b56f2d61d0]

Overview
Comment:[graphspell] ibdawg: spelling suggestion mechanism test
Downloads: Tarball | ZIP archive | SQL archive
Timelines: family | ancestors | descendants | both | trunk | graphspell
Files: files | file ages | folders
SHA3-256: b56f2d61d0f35e920e9e3c56bee89df4a17f3f7cf0c027c439b612100932495c
User & Date: olr on 2024-06-11 17:27:55
Other Links: manifest | tags
Context
2024-06-11
17:44
[fr] dictionnaires: màj check-in: 800b3de9ea user: olr tags: trunk, fr, v2.2
17:27
[graphspell] ibdawg: spelling suggestion mechanism test check-in: b56f2d61d0 user: olr tags: trunk, graphspell
17:17
[fr] faux positifs check-in: 4d40f61f04 user: olr tags: trunk, fr
Changes

Modified graphspell/ibdawg.py from [71b7ddd53e] to [6f7d77073f].

40
41
42
43
44
45
46
47

48
49
50
51
52

53

54
55
56
57

58
59
60
61
62
63
64
65
66
67




68
69
70
71
72



73
74
75
76



77
78
79
80
81
82
83
84
85
86
87




88
89
90
91
92

93
94
95
96
97

98
99
100
101
102
103
104
40
41
42
43
44
45
46

47
48
49
50


51
52
53
54



55
56
57
58
59
60
61




62
63
64
65





66
67
68




69
70
71
72
73
74
75
76
77





78
79
80
81





82
83
84
85
86

87
88
89
90
91
92
93
94







-
+



-
-
+

+

-
-
-
+






-
-
-
-
+
+
+
+
-
-
-
-
-
+
+
+
-
-
-
-
+
+
+






-
-
-
-
-
+
+
+
+
-
-
-
-
-
+




-
+








class SuggResult:
    """Structure for storing, classifying and filtering suggestions"""

    def __init__ (self, sWord, nSuggLimit=10, nDistLimit=-1):
        self.sWord = sWord
        self.sSimplifiedWord = st.simplifyWord(sWord)
        self.nDistLimit = nDistLimit  if nDistLimit >= 0  else  (len(sWord) // 3) + 1
        self.nDistLimit = nDistLimit  if nDistLimit >= 0  else  (len(sWord) // 3) + 1 # used in suggest()
        self.nMinDist = 1000
        # Temporary sets
        self.aAllSugg = set()   # All suggestions, even the one rejected
        self.dGoodSugg = {}     # Acceptable suggestions
        self.dBestSugg = {}     # Best suggestions
        self.dAccSugg = {}      # Accepted suggestions
        # Parameters

        self.nSuggLimit = nSuggLimit
        self.nSuggLimitExt = nSuggLimit + 2             # we add few entries in case suggestions merge after casing modifications
        self.nBestSuggLimit = floor(nSuggLimit * 2)     # n times the requested limit
        self.nGoodSuggLimit = nSuggLimit * 15           # n times the requested limit
        self.nTempSuggLimit = nSuggLimit * 6

    def addSugg (self, sSugg, nDeep=0):
        "add a suggestion"
        if sSugg in self.aAllSugg:
            return
        self.aAllSugg.add(sSugg)
        nDistJaro = 1 - st.distanceJaroWinkler(self.sSimplifiedWord, st.simplifyWord(sSugg))
        nDist = floor(nDistJaro * 10)
        if nDist < self.nMinDist:
            self.nMinDist = nDist
        nSimDist = st.distanceSift4(self.sSimplifiedWord, st.simplifyWord(sSugg))
        st.showDistance(self.sSimplifiedWord, st.simplifyWord(sSugg))
        if nSimDist < self.nMinDist:
            self.nMinDist = nSimDist
        #logging.info((nDeep * "  ") + "__" + sSugg + "__ " + str(round(nDistJaro*1000)))
        if nDistJaro < .11:     # Best suggestions
            self.dBestSugg[sSugg] = round(nDistJaro*1000)
            if len(self.dBestSugg) > self.nBestSuggLimit:
                self.nDistLimit = -1  # make suggest() to end search
        if nSimDist <= (self.nMinDist + 1):
            nDist = st.distanceJaroWinkler(self.sWord, sSugg)
            st.showDistance(self.sWord, sSugg)
        elif nDistJaro < .33:   # Good suggestions
            self.dGoodSugg[sSugg] = round(nDistJaro*1000)
            if len(self.dGoodSugg) > self.nGoodSuggLimit:
                self.nDistLimit = -1  # make suggest() to end search
            self.dAccSugg[sSugg] = min(nDist, nSimDist+1)
            if len(self.dAccSugg) > self.nTempSuggLimit:
                self.nDistLimit = -1  # suggest() ends searching when this variable = -1
        self.nDistLimit = min(self.nDistLimit, self.nMinDist+1)

    def getSuggestions (self):
        "return a list of suggestions"
        # we sort the better results with the original word
        lRes = []
        if len(self.dBestSugg) > 0:
            # sort only with simplified words
            lResTmp = sorted(self.dBestSugg.items(), key=lambda x: x[1])
            for i in range(min(self.nSuggLimitExt, len(lResTmp))):
                lRes.append(lResTmp[i][0])
        # sort only with simplified words
        lResTmp = sorted(self.dAccSugg.items(), key=lambda x: (x[1], x[0]))
        for i in range(min(self.nSuggLimit, len(lResTmp))):
            lRes.append(lResTmp[i][0])
        if len(lRes) < self.nSuggLimitExt:
            # sort with simplified words and original word
            lResTmp = sorted(self.dGoodSugg.items(), key=lambda x: ((1-st.distanceJaroWinkler(self.sWord, x[0]))*10, x[1]))
            for i in range(min(self.nSuggLimitExt, len(lResTmp))):
                lRes.append(lResTmp[i][0])
            #st.showDistance(self.sWord, lResTmp[i][0])
        # casing
        if self.sWord.isupper():
            lRes = list(OrderedDict.fromkeys(map(lambda sSugg: sSugg.upper(), lRes))) # use dict, when Python 3.6+
        elif self.sWord[0:1].isupper():
            # dont’ use <.istitle>
            # don’t use <.istitle>
            lRes = list(OrderedDict.fromkeys(map(lambda sSugg: sSugg[0:1].upper()+sSugg[1:], lRes))) # use dict, when Python 3.6+
        return lRes[:self.nSuggLimit]


class IBDAWG:
    """INDEXABLE BINARY DIRECT ACYCLIC WORD GRAPH"""