Grammalecte  Check-in [b56f2d61d0]

Overview
Comment:[graphspell] ibdawg: spelling suggestion mechanism test
Downloads: Tarball | ZIP archive | SQL archive
Timelines: family | ancestors | descendants | both | trunk | graphspell
Files: files | file ages | folders
SHA3-256: b56f2d61d0f35e920e9e3c56bee89df4a17f3f7cf0c027c439b612100932495c
User & Date: olr on 2024-06-11 17:27:55
Other Links: manifest | tags
Context
2024-06-11
17:44
[fr] dictionnaires: màj check-in: 800b3de9ea user: olr tags: fr, trunk, v2.2
17:27
[graphspell] ibdawg: spelling suggestion mechanism test check-in: b56f2d61d0 user: olr tags: graphspell, trunk
17:17
[fr] faux positifs check-in: 4d40f61f04 user: olr tags: fr, trunk
Changes

Modified graphspell/ibdawg.py from [71b7ddd53e] to [6f7d77073f].

40
41
42
43
44
45
46
47
48
49
50
51
52
53

54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104

class SuggResult:
    """Structure for storing, classifying and filtering suggestions"""

    def __init__ (self, sWord, nSuggLimit=10, nDistLimit=-1):
        self.sWord = sWord
        self.sSimplifiedWord = st.simplifyWord(sWord)
        self.nDistLimit = nDistLimit  if nDistLimit >= 0  else  (len(sWord) // 3) + 1
        self.nMinDist = 1000
        # Temporary sets
        self.aAllSugg = set()   # All suggestions, even the one rejected
        self.dGoodSugg = {}     # Acceptable suggestions
        self.dBestSugg = {}     # Best suggestions
        # Parameters

        self.nSuggLimit = nSuggLimit
        self.nSuggLimitExt = nSuggLimit + 2             # we add few entries in case suggestions merge after casing modifications
        self.nBestSuggLimit = floor(nSuggLimit * 2)     # n times the requested limit
        self.nGoodSuggLimit = nSuggLimit * 15           # n times the requested limit

    def addSugg (self, sSugg, nDeep=0):
        "add a suggestion"
        if sSugg in self.aAllSugg:
            return
        self.aAllSugg.add(sSugg)
        nDistJaro = 1 - st.distanceJaroWinkler(self.sSimplifiedWord, st.simplifyWord(sSugg))
        nDist = floor(nDistJaro * 10)
        if nDist < self.nMinDist:
            self.nMinDist = nDist
        #logging.info((nDeep * "  ") + "__" + sSugg + "__ " + str(round(nDistJaro*1000)))
        if nDistJaro < .11:     # Best suggestions
            self.dBestSugg[sSugg] = round(nDistJaro*1000)
            if len(self.dBestSugg) > self.nBestSuggLimit:
                self.nDistLimit = -1  # make suggest() to end search
        elif nDistJaro < .33:   # Good suggestions
            self.dGoodSugg[sSugg] = round(nDistJaro*1000)
            if len(self.dGoodSugg) > self.nGoodSuggLimit:
                self.nDistLimit = -1  # make suggest() to end search
        self.nDistLimit = min(self.nDistLimit, self.nMinDist+1)

    def getSuggestions (self):
        "return a list of suggestions"
        # we sort the better results with the original word
        lRes = []
        if len(self.dBestSugg) > 0:
            # sort only with simplified words
            lResTmp = sorted(self.dBestSugg.items(), key=lambda x: x[1])
            for i in range(min(self.nSuggLimitExt, len(lResTmp))):
                lRes.append(lResTmp[i][0])
        if len(lRes) < self.nSuggLimitExt:
            # sort with simplified words and original word
            lResTmp = sorted(self.dGoodSugg.items(), key=lambda x: ((1-st.distanceJaroWinkler(self.sWord, x[0]))*10, x[1]))
            for i in range(min(self.nSuggLimitExt, len(lResTmp))):
                lRes.append(lResTmp[i][0])
        # casing
        if self.sWord.isupper():
            lRes = list(OrderedDict.fromkeys(map(lambda sSugg: sSugg.upper(), lRes))) # use dict, when Python 3.6+
        elif self.sWord[0:1].isupper():
            # dont’ use <.istitle>
            lRes = list(OrderedDict.fromkeys(map(lambda sSugg: sSugg[0:1].upper()+sSugg[1:], lRes))) # use dict, when Python 3.6+
        return lRes[:self.nSuggLimit]


class IBDAWG:
    """INDEXABLE BINARY DIRECT ACYCLIC WORD GRAPH"""








|



<
|

>

<
<
|






|
|
|
|
<
<
|
|
|
<
|
|
|






<
|
|
|
|
<
<
<
<
|




|







40
41
42
43
44
45
46
47
48
49
50

51
52
53
54


55
56
57
58
59
60
61
62
63
64
65


66
67
68

69
70
71
72
73
74
75
76
77

78
79
80
81




82
83
84
85
86
87
88
89
90
91
92
93
94

class SuggResult:
    """Structure for storing, classifying and filtering suggestions"""

    def __init__ (self, sWord, nSuggLimit=10, nDistLimit=-1):
        self.sWord = sWord
        self.sSimplifiedWord = st.simplifyWord(sWord)
        self.nDistLimit = nDistLimit  if nDistLimit >= 0  else  (len(sWord) // 3) + 1 # used in suggest()
        self.nMinDist = 1000
        # Temporary sets
        self.aAllSugg = set()   # All suggestions, even the one rejected

        self.dAccSugg = {}      # Accepted suggestions
        # Parameters

        self.nSuggLimit = nSuggLimit


        self.nTempSuggLimit = nSuggLimit * 6

    def addSugg (self, sSugg, nDeep=0):
        "add a suggestion"
        if sSugg in self.aAllSugg:
            return
        self.aAllSugg.add(sSugg)
        nSimDist = st.distanceSift4(self.sSimplifiedWord, st.simplifyWord(sSugg))
        st.showDistance(self.sSimplifiedWord, st.simplifyWord(sSugg))
        if nSimDist < self.nMinDist:
            self.nMinDist = nSimDist


        if nSimDist <= (self.nMinDist + 1):
            nDist = st.distanceJaroWinkler(self.sWord, sSugg)
            st.showDistance(self.sWord, sSugg)

            self.dAccSugg[sSugg] = min(nDist, nSimDist+1)
            if len(self.dAccSugg) > self.nTempSuggLimit:
                self.nDistLimit = -1  # suggest() ends searching when this variable = -1
        self.nDistLimit = min(self.nDistLimit, self.nMinDist+1)

    def getSuggestions (self):
        "return a list of suggestions"
        # we sort the better results with the original word
        lRes = []

        # sort only with simplified words
        lResTmp = sorted(self.dAccSugg.items(), key=lambda x: (x[1], x[0]))
        for i in range(min(self.nSuggLimit, len(lResTmp))):
            lRes.append(lResTmp[i][0])




            #st.showDistance(self.sWord, lResTmp[i][0])
        # casing
        if self.sWord.isupper():
            lRes = list(OrderedDict.fromkeys(map(lambda sSugg: sSugg.upper(), lRes))) # use dict, when Python 3.6+
        elif self.sWord[0:1].isupper():
            # don’t use <.istitle>
            lRes = list(OrderedDict.fromkeys(map(lambda sSugg: sSugg[0:1].upper()+sSugg[1:], lRes))) # use dict, when Python 3.6+
        return lRes[:self.nSuggLimit]


class IBDAWG:
    """INDEXABLE BINARY DIRECT ACYCLIC WORD GRAPH"""