Grammalecte  Check-in [0346ff5361]

Overview
Comment:[core] ibdawg: suggest() try by removing chars
Downloads: Tarball | ZIP archive | SQL archive
Timelines: family | ancestors | descendants | both | trunk | core
Files: files | file ages | folders
SHA3-256: 0346ff53610cfc086c20acda84c3454754e9b8c75e126bd9a28ed992d49db62d
User & Date: olr on 2017-09-13 12:56:02
Other Links: manifest | tags
Context
2017-09-13
13:06
[core] ibdawg: suggest() with caps check-in: 8a163526c6 user: olr tags: trunk, core
12:56
[core] ibdawg: suggest() try by removing chars check-in: 0346ff5361 user: olr tags: trunk, core
09:55
[core][py] comments for spell suggestion engine check-in: 6edb221239 user: olr tags: trunk, core
Changes

Modified gc_core/py/char_player.py from [845d36e2d1] to [67d42e5b68].

1
2
3
4
5

6
7
8
9
10
11
12
1
2
3
4

5
6
7
8
9
10
11
12




-
+







# list of similar chars
# useful for suggestion mechanism


def distanceBetweenWords (s1, s2):
def distanceDamerauLevenshtein (s1, s2):
    "distance of Damerau-Levenshtein between <s1> and <s2>"
    # https://fr.wikipedia.org/wiki/Distance_de_Damerau-Levenshtein
    d = {}
    nLen1 = len(s1)
    nLen2 = len(s2)
    for i in range(-1, nLen1+1):
        d[i, -1] = i + 1

Modified gc_core/py/ibdawg.py from [a443118d5b] to [dbac860573].

182
183
184
185
186
187
188
189
190
191

192
193
194
195
196
197
198




199
200

201
202
203
204
205
206
207
208
209
210
211

212
213
214
215


216
217
218
219

220

221
222

223






224
225
226


227
228
229



230
231
232
233


234
235
236
237

238
239
240


241
242
243
244
245
246
247
182
183
184
185
186
187
188

189

190
191






192
193
194
195
196

197
198
199
200
201
202
203
204
205
206
207

208
209
210


211
212
213
214
215
216
217

218
219
220
221

222
223
224
225
226
227
228


229
230
231


232
233
234
235
236


237
238
239



240
241


242
243
244
245
246
247
248
249
250







-

-
+

-
-
-
-
-
-
+
+
+
+

-
+










-
+


-
-
+
+




+
-
+


+
-
+
+
+
+
+
+

-
-
+
+

-
-
+
+
+


-
-
+
+

-
-
-
+

-
-
+
+







            l.extend(self.morph(sWord.lower()))
            if sWord.isupper() and len(sWord) > 1:
                l.extend(self.morph(sWord.capitalize()))
        return l

    def suggest (self, sWord, nMaxSugg=10):
        "returns a set of suggestions for <sWord>"
        # first, we check for similar words
        #return self._suggestWithCrushedUselessChars(cp.clearWord(sWord))
        aSugg = self._suggest(sWord)
        aSugg = self._suggest(sWord, nMaxDel=len(sWord) // 5)
        if not aSugg:
            print("try without first char")
            aSugg.update(self._suggest(sWord[1:]))
            if not aSugg:
                print("crush useless chars")
                aSugg.update(self._suggestWithCrushedUselessChars(cp.clearWord(sWord)))
        return sorted(aSugg, key=lambda sSugg: cp.distanceBetweenWords(sWord, sSugg))
            print("crush useless chars")
            aSugg.update(self._suggestWithCrushedUselessChars(cp.clearWord(sWord)))
        aSugg = filter(lambda x: not x.endswith("è"), aSugg) # fr language
        return sorted(aSugg, key=lambda sSugg: cp.distanceDamerauLevenshtein(sWord, sSugg))

    def _suggest (self, sRemain, nDeep=0, iAddr=0, sNewWord="", bAvoidLoop=False):
    def _suggest (self, sRemain, nMaxDel=0, nDeep=0, iAddr=0, sNewWord="", bAvoidLoop=False):
        "returns a set of suggestions"
        # recursive function
        aSugg = set()
        if not sRemain:
            if int.from_bytes(self.byDic[iAddr:iAddr+self.nBytesArc], byteorder='big') & self._finalNodeMask:
                #show(nDeep, "___" + sNewWord + "___")
                aSugg.add(sNewWord)
            for sTail in self._getTails(iAddr):
                aSugg.add(sNewWord+sTail)
            return aSugg
        #show(nDeep, "<" + sRemain + ">  ===>  " + sNewWord)
        #show(nDeep, ":" + sRemain + ":  ===>  " + sNewWord)
        cCurrent = sRemain[0:1]
        for cChar, jAddr in self._getSimilarArcs(cCurrent, iAddr):
            #show(nDeep, cChar)
            aSugg.update(self._suggest(sRemain[1:], nDeep+1, jAddr, sNewWord+cChar))
            #show(nDeep, "<"+cChar+">")
            aSugg.update(self._suggest(sRemain[1:], nMaxDel, nDeep+1, jAddr, sNewWord+cChar))
        if not bAvoidLoop: # avoid infinite loop
            #show(nDeep, ":no loop:")
            if cCurrent == sRemain[1:2]:
                # same char, we remove 1 char without adding 1 to <sNewWord>
                #show(nDeep, cCurrent*2 + " /2")
                aSugg.update(self._suggest(sRemain[1:], nDeep+1, iAddr, sNewWord))
                aSugg.update(self._suggest(sRemain[1:], nMaxDel, nDeep+1, iAddr, sNewWord))
            else:
                # switching chars
                #show(nDeep, "switch: "+sRemain[0:2])
                aSugg.update(self._suggest(sRemain[1:2]+sRemain[0:1]+sRemain[2:], nDeep+1, iAddr, sNewWord, True))
                aSugg.update(self._suggest(sRemain[1:2]+sRemain[0:1]+sRemain[2:], nMaxDel, nDeep+1, iAddr, sNewWord, True))
                # delete char
                if nMaxDel > 0:
                    #show(nDeep, "delete: "+sRemain[0:1])
                    aSugg.update(self._suggest(sRemain[1:], nMaxDel-1, nDeep+1, iAddr, sNewWord, True))
            # Replacements
            for sRepl in cp.d1toX.get(cCurrent, ()):
                #show(nDeep, sRepl)
                aSugg.update(self._suggest(sRepl + sRemain[1:], nDeep+1, iAddr, sNewWord, True))
                #show(nDeep, cCurrent + " >> " + sRepl)
                aSugg.update(self._suggest(sRepl + sRemain[1:], nMaxDel, nDeep+1, iAddr, sNewWord, True))
            for sRepl in cp.d2toX.get(sRemain[0:2], ()):
                #show(nDeep, sRepl)
                aSugg.update(self._suggest(sRepl + sRemain[2:], nDeep+1, iAddr, sNewWord, True))
                #show(nDeep, sRemain[0:2] + " >> " + sRepl)
                aSugg.update(self._suggest(sRepl + sRemain[2:], nMaxDel, nDeep+1, iAddr, sNewWord, True))
            # end of word
            if len(sRemain) == 2:
                for sRepl in cp.dFinal2.get(sRemain, ()):
                    #show(nDeep, sRepl)
                    aSugg.update(self._suggest(sRepl, nDeep+1, iAddr, sNewWord, True))
                    #show(nDeep, "$ " + sRemain + " >> " + sRepl)
                    aSugg.update(self._suggest(sRepl, nMaxDel, nDeep+1, iAddr, sNewWord, True))
            elif len(sRemain) == 1:
                #show(nDeep, ":end of word:")
                # end of word
                aSugg.update(self._suggest("", nDeep+1, iAddr, sNewWord, True)) # remove last char and go on
                aSugg.update(self._suggest("", nMaxDel, nDeep+1, iAddr, sNewWord, True)) # remove last char and go on
                for sRepl in cp.dFinal1.get(sRemain, ()):
                    #show(nDeep, sRepl)
                    aSugg.update(self._suggest(sRepl, nDeep+1, iAddr, sNewWord, True))
                    #show(nDeep, "$ " + sRemain + " >> " + sRepl)
                    aSugg.update(self._suggest(sRepl, nMaxDel, nDeep+1, iAddr, sNewWord, True))
        return aSugg

    def _getSimilarArcs (self, cChar, iAddr):
        "generator: yield similar char of <cChar> and address of the following node"
        for c in cp.d1to1.get(cChar, [cChar]):
            if c in self.dChar:
                jAddr = self._lookupArcNode(self.dChar[c], iAddr)
321
322
323
324
325
326
327
328

329
330
331
332
333
334
335
324
325
326
327
328
329
330

331
332
333
334
335
336
337
338







-
+







            else:
                sEntry = sWord + "\t" + self.funcStemming(sWord, self.lArcVal[nVal])
                for nMorphVal, _ in self._getArcs1(jAddr):
                    if not zPattern or zPattern.search(self.lArcVal[nMorphVal]):
                        yield sEntry + "\t" + self.lArcVal[nMorphVal]

    def _morph1 (self, sWord):
        "returns morphologies of sWord"
        "returns morphologies of <sWord>"
        iAddr = 0
        for c in sWord:
            if c not in self.dChar:
                return []
            iAddr = self._lookupArcNode(self.dChar[c], iAddr)
            if iAddr == None:
                return []