Grammalecte  Check-in [0346ff5361]

Overview
Comment:[core] ibdawg: suggest() try by removing chars
Downloads: Tarball | ZIP archive | SQL archive
Timelines: family | ancestors | descendants | both | trunk | core
Files: files | file ages | folders
SHA3-256: 0346ff53610cfc086c20acda84c3454754e9b8c75e126bd9a28ed992d49db62d
User & Date: olr on 2017-09-13 12:56:02
Other Links: manifest | tags
Context
2017-09-13
13:06
[core] ibdawg: suggest() with caps check-in: 8a163526c6 user: olr tags: trunk, core
12:56
[core] ibdawg: suggest() try by removing chars check-in: 0346ff5361 user: olr tags: trunk, core
09:55
[core][py] comments for spell suggestion engine check-in: 6edb221239 user: olr tags: trunk, core
Changes

Modified gc_core/py/char_player.py from [845d36e2d1] to [67d42e5b68].

1
2
3
4
5
6
7
8
9
10
11
12
# list of similar chars
# useful for suggestion mechanism


def distanceBetweenWords (s1, s2):
    "distance of Damerau-Levenshtein between <s1> and <s2>"
    # https://fr.wikipedia.org/wiki/Distance_de_Damerau-Levenshtein
    d = {}
    nLen1 = len(s1)
    nLen2 = len(s2)
    for i in range(-1, nLen1+1):
        d[i, -1] = i + 1




|







1
2
3
4
5
6
7
8
9
10
11
12
# list of similar chars
# useful for suggestion mechanism


def distanceDamerauLevenshtein (s1, s2):
    "distance of Damerau-Levenshtein between <s1> and <s2>"
    # https://fr.wikipedia.org/wiki/Distance_de_Damerau-Levenshtein
    d = {}
    nLen1 = len(s1)
    nLen2 = len(s2)
    for i in range(-1, nLen1+1):
        d[i, -1] = i + 1

Modified gc_core/py/ibdawg.py from [a443118d5b] to [dbac860573].

182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197

198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219

220
221
222

223





224
225
226
227
228
229

230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
            l.extend(self.morph(sWord.lower()))
            if sWord.isupper() and len(sWord) > 1:
                l.extend(self.morph(sWord.capitalize()))
        return l

    def suggest (self, sWord, nMaxSugg=10):
        "returns a set of suggestions for <sWord>"
        # first, we check for similar words
        #return self._suggestWithCrushedUselessChars(cp.clearWord(sWord))
        aSugg = self._suggest(sWord)
        if not aSugg:
            print("try without first char")
            aSugg.update(self._suggest(sWord[1:]))
            if not aSugg:
                print("crush useless chars")
                aSugg.update(self._suggestWithCrushedUselessChars(cp.clearWord(sWord)))

        return sorted(aSugg, key=lambda sSugg: cp.distanceBetweenWords(sWord, sSugg))

    def _suggest (self, sRemain, nDeep=0, iAddr=0, sNewWord="", bAvoidLoop=False):
        "returns a set of suggestions"
        # recursive function
        aSugg = set()
        if not sRemain:
            if int.from_bytes(self.byDic[iAddr:iAddr+self.nBytesArc], byteorder='big') & self._finalNodeMask:
                #show(nDeep, "___" + sNewWord + "___")
                aSugg.add(sNewWord)
            for sTail in self._getTails(iAddr):
                aSugg.add(sNewWord+sTail)
            return aSugg
        #show(nDeep, "<" + sRemain + ">  ===>  " + sNewWord)
        cCurrent = sRemain[0:1]
        for cChar, jAddr in self._getSimilarArcs(cCurrent, iAddr):
            #show(nDeep, cChar)
            aSugg.update(self._suggest(sRemain[1:], nDeep+1, jAddr, sNewWord+cChar))
        if not bAvoidLoop: # avoid infinite loop
            #show(nDeep, ":no loop:")
            if cCurrent == sRemain[1:2]:
                # same char, we remove 1 char without adding 1 to <sNewWord>

                aSugg.update(self._suggest(sRemain[1:], nDeep+1, iAddr, sNewWord))
            else:
                # switching chars

                aSugg.update(self._suggest(sRemain[1:2]+sRemain[0:1]+sRemain[2:], nDeep+1, iAddr, sNewWord, True))





            for sRepl in cp.d1toX.get(cCurrent, ()):
                #show(nDeep, sRepl)
                aSugg.update(self._suggest(sRepl + sRemain[1:], nDeep+1, iAddr, sNewWord, True))
            for sRepl in cp.d2toX.get(sRemain[0:2], ()):
                #show(nDeep, sRepl)
                aSugg.update(self._suggest(sRepl + sRemain[2:], nDeep+1, iAddr, sNewWord, True))

            if len(sRemain) == 2:
                for sRepl in cp.dFinal2.get(sRemain, ()):
                    #show(nDeep, sRepl)
                    aSugg.update(self._suggest(sRepl, nDeep+1, iAddr, sNewWord, True))
            elif len(sRemain) == 1:
                #show(nDeep, ":end of word:")
                # end of word
                aSugg.update(self._suggest("", nDeep+1, iAddr, sNewWord, True)) # remove last char and go on
                for sRepl in cp.dFinal1.get(sRemain, ()):
                    #show(nDeep, sRepl)
                    aSugg.update(self._suggest(sRepl, nDeep+1, iAddr, sNewWord, True))
        return aSugg

    def _getSimilarArcs (self, cChar, iAddr):
        "generator: yield similar char of <cChar> and address of the following node"
        for c in cp.d1to1.get(cChar, [cChar]):
            if c in self.dChar:
                jAddr = self._lookupArcNode(self.dChar[c], iAddr)







<

|

<
<
<
|
|
>
|

|










|


|
|




>
|


>
|
>
>
>
>
>

|
|

|
|
>


|
|

<
<
|

|
|







182
183
184
185
186
187
188

189
190
191



192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239


240
241
242
243
244
245
246
247
248
249
250
            l.extend(self.morph(sWord.lower()))
            if sWord.isupper() and len(sWord) > 1:
                l.extend(self.morph(sWord.capitalize()))
        return l

    def suggest (self, sWord, nMaxSugg=10):
        "returns a set of suggestions for <sWord>"

        #return self._suggestWithCrushedUselessChars(cp.clearWord(sWord))
        aSugg = self._suggest(sWord, nMaxDel=len(sWord) // 5)
        if not aSugg:



            print("crush useless chars")
            aSugg.update(self._suggestWithCrushedUselessChars(cp.clearWord(sWord)))
        aSugg = filter(lambda x: not x.endswith("è"), aSugg) # fr language
        return sorted(aSugg, key=lambda sSugg: cp.distanceDamerauLevenshtein(sWord, sSugg))

    def _suggest (self, sRemain, nMaxDel=0, nDeep=0, iAddr=0, sNewWord="", bAvoidLoop=False):
        "returns a set of suggestions"
        # recursive function
        aSugg = set()
        if not sRemain:
            if int.from_bytes(self.byDic[iAddr:iAddr+self.nBytesArc], byteorder='big') & self._finalNodeMask:
                #show(nDeep, "___" + sNewWord + "___")
                aSugg.add(sNewWord)
            for sTail in self._getTails(iAddr):
                aSugg.add(sNewWord+sTail)
            return aSugg
        #show(nDeep, ":" + sRemain + ":  ===>  " + sNewWord)
        cCurrent = sRemain[0:1]
        for cChar, jAddr in self._getSimilarArcs(cCurrent, iAddr):
            #show(nDeep, "<"+cChar+">")
            aSugg.update(self._suggest(sRemain[1:], nMaxDel, nDeep+1, jAddr, sNewWord+cChar))
        if not bAvoidLoop: # avoid infinite loop
            #show(nDeep, ":no loop:")
            if cCurrent == sRemain[1:2]:
                # same char, we remove 1 char without adding 1 to <sNewWord>
                #show(nDeep, cCurrent*2 + " /2")
                aSugg.update(self._suggest(sRemain[1:], nMaxDel, nDeep+1, iAddr, sNewWord))
            else:
                # switching chars
                #show(nDeep, "switch: "+sRemain[0:2])
                aSugg.update(self._suggest(sRemain[1:2]+sRemain[0:1]+sRemain[2:], nMaxDel, nDeep+1, iAddr, sNewWord, True))
                # delete char
                if nMaxDel > 0:
                    #show(nDeep, "delete: "+sRemain[0:1])
                    aSugg.update(self._suggest(sRemain[1:], nMaxDel-1, nDeep+1, iAddr, sNewWord, True))
            # Replacements
            for sRepl in cp.d1toX.get(cCurrent, ()):
                #show(nDeep, cCurrent + " >> " + sRepl)
                aSugg.update(self._suggest(sRepl + sRemain[1:], nMaxDel, nDeep+1, iAddr, sNewWord, True))
            for sRepl in cp.d2toX.get(sRemain[0:2], ()):
                #show(nDeep, sRemain[0:2] + " >> " + sRepl)
                aSugg.update(self._suggest(sRepl + sRemain[2:], nMaxDel, nDeep+1, iAddr, sNewWord, True))
            # end of word
            if len(sRemain) == 2:
                for sRepl in cp.dFinal2.get(sRemain, ()):
                    #show(nDeep, "$ " + sRemain + " >> " + sRepl)
                    aSugg.update(self._suggest(sRepl, nMaxDel, nDeep+1, iAddr, sNewWord, True))
            elif len(sRemain) == 1:


                aSugg.update(self._suggest("", nMaxDel, nDeep+1, iAddr, sNewWord, True)) # remove last char and go on
                for sRepl in cp.dFinal1.get(sRemain, ()):
                    #show(nDeep, "$ " + sRemain + " >> " + sRepl)
                    aSugg.update(self._suggest(sRepl, nMaxDel, nDeep+1, iAddr, sNewWord, True))
        return aSugg

    def _getSimilarArcs (self, cChar, iAddr):
        "generator: yield similar char of <cChar> and address of the following node"
        for c in cp.d1to1.get(cChar, [cChar]):
            if c in self.dChar:
                jAddr = self._lookupArcNode(self.dChar[c], iAddr)
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
            else:
                sEntry = sWord + "\t" + self.funcStemming(sWord, self.lArcVal[nVal])
                for nMorphVal, _ in self._getArcs1(jAddr):
                    if not zPattern or zPattern.search(self.lArcVal[nMorphVal]):
                        yield sEntry + "\t" + self.lArcVal[nMorphVal]

    def _morph1 (self, sWord):
        "returns morphologies of sWord"
        iAddr = 0
        for c in sWord:
            if c not in self.dChar:
                return []
            iAddr = self._lookupArcNode(self.dChar[c], iAddr)
            if iAddr == None:
                return []







|







324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
            else:
                sEntry = sWord + "\t" + self.funcStemming(sWord, self.lArcVal[nVal])
                for nMorphVal, _ in self._getArcs1(jAddr):
                    if not zPattern or zPattern.search(self.lArcVal[nMorphVal]):
                        yield sEntry + "\t" + self.lArcVal[nMorphVal]

    def _morph1 (self, sWord):
        "returns morphologies of <sWord>"
        iAddr = 0
        for c in sWord:
            if c not in self.dChar:
                return []
            iAddr = self._lookupArcNode(self.dChar[c], iAddr)
            if iAddr == None:
                return []