Index: gc_core/py/char_player.py ================================================================== --- gc_core/py/char_player.py +++ gc_core/py/char_player.py @@ -1,10 +1,10 @@ # list of similar chars # useful for suggestion mechanism -def distanceBetweenWords (s1, s2): +def distanceDamerauLevenshtein (s1, s2): "distance of Damerau-Levenshtein between and " # https://fr.wikipedia.org/wiki/Distance_de_Damerau-Levenshtein d = {} nLen1 = len(s1) nLen2 = len(s2) Index: gc_core/py/ibdawg.py ================================================================== --- gc_core/py/ibdawg.py +++ gc_core/py/ibdawg.py @@ -184,22 +184,19 @@ l.extend(self.morph(sWord.capitalize())) return l def suggest (self, sWord, nMaxSugg=10): "returns a set of suggestions for " - # first, we check for similar words - #return self._suggestWithCrushedUselessChars(cp.clearWord(sWord)) - aSugg = self._suggest(sWord) - if not aSugg: - print("try without first char") - aSugg.update(self._suggest(sWord[1:])) - if not aSugg: - print("crush useless chars") - aSugg.update(self._suggestWithCrushedUselessChars(cp.clearWord(sWord))) - return sorted(aSugg, key=lambda sSugg: cp.distanceBetweenWords(sWord, sSugg)) - - def _suggest (self, sRemain, nDeep=0, iAddr=0, sNewWord="", bAvoidLoop=False): + #return self._suggestWithCrushedUselessChars(cp.clearWord(sWord)) + aSugg = self._suggest(sWord, nMaxDel=len(sWord) // 5) + if not aSugg: + print("crush useless chars") + aSugg.update(self._suggestWithCrushedUselessChars(cp.clearWord(sWord))) + aSugg = filter(lambda x: not x.endswith("รจ"), aSugg) # fr language + return sorted(aSugg, key=lambda sSugg: cp.distanceDamerauLevenshtein(sWord, sSugg)) + + def _suggest (self, sRemain, nMaxDel=0, nDeep=0, iAddr=0, sNewWord="", bAvoidLoop=False): "returns a set of suggestions" # recursive function aSugg = set() if not sRemain: if int.from_bytes(self.byDic[iAddr:iAddr+self.nBytesArc], byteorder='big') & self._finalNodeMask: @@ -206,40 +203,46 @@ #show(nDeep, "___" + sNewWord + "___") aSugg.add(sNewWord) for sTail in self._getTails(iAddr): aSugg.add(sNewWord+sTail) return aSugg - #show(nDeep, "<" + sRemain + "> ===> " + sNewWord) + #show(nDeep, ":" + sRemain + ": ===> " + sNewWord) cCurrent = sRemain[0:1] for cChar, jAddr in self._getSimilarArcs(cCurrent, iAddr): - #show(nDeep, cChar) - aSugg.update(self._suggest(sRemain[1:], nDeep+1, jAddr, sNewWord+cChar)) + #show(nDeep, "<"+cChar+">") + aSugg.update(self._suggest(sRemain[1:], nMaxDel, nDeep+1, jAddr, sNewWord+cChar)) if not bAvoidLoop: # avoid infinite loop #show(nDeep, ":no loop:") if cCurrent == sRemain[1:2]: # same char, we remove 1 char without adding 1 to - aSugg.update(self._suggest(sRemain[1:], nDeep+1, iAddr, sNewWord)) + #show(nDeep, cCurrent*2 + " /2") + aSugg.update(self._suggest(sRemain[1:], nMaxDel, nDeep+1, iAddr, sNewWord)) else: # switching chars - aSugg.update(self._suggest(sRemain[1:2]+sRemain[0:1]+sRemain[2:], nDeep+1, iAddr, sNewWord, True)) + #show(nDeep, "switch: "+sRemain[0:2]) + aSugg.update(self._suggest(sRemain[1:2]+sRemain[0:1]+sRemain[2:], nMaxDel, nDeep+1, iAddr, sNewWord, True)) + # delete char + if nMaxDel > 0: + #show(nDeep, "delete: "+sRemain[0:1]) + aSugg.update(self._suggest(sRemain[1:], nMaxDel-1, nDeep+1, iAddr, sNewWord, True)) + # Replacements for sRepl in cp.d1toX.get(cCurrent, ()): - #show(nDeep, sRepl) - aSugg.update(self._suggest(sRepl + sRemain[1:], nDeep+1, iAddr, sNewWord, True)) + #show(nDeep, cCurrent + " >> " + sRepl) + aSugg.update(self._suggest(sRepl + sRemain[1:], nMaxDel, nDeep+1, iAddr, sNewWord, True)) for sRepl in cp.d2toX.get(sRemain[0:2], ()): - #show(nDeep, sRepl) - aSugg.update(self._suggest(sRepl + sRemain[2:], nDeep+1, iAddr, sNewWord, True)) + #show(nDeep, sRemain[0:2] + " >> " + sRepl) + aSugg.update(self._suggest(sRepl + sRemain[2:], nMaxDel, nDeep+1, iAddr, sNewWord, True)) + # end of word if len(sRemain) == 2: for sRepl in cp.dFinal2.get(sRemain, ()): - #show(nDeep, sRepl) - aSugg.update(self._suggest(sRepl, nDeep+1, iAddr, sNewWord, True)) + #show(nDeep, "$ " + sRemain + " >> " + sRepl) + aSugg.update(self._suggest(sRepl, nMaxDel, nDeep+1, iAddr, sNewWord, True)) elif len(sRemain) == 1: - #show(nDeep, ":end of word:") - # end of word - aSugg.update(self._suggest("", nDeep+1, iAddr, sNewWord, True)) # remove last char and go on + aSugg.update(self._suggest("", nMaxDel, nDeep+1, iAddr, sNewWord, True)) # remove last char and go on for sRepl in cp.dFinal1.get(sRemain, ()): - #show(nDeep, sRepl) - aSugg.update(self._suggest(sRepl, nDeep+1, iAddr, sNewWord, True)) + #show(nDeep, "$ " + sRemain + " >> " + sRepl) + aSugg.update(self._suggest(sRepl, nMaxDel, nDeep+1, iAddr, sNewWord, True)) return aSugg def _getSimilarArcs (self, cChar, iAddr): "generator: yield similar char of and address of the following node" for c in cp.d1to1.get(cChar, [cChar]): @@ -323,11 +326,11 @@ for nMorphVal, _ in self._getArcs1(jAddr): if not zPattern or zPattern.search(self.lArcVal[nMorphVal]): yield sEntry + "\t" + self.lArcVal[nMorphVal] def _morph1 (self, sWord): - "returns morphologies of sWord" + "returns morphologies of " iAddr = 0 for c in sWord: if c not in self.dChar: return [] iAddr = self._lookupArcNode(self.dChar[c], iAddr)