Overview
Comment: | [core] sort first range of suggestions + code clarification |
---|---|
Downloads: | Tarball | ZIP archive | SQL archive |
Timelines: | family | ancestors | descendants | both | core | spellsugg |
Files: | files | file ages | folders |
SHA3-256: |
d22466bd67f2df09b8e8d933ea100cc4 |
User & Date: | olr on 2017-11-07 19:28:50 |
Other Links: | branch diff | manifest | tags |
Context
2017-11-07
| ||
19:56 | [core] ibdawg: suggest2 > char priority check-in: 8ea89d19b5 user: olr tags: core, spellsugg | |
19:28 | [core] sort first range of suggestions + code clarification check-in: d22466bd67 user: olr tags: core, spellsugg | |
18:25 | [core][bug] ibdawg: avoid storing several times the same suggestion check-in: 64ccfa7e38 user: olr tags: core, spellsugg | |
Changes
Modified gc_core/py/ibdawg.py from [2152c0fca3] to [69a64ae917].
︙ | ︙ | |||
37 38 39 40 41 42 43 | self.nDistLimit = nDistLimit if nDistLimit >= 0 else (len(sWord) // 3) + 1 self.nMinDist = 1000 self.aSugg = set() self.dSugg = { 0: [], 1: [] } def addSugg (self, sSugg, nDeep=0): "add a suggestion" | | < | | | > > | 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 | self.nDistLimit = nDistLimit if nDistLimit >= 0 else (len(sWord) // 3) + 1 self.nMinDist = 1000 self.aSugg = set() self.dSugg = { 0: [], 1: [] } def addSugg (self, sSugg, nDeep=0): "add a suggestion" #logging.info((nDeep * " ") + "__" + sSugg + "__") if sSugg not in self.aSugg: nDist = st.distanceDamerauLevenshtein(self.sCleanWord, cp.cleanWord(sSugg)) if nDist <= self.nDistLimit: if nDist not in self.dSugg: self.dSugg[nDist] = [] self.dSugg[nDist].append(sSugg) self.aSugg.add(sSugg) if nDist < self.nMinDist: self.nMinDist = nDist self.nDistLimit = min(self.nDistLimit, self.nMinDist+2) def getSuggestions (self, nSuggLimit=10, nDistLimit=-1): "return a list of suggestions" lRes = [] if self.dSugg[0]: # we sort the better results with the original word self.dSugg[0].sort(key=lambda sSugg: st.distanceDamerauLevenshtein(self.sWord, sSugg)) for lSugg in self.dSugg.values(): lRes.extend(lSugg) if len(lRes) > nSuggLimit: break lRes = list(cp.filterSugg(lRes)) if self.sWord.istitle(): lRes = list(map(lambda sSugg: sSugg.title(), lRes)) elif self.sWord.isupper(): lRes = list(map(lambda sSugg: sSugg.upper(), lRes)) return lRes[:nSuggLimit] def reset (self): self.aSugg.clear() self.dSugg.clear() |
︙ | ︙ | |||
265 266 267 268 269 270 271 | return aSugg def _suggest (self, oSuggResult, sRemain, nMaxDel=0, nMaxHardRepl=0, nDeep=0, iAddr=0, sNewWord="", sAction="", bAvoidLoop=False): # recursive function #logging.info((nDeep * " ") + sNewWord + ":" + sRemain + " · " + sAction) if not sRemain: if int.from_bytes(self.byDic[iAddr:iAddr+self.nBytesArc], byteorder='big') & self._finalNodeMask: | < < | | 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 | return aSugg def _suggest (self, oSuggResult, sRemain, nMaxDel=0, nMaxHardRepl=0, nDeep=0, iAddr=0, sNewWord="", sAction="", bAvoidLoop=False): # recursive function #logging.info((nDeep * " ") + sNewWord + ":" + sRemain + " · " + sAction) if not sRemain: if int.from_bytes(self.byDic[iAddr:iAddr+self.nBytesArc], byteorder='big') & self._finalNodeMask: oSuggResult.addSugg(sNewWord) for sTail in self._getTails(iAddr): oSuggResult.addSugg(sNewWord+sTail) return cCurrent = sRemain[0:1] for cChar, jAddr in self._getSimilarCharArcs(cCurrent, iAddr): self._suggest(oSuggResult, sRemain[1:], nMaxDel, nMaxHardRepl, nDeep+1, jAddr, sNewWord+cChar, "*") if not bAvoidLoop: # avoid infinite loop if cCurrent == sRemain[1:2]: # same char, we remove 1 char without adding 1 to <sNewWord> self._suggest(oSuggResult, sRemain[1:], nMaxDel, nMaxHardRepl, nDeep+1, iAddr, sNewWord, cCurrent+"/2") else: # switching chars |
︙ | ︙ | |||
319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 | aSugg = oSuggResult.getSuggestions() if sSfx or sPfx: # we add what we removed return list(map(lambda sSug: sPfx + sSug + sSfx, aSugg)) return aSugg def _suggest2 (self, oSuggResult, nDeep=0, iAddr=0, sNewWord=""): #logging.info((nDeep * " ") + sNewWord) if nDeep >= oSuggResult.nDistLimit: sCleanNewWord = cp.cleanWord(sNewWord) if st.distanceSift4(oSuggResult.sCleanWord[:len(sCleanNewWord)], sCleanNewWord) > oSuggResult.nDistLimit: return if int.from_bytes(self.byDic[iAddr:iAddr+self.nBytesArc], byteorder='big') & self._finalNodeMask: #logging.info((nDeep * " ") + "__" + sNewWord + "__") oSuggResult.addSugg(sNewWord, nDeep) for cChar, jAddr in self._getCharArcs(iAddr): self._suggest2(oSuggResult, nDeep+1, jAddr, sNewWord+cChar) return def _getCharArcs (self, iAddr): "generator: yield all chars and addresses from node at address <iAddr>" for nVal, jAddr in self._getArcs(iAddr): if nVal < self.nChar: yield (self.dCharVal[nVal], jAddr) | > | | | | < | | | | | | 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 | aSugg = oSuggResult.getSuggestions() if sSfx or sPfx: # we add what we removed return list(map(lambda sSug: sPfx + sSug + sSfx, aSugg)) return aSugg def _suggest2 (self, oSuggResult, nDeep=0, iAddr=0, sNewWord=""): # recursive function #logging.info((nDeep * " ") + sNewWord) if nDeep >= oSuggResult.nDistLimit: sCleanNewWord = cp.cleanWord(sNewWord) if st.distanceSift4(oSuggResult.sCleanWord[:len(sCleanNewWord)], sCleanNewWord) > oSuggResult.nDistLimit: return if int.from_bytes(self.byDic[iAddr:iAddr+self.nBytesArc], byteorder='big') & self._finalNodeMask: #logging.info((nDeep * " ") + "__" + sNewWord + "__") oSuggResult.addSugg(sNewWord, nDeep) for cChar, jAddr in self._getCharArcs(iAddr): self._suggest2(oSuggResult, nDeep+1, jAddr, sNewWord+cChar) return def _getCharArcs (self, iAddr): "generator: yield all chars and addresses from node at address <iAddr>" for nVal, jAddr in self._getArcs(iAddr): if nVal < self.nChar: yield (self.dCharVal[nVal], jAddr) def _getSimilarCharArcs (self, cChar, iAddr): "generator: yield similar char of <cChar> and address of the following node" for c in cp.d1to1.get(cChar, [cChar]): if c in self.dChar: jAddr = self._lookupArcNode(self.dChar[c], iAddr) if jAddr: yield (c, jAddr) def _getTails (self, iAddr, sTail="", n=2): "return a list of suffixes ending at a distance of <n> from <iAddr>" aTails = set() for nVal, jAddr in self._getArcs(iAddr): if nVal < self.nChar: if int.from_bytes(self.byDic[jAddr:jAddr+self.nBytesArc], byteorder='big') & self._finalNodeMask: aTails.add(sTail + self.dCharVal[nVal]) if n and not aTails: aTails.update(self._getTails(jAddr, sTail+self.dCharVal[nVal], n-1)) return aTails def drawPath (self, sWord, iAddr=0): "show the path taken by <sWord> in the graph" c1 = sWord[0:1] if sWord else " " iPos = -1 n = 0 print(c1 + ": ", end="") for c2, jAddr in self._getCharArcs(iAddr): print(c2, end="") if c2 == sWord[0:1]: iNextNodeAddr = jAddr iPos = n n += 1 if not sWord: return if iPos >= 0: print("\n "+ " " * iPos + "|") self.drawPath(sWord[1:], iNextNodeAddr) def select (self, sPattern=""): |
︙ | ︙ |