Overview
| Comment: | [graphspell] suggestion mechanism optimization: parse graph arcs according to similar chars |
|---|---|
| Downloads: | Tarball | ZIP archive | SQL archive |
| Timelines: | family | ancestors | descendants | both | trunk | graphspell |
| Files: | files | file ages | folders |
| SHA3-256: |
58fed3978799489194f877e74844d386 |
| User & Date: | olr on 2025-09-23 11:48:05 |
| Other Links: | manifest | tags |
Context
|
2025-10-01
| ||
| 09:37 | [fr] faux positifs et ajustements check-in: 82748a4a2e user: olr tags: trunk, fr | |
|
2025-09-23
| ||
| 11:48 | [graphspell] suggestion mechanism optimization: parse graph arcs according to similar chars check-in: 58fed39787 user: olr tags: trunk, graphspell | |
| 11:14 | [cli] clarity for spelling suggestion check-in: 4858dc598e user: olr tags: trunk, cli | |
Changes
Modified gc_lang/fr/modules/tests_modules.py from [10ff0f96cf] to [69c65c5b78].
| ︙ | ︙ | |||
63 64 65 66 67 68 69 |
("Emilie", "Émilie"),
("exibission", "exhibition"),
("ditirembique", "dithyrambique"),
("jai", "j’ai"),
("email", "courriel"),
("fatiqué", "fatigué"),
("coeur", "cœur"),
| | > | 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 |
("Emilie", "Émilie"),
("exibission", "exhibition"),
("ditirembique", "dithyrambique"),
("jai", "j’ai"),
("email", "courriel"),
("fatiqué", "fatigué"),
("coeur", "cœur"),
("triiiiicheuuuur", "tricheur"),
("vraaaaiiiimeeeeennnt", "vraiment"),
("oeil", "œil"),
("Oeil", "Œil"),
("OEIL", "ŒIL"),
("apele", "appel"),
("Co2", "CO₂"),
("emmppâiiiller", "empailler"),
("testt", "test"),
("apelaion", "appellation"),
("exsepttion", "exception"),
("ebriete", "ébriété"),
("ennormmement", "énormément"),
("maîtnesse", "maîtresse"),
("sintaxik", "syntaxique")
]:
#with timeblock(sWord):
for lSugg in self.oSpellChecker.suggest(sWrong):
#print(sWord, "->", " ".join(lSugg))
self.assertIn(sSugg, lSugg)
|
| ︙ | ︙ |
Modified graphspell-js/ibdawg.js from [d23b81aee3] to [f51bd17e6f].
| ︙ | ︙ | |||
363 364 365 366 367 368 369 |
}
}
}
if (nDist > oSuggResult.nDistLimit) {
return;
}
let cCurrent = sRemain.slice(0, 1);
| | | 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 |
}
}
}
if (nDist > oSuggResult.nDistLimit) {
return;
}
let cCurrent = sRemain.slice(0, 1);
for (let [cChar, jAddr] of this._getCharArcs(iAddr, cCurrent)) {
if (char_player.d1to1.gl_get(cCurrent, cCurrent).indexOf(cChar) != -1) {
this._suggest(oSuggResult, sRemain.slice(1), nMaxSwitch, nMaxDel, nMaxHardRepl, nMaxJump, nDist, nDeep+1, jAddr, sNewWord+cChar);
}
else if (!bAvoidLoop) {
if (nMaxHardRepl && this.isNgramsOK(cChar+sRemain.slice(1,2))) {
this._suggest(oSuggResult, sRemain.slice(1), nMaxSwitch, nMaxDel, nMaxHardRepl-1, nMaxJump, nDist+1, nDeep+1, jAddr, sNewWord+cChar, true);
}
|
| ︙ | ︙ | |||
425 426 427 428 429 430 431 |
}
if (!this.a2grams) {
return true;
}
return this.a2grams.has(sChars);
}
| | > > | | > > > > > > > | 425 426 427 428 429 430 431 432 433 434 435 436 437 438 439 440 441 442 443 444 445 446 447 448 449 450 451 452 453 |
}
if (!this.a2grams) {
return true;
}
return this.a2grams.has(sChars);
}
* _getCharArcs (iAddr, cChar="") {
// generator: yield all chars and addresses from node at address <iAddr>
let lStack = [];
for (let [nVal, jAddr] of this._getArcs(iAddr)) {
if (nVal <= this.nChar) {
if (char_player.d1to1.gl_get(cChar, cChar).indexOf(this.dCharVal.get(nVal)) != -1) {
yield [this.dCharVal.get(nVal), jAddr];
}
else {
lStack.push([this.dCharVal.get(nVal), jAddr]);
}
}
}
while (lStack.length > 0) {
yield lStack.shift();
}
}
* _getSimilarCharArcs (cChar, iAddr) {
// generator: yield similar char of <cChar> and address of the following node
for (let c of char_player.d1to1.gl_get(cChar, [cChar])) {
if (this.dChar.has(c)) {
|
| ︙ | ︙ |
Modified graphspell/ibdawg.py from [b0bfbd049d] to [df9af9aea6].
| ︙ | ︙ | |||
249 250 251 252 253 254 255 |
if cSplitter in sWord:
sWord1, sWord2 = sWord.split(cSplitter, 1)
if self.isValid(sWord1) and self.isValid(sWord2):
oSuggResult.addSugg(sWord1+" "+sWord2)
def _suggest (self, oSuggResult, sRemain, nMaxSwitch=0, nMaxDel=0, nMaxHardRepl=0, nMaxJump=0, nDist=0, nDeep=0, iAddr=0, sNewWord="", bAvoidLoop=False):
# recursive function
| | | | 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 |
if cSplitter in sWord:
sWord1, sWord2 = sWord.split(cSplitter, 1)
if self.isValid(sWord1) and self.isValid(sWord2):
oSuggResult.addSugg(sWord1+" "+sWord2)
def _suggest (self, oSuggResult, sRemain, nMaxSwitch=0, nMaxDel=0, nMaxHardRepl=0, nMaxJump=0, nDist=0, nDeep=0, iAddr=0, sNewWord="", bAvoidLoop=False):
# recursive function
#logging.info((nDeep * " ") + f"{sNewWord}:{sRemain} nMaxSwitch:{nMaxSwitch} nMaxDel:{nMaxDel} nMaxHardRepl:{nMaxHardRepl} nMaxJump:{nMaxJump} | nDist:{nDist} / {oSuggResult.nDistLimit}")
if self.lByDic[iAddr] & self._finalNodeMask:
if not sRemain:
oSuggResult.addSugg(sNewWord, nDeep)
for sTail in self._getTails(iAddr):
oSuggResult.addSugg(sNewWord+sTail, nDeep)
return
if (len(sNewWord) + len(sRemain) == len(oSuggResult.sWord)) and oSuggResult.sWord.lower().startswith(sNewWord.lower()) and self.isValid(sRemain):
if self.sLangCode == "fr" and sNewWord.lower() in ("l", "d", "n", "m", "t", "s", "c", "j", "qu", "lorsqu", "puisqu", "quoiqu", "jusqu", "quelqu") and sRemain[0:1] in cp.aVowel:
oSuggResult.addSugg(sNewWord+"’"+sRemain, nDeep)
if (len(sNewWord) > 1 and len(sRemain) > 1) or sNewWord in "aày" or sRemain in "aày":
oSuggResult.addSugg(sNewWord+" "+sRemain, nDeep)
if nDist > oSuggResult.nDistLimit:
return
cCurrent = sRemain[0:1]
for cChar, jAddr in self._getCharArcs(iAddr, cCurrent):
if cChar in cp.d1to1.get(cCurrent, cCurrent):
self._suggest(oSuggResult, sRemain[1:], nMaxSwitch, nMaxDel, nMaxHardRepl, nMaxJump, nDist, nDeep+1, jAddr, sNewWord+cChar)
elif not bAvoidLoop:
if nMaxHardRepl and self.isNgramsOK(cChar+sRemain[1:2]):
self._suggest(oSuggResult, sRemain[1:], nMaxSwitch, nMaxDel, nMaxHardRepl-1, nMaxJump, nDist+1, nDeep+1, jAddr, sNewWord+cChar, True)
if nMaxJump:
self._suggest(oSuggResult, sRemain, nMaxSwitch, nMaxDel, nMaxHardRepl, nMaxJump-1, nDist+1, nDeep+1, jAddr, sNewWord+cChar, True) # True for avoiding loop?
|
| ︙ | ︙ | |||
306 307 308 309 310 311 312 |
"returns True if sChars in known 2grams"
if len(sChars) != 2:
return True
if not self.a2grams:
return True
return sChars in self.a2grams
| | > > | > > > > | 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 |
"returns True if sChars in known 2grams"
if len(sChars) != 2:
return True
if not self.a2grams:
return True
return sChars in self.a2grams
def _getCharArcs (self, iAddr, cChar=""):
"generator: yield all chars and addresses from node at address <iAddr>"
lStack = []
for nVal, jAddr in self._getArcs(iAddr):
if nVal <= self.nChar:
if self.dCharVal[nVal] in cp.d1to1.get(cChar, cChar):
yield (self.dCharVal[nVal], jAddr)
else:
lStack.append((self.dCharVal[nVal], jAddr))
while lStack:
yield lStack.pop(0)
def _getTails (self, iAddr, sTail="", n=2):
"return a list of suffixes ending at a distance of <n> from <iAddr>"
aTails = set()
for nVal, jAddr in self._getArcs(iAddr):
if nVal <= self.nChar:
if self.lByDic[jAddr] & self._finalNodeMask:
|
| ︙ | ︙ |