Grammalecte  Check-in [58fed39787]

Overview
Comment:[graphspell] suggestion mechanism optimization: parse graph arcs according to similar chars
Downloads: Tarball | ZIP archive | SQL archive
Timelines: family | ancestors | trunk | graphspell
Files: files | file ages | folders
SHA3-256: 58fed3978799489194f877e74844d386131b2b32d1a44557b542b15a7de640e8
User & Date: olr on 2025-09-23 11:48:05
Other Links: manifest | tags
Context
2025-09-23
11:48
[graphspell] suggestion mechanism optimization: parse graph arcs according to similar chars Leaf check-in: 58fed39787 user: olr tags: trunk, graphspell
11:14
[cli] clarity for spelling suggestion check-in: 4858dc598e user: olr tags: trunk, cli
Changes

Modified gc_lang/fr/modules/tests_modules.py from [10ff0f96cf] to [69c65c5b78].

63
64
65
66
67
68
69
70

71
72
73
74
75
76
77
78
79
80
81
82

83
84
85
86
87
88
89
63
64
65
66
67
68
69

70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90







-
+












+







            ("Emilie", "Émilie"),
            ("exibission", "exhibition"),
            ("ditirembique", "dithyrambique"),
            ("jai", "j’ai"),
            ("email", "courriel"),
            ("fatiqué", "fatigué"),
            ("coeur", "cœur"),
            ("trèèèèèèèèès", "très"),
            ("triiiiicheuuuur", "tricheur"),
            ("vraaaaiiiimeeeeennnt", "vraiment"),
            ("oeil", "œil"),
            ("Oeil", "Œil"),
            ("OEIL", "ŒIL"),
            ("apele", "appel"),
            ("Co2", "CO₂"),
            ("emmppâiiiller", "empailler"),
            ("testt", "test"),
            ("apelaion", "appellation"),
            ("exsepttion", "exception"),
            ("ebriete", "ébriété"),
            ("ennormmement", "énormément"),
            ("maîtnesse", "maîtresse"),
            ("sintaxik", "syntaxique")
        ]:
            #with timeblock(sWord):
            for lSugg in self.oSpellChecker.suggest(sWrong):
                #print(sWord, "->", " ".join(lSugg))
                self.assertIn(sSugg, lSugg)

Modified graphspell-js/ibdawg.js from [d23b81aee3] to [f51bd17e6f].

363
364
365
366
367
368
369
370

371
372
373
374
375
376
377
363
364
365
366
367
368
369

370
371
372
373
374
375
376
377







-
+







                }
            }
        }
        if (nDist > oSuggResult.nDistLimit) {
            return;
        }
        let cCurrent = sRemain.slice(0, 1);
        for (let [cChar, jAddr] of this._getCharArcs(iAddr)) {
        for (let [cChar, jAddr] of this._getCharArcs(iAddr, cCurrent)) {
            if (char_player.d1to1.gl_get(cCurrent, cCurrent).indexOf(cChar) != -1) {
                this._suggest(oSuggResult, sRemain.slice(1), nMaxSwitch, nMaxDel, nMaxHardRepl, nMaxJump, nDist, nDeep+1, jAddr, sNewWord+cChar);
            }
            else if (!bAvoidLoop) {
                if (nMaxHardRepl  &&  this.isNgramsOK(cChar+sRemain.slice(1,2))) {
                    this._suggest(oSuggResult, sRemain.slice(1), nMaxSwitch, nMaxDel, nMaxHardRepl-1, nMaxJump, nDist+1, nDeep+1, jAddr, sNewWord+cChar, true);
                }
425
426
427
428
429
430
431
432

433

434
435

436
437









438
439
440
441
442
443
444
425
426
427
428
429
430
431

432
433
434
435
436
437


438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453







-
+

+


+
-
-
+
+
+
+
+
+
+
+
+







        }
        if (!this.a2grams) {
            return true;
        }
        return this.a2grams.has(sChars);
    }

    * _getCharArcs (iAddr) {
    * _getCharArcs (iAddr, cChar="") {
        // generator: yield all chars and addresses from node at address <iAddr>
        let lStack = [];
        for (let [nVal, jAddr] of this._getArcs(iAddr)) {
            if (nVal <= this.nChar) {
                if (char_player.d1to1.gl_get(cChar, cChar).indexOf(this.dCharVal.get(nVal)) != -1) {
                yield [this.dCharVal.get(nVal), jAddr];
            }
                    yield [this.dCharVal.get(nVal), jAddr];
                }
                else {
                    lStack.push([this.dCharVal.get(nVal), jAddr]);
                }
            }
        }
        while (lStack.length > 0) {
            yield lStack.shift();
        }
    }

    * _getSimilarCharArcs (cChar, iAddr) {
        // generator: yield similar char of <cChar> and address of the following node
        for (let c of char_player.d1to1.gl_get(cChar, [cChar])) {
            if (this.dChar.has(c)) {

Modified graphspell/ibdawg.py from [b0bfbd049d] to [df9af9aea6].

249
250
251
252
253
254
255
256

257
258
259
260
261
262
263
264
265
266
267
268
269
270
271

272
273
274
275
276
277
278
249
250
251
252
253
254
255

256
257
258
259
260
261
262
263
264
265
266
267
268
269
270

271
272
273
274
275
276
277
278







-
+














-
+







            if cSplitter in sWord:
                sWord1, sWord2 = sWord.split(cSplitter, 1)
                if self.isValid(sWord1) and self.isValid(sWord2):
                    oSuggResult.addSugg(sWord1+" "+sWord2)

    def _suggest (self, oSuggResult, sRemain, nMaxSwitch=0, nMaxDel=0, nMaxHardRepl=0, nMaxJump=0, nDist=0, nDeep=0, iAddr=0, sNewWord="", bAvoidLoop=False):
        # recursive function
        #logging.info((nDeep * "  ") + sNewWord + ":" + sRemain)
        #logging.info((nDeep * "  ") + f"{sNewWord}:{sRemain} nMaxSwitch:{nMaxSwitch} nMaxDel:{nMaxDel} nMaxHardRepl:{nMaxHardRepl} nMaxJump:{nMaxJump} | nDist:{nDist} / {oSuggResult.nDistLimit}")
        if self.lByDic[iAddr] & self._finalNodeMask:
            if not sRemain:
                oSuggResult.addSugg(sNewWord, nDeep)
                for sTail in self._getTails(iAddr):
                    oSuggResult.addSugg(sNewWord+sTail, nDeep)
                return
            if (len(sNewWord) + len(sRemain) == len(oSuggResult.sWord)) and oSuggResult.sWord.lower().startswith(sNewWord.lower()) and self.isValid(sRemain):
                if self.sLangCode == "fr" and sNewWord.lower() in ("l", "d", "n", "m", "t", "s", "c", "j", "qu", "lorsqu", "puisqu", "quoiqu", "jusqu", "quelqu") and sRemain[0:1] in cp.aVowel:
                    oSuggResult.addSugg(sNewWord+"’"+sRemain, nDeep)
                if (len(sNewWord) > 1 and len(sRemain) > 1) or sNewWord in "aày" or sRemain in "aày":
                    oSuggResult.addSugg(sNewWord+" "+sRemain, nDeep)
        if nDist > oSuggResult.nDistLimit:
            return
        cCurrent = sRemain[0:1]
        for cChar, jAddr in self._getCharArcs(iAddr):
        for cChar, jAddr in self._getCharArcs(iAddr, cCurrent):
            if cChar in cp.d1to1.get(cCurrent, cCurrent):
                self._suggest(oSuggResult, sRemain[1:], nMaxSwitch, nMaxDel, nMaxHardRepl, nMaxJump, nDist, nDeep+1, jAddr, sNewWord+cChar)
            elif not bAvoidLoop:
                if nMaxHardRepl and self.isNgramsOK(cChar+sRemain[1:2]):
                    self._suggest(oSuggResult, sRemain[1:], nMaxSwitch, nMaxDel, nMaxHardRepl-1, nMaxJump, nDist+1, nDeep+1, jAddr, sNewWord+cChar, True)
                if nMaxJump:
                    self._suggest(oSuggResult, sRemain, nMaxSwitch, nMaxDel, nMaxHardRepl, nMaxJump-1, nDist+1, nDeep+1, jAddr, sNewWord+cChar, True) # True for avoiding loop?
306
307
308
309
310
311
312
313

314

315
316

317





318
319
320
321
322
323
324
306
307
308
309
310
311
312

313
314
315
316
317
318

319
320
321
322
323
324
325
326
327
328
329
330







-
+

+


+
-
+
+
+
+
+







        "returns True if sChars in known 2grams"
        if len(sChars) != 2:
            return True
        if not self.a2grams:
            return True
        return sChars in self.a2grams

    def _getCharArcs (self, iAddr):
    def _getCharArcs (self, iAddr, cChar=""):
        "generator: yield all chars and addresses from node at address <iAddr>"
        lStack = []
        for nVal, jAddr in self._getArcs(iAddr):
            if nVal <= self.nChar:
                if self.dCharVal[nVal] in cp.d1to1.get(cChar, cChar):
                yield (self.dCharVal[nVal], jAddr)
                    yield (self.dCharVal[nVal], jAddr)
                else:
                    lStack.append((self.dCharVal[nVal], jAddr))
        while lStack:
            yield lStack.pop(0)

    def _getTails (self, iAddr, sTail="", n=2):
        "return a list of suffixes ending at a distance of <n> from <iAddr>"
        aTails = set()
        for nVal, jAddr in self._getArcs(iAddr):
            if nVal <= self.nChar:
                if self.lByDic[jAddr] & self._finalNodeMask: