Overview
Comment: | [graphspell] better suggestions filtering |
---|---|
Downloads: | Tarball | ZIP archive | SQL archive |
Timelines: | family | ancestors | descendants | both | trunk | graphspell |
Files: | files | file ages | folders |
SHA3-256: |
d0fa31b1837bf73ad08e6495e081fd38 |
User & Date: | olr on 2021-02-22 09:28:31 |
Other Links: | manifest | tags |
Context
2021-02-22
| ||
12:13 | [fr] ajustements check-in: 531a642f75 user: olr tags: fr, trunk | |
09:28 | [graphspell] better suggestions filtering check-in: d0fa31b183 user: olr tags: graphspell, trunk | |
08:05 | [fr] faux positifs et ajustements check-in: f48e7eee2c user: olr tags: fr, trunk | |
Changes
Modified graphspell-js/ibdawg.js from [20fbadf805] to [9375ebc47a].
︙ | ︙ | |||
336 337 338 339 340 341 342 | if (bSplitTrailingNumbers) { this._splitTrailingNumbers(oSuggResult, sWord); } this._splitSuggest(oSuggResult, sWord); this._suggest(oSuggResult, sWord); this._suggest(oSuggResult, sWord, nMaxSwitch, nMaxDel, nMaxHardRepl, nMaxJump); let aSugg = oSuggResult.getSuggestions(); | < < < | 336 337 338 339 340 341 342 343 344 345 346 347 348 349 | if (bSplitTrailingNumbers) { this._splitTrailingNumbers(oSuggResult, sWord); } this._splitSuggest(oSuggResult, sWord); this._suggest(oSuggResult, sWord); this._suggest(oSuggResult, sWord, nMaxSwitch, nMaxDel, nMaxHardRepl, nMaxJump); let aSugg = oSuggResult.getSuggestions(); if (sSfx || sPfx) { // we add what we removed return aSugg.map( (sSugg) => { return sPfx + sSugg + sSfx; } ); } //console.timeEnd("Suggestions for " + sWord); return aSugg; } |
︙ | ︙ |
Modified graphspell-js/lexgraph_fr.js from [305b9fc724] to [a345135237].
︙ | ︙ | |||
544 545 546 547 548 549 550 | console.error(e); } }, // Other functions | | > | > > > > > > > > > > > > > > > > > | 544 545 546 547 548 549 550 551 552 553 554 555 556 557 558 559 560 561 562 563 564 565 566 567 568 569 570 571 572 573 574 575 576 577 | console.error(e); } }, // Other functions isValidSugg: function (sSugg, oSpellChecker) { "return True if <sSugg> is valid" if (sSugg.endsWith("è") || sSugg.endsWith("È")) { return false; } if (sSugg.includes("’")) { if (sSugg.search(/^[dD]’/) == 0 && !oSpellChecker.morph(sSugg.slice(2), ":[YNAW]")) { return false; } if (sSugg.search(/^[nmtsNMTS]’/) == 0 && !oSpellChecker.morph(sSugg.slice(2), ":V")) { return false; } if (sSugg.search(/^[jJ]’/) == 0 && !oSpellChecker.morph(sSugg.slice(2), ":(?:Y|[123][sp])")) { return false; } if (sSugg.search(/^[cçCÇ]’/) == 0 && !oSpellChecker.morph(sSugg.slice(2), ":3[sp]")) { return false; } } return true; } } if (typeof(exports) !== 'undefined') { exports.lexgraph_fr = lexgraph_fr; |
︙ | ︙ |
Modified graphspell-js/spellchecker.js from [8801cae354] to [299826c38f].
︙ | ︙ | |||
287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 | if (this.bStorage) { this._dMorphologies.set(sWord, lMorph); this._dLemmas.set(sWord, Array.from(new Set(this.getMorph(sWord).map((sMorph) => { return sMorph.slice(1, sMorph.indexOf("/")); })))); //console.log(sWord, this._dLemmas.get(sWord)); } return lMorph; } getLemma (sWord) { // retrieves lemmas if (this.bStorage) { if (!this._dLemmas.has(sWord)) { this.getMorph(sWord); } return this._dLemmas.get(sWord); } return Array.from(new Set(this.getMorph(sWord).map((sMorph) => { return sMorph.slice(1, sMorph.indexOf("/")); }))); } * suggest (sWord, nSuggLimit=10) { // generator: returns 1, 2 or 3 lists of suggestions if (this.lexicographer) { if (this.lexicographer.dSugg.has(sWord)) { yield this.lexicographer.dSugg.get(sWord).split("|"); } else if (sWord.gl_isTitle() && this.lexicographer.dSugg.has(sWord.toLowerCase())) { | > > > > > > > > > > > > > > > > > > > > > > | | | > > | 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 | if (this.bStorage) { this._dMorphologies.set(sWord, lMorph); this._dLemmas.set(sWord, Array.from(new Set(this.getMorph(sWord).map((sMorph) => { return sMorph.slice(1, sMorph.indexOf("/")); })))); //console.log(sWord, this._dLemmas.get(sWord)); } return lMorph; } morph (sWord, sPattern, sNegPattern="") { // analyse a token, return True if <sNegPattern> not in morphologies and <sPattern> in morphologies let lMorph = this.getMorph(sWord); if (lMorph.length == 0) { return false; } // check negative condition if (sNegPattern) { if (sNegPattern == "*") { // all morph must match sPattern return lMorph.every(sMorph => (sMorph.search(sPattern) !== -1)); } else { if (lMorph.some(sMorph => (sMorph.search(sNegPattern) !== -1))) { return false; } } } // search sPattern return lMorph.some(sMorph => (sMorph.search(sPattern) !== -1)); } getLemma (sWord) { // retrieves lemmas if (this.bStorage) { if (!this._dLemmas.has(sWord)) { this.getMorph(sWord); } return this._dLemmas.get(sWord); } return Array.from(new Set(this.getMorph(sWord).map((sMorph) => { return sMorph.slice(1, sMorph.indexOf("/")); }))); } * suggest (sWord, nSuggLimit=10) { // generator: returns 1, 2 or 3 lists of suggestions if (this.lexicographer) { if (this.lexicographer.dSugg.has(sWord)) { yield this.lexicographer.dSugg.get(sWord).split("|"); } else if (sWord.gl_isTitle() && this.lexicographer.dSugg.has(sWord.toLowerCase())) { let lSuggs = this.lexicographer.dSugg.get(sWord.toLowerCase()).split("|"); yield lSuggs.map((sSugg) => { return sSugg.slice(0,1).toUpperCase() + sSugg.slice(1); }); } else { let lSuggs = this.oMainDic.suggest(sWord, nSuggLimit, true); lSuggs = lSuggs.filter((sSugg) => this.lexicographer.isValidSugg(sSugg, this)); yield lSuggs; } } else { yield this.oMainDic.suggest(sWord, nSuggLimit, true); } if (this.bCommunityDic) { yield this.oCommunityDic.suggest(sWord, Math.floor(nSuggLimit/2)+1); } |
︙ | ︙ |
Modified graphspell/ibdawg.py from [e27ae4ab79] to [71b7ddd53e].
︙ | ︙ | |||
240 241 242 243 244 245 246 | sWord = st.cleanWord(sWord) if bSplitTrailingNumbers: self._splitTrailingNumbers(oSuggResult, sWord) self._splitSuggest(oSuggResult, sWord) self._suggest(oSuggResult, sWord) self._suggest(oSuggResult, sWord, nMaxSwitch, nMaxDel, nMaxHardRepl, nMaxJump) aSugg = oSuggResult.getSuggestions() | < < | 240 241 242 243 244 245 246 247 248 249 250 251 252 253 | sWord = st.cleanWord(sWord) if bSplitTrailingNumbers: self._splitTrailingNumbers(oSuggResult, sWord) self._splitSuggest(oSuggResult, sWord) self._suggest(oSuggResult, sWord) self._suggest(oSuggResult, sWord, nMaxSwitch, nMaxDel, nMaxHardRepl, nMaxJump) aSugg = oSuggResult.getSuggestions() if sSfx or sPfx: # we add what we removed return list(map(lambda sSug: sPfx + sSug + sSfx, aSugg)) return aSugg def _splitTrailingNumbers (self, oSuggResult, sWord): m = re.match(r"(\D+)([0-9]+)$", sWord) |
︙ | ︙ |
Modified graphspell/lexgraph_fr.py from [fab156e863] to [49c48029bd].
1 2 3 4 5 6 7 8 9 10 11 12 13 | """ Lexicographer for the French language """ # Note: # This mode must contains at least: # <dSugg> : a dictionary for default suggestions. # <bLexicographer> : a boolean False # if the boolean is True, 4 functions are required: # split(sWord) -> returns a list of string (that will be analyzed) # analyze(sWord) -> returns a string with the meaning of word # readableMorph(sMorph) -> returns a string with the meaning of tags # setLabelsOnToken(dToken) -> adds readable information on token | | | 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 | """ Lexicographer for the French language """ # Note: # This mode must contains at least: # <dSugg> : a dictionary for default suggestions. # <bLexicographer> : a boolean False # if the boolean is True, 4 functions are required: # split(sWord) -> returns a list of string (that will be analyzed) # analyze(sWord) -> returns a string with the meaning of word # readableMorph(sMorph) -> returns a string with the meaning of tags # setLabelsOnToken(dToken) -> adds readable information on token # isValidSugg(sWord, oSpellChecker) -> returns a filtered list of suggestions import re #### Suggestions dSugg = { |
︙ | ︙ | |||
506 507 508 509 510 511 512 | dToken["aLabels"] = ["token de nature inconnue"] except: return # Other functions | | | | | > > > > > > > > > > | 506 507 508 509 510 511 512 513 514 515 516 517 518 519 520 521 522 523 524 525 526 | dToken["aLabels"] = ["token de nature inconnue"] except: return # Other functions def isValidSugg (sSugg, oSpellChecker): "return True if <sSugg> is valid" if sSugg.endswith(("è", "È")): return False if "’" in sSugg: if sSugg.startswith(("d’", "D’")) and not oSpellChecker.morph(sSugg[2:], ":[YNAW]"): return False if sSugg.startswith(("n’", "m’", "t’", "s’", "N’", "M’", "T’", "S’")) and not oSpellChecker.morph(sSugg[2:], ":V"): return False if sSugg.startswith(("j’", "J’")) and not oSpellChecker.morph(sSugg[2:], ":(?:Y|[123][sp])"): return False if sSugg.startswith(("c’", "C’")) and not oSpellChecker.morph(sSugg[2:], ":3[sp]"): return False return True |
Modified graphspell/spellchecker.py from [d4b9b2fec8] to [8c24055087].
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 | """ Spellchecker. Useful to check several dictionaries at once. To avoid iterating over a pile of dictionaries, it is assumed that 3 are enough: - the main dictionary, bundled with the package - the community dictionary, added by an organization - the personal dictionary, created by the user for its own convenience """ import importlib import traceback from . import ibdawg from . import tokenizer | > | 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 | """ Spellchecker. Useful to check several dictionaries at once. To avoid iterating over a pile of dictionaries, it is assumed that 3 are enough: - the main dictionary, bundled with the package - the community dictionary, added by an organization - the personal dictionary, created by the user for its own convenience """ import re import importlib import traceback from . import ibdawg from . import tokenizer |
︙ | ︙ | |||
252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 | if self.bPersonalDic: lMorph.extend(self.oPersonalDic.getMorph(sWord)) if self.bStorage: self._dMorphologies[sWord] = lMorph self._dLemmas[sWord] = { s[1:s.find("/")] for s in lMorph } return lMorph def getLemma (self, sWord): "retrieves lemmas" if self.bStorage: if sWord not in self._dLemmas: self.getMorph(sWord) return self._dLemmas[sWord] return { s[1:s.find("/")] for s in self.getMorph(sWord) } def suggest (self, sWord, nSuggLimit=10): "generator: returns 1, 2 or 3 lists of suggestions" if self.lexicographer: if sWord in self.lexicographer.dSugg: yield self.lexicographer.dSugg[sWord].split("|") elif sWord.istitle() and sWord.lower() in self.lexicographer.dSugg: | > > > > > > > > > > > > > > > > > > | | | > > | 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 | if self.bPersonalDic: lMorph.extend(self.oPersonalDic.getMorph(sWord)) if self.bStorage: self._dMorphologies[sWord] = lMorph self._dLemmas[sWord] = { s[1:s.find("/")] for s in lMorph } return lMorph def morph (self, sWord, sPattern, sNegPattern=""): "analyse a word, return True if <sNegPattern> not in morphologies and <sPattern> in morphologies" lMorph = self.getMorph(sWord) if not lMorph: return False # check negative condition if sNegPattern: if sNegPattern == "*": # all morph must match sPattern zPattern = re.compile(sPattern) return all(zPattern.search(sMorph) for sMorph in lMorph) zNegPattern = re.compile(sNegPattern) if any(zNegPattern.search(sMorph) for sMorph in lMorph): return False # search sPattern zPattern = re.compile(sPattern) return any(zPattern.search(sMorph) for sMorph in lMorph) def getLemma (self, sWord): "retrieves lemmas" if self.bStorage: if sWord not in self._dLemmas: self.getMorph(sWord) return self._dLemmas[sWord] return { s[1:s.find("/")] for s in self.getMorph(sWord) } def suggest (self, sWord, nSuggLimit=10): "generator: returns 1, 2 or 3 lists of suggestions" if self.lexicographer: if sWord in self.lexicographer.dSugg: yield self.lexicographer.dSugg[sWord].split("|") elif sWord.istitle() and sWord.lower() in self.lexicographer.dSugg: lSuggs = self.lexicographer.dSugg[sWord.lower()].split("|") yield list(map(lambda sSugg: sSugg[0:1].upper()+sSugg[1:], lSuggs)) else: lSuggs = self.oMainDic.suggest(sWord, nSuggLimit, True) lSuggs = [ sSugg for sSugg in lSuggs if self.lexicographer.isValidSugg(sSugg, self) ] yield lSuggs else: yield self.oMainDic.suggest(sWord, nSuggLimit, True) if self.bCommunityDic: yield self.oCommunityDic.suggest(sWord, (nSuggLimit//2)+1) if self.bPersonalDic: yield self.oPersonalDic.suggest(sWord, (nSuggLimit//2)+1) |
︙ | ︙ |