Overview
Comment: | [graphspell] move functions from char_player to str_transform and lexicographer |
---|---|
Downloads: | Tarball | ZIP archive | SQL archive |
Timelines: | family | ancestors | descendants | both | trunk | graphspell |
Files: | files | file ages | folders |
SHA3-256: |
19fccd89d6a445f0d8ca8ff3f3659ddd |
User & Date: | olr on 2020-08-05 09:30:48 |
Other Links: | manifest | tags |
Context
2020-08-05
| ||
09:42 | [graphspell] improve suggestion mechanism check-in: e90761a163 user: olr tags: trunk, graphspell | |
09:30 | [graphspell] move functions from char_player to str_transform and lexicographer check-in: 19fccd89d6 user: olr tags: trunk, graphspell | |
06:56 | [graphspell] move function from char_player to str_transform check-in: 2381f7c9ae user: olr tags: trunk, graphspell | |
Changes
Modified graphspell-js/char_player.js from [ba78268468] to [0602ec129b].
︙ | ︙ | |||
45 46 47 48 49 50 51 | } if (this.oDistanceBetweenChars.hasOwnProperty(c1) && this.oDistanceBetweenChars[c1].hasOwnProperty(c2)) { return this.oDistanceBetweenChars[c1][c2]; } return 1; }, | < < < < < < < < < < < < < < < < < < < < < < < < < < < < < < < < < < < < < < < | 45 46 47 48 49 50 51 52 53 54 55 56 57 58 | } if (this.oDistanceBetweenChars.hasOwnProperty(c1) && this.oDistanceBetweenChars[c1].hasOwnProperty(c2)) { return this.oDistanceBetweenChars[c1][c2]; } return 1; }, aVowel: new Set("aáàâäāeéèêëēiíìîïīoóòôöōuúùûüūyýỳŷÿȳœæAÁÀÂÄĀEÉÈÊËĒIÍÌÎÏĪOÓÒÔÖŌUÚÙÛÜŪYÝỲŶŸȲŒÆ"), aConsonant: new Set("bcçdfghjklmnñpqrstvwxzBCÇDFGHJKLMNÑPQRSTVWXZ"), aDouble: new Set("bcdfjklmnprstzBCDFJKLMNPRSTZ"), // letters that may be used twice successively // Similar chars |
︙ | ︙ | |||
402 403 404 405 406 407 408 | ["EN", ["ENT", "ANT"]], ["ei", ["ait", "ais"]], ["EI", ["AIT", "AIS"]], ["on", ["ons", "ont"]], ["ON", ["ONS", "ONT"]], ["oi", ["ois", "oit", "oix"]], ["OI", ["OIS", "OIT", "OIX"]], | | < < < < < < < < < < < < < < < < < < < < < < < < < < < < < < < < < < < < < < < < < | 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 | ["EN", ["ENT", "ANT"]], ["ei", ["ait", "ais"]], ["EI", ["AIT", "AIS"]], ["on", ["ons", "ont"]], ["ON", ["ONS", "ONT"]], ["oi", ["ois", "oit", "oix"]], ["OI", ["OIS", "OIT", "OIX"]], ]) }; if (typeof(exports) !== 'undefined') { exports.aVowel = char_player.aVowel; exports.aConsonant = char_player.aConsonant; exports.aDouble = char_player.aDouble; exports.d1to1 = char_player.d1to1; exports.d1toX = char_player.d1toX; exports.get1toXReplacement = char_player.get1toXReplacement; exports.d2toX = char_player.d2toX; exports.dFinal1 = char_player.dFinal1; exports.dFinal2 = char_player.dFinal2; exports.aPfx1 = char_player.aPfx1; exports.aPfx2 = char_player.aPfx2; } |
Modified graphspell-js/ibdawg.js from [39524e288d] to [a6b92daf69].
︙ | ︙ | |||
23 24 25 26 27 28 29 | class SuggResult { // Structure for storing, classifying and filtering suggestions constructor (sWord, nDistLimit=-1) { this.sWord = sWord; | | | | 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 | class SuggResult { // Structure for storing, classifying and filtering suggestions constructor (sWord, nDistLimit=-1) { this.sWord = sWord; this.sSimplifiedWord = str_transform.simplifyWord(sWord); this.nDistLimit = (nDistLimit >= 0) ? nDistLimit : Math.floor(sWord.length / 3) + 1; this.nMinDist = 1000; this.aSugg = new Set(); this.dSugg = new Map([ [0, []], [1, []], [2, []] ]); this.aAllSugg = new Set(); // all found words even those refused } addSugg (sSugg, nDeep=0) { // add a suggestion if (this.aAllSugg.has(sSugg)) { return; } this.aAllSugg.add(sSugg); if (!this.aSugg.has(sSugg)) { let nDist = str_transform.distanceDamerauLevenshtein(this.sSimplifiedWord, str_transform.simplifyWord(sSugg)); if (nDist <= this.nDistLimit) { if (sSugg.includes(" ")) { // add 1 to distance for split suggestions nDist += 1; } if (!this.dSugg.has(nDist)) { this.dSugg.set(nDist, []); } |
︙ | ︙ | |||
75 76 77 78 79 80 81 | break; } lRes.push(...lSugg); if (lRes.length > nSuggLimit) { break; } } | < | 75 76 77 78 79 80 81 82 83 84 85 86 87 88 | break; } lRes.push(...lSugg); if (lRes.length > nSuggLimit) { break; } } if (this.sWord.gl_isUpperCase()) { lRes = lRes.map((sSugg) => { return sSugg.toUpperCase(); }); lRes = [...new Set(lRes)]; } else if (this.sWord.slice(0,1).gl_isUpperCase()) { lRes = lRes.map((sSugg) => { return sSugg.slice(0,1).toUpperCase() + sSugg.slice(1); }); lRes = [...new Set(lRes)]; |
︙ | ︙ | |||
193 194 195 196 197 198 199 200 201 202 203 204 205 206 | break; default: throw ValueError("# Error: unknown code: " + this.nCompressionMethod); } //console.log(this.getInfo()); this.bAcronymValid = true; this.bNumAtLastValid = false; } getInfo () { return ` Language: ${this.sLangName} Lang code: ${this.sLangCode} Dictionary name: ${this.sDicName}\n` + ` Compression method: ${this.nCompressionMethod} Date: ${this.sDate} Stemming: ${this.cStemming}FX\n` + ` Arcs values: ${this.nArcVal} = ${this.nChar} characters, ${this.nAff} affixes, ${this.nTag} tags\n` + ` Dictionary: ${this.nEntry} entries, ${this.nNode} nodes, ${this.nArc} arcs\n` + | > > > > > > > > | 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 | break; default: throw ValueError("# Error: unknown code: " + this.nCompressionMethod); } //console.log(this.getInfo()); this.bAcronymValid = true; this.bNumAtLastValid = false; // lexicographer module ? this.lexicographer = null; // JS still sucks: we’ll try importation when importation will be available in Workers. Still waiting... if (self && self.hasOwnProperty("lexgraph_"+this.sLangCode)) { // self is the Worker this.lexicographer = self["lexgraph_"+this.sLangCode]; } } getInfo () { return ` Language: ${this.sLangName} Lang code: ${this.sLangCode} Dictionary name: ${this.sDicName}\n` + ` Compression method: ${this.nCompressionMethod} Date: ${this.sDate} Stemming: ${this.cStemming}FX\n` + ` Arcs values: ${this.nArcVal} = ${this.nChar} characters, ${this.nAff} affixes, ${this.nTag} tags\n` + ` Dictionary: ${this.nEntry} entries, ${this.nNode} nodes, ${this.nArc} arcs\n` + |
︙ | ︙ | |||
324 325 326 327 328 329 330 | suggest (sWord, nSuggLimit=10, bSplitTrailingNumbers=false) { // returns a array of suggestions for <sWord> //console.time("Suggestions for " + sWord); sWord = str_transform.spellingNormalization(sWord); let sPfx = ""; let sSfx = ""; | > | > > > > | | 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 | suggest (sWord, nSuggLimit=10, bSplitTrailingNumbers=false) { // returns a array of suggestions for <sWord> //console.time("Suggestions for " + sWord); sWord = str_transform.spellingNormalization(sWord); let sPfx = ""; let sSfx = ""; if (this.lexicographer) { [sPfx, sWord, sSfx] = this.lexicographer.split(sWord); } let nMaxSwitch = Math.max(Math.floor(sWord.length / 3), 1); let nMaxDel = Math.floor(sWord.length / 5); let nMaxHardRepl = Math.max(Math.floor((sWord.length - 5) / 4), 1); let nMaxJump = Math.max(Math.floor(sWord.length / 4), 1); let oSuggResult = new SuggResult(sWord); if (bSplitTrailingNumbers) { this._splitTrailingNumbers(oSuggResult, sWord); } this._splitSuggest(oSuggResult, sWord); this._suggest(oSuggResult, sWord, nMaxSwitch, nMaxDel, nMaxHardRepl, nMaxJump); let aSugg = oSuggResult.getSuggestions(nSuggLimit); if (this.lexicographer) { aSugg = this.lexicographer.filterSugg(aSugg); } if (sSfx || sPfx) { // we add what we removed return aSugg.map( (sSugg) => { return sPfx + sSugg + sSfx; } ); } //console.timeEnd("Suggestions for " + sWord); return aSugg; } _splitTrailingNumbers (oSuggResult, sWord) { let m = /^([a-zA-Zà-öÀ-Ö_ø-ÿØ-ßĀ-ʯfi-st][a-zA-Zà-öÀ-Ö_ø-ÿØ-ßĀ-ʯfi-st-]+)([0-9]+)$/.exec(sWord); if (m && !m[1].endsWith("-") && !m[1].endsWith("_")) { oSuggResult.addSugg(m[1] + " " + str_transform.numbersToExponent(m[2])); } } _splitSuggest (oSuggResult, sWord) { // split at apostrophes for (let cSplitter of "'’") { if (sWord.includes(cSplitter)) { |
︙ | ︙ |
Modified graphspell-js/lexgraph_fr.js from [7474bcdd06] to [964be7c67e].
︙ | ︙ | |||
132 133 134 135 136 137 138 139 140 141 142 143 144 145 | //// Lexicographer var lexgraph_fr = { dSugg: _dSugg, dTag: new Map([ [':N', [" nom,", "Nom"]], [':A', [" adjectif,", "Adjectif"]], [':M1', [" prénom,", "Prénom"]], [':M2', [" patronyme,", "Patronyme, matronyme, nom de famille…"]], [':MP', [" nom propre,", "Nom propre"]], [':W', [" adverbe,", "Adverbe"]], | > > > > > > > > > > > | 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 | //// Lexicographer var lexgraph_fr = { dSugg: _dSugg, // Préfixes et suffixes aPfx1: new Set([ "anti", "archi", "contre", "hyper", "mé", "méta", "im", "in", "ir", "par", "proto", "pseudo", "pré", "re", "ré", "sans", "sous", "supra", "sur", "ultra" ]), aPfx2: new Set([ "belgo", "franco", "génito", "gynéco", "médico", "russo" ]), // Étiquettes dTag: new Map([ [':N', [" nom,", "Nom"]], [':A', [" adjectif,", "Adjectif"]], [':M1', [" prénom,", "Prénom"]], [':M2', [" patronyme,", "Patronyme, matronyme, nom de famille…"]], [':MP', [" nom propre,", "Nom propre"]], [':W', [" adverbe,", "Adverbe"]], |
︙ | ︙ | |||
369 370 371 372 373 374 375 376 377 378 379 380 381 382 | load: function (oSpellChecker, oTokenizer, oLocGraph) { this.oSpellChecker = oSpellChecker; this.oTokenizer = oTokenizer; this.oLocGraph = JSON.parse(oLocGraph); }, getInfoForToken: function (oToken) { // Token: .sType, .sValue, .nStart, .nEnd // return a object {sType, sValue, aLabel} let m = null; try { switch (oToken.sType) { | > > > > > > > > > > > > > > > > > > > > | 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 | load: function (oSpellChecker, oTokenizer, oLocGraph) { this.oSpellChecker = oSpellChecker; this.oTokenizer = oTokenizer; this.oLocGraph = JSON.parse(oLocGraph); }, split: function (sWord) { // returns an arry of strings (prefix, trimed_word, suffix) let sPrefix = ""; let sSuffix = ""; // préfixe élidé let m = /^([ldmtsnjcç]|lorsqu|presqu|jusqu|puisqu|quoiqu|quelqu|qu)[’'‘`ʼ]([a-zA-Zà-öÀ-Ö0-9_ø-ÿØ-ßĀ-ʯfi-st-]+)/i.exec(sWord); if (m) { sPrefix = m[1] + "’"; sWord = m[2]; } // mots composés m = /^([a-zA-Zà-öÀ-Ö0-9_ø-ÿØ-ßĀ-ʯfi-st-]+)(-(?:(?:les?|la)-(?:moi|toi|lui|[nv]ous|leur)|t-(?:il|elle|on)|y|en|[mts]’(?:y|en)|les?|l[aà]|[mt]oi|leur|lui|je|tu|ils?|elles?|on|[nv]ous|ce))$/i.exec(sWord); if (m) { sWord = m[1]; sSuffix = m[2]; } // split word in 3 parts: prefix, root, suffix return [sPrefix, sWord, sSuffix]; }, getInfoForToken: function (oToken) { // Token: .sType, .sValue, .nStart, .nEnd // return a object {sType, sValue, aLabel} let m = null; try { switch (oToken.sType) { |
︙ | ︙ | |||
713 714 715 716 717 718 719 720 721 722 723 724 725 726 727 | } else { aElem.push(oToken); } iToken++; } } while (iToken < lToken.length); return aElem; } } if (typeof(exports) !== 'undefined') { exports.lexgraph_fr = lexgraph_fr; } | > > > > > | 744 745 746 747 748 749 750 751 752 753 754 755 756 757 758 759 760 761 762 763 | } else { aElem.push(oToken); } iToken++; } } while (iToken < lToken.length); return aElem; }, // Other functions filterSugg: function (aSugg) { return aSugg.filter((sSugg) => { return !sSugg.endsWith("è") && !sSugg.endsWith("È"); }); } } if (typeof(exports) !== 'undefined') { exports.lexgraph_fr = lexgraph_fr; } |
Modified graphspell-js/str_transform.js from [1ca4ee03ac] to [b78c1f2098].
︙ | ︙ | |||
32 33 34 35 36 37 38 39 40 41 42 43 44 45 | spellingNormalization: function (sWord) { let sNewWord = ""; for (let c of sWord) { sNewWord += this._xTransCharsForSpelling.gl_get(c, c); } return sNewWord.normalize("NFC"); }, longestCommonSubstring: function (string1, string2) { // https://en.wikibooks.org/wiki/Algorithm_Implementation/Strings/Longest_common_substring // untested // init max value let longestCommonSubstring = 0; | > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > | 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 | spellingNormalization: function (sWord) { let sNewWord = ""; for (let c of sWord) { sNewWord += this._xTransCharsForSpelling.gl_get(c, c); } return sNewWord.normalize("NFC"); }, _xTransCharsForSimplification: new Map([ ['à', 'a'], ['é', 'é'], ['î', 'i'], ['ô', 'o'], ['û', 'u'], ['ÿ', 'y'], ['â', 'a'], ['è', 'é'], ['ï', 'i'], ['ö', 'o'], ['ù', 'u'], ['ŷ', 'y'], ['ä', 'a'], ['ê', 'é'], ['í', 'i'], ['ó', 'o'], ['ü', 'u'], ['ý', 'y'], ['á', 'a'], ['ë', 'é'], ['ì', 'i'], ['ò', 'o'], ['ú', 'u'], ['ỳ', 'y'], ['ā', 'a'], ['ē', 'é'], ['ī', 'i'], ['ō', 'o'], ['ū', 'u'], ['ȳ', 'y'], ['ç', 'c'], ['ñ', 'n'], ['œ', 'oe'], ['æ', 'ae'], ['ſ', 's'], ['ffi', 'ffi'], ['ffl', 'ffl'], ['ff', 'ff'], ['ſt', 'ft'], ['fi', 'fi'], ['fl', 'fl'], ['st', 'st'], ["⁰", "0"], ["¹", "1"], ["²", "2"], ["³", "3"], ["⁴", "4"], ["⁵", "5"], ["⁶", "6"], ["⁷", "7"], ["⁸", "8"], ["⁹", "9"], ["₀", "0"], ["₁", "1"], ["₂", "2"], ["₃", "3"], ["₄", "4"], ["₅", "5"], ["₆", "6"], ["₇", "7"], ["₈", "8"], ["₉", "9"] ]), simplifyWord: function (sWord) { // word simplication before calculating distance between words sWord = sWord.toLowerCase(); sWord = [...sWord].map(c => this._xTransCharsForSimplification.gl_get(c, c)).join(''); let sNewWord = ""; let i = 1; for (let c of sWord) { if (c == 'e' || c != sWord.slice(i, i+1)) { // exception for <e> to avoid confusion between crée / créai sNewWord += c; } i++; } return sNewWord.replace(/eau/g, "o").replace(/au/g, "o").replace(/ai/g, "é").replace(/ei/g, "é").replace(/ph/g, "f"); }, _xTransNumbersToExponent: new Map([ ["0", "⁰"], ["1", "¹"], ["2", "²"], ["3", "³"], ["4", "⁴"], ["5", "⁵"], ["6", "⁶"], ["7", "⁷"], ["8", "⁸"], ["9", "⁹"] ]), numbersToExponent: function (sWord) { let sNewWord = ""; for (let c of sWord) { sNewWord += this._xTransNumbersToExponent.gl_get(c, c); } return sNewWord; }, longestCommonSubstring: function (string1, string2) { // https://en.wikibooks.org/wiki/Algorithm_Implementation/Strings/Longest_common_substring // untested // init max value let longestCommonSubstring = 0; |
︙ | ︙ | |||
192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 | sWord = sPfxCode.slice(1) + sWord.slice(sPfxCode.charCodeAt(0)-48); return sSfxCode[0] == '0' ? sWord + sSfxCode.slice(1) : sWord.slice(0, -(sSfxCode.charCodeAt(0)-48)) + sSfxCode.slice(1); } }; if (typeof(exports) !== 'undefined') { exports.longestCommonSubstring = str_transform.longestCommonSubstring; exports.distanceDamerauLevenshtein = str_transform.distanceDamerauLevenshtein; exports.distanceDamerauLevenshtein2 = str_transform.distanceDamerauLevenshtein2; exports.showDistance = str_transform.showDistance; exports.changeWordWithSuffixCode = str_transform.changeWordWithSuffixCode; exports.changeWordWithAffixCode = str_transform.changeWordWithAffixCode; exports.defineAffixCode = str_transform.defineAffixCode; exports.defineSuffixCode = str_transform.defineSuffixCode; } | > > > | 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 | sWord = sPfxCode.slice(1) + sWord.slice(sPfxCode.charCodeAt(0)-48); return sSfxCode[0] == '0' ? sWord + sSfxCode.slice(1) : sWord.slice(0, -(sSfxCode.charCodeAt(0)-48)) + sSfxCode.slice(1); } }; if (typeof(exports) !== 'undefined') { exports.simplifyWord = str_transform.simplifyWord; exports.numbersToExponent = str_transform.numbersToExponent; exports.spellingNormalization = str_transform.spellingNormalization; exports.longestCommonSubstring = str_transform.longestCommonSubstring; exports.distanceDamerauLevenshtein = str_transform.distanceDamerauLevenshtein; exports.distanceDamerauLevenshtein2 = str_transform.distanceDamerauLevenshtein2; exports.showDistance = str_transform.showDistance; exports.changeWordWithSuffixCode = str_transform.changeWordWithSuffixCode; exports.changeWordWithAffixCode = str_transform.changeWordWithAffixCode; exports.defineAffixCode = str_transform.defineAffixCode; exports.defineSuffixCode = str_transform.defineSuffixCode; } |
Modified graphspell/char_player.py from [5484ce7bef] to [75ad6388f2].
︙ | ︙ | |||
40 41 42 43 44 45 46 | def distanceBetweenChars (c1, c2): if c1 == c2: return 0 if c1 not in dDistanceBetweenChars: return 1 return dDistanceBetweenChars[c1].get(c2, 1) | < < < < < < < < < < < < < < < < < < < < < < < < < < < < < < < < | 40 41 42 43 44 45 46 47 48 49 50 51 52 53 | def distanceBetweenChars (c1, c2): if c1 == c2: return 0 if c1 not in dDistanceBetweenChars: return 1 return dDistanceBetweenChars[c1].get(c2, 1) aVowel = set("aáàâäāeéèêëēiíìîïīoóòôöōuúùûüūyýỳŷÿȳœæAÁÀÂÄĀEÉÈÊËĒIÍÌÎÏĪOÓÒÔÖŌUÚÙÛÜŪYÝỲŶŸȲŒÆ") aConsonant = set("bcçdfghjklmnñpqrstvwxzBCÇDFGHJKLMNÑPQRSTVWXZ") aDouble = set("bcdfjklmnprstzBCDFJKLMNPRSTZ") # letters that may be used twice successively # Similar chars |
︙ | ︙ | |||
394 395 396 397 398 399 400 | "ei": ("ait", "ais"), "EI": ("AIT", "AIS"), "on": ("ons", "ont"), "ON": ("ONS", "ONT"), "oi": ("ois", "oit", "oix"), "OI": ("OIS", "OIT", "OIX"), } | < < < < < < < < < < < < < < < < < < < < < < < < < < < < < < < < < < < < | 362 363 364 365 366 367 368 | "ei": ("ait", "ais"), "EI": ("AIT", "AIS"), "on": ("ons", "ont"), "ON": ("ONS", "ONT"), "oi": ("ois", "oit", "oix"), "OI": ("OIS", "OIT", "OIX"), } |
Modified graphspell/ibdawg.py from [78af5c9dd3] to [4f7f19d7d1].
︙ | ︙ | |||
9 10 11 12 13 14 15 16 17 18 19 20 21 22 | import traceback import pkgutil import re from functools import wraps import time import json import binascii from collections import OrderedDict #import logging #logging.basicConfig(filename="suggestions.log", level=logging.DEBUG) from . import str_transform as st from . import char_player as cp | > | 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 | import traceback import pkgutil import re from functools import wraps import time import json import binascii import importlib from collections import OrderedDict #import logging #logging.basicConfig(filename="suggestions.log", level=logging.DEBUG) from . import str_transform as st from . import char_player as cp |
︙ | ︙ | |||
37 38 39 40 41 42 43 | class SuggResult: """Structure for storing, classifying and filtering suggestions""" def __init__ (self, sWord, nDistLimit=-1): self.sWord = sWord | | | | | | 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 | class SuggResult: """Structure for storing, classifying and filtering suggestions""" def __init__ (self, sWord, nDistLimit=-1): self.sWord = sWord self.sSimplifiedWord = st.simplifyWord(sWord) self.nDistLimit = nDistLimit if nDistLimit >= 0 else (len(sWord) // 3) + 1 self.nMinDist = 1000 self.aSugg = set() self.dSugg = { 0: [], 1: [], 2: [] } self.aAllSugg = set() # all found words even those refused def addSugg (self, sSugg, nDeep=0): "add a suggestion" #logging.info((nDeep * " ") + "__" + sSugg + "__") if sSugg in self.aAllSugg: return self.aAllSugg.add(sSugg) if sSugg not in self.aSugg: #nDist = min(st.distanceDamerauLevenshtein(self.sWord, sSugg), st.distanceDamerauLevenshtein(self.sSimplifiedWord, st.simplifyWord(sSugg))) nDist = st.distanceDamerauLevenshtein(self.sSimplifiedWord, st.simplifyWord(sSugg)) #logging.info((nDeep * " ") + "__" + sSugg + "__ :" + self.sSimplifiedWord +"|"+ st.simplifyWord(sSugg) +" -> "+ str(nDist)) if nDist <= self.nDistLimit: if " " in sSugg: nDist += 1 if nDist not in self.dSugg: self.dSugg[nDist] = [] self.dSugg[nDist].append(sSugg) self.aSugg.add(sSugg) |
︙ | ︙ | |||
78 79 80 81 82 83 84 | self.dSugg[1].sort(key=lambda sSugg: st.distanceDamerauLevenshtein(self.sWord, sSugg)) lRes = self.dSugg.pop(0) for nDist, lSugg in self.dSugg.items(): if nDist <= self.nDistLimit: lRes.extend(lSugg) if len(lRes) > nSuggLimit: break | < | 79 80 81 82 83 84 85 86 87 88 89 90 91 92 | self.dSugg[1].sort(key=lambda sSugg: st.distanceDamerauLevenshtein(self.sWord, sSugg)) lRes = self.dSugg.pop(0) for nDist, lSugg in self.dSugg.items(): if nDist <= self.nDistLimit: lRes.extend(lSugg) if len(lRes) > nSuggLimit: break if self.sWord.isupper(): lRes = list(OrderedDict.fromkeys(map(lambda sSugg: sSugg.upper(), lRes))) # use dict, when Python 3.6+ elif self.sWord[0:1].isupper(): # dont’ use <.istitle> lRes = list(OrderedDict.fromkeys(map(lambda sSugg: sSugg[0:1].upper()+sSugg[1:], lRes))) # use dict, when Python 3.6+ return lRes[:nSuggLimit] |
︙ | ︙ | |||
149 150 151 152 153 154 155 156 157 158 159 160 161 162 | self._getArcs = self._getArcs3 self._writeNodes = self._writeNodes3 else: raise ValueError(" # Error: unknown code: {}".format(self.nCompressionMethod)) self.bAcronymValid = False self.bNumAtLastValid = False def _initBinary (self): "initialize with binary structure file" if self.by[0:17] != b"/grammalecte-fsa/": raise TypeError("# Error. Not a grammalecte-fsa binary dictionary. Header: {}".format(self.by[0:9])) if not(self.by[17:18] == b"1" or self.by[17:18] == b"2" or self.by[17:18] == b"3"): raise ValueError("# Error. Unknown dictionary version: {}".format(self.by[17:18])) | > > > > > > > > | 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 | self._getArcs = self._getArcs3 self._writeNodes = self._writeNodes3 else: raise ValueError(" # Error: unknown code: {}".format(self.nCompressionMethod)) self.bAcronymValid = False self.bNumAtLastValid = False # lexicographer module ? self.lexicographer = None try: self.lexicographer = importlib.import_module("graphspell.lexgraph_"+self.sLangCode) except ImportError: print("# No module <graphspell.lexgraph_"+self.sLangCode+".py>") def _initBinary (self): "initialize with binary structure file" if self.by[0:17] != b"/grammalecte-fsa/": raise TypeError("# Error. Not a grammalecte-fsa binary dictionary. Header: {}".format(self.by[0:9])) if not(self.by[17:18] == b"1" or self.by[17:18] == b"2" or self.by[17:18] == b"3"): raise ValueError("# Error. Unknown dictionary version: {}".format(self.by[17:18])) |
︙ | ︙ | |||
301 302 303 304 305 306 307 | return l #@timethis def suggest (self, sWord, nSuggLimit=10, bSplitTrailingNumbers=False): "returns a set of suggestions for <sWord>" sWord = sWord.rstrip(".") # useful for LibreOffice sWord = st.spellingNormalization(sWord) | > > > | > > | | 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 | return l #@timethis def suggest (self, sWord, nSuggLimit=10, bSplitTrailingNumbers=False): "returns a set of suggestions for <sWord>" sWord = sWord.rstrip(".") # useful for LibreOffice sWord = st.spellingNormalization(sWord) sPfx = "" sSfx = "" if self.lexicographer: sPfx, sWord, sSfx = self.lexicographer.split(sWord) nMaxSwitch = max(len(sWord) // 3, 1) nMaxDel = len(sWord) // 5 nMaxHardRepl = max((len(sWord) - 5) // 4, 1) nMaxJump = max(len(sWord) // 4, 1) oSuggResult = SuggResult(sWord) if bSplitTrailingNumbers: self._splitTrailingNumbers(oSuggResult, sWord) self._splitSuggest(oSuggResult, sWord) self._suggest(oSuggResult, sWord, nMaxSwitch, nMaxDel, nMaxHardRepl, nMaxJump) aSugg = oSuggResult.getSuggestions(nSuggLimit) if self.lexicographer: aSugg = self.lexicographer.filterSugg(aSugg) if sSfx or sPfx: # we add what we removed return list(map(lambda sSug: sPfx + sSug + sSfx, aSugg)) return aSugg def _splitTrailingNumbers (self, oSuggResult, sWord): m = re.match(r"(\D+)([0-9]+)$", sWord) if m and m.group(1)[-1:].isalpha(): oSuggResult.addSugg(m.group(1) + " " + st.numbersToExponent(m.group(2))) def _splitSuggest (self, oSuggResult, sWord): # split at apostrophes for cSplitter in "'’": if cSplitter in sWord: sWord1, sWord2 = sWord.split(cSplitter, 1) if self.isValid(sWord1) and self.isValid(sWord2): |
︙ | ︙ |
Modified graphspell/lexgraph_fr.py from [a30a19637d] to [67e2780114].
1 2 3 4 5 6 7 8 | """ Lexicographer for the French language """ # Note: # This mode must contains at least: # <dSugg> : a dictionary for default suggestions. # <bLexicographer> : a boolean False | | > | 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 | """ Lexicographer for the French language """ # Note: # This mode must contains at least: # <dSugg> : a dictionary for default suggestions. # <bLexicographer> : a boolean False # if the boolean is True, 4 functions are required: # split(sWord) -> returns a list of string (that will be analyzed) # analyze(sWord) -> returns a string with the meaning of word # formatTags(sTags) -> returns a string with the meaning of tags # filterSugg(aWord) -> returns a filtered list of suggestions import re #### Suggestions dSugg = { |
︙ | ︙ | |||
132 133 134 135 136 137 138 139 140 141 142 143 144 145 | "XXVIème": "XXVIᵉ", "XXVIIème": "XXVIIᵉ", "XXVIIIème": "XXVIIIᵉ", "XXIXème": "XXIXᵉ", "XXXème": "XXXᵉ" } #### Lexicographer bLexicographer = True _dTAGS = { ':N': (" nom,", "Nom"), | > > > > > > > > > > > | 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 | "XXVIème": "XXVIᵉ", "XXVIIème": "XXVIIᵉ", "XXVIIIème": "XXVIIIᵉ", "XXIXème": "XXIXᵉ", "XXXème": "XXXᵉ" } # Préfixes et suffixes aPfx1 = frozenset([ "anti", "archi", "contre", "hyper", "mé", "méta", "im", "in", "ir", "par", "proto", "pseudo", "pré", "re", "ré", "sans", "sous", "supra", "sur", "ultra" ]) aPfx2 = frozenset([ "belgo", "franco", "génito", "gynéco", "médico", "russo" ]) #### Lexicographer bLexicographer = True _dTAGS = { ':N': (" nom,", "Nom"), |
︙ | ︙ | |||
312 313 314 315 316 317 318 | '-en': " pronom adverbial", "-m’en": " (me) pronom personnel objet + (en) pronom adverbial", "-t’en": " (te) pronom personnel objet + (en) pronom adverbial", "-s’en": " (se) pronom personnel objet + (en) pronom adverbial", } | | | < | | 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 | '-en': " pronom adverbial", "-m’en": " (me) pronom personnel objet + (en) pronom adverbial", "-t’en": " (te) pronom personnel objet + (en) pronom adverbial", "-s’en": " (se) pronom personnel objet + (en) pronom adverbial", } _zElidedPrefix = re.compile("(?i)^([ldmtsnjcç]|lorsqu|presqu|jusqu|puisqu|quoiqu|quelqu|qu)[’'‘`ʼ]([\\w-]+)") _zCompoundWord = re.compile("(?i)(\\w+)(-(?:(?:les?|la)-(?:moi|toi|lui|[nv]ous|leur)|t-(?:il|elle|on)|y|en|[mts]’(?:y|en)|les?|l[aà]|[mt]oi|leur|lui|je|tu|ils?|elles?|on|[nv]ous|ce))$") _zTag = re.compile("[:;/][\\w*][^:;/]*") def split (sWord): "split word in 3 parts: prefix, root, suffix" sPrefix = "" sSuffix = "" # préfixe élidé m = _zElidedPrefix.match(sWord) if m: sPrefix = m.group(1) + "’" sWord = m.group(2) # mots composés m = _zCompoundWord.match(sWord) if m: sWord = m.group(1) sSuffix = m.group(2) return sPrefix, sWord, sSuffix |
︙ | ︙ | |||
352 353 354 355 356 357 358 | sTags = re.sub("(?<=V[1-3])[itpqnmr_eaxz]+", "", sTags) sTags = re.sub("(?<=V0[ea])[itpqnmr_eaxz]+", "", sTags) for m in _zTag.finditer(sTags): sRes += _dTAGS.get(m.group(0), " [{}]".format(m.group(0)))[0] if sRes.startswith(" verbe") and not sRes.endswith("infinitif"): sRes += " [{}]".format(sTags[1:sTags.find("/")]) return sRes.rstrip(",") | > > > > > > > | 363 364 365 366 367 368 369 370 371 372 373 374 375 376 | sTags = re.sub("(?<=V[1-3])[itpqnmr_eaxz]+", "", sTags) sTags = re.sub("(?<=V0[ea])[itpqnmr_eaxz]+", "", sTags) for m in _zTag.finditer(sTags): sRes += _dTAGS.get(m.group(0), " [{}]".format(m.group(0)))[0] if sRes.startswith(" verbe") and not sRes.endswith("infinitif"): sRes += " [{}]".format(sTags[1:sTags.find("/")]) return sRes.rstrip(",") # Other functions def filterSugg (aSugg): "exclude suggestions" return filter(lambda sSugg: not sSugg.endswith(("è", "È")), aSugg) |
Modified graphspell/str_transform.py from [e25e3e9b20] to [65dd9b4e2a].
︙ | ︙ | |||
23 24 25 26 27 28 29 30 31 32 33 34 35 36 | 'ſ': 's', 'ffi': 'ffi', 'ffl': 'ffl', 'ff': 'ff', 'ſt': 'ft', 'fi': 'fi', 'fl': 'fl', 'st': 'st' }) def spellingNormalization (sWord): "nomalization NFC and removing ligatures" return unicodedata.normalize("NFC", sWord.translate(_xTransCharsForSpelling)) #### DISTANCE CALCULATIONS def longestCommonSubstring (s1, s2): "longest common substring" # http://en.wikipedia.org/wiki/Longest_common_substring_problem | > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > | 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 | 'ſ': 's', 'ffi': 'ffi', 'ffl': 'ffl', 'ff': 'ff', 'ſt': 'ft', 'fi': 'fi', 'fl': 'fl', 'st': 'st' }) def spellingNormalization (sWord): "nomalization NFC and removing ligatures" return unicodedata.normalize("NFC", sWord.translate(_xTransCharsForSpelling)) _xTransCharsForSimplification = str.maketrans({ 'à': 'a', 'é': 'é', 'î': 'i', 'ô': 'o', 'û': 'u', 'ÿ': 'y', 'â': 'a', 'è': 'é', 'ï': 'i', 'ö': 'o', 'ù': 'u', 'ŷ': 'y', 'ä': 'a', 'ê': 'é', 'í': 'i', 'ó': 'o', 'ü': 'u', 'ý': 'y', 'á': 'a', 'ë': 'é', 'ì': 'i', 'ò': 'o', 'ú': 'u', 'ỳ': 'y', 'ā': 'a', 'ē': 'é', 'ī': 'i', 'ō': 'o', 'ū': 'u', 'ȳ': 'y', 'ç': 'c', 'ñ': 'n', 'œ': 'oe', 'æ': 'ae', 'ſ': 's', 'ffi': 'ffi', 'ffl': 'ffl', 'ff': 'ff', 'ſt': 'ft', 'fi': 'fi', 'fl': 'fl', 'st': 'st', "⁰": "0", "¹": "1", "²": "2", "³": "3", "⁴": "4", "⁵": "5", "⁶": "6", "⁷": "7", "⁸": "8", "⁹": "9", "₀": "0", "₁": "1", "₂": "2", "₃": "3", "₄": "4", "₅": "5", "₆": "6", "₇": "7", "₈": "8", "₉": "9" }) def simplifyWord (sWord): "word simplication before calculating distance between words" sWord = sWord.lower().translate(_xTransCharsForSimplification) sNewWord = "" for i, c in enumerate(sWord, 1): if c == 'e' or c != sWord[i:i+1]: # exception for <e> to avoid confusion between crée / créai sNewWord += c return sNewWord.replace("eau", "o").replace("au", "o").replace("ai", "é").replace("ei", "é").replace("ph", "f") _xTransNumbersToExponent = str.maketrans({ "0": "⁰", "1": "¹", "2": "²", "3": "³", "4": "⁴", "5": "⁵", "6": "⁶", "7": "⁷", "8": "⁸", "9": "⁹" }) def numbersToExponent (sWord): "convert numeral chars to exponant chars" return sWord.translate(_xTransNumbersToExponent) #### DISTANCE CALCULATIONS def longestCommonSubstring (s1, s2): "longest common substring" # http://en.wikipedia.org/wiki/Longest_common_substring_problem |
︙ | ︙ |