Overview
Comment: | [graphspell] move functions from char_player to str_transform and lexicographer |
---|---|
Downloads: | Tarball | ZIP archive | SQL archive |
Timelines: | family | ancestors | descendants | both | trunk | graphspell |
Files: | files | file ages | folders |
SHA3-256: |
19fccd89d6a445f0d8ca8ff3f3659ddd |
User & Date: | olr on 2020-08-05 09:30:48 |
Other Links: | manifest | tags |
Context
2020-08-05
| ||
09:42 | [graphspell] improve suggestion mechanism check-in: e90761a163 user: olr tags: trunk, graphspell | |
09:30 | [graphspell] move functions from char_player to str_transform and lexicographer check-in: 19fccd89d6 user: olr tags: trunk, graphspell | |
06:56 | [graphspell] move function from char_player to str_transform check-in: 2381f7c9ae user: olr tags: trunk, graphspell | |
Changes
Modified graphspell-js/char_player.js from [ba78268468] to [0602ec129b].
︙ | |||
45 46 47 48 49 50 51 | 45 46 47 48 49 50 51 52 53 54 55 56 57 58 | - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - | } if (this.oDistanceBetweenChars.hasOwnProperty(c1) && this.oDistanceBetweenChars[c1].hasOwnProperty(c2)) { return this.oDistanceBetweenChars[c1][c2]; } return 1; }, |
︙ | |||
402 403 404 405 406 407 408 | 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 | - + - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - | ["EN", ["ENT", "ANT"]], ["ei", ["ait", "ais"]], ["EI", ["AIT", "AIS"]], ["on", ["ons", "ont"]], ["ON", ["ONS", "ONT"]], ["oi", ["ois", "oit", "oix"]], ["OI", ["OIS", "OIT", "OIX"]], |
Modified graphspell-js/ibdawg.js from [39524e288d] to [a6b92daf69].
︙ | |||
23 24 25 26 27 28 29 | 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 | - + - + | class SuggResult { // Structure for storing, classifying and filtering suggestions constructor (sWord, nDistLimit=-1) { this.sWord = sWord; |
︙ | |||
75 76 77 78 79 80 81 | 75 76 77 78 79 80 81 82 83 84 85 86 87 88 | - | break; } lRes.push(...lSugg); if (lRes.length > nSuggLimit) { break; } } |
︙ | |||
193 194 195 196 197 198 199 200 201 202 203 204 205 206 | 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 | + + + + + + + + | break; default: throw ValueError("# Error: unknown code: " + this.nCompressionMethod); } //console.log(this.getInfo()); this.bAcronymValid = true; this.bNumAtLastValid = false; // lexicographer module ? this.lexicographer = null; // JS still sucks: we’ll try importation when importation will be available in Workers. Still waiting... if (self && self.hasOwnProperty("lexgraph_"+this.sLangCode)) { // self is the Worker this.lexicographer = self["lexgraph_"+this.sLangCode]; } } getInfo () { return ` Language: ${this.sLangName} Lang code: ${this.sLangCode} Dictionary name: ${this.sDicName}\n` + ` Compression method: ${this.nCompressionMethod} Date: ${this.sDate} Stemming: ${this.cStemming}FX\n` + ` Arcs values: ${this.nArcVal} = ${this.nChar} characters, ${this.nAff} affixes, ${this.nTag} tags\n` + ` Dictionary: ${this.nEntry} entries, ${this.nNode} nodes, ${this.nArc} arcs\n` + |
︙ | |||
324 325 326 327 328 329 330 | 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 | + - + + + + + - + | suggest (sWord, nSuggLimit=10, bSplitTrailingNumbers=false) { // returns a array of suggestions for <sWord> //console.time("Suggestions for " + sWord); sWord = str_transform.spellingNormalization(sWord); let sPfx = ""; let sSfx = ""; if (this.lexicographer) { |
︙ |
Modified graphspell-js/lexgraph_fr.js from [7474bcdd06] to [964be7c67e].
︙ | |||
132 133 134 135 136 137 138 139 140 141 142 143 144 145 | 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 | + + + + + + + + + + + | //// Lexicographer var lexgraph_fr = { dSugg: _dSugg, // Préfixes et suffixes aPfx1: new Set([ "anti", "archi", "contre", "hyper", "mé", "méta", "im", "in", "ir", "par", "proto", "pseudo", "pré", "re", "ré", "sans", "sous", "supra", "sur", "ultra" ]), aPfx2: new Set([ "belgo", "franco", "génito", "gynéco", "médico", "russo" ]), // Étiquettes dTag: new Map([ [':N', [" nom,", "Nom"]], [':A', [" adjectif,", "Adjectif"]], [':M1', [" prénom,", "Prénom"]], [':M2', [" patronyme,", "Patronyme, matronyme, nom de famille…"]], [':MP', [" nom propre,", "Nom propre"]], [':W', [" adverbe,", "Adverbe"]], |
︙ | |||
369 370 371 372 373 374 375 376 377 378 379 380 381 382 | 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 | + + + + + + + + + + + + + + + + + + + + | load: function (oSpellChecker, oTokenizer, oLocGraph) { this.oSpellChecker = oSpellChecker; this.oTokenizer = oTokenizer; this.oLocGraph = JSON.parse(oLocGraph); }, split: function (sWord) { // returns an arry of strings (prefix, trimed_word, suffix) let sPrefix = ""; let sSuffix = ""; // préfixe élidé let m = /^([ldmtsnjcç]|lorsqu|presqu|jusqu|puisqu|quoiqu|quelqu|qu)[’'‘`ʼ]([a-zA-Zà-öÀ-Ö0-9_ø-ÿØ-ßĀ-ʯfi-st-]+)/i.exec(sWord); if (m) { sPrefix = m[1] + "’"; sWord = m[2]; } // mots composés m = /^([a-zA-Zà-öÀ-Ö0-9_ø-ÿØ-ßĀ-ʯfi-st-]+)(-(?:(?:les?|la)-(?:moi|toi|lui|[nv]ous|leur)|t-(?:il|elle|on)|y|en|[mts]’(?:y|en)|les?|l[aà]|[mt]oi|leur|lui|je|tu|ils?|elles?|on|[nv]ous|ce))$/i.exec(sWord); if (m) { sWord = m[1]; sSuffix = m[2]; } // split word in 3 parts: prefix, root, suffix return [sPrefix, sWord, sSuffix]; }, getInfoForToken: function (oToken) { // Token: .sType, .sValue, .nStart, .nEnd // return a object {sType, sValue, aLabel} let m = null; try { switch (oToken.sType) { |
︙ | |||
713 714 715 716 717 718 719 720 721 722 723 724 725 726 727 | 744 745 746 747 748 749 750 751 752 753 754 755 756 757 758 759 760 761 762 763 | + + + + + | } else { aElem.push(oToken); } iToken++; } } while (iToken < lToken.length); return aElem; }, // Other functions filterSugg: function (aSugg) { return aSugg.filter((sSugg) => { return !sSugg.endsWith("è") && !sSugg.endsWith("È"); }); } } if (typeof(exports) !== 'undefined') { exports.lexgraph_fr = lexgraph_fr; } |
Modified graphspell-js/str_transform.js from [1ca4ee03ac] to [b78c1f2098].
︙ | |||
32 33 34 35 36 37 38 39 40 41 42 43 44 45 | 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 | + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + | spellingNormalization: function (sWord) { let sNewWord = ""; for (let c of sWord) { sNewWord += this._xTransCharsForSpelling.gl_get(c, c); } return sNewWord.normalize("NFC"); }, _xTransCharsForSimplification: new Map([ ['à', 'a'], ['é', 'é'], ['î', 'i'], ['ô', 'o'], ['û', 'u'], ['ÿ', 'y'], ['â', 'a'], ['è', 'é'], ['ï', 'i'], ['ö', 'o'], ['ù', 'u'], ['ŷ', 'y'], ['ä', 'a'], ['ê', 'é'], ['í', 'i'], ['ó', 'o'], ['ü', 'u'], ['ý', 'y'], ['á', 'a'], ['ë', 'é'], ['ì', 'i'], ['ò', 'o'], ['ú', 'u'], ['ỳ', 'y'], ['ā', 'a'], ['ē', 'é'], ['ī', 'i'], ['ō', 'o'], ['ū', 'u'], ['ȳ', 'y'], ['ç', 'c'], ['ñ', 'n'], ['œ', 'oe'], ['æ', 'ae'], ['ſ', 's'], ['ffi', 'ffi'], ['ffl', 'ffl'], ['ff', 'ff'], ['ſt', 'ft'], ['fi', 'fi'], ['fl', 'fl'], ['st', 'st'], ["⁰", "0"], ["¹", "1"], ["²", "2"], ["³", "3"], ["⁴", "4"], ["⁵", "5"], ["⁶", "6"], ["⁷", "7"], ["⁸", "8"], ["⁹", "9"], ["₀", "0"], ["₁", "1"], ["₂", "2"], ["₃", "3"], ["₄", "4"], ["₅", "5"], ["₆", "6"], ["₇", "7"], ["₈", "8"], ["₉", "9"] ]), simplifyWord: function (sWord) { // word simplication before calculating distance between words sWord = sWord.toLowerCase(); sWord = [...sWord].map(c => this._xTransCharsForSimplification.gl_get(c, c)).join(''); let sNewWord = ""; let i = 1; for (let c of sWord) { if (c == 'e' || c != sWord.slice(i, i+1)) { // exception for <e> to avoid confusion between crée / créai sNewWord += c; } i++; } return sNewWord.replace(/eau/g, "o").replace(/au/g, "o").replace(/ai/g, "é").replace(/ei/g, "é").replace(/ph/g, "f"); }, _xTransNumbersToExponent: new Map([ ["0", "⁰"], ["1", "¹"], ["2", "²"], ["3", "³"], ["4", "⁴"], ["5", "⁵"], ["6", "⁶"], ["7", "⁷"], ["8", "⁸"], ["9", "⁹"] ]), numbersToExponent: function (sWord) { let sNewWord = ""; for (let c of sWord) { sNewWord += this._xTransNumbersToExponent.gl_get(c, c); } return sNewWord; }, longestCommonSubstring: function (string1, string2) { // https://en.wikibooks.org/wiki/Algorithm_Implementation/Strings/Longest_common_substring // untested // init max value let longestCommonSubstring = 0; |
︙ | |||
192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 | 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 | + + + | sWord = sPfxCode.slice(1) + sWord.slice(sPfxCode.charCodeAt(0)-48); return sSfxCode[0] == '0' ? sWord + sSfxCode.slice(1) : sWord.slice(0, -(sSfxCode.charCodeAt(0)-48)) + sSfxCode.slice(1); } }; if (typeof(exports) !== 'undefined') { exports.simplifyWord = str_transform.simplifyWord; exports.numbersToExponent = str_transform.numbersToExponent; exports.spellingNormalization = str_transform.spellingNormalization; exports.longestCommonSubstring = str_transform.longestCommonSubstring; exports.distanceDamerauLevenshtein = str_transform.distanceDamerauLevenshtein; exports.distanceDamerauLevenshtein2 = str_transform.distanceDamerauLevenshtein2; exports.showDistance = str_transform.showDistance; exports.changeWordWithSuffixCode = str_transform.changeWordWithSuffixCode; exports.changeWordWithAffixCode = str_transform.changeWordWithAffixCode; exports.defineAffixCode = str_transform.defineAffixCode; exports.defineSuffixCode = str_transform.defineSuffixCode; } |
Modified graphspell/char_player.py from [5484ce7bef] to [75ad6388f2].
︙ | |||
40 41 42 43 44 45 46 | 40 41 42 43 44 45 46 47 48 49 50 51 52 53 | - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - | def distanceBetweenChars (c1, c2): if c1 == c2: return 0 if c1 not in dDistanceBetweenChars: return 1 return dDistanceBetweenChars[c1].get(c2, 1) |
︙ | |||
394 395 396 397 398 399 400 | 362 363 364 365 366 367 368 | - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - | "ei": ("ait", "ais"), "EI": ("AIT", "AIS"), "on": ("ons", "ont"), "ON": ("ONS", "ONT"), "oi": ("ois", "oit", "oix"), "OI": ("OIS", "OIT", "OIX"), } |
Modified graphspell/ibdawg.py from [78af5c9dd3] to [4f7f19d7d1].
︙ | |||
9 10 11 12 13 14 15 16 17 18 19 20 21 22 | 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 | + | import traceback import pkgutil import re from functools import wraps import time import json import binascii import importlib from collections import OrderedDict #import logging #logging.basicConfig(filename="suggestions.log", level=logging.DEBUG) from . import str_transform as st from . import char_player as cp |
︙ | |||
37 38 39 40 41 42 43 | 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 | - + - - - + + + | class SuggResult: """Structure for storing, classifying and filtering suggestions""" def __init__ (self, sWord, nDistLimit=-1): self.sWord = sWord |
︙ | |||
78 79 80 81 82 83 84 | 79 80 81 82 83 84 85 86 87 88 89 90 91 92 | - | self.dSugg[1].sort(key=lambda sSugg: st.distanceDamerauLevenshtein(self.sWord, sSugg)) lRes = self.dSugg.pop(0) for nDist, lSugg in self.dSugg.items(): if nDist <= self.nDistLimit: lRes.extend(lSugg) if len(lRes) > nSuggLimit: break |
︙ | |||
149 150 151 152 153 154 155 156 157 158 159 160 161 162 | 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 | + + + + + + + + | self._getArcs = self._getArcs3 self._writeNodes = self._writeNodes3 else: raise ValueError(" # Error: unknown code: {}".format(self.nCompressionMethod)) self.bAcronymValid = False self.bNumAtLastValid = False # lexicographer module ? self.lexicographer = None try: self.lexicographer = importlib.import_module("graphspell.lexgraph_"+self.sLangCode) except ImportError: print("# No module <graphspell.lexgraph_"+self.sLangCode+".py>") def _initBinary (self): "initialize with binary structure file" if self.by[0:17] != b"/grammalecte-fsa/": raise TypeError("# Error. Not a grammalecte-fsa binary dictionary. Header: {}".format(self.by[0:9])) if not(self.by[17:18] == b"1" or self.by[17:18] == b"2" or self.by[17:18] == b"3"): raise ValueError("# Error. Unknown dictionary version: {}".format(self.by[17:18])) |
︙ | |||
301 302 303 304 305 306 307 | 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 | + + + - + + + - + | return l #@timethis def suggest (self, sWord, nSuggLimit=10, bSplitTrailingNumbers=False): "returns a set of suggestions for <sWord>" sWord = sWord.rstrip(".") # useful for LibreOffice sWord = st.spellingNormalization(sWord) sPfx = "" sSfx = "" if self.lexicographer: |
︙ |
Modified graphspell/lexgraph_fr.py from [a30a19637d] to [67e2780114].
1 2 3 4 5 6 7 8 | 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 | - + + | """ Lexicographer for the French language """ # Note: # This mode must contains at least: # <dSugg> : a dictionary for default suggestions. # <bLexicographer> : a boolean False |
︙ | |||
132 133 134 135 136 137 138 139 140 141 142 143 144 145 | 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 | + + + + + + + + + + + | "XXVIème": "XXVIᵉ", "XXVIIème": "XXVIIᵉ", "XXVIIIème": "XXVIIIᵉ", "XXIXème": "XXIXᵉ", "XXXème": "XXXᵉ" } # Préfixes et suffixes aPfx1 = frozenset([ "anti", "archi", "contre", "hyper", "mé", "méta", "im", "in", "ir", "par", "proto", "pseudo", "pré", "re", "ré", "sans", "sous", "supra", "sur", "ultra" ]) aPfx2 = frozenset([ "belgo", "franco", "génito", "gynéco", "médico", "russo" ]) #### Lexicographer bLexicographer = True _dTAGS = { ':N': (" nom,", "Nom"), |
︙ | |||
312 313 314 315 316 317 318 | 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 | - - + + - - + | '-en': " pronom adverbial", "-m’en": " (me) pronom personnel objet + (en) pronom adverbial", "-t’en": " (te) pronom personnel objet + (en) pronom adverbial", "-s’en": " (se) pronom personnel objet + (en) pronom adverbial", } |
︙ | |||
352 353 354 355 356 357 358 | 363 364 365 366 367 368 369 370 371 372 373 374 375 376 | + + + + + + + | sTags = re.sub("(?<=V[1-3])[itpqnmr_eaxz]+", "", sTags) sTags = re.sub("(?<=V0[ea])[itpqnmr_eaxz]+", "", sTags) for m in _zTag.finditer(sTags): sRes += _dTAGS.get(m.group(0), " [{}]".format(m.group(0)))[0] if sRes.startswith(" verbe") and not sRes.endswith("infinitif"): sRes += " [{}]".format(sTags[1:sTags.find("/")]) return sRes.rstrip(",") # Other functions def filterSugg (aSugg): "exclude suggestions" return filter(lambda sSugg: not sSugg.endswith(("è", "È")), aSugg) |
Modified graphspell/str_transform.py from [e25e3e9b20] to [65dd9b4e2a].
︙ | |||
23 24 25 26 27 28 29 30 31 32 33 34 35 36 | 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 | + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + | 'ſ': 's', 'ffi': 'ffi', 'ffl': 'ffl', 'ff': 'ff', 'ſt': 'ft', 'fi': 'fi', 'fl': 'fl', 'st': 'st' }) def spellingNormalization (sWord): "nomalization NFC and removing ligatures" return unicodedata.normalize("NFC", sWord.translate(_xTransCharsForSpelling)) _xTransCharsForSimplification = str.maketrans({ 'à': 'a', 'é': 'é', 'î': 'i', 'ô': 'o', 'û': 'u', 'ÿ': 'y', 'â': 'a', 'è': 'é', 'ï': 'i', 'ö': 'o', 'ù': 'u', 'ŷ': 'y', 'ä': 'a', 'ê': 'é', 'í': 'i', 'ó': 'o', 'ü': 'u', 'ý': 'y', 'á': 'a', 'ë': 'é', 'ì': 'i', 'ò': 'o', 'ú': 'u', 'ỳ': 'y', 'ā': 'a', 'ē': 'é', 'ī': 'i', 'ō': 'o', 'ū': 'u', 'ȳ': 'y', 'ç': 'c', 'ñ': 'n', 'œ': 'oe', 'æ': 'ae', 'ſ': 's', 'ffi': 'ffi', 'ffl': 'ffl', 'ff': 'ff', 'ſt': 'ft', 'fi': 'fi', 'fl': 'fl', 'st': 'st', "⁰": "0", "¹": "1", "²": "2", "³": "3", "⁴": "4", "⁵": "5", "⁶": "6", "⁷": "7", "⁸": "8", "⁹": "9", "₀": "0", "₁": "1", "₂": "2", "₃": "3", "₄": "4", "₅": "5", "₆": "6", "₇": "7", "₈": "8", "₉": "9" }) def simplifyWord (sWord): "word simplication before calculating distance between words" sWord = sWord.lower().translate(_xTransCharsForSimplification) sNewWord = "" for i, c in enumerate(sWord, 1): if c == 'e' or c != sWord[i:i+1]: # exception for <e> to avoid confusion between crée / créai sNewWord += c return sNewWord.replace("eau", "o").replace("au", "o").replace("ai", "é").replace("ei", "é").replace("ph", "f") _xTransNumbersToExponent = str.maketrans({ "0": "⁰", "1": "¹", "2": "²", "3": "³", "4": "⁴", "5": "⁵", "6": "⁶", "7": "⁷", "8": "⁸", "9": "⁹" }) def numbersToExponent (sWord): "convert numeral chars to exponant chars" return sWord.translate(_xTransNumbersToExponent) #### DISTANCE CALCULATIONS def longestCommonSubstring (s1, s2): "longest common substring" # http://en.wikipedia.org/wiki/Longest_common_substring_problem |
︙ |