Index: graphspell-js/char_player.js ================================================================== --- graphspell-js/char_player.js +++ graphspell-js/char_player.js @@ -47,49 +47,10 @@ return this.oDistanceBetweenChars[c1][c2]; } return 1; }, - _xTransCharsForSimplification: new Map([ - ['à', 'a'], ['é', 'é'], ['î', 'i'], ['ô', 'o'], ['û', 'u'], ['ÿ', 'y'], - ['â', 'a'], ['è', 'é'], ['ï', 'i'], ['ö', 'o'], ['ù', 'u'], ['ŷ', 'y'], - ['ä', 'a'], ['ê', 'é'], ['í', 'i'], ['ó', 'o'], ['ü', 'u'], ['ý', 'y'], - ['á', 'a'], ['ë', 'é'], ['ì', 'i'], ['ò', 'o'], ['ú', 'u'], ['ỳ', 'y'], - ['ā', 'a'], ['ē', 'é'], ['ī', 'i'], ['ō', 'o'], ['ū', 'u'], ['ȳ', 'y'], - ['ç', 'c'], ['ñ', 'n'], - ['œ', 'oe'], ['æ', 'ae'], - ['ſ', 's'], ['ffi', 'ffi'], ['ffl', 'ffl'], ['ff', 'ff'], ['ſt', 'ft'], ['fi', 'fi'], ['fl', 'fl'], ['st', 'st'], - ["⁰", "0"], ["¹", "1"], ["²", "2"], ["³", "3"], ["⁴", "4"], ["⁵", "5"], ["⁶", "6"], ["⁷", "7"], ["⁸", "8"], ["⁹", "9"], - ["₀", "0"], ["₁", "1"], ["₂", "2"], ["₃", "3"], ["₄", "4"], ["₅", "5"], ["₆", "6"], ["₇", "7"], ["₈", "8"], ["₉", "9"] - ]), - - simplifyWord: function (sWord) { - // word simplication before calculating distance between words - sWord = sWord.toLowerCase(); - sWord = [...sWord].map(c => this._xTransCharsForSimplification.gl_get(c, c)).join(''); - let sNewWord = ""; - let i = 1; - for (let c of sWord) { - if (c == 'e' || c != sWord.slice(i, i+1)) { // exception for to avoid confusion between crée / créai - sNewWord += c; - } - i++; - } - return sNewWord.replace(/eau/g, "o").replace(/au/g, "o").replace(/ai/g, "é").replace(/ei/g, "é").replace(/ph/g, "f"); - }, - - _xTransNumbersToExponent: new Map([ - ["0", "⁰"], ["1", "¹"], ["2", "²"], ["3", "³"], ["4", "⁴"], ["5", "⁵"], ["6", "⁶"], ["7", "⁷"], ["8", "⁸"], ["9", "⁹"] - ]), - - numbersToExponent: function (sWord) { - let sNewWord = ""; - for (let c of sWord) { - sNewWord += this._xTransNumbersToExponent.gl_get(c, c); - } - return sNewWord; - }, aVowel: new Set("aáàâäāeéèêëēiíìîïīoóòôöōuúùûüūyýỳŷÿȳœæAÁÀÂÄĀEÉÈÊËĒIÍÌÎÏĪOÓÒÔÖŌUÚÙÛÜŪYÝỲŶŸȲŒÆ"), aConsonant: new Set("bcçdfghjklmnñpqrstvwxzBCÇDFGHJKLMNÑPQRSTVWXZ"), aDouble: new Set("bcdfjklmnprstzBCDFJKLMNPRSTZ"), // letters that may be used twice successively @@ -404,54 +365,15 @@ ["EI", ["AIT", "AIS"]], ["on", ["ons", "ont"]], ["ON", ["ONS", "ONT"]], ["oi", ["ois", "oit", "oix"]], ["OI", ["OIS", "OIT", "OIX"]], - ]), - - - // Préfixes et suffixes - aPfx1: new Set([ - "anti", "archi", "contre", "hyper", "mé", "méta", "im", "in", "ir", "par", "proto", - "pseudo", "pré", "re", "ré", "sans", "sous", "supra", "sur", "ultra" - ]), - - aPfx2: new Set([ - "belgo", "franco", "génito", "gynéco", "médico", "russo" - ]), - - - cut: function (sWord) { - // returns an arry of strings (prefix, trimed_word, suffix) - let sPrefix = ""; - let sSuffix = ""; - let m = /^([ldmtsnjcç]|lorsqu|presqu|jusqu|puisqu|quoiqu|quelqu|qu)[’'‘`ʼ]([a-zA-Zà-öÀ-Ö0-9_ø-ÿØ-ßĀ-ʯfi-st-]+)/i.exec(sWord); - if (m) { - sPrefix = m[1] + "’"; - sWord = m[2]; - } - m = /^([a-zA-Zà-öÀ-Ö0-9_ø-ÿØ-ßĀ-ʯfi-st]+)(-(?:t-|)(?:ils?|elles?|on|je|tu|nous|vous|ce)$)/i.exec(sWord); - if (m) { - sWord = m[1]; - sSuffix = m[2]; - } - return [sPrefix, sWord, sSuffix]; - }, - - // Other functions - filterSugg: function (aSugg) { - return aSugg.filter((sSugg) => { return !sSugg.endsWith("è") && !sSugg.endsWith("È"); }); - } - + ]) }; if (typeof(exports) !== 'undefined') { - exports._xTransCharsForSpelling = char_player._xTransCharsForSpelling; - exports.spellingNormalization = char_player.spellingNormalization; - exports._xTransCharsForSimplification = char_player._xTransCharsForSimplification; - exports.simplifyWord = char_player.simplifyWord; exports.aVowel = char_player.aVowel; exports.aConsonant = char_player.aConsonant; exports.aDouble = char_player.aDouble; exports.d1to1 = char_player.d1to1; exports.d1toX = char_player.d1toX; @@ -459,8 +381,6 @@ exports.d2toX = char_player.d2toX; exports.dFinal1 = char_player.dFinal1; exports.dFinal2 = char_player.dFinal2; exports.aPfx1 = char_player.aPfx1; exports.aPfx2 = char_player.aPfx2; - exports.cut = char_player.cut; - exports.filterSugg = char_player.filterSugg; } Index: graphspell-js/ibdawg.js ================================================================== --- graphspell-js/ibdawg.js +++ graphspell-js/ibdawg.js @@ -25,11 +25,11 @@ class SuggResult { // Structure for storing, classifying and filtering suggestions constructor (sWord, nDistLimit=-1) { this.sWord = sWord; - this.sSimplifiedWord = char_player.simplifyWord(sWord); + this.sSimplifiedWord = str_transform.simplifyWord(sWord); this.nDistLimit = (nDistLimit >= 0) ? nDistLimit : Math.floor(sWord.length / 3) + 1; this.nMinDist = 1000; this.aSugg = new Set(); this.dSugg = new Map([ [0, []], [1, []], [2, []] ]); this.aAllSugg = new Set(); // all found words even those refused @@ -40,11 +40,11 @@ if (this.aAllSugg.has(sSugg)) { return; } this.aAllSugg.add(sSugg); if (!this.aSugg.has(sSugg)) { - let nDist = str_transform.distanceDamerauLevenshtein(this.sSimplifiedWord, char_player.simplifyWord(sSugg)); + let nDist = str_transform.distanceDamerauLevenshtein(this.sSimplifiedWord, str_transform.simplifyWord(sSugg)); if (nDist <= this.nDistLimit) { if (sSugg.includes(" ")) { // add 1 to distance for split suggestions nDist += 1; } if (!this.dSugg.has(nDist)) { @@ -77,11 +77,10 @@ lRes.push(...lSugg); if (lRes.length > nSuggLimit) { break; } } - lRes = char_player.filterSugg(lRes); if (this.sWord.gl_isUpperCase()) { lRes = lRes.map((sSugg) => { return sSugg.toUpperCase(); }); lRes = [...new Set(lRes)]; } else if (this.sWord.slice(0,1).gl_isUpperCase()) { @@ -195,10 +194,18 @@ throw ValueError("# Error: unknown code: " + this.nCompressionMethod); } //console.log(this.getInfo()); this.bAcronymValid = true; this.bNumAtLastValid = false; + + // lexicographer module ? + this.lexicographer = null; + // JS still sucks: we’ll try importation when importation will be available in Workers. Still waiting... + if (self && self.hasOwnProperty("lexgraph_"+this.sLangCode)) { // self is the Worker + this.lexicographer = self["lexgraph_"+this.sLangCode]; + } + } getInfo () { return ` Language: ${this.sLangName} Lang code: ${this.sLangCode} Dictionary name: ${this.sDicName}\n` + ` Compression method: ${this.nCompressionMethod} Date: ${this.sDate} Stemming: ${this.cStemming}FX\n` + @@ -326,11 +333,13 @@ // returns a array of suggestions for //console.time("Suggestions for " + sWord); sWord = str_transform.spellingNormalization(sWord); let sPfx = ""; let sSfx = ""; - [sPfx, sWord, sSfx] = char_player.cut(sWord); + if (this.lexicographer) { + [sPfx, sWord, sSfx] = this.lexicographer.split(sWord); + } let nMaxSwitch = Math.max(Math.floor(sWord.length / 3), 1); let nMaxDel = Math.floor(sWord.length / 5); let nMaxHardRepl = Math.max(Math.floor((sWord.length - 5) / 4), 1); let nMaxJump = Math.max(Math.floor(sWord.length / 4), 1); let oSuggResult = new SuggResult(sWord); @@ -338,10 +347,13 @@ this._splitTrailingNumbers(oSuggResult, sWord); } this._splitSuggest(oSuggResult, sWord); this._suggest(oSuggResult, sWord, nMaxSwitch, nMaxDel, nMaxHardRepl, nMaxJump); let aSugg = oSuggResult.getSuggestions(nSuggLimit); + if (this.lexicographer) { + aSugg = this.lexicographer.filterSugg(aSugg); + } if (sSfx || sPfx) { // we add what we removed return aSugg.map( (sSugg) => { return sPfx + sSugg + sSfx; } ); } //console.timeEnd("Suggestions for " + sWord); @@ -349,11 +361,11 @@ } _splitTrailingNumbers (oSuggResult, sWord) { let m = /^([a-zA-Zà-öÀ-Ö_ø-ÿØ-ßĀ-ʯfi-st][a-zA-Zà-öÀ-Ö_ø-ÿØ-ßĀ-ʯfi-st-]+)([0-9]+)$/.exec(sWord); if (m && !m[1].endsWith("-") && !m[1].endsWith("_")) { - oSuggResult.addSugg(m[1] + " " + char_player.numbersToExponent(m[2])); + oSuggResult.addSugg(m[1] + " " + str_transform.numbersToExponent(m[2])); } } _splitSuggest (oSuggResult, sWord) { // split at apostrophes Index: graphspell-js/lexgraph_fr.js ================================================================== --- graphspell-js/lexgraph_fr.js +++ graphspell-js/lexgraph_fr.js @@ -134,10 +134,21 @@ var lexgraph_fr = { dSugg: _dSugg, + // Préfixes et suffixes + aPfx1: new Set([ + "anti", "archi", "contre", "hyper", "mé", "méta", "im", "in", "ir", "par", "proto", + "pseudo", "pré", "re", "ré", "sans", "sous", "supra", "sur", "ultra" + ]), + + aPfx2: new Set([ + "belgo", "franco", "génito", "gynéco", "médico", "russo" + ]), + + // Étiquettes dTag: new Map([ [':N', [" nom,", "Nom"]], [':A', [" adjectif,", "Adjectif"]], [':M1', [" prénom,", "Prénom"]], [':M2', [" patronyme,", "Patronyme, matronyme, nom de famille…"]], @@ -371,10 +382,30 @@ load: function (oSpellChecker, oTokenizer, oLocGraph) { this.oSpellChecker = oSpellChecker; this.oTokenizer = oTokenizer; this.oLocGraph = JSON.parse(oLocGraph); }, + + split: function (sWord) { + // returns an arry of strings (prefix, trimed_word, suffix) + let sPrefix = ""; + let sSuffix = ""; + // préfixe élidé + let m = /^([ldmtsnjcç]|lorsqu|presqu|jusqu|puisqu|quoiqu|quelqu|qu)[’'‘`ʼ]([a-zA-Zà-öÀ-Ö0-9_ø-ÿØ-ßĀ-ʯfi-st-]+)/i.exec(sWord); + if (m) { + sPrefix = m[1] + "’"; + sWord = m[2]; + } + // mots composés + m = /^([a-zA-Zà-öÀ-Ö0-9_ø-ÿØ-ßĀ-ʯfi-st-]+)(-(?:(?:les?|la)-(?:moi|toi|lui|[nv]ous|leur)|t-(?:il|elle|on)|y|en|[mts]’(?:y|en)|les?|l[aà]|[mt]oi|leur|lui|je|tu|ils?|elles?|on|[nv]ous|ce))$/i.exec(sWord); + if (m) { + sWord = m[1]; + sSuffix = m[2]; + } + // split word in 3 parts: prefix, root, suffix + return [sPrefix, sWord, sSuffix]; + }, getInfoForToken: function (oToken) { // Token: .sType, .sValue, .nStart, .nEnd // return a object {sType, sValue, aLabel} let m = null; @@ -715,13 +746,18 @@ } iToken++; } } while (iToken < lToken.length); return aElem; + }, + + // Other functions + filterSugg: function (aSugg) { + return aSugg.filter((sSugg) => { return !sSugg.endsWith("è") && !sSugg.endsWith("È"); }); } } if (typeof(exports) !== 'undefined') { exports.lexgraph_fr = lexgraph_fr; } Index: graphspell-js/str_transform.js ================================================================== --- graphspell-js/str_transform.js +++ graphspell-js/str_transform.js @@ -34,10 +34,50 @@ for (let c of sWord) { sNewWord += this._xTransCharsForSpelling.gl_get(c, c); } return sNewWord.normalize("NFC"); }, + + _xTransCharsForSimplification: new Map([ + ['à', 'a'], ['é', 'é'], ['î', 'i'], ['ô', 'o'], ['û', 'u'], ['ÿ', 'y'], + ['â', 'a'], ['è', 'é'], ['ï', 'i'], ['ö', 'o'], ['ù', 'u'], ['ŷ', 'y'], + ['ä', 'a'], ['ê', 'é'], ['í', 'i'], ['ó', 'o'], ['ü', 'u'], ['ý', 'y'], + ['á', 'a'], ['ë', 'é'], ['ì', 'i'], ['ò', 'o'], ['ú', 'u'], ['ỳ', 'y'], + ['ā', 'a'], ['ē', 'é'], ['ī', 'i'], ['ō', 'o'], ['ū', 'u'], ['ȳ', 'y'], + ['ç', 'c'], ['ñ', 'n'], + ['œ', 'oe'], ['æ', 'ae'], + ['ſ', 's'], ['ffi', 'ffi'], ['ffl', 'ffl'], ['ff', 'ff'], ['ſt', 'ft'], ['fi', 'fi'], ['fl', 'fl'], ['st', 'st'], + ["⁰", "0"], ["¹", "1"], ["²", "2"], ["³", "3"], ["⁴", "4"], ["⁵", "5"], ["⁶", "6"], ["⁷", "7"], ["⁸", "8"], ["⁹", "9"], + ["₀", "0"], ["₁", "1"], ["₂", "2"], ["₃", "3"], ["₄", "4"], ["₅", "5"], ["₆", "6"], ["₇", "7"], ["₈", "8"], ["₉", "9"] + ]), + + simplifyWord: function (sWord) { + // word simplication before calculating distance between words + sWord = sWord.toLowerCase(); + sWord = [...sWord].map(c => this._xTransCharsForSimplification.gl_get(c, c)).join(''); + let sNewWord = ""; + let i = 1; + for (let c of sWord) { + if (c == 'e' || c != sWord.slice(i, i+1)) { // exception for to avoid confusion between crée / créai + sNewWord += c; + } + i++; + } + return sNewWord.replace(/eau/g, "o").replace(/au/g, "o").replace(/ai/g, "é").replace(/ei/g, "é").replace(/ph/g, "f"); + }, + + _xTransNumbersToExponent: new Map([ + ["0", "⁰"], ["1", "¹"], ["2", "²"], ["3", "³"], ["4", "⁴"], ["5", "⁵"], ["6", "⁶"], ["7", "⁷"], ["8", "⁸"], ["9", "⁹"] + ]), + + numbersToExponent: function (sWord) { + let sNewWord = ""; + for (let c of sWord) { + sNewWord += this._xTransNumbersToExponent.gl_get(c, c); + } + return sNewWord; + }, longestCommonSubstring: function (string1, string2) { // https://en.wikibooks.org/wiki/Algorithm_Implementation/Strings/Longest_common_substring // untested @@ -194,14 +234,17 @@ } }; if (typeof(exports) !== 'undefined') { + exports.simplifyWord = str_transform.simplifyWord; + exports.numbersToExponent = str_transform.numbersToExponent; + exports.spellingNormalization = str_transform.spellingNormalization; exports.longestCommonSubstring = str_transform.longestCommonSubstring; exports.distanceDamerauLevenshtein = str_transform.distanceDamerauLevenshtein; exports.distanceDamerauLevenshtein2 = str_transform.distanceDamerauLevenshtein2; exports.showDistance = str_transform.showDistance; exports.changeWordWithSuffixCode = str_transform.changeWordWithSuffixCode; exports.changeWordWithAffixCode = str_transform.changeWordWithAffixCode; exports.defineAffixCode = str_transform.defineAffixCode; exports.defineSuffixCode = str_transform.defineSuffixCode; } Index: graphspell/char_player.py ================================================================== --- graphspell/char_player.py +++ graphspell/char_player.py @@ -42,42 +42,10 @@ return 0 if c1 not in dDistanceBetweenChars: return 1 return dDistanceBetweenChars[c1].get(c2, 1) - -_xTransCharsForSimplification = str.maketrans({ - 'à': 'a', 'é': 'é', 'î': 'i', 'ô': 'o', 'û': 'u', 'ÿ': 'y', - 'â': 'a', 'è': 'é', 'ï': 'i', 'ö': 'o', 'ù': 'u', 'ŷ': 'y', - 'ä': 'a', 'ê': 'é', 'í': 'i', 'ó': 'o', 'ü': 'u', 'ý': 'y', - 'á': 'a', 'ë': 'é', 'ì': 'i', 'ò': 'o', 'ú': 'u', 'ỳ': 'y', - 'ā': 'a', 'ē': 'é', 'ī': 'i', 'ō': 'o', 'ū': 'u', 'ȳ': 'y', - 'ç': 'c', 'ñ': 'n', - 'œ': 'oe', 'æ': 'ae', - 'ſ': 's', 'ffi': 'ffi', 'ffl': 'ffl', 'ff': 'ff', 'ſt': 'ft', 'fi': 'fi', 'fl': 'fl', 'st': 'st', - "⁰": "0", "¹": "1", "²": "2", "³": "3", "⁴": "4", "⁵": "5", "⁶": "6", "⁷": "7", "⁸": "8", "⁹": "9", - "₀": "0", "₁": "1", "₂": "2", "₃": "3", "₄": "4", "₅": "5", "₆": "6", "₇": "7", "₈": "8", "₉": "9" -}) - -def simplifyWord (sWord): - "word simplication before calculating distance between words" - sWord = sWord.lower().translate(_xTransCharsForSimplification) - sNewWord = "" - for i, c in enumerate(sWord, 1): - if c == 'e' or c != sWord[i:i+1]: # exception for to avoid confusion between crée / créai - sNewWord += c - return sNewWord.replace("eau", "o").replace("au", "o").replace("ai", "é").replace("ei", "é").replace("ph", "f") - - -_xTransNumbersToExponent = str.maketrans({ - "0": "⁰", "1": "¹", "2": "²", "3": "³", "4": "⁴", "5": "⁵", "6": "⁶", "7": "⁷", "8": "⁸", "9": "⁹" -}) - -def numbersToExponent (sWord): - "convert numeral chars to exponant chars" - return sWord.translate(_xTransNumbersToExponent) - aVowel = set("aáàâäāeéèêëēiíìîïīoóòôöōuúùûüūyýỳŷÿȳœæAÁÀÂÄĀEÉÈÊËĒIÍÌÎÏĪOÓÒÔÖŌUÚÙÛÜŪYÝỲŶŸȲŒÆ") aConsonant = set("bcçdfghjklmnñpqrstvwxzBCÇDFGHJKLMNÑPQRSTVWXZ") aDouble = set("bcdfjklmnprstzBCDFJKLMNPRSTZ") # letters that may be used twice successively @@ -396,41 +364,5 @@ "on": ("ons", "ont"), "ON": ("ONS", "ONT"), "oi": ("ois", "oit", "oix"), "OI": ("OIS", "OIT", "OIX"), } - - -# Préfixes et suffixes - -aPfx1 = frozenset([ - "anti", "archi", "contre", "hyper", "mé", "méta", "im", "in", "ir", "par", "proto", - "pseudo", "pré", "re", "ré", "sans", "sous", "supra", "sur", "ultra" -]) -aPfx2 = frozenset([ - "belgo", "franco", "génito", "gynéco", "médico", "russo" -]) - - -_zWordPrefixes = re.compile("(?i)^([ldmtsnjcç]|lorsqu|presqu|jusqu|puisqu|quoiqu|quelqu|qu)[’'‘`ʼ]([\\w-]+)") -_zWordSuffixes = re.compile("(?i)^(\\w+)(-(?:t-|)(?:ils?|elles?|on|je|tu|nous|vous|ce))$") - -def cut (sWord): - "returns a tuple of strings (prefix, trimed_word, suffix)" - sPrefix = "" - sSuffix = "" - m = _zWordPrefixes.search(sWord) - if m: - sPrefix = m.group(1) + "’" - sWord = m.group(2) - m = _zWordSuffixes.search(sWord) - if m: - sWord = m.group(1) - sSuffix = m.group(2) - return (sPrefix, sWord, sSuffix) - - -# Other functions - -def filterSugg (aSugg): - "exclude suggestions" - return filter(lambda sSugg: not sSugg.endswith(("è", "È")), aSugg) Index: graphspell/ibdawg.py ================================================================== --- graphspell/ibdawg.py +++ graphspell/ibdawg.py @@ -11,10 +11,11 @@ import re from functools import wraps import time import json import binascii +import importlib from collections import OrderedDict #import logging #logging.basicConfig(filename="suggestions.log", level=logging.DEBUG) @@ -39,11 +40,11 @@ class SuggResult: """Structure for storing, classifying and filtering suggestions""" def __init__ (self, sWord, nDistLimit=-1): self.sWord = sWord - self.sSimplifiedWord = cp.simplifyWord(sWord) + self.sSimplifiedWord = st.simplifyWord(sWord) self.nDistLimit = nDistLimit if nDistLimit >= 0 else (len(sWord) // 3) + 1 self.nMinDist = 1000 self.aSugg = set() self.dSugg = { 0: [], 1: [], 2: [] } self.aAllSugg = set() # all found words even those refused @@ -53,13 +54,13 @@ #logging.info((nDeep * " ") + "__" + sSugg + "__") if sSugg in self.aAllSugg: return self.aAllSugg.add(sSugg) if sSugg not in self.aSugg: - #nDist = min(st.distanceDamerauLevenshtein(self.sWord, sSugg), st.distanceDamerauLevenshtein(self.sSimplifiedWord, cp.simplifyWord(sSugg))) - nDist = st.distanceDamerauLevenshtein(self.sSimplifiedWord, cp.simplifyWord(sSugg)) - #logging.info((nDeep * " ") + "__" + sSugg + "__ :" + self.sSimplifiedWord +"|"+ cp.simplifyWord(sSugg) +" -> "+ str(nDist)) + #nDist = min(st.distanceDamerauLevenshtein(self.sWord, sSugg), st.distanceDamerauLevenshtein(self.sSimplifiedWord, st.simplifyWord(sSugg))) + nDist = st.distanceDamerauLevenshtein(self.sSimplifiedWord, st.simplifyWord(sSugg)) + #logging.info((nDeep * " ") + "__" + sSugg + "__ :" + self.sSimplifiedWord +"|"+ st.simplifyWord(sSugg) +" -> "+ str(nDist)) if nDist <= self.nDistLimit: if " " in sSugg: nDist += 1 if nDist not in self.dSugg: self.dSugg[nDist] = [] @@ -80,11 +81,10 @@ for nDist, lSugg in self.dSugg.items(): if nDist <= self.nDistLimit: lRes.extend(lSugg) if len(lRes) > nSuggLimit: break - lRes = list(cp.filterSugg(lRes)) if self.sWord.isupper(): lRes = list(OrderedDict.fromkeys(map(lambda sSugg: sSugg.upper(), lRes))) # use dict, when Python 3.6+ elif self.sWord[0:1].isupper(): # dont’ use <.istitle> lRes = list(OrderedDict.fromkeys(map(lambda sSugg: sSugg[0:1].upper()+sSugg[1:], lRes))) # use dict, when Python 3.6+ @@ -151,10 +151,18 @@ else: raise ValueError(" # Error: unknown code: {}".format(self.nCompressionMethod)) self.bAcronymValid = False self.bNumAtLastValid = False + + # lexicographer module ? + self.lexicographer = None + try: + self.lexicographer = importlib.import_module("graphspell.lexgraph_"+self.sLangCode) + except ImportError: + print("# No module ") + def _initBinary (self): "initialize with binary structure file" if self.by[0:17] != b"/grammalecte-fsa/": raise TypeError("# Error. Not a grammalecte-fsa binary dictionary. Header: {}".format(self.by[0:9])) @@ -303,11 +311,14 @@ #@timethis def suggest (self, sWord, nSuggLimit=10, bSplitTrailingNumbers=False): "returns a set of suggestions for " sWord = sWord.rstrip(".") # useful for LibreOffice sWord = st.spellingNormalization(sWord) - sPfx, sWord, sSfx = cp.cut(sWord) + sPfx = "" + sSfx = "" + if self.lexicographer: + sPfx, sWord, sSfx = self.lexicographer.split(sWord) nMaxSwitch = max(len(sWord) // 3, 1) nMaxDel = len(sWord) // 5 nMaxHardRepl = max((len(sWord) - 5) // 4, 1) nMaxJump = max(len(sWord) // 4, 1) oSuggResult = SuggResult(sWord) @@ -314,19 +325,21 @@ if bSplitTrailingNumbers: self._splitTrailingNumbers(oSuggResult, sWord) self._splitSuggest(oSuggResult, sWord) self._suggest(oSuggResult, sWord, nMaxSwitch, nMaxDel, nMaxHardRepl, nMaxJump) aSugg = oSuggResult.getSuggestions(nSuggLimit) + if self.lexicographer: + aSugg = self.lexicographer.filterSugg(aSugg) if sSfx or sPfx: # we add what we removed return list(map(lambda sSug: sPfx + sSug + sSfx, aSugg)) return aSugg def _splitTrailingNumbers (self, oSuggResult, sWord): m = re.match(r"(\D+)([0-9]+)$", sWord) if m and m.group(1)[-1:].isalpha(): - oSuggResult.addSugg(m.group(1) + " " + cp.numbersToExponent(m.group(2))) + oSuggResult.addSugg(m.group(1) + " " + st.numbersToExponent(m.group(2))) def _splitSuggest (self, oSuggResult, sWord): # split at apostrophes for cSplitter in "'’": if cSplitter in sWord: Index: graphspell/lexgraph_fr.py ================================================================== --- graphspell/lexgraph_fr.py +++ graphspell/lexgraph_fr.py @@ -4,14 +4,15 @@ # Note: # This mode must contains at least: # : a dictionary for default suggestions. # : a boolean False -# if the boolean is True, 3 functions are required: +# if the boolean is True, 4 functions are required: # split(sWord) -> returns a list of string (that will be analyzed) # analyze(sWord) -> returns a string with the meaning of word # formatTags(sTags) -> returns a string with the meaning of tags +# filterSugg(aWord) -> returns a filtered list of suggestions import re #### Suggestions @@ -134,10 +135,21 @@ "XXVIIIème": "XXVIIIᵉ", "XXIXème": "XXIXᵉ", "XXXème": "XXXᵉ" } + +# Préfixes et suffixes + +aPfx1 = frozenset([ + "anti", "archi", "contre", "hyper", "mé", "méta", "im", "in", "ir", "par", "proto", + "pseudo", "pré", "re", "ré", "sans", "sous", "supra", "sur", "ultra" +]) +aPfx2 = frozenset([ + "belgo", "franco", "génito", "gynéco", "médico", "russo" +]) + #### Lexicographer bLexicographer = True @@ -314,23 +326,22 @@ "-t’en": " (te) pronom personnel objet + (en) pronom adverbial", "-s’en": " (se) pronom personnel objet + (en) pronom adverbial", } -_zElidedPrefix = re.compile("(?i)^((?:[dljmtsncç]|quoiqu|lorsqu|jusqu|puisqu|qu)’)(.+)") -_zCompoundWord = re.compile("(?i)(\\w+)(-(?:(?:les?|la)-(?:moi|toi|lui|[nv]ous|leur)|t-(?:il|elle|on)|y|en|[mts]’(?:y|en)|les?|l[aà]|[mt]oi|leur|lui|je|tu|ils?|elles?|on|[nv]ous))$") +_zElidedPrefix = re.compile("(?i)^([ldmtsnjcç]|lorsqu|presqu|jusqu|puisqu|quoiqu|quelqu|qu)[’'‘`ʼ]([\\w-]+)") +_zCompoundWord = re.compile("(?i)(\\w+)(-(?:(?:les?|la)-(?:moi|toi|lui|[nv]ous|leur)|t-(?:il|elle|on)|y|en|[mts]’(?:y|en)|les?|l[aà]|[mt]oi|leur|lui|je|tu|ils?|elles?|on|[nv]ous|ce))$") _zTag = re.compile("[:;/][\\w*][^:;/]*") def split (sWord): "split word in 3 parts: prefix, root, suffix" - sWord = sWord.replace("'", "’") sPrefix = "" sSuffix = "" # préfixe élidé m = _zElidedPrefix.match(sWord) if m: - sPrefix = m.group(1) + sPrefix = m.group(1) + "’" sWord = m.group(2) # mots composés m = _zCompoundWord.match(sWord) if m: sWord = m.group(1) @@ -354,5 +365,12 @@ for m in _zTag.finditer(sTags): sRes += _dTAGS.get(m.group(0), " [{}]".format(m.group(0)))[0] if sRes.startswith(" verbe") and not sRes.endswith("infinitif"): sRes += " [{}]".format(sTags[1:sTags.find("/")]) return sRes.rstrip(",") + + +# Other functions + +def filterSugg (aSugg): + "exclude suggestions" + return filter(lambda sSugg: not sSugg.endswith(("è", "È")), aSugg) Index: graphspell/str_transform.py ================================================================== --- graphspell/str_transform.py +++ graphspell/str_transform.py @@ -25,10 +25,42 @@ def spellingNormalization (sWord): "nomalization NFC and removing ligatures" return unicodedata.normalize("NFC", sWord.translate(_xTransCharsForSpelling)) + +_xTransCharsForSimplification = str.maketrans({ + 'à': 'a', 'é': 'é', 'î': 'i', 'ô': 'o', 'û': 'u', 'ÿ': 'y', + 'â': 'a', 'è': 'é', 'ï': 'i', 'ö': 'o', 'ù': 'u', 'ŷ': 'y', + 'ä': 'a', 'ê': 'é', 'í': 'i', 'ó': 'o', 'ü': 'u', 'ý': 'y', + 'á': 'a', 'ë': 'é', 'ì': 'i', 'ò': 'o', 'ú': 'u', 'ỳ': 'y', + 'ā': 'a', 'ē': 'é', 'ī': 'i', 'ō': 'o', 'ū': 'u', 'ȳ': 'y', + 'ç': 'c', 'ñ': 'n', + 'œ': 'oe', 'æ': 'ae', + 'ſ': 's', 'ffi': 'ffi', 'ffl': 'ffl', 'ff': 'ff', 'ſt': 'ft', 'fi': 'fi', 'fl': 'fl', 'st': 'st', + "⁰": "0", "¹": "1", "²": "2", "³": "3", "⁴": "4", "⁵": "5", "⁶": "6", "⁷": "7", "⁸": "8", "⁹": "9", + "₀": "0", "₁": "1", "₂": "2", "₃": "3", "₄": "4", "₅": "5", "₆": "6", "₇": "7", "₈": "8", "₉": "9" +}) + +def simplifyWord (sWord): + "word simplication before calculating distance between words" + sWord = sWord.lower().translate(_xTransCharsForSimplification) + sNewWord = "" + for i, c in enumerate(sWord, 1): + if c == 'e' or c != sWord[i:i+1]: # exception for to avoid confusion between crée / créai + sNewWord += c + return sNewWord.replace("eau", "o").replace("au", "o").replace("ai", "é").replace("ei", "é").replace("ph", "f") + + +_xTransNumbersToExponent = str.maketrans({ + "0": "⁰", "1": "¹", "2": "²", "3": "³", "4": "⁴", "5": "⁵", "6": "⁶", "7": "⁷", "8": "⁸", "9": "⁹" +}) + +def numbersToExponent (sWord): + "convert numeral chars to exponant chars" + return sWord.translate(_xTransNumbersToExponent) + #### DISTANCE CALCULATIONS def longestCommonSubstring (s1, s2):