Index: gc_lang/fr/build_data.py ================================================================== --- gc_lang/fr/build_data.py +++ gc_lang/fr/build_data.py @@ -269,11 +269,11 @@ print("(Python et JavaScript)" if bJS else "(Python seulement)") import gc_lang.fr.modules.conj as conj try: - oDict = ibdawg.IBDAWG("French.bdic") + oDict = ibdawg.IBDAWG("fr.bdic") except: traceback.print_exc() return # set of homophonic words Index: gc_lang/fr/config.ini ================================================================== --- gc_lang/fr/config.ini +++ gc_lang/fr/config.ini @@ -4,11 +4,11 @@ locales = fr_FR fr_BE fr_CA fr_CH fr_LU fr_MC fr_BF fr_CI fr_SN fr_ML fr_NE fr_TG fr_BJ country_default = FR name = Grammalecte implname = grammalecte # always use 3 numbers for version: x.y.z -version = 0.6.1 +version = 0.6.2 author = Olivier R. provider = Dicollecte link = http://grammalecte.net description = Correcteur grammatical pour le français. extras = README_fr.txt Index: gc_lang/fr/dictionnaire/genfrdic.py ================================================================== --- gc_lang/fr/dictionnaire/genfrdic.py +++ gc_lang/fr/dictionnaire/genfrdic.py @@ -815,11 +815,11 @@ def check (self): sErr = '' if self.lemma == '': sErr += 'lemme vide' - if not re.match(r"[a-zA-ZéÉôÔàâÂîÎïèÈêÊÜœŒæÆçÇ0-9µåÅΩ&αβγδεζηθικλμνξοπρστυφχψωΔℓΩ_]", self.lemma): + if not re.match(r"[a-zA-ZéÉôÔàâáÂîÎïèÈêÊÜœŒæÆçÇ0-9µåÅΩ&αβγδεζηθικλμνξοπρστυφχψωΔℓΩ_]", self.lemma): sErr += 'premier caractère inconnu: ' + self.lemma[0] if re.search(r"\s$", self.lemma): sErr += 'espace en fin de lemme' if re.match(r"v[0123]", self.po) and not re.match(r"[eas_][ix_][tx_][nx_][pqreuvx_][mx_][ex_z][ax_z]\b", self.po[2:]): sErr += 'verbe inconnu: ' + self.po Index: gc_lang/fr/dictionnaire/orthographe/FRANCAIS_5.aff ================================================================== --- gc_lang/fr/dictionnaire/orthographe/FRANCAIS_5.aff +++ gc_lang/fr/dictionnaire/orthographe/FRANCAIS_5.aff @@ -50,11 +50,11 @@ MAP wW MAP xX MAP zZ # Remplacements envisagés & barbarismes -REP 84 +REP 82 REP ^Ca$ Ça REP ^l l' REP ^d d' REP ^n n' REP ^s s' @@ -133,12 +133,10 @@ REP sanctionnable punissable REP questionnable discutable REP antitartre détartrant REP email courriel REP construirent construisirent -REP cad$ c’est-à-dire -REP càd$ c’est-à-dire # Phonétique #PHONE 69 #PHONE AN(DT)$ @ @@ -324,20 +322,21 @@ # La première colonne dresse une liste de caractères écrits avec des diacritiques combinants : # http://www.unicode.org/charts/ U0300 + # La seconde colonne établit l’équivalent en Latin-1 étendu : # Hunspell fait la modification pour vérifier l’orthographe. (Peut-être pas utile pour Mozilla) # Apostrophes: U+2019, U+02BC -ICONV 41 +ICONV 42 ICONV ’ ' ICONV ʼ ' ICONV ffi ffi ICONV ffl ffl ICONV ff ff ICONV ſt ft ICONV fi fi ICONV fl fl ICONV st st +ICONV ſ s ICONV à à ICONV â â ICONV ä ä ICONV é é ICONV è è Index: gc_lang/fr/perf_memo.txt ================================================================== --- gc_lang/fr/perf_memo.txt +++ gc_lang/fr/perf_memo.txt @@ -21,5 +21,6 @@ 0.5.15 2017.01.22 11:47 4.85593 1.15248 0.762924 0.22744 0.243461 0.254609 0.586741 0.317503 0.0588827 0.00701016 (unicode normalisation NFC) 0.5.15 2017.01.31 12:06 4.88227 1.18008 0.782217 0.232617 0.247672 0.257628 0.596903 0.32169 0.0603505 0.00695196 0.5.15 2017.02.05 10:10 4.90222 1.18444 0.786696 0.233413 0.25071 0.260214 0.602112 0.325235 0.0609932 0.00706897 0.5.16 2017.05.12 07:41 4.92201 1.19269 0.80639 0.239147 0.257518 0.266523 0.62111 0.33359 0.0634668 0.00757178 0.6.1 2018.02.12 09:58 5.25924 1.2649 0.878442 0.257465 0.280558 0.293903 0.686887 0.391275 0.0672474 0.00824723 +0.6.2 2018.02.19 09:06 6.20116 1.44334 1.02936 0.272956 0.311561 0.362367 0.812705 0.419061 0.0773003 0.00845671 (spelling normalization) Index: gc_lang/fr/webext/manifest.json ================================================================== --- gc_lang/fr/webext/manifest.json +++ gc_lang/fr/webext/manifest.json @@ -1,10 +1,10 @@ { "manifest_version": 2, "name": "Grammalecte [fr]", "short_name": "Grammalecte [fr]", - "version": "0.6.1", + "version": "0.6.2", "applications": { "gecko": { "id": "French-GC@grammalecte.net", "strict_min_version": "56.0" Index: graphspell-js/char_player.js ================================================================== --- graphspell-js/char_player.js +++ graphspell-js/char_player.js @@ -4,29 +4,42 @@ ${map} var char_player = { - _dTransChars: new Map([ + _xTransCharsForSpelling: new Map([ + ['ſ', 's'], ['ffi', 'ffi'], ['ffl', 'ffl'], ['ff', 'ff'], ['ſt', 'ft'], ['fi', 'fi'], ['fl', 'fl'], ['st', 'st'] + ]), + + spellingNormalization: function (sWord) { + let sNewWord = ""; + for (let c of sWord) { + sNewWord += this._xTransCharsForSpelling.gl_get(c, c); + } + return sNewWord.normalize("NFC"); + }, + + _xTransCharsForSimplification: new Map([ ['à', 'a'], ['é', 'e'], ['î', 'i'], ['ô', 'o'], ['û', 'u'], ['ÿ', 'i'], ['y', 'i'], ['â', 'a'], ['è', 'e'], ['ï', 'i'], ['ö', 'o'], ['ù', 'u'], ['ŷ', 'i'], ['ä', 'a'], ['ê', 'e'], ['í', 'i'], ['ó', 'o'], ['ü', 'u'], ['ý', 'i'], ['á', 'a'], ['ë', 'e'], ['ì', 'i'], ['ò', 'o'], ['ú', 'u'], ['ỳ', 'i'], ['ā', 'a'], ['ē', 'e'], ['ī', 'i'], ['ō', 'o'], ['ū', 'u'], ['ȳ', 'i'], ['ñ', 'n'], ['k', 'q'], ['w', 'v'], ['œ', 'oe'], ['æ', 'ae'], + ['ſ', 's'], ['ffi', 'ffi'], ['ffl', 'ffl'], ['ff', 'ff'], ['ſt', 'ft'], ['fi', 'fi'], ['fl', 'fl'], ['st', 'st'] ]), simplifyWord: function (sWord) { // word simplication before calculating distance between words sWord = sWord.toLowerCase(); let sNewWord = ""; let i = 1; for (let c of sWord) { - let cNew = this._dTransChars.gl_get(c, c); + let cNew = this._xTransCharsForSimplification.gl_get(c, c); let cNext = sWord.slice(i, i+1) - if (cNew != this._dTransChars.gl_get(cNext, cNext)) { + if (cNew != this._xTransCharsForSimplification.gl_get(cNext, cNext)) { sNewWord += cNew; } i++; } return sNewWord.replace(/eau/g, "o").replace(/au/g, "o").replace(/ai/g, "e").replace(/ei/g, "e").replace(/ph/g, "f"); Index: graphspell-js/ibdawg.js ================================================================== --- graphspell-js/ibdawg.js +++ graphspell-js/ibdawg.js @@ -208,10 +208,11 @@ return oJSON; } isValidToken (sToken) { // checks if sToken is valid (if there is hyphens in sToken, sToken is split, each part is checked) + sToken = char_player.spellingNormalization(sToken) if (this.isValid(sToken)) { return true; } if (sToken.includes("-")) { if (sToken.gl_count("-") > 4) { @@ -278,10 +279,11 @@ return Boolean(this._convBytesToInteger(this.byDic.slice(iAddr, iAddr+this.nBytesArc)) & this._finalNodeMask); } getMorph (sWord) { // retrieves morphologies list, different casing allowed + sWord = char_player.spellingNormalization(sWord) let l = this.morph(sWord); if (sWord[0].gl_isUpperCase()) { l.push(...this.morph(sWord.toLowerCase())); if (sWord.gl_isUpperCase() && sWord.length > 1) { l.push(...this.morph(sWord.gl_toCapitalize())); @@ -290,10 +292,11 @@ return l; } suggest (sWord, nSuggLimit=10) { // returns a array of suggestions for + sWord = char_player.spellingNormalization(sWord) let sPfx = ""; let sSfx = ""; [sPfx, sWord, sSfx] = char_player.cut(sWord); let nMaxSwitch = Math.max(Math.floor(sWord.length / 3), 1); let nMaxDel = Math.floor(sWord.length / 5); Index: graphspell/char_player.py ================================================================== --- graphspell/char_player.py +++ graphspell/char_player.py @@ -1,24 +1,34 @@ # list of similar chars # useful for suggestion mechanism import re +import unicodedata + + +_xTransCharsForSpelling = str.maketrans({ + 'ſ': 's', 'ffi': 'ffi', 'ffl': 'ffl', 'ff': 'ff', 'ſt': 'ft', 'fi': 'fi', 'fl': 'fl', 'st': 'st' +}) + +def spellingNormalization (sWord): + return unicodedata.normalize("NFC", sWord.translate(_xTransCharsForSpelling)) -_xTransChars = str.maketrans({ +_xTransCharsForSimplification = str.maketrans({ 'à': 'a', 'é': 'e', 'î': 'i', 'ô': 'o', 'û': 'u', 'ÿ': 'i', "y": "i", 'â': 'a', 'è': 'e', 'ï': 'i', 'ö': 'o', 'ù': 'u', 'ŷ': 'i', 'ä': 'a', 'ê': 'e', 'í': 'i', 'ó': 'o', 'ü': 'u', 'ý': 'i', 'á': 'a', 'ë': 'e', 'ì': 'i', 'ò': 'o', 'ú': 'u', 'ỳ': 'i', 'ā': 'a', 'ē': 'e', 'ī': 'i', 'ō': 'o', 'ū': 'u', 'ȳ': 'i', 'ñ': 'n', 'k': 'q', 'w': 'v', - 'œ': 'oe', 'æ': 'ae', + 'œ': 'oe', 'æ': 'ae', + 'ſ': 's', 'ffi': 'ffi', 'ffl': 'ffl', 'ff': 'ff', 'ſt': 'ft', 'fi': 'fi', 'fl': 'fl', 'st': 'st', }) def simplifyWord (sWord): "word simplication before calculating distance between words" - sWord = sWord.lower().translate(_xTransChars) + sWord = sWord.lower().translate(_xTransCharsForSimplification) sNewWord = "" for i, c in enumerate(sWord, 1): if c != sWord[i:i+1]: sNewWord += c return sNewWord.replace("eau", "o").replace("au", "o").replace("ai", "e").replace("ei", "e").replace("ph", "f") Index: graphspell/ibdawg.py ================================================================== --- graphspell/ibdawg.py +++ graphspell/ibdawg.py @@ -216,10 +216,11 @@ if bInJSModule: hDst.write(";\n\nexports.dictionary = dictionary;\n") def isValidToken (self, sToken): "checks if is valid (if there is hyphens in , is split, each part is checked)" + sToken = cp.spellingNormalization(sToken) if self.isValid(sToken): return True if "-" in sToken: if sToken.count("-") > 4: return True @@ -258,10 +259,11 @@ return False return bool(int.from_bytes(self.byDic[iAddr:iAddr+self.nBytesArc], byteorder='big') & self._finalNodeMask) def getMorph (self, sWord): "retrieves morphologies list, different casing allowed" + sWord = cp.spellingNormalization(sWord) l = self.morph(sWord) if sWord[0:1].isupper(): l.extend(self.morph(sWord.lower())) if sWord.isupper() and len(sWord) > 1: l.extend(self.morph(sWord.capitalize())) @@ -268,10 +270,11 @@ return l #@timethis def suggest (self, sWord, nSuggLimit=10): "returns a set of suggestions for " + sWord = cp.spellingNormalization(sWord) sPfx, sWord, sSfx = cp.cut(sWord) nMaxSwitch = max(len(sWord) // 3, 1) nMaxDel = len(sWord) // 5 nMaxHardRepl = max((len(sWord) - 5) // 4, 1) oSuggResult = SuggResult(sWord) @@ -328,10 +331,11 @@ self._suggest(oSuggResult, sRepl, nMaxSwitch, nMaxDel, nMaxHardRepl, nDeep+1, iAddr, sNewWord, True) #@timethis def suggest2 (self, sWord, nMaxSugg=10): "returns a set of suggestions for " + sWord = cp.spellingNormalization(sWord) sPfx, sWord, sSfx = cp.cut(sWord) oSuggResult = SuggResult(sWord) self._suggest2(oSuggResult) aSugg = oSuggResult.getSuggestions() if sSfx or sPfx: @@ -384,10 +388,11 @@ aTails.update(self._getTails(jAddr, sTail+self.dCharVal[nVal], n-1)) return aTails def drawPath (self, sWord, iAddr=0): "show the path taken by in the graph" + sWord = cp.spellingNormalization(sWord) c1 = sWord[0:1] if sWord else " " iPos = -1 n = 0 print(c1 + ": ", end="") for c2, jAddr in self._getCharArcs(iAddr):