Index: cli.py ================================================================== --- cli.py +++ cli.py @@ -208,10 +208,11 @@ echo(" {:<32} {}".format(sMorph, oLexGraphe.formatTags(sMorph))) elif sText.startswith("!"): for sWord in sText[1:].strip().split(): if sWord: echo(" | ".join(oDict.suggest(sWord))) + #echo(" | ".join(oDict.suggest2(sWord))) elif sText.startswith(">"): oDict.drawPath(sText[1:].strip()) elif sText.startswith("="): for sRes in oDict.select(sText[1:].strip()): echo(sRes) Index: gc_core/js/char_player.js ================================================================== --- gc_core/js/char_player.js +++ gc_core/js/char_player.js @@ -24,76 +24,13 @@ sRes += this._dTransChars.gl_get(c, c); } return sRes.replace("eau", "o").replace("au", "o"); }, - distanceDamerauLevenshtein: function (s1, s2) { - // distance of Damerau-Levenshtein between and - // https://fr.wikipedia.org/wiki/Distance_de_Damerau-Levenshtein - try { - let nLen1 = s1.length; - let nLen2 = s2.length; - let matrix = []; - for (let i = 0; i <= nLen1; i++) { - matrix[i] = new Array(nLen2 + 1); - } - for (let i = 0; i <= nLen1; i++) { - matrix[i][0] = i; - } - for (let j = 0; j <= nLen2; j++) { - matrix[0][j] = j; - } - for (let i = 1; i <= nLen1; i++) { - for (let j = 1; j <= nLen2; j++) { - let nCost = (s1[i] === s2[j]) ? 0 : 1; - matrix[i][j] = Math.min( - matrix[i-1][j] + 1, // Deletion - matrix[i][j-1] + 1, // Insertion - matrix[i-1][j-1] + nCost // Substitution - ); - if (i > 1 && j > 1 && s1[i] == s2[j-1] && s1[i-1] == s2[j]) { - matrix[i][j] = Math.min(matrix[i][j], matrix[i-2][j-2] + nCost); // Transposition - } - } - } - //console.log(s2 + ": " + matrix[nLen1][nLen2]); - return matrix[nLen1][nLen2]; - } - catch (e) { - helpers.logerror(e); - } - }, - - showDistance (s1, s2) { - let s1b = this.cleanWord(s1); - let s2b = this.cleanWord(s2); - console.log(`Distance: ${s1} / ${s2} = ${this.distanceDamerauLevenshtein(s1, s2)})`); - console.log(`Distance: ${s1b} / ${s2b} = ${this.distanceDamerauLevenshtein(s1b, s2b)})`); - }, - - // Method: Remove Useless Chars - - aVovels: new Set([ - 'a', 'e', 'i', 'o', 'u', 'y', - 'à', 'é', 'î', 'ô', 'û', 'ÿ', - 'â', 'è', 'ï', 'ö', 'ù', 'ŷ', - 'ä', 'ê', 'í', 'ó', 'ü', 'ý', - 'á', 'ë', 'ì', 'ò', 'ú', 'ỳ', - 'ā', 'ē', 'ī', 'ō', 'ū', 'ȳ', - 'h', 'œ', 'æ' - ]), - - shrinkWord: function (sWord) { - // remove vovels and h - let sRes = ""; - for (let cChar of sWord.slice(1)) { - if (!this.aVovels.has(cChar)) { - sRes += cChar; - } - } - return sWord.slice(0, 1).replace("h", "") + sRes; - }, + aVowel: new Set("aáàâäāeéèêëēiíìîïīoóòôöōuúùûüūyýỳŷÿȳœæAÁÀÂÄĀEÉÈÊËĒIÍÌÎÏĪOÓÒÔÖŌUÚÙÛÜŪYÝỲŶŸȲŒÆ"), + aConsonant: new Set("bcçdfghjklmnñpqrstvwxzBCÇDFGHJKLMNÑPQRSTVWXZ"), + aDouble: new Set("bcçdfjklmnprstzBCÇDFJKLMNPRSTZ"), // letters that may be used twice successively // Similar chars d1to1: new Map([ @@ -225,20 +162,18 @@ ["Æ", ["AE",]], ["b", ["bb",]], ["B", ["BB",]], ["c", ["cc", "ss", "qu", "ch"]], ["C", ["CC", "SS", "QU", "CH"]], - ["ç", ["ss", "cc", "qh", "ch"]], - ["Ç", ["SS", "CC", "QH", "CH"]], ["d", ["dd",]], ["D", ["DD",]], + ["é", ["ai", "ei"]], + ["É", ["AI", "EI"]], ["f", ["ff", "ph"]], ["F", ["FF", "PH"]], ["g", ["gu", "ge", "gg", "gh"]], ["G", ["GU", "GE", "GG", "GH"]], - ["i", ["ii",]], - ["I", ["II",]], ["j", ["jj", "dj"]], ["J", ["JJ", "DJ"]], ["k", ["qu", "ck", "ch", "cu", "kk", "kh"]], ["K", ["QU", "CK", "CH", "CU", "KK", "KH"]], ["l", ["ll",]], @@ -264,10 +199,17 @@ ["x", ["cc", "ct", "xx"]], ["X", ["CC", "CT", "XX"]], ["z", ["ss", "zh"]], ["Z", ["SS", "ZH"]], ]), + + get1toXReplacement: function (cPrev, cCur, cNext) { + if (this.aConsonant.has(cCur) && (this.aConsonant.has(cPrev) || this.aConsonant.has(cNext))) { + return []; + } + return this.d1toX.gl_get(cCur, []); + }, d2toX: new Map([ ["an", ["en",]], ["AN", ["EN",]], ["au", ["eau", "o", "ô"]], Index: gc_core/js/ibdawg.js ================================================================== --- gc_core/js/ibdawg.js +++ gc_core/js/ibdawg.js @@ -15,10 +15,73 @@ // Don’t remove . Necessary in TB. ${string} ${map} ${set} + +class SuggResult { + // Structure for storing, classifying and filtering suggestions + + constructor (sWord, nDistLimit=-1) { + this.sWord = sWord; + this.sCleanWord = char_player.cleanWord(sWord); + this.nDistLimit = (nDistLimit >= 0) ? nDistLimit : Math.floor(sWord.length / 3) + 1; + this.nMinDist = 1000; + this.aSugg = new Set(); + this.dSugg = new Map([ [0, []], [1, []] ]); + } + + addSugg (sSugg, nDeep=0) { + // add a suggestion + if (!this.aSugg.has(sSugg)) { + let nDist = str_transform.distanceDamerauLevenshtein(this.sCleanWord, char_player.cleanWord(sSugg)); + if (nDist <= this.nDistLimit) { + if (!this.dSugg.has(nDist)) { + this.dSugg.set(nDist, []); + } + this.dSugg.get(nDist).push(sSugg); + this.aSugg.add(sSugg); + if (nDist < this.nMinDist) { + this.nMinDist = nDist; + } + this.nDistLimit = Math.min(this.nDistLimit, this.nMinDist+2); + } + } + } + + getSuggestions (nSuggLimit=10, nDistLimit=-1) { + // return a list of suggestions + let lRes = []; + if (this.dSugg.get(0).length) { + // we sort the better results with the original word + let dDistTemp = new Map(); + lRes.forEach((sSugg) => { dDistTemp.set(sSugg, str_transform.distanceDamerauLevenshtein(this.sWord, sSugg)); }); + lRes = lRes.sort((sA, sB) => { return dDistTemp.get(sA) - dDistTemp.get(sB); }); + dDistTemp.clear(); + } + for (let lSugg of this.dSugg.values()) { + for (let sSugg of lSugg) { lRes.push(sSugg); } + if (lRes.length > nSuggLimit) { + break; + } + } + lRes = char_player.filterSugg(lRes); + if (this.sWord.gl_isTitle()) { + lRes = lRes.map((sSugg) => { return sSugg.gl_toCapitalize(); }); + } + else if (this.sWord.gl_isUpperCase()) { + lRes = lRes.map((sSugg) => { return sSugg.toUpperCase(); }); + } + return lRes.slice(0, nSuggLimit); + } + + reset () { + this.aSugg.clear(); + this.dSugg.clear(); + } +} + class IBDAWG { // INDEXABLE BINARY DIRECT ACYCLIC WORD GRAPH constructor (sDicName, sPath="") { @@ -191,93 +254,94 @@ suggest (sWord, nMaxSugg=10) { // returns a array of suggestions for let sPfx = ""; let sSfx = ""; [sPfx, sWord, sSfx] = char_player.cut(sWord); + let nMaxSwitch = Math.max(Math.floor(sWord.length / 3), 1); let nMaxDel = Math.floor(sWord.length / 5); let nMaxHardRepl = Math.max(Math.floor((sWord.length - 5) / 4), 1); - let aSugg = this._suggest(sWord, nMaxDel, nMaxHardRepl); + let oSuggResult = new SuggResult(sWord); + this._suggest(oSuggResult, sWord, nMaxSwitch, nMaxDel, nMaxHardRepl); if (sWord.gl_isTitle()) { - aSugg.gl_update(this._suggest(sWord.toLowerCase(), nMaxDel, nMaxHardRepl)); + this._suggest(oSuggResult, sWord.toLowerCase(), nMaxSwitch, nMaxDel, nMaxHardRepl); } else if (sWord.gl_isLowerCase()) { - aSugg.gl_update(this._suggest(sWord.gl_toCapitalize(), nMaxDel, nMaxHardRepl)); - } - // Set to Array - aSugg = Array.from(aSugg); - aSugg = char_player.filterSugg(aSugg); - if (sWord.gl_isTitle()) { - aSugg = aSugg.map((sSugg) => { return sSugg.gl_toCapitalize(); }); - } - let dDistTemp = new Map(); - let sCleanWord = char_player.cleanWord(sWord); - aSugg.forEach((sSugg) => { dDistTemp.set(sSugg, char_player.distanceDamerauLevenshtein(sCleanWord, char_player.cleanWord(sSugg))); }); - aSugg = aSugg.sort((sA, sB) => { return dDistTemp.get(sA) - dDistTemp.get(sB); }).slice(0, nMaxSugg); - dDistTemp.clear(); + this._suggest(oSuggResult, sWord.gl_toCapitalize(), nMaxSwitch, nMaxDel, nMaxHardRepl); + } + let aSugg = oSuggResult.getSuggestions(); if (sSfx || sPfx) { // we add what we removed return aSugg.map( (sSugg) => { return sPfx + sSugg + sSfx } ); } return aSugg; } - _suggest (sRemain, nMaxDel=0, nMaxHardRepl=0, nDeep=0, iAddr=0, sNewWord="", bAvoidLoop=false) { + _suggest (oSuggResult, sRemain, nMaxSwitch=0, nMaxDel=0, nMaxHardRepl=0, nDeep=0, iAddr=0, sNewWord="", bAvoidLoop=false) { // returns a set of suggestions // recursive function - let aSugg = new Set(); if (sRemain == "") { if (this._convBytesToInteger(this.byDic.slice(iAddr, iAddr+this.nBytesArc)) & this._finalNodeMask) { - aSugg.add(sNewWord); + oSuggResult.addSugg(sNewWord); } for (let sTail of this._getTails(iAddr)) { - aSugg.add(sNewWord+sTail); + oSuggResult.addSugg(sNewWord+sTail); } - return aSugg; + return; } let cCurrent = sRemain.slice(0, 1); - for (let [cChar, jAddr] of this._getSimilarArcs(cCurrent, iAddr)) { - aSugg.gl_update(this._suggest(sRemain.slice(1), nMaxDel, nMaxHardRepl, nDeep+1, jAddr, sNewWord+cChar)); + for (let [cChar, jAddr] of this._getSimilarCharArcs(cCurrent, iAddr)) { + this._suggest(oSuggResult, sRemain.slice(1), nMaxSwitch, nMaxDel, nMaxHardRepl, nDeep+1, jAddr, sNewWord+cChar); } if (!bAvoidLoop) { // avoid infinite loop - if (cCurrent == sRemain.slice(1, 2)) { - // same char, we remove 1 char without adding 1 to - aSugg.gl_update(this._suggest(sRemain.slice(1), nMaxDel, nMaxHardRepl, nDeep+1, iAddr, sNewWord)); - } - else { - // switching chars - aSugg.gl_update(this._suggest(sRemain.slice(1, 2)+sRemain.slice(0, 1)+sRemain.slice(2), nMaxDel, nMaxHardRepl, nDeep+1, iAddr, sNewWord, true)); - // delete char - if (nMaxDel > 0) { - aSugg.gl_update(this._suggest(sRemain.slice(1), nMaxDel-1, nMaxHardRepl, nDeep+1, iAddr, sNewWord, true)); - } - } - // Phonetic replacements - for (let sRepl of char_player.d1toX.gl_get(cCurrent, [])) { - aSugg.gl_update(this._suggest(sRepl + sRemain.slice(1), nMaxDel, nMaxHardRepl, nDeep+1, iAddr, sNewWord, true)); - } - for (let sRepl of char_player.d2toX.gl_get(sRemain.slice(0, 2), [])) { - aSugg.gl_update(this._suggest(sRepl + sRemain.slice(2), nMaxDel, nMaxHardRepl, nDeep+1, iAddr, sNewWord, true)); - } - // Hard replacements - if (nDeep > 3 && nMaxHardRepl && sRemain.length >= 2) { - for (let [nVal, kAddr] of this._getArcs1(iAddr)) { - if (this.dCharVal.has(nVal)) { - let cChar = this.dCharVal.get(nVal); + if (sRemain.length > 1) { + if (cCurrent == sRemain.slice(1, 2)) { + // same char, we remove 1 char without adding 1 to + this._suggest(oSuggResult, sRemain.slice(1), nMaxSwitch, nMaxDel, nMaxHardRepl, nDeep+1, iAddr, sNewWord); + } + else { + // switching chars + if (nMaxSwitch > 0) { + this._suggest(oSuggResult, sRemain.slice(1, 2)+sRemain.slice(0, 1)+sRemain.slice(2), nMaxSwitch-1, nMaxDel, nMaxHardRepl, nDeep+1, iAddr, sNewWord, true); + } + // delete char + if (nMaxDel > 0) { + this._suggest(oSuggResult, sRemain.slice(1), nMaxSwitch, nMaxDel-1, nMaxHardRepl, nDeep+1, iAddr, sNewWord, true); + } + } + // Phonetic replacements + for (let sRepl of char_player.get1toXReplacement(sNewWord.slice(-1), cCurrent, sRemain.slice(1,2))) { + this._suggest(oSuggResult, sRepl + sRemain.slice(1), nMaxSwitch, nMaxDel, nMaxHardRepl, nDeep+1, iAddr, sNewWord, true); + } + for (let sRepl of char_player.d2toX.gl_get(sRemain.slice(0, 2), [])) { + this._suggest(oSuggResult, sRepl + sRemain.slice(2), nMaxSwitch, nMaxDel, nMaxHardRepl, nDeep+1, iAddr, sNewWord, true); + } + // Hard replacements + if (nDeep > 3 && nMaxHardRepl && sRemain.length >= 2) { + for (let [cChar, kAddr] of this._getCharArcs(iAddr)) { if (!char_player.d1to1.gl_get(cCurrent, "").includes(cChar)) { - aSugg.gl_update(this._suggest(sRemain.slice(1), nMaxDel, nMaxHardRepl-1, nDeep+1, kAddr, sNewWord+cChar, true)); + this._suggest(oSuggResult, sRemain.slice(1), nMaxSwitch, nMaxDel, nMaxHardRepl-1, nDeep+1, kAddr, sNewWord+cChar, true); } } } } // end of word if (sRemain.length == 2) { for (let sRepl of char_player.dFinal2.gl_get(sRemain, [])) { - aSugg.gl_update(this._suggest(sRepl, nMaxDel, nMaxHardRepl, nDeep+1, iAddr, sNewWord, true)); + this._suggest(oSuggResult, sRepl, nMaxSwitch, nMaxDel, nMaxHardRepl, nDeep+1, iAddr, sNewWord, true); } } else if (sRemain.length == 1) { - aSugg.gl_update(this._suggest("", nMaxDel, nMaxHardRepl, nDeep+1, iAddr, sNewWord, true)); // remove last char and go on + this._suggest(oSuggResult, "", nMaxSwitch, nMaxDel, nMaxHardRepl, nDeep+1, iAddr, sNewWord, true); // remove last char and go on for (let sRepl of char_player.dFinal1.gl_get(sRemain, [])) { - aSugg.gl_update(this._suggest(sRepl, nMaxDel, nMaxHardRepl, nDeep+1, iAddr, sNewWord, true)); + this._suggest(oSuggResult, sRepl, nMaxSwitch, nMaxDel, nMaxHardRepl, nDeep+1, iAddr, sNewWord, true); } } } + } + + * _getCharArcs (iAddr) { + // generator: yield all chars and addresses from node at address + for (let [nVal, jAddr] of this._getArcs(iAddr)) { + if (nVal < this.nChar) { + yield [this.dCharVal.get(nVal), jAddr]; + } + } @@ -284,9 +348,8 @@ - return aSugg; } - * _getSimilarArcs (cChar, iAddr) { + * _getSimilarCharArcs (cChar, iAddr) { // generator: yield similar char of and address of the following node for (let c of char_player.d1to1.gl_get(cChar, [cChar])) { if (this.dChar.has(c)) { let jAddr = this._lookupArcNode(this.dChar.get(c), iAddr); if (jAddr) { Index: gc_core/js/str_transform.js ================================================================== --- gc_core/js/str_transform.js +++ gc_core/js/str_transform.js @@ -2,10 +2,52 @@ /*jslint esversion: 6*/ // Note: 48 is the ASCII code for "0" var str_transform = { + + distanceDamerauLevenshtein: function (s1, s2) { + // distance of Damerau-Levenshtein between and + // https://fr.wikipedia.org/wiki/Distance_de_Damerau-Levenshtein + try { + let nLen1 = s1.length; + let nLen2 = s2.length; + let matrix = []; + for (let i = 0; i <= nLen1; i++) { + matrix[i] = new Array(nLen2 + 1); + } + for (let i = 0; i <= nLen1; i++) { + matrix[i][0] = i; + } + for (let j = 0; j <= nLen2; j++) { + matrix[0][j] = j; + } + for (let i = 1; i <= nLen1; i++) { + for (let j = 1; j <= nLen2; j++) { + let nCost = (s1[i] === s2[j]) ? 0 : 1; + matrix[i][j] = Math.min( + matrix[i-1][j] + 1, // Deletion + matrix[i][j-1] + 1, // Insertion + matrix[i-1][j-1] + nCost // Substitution + ); + if (i > 1 && j > 1 && s1[i] == s2[j-1] && s1[i-1] == s2[j]) { + matrix[i][j] = Math.min(matrix[i][j], matrix[i-2][j-2] + nCost); // Transposition + } + } + } + //console.log(s2 + ": " + matrix[nLen1][nLen2]); + return matrix[nLen1][nLen2]; + } + catch (e) { + helpers.logerror(e); + } + }, + + showDistance (s1, s2) { + console.log(`Distance: ${s1} / ${s2} = ${this.distanceDamerauLevenshtein(s1, s2)})`); + }, + getStemFromSuffixCode: function (sFlex, sSfxCode) { // Suffix only if (sSfxCode == "0") { return sFlex; } Index: gc_core/py/char_player.py ================================================================== --- gc_core/py/char_player.py +++ gc_core/py/char_player.py @@ -17,61 +17,13 @@ def cleanWord (sWord): "word simplication before calculating distance between words" return sWord.lower().translate(_xTransChars).replace("eau", "o").replace("au", "o") -def distanceDamerauLevenshtein (s1, s2): - "distance of Damerau-Levenshtein between and " - # https://fr.wikipedia.org/wiki/Distance_de_Damerau-Levenshtein - d = {} - nLen1 = len(s1) - nLen2 = len(s2) - for i in range(-1, nLen1+1): - d[i, -1] = i + 1 - for j in range(-1, nLen2+1): - d[-1, j] = j + 1 - for i in range(nLen1): - for j in range(nLen2): - nCost = 0 if s1[i] == s2[j] else 1 - d[i, j] = min( - d[i-1, j] + 1, # Deletion - d[i, j-1] + 1, # Insertion - d[i-1, j-1] + nCost, # Substitution - ) - if i and j and s1[i] == s2[j-1] and s1[i-1] == s2[j]: - d[i, j] = min(d[i, j], d[i-2, j-2] + nCost) # Transposition - return d[nLen1-1, nLen2-1] - - -def showDistance (s1, s2): - s1b = cleanWord(s1); - s2b = cleanWord(s2); - print(f"Distance: {s1} / {s2} = {distanceDamerauLevenshtein(s1, s2)}") - print(f"Distance: {s1b} / {s2b} = {distanceDamerauLevenshtein(s1b, s2b)}") - - -# Method: Remove Useless Chars - -_dVovels = { - 'a': '', 'e': '', 'i': '', 'o': '', 'u': '', 'y': '', - 'à': '', 'é': '', 'î': '', 'ô': '', 'û': '', 'ÿ': '', - 'â': '', 'è': '', 'ï': '', 'ö': '', 'ù': '', 'ŷ': '', - 'ä': '', 'ê': '', 'í': '', 'ó': '', 'ü': '', 'ý': '', - 'á': '', 'ë': '', 'ì': '', 'ò': '', 'ú': '', 'ỳ': '', - 'ā': '', 'ē': '', 'ī': '', 'ō': '', 'ū': '', 'ȳ': '', - 'h': '', 'œ': '', 'æ': '' - } - -_xTransVovels = str.maketrans(_dVovels) - - -aVovels = frozenset(_dVovels.keys()) - - -def shrinkWord (sWord): - "remove vovels and h" - return sWord[0:1].replace("h", "") + sWord[1:].translate(_xTransVovels) +aVowel = set("aáàâäāeéèêëēiíìîïīoóòôöōuúùûüūyýỳŷÿȳœæAÁÀÂÄĀEÉÈÊËĒIÍÌÎÏĪOÓÒÔÖŌUÚÙÛÜŪYÝỲŶŸȲŒÆ") +aConsonant = set("bcçdfghjklmnñpqrstvwxzBCÇDFGHJKLMNÑPQRSTVWXZ") +aDouble = set("bcçdfjklmnprstzBCÇDFJKLMNPRSTZ") # letters that may be used twice successively # Similar chars d1to1 = { @@ -203,20 +155,18 @@ "Æ": ("AE",), "b": ("bb",), "B": ("BB",), "c": ("cc", "ss", "qu", "ch"), "C": ("CC", "SS", "QU", "CH"), - "ç": ("ss", "cc", "qh", "ch"), - "Ç": ("SS", "CC", "QH", "CH"), "d": ("dd",), "D": ("DD",), + "é": ("ai", "ei"), + "É": ("AI", "EI"), "f": ("ff", "ph"), "F": ("FF", "PH"), "g": ("gu", "ge", "gg", "gh"), "G": ("GU", "GE", "GG", "GH"), - "i": ("ii",), - "I": ("II",), "j": ("jj", "dj"), "J": ("JJ", "DJ"), "k": ("qu", "ck", "ch", "cu", "kk", "kh"), "K": ("QU", "CK", "CH", "CU", "KK", "KH"), "l": ("ll",), @@ -242,10 +192,17 @@ "x": ("cc", "ct", "xx"), "X": ("CC", "CT", "XX"), "z": ("ss", "zh"), "Z": ("SS", "ZH"), } + + +def get1toXReplacement (cPrev, cCur, cNext): + if cCur in aConsonant and (cPrev in aConsonant or cNext in aConsonant): + return () + return d1toX.get(cCur, ()) + d2toX = { "an": ("en",), "AN": ("EN",), "au": ("eau", "o", "ô"), Index: gc_core/py/ibdawg.py ================================================================== --- gc_core/py/ibdawg.py +++ gc_core/py/ibdawg.py @@ -6,12 +6,12 @@ import re from itertools import chain from functools import wraps import time -#import logging -#logging.basicConfig(filename="suggestions.log", level=logging.DEBUG) +import logging +logging.basicConfig(filename="suggestions.log", level=logging.DEBUG) from . import str_transform as st from . import char_player as cp from .echo import echo @@ -25,10 +25,57 @@ fEnd = time.time() print(func.__name__, fEnd - fStart) return result return wrapper + +class SuggResult: + """Structure for storing, classifying and filtering suggestions""" + + def __init__ (self, sWord, nDistLimit=-1): + self.sWord = sWord + self.sCleanWord = cp.cleanWord(sWord) + self.nDistLimit = nDistLimit if nDistLimit >= 0 else (len(sWord) // 3) + 1 + self.nMinDist = 1000 + self.aSugg = set() + self.dSugg = { 0: [], 1: [] } + + def addSugg (self, sSugg, nDeep=0): + "add a suggestion" + #logging.info((nDeep * " ") + "__" + sSugg + "__") + if sSugg not in self.aSugg: + nDist = st.distanceDamerauLevenshtein(self.sCleanWord, cp.cleanWord(sSugg)) + if nDist <= self.nDistLimit: + if nDist not in self.dSugg: + self.dSugg[nDist] = [] + self.dSugg[nDist].append(sSugg) + self.aSugg.add(sSugg) + if nDist < self.nMinDist: + self.nMinDist = nDist + self.nDistLimit = min(self.nDistLimit, self.nMinDist+2) + + def getSuggestions (self, nSuggLimit=10, nDistLimit=-1): + "return a list of suggestions" + lRes = [] + if self.dSugg[0]: + # we sort the better results with the original word + self.dSugg[0].sort(key=lambda sSugg: st.distanceDamerauLevenshtein(self.sWord, sSugg)) + for lSugg in self.dSugg.values(): + lRes.extend(lSugg) + if len(lRes) > nSuggLimit: + break + lRes = list(cp.filterSugg(lRes)) + if self.sWord.istitle(): + lRes = list(map(lambda sSugg: sSugg.title(), lRes)) + elif self.sWord.isupper(): + lRes = list(map(lambda sSugg: sSugg.upper(), lRes)) + return lRes[:nSuggLimit] + + def reset (self): + self.aSugg.clear() + self.dSugg.clear() + class IBDAWG: """INDEXABLE BINARY DIRECT ACYCLIC WORD GRAPH""" def __init__ (self, sDicName): @@ -202,81 +249,114 @@ @timethis def suggest (self, sWord, nMaxSugg=10): "returns a set of suggestions for " sPfx, sWord, sSfx = cp.cut(sWord) + nMaxSwitch = max(len(sWord) // 3, 1) nMaxDel = len(sWord) // 5 nMaxHardRepl = max((len(sWord) - 5) // 4, 1) - aSugg = self._suggest(sWord, nMaxDel=nMaxDel, nMaxHardRepl=nMaxHardRepl) + oSuggResult = SuggResult(sWord) + self._suggest(oSuggResult, sWord, nMaxSwitch=nMaxSwitch, nMaxDel=nMaxDel, nMaxHardRepl=nMaxHardRepl) if sWord.istitle(): - aSugg.update(self._suggest(sWord.lower(), nMaxDel=nMaxDel, nMaxHardRepl=nMaxHardRepl)) - aSugg = set(map(lambda sSugg: sSugg.title(), aSugg)) + self._suggest(oSuggResult, sWord.lower(), nMaxSwitch=nMaxSwitch, nMaxDel=nMaxDel, nMaxHardRepl=nMaxHardRepl) elif sWord.islower(): - aSugg.update(self._suggest(sWord.title(), nMaxDel=nMaxDel, nMaxHardRepl=nMaxHardRepl)) - aSugg = cp.filterSugg(aSugg) - sCleanWord = cp.cleanWord(sWord) - aSugg = sorted(aSugg, key=lambda sSugg: cp.distanceDamerauLevenshtein(sCleanWord, cp.cleanWord(sSugg)))[:nMaxSugg] + self._suggest(oSuggResult, sWord.title(), nMaxSwitch=nMaxSwitch, nMaxDel=nMaxDel, nMaxHardRepl=nMaxHardRepl) + aSugg = oSuggResult.getSuggestions() + if sSfx or sPfx: + # we add what we removed + return list(map(lambda sSug: sPfx + sSug + sSfx, aSugg)) + return aSugg + + def _suggest (self, oSuggResult, sRemain, nMaxSwitch=0, nMaxDel=0, nMaxHardRepl=0, nDeep=0, iAddr=0, sNewWord="", sAction="", bAvoidLoop=False): + # recursive function + #logging.info((nDeep * " ") + sNewWord + ":" + sRemain + " · " + sAction) + if not sRemain: + if int.from_bytes(self.byDic[iAddr:iAddr+self.nBytesArc], byteorder='big') & self._finalNodeMask: + oSuggResult.addSugg(sNewWord, nDeep) + for sTail in self._getTails(iAddr): + oSuggResult.addSugg(sNewWord+sTail, nDeep) + return + cCurrent = sRemain[0:1] + for cChar, jAddr in self._getSimilarCharArcs(cCurrent, iAddr): + self._suggest(oSuggResult, sRemain[1:], nMaxSwitch, nMaxDel, nMaxHardRepl, nDeep+1, jAddr, sNewWord+cChar, "*") + if not bAvoidLoop: # avoid infinite loop + if len(sRemain) > 1: + if cCurrent == sRemain[1:2]: + # same char, we remove 1 char without adding 1 to + self._suggest(oSuggResult, sRemain[1:], nMaxSwitch, nMaxDel, nMaxHardRepl, nDeep+1, iAddr, sNewWord, cCurrent+"/2") + else: + # switching chars + if nMaxSwitch: + self._suggest(oSuggResult, sRemain[1:2]+sRemain[0:1]+sRemain[2:], nMaxSwitch-1, nMaxDel, nMaxHardRepl, nDeep+1, iAddr, sNewWord, "><",True) + # delete char + if nMaxDel: + self._suggest(oSuggResult, sRemain[1:], nMaxSwitch, nMaxDel-1, nMaxHardRepl, nDeep+1, iAddr, sNewWord, "-"+cCurrent, True) + # Phonetic replacements + for sRepl in cp.get1toXReplacement(sNewWord[-1:], cCurrent, sRemain[1:2]): + self._suggest(oSuggResult, sRepl + sRemain[1:], nMaxSwitch, nMaxDel, nMaxHardRepl, nDeep+1, iAddr, sNewWord, cCurrent+">"+sRepl, True) + for sRepl in cp.d2toX.get(sRemain[0:2], ()): + self._suggest(oSuggResult, sRepl + sRemain[2:], nMaxSwitch, nMaxDel, nMaxHardRepl, nDeep+1, iAddr, sNewWord, sRemain[0:2]+">"+sRepl, True) + # Hard replacements + if nDeep > 3 and nMaxHardRepl: + for cChar, kAddr in self._getCharArcs(iAddr): + if cChar not in cp.d1to1.get(cCurrent, ""): + self._suggest(oSuggResult, sRemain[1:], nMaxSwitch, nMaxDel, nMaxHardRepl-1, nDeep+1, kAddr, sNewWord+cChar, "[["+cChar+"]]", True) + # end of word + if len(sRemain) == 2: + for sRepl in cp.dFinal2.get(sRemain, ()): + self._suggest(oSuggResult, sRepl, nMaxSwitch, nMaxDel, nMaxHardRepl, nDeep+1, iAddr, sNewWord, sRemain + " >> " + sRepl, True) + elif len(sRemain) == 1: + self._suggest(oSuggResult, "", nMaxSwitch, nMaxDel, nMaxHardRepl, nDeep+1, iAddr, sNewWord, sRemain + " [last char removed] ", True) # remove last char and go on + for sRepl in cp.dFinal1.get(sRemain, ()): + self._suggest(oSuggResult, sRepl, nMaxSwitch, nMaxDel, nMaxHardRepl, nDeep+1, iAddr, sNewWord, sRemain + " >> " + sRepl, True) + return + + @timethis + def suggest2 (self, sWord, nMaxSugg=10): + "returns a set of suggestions for " + sPfx, sWord, sSfx = cp.cut(sWord) + oSuggResult = SuggResult(sWord) + self._suggest2(oSuggResult) + aSugg = oSuggResult.getSuggestions() if sSfx or sPfx: # we add what we removed return list(map(lambda sSug: sPfx + sSug + sSfx, aSugg)) return aSugg - def _suggest (self, sRemain, nMaxDel=0, nMaxHardRepl=0, nDeep=0, iAddr=0, sNewWord="", bAvoidLoop=False): - "returns a set of suggestions" + def _suggest2 (self, oSuggResult, nDeep=0, iAddr=0, sNewWord=""): # recursive function - #logging.info((nDeep * " ") + sNewWord + ":" + sRemain) - aSugg = set() - if not sRemain: - if int.from_bytes(self.byDic[iAddr:iAddr+self.nBytesArc], byteorder='big') & self._finalNodeMask: - #logging.info((nDeep * " ") + "__" + sNewWord + "__") - aSugg.add(sNewWord) - for sTail in self._getTails(iAddr): - #logging.info((nDeep * " ") + "__" + sNewWord+sTail + "__") - aSugg.add(sNewWord+sTail) - return aSugg - cCurrent = sRemain[0:1] - for cChar, jAddr in self._getSimilarArcs(cCurrent, iAddr): - aSugg.update(self._suggest(sRemain[1:], nMaxDel, nMaxHardRepl, nDeep+1, jAddr, sNewWord+cChar)) - if not bAvoidLoop: # avoid infinite loop - if cCurrent == sRemain[1:2]: - # same char, we remove 1 char without adding 1 to - aSugg.update(self._suggest(sRemain[1:], nMaxDel, nMaxHardRepl, nDeep+1, iAddr, sNewWord)) - else: - # switching chars - aSugg.update(self._suggest(sRemain[1:2]+sRemain[0:1]+sRemain[2:], nMaxDel, nMaxHardRepl, nDeep+1, iAddr, sNewWord, True)) - # delete char - if nMaxDel > 0: - aSugg.update(self._suggest(sRemain[1:], nMaxDel-1, nMaxHardRepl, nDeep+1, iAddr, sNewWord, True)) - # Phonetic replacements - for sRepl in cp.d1toX.get(cCurrent, ()): - aSugg.update(self._suggest(sRepl + sRemain[1:], nMaxDel, nMaxHardRepl, nDeep+1, iAddr, sNewWord, True)) - for sRepl in cp.d2toX.get(sRemain[0:2], ()): - aSugg.update(self._suggest(sRepl + sRemain[2:], nMaxDel, nMaxHardRepl, nDeep+1, iAddr, sNewWord, True)) - # Hard replacements - if nDeep > 3 and nMaxHardRepl and len(sRemain) >= 2: - for nVal, kAddr in self._getArcs1(iAddr): - if nVal in self.dCharVal: - cChar = self.dCharVal[nVal] - if cChar not in cp.d1to1.get(cCurrent, ""): - aSugg.update(self._suggest(sRemain[1:], nMaxDel, nMaxHardRepl-1, nDeep+1, kAddr, sNewWord+cChar, True)) - # end of word - if len(sRemain) == 2: - for sRepl in cp.dFinal2.get(sRemain, ()): - aSugg.update(self._suggest(sRepl, nMaxDel, nMaxHardRepl, nDeep+1, iAddr, sNewWord, True)) - elif len(sRemain) == 1: - aSugg.update(self._suggest("", nMaxDel, nMaxHardRepl, nDeep+1, iAddr, sNewWord, True)) # remove last char and go on - for sRepl in cp.dFinal1.get(sRemain, ()): - aSugg.update(self._suggest(sRepl, nMaxDel, nMaxHardRepl, nDeep+1, iAddr, sNewWord, True)) - return aSugg - - def _getSimilarArcs (self, cChar, iAddr): + #logging.info((nDeep * " ") + sNewWord) + if nDeep >= oSuggResult.nDistLimit: + sCleanNewWord = cp.cleanWord(sNewWord) + if st.distanceSift4(oSuggResult.sCleanWord[:len(sCleanNewWord)], sCleanNewWord) > oSuggResult.nDistLimit: + return + if int.from_bytes(self.byDic[iAddr:iAddr+self.nBytesArc], byteorder='big') & self._finalNodeMask: + oSuggResult.addSugg(sNewWord, nDeep) + for cChar, jAddr in self._getCharArcsWithPriority(iAddr, oSuggResult.sWord[nDeep:nDeep+1]): + self._suggest2(oSuggResult, nDeep+1, jAddr, sNewWord+cChar) + return + + def _getCharArcs (self, iAddr): + "generator: yield all chars and addresses from node at address " + for nVal, jAddr in self._getArcs(iAddr): + if nVal < self.nChar: + yield (self.dCharVal[nVal], jAddr) + + def _getSimilarCharArcs (self, cChar, iAddr): "generator: yield similar char of and address of the following node" for c in cp.d1to1.get(cChar, [cChar]): if c in self.dChar: jAddr = self._lookupArcNode(self.dChar[c], iAddr) if jAddr: yield (c, jAddr) + + def _getCharArcsWithPriority (self, iAddr, cChar): + if not cChar: + yield from self._getCharArcs(iAddr) + lTuple = list(self._getCharArcs(iAddr)) + lTuple.sort(key=lambda t: 0 if t[0] in cp.d1to1.get(cChar, cChar) else 1) + yield from lTuple def _getTails (self, iAddr, sTail="", n=2): "return a list of suffixes ending at a distance of from " aTails = set() for nVal, jAddr in self._getArcs(iAddr): @@ -287,21 +367,20 @@ aTails.update(self._getTails(jAddr, sTail+self.dCharVal[nVal], n-1)) return aTails def drawPath (self, sWord, iAddr=0): "show the path taken by in the graph" - cChar = sWord[0:1] if sWord else " " + c1 = sWord[0:1] if sWord else " " iPos = -1 n = 0 - print(cChar + ": ", end="") - for nVal, jAddr in self._getArcs(iAddr): - if nVal in self.dCharVal: - print(self.dCharVal[nVal], end="") - if self.dCharVal[nVal] == sWord[0:1]: - iNextNodeAddr = jAddr - iPos = n - n += 1 + print(c1 + ": ", end="") + for c2, jAddr in self._getCharArcs(iAddr): + print(c2, end="") + if c2 == sWord[0:1]: + iNextNodeAddr = jAddr + iPos = n + n += 1 if not sWord: return if iPos >= 0: print("\n "+ " " * iPos + "|") self.drawPath(sWord[1:], iNextNodeAddr) Index: gc_core/py/str_transform.py ================================================================== --- gc_core/py/str_transform.py +++ gc_core/py/str_transform.py @@ -1,7 +1,120 @@ #!python3 + +#### DISTANCE CALCULATIONS + +def longestCommonSubstring (s1, s2): + # http://en.wikipedia.org/wiki/Longest_common_substring_problem + # http://en.wikibooks.org/wiki/Algorithm_implementation/Strings/Longest_common_substring + M = [ [0]*(1+len(s2)) for i in range(1+len(s1)) ] + longest, x_longest = 0, 0 + for x in range(1, 1+len(s1)): + for y in range(1, 1+len(s2)): + if s1[x-1] == s2[y-1]: + M[x][y] = M[x-1][y-1] + 1 + if M[x][y] > longest: + longest = M[x][y] + x_longest = x + else: + M[x][y] = 0 + return s1[x_longest-longest : x_longest] + + +def distanceDamerauLevenshtein (s1, s2): + "distance of Damerau-Levenshtein between and " + # https://fr.wikipedia.org/wiki/Distance_de_Damerau-Levenshtein + d = {} + nLen1 = len(s1) + nLen2 = len(s2) + for i in range(-1, nLen1+1): + d[i, -1] = i + 1 + for j in range(-1, nLen2+1): + d[-1, j] = j + 1 + for i in range(nLen1): + for j in range(nLen2): + nCost = 0 if s1[i] == s2[j] else 1 + d[i, j] = min( + d[i-1, j] + 1, # Deletion + d[i, j-1] + 1, # Insertion + d[i-1, j-1] + nCost, # Substitution + ) + if i and j and s1[i] == s2[j-1] and s1[i-1] == s2[j]: + d[i, j] = min(d[i, j], d[i-2, j-2] + nCost) # Transposition + return d[nLen1-1, nLen2-1] + + +def distanceSift4 (s1, s2, nMaxOffset=5): + "implementation of general Sift4." + # https://siderite.blogspot.com/2014/11/super-fast-and-accurate-string-distance.html + if not s1: + return len(s2) + if not s2: + return len(s1) + nLen1, nLen2 = len(s1), len(s2) + i1, i2 = 0, 0 # Cursors for each string + nLargestCS = 0 # Largest common substring + nLocalCS = 0 # Local common substring + nTrans = 0 # Number of transpositions ('ab' vs 'ba') + lOffset = [] # Offset pair array, for computing the transpositions + + while i1 < nLen1 and i2 < nLen2: + if s1[i1] == s2[i2]: + nLocalCS += 1 + # Check if current match is a transposition + bTrans = False + i = 0 + while i < len(lOffset): + t = lOffset[i] + if i1 <= t[0] or i2 <= t[1]: + bTrans = abs(i2-i1) >= abs(t[1] - t[0]) + if bTrans: + nTrans += 1 + elif not t[2]: + t[2] = True + nTrans += 1 + break + elif i1 > t[1] and i2 > t[0]: + del lOffset[i] + else: + i += 1 + lOffset.append([i1, i2, bTrans]) + else: + nLargestCS += nLocalCS + nLocalCS = 0 + if i1 != i2: + i1 = i2 = min(i1, i2) + for i in range(nMaxOffset): + if i1 + i >= nLen1 and i2 + i >= nLen2: + break + elif i1 + i < nLen1 and s1[i1+i] == s2[i2]: + i1 += i - 1 + i2 -= 1 + break + elif i2 + i < nLen2 and s1[i1] == s2[i2+i]: + i2 += i - 1 + i1 -= 1 + break + i1 += 1 + i2 += 1 + if i1 >= nLen1 or i2 >= nLen2: + nLargestCS += nLocalCS + nLocalCS = 0 + i1 = i2 = min(i1, i2) + nLargestCS += nLocalCS + return round(max(nLen1, nLen2) - nLargestCS + nTrans) + + +def showDistance (s1, s2): + print(f"Damerau-Levenshtein: {s1} / {s2} = {distanceDamerauLevenshtein(s1, s2)}") + print(f"Sift4: {s1} / {s2} = {distanceSift4(s1, s2)}") + + + + +#### STEMMING OPERATIONS + ## No stemming def noStemming (sFlex, sStem): return sStem @@ -38,18 +151,20 @@ for i in range(min(len(sFlex), len(sStem))): if sFlex[i] != sStem[i]: break jSfx += 1 return chr(len(sFlex)-jSfx+48) + sStem[jSfx:] + def changeWordWithSuffixCode (sWord, sSfxCode): if sSfxCode == "0": return sWord return sWord[:-(ord(sSfxCode[0])-48)] + sSfxCode[1:] if sSfxCode[0] != '0' else sWord + sSfxCode[1:] # Prefix and suffix + def defineAffixCode (sFlex, sStem): """ Returns a string defining how to get stem from flexion. Examples: "0" if stem = flexion "stem" if no common substring "n(pfx)/m(sfx)" @@ -74,25 +189,10 @@ sAff = "{}/".format(chr(n+48)) if not sPfx else "{}{}/".format(chr(n+48), sPfx) sAff += chr(m+48) if not sSfx else "{}{}".format(chr(m+48), sSfx) return sAff return sStem -def longestCommonSubstring (s1, s2): - # http://en.wikipedia.org/wiki/Longest_common_substring_problem - # http://en.wikibooks.org/wiki/Algorithm_implementation/Strings/Longest_common_substring - M = [ [0]*(1+len(s2)) for i in range(1+len(s1)) ] - longest, x_longest = 0, 0 - for x in range(1, 1+len(s1)): - for y in range(1, 1+len(s2)): - if s1[x-1] == s2[y-1]: - M[x][y] = M[x-1][y-1] + 1 - if M[x][y] > longest: - longest = M[x][y] - x_longest = x - else: - M[x][y] = 0 - return s1[x_longest-longest : x_longest] def changeWordWithAffixCode (sWord, sAffCode): if sAffCode == "0": return sWord if '/' not in sAffCode: