Index: gc_lang/fr/modules/tests_modules.py ================================================================== --- gc_lang/fr/modules/tests_modules.py +++ gc_lang/fr/modules/tests_modules.py @@ -20,11 +20,11 @@ start = time.perf_counter() try: yield finally: end = time.perf_counter() - print('{} : {}'.format(label, end - start)) + print('{:<20} : {}'.format(label, end - start)) if hDst: hDst.write("{:<12.6}".format(end-start)) class TestDictionary (unittest.TestCase): @@ -49,13 +49,18 @@ def test_isvalid_failed (self): for sWord in ["BranchE", "BRanche", "BRAnCHE", "émilie", "éMILIE", "émiLie"]: self.assertFalse(self.oDic.isValid(sWord), sWord) def test_suggest (self): - for sWord in ["déelirranttesss", "vallidasion", "Emilie", "exibission"]: + for sWord in [ + "déelirranttesss", "vallidasion", "Emilie", "exibission", "ditirembique", "jai", "email", + "fatiqué", "coeur", "trèèèèèèèèès", "vraaaaiiiimeeeeennnt", "apele", "email", "Co2", + "emmppâiiiller", "testt", "apelaion", "exsepttion", "sintaxik", "ebriete", "ennormmement" + ]: with timeblock(sWord): - self.assertNotEqual(0, self.oDic.suggest(sWord)) + aSugg = self.oDic.suggest(sWord) + print(sWord, "->", " ".join(aSugg)) class TestConjugation (unittest.TestCase): "Tests des conjugaisons" Index: gc_lang/fr/perf_memo.text ================================================================== --- gc_lang/fr/perf_memo.text +++ gc_lang/fr/perf_memo.text @@ -28,7 +28,8 @@ 1.1 2019.05.16 09:42 1.50743 0.360923 0.261113 0.0749272 0.0763827 0.0771537 0.180504 0.102942 0.0182762 0.0021925 (×2, but new processor: AMD Ryzen 7 2700X) 1.2.1 2019.08.06 20:57 1.42886 0.358425 0.247356 0.0704405 0.0754886 0.0765604 0.177197 0.0988517 0.0188103 0.0020243 1.6.0 2020.01.03 20:22 1.38847 0.346214 0.240242 0.0709539 0.0737499 0.0748733 0.176477 0.0969171 0.0187857 0.0025143 (nouveau dictionnaire avec lemmes masculins) 1.9.0 2020.04.20 19:57 1.51183 0.369546 0.25681 0.0734314 0.0764396 0.0785668 0.183922 0.103674 0.0185812 0.002099 (NFC normalization) 1.9.2 2020.05.12 08:43 1.62465 0.398831 0.273012 0.0810811 0.080937 0.0845885 0.204133 0.114146 0.0212864 0.0029547 -1.12.2 2020.09.09 13:34 1.50568 0.374504 0.233108 0.0798712 0.0804466 0.0769674 0.171519 0.0945132 0.0165344 0.0019474 -1.12.2 2020.09.09 13:35 1.41094 0.359093 0.236443 0.06968 0.0734418 0.0738087 0.169371 0.0946279 0.0167106 0.0019773 +1.12.2 2020.09.09 13:34 1.50568 0.374504 0.233108 0.0798712 0.0804466 0.0769674 0.171519 0.0945132 0.0165344 0.0019474 +1.12.2 2020.09.09 13:35 1.41094 0.359093 0.236443 0.06968 0.0734418 0.0738087 0.169371 0.0946279 0.0167106 0.0019773 +1.12.2 2020.09.11 19:16 1.35297 0.330545 0.221731 0.0666998 0.0692539 0.0701707 0.160564 0.0891676 0.015807 0.0045998 Index: gc_lang/fr/webext/gce_worker.js ================================================================== --- gc_lang/fr/webext/gce_worker.js +++ gc_lang/fr/webext/gce_worker.js @@ -290,17 +290,26 @@ let dOptions = helpers.mapToObject(gc_engine.getOptions()); postMessage(createResponse("resetOptions", dOptions, oInfo, true)); } function tests () { - console.log(conj.getConj("devenir", ":E", ":2s")); + /*console.log(conj.getConj("devenir", ":E", ":2s")); console.log(mfsp.getMasForm("emmerdeuse", true)); console.log(mfsp.getMasForm("pointilleuse", false)); console.log(phonet.getSimil("est")); let aRes = gc_engine.parse("Je suit..."); for (let oErr of aRes) { console.log(text.getReadableError(oErr)); + }*/ + for (let sWord of ["fatiqué", "coeur", "trèèèèèèèèès", "vraaaaiiiimeeeeennnt", "apele", "email", "Co2", "emmppâiiiller", "testt", "apelaion", "exsepttion", "sintaxik", "ebriete", "ennormmement"]) { + console.time("Suggestions for " + sWord); + for (let aSugg of oSpellChecker.suggest(sWord)) { + if (aSugg.length) { + console.log(sWord + " -> ", aSugg.join(" ")); + } + } + console.timeEnd("Suggestions for " + sWord); } } function textToTest (sText, sCountry, bDebug, bContext, oInfo={}) { if (!gc_engine) { Index: graphspell-js/char_player.js ================================================================== --- graphspell-js/char_player.js +++ graphspell-js/char_player.js @@ -6,39 +6,43 @@ ${map} var char_player = { - + /* + oDistanceBetweenChars: + - with Jaro-Winkler, values between 1 and 10 + - with Damerau-Levenshtein, values / 10 (between 0 and 1: 0.1, 0.2 ... 0.9) + */ oDistanceBetweenChars: { - "a": {}, - "e": {"é": 0.5}, - "é": {"e": 0.5}, - "i": {"y": 0.2}, - "o": {}, - "u": {}, - "y": {"i": 0.3}, - "b": {"d": 0.8, "h": 0.9}, - "c": {"ç": 0.1, "k": 0.5, "q": 0.5, "s": 0.5, "x": 0.5, "z": 0.8}, - "d": {"b": 0.8}, - "f": {"v": 0.8}, - "g": {"j": 0.5}, - "h": {"b": 0.9}, - "j": {"g": 0.5, "i": 0.9}, - "k": {"c": 0.5, "q": 0.1, "x": 0.5}, - "l": {"i": 0.9}, - "m": {"n": 0.8}, - "n": {"m": 0.8, "r": 0.9}, - "p": {"q": 0.9}, - "q": {"c": 0.5, "k": 0.1, "p": 0.9}, - "r": {"n": 0.9, "j": 0.9}, - "s": {"c": 0.5, "ç": 0.1, "x": 0.5, "z": 0.5}, - "t": {"d": 0.9}, - "v": {"f": 0.8, "w": 0.1}, - "w": {"v": 0.1}, - "x": {"c": 0.5, "k": 0.5, "q": 0.5, "s": 0.5}, - "z": {"s": 0.5} + //"a": {}, + "e": {"é": 5}, + //"é": {"e": 5}, + "i": {"y": 2}, + //"o": {}, + //"u": {}, + "y": {"i": 3}, + "b": {"d": 8, "h": 9}, + "c": {"ç": 1, "k": 5, "q": 5, "s": 5, "x": 5, "z": 8}, + "d": {"b": 8}, + "f": {"v": 8}, + "g": {"j": 5}, + "h": {"b": 9}, + "j": {"g": 5, "i": 9}, + "k": {"c": 5, "q": 1, "x": 5}, + "l": {"i": 9}, + "m": {"n": 8}, + "n": {"m": 8, "r": 9}, + "p": {"q": 9}, + "q": {"c": 5, "k": 1, "p": 9}, + "r": {"n": 9, "j": 9}, + "s": {"c": 5, "ç": 1, "x": 5, "z": 5}, + "t": {"d": 9}, + "v": {"f": 8, "w": 1}, + "w": {"v": 1}, + "x": {"c": 5, "k": 5, "q": 5, "s": 5}, + "z": {"s": 5} }, distanceBetweenChars: function (c1, c2) { if (c1 == c2) { return 0; @@ -318,12 +322,12 @@ // End of word dFinal1: new Map([ ["a", ["as", "at", "ant", "ah"]], ["A", ["AS", "AT", "ANT", "AH"]], - ["c", ["ch",]], - ["C", ["CH",]], + ["c", ["ch", "que"]], + ["C", ["CH", "QUE"]], ["e", ["et", "er", "ets", "ée", "ez", "ai", "ais", "ait", "ent", "eh"]], ["E", ["ET", "ER", "ETS", "ÉE", "EZ", "AI", "AIS", "AIT", "ENT", "EH"]], ["é", ["et", "er", "ets", "ée", "ez", "ai", "ais", "ait"]], ["É", ["ET", "ER", "ETS", "ÉE", "EZ", "AI", "AIS", "AIT"]], ["è", ["et", "er", "ets", "ée", "ez", "ai", "ais", "ait"]], @@ -334,10 +338,12 @@ ["Ë", ["ET", "ER", "ETS", "ÉE", "EZ", "AI", "AIS", "AIT"]], ["g", ["gh",]], ["G", ["GH",]], ["i", ["is", "it", "ie", "in"]], ["I", ["IS", "IT", "IE", "IN"]], + ["k", ["que"]], + ["K", ["QUE"]], ["n", ["nt", "nd", "ns", "nh"]], ["N", ["NT", "ND", "NS", "NH"]], ["o", ["aut", "ot", "os"]], ["O", ["AUT", "OT", "OS"]], ["ô", ["aut", "ot", "os"]], Index: graphspell-js/ibdawg.js ================================================================== --- graphspell-js/ibdawg.js +++ graphspell-js/ibdawg.js @@ -20,76 +20,103 @@ class SuggResult { // Structure for storing, classifying and filtering suggestions - constructor (sWord, nDistLimit=-1) { + constructor (sWord, nSuggLimit=10, nDistLimit=-1) { this.sWord = sWord; this.sSimplifiedWord = str_transform.simplifyWord(sWord); this.nDistLimit = (nDistLimit >= 0) ? nDistLimit : Math.floor(sWord.length / 3) + 1; this.nMinDist = 1000; - this.aSugg = new Set(); - this.dSugg = new Map([ [0, []], [1, []], [2, []] ]); - this.aAllSugg = new Set(); // all found words even those refused + // Temporary sets + this.aAllSugg = new Set(); // All suggestions, even the one rejected + this.dGoodSugg = new Map(); // Acceptable suggestions + this.dBestSugg = new Map(); // Best suggestions + // Parameters + this.nSuggLimit = nSuggLimit; + this.nSuggLimitExt = nSuggLimit + 2; // we add few entries in case suggestions merge after casing modifications + this.nBestSuggLimit = Math.floor(nSuggLimit * 1.5); // n times the requested limit + this.nGoodSuggLimit = nSuggLimit * 15; // n times the requested limit } - addSugg (sSugg, nDeep=0) { + addSugg (sSugg) { // add a suggestion if (this.aAllSugg.has(sSugg)) { return; } this.aAllSugg.add(sSugg); - if (!this.aSugg.has(sSugg)) { - let nDist = Math.floor(str_transform.distanceDamerauLevenshtein(this.sSimplifiedWord, str_transform.simplifyWord(sSugg))); - if (nDist <= this.nDistLimit) { - if (sSugg.includes(" ")) { // add 1 to distance for split suggestions - nDist += 1; - } - if (!this.dSugg.has(nDist)) { - this.dSugg.set(nDist, []); - } - this.dSugg.get(nDist).push(sSugg); - this.aSugg.add(sSugg); - if (nDist < this.nMinDist) { - this.nMinDist = nDist; - } - this.nDistLimit = Math.min(this.nDistLimit, this.nMinDist+1); - } + // jaro 0->1 1 les chaines sont égale + let nDistJaro = 1 - str_transform.distanceJaroWinkler(this.sSimplifiedWord, str_transform.simplifyWord(sSugg)); + let nDist = Math.floor(nDistJaro * 10); + if (nDistJaro < .11) { // Best suggestions + this.dBestSugg.set(sSugg, Math.round(nDistJaro*1000)); + if (this.dBestSugg.size > this.nBestSuggLimit) { + this.nDistLimit = -1; // make suggest() to end search + } + } else if (nDistJaro < .33) { // Good suggestions + this.dGoodSugg.set(sSugg, Math.round(nDistJaro*1000)); + if (this.dGoodSugg.size > this.nGoodSuggLimit) { + this.nDistLimit = -1; // make suggest() to end search + } + } else { + if (nDist < this.nMinDist) { + this.nMinDist = nDist; + } + this.nDistLimit = Math.min(this.nDistLimit, this.nMinDist); + } + if (nDist <= this.nDistLimit) { + if (nDist < this.nMinDist) { + this.nMinDist = nDist; + } + this.nDistLimit = Math.min(this.nDistLimit, this.nMinDist+1); } } - getSuggestions (nSuggLimit=10) { + getSuggestions () { // return a list of suggestions let lRes = []; - let bFirstListSorted = false; - for (let [nDist, lSugg] of this.dSugg.entries()) { - if (nDist > this.nDistLimit) { - break; - } - if (!bFirstListSorted && lSugg.length > 1) { - lRes.sort((a, b) => { return str_transform.distanceDamerauLevenshtein(this.sWord, a) - str_transform.distanceDamerauLevenshtein(this.sWord, b); }); - bFirstListSorted = true; - } - lRes.push(...lSugg); - if (lRes.length > nSuggLimit) { - break; + if (this.dBestSugg.size > 0) { + // sort only with simplified words + let lResTmp = [...this.dBestSugg.entries()].sort((a, b) => { return a[1] - b[1]; }); + let nSize = Math.min(this.nSuggLimitExt, lResTmp.length); + for (let i=0; i < nSize; i++){ + lRes.push(lResTmp[i][0]); + } + } + if (lRes.length < this.nSuggLimitExt) { + // sort with simplified words and original word + let lResTmp = [...this.dGoodSugg.entries()].sort((a, b) => { + // Low precision to rely more on simplified words + let nJaroA = Math.round(str_transform.distanceJaroWinkler(this.sWord, a[0]) * 10); + let nJaroB = Math.round(str_transform.distanceJaroWinkler(this.sWord, b[0]) * 10); + if (nJaroA == nJaroB) { + return a[1] - b[1]; // warning: both lists are NOT sorted the same way (key: a-b) + } else { + return nJaroB - nJaroA; // warning: both lists are NOT sorted the same way (key: b-a) + } + }).slice(0, this.nSuggLimitExt); + let nSize = Math.min(this.nSuggLimitExt, lResTmp.length); + for (let i=0; i < nSize; i++){ + lRes.push(lResTmp[i][0]); } } + // casing if (this.sWord.gl_isUpperCase()) { lRes = lRes.map((sSugg) => { return sSugg.toUpperCase(); }); lRes = [...new Set(lRes)]; } else if (this.sWord.slice(0,1).gl_isUpperCase()) { lRes = lRes.map((sSugg) => { return sSugg.slice(0,1).toUpperCase() + sSugg.slice(1); }); lRes = [...new Set(lRes)]; } - return lRes.slice(0, nSuggLimit); + return lRes.slice(0, this.nSuggLimit); } reset () { - this.aSugg.clear(); this.dSugg.clear(); + this.dGoodSugg.clear(); + this.dBestSugg.clear(); } } class IBDAWG { @@ -122,24 +149,10 @@ Properties: sName, nCompressionMethod, sHeader, lArcVal, nArcVal, sByDic, sLang, nChar, nBytesArc, nBytesNodeAddress, nEntry, nNode, nArc, nAff, cStemming, nTag, dChar, nBytesOffset, */ - /* - Bug workaround. - Mozilla’s JS parser sucks. Can’t read file bigger than 4 Mb! - So we convert huge hexadecimal string to list of numbers… - https://github.com/mozilla/addons-linter/issues/1361 - */ - let lTemp = []; - for (let i = 0; i < this.sByDic.length; i+=2) { - lTemp.push(parseInt(this.sByDic.slice(i, i+2), 16)); - } - this.byDic = lTemp; - //this.byDic = new Uint8Array(lTemp); // not quicker, even slower - /* end of bug workaround */ - if (!(this.sHeader.startsWith("/grammalecte-fsa/") || this.sHeader.startsWith("/pyfsa/"))) { throw TypeError("# Error. Not a grammalecte-fsa binary dictionary. Header: " + this.sHeader); } if (!(this.nCompressionMethod == 1 || this.nCompressionMethod == 2 || this.nCompressionMethod == 3)) { throw RangeError("# Error. Unknown dictionary compression method: " + this.nCompressionMethod); @@ -155,41 +168,45 @@ this.funcStemming = str_transform.changeWordWithAffixCode; } else { this.funcStemming = str_transform.noStemming; } + /* + Bug workaround. + Mozilla’s JS parser sucks. Can’t read file bigger than 4 Mb! + So we convert huge hexadecimal string to list of numbers… + https://github.com/mozilla/addons-linter/issues/1361 + */ + /* + Performance trick: + Instead of converting bytes to integers each times we parse the binary dictionary, + we do it once, then parse the array + */ + let nAcc = 0; + let lBytesBuffer = []; + let lTemp = []; + let nDivisor = (this.nBytesArc + this.nBytesNodeAddress) / 2; + for (let i = 0; i < this.sByDic.length; i+=2) { + lBytesBuffer.push(parseInt(this.sByDic.slice(i, i+2), 16)); + if (nAcc == (this.nBytesArc - 1)) { + lTemp.push(this._convBytesToInteger(lBytesBuffer)); + lBytesBuffer = []; + } + else if (nAcc == (this.nBytesArc + this.nBytesNodeAddress - 1)) { + lTemp.push(Math.round(this._convBytesToInteger(lBytesBuffer) / nDivisor)); // Math.round should be useless, BUT with JS who knowns what can happen… + lBytesBuffer = []; + nAcc = -1; + } + nAcc = nAcc + 1; + } + this.byDic = lTemp; + /* end of bug workaround */ + this._arcMask = (2 ** ((this.nBytesArc * 8) - 3)) - 1; this._finalNodeMask = 1 << ((this.nBytesArc * 8) - 1); this._lastArcMask = 1 << ((this.nBytesArc * 8) - 2); - - // Configuring DAWG functions according to nCompressionMethod - switch (this.nCompressionMethod) { - case 1: - this.morph = this._morph1; - this.stem = this._stem1; - this._lookupArcNode = this._lookupArcNode1; - this._getArcs = this._getArcs1; - this._writeNodes = this._writeNodes1; - break; - case 2: - this.morph = this._morph2; - this.stem = this._stem2; - this._lookupArcNode = this._lookupArcNode2; - this._getArcs = this._getArcs2; - this._writeNodes = this._writeNodes2; - break; - case 3: - this.morph = this._morph3; - this.stem = this._stem3; - this._lookupArcNode = this._lookupArcNode3; - this._getArcs = this._getArcs3; - this._writeNodes = this._writeNodes3; - break; - default: - throw ValueError("# Error: unknown code: " + this.nCompressionMethod); - } //console.log(this.getInfo()); this.bAcronymValid = true; this.bNumAtLastValid = false; // lexicographer module ? @@ -196,11 +213,10 @@ this.lexicographer = null; // JS still sucks: we’ll try importation when importation will be available in Workers. Still waiting... if (self && self.hasOwnProperty("lexgraph_"+this.sLangCode)) { // self is the Worker this.lexicographer = self["lexgraph_"+this.sLangCode]; } - } getInfo () { return ` Language: ${this.sLangName} Lang code: ${this.sLangCode} Dictionary name: ${this.sDicName}\n` + ` Compression method: ${this.nCompressionMethod} Date: ${this.sDate} Stemming: ${this.cStemming}FX\n` + @@ -306,24 +322,24 @@ iAddr = this._lookupArcNode(this.dChar.get(c), iAddr); if (iAddr === null) { return false; } } - return Boolean(this._convBytesToInteger(this.byDic.slice(iAddr, iAddr+this.nBytesArc)) & this._finalNodeMask); + return Boolean(this.byDic[iAddr] & this._finalNodeMask); } getMorph (sWord) { // retrieves morphologies list, different casing allowed if (!sWord) { return []; } sWord = str_transform.spellingNormalization(sWord); - let l = this.morph(sWord); + let l = this._morph(sWord); if (sWord[0].gl_isUpperCase()) { - l.push(...this.morph(sWord.toLowerCase())); + l.push(...this._morph(sWord.toLowerCase())); if (sWord.gl_isUpperCase() && sWord.length > 1) { - l.push(...this.morph(sWord.gl_toCapitalize())); + l.push(...this._morph(sWord.gl_toCapitalize())); } } return l; } @@ -338,17 +354,18 @@ } let nMaxSwitch = Math.max(Math.floor(sWord.length / 3), 1); let nMaxDel = Math.floor(sWord.length / 5); let nMaxHardRepl = Math.max(Math.floor((sWord.length - 5) / 4), 1); let nMaxJump = Math.max(Math.floor(sWord.length / 4), 1); - let oSuggResult = new SuggResult(sWord); + let oSuggResult = new SuggResult(sWord, nSuggLimit); + sWord = str_transform.cleanWord(sWord); if (bSplitTrailingNumbers) { this._splitTrailingNumbers(oSuggResult, sWord); } this._splitSuggest(oSuggResult, sWord); this._suggest(oSuggResult, sWord, nMaxSwitch, nMaxDel, nMaxHardRepl, nMaxJump); - let aSugg = oSuggResult.getSuggestions(nSuggLimit); + let aSugg = oSuggResult.getSuggestions(); if (this.lexicographer) { aSugg = this.lexicographer.filterSugg(aSugg); } if (sSfx || sPfx) { // we add what we removed @@ -378,11 +395,11 @@ } _suggest (oSuggResult, sRemain, nMaxSwitch=0, nMaxDel=0, nMaxHardRepl=0, nMaxJump=0, nDist=0, nDeep=0, iAddr=0, sNewWord="", bAvoidLoop=false) { // returns a set of suggestions // recursive function - if (this._convBytesToInteger(this.byDic.slice(iAddr, iAddr+this.nBytesArc)) & this._finalNodeMask) { + if (this.byDic[iAddr] & this._finalNodeMask) { if (sRemain == "") { oSuggResult.addSugg(sNewWord); for (let sTail of this._getTails(iAddr)) { oSuggResult.addSugg(sNewWord+sTail); } @@ -488,11 +505,11 @@ _getTails (iAddr, sTail="", n=2) { // return a list of suffixes ending at a distance of from let aTails = new Set(); for (let [nVal, jAddr] of this._getArcs(iAddr)) { if (nVal <= this.nChar) { - if (this._convBytesToInteger(this.byDic.slice(jAddr, jAddr+this.nBytesArc)) & this._finalNodeMask) { + if (this.byDic[jAddr] & this._finalNodeMask) { aTails.add(sTail + this.dCharVal.get(nVal)); } if (n && aTails.size == 0) { aTails.gl_update(this._getTails(jAddr, sTail+this.dCharVal.get(nVal), n-1)); } @@ -499,14 +516,10 @@ } } return aTails; } - // morph (sWord) { - // is defined in constructor - // } - getSimilarEntries (sWord, nSuggLimit=10) { // return a list of tuples (similar word, stem, morphology) if (sWord == "") { return []; } @@ -530,35 +543,33 @@ } catch (e) { console.log("Error in regex pattern"); console.log(e.message); } - yield* this._select1(zFlexPattern, zTagsPattern, 0, ""); + yield* this._select(zFlexPattern, zTagsPattern, 0, ""); } - // VERSION 1 - - * _select1 (zFlexPattern, zTagsPattern, iAddr, sWord) { + * _select (zFlexPattern, zTagsPattern, iAddr, sWord) { // recursive generator - for (let [nVal, jAddr] of this._getArcs1(iAddr)) { + for (let [nVal, jAddr] of this._getArcs(iAddr)) { if (nVal <= this.nChar) { // simple character - yield* this._select1(zFlexPattern, zTagsPattern, jAddr, sWord + this.lArcVal[nVal]); + yield* this._select(zFlexPattern, zTagsPattern, jAddr, sWord + this.lArcVal[nVal]); } else { if (!zFlexPattern || zFlexPattern.test(sWord)) { let sStem = this.funcStemming(sWord, this.lArcVal[nVal]); - for (let [nMorphVal, _] of this._getArcs1(jAddr)) { + for (let [nMorphVal, _] of this._getArcs(jAddr)) { if (!zTagsPattern || zTagsPattern.test(this.lArcVal[nMorphVal])) { yield [sWord, sStem, this.lArcVal[nMorphVal]]; } } } } } } - _morph1 (sWord) { + _morph (sWord) { // returns morphologies of sWord let iAddr = 0; for (let c of sWord) { if (!this.dChar.has(c)) { return []; @@ -566,38 +577,38 @@ iAddr = this._lookupArcNode(this.dChar.get(c), iAddr); if (iAddr === null) { return []; } } - if (this._convBytesToInteger(this.byDic.slice(iAddr, iAddr+this.nBytesArc)) & this._finalNodeMask) { + if (this.byDic[iAddr] & this._finalNodeMask) { let l = []; let nRawArc = 0; while (!(nRawArc & this._lastArcMask)) { - let iEndArcAddr = iAddr + this.nBytesArc; - nRawArc = this._convBytesToInteger(this.byDic.slice(iAddr, iEndArcAddr)); + let iEndArcAddr = iAddr + 1; + nRawArc = this.byDic[iAddr]; let nArc = nRawArc & this._arcMask; if (nArc > this.nChar) { // This value is not a char, this is a stemming code let sStem = ">" + this.funcStemming(sWord, this.lArcVal[nArc]); // Now , we go to the next node and retrieve all following arcs values, all of them are tags - let iAddr2 = this._convBytesToInteger(this.byDic.slice(iEndArcAddr, iEndArcAddr+this.nBytesNodeAddress)); + let iAddr2 = this.byDic[iEndArcAddr]; let nRawArc2 = 0; while (!(nRawArc2 & this._lastArcMask)) { - let iEndArcAddr2 = iAddr2 + this.nBytesArc; - nRawArc2 = this._convBytesToInteger(this.byDic.slice(iAddr2, iEndArcAddr2)); + let iEndArcAddr2 = iAddr2 + 1; + nRawArc2 = this.byDic[iAddr2]; l.push(sStem + "/" + this.lArcVal[nRawArc2 & this._arcMask]); - iAddr2 = iEndArcAddr2+this.nBytesNodeAddress; + iAddr2 = iEndArcAddr2 + 1; } } - iAddr = iEndArcAddr + this.nBytesNodeAddress; + iAddr = iEndArcAddr + 1; } return l; } return []; } - _stem1 (sWord) { + _stem (sWord) { // returns stems list of sWord let iAddr = 0; for (let c of sWord) { if (!this.dChar.has(c)) { return []; @@ -605,88 +616,61 @@ iAddr = this._lookupArcNode(this.dChar.get(c), iAddr); if (iAddr === null) { return []; } } - if (this._convBytesToInteger(this.byDic.slice(iAddr, iAddr+this.nBytesArc)) & this._finalNodeMask) { + if (this.byDic[iAddr] & this._finalNodeMask) { let l = []; let nRawArc = 0; while (!(nRawArc & this._lastArcMask)) { - let iEndArcAddr = iAddr + this.nBytesArc; - nRawArc = this._convBytesToInteger(this.byDic.slice(iAddr, iEndArcAddr)); + let iEndArcAddr = iAddr + 1; + nRawArc = this.byDic[iAddr]; let nArc = nRawArc & this._arcMask; if (nArc > this.nChar) { // This value is not a char, this is a stemming code l.push(this.funcStemming(sWord, this.lArcVal[nArc])); } - iAddr = iEndArcAddr + this.nBytesNodeAddress; + iAddr = iEndArcAddr + 1; } return l; } return []; } - _lookupArcNode1 (nVal, iAddr) { + _lookupArcNode (nVal, iAddr) { // looks if nVal is an arc at the node at iAddr, if yes, returns address of next node else None while (true) { - let iEndArcAddr = iAddr+this.nBytesArc; - let nRawArc = this._convBytesToInteger(this.byDic.slice(iAddr, iEndArcAddr)); + let iEndArcAddr = iAddr+1; + let nRawArc = this.byDic[iAddr]; if (nVal == (nRawArc & this._arcMask)) { // the value we are looking for // we return the address of the next node - return this._convBytesToInteger(this.byDic.slice(iEndArcAddr, iEndArcAddr+this.nBytesNodeAddress)); + return this.byDic[iEndArcAddr]; } else { // value not found if (nRawArc & this._lastArcMask) { return null; } - iAddr = iEndArcAddr + this.nBytesNodeAddress; - } - } - } - - * _getArcs1 (iAddr) { - // generator: return all arcs at as tuples of (nVal, iAddr) - while (true) { - let iEndArcAddr = iAddr+this.nBytesArc; - let nRawArc = this._convBytesToInteger(this.byDic.slice(iAddr, iEndArcAddr)); - yield [nRawArc & this._arcMask, this._convBytesToInteger(this.byDic.slice(iEndArcAddr, iEndArcAddr+this.nBytesNodeAddress))]; - if (nRawArc & this._lastArcMask) { - break; - } - iAddr = iEndArcAddr+this.nBytesNodeAddress; - } - } - - // VERSION 2 - _morph2 (sWord) { - // to do - } - - _stem2 (sWord) { - // to do - } - - _lookupArcNode2 (nVal, iAddr) { - // to do - } - - - // VERSION 3 - _morph3 (sWord) { - // to do - } - - _stem3 (sWord) { - // to do - } - - _lookupArcNode3 (nVal, iAddr) { - // to do + iAddr = iEndArcAddr + 1; + } + } + } + + * _getArcs (iAddr) { + // generator: return all arcs at as tuples of (nVal, iAddr) + while (true) { + let iEndArcAddr = iAddr+1; + let nRawArc = this.byDic[iAddr]; + yield [nRawArc & this._arcMask, this.byDic[iEndArcAddr]]; + if (nRawArc & this._lastArcMask) { + break; + } + iAddr = iEndArcAddr+1; + } } } if (typeof(exports) !== 'undefined') { exports.IBDAWG = IBDAWG; } Index: graphspell-js/str_transform.js ================================================================== --- graphspell-js/str_transform.js +++ graphspell-js/str_transform.js @@ -9,10 +9,11 @@ if (typeof(process) !== 'undefined') { var char_player = require("./char_player.js"); } + // Note: 48 is the ASCII code for "0" var str_transform = { @@ -63,10 +64,16 @@ } i++; } return sNewWord.replace(/eau/g, "o").replace(/au/g, "o").replace(/ai/g, "éi").replace(/ei/g, "é").replace(/ph/g, "f"); }, + + cleanWord: function (sWord) { + // word clean for the user who make commun and preditive error help suggest + // remove letters repeated more than 2 times + return sWord.replace(/(.)\1{2,}/ig,'$1$1'); + }, _xTransNumbersToExponent: new Map([ ["0", "⁰"], ["1", "¹"], ["2", "²"], ["3", "³"], ["4", "⁴"], ["5", "⁵"], ["6", "⁶"], ["7", "⁷"], ["8", "⁸"], ["9", "⁹"] ]), @@ -150,10 +157,101 @@ } catch (e) { console.error(e); } }, + + distanceJaroWinkler: function(a, b, boost = .666) { + // https://github.com/thsig/jaro-winkler-JS + //if (a == b) { return 1.0; } + let a_len = a.length; + let b_len = b.length; + let a_flag = []; + let b_flag = []; + let search_range = Math.floor(Math.max(a_len, b_len) / 2) - 1; + let minv = Math.min(a_len, b_len); + + // Looking only within the search range, count and flag the matched pairs. + let Num_com = 0; + let yl1 = b_len - 1; + for (let i = 0; i < a_len; i++) { + let lowlim = (i >= search_range) ? i - search_range : 0; + let hilim = ((i + search_range) <= yl1) ? (i + search_range) : yl1; + for (let j = lowlim; j <= hilim; j++) { + if (b_flag[j] !== 1 && a[j] === b[i]) { + a_flag[j] = 1; + b_flag[i] = 1; + Num_com++; + break; + } + } + } + + // Return if no characters in common + if (Num_com === 0) { return 0.0; } + + // Count the number of transpositions + let k = 0; + let N_trans = 0; + for (let i = 0; i < a_len; i++) { + if (a_flag[i] === 1) { + let j; + for (j = k; j < b_len; j++) { + if (b_flag[j] === 1) { + k = j + 1; + break; + } + } + if (a[i] !== b[j]) { N_trans++; } + } + } + N_trans = Math.floor(N_trans / 2); + + // Adjust for similarities in nonmatched characters + let N_simi = 0; + let adjwt = char_player.oDistanceBetweenChars; + if (minv > Num_com) { + for (let i = 0; i < a_len; i++) { + if (!a_flag[i]) { + for (let j = 0; j < b_len; j++) { + if (!b_flag[j]) { + if (adjwt[a[i]] && adjwt[a[i]][b[j]]) { + N_simi += adjwt[a[i]][b[j]]; + b_flag[j] = 2; + break; + } + } + } + } + } + } + + let Num_sim = (N_simi / 10.0) + Num_com; + + // Main weight computation + let weight = Num_sim / a_len + Num_sim / b_len + (Num_com - N_trans) / Num_com; + weight = weight / 3; + + // Continue to boost the weight if the strings are similar + if (weight > boost) { + // Adjust for having up to the first 4 characters in common + let j = (minv >= 4) ? 4 : minv; + let i; + for (i = 0; (i < j) && a[i] === b[i]; i++) { } + if (i) { weight += i * 0.1 * (1.0 - weight) }; + + // Adjust for long strings. + // After agreeing beginning chars, at least two more must agree + // and the agreeing characters must be more than half of the + // remaining characters. + if (minv > 4 && Num_com > i + 1 && 2 * Num_com >= minv + i) { + weight += (1 - weight) * ((Num_com - i - 1) / (a_len * b_len - i*2 + 2)); + } + } + + return weight; + }, showDistance (s1, s2) { console.log(`Distance: ${s1} / ${s2} = ${this.distanceDamerauLevenshtein(s1, s2)})`); }, Index: graphspell/char_player.py ================================================================== --- graphspell/char_player.py +++ graphspell/char_player.py @@ -3,37 +3,40 @@ useful for suggestion mechanism """ dDistanceBetweenChars = { - "a": {}, - "e": {"é": 0.5}, - "é": {"e": 0.5}, - "i": {"y": 0.2}, - "o": {}, - "u": {}, - "y": {"i": 0.3}, - "b": {"d": 0.8, "h": 0.9}, - "c": {"ç": 0.1, "k": 0.5, "q": 0.5, "s": 0.5, "x": 0.5, "z": 0.8}, - "d": {"b": 0.8}, - "f": {"v": 0.8}, - "g": {"j": 0.5}, - "h": {"b": 0.9}, - "j": {"g": 0.5, "i": 0.9}, - "k": {"c": 0.5, "q": 0.1, "x": 0.5}, - "l": {"i": 0.9}, - "m": {"n": 0.8}, - "n": {"m": 0.8, "r": 0.9}, - "p": {"q": 0.9}, - "q": {"c": 0.5, "k": 0.1, "p": 0.9}, - "r": {"n": 0.9, "j": 0.9}, - "s": {"c": 0.5, "ç": 0.1, "x": 0.5, "z": 0.5}, - "t": {"d": 0.9}, - "v": {"f": 0.8, "w": 0.1}, - "w": {"v": 0.1}, - "x": {"c": 0.5, "k": 0.5, "q": 0.5, "s": 0.5}, - "z": {"s": 0.5} + # dDistanceBetweenChars: + # - with Jaro-Winkler, values between 1 and 10 + # - with Damerau-Levenshtein, values / 10 (between 0 and 1: 0.1, 0.2 ... 0.9) + #"a": {}, + "e": {"é": 5}, + "é": {"e": 5}, + "i": {"y": 2}, + #"o": {}, + #"u": {}, + "y": {"i": 3}, + "b": {"d": 8, "h": 9}, + "c": {"ç": 1, "k": 5, "q": 5, "s": 5, "x": 5, "z": 8}, + "d": {"b": 8}, + "f": {"v": 8}, + "g": {"j": 5}, + "h": {"b": 9}, + "j": {"g": 5, "i": 9}, + "k": {"c": 5, "q": 1, "x": 5}, + "l": {"i": 9}, + "m": {"n": 8}, + "n": {"m": 8, "r": 9}, + "p": {"q": 9}, + "q": {"c": 5, "k": 1, "p": 9}, + "r": {"n": 9, "j": 9}, + "s": {"c": 5, "ç": 1, "x": 5, "z": 5}, + "t": {"d": 9}, + "v": {"f": 8, "w": 1}, + "w": {"v": 1}, + "x": {"c": 5, "k": 5, "q": 5, "s": 5}, + "z": {"s": 5} } def distanceBetweenChars (c1, c2): "returns a float between 0 and 1" @@ -315,12 +318,12 @@ # End of word dFinal1 = { "a": ("as", "at", "ant", "ah"), "A": ("AS", "AT", "ANT", "AH"), - "c": ("ch",), - "C": ("CH",), + "c": ("ch", "que"), + "C": ("CH", "QUE"), "e": ("et", "er", "ets", "ée", "ez", "ai", "ais", "ait", "ent", "eh"), "E": ("ET", "ER", "ETS", "ÉE", "EZ", "AI", "AIS", "AIT", "ENT", "EH"), "é": ("et", "er", "ets", "ée", "ez", "ai", "ais", "ait"), "É": ("ET", "ER", "ETS", "ÉE", "EZ", "AI", "AIS", "AIT"), "è": ("et", "er", "ets", "ée", "ez", "ai", "ais", "ait"), @@ -331,10 +334,12 @@ "Ë": ("ET", "ER", "ETS", "ÉE", "EZ", "AI", "AIS", "AIT"), "g": ("gh",), "G": ("GH",), "i": ("is", "it", "ie", "in"), "I": ("IS", "IT", "IE", "IN"), + "k": ("que",), + "K": ("QUE",), "n": ("nt", "nd", "ns", "nh"), "N": ("NT", "ND", "NS", "NH"), "o": ("aut", "ot", "os"), "O": ("AUT", "OT", "OS"), "ô": ("aut", "ot", "os"), Index: graphspell/ibdawg.py ================================================================== --- graphspell/ibdawg.py +++ graphspell/ibdawg.py @@ -13,10 +13,11 @@ import time import json import binascii import importlib from collections import OrderedDict +from math import floor #import logging #logging.basicConfig(filename="suggestions.log", level=logging.DEBUG) from . import str_transform as st @@ -38,63 +39,71 @@ class SuggResult: """Structure for storing, classifying and filtering suggestions""" - def __init__ (self, sWord, nDistLimit=-1): + def __init__ (self, sWord, nSuggLimit=10, nDistLimit=-1): self.sWord = sWord self.sSimplifiedWord = st.simplifyWord(sWord) self.nDistLimit = nDistLimit if nDistLimit >= 0 else (len(sWord) // 3) + 1 self.nMinDist = 1000 - self.aSugg = set() - self.dSugg = { 0: [], 1: [], 2: [] } - self.aAllSugg = set() # all found words even those refused + # Temporary sets + self.aAllSugg = set() # All suggestions, even the one rejected + self.dGoodSugg = {} # Acceptable suggestions + self.dBestSugg = {} # Best suggestions + # Parameters + self.nSuggLimit = nSuggLimit + self.nSuggLimitExt = nSuggLimit + 2 # we add few entries in case suggestions merge after casing modifications + self.nBestSuggLimit = floor(nSuggLimit * 1.5) # n times the requested limit + self.nGoodSuggLimit = nSuggLimit * 15 # n times the requested limit def addSugg (self, sSugg, nDeep=0): "add a suggestion" #logging.info((nDeep * " ") + "__" + sSugg + "__") if sSugg in self.aAllSugg: return self.aAllSugg.add(sSugg) - if sSugg not in self.aSugg: - #nDist = min(st.distanceDamerauLevenshtein(self.sWord, sSugg), st.distanceDamerauLevenshtein(self.sSimplifiedWord, st.simplifyWord(sSugg))) - nDist = int(st.distanceDamerauLevenshtein(self.sSimplifiedWord, st.simplifyWord(sSugg))) - #logging.info((nDeep * " ") + "__" + sSugg + "__ :" + self.sSimplifiedWord +"|"+ st.simplifyWord(sSugg) +" -> "+ str(nDist)) - if nDist <= self.nDistLimit: - if " " in sSugg: - nDist += 1 - if nDist not in self.dSugg: - self.dSugg[nDist] = [] - self.dSugg[nDist].append(sSugg) - self.aSugg.add(sSugg) - if nDist < self.nMinDist: - self.nMinDist = nDist - self.nDistLimit = min(self.nDistLimit, self.nMinDist+1) - - def getSuggestions (self, nSuggLimit=10): + nDistJaro = 1 - st.distanceJaroWinkler(self.sSimplifiedWord, st.simplifyWord(sSugg)) + nDist = floor(nDistJaro * 10) + if nDistJaro < .11: # Best suggestions + self.dBestSugg[sSugg] = round(nDistJaro*1000) + if len(self.dBestSugg) > self.nBestSuggLimit: + self.nDistLimit = -1 # make suggest() to end search + elif nDistJaro < .33: # Good suggestions + self.dGoodSugg[sSugg] = round(nDistJaro*1000) + if len(self.dGoodSugg) > self.nGoodSuggLimit: + self.nDistLimit = -1 # make suggest() to end search + else: + if nDist < self.nMinDist: + self.nMinDist = nDist + self.nDistLimit = min(self.nDistLimit, self.nMinDist) + if nDist <= self.nDistLimit: + if nDist < self.nMinDist: + self.nMinDist = nDist + self.nDistLimit = min(self.nDistLimit, self.nMinDist+1) + + def getSuggestions (self): "return a list of suggestions" # we sort the better results with the original word lRes = [] - bFirstListSorted = False - for nDist, lSugg in self.dSugg.items(): - if nDist > self.nDistLimit: - break - if not bFirstListSorted and len(lSugg) > 1: - lSugg.sort(key=lambda sSugg: st.distanceDamerauLevenshtein(self.sWord, sSugg)) - bFirstListSorted = True - #print(nDist, "|".join(lSugg)) - #for sSugg in lSugg: - # print(sSugg, st.distanceDamerauLevenshtein(self.sWord, sSugg)) - lRes.extend(lSugg) - if len(lRes) > nSuggLimit: - break + if len(self.dBestSugg) > 0: + # sort only with simplified words + lResTmp = sorted(self.dBestSugg.items(), key=lambda x: x[1]) + for i in range(min(self.nSuggLimitExt, len(lResTmp))): + lRes.append(lResTmp[i][0]) + if len(lRes) < self.nSuggLimitExt: + # sort with simplified words and original word + lResTmp = sorted(self.dGoodSugg.items(), key=lambda x: ((1-st.distanceJaroWinkler(self.sWord, x[0]))*10, x[1])) + for i in range(min(self.nSuggLimitExt, len(lResTmp))): + lRes.append(lResTmp[i][0]) + # casing if self.sWord.isupper(): lRes = list(OrderedDict.fromkeys(map(lambda sSugg: sSugg.upper(), lRes))) # use dict, when Python 3.6+ elif self.sWord[0:1].isupper(): # dont’ use <.istitle> lRes = list(OrderedDict.fromkeys(map(lambda sSugg: sSugg[0:1].upper()+sSugg[1:], lRes))) # use dict, when Python 3.6+ - return lRes[:nSuggLimit] + return lRes[:self.nSuggLimit] def reset (self): "clear data" self.aSugg.clear() self.dSugg.clear() @@ -118,45 +127,42 @@ else: self._initJSON(source) self.sFileName = source if isinstance(source, str) else "[None]" + # Performance trick: + # Instead of converting bytes to integers each times we parse the binary dictionary, + # we do it once, then parse the array + nAcc = 0 + byBuffer = b"" + lTemp = [] + nDivisor = (self.nBytesArc + self.nBytesNodeAddress) / 2 + for i in range(0, len(self.byDic)): + byBuffer += self.byDic[i:i+1] + if nAcc == (self.nBytesArc - 1): + lTemp.append(int.from_bytes(byBuffer, byteorder="big")) + byBuffer = b"" + elif nAcc == (self.nBytesArc + self.nBytesNodeAddress - 1): + lTemp.append(round(int.from_bytes(byBuffer, byteorder="big") / nDivisor)) + byBuffer = b"" + nAcc = -1 + nAcc = nAcc + 1 + self.byDic = lTemp; + + # masks self._arcMask = (2 ** ((self.nBytesArc * 8) - 3)) - 1 self._finalNodeMask = 1 << ((self.nBytesArc * 8) - 1) self._lastArcMask = 1 << ((self.nBytesArc * 8) - 2) - self._addrBitMask = 1 << ((self.nBytesArc * 8) - 3) # version 2 # function to decode the affix/suffix code if self.cStemming == "S": self.funcStemming = st.changeWordWithSuffixCode elif self.cStemming == "A": self.funcStemming = st.changeWordWithAffixCode else: self.funcStemming = st.noStemming - # Configuring DAWG functions according to nCompressionMethod - if self.nCompressionMethod == 1: - self.morph = self._morph1 - self.stem = self._stem1 - self._lookupArcNode = self._lookupArcNode1 - self._getArcs = self._getArcs1 - self._writeNodes = self._writeNodes1 - elif self.nCompressionMethod == 2: - self.morph = self._morph2 - self.stem = self._stem2 - self._lookupArcNode = self._lookupArcNode2 - self._getArcs = self._getArcs2 - self._writeNodes = self._writeNodes2 - elif self.nCompressionMethod == 3: - self.morph = self._morph3 - self.stem = self._stem3 - self._lookupArcNode = self._lookupArcNode3 - self._getArcs = self._getArcs3 - self._writeNodes = self._writeNodes3 - else: - raise ValueError(" # Error: unknown code: {}".format(self.nCompressionMethod)) - self.bAcronymValid = False self.bNumAtLastValid = False # lexicographer module ? self.lexicographer = None @@ -202,11 +208,10 @@ # to get the value of an arc, to get the char of an arc with its value self.dChar = {} for i in range(1, self.nChar+1): self.dChar[self.lArcVal[i]] = i self.dCharVal = { v: k for k, v in self.dChar.items() } - self.nBytesOffset = 1 # version 3 def _initJSON (self, oJSON): "initialize with a JSON text file" self.sByDic = "" # init to prevent pylint whining self.__dict__.update(oJSON) @@ -246,11 +251,10 @@ "nArcVal": self.nArcVal, "lArcVal": self.lArcVal, "nCompressionMethod": self.nCompressionMethod, "nBytesArc": self.nBytesArc, "nBytesNodeAddress": self.nBytesNodeAddress, - "nBytesOffset": self.nBytesOffset, # JavaScript is a pile of shit, so Mozilla’s JS parser don’t like file bigger than 4 Mb! # So, if necessary, we use an hexadecimal string, that we will convert later in Firefox’s extension. # https://github.com/mozilla/addons-linter/issues/1361 "sByDic": self.byDic.hex() if bBinaryDictAsHexString else [ e for e in self.byDic ], "l2grams": list(self.a2grams) @@ -298,22 +302,22 @@ if c not in self.dChar: return False iAddr = self._lookupArcNode(self.dChar[c], iAddr) if iAddr is None: return False - return bool(int.from_bytes(self.byDic[iAddr:iAddr+self.nBytesArc], byteorder='big') & self._finalNodeMask) + return bool(self.byDic[iAddr] & self._finalNodeMask) def getMorph (self, sWord): "retrieves morphologies list, different casing allowed" if not sWord: return [] sWord = st.spellingNormalization(sWord) - l = self.morph(sWord) + l = self._morph(sWord) if sWord[0:1].isupper(): - l.extend(self.morph(sWord.lower())) + l.extend(self._morph(sWord.lower())) if sWord.isupper() and len(sWord) > 1: - l.extend(self.morph(sWord.capitalize())) + l.extend(self._morph(sWord.capitalize())) return l #@timethis def suggest (self, sWord, nSuggLimit=10, bSplitTrailingNumbers=False): "returns a set of suggestions for " @@ -325,16 +329,17 @@ sPfx, sWord, sSfx = self.lexicographer.split(sWord) nMaxSwitch = max(len(sWord) // 3, 1) nMaxDel = len(sWord) // 5 nMaxHardRepl = max((len(sWord) - 5) // 4, 1) nMaxJump = max(len(sWord) // 4, 1) - oSuggResult = SuggResult(sWord) + oSuggResult = SuggResult(sWord, nSuggLimit) + sWord = st.cleanWord(sWord) if bSplitTrailingNumbers: self._splitTrailingNumbers(oSuggResult, sWord) self._splitSuggest(oSuggResult, sWord) self._suggest(oSuggResult, sWord, nMaxSwitch, nMaxDel, nMaxHardRepl, nMaxJump) - aSugg = oSuggResult.getSuggestions(nSuggLimit) + aSugg = oSuggResult.getSuggestions() if self.lexicographer: aSugg = self.lexicographer.filterSugg(aSugg) if sSfx or sPfx: # we add what we removed return list(map(lambda sSug: sPfx + sSug + sSfx, aSugg)) @@ -354,11 +359,11 @@ oSuggResult.addSugg(sWord1+" "+sWord2) def _suggest (self, oSuggResult, sRemain, nMaxSwitch=0, nMaxDel=0, nMaxHardRepl=0, nMaxJump=0, nDist=0, nDeep=0, iAddr=0, sNewWord="", bAvoidLoop=False): # recursive function #logging.info((nDeep * " ") + sNewWord + ":" + sRemain) - if int.from_bytes(self.byDic[iAddr:iAddr+self.nBytesArc], byteorder='big') & self._finalNodeMask: + if self.byDic[iAddr] & self._finalNodeMask: if not sRemain: oSuggResult.addSugg(sNewWord, nDeep) for sTail in self._getTails(iAddr): oSuggResult.addSugg(sNewWord+sTail, nDeep) return @@ -421,11 +426,11 @@ def _getTails (self, iAddr, sTail="", n=2): "return a list of suffixes ending at a distance of from " aTails = set() for nVal, jAddr in self._getArcs(iAddr): if nVal <= self.nChar: - if int.from_bytes(self.byDic[jAddr:jAddr+self.nBytesArc], byteorder='big') & self._finalNodeMask: + if self.byDic[jAddr] & self._finalNodeMask: aTails.add(sTail + self.dCharVal[nVal]) if n and not aTails: aTails.update(self._getTails(jAddr, sTail+self.dCharVal[nVal], n-1)) return aTails @@ -469,330 +474,113 @@ if sTagsPattern: zTagsPattern = re.compile(sTagsPattern) except re.error: print("# Error in regex pattern") traceback.print_exc() - yield from self._select1(zFlexPattern, zTagsPattern, 0, "") - - # def morph (self, sWord): - # is defined in __init__ - - # VERSION 1 - def _select1 (self, zFlexPattern, zTagsPattern, iAddr, sWord): - # recursive generator - for nVal, jAddr in self._getArcs1(iAddr): + yield from self._select(zFlexPattern, zTagsPattern, 0, "") + + def _select (self, zFlexPattern, zTagsPattern, iAddr, sWord): + # recursive generator + for nVal, jAddr in self._getArcs(iAddr): if nVal <= self.nChar: # simple character - yield from self._select1(zFlexPattern, zTagsPattern, jAddr, sWord + self.lArcVal[nVal]) + yield from self._select(zFlexPattern, zTagsPattern, jAddr, sWord + self.lArcVal[nVal]) else: if not zFlexPattern or zFlexPattern.search(sWord): sStem = self.funcStemming(sWord, self.lArcVal[nVal]) - for nMorphVal, _ in self._getArcs1(jAddr): + for nMorphVal, _ in self._getArcs(jAddr): if not zTagsPattern or zTagsPattern.search(self.lArcVal[nMorphVal]): yield [sWord, sStem, self.lArcVal[nMorphVal]] - def _morph1 (self, sWord): + def _morph (self, sWord): "returns morphologies of " iAddr = 0 for c in sWord: if c not in self.dChar: return [] iAddr = self._lookupArcNode(self.dChar[c], iAddr) if iAddr is None: return [] - if int.from_bytes(self.byDic[iAddr:iAddr+self.nBytesArc], byteorder='big') & self._finalNodeMask: + if self.byDic[iAddr] & self._finalNodeMask: l = [] nRawArc = 0 while not nRawArc & self._lastArcMask: - iEndArcAddr = iAddr + self.nBytesArc - nRawArc = int.from_bytes(self.byDic[iAddr:iEndArcAddr], byteorder='big') + iEndArcAddr = iAddr + 1 + nRawArc = self.byDic[iAddr] nArc = nRawArc & self._arcMask if nArc > self.nChar: # This value is not a char, this is a stemming code sStem = ">" + self.funcStemming(sWord, self.lArcVal[nArc]) # Now , we go to the next node and retrieve all following arcs values, all of them are tags - iAddr2 = int.from_bytes(self.byDic[iEndArcAddr:iEndArcAddr+self.nBytesNodeAddress], byteorder='big') + iAddr2 = self.byDic[iEndArcAddr] nRawArc2 = 0 while not nRawArc2 & self._lastArcMask: - iEndArcAddr2 = iAddr2 + self.nBytesArc - nRawArc2 = int.from_bytes(self.byDic[iAddr2:iEndArcAddr2], byteorder='big') + iEndArcAddr2 = iAddr2 + 1 + nRawArc2 = self.byDic[iAddr2] l.append(sStem + "/" + self.lArcVal[nRawArc2 & self._arcMask]) - iAddr2 = iEndArcAddr2+self.nBytesNodeAddress - iAddr = iEndArcAddr+self.nBytesNodeAddress + iAddr2 = iEndArcAddr2 + 1 + iAddr = iEndArcAddr + 1 return l return [] - def _stem1 (self, sWord): + def _stem (self, sWord): "returns stems list of " iAddr = 0 for c in sWord: if c not in self.dChar: return [] iAddr = self._lookupArcNode(self.dChar[c], iAddr) if iAddr is None: return [] - if int.from_bytes(self.byDic[iAddr:iAddr+self.nBytesArc], byteorder='big') & self._finalNodeMask: + if self.byDic[iAddr] & self._finalNodeMask: l = [] nRawArc = 0 while not nRawArc & self._lastArcMask: - iEndArcAddr = iAddr + self.nBytesArc - nRawArc = int.from_bytes(self.byDic[iAddr:iEndArcAddr], byteorder='big') + iEndArcAddr = iAddr + 1 + nRawArc = self.byDic[iAddr] nArc = nRawArc & self._arcMask if nArc > self.nChar: # This value is not a char, this is a stemming code l.append(self.funcStemming(sWord, self.lArcVal[nArc])) - iAddr = iEndArcAddr+self.nBytesNodeAddress + iAddr = iEndArcAddr + 1 return l return [] - def _lookupArcNode1 (self, nVal, iAddr): + def _lookupArcNode (self, nVal, iAddr): "looks if is an arc at the node at , if yes, returns address of next node else None" while True: - iEndArcAddr = iAddr+self.nBytesArc - nRawArc = int.from_bytes(self.byDic[iAddr:iEndArcAddr], byteorder='big') + iEndArcAddr = iAddr + 1 + nRawArc = self.byDic[iAddr] if nVal == (nRawArc & self._arcMask): # the value we are looking for # we return the address of the next node - return int.from_bytes(self.byDic[iEndArcAddr:iEndArcAddr+self.nBytesNodeAddress], byteorder='big') + return self.byDic[iEndArcAddr] # value not found if nRawArc & self._lastArcMask: return None - iAddr = iEndArcAddr+self.nBytesNodeAddress + iAddr = iEndArcAddr + 1 - def _getArcs1 (self, iAddr): + def _getArcs (self, iAddr): "generator: return all arcs at as tuples of (nVal, iAddr)" while True: - iEndArcAddr = iAddr+self.nBytesArc - nRawArc = int.from_bytes(self.byDic[iAddr:iEndArcAddr], byteorder='big') - yield nRawArc & self._arcMask, int.from_bytes(self.byDic[iEndArcAddr:iEndArcAddr+self.nBytesNodeAddress], byteorder='big') + iEndArcAddr = iAddr + 1 + nRawArc = self.byDic[iAddr] + yield nRawArc & self._arcMask, self.byDic[iEndArcAddr] if nRawArc & self._lastArcMask: break - iAddr = iEndArcAddr+self.nBytesNodeAddress + iAddr = iEndArcAddr + 1 - def _writeNodes1 (self, spfDest): + def _writeNodes (self, spfDest): "for debugging only" print(" > Write binary nodes") with open(spfDest, 'w', 'utf-8', newline="\n") as hDst: iAddr = 0 hDst.write("i{:_>10} -- #{:_>10}\n".format("0", iAddr)) while iAddr < len(self.byDic): - iEndArcAddr = iAddr+self.nBytesArc - nRawArc = int.from_bytes(self.byDic[iAddr:iEndArcAddr], byteorder='big') - nArc = nRawArc & self._arcMask - hDst.write(" {:<20} {:0>16} i{:>10} #{:_>10}\n".format(self.lArcVal[nArc], bin(nRawArc)[2:], "?", \ - int.from_bytes(self.byDic[iEndArcAddr:iEndArcAddr+self.nBytesNodeAddress], \ - byteorder='big'))) - iAddr = iEndArcAddr+self.nBytesNodeAddress - if (nRawArc & self._lastArcMask) and iAddr < len(self.byDic): - hDst.write("\ni{:_>10} -- #{:_>10}\n".format("?", iAddr)) - hDst.close() - - # VERSION 2 - def _morph2 (self, sWord): - "returns morphologies of " - iAddr = 0 - for c in sWord: - if c not in self.dChar: - return [] - iAddr = self._lookupArcNode(self.dChar[c], iAddr) - if iAddr is None: - return [] - if int.from_bytes(self.byDic[iAddr:iAddr+self.nBytesArc], byteorder='big') & self._finalNodeMask: - l = [] - nRawArc = 0 - while not nRawArc & self._lastArcMask: - iEndArcAddr = iAddr + self.nBytesArc - nRawArc = int.from_bytes(self.byDic[iAddr:iEndArcAddr], byteorder='big') - nArc = nRawArc & self._arcMask - if nArc > self.nChar: - # This value is not a char, this is a stemming code - sStem = ">" + self.funcStemming(sWord, self.lArcVal[nArc]) - # Now , we go to the next node and retrieve all following arcs values, all of them are tags - if not nRawArc & self._addrBitMask: - iAddr2 = int.from_bytes(self.byDic[iEndArcAddr:iEndArcAddr+self.nBytesNodeAddress], byteorder='big') - else: - # we go to the end of the node - iAddr2 = iEndArcAddr - while not nRawArc & self._lastArcMask: - nRawArc = int.from_bytes(self.byDic[iAddr2:iAddr2+self.nBytesArc], byteorder='big') - iAddr2 += self.nBytesArc + self.nBytesNodeAddress - nRawArc2 = 0 - while not nRawArc2 & self._lastArcMask: - iEndArcAddr2 = iAddr2 + self.nBytesArc - nRawArc2 = int.from_bytes(self.byDic[iAddr2:iEndArcAddr2], byteorder='big') - l.append(sStem + "/" + self.lArcVal[nRawArc2 & self._arcMask]) - iAddr2 = iEndArcAddr2+self.nBytesNodeAddress if not nRawArc2 & self._addrBitMask else iEndArcAddr2 - iAddr = iEndArcAddr+self.nBytesNodeAddress if not nRawArc & self._addrBitMask else iEndArcAddr - return l - return [] - - def _stem2 (self, sWord): - "returns stems list of " - iAddr = 0 - for c in sWord: - if c not in self.dChar: - return [] - iAddr = self._lookupArcNode(self.dChar[c], iAddr) - if iAddr is None: - return [] - if int.from_bytes(self.byDic[iAddr:iAddr+self.nBytesArc], byteorder='big') & self._finalNodeMask: - l = [] - nRawArc = 0 - while not nRawArc & self._lastArcMask: - iEndArcAddr = iAddr + self.nBytesArc - nRawArc = int.from_bytes(self.byDic[iAddr:iEndArcAddr], byteorder='big') - nArc = nRawArc & self._arcMask - if nArc > self.nChar: - # This value is not a char, this is a stemming code - l.append(self.funcStemming(sWord, self.lArcVal[nArc])) - # Now , we go to the next node - if not nRawArc & self._addrBitMask: - iAddr2 = int.from_bytes(self.byDic[iEndArcAddr:iEndArcAddr+self.nBytesNodeAddress], byteorder='big') - else: - # we go to the end of the node - iAddr2 = iEndArcAddr - while not nRawArc & self._lastArcMask: - nRawArc = int.from_bytes(self.byDic[iAddr2:iAddr2+self.nBytesArc], byteorder='big') - iAddr2 += self.nBytesArc + self.nBytesNodeAddress - iAddr = iEndArcAddr+self.nBytesNodeAddress if not nRawArc & self._addrBitMask else iEndArcAddr - return l - return [] - - def _lookupArcNode2 (self, nVal, iAddr): - "looks if is an arc at the node at , if yes, returns address of next node else None" - while True: - iEndArcAddr = iAddr+self.nBytesArc - nRawArc = int.from_bytes(self.byDic[iAddr:iEndArcAddr], byteorder='big') - if nVal == (nRawArc & self._arcMask): - # the value we are looking for - if not nRawArc & self._addrBitMask: - # we return the address of the next node - return int.from_bytes(self.byDic[iEndArcAddr:iEndArcAddr+self.nBytesNodeAddress], byteorder='big') - # we go to the end of the node - iAddr = iEndArcAddr - while not nRawArc & self._lastArcMask: - nRawArc = int.from_bytes(self.byDic[iAddr:iAddr+self.nBytesArc], byteorder='big') - iAddr += self.nBytesArc + self.nBytesNodeAddress if not nRawArc & self._addrBitMask else self.nBytesArc - return iAddr - # value not found - if nRawArc & self._lastArcMask: - return None - iAddr = iEndArcAddr+self.nBytesNodeAddress if not nRawArc & self._addrBitMask else iEndArcAddr - - def _writeNodes2 (self, spfDest): - "for debugging only" - print(" > Write binary nodes") - with open(spfDest, 'w', 'utf-8', newline="\n") as hDst: - iAddr = 0 - hDst.write("i{:_>10} -- #{:_>10}\n".format("0", iAddr)) - while iAddr < len(self.byDic): - iEndArcAddr = iAddr+self.nBytesArc - nRawArc = int.from_bytes(self.byDic[iAddr:iEndArcAddr], byteorder='big') - nArc = nRawArc & self._arcMask - if not nRawArc & self._addrBitMask: - iNextNodeAddr = int.from_bytes(self.byDic[iEndArcAddr:iEndArcAddr+self.nBytesNodeAddress], byteorder='big') - hDst.write(" {:<20} {:0>16} i{:>10} #{:_>10}\n".format(self.lArcVal[nArc], bin(nRawArc)[2:], "?", iNextNodeAddr)) - iAddr = iEndArcAddr+self.nBytesNodeAddress - else: - hDst.write(" {:<20} {:0>16}\n".format(self.lArcVal[nArc], bin(nRawArc)[2:])) - iAddr = iEndArcAddr - if nRawArc & self._lastArcMask: - hDst.write("\ni{:_>10} -- #{:_>10}\n".format("?", iAddr)) - hDst.close() - - # VERSION 3 - def _morph3 (self, sWord): - "returns morphologies of " - iAddr = 0 - for c in sWord: - if c not in self.dChar: - return [] - iAddr = self._lookupArcNode(self.dChar[c], iAddr) - if iAddr is None: - return [] - if int.from_bytes(self.byDic[iAddr:iAddr+self.nBytesArc], byteorder='big') & self._finalNodeMask: - l = [] - nRawArc = 0 - iAddrNode = iAddr - while not nRawArc & self._lastArcMask: - iEndArcAddr = iAddr + self.nBytesArc - nRawArc = int.from_bytes(self.byDic[iAddr:iEndArcAddr], byteorder='big') - nArc = nRawArc & self._arcMask - if nArc > self.nChar: - # This value is not a char, this is a stemming code - sStem = ">" + self.funcStemming(sWord, self.lArcVal[nArc]) - # Now , we go to the next node and retrieve all following arcs values, all of them are tags - if not nRawArc & self._addrBitMask: - iAddr2 = int.from_bytes(self.byDic[iEndArcAddr:iEndArcAddr+self.nBytesNodeAddress], byteorder='big') - else: - iAddr2 = iAddrNode + int.from_bytes(self.byDic[iEndArcAddr:iEndArcAddr+self.nBytesOffset], byteorder='big') - nRawArc2 = 0 - while not nRawArc2 & self._lastArcMask: - iEndArcAddr2 = iAddr2 + self.nBytesArc - nRawArc2 = int.from_bytes(self.byDic[iAddr2:iEndArcAddr2], byteorder='big') - l.append(sStem + "/" + self.lArcVal[nRawArc2 & self._arcMask]) - iAddr2 = iEndArcAddr2+self.nBytesNodeAddress if not nRawArc2 & self._addrBitMask else iEndArcAddr2+self.nBytesOffset - iAddr = iEndArcAddr+self.nBytesNodeAddress if not nRawArc & self._addrBitMask else iEndArcAddr+self.nBytesOffset - return l - return [] - - def _stem3 (self, sWord): - "returns stems list of " - iAddr = 0 - for c in sWord: - if c not in self.dChar: - return [] - iAddr = self._lookupArcNode(self.dChar[c], iAddr) - if iAddr is None: - return [] - if int.from_bytes(self.byDic[iAddr:iAddr+self.nBytesArc], byteorder='big') & self._finalNodeMask: - l = [] - nRawArc = 0 - #iAddrNode = iAddr - while not nRawArc & self._lastArcMask: - iEndArcAddr = iAddr + self.nBytesArc - nRawArc = int.from_bytes(self.byDic[iAddr:iEndArcAddr], byteorder='big') - nArc = nRawArc & self._arcMask - if nArc > self.nChar: - # This value is not a char, this is a stemming code - l.append(self.funcStemming(sWord, self.lArcVal[nArc])) - iAddr = iEndArcAddr+self.nBytesNodeAddress if not nRawArc & self._addrBitMask else iEndArcAddr+self.nBytesOffset - return l - return [] - - def _lookupArcNode3 (self, nVal, iAddr): - "looks if is an arc at the node at , if yes, returns address of next node else None" - iAddrNode = iAddr - while True: - iEndArcAddr = iAddr+self.nBytesArc - nRawArc = int.from_bytes(self.byDic[iAddr:iEndArcAddr], byteorder='big') - if nVal == (nRawArc & self._arcMask): - # the value we are looking for - if not nRawArc & self._addrBitMask: - return int.from_bytes(self.byDic[iEndArcAddr:iEndArcAddr+self.nBytesNodeAddress], byteorder='big') - return iAddrNode + int.from_bytes(self.byDic[iEndArcAddr:iEndArcAddr+self.nBytesOffset], byteorder='big') - # value not found - if nRawArc & self._lastArcMask: - return None - iAddr = iEndArcAddr+self.nBytesNodeAddress if not nRawArc & self._addrBitMask else iEndArcAddr+self.nBytesOffset - - def _writeNodes3 (self, spfDest): - "for debugging only" - print(" > Write binary nodes") - with open(spfDest, 'w', 'utf-8', newline="\n") as hDst: - iAddr = 0 - hDst.write("i{:_>10} -- #{:_>10}\n".format("0", iAddr)) - while iAddr < len(self.byDic): - iEndArcAddr = iAddr+self.nBytesArc - nRawArc = int.from_bytes(self.byDic[iAddr:iEndArcAddr], byteorder='big') - nArc = nRawArc & self._arcMask - if not nRawArc & self._addrBitMask: - iNextNodeAddr = int.from_bytes(self.byDic[iEndArcAddr:iEndArcAddr+self.nBytesNodeAddress], byteorder='big') - hDst.write(" {:<20} {:0>16} i{:>10} #{:_>10}\n".format(self.lArcVal[nArc], bin(nRawArc)[2:], "?", iNextNodeAddr)) - iAddr = iEndArcAddr+self.nBytesNodeAddress - else: - iNextNodeAddr = int.from_bytes(self.byDic[iEndArcAddr:iEndArcAddr+self.nBytesOffset], byteorder='big') - hDst.write(" {:<20} {:0>16} i{:>10} +{:_>10}\n".format(self.lArcVal[nArc], bin(nRawArc)[2:], "?", iNextNodeAddr)) - iAddr = iEndArcAddr+self.nBytesOffset - if nRawArc & self._lastArcMask: + iEndArcAddr = iAddr + 1 + nRawArc = self.byDic[iAddr] + nArc = nRawArc & self._arcMask + hDst.write(" {:<20} {:0>16} i{:>10} #{:_>10}\n".format(self.lArcVal[nArc], bin(nRawArc)[2:], "?", self.byDic[iEndArcAddr])) + iAddr = iEndArcAddr + 1 + if (nRawArc & self._lastArcMask) and iAddr < len(self.byDic): hDst.write("\ni{:_>10} -- #{:_>10}\n".format("?", iAddr)) hDst.close() Index: graphspell/str_transform.py ================================================================== --- graphspell/str_transform.py +++ graphspell/str_transform.py @@ -3,12 +3,13 @@ - calculate distance between two strings - transform strings with transformation codes """ import unicodedata +import re -from .char_player import distanceBetweenChars +from .char_player import distanceBetweenChars, dDistanceBetweenChars #### N-GRAMS def getNgrams (sWord, n=2): @@ -50,10 +51,15 @@ for i, c in enumerate(sWord, 1): if c != sWord[i:i+1] or (c == 'e' and sWord[i:i+2] != "ee"): # exception for to avoid confusion between crée / créai sNewWord += c return sNewWord.replace("eau", "o").replace("au", "o").replace("ai", "éi").replace("ei", "é").replace("ph", "f") + +def cleanWord (sWord): + "remove letters repeated more than 2 times" + return re.sub("(.)\\1{2,}", '\\1\\1', sWord) + _xTransNumbersToExponent = str.maketrans({ "0": "⁰", "1": "¹", "2": "²", "3": "³", "4": "⁴", "5": "⁵", "6": "⁶", "7": "⁷", "8": "⁸", "9": "⁹" }) @@ -104,10 +110,87 @@ ) if i and j and s1[i] == s2[j-1] and s1[i-1] == s2[j]: d[i, j] = min(d[i, j], d[i-2, j-2] + nCost) # Transposition return d[nLen1-1, nLen2-1] + +def distanceJaroWinkler (a, b, boost = .666): + # https://github.com/thsig/jaro-winkler-JS + #if (a == b): return 1.0 + a_len = len(a) + b_len = len(b) + nMax = max(a_len, b_len) + a_flag = [None for _ in range(nMax)] + b_flag = [None for _ in range(nMax)] + search_range = (max(a_len, b_len) // 2) - 1 + minv = min(a_len, b_len) + + # Looking only within the search range, count and flag the matched pairs. + Num_com = 0 + yl1 = b_len - 1 + for i in range(a_len): + lowlim = i - search_range if i >= search_range else 0 + hilim = i + search_range if (i + search_range) <= yl1 else yl1 + for j in range(lowlim, hilim+1): + if b_flag[j] != 1 and a[j:j+1] == b[i:i+1]: + a_flag[j] = 1 + b_flag[i] = 1 + Num_com += 1 + break + + # Return if no characters in common + if Num_com == 0: + return 0.0 + + # Count the number of transpositions + k = 0 + N_trans = 0 + for i in range(a_len): + if a_flag[i] == 1: + for j in range(k, b_len): + if b_flag[j] == 1: + k = j + 1 + break + if a[i] != b[j]: + N_trans += 1 + N_trans = N_trans // 2 + + # Adjust for similarities in nonmatched characters + N_simi = 0 + if minv > Num_com: + for i in range(a_len): + if not a_flag[i]: + for j in range(b_len): + if not b_flag[j]: + if a[i] in dDistanceBetweenChars and b[j] in dDistanceBetweenChars[a[i]]: + N_simi += dDistanceBetweenChars[a[i]][b[j]] + b_flag[j] = 2 + break + + Num_sim = (N_simi / 10.0) + Num_com + + # Main weight computation + weight = Num_sim / a_len + Num_sim / b_len + (Num_com - N_trans) / Num_com + weight = weight / 3 + + # Continue to boost the weight if the strings are similar + if weight > boost: + # Adjust for having up to the first 4 characters in common + j = 4 if minv >= 4 else minv + i = 0 + while i < j and a[i] == b[i]: + i += 1 + if i: + weight += i * 0.1 * (1.0 - weight) + # Adjust for long strings. + # After agreeing beginning chars, at least two more must agree + # and the agreeing characters must be more than half of the + # remaining characters. + if minv > 4 and Num_com > i + 1 and 2 * Num_com >= minv + i: + weight += (1 - weight) * ((Num_com - i - 1) / (a_len * b_len - i*2 + 2)) + return weight + def distanceSift4 (s1, s2, nMaxOffset=5): "implementation of general Sift4." # https://siderite.blogspot.com/2014/11/super-fast-and-accurate-string-distance.html if not s1: