Index: graphspell-js/dawg.js ================================================================== --- graphspell-js/dawg.js +++ graphspell-js/dawg.js @@ -59,11 +59,11 @@ nChar += 1; } dCharOccur.set(c, dCharOccur.gl_get(c, 0) + 1); } // affixes to find stem from flexion - sAff = funcStemmingGen(sFlex, sStem); + let sAff = funcStemmingGen(sFlex, sStem); if (!dAff.get(sAff)) { dAff.set(sAff, nAff); lAff.push(sAff); nAff += 1; } @@ -99,11 +99,11 @@ // Dictionary of arc values occurrency, to sort arcs of each node let lKeyVal = []; for (let c of dChar.keys()) { lKeyVal.push([dChar[c], dCharOccur[c]]); } for (let sAff of dAff.keys()) { lKeyVal.push([dAff[sAff]+nChar, dAffOccur[sAff]]); } for (let sTag in dTag.keys()) { lKeyVal.push([dTag[sTag]+nChar+nAff, dTagOccur[sTag]]); } - dValOccur = new Map(lKeyVal); + let dValOccur = new Map(lKeyVal); lKeyVal.length = 0; // clear the array //with open(spfSrc[:-8]+".valuesfreq.txt", 'w', encoding='utf-8') as hFreqDst: # DEBUG // for iKey, nOcc in sorted(dValOccur.entries(), key=lambda t: t[1], reverse=True): // hFreqDst.write("{}: {}\n".format(lVal[iKey], nOcc)) @@ -240,12 +240,12 @@ _parseNodes (oNode) { // Warning: recursive method if (oNode.pos > 0) { return; } - oNode.setPos(); - this.lSortedNodes.append(oNode); + //oNode.setPos(); // version 2 + this.lSortedNodes.push(oNode); for (let oNextNode of oNode.arcs.values()) { this._parseNodes(oNextNode); } } Index: graphspell-js/ibdawg.js ================================================================== --- graphspell-js/ibdawg.js +++ graphspell-js/ibdawg.js @@ -122,13 +122,13 @@ this.dChar = helpers.objectToMap(this.dChar); this.dCharVal = this.dChar.gl_reverse(); //this.byDic = new Uint8Array(this.byDic); // not quicker, even slower if (this.cStemming == "S") { - this.funcStemming = str_transform.getStemFromSuffixCode; + this.funcStemming = str_transform.changeWordWithSuffixCode; } else if (this.cStemming == "A") { - this.funcStemming = str_transform.getStemFromAffixCode; + this.funcStemming = str_transform.changeWordWithAffixCode; } else { this.funcStemming = str_transform.noStemming; } // Configuring DAWG functions according to nVersion Index: graphspell-js/str_transform.js ================================================================== --- graphspell-js/str_transform.js +++ graphspell-js/str_transform.js @@ -1,11 +1,52 @@ //// STRING TRANSFORMATION /*jslint esversion: 6*/ + +"use strict"; + // Note: 48 is the ASCII code for "0" var str_transform = { + + longestCommonSubstring: function (string1, string2) { + // https://en.wikibooks.org/wiki/Algorithm_Implementation/Strings/Longest_common_substring + // untested + + // init max value + let longestCommonSubstring = 0; + // init 2D array with 0 + let table = [], + len1 = string1.length, + len2 = string2.length, + row, col; + for (row = 0; row <= len1; row++) { + table[row] = []; + for (col = 0; col <= len2; col++) { + table[row][col] = 0; + } + } + // fill table + let i, j; + for (i = 0; i < len1; i++) { + for (j = 0; j < len2; j++) { + if (string1[i] === string2[j]) { + if (table[i][j] === 0){ + table[i+1][j+1] = 1; + } else { + table[i+1][j+1] = table[i][j] + 1; + } + if (table[i+1][j+1] > longestCommonSubstring) { + longestCommonSubstring = table[i+1][j+1]; + } + } else { + table[i+1][j+1] = 0; + } + } + } + return longestCommonSubstring; + }, distanceDamerauLevenshtein2: function (s1, s2) { // distance of Damerau-Levenshtein between and // https://fr.wikipedia.org/wiki/Distance_de_Damerau-Levenshtein try { @@ -115,32 +156,63 @@ jSfx += 1; } return String.fromCharCode(sFlex.length-jSfx+48) + sStem.slice(jSfx); }, - getStemFromSuffixCode: function (sFlex, sSfxCode) { - // Suffix only + changeWordWithSuffixCode: function (sWord, sSfxCode) { if (sSfxCode == "0") { - return sFlex; + return sWord; } - return sSfxCode[0] == '0' ? sFlex + sSfxCode.slice(1) : sFlex.slice(0, -(sSfxCode.charCodeAt(0)-48)) + sSfxCode.slice(1); + return sSfxCode[0] == '0' ? sWord + sSfxCode.slice(1) : sWord.slice(0, -(sSfxCode.charCodeAt(0)-48)) + sSfxCode.slice(1); }, - getStemFromAffixCode: function (sFlex, sAffCode) { - // Prefix and suffix + // Prefix and suffix + defineAffixCode: function (sFlex, sStem) { + /* + UNTESTED! + Returns a string defining how to get stem from flexion. Examples: + "0" if stem = flexion + "stem" if no common substring + "n(pfx)/m(sfx)" + with n and m: chars with numeric meaning, "0" = 0, "1" = 1, ... ":" = 10, etc. (See ASCII table.) Says how many letters to strip from flexion. + pfx [optional]: string to add before the flexion + sfx [optional]: string to add after the flexion + */ + if (sFlex == sStem) { + return "0"; + } + // is stem a substring of flexion? + let n = sFlex.indexOf(sStem); + if (n >= 0) { + return String.fromCharCode(n+48) + "/" + String.fromCharCode(sFlex.length-(sStem.length+n)+48); + } + // no, so we are looking for common substring + let sSubs = this.longestCommonSubstring(sFlex, sStem); + if (sSubs.length > 1) { + let iPos = sStem.indexOf(sSubs); + let sPfx = sStem.slice(0, iPos); + let sSfx = sStem.slice(iPos+sSubs.length); + let n = sFlex.indexOf(sSubs); + let m = sFlex.length - (sSubs.length+n); + return String.fromCharCode(n+48) + sPfx + "/" + String.fromCharCode(m+48) + sSfx; + } + return sStem; + }, + + changeWordWithAffixCode: function (sWord, sAffCode) { if (sAffCode == "0") { - return sFlex; + return sWord; } if (!sAffCode.includes("/")) { - return "# error #"; + return sAffCode; } let [sPfxCode, sSfxCode] = sAffCode.split('/'); - sFlex = sPfxCode.slice(1) + sFlex.slice(sPfxCode.charCodeAt(0)-48); - return sSfxCode[0] == '0' ? sFlex + sSfxCode.slice(1) : sFlex.slice(0, -(sSfxCode.charCodeAt(0)-48)) + sSfxCode.slice(1); + sWord = sPfxCode.slice(1) + sWord.slice(sPfxCode.charCodeAt(0)-48); + return sSfxCode[0] == '0' ? sWord + sSfxCode.slice(1) : sWord.slice(0, -(sSfxCode.charCodeAt(0)-48)) + sSfxCode.slice(1); } }; if (typeof(exports) !== 'undefined') { exports.getStemFromSuffixCode = str_transform.getStemFromSuffixCode; exports.getStemFromAffixCode = str_transform.getStemFromAffixCode; } Index: graphspell/str_transform.py ================================================================== --- graphspell/str_transform.py +++ graphspell/str_transform.py @@ -184,20 +184,18 @@ iPos = sStem.find(sSubs) sPfx = sStem[:iPos] sSfx = sStem[iPos+len(sSubs):] n = sFlex.find(sSubs) m = len(sFlex) - (len(sSubs)+n) - sAff = "{}/".format(chr(n+48)) if not sPfx else "{}{}/".format(chr(n+48), sPfx) - sAff += chr(m+48) if not sSfx else "{}{}".format(chr(m+48), sSfx) - return sAff + return chr(n+48) + sPfx + "/" + chr(m+48) + sSfx return sStem def changeWordWithAffixCode (sWord, sAffCode): if sAffCode == "0": return sWord if '/' not in sAffCode: - return "# error #" + return sAffCode sPfxCode, sSfxCode = sAffCode.split('/') sWord = sPfxCode[1:] + sWord[(ord(sPfxCode[0])-48):] return sWord[:-(ord(sSfxCode[0])-48)] + sSfxCode[1:] if sSfxCode[0] != '0' else sWord + sSfxCode[1:]