Index: gc_lang/fr/modules/tests_modules.py ================================================================== --- gc_lang/fr/modules/tests_modules.py +++ gc_lang/fr/modules/tests_modules.py @@ -20,11 +20,11 @@ start = time.perf_counter() try: yield finally: end = time.perf_counter() - print('{} : {}'.format(label, end - start)) + print('{:<20} : {}'.format(label, end - start)) if hDst: hDst.write("{:<12.6}".format(end-start)) class TestDictionary (unittest.TestCase): @@ -49,11 +49,11 @@ def test_isvalid_failed (self): for sWord in ["BranchE", "BRanche", "BRAnCHE", "émilie", "éMILIE", "émiLie"]: self.assertFalse(self.oDic.isValid(sWord), sWord) def test_suggest (self): - for sWord in ["déelirranttesss", "vallidasion", "Emilie", "exibission"]: + for sWord in ["déelirranttesss", "vallidasion", "Emilie", "exibission", "ditirembique", "jai", "email"]: with timeblock(sWord): self.assertNotEqual(0, self.oDic.suggest(sWord)) class TestConjugation (unittest.TestCase): Index: gc_lang/fr/perf_memo.text ================================================================== --- gc_lang/fr/perf_memo.text +++ gc_lang/fr/perf_memo.text @@ -28,7 +28,8 @@ 1.1 2019.05.16 09:42 1.50743 0.360923 0.261113 0.0749272 0.0763827 0.0771537 0.180504 0.102942 0.0182762 0.0021925 (×2, but new processor: AMD Ryzen 7 2700X) 1.2.1 2019.08.06 20:57 1.42886 0.358425 0.247356 0.0704405 0.0754886 0.0765604 0.177197 0.0988517 0.0188103 0.0020243 1.6.0 2020.01.03 20:22 1.38847 0.346214 0.240242 0.0709539 0.0737499 0.0748733 0.176477 0.0969171 0.0187857 0.0025143 (nouveau dictionnaire avec lemmes masculins) 1.9.0 2020.04.20 19:57 1.51183 0.369546 0.25681 0.0734314 0.0764396 0.0785668 0.183922 0.103674 0.0185812 0.002099 (NFC normalization) 1.9.2 2020.05.12 08:43 1.62465 0.398831 0.273012 0.0810811 0.080937 0.0845885 0.204133 0.114146 0.0212864 0.0029547 -1.12.2 2020.09.09 13:34 1.50568 0.374504 0.233108 0.0798712 0.0804466 0.0769674 0.171519 0.0945132 0.0165344 0.0019474 -1.12.2 2020.09.09 13:35 1.41094 0.359093 0.236443 0.06968 0.0734418 0.0738087 0.169371 0.0946279 0.0167106 0.0019773 +1.12.2 2020.09.09 13:34 1.50568 0.374504 0.233108 0.0798712 0.0804466 0.0769674 0.171519 0.0945132 0.0165344 0.0019474 +1.12.2 2020.09.09 13:35 1.41094 0.359093 0.236443 0.06968 0.0734418 0.0738087 0.169371 0.0946279 0.0167106 0.0019773 +1.12.2 2020.09.11 19:16 1.35297 0.330545 0.221731 0.0666998 0.0692539 0.0701707 0.160564 0.0891676 0.015807 0.0045998 Index: gc_lang/fr/webext/gce_worker.js ================================================================== --- gc_lang/fr/webext/gce_worker.js +++ gc_lang/fr/webext/gce_worker.js @@ -183,11 +183,11 @@ if (!(dOptions instanceof Map)) { dOptions = helpers.objectToMap(dOptions); } gc_engine.setOptions(dOptions); } - //tests(); + tests(); bInitDone = true; } else { console.log("[Worker] Already initialized…") } // we always retrieve options from the gc_engine, for setOptions filters obsolete options @@ -290,17 +290,26 @@ let dOptions = helpers.mapToObject(gc_engine.getOptions()); postMessage(createResponse("resetOptions", dOptions, oInfo, true)); } function tests () { - console.log(conj.getConj("devenir", ":E", ":2s")); + /*console.log(conj.getConj("devenir", ":E", ":2s")); console.log(mfsp.getMasForm("emmerdeuse", true)); console.log(mfsp.getMasForm("pointilleuse", false)); console.log(phonet.getSimil("est")); let aRes = gc_engine.parse("Je suit..."); for (let oErr of aRes) { console.log(text.getReadableError(oErr)); + }*/ + for (let sWord of ["fatiqué", "coeur", "trèèèèèèèèès", "vraaaaiiiimeeeeennnt", "apele", "email", "Co2", "emmppâiiiller", "testt", "apelaion", "exsepttion", "sintaxik", "ebriete", "ennormmement"]) { + console.time("Suggestions for " + sWord); + for (let aSugg of oSpellChecker.suggest(sWord)) { + if (aSugg.length) { + console.log(sWord + " -> ", aSugg.join(" ")); + } + } + console.timeEnd("Suggestions for " + sWord); } } function textToTest (sText, sCountry, bDebug, bContext, oInfo={}) { if (!gc_engine) { Index: graphspell-js/ibdawg.js ================================================================== --- graphspell-js/ibdawg.js +++ graphspell-js/ibdawg.js @@ -122,24 +122,10 @@ Properties: sName, nCompressionMethod, sHeader, lArcVal, nArcVal, sByDic, sLang, nChar, nBytesArc, nBytesNodeAddress, nEntry, nNode, nArc, nAff, cStemming, nTag, dChar, nBytesOffset, */ - /* - Bug workaround. - Mozilla’s JS parser sucks. Can’t read file bigger than 4 Mb! - So we convert huge hexadecimal string to list of numbers… - https://github.com/mozilla/addons-linter/issues/1361 - */ - let lTemp = []; - for (let i = 0; i < this.sByDic.length; i+=2) { - lTemp.push(parseInt(this.sByDic.slice(i, i+2), 16)); - } - this.byDic = lTemp; - //this.byDic = new Uint8Array(lTemp); // not quicker, even slower - /* end of bug workaround */ - if (!(this.sHeader.startsWith("/grammalecte-fsa/") || this.sHeader.startsWith("/pyfsa/"))) { throw TypeError("# Error. Not a grammalecte-fsa binary dictionary. Header: " + this.sHeader); } if (!(this.nCompressionMethod == 1 || this.nCompressionMethod == 2 || this.nCompressionMethod == 3)) { throw RangeError("# Error. Unknown dictionary compression method: " + this.nCompressionMethod); @@ -155,41 +141,45 @@ this.funcStemming = str_transform.changeWordWithAffixCode; } else { this.funcStemming = str_transform.noStemming; } + /* + Bug workaround. + Mozilla’s JS parser sucks. Can’t read file bigger than 4 Mb! + So we convert huge hexadecimal string to list of numbers… + https://github.com/mozilla/addons-linter/issues/1361 + */ + /* + Performance trick: + Instead of converting bytes to integers each times we parse the binary dictionary, + we do it once, then parse the array + */ + let nAcc = 0; + let lBytesBuffer = []; + let lTemp = []; + let nDivisor = (this.nBytesArc + this.nBytesNodeAddress) / 2; + for (let i = 0; i < this.sByDic.length; i+=2) { + lBytesBuffer.push(parseInt(this.sByDic.slice(i, i+2), 16)); + if (nAcc == (this.nBytesArc - 1)) { + lTemp.push(this._convBytesToInteger(lBytesBuffer)); + lBytesBuffer = []; + } + else if (nAcc == (this.nBytesArc + this.nBytesNodeAddress - 1)) { + lTemp.push(Math.round(this._convBytesToInteger(lBytesBuffer) / nDivisor)); // Math.round should be useless, BUT with JS who knowns what can happen… + lBytesBuffer = []; + nAcc = -1; + } + nAcc = nAcc + 1; + } + this.byDic = lTemp; + /* end of bug workaround */ + this._arcMask = (2 ** ((this.nBytesArc * 8) - 3)) - 1; this._finalNodeMask = 1 << ((this.nBytesArc * 8) - 1); this._lastArcMask = 1 << ((this.nBytesArc * 8) - 2); - - // Configuring DAWG functions according to nCompressionMethod - switch (this.nCompressionMethod) { - case 1: - this.morph = this._morph1; - this.stem = this._stem1; - this._lookupArcNode = this._lookupArcNode1; - this._getArcs = this._getArcs1; - this._writeNodes = this._writeNodes1; - break; - case 2: - this.morph = this._morph2; - this.stem = this._stem2; - this._lookupArcNode = this._lookupArcNode2; - this._getArcs = this._getArcs2; - this._writeNodes = this._writeNodes2; - break; - case 3: - this.morph = this._morph3; - this.stem = this._stem3; - this._lookupArcNode = this._lookupArcNode3; - this._getArcs = this._getArcs3; - this._writeNodes = this._writeNodes3; - break; - default: - throw ValueError("# Error: unknown code: " + this.nCompressionMethod); - } //console.log(this.getInfo()); this.bAcronymValid = true; this.bNumAtLastValid = false; // lexicographer module ? @@ -196,11 +186,10 @@ this.lexicographer = null; // JS still sucks: we’ll try importation when importation will be available in Workers. Still waiting... if (self && self.hasOwnProperty("lexgraph_"+this.sLangCode)) { // self is the Worker this.lexicographer = self["lexgraph_"+this.sLangCode]; } - } getInfo () { return ` Language: ${this.sLangName} Lang code: ${this.sLangCode} Dictionary name: ${this.sDicName}\n` + ` Compression method: ${this.nCompressionMethod} Date: ${this.sDate} Stemming: ${this.cStemming}FX\n` + @@ -306,24 +295,24 @@ iAddr = this._lookupArcNode(this.dChar.get(c), iAddr); if (iAddr === null) { return false; } } - return Boolean(this._convBytesToInteger(this.byDic.slice(iAddr, iAddr+this.nBytesArc)) & this._finalNodeMask); + return Boolean(this.byDic[iAddr] & this._finalNodeMask); } getMorph (sWord) { // retrieves morphologies list, different casing allowed if (!sWord) { return []; } sWord = str_transform.spellingNormalization(sWord); - let l = this.morph(sWord); + let l = this._morph(sWord); if (sWord[0].gl_isUpperCase()) { - l.push(...this.morph(sWord.toLowerCase())); + l.push(...this._morph(sWord.toLowerCase())); if (sWord.gl_isUpperCase() && sWord.length > 1) { - l.push(...this.morph(sWord.gl_toCapitalize())); + l.push(...this._morph(sWord.gl_toCapitalize())); } } return l; } @@ -378,11 +367,11 @@ } _suggest (oSuggResult, sRemain, nMaxSwitch=0, nMaxDel=0, nMaxHardRepl=0, nMaxJump=0, nDist=0, nDeep=0, iAddr=0, sNewWord="", bAvoidLoop=false) { // returns a set of suggestions // recursive function - if (this._convBytesToInteger(this.byDic.slice(iAddr, iAddr+this.nBytesArc)) & this._finalNodeMask) { + if (this.byDic[iAddr] & this._finalNodeMask) { if (sRemain == "") { oSuggResult.addSugg(sNewWord); for (let sTail of this._getTails(iAddr)) { oSuggResult.addSugg(sNewWord+sTail); } @@ -488,11 +477,11 @@ _getTails (iAddr, sTail="", n=2) { // return a list of suffixes ending at a distance of from let aTails = new Set(); for (let [nVal, jAddr] of this._getArcs(iAddr)) { if (nVal <= this.nChar) { - if (this._convBytesToInteger(this.byDic.slice(jAddr, jAddr+this.nBytesArc)) & this._finalNodeMask) { + if (this.byDic[jAddr] & this._finalNodeMask) { aTails.add(sTail + this.dCharVal.get(nVal)); } if (n && aTails.size == 0) { aTails.gl_update(this._getTails(jAddr, sTail+this.dCharVal.get(nVal), n-1)); } @@ -499,14 +488,10 @@ } } return aTails; } - // morph (sWord) { - // is defined in constructor - // } - getSimilarEntries (sWord, nSuggLimit=10) { // return a list of tuples (similar word, stem, morphology) if (sWord == "") { return []; } @@ -530,35 +515,33 @@ } catch (e) { console.log("Error in regex pattern"); console.log(e.message); } - yield* this._select1(zFlexPattern, zTagsPattern, 0, ""); + yield* this._select(zFlexPattern, zTagsPattern, 0, ""); } - // VERSION 1 - - * _select1 (zFlexPattern, zTagsPattern, iAddr, sWord) { + * _select (zFlexPattern, zTagsPattern, iAddr, sWord) { // recursive generator - for (let [nVal, jAddr] of this._getArcs1(iAddr)) { + for (let [nVal, jAddr] of this._getArcs(iAddr)) { if (nVal <= this.nChar) { // simple character - yield* this._select1(zFlexPattern, zTagsPattern, jAddr, sWord + this.lArcVal[nVal]); + yield* this._select(zFlexPattern, zTagsPattern, jAddr, sWord + this.lArcVal[nVal]); } else { if (!zFlexPattern || zFlexPattern.test(sWord)) { let sStem = this.funcStemming(sWord, this.lArcVal[nVal]); - for (let [nMorphVal, _] of this._getArcs1(jAddr)) { + for (let [nMorphVal, _] of this._getArcs(jAddr)) { if (!zTagsPattern || zTagsPattern.test(this.lArcVal[nMorphVal])) { yield [sWord, sStem, this.lArcVal[nMorphVal]]; } } } } } } - _morph1 (sWord) { + _morph (sWord) { // returns morphologies of sWord let iAddr = 0; for (let c of sWord) { if (!this.dChar.has(c)) { return []; @@ -566,38 +549,38 @@ iAddr = this._lookupArcNode(this.dChar.get(c), iAddr); if (iAddr === null) { return []; } } - if (this._convBytesToInteger(this.byDic.slice(iAddr, iAddr+this.nBytesArc)) & this._finalNodeMask) { + if (this.byDic[iAddr] & this._finalNodeMask) { let l = []; let nRawArc = 0; while (!(nRawArc & this._lastArcMask)) { - let iEndArcAddr = iAddr + this.nBytesArc; - nRawArc = this._convBytesToInteger(this.byDic.slice(iAddr, iEndArcAddr)); + let iEndArcAddr = iAddr + 1; + nRawArc = this.byDic[iAddr]; let nArc = nRawArc & this._arcMask; if (nArc > this.nChar) { // This value is not a char, this is a stemming code let sStem = ">" + this.funcStemming(sWord, this.lArcVal[nArc]); // Now , we go to the next node and retrieve all following arcs values, all of them are tags - let iAddr2 = this._convBytesToInteger(this.byDic.slice(iEndArcAddr, iEndArcAddr+this.nBytesNodeAddress)); + let iAddr2 = this.byDic[iEndArcAddr]; let nRawArc2 = 0; while (!(nRawArc2 & this._lastArcMask)) { - let iEndArcAddr2 = iAddr2 + this.nBytesArc; - nRawArc2 = this._convBytesToInteger(this.byDic.slice(iAddr2, iEndArcAddr2)); + let iEndArcAddr2 = iAddr2 + 1; + nRawArc2 = this.byDic[iAddr2]; l.push(sStem + "/" + this.lArcVal[nRawArc2 & this._arcMask]); - iAddr2 = iEndArcAddr2+this.nBytesNodeAddress; + iAddr2 = iEndArcAddr2 + 1; } } - iAddr = iEndArcAddr + this.nBytesNodeAddress; + iAddr = iEndArcAddr + 1; } return l; } return []; } - _stem1 (sWord) { + _stem (sWord) { // returns stems list of sWord let iAddr = 0; for (let c of sWord) { if (!this.dChar.has(c)) { return []; @@ -605,88 +588,61 @@ iAddr = this._lookupArcNode(this.dChar.get(c), iAddr); if (iAddr === null) { return []; } } - if (this._convBytesToInteger(this.byDic.slice(iAddr, iAddr+this.nBytesArc)) & this._finalNodeMask) { + if (this.byDic[iAddr] & this._finalNodeMask) { let l = []; let nRawArc = 0; while (!(nRawArc & this._lastArcMask)) { - let iEndArcAddr = iAddr + this.nBytesArc; - nRawArc = this._convBytesToInteger(this.byDic.slice(iAddr, iEndArcAddr)); + let iEndArcAddr = iAddr + 1; + nRawArc = this.byDic[iAddr]; let nArc = nRawArc & this._arcMask; if (nArc > this.nChar) { // This value is not a char, this is a stemming code l.push(this.funcStemming(sWord, this.lArcVal[nArc])); } - iAddr = iEndArcAddr + this.nBytesNodeAddress; + iAddr = iEndArcAddr + 1; } return l; } return []; } - _lookupArcNode1 (nVal, iAddr) { + _lookupArcNode (nVal, iAddr) { // looks if nVal is an arc at the node at iAddr, if yes, returns address of next node else None while (true) { - let iEndArcAddr = iAddr+this.nBytesArc; - let nRawArc = this._convBytesToInteger(this.byDic.slice(iAddr, iEndArcAddr)); + let iEndArcAddr = iAddr+1; + let nRawArc = this.byDic[iAddr]; if (nVal == (nRawArc & this._arcMask)) { // the value we are looking for // we return the address of the next node - return this._convBytesToInteger(this.byDic.slice(iEndArcAddr, iEndArcAddr+this.nBytesNodeAddress)); + return this.byDic[iEndArcAddr]; } else { // value not found if (nRawArc & this._lastArcMask) { return null; } - iAddr = iEndArcAddr + this.nBytesNodeAddress; - } - } - } - - * _getArcs1 (iAddr) { - // generator: return all arcs at as tuples of (nVal, iAddr) - while (true) { - let iEndArcAddr = iAddr+this.nBytesArc; - let nRawArc = this._convBytesToInteger(this.byDic.slice(iAddr, iEndArcAddr)); - yield [nRawArc & this._arcMask, this._convBytesToInteger(this.byDic.slice(iEndArcAddr, iEndArcAddr+this.nBytesNodeAddress))]; - if (nRawArc & this._lastArcMask) { - break; - } - iAddr = iEndArcAddr+this.nBytesNodeAddress; - } - } - - // VERSION 2 - _morph2 (sWord) { - // to do - } - - _stem2 (sWord) { - // to do - } - - _lookupArcNode2 (nVal, iAddr) { - // to do - } - - - // VERSION 3 - _morph3 (sWord) { - // to do - } - - _stem3 (sWord) { - // to do - } - - _lookupArcNode3 (nVal, iAddr) { - // to do + iAddr = iEndArcAddr + 1; + } + } + } + + * _getArcs (iAddr) { + // generator: return all arcs at as tuples of (nVal, iAddr) + while (true) { + let iEndArcAddr = iAddr+1; + let nRawArc = this.byDic[iAddr]; + yield [nRawArc & this._arcMask, this.byDic[iEndArcAddr]]; + if (nRawArc & this._lastArcMask) { + break; + } + iAddr = iEndArcAddr+1; + } } } if (typeof(exports) !== 'undefined') { exports.IBDAWG = IBDAWG; } Index: graphspell-js/str_transform.js ================================================================== --- graphspell-js/str_transform.js +++ graphspell-js/str_transform.js @@ -9,10 +9,11 @@ if (typeof(process) !== 'undefined') { var char_player = require("./char_player.js"); } + // Note: 48 is the ASCII code for "0" var str_transform = { @@ -150,10 +151,101 @@ } catch (e) { console.error(e); } }, + + distanceJaroWinkler: function(a, b, boost = .666) { + // https://github.com/thsig/jaro-winkler-JS + //if (a == b) { return 1.0; } + let a_len = a.length; + let b_len = b.length; + let a_flag = []; + let b_flag = []; + let search_range = Math.floor(Math.max(a_len, b_len) / 2) - 1; + let minv = Math.min(a_len, b_len); + + // Looking only within the search range, count and flag the matched pairs. + let Num_com = 0; + let yl1 = b_len - 1; + for (let i = 0; i < a_len; i++) { + let lowlim = (i >= search_range) ? i - search_range : 0; + let hilim = ((i + search_range) <= yl1) ? (i + search_range) : yl1; + for (let j = lowlim; j <= hilim; j++) { + if (b_flag[j] !== 1 && a[j] === b[i]) { + a_flag[j] = 1; + b_flag[i] = 1; + Num_com++; + break; + } + } + } + + // Return if no characters in common + if (Num_com === 0) { return 0.0; } + + // Count the number of transpositions + let k = 0; + let N_trans = 0; + for (let i = 0; i < a_len; i++) { + if (a_flag[i] === 1) { + let j; + for (j = k; j < b_len; j++) { + if (b_flag[j] === 1) { + k = j + 1; + break; + } + } + if (a[i] !== b[j]) { N_trans++; } + } + } + N_trans = Math.floor(N_trans / 2); + + // Adjust for similarities in nonmatched characters + let N_simi = 0; + let adjwt = char_player.oDistanceBetweenChars; + if (minv > Num_com) { + for (let i = 0; i < a_len; i++) { + if (!a_flag[i]) { + for (let j = 0; j < b_len; j++) { + if (!b_flag[j]) { + if (adjwt[a[i]] && adjwt[a[i]][b[j]]) { + N_simi += adjwt[a[i]][b[j]] * 10; // le fois 10 est un ajustement pour que ça fonctionne bien dans cette fonction + b_flag[j] = 2; + break; + } + } + } + } + } + } + + let Num_sim = (N_simi / 10.0) + Num_com; + + // Main weight computation + let weight = Num_sim / a_len + Num_sim / b_len + (Num_com - N_trans) / Num_com; + weight = weight / 3; + + // Continue to boost the weight if the strings are similar + if (weight > boost) { + // Adjust for having up to the first 4 characters in common + let j = (minv >= 4) ? 4 : minv; + let i; + for (i = 0; (i < j) && a[i] === b[i]; i++) { } + if (i) { weight += i * 0.1 * (1.0 - weight) }; + + // Adjust for long strings. + // After agreeing beginning chars, at least two more must agree + // and the agreeing characters must be more than half of the + // remaining characters. + if (minv > 4 && Num_com > i + 1 && 2 * Num_com >= minv + i) { + weight += (1 - weight) * ((Num_com - i - 1) / (a_len * b_len - i*2 + 2)); + } + } + + return weight; + }, showDistance (s1, s2) { console.log(`Distance: ${s1} / ${s2} = ${this.distanceDamerauLevenshtein(s1, s2)})`); }, Index: graphspell/ibdawg.py ================================================================== --- graphspell/ibdawg.py +++ graphspell/ibdawg.py @@ -118,45 +118,42 @@ else: self._initJSON(source) self.sFileName = source if isinstance(source, str) else "[None]" + # Performance trick: + # Instead of converting bytes to integers each times we parse the binary dictionary, + # we do it once, then parse the array + nAcc = 0 + byBuffer = b"" + lTemp = [] + nDivisor = (self.nBytesArc + self.nBytesNodeAddress) / 2 + for i in range(0, len(self.byDic)): + byBuffer += self.byDic[i:i+1] + if nAcc == (self.nBytesArc - 1): + lTemp.append(int.from_bytes(byBuffer, byteorder="big")) + byBuffer = b"" + elif nAcc == (self.nBytesArc + self.nBytesNodeAddress - 1): + lTemp.append(round(int.from_bytes(byBuffer, byteorder="big") / nDivisor)) + byBuffer = b"" + nAcc = -1 + nAcc = nAcc + 1 + self.byDic = lTemp; + + # masks self._arcMask = (2 ** ((self.nBytesArc * 8) - 3)) - 1 self._finalNodeMask = 1 << ((self.nBytesArc * 8) - 1) self._lastArcMask = 1 << ((self.nBytesArc * 8) - 2) - self._addrBitMask = 1 << ((self.nBytesArc * 8) - 3) # version 2 # function to decode the affix/suffix code if self.cStemming == "S": self.funcStemming = st.changeWordWithSuffixCode elif self.cStemming == "A": self.funcStemming = st.changeWordWithAffixCode else: self.funcStemming = st.noStemming - # Configuring DAWG functions according to nCompressionMethod - if self.nCompressionMethod == 1: - self.morph = self._morph1 - self.stem = self._stem1 - self._lookupArcNode = self._lookupArcNode1 - self._getArcs = self._getArcs1 - self._writeNodes = self._writeNodes1 - elif self.nCompressionMethod == 2: - self.morph = self._morph2 - self.stem = self._stem2 - self._lookupArcNode = self._lookupArcNode2 - self._getArcs = self._getArcs2 - self._writeNodes = self._writeNodes2 - elif self.nCompressionMethod == 3: - self.morph = self._morph3 - self.stem = self._stem3 - self._lookupArcNode = self._lookupArcNode3 - self._getArcs = self._getArcs3 - self._writeNodes = self._writeNodes3 - else: - raise ValueError(" # Error: unknown code: {}".format(self.nCompressionMethod)) - self.bAcronymValid = False self.bNumAtLastValid = False # lexicographer module ? self.lexicographer = None @@ -202,11 +199,10 @@ # to get the value of an arc, to get the char of an arc with its value self.dChar = {} for i in range(1, self.nChar+1): self.dChar[self.lArcVal[i]] = i self.dCharVal = { v: k for k, v in self.dChar.items() } - self.nBytesOffset = 1 # version 3 def _initJSON (self, oJSON): "initialize with a JSON text file" self.sByDic = "" # init to prevent pylint whining self.__dict__.update(oJSON) @@ -246,11 +242,10 @@ "nArcVal": self.nArcVal, "lArcVal": self.lArcVal, "nCompressionMethod": self.nCompressionMethod, "nBytesArc": self.nBytesArc, "nBytesNodeAddress": self.nBytesNodeAddress, - "nBytesOffset": self.nBytesOffset, # JavaScript is a pile of shit, so Mozilla’s JS parser don’t like file bigger than 4 Mb! # So, if necessary, we use an hexadecimal string, that we will convert later in Firefox’s extension. # https://github.com/mozilla/addons-linter/issues/1361 "sByDic": self.byDic.hex() if bBinaryDictAsHexString else [ e for e in self.byDic ], "l2grams": list(self.a2grams) @@ -298,22 +293,22 @@ if c not in self.dChar: return False iAddr = self._lookupArcNode(self.dChar[c], iAddr) if iAddr is None: return False - return bool(int.from_bytes(self.byDic[iAddr:iAddr+self.nBytesArc], byteorder='big') & self._finalNodeMask) + return bool(self.byDic[iAddr] & self._finalNodeMask) def getMorph (self, sWord): "retrieves morphologies list, different casing allowed" if not sWord: return [] sWord = st.spellingNormalization(sWord) - l = self.morph(sWord) + l = self._morph(sWord) if sWord[0:1].isupper(): - l.extend(self.morph(sWord.lower())) + l.extend(self._morph(sWord.lower())) if sWord.isupper() and len(sWord) > 1: - l.extend(self.morph(sWord.capitalize())) + l.extend(self._morph(sWord.capitalize())) return l #@timethis def suggest (self, sWord, nSuggLimit=10, bSplitTrailingNumbers=False): "returns a set of suggestions for " @@ -354,11 +349,11 @@ oSuggResult.addSugg(sWord1+" "+sWord2) def _suggest (self, oSuggResult, sRemain, nMaxSwitch=0, nMaxDel=0, nMaxHardRepl=0, nMaxJump=0, nDist=0, nDeep=0, iAddr=0, sNewWord="", bAvoidLoop=False): # recursive function #logging.info((nDeep * " ") + sNewWord + ":" + sRemain) - if int.from_bytes(self.byDic[iAddr:iAddr+self.nBytesArc], byteorder='big') & self._finalNodeMask: + if self.byDic[iAddr] & self._finalNodeMask: if not sRemain: oSuggResult.addSugg(sNewWord, nDeep) for sTail in self._getTails(iAddr): oSuggResult.addSugg(sNewWord+sTail, nDeep) return @@ -421,11 +416,11 @@ def _getTails (self, iAddr, sTail="", n=2): "return a list of suffixes ending at a distance of from " aTails = set() for nVal, jAddr in self._getArcs(iAddr): if nVal <= self.nChar: - if int.from_bytes(self.byDic[jAddr:jAddr+self.nBytesArc], byteorder='big') & self._finalNodeMask: + if self.byDic[jAddr] & self._finalNodeMask: aTails.add(sTail + self.dCharVal[nVal]) if n and not aTails: aTails.update(self._getTails(jAddr, sTail+self.dCharVal[nVal], n-1)) return aTails @@ -469,330 +464,113 @@ if sTagsPattern: zTagsPattern = re.compile(sTagsPattern) except re.error: print("# Error in regex pattern") traceback.print_exc() - yield from self._select1(zFlexPattern, zTagsPattern, 0, "") - - # def morph (self, sWord): - # is defined in __init__ - - # VERSION 1 - def _select1 (self, zFlexPattern, zTagsPattern, iAddr, sWord): - # recursive generator - for nVal, jAddr in self._getArcs1(iAddr): + yield from self._select(zFlexPattern, zTagsPattern, 0, "") + + def _select (self, zFlexPattern, zTagsPattern, iAddr, sWord): + # recursive generator + for nVal, jAddr in self._getArcs(iAddr): if nVal <= self.nChar: # simple character - yield from self._select1(zFlexPattern, zTagsPattern, jAddr, sWord + self.lArcVal[nVal]) + yield from self._select(zFlexPattern, zTagsPattern, jAddr, sWord + self.lArcVal[nVal]) else: if not zFlexPattern or zFlexPattern.search(sWord): sStem = self.funcStemming(sWord, self.lArcVal[nVal]) - for nMorphVal, _ in self._getArcs1(jAddr): + for nMorphVal, _ in self._getArcs(jAddr): if not zTagsPattern or zTagsPattern.search(self.lArcVal[nMorphVal]): yield [sWord, sStem, self.lArcVal[nMorphVal]] - def _morph1 (self, sWord): + def _morph (self, sWord): "returns morphologies of " iAddr = 0 for c in sWord: if c not in self.dChar: return [] iAddr = self._lookupArcNode(self.dChar[c], iAddr) if iAddr is None: return [] - if int.from_bytes(self.byDic[iAddr:iAddr+self.nBytesArc], byteorder='big') & self._finalNodeMask: + if self.byDic[iAddr] & self._finalNodeMask: l = [] nRawArc = 0 while not nRawArc & self._lastArcMask: - iEndArcAddr = iAddr + self.nBytesArc - nRawArc = int.from_bytes(self.byDic[iAddr:iEndArcAddr], byteorder='big') + iEndArcAddr = iAddr + 1 + nRawArc = self.byDic[iAddr] nArc = nRawArc & self._arcMask if nArc > self.nChar: # This value is not a char, this is a stemming code sStem = ">" + self.funcStemming(sWord, self.lArcVal[nArc]) # Now , we go to the next node and retrieve all following arcs values, all of them are tags - iAddr2 = int.from_bytes(self.byDic[iEndArcAddr:iEndArcAddr+self.nBytesNodeAddress], byteorder='big') + iAddr2 = self.byDic[iEndArcAddr] nRawArc2 = 0 while not nRawArc2 & self._lastArcMask: - iEndArcAddr2 = iAddr2 + self.nBytesArc - nRawArc2 = int.from_bytes(self.byDic[iAddr2:iEndArcAddr2], byteorder='big') + iEndArcAddr2 = iAddr2 + 1 + nRawArc2 = self.byDic[iAddr2] l.append(sStem + "/" + self.lArcVal[nRawArc2 & self._arcMask]) - iAddr2 = iEndArcAddr2+self.nBytesNodeAddress - iAddr = iEndArcAddr+self.nBytesNodeAddress + iAddr2 = iEndArcAddr2 + 1 + iAddr = iEndArcAddr + 1 return l return [] - def _stem1 (self, sWord): + def _stem (self, sWord): "returns stems list of " iAddr = 0 for c in sWord: if c not in self.dChar: return [] iAddr = self._lookupArcNode(self.dChar[c], iAddr) if iAddr is None: return [] - if int.from_bytes(self.byDic[iAddr:iAddr+self.nBytesArc], byteorder='big') & self._finalNodeMask: + if self.byDic[iAddr] & self._finalNodeMask: l = [] nRawArc = 0 while not nRawArc & self._lastArcMask: - iEndArcAddr = iAddr + self.nBytesArc - nRawArc = int.from_bytes(self.byDic[iAddr:iEndArcAddr], byteorder='big') + iEndArcAddr = iAddr + 1 + nRawArc = self.byDic[iAddr] nArc = nRawArc & self._arcMask if nArc > self.nChar: # This value is not a char, this is a stemming code l.append(self.funcStemming(sWord, self.lArcVal[nArc])) - iAddr = iEndArcAddr+self.nBytesNodeAddress + iAddr = iEndArcAddr + 1 return l return [] - def _lookupArcNode1 (self, nVal, iAddr): + def _lookupArcNode (self, nVal, iAddr): "looks if is an arc at the node at , if yes, returns address of next node else None" while True: - iEndArcAddr = iAddr+self.nBytesArc - nRawArc = int.from_bytes(self.byDic[iAddr:iEndArcAddr], byteorder='big') + iEndArcAddr = iAddr + 1 + nRawArc = self.byDic[iAddr] if nVal == (nRawArc & self._arcMask): # the value we are looking for # we return the address of the next node - return int.from_bytes(self.byDic[iEndArcAddr:iEndArcAddr+self.nBytesNodeAddress], byteorder='big') + return self.byDic[iEndArcAddr] # value not found if nRawArc & self._lastArcMask: return None - iAddr = iEndArcAddr+self.nBytesNodeAddress + iAddr = iEndArcAddr + 1 - def _getArcs1 (self, iAddr): + def _getArcs (self, iAddr): "generator: return all arcs at as tuples of (nVal, iAddr)" while True: - iEndArcAddr = iAddr+self.nBytesArc - nRawArc = int.from_bytes(self.byDic[iAddr:iEndArcAddr], byteorder='big') - yield nRawArc & self._arcMask, int.from_bytes(self.byDic[iEndArcAddr:iEndArcAddr+self.nBytesNodeAddress], byteorder='big') + iEndArcAddr = iAddr + 1 + nRawArc = self.byDic[iAddr] + yield nRawArc & self._arcMask, self.byDic[iEndArcAddr] if nRawArc & self._lastArcMask: break - iAddr = iEndArcAddr+self.nBytesNodeAddress + iAddr = iEndArcAddr + 1 - def _writeNodes1 (self, spfDest): + def _writeNodes (self, spfDest): "for debugging only" print(" > Write binary nodes") with open(spfDest, 'w', 'utf-8', newline="\n") as hDst: iAddr = 0 hDst.write("i{:_>10} -- #{:_>10}\n".format("0", iAddr)) while iAddr < len(self.byDic): - iEndArcAddr = iAddr+self.nBytesArc - nRawArc = int.from_bytes(self.byDic[iAddr:iEndArcAddr], byteorder='big') - nArc = nRawArc & self._arcMask - hDst.write(" {:<20} {:0>16} i{:>10} #{:_>10}\n".format(self.lArcVal[nArc], bin(nRawArc)[2:], "?", \ - int.from_bytes(self.byDic[iEndArcAddr:iEndArcAddr+self.nBytesNodeAddress], \ - byteorder='big'))) - iAddr = iEndArcAddr+self.nBytesNodeAddress - if (nRawArc & self._lastArcMask) and iAddr < len(self.byDic): - hDst.write("\ni{:_>10} -- #{:_>10}\n".format("?", iAddr)) - hDst.close() - - # VERSION 2 - def _morph2 (self, sWord): - "returns morphologies of " - iAddr = 0 - for c in sWord: - if c not in self.dChar: - return [] - iAddr = self._lookupArcNode(self.dChar[c], iAddr) - if iAddr is None: - return [] - if int.from_bytes(self.byDic[iAddr:iAddr+self.nBytesArc], byteorder='big') & self._finalNodeMask: - l = [] - nRawArc = 0 - while not nRawArc & self._lastArcMask: - iEndArcAddr = iAddr + self.nBytesArc - nRawArc = int.from_bytes(self.byDic[iAddr:iEndArcAddr], byteorder='big') - nArc = nRawArc & self._arcMask - if nArc > self.nChar: - # This value is not a char, this is a stemming code - sStem = ">" + self.funcStemming(sWord, self.lArcVal[nArc]) - # Now , we go to the next node and retrieve all following arcs values, all of them are tags - if not nRawArc & self._addrBitMask: - iAddr2 = int.from_bytes(self.byDic[iEndArcAddr:iEndArcAddr+self.nBytesNodeAddress], byteorder='big') - else: - # we go to the end of the node - iAddr2 = iEndArcAddr - while not nRawArc & self._lastArcMask: - nRawArc = int.from_bytes(self.byDic[iAddr2:iAddr2+self.nBytesArc], byteorder='big') - iAddr2 += self.nBytesArc + self.nBytesNodeAddress - nRawArc2 = 0 - while not nRawArc2 & self._lastArcMask: - iEndArcAddr2 = iAddr2 + self.nBytesArc - nRawArc2 = int.from_bytes(self.byDic[iAddr2:iEndArcAddr2], byteorder='big') - l.append(sStem + "/" + self.lArcVal[nRawArc2 & self._arcMask]) - iAddr2 = iEndArcAddr2+self.nBytesNodeAddress if not nRawArc2 & self._addrBitMask else iEndArcAddr2 - iAddr = iEndArcAddr+self.nBytesNodeAddress if not nRawArc & self._addrBitMask else iEndArcAddr - return l - return [] - - def _stem2 (self, sWord): - "returns stems list of " - iAddr = 0 - for c in sWord: - if c not in self.dChar: - return [] - iAddr = self._lookupArcNode(self.dChar[c], iAddr) - if iAddr is None: - return [] - if int.from_bytes(self.byDic[iAddr:iAddr+self.nBytesArc], byteorder='big') & self._finalNodeMask: - l = [] - nRawArc = 0 - while not nRawArc & self._lastArcMask: - iEndArcAddr = iAddr + self.nBytesArc - nRawArc = int.from_bytes(self.byDic[iAddr:iEndArcAddr], byteorder='big') - nArc = nRawArc & self._arcMask - if nArc > self.nChar: - # This value is not a char, this is a stemming code - l.append(self.funcStemming(sWord, self.lArcVal[nArc])) - # Now , we go to the next node - if not nRawArc & self._addrBitMask: - iAddr2 = int.from_bytes(self.byDic[iEndArcAddr:iEndArcAddr+self.nBytesNodeAddress], byteorder='big') - else: - # we go to the end of the node - iAddr2 = iEndArcAddr - while not nRawArc & self._lastArcMask: - nRawArc = int.from_bytes(self.byDic[iAddr2:iAddr2+self.nBytesArc], byteorder='big') - iAddr2 += self.nBytesArc + self.nBytesNodeAddress - iAddr = iEndArcAddr+self.nBytesNodeAddress if not nRawArc & self._addrBitMask else iEndArcAddr - return l - return [] - - def _lookupArcNode2 (self, nVal, iAddr): - "looks if is an arc at the node at , if yes, returns address of next node else None" - while True: - iEndArcAddr = iAddr+self.nBytesArc - nRawArc = int.from_bytes(self.byDic[iAddr:iEndArcAddr], byteorder='big') - if nVal == (nRawArc & self._arcMask): - # the value we are looking for - if not nRawArc & self._addrBitMask: - # we return the address of the next node - return int.from_bytes(self.byDic[iEndArcAddr:iEndArcAddr+self.nBytesNodeAddress], byteorder='big') - # we go to the end of the node - iAddr = iEndArcAddr - while not nRawArc & self._lastArcMask: - nRawArc = int.from_bytes(self.byDic[iAddr:iAddr+self.nBytesArc], byteorder='big') - iAddr += self.nBytesArc + self.nBytesNodeAddress if not nRawArc & self._addrBitMask else self.nBytesArc - return iAddr - # value not found - if nRawArc & self._lastArcMask: - return None - iAddr = iEndArcAddr+self.nBytesNodeAddress if not nRawArc & self._addrBitMask else iEndArcAddr - - def _writeNodes2 (self, spfDest): - "for debugging only" - print(" > Write binary nodes") - with open(spfDest, 'w', 'utf-8', newline="\n") as hDst: - iAddr = 0 - hDst.write("i{:_>10} -- #{:_>10}\n".format("0", iAddr)) - while iAddr < len(self.byDic): - iEndArcAddr = iAddr+self.nBytesArc - nRawArc = int.from_bytes(self.byDic[iAddr:iEndArcAddr], byteorder='big') - nArc = nRawArc & self._arcMask - if not nRawArc & self._addrBitMask: - iNextNodeAddr = int.from_bytes(self.byDic[iEndArcAddr:iEndArcAddr+self.nBytesNodeAddress], byteorder='big') - hDst.write(" {:<20} {:0>16} i{:>10} #{:_>10}\n".format(self.lArcVal[nArc], bin(nRawArc)[2:], "?", iNextNodeAddr)) - iAddr = iEndArcAddr+self.nBytesNodeAddress - else: - hDst.write(" {:<20} {:0>16}\n".format(self.lArcVal[nArc], bin(nRawArc)[2:])) - iAddr = iEndArcAddr - if nRawArc & self._lastArcMask: - hDst.write("\ni{:_>10} -- #{:_>10}\n".format("?", iAddr)) - hDst.close() - - # VERSION 3 - def _morph3 (self, sWord): - "returns morphologies of " - iAddr = 0 - for c in sWord: - if c not in self.dChar: - return [] - iAddr = self._lookupArcNode(self.dChar[c], iAddr) - if iAddr is None: - return [] - if int.from_bytes(self.byDic[iAddr:iAddr+self.nBytesArc], byteorder='big') & self._finalNodeMask: - l = [] - nRawArc = 0 - iAddrNode = iAddr - while not nRawArc & self._lastArcMask: - iEndArcAddr = iAddr + self.nBytesArc - nRawArc = int.from_bytes(self.byDic[iAddr:iEndArcAddr], byteorder='big') - nArc = nRawArc & self._arcMask - if nArc > self.nChar: - # This value is not a char, this is a stemming code - sStem = ">" + self.funcStemming(sWord, self.lArcVal[nArc]) - # Now , we go to the next node and retrieve all following arcs values, all of them are tags - if not nRawArc & self._addrBitMask: - iAddr2 = int.from_bytes(self.byDic[iEndArcAddr:iEndArcAddr+self.nBytesNodeAddress], byteorder='big') - else: - iAddr2 = iAddrNode + int.from_bytes(self.byDic[iEndArcAddr:iEndArcAddr+self.nBytesOffset], byteorder='big') - nRawArc2 = 0 - while not nRawArc2 & self._lastArcMask: - iEndArcAddr2 = iAddr2 + self.nBytesArc - nRawArc2 = int.from_bytes(self.byDic[iAddr2:iEndArcAddr2], byteorder='big') - l.append(sStem + "/" + self.lArcVal[nRawArc2 & self._arcMask]) - iAddr2 = iEndArcAddr2+self.nBytesNodeAddress if not nRawArc2 & self._addrBitMask else iEndArcAddr2+self.nBytesOffset - iAddr = iEndArcAddr+self.nBytesNodeAddress if not nRawArc & self._addrBitMask else iEndArcAddr+self.nBytesOffset - return l - return [] - - def _stem3 (self, sWord): - "returns stems list of " - iAddr = 0 - for c in sWord: - if c not in self.dChar: - return [] - iAddr = self._lookupArcNode(self.dChar[c], iAddr) - if iAddr is None: - return [] - if int.from_bytes(self.byDic[iAddr:iAddr+self.nBytesArc], byteorder='big') & self._finalNodeMask: - l = [] - nRawArc = 0 - #iAddrNode = iAddr - while not nRawArc & self._lastArcMask: - iEndArcAddr = iAddr + self.nBytesArc - nRawArc = int.from_bytes(self.byDic[iAddr:iEndArcAddr], byteorder='big') - nArc = nRawArc & self._arcMask - if nArc > self.nChar: - # This value is not a char, this is a stemming code - l.append(self.funcStemming(sWord, self.lArcVal[nArc])) - iAddr = iEndArcAddr+self.nBytesNodeAddress if not nRawArc & self._addrBitMask else iEndArcAddr+self.nBytesOffset - return l - return [] - - def _lookupArcNode3 (self, nVal, iAddr): - "looks if is an arc at the node at , if yes, returns address of next node else None" - iAddrNode = iAddr - while True: - iEndArcAddr = iAddr+self.nBytesArc - nRawArc = int.from_bytes(self.byDic[iAddr:iEndArcAddr], byteorder='big') - if nVal == (nRawArc & self._arcMask): - # the value we are looking for - if not nRawArc & self._addrBitMask: - return int.from_bytes(self.byDic[iEndArcAddr:iEndArcAddr+self.nBytesNodeAddress], byteorder='big') - return iAddrNode + int.from_bytes(self.byDic[iEndArcAddr:iEndArcAddr+self.nBytesOffset], byteorder='big') - # value not found - if nRawArc & self._lastArcMask: - return None - iAddr = iEndArcAddr+self.nBytesNodeAddress if not nRawArc & self._addrBitMask else iEndArcAddr+self.nBytesOffset - - def _writeNodes3 (self, spfDest): - "for debugging only" - print(" > Write binary nodes") - with open(spfDest, 'w', 'utf-8', newline="\n") as hDst: - iAddr = 0 - hDst.write("i{:_>10} -- #{:_>10}\n".format("0", iAddr)) - while iAddr < len(self.byDic): - iEndArcAddr = iAddr+self.nBytesArc - nRawArc = int.from_bytes(self.byDic[iAddr:iEndArcAddr], byteorder='big') - nArc = nRawArc & self._arcMask - if not nRawArc & self._addrBitMask: - iNextNodeAddr = int.from_bytes(self.byDic[iEndArcAddr:iEndArcAddr+self.nBytesNodeAddress], byteorder='big') - hDst.write(" {:<20} {:0>16} i{:>10} #{:_>10}\n".format(self.lArcVal[nArc], bin(nRawArc)[2:], "?", iNextNodeAddr)) - iAddr = iEndArcAddr+self.nBytesNodeAddress - else: - iNextNodeAddr = int.from_bytes(self.byDic[iEndArcAddr:iEndArcAddr+self.nBytesOffset], byteorder='big') - hDst.write(" {:<20} {:0>16} i{:>10} +{:_>10}\n".format(self.lArcVal[nArc], bin(nRawArc)[2:], "?", iNextNodeAddr)) - iAddr = iEndArcAddr+self.nBytesOffset - if nRawArc & self._lastArcMask: + iEndArcAddr = iAddr + 1 + nRawArc = self.byDic[iAddr] + nArc = nRawArc & self._arcMask + hDst.write(" {:<20} {:0>16} i{:>10} #{:_>10}\n".format(self.lArcVal[nArc], bin(nRawArc)[2:], "?", self.byDic[iEndArcAddr])) + iAddr = iEndArcAddr + 1 + if (nRawArc & self._lastArcMask) and iAddr < len(self.byDic): hDst.write("\ni{:_>10} -- #{:_>10}\n".format("?", iAddr)) hDst.close()