Index: gc_lang/fr/perf_memo.text ================================================================== --- gc_lang/fr/perf_memo.text +++ gc_lang/fr/perf_memo.text @@ -28,7 +28,8 @@ 1.1 2019.05.16 09:42 1.50743 0.360923 0.261113 0.0749272 0.0763827 0.0771537 0.180504 0.102942 0.0182762 0.0021925 (×2, but new processor: AMD Ryzen 7 2700X) 1.2.1 2019.08.06 20:57 1.42886 0.358425 0.247356 0.0704405 0.0754886 0.0765604 0.177197 0.0988517 0.0188103 0.0020243 1.6.0 2020.01.03 20:22 1.38847 0.346214 0.240242 0.0709539 0.0737499 0.0748733 0.176477 0.0969171 0.0187857 0.0025143 (nouveau dictionnaire avec lemmes masculins) 1.9.0 2020.04.20 19:57 1.51183 0.369546 0.25681 0.0734314 0.0764396 0.0785668 0.183922 0.103674 0.0185812 0.002099 (NFC normalization) 1.9.2 2020.05.12 08:43 1.62465 0.398831 0.273012 0.0810811 0.080937 0.0845885 0.204133 0.114146 0.0212864 0.0029547 -1.12.2 2020.09.09 13:34 1.50568 0.374504 0.233108 0.0798712 0.0804466 0.0769674 0.171519 0.0945132 0.0165344 0.0019474 -1.12.2 2020.09.09 13:35 1.41094 0.359093 0.236443 0.06968 0.0734418 0.0738087 0.169371 0.0946279 0.0167106 0.0019773 +1.12.2 2020.09.09 13:34 1.50568 0.374504 0.233108 0.0798712 0.0804466 0.0769674 0.171519 0.0945132 0.0165344 0.0019474 +1.12.2 2020.09.09 13:35 1.41094 0.359093 0.236443 0.06968 0.0734418 0.0738087 0.169371 0.0946279 0.0167106 0.0019773 +1.12.2 2020.09.11 19:16 1.35297 0.330545 0.221731 0.0666998 0.0692539 0.0701707 0.160564 0.0891676 0.015807 0.0045998 Index: graphspell-js/ibdawg.js ================================================================== --- graphspell-js/ibdawg.js +++ graphspell-js/ibdawg.js @@ -128,16 +128,33 @@ Bug workaround. Mozilla’s JS parser sucks. Can’t read file bigger than 4 Mb! So we convert huge hexadecimal string to list of numbers… https://github.com/mozilla/addons-linter/issues/1361 */ + /* + Performance trick: + Instead of converting bytes to integers each times we parse the binary dictionary, + we do it once, then parse the array + */ + let nAcc = 0; + let lBytesBuffer = []; let lTemp = []; + let nDivisor = (this.nBytesArc + this.nBytesNodeAddress) / 2; for (let i = 0; i < this.sByDic.length; i+=2) { - lTemp.push(parseInt(this.sByDic.slice(i, i+2), 16)); + lBytesBuffer.push(parseInt(this.sByDic.slice(i, i+2), 16)); + if (nAcc == (this.nBytesArc - 1)) { + lTemp.push(this._convBytesToInteger(lBytesBuffer)); + lBytesBuffer = []; + } + else if (nAcc == (this.nBytesArc + this.nBytesNodeAddress - 1)) { + lTemp.push(Math.round(this._convBytesToInteger(lBytesBuffer) / nDivisor)); // Math.round should be useless, BUT with JS who knowns what can happen… + lBytesBuffer = []; + nAcc = -1; + } + nAcc = nAcc + 1; } this.byDic = lTemp; - //this.byDic = new Uint8Array(lTemp); // not quicker, even slower /* end of bug workaround */ if (!(this.sHeader.startsWith("/grammalecte-fsa/") || this.sHeader.startsWith("/pyfsa/"))) { throw TypeError("# Error. Not a grammalecte-fsa binary dictionary. Header: " + this.sHeader); } @@ -196,11 +213,10 @@ this.lexicographer = null; // JS still sucks: we’ll try importation when importation will be available in Workers. Still waiting... if (self && self.hasOwnProperty("lexgraph_"+this.sLangCode)) { // self is the Worker this.lexicographer = self["lexgraph_"+this.sLangCode]; } - } getInfo () { return ` Language: ${this.sLangName} Lang code: ${this.sLangCode} Dictionary name: ${this.sDicName}\n` + ` Compression method: ${this.nCompressionMethod} Date: ${this.sDate} Stemming: ${this.cStemming}FX\n` + @@ -306,11 +322,11 @@ iAddr = this._lookupArcNode(this.dChar.get(c), iAddr); if (iAddr === null) { return false; } } - return Boolean(this._convBytesToInteger(this.byDic.slice(iAddr, iAddr+this.nBytesArc)) & this._finalNodeMask); + return Boolean(this.byDic[iAddr] & this._finalNodeMask); } getMorph (sWord) { // retrieves morphologies list, different casing allowed if (!sWord) { @@ -378,11 +394,11 @@ } _suggest (oSuggResult, sRemain, nMaxSwitch=0, nMaxDel=0, nMaxHardRepl=0, nMaxJump=0, nDist=0, nDeep=0, iAddr=0, sNewWord="", bAvoidLoop=false) { // returns a set of suggestions // recursive function - if (this._convBytesToInteger(this.byDic.slice(iAddr, iAddr+this.nBytesArc)) & this._finalNodeMask) { + if (this.byDic[iAddr] & this._finalNodeMask) { if (sRemain == "") { oSuggResult.addSugg(sNewWord); for (let sTail of this._getTails(iAddr)) { oSuggResult.addSugg(sNewWord+sTail); } @@ -488,11 +504,11 @@ _getTails (iAddr, sTail="", n=2) { // return a list of suffixes ending at a distance of from let aTails = new Set(); for (let [nVal, jAddr] of this._getArcs(iAddr)) { if (nVal <= this.nChar) { - if (this._convBytesToInteger(this.byDic.slice(jAddr, jAddr+this.nBytesArc)) & this._finalNodeMask) { + if (this.byDic[jAddr] & this._finalNodeMask) { aTails.add(sTail + this.dCharVal.get(nVal)); } if (n && aTails.size == 0) { aTails.gl_update(this._getTails(jAddr, sTail+this.dCharVal.get(nVal), n-1)); } @@ -566,31 +582,31 @@ iAddr = this._lookupArcNode(this.dChar.get(c), iAddr); if (iAddr === null) { return []; } } - if (this._convBytesToInteger(this.byDic.slice(iAddr, iAddr+this.nBytesArc)) & this._finalNodeMask) { + if (this.byDic[iAddr] & this._finalNodeMask) { let l = []; let nRawArc = 0; while (!(nRawArc & this._lastArcMask)) { - let iEndArcAddr = iAddr + this.nBytesArc; - nRawArc = this._convBytesToInteger(this.byDic.slice(iAddr, iEndArcAddr)); + let iEndArcAddr = iAddr + 1; + nRawArc = this.byDic[iAddr]; let nArc = nRawArc & this._arcMask; if (nArc > this.nChar) { // This value is not a char, this is a stemming code let sStem = ">" + this.funcStemming(sWord, this.lArcVal[nArc]); // Now , we go to the next node and retrieve all following arcs values, all of them are tags - let iAddr2 = this._convBytesToInteger(this.byDic.slice(iEndArcAddr, iEndArcAddr+this.nBytesNodeAddress)); + let iAddr2 = this.byDic[iEndArcAddr]; let nRawArc2 = 0; while (!(nRawArc2 & this._lastArcMask)) { - let iEndArcAddr2 = iAddr2 + this.nBytesArc; - nRawArc2 = this._convBytesToInteger(this.byDic.slice(iAddr2, iEndArcAddr2)); + let iEndArcAddr2 = iAddr2 + 1; + nRawArc2 = this.byDic[iAddr2]; l.push(sStem + "/" + this.lArcVal[nRawArc2 & this._arcMask]); - iAddr2 = iEndArcAddr2+this.nBytesNodeAddress; + iAddr2 = iEndArcAddr2 + 1; } } - iAddr = iEndArcAddr + this.nBytesNodeAddress; + iAddr = iEndArcAddr + 1; } return l; } return []; } @@ -605,58 +621,58 @@ iAddr = this._lookupArcNode(this.dChar.get(c), iAddr); if (iAddr === null) { return []; } } - if (this._convBytesToInteger(this.byDic.slice(iAddr, iAddr+this.nBytesArc)) & this._finalNodeMask) { + if (this.byDic[iAddr] & this._finalNodeMask) { let l = []; let nRawArc = 0; while (!(nRawArc & this._lastArcMask)) { - let iEndArcAddr = iAddr + this.nBytesArc; - nRawArc = this._convBytesToInteger(this.byDic.slice(iAddr, iEndArcAddr)); + let iEndArcAddr = iAddr + 1; + nRawArc = this.byDic[iAddr]; let nArc = nRawArc & this._arcMask; if (nArc > this.nChar) { // This value is not a char, this is a stemming code l.push(this.funcStemming(sWord, this.lArcVal[nArc])); } - iAddr = iEndArcAddr + this.nBytesNodeAddress; + iAddr = iEndArcAddr + 1; } return l; } return []; } _lookupArcNode1 (nVal, iAddr) { // looks if nVal is an arc at the node at iAddr, if yes, returns address of next node else None while (true) { - let iEndArcAddr = iAddr+this.nBytesArc; - let nRawArc = this._convBytesToInteger(this.byDic.slice(iAddr, iEndArcAddr)); + let iEndArcAddr = iAddr+1; + let nRawArc = this.byDic[iAddr]; if (nVal == (nRawArc & this._arcMask)) { // the value we are looking for // we return the address of the next node - return this._convBytesToInteger(this.byDic.slice(iEndArcAddr, iEndArcAddr+this.nBytesNodeAddress)); + return this.byDic[iEndArcAddr]; } else { // value not found if (nRawArc & this._lastArcMask) { return null; } - iAddr = iEndArcAddr + this.nBytesNodeAddress; + iAddr = iEndArcAddr + 1; } } } * _getArcs1 (iAddr) { // generator: return all arcs at as tuples of (nVal, iAddr) while (true) { - let iEndArcAddr = iAddr+this.nBytesArc; - let nRawArc = this._convBytesToInteger(this.byDic.slice(iAddr, iEndArcAddr)); - yield [nRawArc & this._arcMask, this._convBytesToInteger(this.byDic.slice(iEndArcAddr, iEndArcAddr+this.nBytesNodeAddress))]; + let iEndArcAddr = iAddr+1; + let nRawArc = this.byDic[iAddr]; + yield [nRawArc & this._arcMask, this.byDic[iEndArcAddr]]; if (nRawArc & this._lastArcMask) { break; } - iAddr = iEndArcAddr+this.nBytesNodeAddress; + iAddr = iEndArcAddr+1; } } // VERSION 2 _morph2 (sWord) { Index: graphspell/ibdawg.py ================================================================== --- graphspell/ibdawg.py +++ graphspell/ibdawg.py @@ -118,10 +118,30 @@ else: self._initJSON(source) self.sFileName = source if isinstance(source, str) else "[None]" + # Performance trick: + # Instead of converting bytes to integers each times we parse the binary dictionary, + # we do it once, then parse the array + nAcc = 0 + byBuffer = b"" + lTemp = [] + nDivisor = (self.nBytesArc + self.nBytesNodeAddress) / 2 + for i in range(0, len(self.byDic)): + byBuffer += self.byDic[i:i+1] + if nAcc == (self.nBytesArc - 1): + lTemp.append(int.from_bytes(byBuffer, byteorder="big")) + byBuffer = b"" + elif nAcc == (self.nBytesArc + self.nBytesNodeAddress - 1): + lTemp.append(round(int.from_bytes(byBuffer, byteorder="big") / nDivisor)) + byBuffer = b"" + nAcc = -1 + nAcc = nAcc + 1 + self.byDic = lTemp; + + # masks self._arcMask = (2 ** ((self.nBytesArc * 8) - 3)) - 1 self._finalNodeMask = 1 << ((self.nBytesArc * 8) - 1) self._lastArcMask = 1 << ((self.nBytesArc * 8) - 2) self._addrBitMask = 1 << ((self.nBytesArc * 8) - 3) # version 2 @@ -298,11 +318,11 @@ if c not in self.dChar: return False iAddr = self._lookupArcNode(self.dChar[c], iAddr) if iAddr is None: return False - return bool(int.from_bytes(self.byDic[iAddr:iAddr+self.nBytesArc], byteorder='big') & self._finalNodeMask) + return bool(self.byDic[iAddr] & self._finalNodeMask) def getMorph (self, sWord): "retrieves morphologies list, different casing allowed" if not sWord: return [] @@ -354,11 +374,11 @@ oSuggResult.addSugg(sWord1+" "+sWord2) def _suggest (self, oSuggResult, sRemain, nMaxSwitch=0, nMaxDel=0, nMaxHardRepl=0, nMaxJump=0, nDist=0, nDeep=0, iAddr=0, sNewWord="", bAvoidLoop=False): # recursive function #logging.info((nDeep * " ") + sNewWord + ":" + sRemain) - if int.from_bytes(self.byDic[iAddr:iAddr+self.nBytesArc], byteorder='big') & self._finalNodeMask: + if self.byDic[iAddr] & self._finalNodeMask: if not sRemain: oSuggResult.addSugg(sNewWord, nDeep) for sTail in self._getTails(iAddr): oSuggResult.addSugg(sNewWord+sTail, nDeep) return @@ -421,11 +441,11 @@ def _getTails (self, iAddr, sTail="", n=2): "return a list of suffixes ending at a distance of from " aTails = set() for nVal, jAddr in self._getArcs(iAddr): if nVal <= self.nChar: - if int.from_bytes(self.byDic[jAddr:jAddr+self.nBytesArc], byteorder='big') & self._finalNodeMask: + if self.byDic[jAddr] & self._finalNodeMask: aTails.add(sTail + self.dCharVal[nVal]) if n and not aTails: aTails.update(self._getTails(jAddr, sTail+self.dCharVal[nVal], n-1)) return aTails @@ -497,29 +517,29 @@ if c not in self.dChar: return [] iAddr = self._lookupArcNode(self.dChar[c], iAddr) if iAddr is None: return [] - if int.from_bytes(self.byDic[iAddr:iAddr+self.nBytesArc], byteorder='big') & self._finalNodeMask: + if self.byDic[iAddr] & self._finalNodeMask: l = [] nRawArc = 0 while not nRawArc & self._lastArcMask: - iEndArcAddr = iAddr + self.nBytesArc - nRawArc = int.from_bytes(self.byDic[iAddr:iEndArcAddr], byteorder='big') + iEndArcAddr = iAddr + 1 + nRawArc = self.byDic[iAddr] nArc = nRawArc & self._arcMask if nArc > self.nChar: # This value is not a char, this is a stemming code sStem = ">" + self.funcStemming(sWord, self.lArcVal[nArc]) # Now , we go to the next node and retrieve all following arcs values, all of them are tags - iAddr2 = int.from_bytes(self.byDic[iEndArcAddr:iEndArcAddr+self.nBytesNodeAddress], byteorder='big') + iAddr2 = self.byDic[iEndArcAddr] nRawArc2 = 0 while not nRawArc2 & self._lastArcMask: - iEndArcAddr2 = iAddr2 + self.nBytesArc - nRawArc2 = int.from_bytes(self.byDic[iAddr2:iEndArcAddr2], byteorder='big') + iEndArcAddr2 = iAddr2 + 1 + nRawArc2 = self.byDic[iAddr2] l.append(sStem + "/" + self.lArcVal[nRawArc2 & self._arcMask]) - iAddr2 = iEndArcAddr2+self.nBytesNodeAddress - iAddr = iEndArcAddr+self.nBytesNodeAddress + iAddr2 = iEndArcAddr2 + 1 + iAddr = iEndArcAddr + 1 return l return [] def _stem1 (self, sWord): "returns stems list of " @@ -528,62 +548,60 @@ if c not in self.dChar: return [] iAddr = self._lookupArcNode(self.dChar[c], iAddr) if iAddr is None: return [] - if int.from_bytes(self.byDic[iAddr:iAddr+self.nBytesArc], byteorder='big') & self._finalNodeMask: + if self.byDic[iAddr] & self._finalNodeMask: l = [] nRawArc = 0 while not nRawArc & self._lastArcMask: - iEndArcAddr = iAddr + self.nBytesArc - nRawArc = int.from_bytes(self.byDic[iAddr:iEndArcAddr], byteorder='big') + iEndArcAddr = iAddr + 1 + nRawArc = self.byDic[iAddr] nArc = nRawArc & self._arcMask if nArc > self.nChar: # This value is not a char, this is a stemming code l.append(self.funcStemming(sWord, self.lArcVal[nArc])) - iAddr = iEndArcAddr+self.nBytesNodeAddress + iAddr = iEndArcAddr + 1 return l return [] def _lookupArcNode1 (self, nVal, iAddr): "looks if is an arc at the node at , if yes, returns address of next node else None" while True: - iEndArcAddr = iAddr+self.nBytesArc - nRawArc = int.from_bytes(self.byDic[iAddr:iEndArcAddr], byteorder='big') + iEndArcAddr = iAddr + 1 + nRawArc = self.byDic[iAddr] if nVal == (nRawArc & self._arcMask): # the value we are looking for # we return the address of the next node - return int.from_bytes(self.byDic[iEndArcAddr:iEndArcAddr+self.nBytesNodeAddress], byteorder='big') + return self.byDic[iEndArcAddr] # value not found if nRawArc & self._lastArcMask: return None - iAddr = iEndArcAddr+self.nBytesNodeAddress + iAddr = iEndArcAddr + 1 def _getArcs1 (self, iAddr): "generator: return all arcs at as tuples of (nVal, iAddr)" while True: - iEndArcAddr = iAddr+self.nBytesArc - nRawArc = int.from_bytes(self.byDic[iAddr:iEndArcAddr], byteorder='big') - yield nRawArc & self._arcMask, int.from_bytes(self.byDic[iEndArcAddr:iEndArcAddr+self.nBytesNodeAddress], byteorder='big') + iEndArcAddr = iAddr + 1 + nRawArc = self.byDic[iAddr] + yield nRawArc & self._arcMask, self.byDic[iEndArcAddr] if nRawArc & self._lastArcMask: break - iAddr = iEndArcAddr+self.nBytesNodeAddress + iAddr = iEndArcAddr + 1 def _writeNodes1 (self, spfDest): "for debugging only" print(" > Write binary nodes") with open(spfDest, 'w', 'utf-8', newline="\n") as hDst: iAddr = 0 hDst.write("i{:_>10} -- #{:_>10}\n".format("0", iAddr)) while iAddr < len(self.byDic): - iEndArcAddr = iAddr+self.nBytesArc - nRawArc = int.from_bytes(self.byDic[iAddr:iEndArcAddr], byteorder='big') + iEndArcAddr = iAddr + 1 + nRawArc = self.byDic[iAddr] nArc = nRawArc & self._arcMask - hDst.write(" {:<20} {:0>16} i{:>10} #{:_>10}\n".format(self.lArcVal[nArc], bin(nRawArc)[2:], "?", \ - int.from_bytes(self.byDic[iEndArcAddr:iEndArcAddr+self.nBytesNodeAddress], \ - byteorder='big'))) - iAddr = iEndArcAddr+self.nBytesNodeAddress + hDst.write(" {:<20} {:0>16} i{:>10} #{:_>10}\n".format(self.lArcVal[nArc], bin(nRawArc)[2:], "?", self.byDic[iEndArcAddr])) + iAddr = iEndArcAddr + 1 if (nRawArc & self._lastArcMask) and iAddr < len(self.byDic): hDst.write("\ni{:_>10} -- #{:_>10}\n".format("?", iAddr)) hDst.close() # VERSION 2