Index: graphspell-js/ibdawg.js ================================================================== --- graphspell-js/ibdawg.js +++ graphspell-js/ibdawg.js @@ -179,28 +179,27 @@ /* Performance trick: Instead of converting bytes to integers each times we parse the binary dictionary, we do it once, then parse the array */ + this.lByDic = []; let nAcc = 0; let lBytesBuffer = []; - let lTemp = []; let nDivisor = (this.nBytesArc + this.nBytesNodeAddress) / 2; for (let i = 0; i < this.sByDic.length; i+=2) { lBytesBuffer.push(parseInt(this.sByDic.slice(i, i+2), 16)); if (nAcc == (this.nBytesArc - 1)) { - lTemp.push(this._convBytesToInteger(lBytesBuffer)); + this.lByDic.push(this._convBytesToInteger(lBytesBuffer)); lBytesBuffer = []; } else if (nAcc == (this.nBytesArc + this.nBytesNodeAddress - 1)) { - lTemp.push(Math.round(this._convBytesToInteger(lBytesBuffer) / nDivisor)); // Math.round should be useless, BUT with JS who knowns what can happen… + this.lByDic.push(Math.round(this._convBytesToInteger(lBytesBuffer) / nDivisor)); // Math.round should be useless, BUT with JS who knowns what can happen… lBytesBuffer = []; nAcc = -1; } nAcc = nAcc + 1; } - this.byDic = lTemp; /* end of bug workaround */ this._arcMask = (2 ** ((this.nBytesArc * 8) - 3)) - 1; this._finalNodeMask = 1 << ((this.nBytesArc * 8) - 1); this._lastArcMask = 1 << ((this.nBytesArc * 8) - 2); @@ -319,11 +318,11 @@ iAddr = this._lookupArcNode(this.dChar.get(c), iAddr); if (iAddr === null) { return false; } } - return Boolean(this.byDic[iAddr] & this._finalNodeMask); + return Boolean(this.lByDic[iAddr] & this._finalNodeMask); } getMorph (sWord) { // retrieves morphologies list, different casing allowed if (!sWord) { @@ -392,11 +391,11 @@ } _suggest (oSuggResult, sRemain, nMaxSwitch=0, nMaxDel=0, nMaxHardRepl=0, nMaxJump=0, nDist=0, nDeep=0, iAddr=0, sNewWord="", bAvoidLoop=false) { // returns a set of suggestions // recursive function - if (this.byDic[iAddr] & this._finalNodeMask) { + if (this.lByDic[iAddr] & this._finalNodeMask) { if (sRemain == "") { oSuggResult.addSugg(sNewWord); for (let sTail of this._getTails(iAddr)) { oSuggResult.addSugg(sNewWord+sTail); } @@ -502,11 +501,11 @@ _getTails (iAddr, sTail="", n=2) { // return a list of suffixes ending at a distance of from let aTails = new Set(); for (let [nVal, jAddr] of this._getArcs(iAddr)) { if (nVal <= this.nChar) { - if (this.byDic[jAddr] & this._finalNodeMask) { + if (this.lByDic[jAddr] & this._finalNodeMask) { aTails.add(sTail + this.dCharVal.get(nVal)); } if (n && aTails.size == 0) { aTails.gl_update(this._getTails(jAddr, sTail+this.dCharVal.get(nVal), n-1)); } @@ -574,26 +573,26 @@ iAddr = this._lookupArcNode(this.dChar.get(c), iAddr); if (iAddr === null) { return []; } } - if (this.byDic[iAddr] & this._finalNodeMask) { + if (this.lByDic[iAddr] & this._finalNodeMask) { let l = []; let nRawArc = 0; while (!(nRawArc & this._lastArcMask)) { let iEndArcAddr = iAddr + 1; - nRawArc = this.byDic[iAddr]; + nRawArc = this.lByDic[iAddr]; let nArc = nRawArc & this._arcMask; if (nArc > this.nChar) { // This value is not a char, this is a stemming code let sStem = ">" + this.funcStemming(sWord, this.lArcVal[nArc]); // Now , we go to the next node and retrieve all following arcs values, all of them are tags - let iAddr2 = this.byDic[iEndArcAddr]; + let iAddr2 = this.lByDic[iEndArcAddr]; let nRawArc2 = 0; while (!(nRawArc2 & this._lastArcMask)) { let iEndArcAddr2 = iAddr2 + 1; - nRawArc2 = this.byDic[iAddr2]; + nRawArc2 = this.lByDic[iAddr2]; l.push(sStem + "/" + this.lArcVal[nRawArc2 & this._arcMask]); iAddr2 = iEndArcAddr2 + 1; } } iAddr = iEndArcAddr + 1; @@ -613,16 +612,16 @@ iAddr = this._lookupArcNode(this.dChar.get(c), iAddr); if (iAddr === null) { return []; } } - if (this.byDic[iAddr] & this._finalNodeMask) { + if (this.lByDic[iAddr] & this._finalNodeMask) { let l = []; let nRawArc = 0; while (!(nRawArc & this._lastArcMask)) { let iEndArcAddr = iAddr + 1; - nRawArc = this.byDic[iAddr]; + nRawArc = this.lByDic[iAddr]; let nArc = nRawArc & this._arcMask; if (nArc > this.nChar) { // This value is not a char, this is a stemming code l.push(this.funcStemming(sWord, this.lArcVal[nArc])); } @@ -635,15 +634,15 @@ _lookupArcNode (nVal, iAddr) { // looks if nVal is an arc at the node at iAddr, if yes, returns address of next node else None while (true) { let iEndArcAddr = iAddr+1; - let nRawArc = this.byDic[iAddr]; + let nRawArc = this.lByDic[iAddr]; if (nVal == (nRawArc & this._arcMask)) { // the value we are looking for // we return the address of the next node - return this.byDic[iEndArcAddr]; + return this.lByDic[iEndArcAddr]; } else { // value not found if (nRawArc & this._lastArcMask) { return null; @@ -655,12 +654,12 @@ * _getArcs (iAddr) { // generator: return all arcs at as tuples of (nVal, iAddr) while (true) { let iEndArcAddr = iAddr+1; - let nRawArc = this.byDic[iAddr]; - yield [nRawArc & this._arcMask, this.byDic[iEndArcAddr]]; + let nRawArc = this.lByDic[iAddr]; + yield [nRawArc & this._arcMask, this.lByDic[iEndArcAddr]]; if (nRawArc & this._lastArcMask) { break; } iAddr = iEndArcAddr+1; } Index: graphspell/ibdawg.py ================================================================== --- graphspell/ibdawg.py +++ graphspell/ibdawg.py @@ -132,23 +132,22 @@ # Performance trick: # Instead of converting bytes to integers each times we parse the binary dictionary, # we do it once, then parse the array nAcc = 0 byBuffer = b"" - lTemp = [] + self.lByDic = [] nDivisor = (self.nBytesArc + self.nBytesNodeAddress) / 2 for i in range(0, len(self.byDic)): byBuffer += self.byDic[i:i+1] if nAcc == (self.nBytesArc - 1): - lTemp.append(int.from_bytes(byBuffer, byteorder="big")) + self.lByDic.append(int.from_bytes(byBuffer, byteorder="big")) byBuffer = b"" elif nAcc == (self.nBytesArc + self.nBytesNodeAddress - 1): - lTemp.append(round(int.from_bytes(byBuffer, byteorder="big") / nDivisor)) + self.lByDic.append(round(int.from_bytes(byBuffer, byteorder="big") / nDivisor)) byBuffer = b"" nAcc = -1 nAcc = nAcc + 1 - self.byDic = lTemp; # masks self._arcMask = (2 ** ((self.nBytesArc * 8) - 3)) - 1 self._finalNodeMask = 1 << ((self.nBytesArc * 8) - 1) self._lastArcMask = 1 << ((self.nBytesArc * 8) - 2) @@ -300,11 +299,11 @@ if c not in self.dChar: return False iAddr = self._lookupArcNode(self.dChar[c], iAddr) if iAddr is None: return False - return bool(self.byDic[iAddr] & self._finalNodeMask) + return bool(self.lByDic[iAddr] & self._finalNodeMask) def getMorph (self, sWord): "retrieves morphologies list, different casing allowed" if not sWord: return [] @@ -357,11 +356,11 @@ oSuggResult.addSugg(sWord1+" "+sWord2) def _suggest (self, oSuggResult, sRemain, nMaxSwitch=0, nMaxDel=0, nMaxHardRepl=0, nMaxJump=0, nDist=0, nDeep=0, iAddr=0, sNewWord="", bAvoidLoop=False): # recursive function #logging.info((nDeep * " ") + sNewWord + ":" + sRemain) - if self.byDic[iAddr] & self._finalNodeMask: + if self.lByDic[iAddr] & self._finalNodeMask: if not sRemain: oSuggResult.addSugg(sNewWord, nDeep) for sTail in self._getTails(iAddr): oSuggResult.addSugg(sNewWord+sTail, nDeep) return @@ -424,11 +423,11 @@ def _getTails (self, iAddr, sTail="", n=2): "return a list of suffixes ending at a distance of from " aTails = set() for nVal, jAddr in self._getArcs(iAddr): if nVal <= self.nChar: - if self.byDic[jAddr] & self._finalNodeMask: + if self.lByDic[jAddr] & self._finalNodeMask: aTails.add(sTail + self.dCharVal[nVal]) if n and not aTails: aTails.update(self._getTails(jAddr, sTail+self.dCharVal[nVal], n-1)) return aTails @@ -496,26 +495,26 @@ if c not in self.dChar: return [] iAddr = self._lookupArcNode(self.dChar[c], iAddr) if iAddr is None: return [] - if self.byDic[iAddr] & self._finalNodeMask: + if self.lByDic[iAddr] & self._finalNodeMask: l = [] nRawArc = 0 while not nRawArc & self._lastArcMask: iEndArcAddr = iAddr + 1 - nRawArc = self.byDic[iAddr] + nRawArc = self.lByDic[iAddr] nArc = nRawArc & self._arcMask if nArc > self.nChar: # This value is not a char, this is a stemming code sStem = ">" + self.funcStemming(sWord, self.lArcVal[nArc]) # Now , we go to the next node and retrieve all following arcs values, all of them are tags - iAddr2 = self.byDic[iEndArcAddr] + iAddr2 = self.lByDic[iEndArcAddr] nRawArc2 = 0 while not nRawArc2 & self._lastArcMask: iEndArcAddr2 = iAddr2 + 1 - nRawArc2 = self.byDic[iAddr2] + nRawArc2 = self.lByDic[iAddr2] l.append(sStem + "/" + self.lArcVal[nRawArc2 & self._arcMask]) iAddr2 = iEndArcAddr2 + 1 iAddr = iEndArcAddr + 1 return l return [] @@ -527,16 +526,16 @@ if c not in self.dChar: return [] iAddr = self._lookupArcNode(self.dChar[c], iAddr) if iAddr is None: return [] - if self.byDic[iAddr] & self._finalNodeMask: + if self.lByDic[iAddr] & self._finalNodeMask: l = [] nRawArc = 0 while not nRawArc & self._lastArcMask: iEndArcAddr = iAddr + 1 - nRawArc = self.byDic[iAddr] + nRawArc = self.lByDic[iAddr] nArc = nRawArc & self._arcMask if nArc > self.nChar: # This value is not a char, this is a stemming code l.append(self.funcStemming(sWord, self.lArcVal[nArc])) iAddr = iEndArcAddr + 1 @@ -545,26 +544,26 @@ def _lookupArcNode (self, nVal, iAddr): "looks if is an arc at the node at , if yes, returns address of next node else None" while True: iEndArcAddr = iAddr + 1 - nRawArc = self.byDic[iAddr] + nRawArc = self.lByDic[iAddr] if nVal == (nRawArc & self._arcMask): # the value we are looking for # we return the address of the next node - return self.byDic[iEndArcAddr] + return self.lByDic[iEndArcAddr] # value not found if nRawArc & self._lastArcMask: return None iAddr = iEndArcAddr + 1 def _getArcs (self, iAddr): "generator: return all arcs at as tuples of (nVal, iAddr)" while True: iEndArcAddr = iAddr + 1 - nRawArc = self.byDic[iAddr] - yield nRawArc & self._arcMask, self.byDic[iEndArcAddr] + nRawArc = self.lByDic[iAddr] + yield nRawArc & self._arcMask, self.lByDic[iEndArcAddr] if nRawArc & self._lastArcMask: break iAddr = iEndArcAddr + 1 def _writeNodes (self, spfDest): @@ -571,14 +570,14 @@ "for debugging only" print(" > Write binary nodes") with open(spfDest, 'w', 'utf-8', newline="\n") as hDst: iAddr = 0 hDst.write("i{:_>10} -- #{:_>10}\n".format("0", iAddr)) - while iAddr < len(self.byDic): + while iAddr < len(self.lByDic): iEndArcAddr = iAddr + 1 - nRawArc = self.byDic[iAddr] + nRawArc = self.lByDic[iAddr] nArc = nRawArc & self._arcMask - hDst.write(" {:<20} {:0>16} i{:>10} #{:_>10}\n".format(self.lArcVal[nArc], bin(nRawArc)[2:], "?", self.byDic[iEndArcAddr])) + hDst.write(" {:<20} {:0>16} i{:>10} #{:_>10}\n".format(self.lArcVal[nArc], bin(nRawArc)[2:], "?", self.lByDic[iEndArcAddr])) iAddr = iEndArcAddr + 1 - if (nRawArc & self._lastArcMask) and iAddr < len(self.byDic): + if (nRawArc & self._lastArcMask) and iAddr < len(self.lByDic): hDst.write("\ni{:_>10} -- #{:_>10}\n".format("?", iAddr)) hDst.close()