@@ -1,8 +1,13 @@ #!python3 -import os +""" +INDEXABLE BINARY DIRECT ACYCLIC WORD GRAPH +Implementation of a spellchecker as a transducer (storing transformation code to get lemma and morphologies) +and a spell suggestion mechanim +""" + import traceback import pkgutil import re from functools import wraps import time @@ -19,10 +24,11 @@ def timethis (func): "decorator for the execution time" @wraps(func) def wrapper (*args, **kwargs): + "something to prevent pylint whining" fStart = time.time() result = func(*args, **kwargs) fEnd = time.time() print(func.__name__, fEnd - fStart) return result @@ -56,11 +62,11 @@ self.aSugg.add(sSugg) if nDist < self.nMinDist: self.nMinDist = nDist self.nDistLimit = min(self.nDistLimit, self.nMinDist+2) - def getSuggestions (self, nSuggLimit=10, nDistLimit=-1): + def getSuggestions (self, nSuggLimit=10): "return a list of suggestions" if self.dSugg[0]: # we sort the better results with the original word self.dSugg[0].sort(key=lambda sSugg: st.distanceDamerauLevenshtein(self.sWord, sSugg)) lRes = self.dSugg.pop(0) @@ -75,10 +81,11 @@ elif self.sWord[0:1].isupper(): lRes = list(map(lambda sSugg: sSugg[0:1].upper()+sSugg[1:], lRes)) # dont’ use <.istitle> return lRes[:nSuggLimit] def reset (self): + "clear data" self.aSugg.clear() self.dSugg.clear() class IBDAWG: @@ -182,10 +189,11 @@ self.__dict__.update(oJSON) self.byDic = binascii.unhexlify(self.sByDic) self.dCharVal = { v: k for k, v in self.dChar.items() } def getInfo (self): + "return string about the IBDAWG" return " Language: {0.sLangName} Lang code: {0.sLangCode} Dictionary name: {0.sDicName}" \ " Compression method: {0.nCompressionMethod:>2} Date: {0.sDate} Stemming: {0.cStemming}FX\n" \ " Arcs values: {0.nArcVal:>10,} = {0.nChar:>5,} characters, {0.nAff:>6,} affixes, {0.nTag:>6,} tags\n" \ " Dictionary: {0.nEntry:>12,} entries, {0.nNode:>11,} nodes, {0.nArc:>11,} arcs\n" \ " Address size: {0.nBytesNodeAddress:>1} bytes, Arc size: {0.nBytesArc:>1} bytes\n".format(self) @@ -194,35 +202,35 @@ "write IBDAWG as a JavaScript object in a JavaScript module" with open(spfDest, "w", encoding="utf-8", newline="\n") as hDst: if bInJSModule: hDst.write('// JavaScript\n// Generated data (do not edit)\n\n"use strict";\n\nconst dictionary = ') hDst.write(json.dumps({ - "sHeader": "/grammalecte-fsa/", - "sLangCode": self.sLangCode, - "sLangName": self.sLangName, - "sDicName": self.sDicName, - "sFileName": self.sFileName, - "sDate": self.sDate, - "nEntry": self.nEntry, - "nChar": self.nChar, - "nAff": self.nAff, - "nTag": self.nTag, - "cStemming": self.cStemming, - "dChar": self.dChar, - "nNode": self.nNode, - "nArc": self.nArc, - "nArcVal": self.nArcVal, - "lArcVal": self.lArcVal, - "nCompressionMethod": self.nCompressionMethod, - "nBytesArc": self.nBytesArc, - "nBytesNodeAddress": self.nBytesNodeAddress, - "nBytesOffset": self.nBytesOffset, - # JavaScript is a pile of shit, so Mozilla’s JS parser don’t like file bigger than 4 Mb! - # So, if necessary, we use an hexadecimal string, that we will convert later in Firefox’s extension. - # https://github.com/mozilla/addons-linter/issues/1361 - "sByDic": self.byDic.hex() if bBinaryDictAsHexString else [ e for e in self.byDic ] - }, ensure_ascii=False)) + "sHeader": "/grammalecte-fsa/", + "sLangCode": self.sLangCode, + "sLangName": self.sLangName, + "sDicName": self.sDicName, + "sFileName": self.sFileName, + "sDate": self.sDate, + "nEntry": self.nEntry, + "nChar": self.nChar, + "nAff": self.nAff, + "nTag": self.nTag, + "cStemming": self.cStemming, + "dChar": self.dChar, + "nNode": self.nNode, + "nArc": self.nArc, + "nArcVal": self.nArcVal, + "lArcVal": self.lArcVal, + "nCompressionMethod": self.nCompressionMethod, + "nBytesArc": self.nBytesArc, + "nBytesNodeAddress": self.nBytesNodeAddress, + "nBytesOffset": self.nBytesOffset, + # JavaScript is a pile of shit, so Mozilla’s JS parser don’t like file bigger than 4 Mb! + # So, if necessary, we use an hexadecimal string, that we will convert later in Firefox’s extension. + # https://github.com/mozilla/addons-linter/issues/1361 + "sByDic": self.byDic.hex() if bBinaryDictAsHexString else [ e for e in self.byDic ] + }, ensure_ascii=False)) if bInJSModule: hDst.write(";\n\nexports.dictionary = dictionary;\n") def isValidToken (self, sToken): "checks if is valid (if there is hyphens in , is split, each part is checked)" @@ -239,12 +247,12 @@ def isValid (self, sWord): "checks if is valid (different casing tested if the first letter is a capital)" if not sWord: return None - if "’" in sWord: # ugly hack - sWord = sWord.replace("’", "'") + if "'" in sWord: # ugly hack + sWord = sWord.replace("'", "’") if self.lookup(sWord): return True if sWord[0:1].isupper(): if len(sWord) > 1: if sWord.istitle(): @@ -265,11 +273,11 @@ iAddr = 0 for c in sWord: if c not in self.dChar: return False iAddr = self._lookupArcNode(self.dChar[c], iAddr) - if iAddr == None: + if iAddr is None: return False return bool(int.from_bytes(self.byDic[iAddr:iAddr+self.nBytesArc], byteorder='big') & self._finalNodeMask) def getMorph (self, sWord): "retrieves morphologies list, different casing allowed" @@ -344,17 +352,17 @@ self._suggest(oSuggResult, "", nMaxSwitch, nMaxDel, nMaxHardRepl, nMaxJump, nDist, nDeep+1, iAddr, sNewWord, True) # remove last char and go on for sRepl in cp.dFinal1.get(sRemain, ()): self._suggest(oSuggResult, sRepl, nMaxSwitch, nMaxDel, nMaxHardRepl, nMaxJump, nDist, nDeep+1, iAddr, sNewWord, True) #@timethis - def suggest2 (self, sWord, nMaxSugg=10): + def suggest2 (self, sWord, nSuggLimit=10): "returns a set of suggestions for " sWord = cp.spellingNormalization(sWord) sPfx, sWord, sSfx = cp.cut(sWord) oSuggResult = SuggResult(sWord) self._suggest2(oSuggResult) - aSugg = oSuggResult.getSuggestions() + aSugg = oSuggResult.getSuggestions(nSuggLimit) if sSfx or sPfx: # we add what we removed return list(map(lambda sSug: sPfx + sSug + sSfx, aSugg)) return aSugg @@ -407,21 +415,21 @@ "show the path taken by in the graph" sWord = cp.spellingNormalization(sWord) c1 = sWord[0:1] if sWord else " " iPos = -1 n = 0 - print(c1 + ": ", end="") + echo(c1 + ": ", end="") for c2, jAddr in self._getCharArcs(iAddr): - print(c2, end="") + echo(c2, end="") if c2 == sWord[0:1]: iNextNodeAddr = jAddr iPos = n n += 1 if not sWord: return if iPos >= 0: - print("\n "+ " " * iPos + "|") + echo("\n " + " " * iPos + "|") self.drawPath(sWord[1:], iNextNodeAddr) def getSimilarEntries (self, sWord, nSuggLimit=10): "return a list of tuples (similar word, stem, morphology)" if not sWord: @@ -469,13 +477,13 @@ iAddr = 0 for c in sWord: if c not in self.dChar: return [] iAddr = self._lookupArcNode(self.dChar[c], iAddr) - if iAddr == None: + if iAddr is None: return [] - if (int.from_bytes(self.byDic[iAddr:iAddr+self.nBytesArc], byteorder='big') & self._finalNodeMask): + if int.from_bytes(self.byDic[iAddr:iAddr+self.nBytesArc], byteorder='big') & self._finalNodeMask: l = [] nRawArc = 0 while not (nRawArc & self._lastArcMask): iEndArcAddr = iAddr + self.nBytesArc nRawArc = int.from_bytes(self.byDic[iAddr:iEndArcAddr], byteorder='big') @@ -487,11 +495,11 @@ iAddr2 = int.from_bytes(self.byDic[iEndArcAddr:iEndArcAddr+self.nBytesNodeAddress], byteorder='big') nRawArc2 = 0 while not (nRawArc2 & self._lastArcMask): iEndArcAddr2 = iAddr2 + self.nBytesArc nRawArc2 = int.from_bytes(self.byDic[iAddr2:iEndArcAddr2], byteorder='big') - l.append(sStem + " " + self.lArcVal[nRawArc2 & self._arcMask]) + l.append(sStem + "/" + self.lArcVal[nRawArc2 & self._arcMask]) iAddr2 = iEndArcAddr2+self.nBytesNodeAddress iAddr = iEndArcAddr+self.nBytesNodeAddress return l return [] @@ -500,13 +508,13 @@ iAddr = 0 for c in sWord: if c not in self.dChar: return [] iAddr = self._lookupArcNode(self.dChar[c], iAddr) - if iAddr == None: + if iAddr is None: return [] - if (int.from_bytes(self.byDic[iAddr:iAddr+self.nBytesArc], byteorder='big') & self._finalNodeMask): + if int.from_bytes(self.byDic[iAddr:iAddr+self.nBytesArc], byteorder='big') & self._finalNodeMask: l = [] nRawArc = 0 while not (nRawArc & self._lastArcMask): iEndArcAddr = iAddr + self.nBytesArc nRawArc = int.from_bytes(self.byDic[iAddr:iEndArcAddr], byteorder='big') @@ -527,28 +535,28 @@ # the value we are looking for # we return the address of the next node return int.from_bytes(self.byDic[iEndArcAddr:iEndArcAddr+self.nBytesNodeAddress], byteorder='big') else: # value not found - if (nRawArc & self._lastArcMask): + if nRawArc & self._lastArcMask: return None iAddr = iEndArcAddr+self.nBytesNodeAddress def _getArcs1 (self, iAddr): "generator: return all arcs at as tuples of (nVal, iAddr)" while True: iEndArcAddr = iAddr+self.nBytesArc nRawArc = int.from_bytes(self.byDic[iAddr:iEndArcAddr], byteorder='big') - yield (nRawArc & self._arcMask, int.from_bytes(self.byDic[iEndArcAddr:iEndArcAddr+self.nBytesNodeAddress], byteorder='big')) - if (nRawArc & self._lastArcMask): + yield nRawArc & self._arcMask, int.from_bytes(self.byDic[iEndArcAddr:iEndArcAddr+self.nBytesNodeAddress], byteorder='big') + if nRawArc & self._lastArcMask: break iAddr = iEndArcAddr+self.nBytesNodeAddress def _writeNodes1 (self, spfDest): "for debugging only" print(" > Write binary nodes") - with codecs.open(spfDest, 'w', 'utf-8', newline="\n") as hDst: + with open(spfDest, 'w', 'utf-8', newline="\n") as hDst: iAddr = 0 hDst.write("i{:_>10} -- #{:_>10}\n".format("0", iAddr)) while iAddr < len(self.byDic): iEndArcAddr = iAddr+self.nBytesArc nRawArc = int.from_bytes(self.byDic[iAddr:iEndArcAddr], byteorder='big') @@ -567,13 +575,13 @@ iAddr = 0 for c in sWord: if c not in self.dChar: return [] iAddr = self._lookupArcNode(self.dChar[c], iAddr) - if iAddr == None: + if iAddr is None: return [] - if (int.from_bytes(self.byDic[iAddr:iAddr+self.nBytesArc], byteorder='big') & self._finalNodeMask): + if int.from_bytes(self.byDic[iAddr:iAddr+self.nBytesArc], byteorder='big') & self._finalNodeMask: l = [] nRawArc = 0 while not (nRawArc & self._lastArcMask): iEndArcAddr = iAddr + self.nBytesArc nRawArc = int.from_bytes(self.byDic[iAddr:iEndArcAddr], byteorder='big') @@ -592,11 +600,11 @@ iAddr2 += self.nBytesArc + self.nBytesNodeAddress nRawArc2 = 0 while not (nRawArc2 & self._lastArcMask): iEndArcAddr2 = iAddr2 + self.nBytesArc nRawArc2 = int.from_bytes(self.byDic[iAddr2:iEndArcAddr2], byteorder='big') - l.append(sStem + " " + self.lArcVal[nRawArc2 & self._arcMask]) + l.append(sStem + "/" + self.lArcVal[nRawArc2 & self._arcMask]) iAddr2 = iEndArcAddr2+self.nBytesNodeAddress if not (nRawArc2 & self._addrBitMask) else iEndArcAddr2 iAddr = iEndArcAddr+self.nBytesNodeAddress if not (nRawArc & self._addrBitMask) else iEndArcAddr return l return [] @@ -605,13 +613,13 @@ iAddr = 0 for c in sWord: if c not in self.dChar: return [] iAddr = self._lookupArcNode(self.dChar[c], iAddr) - if iAddr == None: + if iAddr is None: return [] - if (int.from_bytes(self.byDic[iAddr:iAddr+self.nBytesArc], byteorder='big') & self._finalNodeMask): + if int.from_bytes(self.byDic[iAddr:iAddr+self.nBytesArc], byteorder='big') & self._finalNodeMask: l = [] nRawArc = 0 while not (nRawArc & self._lastArcMask): iEndArcAddr = iAddr + self.nBytesArc nRawArc = int.from_bytes(self.byDic[iAddr:iEndArcAddr], byteorder='big') @@ -649,18 +657,18 @@ nRawArc = int.from_bytes(self.byDic[iAddr:iAddr+self.nBytesArc], byteorder='big') iAddr += self.nBytesArc + self.nBytesNodeAddress if not (nRawArc & self._addrBitMask) else self.nBytesArc return iAddr else: # value not found - if (nRawArc & self._lastArcMask): + if nRawArc & self._lastArcMask: return None iAddr = iEndArcAddr+self.nBytesNodeAddress if not (nRawArc & self._addrBitMask) else iEndArcAddr def _writeNodes2 (self, spfDest): "for debugging only" print(" > Write binary nodes") - with codecs.open(spfDest, 'w', 'utf-8', newline="\n") as hDst: + with open(spfDest, 'w', 'utf-8', newline="\n") as hDst: iAddr = 0 hDst.write("i{:_>10} -- #{:_>10}\n".format("0", iAddr)) while iAddr < len(self.byDic): iEndArcAddr = iAddr+self.nBytesArc nRawArc = int.from_bytes(self.byDic[iAddr:iEndArcAddr], byteorder='big') @@ -670,11 +678,11 @@ hDst.write(" {:<20} {:0>16} i{:>10} #{:_>10}\n".format(self.lArcVal[nArc], bin(nRawArc)[2:], "?", iNextNodeAddr)) iAddr = iEndArcAddr+self.nBytesNodeAddress else: hDst.write(" {:<20} {:0>16}\n".format(self.lArcVal[nArc], bin(nRawArc)[2:])) iAddr = iEndArcAddr - if (nRawArc & self._lastArcMask): + if nRawArc & self._lastArcMask: hDst.write("\ni{:_>10} -- #{:_>10}\n".format("?", iAddr)) hDst.close() # VERSION 3 def _morph3 (self, sWord): @@ -682,13 +690,13 @@ iAddr = 0 for c in sWord: if c not in self.dChar: return [] iAddr = self._lookupArcNode(self.dChar[c], iAddr) - if iAddr == None: + if iAddr is None: return [] - if (int.from_bytes(self.byDic[iAddr:iAddr+self.nBytesArc], byteorder='big') & self._finalNodeMask): + if int.from_bytes(self.byDic[iAddr:iAddr+self.nBytesArc], byteorder='big') & self._finalNodeMask: l = [] nRawArc = 0 iAddrNode = iAddr while not (nRawArc & self._lastArcMask): iEndArcAddr = iAddr + self.nBytesArc @@ -704,11 +712,11 @@ iAddr2 = iAddrNode + int.from_bytes(self.byDic[iEndArcAddr:iEndArcAddr+self.nBytesOffset], byteorder='big') nRawArc2 = 0 while not (nRawArc2 & self._lastArcMask): iEndArcAddr2 = iAddr2 + self.nBytesArc nRawArc2 = int.from_bytes(self.byDic[iAddr2:iEndArcAddr2], byteorder='big') - l.append(sStem + " " + self.lArcVal[nRawArc2 & self._arcMask]) + l.append(sStem + "/" + self.lArcVal[nRawArc2 & self._arcMask]) iAddr2 = iEndArcAddr2+self.nBytesNodeAddress if not (nRawArc2 & self._addrBitMask) else iEndArcAddr2+self.nBytesOffset iAddr = iEndArcAddr+self.nBytesNodeAddress if not (nRawArc & self._addrBitMask) else iEndArcAddr+self.nBytesOffset return l return [] @@ -717,16 +725,16 @@ iAddr = 0 for c in sWord: if c not in self.dChar: return [] iAddr = self._lookupArcNode(self.dChar[c], iAddr) - if iAddr == None: + if iAddr is None: return [] - if (int.from_bytes(self.byDic[iAddr:iAddr+self.nBytesArc], byteorder='big') & self._finalNodeMask): + if int.from_bytes(self.byDic[iAddr:iAddr+self.nBytesArc], byteorder='big') & self._finalNodeMask: l = [] nRawArc = 0 - iAddrNode = iAddr + #iAddrNode = iAddr while not (nRawArc & self._lastArcMask): iEndArcAddr = iAddr + self.nBytesArc nRawArc = int.from_bytes(self.byDic[iAddr:iEndArcAddr], byteorder='big') nArc = nRawArc & self._arcMask if nArc > self.nChar: @@ -748,18 +756,18 @@ return int.from_bytes(self.byDic[iEndArcAddr:iEndArcAddr+self.nBytesNodeAddress], byteorder='big') else: return iAddrNode + int.from_bytes(self.byDic[iEndArcAddr:iEndArcAddr+self.nBytesOffset], byteorder='big') else: # value not found - if (nRawArc & self._lastArcMask): + if nRawArc & self._lastArcMask: return None iAddr = iEndArcAddr+self.nBytesNodeAddress if not (nRawArc & self._addrBitMask) else iEndArcAddr+self.nBytesOffset def _writeNodes3 (self, spfDest): "for debugging only" print(" > Write binary nodes") - with codecs.open(spfDest, 'w', 'utf-8', newline="\n") as hDst: + with open(spfDest, 'w', 'utf-8', newline="\n") as hDst: iAddr = 0 hDst.write("i{:_>10} -- #{:_>10}\n".format("0", iAddr)) while iAddr < len(self.byDic): iEndArcAddr = iAddr+self.nBytesArc nRawArc = int.from_bytes(self.byDic[iAddr:iEndArcAddr], byteorder='big') @@ -770,8 +778,8 @@ iAddr = iEndArcAddr+self.nBytesNodeAddress else: iNextNodeAddr = int.from_bytes(self.byDic[iEndArcAddr:iEndArcAddr+self.nBytesOffset], byteorder='big') hDst.write(" {:<20} {:0>16} i{:>10} +{:_>10}\n".format(self.lArcVal[nArc], bin(nRawArc)[2:], "?", iNextNodeAddr)) iAddr = iEndArcAddr+self.nBytesOffset - if (nRawArc & self._lastArcMask): + if nRawArc & self._lastArcMask: hDst.write("\ni{:_>10} -- #{:_>10}\n".format("?", iAddr)) hDst.close()