Index: graphspell/__init__.py ================================================================== --- graphspell/__init__.py +++ graphspell/__init__.py @@ -1,2 +1,11 @@ + +""" +SPELLCHECKER +using a Direct Acyclic Word Graph +with a transducer to retrieve +- lemma of words +- morphologies +with a spell suggestion mechanism +""" from .spellchecker import * Index: graphspell/char_player.py ================================================================== --- graphspell/char_player.py +++ graphspell/char_player.py @@ -1,7 +1,9 @@ -# list of similar chars -# useful for suggestion mechanism +""" +List of similar chars +useful for suggestion mechanism +""" import re import unicodedata @@ -8,10 +10,11 @@ _xTransCharsForSpelling = str.maketrans({ 'ſ': 's', 'ffi': 'ffi', 'ffl': 'ffl', 'ff': 'ff', 'ſt': 'ft', 'fi': 'fi', 'fl': 'fl', 'st': 'st' }) def spellingNormalization (sWord): + "nomalization NFC and removing ligatures" return unicodedata.normalize("NFC", sWord.translate(_xTransCharsForSpelling)) _xTransCharsForSimplification = str.maketrans({ 'à': 'a', 'é': 'e', 'î': 'i', 'ô': 'o', 'û': 'u', 'ÿ': 'i', "y": "i", @@ -19,11 +22,11 @@ 'ä': 'a', 'ê': 'e', 'í': 'i', 'ó': 'o', 'ü': 'u', 'ý': 'i', 'á': 'a', 'ë': 'e', 'ì': 'i', 'ò': 'o', 'ú': 'u', 'ỳ': 'i', 'ā': 'a', 'ē': 'e', 'ī': 'i', 'ō': 'o', 'ū': 'u', 'ȳ': 'i', 'ç': 'c', 'ñ': 'n', 'k': 'q', 'w': 'v', 'œ': 'oe', 'æ': 'ae', - 'ſ': 's', 'ffi': 'ffi', 'ffl': 'ffl', 'ff': 'ff', 'ſt': 'ft', 'fi': 'fi', 'fl': 'fl', 'st': 'st', + 'ſ': 's', 'ffi': 'ffi', 'ffl': 'ffl', 'ff': 'ff', 'ſt': 'ft', 'fi': 'fi', 'fl': 'fl', 'st': 'st', }) def simplifyWord (sWord): "word simplication before calculating distance between words" sWord = sWord.lower().translate(_xTransCharsForSimplification) @@ -92,11 +95,11 @@ "f": "fF", "F": "Ff", "g": "gGjJĵĴ", "G": "GgJjĴĵ", - + "h": "hH", "H": "Hh", "i": "iIîÎïÏyYíÍìÌīĪÿŸ", "I": "IiÎîÏïYyÍíÌìĪīŸÿ", @@ -237,10 +240,11 @@ "Z": ("SS", "ZH"), } def get1toXReplacement (cPrev, cCur, cNext): + "return tuple of replacements for " if cCur in aConsonant and (cPrev in aConsonant or cNext in aConsonant): return () return d1toX.get(cCur, ()) Index: graphspell/dawg.py ================================================================== --- graphspell/dawg.py +++ graphspell/dawg.py @@ -1,15 +1,16 @@ #!python3 -# FSA DICTIONARY BUILDER -# -# by Olivier R. -# License: MPL 2 -# -# This tool encodes lexicon into an indexable binary dictionary -# Input files MUST be encoded in UTF-8. - +""" +FSA DICTIONARY BUILDER + +by Olivier R. +License: MPL 2 + +This tool encodes lexicon into an indexable binary dictionary +Input files MUST be encoded in UTF-8. +""" import sys import os import collections import json @@ -21,10 +22,11 @@ from .progressbar import ProgressBar def readFile (spf): + "generator: read file and return for each line a list of elements separated by a tabulation." print(" < Read lexicon: " + spf) if os.path.isfile(spf): with open(spf, "r", encoding="utf-8") as hSrc: for sLine in hSrc: sLine = sLine.strip() @@ -97,23 +99,23 @@ nTag += 1 dTagOccur[sTag] = dTagOccur.get(sTag, 0) + 1 aEntry.add((sFlex, dAff[sAff], dTag[sTag])) if not aEntry: raise ValueError("# Error. Empty lexicon") - + # Preparing DAWG print(" > Preparing list of words") print(" Filter: " + (sSelectFilterRegex or "[None]")) lVal = lChar + lAff + lTag lWord = [ [dChar[c] for c in sFlex] + [iAff+nChar] + [iTag+nChar+nAff] for sFlex, iAff, iTag in aEntry ] aEntry = None - + # Dictionary of arc values occurrency, to sort arcs of each node dValOccur = dict( [ (dChar[c], dCharOccur[c]) for c in dChar ] \ + [ (dAff[aff]+nChar, dAffOccur[aff]) for aff in dAff ] \ + [ (dTag[tag]+nChar+nAff, dTagOccur[tag]) for tag in dTag ] ) - + self.sFileName = src if type(src) is str else "[None]" self.sLangCode = sLangCode self.sLangName = sLangName self.sDicName = sDicName self.nEntry = len(lWord) @@ -132,15 +134,15 @@ self.nArcVal = len(lVal) self.nTag = self.nArcVal - self.nChar - nAff self.cStemming = cStemming if cStemming == "A": self.funcStemming = st.changeWordWithAffixCode - elif cStemming == "S": + elif cStemming == "S": self.funcStemming = st.changeWordWithSuffixCode else: self.funcStemming = st.noStemming - + # build lWord.sort() oProgBar = ProgressBar(0, len(lWord)) for aEntry in lWord: self.insert(aEntry) @@ -147,20 +149,21 @@ oProgBar.increment(1) oProgBar.done() self.finish() self.countNodes() self.countArcs() - self.sortNodes() # version 2 and 3 + self.sortNodes() # version 2 and 3 self.sortNodeArcs(dValOccur) #self.sortNodeArcs2 (self.oRoot, "") self.displayInfo() # BUILD DAWG def insert (self, aEntry): + "insert a new entry (insertion must be made in alphabetical order)." if aEntry < self.aPreviousEntry: sys.exit("# Error: Words must be inserted in alphabetical order.") - + # find common prefix between word and previous word nCommonPrefix = 0 for i in range(min(len(aEntry), len(self.aPreviousEntry))): if aEntry[i] != self.aPreviousEntry[i]: break @@ -179,11 +182,11 @@ iChar = nCommonPrefix for c in aEntry[nCommonPrefix:]: oNextNode = DawgNode() oNode.arcs[c] = oNextNode self.lUncheckedNodes.append((oNode, c, oNextNode)) - if iChar == (len(aEntry) - 2): + if iChar == (len(aEntry) - 2): oNode.final = True iChar += 1 oNode = oNextNode oNode.final = True self.aPreviousEntry = aEntry @@ -203,54 +206,61 @@ # add the state to the minimized nodes. self.lMinimizedNodes[oChildNode] = oChildNode self.lUncheckedNodes.pop() def countNodes (self): + "count the number of nodes of the whole word graph" self.nNode = len(self.lMinimizedNodes) def countArcs (self): + "count the number of arcs in the whole word graph" self.nArc = 0 for oNode in self.lMinimizedNodes: self.nArc += len(oNode.arcs) - + def sortNodeArcs (self, dValOccur): + "sort arcs of each node according to " print(" > Sort node arcs") self.oRoot.sortArcs(dValOccur) for oNode in self.lMinimizedNodes: oNode.sortArcs(dValOccur) - + def sortNodeArcs2 (self, oNode, cPrevious=""): + "sort arcs of each node depending on the previous char" # recursive function dCharOccur = getCharOrderAfterChar(cPrevious) if dCharOccur: oNode.sortArcs2(dCharOccur, self.lArcVal) for nArcVal, oNextNode in oNode.arcs.items(): self.sortNodeArcs2(oNextNode, self.lArcVal[nArcVal]) def sortNodes (self): + "sort nodes" print(" > Sort nodes") for oNode in self.oRoot.arcs.values(): self._parseNodes(oNode) - + def _parseNodes (self, oNode): # Warning: recursive method if oNode.pos > 0: return oNode.setPos() self.lSortedNodes.append(oNode) for oNextNode in oNode.arcs.values(): - self._parseNodes(oNextNode) - + self._parseNodes(oNextNode) + def lookup (self, sWord): + "return True if is within the word graph (debugging)" oNode = self.oRoot for c in sWord: if self.dChar.get(c, '') not in oNode.arcs: return False oNode = oNode.arcs[self.dChar[c]] return oNode.final def morph (self, sWord): + "return a string of the morphologies of (debugging)" oNode = self.oRoot for c in sWord: if self.dChar.get(c, '') not in oNode.arcs: return '' oNode = oNode.arcs[self.dChar[c]] @@ -265,10 +275,11 @@ s += "]" return s return '' def displayInfo (self): + "display informations about the word graph" print(" * {:<12} {:>16,}".format("Entries:", self.nEntry)) print(" * {:<12} {:>16,}".format("Characters:", self.nChar)) print(" * {:<12} {:>16,}".format("Affixes:", self.nAff)) print(" * {:<12} {:>16,}".format("Tags:", self.nTag)) print(" * {:<12} {:>16,}".format("Arc values:", self.nArcVal)) @@ -275,10 +286,11 @@ print(" * {:<12} {:>16,}".format("Nodes:", self.nNode)) print(" * {:<12} {:>16,}".format("Arcs:", self.nArc)) print(" * {:<12} {:>16}".format("Stemming:", self.cStemming + "FX")) def getArcStats (self): + "return a string with statistics about nodes and arcs" d = {} for oNode in self.lMinimizedNodes: n = len(oNode.arcs) d[n] = d.get(n, 0) + 1 s = " * Nodes:\n" @@ -285,10 +297,11 @@ for n in d: s = s + " {:>9} nodes have {:>3} arcs\n".format(d[n], n) return s def writeInfo (self, sPathFile): + "write informations in file " print(" > Write informations") with open(sPathFile, 'w', encoding='utf-8', newline="\n") as hDst: hDst.write(self.getArcStats()) hDst.write("\n * Values:\n") for i, s in enumerate(self.lArcVal): @@ -394,10 +407,11 @@ if self.lSortedNodes[i].size != nSize: self.lSortedNodes[i].size = nSize bEnd = False def getBinaryAsJSON (self, nCompressionMethod=1, bBinaryDictAsHexString=True): + "return a JSON string containing all necessary data of the dictionary (compressed as a binary string)" self._calculateBinary(nCompressionMethod) byDic = b"" if nCompressionMethod == 1: byDic = self.oRoot.convToBytes1(self.nBytesArc, self.nBytesNodeAddress) for oNode in self.lMinimizedNodes: @@ -436,10 +450,11 @@ # https://github.com/mozilla/addons-linter/issues/1361 "sByDic": byDic.hex() if bBinaryDictAsHexString else [ e for e in byDic ] } def writeAsJSObject (self, spfDst, nCompressionMethod, bInJSModule=False, bBinaryDictAsHexString=True): + "write a file (JSON or JS module) with all the necessary data" if not spfDst.endswith(".json"): spfDst += "."+str(nCompressionMethod)+".json" with open(spfDst, "w", encoding="utf-8", newline="\n") as hDst: if bInJSModule: hDst.write('// JavaScript\n// Generated data (do not edit)\n\n"use strict";\n\nconst dictionary = ') @@ -447,17 +462,19 @@ if bInJSModule: hDst.write(";\n\nexports.dictionary = dictionary;\n") def writeBinary (self, sPathFile, nCompressionMethod, bDebug=False): """ + Save as a binary file. + Format of the binary indexable dictionary: Each section is separated with 4 bytes of \0 - + - Section Header: /grammalecte-fsa/[compression method] * compression method is an ASCII string - + - Section Informations: /[lang code] /[lang name] /[dictionary name] /[date creation] @@ -472,14 +489,14 @@ /[stemming code] * "S" means stems are generated by /suffix_code/, "A" means they are generated by /affix_code/ See defineSuffixCode() and defineAffixCode() for details. "N" means no stemming - + - Section Values: * a list of strings encoded in binary from utf-8, each value separated with a tabulation - + - Section Word Graph (nodes / arcs) * A list of nodes which are a list of arcs with an address of the next node. See DawgNode.convToBytes() for details. """ self._calculateBinary(nCompressionMethod) @@ -520,30 +537,32 @@ def _writeNodes (self, sPathFile, nCompressionMethod): "for debugging only" print(" > Write nodes") with open(sPathFile+".nodes."+str(nCompressionMethod)+".txt", 'w', encoding='utf-8', newline="\n") as hDst: if nCompressionMethod == 1: - hDst.write(self.oRoot.getTxtRepr1(self.nBytesArc, self.nBytesNodeAddress, self.lArcVal)+"\n") + hDst.write(self.oRoot.getTxtRepr1(self.nBytesArc, self.lArcVal)+"\n") #hDst.write( ''.join( [ "%02X " % z for z in self.oRoot.convToBytes1(self.nBytesArc, self.nBytesNodeAddress) ] ).strip() ) for oNode in self.lMinimizedNodes: - hDst.write(oNode.getTxtRepr1(self.nBytesArc, self.nBytesNodeAddress, self.lArcVal)+"\n") + hDst.write(oNode.getTxtRepr1(self.nBytesArc, self.lArcVal)+"\n") if nCompressionMethod == 2: - hDst.write(self.oRoot.getTxtRepr2(self.nBytesArc, self.nBytesNodeAddress, self.lArcVal)+"\n") + hDst.write(self.oRoot.getTxtRepr2(self.nBytesArc, self.lArcVal)+"\n") for oNode in self.lSortedNodes: - hDst.write(oNode.getTxtRepr2(self.nBytesArc, self.nBytesNodeAddress, self.lArcVal)+"\n") + hDst.write(oNode.getTxtRepr2(self.nBytesArc, self.lArcVal)+"\n") if nCompressionMethod == 3: - hDst.write(self.oRoot.getTxtRepr3(self.nBytesArc, self.nBytesNodeAddress, self.nBytesOffset, self.lArcVal)+"\n") + hDst.write(self.oRoot.getTxtRepr3(self.nBytesArc, self.nBytesOffset, self.lArcVal)+"\n") #hDst.write( ''.join( [ "%02X " % z for z in self.oRoot.convToBytes3(self.nBytesArc, self.nBytesNodeAddress, self.nBytesOffset) ] ).strip() ) for oNode in self.lSortedNodes: - hDst.write(oNode.getTxtRepr3(self.nBytesArc, self.nBytesNodeAddress, self.nBytesOffset, self.lArcVal)+"\n") + hDst.write(oNode.getTxtRepr3(self.nBytesArc, self.nBytesOffset, self.lArcVal)+"\n") class DawgNode: + """Node of the word graph""" + NextId = 0 NextPos = 1 # (version 2) - + def __init__ (self): self.i = DawgNode.NextId DawgNode.NextId += 1 self.final = False self.arcs = {} # key: arc value; value: a node @@ -551,19 +570,21 @@ self.pos = 0 # position in the binary dictionary (version 2) self.size = 0 # size of node in bytes (version 3) @classmethod def resetNextId (cls): + "set NextId to 0 " cls.NextId = 0 def setPos (self): # version 2 + "define a position for node (version 2)" self.pos = DawgNode.NextPos DawgNode.NextPos += 1 def __str__ (self): # Caution! this function is used for hashing and comparison! - sFinalChar = "1" if self.final else "0"; + sFinalChar = "1" if self.final else "0" l = [sFinalChar] for (key, node) in self.arcs.items(): l.append(str(key)) l.append(str(node.i)) return "_".join(l) @@ -576,36 +597,40 @@ # Used as a key in a python dictionary. # Nodes are equivalent if they have identical arcs, and each identical arc leads to identical states. return self.__str__() == other.__str__() def sortArcs (self, dValOccur): + "sort arcs of node according to " self.arcs = collections.OrderedDict(sorted(self.arcs.items(), key=lambda t: dValOccur.get(t[0], 0), reverse=True)) def sortArcs2 (self, dValOccur, lArcVal): + "sort arcs of each node depending on the previous char" self.arcs = collections.OrderedDict(sorted(self.arcs.items(), key=lambda t: dValOccur.get(lArcVal[t[0]], 0), reverse=True)) # VERSION 1 ===================================================================================================== def convToBytes1 (self, nBytesArc, nBytesNodeAddress): """ + Convert to bytes (method 1). + Node scheme: - Arc length is defined by nBytesArc - Address length is defined by nBytesNodeAddress - + | Arc | Address of next node | | | | - /---------------\ /---------------\ /---------------\ /---------------\ /---------------\ /---------------\ - | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | - \---------------/ \---------------/ \---------------/ \---------------/ \---------------/ \---------------/ + ┏━━━━━━━━━━━━━━━┓ ┏━━━━━━━━━━━━━━━┓ ┏━━━━━━━━━━━━━━━┓ ┏━━━━━━━━━━━━━━━┓ ┏━━━━━━━━━━━━━━━┓ ┏━━━━━━━━━━━━━━━┓ + ┃ ┃ ┃ ┃ ┃ ┃ ┃ ┃ ┃ ┃ ┃ ┃ ┃ ┃ ┃ ┃ ┃ ┃ ┃ ┃ ┃ ┃ ┃ ┃ ┃ ┃ ┃ ┃ ┃ ┃ ┃ ┃ ┃ ┃ ┃ ┃ ┃ ┃ ┃ ┃ ┃ ┃ ┃ ┃ ┃ ┃ ┃ ┃ ┃ ┃ ┃ ┃ ┃ ┃ + ┗━━━━━━━━━━━━━━━┛ ┗━━━━━━━━━━━━━━━┛ ┗━━━━━━━━━━━━━━━┛ ┗━━━━━━━━━━━━━━━┛ ┗━━━━━━━━━━━━━━━┛ ┗━━━━━━━━━━━━━━━┛ [...] - /---------------\ /---------------\ /---------------\ /---------------\ /---------------\ /---------------\ - | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | - \---------------/ \---------------/ \---------------/ \---------------/ \---------------/ \---------------/ + ┏━━━━━━━━━━━━━━━┓ ┏━━━━━━━━━━━━━━━┓ ┏━━━━━━━━━━━━━━━┓ ┏━━━━━━━━━━━━━━━┓ ┏━━━━━━━━━━━━━━━┓ ┏━━━━━━━━━━━━━━━┓ + ┃ ┃ ┃ ┃ ┃ ┃ ┃ ┃ ┃ ┃ ┃ ┃ ┃ ┃ ┃ ┃ ┃ ┃ ┃ ┃ ┃ ┃ ┃ ┃ ┃ ┃ ┃ ┃ ┃ ┃ ┃ ┃ ┃ ┃ ┃ ┃ ┃ ┃ ┃ ┃ ┃ ┃ ┃ ┃ ┃ ┃ ┃ ┃ ┃ ┃ ┃ ┃ ┃ ┃ + ┗━━━━━━━━━━━━━━━┛ ┗━━━━━━━━━━━━━━━┛ ┗━━━━━━━━━━━━━━━┛ ┗━━━━━━━━━━━━━━━┛ ┗━━━━━━━━━━━━━━━┛ ┗━━━━━━━━━━━━━━━┛ ^ ^ - | | - | | - | \___ if 1, last arc of this node - \_____ if 1, this node is final (only on the first arc) + ┃ ┃ + ┃ ┃ + ┃ ┗━━━ if 1, last arc of this node + ┗━━━━━ if 1, this node is final (only on the first arc) """ nArc = len(self.arcs) nFinalNodeMask = 1 << ((nBytesArc*8)-1) nFinalArcMask = 1 << ((nBytesArc*8)-2) if len(self.arcs) == 0: @@ -621,12 +646,13 @@ if i == nArc: val = val | nFinalArcMask by += val.to_bytes(nBytesArc, byteorder='big') by += self.arcs[arc].addr.to_bytes(nBytesNodeAddress, byteorder='big') return by - - def getTxtRepr1 (self, nBytesArc, nBytesNodeAddress, lVal): + + def getTxtRepr1 (self, nBytesArc, lVal): + "return representation as string of node (method 1)" nArc = len(self.arcs) nFinalNodeMask = 1 << ((nBytesArc*8)-1) nFinalArcMask = 1 << ((nBytesArc*8)-2) s = "i{:_>10} -- #{:_>10}\n".format(self.i, self.addr) if len(self.arcs) == 0: @@ -642,28 +668,30 @@ return s # VERSION 2 ===================================================================================================== def convToBytes2 (self, nBytesArc, nBytesNodeAddress): """ + Convert to bytes (method 2). + Node scheme: - Arc length is defined by nBytesArc - Address length is defined by nBytesNodeAddress - + | Arc | Address of next node | | | | - /---------------\ /---------------\ /---------------\ /---------------\ /---------------\ /---------------\ - | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | - \---------------/ \---------------/ \---------------/ \---------------/ \---------------/ \---------------/ + ┏━━━━━━━━━━━━━━━┓ ┏━━━━━━━━━━━━━━━┓ ┏━━━━━━━━━━━━━━━┓ ┏━━━━━━━━━━━━━━━┓ ┏━━━━━━━━━━━━━━━┓ ┏━━━━━━━━━━━━━━━┓ + ┃ ┃ ┃ ┃ ┃ ┃ ┃ ┃ ┃ ┃ ┃ ┃ ┃ ┃ ┃ ┃ ┃ ┃ ┃ ┃ ┃ ┃ ┃ ┃ ┃ ┃ ┃ ┃ ┃ ┃ ┃ ┃ ┃ ┃ ┃ ┃ ┃ ┃ ┃ ┃ ┃ ┃ ┃ ┃ ┃ ┃ ┃ ┃ ┃ ┃ ┃ ┃ ┃ ┃ + ┗━━━━━━━━━━━━━━━┛ ┗━━━━━━━━━━━━━━━┛ ┗━━━━━━━━━━━━━━━┛ ┗━━━━━━━━━━━━━━━┛ ┗━━━━━━━━━━━━━━━┛ ┗━━━━━━━━━━━━━━━┛ [...] - /---------------\ /---------------\ /---------------\ /---------------\ /---------------\ /---------------\ - | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | - \---------------/ \---------------/ \---------------/ \---------------/ \---------------/ \---------------/ + ┏━━━━━━━━━━━━━━━┓ ┏━━━━━━━━━━━━━━━┓ ┏━━━━━━━━━━━━━━━┓ ┏━━━━━━━━━━━━━━━┓ ┏━━━━━━━━━━━━━━━┓ ┏━━━━━━━━━━━━━━━┓ + ┃ ┃ ┃ ┃ ┃ ┃ ┃ ┃ ┃ ┃ ┃ ┃ ┃ ┃ ┃ ┃ ┃ ┃ ┃ ┃ ┃ ┃ ┃ ┃ ┃ ┃ ┃ ┃ ┃ ┃ ┃ ┃ ┃ ┃ ┃ ┃ ┃ ┃ ┃ ┃ ┃ ┃ ┃ ┃ ┃ ┃ ┃ ┃ ┃ ┃ ┃ ┃ ┃ ┃ + ┗━━━━━━━━━━━━━━━┛ ┗━━━━━━━━━━━━━━━┛ ┗━━━━━━━━━━━━━━━┛ ┗━━━━━━━━━━━━━━━┛ ┗━━━━━━━━━━━━━━━┛ ┗━━━━━━━━━━━━━━━┛ ^ ^ ^ - | | | - | | \_ if 1, caution, no address: next node is the following node - | \___ if 1, last arc of this node - \_____ if 1, this node is final (only on the first arc) + ┃ ┃ ┃ + ┃ ┃ ┗━━ if 1, caution, no address: next node is the following node + ┃ ┗━━━━ if 1, last arc of this node + ┗━━━━━━ if 1, this node is final (only on the first arc) """ nArc = len(self.arcs) nFinalNodeMask = 1 << ((nBytesArc*8)-1) nFinalArcMask = 1 << ((nBytesArc*8)-2) nNextNodeMask = 1 << ((nBytesArc*8)-3) @@ -684,12 +712,13 @@ by += val.to_bytes(nBytesArc, byteorder='big') else: by += val.to_bytes(nBytesArc, byteorder='big') by += self.arcs[arc].addr.to_bytes(nBytesNodeAddress, byteorder='big') return by - - def getTxtRepr2 (self, nBytesArc, nBytesNodeAddress, lVal): + + def getTxtRepr2 (self, nBytesArc, lVal): + "return representation as string of node (method 2)" nArc = len(self.arcs) nFinalNodeMask = 1 << ((nBytesArc*8)-1) nFinalArcMask = 1 << ((nBytesArc*8)-2) nNextNodeMask = 1 << ((nBytesArc*8)-3) s = "i{:_>10} -- #{:_>10}\n".format(self.i, self.addr) @@ -702,41 +731,43 @@ val = val | nFinalNodeMask if i == nArc: val = val | nFinalArcMask if (self.pos + 1) == self.arcs[arc].pos and self.i != 0: val = val | nNextNodeMask - s += " {:<20} {:0>16}\n".format(lVal[arc], bin(val)[2:], "") + s += " {:<20} {:0>16}\n".format(lVal[arc], bin(val)[2:]) else: s += " {:<20} {:0>16} i{:_>10} #{:_>10}\n".format(lVal[arc], bin(val)[2:], self.arcs[arc].i, self.arcs[arc].addr) return s # VERSION 3 ===================================================================================================== def convToBytes3 (self, nBytesArc, nBytesNodeAddress, nBytesOffset): """ + Convert to bytes (method 3). + Node scheme: - Arc length is defined by nBytesArc - Address length is defined by nBytesNodeAddress - Offset length is defined by nBytesOffset - + | Arc | Address of next node or offset to next node | | | | - /---------------\ /---------------\ /---------------\ /---------------\ /---------------\ /---------------\ - |1|0|0| | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | - \---------------/ \---------------/ \---------------/ \---------------/ \---------------/ \---------------/ + ┏━━━━━━━━━━━━━━━┓ ┏━━━━━━━━━━━━━━━┓ ┏━━━━━━━━━━━━━━━┓ ┏━━━━━━━━━━━━━━━┓ ┏━━━━━━━━━━━━━━━┓ ┏━━━━━━━━━━━━━━━┓ + ┃1┃0┃0┃ ┃ ┃ ┃ ┃ ┃ ┃ ┃ ┃ ┃ ┃ ┃ ┃ ┃ ┃ ┃ ┃ ┃ ┃ ┃ ┃ ┃ ┃ ┃ ┃ ┃ ┃ ┃ ┃ ┃ ┃ ┃ ┃ ┃ ┃ ┃ ┃ ┃ ┃ ┃ ┃ ┃ ┃ ┃ ┃ ┃ ┃ ┃ ┃ ┃ ┃ + ┗━━━━━━━━━━━━━━━┛ ┗━━━━━━━━━━━━━━━┛ ┗━━━━━━━━━━━━━━━┛ ┗━━━━━━━━━━━━━━━┛ ┗━━━━━━━━━━━━━━━┛ ┗━━━━━━━━━━━━━━━┛ [...] - /---------------\ /---------------\ /---------------\ - |0|0|1| | | | | | | | | | | | | | | | | | | | | | | | Offsets are shorter than addresses - \---------------/ \---------------/ \---------------/ - /---------------\ /---------------\ /---------------\ /---------------\ /---------------\ /---------------\ - |0|1|0| | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | - \---------------/ \---------------/ \---------------/ \---------------/ \---------------/ \---------------/ + ┏━━━━━━━━━━━━━━━┓ ┏━━━━━━━━━━━━━━━┓ ┏━━━━━━━━━━━━━━━┓ + ┃0┃0┃1┃ ┃ ┃ ┃ ┃ ┃ ┃ ┃ ┃ ┃ ┃ ┃ ┃ ┃ ┃ ┃ ┃ ┃ ┃ ┃ ┃ ┃ ┃ ┃ Offsets are shorter than addresses + ┗━━━━━━━━━━━━━━━┛ ┗━━━━━━━━━━━━━━━┛ ┗━━━━━━━━━━━━━━━┛ + ┏━━━━━━━━━━━━━━━┓ ┏━━━━━━━━━━━━━━━┓ ┏━━━━━━━━━━━━━━━┓ ┏━━━━━━━━━━━━━━━┓ ┏━━━━━━━━━━━━━━━┓ ┏━━━━━━━━━━━━━━━┓ + ┃0┃1┃0┃ ┃ ┃ ┃ ┃ ┃ ┃ ┃ ┃ ┃ ┃ ┃ ┃ ┃ ┃ ┃ ┃ ┃ ┃ ┃ ┃ ┃ ┃ ┃ ┃ ┃ ┃ ┃ ┃ ┃ ┃ ┃ ┃ ┃ ┃ ┃ ┃ ┃ ┃ ┃ ┃ ┃ ┃ ┃ ┃ ┃ ┃ ┃ ┃ ┃ ┃ + ┗━━━━━━━━━━━━━━━┛ ┗━━━━━━━━━━━━━━━┛ ┗━━━━━━━━━━━━━━━┛ ┗━━━━━━━━━━━━━━━┛ ┗━━━━━━━━━━━━━━━┛ ┗━━━━━━━━━━━━━━━┛ ^ ^ ^ - | | | - | | \_ if 1, offset instead of address of next node - | \___ if 1, last arc of this node - \_____ if 1, this node is final (only on the first arc) + ┃ ┃ ┃ + ┃ ┃ ┗━━ if 1, offset instead of address of next node + ┃ ┗━━━━ if 1, last arc of this node + ┗━━━━━━ if 1, this node is final (only on the first arc) """ nArc = len(self.arcs) nFinalNodeMask = 1 << ((nBytesArc*8)-1) nFinalArcMask = 1 << ((nBytesArc*8)-2) nNextNodeMask = 1 << ((nBytesArc*8)-3) @@ -759,12 +790,13 @@ by += (self.arcs[arc].addr-self.addr).to_bytes(nBytesOffset, byteorder='big') else: by += val.to_bytes(nBytesArc, byteorder='big') by += self.arcs[arc].addr.to_bytes(nBytesNodeAddress, byteorder='big') return by - - def getTxtRepr3 (self, nBytesArc, nBytesNodeAddress, nBytesOffset, lVal): + + def getTxtRepr3 (self, nBytesArc, nBytesOffset, lVal): + "return representation as string of node (method 3)" nArc = len(self.arcs) nFinalNodeMask = 1 << ((nBytesArc*8)-1) nFinalArcMask = 1 << ((nBytesArc*8)-2) nNextNodeMask = 1 << ((nBytesArc*8)-3) nMaxOffset = (2 ** (nBytesOffset * 8)) - 1 @@ -794,20 +826,23 @@ "": {} } def addWordToCharDict (sWord): + "for each character of , count how many times it appears after the previous character, and store result in a <_dCharOrder>" cPrevious = "" for cChar in sWord: if cPrevious not in _dCharOrder: _dCharOrder[cPrevious] = {} _dCharOrder[cPrevious][cChar] = _dCharOrder[cPrevious].get(cChar, 0) + 1 cPrevious = cChar def getCharOrderAfterChar (cChar): + "return a dictionary of chars with number of times it appears after character " return _dCharOrder.get(cChar, None) def displayCharOrder (): + "display how many times each character appear after another one" for key, value in _dCharOrder.items(): print("[" + key + "]: ", ", ".join([ c+":"+str(n) for c, n in sorted(value.items(), key=lambda t: t[1], reverse=True) ])) Index: graphspell/echo.py ================================================================== --- graphspell/echo.py +++ graphspell/echo.py @@ -1,9 +1,13 @@ #!python3 -# The most boring yet indispensable function: print! +""" +The most boring yet indispensable function: print! +Because you can print on Windows console without being sure the script won’t crash… +Windows console don’t accept many characters. +""" import sys _CHARMAP = str.maketrans({ 'œ': 'ö', 'Œ': 'Ö', 'ʳ': "r", 'ᵉ': "e", '…': "_", \ @@ -22,8 +26,8 @@ if sys.platform != "win32": print(obj, sep=sep, end=end, file=file, flush=flush) return True try: print(str(obj).translate(_CHARMAP), sep=sep, end=end, file=file, flush=flush) - except: + except Exception: print(str(obj).encode('ascii', 'replace').decode('ascii', 'replace'), sep=sep, end=end, file=file, flush=flush) return True Index: graphspell/fr.py ================================================================== --- graphspell/fr.py +++ graphspell/fr.py @@ -1,6 +1,8 @@ -# French language +""" +Default suggestion for French language +""" dSugg = { "bcp": "beaucoup", "ca": "ça", "cad": "c’est-à-dire", Index: graphspell/ibdawg.py ================================================================== --- graphspell/ibdawg.py +++ graphspell/ibdawg.py @@ -1,8 +1,13 @@ #!python3 -import os +""" +INDEXABLE BINARY DIRECT ACYCLIC WORD GRAPH +Implementation of a spellchecker as a transducer (storing transformation code to get lemma and morphologies) +and a spell suggestion mechanim +""" + import traceback import pkgutil import re from functools import wraps import time @@ -19,10 +24,11 @@ def timethis (func): "decorator for the execution time" @wraps(func) def wrapper (*args, **kwargs): + "something to prevent pylint whining" fStart = time.time() result = func(*args, **kwargs) fEnd = time.time() print(func.__name__, fEnd - fStart) return result @@ -56,11 +62,11 @@ self.aSugg.add(sSugg) if nDist < self.nMinDist: self.nMinDist = nDist self.nDistLimit = min(self.nDistLimit, self.nMinDist+2) - def getSuggestions (self, nSuggLimit=10, nDistLimit=-1): + def getSuggestions (self, nSuggLimit=10): "return a list of suggestions" if self.dSugg[0]: # we sort the better results with the original word self.dSugg[0].sort(key=lambda sSugg: st.distanceDamerauLevenshtein(self.sWord, sSugg)) lRes = self.dSugg.pop(0) @@ -75,10 +81,11 @@ elif self.sWord[0:1].isupper(): lRes = list(map(lambda sSugg: sSugg[0:1].upper()+sSugg[1:], lRes)) # dont’ use <.istitle> return lRes[:nSuggLimit] def reset (self): + "clear data" self.aSugg.clear() self.dSugg.clear() class IBDAWG: @@ -147,11 +154,11 @@ raise ValueError("# Error. Unknown dictionary version: {}".format(self.by[17:18])) try: header, info, values, bdic = self.by.split(b"\0\0\0\0", 3) except Exception: raise Exception - + self.nCompressionMethod = int(self.by[17:18].decode("utf-8")) self.sHeader = header.decode("utf-8") self.lArcVal = values.decode("utf-8").split("\t") self.nArcVal = len(self.lArcVal) self.byDic = bdic @@ -182,10 +189,11 @@ self.__dict__.update(oJSON) self.byDic = binascii.unhexlify(self.sByDic) self.dCharVal = { v: k for k, v in self.dChar.items() } def getInfo (self): + "return string about the IBDAWG" return " Language: {0.sLangName} Lang code: {0.sLangCode} Dictionary name: {0.sDicName}" \ " Compression method: {0.nCompressionMethod:>2} Date: {0.sDate} Stemming: {0.cStemming}FX\n" \ " Arcs values: {0.nArcVal:>10,} = {0.nChar:>5,} characters, {0.nAff:>6,} affixes, {0.nTag:>6,} tags\n" \ " Dictionary: {0.nEntry:>12,} entries, {0.nNode:>11,} nodes, {0.nArc:>11,} arcs\n" \ " Address size: {0.nBytesNodeAddress:>1} bytes, Arc size: {0.nBytesArc:>1} bytes\n".format(self) @@ -194,35 +202,35 @@ "write IBDAWG as a JavaScript object in a JavaScript module" with open(spfDest, "w", encoding="utf-8", newline="\n") as hDst: if bInJSModule: hDst.write('// JavaScript\n// Generated data (do not edit)\n\n"use strict";\n\nconst dictionary = ') hDst.write(json.dumps({ - "sHeader": "/grammalecte-fsa/", - "sLangCode": self.sLangCode, - "sLangName": self.sLangName, - "sDicName": self.sDicName, - "sFileName": self.sFileName, - "sDate": self.sDate, - "nEntry": self.nEntry, - "nChar": self.nChar, - "nAff": self.nAff, - "nTag": self.nTag, - "cStemming": self.cStemming, - "dChar": self.dChar, - "nNode": self.nNode, - "nArc": self.nArc, - "nArcVal": self.nArcVal, - "lArcVal": self.lArcVal, - "nCompressionMethod": self.nCompressionMethod, - "nBytesArc": self.nBytesArc, - "nBytesNodeAddress": self.nBytesNodeAddress, - "nBytesOffset": self.nBytesOffset, - # JavaScript is a pile of shit, so Mozilla’s JS parser don’t like file bigger than 4 Mb! - # So, if necessary, we use an hexadecimal string, that we will convert later in Firefox’s extension. - # https://github.com/mozilla/addons-linter/issues/1361 - "sByDic": self.byDic.hex() if bBinaryDictAsHexString else [ e for e in self.byDic ] - }, ensure_ascii=False)) + "sHeader": "/grammalecte-fsa/", + "sLangCode": self.sLangCode, + "sLangName": self.sLangName, + "sDicName": self.sDicName, + "sFileName": self.sFileName, + "sDate": self.sDate, + "nEntry": self.nEntry, + "nChar": self.nChar, + "nAff": self.nAff, + "nTag": self.nTag, + "cStemming": self.cStemming, + "dChar": self.dChar, + "nNode": self.nNode, + "nArc": self.nArc, + "nArcVal": self.nArcVal, + "lArcVal": self.lArcVal, + "nCompressionMethod": self.nCompressionMethod, + "nBytesArc": self.nBytesArc, + "nBytesNodeAddress": self.nBytesNodeAddress, + "nBytesOffset": self.nBytesOffset, + # JavaScript is a pile of shit, so Mozilla’s JS parser don’t like file bigger than 4 Mb! + # So, if necessary, we use an hexadecimal string, that we will convert later in Firefox’s extension. + # https://github.com/mozilla/addons-linter/issues/1361 + "sByDic": self.byDic.hex() if bBinaryDictAsHexString else [ e for e in self.byDic ] + }, ensure_ascii=False)) if bInJSModule: hDst.write(";\n\nexports.dictionary = dictionary;\n") def isValidToken (self, sToken): "checks if is valid (if there is hyphens in , is split, each part is checked)" @@ -265,11 +273,11 @@ iAddr = 0 for c in sWord: if c not in self.dChar: return False iAddr = self._lookupArcNode(self.dChar[c], iAddr) - if iAddr == None: + if iAddr is None: return False return bool(int.from_bytes(self.byDic[iAddr:iAddr+self.nBytesArc], byteorder='big') & self._finalNodeMask) def getMorph (self, sWord): "retrieves morphologies list, different casing allowed" @@ -344,17 +352,17 @@ self._suggest(oSuggResult, "", nMaxSwitch, nMaxDel, nMaxHardRepl, nMaxJump, nDist, nDeep+1, iAddr, sNewWord, True) # remove last char and go on for sRepl in cp.dFinal1.get(sRemain, ()): self._suggest(oSuggResult, sRepl, nMaxSwitch, nMaxDel, nMaxHardRepl, nMaxJump, nDist, nDeep+1, iAddr, sNewWord, True) #@timethis - def suggest2 (self, sWord, nMaxSugg=10): + def suggest2 (self, sWord, nSuggLimit=10): "returns a set of suggestions for " sWord = cp.spellingNormalization(sWord) sPfx, sWord, sSfx = cp.cut(sWord) oSuggResult = SuggResult(sWord) self._suggest2(oSuggResult) - aSugg = oSuggResult.getSuggestions() + aSugg = oSuggResult.getSuggestions(nSuggLimit) if sSfx or sPfx: # we add what we removed return list(map(lambda sSug: sPfx + sSug + sSfx, aSugg)) return aSugg @@ -407,21 +415,21 @@ "show the path taken by in the graph" sWord = cp.spellingNormalization(sWord) c1 = sWord[0:1] if sWord else " " iPos = -1 n = 0 - print(c1 + ": ", end="") + echo(c1 + ": ", end="") for c2, jAddr in self._getCharArcs(iAddr): - print(c2, end="") + echo(c2, end="") if c2 == sWord[0:1]: iNextNodeAddr = jAddr iPos = n n += 1 if not sWord: return if iPos >= 0: - print("\n "+ " " * iPos + "|") + echo("\n " + " " * iPos + "|") self.drawPath(sWord[1:], iNextNodeAddr) def getSimilarEntries (self, sWord, nSuggLimit=10): "return a list of tuples (similar word, stem, morphology)" if not sWord: @@ -469,21 +477,21 @@ iAddr = 0 for c in sWord: if c not in self.dChar: return [] iAddr = self._lookupArcNode(self.dChar[c], iAddr) - if iAddr == None: + if iAddr is None: return [] - if (int.from_bytes(self.byDic[iAddr:iAddr+self.nBytesArc], byteorder='big') & self._finalNodeMask): + if int.from_bytes(self.byDic[iAddr:iAddr+self.nBytesArc], byteorder='big') & self._finalNodeMask: l = [] nRawArc = 0 while not (nRawArc & self._lastArcMask): iEndArcAddr = iAddr + self.nBytesArc nRawArc = int.from_bytes(self.byDic[iAddr:iEndArcAddr], byteorder='big') nArc = nRawArc & self._arcMask if nArc > self.nChar: - # This value is not a char, this is a stemming code + # This value is not a char, this is a stemming code sStem = ">" + self.funcStemming(sWord, self.lArcVal[nArc]) # Now , we go to the next node and retrieve all following arcs values, all of them are tags iAddr2 = int.from_bytes(self.byDic[iEndArcAddr:iEndArcAddr+self.nBytesNodeAddress], byteorder='big') nRawArc2 = 0 while not (nRawArc2 & self._lastArcMask): @@ -500,21 +508,21 @@ iAddr = 0 for c in sWord: if c not in self.dChar: return [] iAddr = self._lookupArcNode(self.dChar[c], iAddr) - if iAddr == None: + if iAddr is None: return [] - if (int.from_bytes(self.byDic[iAddr:iAddr+self.nBytesArc], byteorder='big') & self._finalNodeMask): + if int.from_bytes(self.byDic[iAddr:iAddr+self.nBytesArc], byteorder='big') & self._finalNodeMask: l = [] nRawArc = 0 while not (nRawArc & self._lastArcMask): iEndArcAddr = iAddr + self.nBytesArc nRawArc = int.from_bytes(self.byDic[iAddr:iEndArcAddr], byteorder='big') nArc = nRawArc & self._arcMask if nArc > self.nChar: - # This value is not a char, this is a stemming code + # This value is not a char, this is a stemming code l.append(self.funcStemming(sWord, self.lArcVal[nArc])) iAddr = iEndArcAddr+self.nBytesNodeAddress return l return [] @@ -522,33 +530,33 @@ "looks if is an arc at the node at , if yes, returns address of next node else None" while True: iEndArcAddr = iAddr+self.nBytesArc nRawArc = int.from_bytes(self.byDic[iAddr:iEndArcAddr], byteorder='big') if nVal == (nRawArc & self._arcMask): - # the value we are looking for + # the value we are looking for # we return the address of the next node return int.from_bytes(self.byDic[iEndArcAddr:iEndArcAddr+self.nBytesNodeAddress], byteorder='big') else: # value not found - if (nRawArc & self._lastArcMask): + if nRawArc & self._lastArcMask: return None iAddr = iEndArcAddr+self.nBytesNodeAddress def _getArcs1 (self, iAddr): "generator: return all arcs at as tuples of (nVal, iAddr)" while True: iEndArcAddr = iAddr+self.nBytesArc nRawArc = int.from_bytes(self.byDic[iAddr:iEndArcAddr], byteorder='big') - yield (nRawArc & self._arcMask, int.from_bytes(self.byDic[iEndArcAddr:iEndArcAddr+self.nBytesNodeAddress], byteorder='big')) - if (nRawArc & self._lastArcMask): + yield nRawArc & self._arcMask, int.from_bytes(self.byDic[iEndArcAddr:iEndArcAddr+self.nBytesNodeAddress], byteorder='big') + if nRawArc & self._lastArcMask: break iAddr = iEndArcAddr+self.nBytesNodeAddress def _writeNodes1 (self, spfDest): "for debugging only" print(" > Write binary nodes") - with codecs.open(spfDest, 'w', 'utf-8', newline="\n") as hDst: + with open(spfDest, 'w', 'utf-8', newline="\n") as hDst: iAddr = 0 hDst.write("i{:_>10} -- #{:_>10}\n".format("0", iAddr)) while iAddr < len(self.byDic): iEndArcAddr = iAddr+self.nBytesArc nRawArc = int.from_bytes(self.byDic[iAddr:iEndArcAddr], byteorder='big') @@ -567,21 +575,21 @@ iAddr = 0 for c in sWord: if c not in self.dChar: return [] iAddr = self._lookupArcNode(self.dChar[c], iAddr) - if iAddr == None: + if iAddr is None: return [] - if (int.from_bytes(self.byDic[iAddr:iAddr+self.nBytesArc], byteorder='big') & self._finalNodeMask): + if int.from_bytes(self.byDic[iAddr:iAddr+self.nBytesArc], byteorder='big') & self._finalNodeMask: l = [] nRawArc = 0 while not (nRawArc & self._lastArcMask): iEndArcAddr = iAddr + self.nBytesArc nRawArc = int.from_bytes(self.byDic[iAddr:iEndArcAddr], byteorder='big') nArc = nRawArc & self._arcMask if nArc > self.nChar: - # This value is not a char, this is a stemming code + # This value is not a char, this is a stemming code sStem = ">" + self.funcStemming(sWord, self.lArcVal[nArc]) # Now , we go to the next node and retrieve all following arcs values, all of them are tags if not (nRawArc & self._addrBitMask): iAddr2 = int.from_bytes(self.byDic[iEndArcAddr:iEndArcAddr+self.nBytesNodeAddress], byteorder='big') else: @@ -605,21 +613,21 @@ iAddr = 0 for c in sWord: if c not in self.dChar: return [] iAddr = self._lookupArcNode(self.dChar[c], iAddr) - if iAddr == None: + if iAddr is None: return [] - if (int.from_bytes(self.byDic[iAddr:iAddr+self.nBytesArc], byteorder='big') & self._finalNodeMask): + if int.from_bytes(self.byDic[iAddr:iAddr+self.nBytesArc], byteorder='big') & self._finalNodeMask: l = [] nRawArc = 0 while not (nRawArc & self._lastArcMask): iEndArcAddr = iAddr + self.nBytesArc nRawArc = int.from_bytes(self.byDic[iAddr:iEndArcAddr], byteorder='big') nArc = nRawArc & self._arcMask if nArc > self.nChar: - # This value is not a char, this is a stemming code + # This value is not a char, this is a stemming code l.append(self.funcStemming(sWord, self.lArcVal[nArc])) # Now , we go to the next node if not (nRawArc & self._addrBitMask): iAddr2 = int.from_bytes(self.byDic[iEndArcAddr:iEndArcAddr+self.nBytesNodeAddress], byteorder='big') else: @@ -636,11 +644,11 @@ "looks if is an arc at the node at , if yes, returns address of next node else None" while True: iEndArcAddr = iAddr+self.nBytesArc nRawArc = int.from_bytes(self.byDic[iAddr:iEndArcAddr], byteorder='big') if nVal == (nRawArc & self._arcMask): - # the value we are looking for + # the value we are looking for if not (nRawArc & self._addrBitMask): # we return the address of the next node return int.from_bytes(self.byDic[iEndArcAddr:iEndArcAddr+self.nBytesNodeAddress], byteorder='big') else: # we go to the end of the node @@ -649,18 +657,18 @@ nRawArc = int.from_bytes(self.byDic[iAddr:iAddr+self.nBytesArc], byteorder='big') iAddr += self.nBytesArc + self.nBytesNodeAddress if not (nRawArc & self._addrBitMask) else self.nBytesArc return iAddr else: # value not found - if (nRawArc & self._lastArcMask): + if nRawArc & self._lastArcMask: return None iAddr = iEndArcAddr+self.nBytesNodeAddress if not (nRawArc & self._addrBitMask) else iEndArcAddr def _writeNodes2 (self, spfDest): "for debugging only" print(" > Write binary nodes") - with codecs.open(spfDest, 'w', 'utf-8', newline="\n") as hDst: + with open(spfDest, 'w', 'utf-8', newline="\n") as hDst: iAddr = 0 hDst.write("i{:_>10} -- #{:_>10}\n".format("0", iAddr)) while iAddr < len(self.byDic): iEndArcAddr = iAddr+self.nBytesArc nRawArc = int.from_bytes(self.byDic[iAddr:iEndArcAddr], byteorder='big') @@ -670,11 +678,11 @@ hDst.write(" {:<20} {:0>16} i{:>10} #{:_>10}\n".format(self.lArcVal[nArc], bin(nRawArc)[2:], "?", iNextNodeAddr)) iAddr = iEndArcAddr+self.nBytesNodeAddress else: hDst.write(" {:<20} {:0>16}\n".format(self.lArcVal[nArc], bin(nRawArc)[2:])) iAddr = iEndArcAddr - if (nRawArc & self._lastArcMask): + if nRawArc & self._lastArcMask: hDst.write("\ni{:_>10} -- #{:_>10}\n".format("?", iAddr)) hDst.close() # VERSION 3 def _morph3 (self, sWord): @@ -682,22 +690,22 @@ iAddr = 0 for c in sWord: if c not in self.dChar: return [] iAddr = self._lookupArcNode(self.dChar[c], iAddr) - if iAddr == None: + if iAddr is None: return [] - if (int.from_bytes(self.byDic[iAddr:iAddr+self.nBytesArc], byteorder='big') & self._finalNodeMask): + if int.from_bytes(self.byDic[iAddr:iAddr+self.nBytesArc], byteorder='big') & self._finalNodeMask: l = [] nRawArc = 0 iAddrNode = iAddr while not (nRawArc & self._lastArcMask): iEndArcAddr = iAddr + self.nBytesArc nRawArc = int.from_bytes(self.byDic[iAddr:iEndArcAddr], byteorder='big') nArc = nRawArc & self._arcMask if nArc > self.nChar: - # This value is not a char, this is a stemming code + # This value is not a char, this is a stemming code sStem = ">" + self.funcStemming(sWord, self.lArcVal[nArc]) # Now , we go to the next node and retrieve all following arcs values, all of them are tags if not (nRawArc & self._addrBitMask): iAddr2 = int.from_bytes(self.byDic[iEndArcAddr:iEndArcAddr+self.nBytesNodeAddress], byteorder='big') else: @@ -717,22 +725,22 @@ iAddr = 0 for c in sWord: if c not in self.dChar: return [] iAddr = self._lookupArcNode(self.dChar[c], iAddr) - if iAddr == None: + if iAddr is None: return [] - if (int.from_bytes(self.byDic[iAddr:iAddr+self.nBytesArc], byteorder='big') & self._finalNodeMask): + if int.from_bytes(self.byDic[iAddr:iAddr+self.nBytesArc], byteorder='big') & self._finalNodeMask: l = [] nRawArc = 0 - iAddrNode = iAddr + #iAddrNode = iAddr while not (nRawArc & self._lastArcMask): iEndArcAddr = iAddr + self.nBytesArc nRawArc = int.from_bytes(self.byDic[iAddr:iEndArcAddr], byteorder='big') nArc = nRawArc & self._arcMask if nArc > self.nChar: - # This value is not a char, this is a stemming code + # This value is not a char, this is a stemming code l.append(self.funcStemming(sWord, self.lArcVal[nArc])) iAddr = iEndArcAddr+self.nBytesNodeAddress if not (nRawArc & self._addrBitMask) else iEndArcAddr+self.nBytesOffset return l return [] @@ -741,25 +749,25 @@ iAddrNode = iAddr while True: iEndArcAddr = iAddr+self.nBytesArc nRawArc = int.from_bytes(self.byDic[iAddr:iEndArcAddr], byteorder='big') if nVal == (nRawArc & self._arcMask): - # the value we are looking for + # the value we are looking for if not (nRawArc & self._addrBitMask): return int.from_bytes(self.byDic[iEndArcAddr:iEndArcAddr+self.nBytesNodeAddress], byteorder='big') else: return iAddrNode + int.from_bytes(self.byDic[iEndArcAddr:iEndArcAddr+self.nBytesOffset], byteorder='big') else: # value not found - if (nRawArc & self._lastArcMask): + if nRawArc & self._lastArcMask: return None iAddr = iEndArcAddr+self.nBytesNodeAddress if not (nRawArc & self._addrBitMask) else iEndArcAddr+self.nBytesOffset def _writeNodes3 (self, spfDest): "for debugging only" print(" > Write binary nodes") - with codecs.open(spfDest, 'w', 'utf-8', newline="\n") as hDst: + with open(spfDest, 'w', 'utf-8', newline="\n") as hDst: iAddr = 0 hDst.write("i{:_>10} -- #{:_>10}\n".format("0", iAddr)) while iAddr < len(self.byDic): iEndArcAddr = iAddr+self.nBytesArc nRawArc = int.from_bytes(self.byDic[iAddr:iEndArcAddr], byteorder='big') @@ -770,8 +778,8 @@ iAddr = iEndArcAddr+self.nBytesNodeAddress else: iNextNodeAddr = int.from_bytes(self.byDic[iEndArcAddr:iEndArcAddr+self.nBytesOffset], byteorder='big') hDst.write(" {:<20} {:0>16} i{:>10} +{:_>10}\n".format(self.lArcVal[nArc], bin(nRawArc)[2:], "?", iNextNodeAddr)) iAddr = iEndArcAddr+self.nBytesOffset - if (nRawArc & self._lastArcMask): + if nRawArc & self._lastArcMask: hDst.write("\ni{:_>10} -- #{:_>10}\n".format("?", iAddr)) hDst.close() Index: graphspell/keyboard_chars_proximity.py ================================================================== --- graphspell/keyboard_chars_proximity.py +++ graphspell/keyboard_chars_proximity.py @@ -1,13 +1,18 @@ -# Keyboard chars proximity + +""" +Keyboard chars proximity +""" def getKeyboardMap (sKeyboard): + "return keyboard map as a dictionary of chars" return _dKeyboardMap.get(sKeyboard.lower(), {}) def getKeyboardList (): + "return list of keyboards available" return _dKeyboardMap.keys() _dKeyboardMap = { # keyboards by alphabetical order Index: graphspell/progressbar.py ================================================================== --- graphspell/progressbar.py +++ graphspell/progressbar.py @@ -1,14 +1,17 @@ -# Textual progressbar +""" +Textual progressbar +""" + # by Olivier R. # License: MPL 2 import time class ProgressBar: "Textual progressbar" - + def __init__ (self, nMin=0, nMax=100, nWidth=78): "initiate with minimum nMin to maximum nMax" self.nMin = nMin self.nMax = nMax self.nSpan = nMax - nMin @@ -17,19 +20,19 @@ self.nCurVal = nMin self.startTime = time.time() self._update() def _update (self): - fDone = ((self.nCurVal - self.nMin) / self.nSpan) + fDone = (self.nCurVal - self.nMin) / self.nSpan nAdvance = int(fDone * self.nWidth) - if (nAdvance > self.nAdvance): + if nAdvance > self.nAdvance: self.nAdvance = nAdvance print("\r[ {}{} {}% ] ".format('>'*nAdvance, ' '*(self.nWidth-nAdvance), round(fDone*100)), end="") def increment (self, n=1): "increment value by n (1 by default)" self.nCurVal += n self._update() - + def done (self): "to call when it’s finished" print("\r[ task done in {:.1f} s ] ".format(time.time() - self.startTime)) Index: graphspell/spellchecker.py ================================================================== --- graphspell/spellchecker.py +++ graphspell/spellchecker.py @@ -1,14 +1,15 @@ -# Spellchecker -# Wrapper for the IBDAWG class. -# Useful to check several dictionaries at once. - -# To avoid iterating over a pile of dictionaries, it is assumed that 3 are enough: -# - the main dictionary, bundled with the package -# - the extended dictionary -# - the community dictionary, added by an organization -# - the personal dictionary, created by the user for its own convenience +""" +Spellchecker. +Useful to check several dictionaries at once. + +To avoid iterating over a pile of dictionaries, it is assumed that 3 are enough: +- the main dictionary, bundled with the package +- the extended dictionary +- the community dictionary, added by an organization +- the personal dictionary, created by the user for its own convenience +""" import importlib import traceback from . import ibdawg @@ -20,10 +21,11 @@ "en": "en.bdic" } class SpellChecker (): + "SpellChecker: wrapper for the IBDAWG class" def __init__ (self, sLangCode, sfMainDic="", sfExtendedDic="", sfCommunityDic="", sfPersonalDic=""): "returns True if the main dictionary is loaded" self.sLangCode = sLangCode if not sfMainDic: @@ -55,23 +57,24 @@ raise Exception(str(e), "Error: <" + str(source) + "> not loaded.") print("Error: <" + str(source) + "> not loaded.") traceback.print_exc() return None - def loadTokenizer (self): + def _loadTokenizer (self): self.oTokenizer = tokenizer.Tokenizer(self.sLangCode) def getTokenizer (self): + "load and return the tokenizer object" if not self.oTokenizer: - self.loadTokenizer() + self._loadTokenizer() return self.oTokenizer def setMainDictionary (self, source): "returns True if the dictionary is loaded" self.oMainDic = self._loadDictionary(source, True) return bool(self.oMainDic) - + def setExtendedDictionary (self, source, bActivate=True): "returns True if the dictionary is loaded" self.oExtendedDic = self._loadDictionary(source) self.bExtendedDic = False if not bActivate else bool(self.oExtendedDic) return bool(self.oExtendedDic) @@ -87,57 +90,68 @@ self.oPersonalDic = self._loadDictionary(source) self.bPersonalDic = False if not bActivate else bool(self.oPersonalDic) return bool(self.oPersonalDic) def activateExtendedDictionary (self): + "activate extended dictionary (if available)" self.bExtendedDic = bool(self.oExtendedDic) def activateCommunityDictionary (self): + "activate community dictionary (if available)" self.bCommunityDic = bool(self.oCommunityDic) def activatePersonalDictionary (self): + "activate personal dictionary (if available)" self.bPersonalDic = bool(self.oPersonalDic) def deactivateExtendedDictionary (self): + "deactivate extended dictionary" self.bExtendedDic = False def deactivateCommunityDictionary (self): + "deactivate community dictionary" self.bCommunityDic = False def deactivatePersonalDictionary (self): + "deactivate personal dictionary" self.bPersonalDic = False # Default suggestions def loadSuggestions (self, sLangCode): + "load default suggestion module for " try: - suggest_module = importlib.import_module("."+sLangCode, "graphspell") - except: + suggest = importlib.import_module("."+sLangCode, "graphspell") + except ImportError: print("No suggestion module for language <"+sLangCode+">") return - self.dDefaultSugg = suggest_module.dSugg + self.dDefaultSugg = suggest.dSugg # Storage def activateStorage (self): + "store all lemmas and morphologies retrieved from the word graph" self.bStorage = True def deactivateStorage (self): + "stop storing all lemmas and morphologies retrieved from the word graph" self.bStorage = False def clearStorage (self): + "clear all stored data" self._dLemmas.clear() self._dMorphologies.clear() # parse text functions def parseParagraph (self, sText, bSpellSugg=False): + "return a list of tokens where token value doesn’t exist in the word graph" if not self.oTokenizer: - self.loadTokenizer() + self._loadTokenizer() aSpellErrs = [] for dToken in self.oTokenizer.genTokens(sText): if dToken['sType'] == "WORD" and not self.isValidToken(dToken['sValue']): if bSpellSugg: dToken['aSuggestions'] = [] @@ -145,12 +159,14 @@ dToken['aSuggestions'].extend(lSugg) aSpellErrs.append(dToken) return aSpellErrs def countWordsOccurrences (self, sText, bByLemma=False, bOnlyUnknownWords=False, dWord={}): + """count word occurrences. + can be used to cumulate count from several texts.""" if not self.oTokenizer: - self.loadTokenizer() + self._loadTokenizer() for dToken in self.oTokenizer.genTokens(sText): if dToken['sType'] == "WORD": if bOnlyUnknownWords: if not self.isValidToken(dToken['sValue']): dWord[dToken['sValue']] = dWord.get(dToken['sValue'], 0) + 1 @@ -180,11 +196,11 @@ "checks if sWord is valid (different casing tested if the first letter is a capital)" if self.oMainDic.isValid(sWord): return True if self.bExtendedDic and self.oExtendedDic.isValid(sWord): return True - if self.bCommunityDic and self.oCommunityDic.isValid(sToken): + if self.bCommunityDic and self.oCommunityDic.isValid(sWord): return True if self.bPersonalDic and self.oPersonalDic.isValid(sWord): return True return False @@ -192,11 +208,11 @@ "checks if sWord is in dictionary as is (strict verification)" if self.oMainDic.lookup(sWord): return True if self.bExtendedDic and self.oExtendedDic.lookup(sWord): return True - if self.bCommunityDic and self.oCommunityDic.lookup(sToken): + if self.bCommunityDic and self.oCommunityDic.lookup(sWord): return True if self.bPersonalDic and self.oPersonalDic.lookup(sWord): return True return False @@ -252,10 +268,11 @@ yield from self.oCommunityDic.select(sFlexPattern, sTagsPattern) if self.bPersonalDic: yield from self.oPersonalDic.select(sFlexPattern, sTagsPattern) def drawPath (self, sWord): + "draw the path taken by within the word graph: display matching nodes and their arcs" self.oMainDic.drawPath(sWord) if self.bExtendedDic: print("-----") self.oExtendedDic.drawPath(sWord) if self.bCommunityDic: Index: graphspell/str_transform.py ================================================================== --- graphspell/str_transform.py +++ graphspell/str_transform.py @@ -1,25 +1,32 @@ #!python3 +""" +Operations on strings: +- calculate distance between two strings +- transform strings with transformation codes +""" + #### DISTANCE CALCULATIONS def longestCommonSubstring (s1, s2): + "longest common substring" # http://en.wikipedia.org/wiki/Longest_common_substring_problem # http://en.wikibooks.org/wiki/Algorithm_implementation/Strings/Longest_common_substring - M = [ [0]*(1+len(s2)) for i in range(1+len(s1)) ] - longest, x_longest = 0, 0 + lMatrix = [ [0]*(1+len(s2)) for i in range(1+len(s1)) ] + nLongest, nLongestX = 0, 0 for x in range(1, 1+len(s1)): for y in range(1, 1+len(s2)): if s1[x-1] == s2[y-1]: - M[x][y] = M[x-1][y-1] + 1 - if M[x][y] > longest: - longest = M[x][y] - x_longest = x + lMatrix[x][y] = lMatrix[x-1][y-1] + 1 + if lMatrix[x][y] > nLongest: + nLongest = lMatrix[x][y] + nLongestX = x else: - M[x][y] = 0 - return s1[x_longest-longest : x_longest] + lMatrix[x][y] = 0 + return s1[nLongestX-nLongest : nLongestX] def distanceDamerauLevenshtein (s1, s2): "distance of Damerau-Levenshtein between and " # https://fr.wikipedia.org/wiki/Distance_de_Damerau-Levenshtein @@ -54,11 +61,11 @@ i1, i2 = 0, 0 # Cursors for each string nLargestCS = 0 # Largest common substring nLocalCS = 0 # Local common substring nTrans = 0 # Number of transpositions ('ab' vs 'ba') lOffset = [] # Offset pair array, for computing the transpositions - + while i1 < nLen1 and i2 < nLen2: if s1[i1] == s2[i2]: nLocalCS += 1 # Check if current match is a transposition bTrans = False @@ -103,10 +110,11 @@ nLargestCS += nLocalCS return round(max(nLen1, nLen2) - nLargestCS + nTrans) def showDistance (s1, s2): + "display Damerau-Levenshtein distance and Sift4 distance between and " print("Damerau-Levenshtein: " + s1 + "/" + s2 + " = " + distanceDamerauLevenshtein(s1, s2)) print("Sift4:" + s1 + "/" + s2 + " = " + distanceSift4(s1, s2)) @@ -114,23 +122,27 @@ #### STEMMING OPERATIONS ## No stemming def noStemming (sFlex, sStem): + "return " return sStem -def rebuildWord (sFlex, cmd1, cmd2): - if cmd1 == "_": +def rebuildWord (sFlex, sCode1, sCode2): + """ Change with codes (each inserts a char at a defined possition). + + """ + if sCode1 == "_": + return sFlex + n, c = sCode1.split(":") + sFlex = sFlex[:n] + c + sFlex[n:] + if sCode2 == "_": return sFlex - n, c = cmd1.split(":") - s = s[:n] + c + s[n:] - if cmd2 == "_": - return s - n, c = cmd2.split(":") - return s[:n] + c + s[n:] - - + n, c = sCode2.split(":") + return sFlex[:n] + c + sFlex[n:] + + ## Define affixes for stemming # Note: 48 is the ASCII code for "0" @@ -150,14 +162,15 @@ jSfx = 0 for i in range(min(len(sFlex), len(sStem))): if sFlex[i] != sStem[i]: break jSfx += 1 - return chr(len(sFlex)-jSfx+48) + sStem[jSfx:] + return chr(len(sFlex)-jSfx+48) + sStem[jSfx:] def changeWordWithSuffixCode (sWord, sSfxCode): + "apply transformation code on and return the result string" if sSfxCode == "0": return sWord return sWord[:-(ord(sSfxCode[0])-48)] + sSfxCode[1:] if sSfxCode[0] != '0' else sWord + sSfxCode[1:] @@ -167,11 +180,11 @@ """ Returns a string defining how to get stem from flexion. Examples: "0" if stem = flexion "stem" if no common substring "n(pfx)/m(sfx)" with n and m: chars with numeric meaning, "0" = 0, "1" = 1, ... ":" = 10, etc. (See ASCII table.) Says how many letters to strip from flexion. - pfx [optional]: string to add before the flexion + pfx [optional]: string to add before the flexion sfx [optional]: string to add after the flexion """ if sFlex == sStem: return "0" # is stem a substring of flexion? @@ -189,13 +202,13 @@ return chr(n+48) + sPfx + "/" + chr(m+48) + sSfx return sStem def changeWordWithAffixCode (sWord, sAffCode): + "apply transformation code on and return the result string" if sAffCode == "0": return sWord if '/' not in sAffCode: return sAffCode sPfxCode, sSfxCode = sAffCode.split('/') - sWord = sPfxCode[1:] + sWord[(ord(sPfxCode[0])-48):] + sWord = sPfxCode[1:] + sWord[(ord(sPfxCode[0])-48):] return sWord[:-(ord(sSfxCode[0])-48)] + sSfxCode[1:] if sSfxCode[0] != '0' else sWord + sSfxCode[1:] - Index: graphspell/tokenizer.py ================================================================== --- graphspell/tokenizer.py +++ graphspell/tokenizer.py @@ -1,6 +1,9 @@ -# Very simple tokenizer +""" +Very simple tokenizer +using regular expressions +""" import re _PATTERNS = { "default": @@ -37,21 +40,23 @@ ) } class Tokenizer: + "Tokenizer: transforms a text in a list of tokens" def __init__ (self, sLang): self.sLang = sLang if sLang not in _PATTERNS: self.sLang = "default" self.zToken = re.compile( "(?i)" + '|'.join(sRegex for sRegex in _PATTERNS[sLang]) ) def genTokens (self, sText, bStartEndToken=False): + "generator: tokenize " i = 0 if bStartEndToken: yield { "i": 0, "sType": "INFO", "sValue": "", "nStart": 0, "nEnd": 0 } for i, m in enumerate(self.zToken.finditer(sText), 1): yield { "i": i, "sType": m.lastgroup, "sValue": m.group(), "nStart": m.start(), "nEnd": m.end() } if bStartEndToken: iEnd = len(sText) yield { "i": i+1, "sType": "INFO", "sValue": "", "nStart": iEnd, "nEnd": iEnd }