DELETED gc_core/py/char_player.py Index: gc_core/py/char_player.py ================================================================== --- gc_core/py/char_player.py +++ gc_core/py/char_player.py @@ -1,324 +0,0 @@ -# list of similar chars -# useful for suggestion mechanism - -import re - - -_xTransChars = str.maketrans({ - 'à': 'a', 'é': 'e', 'î': 'i', 'ô': 'o', 'û': 'u', 'ÿ': 'i', "y": "i", - 'â': 'a', 'è': 'e', 'ï': 'i', 'ö': 'o', 'ù': 'u', 'ŷ': 'i', - 'ä': 'a', 'ê': 'e', 'í': 'i', 'ó': 'o', 'ü': 'u', 'ý': 'i', - 'á': 'a', 'ë': 'e', 'ì': 'i', 'ò': 'o', 'ú': 'u', 'ỳ': 'i', - 'ā': 'a', 'ē': 'e', 'ī': 'i', 'ō': 'o', 'ū': 'u', 'ȳ': 'i', - 'ñ': 'n', 'k': 'q', 'w': 'v', - 'œ': 'oe', 'æ': 'ae', -}) - -def simplifyWord (sWord): - "word simplication before calculating distance between words" - sWord = sWord.lower().translate(_xTransChars) - sNewWord = "" - for i, c in enumerate(sWord, 1): - if c != sWord[i:i+1]: - sNewWord += c - return sNewWord.replace("eau", "o").replace("au", "o").replace("ai", "e").replace("ei", "e").replace("ph", "f") - - -aVowel = set("aáàâäāeéèêëēiíìîïīoóòôöōuúùûüūyýỳŷÿȳœæAÁÀÂÄĀEÉÈÊËĒIÍÌÎÏĪOÓÒÔÖŌUÚÙÛÜŪYÝỲŶŸȲŒÆ") -aConsonant = set("bcçdfghjklmnñpqrstvwxzBCÇDFGHJKLMNÑPQRSTVWXZ") -aDouble = set("bcdfjklmnprstzBCDFJKLMNPRSTZ") # letters that may be used twice successively - - -# Similar chars - -d1to1 = { - "1": "liîLIÎ", - "2": "zZ", - "3": "eéèêEÉÈÊ", - "4": "aàâAÀÂ", - "5": "sgSG", - "6": "bdgBDG", - "7": "ltLT", - "8": "bB", - "9": "gbdGBD", - "0": "oôOÔ", - - "a": "aàâáäæ", - "A": "AÀÂÁÄÆ", - "à": "aàâáäæ", - "À": "AÀÂÁÄÆ", - "â": "aàâáäæ", - "Â": "AÀÂÁÄÆ", - "á": "aàâáäæ", - "Á": "AÀÂÁÄÆ", - "ä": "aàâáäæ", - "Ä": "AÀÂÁÄÆ", - - "æ": "æéa", - "Æ": "ÆÉA", - - "c": "cçskqśŝ", - "C": "CÇSKQŚŜ", - "ç": "cçskqśŝ", - "Ç": "CÇSKQŚŜ", - - "e": "eéèêëœ", - "E": "EÉÈÊËŒ", - "é": "eéèêëœ", - "É": "EÉÈÊËŒ", - "ê": "eéèêëœ", - "Ê": "EÉÈÊËŒ", - "è": "eéèêëœ", - "È": "EÉÈÊËŒ", - "ë": "eéèêëœ", - "Ë": "EÉÈÊËŒ", - - "g": "gj", - "G": "GJ", - - "i": "iîïyíìÿ", - "I": "IÎÏYÍÌŸ", - "î": "iîïyíìÿ", - "Î": "IÎÏYÍÌŸ", - "ï": "iîïyíìÿ", - "Ï": "IÎÏYÍÌŸ", - "í": "iîïyíìÿ", - "Í": "IÎÏYÍÌŸ", - "ì": "iîïyíìÿ", - "Ì": "IÎÏYÍÌŸ", - - "j": "jg", - "J": "JG", - - "k": "kcq", - "K": "KCQ", - - "n": "nñ", - "N": "NÑ", - - "o": "oôóòöœ", - "O": "OÔÓÒÖŒ", - "ô": "oôóòöœ", - "Ô": "OÔÓÒÖŒ", - "ó": "oôóòöœ", - "Ó": "OÔÓÒÖŒ", - "ò": "oôóòöœ", - "Ò": "OÔÓÒÖŒ", - "ö": "oôóòöœ", - "Ö": "OÔÓÒÖŒ", - - "œ": "œoôeéèêë", - "Œ": "ŒOÔEÉÈÊË", - - "q": "qck", - "Q": "QCK", - - "s": "sśŝcç", - "S": "SŚŜCÇ", - "ś": "sśŝcç", - "Ś": "SŚŜCÇ", - "ŝ": "sśŝcç", - "Ŝ": "SŚŜCÇ", - - "u": "uûùüú", - "U": "UÛÙÜÚ", - "û": "uûùüú", - "Û": "UÛÙÜÚ", - "ù": "uûùüú", - "Ù": "UÛÙÜÚ", - "ü": "uûùüú", - "Ü": "UÛÙÜÚ", - "ú": "uûùüú", - "Ú": "UÛÙÜÚ", - - "v": "vw", - "V": "VW", - - "w": "wv", - "W": "WV", - - "x": "xck", - "X": "XCK", - - "y": "yÿiîŷýỳ", - "Y": "YŸIÎŶÝỲ", - "ÿ": "yÿiîŷýỳ", - "Ÿ": "YŸIÎŶÝỲ", - "ŷ": "yÿiîŷýỳ", - "Ŷ": "YŸIÎŶÝỲ", - "ý": "yÿiîŷýỳ", - "Ý": "YŸIÎŶÝỲ", - "ỳ": "yÿiîŷýỳ", - "Ỳ": "YŸIÎŶÝỲ", - - "z": "zs", - "Z": "ZS", -} - -d1toX = { - "æ": ("ae",), - "Æ": ("AE",), - "b": ("bb",), - "B": ("BB",), - "c": ("cc", "ss", "qu", "ch"), - "C": ("CC", "SS", "QU", "CH"), - "d": ("dd",), - "D": ("DD",), - "é": ("ai", "ei"), - "É": ("AI", "EI"), - "f": ("ff", "ph"), - "F": ("FF", "PH"), - "g": ("gu", "ge", "gg", "gh"), - "G": ("GU", "GE", "GG", "GH"), - "j": ("jj", "dj"), - "J": ("JJ", "DJ"), - "k": ("qu", "ck", "ch", "cu", "kk", "kh"), - "K": ("QU", "CK", "CH", "CU", "KK", "KH"), - "l": ("ll",), - "L": ("LL",), - "m": ("mm", "mn"), - "M": ("MM", "MN"), - "n": ("nn", "nm", "mn"), - "N": ("NN", "NM", "MN"), - "o": ("au", "eau"), - "O": ("AU", "EAU"), - "œ": ("oe", "eu"), - "Œ": ("OE", "EU"), - "p": ("pp", "ph"), - "P": ("PP", "PH"), - "q": ("qu", "ch", "cq", "ck", "kk"), - "Q": ("QU", "CH", "CQ", "CK", "KK"), - "r": ("rr",), - "R": ("RR",), - "s": ("ss", "sh"), - "S": ("SS", "SH"), - "t": ("tt", "th"), - "T": ("TT", "TH"), - "x": ("cc", "ct", "xx"), - "X": ("CC", "CT", "XX"), - "z": ("ss", "zh"), - "Z": ("SS", "ZH"), -} - - -def get1toXReplacement (cPrev, cCur, cNext): - if cCur in aConsonant and (cPrev in aConsonant or cNext in aConsonant): - return () - return d1toX.get(cCur, ()) - - -d2toX = { - "am": ("an", "en", "em"), - "AM": ("AN", "EN", "EM"), - "an": ("am", "en", "em"), - "AN": ("AM", "EN", "EM"), - "au": ("eau", "o", "ô"), - "AU": ("EAU", "O", "Ô"), - "em": ("an", "am", "en"), - "EM": ("AN", "AM", "EN"), - "en": ("an", "am", "em"), - "EN": ("AN", "AM", "EM"), - "ai": ("ei", "é", "è", "ê", "ë"), - "AI": ("EI", "É", "È", "Ê", "Ë"), - "ei": ("ai", "é", "è", "ê", "ë"), - "EI": ("AI", "É", "È", "Ê", "Ë"), - "ch": ("sh", "c", "ss"), - "CH": ("SH", "C", "SS"), - "ct": ("x", "cc"), - "CT": ("X", "CC"), - "oa": ("oi",), - "OA": ("OI",), - "oi": ("oa", "oie"), - "OI": ("OA", "OIE"), - "ph": ("f",), - "PH": ("F",), - "qu": ("q", "cq", "ck", "c", "k"), - "QU": ("Q", "CQ", "CK", "C", "K"), - "ss": ("c", "ç"), - "SS": ("C", "Ç"), - "un": ("ein",), - "UN": ("EIN",), -} - - -# End of word - -dFinal1 = { - "a": ("as", "at", "ant", "ah"), - "A": ("AS", "AT", "ANT", "AH"), - "c": ("ch",), - "C": ("CH",), - "e": ("et", "er", "ets", "ée", "ez", "ai", "ais", "ait", "ent", "eh"), - "E": ("ET", "ER", "ETS", "ÉE", "EZ", "AI", "AIS", "AIT", "ENT", "EH"), - "é": ("et", "er", "ets", "ée", "ez", "ai", "ais", "ait"), - "É": ("ET", "ER", "ETS", "ÉE", "EZ", "AI", "AIS", "AIT"), - "è": ("et", "er", "ets", "ée", "ez", "ai", "ais", "ait"), - "È": ("ET", "ER", "ETS", "ÉE", "EZ", "AI", "AIS", "AIT"), - "ê": ("et", "er", "ets", "ée", "ez", "ai", "ais", "ait"), - "Ê": ("ET", "ER", "ETS", "ÉE", "EZ", "AI", "AIS", "AIT"), - "ë": ("et", "er", "ets", "ée", "ez", "ai", "ais", "ait"), - "Ë": ("ET", "ER", "ETS", "ÉE", "EZ", "AI", "AIS", "AIT"), - "g": ("gh",), - "G": ("GH",), - "i": ("is", "it", "ie", "in"), - "I": ("IS", "IT", "IE", "IN"), - "n": ("nt", "nd", "ns", "nh"), - "N": ("NT", "ND", "NS", "NH"), - "o": ("aut", "ot", "os"), - "O": ("AUT", "OT", "OS"), - "ô": ("aut", "ot", "os"), - "Ô": ("AUT", "OT", "OS"), - "ö": ("aut", "ot", "os"), - "Ö": ("AUT", "OT", "OS"), - "p": ("ph",), - "P": ("PH",), - "s": ("sh",), - "S": ("SH",), - "t": ("th",), - "T": ("TH",), - "u": ("ut", "us", "uh"), - "U": ("UT", "US", "UH"), -} - -dFinal2 = { - "ai": ("aient", "ais", "et"), - "AI": ("AIENT", "AIS", "ET"), - "an": ("ant", "ent"), - "AN": ("ANT", "ENT"), - "en": ("ent", "ant"), - "EN": ("ENT", "ANT"), - "ei": ("ait", "ais"), - "EI": ("AIT", "AIS"), - "on": ("ons", "ont"), - "ON": ("ONS", "ONT"), - "oi": ("ois", "oit", "oix"), - "OI": ("OIS", "OIT", "OIX"), -} - - -# Préfixes et suffixes - -aPfx1 = frozenset([ - "anti", "archi", "contre", "hyper", "mé", "méta", "im", "in", "ir", "par", "proto", - "pseudo", "pré", "re", "ré", "sans", "sous", "supra", "sur", "ultra" -]) -aPfx2 = frozenset([ - "belgo", "franco", "génito", "gynéco", "médico", "russo" -]) - - -_zMotAvecPronom = re.compile("^(?i)(\\w+)(-(?:t-|)(?:ils?|elles?|on|je|tu|nous|vous))$") - -def cut (sWord): - "returns a tuple of strings (prefix, trimed_word, suffix)" - m = _zMotAvecPronom.search(sWord) - if m: - return ("", m.group(1), m.group(2)) - return ("", sWord, "") - - -# Other functions - -def filterSugg (aSugg): - "exclude suggestions" - return filter(lambda sSugg: not sSugg.endswith(("è", "È")), aSugg) DELETED gc_core/py/dawg.py Index: gc_core/py/dawg.py ================================================================== --- gc_core/py/dawg.py +++ gc_core/py/dawg.py @@ -1,775 +0,0 @@ -#!python3 - -# FSA DICTIONARY BUILDER -# -# by Olivier R. -# License: MPL 2 -# -# This tool encodes lexicon into an indexable binary dictionary -# Input files MUST be encoded in UTF-8. - - -import sys -import os -import collections - -from . import str_transform as st -from .progressbar import ProgressBar - - - -def readFile (spf): - print(" < Read lexicon: " + spf) - if os.path.isfile(spf): - with open(spf, "r", encoding="utf-8") as hSrc: - for sLine in hSrc: - sLine = sLine.strip() - if sLine and not sLine.startswith("#"): - yield sLine - else: - raise OSError("# Error. File not found or not loadable: " + spf) - - -def getElemsFromFile (spf): - "returns tuple of (flexion, stem, tags) from lexicon file" - nErr = 0 - if not spf.endswith(".clex"): - for sLine in readFile(spf): - try: - sFlex, sStem, sTag = sLine.split("\t") - yield (sFlex, sStem, sTag) - except: - nErr += 1 - else: - sTag = "_" # neutral tag - sTag2 = "" - for sLine in readFile(spf): - if sLine.startswith("[") and sLine.endswith("]"): - # tag line - if "-->" in sLine: - try: - sTag, sSfxCode, sTag2 = sLine[1:-1].split(" --> ") - except: - nErr += 1 - continue - sTag = sTag.strip() - sSfxCode = sSfxCode.strip() - sTag2 = sTag2.strip() - else: - sTag = sLine[1:-1] - sTag2 = "" - else: - # entry line - if "\t" in sLine: - if sLine.count("\t") > 1: - nErr += 1 - continue - sFlex, sStem = sLine.split("\t") - else: - sFlex = sStem = sLine - #print(sFlex, sStem, sTag) - yield (sFlex, sStem, sTag) - if sTag2: - sFlex2 = st.changeWordWithSuffixCode(sFlex, sSfxCode) - #print(sFlex2, sStem, sTag2) - yield (sFlex2, sStem, sTag2) - if nErr: - print(" # Lines ignored: {:>10}".format(nErr)) - - - -class DAWG: - """DIRECT ACYCLIC WORD GRAPH""" - # This code is inspired from Steve Hanov’s DAWG, 2011. (http://stevehanov.ca/blog/index.php?id=115) - # We store suffix/affix codes and tags within the graph after the “real” word. - # A word is a list of numbers [ c1, c2, c3 . . . cN, iAffix, iTags] - # Each arc is an index in self.lArcVal, where are stored characters, suffix/affix codes for stemming and tags. - # Important: As usual, the last node (after ‘iTags’) is tagged final, AND the node after ‘cN’ is ALSO tagged final. - - def __init__ (self, spfSrc, sLangName, cStemming): - print("===== Direct Acyclic Word Graph - Minimal Acyclic Finite State Automaton =====") - cStemming = cStemming.upper() - if cStemming == "A": - funcStemmingGen = st.defineAffixCode - elif cStemming == "S": - funcStemmingGen = st.defineSuffixCode - elif cStemming == "N": - funcStemmingGen = st.noStemming - else: - raise ValueError("# Error. Unknown stemming code: {}".format(cStemming)) - - lEntry = [] - lChar = ['']; dChar = {}; nChar = 1; dCharOccur = {} - lAff = []; dAff = {}; nAff = 0; dAffOccur = {} - lTag = []; dTag = {}; nTag = 0; dTagOccur = {} - nErr = 0 - - # read lexicon - for sFlex, sStem, sTag in getElemsFromFile(spfSrc): - addWordToCharDict(sFlex) - # chars - for c in sFlex: - if c not in dChar: - dChar[c] = nChar - lChar.append(c) - nChar += 1 - dCharOccur[c] = dCharOccur.get(c, 0) + 1 - # affixes to find stem from flexion - aff = funcStemmingGen(sFlex, sStem) - if aff not in dAff: - dAff[aff] = nAff - lAff.append(aff) - nAff += 1 - dAffOccur[aff] = dCharOccur.get(aff, 0) + 1 - # tags - if sTag not in dTag: - dTag[sTag] = nTag - lTag.append(sTag) - nTag += 1 - dTagOccur[sTag] = dTagOccur.get(sTag, 0) + 1 - lEntry.append((sFlex, dAff[aff], dTag[sTag])) - if not lEntry: - raise ValueError("# Error. Empty lexicon") - - # Preparing DAWG - print(" > Preparing list of words") - lVal = lChar + lAff + lTag - lWord = [ [dChar[c] for c in sFlex] + [iAff+nChar] + [iTag+nChar+nAff] for sFlex, iAff, iTag in lEntry ] - lEntry = None - - # Dictionary of arc values occurrency, to sort arcs of each node - dValOccur = dict( [ (dChar[c], dCharOccur[c]) for c in dChar ] \ - + [ (dAff[aff]+nChar, dAffOccur[aff]) for aff in dAff ] \ - + [ (dTag[tag]+nChar+nAff, dTagOccur[tag]) for tag in dTag ] ) - #with open(spfSrc[:-8]+".valuesfreq.txt", 'w', encoding='utf-8') as hFreqDst: # DEBUG - # for iKey, nOcc in sorted(dValOccur.items(), key=lambda t: t[1], reverse=True): - # hFreqDst.write("{}: {}\n".format(lVal[iKey], nOcc)) - # hFreqDst.close() - - self.sFile = spfSrc - self.sLang = sLangName - self.nEntry = len(lWord) - self.aPreviousEntry = [] - DawgNode.resetNextId() - self.oRoot = DawgNode() - self.lUncheckedNodes = [] # list of nodes that have not been checked for duplication. - self.lMinimizedNodes = {} # list of unique nodes that have been checked for duplication. - self.lSortedNodes = [] # version 2 and 3 - self.nNode = 0 - self.nArc = 0 - self.dChar = dChar - self.nChar = len(dChar) - self.nAff = nAff - self.lArcVal = lVal - self.nArcVal = len(lVal) - self.nTag = self.nArcVal - self.nChar - nAff - self.cStemming = cStemming - if cStemming == "A": - self.funcStemming = st.changeWordWithAffixCode - elif cStemming == "S": - self.funcStemming = st.changeWordWithSuffixCode - else: - self.funcStemming = st.noStemming - - # build - lWord.sort() - oProgBar = ProgressBar(0, len(lWord)) - for aEntry in lWord: - self.insert(aEntry) - oProgBar.increment(1) - oProgBar.done() - self.finish() - self.countNodes() - self.countArcs() - self.sortNodes() - self.sortNodeArcs(dValOccur) - #self.sortNodeArcs2 (self.oRoot, "") - self.displayInfo() - - # BUILD DAWG - def insert (self, aEntry): - if aEntry < self.aPreviousEntry: - sys.exit("# Error: Words must be inserted in alphabetical order.") - - # find common prefix between word and previous word - nCommonPrefix = 0 - for i in range(min(len(aEntry), len(self.aPreviousEntry))): - if aEntry[i] != self.aPreviousEntry[i]: - break - nCommonPrefix += 1 - - # Check the lUncheckedNodes for redundant nodes, proceeding from last - # one down to the common prefix size. Then truncate the list at that point. - self._minimize(nCommonPrefix) - - # add the suffix, starting from the correct node mid-way through the graph - if len(self.lUncheckedNodes) == 0: - oNode = self.oRoot - else: - oNode = self.lUncheckedNodes[-1][2] - - iChar = nCommonPrefix - for c in aEntry[nCommonPrefix:]: - oNextNode = DawgNode() - oNode.arcs[c] = oNextNode - self.lUncheckedNodes.append((oNode, c, oNextNode)) - if iChar == (len(aEntry) - 2): - oNode.final = True - iChar += 1 - oNode = oNextNode - oNode.final = True - self.aPreviousEntry = aEntry - - def finish (self): - "minimize unchecked nodes" - self._minimize(0) - - def _minimize (self, downTo): - # proceed from the leaf up to a certain point - for i in range( len(self.lUncheckedNodes)-1, downTo-1, -1 ): - oNode, char, oChildNode = self.lUncheckedNodes[i] - if oChildNode in self.lMinimizedNodes: - # replace the child with the previously encountered one - oNode.arcs[char] = self.lMinimizedNodes[oChildNode] - else: - # add the state to the minimized nodes. - self.lMinimizedNodes[oChildNode] = oChildNode - self.lUncheckedNodes.pop() - - def countNodes (self): - self.nNode = len(self.lMinimizedNodes) - - def countArcs (self): - self.nArc = 0 - for oNode in self.lMinimizedNodes: - self.nArc += len(oNode.arcs) - - def sortNodeArcs (self, dValOccur): - print(" > Sort node arcs") - self.oRoot.sortArcs(dValOccur) - for oNode in self.lMinimizedNodes: - oNode.sortArcs(dValOccur) - - def sortNodeArcs2 (self, oNode, cPrevious=""): - # recursive function - dCharOccur = getCharOrderAfterChar(cPrevious) - if dCharOccur: - oNode.sortArcs2(dCharOccur, self.lArcVal) - for nArcVal, oNextNode in oNode.arcs.items(): - self.sortNodeArcs2(oNextNode, self.lArcVal[nArcVal]) - - def sortNodes (self): - print(" > Sort nodes") - for oNode in self.oRoot.arcs.values(): - self._parseNodes(oNode) - - def _parseNodes (self, oNode): - # Warning: recursive method - if oNode.pos > 0: - return - oNode.setPos() - self.lSortedNodes.append(oNode) - for oNextNode in oNode.arcs.values(): - self._parseNodes(oNextNode) - - def lookup (self, sWord): - oNode = self.oRoot - for c in sWord: - if self.dChar.get(c, '') not in oNode.arcs: - return False - oNode = oNode.arcs[self.dChar[c]] - return oNode.final - - def morph (self, sWord): - oNode = self.oRoot - for c in sWord: - if self.dChar.get(c, '') not in oNode.arcs: - return '' - oNode = oNode.arcs[self.dChar[c]] - if oNode.final: - s = "* " - for arc in oNode.arcs: - if arc >= self.nChar: - s += " [" + self.funcStemming(sWord, self.lArcVal[arc]) - oNode2 = oNode.arcs[arc] - for arc2 in oNode2.arcs: - s += " / " + self.lArcVal[arc2] - s += "]" - return s - return '' - - def displayInfo (self): - print(" * {:<12} {:>16,}".format("Entries:", self.nEntry)) - print(" * {:<12} {:>16,}".format("Characters:", self.nChar)) - print(" * {:<12} {:>16,}".format("Affixes:", self.nAff)) - print(" * {:<12} {:>16,}".format("Tags:", self.nTag)) - print(" * {:<12} {:>16,}".format("Arc values:", self.nArcVal)) - print(" * {:<12} {:>16,}".format("Nodes:", self.nNode)) - print(" * {:<12} {:>16,}".format("Arcs:", self.nArc)) - print(" * {:<12} {:>16}".format("Stemming:", self.cStemming + "FX")) - - def getArcStats (self): - d = {} - for oNode in self.lMinimizedNodes: - n = len(oNode.arcs) - d[n] = d.get(n, 0) + 1 - s = " * Nodes:\n" - for n in d: - s = s + " {:>9} nodes have {:>3} arcs\n".format(d[n], n) - return s - - def writeInfo (self, sPathFile): - print(" > Write informations") - with open(sPathFile, 'w', encoding='utf-8', newline="\n") as hDst: - hDst.write(self.getArcStats()) - hDst.write("\n * Values:\n") - for i, s in enumerate(self.lArcVal): - hDst.write(" {:>6}. {}\n".format(i, s)) - hDst.close() - - # BINARY CONVERSION - def createBinary (self, sPathFile, nMethod, bDebug=False): - print(" > Write DAWG as an indexable binary dictionary [method: %d]" % nMethod) - if nMethod == 1: - self.nBytesArc = ( (self.nArcVal.bit_length() + 2) // 8 ) + 1 # We add 2 bits. See DawgNode.convToBytes1() - self._calcNumBytesNodeAddress() - self._calcNodesAddress1() - elif nMethod == 2: - self.nBytesArc = ( (self.nArcVal.bit_length() + 3) // 8 ) + 1 # We add 3 bits. See DawgNode.convToBytes2() - self._calcNumBytesNodeAddress() - self._calcNodesAddress2() - elif nMethod == 3: - self.nBytesArc = ( (self.nArcVal.bit_length() + 3) // 8 ) + 1 # We add 3 bits. See DawgNode.convToBytes3() - self.nBytesOffset = 1 - self.nMaxOffset = (2 ** (self.nBytesOffset * 8)) - 1 - self._calcNumBytesNodeAddress() - self._calcNodesAddress3() - else: - print(" # Error: unknown compression method") - print(" Arc values (chars, affixes and tags): {} -> {} bytes".format( self.nArcVal, len("\t".join(self.lArcVal).encode("utf-8")) )) - print(" Arc size: {} bytes, Address size: {} bytes -> {} * {} = {} bytes".format( self.nBytesArc, self.nBytesNodeAddress, \ - self.nBytesArc+self.nBytesNodeAddress, self.nArc, \ - (self.nBytesArc+self.nBytesNodeAddress)*self.nArc )) - self._writeBinary(sPathFile, nMethod) - if bDebug: - self._writeNodes(sPathFile, nMethod) - - def _calcNumBytesNodeAddress (self): - "how many bytes needed to store all nodes/arcs in the binary dictionary" - self.nBytesNodeAddress = 1 - while ((self.nBytesArc + self.nBytesNodeAddress) * self.nArc) > (2 ** (self.nBytesNodeAddress * 8)): - self.nBytesNodeAddress += 1 - - def _calcNodesAddress1 (self): - nBytesNode = self.nBytesArc + self.nBytesNodeAddress - iAddr = len(self.oRoot.arcs) * nBytesNode - for oNode in self.lMinimizedNodes: - oNode.addr = iAddr - iAddr += max(len(oNode.arcs), 1) * nBytesNode - - def _calcNodesAddress2 (self): - nBytesNode = self.nBytesArc + self.nBytesNodeAddress - iAddr = len(self.oRoot.arcs) * nBytesNode - for oNode in self.lSortedNodes: - oNode.addr = iAddr - iAddr += max(len(oNode.arcs), 1) * nBytesNode - for oNextNode in oNode.arcs.values(): - if (oNode.pos + 1) == oNextNode.pos: - iAddr -= self.nBytesNodeAddress - #break - - def _calcNodesAddress3 (self): - nBytesNode = self.nBytesArc + self.nBytesNodeAddress - # theorical nodes size if only addresses and no offset - self.oRoot.size = len(self.oRoot.arcs) * nBytesNode - for oNode in self.lSortedNodes: - oNode.size = max(len(oNode.arcs), 1) * nBytesNode - # rewind and calculate dropdown from the end, several times - nDiff = self.nBytesNodeAddress - self.nBytesOffset - bEnd = False - while not bEnd: - bEnd = True - # recalculate addresses - iAddr = self.oRoot.size - for oNode in self.lSortedNodes: - oNode.addr = iAddr - iAddr += oNode.size - # rewind and calculate dropdown from the end, several times - for i in range(self.nNode-1, -1, -1): - nSize = max(len(self.lSortedNodes[i].arcs), 1) * nBytesNode - for oNextNode in self.lSortedNodes[i].arcs.values(): - if 1 < (oNextNode.addr - self.lSortedNodes[i].addr) < self.nMaxOffset: - nSize -= nDiff - if self.lSortedNodes[i].size != nSize: - self.lSortedNodes[i].size = nSize - bEnd = False - - def _writeBinary (self, sPathFile, nMethod): - """ - Format of the binary indexable dictionary: - Each section is separated with 4 bytes of \0 - - - Section Header: - /pyfsa/[version] - * version is an ASCII string - - - Section Informations: - /[tag_lang] - /[number of chars] - /[number of bytes for each arc] - /[number of bytes for each address node] - /[number of entries] - /[number of nodes] - /[number of arcs] - /[number of affixes] - * each field is a ASCII string - /[stemming code] - * "S" means stems are generated by /suffix_code/, "A" means they are generated by /affix_code/ - See defineSuffixCode() and defineAffixCode() for details. - "N" means no stemming - - - Section Values: - * a list of strings encoded in binary from utf-8, each value separated with a tabulation - - - Section Word Graph (nodes / arcs) - * A list of nodes which are a list of arcs with an address of the next node. - See DawgNode.convToBytes() for details. - """ - if not sPathFile.endswith(".bdic"): - sPathFile += "."+str(nMethod)+".bdic" - with open(sPathFile, 'wb') as hDst: - # header - hDst.write("/pyfsa/{}/".format(nMethod).encode("utf-8")) - hDst.write(b"\0\0\0\0") - # infos - hDst.write("{}/{}/{}/{}/{}/{}/{}/{}/{}".format(self.sLang, self.nChar, self.nBytesArc, self.nBytesNodeAddress, \ - self.nEntry, self.nNode, self.nArc, self.nAff, self.cStemming).encode("utf-8")) - hDst.write(b"\0\0\0\0") - # lArcVal - hDst.write("\t".join(self.lArcVal).encode("utf-8")) - hDst.write(b"\0\0\0\0") - # DAWG: nodes / arcs - if nMethod == 1: - hDst.write(self.oRoot.convToBytes1(self.nBytesArc, self.nBytesNodeAddress)) - for oNode in self.lMinimizedNodes: - hDst.write(oNode.convToBytes1(self.nBytesArc, self.nBytesNodeAddress)) - elif nMethod == 2: - hDst.write(self.oRoot.convToBytes2(self.nBytesArc, self.nBytesNodeAddress)) - for oNode in self.lSortedNodes: - hDst.write(oNode.convToBytes2(self.nBytesArc, self.nBytesNodeAddress)) - elif nMethod == 3: - hDst.write(self.oRoot.convToBytes3(self.nBytesArc, self.nBytesNodeAddress, self.nBytesOffset)) - for oNode in self.lSortedNodes: - hDst.write(oNode.convToBytes3(self.nBytesArc, self.nBytesNodeAddress, self.nBytesOffset)) - hDst.close() - - def _writeNodes (self, sPathFile, nMethod): - "for debugging only" - print(" > Write nodes") - with open(sPathFile+".nodes."+str(nMethod)+".txt", 'w', encoding='utf-8', newline="\n") as hDst: - if nMethod == 1: - hDst.write(self.oRoot.getTxtRepr1(self.nBytesArc, self.nBytesNodeAddress, self.lArcVal)+"\n") - #hDst.write( ''.join( [ "%02X " % z for z in self.oRoot.convToBytes1(self.nBytesArc, self.nBytesNodeAddress) ] ).strip() ) - for oNode in self.lMinimizedNodes: - hDst.write(oNode.getTxtRepr1(self.nBytesArc, self.nBytesNodeAddress, self.lArcVal)+"\n") - if nMethod == 2: - hDst.write(self.oRoot.getTxtRepr2(self.nBytesArc, self.nBytesNodeAddress, self.lArcVal)+"\n") - for oNode in self.lSortedNodes: - hDst.write(oNode.getTxtRepr2(self.nBytesArc, self.nBytesNodeAddress, self.lArcVal)+"\n") - if nMethod == 3: - hDst.write(self.oRoot.getTxtRepr3(self.nBytesArc, self.nBytesNodeAddress, self.nBytesOffset, self.lArcVal)+"\n") - #hDst.write( ''.join( [ "%02X " % z for z in self.oRoot.convToBytes3(self.nBytesArc, self.nBytesNodeAddress, self.nBytesOffset) ] ).strip() ) - for oNode in self.lSortedNodes: - hDst.write(oNode.getTxtRepr3(self.nBytesArc, self.nBytesNodeAddress, self.nBytesOffset, self.lArcVal)+"\n") - hDst.close() - - def writeResults (self, sPathFile): - bFileExits = os.path.isfile("_lexicons.res.txt") - with open("_lexicons.res.txt", "a", encoding='utf-8', newline="\n") as hDst: - sFormat1 = "{:<12} {:>12} {:>5} {:>8} {:>8} {:>6} {:>8} {:>9} {:>9} {:>15} {:>12} {:>12}\n" - sFormat2 = "{:<12} {:>12,} {:>5,} {:>8,} {:>8} {:>6,} {:>8,} {:>9,} {:>9,} {:>15,} {:>12,} {:>12,}\n" - if not bFileExits: - hDst.write(sFormat1.format("Lexicon", "Entries", "Chars", "Affixes", "Stemming", "Tags", "Values", "Nodes", "Arcs", "Lexicon (Kb)", "Dict (Kb)", "LT Dict (Kb)")) - hDst.write(sFormat2.format(self.sLang, self.nEntry, self.nChar, self.nAff, self.cStemming + "FX", self.nTag, self.nArcVal, \ - self.nNode, self.nArc, os.path.getsize(self.sFile), os.path.getsize(sPathFile), \ - os.path.getsize("cfsa/dict/{}.dict".format(self.sLang)) if os.path.isfile("cfsa/dict/{}.dict".format(self.sLang)) else 0)) - hDst.close() - - - -class DawgNode: - NextId = 0 - NextPos = 1 # (version 2) - - def __init__ (self): - self.i = DawgNode.NextId - DawgNode.NextId += 1 - self.final = False - self.arcs = {} # key: arc value; value: a node - self.addr = 0 # address in the binary dictionary - self.pos = 0 # position in the binary dictionary (version 2) - self.size = 0 # size of node in bytes (version 3) - - @classmethod - def resetNextId (cls): - cls.NextId = 0 - - def setPos (self): # version 2 - self.pos = DawgNode.NextPos - DawgNode.NextPos += 1 - - def __str__ (self): - # Caution! this function is used for hashing and comparison! - l = [] - if self.final: - l.append("1") - else: - l.append("0") - for (key, node) in self.arcs.items(): - l.append(str(key)) - l.append(str(node.i)) - return "_".join(l) - - def __hash__ (self): - # Used as a key in a python dictionary. - return self.__str__().__hash__() - - def __eq__ (self, other): - # Used as a key in a python dictionary. - # Nodes are equivalent if they have identical arcs, and each identical arc leads to identical states. - return self.__str__() == other.__str__() - - def sortArcs (self, dValOccur): - self.arcs = collections.OrderedDict(sorted(self.arcs.items(), key=lambda t: dValOccur.get(t[0], 0), reverse=True)) - - def sortArcs2 (self, dValOccur, lArcVal): - self.arcs = collections.OrderedDict(sorted(self.arcs.items(), key=lambda t: dValOccur.get(lArcVal[t[0]], 0), reverse=True)) - - # VERSION 1 ===================================================================================================== - def convToBytes1 (self, nBytesArc, nBytesNodeAddress): - """ - Node scheme: - - Arc length is defined by nBytesArc - - Address length is defined by nBytesNodeAddress - - | Arc | Address of next node | - | | | - /---------------\ /---------------\ /---------------\ /---------------\ /---------------\ /---------------\ - | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | - \---------------/ \---------------/ \---------------/ \---------------/ \---------------/ \---------------/ - [...] - /---------------\ /---------------\ /---------------\ /---------------\ /---------------\ /---------------\ - | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | - \---------------/ \---------------/ \---------------/ \---------------/ \---------------/ \---------------/ - ^ ^ - | | - | | - | \___ if 1, last arc of this node - \_____ if 1, this node is final (only on the first arc) - """ - nArc = len(self.arcs) - nFinalNodeMask = 1 << ((nBytesArc*8)-1) - nFinalArcMask = 1 << ((nBytesArc*8)-2) - if len(self.arcs) == 0: - val = nFinalNodeMask | nFinalArcMask - by = val.to_bytes(nBytesArc, byteorder='big') - by += (0).to_bytes(nBytesNodeAddress, byteorder='big') - return by - by = b"" - for i, arc in enumerate(self.arcs, 1): - val = arc - if i == 1 and self.final: - val = val | nFinalNodeMask - if i == nArc: - val = val | nFinalArcMask - by += val.to_bytes(nBytesArc, byteorder='big') - by += self.arcs[arc].addr.to_bytes(nBytesNodeAddress, byteorder='big') - return by - - def getTxtRepr1 (self, nBytesArc, nBytesNodeAddress, lVal): - nArc = len(self.arcs) - nFinalNodeMask = 1 << ((nBytesArc*8)-1) - nFinalArcMask = 1 << ((nBytesArc*8)-2) - s = "i{:_>10} -- #{:_>10}\n".format(self.i, self.addr) - if len(self.arcs) == 0: - s += " {:<20} {:0>16} i{:_>10} #{:_>10}\n".format("", bin(nFinalNodeMask | nFinalArcMask)[2:], "0", "0") - return s - for i, arc in enumerate(self.arcs, 1): - val = arc - if i == 1 and self.final: - val = val | nFinalNodeMask - if i == nArc: - val = val | nFinalArcMask - s += " {:<20} {:0>16} i{:_>10} #{:_>10}\n".format(lVal[arc], bin(val)[2:], self.arcs[arc].i, self.arcs[arc].addr) - return s - - # VERSION 2 ===================================================================================================== - def convToBytes2 (self, nBytesArc, nBytesNodeAddress): - """ - Node scheme: - - Arc length is defined by nBytesArc - - Address length is defined by nBytesNodeAddress - - | Arc | Address of next node | - | | | - /---------------\ /---------------\ /---------------\ /---------------\ /---------------\ /---------------\ - | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | - \---------------/ \---------------/ \---------------/ \---------------/ \---------------/ \---------------/ - [...] - /---------------\ /---------------\ /---------------\ /---------------\ /---------------\ /---------------\ - | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | - \---------------/ \---------------/ \---------------/ \---------------/ \---------------/ \---------------/ - ^ ^ ^ - | | | - | | \_ if 1, caution, no address: next node is the following node - | \___ if 1, last arc of this node - \_____ if 1, this node is final (only on the first arc) - """ - nArc = len(self.arcs) - nFinalNodeMask = 1 << ((nBytesArc*8)-1) - nFinalArcMask = 1 << ((nBytesArc*8)-2) - nNextNodeMask = 1 << ((nBytesArc*8)-3) - if len(self.arcs) == 0: - val = nFinalNodeMask | nFinalArcMask - by = val.to_bytes(nBytesArc, byteorder='big') - by += (0).to_bytes(nBytesNodeAddress, byteorder='big') - return by - by = b"" - for i, arc in enumerate(self.arcs, 1): - val = arc - if i == 1 and self.final: - val = val | nFinalNodeMask - if i == nArc: - val = val | nFinalArcMask - if (self.pos + 1) == self.arcs[arc].pos and self.i != 0: - val = val | nNextNodeMask - by += val.to_bytes(nBytesArc, byteorder='big') - else: - by += val.to_bytes(nBytesArc, byteorder='big') - by += self.arcs[arc].addr.to_bytes(nBytesNodeAddress, byteorder='big') - return by - - def getTxtRepr2 (self, nBytesArc, nBytesNodeAddress, lVal): - nArc = len(self.arcs) - nFinalNodeMask = 1 << ((nBytesArc*8)-1) - nFinalArcMask = 1 << ((nBytesArc*8)-2) - nNextNodeMask = 1 << ((nBytesArc*8)-3) - s = "i{:_>10} -- #{:_>10}\n".format(self.i, self.addr) - if nArc == 0: - s += " {:<20} {:0>16} i{:_>10} #{:_>10}\n".format("", bin(nFinalNodeMask | nFinalArcMask)[2:], "0", "0") - return s - for i, arc in enumerate(self.arcs, 1): - val = arc - if i == 1 and self.final: - val = val | nFinalNodeMask - if i == nArc: - val = val | nFinalArcMask - if (self.pos + 1) == self.arcs[arc].pos and self.i != 0: - val = val | nNextNodeMask - s += " {:<20} {:0>16}\n".format(lVal[arc], bin(val)[2:], "") - else: - s += " {:<20} {:0>16} i{:_>10} #{:_>10}\n".format(lVal[arc], bin(val)[2:], self.arcs[arc].i, self.arcs[arc].addr) - return s - - # VERSION 3 ===================================================================================================== - def convToBytes3 (self, nBytesArc, nBytesNodeAddress, nBytesOffset): - """ - Node scheme: - - Arc length is defined by nBytesArc - - Address length is defined by nBytesNodeAddress - - Offset length is defined by nBytesOffset - - | Arc | Address of next node or offset to next node | - | | | - /---------------\ /---------------\ /---------------\ /---------------\ /---------------\ /---------------\ - |1|0|0| | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | - \---------------/ \---------------/ \---------------/ \---------------/ \---------------/ \---------------/ - [...] - /---------------\ /---------------\ /---------------\ - |0|0|1| | | | | | | | | | | | | | | | | | | | | | | | Offsets are shorter than addresses - \---------------/ \---------------/ \---------------/ - /---------------\ /---------------\ /---------------\ /---------------\ /---------------\ /---------------\ - |0|1|0| | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | - \---------------/ \---------------/ \---------------/ \---------------/ \---------------/ \---------------/ - - ^ ^ ^ - | | | - | | \_ if 1, offset instead of address of next node - | \___ if 1, last arc of this node - \_____ if 1, this node is final (only on the first arc) - """ - nArc = len(self.arcs) - nFinalNodeMask = 1 << ((nBytesArc*8)-1) - nFinalArcMask = 1 << ((nBytesArc*8)-2) - nNextNodeMask = 1 << ((nBytesArc*8)-3) - nMaxOffset = (2 ** (nBytesOffset * 8)) - 1 - if nArc == 0: - val = nFinalNodeMask | nFinalArcMask - by = val.to_bytes(nBytesArc, byteorder='big') - by += (0).to_bytes(nBytesNodeAddress, byteorder='big') - return by - by = b"" - for i, arc in enumerate(self.arcs, 1): - val = arc - if i == 1 and self.final: - val = val | nFinalNodeMask - if i == nArc: - val = val | nFinalArcMask - if 1 < (self.arcs[arc].addr - self.addr) < nMaxOffset and self.i != 0: - val = val | nNextNodeMask - by += val.to_bytes(nBytesArc, byteorder='big') - by += (self.arcs[arc].addr-self.addr).to_bytes(nBytesOffset, byteorder='big') - else: - by += val.to_bytes(nBytesArc, byteorder='big') - by += self.arcs[arc].addr.to_bytes(nBytesNodeAddress, byteorder='big') - return by - - def getTxtRepr3 (self, nBytesArc, nBytesNodeAddress, nBytesOffset, lVal): - nArc = len(self.arcs) - nFinalNodeMask = 1 << ((nBytesArc*8)-1) - nFinalArcMask = 1 << ((nBytesArc*8)-2) - nNextNodeMask = 1 << ((nBytesArc*8)-3) - nMaxOffset = (2 ** (nBytesOffset * 8)) - 1 - s = "i{:_>10} -- #{:_>10} ({})\n".format(self.i, self.addr, self.size) - if nArc == 0: - s += " {:<20} {:0>16} i{:_>10} #{:_>10}\n".format("", bin(nFinalNodeMask | nFinalArcMask)[2:], "0", "0") - return s - for i, arc in enumerate(self.arcs, 1): - val = arc - if i == 1 and self.final: - val = val | nFinalNodeMask - if i == nArc: - val = val | nFinalArcMask - if 1 < (self.arcs[arc].addr - self.addr) < nMaxOffset and self.i != 0: - val = val | nNextNodeMask - s += " {:<20} {:0>16} i{:_>10} +{:_>10}\n".format(lVal[arc], bin(val)[2:], self.arcs[arc].i, self.arcs[arc].addr - self.addr) - else: - s += " {:<20} {:0>16} i{:_>10} #{:_>10}\n".format(lVal[arc], bin(val)[2:], self.arcs[arc].i, self.arcs[arc].addr) - return s - - - -# Another attempt to sort node arcs - -_dCharOrder = { - # key: previous char, value: dictionary of chars {c: nValue} - "": {} -} - - -def addWordToCharDict (sWord): - cPrevious = "" - for cChar in sWord: - if cPrevious not in _dCharOrder: - _dCharOrder[cPrevious] = {} - _dCharOrder[cPrevious][cChar] = _dCharOrder[cPrevious].get(cChar, 0) + 1 - cPrevious = cChar - - -def getCharOrderAfterChar (cChar): - return _dCharOrder.get(cChar, None) - - -def displayCharOrder (): - for key, value in _dCharOrder.items(): - print("[" + key + "]: ", ", ".join([ c+":"+str(n) for c, n in sorted(value.items(), key=lambda t: t[1], reverse=True) ])) DELETED gc_core/py/echo.py Index: gc_core/py/echo.py ================================================================== --- gc_core/py/echo.py +++ gc_core/py/echo.py @@ -1,29 +0,0 @@ -#!python3 - -# The most boring yet indispensable function: print! - - -import sys - - -_CHARMAP = str.maketrans({ 'œ': 'ö', 'Œ': 'Ö', 'ʳ': "r", 'ᵉ': "e", '…': "_", \ - '“': '"', '”': '"', '„': '"', '‘': "'", '’': "'", \ - 'ā': 'â', 'Ā': 'Â', 'ē': 'ê', 'Ē': 'Ê', 'ī': 'î', 'Ī': 'Î', \ - 'ō': 'ô', 'Ō': 'Ô', 'ū': 'û', 'Ū': 'Û', 'Ÿ': 'Y', \ - 'ś': 's', 'ŝ': 's', \ - '—': '-', '–': '-' - }) - - -def echo (obj, sep=' ', end='\n', file=sys.stdout, flush=False): - """ Print for Windows to avoid Python crashes. - Encoding depends on Windows locale. No useful standard. - Always returns True (useful for debugging).""" - if sys.platform != "win32": - print(obj, sep=sep, end=end, file=file, flush=flush) - return True - try: - print(str(obj).translate(_CHARMAP), sep=sep, end=end, file=file, flush=flush) - except: - print(str(obj).encode('ascii', 'replace').decode('ascii', 'replace'), sep=sep, end=end, file=file, flush=flush) - return True DELETED gc_core/py/ibdawg.py Index: gc_core/py/ibdawg.py ================================================================== --- gc_core/py/ibdawg.py +++ gc_core/py/ibdawg.py @@ -1,720 +0,0 @@ -#!python3 - -import os -import traceback -import pkgutil -import re -from functools import wraps -import time - -#import logging -#logging.basicConfig(filename="suggestions.log", level=logging.DEBUG) - -from . import str_transform as st -from . import char_player as cp -from .echo import echo - - -def timethis (func): - "decorator for the execution time" - @wraps(func) - def wrapper (*args, **kwargs): - fStart = time.time() - result = func(*args, **kwargs) - fEnd = time.time() - print(func.__name__, fEnd - fStart) - return result - return wrapper - - -class SuggResult: - """Structure for storing, classifying and filtering suggestions""" - - def __init__ (self, sWord, nDistLimit=-1): - self.sWord = sWord - self.sSimplifiedWord = cp.simplifyWord(sWord) - self.nDistLimit = nDistLimit if nDistLimit >= 0 else (len(sWord) // 3) + 1 - self.nMinDist = 1000 - self.aSugg = set() - self.dSugg = { 0: [], 1: [], 2: [] } - - def addSugg (self, sSugg, nDeep=0): - "add a suggestion" - #logging.info((nDeep * " ") + "__" + sSugg + "__") - if sSugg not in self.aSugg: - nDist = st.distanceDamerauLevenshtein(self.sSimplifiedWord, cp.simplifyWord(sSugg)) - if nDist <= self.nDistLimit: - if nDist not in self.dSugg: - self.dSugg[nDist] = [] - self.dSugg[nDist].append(sSugg) - self.aSugg.add(sSugg) - if nDist < self.nMinDist: - self.nMinDist = nDist - self.nDistLimit = min(self.nDistLimit, self.nMinDist+2) - - def getSuggestions (self, nSuggLimit=10, nDistLimit=-1): - "return a list of suggestions" - lRes = [] - if self.dSugg[0]: - # we sort the better results with the original word - self.dSugg[0].sort(key=lambda sSugg: st.distanceDamerauLevenshtein(self.sWord, sSugg)) - for lSugg in self.dSugg.values(): - lRes.extend(lSugg) - if len(lRes) > nSuggLimit: - break - lRes = list(cp.filterSugg(lRes)) - if self.sWord.istitle(): - lRes = list(map(lambda sSugg: sSugg.title(), lRes)) - elif self.sWord.isupper(): - lRes = list(map(lambda sSugg: sSugg.upper(), lRes)) - return lRes[:nSuggLimit] - - def reset (self): - self.aSugg.clear() - self.dSugg.clear() - - -class IBDAWG: - """INDEXABLE BINARY DIRECT ACYCLIC WORD GRAPH""" - - def __init__ (self, sDicName): - self.by = pkgutil.get_data(__package__, "_dictionaries/" + sDicName) - if not self.by: - raise OSError("# Error. File not found or not loadable: "+sDicName) - - if self.by[0:7] != b"/pyfsa/": - raise TypeError("# Error. Not a pyfsa binary dictionary. Header: {}".format(self.by[0:9])) - if not(self.by[7:8] == b"1" or self.by[7:8] == b"2" or self.by[7:8] == b"3"): - raise ValueError("# Error. Unknown dictionary version: {}".format(self.by[7:8])) - try: - header, info, values, bdic = self.by.split(b"\0\0\0\0", 3) - except Exception: - raise Exception - - self.sName = sDicName - self.nVersion = int(self.by[7:8].decode("utf-8")) - self.sHeader = header.decode("utf-8") - self.lArcVal = values.decode("utf-8").split("\t") - self.nArcVal = len(self.lArcVal) - self.byDic = bdic - - l = info.decode("utf-8").split("/") - self.sLang = l[0] - self.nChar = int(l[1]) - self.nBytesArc = int(l[2]) - self.nBytesNodeAddress = int(l[3]) - self.nEntries = int(l[4]) - self.nNode = int(l[5]) - self.nArc = int(l[6]) - self.nAff = int(l[7]) - self.cStemming = l[8] - if self.cStemming == "S": - self.funcStemming = st.changeWordWithSuffixCode - elif self.cStemming == "A": - self.funcStemming = st.changeWordWithAffixCode - else: - self.funcStemming = st.noStemming - self.nTag = self.nArcVal - self.nChar - self.nAff - # to get the value of an arc, to get the char of an arc with its value - self.dChar = {} - for i in range(1, self.nChar): - self.dChar[self.lArcVal[i]] = i - self.dCharVal = { v: k for k, v in self.dChar.items() } - - self._arcMask = (2 ** ((self.nBytesArc * 8) - 3)) - 1 - self._finalNodeMask = 1 << ((self.nBytesArc * 8) - 1) - self._lastArcMask = 1 << ((self.nBytesArc * 8) - 2) - self._addrBitMask = 1 << ((self.nBytesArc * 8) - 3) # version 2 - - self.nBytesOffset = 1 # version 3 - - # Configuring DAWG functions according to nVersion - if self.nVersion == 1: - self.morph = self._morph1 - self.stem = self._stem1 - self._lookupArcNode = self._lookupArcNode1 - self._getArcs = self._getArcs1 - self._writeNodes = self._writeNodes1 - elif self.nVersion == 2: - self.morph = self._morph2 - self.stem = self._stem2 - self._lookupArcNode = self._lookupArcNode2 - self._getArcs = self._getArcs2 - self._writeNodes = self._writeNodes2 - elif self.nVersion == 3: - self.morph = self._morph3 - self.stem = self._stem3 - self._lookupArcNode = self._lookupArcNode3 - self._getArcs = self._getArcs3 - self._writeNodes = self._writeNodes3 - else: - raise ValueError(" # Error: unknown code: {}".format(self.nVersion)) - - self.bOptNumSigle = False - self.bOptNumAtLast = False - - def getInfo (self): - return " Language: {0.sLang:>10} Version: {0.nVersion:>2} Stemming: {0.cStemming}FX\n" \ - " Arcs values: {0.nArcVal:>10,} = {0.nChar:>5,} characters, {0.nAff:>6,} affixes, {0.nTag:>6,} tags\n" \ - " Dictionary: {0.nEntries:>12,} entries, {0.nNode:>11,} nodes, {0.nArc:>11,} arcs\n" \ - " Address size: {0.nBytesNodeAddress:>1} bytes, Arc size: {0.nBytesArc:>1} bytes\n".format(self) - - def writeAsJSObject (self, spfDest, bInJSModule=False, bBinaryDictAsHexString=False): - "write IBDAWG as a JavaScript object in a JavaScript module" - import json - with open(spfDest, "w", encoding="utf-8", newline="\n") as hDst: - if bInJSModule: - hDst.write('// JavaScript\n// Generated data (do not edit)\n\n"use strict";\n\nconst dictionary = ') - hDst.write(json.dumps({ - "sName": self.sName, - "nVersion": self.nVersion, - "sHeader": self.sHeader, - "lArcVal": self.lArcVal, - "nArcVal": self.nArcVal, - # JavaScript is a pile of shit, so Mozilla’s JS parser don’t like file bigger than 4 Mb! - # So, if necessary, we use an hexadecimal string, that we will convert later in Firefox’s extension. - # https://github.com/mozilla/addons-linter/issues/1361 - "byDic": self.byDic.hex() if bBinaryDictAsHexString else [ e for e in self.byDic ], - "sLang": self.sLang, - "nChar": self.nChar, - "nBytesArc": self.nBytesArc, - "nBytesNodeAddress": self.nBytesNodeAddress, - "nEntries": self.nEntries, - "nNode": self.nNode, - "nArc": self.nArc, - "nAff": self.nAff, - "cStemming": self.cStemming, - "nTag": self.nTag, - "dChar": self.dChar, - "_arcMask": self._arcMask, - "_finalNodeMask": self._finalNodeMask, - "_lastArcMask": self._lastArcMask, - "_addrBitMask": self._addrBitMask, - "nBytesOffset": self.nBytesOffset - }, ensure_ascii=False)) - if bInJSModule: - hDst.write(";\n\nexports.dictionary = dictionary;\n") - - def isValidToken (self, sToken): - "checks if is valid (if there is hyphens in , is split, each part is checked)" - if self.isValid(sToken): - return True - if "-" in sToken: - if sToken.count("-") > 4: - return True - return all(self.isValid(sWord) for sWord in sToken.split("-")) - return False - - def isValid (self, sWord): - "checks if is valid (different casing tested if the first letter is a capital)" - if not sWord: - return None - if "’" in sWord: # ugly hack - sWord = sWord.replace("’", "'") - if self.lookup(sWord): - return True - if sWord[0:1].isupper(): - if len(sWord) > 1: - if sWord.istitle(): - return self.lookup(sWord.lower()) - if sWord.isupper(): - if self.bOptNumSigle: - return True - return self.lookup(sWord.lower()) or self.lookup(sWord.capitalize()) - return self.lookup(sWord[:1].lower() + sWord[1:]) - else: - return self.lookup(sWord.lower()) - return False - - def lookup (self, sWord): - "returns True if in dictionary (strict verification)" - iAddr = 0 - for c in sWord: - if c not in self.dChar: - return False - iAddr = self._lookupArcNode(self.dChar[c], iAddr) - if iAddr == None: - return False - return bool(int.from_bytes(self.byDic[iAddr:iAddr+self.nBytesArc], byteorder='big') & self._finalNodeMask) - - def getMorph (self, sWord): - "retrieves morphologies list, different casing allowed" - l = self.morph(sWord) - if sWord[0:1].isupper(): - l.extend(self.morph(sWord.lower())) - if sWord.isupper() and len(sWord) > 1: - l.extend(self.morph(sWord.capitalize())) - return l - - #@timethis - def suggest (self, sWord, nSuggLimit=10): - "returns a set of suggestions for " - sPfx, sWord, sSfx = cp.cut(sWord) - nMaxSwitch = max(len(sWord) // 3, 1) - nMaxDel = len(sWord) // 5 - nMaxHardRepl = max((len(sWord) - 5) // 4, 1) - oSuggResult = SuggResult(sWord) - self._suggest(oSuggResult, sWord, nMaxSwitch=nMaxSwitch, nMaxDel=nMaxDel, nMaxHardRepl=nMaxHardRepl) - if sWord.istitle(): - self._suggest(oSuggResult, sWord.lower(), nMaxSwitch=nMaxSwitch, nMaxDel=nMaxDel, nMaxHardRepl=nMaxHardRepl) - elif sWord.islower(): - self._suggest(oSuggResult, sWord.title(), nMaxSwitch=nMaxSwitch, nMaxDel=nMaxDel, nMaxHardRepl=nMaxHardRepl) - aSugg = oSuggResult.getSuggestions(nSuggLimit) - if sSfx or sPfx: - # we add what we removed - return list(map(lambda sSug: sPfx + sSug + sSfx, aSugg)) - return aSugg - - def _suggest (self, oSuggResult, sRemain, nMaxSwitch=0, nMaxDel=0, nMaxHardRepl=0, nDeep=0, iAddr=0, sNewWord="", bAvoidLoop=False): - # recursive function - #logging.info((nDeep * " ") + sNewWord + ":" + sRemain) - if not sRemain: - if int.from_bytes(self.byDic[iAddr:iAddr+self.nBytesArc], byteorder='big') & self._finalNodeMask: - oSuggResult.addSugg(sNewWord, nDeep) - for sTail in self._getTails(iAddr): - oSuggResult.addSugg(sNewWord+sTail, nDeep) - return - cCurrent = sRemain[0:1] - for cChar, jAddr in self._getCharArcs(iAddr): - if cChar in cp.d1to1.get(cCurrent, cCurrent): - self._suggest(oSuggResult, sRemain[1:], nMaxSwitch, nMaxDel, nMaxHardRepl, nDeep+1, jAddr, sNewWord+cChar) - elif not bAvoidLoop and nMaxHardRepl: - self._suggest(oSuggResult, sRemain[1:], nMaxSwitch, nMaxDel, nMaxHardRepl-1, nDeep+1, jAddr, sNewWord+cChar, True) - if not bAvoidLoop: # avoid infinite loop - if len(sRemain) > 1: - if cCurrent == sRemain[1:2]: - # same char, we remove 1 char without adding 1 to - self._suggest(oSuggResult, sRemain[1:], nMaxSwitch, nMaxDel, nMaxHardRepl, nDeep+1, iAddr, sNewWord) - else: - # switching chars - if nMaxSwitch: - self._suggest(oSuggResult, sRemain[1:2]+sRemain[0:1]+sRemain[2:], nMaxSwitch-1, nMaxDel, nMaxHardRepl, nDeep+1, iAddr, sNewWord, True) - # delete char - if nMaxDel: - self._suggest(oSuggResult, sRemain[1:], nMaxSwitch, nMaxDel-1, nMaxHardRepl, nDeep+1, iAddr, sNewWord, True) - # Phonetic replacements - for sRepl in cp.get1toXReplacement(sNewWord[-1:], cCurrent, sRemain[1:2]): - self._suggest(oSuggResult, sRepl + sRemain[1:], nMaxSwitch, nMaxDel, nMaxHardRepl, nDeep+1, iAddr, sNewWord, True) - for sRepl in cp.d2toX.get(sRemain[0:2], ()): - self._suggest(oSuggResult, sRepl + sRemain[2:], nMaxSwitch, nMaxDel, nMaxHardRepl, nDeep+1, iAddr, sNewWord, True) - # end of word - if len(sRemain) == 2: - for sRepl in cp.dFinal2.get(sRemain, ()): - self._suggest(oSuggResult, sRepl, nMaxSwitch, nMaxDel, nMaxHardRepl, nDeep+1, iAddr, sNewWord, True) - elif len(sRemain) == 1: - self._suggest(oSuggResult, "", nMaxSwitch, nMaxDel, nMaxHardRepl, nDeep+1, iAddr, sNewWord, True) # remove last char and go on - for sRepl in cp.dFinal1.get(sRemain, ()): - self._suggest(oSuggResult, sRepl, nMaxSwitch, nMaxDel, nMaxHardRepl, nDeep+1, iAddr, sNewWord, True) - - #@timethis - def suggest2 (self, sWord, nMaxSugg=10): - "returns a set of suggestions for " - sPfx, sWord, sSfx = cp.cut(sWord) - oSuggResult = SuggResult(sWord) - self._suggest2(oSuggResult) - aSugg = oSuggResult.getSuggestions() - if sSfx or sPfx: - # we add what we removed - return list(map(lambda sSug: sPfx + sSug + sSfx, aSugg)) - return aSugg - - def _suggest2 (self, oSuggResult, nDeep=0, iAddr=0, sNewWord=""): - # recursive function - #logging.info((nDeep * " ") + sNewWord) - if nDeep >= oSuggResult.nDistLimit: - sCleanNewWord = cp.simplifyWord(sNewWord) - if st.distanceSift4(oSuggResult.sCleanWord[:len(sCleanNewWord)], sCleanNewWord) > oSuggResult.nDistLimit: - return - if int.from_bytes(self.byDic[iAddr:iAddr+self.nBytesArc], byteorder='big') & self._finalNodeMask: - oSuggResult.addSugg(sNewWord, nDeep) - for cChar, jAddr in self._getCharArcsWithPriority(iAddr, oSuggResult.sWord[nDeep:nDeep+1]): - self._suggest2(oSuggResult, nDeep+1, jAddr, sNewWord+cChar) - return - - def _getCharArcs (self, iAddr): - "generator: yield all chars and addresses from node at address " - for nVal, jAddr in self._getArcs(iAddr): - if nVal < self.nChar: - yield (self.dCharVal[nVal], jAddr) - - def _getSimilarCharArcs (self, cChar, iAddr): - "generator: yield similar char of and address of the following node" - for c in cp.d1to1.get(cChar, [cChar]): - if c in self.dChar: - jAddr = self._lookupArcNode(self.dChar[c], iAddr) - if jAddr: - yield (c, jAddr) - - def _getCharArcsWithPriority (self, iAddr, cChar): - if not cChar: - yield from self._getCharArcs(iAddr) - lTuple = list(self._getCharArcs(iAddr)) - lTuple.sort(key=lambda t: 0 if t[0] in cp.d1to1.get(cChar, cChar) else 1) - yield from lTuple - - def _getTails (self, iAddr, sTail="", n=2): - "return a list of suffixes ending at a distance of from " - aTails = set() - for nVal, jAddr in self._getArcs(iAddr): - if nVal < self.nChar: - if int.from_bytes(self.byDic[jAddr:jAddr+self.nBytesArc], byteorder='big') & self._finalNodeMask: - aTails.add(sTail + self.dCharVal[nVal]) - if n and not aTails: - aTails.update(self._getTails(jAddr, sTail+self.dCharVal[nVal], n-1)) - return aTails - - def drawPath (self, sWord, iAddr=0): - "show the path taken by in the graph" - c1 = sWord[0:1] if sWord else " " - iPos = -1 - n = 0 - print(c1 + ": ", end="") - for c2, jAddr in self._getCharArcs(iAddr): - print(c2, end="") - if c2 == sWord[0:1]: - iNextNodeAddr = jAddr - iPos = n - n += 1 - if not sWord: - return - if iPos >= 0: - print("\n "+ " " * iPos + "|") - self.drawPath(sWord[1:], iNextNodeAddr) - - def select (self, sPattern=""): - "generator: returns all entries which morphology fits " - zPattern = None - try: - zPattern = re.compile(sPattern) - except: - print("# Error in regex pattern") - traceback.print_exc() - yield from self._select1(zPattern, 0, "") - - # def morph (self, sWord): - # is defined in __init__ - - # VERSION 1 - def _select1 (self, zPattern, iAddr, sWord): - # recursive generator - for nVal, jAddr in self._getArcs1(iAddr): - if nVal < self.nChar: - # simple character - yield from self._select1(zPattern, jAddr, sWord + self.lArcVal[nVal]) - else: - sEntry = sWord + "\t" + self.funcStemming(sWord, self.lArcVal[nVal]) - for nMorphVal, _ in self._getArcs1(jAddr): - if not zPattern or zPattern.search(self.lArcVal[nMorphVal]): - yield sEntry + "\t" + self.lArcVal[nMorphVal] - - def _morph1 (self, sWord): - "returns morphologies of " - iAddr = 0 - for c in sWord: - if c not in self.dChar: - return [] - iAddr = self._lookupArcNode(self.dChar[c], iAddr) - if iAddr == None: - return [] - if (int.from_bytes(self.byDic[iAddr:iAddr+self.nBytesArc], byteorder='big') & self._finalNodeMask): - l = [] - nRawArc = 0 - while not (nRawArc & self._lastArcMask): - iEndArcAddr = iAddr + self.nBytesArc - nRawArc = int.from_bytes(self.byDic[iAddr:iEndArcAddr], byteorder='big') - nArc = nRawArc & self._arcMask - if nArc >= self.nChar: - # This value is not a char, this is a stemming code - sStem = ">" + self.funcStemming(sWord, self.lArcVal[nArc]) - # Now , we go to the next node and retrieve all following arcs values, all of them are tags - iAddr2 = int.from_bytes(self.byDic[iEndArcAddr:iEndArcAddr+self.nBytesNodeAddress], byteorder='big') - nRawArc2 = 0 - while not (nRawArc2 & self._lastArcMask): - iEndArcAddr2 = iAddr2 + self.nBytesArc - nRawArc2 = int.from_bytes(self.byDic[iAddr2:iEndArcAddr2], byteorder='big') - l.append(sStem + " " + self.lArcVal[nRawArc2 & self._arcMask]) - iAddr2 = iEndArcAddr2+self.nBytesNodeAddress - iAddr = iEndArcAddr+self.nBytesNodeAddress - return l - return [] - - def _stem1 (self, sWord): - "returns stems list of " - iAddr = 0 - for c in sWord: - if c not in self.dChar: - return [] - iAddr = self._lookupArcNode(self.dChar[c], iAddr) - if iAddr == None: - return [] - if (int.from_bytes(self.byDic[iAddr:iAddr+self.nBytesArc], byteorder='big') & self._finalNodeMask): - l = [] - nRawArc = 0 - while not (nRawArc & self._lastArcMask): - iEndArcAddr = iAddr + self.nBytesArc - nRawArc = int.from_bytes(self.byDic[iAddr:iEndArcAddr], byteorder='big') - nArc = nRawArc & self._arcMask - if nArc >= self.nChar: - # This value is not a char, this is a stemming code - l.append(self.funcStemming(sWord, self.lArcVal[nArc])) - iAddr = iEndArcAddr+self.nBytesNodeAddress - return l - return [] - - def _lookupArcNode1 (self, nVal, iAddr): - "looks if is an arc at the node at , if yes, returns address of next node else None" - while True: - iEndArcAddr = iAddr+self.nBytesArc - nRawArc = int.from_bytes(self.byDic[iAddr:iEndArcAddr], byteorder='big') - if nVal == (nRawArc & self._arcMask): - # the value we are looking for - # we return the address of the next node - return int.from_bytes(self.byDic[iEndArcAddr:iEndArcAddr+self.nBytesNodeAddress], byteorder='big') - else: - # value not found - if (nRawArc & self._lastArcMask): - return None - iAddr = iEndArcAddr+self.nBytesNodeAddress - - def _getArcs1 (self, iAddr): - "generator: return all arcs at as tuples of (nVal, iAddr)" - while True: - iEndArcAddr = iAddr+self.nBytesArc - nRawArc = int.from_bytes(self.byDic[iAddr:iEndArcAddr], byteorder='big') - yield (nRawArc & self._arcMask, int.from_bytes(self.byDic[iEndArcAddr:iEndArcAddr+self.nBytesNodeAddress], byteorder='big')) - if (nRawArc & self._lastArcMask): - break - iAddr = iEndArcAddr+self.nBytesNodeAddress - - def _writeNodes1 (self, spfDest): - "for debugging only" - print(" > Write binary nodes") - with codecs.open(spfDest, 'w', 'utf-8', newline="\n") as hDst: - iAddr = 0 - hDst.write("i{:_>10} -- #{:_>10}\n".format("0", iAddr)) - while iAddr < len(self.byDic): - iEndArcAddr = iAddr+self.nBytesArc - nRawArc = int.from_bytes(self.byDic[iAddr:iEndArcAddr], byteorder='big') - nArc = nRawArc & self._arcMask - hDst.write(" {:<20} {:0>16} i{:>10} #{:_>10}\n".format(self.lArcVal[nArc], bin(nRawArc)[2:], "?", \ - int.from_bytes(self.byDic[iEndArcAddr:iEndArcAddr+self.nBytesNodeAddress], \ - byteorder='big'))) - iAddr = iEndArcAddr+self.nBytesNodeAddress - if (nRawArc & self._lastArcMask) and iAddr < len(self.byDic): - hDst.write("\ni{:_>10} -- #{:_>10}\n".format("?", iAddr)) - hDst.close() - - # VERSION 2 - def _morph2 (self, sWord): - "returns morphologies of " - iAddr = 0 - for c in sWord: - if c not in self.dChar: - return [] - iAddr = self._lookupArcNode(self.dChar[c], iAddr) - if iAddr == None: - return [] - if (int.from_bytes(self.byDic[iAddr:iAddr+self.nBytesArc], byteorder='big') & self._finalNodeMask): - l = [] - nRawArc = 0 - while not (nRawArc & self._lastArcMask): - iEndArcAddr = iAddr + self.nBytesArc - nRawArc = int.from_bytes(self.byDic[iAddr:iEndArcAddr], byteorder='big') - nArc = nRawArc & self._arcMask - if nArc >= self.nChar: - # This value is not a char, this is a stemming code - sStem = ">" + self.funcStemming(sWord, self.lArcVal[nArc]) - # Now , we go to the next node and retrieve all following arcs values, all of them are tags - if not (nRawArc & self._addrBitMask): - iAddr2 = int.from_bytes(self.byDic[iEndArcAddr:iEndArcAddr+self.nBytesNodeAddress], byteorder='big') - else: - # we go to the end of the node - iAddr2 = iEndArcAddr - while not (nRawArc & self._lastArcMask): - nRawArc = int.from_bytes(self.byDic[iAddr2:iAddr2+self.nBytesArc], byteorder='big') - iAddr2 += self.nBytesArc + self.nBytesNodeAddress - nRawArc2 = 0 - while not (nRawArc2 & self._lastArcMask): - iEndArcAddr2 = iAddr2 + self.nBytesArc - nRawArc2 = int.from_bytes(self.byDic[iAddr2:iEndArcAddr2], byteorder='big') - l.append(sStem + " " + self.lArcVal[nRawArc2 & self._arcMask]) - iAddr2 = iEndArcAddr2+self.nBytesNodeAddress if not (nRawArc2 & self._addrBitMask) else iEndArcAddr2 - iAddr = iEndArcAddr+self.nBytesNodeAddress if not (nRawArc & self._addrBitMask) else iEndArcAddr - return l - return [] - - def _stem2 (self, sWord): - "returns stems list of " - iAddr = 0 - for c in sWord: - if c not in self.dChar: - return [] - iAddr = self._lookupArcNode(self.dChar[c], iAddr) - if iAddr == None: - return [] - if (int.from_bytes(self.byDic[iAddr:iAddr+self.nBytesArc], byteorder='big') & self._finalNodeMask): - l = [] - nRawArc = 0 - while not (nRawArc & self._lastArcMask): - iEndArcAddr = iAddr + self.nBytesArc - nRawArc = int.from_bytes(self.byDic[iAddr:iEndArcAddr], byteorder='big') - nArc = nRawArc & self._arcMask - if nArc >= self.nChar: - # This value is not a char, this is a stemming code - l.append(self.funcStemming(sWord, self.lArcVal[nArc])) - # Now , we go to the next node - if not (nRawArc & self._addrBitMask): - iAddr2 = int.from_bytes(self.byDic[iEndArcAddr:iEndArcAddr+self.nBytesNodeAddress], byteorder='big') - else: - # we go to the end of the node - iAddr2 = iEndArcAddr - while not (nRawArc & self._lastArcMask): - nRawArc = int.from_bytes(self.byDic[iAddr2:iAddr2+self.nBytesArc], byteorder='big') - iAddr2 += self.nBytesArc + self.nBytesNodeAddress - iAddr = iEndArcAddr+self.nBytesNodeAddress if not (nRawArc & self._addrBitMask) else iEndArcAddr - return l - return [] - - def _lookupArcNode2 (self, nVal, iAddr): - "looks if is an arc at the node at , if yes, returns address of next node else None" - while True: - iEndArcAddr = iAddr+self.nBytesArc - nRawArc = int.from_bytes(self.byDic[iAddr:iEndArcAddr], byteorder='big') - if nVal == (nRawArc & self._arcMask): - # the value we are looking for - if not (nRawArc & self._addrBitMask): - # we return the address of the next node - return int.from_bytes(self.byDic[iEndArcAddr:iEndArcAddr+self.nBytesNodeAddress], byteorder='big') - else: - # we go to the end of the node - iAddr = iEndArcAddr - while not (nRawArc & self._lastArcMask): - nRawArc = int.from_bytes(self.byDic[iAddr:iAddr+self.nBytesArc], byteorder='big') - iAddr += self.nBytesArc + self.nBytesNodeAddress if not (nRawArc & self._addrBitMask) else self.nBytesArc - return iAddr - else: - # value not found - if (nRawArc & self._lastArcMask): - return None - iAddr = iEndArcAddr+self.nBytesNodeAddress if not (nRawArc & self._addrBitMask) else iEndArcAddr - - def _writeNodes2 (self, spfDest): - "for debugging only" - print(" > Write binary nodes") - with codecs.open(spfDest, 'w', 'utf-8', newline="\n") as hDst: - iAddr = 0 - hDst.write("i{:_>10} -- #{:_>10}\n".format("0", iAddr)) - while iAddr < len(self.byDic): - iEndArcAddr = iAddr+self.nBytesArc - nRawArc = int.from_bytes(self.byDic[iAddr:iEndArcAddr], byteorder='big') - nArc = nRawArc & self._arcMask - if not (nRawArc & self._addrBitMask): - iNextNodeAddr = int.from_bytes(self.byDic[iEndArcAddr:iEndArcAddr+self.nBytesNodeAddress], byteorder='big') - hDst.write(" {:<20} {:0>16} i{:>10} #{:_>10}\n".format(self.lArcVal[nArc], bin(nRawArc)[2:], "?", iNextNodeAddr)) - iAddr = iEndArcAddr+self.nBytesNodeAddress - else: - hDst.write(" {:<20} {:0>16}\n".format(self.lArcVal[nArc], bin(nRawArc)[2:])) - iAddr = iEndArcAddr - if (nRawArc & self._lastArcMask): - hDst.write("\ni{:_>10} -- #{:_>10}\n".format("?", iAddr)) - hDst.close() - - # VERSION 3 - def _morph3 (self, sWord): - "returns morphologies of " - iAddr = 0 - for c in sWord: - if c not in self.dChar: - return [] - iAddr = self._lookupArcNode(self.dChar[c], iAddr) - if iAddr == None: - return [] - if (int.from_bytes(self.byDic[iAddr:iAddr+self.nBytesArc], byteorder='big') & self._finalNodeMask): - l = [] - nRawArc = 0 - iAddrNode = iAddr - while not (nRawArc & self._lastArcMask): - iEndArcAddr = iAddr + self.nBytesArc - nRawArc = int.from_bytes(self.byDic[iAddr:iEndArcAddr], byteorder='big') - nArc = nRawArc & self._arcMask - if nArc >= self.nChar: - # This value is not a char, this is a stemming code - sStem = ">" + self.funcStemming(sWord, self.lArcVal[nArc]) - # Now , we go to the next node and retrieve all following arcs values, all of them are tags - if not (nRawArc & self._addrBitMask): - iAddr2 = int.from_bytes(self.byDic[iEndArcAddr:iEndArcAddr+self.nBytesNodeAddress], byteorder='big') - else: - iAddr2 = iAddrNode + int.from_bytes(self.byDic[iEndArcAddr:iEndArcAddr+self.nBytesOffset], byteorder='big') - nRawArc2 = 0 - while not (nRawArc2 & self._lastArcMask): - iEndArcAddr2 = iAddr2 + self.nBytesArc - nRawArc2 = int.from_bytes(self.byDic[iAddr2:iEndArcAddr2], byteorder='big') - l.append(sStem + " " + self.lArcVal[nRawArc2 & self._arcMask]) - iAddr2 = iEndArcAddr2+self.nBytesNodeAddress if not (nRawArc2 & self._addrBitMask) else iEndArcAddr2+self.nBytesOffset - iAddr = iEndArcAddr+self.nBytesNodeAddress if not (nRawArc & self._addrBitMask) else iEndArcAddr+self.nBytesOffset - return l - return [] - - def _stem3 (self, sWord): - "returns stems list of " - iAddr = 0 - for c in sWord: - if c not in self.dChar: - return [] - iAddr = self._lookupArcNode(self.dChar[c], iAddr) - if iAddr == None: - return [] - if (int.from_bytes(self.byDic[iAddr:iAddr+self.nBytesArc], byteorder='big') & self._finalNodeMask): - l = [] - nRawArc = 0 - iAddrNode = iAddr - while not (nRawArc & self._lastArcMask): - iEndArcAddr = iAddr + self.nBytesArc - nRawArc = int.from_bytes(self.byDic[iAddr:iEndArcAddr], byteorder='big') - nArc = nRawArc & self._arcMask - if nArc >= self.nChar: - # This value is not a char, this is a stemming code - l.append(self.funcStemming(sWord, self.lArcVal[nArc])) - iAddr = iEndArcAddr+self.nBytesNodeAddress if not (nRawArc & self._addrBitMask) else iEndArcAddr+self.nBytesOffset - return l - return [] - - def _lookupArcNode3 (self, nVal, iAddr): - "looks if is an arc at the node at , if yes, returns address of next node else None" - iAddrNode = iAddr - while True: - iEndArcAddr = iAddr+self.nBytesArc - nRawArc = int.from_bytes(self.byDic[iAddr:iEndArcAddr], byteorder='big') - if nVal == (nRawArc & self._arcMask): - # the value we are looking for - if not (nRawArc & self._addrBitMask): - return int.from_bytes(self.byDic[iEndArcAddr:iEndArcAddr+self.nBytesNodeAddress], byteorder='big') - else: - return iAddrNode + int.from_bytes(self.byDic[iEndArcAddr:iEndArcAddr+self.nBytesOffset], byteorder='big') - else: - # value not found - if (nRawArc & self._lastArcMask): - return None - iAddr = iEndArcAddr+self.nBytesNodeAddress if not (nRawArc & self._addrBitMask) else iEndArcAddr+self.nBytesOffset - - def _writeNodes3 (self, spfDest): - "for debugging only" - print(" > Write binary nodes") - with codecs.open(spfDest, 'w', 'utf-8', newline="\n") as hDst: - iAddr = 0 - hDst.write("i{:_>10} -- #{:_>10}\n".format("0", iAddr)) - while iAddr < len(self.byDic): - iEndArcAddr = iAddr+self.nBytesArc - nRawArc = int.from_bytes(self.byDic[iAddr:iEndArcAddr], byteorder='big') - nArc = nRawArc & self._arcMask - if not (nRawArc & self._addrBitMask): - iNextNodeAddr = int.from_bytes(self.byDic[iEndArcAddr:iEndArcAddr+self.nBytesNodeAddress], byteorder='big') - hDst.write(" {:<20} {:0>16} i{:>10} #{:_>10}\n".format(self.lArcVal[nArc], bin(nRawArc)[2:], "?", iNextNodeAddr)) - iAddr = iEndArcAddr+self.nBytesNodeAddress - else: - iNextNodeAddr = int.from_bytes(self.byDic[iEndArcAddr:iEndArcAddr+self.nBytesOffset], byteorder='big') - hDst.write(" {:<20} {:0>16} i{:>10} +{:_>10}\n".format(self.lArcVal[nArc], bin(nRawArc)[2:], "?", iNextNodeAddr)) - iAddr = iEndArcAddr+self.nBytesOffset - if (nRawArc & self._lastArcMask): - hDst.write("\ni{:_>10} -- #{:_>10}\n".format("?", iAddr)) - hDst.close() DELETED gc_core/py/keyboard_chars_proximity.py Index: gc_core/py/keyboard_chars_proximity.py ================================================================== --- gc_core/py/keyboard_chars_proximity.py +++ gc_core/py/keyboard_chars_proximity.py @@ -1,220 +0,0 @@ -# Keyboard chars proximity - - -def getKeyboardMap (sKeyboard): - return _dKeyboardMap.get(sKeyboard.lower(), {}) - - -def getKeyboardList (): - return _dKeyboardMap.keys() - - -_dKeyboardMap = { - # keyboards by alphabetical order - # bépo, colemak and dvorak users are assumed to do less typing errors. - "azerty": { - # fr - # line 1 - "é": "az", - "è": "yu", - "ç": "àio", - "à": "op", - # line 2 - "a": "zéq", - "z": "aesq", - "e": "zrds", - "r": "etfd", - "t": "rygf", - "y": "tuhg", - "u": "yijh", - "i": "uokj", - "o": "iplk", - "p": "oml", - # line 3 - "q": "sawz", - "s": "qdzwxe", - "d": "sfexcr", - "f": "dgrcvt", - "g": "fhtvby", - "h": "gjybnu", - "j": "hkuni", - "k": "jlio", - "l": "kmop", - "m": "lùp", - "ù": "m", - # line 4 - "w": "xqs", - "x": "wcsd", - "c": "xvdf", - "v": "cbfg", - "b": "vngh", - "n": "bhj", - }, - "bépo": { - # fr - # line 2 - "b": "éa", - "é": "bpu", - "p": "éoi", - "o": "pèe", - "è": "o", - "v": "dt", - "d": "vls", - "l": "djr", - "j": "lzn", - "z": "jmw", - # line 3 - "a": "ubà", - "u": "aiéy", - "i": "uepx", - "e": "io", - "c": "t", - "t": "csvq", - "s": "trdg", - "r": "snlh", - "n": "rmjf", - "m": "nzç", - # line 4 - "à": "yêa", - "y": "àxu", - "x": "ywi", - "w": "z", - "k": "c", - "q": "gt", - "g": "qhs", - "h": "gfr", - "f": "hçn", - "ç": "fm", - }, - "colemak": { - # en, us, intl - # line 2 - "q": "wa", - "w": "qfr", - "f": "wps", - "p": "fgt", - "g": "pjd", - "j": "glh", - "l": "jun", - "u": "lye", - "y": "ui", - # line 3 - "a": "rqz", - "r": "aswx", - "s": "rtfc", - "t": "sdpv", - "d": "thgb", - "h": "dnjk", - "n": "helm", - "e": "niu", - "i": "eoy", - "o": "i", - # line 4 - "z": "xa", - "x": "zcr", - "c": "xvs", - "v": "cbt", - "b": "vkd", - "k": "bmh", - "m": "kn", - }, - "dvorak": { - # en, us, intl - # line 2 - "p": "yu", - "y": "pfi", - "f": "ygd", - "g": "fch", - "c": "grt", - "r": "cln", - "l": "rs", - # line 3 - "a": "o", - "o": "aeq", - "e": "ouj", - "u": "eipk", - "i": "udyx", - "d": "ihfb", - "h": "dtgm", - "t": "hncw", - "n": "tsrv", - "s": "nlz", - # line 4 - "q": "jo", - "j": "qke", - "k": "jxu", - "x": "kbi", - "b": "xmd", - "m": "bwh", - "w": "mvt", - "v": "wzn", - "z": "vs", - }, - "qwerty": { - # en, us, intl - # line 2 - "q": "wa", - "w": "qeas", - "e": "wrds", - "r": "etfd", - "t": "rygf", - "y": "tuhg", - "u": "yijh", - "i": "uokj", - "o": "iplk", - "p": "ol", - # line 3 - "a": "sqzw", - "s": "adwzxe", - "d": "sfexcr", - "f": "dgrcvt", - "g": "fhtvby", - "h": "gjybnu", - "j": "hkunmi", - "k": "jlimo", - "l": "kop", - # line 4 - "z": "xas", - "x": "zcsd", - "c": "xvdf", - "v": "cbfg", - "b": "vngh", - "n": "bmhj", - "m": "njk", - }, - "qwertz": { - # ge, au - # line 2 - "q": "wa", - "w": "qeas", - "e": "wrds", - "r": "etfd", - "t": "rzgf", - "z": "tuhg", - "u": "zijh", - "i": "uokj", - "o": "iplk", - "p": "oüöl", - "ü": "päö", - # line 3 - "a": "sqyw", - "s": "adwyxe", - "d": "sfexcr", - "f": "dgrcvt", - "g": "fhtvbz", - "h": "gjzbnu", - "j": "hkunmi", - "k": "jlimo", - "l": "köop", - "ö": "läpü", - "ä": "öü", - # line 4 - "y": "xas", - "x": "ycsd", - "c": "xvdf", - "v": "cbfg", - "b": "vngh", - "n": "bmhj", - "m": "njk", - } -} DELETED gc_core/py/progressbar.py Index: gc_core/py/progressbar.py ================================================================== --- gc_core/py/progressbar.py +++ gc_core/py/progressbar.py @@ -1,35 +0,0 @@ -# Textual progressbar -# by Olivier R. -# License: MPL 2 - -import time - -class ProgressBar: - "Textual progressbar" - - def __init__ (self, nMin=0, nMax=100, nWidth=78): - "initiate with minimum nMin to maximum nMax" - self.nMin = nMin - self.nMax = nMax - self.nSpan = nMax - nMin - self.nWidth = nWidth-9 - self.nAdvance = -1 - self.nCurVal = nMin - self.startTime = time.time() - self._update() - - def _update (self): - fDone = ((self.nCurVal - self.nMin) / self.nSpan) - nAdvance = int(fDone * self.nWidth) - if (nAdvance > self.nAdvance): - self.nAdvance = nAdvance - print("\r[ {}{} {}% ] ".format('>'*nAdvance, ' '*(self.nWidth-nAdvance), round(fDone*100)), end="") - - def increment (self, n=1): - "increment value by n (1 by default)" - self.nCurVal += n - self._update() - - def done (self): - "to call when it’s finished" - print("\r[ task done in {:.1f} s ] ".format(time.time() - self.startTime)) DELETED gc_core/py/spellchecker.py Index: gc_core/py/spellchecker.py ================================================================== --- gc_core/py/spellchecker.py +++ gc_core/py/spellchecker.py @@ -1,134 +0,0 @@ -# Spellchecker -# Wrapper for the IBDAWG class. -# Useful to check several dictionaries at once. - -from . import ibdawg - - -dDictionaries = { - "fr": "French.bdic", - "en": "English.bdic" -} - - -class Spellchecker (): - - def __init__ (self, sLangCode): - self.sLangCode = sLangCode - self.oMainDic = None - if sLangCode in dDictionaries: - self.oMainDic = ibdawg.IBDAWG(dDictionaries[sLangCode]) - self.lOtherDic = [] - return bool(self.oMainDic) - - - def setMainDictionary (self, sDicName): - try: - self.oMainDic = ibdawg.IBDAWG(sDicName) - return True - except: - print("Error: <" + sDicName + "> not set as main dictionary.") - return False - - def addDictionary (self, sDicName): - try: - self.lOtherDic.append(ibdawg.IBDAWG(sDicName)) - return True - except: - print("Error: <" + sDicName + "> not added to the list.") - return False - - # Return codes: - # 0: invalid - # 1: correct in main dictionary - # 2+: correct in foreign dictionaries - - - # check in the main dictionary only - - def isValidToken (self, sToken): - "(in main dictionary) checks if sToken is valid (if there is hyphens in sToken, sToken is split, each part is checked)" - if self.oMainDic.isValidToken(sToken): - return 1 - return 0 - - def isValid (self, sWord): - "(in main dictionary) checks if sWord is valid (different casing tested if the first letter is a capital)" - if self.oMainDic.isValid(sWord): - return 1 - return 0 - - def lookup (self, sWord): - "(in main dictionary) checks if sWord is in dictionary as is (strict verification)" - if self.oMainDic.lookup(sWord): - return 1 - return 0 - - - # check in all dictionaries - - def isValidTokenAll (self, sToken): - "(in all dictionaries) checks if sToken is valid (if there is hyphens in sToken, sToken is split, each part is checked)" - if self.oMainDic.isValidToken(sToken): - return 1 - for i, oDic in enumerate(self.lOtherDic, 2): - if oDic.isValidToken(sToken): - return i - return 0 - - def isValidAll (self, sWord): - "(in all dictionaries) checks if sWord is valid (different casing tested if the first letter is a capital)" - if self.oMainDic.isValid(sToken): - return 1 - for i, oDic in enumerate(self.lOtherDic, 2): - if oDic.isValid(sToken): - return i - return 0 - - def lookupAll (self, sWord): - "(in all dictionaries) checks if sWord is in dictionary as is (strict verification)" - if self.oMainDic.lookup(sToken): - return 1 - for i, oDic in enumerate(self.lOtherDic, 2): - if oDic.lookup(sToken): - return i - return 0 - - - # check in dictionaries up to level n - - def isValidTokenLevel (self, sToken, nLevel): - "(in dictionaries up to level n) checks if sToken is valid (if there is hyphens in sToken, sToken is split, each part is checked)" - if self.oMainDic.isValidToken(sToken): - return 1 - if nLevel >= 2: - for i, oDic in enumerate(self.lOtherDic, 2): - if oDic.isValidToken(sToken): - return i - if i == nLevel: - break - return 0 - - def isValidLevel (self, sWord, nLevel): - "(in dictionaries up to level n) checks if sWord is valid (different casing tested if the first letter is a capital)" - if self.oMainDic.isValid(sToken): - return 1 - if nLevel >= 2: - for i, oDic in enumerate(self.lOtherDic, 2): - if oDic.isValid(sToken): - return i - if i == nLevel: - break - return 0 - - def lookupLevel (self, sWord, nLevel): - "(in dictionaries up to level n) checks if sWord is in dictionary as is (strict verification)" - if self.oMainDic.lookup(sToken): - return 1 - if nLevel >= 2: - for i, oDic in enumerate(self.lOtherDic, 2): - if oDic.lookup(sToken): - return i - if i == nLevel: - break - return 0 DELETED gc_core/py/str_transform.py Index: gc_core/py/str_transform.py ================================================================== --- gc_core/py/str_transform.py +++ gc_core/py/str_transform.py @@ -1,203 +0,0 @@ -#!python3 - - -#### DISTANCE CALCULATIONS - -def longestCommonSubstring (s1, s2): - # http://en.wikipedia.org/wiki/Longest_common_substring_problem - # http://en.wikibooks.org/wiki/Algorithm_implementation/Strings/Longest_common_substring - M = [ [0]*(1+len(s2)) for i in range(1+len(s1)) ] - longest, x_longest = 0, 0 - for x in range(1, 1+len(s1)): - for y in range(1, 1+len(s2)): - if s1[x-1] == s2[y-1]: - M[x][y] = M[x-1][y-1] + 1 - if M[x][y] > longest: - longest = M[x][y] - x_longest = x - else: - M[x][y] = 0 - return s1[x_longest-longest : x_longest] - - -def distanceDamerauLevenshtein (s1, s2): - "distance of Damerau-Levenshtein between and " - # https://fr.wikipedia.org/wiki/Distance_de_Damerau-Levenshtein - d = {} - nLen1 = len(s1) - nLen2 = len(s2) - for i in range(-1, nLen1+1): - d[i, -1] = i + 1 - for j in range(-1, nLen2+1): - d[-1, j] = j + 1 - for i in range(nLen1): - for j in range(nLen2): - nCost = 0 if s1[i] == s2[j] else 1 - d[i, j] = min( - d[i-1, j] + 1, # Deletion - d[i, j-1] + 1, # Insertion - d[i-1, j-1] + nCost, # Substitution - ) - if i and j and s1[i] == s2[j-1] and s1[i-1] == s2[j]: - d[i, j] = min(d[i, j], d[i-2, j-2] + nCost) # Transposition - return d[nLen1-1, nLen2-1] - - -def distanceSift4 (s1, s2, nMaxOffset=5): - "implementation of general Sift4." - # https://siderite.blogspot.com/2014/11/super-fast-and-accurate-string-distance.html - if not s1: - return len(s2) - if not s2: - return len(s1) - nLen1, nLen2 = len(s1), len(s2) - i1, i2 = 0, 0 # Cursors for each string - nLargestCS = 0 # Largest common substring - nLocalCS = 0 # Local common substring - nTrans = 0 # Number of transpositions ('ab' vs 'ba') - lOffset = [] # Offset pair array, for computing the transpositions - - while i1 < nLen1 and i2 < nLen2: - if s1[i1] == s2[i2]: - nLocalCS += 1 - # Check if current match is a transposition - bTrans = False - i = 0 - while i < len(lOffset): - t = lOffset[i] - if i1 <= t[0] or i2 <= t[1]: - bTrans = abs(i2-i1) >= abs(t[1] - t[0]) - if bTrans: - nTrans += 1 - elif not t[2]: - t[2] = True - nTrans += 1 - break - elif i1 > t[1] and i2 > t[0]: - del lOffset[i] - else: - i += 1 - lOffset.append([i1, i2, bTrans]) - else: - nLargestCS += nLocalCS - nLocalCS = 0 - if i1 != i2: - i1 = i2 = min(i1, i2) - for i in range(nMaxOffset): - if i1 + i >= nLen1 and i2 + i >= nLen2: - break - elif i1 + i < nLen1 and s1[i1+i] == s2[i2]: - i1 += i - 1 - i2 -= 1 - break - elif i2 + i < nLen2 and s1[i1] == s2[i2+i]: - i2 += i - 1 - i1 -= 1 - break - i1 += 1 - i2 += 1 - if i1 >= nLen1 or i2 >= nLen2: - nLargestCS += nLocalCS - nLocalCS = 0 - i1 = i2 = min(i1, i2) - nLargestCS += nLocalCS - return round(max(nLen1, nLen2) - nLargestCS + nTrans) - - -def showDistance (s1, s2): - print("Damerau-Levenshtein: " + s1 + "/" + s2 + " = " + distanceDamerauLevenshtein(s1, s2)) - print("Sift4:" + s1 + "/" + s2 + " = " + distanceSift4(s1, s2)) - - - - -#### STEMMING OPERATIONS - -## No stemming - -def noStemming (sFlex, sStem): - return sStem - -def rebuildWord (sFlex, cmd1, cmd2): - if cmd1 == "_": - return sFlex - n, c = cmd1.split(":") - s = s[:n] + c + s[n:] - if cmd2 == "_": - return s - n, c = cmd2.split(":") - return s[:n] + c + s[n:] - - -## Define affixes for stemming - -# Note: 48 is the ASCII code for "0" - - -# Suffix only -def defineSuffixCode (sFlex, sStem): - """ Returns a string defining how to get stem from flexion - "n(sfx)" - with n: a char with numeric meaning, "0" = 0, "1" = 1, ... ":" = 10, etc. (See ASCII table.) Says how many letters to strip from flexion. - sfx [optional]: string to add on flexion - Examples: - "0": strips nothing, adds nothing - "1er": strips 1 letter, adds "er" - "2": strips 2 letters, adds nothing - """ - if sFlex == sStem: - return "0" - jSfx = 0 - for i in range(min(len(sFlex), len(sStem))): - if sFlex[i] != sStem[i]: - break - jSfx += 1 - return chr(len(sFlex)-jSfx+48) + sStem[jSfx:] - - -def changeWordWithSuffixCode (sWord, sSfxCode): - if sSfxCode == "0": - return sWord - return sWord[:-(ord(sSfxCode[0])-48)] + sSfxCode[1:] if sSfxCode[0] != '0' else sWord + sSfxCode[1:] - - -# Prefix and suffix - -def defineAffixCode (sFlex, sStem): - """ Returns a string defining how to get stem from flexion. Examples: - "0" if stem = flexion - "stem" if no common substring - "n(pfx)/m(sfx)" - with n and m: chars with numeric meaning, "0" = 0, "1" = 1, ... ":" = 10, etc. (See ASCII table.) Says how many letters to strip from flexion. - pfx [optional]: string to add before the flexion - sfx [optional]: string to add after the flexion - """ - if sFlex == sStem: - return "0" - # is stem a substring of flexion? - n = sFlex.find(sStem) - if n >= 0: - return "{}/{}".format(chr(n+48), chr(len(sFlex)-(len(sStem)+n)+48)) - # no, so we are looking for common substring - sSubs = longestCommonSubstring(sFlex, sStem) - if len(sSubs) > 1: - iPos = sStem.find(sSubs) - sPfx = sStem[:iPos] - sSfx = sStem[iPos+len(sSubs):] - n = sFlex.find(sSubs) - m = len(sFlex) - (len(sSubs)+n) - sAff = "{}/".format(chr(n+48)) if not sPfx else "{}{}/".format(chr(n+48), sPfx) - sAff += chr(m+48) if not sSfx else "{}{}".format(chr(m+48), sSfx) - return sAff - return sStem - - -def changeWordWithAffixCode (sWord, sAffCode): - if sAffCode == "0": - return sWord - if '/' not in sAffCode: - return "# error #" - sPfxCode, sSfxCode = sAffCode.split('/') - sWord = sPfxCode[1:] + sWord[(ord(sPfxCode[0])-48):] - return sWord[:-(ord(sSfxCode[0])-48)] + sSfxCode[1:] if sSfxCode[0] != '0' else sWord + sSfxCode[1:] - DELETED gc_core/py/tokenizer.py Index: gc_core/py/tokenizer.py ================================================================== --- gc_core/py/tokenizer.py +++ gc_core/py/tokenizer.py @@ -1,49 +0,0 @@ -# Very simple tokenizer - -import re - -_PATTERNS = { - "default": - ( - r'(?P/(?:bin|boot|dev|etc|home|lib|mnt|opt|root|sbin|tmp|usr|var|Bureau|Documents|Images|Musique|Public|Téléchargements|Vidéos)(?:/[\w.()-]+)*)', - r'(?P[a-zA-Z]:\\(?:Program Files(?: [(]x86[)]|)|[\w.()]+)(?:\\[\w.()-]+)*)', - r'(?P[.,?!:;…«»“”"()/·]+)', - r'(?P[A-Z][.][A-Z][.](?:[A-Z][.])*)', - r'(?P(?:https?://|www[.]|\w+[@.]\w\w+[@.])\w[\w./?&!%=+*"\'@$#-]+)', - r'(?P[#@][\w-]+)', - r'(?P<\w+.*?>|)', - r'(?P\[/?\w+\])', - r'(?P\d\d?h\d\d\b)', - r'(?P-?\d+(?:[.,]\d+))', - r"(?P\w+(?:[’'`-]\w+)*)" - ), - "fr": - ( - r'(?P/(?:bin|boot|dev|etc|home|lib|mnt|opt|root|sbin|tmp|usr|var|Bureau|Documents|Images|Musique|Public|Téléchargements|Vidéos)(?:/[\w.()-]+)*)', - r'(?P[a-zA-Z]:\\(?:Program Files(?: [(]x86[)]|)|[\w.()]+)(?:\\[\w.()-]+)*)', - r'(?P[.,?!:;…«»“”"()/·]+)', - r'(?P[A-Z][.][A-Z][.](?:[A-Z][.])*)', - r'(?P(?:https?://|www[.]|\w+[@.]\w\w+[@.])\w[\w./?&!%=+*"\'@$#-]+)', - r'(?P[#@][\w-]+)', - r'(?P<\w+.*?>|)', - r'(?P\[/?\w+\])', - r"(?P(?:l|d|n|m|t|s|j|c|ç|lorsqu|puisqu|jusqu|quoiqu|qu)['’`])", - r'(?P\d+(?:er|nd|e|de|ième|ème|eme)\b)', - r'(?P\d\d?h\d\d\b)', - r'(?P-?\d+(?:[.,]\d+|))', - r"(?P\w+(?:[’'`-]\w+)*)" - ) -} - - -class Tokenizer: - - def __init__ (self, sLang): - self.sLang = sLang - if sLang not in _PATTERNS: - self.sLang = "default" - self.zToken = re.compile( "(?i)" + '|'.join(sRegex for sRegex in _PATTERNS[sLang]) ) - - def genTokens (self, sText): - for m in self.zToken.finditer(sText): - yield { "sType": m.lastgroup, "sValue": m.group(), "nStart": m.start(), "nEnd": m.end() } ADDED graphspell/char_player.py Index: graphspell/char_player.py ================================================================== --- graphspell/char_player.py +++ graphspell/char_player.py @@ -0,0 +1,324 @@ +# list of similar chars +# useful for suggestion mechanism + +import re + + +_xTransChars = str.maketrans({ + 'à': 'a', 'é': 'e', 'î': 'i', 'ô': 'o', 'û': 'u', 'ÿ': 'i', "y": "i", + 'â': 'a', 'è': 'e', 'ï': 'i', 'ö': 'o', 'ù': 'u', 'ŷ': 'i', + 'ä': 'a', 'ê': 'e', 'í': 'i', 'ó': 'o', 'ü': 'u', 'ý': 'i', + 'á': 'a', 'ë': 'e', 'ì': 'i', 'ò': 'o', 'ú': 'u', 'ỳ': 'i', + 'ā': 'a', 'ē': 'e', 'ī': 'i', 'ō': 'o', 'ū': 'u', 'ȳ': 'i', + 'ñ': 'n', 'k': 'q', 'w': 'v', + 'œ': 'oe', 'æ': 'ae', +}) + +def simplifyWord (sWord): + "word simplication before calculating distance between words" + sWord = sWord.lower().translate(_xTransChars) + sNewWord = "" + for i, c in enumerate(sWord, 1): + if c != sWord[i:i+1]: + sNewWord += c + return sNewWord.replace("eau", "o").replace("au", "o").replace("ai", "e").replace("ei", "e").replace("ph", "f") + + +aVowel = set("aáàâäāeéèêëēiíìîïīoóòôöōuúùûüūyýỳŷÿȳœæAÁÀÂÄĀEÉÈÊËĒIÍÌÎÏĪOÓÒÔÖŌUÚÙÛÜŪYÝỲŶŸȲŒÆ") +aConsonant = set("bcçdfghjklmnñpqrstvwxzBCÇDFGHJKLMNÑPQRSTVWXZ") +aDouble = set("bcdfjklmnprstzBCDFJKLMNPRSTZ") # letters that may be used twice successively + + +# Similar chars + +d1to1 = { + "1": "liîLIÎ", + "2": "zZ", + "3": "eéèêEÉÈÊ", + "4": "aàâAÀÂ", + "5": "sgSG", + "6": "bdgBDG", + "7": "ltLT", + "8": "bB", + "9": "gbdGBD", + "0": "oôOÔ", + + "a": "aàâáäæ", + "A": "AÀÂÁÄÆ", + "à": "aàâáäæ", + "À": "AÀÂÁÄÆ", + "â": "aàâáäæ", + "Â": "AÀÂÁÄÆ", + "á": "aàâáäæ", + "Á": "AÀÂÁÄÆ", + "ä": "aàâáäæ", + "Ä": "AÀÂÁÄÆ", + + "æ": "æéa", + "Æ": "ÆÉA", + + "c": "cçskqśŝ", + "C": "CÇSKQŚŜ", + "ç": "cçskqśŝ", + "Ç": "CÇSKQŚŜ", + + "e": "eéèêëœ", + "E": "EÉÈÊËŒ", + "é": "eéèêëœ", + "É": "EÉÈÊËŒ", + "ê": "eéèêëœ", + "Ê": "EÉÈÊËŒ", + "è": "eéèêëœ", + "È": "EÉÈÊËŒ", + "ë": "eéèêëœ", + "Ë": "EÉÈÊËŒ", + + "g": "gj", + "G": "GJ", + + "i": "iîïyíìÿ", + "I": "IÎÏYÍÌŸ", + "î": "iîïyíìÿ", + "Î": "IÎÏYÍÌŸ", + "ï": "iîïyíìÿ", + "Ï": "IÎÏYÍÌŸ", + "í": "iîïyíìÿ", + "Í": "IÎÏYÍÌŸ", + "ì": "iîïyíìÿ", + "Ì": "IÎÏYÍÌŸ", + + "j": "jg", + "J": "JG", + + "k": "kcq", + "K": "KCQ", + + "n": "nñ", + "N": "NÑ", + + "o": "oôóòöœ", + "O": "OÔÓÒÖŒ", + "ô": "oôóòöœ", + "Ô": "OÔÓÒÖŒ", + "ó": "oôóòöœ", + "Ó": "OÔÓÒÖŒ", + "ò": "oôóòöœ", + "Ò": "OÔÓÒÖŒ", + "ö": "oôóòöœ", + "Ö": "OÔÓÒÖŒ", + + "œ": "œoôeéèêë", + "Œ": "ŒOÔEÉÈÊË", + + "q": "qck", + "Q": "QCK", + + "s": "sśŝcç", + "S": "SŚŜCÇ", + "ś": "sśŝcç", + "Ś": "SŚŜCÇ", + "ŝ": "sśŝcç", + "Ŝ": "SŚŜCÇ", + + "u": "uûùüú", + "U": "UÛÙÜÚ", + "û": "uûùüú", + "Û": "UÛÙÜÚ", + "ù": "uûùüú", + "Ù": "UÛÙÜÚ", + "ü": "uûùüú", + "Ü": "UÛÙÜÚ", + "ú": "uûùüú", + "Ú": "UÛÙÜÚ", + + "v": "vw", + "V": "VW", + + "w": "wv", + "W": "WV", + + "x": "xck", + "X": "XCK", + + "y": "yÿiîŷýỳ", + "Y": "YŸIÎŶÝỲ", + "ÿ": "yÿiîŷýỳ", + "Ÿ": "YŸIÎŶÝỲ", + "ŷ": "yÿiîŷýỳ", + "Ŷ": "YŸIÎŶÝỲ", + "ý": "yÿiîŷýỳ", + "Ý": "YŸIÎŶÝỲ", + "ỳ": "yÿiîŷýỳ", + "Ỳ": "YŸIÎŶÝỲ", + + "z": "zs", + "Z": "ZS", +} + +d1toX = { + "æ": ("ae",), + "Æ": ("AE",), + "b": ("bb",), + "B": ("BB",), + "c": ("cc", "ss", "qu", "ch"), + "C": ("CC", "SS", "QU", "CH"), + "d": ("dd",), + "D": ("DD",), + "é": ("ai", "ei"), + "É": ("AI", "EI"), + "f": ("ff", "ph"), + "F": ("FF", "PH"), + "g": ("gu", "ge", "gg", "gh"), + "G": ("GU", "GE", "GG", "GH"), + "j": ("jj", "dj"), + "J": ("JJ", "DJ"), + "k": ("qu", "ck", "ch", "cu", "kk", "kh"), + "K": ("QU", "CK", "CH", "CU", "KK", "KH"), + "l": ("ll",), + "L": ("LL",), + "m": ("mm", "mn"), + "M": ("MM", "MN"), + "n": ("nn", "nm", "mn"), + "N": ("NN", "NM", "MN"), + "o": ("au", "eau"), + "O": ("AU", "EAU"), + "œ": ("oe", "eu"), + "Œ": ("OE", "EU"), + "p": ("pp", "ph"), + "P": ("PP", "PH"), + "q": ("qu", "ch", "cq", "ck", "kk"), + "Q": ("QU", "CH", "CQ", "CK", "KK"), + "r": ("rr",), + "R": ("RR",), + "s": ("ss", "sh"), + "S": ("SS", "SH"), + "t": ("tt", "th"), + "T": ("TT", "TH"), + "x": ("cc", "ct", "xx"), + "X": ("CC", "CT", "XX"), + "z": ("ss", "zh"), + "Z": ("SS", "ZH"), +} + + +def get1toXReplacement (cPrev, cCur, cNext): + if cCur in aConsonant and (cPrev in aConsonant or cNext in aConsonant): + return () + return d1toX.get(cCur, ()) + + +d2toX = { + "am": ("an", "en", "em"), + "AM": ("AN", "EN", "EM"), + "an": ("am", "en", "em"), + "AN": ("AM", "EN", "EM"), + "au": ("eau", "o", "ô"), + "AU": ("EAU", "O", "Ô"), + "em": ("an", "am", "en"), + "EM": ("AN", "AM", "EN"), + "en": ("an", "am", "em"), + "EN": ("AN", "AM", "EM"), + "ai": ("ei", "é", "è", "ê", "ë"), + "AI": ("EI", "É", "È", "Ê", "Ë"), + "ei": ("ai", "é", "è", "ê", "ë"), + "EI": ("AI", "É", "È", "Ê", "Ë"), + "ch": ("sh", "c", "ss"), + "CH": ("SH", "C", "SS"), + "ct": ("x", "cc"), + "CT": ("X", "CC"), + "oa": ("oi",), + "OA": ("OI",), + "oi": ("oa", "oie"), + "OI": ("OA", "OIE"), + "ph": ("f",), + "PH": ("F",), + "qu": ("q", "cq", "ck", "c", "k"), + "QU": ("Q", "CQ", "CK", "C", "K"), + "ss": ("c", "ç"), + "SS": ("C", "Ç"), + "un": ("ein",), + "UN": ("EIN",), +} + + +# End of word + +dFinal1 = { + "a": ("as", "at", "ant", "ah"), + "A": ("AS", "AT", "ANT", "AH"), + "c": ("ch",), + "C": ("CH",), + "e": ("et", "er", "ets", "ée", "ez", "ai", "ais", "ait", "ent", "eh"), + "E": ("ET", "ER", "ETS", "ÉE", "EZ", "AI", "AIS", "AIT", "ENT", "EH"), + "é": ("et", "er", "ets", "ée", "ez", "ai", "ais", "ait"), + "É": ("ET", "ER", "ETS", "ÉE", "EZ", "AI", "AIS", "AIT"), + "è": ("et", "er", "ets", "ée", "ez", "ai", "ais", "ait"), + "È": ("ET", "ER", "ETS", "ÉE", "EZ", "AI", "AIS", "AIT"), + "ê": ("et", "er", "ets", "ée", "ez", "ai", "ais", "ait"), + "Ê": ("ET", "ER", "ETS", "ÉE", "EZ", "AI", "AIS", "AIT"), + "ë": ("et", "er", "ets", "ée", "ez", "ai", "ais", "ait"), + "Ë": ("ET", "ER", "ETS", "ÉE", "EZ", "AI", "AIS", "AIT"), + "g": ("gh",), + "G": ("GH",), + "i": ("is", "it", "ie", "in"), + "I": ("IS", "IT", "IE", "IN"), + "n": ("nt", "nd", "ns", "nh"), + "N": ("NT", "ND", "NS", "NH"), + "o": ("aut", "ot", "os"), + "O": ("AUT", "OT", "OS"), + "ô": ("aut", "ot", "os"), + "Ô": ("AUT", "OT", "OS"), + "ö": ("aut", "ot", "os"), + "Ö": ("AUT", "OT", "OS"), + "p": ("ph",), + "P": ("PH",), + "s": ("sh",), + "S": ("SH",), + "t": ("th",), + "T": ("TH",), + "u": ("ut", "us", "uh"), + "U": ("UT", "US", "UH"), +} + +dFinal2 = { + "ai": ("aient", "ais", "et"), + "AI": ("AIENT", "AIS", "ET"), + "an": ("ant", "ent"), + "AN": ("ANT", "ENT"), + "en": ("ent", "ant"), + "EN": ("ENT", "ANT"), + "ei": ("ait", "ais"), + "EI": ("AIT", "AIS"), + "on": ("ons", "ont"), + "ON": ("ONS", "ONT"), + "oi": ("ois", "oit", "oix"), + "OI": ("OIS", "OIT", "OIX"), +} + + +# Préfixes et suffixes + +aPfx1 = frozenset([ + "anti", "archi", "contre", "hyper", "mé", "méta", "im", "in", "ir", "par", "proto", + "pseudo", "pré", "re", "ré", "sans", "sous", "supra", "sur", "ultra" +]) +aPfx2 = frozenset([ + "belgo", "franco", "génito", "gynéco", "médico", "russo" +]) + + +_zMotAvecPronom = re.compile("^(?i)(\\w+)(-(?:t-|)(?:ils?|elles?|on|je|tu|nous|vous))$") + +def cut (sWord): + "returns a tuple of strings (prefix, trimed_word, suffix)" + m = _zMotAvecPronom.search(sWord) + if m: + return ("", m.group(1), m.group(2)) + return ("", sWord, "") + + +# Other functions + +def filterSugg (aSugg): + "exclude suggestions" + return filter(lambda sSugg: not sSugg.endswith(("è", "È")), aSugg) ADDED graphspell/dawg.py Index: graphspell/dawg.py ================================================================== --- graphspell/dawg.py +++ graphspell/dawg.py @@ -0,0 +1,775 @@ +#!python3 + +# FSA DICTIONARY BUILDER +# +# by Olivier R. +# License: MPL 2 +# +# This tool encodes lexicon into an indexable binary dictionary +# Input files MUST be encoded in UTF-8. + + +import sys +import os +import collections + +from . import str_transform as st +from .progressbar import ProgressBar + + + +def readFile (spf): + print(" < Read lexicon: " + spf) + if os.path.isfile(spf): + with open(spf, "r", encoding="utf-8") as hSrc: + for sLine in hSrc: + sLine = sLine.strip() + if sLine and not sLine.startswith("#"): + yield sLine + else: + raise OSError("# Error. File not found or not loadable: " + spf) + + +def getElemsFromFile (spf): + "returns tuple of (flexion, stem, tags) from lexicon file" + nErr = 0 + if not spf.endswith(".clex"): + for sLine in readFile(spf): + try: + sFlex, sStem, sTag = sLine.split("\t") + yield (sFlex, sStem, sTag) + except: + nErr += 1 + else: + sTag = "_" # neutral tag + sTag2 = "" + for sLine in readFile(spf): + if sLine.startswith("[") and sLine.endswith("]"): + # tag line + if "-->" in sLine: + try: + sTag, sSfxCode, sTag2 = sLine[1:-1].split(" --> ") + except: + nErr += 1 + continue + sTag = sTag.strip() + sSfxCode = sSfxCode.strip() + sTag2 = sTag2.strip() + else: + sTag = sLine[1:-1] + sTag2 = "" + else: + # entry line + if "\t" in sLine: + if sLine.count("\t") > 1: + nErr += 1 + continue + sFlex, sStem = sLine.split("\t") + else: + sFlex = sStem = sLine + #print(sFlex, sStem, sTag) + yield (sFlex, sStem, sTag) + if sTag2: + sFlex2 = st.changeWordWithSuffixCode(sFlex, sSfxCode) + #print(sFlex2, sStem, sTag2) + yield (sFlex2, sStem, sTag2) + if nErr: + print(" # Lines ignored: {:>10}".format(nErr)) + + + +class DAWG: + """DIRECT ACYCLIC WORD GRAPH""" + # This code is inspired from Steve Hanov’s DAWG, 2011. (http://stevehanov.ca/blog/index.php?id=115) + # We store suffix/affix codes and tags within the graph after the “real” word. + # A word is a list of numbers [ c1, c2, c3 . . . cN, iAffix, iTags] + # Each arc is an index in self.lArcVal, where are stored characters, suffix/affix codes for stemming and tags. + # Important: As usual, the last node (after ‘iTags’) is tagged final, AND the node after ‘cN’ is ALSO tagged final. + + def __init__ (self, spfSrc, sLangName, cStemming): + print("===== Direct Acyclic Word Graph - Minimal Acyclic Finite State Automaton =====") + cStemming = cStemming.upper() + if cStemming == "A": + funcStemmingGen = st.defineAffixCode + elif cStemming == "S": + funcStemmingGen = st.defineSuffixCode + elif cStemming == "N": + funcStemmingGen = st.noStemming + else: + raise ValueError("# Error. Unknown stemming code: {}".format(cStemming)) + + lEntry = [] + lChar = ['']; dChar = {}; nChar = 1; dCharOccur = {} + lAff = []; dAff = {}; nAff = 0; dAffOccur = {} + lTag = []; dTag = {}; nTag = 0; dTagOccur = {} + nErr = 0 + + # read lexicon + for sFlex, sStem, sTag in getElemsFromFile(spfSrc): + addWordToCharDict(sFlex) + # chars + for c in sFlex: + if c not in dChar: + dChar[c] = nChar + lChar.append(c) + nChar += 1 + dCharOccur[c] = dCharOccur.get(c, 0) + 1 + # affixes to find stem from flexion + aff = funcStemmingGen(sFlex, sStem) + if aff not in dAff: + dAff[aff] = nAff + lAff.append(aff) + nAff += 1 + dAffOccur[aff] = dCharOccur.get(aff, 0) + 1 + # tags + if sTag not in dTag: + dTag[sTag] = nTag + lTag.append(sTag) + nTag += 1 + dTagOccur[sTag] = dTagOccur.get(sTag, 0) + 1 + lEntry.append((sFlex, dAff[aff], dTag[sTag])) + if not lEntry: + raise ValueError("# Error. Empty lexicon") + + # Preparing DAWG + print(" > Preparing list of words") + lVal = lChar + lAff + lTag + lWord = [ [dChar[c] for c in sFlex] + [iAff+nChar] + [iTag+nChar+nAff] for sFlex, iAff, iTag in lEntry ] + lEntry = None + + # Dictionary of arc values occurrency, to sort arcs of each node + dValOccur = dict( [ (dChar[c], dCharOccur[c]) for c in dChar ] \ + + [ (dAff[aff]+nChar, dAffOccur[aff]) for aff in dAff ] \ + + [ (dTag[tag]+nChar+nAff, dTagOccur[tag]) for tag in dTag ] ) + #with open(spfSrc[:-8]+".valuesfreq.txt", 'w', encoding='utf-8') as hFreqDst: # DEBUG + # for iKey, nOcc in sorted(dValOccur.items(), key=lambda t: t[1], reverse=True): + # hFreqDst.write("{}: {}\n".format(lVal[iKey], nOcc)) + # hFreqDst.close() + + self.sFile = spfSrc + self.sLang = sLangName + self.nEntry = len(lWord) + self.aPreviousEntry = [] + DawgNode.resetNextId() + self.oRoot = DawgNode() + self.lUncheckedNodes = [] # list of nodes that have not been checked for duplication. + self.lMinimizedNodes = {} # list of unique nodes that have been checked for duplication. + self.lSortedNodes = [] # version 2 and 3 + self.nNode = 0 + self.nArc = 0 + self.dChar = dChar + self.nChar = len(dChar) + self.nAff = nAff + self.lArcVal = lVal + self.nArcVal = len(lVal) + self.nTag = self.nArcVal - self.nChar - nAff + self.cStemming = cStemming + if cStemming == "A": + self.funcStemming = st.changeWordWithAffixCode + elif cStemming == "S": + self.funcStemming = st.changeWordWithSuffixCode + else: + self.funcStemming = st.noStemming + + # build + lWord.sort() + oProgBar = ProgressBar(0, len(lWord)) + for aEntry in lWord: + self.insert(aEntry) + oProgBar.increment(1) + oProgBar.done() + self.finish() + self.countNodes() + self.countArcs() + self.sortNodes() + self.sortNodeArcs(dValOccur) + #self.sortNodeArcs2 (self.oRoot, "") + self.displayInfo() + + # BUILD DAWG + def insert (self, aEntry): + if aEntry < self.aPreviousEntry: + sys.exit("# Error: Words must be inserted in alphabetical order.") + + # find common prefix between word and previous word + nCommonPrefix = 0 + for i in range(min(len(aEntry), len(self.aPreviousEntry))): + if aEntry[i] != self.aPreviousEntry[i]: + break + nCommonPrefix += 1 + + # Check the lUncheckedNodes for redundant nodes, proceeding from last + # one down to the common prefix size. Then truncate the list at that point. + self._minimize(nCommonPrefix) + + # add the suffix, starting from the correct node mid-way through the graph + if len(self.lUncheckedNodes) == 0: + oNode = self.oRoot + else: + oNode = self.lUncheckedNodes[-1][2] + + iChar = nCommonPrefix + for c in aEntry[nCommonPrefix:]: + oNextNode = DawgNode() + oNode.arcs[c] = oNextNode + self.lUncheckedNodes.append((oNode, c, oNextNode)) + if iChar == (len(aEntry) - 2): + oNode.final = True + iChar += 1 + oNode = oNextNode + oNode.final = True + self.aPreviousEntry = aEntry + + def finish (self): + "minimize unchecked nodes" + self._minimize(0) + + def _minimize (self, downTo): + # proceed from the leaf up to a certain point + for i in range( len(self.lUncheckedNodes)-1, downTo-1, -1 ): + oNode, char, oChildNode = self.lUncheckedNodes[i] + if oChildNode in self.lMinimizedNodes: + # replace the child with the previously encountered one + oNode.arcs[char] = self.lMinimizedNodes[oChildNode] + else: + # add the state to the minimized nodes. + self.lMinimizedNodes[oChildNode] = oChildNode + self.lUncheckedNodes.pop() + + def countNodes (self): + self.nNode = len(self.lMinimizedNodes) + + def countArcs (self): + self.nArc = 0 + for oNode in self.lMinimizedNodes: + self.nArc += len(oNode.arcs) + + def sortNodeArcs (self, dValOccur): + print(" > Sort node arcs") + self.oRoot.sortArcs(dValOccur) + for oNode in self.lMinimizedNodes: + oNode.sortArcs(dValOccur) + + def sortNodeArcs2 (self, oNode, cPrevious=""): + # recursive function + dCharOccur = getCharOrderAfterChar(cPrevious) + if dCharOccur: + oNode.sortArcs2(dCharOccur, self.lArcVal) + for nArcVal, oNextNode in oNode.arcs.items(): + self.sortNodeArcs2(oNextNode, self.lArcVal[nArcVal]) + + def sortNodes (self): + print(" > Sort nodes") + for oNode in self.oRoot.arcs.values(): + self._parseNodes(oNode) + + def _parseNodes (self, oNode): + # Warning: recursive method + if oNode.pos > 0: + return + oNode.setPos() + self.lSortedNodes.append(oNode) + for oNextNode in oNode.arcs.values(): + self._parseNodes(oNextNode) + + def lookup (self, sWord): + oNode = self.oRoot + for c in sWord: + if self.dChar.get(c, '') not in oNode.arcs: + return False + oNode = oNode.arcs[self.dChar[c]] + return oNode.final + + def morph (self, sWord): + oNode = self.oRoot + for c in sWord: + if self.dChar.get(c, '') not in oNode.arcs: + return '' + oNode = oNode.arcs[self.dChar[c]] + if oNode.final: + s = "* " + for arc in oNode.arcs: + if arc >= self.nChar: + s += " [" + self.funcStemming(sWord, self.lArcVal[arc]) + oNode2 = oNode.arcs[arc] + for arc2 in oNode2.arcs: + s += " / " + self.lArcVal[arc2] + s += "]" + return s + return '' + + def displayInfo (self): + print(" * {:<12} {:>16,}".format("Entries:", self.nEntry)) + print(" * {:<12} {:>16,}".format("Characters:", self.nChar)) + print(" * {:<12} {:>16,}".format("Affixes:", self.nAff)) + print(" * {:<12} {:>16,}".format("Tags:", self.nTag)) + print(" * {:<12} {:>16,}".format("Arc values:", self.nArcVal)) + print(" * {:<12} {:>16,}".format("Nodes:", self.nNode)) + print(" * {:<12} {:>16,}".format("Arcs:", self.nArc)) + print(" * {:<12} {:>16}".format("Stemming:", self.cStemming + "FX")) + + def getArcStats (self): + d = {} + for oNode in self.lMinimizedNodes: + n = len(oNode.arcs) + d[n] = d.get(n, 0) + 1 + s = " * Nodes:\n" + for n in d: + s = s + " {:>9} nodes have {:>3} arcs\n".format(d[n], n) + return s + + def writeInfo (self, sPathFile): + print(" > Write informations") + with open(sPathFile, 'w', encoding='utf-8', newline="\n") as hDst: + hDst.write(self.getArcStats()) + hDst.write("\n * Values:\n") + for i, s in enumerate(self.lArcVal): + hDst.write(" {:>6}. {}\n".format(i, s)) + hDst.close() + + # BINARY CONVERSION + def createBinary (self, sPathFile, nMethod, bDebug=False): + print(" > Write DAWG as an indexable binary dictionary [method: %d]" % nMethod) + if nMethod == 1: + self.nBytesArc = ( (self.nArcVal.bit_length() + 2) // 8 ) + 1 # We add 2 bits. See DawgNode.convToBytes1() + self._calcNumBytesNodeAddress() + self._calcNodesAddress1() + elif nMethod == 2: + self.nBytesArc = ( (self.nArcVal.bit_length() + 3) // 8 ) + 1 # We add 3 bits. See DawgNode.convToBytes2() + self._calcNumBytesNodeAddress() + self._calcNodesAddress2() + elif nMethod == 3: + self.nBytesArc = ( (self.nArcVal.bit_length() + 3) // 8 ) + 1 # We add 3 bits. See DawgNode.convToBytes3() + self.nBytesOffset = 1 + self.nMaxOffset = (2 ** (self.nBytesOffset * 8)) - 1 + self._calcNumBytesNodeAddress() + self._calcNodesAddress3() + else: + print(" # Error: unknown compression method") + print(" Arc values (chars, affixes and tags): {} -> {} bytes".format( self.nArcVal, len("\t".join(self.lArcVal).encode("utf-8")) )) + print(" Arc size: {} bytes, Address size: {} bytes -> {} * {} = {} bytes".format( self.nBytesArc, self.nBytesNodeAddress, \ + self.nBytesArc+self.nBytesNodeAddress, self.nArc, \ + (self.nBytesArc+self.nBytesNodeAddress)*self.nArc )) + self._writeBinary(sPathFile, nMethod) + if bDebug: + self._writeNodes(sPathFile, nMethod) + + def _calcNumBytesNodeAddress (self): + "how many bytes needed to store all nodes/arcs in the binary dictionary" + self.nBytesNodeAddress = 1 + while ((self.nBytesArc + self.nBytesNodeAddress) * self.nArc) > (2 ** (self.nBytesNodeAddress * 8)): + self.nBytesNodeAddress += 1 + + def _calcNodesAddress1 (self): + nBytesNode = self.nBytesArc + self.nBytesNodeAddress + iAddr = len(self.oRoot.arcs) * nBytesNode + for oNode in self.lMinimizedNodes: + oNode.addr = iAddr + iAddr += max(len(oNode.arcs), 1) * nBytesNode + + def _calcNodesAddress2 (self): + nBytesNode = self.nBytesArc + self.nBytesNodeAddress + iAddr = len(self.oRoot.arcs) * nBytesNode + for oNode in self.lSortedNodes: + oNode.addr = iAddr + iAddr += max(len(oNode.arcs), 1) * nBytesNode + for oNextNode in oNode.arcs.values(): + if (oNode.pos + 1) == oNextNode.pos: + iAddr -= self.nBytesNodeAddress + #break + + def _calcNodesAddress3 (self): + nBytesNode = self.nBytesArc + self.nBytesNodeAddress + # theorical nodes size if only addresses and no offset + self.oRoot.size = len(self.oRoot.arcs) * nBytesNode + for oNode in self.lSortedNodes: + oNode.size = max(len(oNode.arcs), 1) * nBytesNode + # rewind and calculate dropdown from the end, several times + nDiff = self.nBytesNodeAddress - self.nBytesOffset + bEnd = False + while not bEnd: + bEnd = True + # recalculate addresses + iAddr = self.oRoot.size + for oNode in self.lSortedNodes: + oNode.addr = iAddr + iAddr += oNode.size + # rewind and calculate dropdown from the end, several times + for i in range(self.nNode-1, -1, -1): + nSize = max(len(self.lSortedNodes[i].arcs), 1) * nBytesNode + for oNextNode in self.lSortedNodes[i].arcs.values(): + if 1 < (oNextNode.addr - self.lSortedNodes[i].addr) < self.nMaxOffset: + nSize -= nDiff + if self.lSortedNodes[i].size != nSize: + self.lSortedNodes[i].size = nSize + bEnd = False + + def _writeBinary (self, sPathFile, nMethod): + """ + Format of the binary indexable dictionary: + Each section is separated with 4 bytes of \0 + + - Section Header: + /pyfsa/[version] + * version is an ASCII string + + - Section Informations: + /[tag_lang] + /[number of chars] + /[number of bytes for each arc] + /[number of bytes for each address node] + /[number of entries] + /[number of nodes] + /[number of arcs] + /[number of affixes] + * each field is a ASCII string + /[stemming code] + * "S" means stems are generated by /suffix_code/, "A" means they are generated by /affix_code/ + See defineSuffixCode() and defineAffixCode() for details. + "N" means no stemming + + - Section Values: + * a list of strings encoded in binary from utf-8, each value separated with a tabulation + + - Section Word Graph (nodes / arcs) + * A list of nodes which are a list of arcs with an address of the next node. + See DawgNode.convToBytes() for details. + """ + if not sPathFile.endswith(".bdic"): + sPathFile += "."+str(nMethod)+".bdic" + with open(sPathFile, 'wb') as hDst: + # header + hDst.write("/pyfsa/{}/".format(nMethod).encode("utf-8")) + hDst.write(b"\0\0\0\0") + # infos + hDst.write("{}/{}/{}/{}/{}/{}/{}/{}/{}".format(self.sLang, self.nChar, self.nBytesArc, self.nBytesNodeAddress, \ + self.nEntry, self.nNode, self.nArc, self.nAff, self.cStemming).encode("utf-8")) + hDst.write(b"\0\0\0\0") + # lArcVal + hDst.write("\t".join(self.lArcVal).encode("utf-8")) + hDst.write(b"\0\0\0\0") + # DAWG: nodes / arcs + if nMethod == 1: + hDst.write(self.oRoot.convToBytes1(self.nBytesArc, self.nBytesNodeAddress)) + for oNode in self.lMinimizedNodes: + hDst.write(oNode.convToBytes1(self.nBytesArc, self.nBytesNodeAddress)) + elif nMethod == 2: + hDst.write(self.oRoot.convToBytes2(self.nBytesArc, self.nBytesNodeAddress)) + for oNode in self.lSortedNodes: + hDst.write(oNode.convToBytes2(self.nBytesArc, self.nBytesNodeAddress)) + elif nMethod == 3: + hDst.write(self.oRoot.convToBytes3(self.nBytesArc, self.nBytesNodeAddress, self.nBytesOffset)) + for oNode in self.lSortedNodes: + hDst.write(oNode.convToBytes3(self.nBytesArc, self.nBytesNodeAddress, self.nBytesOffset)) + hDst.close() + + def _writeNodes (self, sPathFile, nMethod): + "for debugging only" + print(" > Write nodes") + with open(sPathFile+".nodes."+str(nMethod)+".txt", 'w', encoding='utf-8', newline="\n") as hDst: + if nMethod == 1: + hDst.write(self.oRoot.getTxtRepr1(self.nBytesArc, self.nBytesNodeAddress, self.lArcVal)+"\n") + #hDst.write( ''.join( [ "%02X " % z for z in self.oRoot.convToBytes1(self.nBytesArc, self.nBytesNodeAddress) ] ).strip() ) + for oNode in self.lMinimizedNodes: + hDst.write(oNode.getTxtRepr1(self.nBytesArc, self.nBytesNodeAddress, self.lArcVal)+"\n") + if nMethod == 2: + hDst.write(self.oRoot.getTxtRepr2(self.nBytesArc, self.nBytesNodeAddress, self.lArcVal)+"\n") + for oNode in self.lSortedNodes: + hDst.write(oNode.getTxtRepr2(self.nBytesArc, self.nBytesNodeAddress, self.lArcVal)+"\n") + if nMethod == 3: + hDst.write(self.oRoot.getTxtRepr3(self.nBytesArc, self.nBytesNodeAddress, self.nBytesOffset, self.lArcVal)+"\n") + #hDst.write( ''.join( [ "%02X " % z for z in self.oRoot.convToBytes3(self.nBytesArc, self.nBytesNodeAddress, self.nBytesOffset) ] ).strip() ) + for oNode in self.lSortedNodes: + hDst.write(oNode.getTxtRepr3(self.nBytesArc, self.nBytesNodeAddress, self.nBytesOffset, self.lArcVal)+"\n") + hDst.close() + + def writeResults (self, sPathFile): + bFileExits = os.path.isfile("_lexicons.res.txt") + with open("_lexicons.res.txt", "a", encoding='utf-8', newline="\n") as hDst: + sFormat1 = "{:<12} {:>12} {:>5} {:>8} {:>8} {:>6} {:>8} {:>9} {:>9} {:>15} {:>12} {:>12}\n" + sFormat2 = "{:<12} {:>12,} {:>5,} {:>8,} {:>8} {:>6,} {:>8,} {:>9,} {:>9,} {:>15,} {:>12,} {:>12,}\n" + if not bFileExits: + hDst.write(sFormat1.format("Lexicon", "Entries", "Chars", "Affixes", "Stemming", "Tags", "Values", "Nodes", "Arcs", "Lexicon (Kb)", "Dict (Kb)", "LT Dict (Kb)")) + hDst.write(sFormat2.format(self.sLang, self.nEntry, self.nChar, self.nAff, self.cStemming + "FX", self.nTag, self.nArcVal, \ + self.nNode, self.nArc, os.path.getsize(self.sFile), os.path.getsize(sPathFile), \ + os.path.getsize("cfsa/dict/{}.dict".format(self.sLang)) if os.path.isfile("cfsa/dict/{}.dict".format(self.sLang)) else 0)) + hDst.close() + + + +class DawgNode: + NextId = 0 + NextPos = 1 # (version 2) + + def __init__ (self): + self.i = DawgNode.NextId + DawgNode.NextId += 1 + self.final = False + self.arcs = {} # key: arc value; value: a node + self.addr = 0 # address in the binary dictionary + self.pos = 0 # position in the binary dictionary (version 2) + self.size = 0 # size of node in bytes (version 3) + + @classmethod + def resetNextId (cls): + cls.NextId = 0 + + def setPos (self): # version 2 + self.pos = DawgNode.NextPos + DawgNode.NextPos += 1 + + def __str__ (self): + # Caution! this function is used for hashing and comparison! + l = [] + if self.final: + l.append("1") + else: + l.append("0") + for (key, node) in self.arcs.items(): + l.append(str(key)) + l.append(str(node.i)) + return "_".join(l) + + def __hash__ (self): + # Used as a key in a python dictionary. + return self.__str__().__hash__() + + def __eq__ (self, other): + # Used as a key in a python dictionary. + # Nodes are equivalent if they have identical arcs, and each identical arc leads to identical states. + return self.__str__() == other.__str__() + + def sortArcs (self, dValOccur): + self.arcs = collections.OrderedDict(sorted(self.arcs.items(), key=lambda t: dValOccur.get(t[0], 0), reverse=True)) + + def sortArcs2 (self, dValOccur, lArcVal): + self.arcs = collections.OrderedDict(sorted(self.arcs.items(), key=lambda t: dValOccur.get(lArcVal[t[0]], 0), reverse=True)) + + # VERSION 1 ===================================================================================================== + def convToBytes1 (self, nBytesArc, nBytesNodeAddress): + """ + Node scheme: + - Arc length is defined by nBytesArc + - Address length is defined by nBytesNodeAddress + + | Arc | Address of next node | + | | | + /---------------\ /---------------\ /---------------\ /---------------\ /---------------\ /---------------\ + | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | + \---------------/ \---------------/ \---------------/ \---------------/ \---------------/ \---------------/ + [...] + /---------------\ /---------------\ /---------------\ /---------------\ /---------------\ /---------------\ + | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | + \---------------/ \---------------/ \---------------/ \---------------/ \---------------/ \---------------/ + ^ ^ + | | + | | + | \___ if 1, last arc of this node + \_____ if 1, this node is final (only on the first arc) + """ + nArc = len(self.arcs) + nFinalNodeMask = 1 << ((nBytesArc*8)-1) + nFinalArcMask = 1 << ((nBytesArc*8)-2) + if len(self.arcs) == 0: + val = nFinalNodeMask | nFinalArcMask + by = val.to_bytes(nBytesArc, byteorder='big') + by += (0).to_bytes(nBytesNodeAddress, byteorder='big') + return by + by = b"" + for i, arc in enumerate(self.arcs, 1): + val = arc + if i == 1 and self.final: + val = val | nFinalNodeMask + if i == nArc: + val = val | nFinalArcMask + by += val.to_bytes(nBytesArc, byteorder='big') + by += self.arcs[arc].addr.to_bytes(nBytesNodeAddress, byteorder='big') + return by + + def getTxtRepr1 (self, nBytesArc, nBytesNodeAddress, lVal): + nArc = len(self.arcs) + nFinalNodeMask = 1 << ((nBytesArc*8)-1) + nFinalArcMask = 1 << ((nBytesArc*8)-2) + s = "i{:_>10} -- #{:_>10}\n".format(self.i, self.addr) + if len(self.arcs) == 0: + s += " {:<20} {:0>16} i{:_>10} #{:_>10}\n".format("", bin(nFinalNodeMask | nFinalArcMask)[2:], "0", "0") + return s + for i, arc in enumerate(self.arcs, 1): + val = arc + if i == 1 and self.final: + val = val | nFinalNodeMask + if i == nArc: + val = val | nFinalArcMask + s += " {:<20} {:0>16} i{:_>10} #{:_>10}\n".format(lVal[arc], bin(val)[2:], self.arcs[arc].i, self.arcs[arc].addr) + return s + + # VERSION 2 ===================================================================================================== + def convToBytes2 (self, nBytesArc, nBytesNodeAddress): + """ + Node scheme: + - Arc length is defined by nBytesArc + - Address length is defined by nBytesNodeAddress + + | Arc | Address of next node | + | | | + /---------------\ /---------------\ /---------------\ /---------------\ /---------------\ /---------------\ + | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | + \---------------/ \---------------/ \---------------/ \---------------/ \---------------/ \---------------/ + [...] + /---------------\ /---------------\ /---------------\ /---------------\ /---------------\ /---------------\ + | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | + \---------------/ \---------------/ \---------------/ \---------------/ \---------------/ \---------------/ + ^ ^ ^ + | | | + | | \_ if 1, caution, no address: next node is the following node + | \___ if 1, last arc of this node + \_____ if 1, this node is final (only on the first arc) + """ + nArc = len(self.arcs) + nFinalNodeMask = 1 << ((nBytesArc*8)-1) + nFinalArcMask = 1 << ((nBytesArc*8)-2) + nNextNodeMask = 1 << ((nBytesArc*8)-3) + if len(self.arcs) == 0: + val = nFinalNodeMask | nFinalArcMask + by = val.to_bytes(nBytesArc, byteorder='big') + by += (0).to_bytes(nBytesNodeAddress, byteorder='big') + return by + by = b"" + for i, arc in enumerate(self.arcs, 1): + val = arc + if i == 1 and self.final: + val = val | nFinalNodeMask + if i == nArc: + val = val | nFinalArcMask + if (self.pos + 1) == self.arcs[arc].pos and self.i != 0: + val = val | nNextNodeMask + by += val.to_bytes(nBytesArc, byteorder='big') + else: + by += val.to_bytes(nBytesArc, byteorder='big') + by += self.arcs[arc].addr.to_bytes(nBytesNodeAddress, byteorder='big') + return by + + def getTxtRepr2 (self, nBytesArc, nBytesNodeAddress, lVal): + nArc = len(self.arcs) + nFinalNodeMask = 1 << ((nBytesArc*8)-1) + nFinalArcMask = 1 << ((nBytesArc*8)-2) + nNextNodeMask = 1 << ((nBytesArc*8)-3) + s = "i{:_>10} -- #{:_>10}\n".format(self.i, self.addr) + if nArc == 0: + s += " {:<20} {:0>16} i{:_>10} #{:_>10}\n".format("", bin(nFinalNodeMask | nFinalArcMask)[2:], "0", "0") + return s + for i, arc in enumerate(self.arcs, 1): + val = arc + if i == 1 and self.final: + val = val | nFinalNodeMask + if i == nArc: + val = val | nFinalArcMask + if (self.pos + 1) == self.arcs[arc].pos and self.i != 0: + val = val | nNextNodeMask + s += " {:<20} {:0>16}\n".format(lVal[arc], bin(val)[2:], "") + else: + s += " {:<20} {:0>16} i{:_>10} #{:_>10}\n".format(lVal[arc], bin(val)[2:], self.arcs[arc].i, self.arcs[arc].addr) + return s + + # VERSION 3 ===================================================================================================== + def convToBytes3 (self, nBytesArc, nBytesNodeAddress, nBytesOffset): + """ + Node scheme: + - Arc length is defined by nBytesArc + - Address length is defined by nBytesNodeAddress + - Offset length is defined by nBytesOffset + + | Arc | Address of next node or offset to next node | + | | | + /---------------\ /---------------\ /---------------\ /---------------\ /---------------\ /---------------\ + |1|0|0| | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | + \---------------/ \---------------/ \---------------/ \---------------/ \---------------/ \---------------/ + [...] + /---------------\ /---------------\ /---------------\ + |0|0|1| | | | | | | | | | | | | | | | | | | | | | | | Offsets are shorter than addresses + \---------------/ \---------------/ \---------------/ + /---------------\ /---------------\ /---------------\ /---------------\ /---------------\ /---------------\ + |0|1|0| | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | + \---------------/ \---------------/ \---------------/ \---------------/ \---------------/ \---------------/ + + ^ ^ ^ + | | | + | | \_ if 1, offset instead of address of next node + | \___ if 1, last arc of this node + \_____ if 1, this node is final (only on the first arc) + """ + nArc = len(self.arcs) + nFinalNodeMask = 1 << ((nBytesArc*8)-1) + nFinalArcMask = 1 << ((nBytesArc*8)-2) + nNextNodeMask = 1 << ((nBytesArc*8)-3) + nMaxOffset = (2 ** (nBytesOffset * 8)) - 1 + if nArc == 0: + val = nFinalNodeMask | nFinalArcMask + by = val.to_bytes(nBytesArc, byteorder='big') + by += (0).to_bytes(nBytesNodeAddress, byteorder='big') + return by + by = b"" + for i, arc in enumerate(self.arcs, 1): + val = arc + if i == 1 and self.final: + val = val | nFinalNodeMask + if i == nArc: + val = val | nFinalArcMask + if 1 < (self.arcs[arc].addr - self.addr) < nMaxOffset and self.i != 0: + val = val | nNextNodeMask + by += val.to_bytes(nBytesArc, byteorder='big') + by += (self.arcs[arc].addr-self.addr).to_bytes(nBytesOffset, byteorder='big') + else: + by += val.to_bytes(nBytesArc, byteorder='big') + by += self.arcs[arc].addr.to_bytes(nBytesNodeAddress, byteorder='big') + return by + + def getTxtRepr3 (self, nBytesArc, nBytesNodeAddress, nBytesOffset, lVal): + nArc = len(self.arcs) + nFinalNodeMask = 1 << ((nBytesArc*8)-1) + nFinalArcMask = 1 << ((nBytesArc*8)-2) + nNextNodeMask = 1 << ((nBytesArc*8)-3) + nMaxOffset = (2 ** (nBytesOffset * 8)) - 1 + s = "i{:_>10} -- #{:_>10} ({})\n".format(self.i, self.addr, self.size) + if nArc == 0: + s += " {:<20} {:0>16} i{:_>10} #{:_>10}\n".format("", bin(nFinalNodeMask | nFinalArcMask)[2:], "0", "0") + return s + for i, arc in enumerate(self.arcs, 1): + val = arc + if i == 1 and self.final: + val = val | nFinalNodeMask + if i == nArc: + val = val | nFinalArcMask + if 1 < (self.arcs[arc].addr - self.addr) < nMaxOffset and self.i != 0: + val = val | nNextNodeMask + s += " {:<20} {:0>16} i{:_>10} +{:_>10}\n".format(lVal[arc], bin(val)[2:], self.arcs[arc].i, self.arcs[arc].addr - self.addr) + else: + s += " {:<20} {:0>16} i{:_>10} #{:_>10}\n".format(lVal[arc], bin(val)[2:], self.arcs[arc].i, self.arcs[arc].addr) + return s + + + +# Another attempt to sort node arcs + +_dCharOrder = { + # key: previous char, value: dictionary of chars {c: nValue} + "": {} +} + + +def addWordToCharDict (sWord): + cPrevious = "" + for cChar in sWord: + if cPrevious not in _dCharOrder: + _dCharOrder[cPrevious] = {} + _dCharOrder[cPrevious][cChar] = _dCharOrder[cPrevious].get(cChar, 0) + 1 + cPrevious = cChar + + +def getCharOrderAfterChar (cChar): + return _dCharOrder.get(cChar, None) + + +def displayCharOrder (): + for key, value in _dCharOrder.items(): + print("[" + key + "]: ", ", ".join([ c+":"+str(n) for c, n in sorted(value.items(), key=lambda t: t[1], reverse=True) ])) ADDED graphspell/echo.py Index: graphspell/echo.py ================================================================== --- graphspell/echo.py +++ graphspell/echo.py @@ -0,0 +1,29 @@ +#!python3 + +# The most boring yet indispensable function: print! + + +import sys + + +_CHARMAP = str.maketrans({ 'œ': 'ö', 'Œ': 'Ö', 'ʳ': "r", 'ᵉ': "e", '…': "_", \ + '“': '"', '”': '"', '„': '"', '‘': "'", '’': "'", \ + 'ā': 'â', 'Ā': 'Â', 'ē': 'ê', 'Ē': 'Ê', 'ī': 'î', 'Ī': 'Î', \ + 'ō': 'ô', 'Ō': 'Ô', 'ū': 'û', 'Ū': 'Û', 'Ÿ': 'Y', \ + 'ś': 's', 'ŝ': 's', \ + '—': '-', '–': '-' + }) + + +def echo (obj, sep=' ', end='\n', file=sys.stdout, flush=False): + """ Print for Windows to avoid Python crashes. + Encoding depends on Windows locale. No useful standard. + Always returns True (useful for debugging).""" + if sys.platform != "win32": + print(obj, sep=sep, end=end, file=file, flush=flush) + return True + try: + print(str(obj).translate(_CHARMAP), sep=sep, end=end, file=file, flush=flush) + except: + print(str(obj).encode('ascii', 'replace').decode('ascii', 'replace'), sep=sep, end=end, file=file, flush=flush) + return True ADDED graphspell/ibdawg.py Index: graphspell/ibdawg.py ================================================================== --- graphspell/ibdawg.py +++ graphspell/ibdawg.py @@ -0,0 +1,720 @@ +#!python3 + +import os +import traceback +import pkgutil +import re +from functools import wraps +import time + +#import logging +#logging.basicConfig(filename="suggestions.log", level=logging.DEBUG) + +from . import str_transform as st +from . import char_player as cp +from .echo import echo + + +def timethis (func): + "decorator for the execution time" + @wraps(func) + def wrapper (*args, **kwargs): + fStart = time.time() + result = func(*args, **kwargs) + fEnd = time.time() + print(func.__name__, fEnd - fStart) + return result + return wrapper + + +class SuggResult: + """Structure for storing, classifying and filtering suggestions""" + + def __init__ (self, sWord, nDistLimit=-1): + self.sWord = sWord + self.sSimplifiedWord = cp.simplifyWord(sWord) + self.nDistLimit = nDistLimit if nDistLimit >= 0 else (len(sWord) // 3) + 1 + self.nMinDist = 1000 + self.aSugg = set() + self.dSugg = { 0: [], 1: [], 2: [] } + + def addSugg (self, sSugg, nDeep=0): + "add a suggestion" + #logging.info((nDeep * " ") + "__" + sSugg + "__") + if sSugg not in self.aSugg: + nDist = st.distanceDamerauLevenshtein(self.sSimplifiedWord, cp.simplifyWord(sSugg)) + if nDist <= self.nDistLimit: + if nDist not in self.dSugg: + self.dSugg[nDist] = [] + self.dSugg[nDist].append(sSugg) + self.aSugg.add(sSugg) + if nDist < self.nMinDist: + self.nMinDist = nDist + self.nDistLimit = min(self.nDistLimit, self.nMinDist+2) + + def getSuggestions (self, nSuggLimit=10, nDistLimit=-1): + "return a list of suggestions" + lRes = [] + if self.dSugg[0]: + # we sort the better results with the original word + self.dSugg[0].sort(key=lambda sSugg: st.distanceDamerauLevenshtein(self.sWord, sSugg)) + for lSugg in self.dSugg.values(): + lRes.extend(lSugg) + if len(lRes) > nSuggLimit: + break + lRes = list(cp.filterSugg(lRes)) + if self.sWord.istitle(): + lRes = list(map(lambda sSugg: sSugg.title(), lRes)) + elif self.sWord.isupper(): + lRes = list(map(lambda sSugg: sSugg.upper(), lRes)) + return lRes[:nSuggLimit] + + def reset (self): + self.aSugg.clear() + self.dSugg.clear() + + +class IBDAWG: + """INDEXABLE BINARY DIRECT ACYCLIC WORD GRAPH""" + + def __init__ (self, sDicName): + self.by = pkgutil.get_data(__package__, "_dictionaries/" + sDicName) + if not self.by: + raise OSError("# Error. File not found or not loadable: "+sDicName) + + if self.by[0:7] != b"/pyfsa/": + raise TypeError("# Error. Not a pyfsa binary dictionary. Header: {}".format(self.by[0:9])) + if not(self.by[7:8] == b"1" or self.by[7:8] == b"2" or self.by[7:8] == b"3"): + raise ValueError("# Error. Unknown dictionary version: {}".format(self.by[7:8])) + try: + header, info, values, bdic = self.by.split(b"\0\0\0\0", 3) + except Exception: + raise Exception + + self.sName = sDicName + self.nVersion = int(self.by[7:8].decode("utf-8")) + self.sHeader = header.decode("utf-8") + self.lArcVal = values.decode("utf-8").split("\t") + self.nArcVal = len(self.lArcVal) + self.byDic = bdic + + l = info.decode("utf-8").split("/") + self.sLang = l[0] + self.nChar = int(l[1]) + self.nBytesArc = int(l[2]) + self.nBytesNodeAddress = int(l[3]) + self.nEntries = int(l[4]) + self.nNode = int(l[5]) + self.nArc = int(l[6]) + self.nAff = int(l[7]) + self.cStemming = l[8] + if self.cStemming == "S": + self.funcStemming = st.changeWordWithSuffixCode + elif self.cStemming == "A": + self.funcStemming = st.changeWordWithAffixCode + else: + self.funcStemming = st.noStemming + self.nTag = self.nArcVal - self.nChar - self.nAff + # to get the value of an arc, to get the char of an arc with its value + self.dChar = {} + for i in range(1, self.nChar): + self.dChar[self.lArcVal[i]] = i + self.dCharVal = { v: k for k, v in self.dChar.items() } + + self._arcMask = (2 ** ((self.nBytesArc * 8) - 3)) - 1 + self._finalNodeMask = 1 << ((self.nBytesArc * 8) - 1) + self._lastArcMask = 1 << ((self.nBytesArc * 8) - 2) + self._addrBitMask = 1 << ((self.nBytesArc * 8) - 3) # version 2 + + self.nBytesOffset = 1 # version 3 + + # Configuring DAWG functions according to nVersion + if self.nVersion == 1: + self.morph = self._morph1 + self.stem = self._stem1 + self._lookupArcNode = self._lookupArcNode1 + self._getArcs = self._getArcs1 + self._writeNodes = self._writeNodes1 + elif self.nVersion == 2: + self.morph = self._morph2 + self.stem = self._stem2 + self._lookupArcNode = self._lookupArcNode2 + self._getArcs = self._getArcs2 + self._writeNodes = self._writeNodes2 + elif self.nVersion == 3: + self.morph = self._morph3 + self.stem = self._stem3 + self._lookupArcNode = self._lookupArcNode3 + self._getArcs = self._getArcs3 + self._writeNodes = self._writeNodes3 + else: + raise ValueError(" # Error: unknown code: {}".format(self.nVersion)) + + self.bOptNumSigle = False + self.bOptNumAtLast = False + + def getInfo (self): + return " Language: {0.sLang:>10} Version: {0.nVersion:>2} Stemming: {0.cStemming}FX\n" \ + " Arcs values: {0.nArcVal:>10,} = {0.nChar:>5,} characters, {0.nAff:>6,} affixes, {0.nTag:>6,} tags\n" \ + " Dictionary: {0.nEntries:>12,} entries, {0.nNode:>11,} nodes, {0.nArc:>11,} arcs\n" \ + " Address size: {0.nBytesNodeAddress:>1} bytes, Arc size: {0.nBytesArc:>1} bytes\n".format(self) + + def writeAsJSObject (self, spfDest, bInJSModule=False, bBinaryDictAsHexString=False): + "write IBDAWG as a JavaScript object in a JavaScript module" + import json + with open(spfDest, "w", encoding="utf-8", newline="\n") as hDst: + if bInJSModule: + hDst.write('// JavaScript\n// Generated data (do not edit)\n\n"use strict";\n\nconst dictionary = ') + hDst.write(json.dumps({ + "sName": self.sName, + "nVersion": self.nVersion, + "sHeader": self.sHeader, + "lArcVal": self.lArcVal, + "nArcVal": self.nArcVal, + # JavaScript is a pile of shit, so Mozilla’s JS parser don’t like file bigger than 4 Mb! + # So, if necessary, we use an hexadecimal string, that we will convert later in Firefox’s extension. + # https://github.com/mozilla/addons-linter/issues/1361 + "byDic": self.byDic.hex() if bBinaryDictAsHexString else [ e for e in self.byDic ], + "sLang": self.sLang, + "nChar": self.nChar, + "nBytesArc": self.nBytesArc, + "nBytesNodeAddress": self.nBytesNodeAddress, + "nEntries": self.nEntries, + "nNode": self.nNode, + "nArc": self.nArc, + "nAff": self.nAff, + "cStemming": self.cStemming, + "nTag": self.nTag, + "dChar": self.dChar, + "_arcMask": self._arcMask, + "_finalNodeMask": self._finalNodeMask, + "_lastArcMask": self._lastArcMask, + "_addrBitMask": self._addrBitMask, + "nBytesOffset": self.nBytesOffset + }, ensure_ascii=False)) + if bInJSModule: + hDst.write(";\n\nexports.dictionary = dictionary;\n") + + def isValidToken (self, sToken): + "checks if is valid (if there is hyphens in , is split, each part is checked)" + if self.isValid(sToken): + return True + if "-" in sToken: + if sToken.count("-") > 4: + return True + return all(self.isValid(sWord) for sWord in sToken.split("-")) + return False + + def isValid (self, sWord): + "checks if is valid (different casing tested if the first letter is a capital)" + if not sWord: + return None + if "’" in sWord: # ugly hack + sWord = sWord.replace("’", "'") + if self.lookup(sWord): + return True + if sWord[0:1].isupper(): + if len(sWord) > 1: + if sWord.istitle(): + return self.lookup(sWord.lower()) + if sWord.isupper(): + if self.bOptNumSigle: + return True + return self.lookup(sWord.lower()) or self.lookup(sWord.capitalize()) + return self.lookup(sWord[:1].lower() + sWord[1:]) + else: + return self.lookup(sWord.lower()) + return False + + def lookup (self, sWord): + "returns True if in dictionary (strict verification)" + iAddr = 0 + for c in sWord: + if c not in self.dChar: + return False + iAddr = self._lookupArcNode(self.dChar[c], iAddr) + if iAddr == None: + return False + return bool(int.from_bytes(self.byDic[iAddr:iAddr+self.nBytesArc], byteorder='big') & self._finalNodeMask) + + def getMorph (self, sWord): + "retrieves morphologies list, different casing allowed" + l = self.morph(sWord) + if sWord[0:1].isupper(): + l.extend(self.morph(sWord.lower())) + if sWord.isupper() and len(sWord) > 1: + l.extend(self.morph(sWord.capitalize())) + return l + + #@timethis + def suggest (self, sWord, nSuggLimit=10): + "returns a set of suggestions for " + sPfx, sWord, sSfx = cp.cut(sWord) + nMaxSwitch = max(len(sWord) // 3, 1) + nMaxDel = len(sWord) // 5 + nMaxHardRepl = max((len(sWord) - 5) // 4, 1) + oSuggResult = SuggResult(sWord) + self._suggest(oSuggResult, sWord, nMaxSwitch=nMaxSwitch, nMaxDel=nMaxDel, nMaxHardRepl=nMaxHardRepl) + if sWord.istitle(): + self._suggest(oSuggResult, sWord.lower(), nMaxSwitch=nMaxSwitch, nMaxDel=nMaxDel, nMaxHardRepl=nMaxHardRepl) + elif sWord.islower(): + self._suggest(oSuggResult, sWord.title(), nMaxSwitch=nMaxSwitch, nMaxDel=nMaxDel, nMaxHardRepl=nMaxHardRepl) + aSugg = oSuggResult.getSuggestions(nSuggLimit) + if sSfx or sPfx: + # we add what we removed + return list(map(lambda sSug: sPfx + sSug + sSfx, aSugg)) + return aSugg + + def _suggest (self, oSuggResult, sRemain, nMaxSwitch=0, nMaxDel=0, nMaxHardRepl=0, nDeep=0, iAddr=0, sNewWord="", bAvoidLoop=False): + # recursive function + #logging.info((nDeep * " ") + sNewWord + ":" + sRemain) + if not sRemain: + if int.from_bytes(self.byDic[iAddr:iAddr+self.nBytesArc], byteorder='big') & self._finalNodeMask: + oSuggResult.addSugg(sNewWord, nDeep) + for sTail in self._getTails(iAddr): + oSuggResult.addSugg(sNewWord+sTail, nDeep) + return + cCurrent = sRemain[0:1] + for cChar, jAddr in self._getCharArcs(iAddr): + if cChar in cp.d1to1.get(cCurrent, cCurrent): + self._suggest(oSuggResult, sRemain[1:], nMaxSwitch, nMaxDel, nMaxHardRepl, nDeep+1, jAddr, sNewWord+cChar) + elif not bAvoidLoop and nMaxHardRepl: + self._suggest(oSuggResult, sRemain[1:], nMaxSwitch, nMaxDel, nMaxHardRepl-1, nDeep+1, jAddr, sNewWord+cChar, True) + if not bAvoidLoop: # avoid infinite loop + if len(sRemain) > 1: + if cCurrent == sRemain[1:2]: + # same char, we remove 1 char without adding 1 to + self._suggest(oSuggResult, sRemain[1:], nMaxSwitch, nMaxDel, nMaxHardRepl, nDeep+1, iAddr, sNewWord) + else: + # switching chars + if nMaxSwitch: + self._suggest(oSuggResult, sRemain[1:2]+sRemain[0:1]+sRemain[2:], nMaxSwitch-1, nMaxDel, nMaxHardRepl, nDeep+1, iAddr, sNewWord, True) + # delete char + if nMaxDel: + self._suggest(oSuggResult, sRemain[1:], nMaxSwitch, nMaxDel-1, nMaxHardRepl, nDeep+1, iAddr, sNewWord, True) + # Phonetic replacements + for sRepl in cp.get1toXReplacement(sNewWord[-1:], cCurrent, sRemain[1:2]): + self._suggest(oSuggResult, sRepl + sRemain[1:], nMaxSwitch, nMaxDel, nMaxHardRepl, nDeep+1, iAddr, sNewWord, True) + for sRepl in cp.d2toX.get(sRemain[0:2], ()): + self._suggest(oSuggResult, sRepl + sRemain[2:], nMaxSwitch, nMaxDel, nMaxHardRepl, nDeep+1, iAddr, sNewWord, True) + # end of word + if len(sRemain) == 2: + for sRepl in cp.dFinal2.get(sRemain, ()): + self._suggest(oSuggResult, sRepl, nMaxSwitch, nMaxDel, nMaxHardRepl, nDeep+1, iAddr, sNewWord, True) + elif len(sRemain) == 1: + self._suggest(oSuggResult, "", nMaxSwitch, nMaxDel, nMaxHardRepl, nDeep+1, iAddr, sNewWord, True) # remove last char and go on + for sRepl in cp.dFinal1.get(sRemain, ()): + self._suggest(oSuggResult, sRepl, nMaxSwitch, nMaxDel, nMaxHardRepl, nDeep+1, iAddr, sNewWord, True) + + #@timethis + def suggest2 (self, sWord, nMaxSugg=10): + "returns a set of suggestions for " + sPfx, sWord, sSfx = cp.cut(sWord) + oSuggResult = SuggResult(sWord) + self._suggest2(oSuggResult) + aSugg = oSuggResult.getSuggestions() + if sSfx or sPfx: + # we add what we removed + return list(map(lambda sSug: sPfx + sSug + sSfx, aSugg)) + return aSugg + + def _suggest2 (self, oSuggResult, nDeep=0, iAddr=0, sNewWord=""): + # recursive function + #logging.info((nDeep * " ") + sNewWord) + if nDeep >= oSuggResult.nDistLimit: + sCleanNewWord = cp.simplifyWord(sNewWord) + if st.distanceSift4(oSuggResult.sCleanWord[:len(sCleanNewWord)], sCleanNewWord) > oSuggResult.nDistLimit: + return + if int.from_bytes(self.byDic[iAddr:iAddr+self.nBytesArc], byteorder='big') & self._finalNodeMask: + oSuggResult.addSugg(sNewWord, nDeep) + for cChar, jAddr in self._getCharArcsWithPriority(iAddr, oSuggResult.sWord[nDeep:nDeep+1]): + self._suggest2(oSuggResult, nDeep+1, jAddr, sNewWord+cChar) + return + + def _getCharArcs (self, iAddr): + "generator: yield all chars and addresses from node at address " + for nVal, jAddr in self._getArcs(iAddr): + if nVal < self.nChar: + yield (self.dCharVal[nVal], jAddr) + + def _getSimilarCharArcs (self, cChar, iAddr): + "generator: yield similar char of and address of the following node" + for c in cp.d1to1.get(cChar, [cChar]): + if c in self.dChar: + jAddr = self._lookupArcNode(self.dChar[c], iAddr) + if jAddr: + yield (c, jAddr) + + def _getCharArcsWithPriority (self, iAddr, cChar): + if not cChar: + yield from self._getCharArcs(iAddr) + lTuple = list(self._getCharArcs(iAddr)) + lTuple.sort(key=lambda t: 0 if t[0] in cp.d1to1.get(cChar, cChar) else 1) + yield from lTuple + + def _getTails (self, iAddr, sTail="", n=2): + "return a list of suffixes ending at a distance of from " + aTails = set() + for nVal, jAddr in self._getArcs(iAddr): + if nVal < self.nChar: + if int.from_bytes(self.byDic[jAddr:jAddr+self.nBytesArc], byteorder='big') & self._finalNodeMask: + aTails.add(sTail + self.dCharVal[nVal]) + if n and not aTails: + aTails.update(self._getTails(jAddr, sTail+self.dCharVal[nVal], n-1)) + return aTails + + def drawPath (self, sWord, iAddr=0): + "show the path taken by in the graph" + c1 = sWord[0:1] if sWord else " " + iPos = -1 + n = 0 + print(c1 + ": ", end="") + for c2, jAddr in self._getCharArcs(iAddr): + print(c2, end="") + if c2 == sWord[0:1]: + iNextNodeAddr = jAddr + iPos = n + n += 1 + if not sWord: + return + if iPos >= 0: + print("\n "+ " " * iPos + "|") + self.drawPath(sWord[1:], iNextNodeAddr) + + def select (self, sPattern=""): + "generator: returns all entries which morphology fits " + zPattern = None + try: + zPattern = re.compile(sPattern) + except: + print("# Error in regex pattern") + traceback.print_exc() + yield from self._select1(zPattern, 0, "") + + # def morph (self, sWord): + # is defined in __init__ + + # VERSION 1 + def _select1 (self, zPattern, iAddr, sWord): + # recursive generator + for nVal, jAddr in self._getArcs1(iAddr): + if nVal < self.nChar: + # simple character + yield from self._select1(zPattern, jAddr, sWord + self.lArcVal[nVal]) + else: + sEntry = sWord + "\t" + self.funcStemming(sWord, self.lArcVal[nVal]) + for nMorphVal, _ in self._getArcs1(jAddr): + if not zPattern or zPattern.search(self.lArcVal[nMorphVal]): + yield sEntry + "\t" + self.lArcVal[nMorphVal] + + def _morph1 (self, sWord): + "returns morphologies of " + iAddr = 0 + for c in sWord: + if c not in self.dChar: + return [] + iAddr = self._lookupArcNode(self.dChar[c], iAddr) + if iAddr == None: + return [] + if (int.from_bytes(self.byDic[iAddr:iAddr+self.nBytesArc], byteorder='big') & self._finalNodeMask): + l = [] + nRawArc = 0 + while not (nRawArc & self._lastArcMask): + iEndArcAddr = iAddr + self.nBytesArc + nRawArc = int.from_bytes(self.byDic[iAddr:iEndArcAddr], byteorder='big') + nArc = nRawArc & self._arcMask + if nArc >= self.nChar: + # This value is not a char, this is a stemming code + sStem = ">" + self.funcStemming(sWord, self.lArcVal[nArc]) + # Now , we go to the next node and retrieve all following arcs values, all of them are tags + iAddr2 = int.from_bytes(self.byDic[iEndArcAddr:iEndArcAddr+self.nBytesNodeAddress], byteorder='big') + nRawArc2 = 0 + while not (nRawArc2 & self._lastArcMask): + iEndArcAddr2 = iAddr2 + self.nBytesArc + nRawArc2 = int.from_bytes(self.byDic[iAddr2:iEndArcAddr2], byteorder='big') + l.append(sStem + " " + self.lArcVal[nRawArc2 & self._arcMask]) + iAddr2 = iEndArcAddr2+self.nBytesNodeAddress + iAddr = iEndArcAddr+self.nBytesNodeAddress + return l + return [] + + def _stem1 (self, sWord): + "returns stems list of " + iAddr = 0 + for c in sWord: + if c not in self.dChar: + return [] + iAddr = self._lookupArcNode(self.dChar[c], iAddr) + if iAddr == None: + return [] + if (int.from_bytes(self.byDic[iAddr:iAddr+self.nBytesArc], byteorder='big') & self._finalNodeMask): + l = [] + nRawArc = 0 + while not (nRawArc & self._lastArcMask): + iEndArcAddr = iAddr + self.nBytesArc + nRawArc = int.from_bytes(self.byDic[iAddr:iEndArcAddr], byteorder='big') + nArc = nRawArc & self._arcMask + if nArc >= self.nChar: + # This value is not a char, this is a stemming code + l.append(self.funcStemming(sWord, self.lArcVal[nArc])) + iAddr = iEndArcAddr+self.nBytesNodeAddress + return l + return [] + + def _lookupArcNode1 (self, nVal, iAddr): + "looks if is an arc at the node at , if yes, returns address of next node else None" + while True: + iEndArcAddr = iAddr+self.nBytesArc + nRawArc = int.from_bytes(self.byDic[iAddr:iEndArcAddr], byteorder='big') + if nVal == (nRawArc & self._arcMask): + # the value we are looking for + # we return the address of the next node + return int.from_bytes(self.byDic[iEndArcAddr:iEndArcAddr+self.nBytesNodeAddress], byteorder='big') + else: + # value not found + if (nRawArc & self._lastArcMask): + return None + iAddr = iEndArcAddr+self.nBytesNodeAddress + + def _getArcs1 (self, iAddr): + "generator: return all arcs at as tuples of (nVal, iAddr)" + while True: + iEndArcAddr = iAddr+self.nBytesArc + nRawArc = int.from_bytes(self.byDic[iAddr:iEndArcAddr], byteorder='big') + yield (nRawArc & self._arcMask, int.from_bytes(self.byDic[iEndArcAddr:iEndArcAddr+self.nBytesNodeAddress], byteorder='big')) + if (nRawArc & self._lastArcMask): + break + iAddr = iEndArcAddr+self.nBytesNodeAddress + + def _writeNodes1 (self, spfDest): + "for debugging only" + print(" > Write binary nodes") + with codecs.open(spfDest, 'w', 'utf-8', newline="\n") as hDst: + iAddr = 0 + hDst.write("i{:_>10} -- #{:_>10}\n".format("0", iAddr)) + while iAddr < len(self.byDic): + iEndArcAddr = iAddr+self.nBytesArc + nRawArc = int.from_bytes(self.byDic[iAddr:iEndArcAddr], byteorder='big') + nArc = nRawArc & self._arcMask + hDst.write(" {:<20} {:0>16} i{:>10} #{:_>10}\n".format(self.lArcVal[nArc], bin(nRawArc)[2:], "?", \ + int.from_bytes(self.byDic[iEndArcAddr:iEndArcAddr+self.nBytesNodeAddress], \ + byteorder='big'))) + iAddr = iEndArcAddr+self.nBytesNodeAddress + if (nRawArc & self._lastArcMask) and iAddr < len(self.byDic): + hDst.write("\ni{:_>10} -- #{:_>10}\n".format("?", iAddr)) + hDst.close() + + # VERSION 2 + def _morph2 (self, sWord): + "returns morphologies of " + iAddr = 0 + for c in sWord: + if c not in self.dChar: + return [] + iAddr = self._lookupArcNode(self.dChar[c], iAddr) + if iAddr == None: + return [] + if (int.from_bytes(self.byDic[iAddr:iAddr+self.nBytesArc], byteorder='big') & self._finalNodeMask): + l = [] + nRawArc = 0 + while not (nRawArc & self._lastArcMask): + iEndArcAddr = iAddr + self.nBytesArc + nRawArc = int.from_bytes(self.byDic[iAddr:iEndArcAddr], byteorder='big') + nArc = nRawArc & self._arcMask + if nArc >= self.nChar: + # This value is not a char, this is a stemming code + sStem = ">" + self.funcStemming(sWord, self.lArcVal[nArc]) + # Now , we go to the next node and retrieve all following arcs values, all of them are tags + if not (nRawArc & self._addrBitMask): + iAddr2 = int.from_bytes(self.byDic[iEndArcAddr:iEndArcAddr+self.nBytesNodeAddress], byteorder='big') + else: + # we go to the end of the node + iAddr2 = iEndArcAddr + while not (nRawArc & self._lastArcMask): + nRawArc = int.from_bytes(self.byDic[iAddr2:iAddr2+self.nBytesArc], byteorder='big') + iAddr2 += self.nBytesArc + self.nBytesNodeAddress + nRawArc2 = 0 + while not (nRawArc2 & self._lastArcMask): + iEndArcAddr2 = iAddr2 + self.nBytesArc + nRawArc2 = int.from_bytes(self.byDic[iAddr2:iEndArcAddr2], byteorder='big') + l.append(sStem + " " + self.lArcVal[nRawArc2 & self._arcMask]) + iAddr2 = iEndArcAddr2+self.nBytesNodeAddress if not (nRawArc2 & self._addrBitMask) else iEndArcAddr2 + iAddr = iEndArcAddr+self.nBytesNodeAddress if not (nRawArc & self._addrBitMask) else iEndArcAddr + return l + return [] + + def _stem2 (self, sWord): + "returns stems list of " + iAddr = 0 + for c in sWord: + if c not in self.dChar: + return [] + iAddr = self._lookupArcNode(self.dChar[c], iAddr) + if iAddr == None: + return [] + if (int.from_bytes(self.byDic[iAddr:iAddr+self.nBytesArc], byteorder='big') & self._finalNodeMask): + l = [] + nRawArc = 0 + while not (nRawArc & self._lastArcMask): + iEndArcAddr = iAddr + self.nBytesArc + nRawArc = int.from_bytes(self.byDic[iAddr:iEndArcAddr], byteorder='big') + nArc = nRawArc & self._arcMask + if nArc >= self.nChar: + # This value is not a char, this is a stemming code + l.append(self.funcStemming(sWord, self.lArcVal[nArc])) + # Now , we go to the next node + if not (nRawArc & self._addrBitMask): + iAddr2 = int.from_bytes(self.byDic[iEndArcAddr:iEndArcAddr+self.nBytesNodeAddress], byteorder='big') + else: + # we go to the end of the node + iAddr2 = iEndArcAddr + while not (nRawArc & self._lastArcMask): + nRawArc = int.from_bytes(self.byDic[iAddr2:iAddr2+self.nBytesArc], byteorder='big') + iAddr2 += self.nBytesArc + self.nBytesNodeAddress + iAddr = iEndArcAddr+self.nBytesNodeAddress if not (nRawArc & self._addrBitMask) else iEndArcAddr + return l + return [] + + def _lookupArcNode2 (self, nVal, iAddr): + "looks if is an arc at the node at , if yes, returns address of next node else None" + while True: + iEndArcAddr = iAddr+self.nBytesArc + nRawArc = int.from_bytes(self.byDic[iAddr:iEndArcAddr], byteorder='big') + if nVal == (nRawArc & self._arcMask): + # the value we are looking for + if not (nRawArc & self._addrBitMask): + # we return the address of the next node + return int.from_bytes(self.byDic[iEndArcAddr:iEndArcAddr+self.nBytesNodeAddress], byteorder='big') + else: + # we go to the end of the node + iAddr = iEndArcAddr + while not (nRawArc & self._lastArcMask): + nRawArc = int.from_bytes(self.byDic[iAddr:iAddr+self.nBytesArc], byteorder='big') + iAddr += self.nBytesArc + self.nBytesNodeAddress if not (nRawArc & self._addrBitMask) else self.nBytesArc + return iAddr + else: + # value not found + if (nRawArc & self._lastArcMask): + return None + iAddr = iEndArcAddr+self.nBytesNodeAddress if not (nRawArc & self._addrBitMask) else iEndArcAddr + + def _writeNodes2 (self, spfDest): + "for debugging only" + print(" > Write binary nodes") + with codecs.open(spfDest, 'w', 'utf-8', newline="\n") as hDst: + iAddr = 0 + hDst.write("i{:_>10} -- #{:_>10}\n".format("0", iAddr)) + while iAddr < len(self.byDic): + iEndArcAddr = iAddr+self.nBytesArc + nRawArc = int.from_bytes(self.byDic[iAddr:iEndArcAddr], byteorder='big') + nArc = nRawArc & self._arcMask + if not (nRawArc & self._addrBitMask): + iNextNodeAddr = int.from_bytes(self.byDic[iEndArcAddr:iEndArcAddr+self.nBytesNodeAddress], byteorder='big') + hDst.write(" {:<20} {:0>16} i{:>10} #{:_>10}\n".format(self.lArcVal[nArc], bin(nRawArc)[2:], "?", iNextNodeAddr)) + iAddr = iEndArcAddr+self.nBytesNodeAddress + else: + hDst.write(" {:<20} {:0>16}\n".format(self.lArcVal[nArc], bin(nRawArc)[2:])) + iAddr = iEndArcAddr + if (nRawArc & self._lastArcMask): + hDst.write("\ni{:_>10} -- #{:_>10}\n".format("?", iAddr)) + hDst.close() + + # VERSION 3 + def _morph3 (self, sWord): + "returns morphologies of " + iAddr = 0 + for c in sWord: + if c not in self.dChar: + return [] + iAddr = self._lookupArcNode(self.dChar[c], iAddr) + if iAddr == None: + return [] + if (int.from_bytes(self.byDic[iAddr:iAddr+self.nBytesArc], byteorder='big') & self._finalNodeMask): + l = [] + nRawArc = 0 + iAddrNode = iAddr + while not (nRawArc & self._lastArcMask): + iEndArcAddr = iAddr + self.nBytesArc + nRawArc = int.from_bytes(self.byDic[iAddr:iEndArcAddr], byteorder='big') + nArc = nRawArc & self._arcMask + if nArc >= self.nChar: + # This value is not a char, this is a stemming code + sStem = ">" + self.funcStemming(sWord, self.lArcVal[nArc]) + # Now , we go to the next node and retrieve all following arcs values, all of them are tags + if not (nRawArc & self._addrBitMask): + iAddr2 = int.from_bytes(self.byDic[iEndArcAddr:iEndArcAddr+self.nBytesNodeAddress], byteorder='big') + else: + iAddr2 = iAddrNode + int.from_bytes(self.byDic[iEndArcAddr:iEndArcAddr+self.nBytesOffset], byteorder='big') + nRawArc2 = 0 + while not (nRawArc2 & self._lastArcMask): + iEndArcAddr2 = iAddr2 + self.nBytesArc + nRawArc2 = int.from_bytes(self.byDic[iAddr2:iEndArcAddr2], byteorder='big') + l.append(sStem + " " + self.lArcVal[nRawArc2 & self._arcMask]) + iAddr2 = iEndArcAddr2+self.nBytesNodeAddress if not (nRawArc2 & self._addrBitMask) else iEndArcAddr2+self.nBytesOffset + iAddr = iEndArcAddr+self.nBytesNodeAddress if not (nRawArc & self._addrBitMask) else iEndArcAddr+self.nBytesOffset + return l + return [] + + def _stem3 (self, sWord): + "returns stems list of " + iAddr = 0 + for c in sWord: + if c not in self.dChar: + return [] + iAddr = self._lookupArcNode(self.dChar[c], iAddr) + if iAddr == None: + return [] + if (int.from_bytes(self.byDic[iAddr:iAddr+self.nBytesArc], byteorder='big') & self._finalNodeMask): + l = [] + nRawArc = 0 + iAddrNode = iAddr + while not (nRawArc & self._lastArcMask): + iEndArcAddr = iAddr + self.nBytesArc + nRawArc = int.from_bytes(self.byDic[iAddr:iEndArcAddr], byteorder='big') + nArc = nRawArc & self._arcMask + if nArc >= self.nChar: + # This value is not a char, this is a stemming code + l.append(self.funcStemming(sWord, self.lArcVal[nArc])) + iAddr = iEndArcAddr+self.nBytesNodeAddress if not (nRawArc & self._addrBitMask) else iEndArcAddr+self.nBytesOffset + return l + return [] + + def _lookupArcNode3 (self, nVal, iAddr): + "looks if is an arc at the node at , if yes, returns address of next node else None" + iAddrNode = iAddr + while True: + iEndArcAddr = iAddr+self.nBytesArc + nRawArc = int.from_bytes(self.byDic[iAddr:iEndArcAddr], byteorder='big') + if nVal == (nRawArc & self._arcMask): + # the value we are looking for + if not (nRawArc & self._addrBitMask): + return int.from_bytes(self.byDic[iEndArcAddr:iEndArcAddr+self.nBytesNodeAddress], byteorder='big') + else: + return iAddrNode + int.from_bytes(self.byDic[iEndArcAddr:iEndArcAddr+self.nBytesOffset], byteorder='big') + else: + # value not found + if (nRawArc & self._lastArcMask): + return None + iAddr = iEndArcAddr+self.nBytesNodeAddress if not (nRawArc & self._addrBitMask) else iEndArcAddr+self.nBytesOffset + + def _writeNodes3 (self, spfDest): + "for debugging only" + print(" > Write binary nodes") + with codecs.open(spfDest, 'w', 'utf-8', newline="\n") as hDst: + iAddr = 0 + hDst.write("i{:_>10} -- #{:_>10}\n".format("0", iAddr)) + while iAddr < len(self.byDic): + iEndArcAddr = iAddr+self.nBytesArc + nRawArc = int.from_bytes(self.byDic[iAddr:iEndArcAddr], byteorder='big') + nArc = nRawArc & self._arcMask + if not (nRawArc & self._addrBitMask): + iNextNodeAddr = int.from_bytes(self.byDic[iEndArcAddr:iEndArcAddr+self.nBytesNodeAddress], byteorder='big') + hDst.write(" {:<20} {:0>16} i{:>10} #{:_>10}\n".format(self.lArcVal[nArc], bin(nRawArc)[2:], "?", iNextNodeAddr)) + iAddr = iEndArcAddr+self.nBytesNodeAddress + else: + iNextNodeAddr = int.from_bytes(self.byDic[iEndArcAddr:iEndArcAddr+self.nBytesOffset], byteorder='big') + hDst.write(" {:<20} {:0>16} i{:>10} +{:_>10}\n".format(self.lArcVal[nArc], bin(nRawArc)[2:], "?", iNextNodeAddr)) + iAddr = iEndArcAddr+self.nBytesOffset + if (nRawArc & self._lastArcMask): + hDst.write("\ni{:_>10} -- #{:_>10}\n".format("?", iAddr)) + hDst.close() ADDED graphspell/keyboard_chars_proximity.py Index: graphspell/keyboard_chars_proximity.py ================================================================== --- graphspell/keyboard_chars_proximity.py +++ graphspell/keyboard_chars_proximity.py @@ -0,0 +1,220 @@ +# Keyboard chars proximity + + +def getKeyboardMap (sKeyboard): + return _dKeyboardMap.get(sKeyboard.lower(), {}) + + +def getKeyboardList (): + return _dKeyboardMap.keys() + + +_dKeyboardMap = { + # keyboards by alphabetical order + # bépo, colemak and dvorak users are assumed to do less typing errors. + "azerty": { + # fr + # line 1 + "é": "az", + "è": "yu", + "ç": "àio", + "à": "op", + # line 2 + "a": "zéq", + "z": "aesq", + "e": "zrds", + "r": "etfd", + "t": "rygf", + "y": "tuhg", + "u": "yijh", + "i": "uokj", + "o": "iplk", + "p": "oml", + # line 3 + "q": "sawz", + "s": "qdzwxe", + "d": "sfexcr", + "f": "dgrcvt", + "g": "fhtvby", + "h": "gjybnu", + "j": "hkuni", + "k": "jlio", + "l": "kmop", + "m": "lùp", + "ù": "m", + # line 4 + "w": "xqs", + "x": "wcsd", + "c": "xvdf", + "v": "cbfg", + "b": "vngh", + "n": "bhj", + }, + "bépo": { + # fr + # line 2 + "b": "éa", + "é": "bpu", + "p": "éoi", + "o": "pèe", + "è": "o", + "v": "dt", + "d": "vls", + "l": "djr", + "j": "lzn", + "z": "jmw", + # line 3 + "a": "ubà", + "u": "aiéy", + "i": "uepx", + "e": "io", + "c": "t", + "t": "csvq", + "s": "trdg", + "r": "snlh", + "n": "rmjf", + "m": "nzç", + # line 4 + "à": "yêa", + "y": "àxu", + "x": "ywi", + "w": "z", + "k": "c", + "q": "gt", + "g": "qhs", + "h": "gfr", + "f": "hçn", + "ç": "fm", + }, + "colemak": { + # en, us, intl + # line 2 + "q": "wa", + "w": "qfr", + "f": "wps", + "p": "fgt", + "g": "pjd", + "j": "glh", + "l": "jun", + "u": "lye", + "y": "ui", + # line 3 + "a": "rqz", + "r": "aswx", + "s": "rtfc", + "t": "sdpv", + "d": "thgb", + "h": "dnjk", + "n": "helm", + "e": "niu", + "i": "eoy", + "o": "i", + # line 4 + "z": "xa", + "x": "zcr", + "c": "xvs", + "v": "cbt", + "b": "vkd", + "k": "bmh", + "m": "kn", + }, + "dvorak": { + # en, us, intl + # line 2 + "p": "yu", + "y": "pfi", + "f": "ygd", + "g": "fch", + "c": "grt", + "r": "cln", + "l": "rs", + # line 3 + "a": "o", + "o": "aeq", + "e": "ouj", + "u": "eipk", + "i": "udyx", + "d": "ihfb", + "h": "dtgm", + "t": "hncw", + "n": "tsrv", + "s": "nlz", + # line 4 + "q": "jo", + "j": "qke", + "k": "jxu", + "x": "kbi", + "b": "xmd", + "m": "bwh", + "w": "mvt", + "v": "wzn", + "z": "vs", + }, + "qwerty": { + # en, us, intl + # line 2 + "q": "wa", + "w": "qeas", + "e": "wrds", + "r": "etfd", + "t": "rygf", + "y": "tuhg", + "u": "yijh", + "i": "uokj", + "o": "iplk", + "p": "ol", + # line 3 + "a": "sqzw", + "s": "adwzxe", + "d": "sfexcr", + "f": "dgrcvt", + "g": "fhtvby", + "h": "gjybnu", + "j": "hkunmi", + "k": "jlimo", + "l": "kop", + # line 4 + "z": "xas", + "x": "zcsd", + "c": "xvdf", + "v": "cbfg", + "b": "vngh", + "n": "bmhj", + "m": "njk", + }, + "qwertz": { + # ge, au + # line 2 + "q": "wa", + "w": "qeas", + "e": "wrds", + "r": "etfd", + "t": "rzgf", + "z": "tuhg", + "u": "zijh", + "i": "uokj", + "o": "iplk", + "p": "oüöl", + "ü": "päö", + # line 3 + "a": "sqyw", + "s": "adwyxe", + "d": "sfexcr", + "f": "dgrcvt", + "g": "fhtvbz", + "h": "gjzbnu", + "j": "hkunmi", + "k": "jlimo", + "l": "köop", + "ö": "läpü", + "ä": "öü", + # line 4 + "y": "xas", + "x": "ycsd", + "c": "xvdf", + "v": "cbfg", + "b": "vngh", + "n": "bmhj", + "m": "njk", + } +} ADDED graphspell/progressbar.py Index: graphspell/progressbar.py ================================================================== --- graphspell/progressbar.py +++ graphspell/progressbar.py @@ -0,0 +1,35 @@ +# Textual progressbar +# by Olivier R. +# License: MPL 2 + +import time + +class ProgressBar: + "Textual progressbar" + + def __init__ (self, nMin=0, nMax=100, nWidth=78): + "initiate with minimum nMin to maximum nMax" + self.nMin = nMin + self.nMax = nMax + self.nSpan = nMax - nMin + self.nWidth = nWidth-9 + self.nAdvance = -1 + self.nCurVal = nMin + self.startTime = time.time() + self._update() + + def _update (self): + fDone = ((self.nCurVal - self.nMin) / self.nSpan) + nAdvance = int(fDone * self.nWidth) + if (nAdvance > self.nAdvance): + self.nAdvance = nAdvance + print("\r[ {}{} {}% ] ".format('>'*nAdvance, ' '*(self.nWidth-nAdvance), round(fDone*100)), end="") + + def increment (self, n=1): + "increment value by n (1 by default)" + self.nCurVal += n + self._update() + + def done (self): + "to call when it’s finished" + print("\r[ task done in {:.1f} s ] ".format(time.time() - self.startTime)) ADDED graphspell/spellchecker.py Index: graphspell/spellchecker.py ================================================================== --- graphspell/spellchecker.py +++ graphspell/spellchecker.py @@ -0,0 +1,134 @@ +# Spellchecker +# Wrapper for the IBDAWG class. +# Useful to check several dictionaries at once. + +from . import ibdawg + + +dDictionaries = { + "fr": "French.bdic", + "en": "English.bdic" +} + + +class Spellchecker (): + + def __init__ (self, sLangCode): + self.sLangCode = sLangCode + self.oMainDic = None + if sLangCode in dDictionaries: + self.oMainDic = ibdawg.IBDAWG(dDictionaries[sLangCode]) + self.lOtherDic = [] + return bool(self.oMainDic) + + + def setMainDictionary (self, sDicName): + try: + self.oMainDic = ibdawg.IBDAWG(sDicName) + return True + except: + print("Error: <" + sDicName + "> not set as main dictionary.") + return False + + def addDictionary (self, sDicName): + try: + self.lOtherDic.append(ibdawg.IBDAWG(sDicName)) + return True + except: + print("Error: <" + sDicName + "> not added to the list.") + return False + + # Return codes: + # 0: invalid + # 1: correct in main dictionary + # 2+: correct in foreign dictionaries + + + # check in the main dictionary only + + def isValidToken (self, sToken): + "(in main dictionary) checks if sToken is valid (if there is hyphens in sToken, sToken is split, each part is checked)" + if self.oMainDic.isValidToken(sToken): + return 1 + return 0 + + def isValid (self, sWord): + "(in main dictionary) checks if sWord is valid (different casing tested if the first letter is a capital)" + if self.oMainDic.isValid(sWord): + return 1 + return 0 + + def lookup (self, sWord): + "(in main dictionary) checks if sWord is in dictionary as is (strict verification)" + if self.oMainDic.lookup(sWord): + return 1 + return 0 + + + # check in all dictionaries + + def isValidTokenAll (self, sToken): + "(in all dictionaries) checks if sToken is valid (if there is hyphens in sToken, sToken is split, each part is checked)" + if self.oMainDic.isValidToken(sToken): + return 1 + for i, oDic in enumerate(self.lOtherDic, 2): + if oDic.isValidToken(sToken): + return i + return 0 + + def isValidAll (self, sWord): + "(in all dictionaries) checks if sWord is valid (different casing tested if the first letter is a capital)" + if self.oMainDic.isValid(sToken): + return 1 + for i, oDic in enumerate(self.lOtherDic, 2): + if oDic.isValid(sToken): + return i + return 0 + + def lookupAll (self, sWord): + "(in all dictionaries) checks if sWord is in dictionary as is (strict verification)" + if self.oMainDic.lookup(sToken): + return 1 + for i, oDic in enumerate(self.lOtherDic, 2): + if oDic.lookup(sToken): + return i + return 0 + + + # check in dictionaries up to level n + + def isValidTokenLevel (self, sToken, nLevel): + "(in dictionaries up to level n) checks if sToken is valid (if there is hyphens in sToken, sToken is split, each part is checked)" + if self.oMainDic.isValidToken(sToken): + return 1 + if nLevel >= 2: + for i, oDic in enumerate(self.lOtherDic, 2): + if oDic.isValidToken(sToken): + return i + if i == nLevel: + break + return 0 + + def isValidLevel (self, sWord, nLevel): + "(in dictionaries up to level n) checks if sWord is valid (different casing tested if the first letter is a capital)" + if self.oMainDic.isValid(sToken): + return 1 + if nLevel >= 2: + for i, oDic in enumerate(self.lOtherDic, 2): + if oDic.isValid(sToken): + return i + if i == nLevel: + break + return 0 + + def lookupLevel (self, sWord, nLevel): + "(in dictionaries up to level n) checks if sWord is in dictionary as is (strict verification)" + if self.oMainDic.lookup(sToken): + return 1 + if nLevel >= 2: + for i, oDic in enumerate(self.lOtherDic, 2): + if oDic.lookup(sToken): + return i + if i == nLevel: + break + return 0 ADDED graphspell/str_transform.py Index: graphspell/str_transform.py ================================================================== --- graphspell/str_transform.py +++ graphspell/str_transform.py @@ -0,0 +1,203 @@ +#!python3 + + +#### DISTANCE CALCULATIONS + +def longestCommonSubstring (s1, s2): + # http://en.wikipedia.org/wiki/Longest_common_substring_problem + # http://en.wikibooks.org/wiki/Algorithm_implementation/Strings/Longest_common_substring + M = [ [0]*(1+len(s2)) for i in range(1+len(s1)) ] + longest, x_longest = 0, 0 + for x in range(1, 1+len(s1)): + for y in range(1, 1+len(s2)): + if s1[x-1] == s2[y-1]: + M[x][y] = M[x-1][y-1] + 1 + if M[x][y] > longest: + longest = M[x][y] + x_longest = x + else: + M[x][y] = 0 + return s1[x_longest-longest : x_longest] + + +def distanceDamerauLevenshtein (s1, s2): + "distance of Damerau-Levenshtein between and " + # https://fr.wikipedia.org/wiki/Distance_de_Damerau-Levenshtein + d = {} + nLen1 = len(s1) + nLen2 = len(s2) + for i in range(-1, nLen1+1): + d[i, -1] = i + 1 + for j in range(-1, nLen2+1): + d[-1, j] = j + 1 + for i in range(nLen1): + for j in range(nLen2): + nCost = 0 if s1[i] == s2[j] else 1 + d[i, j] = min( + d[i-1, j] + 1, # Deletion + d[i, j-1] + 1, # Insertion + d[i-1, j-1] + nCost, # Substitution + ) + if i and j and s1[i] == s2[j-1] and s1[i-1] == s2[j]: + d[i, j] = min(d[i, j], d[i-2, j-2] + nCost) # Transposition + return d[nLen1-1, nLen2-1] + + +def distanceSift4 (s1, s2, nMaxOffset=5): + "implementation of general Sift4." + # https://siderite.blogspot.com/2014/11/super-fast-and-accurate-string-distance.html + if not s1: + return len(s2) + if not s2: + return len(s1) + nLen1, nLen2 = len(s1), len(s2) + i1, i2 = 0, 0 # Cursors for each string + nLargestCS = 0 # Largest common substring + nLocalCS = 0 # Local common substring + nTrans = 0 # Number of transpositions ('ab' vs 'ba') + lOffset = [] # Offset pair array, for computing the transpositions + + while i1 < nLen1 and i2 < nLen2: + if s1[i1] == s2[i2]: + nLocalCS += 1 + # Check if current match is a transposition + bTrans = False + i = 0 + while i < len(lOffset): + t = lOffset[i] + if i1 <= t[0] or i2 <= t[1]: + bTrans = abs(i2-i1) >= abs(t[1] - t[0]) + if bTrans: + nTrans += 1 + elif not t[2]: + t[2] = True + nTrans += 1 + break + elif i1 > t[1] and i2 > t[0]: + del lOffset[i] + else: + i += 1 + lOffset.append([i1, i2, bTrans]) + else: + nLargestCS += nLocalCS + nLocalCS = 0 + if i1 != i2: + i1 = i2 = min(i1, i2) + for i in range(nMaxOffset): + if i1 + i >= nLen1 and i2 + i >= nLen2: + break + elif i1 + i < nLen1 and s1[i1+i] == s2[i2]: + i1 += i - 1 + i2 -= 1 + break + elif i2 + i < nLen2 and s1[i1] == s2[i2+i]: + i2 += i - 1 + i1 -= 1 + break + i1 += 1 + i2 += 1 + if i1 >= nLen1 or i2 >= nLen2: + nLargestCS += nLocalCS + nLocalCS = 0 + i1 = i2 = min(i1, i2) + nLargestCS += nLocalCS + return round(max(nLen1, nLen2) - nLargestCS + nTrans) + + +def showDistance (s1, s2): + print("Damerau-Levenshtein: " + s1 + "/" + s2 + " = " + distanceDamerauLevenshtein(s1, s2)) + print("Sift4:" + s1 + "/" + s2 + " = " + distanceSift4(s1, s2)) + + + + +#### STEMMING OPERATIONS + +## No stemming + +def noStemming (sFlex, sStem): + return sStem + +def rebuildWord (sFlex, cmd1, cmd2): + if cmd1 == "_": + return sFlex + n, c = cmd1.split(":") + s = s[:n] + c + s[n:] + if cmd2 == "_": + return s + n, c = cmd2.split(":") + return s[:n] + c + s[n:] + + +## Define affixes for stemming + +# Note: 48 is the ASCII code for "0" + + +# Suffix only +def defineSuffixCode (sFlex, sStem): + """ Returns a string defining how to get stem from flexion + "n(sfx)" + with n: a char with numeric meaning, "0" = 0, "1" = 1, ... ":" = 10, etc. (See ASCII table.) Says how many letters to strip from flexion. + sfx [optional]: string to add on flexion + Examples: + "0": strips nothing, adds nothing + "1er": strips 1 letter, adds "er" + "2": strips 2 letters, adds nothing + """ + if sFlex == sStem: + return "0" + jSfx = 0 + for i in range(min(len(sFlex), len(sStem))): + if sFlex[i] != sStem[i]: + break + jSfx += 1 + return chr(len(sFlex)-jSfx+48) + sStem[jSfx:] + + +def changeWordWithSuffixCode (sWord, sSfxCode): + if sSfxCode == "0": + return sWord + return sWord[:-(ord(sSfxCode[0])-48)] + sSfxCode[1:] if sSfxCode[0] != '0' else sWord + sSfxCode[1:] + + +# Prefix and suffix + +def defineAffixCode (sFlex, sStem): + """ Returns a string defining how to get stem from flexion. Examples: + "0" if stem = flexion + "stem" if no common substring + "n(pfx)/m(sfx)" + with n and m: chars with numeric meaning, "0" = 0, "1" = 1, ... ":" = 10, etc. (See ASCII table.) Says how many letters to strip from flexion. + pfx [optional]: string to add before the flexion + sfx [optional]: string to add after the flexion + """ + if sFlex == sStem: + return "0" + # is stem a substring of flexion? + n = sFlex.find(sStem) + if n >= 0: + return "{}/{}".format(chr(n+48), chr(len(sFlex)-(len(sStem)+n)+48)) + # no, so we are looking for common substring + sSubs = longestCommonSubstring(sFlex, sStem) + if len(sSubs) > 1: + iPos = sStem.find(sSubs) + sPfx = sStem[:iPos] + sSfx = sStem[iPos+len(sSubs):] + n = sFlex.find(sSubs) + m = len(sFlex) - (len(sSubs)+n) + sAff = "{}/".format(chr(n+48)) if not sPfx else "{}{}/".format(chr(n+48), sPfx) + sAff += chr(m+48) if not sSfx else "{}{}".format(chr(m+48), sSfx) + return sAff + return sStem + + +def changeWordWithAffixCode (sWord, sAffCode): + if sAffCode == "0": + return sWord + if '/' not in sAffCode: + return "# error #" + sPfxCode, sSfxCode = sAffCode.split('/') + sWord = sPfxCode[1:] + sWord[(ord(sPfxCode[0])-48):] + return sWord[:-(ord(sSfxCode[0])-48)] + sSfxCode[1:] if sSfxCode[0] != '0' else sWord + sSfxCode[1:] + ADDED graphspell/tokenizer.py Index: graphspell/tokenizer.py ================================================================== --- graphspell/tokenizer.py +++ graphspell/tokenizer.py @@ -0,0 +1,49 @@ +# Very simple tokenizer + +import re + +_PATTERNS = { + "default": + ( + r'(?P/(?:bin|boot|dev|etc|home|lib|mnt|opt|root|sbin|tmp|usr|var|Bureau|Documents|Images|Musique|Public|Téléchargements|Vidéos)(?:/[\w.()-]+)*)', + r'(?P[a-zA-Z]:\\(?:Program Files(?: [(]x86[)]|)|[\w.()]+)(?:\\[\w.()-]+)*)', + r'(?P[.,?!:;…«»“”"()/·]+)', + r'(?P[A-Z][.][A-Z][.](?:[A-Z][.])*)', + r'(?P(?:https?://|www[.]|\w+[@.]\w\w+[@.])\w[\w./?&!%=+*"\'@$#-]+)', + r'(?P[#@][\w-]+)', + r'(?P<\w+.*?>|)', + r'(?P\[/?\w+\])', + r'(?P\d\d?h\d\d\b)', + r'(?P-?\d+(?:[.,]\d+))', + r"(?P\w+(?:[’'`-]\w+)*)" + ), + "fr": + ( + r'(?P/(?:bin|boot|dev|etc|home|lib|mnt|opt|root|sbin|tmp|usr|var|Bureau|Documents|Images|Musique|Public|Téléchargements|Vidéos)(?:/[\w.()-]+)*)', + r'(?P[a-zA-Z]:\\(?:Program Files(?: [(]x86[)]|)|[\w.()]+)(?:\\[\w.()-]+)*)', + r'(?P[.,?!:;…«»“”"()/·]+)', + r'(?P[A-Z][.][A-Z][.](?:[A-Z][.])*)', + r'(?P(?:https?://|www[.]|\w+[@.]\w\w+[@.])\w[\w./?&!%=+*"\'@$#-]+)', + r'(?P[#@][\w-]+)', + r'(?P<\w+.*?>|)', + r'(?P\[/?\w+\])', + r"(?P(?:l|d|n|m|t|s|j|c|ç|lorsqu|puisqu|jusqu|quoiqu|qu)['’`])", + r'(?P\d+(?:er|nd|e|de|ième|ème|eme)\b)', + r'(?P\d\d?h\d\d\b)', + r'(?P-?\d+(?:[.,]\d+|))', + r"(?P\w+(?:[’'`-]\w+)*)" + ) +} + + +class Tokenizer: + + def __init__ (self, sLang): + self.sLang = sLang + if sLang not in _PATTERNS: + self.sLang = "default" + self.zToken = re.compile( "(?i)" + '|'.join(sRegex for sRegex in _PATTERNS[sLang]) ) + + def genTokens (self, sText): + for m in self.zToken.finditer(sText): + yield { "sType": m.lastgroup, "sValue": m.group(), "nStart": m.start(), "nEnd": m.end() }