Index: cli.py ================================================================== --- cli.py +++ cli.py @@ -205,13 +205,11 @@ for sMorph in oDict.getMorph(sWord): echo(" {:<32} {}".format(sMorph, oLexGraphe.formatTags(sMorph))) elif sText.startswith("!"): for sWord in sText[1:].strip().split(): if sWord: - echo("* suggestions for: " + sWord) - for sSugg in oDict.suggest(sWord): - echo(" > " + sSugg) + echo(" | ".join(oDict.suggest(sWord))) elif sText.startswith("/+ "): gce.setOptions({ opt:True for opt in sText[3:].strip().split() if opt in gce.getOptions() }) echo("done") elif sText.startswith("/- "): gce.setOptions({ opt:False for opt in sText[3:].strip().split() if opt in gce.getOptions() }) Index: gc_core/py/char_player.py ================================================================== --- gc_core/py/char_player.py +++ gc_core/py/char_player.py @@ -1,42 +1,91 @@ # list of similar chars # useful for suggestion mechanism -dSimilarChar = { - "1": "l", - - "a": "aàâáä", - "à": "aàâáä", - "â": "aàâáä", - "á": "aàâáä", - "ä": "aàâáä", - - "c": "cçsśŝ", - "ç": "cçsśŝ", - - "e": "eéêèë", - "é": "eéêèë", - "ê": "eéêèë", - "è": "eéêèë", - "ë": "eéêèë", - - "g": "j", - "j": "g", - - "i": "iîïyíìÿ", - "î": "iîïyíìÿ", - "ï": "iîïyíìÿ", - "í": "iîïyíìÿ", - "ì": "iîïyíìÿ", - - "n": "nñńǹ", - - "o": "oôóòö", - "ô": "oôóòö", - "ó": "oôóòö", - "ò": "oôóòö", - "ö": "oôóòö", + +# Method: Remove Useless Chars + +_dUselessChar = { + 'a': '', 'e': '', 'i': '', 'o': '', 'u': '', 'y': '', + 'à': '', 'é': '', 'î': '', 'ô': '', 'û': '', 'ÿ': '', + 'â': '', 'è': '', 'ï': '', 'ö': '', 'ù': '', 'ŷ': '', + 'ä': '', 'ê': '', 'í': '', 'ó': '', 'ü': '', 'ý': '', + 'á': '', 'ë': '', 'ì': '', 'ò': '', 'ú': '', 'ỳ': '', + 'ā': '', 'ē': '', 'ī': '', 'ō': '', 'ū': '', 'ȳ': '', + 'h': '', 'œ': '', 'æ': '' + } + +_CHARMAP = str.maketrans(_dUselessChar) + +aUselessChar = frozenset(_dUselessChar.keys()) + +def clearWord (sWord): + "remove vovels and h" + return sWord.translate(_CHARMAP) + + +# Similar chars + +d1to1 = { + "1": "li", + "2": "e", + "3": "e", + "4": "aà", + "5": "ge", + "6": "bd", + "7": "lt", + "8": "b", + "9": "gbd", + + "a": "aàâáäæ", + "à": "aàâáäæ", + "â": "aàâáäæ", + "á": "aàâáäæ", + "ä": "aàâáäæ", + + "æ": "éa", + + "c": "cçskqśŝ", + "ç": "cçskqśŝ", + + "e": "eéèêëœ", + "é": "eéèêëœ", + "ê": "eéèêëœ", + "è": "eéèêëœ", + "ë": "eéèêëœ", + + "f": "fv", + + "g": "gjq", + + "i": "iîïylíìÿ", + "î": "iîïylíìÿ", + "ï": "iîïylíìÿ", + "í": "iîïylíìÿ", + "ì": "iîïylíìÿ", + + "j": "jg", + + "k": "kcq", + + "l": "li", + + "n": "nñr", + + "o": "oôóòöœ", + "ô": "oôóòöœ", + "ó": "oôóòöœ", + "ò": "oôóòöœ", + "ö": "oôóòöœ", + + "œ": "œoôeéèêë", + + "p": "pb", + + "q": "ckg", + + "r": "rn", "s": "sśŝcç", "ś": "sśŝcç", "ŝ": "sśŝcç", @@ -43,12 +92,171 @@ "u": "uûùüú", "û": "uûùüú", "ù": "uûùüú", "ü": "uûùüú", "ú": "uûùüú", + + "v": "vwf", + + "w": "wv", + + "x": "xck", "y": "yÿiîŷýỳ", "ÿ": "yÿiîŷýỳ", "ŷ": "yÿiîŷýỳ", "ý": "yÿiîŷýỳ", "ỳ": "yÿiîŷýỳ", + + "z": "zs", +} + +d1toX = { + "æ": ("ae",), + "b": ("bb",), + "c": ("cc", "ss", "qu", "ch"), + "ç": ("ss", "cc", "qh", "ch"), + "d": ("dd",), + "f": ("ff", "ph"), + "g": ("gu", "ge", "gg", "gh"), + "i": ("ii",), + "j": ("jj", "dj"), + "k": ("qu", "ck", "ch", "cu", "kk", "kh"), + "l": ("ll",), + "m": ("mm", "mn"), + "n": ("nn", "nm", "mn"), + "o": ("au", "eau", "aut"), + "œ": ("oe", "eu"), + "p": ("pp", "ph"), + "q": ("qu", "ch", "cq", "ck", "kk"), + "r": ("rr",), + "s": ("ss", "sh"), + "t": ("tt", "th"), + "x": ("cc", "ct", "xx"), + "z": ("ss", "zh") +} + +d2toX = { + "an": ("en",), + "en": ("an",), + "ai": ("ei", "é", "è", "ê", "ë"), + "ei": ("ai", "ait", "ais", "é", "è", "ê", "ë"), + "ch": ("sh", "c", "ss"), + "ct": ("x", "cc"), + "oa": ("oi",), + "oi": ("oa", "oie"), + "qu": ("q", "cq", "ck", "c", "k"), +} + + +# End of word + +dFinal1 = { + "a": ("as", "at", "ant"), + + "e": ("et", "er", "ets", "ée", "ez", "ai", "ais", "ait", "ent"), + "é": ("et", "er", "ets", "ée", "ez", "ai", "ais", "ait"), + "è": ("et", "er", "ets", "ée", "ez", "ai", "ais", "ait"), + "ê": ("et", "er", "ets", "ée", "ez", "ai", "ais", "ait"), + "ë": ("et", "er", "ets", "ée", "ez", "ai", "ais", "ait"), + + "i": ("is", "it", "ie", "in"), + + "n": ("nt", "nd", "ns"), + + "o": ("aut", "ot", "os"), + "ô": ("aut", "ot", "os"), + "ö": ("aut", "ot", "os"), + + "u": ("ut", "us"), +} + +dFinal2 = { + "an": ("ant", "ent"), + "en": ("ent", "ant"), + "ei": ("ait", "ais"), + "on": ("ons", "ont"), + "oi": ("ois", "oit", "oix"), +} + + +# Préfixes + +aPfx = ("anti", "contre", "mé", "im", "in", "ir", "par", "pré", "re", "ré", "sans", "sous", "sur") + + +# Keyboards + +dBépo = { + # on présume que le bépoète est moins susceptible de faire des erreurs de frappe que l’azertyste. + # ligne 2 + "b": "éa", + "é": "bpu", + "p": "éoi", + "o": "pèe", + "è": "o", + "v": "dt", + "d": "vls", + "l": "djr", + "j": "lzn", + "z": "jmw", + # ligne 3 + "a": "ubà", + "u": "aiéy", + "i": "uepx", + "e": "io", + "c": "t", + "t": "csvq", + "s": "trdg", + "r": "snlh", + "n": "rmjf", + "m": "nzç", + # ligne 4 + "à": "yêa", + "y": "àxu", + "x": "ywi", + "w": "z", + "k": "c", + "q": "gt", + "g": "qhs", + "h": "gfr", + "f": "hçn", + "ç": "fm", +} + +dAzerty = { + # ligne 1 + "é": "az", + "è": "yu", + "ç": "àio", + "à": "op", + # ligne 2 + "a": "zéqs", + "z": "aesqd", + "e": "zrdsf", + "r": "etfdg", + "t": "rygfh", + "y": "tuhgj", + "u": "yijhk", + "i": "uokjl", + "o": "iplkm", + "p": "oml", + # ligne 3 + "q": "sawz", + "s": "qdzwxe", + "d": "sfexcr", + "f": "dgrcvt", + "g": "fhtvby", + "h": "gjybnu", + "j": "hkuni", + "k": "jlio", + "l": "kmop", + "m": "lùp", + "ù": "m", + # ligne 4 + "w": "xqs", + "x": "wcsd", + "c": "xvdf", + "v": "cbfg", + "b": "vngh", + "n": "bhj", } Index: gc_core/py/ibdawg.py ================================================================== --- gc_core/py/ibdawg.py +++ gc_core/py/ibdawg.py @@ -2,15 +2,20 @@ # -*- coding: UTF-8 -*- import os import traceback import pkgutil +from itertools import chain from . import str_transform as st from . import char_player as cp from .echo import echo + +def show (nDeep, sText): + print(nDeep * " " + sText) + class IBDAWG: """INDEXABLE BINARY DIRECT ACYCLIC WORD GRAPH""" def __init__ (self, sDicName): @@ -49,13 +54,15 @@ elif self.cStemming == "A": self.funcStemming = st.changeWordWithAffixCode else: self.funcStemming = st.noStemming self.nTag = self.nArcVal - self.nChar - self.nAff + # to get the value of an arc, to get the char of an arc with its value self.dChar = {} for i in range(1, self.nChar): self.dChar[self.lArcVal[i]] = i + self.dVal = { v: k for k, v in self.dChar.items() } self._arcMask = (2 ** ((self.nBytesArc * 8) - 3)) - 1 self._finalNodeMask = 1 << ((self.nBytesArc * 8) - 1) self._lastArcMask = 1 << ((self.nBytesArc * 8) - 2) self._addrBitMask = 1 << ((self.nBytesArc * 8) - 3) # version 2 @@ -65,19 +72,22 @@ # Configuring DAWG functions according to nVersion if self.nVersion == 1: self.morph = self._morph1 self.stem = self._stem1 self._lookupArcNode = self._lookupArcNode1 + self._getArcs = self._getArcs1 self._writeNodes = self._writeNodes1 elif self.nVersion == 2: self.morph = self._morph2 self.stem = self._stem2 self._lookupArcNode = self._lookupArcNode2 + self._getArcs = self._getArcs2 self._writeNodes = self._writeNodes2 elif self.nVersion == 3: self.morph = self._morph3 self.stem = self._stem3 + self._getArcs = self._getArcs3 self._lookupArcNode = self._lookupArcNode3 self._writeNodes = self._writeNodes3 else: raise ValueError(" # Error: unknown code: {}".format(self.nVersion)) @@ -163,25 +173,79 @@ iAddr = self._lookupArcNode(self.dChar[c], iAddr) if iAddr == None: return False return int.from_bytes(self.byDic[iAddr:iAddr+self.nBytesArc], byteorder='big') & self._finalNodeMask - def suggest (self, sWord, iAddr=0, sNewWord=""): - "not finished" + def suggest (self, sWord): + "returns a set of similar words" + # first, we check for similar words + return set(self._suggestWithCrushedUselessChars(cp.clearWord(sWord))) + lSugg = self._suggest(sWord) + if not lSugg: + lSugg.extend(self._suggest(sWord[1:])) + lSugg.extend(self._suggest(sWord[:-1])) + lSugg.extend(self._suggest(sWord[1:-1])) + if not lSugg: + lSugg.extend(self._suggestWithCrushedUselessChars(cp.clearWord(sWord))) + return set(lSugg) + + def _suggest (self, sWord, cPrevious='', nDeep=0, iAddr=0, sNewWord="", bAvoidLoop=False): # RECURSIVE FUNCTION if not sWord: if int.from_bytes(self.byDic[iAddr:iAddr+self.nBytesArc], byteorder='big') & self._finalNodeMask: + show(nDeep, "!!! " + sNewWord + " !!!") return [sNewWord] return [] + #show(nDeep, "<" + sWord + "> ===> " + sNewWord) lSugg = [] - for cChar, jAddr in self._getSimilarArcs(sWord[0:1], iAddr): - lSugg.extend(self.suggest(sWord[1:], jAddr, sNewWord+cChar)) + cCurrent = sWord[0:1] + for cChar, jAddr in self._getSimilarArcs(cCurrent, iAddr): + #show(nDeep, cChar) + lSugg.extend(self._suggest(sWord[1:], cCurrent, nDeep+1, jAddr, sNewWord+cChar)) + if not bAvoidLoop: # avoid infinite loop + #show(nDeep, ":no loop:") + if cPrevious == cCurrent: + # same char, we remove 1 char without adding 1 to + lSugg.extend(self._suggest(sWord[1:], cCurrent, nDeep+1, iAddr, sNewWord)) + for sRepl in cp.d1toX.get(cCurrent, ()): + #show(nDeep, sRepl) + lSugg.extend(self._suggest(sRepl + sWord[1:], cCurrent, nDeep+1, iAddr, sNewWord, True)) + if len(sWord) == 1: + #show(nDeep, ":end of word:") + # end of word + for sRepl in cp.dFinal1.get(sWord, ()): + #show(nDeep, sRepl) + lSugg.extend(self._suggest(sRepl, cCurrent, nDeep+1, iAddr, sNewWord, True)) return lSugg def _getSimilarArcs (self, cChar, iAddr): "generator: yield similar char of and address of the following node" - for c in cp.dSimilarChar.get(cChar, [cChar]): + for c in cp.d1to1.get(cChar, [cChar]): + if c in self.dChar: + jAddr = self._lookupArcNode(self.dChar[c], iAddr) + if jAddr: + yield (c, jAddr) + + def _suggestWithCrushedUselessChars (self, sWord, cPrevious='', nDeep=0, iAddr=0, sNewWord="", bAvoidLoop=False): + if not sWord: + if int.from_bytes(self.byDic[iAddr:iAddr+self.nBytesArc], byteorder='big') & self._finalNodeMask: + show(nDeep, "!!! " + sNewWord + " !!!") + return [sNewWord] + return [] + lSugg = [] + cCurrent = sWord[0:1] + for cChar, jAddr in self._getSimilarArcsAndCrushedChars(cCurrent, iAddr): + show(nDeep, cChar) + lSugg.extend(self._suggestWithCrushedUselessChars(sWord[1:], cCurrent, nDeep+1, jAddr, sNewWord+cChar)) + return lSugg + + def _getSimilarArcsAndCrushedChars (self, cChar, iAddr): + "generator: yield similar char of and address of the following node" + for nVal, jAddr in self._getArcs(iAddr): + if self.dVal.get(nVal, "") in cp.aUselessChar: + yield (self.dVal[nVal], jAddr) + for c in cp.d1to1.get(cChar, [cChar]): if c in self.dChar: jAddr = self._lookupArcNode(self.dChar[c], iAddr) if jAddr: yield (c, jAddr) @@ -265,10 +329,20 @@ # value not found if (nRawArc & self._lastArcMask): return None iAddr = iEndArcAddr+self.nBytesNodeAddress + def _getArcs1 (self, iAddr): + "generator: return all arcs at as tuples of (nVal, iAddr)" + while True: + iEndArcAddr = iAddr+self.nBytesArc + nRawArc = int.from_bytes(self.byDic[iAddr:iEndArcAddr], byteorder='big') + yield (nRawArc & self._arcMask, int.from_bytes(self.byDic[iEndArcAddr:iEndArcAddr+self.nBytesNodeAddress], byteorder='big')) + if (nRawArc & self._lastArcMask): + break + iAddr = iEndArcAddr+self.nBytesNodeAddress + def _writeNodes1 (self, spfDest): "for debugging only" print(" > Write binary nodes") with codecs.open(spfDest, 'w', 'utf-8', newline="\n") as hDst: iAddr = 0