Index: gc_core/py/grammar_checker.py ================================================================== --- gc_core/py/grammar_checker.py +++ gc_core/py/grammar_checker.py Index: gc_core/py/lang_core/gc_engine.py ================================================================== --- gc_core/py/lang_core/gc_engine.py +++ gc_core/py/lang_core/gc_engine.py @@ -235,10 +235,11 @@ self.sText0 = sText self.sSentence = "" self.sSentence0 = "" self.nOffsetWithinParagraph = 0 self.lTokens = [] + self.lTokens0 = [] self.dTokenPos = {} # {position: token} self.dTags = {} # {position: tags} self.dError = {} # {position: error} self.dSentenceError = {} # {position: error} (for the current sentence only) self.dErrorPriority = {} # {position: priority of the current error} @@ -266,12 +267,10 @@ # parse paragraph try: self.parseText(self.sText, self.sText0, True, 0, sCountry, dOpt, bShowRuleId, bDebug, bContext) except: raise - self.lTokens = None - self.lTokens0 = None if bFullInfo: lParagraphErrors = list(self.dError.values()) lSentences = [] self.dSentenceError.clear() # parse sentences @@ -301,13 +300,12 @@ except: raise if bFullInfo: # Grammar checking and sentence analysis return lParagraphErrors, lSentences - else: - # Grammar checking only - return self.dError.values() # this is a view (iterable) + # Grammar checking only + return self.dError.values() # this is a view (iterable) def _getCleanText (self): sText = self.sText if " " in sText: sText = sText.replace(" ", ' ') # nbsp @@ -813,11 +811,11 @@ self.lTokens[nTokenRewriteStart]["sNewValue"] = sWhat else: # several tokens lTokenValue = sWhat.split("|") if len(lTokenValue) != (nTokenRewriteEnd - nTokenRewriteStart + 1): - if (bDebug): + if bDebug: echo("Error. Text processor: number of replacements != number of tokens.") return for i, sValue in zip(range(nTokenRewriteStart, nTokenRewriteEnd+1), lTokenValue): if not sValue or sValue == "*": self.lTokens[i]["bToRemove"] = True @@ -829,11 +827,10 @@ def rewriteFromTags (self, bDebug=False): "rewrite the sentence, modify tokens, purge the token list" if bDebug: echo("REWRITE") lNewTokens = [] - lNewTokens0 = [] nMergeUntil = 0 dTokenMerger = {} for iToken, dToken in enumerate(self.lTokens): bKeepToken = True if dToken["sType"] != "INFO": Index: gc_core/py/lang_core/gc_functions.py ================================================================== --- gc_core/py/lang_core/gc_functions.py +++ gc_core/py/lang_core/gc_functions.py @@ -6,10 +6,11 @@ # template: # variables generated in import re +import traceback from . import gc_options from ..graphspell.echo import echo @@ -16,10 +17,11 @@ _sAppContext = "Python" # what software is running _oSpellChecker = None def load (sContext, oSpellChecker): + "mandatory first function to call: variables initialization" global _sAppContext global _oSpellChecker _sAppContext = sContext _oSpellChecker = oSpellChecker Index: gc_core/py/lang_core/gc_options.py ================================================================== --- gc_core/py/lang_core/gc_options.py +++ gc_core/py/lang_core/gc_options.py @@ -14,10 +14,11 @@ _sAppContext = "Python" def load (sContext="Python"): + "mandatory first function to call: variables initialization" global dOptions global _sAppContext _sAppContext = sContext dOptions = getDefaultOptions(sContext) Index: gc_core/py/lang_core/tests_core.py ================================================================== --- gc_core/py/lang_core/tests_core.py +++ gc_core/py/lang_core/tests_core.py @@ -75,11 +75,11 @@ return with open(spfParsingTest, "r", encoding="utf-8") as hSrc: nUnexpectedErrors = 0 nTestWithExpectedError = 0 nTestWithExpectedErrorAndSugg = 0 - for i, sLine in enumerate( s for s in hSrc if not s.startswith("#") and s.strip() ): + for sLine in ( s for s in hSrc if not s.startswith("#") and s.strip() ): sLineNum = sLine[:10].strip() sLine = sLine[10:].strip() sOption = None m = zOption.search(sLine) if m: @@ -183,10 +183,11 @@ return False return True def purgeMessage (sMessage): + "remove space after elided French words" for sToReplace, sReplacement in [ ("l’ ", "l’"), ("d’ ", "d’"), ("n’ ", "n’"), ("j’ ", "j’"), ("m’ ", "m’"), ("t’ ", "t’"), ("s’ ", "s’"), ("qu’ ", "qu’"), ("L’ ", "L’"), ("D’ ", "D’"), ("N’ ", "N’"), ("J’ ", "J’"), ("M’ ", "M’"), ("T’ ", "T’"), ("S’ ", "S’"), ("QU’ ", "QU’") ]: sMessage = sMessage.replace(sToReplace, sReplacement) Index: gc_lang/fr/modules-js/gce_suggestions.js ================================================================== --- gc_lang/fr/modules-js/gce_suggestions.js +++ gc_lang/fr/modules-js/gce_suggestions.js @@ -170,13 +170,13 @@ } } } else { for (let [sTense, ] of lTenses) { - for (let [sWho, ] of [ ...sMorph.matchAll(/:(?:[123][sp]|P|Y)/g) ]) { - if (conj.hasConj(sStem, sTense, sWho)) { - aSugg.add(conj.getConj(sStem, sTense, sWho)); + for (let [sWho2, ] of [ ...sMorph.matchAll(/:(?:[123][sp]|P|Y)/g) ]) { + if (conj.hasConj(sStem, sTense, sWho2)) { + aSugg.add(conj.getConj(sStem, sTense, sWho2)); } } } } } Index: gc_lang/fr/modules/conj.py ================================================================== --- gc_lang/fr/modules/conj.py +++ gc_lang/fr/modules/conj.py @@ -65,24 +65,23 @@ def getNamesFrom (sVerb): "returns a list of names derivating from " if sVerb in _dVerbNames: # there are names derivated from the verb return list(_dVerbNames[sVerb]) - else: - # we suggest past participles - tTags = _getTags(sVerb) - if tTags: - aSugg = [ _getConjWithTags(sVerb, tTags, ":Q", ":m:s") ] - if _hasConjWithTags(tTags, ":Q", ":f:s"): - aSugg.append(_getConjWithTags(sVerb, tTags, ":Q", ":f:s")) - if _hasConjWithTags(tTags, ":Q", ":m:p"): - aSugg.append(_getConjWithTags(sVerb, tTags, ":Q", ":m:p")) - if _hasConjWithTags(tTags, ":Q", ":f:p"): - aSugg.append(_getConjWithTags(sVerb, tTags, ":Q", ":f:p")) - # if there is only one past participle (epi inv), unreliable. - return aSugg if len(aSugg) > 1 else [] - return [] + # nothing found: we suggest past participles + tTags = _getTags(sVerb) + if tTags: + aSugg = [ _getConjWithTags(sVerb, tTags, ":Q", ":m:s") ] + if _hasConjWithTags(tTags, ":Q", ":f:s"): + aSugg.append(_getConjWithTags(sVerb, tTags, ":Q", ":f:s")) + if _hasConjWithTags(tTags, ":Q", ":m:p"): + aSugg.append(_getConjWithTags(sVerb, tTags, ":Q", ":m:p")) + if _hasConjWithTags(tTags, ":Q", ":f:p"): + aSugg.append(_getConjWithTags(sVerb, tTags, ":Q", ":f:p")) + # if there is only one past participle (epi inv), unreliable. + return aSugg if len(aSugg) > 1 else [] + return [] def getConjSimilInfiV1 (sInfi): "returns verbal forms phonetically similar to infinitive form (for verb in group 1)" if sInfi not in _dVerb: Index: gc_lang/fr/modules/gce_suggestions.py ================================================================== --- gc_lang/fr/modules/gce_suggestions.py +++ gc_lang/fr/modules/gce_suggestions.py @@ -39,11 +39,11 @@ aTense[":Is"] = "" elif m.group(1) == ":P": aTense[":Ip"] = "" else: aTense[m.group(1)] = "" - for sTense in aTense.keys(): + for sTense in aTense: if sWho == ":1ś" and not conj._hasConjWithTags(tTags, sTense, ":1ś"): sWho = ":1s" if conj._hasConjWithTags(tTags, sTense, sWho): dSugg[conj._getConjWithTags(sStem, tTags, sTense, sWho)] = "" if funcSugg2: @@ -50,16 +50,17 @@ sSugg2 = funcSugg2(*args) if args else funcSugg2(sFlex) if sSugg2: dSugg[sSugg2] = "" if dSugg: if bVC: - return "|".join([ joinVerbAndSuffix(sSugg, sSfx) for sSugg in dSugg.keys() ]) - return "|".join(dSugg.keys()) + return "|".join([ joinVerbAndSuffix(sSugg, sSfx) for sSugg in dSugg ]) + return "|".join(dSugg) return "" def joinVerbAndSuffix (sFlex, sSfx): + "join verb with suffix, modifying to prevent irregular forms" if sSfx.startswith(("-t-", "-T-")) and sFlex.endswith(("t", "d", "T", "D")): return sFlex + sSfx[2:] if sFlex.endswith(("e", "a", "c", "E", "A", "C")): if re.match("(?i)-(?:en|y)$", sSfx): return sFlex + "s" + sSfx @@ -108,11 +109,11 @@ else: dSugg[conj._getConjWithTags(sStem, tTags, ":Q", ":m:s")] = "" if "" in dSugg: del dSugg[""] if dSugg: - return "|".join(dSugg.keys()) + return "|".join(dSugg) return "" def suggVerbTense (sFlex, sTense, sWho): "change to a verb according to and " @@ -119,11 +120,11 @@ dSugg = {} for sStem in _oSpellChecker.getLemma(sFlex): if conj.hasConj(sStem, sTense, sWho): dSugg[conj.getConj(sStem, sTense, sWho)] = "" if dSugg: - return "|".join(dSugg.keys()) + return "|".join(dSugg) return "" def suggVerbFrom (sStem, sFlex, sWho=""): "conjugate according to (and eventually )" @@ -134,15 +135,15 @@ for sTense in lTenses: if conj.hasConj(sStem, sTense, sWho): dSugg[conj.getConj(sStem, sTense, sWho)] = "" else: for sTense in lTenses: - for sWho in [ m.group(0) for m in re.finditer(":(?:[123][sp]|P|Y)", sMorph) ]: - if conj.hasConj(sStem, sTense, sWho): - dSugg[conj.getConj(sStem, sTense, sWho)] = "" + for sWho2 in [ m.group(0) for m in re.finditer(":(?:[123][sp]|P|Y)", sMorph) ]: + if conj.hasConj(sStem, sTense, sWho2): + dSugg[conj.getConj(sStem, sTense, sWho2)] = "" if dSugg: - return "|".join(dSugg.keys()) + return "|".join(dSugg) return "" def suggVerbImpe (sFlex, bVC=False): "change to a verb at imperative form" @@ -194,11 +195,11 @@ if conj._hasConjWithTags(tTags, sTense, sWho): dSugg[conj._getConjWithTags(sStem, tTags, sTense, sWho)] = "" if sFlex in _dModeSugg: dSugg[_dModeSugg[sFlex]] = "" if dSugg: - return "|".join(dSugg.keys()) + return "|".join(dSugg) return "" ## Nouns and adjectives @@ -277,11 +278,11 @@ dSugg[conj.getConj(sVerb, ":Q", ":m:s")] = "" if bSuggSimil: for e in phonet.selectSimil(sFlex, ":m:[si]"): dSugg[e] = "" if dSugg: - return "|".join(dSugg.keys()) + return "|".join(dSugg) return "" def suggMasPlur (sFlex, bSuggSimil=False): "returns masculine plural forms" @@ -307,11 +308,11 @@ dSugg[sSugg] = "" if bSuggSimil: for e in phonet.selectSimil(sFlex, ":m:[pi]"): dSugg[e] = "" if dSugg: - return "|".join(dSugg.keys()) + return "|".join(dSugg) return "" def suggFemSing (sFlex, bSuggSimil=False): "returns feminine singular forms" @@ -332,11 +333,11 @@ dSugg[conj.getConj(sVerb, ":Q", ":f:s")] = "" if bSuggSimil: for e in phonet.selectSimil(sFlex, ":f:[si]"): dSugg[e] = "" if dSugg: - return "|".join(dSugg.keys()) + return "|".join(dSugg) return "" def suggFemPlur (sFlex, bSuggSimil=False): "returns feminine plural forms" @@ -370,23 +371,23 @@ return "" sGender, sNumber = cr.getGenderNumber(lMorphSrc) if sGender == ":m": if sNumber == ":s": return suggMasSing(sFlexDest) - elif sNumber == ":p": + if sNumber == ":p": return suggMasPlur(sFlexDest) return suggMasSing(sFlexDest) - elif sGender == ":f": + if sGender == ":f": if sNumber == ":s": return suggFemSing(sFlexDest) - elif sNumber == ":p": + if sNumber == ":p": return suggFemPlur(sFlexDest) return suggFemSing(sFlexDest) - elif sGender == ":e": + if sGender == ":e": if sNumber == ":s": return suggSing(sFlexDest) - elif sNumber == ":p": + if sNumber == ":p": return suggPlur(sFlexDest) return sFlexDest return "" @@ -397,23 +398,23 @@ return "" sGender, sNumber = cr.getGenderNumber(lMorphSrc) if sGender == ":m": if sNumber == ":s": return suggMasSing(dTokenDst["sValue"]) - elif sNumber == ":p": + if sNumber == ":p": return suggMasPlur(dTokenDst["sValue"]) return suggMasSing(dTokenDst["sValue"]) - elif sGender == ":f": + if sGender == ":f": if sNumber == ":s": return suggFemSing(dTokenDst["sValue"]) - elif sNumber == ":p": + if sNumber == ":p": return suggFemPlur(dTokenDst["sValue"]) return suggFemSing(dTokenDst["sValue"]) - elif sGender == ":e": + if sGender == ":e": if sNumber == ":s": return suggSing(dTokenDst["sValue"]) - elif sNumber == ":p": + if sNumber == ":p": return suggPlur(dTokenDst["sValue"]) return dTokenDst["sValue"] return "" @@ -470,11 +471,11 @@ if ":f" in sMorph: dSugg[suggMasSing(sFlex)] = "" elif ":m" in sMorph: dSugg[suggFemSing(sFlex)] = "" if dSugg: - return "|".join(dSugg.keys()) + return "|".join(dSugg) return "" def switchPlural (sFlex): "return plural or singular form(s) of " @@ -483,11 +484,11 @@ if ":s" in sMorph: aSugg[suggPlur(sFlex)] = "" elif ":p" in sMorph: aSugg[suggSing(sFlex)] = "" if aSugg: - return "|".join(aSugg.keys()) + return "|".join(aSugg) return "" def hasSimil (sWord, sPattern=None): "return True if there is words phonetically similar to (according to if required)" @@ -510,12 +511,12 @@ else: dSugg.update(dict.fromkeys(conj.getNamesFrom(sInfi), "")) break if dSugg: if bVC: - return "|".join([ joinVerbAndSuffix(sSugg, sSfx) for sSugg in dSugg.keys() ]) - return "|".join(dSugg.keys()) + return "|".join([ joinVerbAndSuffix(sSugg, sSfx) for sSugg in dSugg ]) + return "|".join(dSugg) return "" def suggCeOrCet (sWord): "suggest “ce” or “cet” or both according to the first letter of " Index: gc_lang/fr/modules/tests_modules.py ================================================================== --- gc_lang/fr/modules/tests_modules.py +++ gc_lang/fr/modules/tests_modules.py @@ -55,20 +55,26 @@ self.assertFalse(self.oSpellChecker.isValid(sWord), sWord) def test_suggest (self): for sWord in [ "déelirranttesss", "vallidasion", "Emilie", "exibission", "ditirembique", "jai", "email", - "fatiqué", "coeur", "trèèèèèèèèès", "vraaaaiiiimeeeeennnt", "apele", "email", "Co2", + "fatiqué", "coeur", "trèèèèèèèèès", "vraaaaiiiimeeeeennnt", "apele", "Co2", "emmppâiiiller", "testt", "apelaion", "exsepttion", "sintaxik", "ebriete", "ennormmement" ]: - aSugg = self.oSpellChecker.suggest(sWord) + for lSugg in self.oSpellChecker.suggest(sWord): + self.assertTrue(len(lSugg) > 0) #with timeblock(sWord): # aSugg = self.oSpellChecker.suggest(sWord) # print(sWord, "->", " ".join(aSugg)) def test_lemmas (self): for sWord, sInfi in [ + ("suis", "suivre"), + ("suis", "être"), + ("a", "avoir"), + ("a", "a"), + ("irai", "aller"), ("jetez", "jeter"), ("finit", "finir"), ("mangé", "manger"), ("oubliait", "oublier"), ("arrivais", "arriver"), Index: graphspell-js/dawg.js ================================================================== --- graphspell-js/dawg.js +++ graphspell-js/dawg.js @@ -140,10 +140,14 @@ } else if (cStemming == "S") { this.funcStemming = str_transform.changeWordWithSuffixCode; } else { this.funcStemming = str_transform.noStemming; } + + // binary dictionary + this.sByDic = ""; + this.lByDic = []; // build lWord.sort(); if (xProgressBarNode) { xProgressBarNode.value = 0; Index: graphspell-js/ibdawg.js ================================================================== --- graphspell-js/ibdawg.js +++ graphspell-js/ibdawg.js @@ -108,16 +108,10 @@ lRes = lRes.map((sSugg) => { return sSugg.slice(0,1).toUpperCase() + sSugg.slice(1); }); lRes = [...new Set(lRes)]; } return lRes.slice(0, this.nSuggLimit); } - - reset () { - this.dSugg.clear(); - this.dGoodSugg.clear(); - this.dBestSugg.clear(); - } } class IBDAWG { // INDEXABLE BINARY DIRECT ACYCLIC WORD GRAPH Index: graphspell/dawg.py ================================================================== --- graphspell/dawg.py +++ graphspell/dawg.py @@ -163,10 +163,14 @@ self.nBytesNodeAddress = 1 self.nBytesArc = 0 self.nBytesOffset = 0 self.nMaxOffset = 0 + # binary dictionary + self.byDic = b"" + self.lByDic = [] + # build lWord.sort() oProgBar = ProgressBar(0, len(lWord)) for aEntry in lWord: self.insert(aEntry) @@ -362,11 +366,10 @@ print(" > Write DAWG as an indexable binary dictionary") self.nBytesArc = ( (self.nArcVal.bit_length() + 2) // 8 ) + 1 # We add 2 bits. See DawgNode.convToBytes() self.nBytesOffset = 0 self._calcNumBytesNodeAddress() self._calcNodesAddress() - self.byDic = b"" self.byDic = self.oRoot.convToBytes(self.nBytesArc, self.nBytesNodeAddress) for oNode in self.lMinimizedNodes: self.byDic += oNode.convToBytes(self.nBytesArc, self.nBytesNodeAddress) print(" Arc values (chars, affixes and tags): {} -> {} bytes".format( self.nArcVal, len("\t".join(self.lArcVal).encode("utf-8")) )) print(" Arc size: {} bytes, Address size: {} bytes -> {} * {} = {} bytes".format( self.nBytesArc, self.nBytesNodeAddress, \ Index: graphspell/ibdawg.py ================================================================== --- graphspell/ibdawg.py +++ graphspell/ibdawg.py @@ -101,15 +101,10 @@ elif self.sWord[0:1].isupper(): # dont’ use <.istitle> lRes = list(OrderedDict.fromkeys(map(lambda sSugg: sSugg[0:1].upper()+sSugg[1:], lRes))) # use dict, when Python 3.6+ return lRes[:self.nSuggLimit] - def reset (self): - "clear data" - self.aSugg.clear() - self.dSugg.clear() - class IBDAWG: """INDEXABLE BINARY DIRECT ACYCLIC WORD GRAPH""" def __init__ (self, source): Index: graphspell/lexgraph_fr.py ================================================================== --- graphspell/lexgraph_fr.py +++ graphspell/lexgraph_fr.py @@ -440,14 +440,15 @@ _zPartDemForm = re.compile("([\\w]+)-(là|ci)$") _zInterroVerb = re.compile("([\\w]+)(-(?:t-(?:ie?l|elle|on)|je|tu|ie?ls?|elles?|on|[nv]ous))$") _zImperatifVerb = re.compile("([\\w]+)(-(?:l(?:es?|a)-(?:moi|toi|lui|[nv]ous|leur)|y|en|[mts]['’ʼ‘‛´`′‵՚ꞌꞋ](?:y|en)|les?|la|[mt]oi|leur|lui))$") def setLabelsOnToken (dToken): + "create an attribute “alabels” on as a list of readable meanings" # Token: .sType, .sValue, .nStart, .nEnd, .lMorph try: if dToken["sType"] == "PUNC" or dToken["sType"] == "SIGN": - dToken["aLabels"] = [_dValues.get(dToken["sValue"], "signe de ponctuation divers")] + dToken["aLabels"] = [ _dValues.get(dToken["sValue"], "signe de ponctuation divers") ] elif dToken["sType"] == 'SYMBOL': dToken["aLabels"] = ["symbole"] elif dToken["sType"] == 'EMOJI': dToken["aLabels"] = ["émoji"] elif dToken["sType"] == 'NUM': @@ -507,8 +508,9 @@ return # Other functions -def filterSugg (aSugg): +def filterSugg (aSuggs): "exclude suggestions" - return filter(lambda sSugg: not sSugg.endswith(("è", "È")), aSugg) + return [ sSugg for sSugg in aSuggs if not sSugg.endswith(("è", "È")) ] + #return filter(lambda sSugg: not sSugg.endswith(("è", "È")), aSuggs) # return an object filter Index: graphspell/str_transform.py ================================================================== --- graphspell/str_transform.py +++ graphspell/str_transform.py @@ -113,10 +113,11 @@ d[i, j] = min(d[i, j], d[i-2, j-2] + nCost) # Transposition return d[nLen1-1, nLen2-1] def distanceJaroWinkler (sWord1, sWord2, fBoost = .666): + "distance of Jaro-Winkler between and , returns a float" # https://github.com/thsig/jaro-winkler-JS #if (sWord1 == sWord2): return 1.0 nLen1 = len(sWord1) nLen2 = len(sWord2) nMax = max(nLen1, nLen2) @@ -219,11 +220,11 @@ nTrans += 1 elif not t[2]: t[2] = True nTrans += 1 break - elif i1 > t[1] and i2 > t[0]: + if i1 > t[1] and i2 > t[0]: del lOffset[i] else: i += 1 lOffset.append([i1, i2, bTrans]) else: @@ -232,15 +233,15 @@ if i1 != i2: i1 = i2 = min(i1, i2) for i in range(nMaxOffset): if i1 + i >= nLen1 and i2 + i >= nLen2: break - elif i1 + i < nLen1 and s1[i1+i] == s2[i2]: + if i1 + i < nLen1 and s1[i1+i] == s2[i2]: i1 += i - 1 i2 -= 1 break - elif i2 + i < nLen2 and s1[i1] == s2[i2+i]: + if i2 + i < nLen2 and s1[i1] == s2[i2+i]: i2 += i - 1 i1 -= 1 break i1 += 1 i2 += 1 Index: pylintrc ================================================================== --- pylintrc +++ pylintrc @@ -16,11 +16,11 @@ # Python code to execute, usually for sys.path manipulation such as # pygtk.require(). #init-hook= # Use multiple processes to speed up Pylint. -jobs=4 +jobs=1 # List of plugins (as comma separated values of python modules names) to load, # usually to register additional checkers. load-plugins= @@ -118,25 +118,23 @@ rdiv-method, exception-message-attribute, invalid-str-codec, sys-max-int, bad-python3-import, - deprecated-string-function, - deprecated-str-translate-call, - deprecated-itertools-function, deprecated-types-field, + missing-format-attribute, next-method-defined, dict-items-not-iterating, dict-keys-not-iterating, dict-values-not-iterating, deprecated-operator-function, deprecated-urllib-function, xreadlines-attribute, - deprecated-sys-function, exception-escape, comprehension-escape, bad-whitespace, + consider-using-ternary, line-too-long # Enable the message, report, category or checker with the given id(s). You can # either give multiple identifier separated by comma (,) or put this option # multiple time (only on the command line, not in the configuration file where