Index: compile_rules.py ================================================================== --- compile_rules.py +++ compile_rules.py @@ -458,11 +458,11 @@ dDEF["{"+m.group(1)+"}"] = m.group(2) else: print("Error in definition: ", end="") print(sLine.strip()) elif sLine.startswith("TEST:"): - lTest.append("{:<8}".format(i) + " " + sLine[5:].strip()) + lTest.append("r{:<7}".format(i) + " " + sLine[5:].strip()) elif sLine.startswith("TODO:"): pass elif sLine.startswith(("OPTGROUP/", "OPTSOFTWARE:", "OPT/", "OPTLANG/", "OPTDEFAULTUILANG:", "OPTLABEL/", "OPTPRIORITY/")): lOpt.append(sLine) elif re.match("[  \t]*$", sLine): @@ -542,14 +542,14 @@ print("Unnamed rules: " + str(nRULEWITHOUTNAME)) d = { "callables": sPyCallables, "callablesJS": sJSCallables, - "gctests": sGCTests, - "gctestsJS": sGCTestsJS, + "regex_gctests": sGCTests, + "regex_gctestsJS": sGCTestsJS, "paragraph_rules": mergeRulesByOption(lParagraphRules), "sentence_rules": mergeRulesByOption(lSentenceRules), "paragraph_rules_JS": jsconv.writeRulesToJSArray(mergeRulesByOption(lParagraphRulesJS)), "sentence_rules_JS": jsconv.writeRulesToJSArray(mergeRulesByOption(lSentenceRulesJS)) } d.update(dOptions) return d ADDED compile_rules_graph.py Index: compile_rules_graph.py ================================================================== --- /dev/null +++ compile_rules_graph.py @@ -0,0 +1,348 @@ +# Create a Direct Acyclic Rule Graph (DARG) + +import re +import traceback +import json +import darg + + +dDEF = {} +dACTIONS = {} +lFUNCTIONS = [] + + +def prepareFunction (s): + s = s.replace("__also__", "bCondMemo") + s = s.replace("__else__", "not bCondMemo") + s = re.sub(r"isStart *\(\)", 'before(["", ","])', s) + s = re.sub(r"isRealStart *\(\)", 'before([""])', s) + s = re.sub(r"isStart0 *\(\)", 'before0(["", ","])', s) + s = re.sub(r"isRealStart0 *\(\)", 'before0([""])', s) + s = re.sub(r"isEnd *\(\)", 'after(["", ","])', s) + s = re.sub(r"isRealEnd *\(\)", 'after([""])', s) + s = re.sub(r"isEnd0 *\(\)", 'after0(["", ","])', s) + s = re.sub(r"isRealEnd0 *\(\)", 'after0([""])', s) + s = re.sub(r"(select|exclude)[(][\\](\d+)", '\\1(lToken[\\2]', s) + s = re.sub(r"define[(][\\](\d+)", 'define(lToken[\\1]', s) + s = re.sub(r"(morph|morphex|displayInfo)[(]\\(\d+)", '\\1(lToken[\\2]', s) + s = re.sub(r"token\(\s*(\d)", 'nextToken(\\1', s) # token(n) + s = re.sub(r"token\(\s*-(\d)", 'prevToken(\\1', s) # token(-n) + s = re.sub(r"before\(\s*", 'look(s[:m.start()], ', s) # before(s) + s = re.sub(r"after\(\s*", 'look(s[m.end():], ', s) # after(s) + s = re.sub(r"textarea\(\s*", 'look(s, ', s) # textarea(s) + s = re.sub(r"before_chk1\(\s*", 'look_chk1(dDA, s[:m.start()], 0, ', s) # before_chk1(s) + s = re.sub(r"after_chk1\(\s*", 'look_chk1(dDA, s[m.end():], m.end(), ', s) # after_chk1(s) + s = re.sub(r"textarea_chk1\(\s*", 'look_chk1(dDA, s, 0, ', s) # textarea_chk1(s) + s = re.sub(r"isEndOfNG\(\s*\)", 'isEndOfNG(dDA, s[m.end():], m.end())', s) # isEndOfNG(s) + s = re.sub(r"isNextNotCOD\(\s*\)", 'isNextNotCOD(dDA, s[m.end():], m.end())', s) # isNextNotCOD(s) + s = re.sub(r"isNextVerb\(\s*\)", 'isNextVerb(dDA, s[m.end():], m.end())', s) # isNextVerb(s) + s = re.sub(r"\bspell *[(]", '_oSpellChecker.isValid(', s) + s = re.sub(r"[\\](\d+)", 'lToken[\\1]', s) + return s + + +def changeReferenceToken (s, dPos): + for i in range(len(dPos), 0, -1): + s = s.replace("\\"+str(i), "\\"+str(dPos[i])) + return s + + +def genTokenRules (sTokenLine): + lToken = sTokenLine.split() + lTokenRules = None + for i, sToken in enumerate(lToken): + if sToken.startswith("{") and sToken.endswith("}") and sToken in dDEF: + lToken[i] = dDEF[sToken] + if ( (sToken.startswith("[") and sToken.endswith("]")) or (sToken.startswith("([") and sToken.endswith("])")) ): + bSelectedGroup = sToken.startswith("(") and sToken.endswith(")") + if bSelectedGroup: + sToken = sToken[1:-1] + # multiple token + if not lTokenRules: + lTokenRules = [ [s] for s in sToken[1:-1].split("|") ] + else: + lNewTemp = [] + for aRule in lTokenRules: + lElem = sToken[1:-1].split("|") + sElem1 = lElem.pop(0) + if bSelectedGroup: + sElem1 = "(" + sElem1 + ")" + for sElem in lElem: + if bSelectedGroup: + sElem = "(" + sElem + ")" + aNew = list(aRule) + aNew.append(sElem) + lNewTemp.append(aNew) + aRule.append(sElem1) + lTokenRules.extend(lNewTemp) + else: + # simple token + if not lTokenRules: + lTokenRules = [[sToken]] + else: + for aRule in lTokenRules: + aRule.append(sToken) + for aRule in lTokenRules: + yield aRule + + +def createRule (iLine, sRuleName, sTokenLine, sActions, nPriority): + # print(iLine, "//", sRuleName, "//", sTokenLine, "//", sActions, "//", nPriority) + for lToken in genTokenRules(sTokenLine): + # Calculate positions + dPos = {} + nGroup = 0 + for i, sToken in enumerate(lToken): + if sToken.startswith("(") and sToken.endswith(")"): + lToken[i] = sToken[1:-1] + nGroup += 1 + dPos[nGroup] = i + + # Parse actions + for nAction, sAction in enumerate(sActions.split(" <<- ")): + if sAction.strip(): + sActionId = sRuleName + "_a" + str(nAction) + aAction = createAction(sActionId, sAction, nGroup, nPriority, dPos) + if aAction: + dACTIONS[sActionId] = aAction + lResult = list(lToken) + lResult.extend(["##"+str(iLine), sActionId]) + yield lResult + + +def createAction (sIdAction, sAction, nGroup, nPriority, dPos): + m = re.search("([-~=])(\\d+|)(:\\d+|)>> ", sAction) + if not m: + print(" # Error. No action found at: ", sIdAction) + print(" ==", sAction, "==") + return None + # Condition + sCondition = sAction[:m.start()].strip() + if sCondition: + sCondition = prepareFunction(sCondition) + sCondition = changeReferenceToken(sCondition, dPos) + lFUNCTIONS.append(("g_c_"+sIdAction, sCondition)) + sCondition = "g_c_"+sIdAction + else: + sCondition = "" + # Action + cAction = m.group(1) + sAction = sAction[m.end():].strip() + sAction = changeReferenceToken(sAction, dPos) + iStartAction = int(m.group(2)) if m.group(2) else 0 + iEndAction = int(m.group(3)[1:]) if m.group(3) else iStartAction + if nGroup: + try: + iStartAction = dPos[iStartAction] + iEndAction = dPos[iEndAction] + except: + print("# Error. Wrong groups in: " + sIdAction) + + if cAction == "-": + ## error + iMsg = sAction.find(" # ") + if iMsg == -1: + sMsg = "# Error. Error message not found." + sURL = "" + print(sMsg + " Action id: " + sIdAction) + else: + sMsg = sAction[iMsg+3:].strip() + sAction = sAction[:iMsg].strip() + sURL = "" + mURL = re.search("[|] *(https?://.*)", sMsg) + if mURL: + sURL = mURL.group(1).strip() + sMsg = sMsg[:mURL.start(0)].strip() + if sMsg[0:1] == "=": + sMsg = prepareFunction(sMsg[1:]) + lFUNCTIONS.append(("g_m_"+sIdAction, sMsg)) + for x in re.finditer("group[(](\\d+)[)]", sMsg): + if int(x.group(1)) > nGroup: + print("# Error in groups in message at line " + sIdAction + " ("+str(nGroup)+" groups only)") + sMsg = "=g_m_"+sIdAction + else: + for x in re.finditer(r"\\(\d+)", sMsg): + if int(x.group(1)) > nGroup: + print("# Error in groups in message at line " + sIdAction + " ("+str(nGroup)+" groups only)") + if re.search("[.]\\w+[(]", sMsg): + print("# Error in message at line " + sIdAction + ": This message looks like code. Line should begin with =") + + if sAction[0:1] == "=" or cAction == "=": + if "define" in sAction and not re.search(r"define\(\\\d+ *, *\[.*\] *\)", sAction): + print("# Error in action at line " + sIdAction + ": second argument for define must be a list of strings") + sAction = prepareFunction(sAction) + for x in re.finditer("group[(](\\d+)[)]", sAction): + if int(x.group(1)) > nGroup: + print("# Error in groups in replacement at line " + sIdAction + " ("+str(nGroup)+" groups only)") + else: + for x in re.finditer(r"\\(\d+)", sAction): + if int(x.group(1)) > nGroup: + print("# Error in groups in replacement at line " + sIdAction + " ("+str(nGroup)+" groups only)") + if re.search("[.]\\w+[(]|sugg\\w+[(]", sAction): + print("# Error in action at line " + sIdAction + ": This action looks like code. Line should begin with =") + + if cAction == "-": + ## error detected --> suggestion + if not sAction: + print("# Error in action at line " + sIdAction + ": This action is empty.") + if sAction[0:1] == "=": + lFUNCTIONS.append(("g_s_"+sIdAction, sAction[1:])) + sAction = "=g_s_"+sIdAction + elif sAction.startswith('"') and sAction.endswith('"'): + sAction = sAction[1:-1] + if not sMsg: + print("# Error in action at line " + sIdAction + ": The message is empty.") + return [sCondition, cAction, sAction, iStartAction, iEndAction, nPriority, sMsg, sURL] + elif cAction == "~": + ## text processor + if not sAction: + print("# Error in action at line " + sIdAction + ": This action is empty.") + if sAction[0:1] == "=": + lFUNCTIONS.append(("g_p_"+sIdAction, sAction[1:])) + sAction = "=g_p_"+sIdAction + elif sAction.startswith('"') and sAction.endswith('"'): + sAction = sAction[1:-1] + return [sCondition, cAction, sAction, iStartAction, iEndAction] + elif cAction == "=": + ## disambiguator + if sAction[0:1] == "=": + sAction = sAction[1:] + if not sAction: + print("# Error in action at line " + sIdAction + ": This action is empty.") + lFUNCTIONS.append(("g_d_"+sIdAction, sAction)) + sAction = "g_d_"+sIdAction + return [sCondition, cAction, sAction] + elif cAction == ">": + ## no action, break loop if condition is False + return [sCondition, cAction, ""] + else: + print("# Unknown action at line " + sIdAction) + return None + + +def make (spLang, sLang, bJavaScript): + "compile rules, returns a dictionary of values" + # for clarity purpose, don’t create any file here + + print("> read graph rules file...") + try: + lRules = open(spLang + "/rules_graph.grx", 'r', encoding="utf-8").readlines() + except: + print("Error. Rules file in project [" + sLang + "] not found.") + exit() + + # removing comments, zeroing empty lines, creating definitions, storing tests, merging rule lines + print(" parsing rules...") + global dDEF + lLine = [] + lRuleLine = [] + lTest = [] + lOpt = [] + lTokenLine = [] + sActions = "" + nPriority = 4 + + for i, sLine in enumerate(lRules, 1): + sLine = sLine.rstrip() + if "\t" in sLine: + print("Error. Tabulation at line: ", i) + break + if sLine.startswith('#END'): + printBookmark(0, "BREAK BY #END", i) + break + elif sLine.startswith("#"): + pass + elif sLine.startswith("DEF:"): + m = re.match("DEF: +([a-zA-Z_][a-zA-Z_0-9]*) +(.+)$", sLine.strip()) + if m: + dDEF["{"+m.group(1)+"}"] = m.group(2) + else: + print("Error in definition: ", end="") + print(sLine.strip()) + elif sLine.startswith("TEST:"): + lTest.append("g{:<7}".format(i) + " " + sLine[5:].strip()) + elif sLine.startswith("TODO:"): + pass + elif sLine.startswith("!!"): + m = re.search("^!!+", sLine) + nExMk = len(m.group(0)) + if sLine[nExMk:].strip(): + printBookmark(nExMk-2, sLine[nExMk:].strip(), i) + elif sLine.startswith("__") and sLine.endswith("__"): + # new rule group + m = re.match("__(\\w+)(!\\d|)__", sLine) + if m: + sRuleName = m.group(1) + nPriority = int(m.group(2)[1:]) if m.group(2) else 4 + else: + print("Error at rule group: ", sLine, " -- line:", i) + break + elif re.match("[  ]*$", sLine): + # empty line to end merging + for i, sTokenLine in lTokenLine: + lRuleLine.append((i, sRuleName, sTokenLine, sActions, nPriority)) + lTokenLine = [] + sActions = "" + sRuleName = "" + nPriority = 4 + elif sLine.startswith((" ")): + # actions + sActions += " " + sLine.strip() + else: + lTokenLine.append([i, sLine.strip()]) + + # tests + print(" list tests...") + sGCTests = "\n".join(lTest) + sGCTestsJS = '{ "aData2": ' + json.dumps(lTest, ensure_ascii=False) + " }\n" + + # processing rules + print(" preparing rules...") + lPreparedRule = [] + for i, sRuleGroup, sTokenLine, sActions, nPriority in lRuleLine: + for lRule in createRule(i, sRuleGroup, sTokenLine, sActions, nPriority): + lPreparedRule.append(lRule) + + # Graph creation + for e in lPreparedRule: + print(e) + + oDARG = darg.DARG(lPreparedRule, sLang) + oRuleGraph = oDARG.createGraph() + + # creating file with all functions callable by rules + print(" creating callables...") + sPyCallables = "# generated code, do not edit\n" + #sJSCallables = "// generated code, do not edit\nconst oEvalFunc = {\n" + for sFuncName, sReturn in lFUNCTIONS: + if sFuncName.startswith("g_c_"): # condition + sParams = "lToken, sCountry, bCondMemo" + elif sFuncName.startswith("g_m_"): # message + sParams = "lToken" + elif sFuncName.startswith("g_s_"): # suggestion + sParams = "lToken" + elif sFuncName.startswith("g_p_"): # preprocessor + sParams = "lToken" + elif sFuncName.startswith("g_d_"): # disambiguator + sParams = "lToken" + else: + print("# Unknown function type in [" + sFuncName + "]") + continue + sPyCallables += "def {} ({}):\n".format(sFuncName, sParams) + sPyCallables += " return " + sReturn + "\n" + #sJSCallables += " {}: function ({})".format(sFuncName, sParams) + " {\n" + #sJSCallables += " return " + jsconv.py2js(sReturn) + ";\n" + #sJSCallables += " },\n" + #sJSCallables += "}\n" + + # Result + d = { + "graph_callables": sPyCallables, + "graph_gctests": sGCTests, + "rules_graph": oRuleGraph, + "rules_actions": dACTIONS + } + + return d + + ADDED darg.py Index: darg.py ================================================================== --- /dev/null +++ darg.py @@ -0,0 +1,185 @@ +#!python3 + +# RULE GRAPH BUILDER +# +# by Olivier R. +# License: MPL 2 + + +import json +import time +import traceback + +from graphspell.progressbar import ProgressBar + + + +class DARG: + """DIRECT ACYCLIC RULE GRAPH""" + # This code is inspired from Steve Hanov’s DAWG, 2011. (http://stevehanov.ca/blog/index.php?id=115) + + def __init__ (self, lRule, sLangCode): + print("===== Direct Acyclic Rule Graph - Minimal Acyclic Finite State Automaton =====") + + # Preparing DARG + print(" > Preparing list of tokens") + self.sLangCode = sLangCode + self.nRule = len(lRule) + self.aPreviousRule = [] + Node.resetNextId() + self.oRoot = Node() + self.lUncheckedNodes = [] # list of nodes that have not been checked for duplication. + self.lMinimizedNodes = {} # list of unique nodes that have been checked for duplication. + self.nNode = 0 + self.nArc = 0 + + # build + lRule.sort() + oProgBar = ProgressBar(0, len(lRule)) + for aRule in lRule: + self.insert(aRule) + oProgBar.increment(1) + oProgBar.done() + self.finish() + self.countNodes() + self.countArcs() + self.displayInfo() + + # BUILD DARG + def insert (self, aRule): + if aRule < self.aPreviousRule: + sys.exit("# Error: tokens must be inserted in order.") + + # find common prefix between word and previous word + nCommonPrefix = 0 + for i in range(min(len(aRule), len(self.aPreviousRule))): + if aRule[i] != self.aPreviousRule[i]: + break + nCommonPrefix += 1 + + # Check the lUncheckedNodes for redundant nodes, proceeding from last + # one down to the common prefix size. Then truncate the list at that point. + self._minimize(nCommonPrefix) + + # add the suffix, starting from the correct node mid-way through the graph + if len(self.lUncheckedNodes) == 0: + oNode = self.oRoot + else: + oNode = self.lUncheckedNodes[-1][2] + + iToken = nCommonPrefix + for sToken in aRule[nCommonPrefix:]: + oNextNode = Node() + oNode.dArcs[sToken] = oNextNode + self.lUncheckedNodes.append((oNode, sToken, oNextNode)) + if iToken == (len(aRule) - 2): + oNode.bFinal = True + iToken += 1 + oNode = oNextNode + oNode.bFinal = True + self.aPreviousRule = aRule + + def finish (self): + "minimize unchecked nodes" + self._minimize(0) + + def _minimize (self, downTo): + # proceed from the leaf up to a certain point + for i in range( len(self.lUncheckedNodes)-1, downTo-1, -1 ): + oNode, sToken, oChildNode = self.lUncheckedNodes[i] + if oChildNode in self.lMinimizedNodes: + # replace the child with the previously encountered one + oNode.dArcs[sToken] = self.lMinimizedNodes[oChildNode] + else: + # add the state to the minimized nodes. + self.lMinimizedNodes[oChildNode] = oChildNode + self.lUncheckedNodes.pop() + + def countNodes (self): + self.nNode = len(self.lMinimizedNodes) + + def countArcs (self): + self.nArc = 0 + for oNode in self.lMinimizedNodes: + self.nArc += len(oNode.dArcs) + + def displayInfo (self): + print(" * {:<12} {:>16,}".format("Rules:", self.nRule)) + print(" * {:<12} {:>16,}".format("Nodes:", self.nNode)) + print(" * {:<12} {:>16,}".format("Arcs:", self.nArc)) + + def createGraph (self): + dGraph = { 0: self.oRoot.getNodeAsDict() } + print(0, "\t", self.oRoot.getNodeAsDict()) + for oNode in self.lMinimizedNodes: + sHashId = oNode.__hash__() + if sHashId not in dGraph: + dGraph[sHashId] = oNode.getNodeAsDict() + print(sHashId, "\t", dGraph[sHashId]) + else: + print("Error. Double node… same id: ", sHashId) + print(str(oNode.getNodeAsDict())) + return dGraph + + + +class Node: + NextId = 0 + + def __init__ (self): + self.i = Node.NextId + Node.NextId += 1 + self.bFinal = False + self.dArcs = {} # key: arc value; value: a node + + @classmethod + def resetNextId (cls): + cls.NextId = 0 + + def __str__ (self): + # Caution! this function is used for hashing and comparison! + cFinal = "1" if self.bFinal else "0" + l = [cFinal] + for (key, oNode) in self.dArcs.items(): + l.append(str(key)) + l.append(str(oNode.i)) + return "_".join(l) + + def __hash__ (self): + # Used as a key in a python dictionary. + return self.__str__().__hash__() + + def __eq__ (self, other): + # Used as a key in a python dictionary. + # Nodes are equivalent if they have identical arcs, and each identical arc leads to identical states. + return self.__str__() == other.__str__() + + def getNodeAsDict (self): + "returns the node as a dictionary structure" + dNode = {} + dReValue = {} + dReMorph = {} + dRules = {} + dLemmas = {} + for sArc, oNode in self.dArcs.items(): + if sArc.startswith("~~") and len(sArc) > 2: + dReMorph[sArc[1:]] = oNode.__hash__() + elif sArc.startswith("~") and len(sArc) > 1: + dReValue[sArc[1:]] = oNode.__hash__() + elif sArc.startswith(">") and len(sArc) > 1: + dLemmas[sArc[1:]] = oNode.__hash__() + elif sArc.startswith("##"): + dRules[sArc[1:]] = oNode.__hash__() + else: + dNode[sArc] = oNode.__hash__() + if dReValue: + dNode[""] = dReValue + if dReMorph: + dNode[""] = dReMorph + if dLemmas: + dNode[""] = dLemmas + if dRules: + dNode[""] = dRules + #if self.bFinal: + # dNode[""] = 1 + return dNode Index: gc_core/js/lang_core/gc_engine.js ================================================================== --- gc_core/js/lang_core/gc_engine.js +++ gc_core/js/lang_core/gc_engine.js @@ -37,11 +37,10 @@ // data let _sAppContext = ""; // what software is running let _dOptions = null; let _aIgnoredRules = new Set(); let _oSpellChecker = null; -let _dAnalyses = new Map(); // cache for data from dictionary var gc_engine = { //// Informations @@ -327,10 +326,11 @@ } else { _oSpellChecker = new SpellChecker("${lang}", sPath, "${dic_main_filename_js}", "${dic_extended_filename_js}", "${dic_community_filename_js}", "${dic_personal_filename_js}"); } _sAppContext = sContext; _dOptions = gc_options.getOptions(sContext).gl_shallowCopy(); // duplication necessary, to be able to reset to default + _oSpellChecker.activateStorage(); } catch (e) { helpers.logerror(e); } }, @@ -376,39 +376,30 @@ // for debugging: info of word if (!aWord) { helpers.echo("> nothing to find"); return true; } - if (!_dAnalyses.has(aWord[1]) && !_storeMorphFromFSA(aWord[1])) { - helpers.echo("> not in FSA"); + let lMorph = _oSpellChecker.getMorph(aWord[1]); + if (lMorph.length === 0) { + helpers.echo("> not in dictionary"); return true; } if (dDA.has(aWord[0])) { helpers.echo("DA: " + dDA.get(aWord[0])); } - helpers.echo("FSA: " + _dAnalyses.get(aWord[1])); + helpers.echo("FSA: " + lMorph); return true; } -function _storeMorphFromFSA (sWord) { - // retrieves morphologies list from _oSpellChecker -> _dAnalyses - //helpers.echo("register: "+sWord + " " + _oSpellChecker.getMorph(sWord).toString()) - _dAnalyses.set(sWord, _oSpellChecker.getMorph(sWord)); - return !!_dAnalyses.get(sWord); -} - function morph (dDA, aWord, sPattern, bStrict=true, bNoWord=false) { // analyse a tuple (position, word), return true if sPattern in morphologies (disambiguation on) if (!aWord) { //helpers.echo("morph: noword, returns " + bNoWord); return bNoWord; } //helpers.echo("aWord: "+aWord.toString()); - if (!_dAnalyses.has(aWord[1]) && !_storeMorphFromFSA(aWord[1])) { - return false; - } - let lMorph = dDA.has(aWord[0]) ? dDA.get(aWord[0]) : _dAnalyses.get(aWord[1]); + let lMorph = dDA.has(aWord[0]) ? dDA.get(aWord[0]) : _oSpellChecker.getMorph(aWord[1]); //helpers.echo("lMorph: "+lMorph.toString()); if (lMorph.length === 0) { return false; } //helpers.echo("***"); @@ -423,14 +414,11 @@ if (!aWord) { //helpers.echo("morph: noword, returns " + bNoWord); return bNoWord; } //helpers.echo("aWord: "+aWord.toString()); - if (!_dAnalyses.has(aWord[1]) && !_storeMorphFromFSA(aWord[1])) { - return false; - } - let lMorph = dDA.has(aWord[0]) ? dDA.get(aWord[0]) : _dAnalyses.get(aWord[1]); + let lMorph = dDA.has(aWord[0]) ? dDA.get(aWord[0]) : _oSpellChecker.getMorph(aWord[1]); //helpers.echo("lMorph: "+lMorph.toString()); if (lMorph.length === 0) { return false; } //helpers.echo("***"); @@ -442,41 +430,32 @@ return lMorph.some(s => (s.search(sPattern) !== -1)); } function analyse (sWord, sPattern, bStrict=true) { // analyse a word, return true if sPattern in morphologies (disambiguation off) - if (!_dAnalyses.has(sWord) && !_storeMorphFromFSA(sWord)) { + let lMorph = _oSpellChecker.getMorph(sWord); + if (lMorph.length === 0) { return false; } if (bStrict) { - return _dAnalyses.get(sWord).every(s => (s.search(sPattern) !== -1)); + return lMorph.every(s => (s.search(sPattern) !== -1)); } - return _dAnalyses.get(sWord).some(s => (s.search(sPattern) !== -1)); + return lMorph.some(s => (s.search(sPattern) !== -1)); } function analysex (sWord, sPattern, sNegPattern) { // analyse a word, returns True if not sNegPattern in word morphologies and sPattern in word morphologies (disambiguation off) - if (!_dAnalyses.has(sWord) && !_storeMorphFromFSA(sWord)) { + let lMorph = _oSpellChecker.getMorph(sWord); + if (lMorph.length === 0) { return false; } // check negative condition - if (_dAnalyses.get(sWord).some(s => (s.search(sNegPattern) !== -1))) { + if (lMorph.some(s => (s.search(sNegPattern) !== -1))) { return false; } // search sPattern - return _dAnalyses.get(sWord).some(s => (s.search(sPattern) !== -1)); -} - -function stem (sWord) { - // returns a list of sWord's stems - if (!sWord) { - return []; - } - if (!_dAnalyses.has(sWord) && !_storeMorphFromFSA(sWord)) { - return []; - } - return _dAnalyses.get(sWord).map( s => s.slice(1, s.indexOf(" ")) ); + return lMorph.some(s => (s.search(sPattern) !== -1)); } //// functions to get text outside pattern scope @@ -565,19 +544,17 @@ return true; } if (dDA.has(nPos)) { return true; } - if (!_dAnalyses.has(sWord) && !_storeMorphFromFSA(sWord)) { + let lMorph = _oSpellChecker.getMorph(sWord); + if (lMorph.length === 0 || lMorph.length === 1) { return true; } - if (_dAnalyses.get(sWord).length === 1) { - return true; - } - let lSelect = _dAnalyses.get(sWord).filter( sMorph => sMorph.search(sPattern) !== -1 ); + let lSelect = lMorph.filter( sMorph => sMorph.search(sPattern) !== -1 ); if (lSelect.length > 0) { - if (lSelect.length != _dAnalyses.get(sWord).length) { + if (lSelect.length != lMorph.length) { dDA.set(nPos, lSelect); } } else if (lDefault) { dDA.set(nPos, lDefaul); } @@ -589,19 +566,17 @@ return true; } if (dDA.has(nPos)) { return true; } - if (!_dAnalyses.has(sWord) && !_storeMorphFromFSA(sWord)) { + let lMorph = _oSpellChecker.getMorph(sWord); + if (lMorph.length === 0 || lMorph.length === 1) { return true; } - if (_dAnalyses.get(sWord).length === 1) { - return true; - } - let lSelect = _dAnalyses.get(sWord).filter( sMorph => sMorph.search(sPattern) === -1 ); + let lSelect = lMorph.filter( sMorph => sMorph.search(sPattern) === -1 ); if (lSelect.length > 0) { - if (lSelect.length != _dAnalyses.get(sWord).length) { + if (lSelect.length != lMorph.length) { dDA.set(nPos, lSelect); } } else if (lDefault) { dDA.set(nPos, lDefault); } Index: gc_core/py/lang_core/gc_engine.py ================================================================== --- gc_core/py/lang_core/gc_engine.py +++ gc_core/py/lang_core/gc_engine.py @@ -10,10 +10,13 @@ from ..graphspell.spellchecker import SpellChecker from ..graphspell.echo import echo from . import gc_options +from ..graphspell.tokenizer import Tokenizer +from .gc_rules_graph import dGraph, dRule + __all__ = [ "lang", "locales", "pkg", "name", "version", "author", \ "load", "parse", "getSpellChecker", \ "setOption", "setOptions", "getOptions", "getDefaultOptions", "getOptionsLabels", "resetOptions", "displayOptions", \ "ignoreRule", "resetIgnoreRules", "reactivateRule", "listRules", "displayRules" ] @@ -33,28 +36,28 @@ # data _sAppContext = "" # what software is running _dOptions = None _aIgnoredRules = set() _oSpellChecker = None -_dAnalyses = {} # cache for data from dictionary - +_oTokenizer = None #### Parsing def parse (sText, sCountry="${country_default}", bDebug=False, dOptions=None, bContext=False): "analyses the paragraph sText and returns list of errors" #sText = unicodedata.normalize("NFC", sText) aErrors = None - sAlt = sText + sRealText = sText dDA = {} # Disambiguisator. Key = position; value = list of morphologies dPriority = {} # Key = position; value = priority dOpt = _dOptions if not dOptions else dOptions + bShowRuleId = option('idrule') # parse paragraph try: - sNew, aErrors = _proofread(sText, sAlt, 0, True, dDA, dPriority, sCountry, dOpt, bDebug, bContext) + sNew, aErrors = _proofread(sText, sRealText, 0, True, dDA, dPriority, sCountry, dOpt, bShowRuleId, bDebug, bContext) if sNew: sText = sNew except: raise @@ -71,11 +74,16 @@ # parse sentences for iStart, iEnd in _getSentenceBoundaries(sText): if 4 < (iEnd - iStart) < 2000: dDA.clear() try: - _, errs = _proofread(sText[iStart:iEnd], sAlt[iStart:iEnd], iStart, False, dDA, dPriority, sCountry, dOpt, bDebug, bContext) + # regex parser + _, errs = _proofread(sText[iStart:iEnd], sRealText[iStart:iEnd], iStart, False, dDA, dPriority, sCountry, dOpt, bShowRuleId, bDebug, bContext) + aErrors.update(errs) + # token parser + oSentence = TokenSentence(sText[iStart:iEnd], sRealText[iStart:iEnd], iStart) + _, errs = oSentence.parse(dPriority, sCountry, dOpt, bShowRuleId, bDebug, bContext) aErrors.update(errs) except: raise return aErrors.values() # this is a view (iterable) @@ -85,15 +93,13 @@ for m in _zEndOfSentence.finditer(sText): yield (iStart, m.end()) iStart = m.end() -def _proofread (s, sx, nOffset, bParagraph, dDA, dPriority, sCountry, dOptions, bDebug, bContext): +def _proofread (s, sx, nOffset, bParagraph, dDA, dPriority, sCountry, dOptions, bShowRuleId, bDebug, bContext): dErrs = {} bChange = False - bIdRule = option('idrule') - for sOption, lRuleGroup in _getRules(bParagraph): if not sOption or dOptions.get(sOption, False): for zRegex, bUppercase, sLineId, sRuleId, nPriority, lActions in lRuleGroup: if sRuleId not in _aIgnoredRules: for m in zRegex.finditer(s): @@ -105,11 +111,11 @@ if bCondMemo: if cActionType == "-": # grammar error nErrorStart = nOffset + m.start(eAct[0]) if nErrorStart not in dErrs or nPriority > dPriority[nErrorStart]: - dErrs[nErrorStart] = _createError(s, sx, sWhat, nOffset, m, eAct[0], sLineId, sRuleId, bUppercase, eAct[1], eAct[2], bIdRule, sOption, bContext) + dErrs[nErrorStart] = _createRegexError(s, sx, sWhat, nOffset, m, eAct[0], sLineId, sRuleId, bUppercase, eAct[1], eAct[2], bShowRuleId, sOption, bContext) dPriority[nErrorStart] = nPriority elif cActionType == "~": # text processor s = _rewrite(s, sWhat, eAct[0], m, bUppercase) bChange = True @@ -132,11 +138,11 @@ if bChange: return (s, dErrs) return (False, dErrs) -def _createWriterError (s, sx, sRepl, nOffset, m, iGroup, sLineId, sRuleId, bUppercase, sMsg, sURL, bIdRule, sOption, bContext): +def _createRegexWriterError (s, sx, sRepl, nOffset, m, iGroup, sLineId, sRuleId, bUppercase, sMsg, sURL, bShowRuleId, sOption, bContext): "error for Writer (LO/OO)" xErr = SingleProofreadingError() #xErr = uno.createUnoStruct( "com.sun.star.linguistic2.SingleProofreadingError" ) xErr.nErrorStart = nOffset + m.start(iGroup) xErr.nErrorLength = m.end(iGroup) - m.start(iGroup) @@ -164,11 +170,11 @@ sMessage = globals()[sMsg[1:]](s, m) else: sMessage = m.expand(sMsg) xErr.aShortComment = sMessage # sMessage.split("|")[0] # in context menu xErr.aFullComment = sMessage # sMessage.split("|")[-1] # in dialog - if bIdRule: + if bShowRuleId: xErr.aShortComment += " # " + sLineId + " # " + sRuleId # URL if sURL: p = PropertyValue() p.Name = "FullCommentURL" @@ -177,11 +183,11 @@ else: xErr.aProperties = () return xErr -def _createDictError (s, sx, sRepl, nOffset, m, iGroup, sLineId, sRuleId, bUppercase, sMsg, sURL, bIdRule, sOption, bContext): +def _createRegexDictError (s, sx, sRepl, nOffset, m, iGroup, sLineId, sRuleId, bUppercase, sMsg, sURL, bShowRuleId, sOption, bContext): "error as a dictionary" dErr = {} dErr["nStart"] = nOffset + m.start(iGroup) dErr["nEnd"] = nOffset + m.end(iGroup) dErr["sLineId"] = sLineId @@ -194,13 +200,13 @@ if bUppercase and m.group(iGroup)[0:1].isupper(): dErr["aSuggestions"] = list(map(str.capitalize, sugg.split("|"))) else: dErr["aSuggestions"] = sugg.split("|") else: - dErr["aSuggestions"] = () + dErr["aSuggestions"] = [] elif sRepl == "_": - dErr["aSuggestions"] = () + dErr["aSuggestions"] = [] else: if bUppercase and m.group(iGroup)[0:1].isupper(): dErr["aSuggestions"] = list(map(str.capitalize, m.expand(sRepl).split("|"))) else: dErr["aSuggestions"] = m.expand(sRepl).split("|") @@ -208,11 +214,11 @@ if sMsg[0:1] == "=": sMessage = globals()[sMsg[1:]](s, m) else: sMessage = m.expand(sMsg) dErr["sMessage"] = sMessage - if bIdRule: + if bShowRuleId: dErr["sMessage"] += " # " + sLineId + " # " + sRuleId # URL dErr["URL"] = sURL if sURL else "" # Context if bContext: @@ -219,10 +225,98 @@ dErr['sUnderlined'] = sx[m.start(iGroup):m.end(iGroup)] dErr['sBefore'] = sx[max(0,m.start(iGroup)-80):m.start(iGroup)] dErr['sAfter'] = sx[m.end(iGroup):m.end(iGroup)+80] return dErr + +def _createTokenWriterError (lToken, sSentence, sSentence0, sRepl, iFirstToken, nStart, nEnd, sLineId, sRuleId, bUppercase, sMsg, sURL, bShowRuleId, sOption, bContext): + "error for Writer (LO/OO)" + xErr = SingleProofreadingError() + #xErr = uno.createUnoStruct( "com.sun.star.linguistic2.SingleProofreadingError" ) + xErr.nErrorStart = nStart + xErr.nErrorLength = nEnd - nStart + xErr.nErrorType = PROOFREADING + xErr.aRuleIdentifier = sRuleId + # suggestions + if sRepl[0:1] == "=": + sSugg = globals()[sRepl[1:]](lToken) + if sSugg: + if bUppercase and lToken[iFirstToken]["sValue"][0:1].isupper(): + xErr.aSuggestions = tuple(map(str.capitalize, sSugg.split("|"))) + else: + xErr.aSuggestions = tuple(sSugg.split("|")) + else: + xErr.aSuggestions = () + elif sRepl == "_": + xErr.aSuggestions = () + else: + if bUppercase and lToken[iFirstToken]["sValue"][0:1].isupper(): + xErr.aSuggestions = tuple(map(str.capitalize, sRepl.split("|"))) + else: + xErr.aSuggestions = tuple(sRepl.split("|")) + # Message + if sMsg[0:1] == "=": + sMessage = globals()[sMsg[1:]](lToken) + else: + sMessage = sMsg + xErr.aShortComment = sMessage # sMessage.split("|")[0] # in context menu + xErr.aFullComment = sMessage # sMessage.split("|")[-1] # in dialog + if bShowRuleId: + xErr.aShortComment += " " + sLineId + " # " + sRuleId + # URL + if sURL: + p = PropertyValue() + p.Name = "FullCommentURL" + p.Value = sURL + xErr.aProperties = (p,) + else: + xErr.aProperties = () + return xErr + + +def _createTokenDictError (lToken, sSentence, sSentence0, sRepl, iFirstToken, nStart, nEnd, sLineId, sRuleId, bUppercase, sMsg, sURL, bShowRuleId, sOption, bContext): + "error as a dictionary" + dErr = {} + dErr["nStart"] = nStart + dErr["nEnd"] = nEnd + dErr["sLineId"] = sLineId + dErr["sRuleId"] = sRuleId + dErr["sType"] = sOption if sOption else "notype" + # suggestions + if sRepl[0:1] == "=": + sugg = globals()[sRepl[1:]](lToken) + if sugg: + if bUppercase and lToken[iFirstToken]["sValue"][0:1].isupper(): + dErr["aSuggestions"] = list(map(str.capitalize, sugg.split("|"))) + else: + dErr["aSuggestions"] = sugg.split("|") + else: + dErr["aSuggestions"] = [] + elif sRepl == "_": + dErr["aSuggestions"] = [] + else: + if bUppercase and lToken[iFirstToken]["sValue"][0:1].isupper(): + dErr["aSuggestions"] = list(map(str.capitalize, sRepl.split("|"))) + else: + dErr["aSuggestions"] = sRepl.split("|") + # Message + if sMsg[0:1] == "=": + sMessage = globals()[sMsg[1:]](lToken) + else: + sMessage = sMsg + dErr["sMessage"] = sMessage + if bShowRuleId: + dErr["sMessage"] += " " + sLineId + " # " + sRuleId + # URL + dErr["URL"] = sURL if sURL else "" + # Context + if bContext: + dErr['sUnderlined'] = sSentence0[dErr["nStart"]:dErr["nEnd"]] + dErr['sBefore'] = sSentence0[max(0,dErr["nStart"]-80):dErr["nStart"]] + dErr['sAfter'] = sSentence0[dErr["nEnd"]:dErr["nEnd"]+80] + return dErr + def _rewrite (s, sRepl, iGroup, m, bUppercase): "text processor: write sRepl in s at iGroup position" nLen = m.end(iGroup) - m.start(iGroup) if sRepl == "*": @@ -280,23 +374,28 @@ # LibreOffice / OpenOffice from com.sun.star.linguistic2 import SingleProofreadingError from com.sun.star.text.TextMarkupType import PROOFREADING from com.sun.star.beans import PropertyValue #import lightproof_handler_${implname} as opt - _createError = _createWriterError + _createRegexError = _createRegexWriterError + _createTokenError = _createTokenWriterError except ImportError: - _createError = _createDictError + _createRegexError = _createRegexDictError + _createTokenError = _createTokenDictError def load (sContext="Python"): global _oSpellChecker global _sAppContext global _dOptions + global _oTokenizer try: _oSpellChecker = SpellChecker("${lang}", "${dic_main_filename_py}", "${dic_extended_filename_py}", "${dic_community_filename_py}", "${dic_personal_filename_py}") _sAppContext = sContext _dOptions = dict(gc_options.getOptions(sContext)) # duplication necessary, to be able to reset to default + _oTokenizer = _oSpellChecker.getTokenizer() + _oSpellChecker.activateStorage() except: traceback.print_exc() def setOption (sOpt, bVal): @@ -369,15 +468,15 @@ #### common functions # common regexes -_zEndOfSentence = re.compile('([.?!:;…][ .?!… »”")]*|.$)') -_zBeginOfParagraph = re.compile("^\W*") -_zEndOfParagraph = re.compile("\W*$") -_zNextWord = re.compile(" +(\w[\w-]*)") -_zPrevWord = re.compile("(\w[\w-]*) +$") +_zEndOfSentence = re.compile(r'([.?!:;…][ .?!… »”")]*|.$)') +_zBeginOfParagraph = re.compile(r"^\W*") +_zEndOfParagraph = re.compile(r"\W*$") +_zNextWord = re.compile(r" +(\w[\w-]*)") +_zPrevWord = re.compile(r"(\w[\w-]*) +$") def option (sOpt): "return True if option sOpt is active" return _dOptions.get(sOpt, False) @@ -386,33 +485,25 @@ def displayInfo (dDA, tWord): "for debugging: retrieve info of word" if not tWord: echo("> nothing to find") return True - if tWord[1] not in _dAnalyses and not _storeMorphFromFSA(tWord[1]): - echo("> not in FSA") + lMorph = _oSpellChecker.getMorph(tWord[1]) + if not lMorph: + echo("> not in dictionary") return True if tWord[0] in dDA: echo("DA: " + str(dDA[tWord[0]])) - echo("FSA: " + str(_dAnalyses[tWord[1]])) + echo("FSA: " + str(lMorph)) return True - -def _storeMorphFromFSA (sWord): - "retrieves morphologies list from _oSpellChecker -> _dAnalyses" - global _dAnalyses - _dAnalyses[sWord] = _oSpellChecker.getMorph(sWord) - return True if _dAnalyses[sWord] else False - def morph (dDA, tWord, sPattern, bStrict=True, bNoWord=False): "analyse a tuple (position, word), return True if sPattern in morphologies (disambiguation on)" if not tWord: return bNoWord - if tWord[1] not in _dAnalyses and not _storeMorphFromFSA(tWord[1]): - return False - lMorph = dDA[tWord[0]] if tWord[0] in dDA else _dAnalyses[tWord[1]] + lMorph = dDA[tWord[0]] if tWord[0] in dDA else _oSpellChecker.getMorph(tWord[1]) if not lMorph: return False p = re.compile(sPattern) if bStrict: return all(p.search(s) for s in lMorph) @@ -421,13 +512,13 @@ def morphex (dDA, tWord, sPattern, sNegPattern, bNoWord=False): "analyse a tuple (position, word), returns True if not sNegPattern in word morphologies and sPattern in word morphologies (disambiguation on)" if not tWord: return bNoWord - if tWord[1] not in _dAnalyses and not _storeMorphFromFSA(tWord[1]): + lMorph = dDA[tWord[0]] if tWord[0] in dDA else _oSpellChecker.getMorph(tWord[1]) + if not lMorph: return False - lMorph = dDA[tWord[0]] if tWord[0] in dDA else _dAnalyses[tWord[1]] # check negative condition np = re.compile(sNegPattern) if any(np.search(s) for s in lMorph): return False # search sPattern @@ -435,40 +526,32 @@ return any(p.search(s) for s in lMorph) def analyse (sWord, sPattern, bStrict=True): "analyse a word, return True if sPattern in morphologies (disambiguation off)" - if sWord not in _dAnalyses and not _storeMorphFromFSA(sWord): - return False - if not _dAnalyses[sWord]: + lMorph = _oSpellChecker.getMorph(sWord) + if not lMorph: return False p = re.compile(sPattern) if bStrict: - return all(p.search(s) for s in _dAnalyses[sWord]) - return any(p.search(s) for s in _dAnalyses[sWord]) + return all(p.search(s) for s in lMorph) + return any(p.search(s) for s in lMorph) def analysex (sWord, sPattern, sNegPattern): "analyse a word, returns True if not sNegPattern in word morphologies and sPattern in word morphologies (disambiguation off)" - if sWord not in _dAnalyses and not _storeMorphFromFSA(sWord): + lMorph = _oSpellChecker.getMorph(sWord) + if not lMorph: return False # check negative condition np = re.compile(sNegPattern) - if any(np.search(s) for s in _dAnalyses[sWord]): + if any(np.search(s) for s in lMorph): return False # search sPattern p = re.compile(sPattern) - return any(p.search(s) for s in _dAnalyses[sWord]) - - -def stem (sWord): - "returns a list of sWord's stems" - if not sWord: - return [] - if sWord not in _dAnalyses and not _storeMorphFromFSA(sWord): - return [] - return [ s[1:s.find(" ")] for s in _dAnalyses[sWord] ] + return any(p.search(s) for s in lMorph) + ## functions to get text outside pattern scope # warning: check compile_rules.py to understand how it works @@ -534,52 +617,302 @@ def select (dDA, nPos, sWord, sPattern, lDefault=None): if not sWord: return True if nPos in dDA: return True - if sWord not in _dAnalyses and not _storeMorphFromFSA(sWord): + lMorph = _oSpellChecker.getMorph(sWord) + if not lMorph or len(lMorph) == 1: return True - if len(_dAnalyses[sWord]) == 1: - return True - lSelect = [ sMorph for sMorph in _dAnalyses[sWord] if re.search(sPattern, sMorph) ] + lSelect = [ sMorph for sMorph in lMorph if re.search(sPattern, sMorph) ] if lSelect: - if len(lSelect) != len(_dAnalyses[sWord]): + if len(lSelect) != len(lMorph): dDA[nPos] = lSelect - #echo("= "+sWord+" "+str(dDA.get(nPos, "null"))) elif lDefault: dDA[nPos] = lDefault - #echo("= "+sWord+" "+str(dDA.get(nPos, "null"))) return True def exclude (dDA, nPos, sWord, sPattern, lDefault=None): if not sWord: return True if nPos in dDA: return True - if sWord not in _dAnalyses and not _storeMorphFromFSA(sWord): - return True - if len(_dAnalyses[sWord]) == 1: - return True - lSelect = [ sMorph for sMorph in _dAnalyses[sWord] if not re.search(sPattern, sMorph) ] - if lSelect: - if len(lSelect) != len(_dAnalyses[sWord]): - dDA[nPos] = lSelect - #echo("= "+sWord+" "+str(dDA.get(nPos, "null"))) + lMorph = _oSpellChecker.getMorph(sWord) + if not lMorph or len(lMorph) == 1: + return True + lSelect = [ sMorph for sMorph in lMorph if not re.search(sPattern, sMorph) ] + if lSelect: + if len(lSelect) != len(lMorph): + dDA[nPos] = lSelect elif lDefault: dDA[nPos] = lDefault - #echo("= "+sWord+" "+str(dDA.get(nPos, "null"))) return True def define (dDA, nPos, lMorph): dDA[nPos] = lMorph - #echo("= "+str(nPos)+" "+str(dDA[nPos])) return True #### GRAMMAR CHECKER PLUGINS ${plugins} +#### CALLABLES (generated code) + ${callables} + + + +#### TOKEN SENTENCE CHECKER + +class TokenSentence: + + def __init__ (self, sSentence, sSentence0, iStart): + self.sSentence = sSentence + self.sSentence0 = sSentence0 + self.iStart = iStart + self.lToken = list(_oTokenizer.genTokens(sSentence, True)) + + def _getNextMatchingNodes (self, dToken, dNode): + "generator: return nodes where “values” match arcs" + # token value + if dToken["sValue"] in dNode: + #print("value found: ", dToken["sValue"]) + yield dGraph[dNode[dToken["sValue"]]] + # token lemmas + if "" in dNode: + for sLemma in _oSpellChecker.getLemma(dToken["sValue"]): + if sLemma in dNode[""]: + #print("lemma found: ", sLemma) + yield dGraph[dNode[""][sLemma]] + # universal arc + if "*" in dNode: + #print("generic arc") + yield dGraph[dNode["*"]] + # regex value arcs + if "" in dNode: + for sRegex in dNode[""]: + if re.search(sRegex, dToken["sValue"]): + #print("value regex matching: ", sRegex) + yield dGraph[dNode[""][sRegex]] + # regex morph arcs + if "" in dNode: + for sRegex in dNode[""]: + for sMorph in _oSpellChecker.getMorph(dToken["sValue"]): + if re.search(sRegex, sMorph): + #print("morph regex matching: ", sRegex) + yield dGraph[dNode[""][sRegex]] + + def parse (self, dPriority, sCountry="${country_default}", dOptions=None, bShowRuleId=False, bDebug=False, bContext=False): + dErr = {} + dPriority = {} # Key = position; value = priority + dOpt = _dOptions if not dOptions else dOptions + lPointer = [] + bChange = False + for dToken in self.lToken: + # check arcs for each existing pointer + lNewPointer = [] + for i, dPointer in enumerate(lPointer): + bValid = False + bFirst = True + for dNode in self._getNextMatchingNodes(dToken, dPointer["dNode"]): + if bFirst: + dPointer["dNode"] = dNode + else: + lNewPointer.append({"nOffset": dPointer["nOffset"], "dNode": dNode}) + bFirst = False + bValid = True + if not bValid: + del lPointer[i] + lPointer.extend(lNewPointer) + # check arcs of first nodes + for dNode in self._getNextMatchingNodes(dToken, dGraph[0]): + lPointer.append({"nOffset": dToken["i"], "dNode": dNode}) + # check if there is rules to check for each pointer + for dPointer in lPointer: + if "" in dPointer["dNode"]: + bHasChanged, errs = self._executeActions(dPointer["dNode"][""], dPointer["nOffset"], dPriority, dOpt, bShowRuleId, bContext) + dErr.update(errs) + if bHasChanged: + bChange = True + if dErr: + print(dErr) + return (bChange, dErr) + + def _executeActions (self, dNode, nTokenOffset, dPriority, dOpt, bShowRuleId, bContext): + #print(locals()) + dErrs = {} + bChange = False + for sLineId, nextNodeKey in dNode.items(): + for sRuleId in dGraph[nextNodeKey]: + print(sRuleId) + bCondMemo = None + sFuncCond, cActionType, sWhat, *eAct = dRule[sRuleId] + # action in lActions: [ condition, action type, replacement/suggestion/action[, iTokenStart, iTokenEnd[, nPriority, message, URL]] ] + try: + bCondMemo = not sFuncCond or globals()[sFuncCond](self, sCountry, bCondMemo) + if bCondMemo: + if cActionType == "-": + # grammar error + print("-") + nTokenErrorStart = nTokenOffset + eAct[0] + nTokenErrorEnd = nTokenOffset + eAct[1] + nErrorStart = self.iStart + self.lToken[nTokenErrorStart]["nStart"] + nErrorEnd = self.iStart + self.lToken[nTokenErrorEnd]["nEnd"] + if nErrorStart not in dErrs or eAct[2] > dPriority[nErrorStart]: + dErrs[nErrorStart] = _createTokenError(self.lToken, self.sSentence, self.sSentence0, sWhat, nTokenErrorStart, nErrorStart, nErrorEnd, sLineId, sRuleId, True, eAct[3], eAct[4], bShowRuleId, "notype", bContext) + dPriority[nErrorStart] = eAct[2] + elif cActionType == "~": + # text processor + print("~") + self._rewrite(sWhat, nErrorStart, nErrorEnd) + elif cActionType == "@": + # jump + print("@") + self._jump(sWhat) + elif cActionType == "=": + # disambiguation + print("=") + globals()[sWhat](self.lToken) + elif cActionType == ">": + # we do nothing, this test is just a condition to apply all following actions + print(">") + pass + else: + print("# error: unknown action at " + sLineId) + elif cActionType == ">": + break + except Exception as e: + raise Exception(str(e), sLineId) + return bChange, dErrs + + def _rewrite (self, sWhat, nErrorStart, nErrorEnd): + "text processor: rewrite tokens between and position" + lTokenValue = sWhat.split("|") + if len(lTokenValue) != (nErrorEnd - nErrorStart + 1): + print("Error. Text processor: number of replacements != number of tokens.") + return + for i, sValue in zip(range(nErrorStart, nErrorEnd+1), lTokenValue): + self.lToken[i]["sValue"] = sValue + + def _jump (self, sWhat): + try: + nFrom, nTo = sWhat.split(">") + self.lToken[int(nFrom)]["iJump"] = int(nTo) + except: + print("# Error. Jump failed: ", sWhat) + traceback.print_exc() + return + + +#### Analyse tokens + +def g_morph (dToken, sPattern, bStrict=True): + "analyse a token, return True if in morphologies" + if "lMorph" in dToken: + lMorph = dToken["lMorph"] + else: + lMorph = _oSpellChecker.getMorph(dToken["sValue"]) + if not lMorph: + return False + zPattern = re.compile(sPattern) + if bStrict: + return all(zPattern.search(sMorph) for sMorph in lMorph) + return any(zPattern.search(sMorph) for sMorph in lMorph) + +def g_morphex (dToken, sPattern, sNegPattern): + "analyse a token, return True if not in morphologies and in morphologies" + if "lMorph" in dToken: + lMorph = dToken["lMorph"] + else: + lMorph = _oSpellChecker.getMorph(dToken["sValue"]) + if not lMorph: + return False + # check negative condition + zNegPattern = re.compile(sNegPattern) + if any(zNegPattern.search(sMorph) for sMorph in lMorph): + return False + # search sPattern + zPattern = re.compile(sPattern) + return any(zPattern.search(sMorph) for sMorph in lMorph) + +def g_analyse (dToken, sPattern, bStrict=True): + "analyse a token, return True if in morphologies (disambiguation off)" + lMorph = _oSpellChecker.getMorph(dToken["sValue"]) + if not lMorph: + return False + zPattern = re.compile(sPattern) + if bStrict: + return all(zPattern.search(sMorph) for sMorph in lMorph) + return any(zPattern.search(sMorph) for sMorph in lMorph) + + +def g_analysex (dToken, sPattern, sNegPattern): + "analyse a token, return True if not in morphologies and in morphologies (disambiguation off)" + lMorph = _oSpellChecker.getMorph(dToken["sValue"]) + if not lMorph: + return False + # check negative condition + zNegPattern = re.compile(sNegPattern) + if any(zNegPattern.search(sMorph) for sMorph in lMorph): + return False + # search sPattern + zPattern = re.compile(sPattern) + return any(zPattern.search(sMorph) for sMorph in lMorph) + + +#### Go outside the rule scope + +def g_nextToken (i): + pass + +def g_prevToken (i): + pass + +def g_look (): + pass + +def g_lookAndCheck (): + pass + + +#### Disambiguator + +def g_select (dToken, sPattern, lDefault=None): + "select morphologies for according to , always return True" + lMorph = dToken["lMorph"] if "lMorph" in dToken else _oSpellChecker.getMorph(dToken["sValue"]) + if not lMorph or len(lMorph) == 1: + return True + lSelect = [ sMorph for sMorph in lMorph if re.search(sPattern, sMorph) ] + if lSelect: + if len(lSelect) != len(lMorph): + dToken["lMorph"] = lSelect + elif lDefault: + dToken["lMorph"] = lDefault + return True + + +def g_exclude (dToken, sPattern, lDefault=None): + "select morphologies for according to , always return True" + lMorph = dToken["lMorph"] if "lMorph" in dToken else _oSpellChecker.getMorph(dToken["sValue"]) + if not lMorph or len(lMorph) == 1: + return True + lSelect = [ sMorph for sMorph in lMorph if not re.search(sPattern, sMorph) ] + if lSelect: + if len(lSelect) != len(lMorph): + dToken["lMorph"] = lSelect + elif lDefault: + dToken["lMorph"] = lDefault + return True + + +def g_define (dToken, lMorph): + "set morphologies of , always return True" + dToken["lMorph"] = lMorph + return True + + +#### CALLABLES (generated code) + +${graph_callables} ADDED gc_core/py/lang_core/gc_rules_graph.py Index: gc_core/py/lang_core/gc_rules_graph.py ================================================================== --- /dev/null +++ gc_core/py/lang_core/gc_rules_graph.py @@ -0,0 +1,5 @@ +# generated code, do not edit + +dGraph = ${rules_graph} + +dRule = ${rules_actions} ADDED gc_core/py/lang_core/gc_sentence.py Index: gc_core/py/lang_core/gc_sentence.py ================================================================== --- /dev/null +++ gc_core/py/lang_core/gc_sentence.py @@ -0,0 +1,237 @@ +# Sentence checker + +from ..graphspell.tokenizer import Tokenizer +from .gc_rules_graph import dGraph + + +oTokenizer = Tokenizer("${lang}") + + +class TokenSentence: + + def __init__ (self, sSentence, sSentence0, nOffset): + self.sSentence = sSentence + self.sSentence0 = sSentence0 + self.nOffset = nOffset + self.lToken = list(oTokenizer.genTokens()) + + def parse (self): + dErr = {} + lPointer = [] + for dToken in self.lToken: + for i, dPointer in enumerate(lPointer): + bValid = False + for dNode in self._getNextMatchingNodes(dToken, dPointer["dNode"]): + dPointer["nOffset"] = dToken["i"] + dPointer["dNode"] = dNode + bValid = True + if not bValid: + del lPointer[i] + for dNode in self._getNextMatchingNodes(dToken, dGraph): + lPointer.append({"nOffset": 0, "dNode": dNode}) + for dPointer in lPointer: + if "" in dPointer["dNode"]: + for dNode in dGraph[dPointer["dNode"][""]]: + dErr = self._executeActions(dNode, nOffset) + return dErr + + def _getNextMatchingNodes (self, dToken, dNode): + # token value + if dToken["sValue"] in dNode: + yield dGraph[dNode[dToken["sValue"]]] + # token lemmas + for sLemma in dToken["lLemma"]: + if sLemma in dNode: + yield dGraph[dNode[sLemma]] + # universal arc + if "*" in dNode: + yield dGraph[dNode["*"]] + # regex arcs + if "~" in dNode: + for sRegex in dNode["~"]: + for sMorph in dToken["lMorph"]: + if re.search(sRegex, sMorph): + yield dGraph[dNode["~"][sRegex]] + + def _executeActions (self, dNode, nOffset): + for sLineId, nextNodeKey in dNode.items(): + for sArc in dGraph[nextNodeKey]: + bCondMemo = None + sFuncCond, cActionType, sWhat, *eAct = dRule[sArc] + # action in lActions: [ condition, action type, replacement/suggestion/action[, iGroupStart, iGroupEnd[, message, URL]] ] + try: + bCondMemo = not sFuncCond or globals()[sFuncCond](self, sCountry, bCondMemo) + if bCondMemo: + if cActionType == "-": + # grammar error + nErrorStart = nSentenceOffset + m.start(eAct[0]) + nErrorEnd = nSentenceOffset + m.start(eAct[1]) + if nErrorStart not in dErrs or nPriority > dPriority[nErrorStart]: + dErrs[nErrorStart] = _createError(self, sWhat, nErrorStart, nErrorEnd, sLineId, bUppercase, eAct[2], eAct[3], bIdRule, sOption, bContext) + dPriority[nErrorStart] = nPriority + elif cActionType == "~": + # text processor + self._rewrite(sWhat, nErrorStart, nErrorEnd) + elif cActionType == "@": + # jump + self._jump(sWhat) + elif cActionType == "=": + # disambiguation + globals()[sWhat](self.lToken) + elif cActionType == ">": + # we do nothing, this test is just a condition to apply all following actions + pass + else: + print("# error: unknown action at " + sLineId) + elif cActionType == ">": + break + except Exception as e: + raise Exception(str(e), "# " + sLineId + " # " + sRuleId) + + def _createWriterError (self): + d = {} + return d + + def _createDictError (self): + d = {} + return d + + def _rewrite (self, sWhat, nErrorStart, nErrorEnd): + "text processor: rewrite tokens between and position" + lTokenValue = sWhat.split("|") + if len(lTokenValue) != (nErrorEnd - nErrorStart + 1): + print("Error. Text processor: number of replacements != number of tokens.") + return + for i, sValue in zip(range(nErrorStart, nErrorEnd+1), lTokenValue): + self.lToken[i]["sValue"] = sValue + + def _jump (self, sWhat): + try: + nFrom, nTo = sWhat.split(">") + self.lToken[int(nFrom)]["iJump"] = int(nTo) + except: + print("# Error. Jump failed: ", sWhat) + traceback.print_exc() + return + + +#### Analyse tokens + +def g_morph (dToken, sPattern, bStrict=True): + "analyse a token, return True if in morphologies" + if "lMorph" in dToken: + lMorph = dToken["lMorph"] + else: + if dToken["sValue"] not in _dAnalyses and not _storeMorphFromFSA(dToken["sValue"]): + return False + if not _dAnalyses[dToken["sValue"]]: + return False + lMorph = _dAnalyses[dToken["sValue"]] + zPattern = re.compile(sPattern) + if bStrict: + return all(zPattern.search(sMorph) for sMorph in lMorph) + return any(zPattern.search(sMorph) for sMorph in lMorph) + +def g_morphex (dToken, sPattern, sNegPattern): + "analyse a token, return True if not in morphologies and in morphologies" + if "lMorph" in dToken: + lMorph = dToken["lMorph"] + else: + if dToken["sValue"] not in _dAnalyses and not _storeMorphFromFSA(dToken["sValue"]): + return False + if not _dAnalyses[dToken["sValue"]]: + return False + lMorph = _dAnalyses[dToken["sValue"]] + # check negative condition + zNegPattern = re.compile(sNegPattern) + if any(zNegPattern.search(sMorph) for sMorph in lMorph): + return False + # search sPattern + zPattern = re.compile(sPattern) + return any(zPattern.search(sMorph) for sMorph in lMorph) + +def g_analyse (dToken, sPattern, bStrict=True): + "analyse a token, return True if in morphologies (disambiguation off)" + if dToken["sValue"] not in _dAnalyses and not _storeMorphFromFSA(dToken["sValue"]): + return False + if not _dAnalyses[dToken["sValue"]]: + return False + zPattern = re.compile(sPattern) + if bStrict: + return all(zPattern.search(sMorph) for sMorph in _dAnalyses[dToken["sValue"]]) + return any(zPattern.search(sMorph) for sMorph in _dAnalyses[dToken["sValue"]]) + + +def g_analysex (dToken, sPattern, sNegPattern): + "analyse a token, return True if not in morphologies and in morphologies (disambiguation off)" + if dToken["sValue"] not in _dAnalyses and not _storeMorphFromFSA(dToken["sValue"]): + return False + if not _dAnalyses[dToken["sValue"]]: + return False + # check negative condition + zNegPattern = re.compile(sNegPattern) + if any(zNegPattern.search(sMorph) for sMorph in _dAnalyses[dToken["sValue"]]): + return False + # search sPattern + zPattern = re.compile(sPattern) + return any(zPattern.search(sMorph) for sMorph in _dAnalyses[dToken["sValue"]]) + + +#### Go outside the rule scope + +def g_nextToken (i): + pass + +def g_prevToken (i): + pass + +def g_look (): + pass + +def g_lookAndCheck (): + pass + + +#### Disambiguator + +def g_select (dToken, sPattern, lDefault=None): + "select morphologies for according to , always return True" + if dToken["sValue"] not in _dAnalyses and not _storeMorphFromFSA(dToken["sValue"]): + return True + if len(_dAnalyses[dToken["sValue"]]) == 1: + return True + lMorph = dToken["lMorph"] or _dAnalyses[dToken["sValue"]] + lSelect = [ sMorph for sMorph in lMorph if re.search(sPattern, sMorph) ] + if lSelect: + if len(lSelect) != len(lMorph): + dToken["lMorph"] = lSelect + elif lDefault: + dToken["lMorph"] = lDefault + return True + + +def g_exclude (dToken, sPattern, lDefault=None): + "select morphologies for according to , always return True" + if dToken["sValue"] not in _dAnalyses and not _storeMorphFromFSA(dToken["sValue"]): + return True + if len(_dAnalyses[dToken["sValue"]]) == 1: + return True + lMorph = dToken["lMorph"] or _dAnalyses[dToken["sValue"]] + lSelect = [ sMorph for sMorph in lMorph if not re.search(sPattern, sMorph) ] + if lSelect: + if len(lSelect) != len(lMorph): + dToken["lMorph"] = lSelect + elif lDefault: + dToken["lMorph"] = lDefault + return True + + +def g_define (dToken, lMorph): + "set morphologies of , always return True" + dToken["lMorph"] = lMorph + return True + + +#### CALLABLES (generated code) + +${graph_callables} Index: gc_lang/fr/modules-js/gce_analyseur.js ================================================================== --- gc_lang/fr/modules-js/gce_analyseur.js +++ gc_lang/fr/modules-js/gce_analyseur.js @@ -20,12 +20,11 @@ } if (s2 == "eux") { return "ils"; } if (s2 == "elle" || s2 == "elles") { - // We don’t check if word exists in _dAnalyses, for it is assumed it has been done before - if (cregex.mbNprMasNotFem(_dAnalyses.gl_get(s1, ""))) { + if (cregex.mbNprMasNotFem(_oSpellChecker.getMorph(s1))) { return "ils"; } // si épicène, indéterminable, mais OSEF, le féminin l’emporte return "elles"; } @@ -32,41 +31,40 @@ return s1 + " et " + s2; } function apposition (sWord1, sWord2) { // returns true if nom + nom (no agreement required) - // We don’t check if word exists in _dAnalyses, for it is assumed it has been done before - return cregex.mbNomNotAdj(_dAnalyses.gl_get(sWord2, "")) && cregex.mbPpasNomNotAdj(_dAnalyses.gl_get(sWord1, "")); + return cregex.mbNomNotAdj(_oSpellChecker.getMorph(sWord2)) && cregex.mbPpasNomNotAdj(_oSpellChecker.getMorph(sWord1)); } function isAmbiguousNAV (sWord) { // words which are nom|adj and verb are ambiguous (except être and avoir) - if (!_dAnalyses.has(sWord) && !_storeMorphFromFSA(sWord)) { + let lMorph = _oSpellChecker.getMorph(sWord); + if (lMorph.length === 0) { return false; } - if (!cregex.mbNomAdj(_dAnalyses.gl_get(sWord, "")) || sWord == "est") { + if (!cregex.mbNomAdj(lMorph) || sWord == "est") { return false; } - if (cregex.mbVconj(_dAnalyses.gl_get(sWord, "")) && !cregex.mbMG(_dAnalyses.gl_get(sWord, ""))) { + if (cregex.mbVconj(lMorph) && !cregex.mbMG(lMorph)) { return true; } return false; } function isAmbiguousAndWrong (sWord1, sWord2, sReqMorphNA, sReqMorphConj) { //// use it if sWord1 won’t be a verb; word2 is assumed to be true via isAmbiguousNAV - // We don’t check if word exists in _dAnalyses, for it is assumed it has been done before - let a2 = _dAnalyses.gl_get(sWord2, null); - if (!a2 || a2.length === 0) { + let a2 = _oSpellChecker.getMorph(sWord2); + if (a2.length === 0) { return false; } if (cregex.checkConjVerb(a2, sReqMorphConj)) { // verb word2 is ok return false; } - let a1 = _dAnalyses.gl_get(sWord1, null); - if (!a1 || a1.length === 0) { + let a1 = _oSpellChecker.getMorph(sWord1); + if (a1.length === 0) { return false; } if (cregex.checkAgreement(a1, a2) && (cregex.mbAdj(a2) || cregex.mbAdj(a1))) { return false; } @@ -73,21 +71,20 @@ return true; } function isVeryAmbiguousAndWrong (sWord1, sWord2, sReqMorphNA, sReqMorphConj, bLastHopeCond) { //// use it if sWord1 can be also a verb; word2 is assumed to be true via isAmbiguousNAV - // We don’t check if word exists in _dAnalyses, for it is assumed it has been done before - let a2 = _dAnalyses.gl_get(sWord2, null); - if (!a2 || a2.length === 0) { + let a2 = _oSpellChecker.getMorph(sWord2); + if (a2.length === 0) { return false; } if (cregex.checkConjVerb(a2, sReqMorphConj)) { // verb word2 is ok return false; } - let a1 = _dAnalyses.gl_get(sWord1, null); - if (!a1 || a1.length === 0) { + let a1 = _oSpellChecker.getMorph(sWord1); + if (a1.length === 0) { return false; } if (cregex.checkAgreement(a1, a2) && (cregex.mbAdj(a2) || cregex.mbAdjNb(a1))) { return false; } @@ -101,17 +98,16 @@ } return false; } function checkAgreement (sWord1, sWord2) { - // We don’t check if word exists in _dAnalyses, for it is assumed it has been done before - let a2 = _dAnalyses.gl_get(sWord2, null); - if (!a2 || a2.length === 0) { + let a2 = _oSpellChecker.getMorph(sWord2); + if (a2.length === 0) { return true; } - let a1 = _dAnalyses.gl_get(sWord1, null); - if (!a1 || a1.length === 0) { + let a1 = _oSpellChecker.getMorph(sWord1); + if (a1.length === 0) { return true; } return cregex.checkAgreement(a1, a2); } Index: gc_lang/fr/modules-js/gce_suggestions.js ================================================================== --- gc_lang/fr/modules-js/gce_suggestions.js +++ gc_lang/fr/modules-js/gce_suggestions.js @@ -10,18 +10,17 @@ //// verbs function suggVerb (sFlex, sWho, funcSugg2=null) { - // we don’t check if word exists in _dAnalyses, for it is assumed it has been done before let aSugg = new Set(); - for (let sStem of stem(sFlex)) { + for (let sStem of _oSpellChecker.getLemma(sFlex)) { let tTags = conj._getTags(sStem); if (tTags) { // we get the tense let aTense = new Set(); - for (let sMorph of _dAnalyses.gl_get(sFlex, [])) { + for (let sMorph of _oSpellChecker.getMorph(sFlex)) { let m; let zVerb = new RegExp (">"+sStem+" .*?(:(?:Y|I[pqsf]|S[pq]|K))", "g"); while ((m = zVerb.exec(sMorph)) !== null) { // stem must be used in regex to prevent confusion between different verbs (e.g. sauras has 2 stems: savoir and saurer) if (m) { @@ -59,11 +58,11 @@ return ""; } function suggVerbPpas (sFlex, sWhat=null) { let aSugg = new Set(); - for (let sStem of stem(sFlex)) { + for (let sStem of _oSpellChecker.getLemma(sFlex)) { let tTags = conj._getTags(sStem); if (tTags) { if (!sWhat) { aSugg.add(conj._getConjWithTags(sStem, tTags, ":PQ", ":Q1")); aSugg.add(conj._getConjWithTags(sStem, tTags, ":PQ", ":Q2")); @@ -109,11 +108,11 @@ return ""; } function suggVerbTense (sFlex, sTense, sWho) { let aSugg = new Set(); - for (let sStem of stem(sFlex)) { + for (let sStem of _oSpellChecker.getLemma(sFlex)) { if (conj.hasConj(sStem, sTense, sWho)) { aSugg.add(conj.getConj(sStem, sTense, sWho)); } } if (aSugg.size > 0) { @@ -122,11 +121,11 @@ return ""; } function suggVerbImpe (sFlex) { let aSugg = new Set(); - for (let sStem of stem(sFlex)) { + for (let sStem of _oSpellChecker.getLemma(sFlex)) { let tTags = conj._getTags(sStem); if (tTags) { if (conj._hasConjWithTags(tTags, ":E", ":2s")) { aSugg.add(conj._getConjWithTags(sStem, tTags, ":E", ":2s")); } @@ -143,11 +142,11 @@ } return ""; } function suggVerbInfi (sFlex) { - return stem(sFlex).filter(sStem => conj.isVerb(sStem)).join("|"); + return _oSpellChecker.getLemma(sFlex).filter(sStem => conj.isVerb(sStem)).join("|"); } const _dQuiEst = new Map ([ ["je", ":1s"], ["j’", ":1s"], ["j’en", ":1s"], ["j’y", ":1s"], @@ -174,11 +173,11 @@ return ""; } sWho = ":3s"; } let aSugg = new Set(); - for (let sStem of stem(sFlex)) { + for (let sStem of _oSpellChecker.getLemma(sFlex)) { let tTags = conj._getTags(sStem); if (tTags) { for (let sTense of lMode) { if (conj._hasConjWithTags(tTags, sTense, sWho)) { aSugg.add(conj._getConjWithTags(sStem, tTags, sTense, sWho)); @@ -195,14 +194,15 @@ //// Nouns and adjectives function suggPlur (sFlex, sWordToAgree=null) { // returns plural forms assuming sFlex is singular if (sWordToAgree) { - if (!_dAnalyses.has(sWordToAgree) && !_storeMorphFromFSA(sWordToAgree)) { + let lMorph = _oSpellChecker.getMorph(sWordToAgree); + if (lMorph.length === 0) { return ""; } - let sGender = cregex.getGender(_dAnalyses.gl_get(sWordToAgree, [])); + let sGender = cregex.getGender(lMorph); if (sGender == ":m") { return suggMasPlur(sFlex); } else if (sGender == ":f") { return suggFemPlur(sFlex); } @@ -256,13 +256,12 @@ return ""; } function suggMasSing (sFlex, bSuggSimil=false) { // returns masculine singular forms - // we don’t check if word exists in _dAnalyses, for it is assumed it has been done before let aSugg = new Set(); - for (let sMorph of _dAnalyses.gl_get(sFlex, [])) { + for (let sMorph of _oSpellChecker.getMorph(sFlex)) { if (!sMorph.includes(":V")) { // not a verb if (sMorph.includes(":m") || sMorph.includes(":e")) { aSugg.add(suggSing(sFlex)); } else { @@ -292,13 +291,12 @@ return ""; } function suggMasPlur (sFlex, bSuggSimil=false) { // returns masculine plural forms - // we don’t check if word exists in _dAnalyses, for it is assumed it has been done before let aSugg = new Set(); - for (let sMorph of _dAnalyses.gl_get(sFlex, [])) { + for (let sMorph of _oSpellChecker.getMorph(sFlex)) { if (!sMorph.includes(":V")) { // not a verb if (sMorph.includes(":m") || sMorph.includes(":e")) { aSugg.add(suggPlur(sFlex)); } else { @@ -333,13 +331,12 @@ } function suggFemSing (sFlex, bSuggSimil=false) { // returns feminine singular forms - // we don’t check if word exists in _dAnalyses, for it is assumed it has been done before let aSugg = new Set(); - for (let sMorph of _dAnalyses.gl_get(sFlex, [])) { + for (let sMorph of _oSpellChecker.getMorph(sFlex)) { if (!sMorph.includes(":V")) { // not a verb if (sMorph.includes(":f") || sMorph.includes(":e")) { aSugg.add(suggSing(sFlex)); } else { @@ -367,13 +364,12 @@ return ""; } function suggFemPlur (sFlex, bSuggSimil=false) { // returns feminine plural forms - // we don’t check if word exists in _dAnalyses, for it is assumed it has been done before let aSugg = new Set(); - for (let sMorph of _dAnalyses.gl_get(sFlex, [])) { + for (let sMorph of _oSpellChecker.getMorph(sFlex)) { if (!sMorph.includes(":V")) { // not a verb if (sMorph.includes(":f") || sMorph.includes(":e")) { aSugg.add(suggPlur(sFlex)); } else { @@ -400,11 +396,11 @@ } return ""; } function hasFemForm (sFlex) { - for (let sStem of stem(sFlex)) { + for (let sStem of _oSpellChecker.getLemma(sFlex)) { if (mfsp.isFemForm(sStem) || conj.hasConj(sStem, ":PQ", ":Q3")) { return true; } } if (phonet.hasSimil(sFlex, ":f")) { @@ -412,11 +408,11 @@ } return false; } function hasMasForm (sFlex) { - for (let sStem of stem(sFlex)) { + for (let sStem of _oSpellChecker.getLemma(sFlex)) { if (mfsp.isFemForm(sStem) || conj.hasConj(sStem, ":PQ", ":Q1")) { // what has a feminine form also has a masculine form return true; } } @@ -425,14 +421,13 @@ } return false; } function switchGender (sFlex, bPlur=null) { - // we don’t check if word exists in _dAnalyses, for it is assumed it has been done before let aSugg = new Set(); if (bPlur === null) { - for (let sMorph of _dAnalyses.gl_get(sFlex, [])) { + for (let sMorph of _oSpellChecker.getMorph(sFlex)) { if (sMorph.includes(":f")) { if (sMorph.includes(":s")) { aSugg.add(suggMasSing(sFlex)); } else if (sMorph.includes(":p")) { aSugg.add(suggMasPlur(sFlex)); @@ -447,19 +442,19 @@ aSugg.add(suggFemPlur(sFlex)); } } } } else if (bPlur) { - for (let sMorph of _dAnalyses.gl_get(sFlex, [])) { + for (let sMorph of _oSpellChecker.getMorph(sFlex)) { if (sMorph.includes(":f")) { aSugg.add(suggMasPlur(sFlex)); } else if (sMorph.includes(":m")) { aSugg.add(suggFemPlur(sFlex)); } } } else { - for (let sMorph of _dAnalyses.gl_get(sFlex, [])) { + for (let sMorph of _oSpellChecker.getMorph(sFlex)) { if (sMorph.includes(":f")) { aSugg.add(suggMasSing(sFlex)); } else if (sMorph.includes(":m")) { aSugg.add(suggFemSing(sFlex)); } @@ -471,11 +466,11 @@ return ""; } function switchPlural (sFlex) { let aSugg = new Set(); - for (let sMorph of _dAnalyses.gl_get(sFlex, [])) { // we don’t check if word exists in _dAnalyses, for it is assumed it has been done before + for (let sMorph of _oSpellChecker.getMorph(sFlex)) { if (sMorph.includes(":s")) { aSugg.add(suggPlur(sFlex)); } else if (sMorph.includes(":p")) { aSugg.add(suggSing(sFlex)); } @@ -491,11 +486,11 @@ } function suggSimil (sWord, sPattern=null, bSubst=false) { // return list of words phonetically similar to sWord and whom POS is matching sPattern let aSugg = phonet.selectSimil(sWord, sPattern); - for (let sMorph of _dAnalyses.gl_get(sWord, [])) { + for (let sMorph of _oSpellChecker.getMorph(sWord)) { for (let e of conj.getSimil(sWord, sMorph, bSubst)) { aSugg.add(e); } } if (aSugg.size > 0) { @@ -513,12 +508,11 @@ } return "ce"; } function suggLesLa (sWord) { - // we don’t check if word exists in _dAnalyses, for it is assumed it has been done before - if (_dAnalyses.gl_get(sWord, []).some(s => s.includes(":p"))) { + if (_oSpellChecker.getMorph(sWord).some(s => s.includes(":p"))) { return "les|la"; } return "la"; } Index: gc_lang/fr/modules-js/tests_data.json ================================================================== --- gc_lang/fr/modules-js/tests_data.json +++ gc_lang/fr/modules-js/tests_data.json @@ -1,1 +1,1 @@ -${gctestsJS} +${regex_gctestsJS} Index: gc_lang/fr/modules/gce_analyseur.py ================================================================== --- gc_lang/fr/modules/gce_analyseur.py +++ gc_lang/fr/modules/gce_analyseur.py @@ -15,63 +15,58 @@ return "nous" if s2 == "vous": return "vous" if s2 == "eux": return "ils" - if s2 == "elle" or s2 == "elles": - # We don’t check if word exists in _dAnalyses, for it is assumed it has been done before - if cr.mbNprMasNotFem(_dAnalyses.get(s1, False)): + if s2 == "elle" or s2 == "elles": + if cr.mbNprMasNotFem(_oSpellChecker.getMorph(s1)): return "ils" # si épicène, indéterminable, mais OSEF, le féminin l’emporte return "elles" return s1 + " et " + s2 def apposition (sWord1, sWord2): "returns True if nom + nom (no agreement required)" - # We don’t check if word exists in _dAnalyses, for it is assumed it has been done before - return cr.mbNomNotAdj(_dAnalyses.get(sWord2, False)) and cr.mbPpasNomNotAdj(_dAnalyses.get(sWord1, False)) + return cr.mbNomNotAdj(_oSpellChecker.getMorph(sWord2)) and cr.mbPpasNomNotAdj(_oSpellChecker.getMorph(sWord1)) def isAmbiguousNAV (sWord): "words which are nom|adj and verb are ambiguous (except être and avoir)" - if sWord not in _dAnalyses and not _storeMorphFromFSA(sWord): + lMorph = _oSpellChecker.getMorph(sWord) + if not cr.mbNomAdj(lMorph) or sWord == "est": return False - if not cr.mbNomAdj(_dAnalyses[sWord]) or sWord == "est": - return False - if cr.mbVconj(_dAnalyses[sWord]) and not cr.mbMG(_dAnalyses[sWord]): + if cr.mbVconj(lMorph) and not cr.mbMG(lMorph): return True return False def isAmbiguousAndWrong (sWord1, sWord2, sReqMorphNA, sReqMorphConj): "use it if sWord1 won’t be a verb; word2 is assumed to be True via isAmbiguousNAV" - # We don’t check if word exists in _dAnalyses, for it is assumed it has been done before - a2 = _dAnalyses.get(sWord2, None) + a2 = _oSpellChecker.getMorph(sWord2) if not a2: return False if cr.checkConjVerb(a2, sReqMorphConj): # verb word2 is ok return False - a1 = _dAnalyses.get(sWord1, None) + a1 = _oSpellChecker.getMorph(sWord1) if not a1: return False if cr.checkAgreement(a1, a2) and (cr.mbAdj(a2) or cr.mbAdj(a1)): return False return True def isVeryAmbiguousAndWrong (sWord1, sWord2, sReqMorphNA, sReqMorphConj, bLastHopeCond): "use it if sWord1 can be also a verb; word2 is assumed to be True via isAmbiguousNAV" - # We don’t check if word exists in _dAnalyses, for it is assumed it has been done before - a2 = _dAnalyses.get(sWord2, None) + a2 = _oSpellChecker.getMorph(sWord2) if not a2: return False if cr.checkConjVerb(a2, sReqMorphConj): # verb word2 is ok return False - a1 = _dAnalyses.get(sWord1, None) + a1 = _oSpellChecker.getMorph(sWord1) if not a1: return False if cr.checkAgreement(a1, a2) and (cr.mbAdj(a2) or cr.mbAdjNb(a1)): return False # now, we know there no agreement, and conjugation is also wrong @@ -82,15 +77,14 @@ return True return False def checkAgreement (sWord1, sWord2): - # We don’t check if word exists in _dAnalyses, for it is assumed it has been done before - a2 = _dAnalyses.get(sWord2, None) + a2 = _oSpellChecker.getMorph(sWord2) if not a2: return True - a1 = _dAnalyses.get(sWord1, None) + a1 = _oSpellChecker.getMorph(sWord1) if not a1: return True return cr.checkAgreement(a1, a2) Index: gc_lang/fr/modules/gce_suggestions.py ================================================================== --- gc_lang/fr/modules/gce_suggestions.py +++ gc_lang/fr/modules/gce_suggestions.py @@ -7,16 +7,16 @@ ## Verbs def suggVerb (sFlex, sWho, funcSugg2=None): aSugg = set() - for sStem in stem(sFlex): + for sStem in _oSpellChecker.getLemma(sFlex): tTags = conj._getTags(sStem) if tTags: # we get the tense aTense = set() - for sMorph in _dAnalyses.get(sFlex, []): # we don’t check if word exists in _dAnalyses, for it is assumed it has been done before + for sMorph in _oSpellChecker.getMorph(sFlex): for m in re.finditer(">"+sStem+" .*?(:(?:Y|I[pqsf]|S[pq]|K|P))", sMorph): # stem must be used in regex to prevent confusion between different verbs (e.g. sauras has 2 stems: savoir and saurer) if m: if m.group(1) == ":Y": aTense.add(":Ip") @@ -40,11 +40,11 @@ return "" def suggVerbPpas (sFlex, sWhat=None): aSugg = set() - for sStem in stem(sFlex): + for sStem in _oSpellChecker.getLemma(sFlex): tTags = conj._getTags(sStem) if tTags: if not sWhat: aSugg.add(conj._getConjWithTags(sStem, tTags, ":PQ", ":Q1")) aSugg.add(conj._getConjWithTags(sStem, tTags, ":PQ", ":Q2")) @@ -83,21 +83,21 @@ return "" def suggVerbTense (sFlex, sTense, sWho): aSugg = set() - for sStem in stem(sFlex): + for sStem in _oSpellChecker.getLemma(sFlex): if conj.hasConj(sStem, sTense, sWho): aSugg.add(conj.getConj(sStem, sTense, sWho)) if aSugg: return "|".join(aSugg) return "" def suggVerbImpe (sFlex): aSugg = set() - for sStem in stem(sFlex): + for sStem in _oSpellChecker.getLemma(sFlex): tTags = conj._getTags(sStem) if tTags: if conj._hasConjWithTags(tTags, ":E", ":2s"): aSugg.add(conj._getConjWithTags(sStem, tTags, ":E", ":2s")) if conj._hasConjWithTags(tTags, ":E", ":1p"): @@ -108,11 +108,11 @@ return "|".join(aSugg) return "" def suggVerbInfi (sFlex): - return "|".join([ sStem for sStem in stem(sFlex) if conj.isVerb(sStem) ]) + return "|".join([ sStem for sStem in _oSpellChecker.getLemma(sFlex) if conj.isVerb(sStem) ]) _dQuiEst = { "je": ":1s", "j’": ":1s", "j’en": ":1s", "j’y": ":1s", \ "tu": ":2s", "il": ":3s", "on": ":3s", "elle": ":3s", "nous": ":1p", "vous": ":2p", "ils": ":3p", "elles": ":3p" } _lIndicatif = [":Ip", ":Iq", ":Is", ":If"] @@ -131,11 +131,11 @@ if not sWho: if sSuj[0:1].islower(): # pas un pronom, ni un nom propre return "" sWho = ":3s" aSugg = set() - for sStem in stem(sFlex): + for sStem in _oSpellChecker.getLemma(sFlex): tTags = conj._getTags(sStem) if tTags: for sTense in lMode: if conj._hasConjWithTags(tTags, sTense, sWho): aSugg.add(conj._getConjWithTags(sStem, tTags, sTense, sWho)) @@ -147,13 +147,14 @@ ## Nouns and adjectives def suggPlur (sFlex, sWordToAgree=None): "returns plural forms assuming sFlex is singular" if sWordToAgree: - if sWordToAgree not in _dAnalyses and not _storeMorphFromFSA(sWordToAgree): + lMorph = _oSpellChecker.getMorph(sFlex) + if not lMorph: return "" - sGender = cr.getGender(_dAnalyses.get(sWordToAgree, [])) + sGender = cr.getGender(lMorph) if sGender == ":m": return suggMasPlur(sFlex) elif sGender == ":f": return suggFemPlur(sFlex) aSugg = set() @@ -191,13 +192,12 @@ return "" def suggMasSing (sFlex, bSuggSimil=False): "returns masculine singular forms" - # we don’t check if word exists in _dAnalyses, for it is assumed it has been done before aSugg = set() - for sMorph in _dAnalyses.get(sFlex, []): + for sMorph in _oSpellChecker.getMorph(sFlex): if not ":V" in sMorph: # not a verb if ":m" in sMorph or ":e" in sMorph: aSugg.add(suggSing(sFlex)) else: @@ -219,13 +219,12 @@ return "" def suggMasPlur (sFlex, bSuggSimil=False): "returns masculine plural forms" - # we don’t check if word exists in _dAnalyses, for it is assumed it has been done before aSugg = set() - for sMorph in _dAnalyses.get(sFlex, []): + for sMorph in _oSpellChecker.getMorph(sFlex): if not ":V" in sMorph: # not a verb if ":m" in sMorph or ":e" in sMorph: aSugg.add(suggPlur(sFlex)) else: @@ -250,13 +249,12 @@ return "" def suggFemSing (sFlex, bSuggSimil=False): "returns feminine singular forms" - # we don’t check if word exists in _dAnalyses, for it is assumed it has been done before aSugg = set() - for sMorph in _dAnalyses.get(sFlex, []): + for sMorph in _oSpellChecker.getMorph(sFlex): if not ":V" in sMorph: # not a verb if ":f" in sMorph or ":e" in sMorph: aSugg.add(suggSing(sFlex)) else: @@ -276,13 +274,12 @@ return "" def suggFemPlur (sFlex, bSuggSimil=False): "returns feminine plural forms" - # we don’t check if word exists in _dAnalyses, for it is assumed it has been done before aSugg = set() - for sMorph in _dAnalyses.get(sFlex, []): + for sMorph in _oSpellChecker.getMorph(sFlex): if not ":V" in sMorph: # not a verb if ":f" in sMorph or ":e" in sMorph: aSugg.add(suggPlur(sFlex)) else: @@ -301,33 +298,32 @@ return "|".join(aSugg) return "" def hasFemForm (sFlex): - for sStem in stem(sFlex): + for sStem in _oSpellChecker.getLemma(sFlex): if mfsp.isFemForm(sStem) or conj.hasConj(sStem, ":PQ", ":Q3"): return True if phonet.hasSimil(sFlex, ":f"): return True return False def hasMasForm (sFlex): - for sStem in stem(sFlex): + for sStem in _oSpellChecker.getLemma(sFlex): if mfsp.isFemForm(sStem) or conj.hasConj(sStem, ":PQ", ":Q1"): # what has a feminine form also has a masculine form return True if phonet.hasSimil(sFlex, ":m"): return True return False def switchGender (sFlex, bPlur=None): - # we don’t check if word exists in _dAnalyses, for it is assumed it has been done before aSugg = set() if bPlur == None: - for sMorph in _dAnalyses.get(sFlex, []): + for sMorph in _oSpellChecker.getMorph(sFlex): if ":f" in sMorph: if ":s" in sMorph: aSugg.add(suggMasSing(sFlex)) elif ":p" in sMorph: aSugg.add(suggMasPlur(sFlex)) @@ -338,17 +334,17 @@ aSugg.add(suggFemPlur(sFlex)) else: aSugg.add(suggFemSing(sFlex)) aSugg.add(suggFemPlur(sFlex)) elif bPlur: - for sMorph in _dAnalyses.get(sFlex, []): + for sMorph in _oSpellChecker.getMorph(sFlex): if ":f" in sMorph: aSugg.add(suggMasPlur(sFlex)) elif ":m" in sMorph: aSugg.add(suggFemPlur(sFlex)) else: - for sMorph in _dAnalyses.get(sFlex, []): + for sMorph in _oSpellChecker.getMorph(sFlex): if ":f" in sMorph: aSugg.add(suggMasSing(sFlex)) elif ":m" in sMorph: aSugg.add(suggFemSing(sFlex)) if aSugg: @@ -355,13 +351,12 @@ return "|".join(aSugg) return "" def switchPlural (sFlex): - # we don’t check if word exists in _dAnalyses, for it is assumed it has been done before aSugg = set() - for sMorph in _dAnalyses.get(sFlex, []): + for sMorph in _oSpellChecker.getMorph(sFlex): if ":s" in sMorph: aSugg.add(suggPlur(sFlex)) elif ":p" in sMorph: aSugg.add(suggSing(sFlex)) if aSugg: @@ -373,13 +368,12 @@ return phonet.hasSimil(sWord, sPattern) def suggSimil (sWord, sPattern=None, bSubst=False): "return list of words phonetically similar to sWord and whom POS is matching sPattern" - # we don’t check if word exists in _dAnalyses, for it is assumed it has been done before aSugg = phonet.selectSimil(sWord, sPattern) - for sMorph in _dAnalyses.get(sWord, []): + for sMorph in _oSpellChecker.getMorph(sWord): aSugg.update(conj.getSimil(sWord, sMorph, bSubst)) break if aSugg: return "|".join(aSugg) return "" @@ -392,12 +386,11 @@ return "ce|cet" return "ce" def suggLesLa (sWord): - # we don’t check if word exists in _dAnalyses, for it is assumed it has been done before - if any( ":p" in sMorph for sMorph in _dAnalyses.get(sWord, []) ): + if any( ":p" in sMorph for sMorph in _oSpellChecker.getMorph(sWord) ): return "les|la" return "la" _zBinary = re.compile("^[01]+$") ADDED gc_lang/fr/rules_graph.grx Index: gc_lang/fr/rules_graph.grx ================================================================== --- /dev/null +++ gc_lang/fr/rules_graph.grx @@ -0,0 +1,62 @@ +# +# RÈGLES DE GRAMMAIRE FRANÇAISE POUR GRAMMALECTE +# par Olivier R. +# +# Copyright © 2011-2017. +# +# This file is part of Grammalecte. +# +# Grammalecte is free software: you can redistribute it and/or modify +# it under the terms of the GNU General Public License as published by +# the Free Software Foundation, either version 3 of the License, or +# (at your option) any later version. +# +# Grammalecte is distributed in the hope that it will be useful, +# but WITHOUT ANY WARRANTY; without even the implied warranty of +# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the +# GNU General Public License for more details. +# +# You should have received a copy of the GNU General Public License +# along with Grammalecte. If not, see +# + +# RÈGLES POUR LE GRAPHE DE TOKENS + +# DOCUMENTATION +# Expressions régulières en Python : http://docs.python.org/library/re.html + +# [++] : séparateur des règles pour le paragraphe et des règles pour la phrase. + +# Types d’action: +# ->> erreur +# ~>> préprocesseur de texte +# =>> désambiguïsateur + + +# Fin d’interprétation du fichier avec une ligne commençant par #END + +# ERREURS COURANTES +# http://fr.wikipedia.org/wiki/Wikip%C3%A9dia:Fautes_d%27orthographe/Courantes + + +__avoir_confiance_en__ + >avoir confiance (dans) [moi|toi|soi|lui|elle|nous|vous|eux|elles] + <<- -1>> en # Avoir confiance en quelqu’un ou quelque chose.|http://grammalecte.net + +TEST: Elle avait confiance {{dans}} lui. + + +__code_legacy__ + legacy code + code legacy + <<- -1:2>> code hérité|code reliquat # Anglicisme superflu. + +TEST: c’est du {{legacy code}}. +TEST: ce {{code legacy}} est un cauchemar + + +__être_en_xxxx__ + [>être|>rester|>demeurer] an [désaccord|accord] + <<- -2>> en # Confusion. Un an = une année. + +TEST: Je suis {{an}} désaccord avec lui. Index: graphspell-js/spellchecker.js ================================================================== --- graphspell-js/spellchecker.js +++ graphspell-js/spellchecker.js @@ -41,10 +41,14 @@ this.oPersonalDic = this._loadDictionary(personalDic, sPath); this.bExtendedDic = Boolean(this.oExtendedDic); this.bCommunityDic = Boolean(this.oCommunityDic); this.bPersonalDic = Boolean(this.oPersonalDic); this.oTokenizer = null; + // storage + this.bStorage = false; + this._dMorphologies = new Map(); // key: flexion, value: list of morphologies + this._dLemmas = new Map(); // key: flexion, value: list of lemmas } _loadDictionary (dictionary, sPath="", bNecessary=false) { // returns an IBDAWG object if (!dictionary) { @@ -132,10 +136,26 @@ deactivatePersonalDictionary () { this.bPersonalDic = false; } + + // Storage + + activateStorage () { + this.bStorage = true; + } + + deactivateStorage () { + this.bStorage = false; + } + + clearStorage () { + this._dLemmas.clear(); + this._dMorphologies.clear(); + } + // parse text functions parseParagraph (sText) { if (!this.oTokenizer) { @@ -203,21 +223,40 @@ return false; } getMorph (sWord) { // retrieves morphologies list, different casing allowed - let lResult = this.oMainDic.getMorph(sWord); + if (this.bStorage && this._dMorphologies.has(sWord)) { + return this._dMorphologies.get(sWord); + } + let lMorph = this.oMainDic.getMorph(sWord); if (this.bExtendedDic) { - lResult.push(...this.oExtendedDic.getMorph(sWord)); + lMorph.push(...this.oExtendedDic.getMorph(sWord)); } if (this.bCommunityDic) { - lResult.push(...this.oCommunityDic.getMorph(sWord)); + lMorph.push(...this.oCommunityDic.getMorph(sWord)); } if (this.bPersonalDic) { - lResult.push(...this.oPersonalDic.getMorph(sWord)); + lMorph.push(...this.oPersonalDic.getMorph(sWord)); + } + if (this.bStorage) { + this._dMorphologies.set(sWord, lMorph); + this._dLemmas.set(sWord, Array.from(new Set(this.getMorph(sWord).map((sMorph) => { return sMorph.slice(1, sMorph.indexOf(" ")); })))); + //console.log(sWord, this._dLemmas.get(sWord)); + } + return lMorph; + } + + getLemma (sWord) { + // retrieves lemmas + if (this.bStorage) { + if (!this._dLemmas.has(sWord)) { + this.getMorph(sWord); + } + return this._dLemmas.get(sWord); } - return lResult; + return Array.from(new Set(this.getMorph(sWord).map((sMorph) => { return sMorph.slice(1, sMorph.indexOf(" ")); }))); } * suggest (sWord, nSuggLimit=10) { // generator: returns 1, 2 or 3 lists of suggestions yield this.oMainDic.suggest(sWord, nSuggLimit); Index: graphspell-js/tokenizer.js ================================================================== --- graphspell-js/tokenizer.js +++ graphspell-js/tokenizer.js @@ -16,11 +16,11 @@ "default": [ [/^[   \t]+/, 'SPACE'], [/^\/(?:~|bin|boot|dev|etc|home|lib|mnt|opt|root|sbin|tmp|usr|var|Bureau|Documents|Images|Musique|Public|Téléchargements|Vidéos)(?:\/[a-zA-Zà-öÀ-Ö0-9ø-ÿØ-ßĀ-ʯfi-st_.()-]+)*/, 'FOLDERUNIX'], [/^[a-zA-Z]:\\(?:Program Files(?: \(x86\)|)|[a-zA-Zà-öÀ-Ö0-9ø-ÿØ-ßĀ-ʯfi-st.()]+)(?:\\[a-zA-Zà-öÀ-Ö0-9ø-ÿØ-ßĀ-ʯfi-st_.()-]+)*/, 'FOLDERWIN'], - [/^[,.;:!?…«»“”‘’"(){}\[\]/·–—]+/, 'SEPARATOR'], + [/^[,.;:!?…«»“”‘’"(){}\[\]/·–—]/, 'SEPARATOR'], [/^[A-Z][.][A-Z][.](?:[A-Z][.])*/, 'ACRONYM'], [/^(?:https?:\/\/|www[.]|[a-zA-Zà-öÀ-Ö0-9ø-ÿØ-ßĀ-ʯfi-st_-]+[@.][a-zA-Zà-öÀ-Ö0-9ø-ÿØ-ßĀ-ʯfi-st_-]{2,}[@.])[a-zA-Z0-9][a-zA-Zà-öÀ-Ö0-9ø-ÿØ-ßĀ-ʯfi-st_.\/?&!%=+*"'@$#-]+/, 'LINK'], [/^[#@][a-zA-Zà-öÀ-Ö0-9ø-ÿØ-ßĀ-ʯfi-st_-]+/, 'TAG'], [/^<[a-zA-Zà-öÀ-Ö0-9ø-ÿØ-ßĀ-ʯfi-st]+.*?>|<\/[a-zA-Zà-öÀ-Ö0-9ø-ÿØ-ßĀ-ʯfi-st]+ *>/, 'HTML'], [/^\[\/?[a-zA-Zà-öÀ-Ö0-9ø-ÿØ-ßĀ-ʯfi-st]+\]/, 'PSEUDOHTML'], @@ -32,11 +32,11 @@ "fr": [ [/^[   \t]+/, 'SPACE'], [/^\/(?:~|bin|boot|dev|etc|home|lib|mnt|opt|root|sbin|tmp|usr|var|Bureau|Documents|Images|Musique|Public|Téléchargements|Vidéos)(?:\/[a-zA-Zà-öÀ-Ö0-9ø-ÿØ-ßĀ-ʯfi-st_.()-]+)*/, 'FOLDERUNIX'], [/^[a-zA-Z]:\\(?:Program Files(?: \(x86\)|)|[a-zA-Zà-öÀ-Ö0-9ø-ÿØ-ßĀ-ʯfi-st.()]+)(?:\\[a-zA-Zà-öÀ-Ö0-9ø-ÿØ-ßĀ-ʯfi-st_.()-]+)*/, 'FOLDERWIN'], - [/^[,.;:!?…«»“”‘’"(){}\[\]/·–—]+/, 'SEPARATOR'], + [/^[,.;:!?…«»“”‘’"(){}\[\]/·–—]/, 'SEPARATOR'], [/^[A-Z][.][A-Z][.](?:[A-Z][.])*/, 'ACRONYM'], [/^(?:https?:\/\/|www[.]|[a-zA-Zà-öÀ-Ö0-9ø-ÿØ-ßĀ-ʯfi-st_-]+[@.][a-zA-Zà-öÀ-Ö0-9ø-ÿØ-ßĀ-ʯfi-st_-]{2,}[@.])[a-zA-Z0-9][a-zA-Zà-öÀ-Ö0-9ø-ÿØ-ßĀ-ʯfi-st_.\/?&!%=+*"'@$#-]+/, 'LINK'], [/^[#@][a-zA-Zà-öÀ-Ö0-9ø-ÿØ-ßĀ-ʯfi-st_-]+/, 'TAG'], [/^<[a-zA-Zà-öÀ-Ö0-9ø-ÿØ-ßĀ-ʯfi-st]+.*?>|<\/[a-zA-Zà-öÀ-Ö0-9ø-ÿØ-ßĀ-ʯfi-st]+ *>/, 'HTML'], [/^\[\/?[a-zA-Zà-öÀ-Ö0-9ø-ÿØ-ßĀ-ʯfi-st]+\]/, 'PSEUDOHTML'], @@ -60,36 +60,32 @@ this.aRules = aTkzPatterns[this.sLang]; } * genTokens (sText) { let m; - let i = 0; + let iNext = 0; while (sText) { - let nCut = 1; + let iCut = 1; + let iToken = 0; for (let [zRegex, sType] of this.aRules) { try { if ((m = zRegex.exec(sText)) !== null) { - if (sType == 'SEPARATOR') { - for (let c of m[0]) { - yield { "sType": sType, "sValue": c, "nStart": i, "nEnd": i + m[0].length } - } - } else { - yield { "sType": sType, "sValue": m[0], "nStart": i, "nEnd": i + m[0].length } - } - nCut = m[0].length; + iToken += 1; + yield { "i": iToken, "sType": sType, "sValue": m[0], "nStart": iNext, "nEnd": iNext + m[0].length } + iCut = m[0].length; break; } } catch (e) { helpers.logerror(e); } } - i += nCut; - sText = sText.slice(nCut); + iNext += iCut; + sText = sText.slice(iCut); } } } if (typeof(exports) !== 'undefined') { exports.Tokenizer = Tokenizer; } ADDED graphspell/fr.py Index: graphspell/fr.py ================================================================== --- /dev/null +++ graphspell/fr.py @@ -0,0 +1,6 @@ +# French language + +dSugg = { + "parce-que": "parce que", + "Parce-que": "Parce que" +} Index: graphspell/spellchecker.py ================================================================== --- graphspell/spellchecker.py +++ graphspell/spellchecker.py @@ -6,11 +6,11 @@ # - the main dictionary, bundled with the package # - the extended dictionary # - the community dictionary, added by an organization # - the personal dictionary, created by the user for its own convenience - +import importlib import traceback from . import ibdawg from . import tokenizer @@ -34,10 +34,17 @@ self.oPersonalDic = self._loadDictionary(sfPersonalDic) self.bExtendedDic = bool(self.oExtendedDic) self.bCommunityDic = bool(self.oCommunityDic) self.bPersonalDic = bool(self.oPersonalDic) self.oTokenizer = None + # Default suggestions + self.dDefaultSugg = None + self.loadSuggestions(sLangCode) + # storage + self.bStorage = False + self._dMorphologies = {} # key: flexion, value: list of morphologies + self._dLemmas = {} # key: flexion, value: list of lemmas def _loadDictionary (self, source, bNecessary=False): "returns an IBDAWG object" if not source: return None @@ -97,10 +104,34 @@ self.bCommunityDic = False def deactivatePersonalDictionary (self): self.bPersonalDic = False + + # Default suggestions + + def loadSuggestions (self, sLangCode): + try: + suggest_module = importlib.import_module("."+sLangCode, "graphspell") + except: + print("No suggestion module for language <"+sLangCode+">") + return + self.dDefaultSugg = suggest_module.dSugg + + + # Storage + + def activateStorage (self): + self.bStorage = True + + def deactivateStorage (self): + self.bStorage = False + + def clearStorage (self): + self._dLemmas.clear() + self._dMorphologies.clear() + # parse text functions def parseParagraph (self, sText, bSpellSugg=False): if not self.oTokenizer: @@ -169,25 +200,38 @@ return True return False def getMorph (self, sWord): "retrieves morphologies list, different casing allowed" - lResult = self.oMainDic.getMorph(sWord) + if self.bStorage and sWord in self._dMorphologies: + return self._dMorphologies[sWord] + lMorph = self.oMainDic.getMorph(sWord) if self.bExtendedDic: - lResult.extend(self.oExtendedDic.getMorph(sWord)) + lMorph.extend(self.oExtendedDic.getMorph(sWord)) if self.bCommunityDic: - lResult.extend(self.oCommunityDic.getMorph(sWord)) + lMorph.extend(self.oCommunityDic.getMorph(sWord)) if self.bPersonalDic: - lResult.extend(self.oPersonalDic.getMorph(sWord)) - return lResult + lMorph.extend(self.oPersonalDic.getMorph(sWord)) + if self.bStorage: + self._dMorphologies[sWord] = lMorph + self._dLemmas[sWord] = set([ s[1:s.find(" ")] for s in lMorph ]) + return lMorph def getLemma (self, sWord): + "retrieves lemmas" + if self.bStorage: + if sWord not in self._dLemmas: + self.getMorph(sWord) + return self._dLemmas[sWord] return set([ s[1:s.find(" ")] for s in self.getMorph(sWord) ]) def suggest (self, sWord, nSuggLimit=10): "generator: returns 1, 2 or 3 lists of suggestions" - yield self.oMainDic.suggest(sWord, nSuggLimit) + if self.dDefaultSugg and sWord in self.dDefaultSugg: + yield self.dDefaultSugg[sWord].split("|") + else: + yield self.oMainDic.suggest(sWord, nSuggLimit) if self.bExtendedDic: yield self.oExtendedDic.suggest(sWord, nSuggLimit) if self.bCommunityDic: yield self.oCommunityDic.suggest(sWord, nSuggLimit) if self.bPersonalDic: Index: graphspell/tokenizer.py ================================================================== --- graphspell/tokenizer.py +++ graphspell/tokenizer.py @@ -5,11 +5,11 @@ _PATTERNS = { "default": ( r'(?P/(?:bin|boot|dev|etc|home|lib|mnt|opt|root|sbin|tmp|usr|var|Bureau|Documents|Images|Musique|Public|Téléchargements|Vidéos)(?:/[\w.()-]+)*)', r'(?P[a-zA-Z]:\\(?:Program Files(?: [(]x86[)]|)|[\w.()]+)(?:\\[\w.()-]+)*)', - r'(?P[.,?!:;…«»“”"()/·]+)', + r'(?P[][,.;:!?…«»“”‘’"(){}/·–—])', r'(?P[A-Z][.][A-Z][.](?:[A-Z][.])*)', r'(?P(?:https?://|www[.]|\w+[@.]\w\w+[@.])\w[\w./?&!%=+*"\'@$#-]+)', r'(?P[#@][\w-]+)', r'(?P<\w+.*?>|)', r'(?P\[/?\w+\])', @@ -19,11 +19,11 @@ ), "fr": ( r'(?P/(?:bin|boot|dev|etc|home|lib|mnt|opt|root|sbin|tmp|usr|var|Bureau|Documents|Images|Musique|Public|Téléchargements|Vidéos)(?:/[\w.()-]+)*)', r'(?P[a-zA-Z]:\\(?:Program Files(?: [(]x86[)]|)|[\w.()]+)(?:\\[\w.()-]+)*)', - r'(?P[.,?!:;…«»“”"()/·]+)', + r'(?P[][,.;:!?…«»“”‘’"(){}/·–—])', r'(?P[A-Z][.][A-Z][.](?:[A-Z][.])*)', r'(?P(?:https?://|www[.]|\w+[@.]\w\w+[@.])\w[\w./?&!%=+*"\'@$#-]+)', r'(?P[#@][\w-]+)', r'(?P<\w+.*?>|)', r'(?P\[/?\w+\])', @@ -42,8 +42,13 @@ self.sLang = sLang if sLang not in _PATTERNS: self.sLang = "default" self.zToken = re.compile( "(?i)" + '|'.join(sRegex for sRegex in _PATTERNS[sLang]) ) - def genTokens (self, sText): - for m in self.zToken.finditer(sText): - yield { "sType": m.lastgroup, "sValue": m.group(), "nStart": m.start(), "nEnd": m.end() } + def genTokens (self, sText, bStartEndToken=False): + if bStartEndToken: + yield { "i": 0, "sType": "INFO", "sValue": "", "nStart": 0, "nEnd": 0 } + for i, m in enumerate(self.zToken.finditer(sText), 1): + yield { "i": i, "sType": m.lastgroup, "sValue": m.group(), "nStart": m.start(), "nEnd": m.end() } + if bStartEndToken: + iEnd = len(sText) + yield { "i": i+1, "sType": "INFO", "sValue": "", "nStart": iEnd, "nEnd": iEnd } Index: make.py ================================================================== --- make.py +++ make.py @@ -17,10 +17,11 @@ from distutils import dir_util, file_util import dialog_bundled import compile_rules +import compile_rules_graph import helpers import lex_build sWarningMessage = "The content of this folder is generated by code and replaced at each build.\n" @@ -191,12 +192,15 @@ dVars = xConfig._sections['args'] dVars['locales'] = dVars["locales"].replace("_", "-") dVars['loc'] = str(dict([ [s, [s[0:2], s[3:5], ""]] for s in dVars["locales"].split(" ") ])) ## COMPILE RULES - dResult = compile_rules.make(spLang, dVars['lang'], bJavaScript) - dVars.update(dResult) + dResultRegex = compile_rules.make(spLang, dVars['lang'], bJavaScript) + dVars.update(dResultRegex) + + dResultGraph = compile_rules_graph.make(spLang, dVars['lang'], bJavaScript) + dVars.update(dResultGraph) ## READ GRAMMAR CHECKER PLUGINS print("PYTHON:") print("+ Plugins: ", end="") sCodePlugins = "" @@ -227,11 +231,15 @@ print() # TEST FILES with open("grammalecte/"+sLang+"/gc_test.txt", "w", encoding="utf-8", newline="\n") as hDstPy: hDstPy.write("# TESTS FOR LANG [" + sLang + "]\n\n") - hDstPy.write(dVars['gctests']) + hDstPy.write("# REGEX RULES\n\n") + hDstPy.write(dVars['regex_gctests']) + hDstPy.write("\n\n\n# GRAPH RULES\n\n") + hDstPy.write(dVars['graph_gctests']) + hDstPy.write("\n") createOXT(spLang, dVars, xConfig._sections['oxt'], spLangPack, bInstallOXT) createServerOptions(sLang, dVars) createPackageZip(sLang, dVars, spLangPack)