ADDED compile_rules_graph.py Index: compile_rules_graph.py ================================================================== --- /dev/null +++ compile_rules_graph.py @@ -0,0 +1,281 @@ +# Create a Direct Acyclic Rule Graph (DARG) + +import re +import traceback +import json +import datg + + +dDEF = {} +dACTIONS = {} +lFUNCTIONS = [] + + +def prepareFunction (s): + s = s.replace("__also__", "bCondMemo") + s = s.replace("__else__", "not bCondMemo") + s = re.sub(r"isStart *\(\)", 'before(["", ","])', s) + s = re.sub(r"isRealStart *\(\)", 'before([""])', s) + s = re.sub(r"isStart0 *\(\)", 'before0(["", ","])', s) + s = re.sub(r"isRealStart0 *\(\)", 'before0([""])', s) + s = re.sub(r"isEnd *\(\)", 'after(["", ","])', s) + s = re.sub(r"isRealEnd *\(\)", 'after([""])', s) + s = re.sub(r"isEnd0 *\(\)", 'after0(["", ","])', s) + s = re.sub(r"isRealEnd0 *\(\)", 'after0([""])', s) + s = re.sub(r"(select|exclude)[(][\\](\d+)", '\\1(lToken[\\2]', s) + s = re.sub(r"define[(][\\](\d+)", 'define(lToken[\\1]', s) + s = re.sub(r"(morph|morphex|displayInfo)[(][\\](\d+)", '\\1(lToken[\\2])', s) + s = re.sub(r"token\(\s*(\d)", 'nextToken(\\1', s) # token(n) + s = re.sub(r"token\(\s*-(\d)", 'prevToken(\\1', s) # token(-n) + s = re.sub(r"before\(\s*", 'look(s[:m.start()], ', s) # before(s) + s = re.sub(r"after\(\s*", 'look(s[m.end():], ', s) # after(s) + s = re.sub(r"textarea\(\s*", 'look(s, ', s) # textarea(s) + s = re.sub(r"before_chk1\(\s*", 'look_chk1(dDA, s[:m.start()], 0, ', s) # before_chk1(s) + s = re.sub(r"after_chk1\(\s*", 'look_chk1(dDA, s[m.end():], m.end(), ', s) # after_chk1(s) + s = re.sub(r"textarea_chk1\(\s*", 'look_chk1(dDA, s, 0, ', s) # textarea_chk1(s) + s = re.sub(r"isEndOfNG\(\s*\)", 'isEndOfNG(dDA, s[m.end():], m.end())', s) # isEndOfNG(s) + s = re.sub(r"isNextNotCOD\(\s*\)", 'isNextNotCOD(dDA, s[m.end():], m.end())', s) # isNextNotCOD(s) + s = re.sub(r"isNextVerb\(\s*\)", 'isNextVerb(dDA, s[m.end():], m.end())', s) # isNextVerb(s) + s = re.sub(r"\bspell *[(]", '_oSpellChecker.isValid(', s) + s = re.sub(r"[\\](\d+)", 'lToken[\\1]', s) + return s + + +def changeReferenceToken (s, dPos): + for i in range(len(dPos), 0, -1): + s = s.replace("\\"+str(i), "\\"+dPos[i]) + return s + + +def createRule (iLine, sRuleName, sTokenLine, sActions, nPriority): + # print(iLine, "//", sRuleName, "//", sTokenLine, "//", sActions, "//", nPriority) + lToken = sTokenLine.split() + + # Calculate positions + dPos = {} + nGroup = 0 + for i, sToken in enumerate(lToken): + if sToken.startswith("(") and sToken.endswith(")"): + lToken[i] = sToken[1:-1] + nGroup += 1 + dPos[nGroup] = i + + # Parse actions + for nAction, sAction in enumerate(sActions.split(" <<- ")): + if sAction.strip(): + sActionId = sRuleName + "_a" + str(nAction) + sCondition, tAction = createAction(sActionId, sAction, nGroup, nPriority, dPos) + if tAction: + dACTIONS[sActionId] = tAction + lResult = list(lToken) + lResult.extend([iLine, sRuleName, sCondition, sActionId]) + yield lResult + + +def createAction (sIdAction, sAction, nGroup, nPriority, dPos): + m = re.search("([-~=])(\\d+|)(:\\d+|)>> ", sAction) + if not m: + print(" # Error. No action found at: ", sIdAction) + print(" ==", sAction, "==") + return None, None + # Condition + sCondition = sAction[:m.start()].strip() + if sCondition: + sCondition = prepareFunction(sCondition) + sCondition = changeReferenceToken(sCondition, dPos) + lFUNCTIONS.append(("gc_"+sIdAction, sCondition)) + sCondition = "gc_"+sIdAction + else: + sCondition = "" + # Action + cAction = m.group(1) + sAction = sAction[m.end():].strip() + sAction = changeReferenceToken(sAction, dPos) + iStartAction = int(m.group(2)) if m.group(2) else 0 + iEndAction = int(m.group(3)[1:]) if m.group(3) else iStartAction + if nGroup: + iStartAction = dPos[iStartAction] + iEndAction = dPos[iEndAction] + + if cAction == "-": + ## error + iMsg = sAction.find(" # ") + if iMsg == -1: + sMsg = "# Error. Error message not found." + sURL = "" + print(sMsg + " Action id: " + sIdAction) + else: + sMsg = sAction[iMsg+3:].strip() + sAction = sAction[:iMsg].strip() + sURL = "" + mURL = re.search("[|] *(https?://.*)", sMsg) + if mURL: + sURL = mURL.group(1).strip() + sMsg = sMsg[:mURL.start(0)].strip() + if sMsg[0:1] == "=": + sMsg = prepareFunction(sMsg[1:]) + lFUNCTIONS.append(("gm_"+sIdAction, sMsg)) + for x in re.finditer("group[(](\d+)[)]", sMsg): + if int(x.group(1)) > nGroup: + print("# Error in groups in message at line " + sIdAction + " ("+str(nGroup)+" groups only)") + sMsg = "=m_"+sIdAction + else: + for x in re.finditer(r"\\(\d+)", sMsg): + if int(x.group(1)) > nGroup: + print("# Error in groups in message at line " + sIdAction + " ("+str(nGroup)+" groups only)") + if re.search("[.]\\w+[(]", sMsg): + print("# Error in message at line " + sIdAction + ": This message looks like code. Line should begin with =") + + if sAction[0:1] == "=" or cAction == "=": + if "define" in sAction and not re.search(r"define\(\\\d+ *, *\[.*\] *\)", sAction): + print("# Error in action at line " + sIdAction + ": second argument for define must be a list of strings") + sAction = prepareFunction(sAction) + for x in re.finditer("group[(](\d+)[)]", sAction): + if int(x.group(1)) > nGroup: + print("# Error in groups in replacement at line " + sIdAction + " ("+str(nGroup)+" groups only)") + else: + for x in re.finditer(r"\\(\d+)", sAction): + if int(x.group(1)) > nGroup: + print("# Error in groups in replacement at line " + sIdAction + " ("+str(nGroup)+" groups only)") + if re.search("[.]\\w+[(]|sugg\\w+[(]", sAction): + print("# Error in action at line " + sIdAction + ": This action looks like code. Line should begin with =") + + if cAction == "-": + ## error detected --> suggestion + if not sAction: + print("# Error in action at line " + sIdAction + ": This action is empty.") + if sAction[0:1] == "=": + lFUNCTIONS.append(("gs_"+sIdAction, sAction[1:])) + sAction = "=gs_"+sIdAction + elif sAction.startswith('"') and sAction.endswith('"'): + sAction = sAction[1:-1] + if not sMsg: + print("# Error in action at line " + sIdAction + ": The message is empty.") + return [sCondition, (cAction, sAction, iStartAction, iEndAction, nPriority, sMsg, sURL)] + elif cAction == "~": + ## text processor + if not sAction: + print("# Error in action at line " + sIdAction + ": This action is empty.") + if sAction[0:1] == "=": + lFUNCTIONS.append(("gp_"+sIdAction, sAction[1:])) + sAction = "=gp_"+sIdAction + elif sAction.startswith('"') and sAction.endswith('"'): + sAction = sAction[1:-1] + return [sCondition, (cAction, sAction, iStartAction, iEndAction)] + elif cAction == "=": + ## disambiguator + if sAction[0:1] == "=": + sAction = sAction[1:] + if not sAction: + print("# Error in action at line " + sIdAction + ": This action is empty.") + lFUNCTIONS.append(("gd_"+sIdAction, sAction)) + sAction = "gd_"+sIdAction + return [sCondition, (cAction, sAction)] + elif cAction == ">": + ## no action, break loop if condition is False + return [sCondition, (cAction, "")] + else: + print("# Unknown action at line " + sIdAction) + return None + + +def make (spLang, sLang, bJavaScript): + "compile rules, returns a dictionary of values" + # for clarity purpose, don’t create any file here + + print("> read graph rules file...") + try: + lRules = open(spLang + "/rules_graph.grx", 'r', encoding="utf-8").readlines() + except: + print("Error. Rules file in project [" + sLang + "] not found.") + exit() + + # removing comments, zeroing empty lines, creating definitions, storing tests, merging rule lines + print(" parsing rules...") + global dDEF + lLine = [] + lRuleLine = [] + lTest = [] + lOpt = [] + lTokenLine = [] + sActions = "" + nPriority = 4 + + for i, sLine in enumerate(lRules, 1): + sLine = sLine.rstrip() + if "\t" in sLine: + print("Error. Tabulation at line: ", i) + break + if sLine.startswith('#END'): + printBookmark(0, "BREAK BY #END", i) + break + elif sLine.startswith("#"): + pass + elif sLine.startswith("DEF:"): + m = re.match("DEF: +([a-zA-Z_][a-zA-Z_0-9]*) +(.+)$", sLine.strip()) + if m: + dDEF["{"+m.group(1)+"}"] = m.group(2) + else: + print("Error in definition: ", end="") + print(sLine.strip()) + elif sLine.startswith("TEST:"): + lTest.append("{:<8}".format(i) + " " + sLine[5:].strip()) + elif sLine.startswith("TODO:"): + pass + elif sLine.startswith("!!"): + m = re.search("^!!+", sLine) + nExMk = len(m.group(0)) + if sLine[nExMk:].strip(): + printBookmark(nExMk-2, sLine[nExMk:].strip(), i) + elif sLine.startswith("__") and sLine.endswith("__"): + # new rule group + m = re.match("__(\\w+)(!\\d|)__", sLine) + if m: + sRuleName = m.group(1) + nPriority = int(m.group(2)[1:]) if m.group(2) else 4 + else: + print("Error at rule group: ", sLine, " -- line:", i) + break + elif re.match("[  ]*$", sLine): + # empty line to end merging + for i, sTokenLine in lTokenLine: + lRuleLine.append((i, sRuleName, sTokenLine, sActions, nPriority)) + lTokenLine = [] + sActions = "" + sRuleName = "" + nPriority = 4 + elif sLine.startswith((" ")): + # actions + sActions += " " + sLine.strip() + else: + lTokenLine.append([i, sLine.strip()]) + + # tests + print(" list tests...") + sGCTests = "\n".join(lTest) + sGCTestsJS = '{ "aData2": ' + json.dumps(lTest, ensure_ascii=False) + " }\n" + + # processing rules + print(" preparing rules...") + lPreparedRule = [] + for i, sRuleGroup, sTokenLine, sActions, nPriority in lRuleLine: + for lRule in createRule(i, sRuleGroup, sTokenLine, sActions, nPriority): + lPreparedRule.append(lRule) + + # Graph creation + for e in lPreparedRule: + print(e) + + oDATG = datg.DATG(lPreparedRule, sLang) + oRuleGraph = oDATG.createGraph() + + # Result + d = { + "g_callables": None, + "g_gctests": None, + "graph_rules": None, + } + + return d + + ADDED datg.py Index: datg.py ================================================================== --- /dev/null +++ datg.py @@ -0,0 +1,183 @@ +#!python3 + +# RULE GRAPH BUILDER +# +# by Olivier R. +# License: MPL 2 + + +import json +import time +import traceback + +from graphspell.progressbar import ProgressBar + + + +class DATG: + """DIRECT ACYCLIC TOKEN GRAPH""" + # This code is inspired from Steve Hanov’s DAWG, 2011. (http://stevehanov.ca/blog/index.php?id=115) + + def __init__ (self, lRule, sLangCode): + print("===== Direct Acyclic Token Graph - Minimal Acyclic Finite State Automaton =====") + + # Preparing DATG + print(" > Preparing list of tokens") + self.sLangCode = sLangCode + self.nRule = len(lRule) + self.aPreviousRule = [] + Node.resetNextId() + self.oRoot = Node() + self.lUncheckedNodes = [] # list of nodes that have not been checked for duplication. + self.lMinimizedNodes = {} # list of unique nodes that have been checked for duplication. + self.nNode = 0 + self.nArc = 0 + + # build + lRule.sort() + oProgBar = ProgressBar(0, len(lRule)) + for aRule in lRule: + self.insert(aRule) + oProgBar.increment(1) + oProgBar.done() + self.finish() + self.countNodes() + self.countArcs() + self.displayInfo() + + # BUILD DATG + def insert (self, aRule): + if aRule < self.aPreviousRule: + sys.exit("# Error: tokens must be inserted in order.") + + # find common prefix between word and previous word + nCommonPrefix = 0 + for i in range(min(len(aRule), len(self.aPreviousRule))): + if aRule[i] != self.aPreviousRule[i]: + break + nCommonPrefix += 1 + + # Check the lUncheckedNodes for redundant nodes, proceeding from last + # one down to the common prefix size. Then truncate the list at that point. + self._minimize(nCommonPrefix) + + # add the suffix, starting from the correct node mid-way through the graph + if len(self.lUncheckedNodes) == 0: + oNode = self.oRoot + else: + oNode = self.lUncheckedNodes[-1][2] + + iToken = nCommonPrefix + for token in aRule[nCommonPrefix:]: + oNextNode = Node() + oNode.dArcs[token] = oNextNode + self.lUncheckedNodes.append((oNode, token, oNextNode)) + if iToken == (len(aRule) - 4): + oNode.bFinal = True + oNextNode.bInfo = True + iToken += 1 + oNode = oNextNode + oNode.bFinal = True + self.aPreviousRule = aRule + + def finish (self): + "minimize unchecked nodes" + self._minimize(0) + + def _minimize (self, downTo): + # proceed from the leaf up to a certain point + for i in range( len(self.lUncheckedNodes)-1, downTo-1, -1 ): + oNode, token, oChildNode = self.lUncheckedNodes[i] + if oChildNode in self.lMinimizedNodes: + # replace the child with the previously encountered one + oNode.dArcs[token] = self.lMinimizedNodes[oChildNode] + else: + # add the state to the minimized nodes. + self.lMinimizedNodes[oChildNode] = oChildNode + self.lUncheckedNodes.pop() + + def countNodes (self): + self.nNode = len(self.lMinimizedNodes) + + def countArcs (self): + self.nArc = 0 + for oNode in self.lMinimizedNodes: + self.nArc += len(oNode.dArcs) + + def lookup (self, sWord): + oNode = self.oRoot + for c in sWord: + if c not in oNode.dArcs: + return False + oNode = oNode.dArcs[c] + return oNode.bFinal + + def displayInfo (self): + print(" * {:<12} {:>16,}".format("Rules:", self.nRule)) + print(" * {:<12} {:>16,}".format("Nodes:", self.nNode)) + print(" * {:<12} {:>16,}".format("Arcs:", self.nArc)) + + def createGraph (self): + dGraph = { 0: self.oRoot.getNodeAsDict() } + print(0, "\t", self.oRoot.getNodeAsDict()) + for oNode in self.lMinimizedNodes: + sHashId = oNode.__hash__() + if sHashId not in dGraph: + dGraph[sHashId] = oNode.getNodeAsDict() + print(sHashId, "\t", dGraph[sHashId]) + else: + print("Error. Double node… same id: ", sHashId) + print(str(oNode.getNodeAsDict())) + return dGraph + + + +class Node: + NextId = 0 + + def __init__ (self): + self.i = Node.NextId + Node.NextId += 1 + self.bFinal = False + self.bInfo = False + self.dArcs = {} # key: arc value; value: a node + + @classmethod + def resetNextId (cls): + cls.NextId = 0 + + def __str__ (self): + # Caution! this function is used for hashing and comparison! + cFinal = "1" if self.bFinal else "0" + cInfo = "1" if self.bInfo else "0" + l = [cFinal, cInfo] + for (key, oNode) in self.dArcs.items(): + l.append(str(key)) + l.append(str(oNode.i)) + return "_".join(l) + + def __hash__ (self): + # Used as a key in a python dictionary. + return self.__str__().__hash__() + + def __eq__ (self, other): + # Used as a key in a python dictionary. + # Nodes are equivalent if they have identical arcs, and each identical arc leads to identical states. + return self.__str__() == other.__str__() + + def getNodeAsDict (self): + "returns the node as a dictionary structure" + dNode = {} + dRegex = {} + for arc, oNode in self.dArcs.items(): + if type(arc) == str and arc.startswith("~"): + dRegex[arc[1:]] = oNode.__hash__() + else: + dNode[arc] = oNode.__hash__() + if dRegex: + dNode[""] = dRegex + if self.bFinal: + dNode[""] = "" + if self.bInfo: + dNode[""] = "" + return dNode ADDED gc_lang/fr/rules_graph.grx Index: gc_lang/fr/rules_graph.grx ================================================================== --- /dev/null +++ gc_lang/fr/rules_graph.grx @@ -0,0 +1,60 @@ +# +# RÈGLES DE GRAMMAIRE FRANÇAISE POUR GRAMMALECTE +# par Olivier R. +# +# Copyright © 2011-2017. +# +# This file is part of Grammalecte. +# +# Grammalecte is free software: you can redistribute it and/or modify +# it under the terms of the GNU General Public License as published by +# the Free Software Foundation, either version 3 of the License, or +# (at your option) any later version. +# +# Grammalecte is distributed in the hope that it will be useful, +# but WITHOUT ANY WARRANTY; without even the implied warranty of +# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the +# GNU General Public License for more details. +# +# You should have received a copy of the GNU General Public License +# along with Grammalecte. If not, see +# + +# RÈGLES POUR LE GRAPHE DE TOKENS + +# DOCUMENTATION +# Expressions régulières en Python : http://docs.python.org/library/re.html + +# [++] : séparateur des règles pour le paragraphe et des règles pour la phrase. + +# Types d’action: +# ->> erreur +# ~>> préprocesseur de texte +# =>> désambiguïsateur + + +# Fin d’interprétation du fichier avec une ligne commençant par #END + +# ERREURS COURANTES +# http://fr.wikipedia.org/wiki/Wikip%C3%A9dia:Fautes_d%27orthographe/Courantes + + +__rule1__ + les ~:N:.:s + des ~:N:.:s + ces ~:N:.:s + <<- -1>> acquit # Message0|http://test.grammalecte.net + +__rule2__ + ci important que soi + ci vraiment il y a + ci pour ça + <<- morph(\2, ":[WAR]", False) -1>> si # Message1|http://test.grammalecte.net + +__rule3__ + contre nature + contre pétrie + contre action + <<- morph(\1, "xxxx") -1:2>> =$area.replace(" ", "") # Message2|http://test.grammalecte.org + <<- ~>> =$area.replace(" ", "") + Index: make.py ================================================================== --- make.py +++ make.py @@ -17,10 +17,11 @@ from distutils import dir_util, file_util import dialog_bundled import compile_rules +import compile_rules_graph import helpers import lex_build sWarningMessage = "The content of this folder is generated by code and replaced at each build.\n" @@ -191,12 +192,14 @@ dVars = xConfig._sections['args'] dVars['locales'] = dVars["locales"].replace("_", "-") dVars['loc'] = str(dict([ [s, [s[0:2], s[3:5], ""]] for s in dVars["locales"].split(" ") ])) ## COMPILE RULES - dResult = compile_rules.make(spLang, dVars['lang'], bJavaScript) - dVars.update(dResult) + dResultRegex = compile_rules.make(spLang, dVars['lang'], bJavaScript) + dVars.update(dResultRegex) + + dResultGraph = compile_rules_graph.make(spLang, dVars['lang'], bJavaScript) ## READ GRAMMAR CHECKER PLUGINS print("PYTHON:") print("+ Plugins: ", end="") sCodePlugins = ""