Index: compile_rules_graph.py ================================================================== --- compile_rules_graph.py +++ compile_rules_graph.py @@ -1,11 +1,11 @@ # Create a Direct Acyclic Rule Graph (DARG) import re import traceback import json -import datg +import darg dDEF = {} dACTIONS = {} lFUNCTIONS = [] @@ -81,12 +81,12 @@ # Condition sCondition = sAction[:m.start()].strip() if sCondition: sCondition = prepareFunction(sCondition) sCondition = changeReferenceToken(sCondition, dPos) - lFUNCTIONS.append(("gc_"+sIdAction, sCondition)) - sCondition = "gc_"+sIdAction + lFUNCTIONS.append(("g_c_"+sIdAction, sCondition)) + sCondition = "g_c_"+sIdAction else: sCondition = "" # Action cAction = m.group(1) sAction = sAction[m.end():].strip() @@ -112,15 +112,15 @@ if mURL: sURL = mURL.group(1).strip() sMsg = sMsg[:mURL.start(0)].strip() if sMsg[0:1] == "=": sMsg = prepareFunction(sMsg[1:]) - lFUNCTIONS.append(("gm_"+sIdAction, sMsg)) + lFUNCTIONS.append(("g_m_"+sIdAction, sMsg)) for x in re.finditer("group[(](\d+)[)]", sMsg): if int(x.group(1)) > nGroup: print("# Error in groups in message at line " + sIdAction + " ("+str(nGroup)+" groups only)") - sMsg = "=m_"+sIdAction + sMsg = "=g_m_"+sIdAction else: for x in re.finditer(r"\\(\d+)", sMsg): if int(x.group(1)) > nGroup: print("# Error in groups in message at line " + sIdAction + " ("+str(nGroup)+" groups only)") if re.search("[.]\\w+[(]", sMsg): @@ -143,12 +143,12 @@ if cAction == "-": ## error detected --> suggestion if not sAction: print("# Error in action at line " + sIdAction + ": This action is empty.") if sAction[0:1] == "=": - lFUNCTIONS.append(("gs_"+sIdAction, sAction[1:])) - sAction = "=gs_"+sIdAction + lFUNCTIONS.append(("g_s_"+sIdAction, sAction[1:])) + sAction = "=g_s_"+sIdAction elif sAction.startswith('"') and sAction.endswith('"'): sAction = sAction[1:-1] if not sMsg: print("# Error in action at line " + sIdAction + ": The message is empty.") return [sCondition, cAction, sAction, iStartAction, iEndAction, nPriority, sMsg, sURL] @@ -155,23 +155,23 @@ elif cAction == "~": ## text processor if not sAction: print("# Error in action at line " + sIdAction + ": This action is empty.") if sAction[0:1] == "=": - lFUNCTIONS.append(("gp_"+sIdAction, sAction[1:])) - sAction = "=gp_"+sIdAction + lFUNCTIONS.append(("g_p_"+sIdAction, sAction[1:])) + sAction = "=g_p_"+sIdAction elif sAction.startswith('"') and sAction.endswith('"'): sAction = sAction[1:-1] return [sCondition, cAction, sAction, iStartAction, iEndAction] elif cAction == "=": ## disambiguator if sAction[0:1] == "=": sAction = sAction[1:] if not sAction: print("# Error in action at line " + sIdAction + ": This action is empty.") - lFUNCTIONS.append(("gd_"+sIdAction, sAction)) - sAction = "gd_"+sIdAction + lFUNCTIONS.append(("g_d_"+sIdAction, sAction)) + sAction = "g_d_"+sIdAction return [sCondition, cAction, sAction] elif cAction == ">": ## no action, break loop if condition is False return [sCondition, cAction, ""] else: @@ -264,18 +264,19 @@ # Graph creation for e in lPreparedRule: print(e) - oDATG = datg.DATG(lPreparedRule, sLang) - oRuleGraph = oDATG.createGraph() + oDARG = darg.DARG(lPreparedRule, sLang) + oRuleGraph = oDARG.createGraph() # Result d = { - "g_callables": None, - "g_gctests": None, - "graph_rules": None, + "graph_callables": None, + "graph_gctests": None, + "rules_graph": oRuleGraph, + "rules_actions": dACTIONS } return d ADDED darg.py Index: darg.py ================================================================== --- darg.py +++ darg.py @@ -0,0 +1,183 @@ +#!python3 + +# RULE GRAPH BUILDER +# +# by Olivier R. +# License: MPL 2 + + +import json +import time +import traceback + +from graphspell.progressbar import ProgressBar + + + +class DARG: + """DIRECT ACYCLIC RULE GRAPH""" + # This code is inspired from Steve Hanov’s DAWG, 2011. (http://stevehanov.ca/blog/index.php?id=115) + + def __init__ (self, lRule, sLangCode): + print("===== Direct Acyclic Token Graph - Minimal Acyclic Finite State Automaton =====") + + # Preparing DARG + print(" > Preparing list of tokens") + self.sLangCode = sLangCode + self.nRule = len(lRule) + self.aPreviousRule = [] + Node.resetNextId() + self.oRoot = Node() + self.lUncheckedNodes = [] # list of nodes that have not been checked for duplication. + self.lMinimizedNodes = {} # list of unique nodes that have been checked for duplication. + self.nNode = 0 + self.nArc = 0 + + # build + lRule.sort() + oProgBar = ProgressBar(0, len(lRule)) + for aRule in lRule: + self.insert(aRule) + oProgBar.increment(1) + oProgBar.done() + self.finish() + self.countNodes() + self.countArcs() + self.displayInfo() + + # BUILD DARG + def insert (self, aRule): + if aRule < self.aPreviousRule: + sys.exit("# Error: tokens must be inserted in order.") + + # find common prefix between word and previous word + nCommonPrefix = 0 + for i in range(min(len(aRule), len(self.aPreviousRule))): + if aRule[i] != self.aPreviousRule[i]: + break + nCommonPrefix += 1 + + # Check the lUncheckedNodes for redundant nodes, proceeding from last + # one down to the common prefix size. Then truncate the list at that point. + self._minimize(nCommonPrefix) + + # add the suffix, starting from the correct node mid-way through the graph + if len(self.lUncheckedNodes) == 0: + oNode = self.oRoot + else: + oNode = self.lUncheckedNodes[-1][2] + + iToken = nCommonPrefix + for token in aRule[nCommonPrefix:]: + oNextNode = Node() + oNode.dArcs[token] = oNextNode + self.lUncheckedNodes.append((oNode, token, oNextNode)) + if iToken == (len(aRule) - 2): + oNode.bFinal = True + iToken += 1 + oNode = oNextNode + oNode.bFinal = True + self.aPreviousRule = aRule + + def finish (self): + "minimize unchecked nodes" + self._minimize(0) + + def _minimize (self, downTo): + # proceed from the leaf up to a certain point + for i in range( len(self.lUncheckedNodes)-1, downTo-1, -1 ): + oNode, token, oChildNode = self.lUncheckedNodes[i] + if oChildNode in self.lMinimizedNodes: + # replace the child with the previously encountered one + oNode.dArcs[token] = self.lMinimizedNodes[oChildNode] + else: + # add the state to the minimized nodes. + self.lMinimizedNodes[oChildNode] = oChildNode + self.lUncheckedNodes.pop() + + def countNodes (self): + self.nNode = len(self.lMinimizedNodes) + + def countArcs (self): + self.nArc = 0 + for oNode in self.lMinimizedNodes: + self.nArc += len(oNode.dArcs) + + def lookup (self, sWord): + oNode = self.oRoot + for c in sWord: + if c not in oNode.dArcs: + return False + oNode = oNode.dArcs[c] + return oNode.bFinal + + def displayInfo (self): + print(" * {:<12} {:>16,}".format("Rules:", self.nRule)) + print(" * {:<12} {:>16,}".format("Nodes:", self.nNode)) + print(" * {:<12} {:>16,}".format("Arcs:", self.nArc)) + + def createGraph (self): + dGraph = { 0: self.oRoot.getNodeAsDict() } + print(0, "\t", self.oRoot.getNodeAsDict()) + for oNode in self.lMinimizedNodes: + sHashId = oNode.__hash__() + if sHashId not in dGraph: + dGraph[sHashId] = oNode.getNodeAsDict() + print(sHashId, "\t", dGraph[sHashId]) + else: + print("Error. Double node… same id: ", sHashId) + print(str(oNode.getNodeAsDict())) + return dGraph + + + +class Node: + NextId = 0 + + def __init__ (self): + self.i = Node.NextId + Node.NextId += 1 + self.bFinal = False + self.dArcs = {} # key: arc value; value: a node + + @classmethod + def resetNextId (cls): + cls.NextId = 0 + + def __str__ (self): + # Caution! this function is used for hashing and comparison! + cFinal = "1" if self.bFinal else "0" + l = [cFinal] + for (key, oNode) in self.dArcs.items(): + l.append(str(key)) + l.append(str(oNode.i)) + return "_".join(l) + + def __hash__ (self): + # Used as a key in a python dictionary. + return self.__str__().__hash__() + + def __eq__ (self, other): + # Used as a key in a python dictionary. + # Nodes are equivalent if they have identical arcs, and each identical arc leads to identical states. + return self.__str__() == other.__str__() + + def getNodeAsDict (self): + "returns the node as a dictionary structure" + dNode = {} + dRegex = {} + dRules = {} + for arc, oNode in self.dArcs.items(): + if type(arc) == str and arc.startswith("~"): + dRegex[arc[1:]] = oNode.__hash__() + elif arc.startswith("##"): + dRules[arc[1:]] = oNode.__hash__() + else: + dNode[arc] = oNode.__hash__() + if dRegex: + dNode[""] = dRegex + if dRules: + dNode[""] = dRules + #if self.bFinal: + # dNode[""] = 1 + return dNode DELETED datg.py Index: datg.py ================================================================== --- datg.py +++ datg.py @@ -1,183 +0,0 @@ -#!python3 - -# RULE GRAPH BUILDER -# -# by Olivier R. -# License: MPL 2 - - -import json -import time -import traceback - -from graphspell.progressbar import ProgressBar - - - -class DATG: - """DIRECT ACYCLIC TOKEN GRAPH""" - # This code is inspired from Steve Hanov’s DAWG, 2011. (http://stevehanov.ca/blog/index.php?id=115) - - def __init__ (self, lRule, sLangCode): - print("===== Direct Acyclic Token Graph - Minimal Acyclic Finite State Automaton =====") - - # Preparing DATG - print(" > Preparing list of tokens") - self.sLangCode = sLangCode - self.nRule = len(lRule) - self.aPreviousRule = [] - Node.resetNextId() - self.oRoot = Node() - self.lUncheckedNodes = [] # list of nodes that have not been checked for duplication. - self.lMinimizedNodes = {} # list of unique nodes that have been checked for duplication. - self.nNode = 0 - self.nArc = 0 - - # build - lRule.sort() - oProgBar = ProgressBar(0, len(lRule)) - for aRule in lRule: - self.insert(aRule) - oProgBar.increment(1) - oProgBar.done() - self.finish() - self.countNodes() - self.countArcs() - self.displayInfo() - - # BUILD DATG - def insert (self, aRule): - if aRule < self.aPreviousRule: - sys.exit("# Error: tokens must be inserted in order.") - - # find common prefix between word and previous word - nCommonPrefix = 0 - for i in range(min(len(aRule), len(self.aPreviousRule))): - if aRule[i] != self.aPreviousRule[i]: - break - nCommonPrefix += 1 - - # Check the lUncheckedNodes for redundant nodes, proceeding from last - # one down to the common prefix size. Then truncate the list at that point. - self._minimize(nCommonPrefix) - - # add the suffix, starting from the correct node mid-way through the graph - if len(self.lUncheckedNodes) == 0: - oNode = self.oRoot - else: - oNode = self.lUncheckedNodes[-1][2] - - iToken = nCommonPrefix - for token in aRule[nCommonPrefix:]: - oNextNode = Node() - oNode.dArcs[token] = oNextNode - self.lUncheckedNodes.append((oNode, token, oNextNode)) - if iToken == (len(aRule) - 2): - oNode.bFinal = True - iToken += 1 - oNode = oNextNode - oNode.bFinal = True - self.aPreviousRule = aRule - - def finish (self): - "minimize unchecked nodes" - self._minimize(0) - - def _minimize (self, downTo): - # proceed from the leaf up to a certain point - for i in range( len(self.lUncheckedNodes)-1, downTo-1, -1 ): - oNode, token, oChildNode = self.lUncheckedNodes[i] - if oChildNode in self.lMinimizedNodes: - # replace the child with the previously encountered one - oNode.dArcs[token] = self.lMinimizedNodes[oChildNode] - else: - # add the state to the minimized nodes. - self.lMinimizedNodes[oChildNode] = oChildNode - self.lUncheckedNodes.pop() - - def countNodes (self): - self.nNode = len(self.lMinimizedNodes) - - def countArcs (self): - self.nArc = 0 - for oNode in self.lMinimizedNodes: - self.nArc += len(oNode.dArcs) - - def lookup (self, sWord): - oNode = self.oRoot - for c in sWord: - if c not in oNode.dArcs: - return False - oNode = oNode.dArcs[c] - return oNode.bFinal - - def displayInfo (self): - print(" * {:<12} {:>16,}".format("Rules:", self.nRule)) - print(" * {:<12} {:>16,}".format("Nodes:", self.nNode)) - print(" * {:<12} {:>16,}".format("Arcs:", self.nArc)) - - def createGraph (self): - dGraph = { 0: self.oRoot.getNodeAsDict() } - print(0, "\t", self.oRoot.getNodeAsDict()) - for oNode in self.lMinimizedNodes: - sHashId = oNode.__hash__() - if sHashId not in dGraph: - dGraph[sHashId] = oNode.getNodeAsDict() - print(sHashId, "\t", dGraph[sHashId]) - else: - print("Error. Double node… same id: ", sHashId) - print(str(oNode.getNodeAsDict())) - return dGraph - - - -class Node: - NextId = 0 - - def __init__ (self): - self.i = Node.NextId - Node.NextId += 1 - self.bFinal = False - self.dArcs = {} # key: arc value; value: a node - - @classmethod - def resetNextId (cls): - cls.NextId = 0 - - def __str__ (self): - # Caution! this function is used for hashing and comparison! - cFinal = "1" if self.bFinal else "0" - l = [cFinal] - for (key, oNode) in self.dArcs.items(): - l.append(str(key)) - l.append(str(oNode.i)) - return "_".join(l) - - def __hash__ (self): - # Used as a key in a python dictionary. - return self.__str__().__hash__() - - def __eq__ (self, other): - # Used as a key in a python dictionary. - # Nodes are equivalent if they have identical arcs, and each identical arc leads to identical states. - return self.__str__() == other.__str__() - - def getNodeAsDict (self): - "returns the node as a dictionary structure" - dNode = {} - dRegex = {} - dRules = {} - for arc, oNode in self.dArcs.items(): - if type(arc) == str and arc.startswith("~"): - dRegex[arc[1:]] = oNode.__hash__() - elif arc.startswith("##"): - dRules[arc[1:]] = oNode.__hash__() - else: - dNode[arc] = oNode.__hash__() - if dRegex: - dNode[""] = dRegex - if dRules: - dNode[""] = dRules - #if self.bFinal: - # dNode[""] = 1 - return dNode ADDED gc_core/py/lang_core/gc_rules_graph.py Index: gc_core/py/lang_core/gc_rules_graph.py ================================================================== --- gc_core/py/lang_core/gc_rules_graph.py +++ gc_core/py/lang_core/gc_rules_graph.py @@ -0,0 +1,5 @@ +# generated code, do not edit + +dGraph = ${rules_graph} + +dRule = ${rules_actions} ADDED gc_core/py/lang_core/gc_sentence.py Index: gc_core/py/lang_core/gc_sentence.py ================================================================== --- gc_core/py/lang_core/gc_sentence.py +++ gc_core/py/lang_core/gc_sentence.py @@ -0,0 +1,89 @@ +# Sentence checker + +from ..graphspell.tokenizer import Tokenizer +from .gc_graph import dGraph + + +oTokenizer = Tokenizer("${lang}") + + +class Sentence: + + def __init__ (self, sSentence, sSentence0, nOffset): + self.sSentence = sSentence + self.sSentence0 = sSentence0 + self.nOffset = nOffset + self.lToken = list(oTokenizer.genTokens()) + + def parse (self): + dErr = {} + lPointer = [] + for dToken in self.lToken: + for i, dPointer in enumerate(lPointer): + bValid = False + for dNode in self._getNextMatchingNodes(dToken, dPointer["dNode"]): + dPointer["nOffset"] += 1 + dPointer["dNode"] = dNode + bValid = True + if not bValid: + del lPointer[i] + for dNode in self._getNextMatchingNodes(dToken, dGraph): + lPointer.append({"nOffset": 0, "dNode": dNode}) + for dPointer in lPointer: + if "" in dPointer["dNode"]: + for dNode in dGraph[dPointer["dNode"][""]]: + dErr = self._executeActions(dNode) + return dErr + + def _getNextMatchingNodes (self, dToken, dNode): + if dToken["sValue"] in dNode: + yield dGraph[dNode[dToken["sValue"]]] + for sLemma in dToken["sLemma"]: + if sLemma in dNode: + yield dGraph[dNode[dToken["sValue"]]] + if "~" in dNode: + for sRegex in dNode["~"]: + for sMorph in dToken["lMorph"]: + if re.search(sRegex, sMorph): + yield dGraph[dNode["~"][sRegex]] + + def _executeActions (self, dNode): + for sLineId, nextNodeKey in dNode.items(): + for sArc in dGraph[nextNodeKey]: + bCondMemo = None + sFuncCond, cActionType, sWhat, *eAct = dRule[sArc] + # action in lActions: [ condition, action type, replacement/suggestion/action[, iGroupStart, iGroupEnd[, message, URL]] ] + try: + bCondMemo = not sFuncCond or globals()[sFuncCond](s, sx, m, dDA, sCountry, bCondMemo) + if bCondMemo: + if cActionType == "-": + # grammar error + nErrorStart = nOffset + m.start(eAct[0]) + nErrorEnd = nOffset + m.start(eAct[1]) + if nErrorStart not in dErrs or nPriority > dPriority[nErrorStart]: + dErrs[nErrorStart] = _createError(self.lToken, self.sSentence0, sWhat, nOffset, m, nErrorStart, nErrorEnd, sLineId, bUppercase, eAct[2], eAct[3], bIdRule, sOption, bContext) + dPriority[nErrorStart] = nPriority + elif cActionType == "~": + # text processor + self.lToken = _rewrite(self.lToken, sWhat, bUppercase) + bChange = True + elif cActionType == "=": + # disambiguation + globals()[sWhat](self.lToken, dDA) + elif cActionType == ">": + # we do nothing, this test is just a condition to apply all following actions + pass + else: + echo("# error: unknown action at " + sLineId) + elif cActionType == ">": + break + except Exception as e: + raise Exception(str(e), "# " + sLineId + " # " + sRuleId) + + def _createWriterError (self): + d = {} + return d + + def _createDictError (self): + d = {} + return d Index: make.py ================================================================== --- make.py +++ make.py @@ -196,10 +196,11 @@ ## COMPILE RULES dResultRegex = compile_rules.make(spLang, dVars['lang'], bJavaScript) dVars.update(dResultRegex) dResultGraph = compile_rules_graph.make(spLang, dVars['lang'], bJavaScript) + dVars.update(dResultGraph) ## READ GRAMMAR CHECKER PLUGINS print("PYTHON:") print("+ Plugins: ", end="") sCodePlugins = ""