ADDED compile_rules_graph.py Index: compile_rules_graph.py ================================================================== --- /dev/null +++ compile_rules_graph.py @@ -0,0 +1,282 @@ +# Create a Direct Acyclic Rule Graph (DARG) + +import re +import traceback +import json +import darg + + +dDEF = {} +dACTIONS = {} +lFUNCTIONS = [] + + +def prepareFunction (s): + s = s.replace("__also__", "bCondMemo") + s = s.replace("__else__", "not bCondMemo") + s = re.sub(r"isStart *", 'before(["", ","])', s) + s = re.sub(r"isRealStart *", 'before([""])', s) + s = re.sub(r"isStart0 *", 'before0(["", ","])', s) + s = re.sub(r"isRealStart0 *", 'before0([""])', s) + s = re.sub(r"isEnd *", 'after(["", ","])', s) + s = re.sub(r"isRealEnd *", 'after([""])', s) + s = re.sub(r"isEnd0 *", 'after0(["", ","])', s) + s = re.sub(r"isRealEnd0 *", 'after0([""])', s) + s = re.sub(r"(select|exclude)[(][\\](\d+)", '\\1(lToken[\\2]', s) + s = re.sub(r"define[(][\\](\d+)", 'define(lToken[\\1]', s) + s = re.sub(r"(morph|morphex|displayInfo)[(][\\](\d+)", '\\1(lToken[\\2])', s) + s = re.sub(r"token$\s*(\d)", 'nextToken(\\1', s) # token(n) + s = re.sub(r"token\(\s*-(\d)", 'prevToken(\\1', s) # token(-n) + s = re.sub(r"before\(\s*", 'look(s[:m.start()], ', s) # before(s) + s = re.sub(r"after\(\s*", 'look(s[m.end():], ', s) # after(s) + s = re.sub(r"textarea\(\s*", 'look(s, ', s) # textarea(s) + s = re.sub(r"before_chk1\(\s*", 'look_chk1(dDA, s[:m.start()], 0, ', s) # before_chk1(s) + s = re.sub(r"after_chk1\(\s*", 'look_chk1(dDA, s[m.end():], m.end(), ', s) # after_chk1(s) + s = re.sub(r"textarea_chk1\(\s*", 'look_chk1(dDA, s, 0, ', s) # textarea_chk1(s) + s = re.sub(r"isEndOfNG\(\s*$", 'isEndOfNG(dDA, s[m.end():], m.end())', s) # isEndOfNG(s) + s = re.sub(r"isNextNotCOD$\s*$", 'isNextNotCOD(dDA, s[m.end():], m.end())', s) # isNextNotCOD(s) + s = re.sub(r"isNextVerb$\s*$", 'isNextVerb(dDA, s[m.end():], m.end())', s) # isNextVerb(s) + s = re.sub(r"\bspell *[(]", '_oSpellChecker.isValid(', s) + s = re.sub(r"[\\](\d+)", 'lToken[\\1]', s) + return s + + +def changeReferenceToken (s, dPos): + for i in range(len(dPos), 0, -1): + s = s.replace("\\"+str(i), "\\"+dPos[i]) + return s + + +def createRule (iLine, sRuleName, sTokenLine, sActions, nPriority): + # print(iLine, "//", sRuleName, "//", sTokenLine, "//", sActions, "//", nPriority) + lToken = sTokenLine.split() + + # Calculate positions + dPos = {} + nGroup = 0 + for i, sToken in enumerate(lToken): + if sToken.startswith("(") and sToken.endswith(")"): + lToken[i] = sToken[1:-1] + nGroup += 1 + dPos[nGroup] = i + + # Parse actions + for nAction, sAction in enumerate(sActions.split(" <<- ")): + if sAction.strip(): + sActionId = sRuleName + "_a" + str(nAction) + aAction = createAction(sActionId, sAction, nGroup, nPriority, dPos) + if aAction: + dACTIONS[sActionId] = aAction + lResult = list(lToken) + lResult.extend(["##"+str(iLine), sActionId]) + yield lResult + + +def createAction (sIdAction, sAction, nGroup, nPriority, dPos): + m = re.search("([-~=])(\\d+|)(:\\d+|)>> ", sAction) + if not m: + print(" # Error. No action found at: ", sIdAction) + print(" ==", sAction, "==") + return None + # Condition + sCondition = sAction[:m.start()].strip() + if sCondition: + sCondition = prepareFunction(sCondition) + sCondition = changeReferenceToken(sCondition, dPos) + lFUNCTIONS.append(("g_c_"+sIdAction, sCondition)) + sCondition = "g_c_"+sIdAction + else: + sCondition = "" + # Action + cAction = m.group(1) + sAction = sAction[m.end():].strip() + sAction = changeReferenceToken(sAction, dPos) + iStartAction = int(m.group(2)) if m.group(2) else 0 + iEndAction = int(m.group(3)[1:]) if m.group(3) else iStartAction + if nGroup: + iStartAction = dPos[iStartAction] + iEndAction = dPos[iEndAction] + + if cAction == "-": + ## error + iMsg = sAction.find(" # ") + if iMsg == -1: + sMsg = "# Error. Error message not found." + sURL = "" + print(sMsg + " Action id: " + sIdAction) + else: + sMsg = sAction[iMsg+3:].strip() + sAction = sAction[:iMsg].strip() + sURL = "" + mURL = re.search("[|] *(https?://.*)", sMsg) + if mURL: + sURL = mURL.group(1).strip() + sMsg = sMsg[:mURL.start(0)].strip() + if sMsg[0:1] == "=": + sMsg = prepareFunction(sMsg[1:]) + lFUNCTIONS.append(("g_m_"+sIdAction, sMsg)) + for x in re.finditer("group[(](\d+)[)]", sMsg): + if int(x.group(1)) > nGroup: + print("# Error in groups in message at line " + sIdAction + " ("+str(nGroup)+" groups only)") + sMsg = "=g_m_"+sIdAction + else: + for x in re.finditer(r"\$\d+)", sMsg): + if int(x.group(1)) > nGroup: + print("# Error in groups in message at line " + sIdAction + " ("+str(nGroup)+" groups only)") + if re.search("[.]\\w+[(]", sMsg): + print("# Error in message at line " + sIdAction + ": This message looks like code. Line should begin with =") + + if sAction[0:1] == "=" or cAction == "=": + if "define" in sAction and not re.search(r"define\(\\\d+ *, *\[.*\] *$", sAction): + print("# Error in action at line " + sIdAction + ": second argument for define must be a list of strings") + sAction = prepareFunction(sAction) + for x in re.finditer("group[(](\d+)[)]", sAction): + if int(x.group(1)) > nGroup: + print("# Error in groups in replacement at line " + sIdAction + " ("+str(nGroup)+" groups only)") + else: + for x in re.finditer(r"\\(\d+)", sAction): + if int(x.group(1)) > nGroup: + print("# Error in groups in replacement at line " + sIdAction + " ("+str(nGroup)+" groups only)") + if re.search("[.]\\w+[(]|sugg\\w+[(]", sAction): + print("# Error in action at line " + sIdAction + ": This action looks like code. Line should begin with =") + + if cAction == "-": + ## error detected --> suggestion + if not sAction: + print("# Error in action at line " + sIdAction + ": This action is empty.") + if sAction[0:1] == "=": + lFUNCTIONS.append(("g_s_"+sIdAction, sAction[1:])) + sAction = "=g_s_"+sIdAction + elif sAction.startswith('"') and sAction.endswith('"'): + sAction = sAction[1:-1] + if not sMsg: + print("# Error in action at line " + sIdAction + ": The message is empty.") + return [sCondition, cAction, sAction, iStartAction, iEndAction, nPriority, sMsg, sURL] + elif cAction == "~": + ## text processor + if not sAction: + print("# Error in action at line " + sIdAction + ": This action is empty.") + if sAction[0:1] == "=": + lFUNCTIONS.append(("g_p_"+sIdAction, sAction[1:])) + sAction = "=g_p_"+sIdAction + elif sAction.startswith('"') and sAction.endswith('"'): + sAction = sAction[1:-1] + return [sCondition, cAction, sAction, iStartAction, iEndAction] + elif cAction == "=": + ## disambiguator + if sAction[0:1] == "=": + sAction = sAction[1:] + if not sAction: + print("# Error in action at line " + sIdAction + ": This action is empty.") + lFUNCTIONS.append(("g_d_"+sIdAction, sAction)) + sAction = "g_d_"+sIdAction + return [sCondition, cAction, sAction] + elif cAction == ">": + ## no action, break loop if condition is False + return [sCondition, cAction, ""] + else: + print("# Unknown action at line " + sIdAction) + return None + + +def make (spLang, sLang, bJavaScript): + "compile rules, returns a dictionary of values" + # for clarity purpose, don’t create any file here + + print("> read graph rules file...") + try: + lRules = open(spLang + "/rules_graph.grx", 'r', encoding="utf-8").readlines() + except: + print("Error. Rules file in project [" + sLang + "] not found.") + exit() + + # removing comments, zeroing empty lines, creating definitions, storing tests, merging rule lines + print(" parsing rules...") + global dDEF + lLine = [] + lRuleLine = [] + lTest = [] + lOpt = [] + lTokenLine = [] + sActions = "" + nPriority = 4 + + for i, sLine in enumerate(lRules, 1): + sLine = sLine.rstrip() + if "\t" in sLine: + print("Error. Tabulation at line: ", i) + break + if sLine.startswith('#END'): + printBookmark(0, "BREAK BY #END", i) + break + elif sLine.startswith("#"): + pass + elif sLine.startswith("DEF:"): + m = re.match("DEF: +([a-zA-Z_][a-zA-Z_0-9]*) +(.+)$", sLine.strip()) + if m: + dDEF["{"+m.group(1)+"}"] = m.group(2) + else: + print("Error in definition: ", end="") + print(sLine.strip()) + elif sLine.startswith("TEST:"): + lTest.append("{:<8}".format(i) + " " + sLine[5:].strip()) + elif sLine.startswith("TODO:"): + pass + elif sLine.startswith("!!"): + m = re.search("^!!+", sLine) + nExMk = len(m.group(0)) + if sLine[nExMk:].strip(): + printBookmark(nExMk-2, sLine[nExMk:].strip(), i) + elif sLine.startswith("__") and sLine.endswith("__"): + # new rule group + m = re.match("__(\\w+)(!\\d|)__", sLine) + if m: + sRuleName = m.group(1) + nPriority = int(m.group(2)[1:]) if m.group(2) else 4 + else: + print("Error at rule group: ", sLine, " -- line:", i) + break + elif re.match("[ ]*$", sLine): + # empty line to end merging + for i, sTokenLine in lTokenLine: + lRuleLine.append((i, sRuleName, sTokenLine, sActions, nPriority)) + lTokenLine = [] + sActions = "" + sRuleName = "" + nPriority = 4 + elif sLine.startswith((" ")): + # actions + sActions += " " + sLine.strip() + else: + lTokenLine.append([i, sLine.strip()]) + + # tests + print(" list tests...") + sGCTests = "\n".join(lTest) + sGCTestsJS = '{ "aData2": ' + json.dumps(lTest, ensure_ascii=False) + " }\n" + + # processing rules + print(" preparing rules...") + lPreparedRule = [] + for i, sRuleGroup, sTokenLine, sActions, nPriority in lRuleLine: + for lRule in createRule(i, sRuleGroup, sTokenLine, sActions, nPriority): + lPreparedRule.append(lRule) + + # Graph creation + for e in lPreparedRule: + print(e) + + oDARG = darg.DARG(lPreparedRule, sLang) + oRuleGraph = oDARG.createGraph() + + # Result + d = { + "graph_callables": None, + "graph_gctests": None, + "rules_graph": oRuleGraph, + "rules_actions": dACTIONS + } + + return d + + ADDED darg.py Index: darg.py ================================================================== --- /dev/null +++ darg.py @@ -0,0 +1,175 @@ +#!python3 + +# RULE GRAPH BUILDER +# +# by Olivier R. +# License: MPL 2 + + +import json +import time +import traceback + +from graphspell.progressbar import ProgressBar + + + +class DARG: + """DIRECT ACYCLIC RULE GRAPH""" + # This code is inspired from Steve Hanov’s DAWG, 2011. (http://stevehanov.ca/blog/index.php?id=115) + + def __init__ (self, lRule, sLangCode): + print("===== Direct Acyclic Rule Graph - Minimal Acyclic Finite State Automaton =====") + + # Preparing DARG + print(" > Preparing list of tokens") + self.sLangCode = sLangCode + self.nRule = len(lRule) + self.aPreviousRule = [] + Node.resetNextId() + self.oRoot = Node() + self.lUncheckedNodes = [] # list of nodes that have not been checked for duplication. + self.lMinimizedNodes = {} # list of unique nodes that have been checked for duplication. + self.nNode = 0 + self.nArc = 0 + + # build + lRule.sort() + oProgBar = ProgressBar(0, len(lRule)) + for aRule in lRule: + self.insert(aRule) + oProgBar.increment(1) + oProgBar.done() + self.finish() + self.countNodes() + self.countArcs() + self.displayInfo() + + # BUILD DARG + def insert (self, aRule): + if aRule < self.aPreviousRule: + sys.exit("# Error: tokens must be inserted in order.") + + # find common prefix between word and previous word + nCommonPrefix = 0 + for i in range(min(len(aRule), len(self.aPreviousRule))): + if aRule[i] != self.aPreviousRule[i]: + break + nCommonPrefix += 1 + + # Check the lUncheckedNodes for redundant nodes, proceeding from last + # one down to the common prefix size. Then truncate the list at that point. + self._minimize(nCommonPrefix) + + # add the suffix, starting from the correct node mid-way through the graph + if len(self.lUncheckedNodes) == 0: + oNode = self.oRoot + else: + oNode = self.lUncheckedNodes[-1][2] + + iToken = nCommonPrefix + for sToken in aRule[nCommonPrefix:]: + oNextNode = Node() + oNode.dArcs[sToken] = oNextNode + self.lUncheckedNodes.append((oNode, sToken, oNextNode)) + if iToken == (len(aRule) - 2): + oNode.bFinal = True + iToken += 1 + oNode = oNextNode + oNode.bFinal = True + self.aPreviousRule = aRule + + def finish (self): + "minimize unchecked nodes" + self._minimize(0) + + def _minimize (self, downTo): + # proceed from the leaf up to a certain point + for i in range( len(self.lUncheckedNodes)-1, downTo-1, -1 ): + oNode, sToken, oChildNode = self.lUncheckedNodes[i] + if oChildNode in self.lMinimizedNodes: + # replace the child with the previously encountered one + oNode.dArcs[sToken] = self.lMinimizedNodes[oChildNode] + else: + # add the state to the minimized nodes. + self.lMinimizedNodes[oChildNode] = oChildNode + self.lUncheckedNodes.pop() + + def countNodes (self): + self.nNode = len(self.lMinimizedNodes) + + def countArcs (self): + self.nArc = 0 + for oNode in self.lMinimizedNodes: + self.nArc += len(oNode.dArcs) + + def displayInfo (self): + print(" * {:<12} {:>16,}".format("Rules:", self.nRule)) + print(" * {:<12} {:>16,}".format("Nodes:", self.nNode)) + print(" * {:<12} {:>16,}".format("Arcs:", self.nArc)) + + def createGraph (self): + dGraph = { 0: self.oRoot.getNodeAsDict() } + print(0, "\t", self.oRoot.getNodeAsDict()) + for oNode in self.lMinimizedNodes: + sHashId = oNode.__hash__() + if sHashId not in dGraph: + dGraph[sHashId] = oNode.getNodeAsDict() + print(sHashId, "\t", dGraph[sHashId]) + else: + print("Error. Double node… same id: ", sHashId) + print(str(oNode.getNodeAsDict())) + return dGraph + + + +class Node: + NextId = 0 + + def __init__ (self): + self.i = Node.NextId + Node.NextId += 1 + self.bFinal = False + self.dArcs = {} # key: arc value; value: a node + + @classmethod + def resetNextId (cls): + cls.NextId = 0 + + def __str__ (self): + # Caution! this function is used for hashing and comparison! + cFinal = "1" if self.bFinal else "0" + l = [cFinal] + for (key, oNode) in self.dArcs.items(): + l.append(str(key)) + l.append(str(oNode.i)) + return "_".join(l) + + def __hash__ (self): + # Used as a key in a python dictionary. + return self.__str__().__hash__() + + def __eq__ (self, other): + # Used as a key in a python dictionary. + # Nodes are equivalent if they have identical arcs, and each identical arc leads to identical states. + return self.__str__() == other.__str__() + + def getNodeAsDict (self): + "returns the node as a dictionary structure" + dNode = {} + dRegex = {} + dRules = {} + for arc, oNode in self.dArcs.items(): + if type(arc) == str and arc.startswith("~"): + dRegex[arc[1:]] = oNode.__hash__() + elif arc.startswith("##"): + dRules[arc[1:]] = oNode.__hash__() + else: + dNode[arc] = oNode.__hash__() + if dRegex: + dNode[""] = dRegex + if dRules: + dNode[""] = dRules + #if self.bFinal: + # dNode[""] = 1 + return dNode ADDED gc_core/py/lang_core/gc_rules_graph.py Index: gc_core/py/lang_core/gc_rules_graph.py ================================================================== --- /dev/null +++ gc_core/py/lang_core/gc_rules_graph.py @@ -0,0 +1,5 @@ +# generated code, do not edit + +dGraph = ${rules_graph} + +dRule = ${rules_actions} ADDED gc_core/py/lang_core/gc_sentence.py Index: gc_core/py/lang_core/gc_sentence.py ================================================================== --- /dev/null +++ gc_core/py/lang_core/gc_sentence.py @@ -0,0 +1,141 @@ +# Sentence checker + +from ..graphspell.tokenizer import Tokenizer +from .gc_graph import dGraph + + +oTokenizer = Tokenizer("${lang}") + + +class Sentence: + + def __init__ (self, sSentence, sSentence0, nOffset): + self.sSentence = sSentence + self.sSentence0 = sSentence0 + self.nOffset = nOffset + self.lToken = list(oTokenizer.genTokens()) + + def parse (self): + dErr = {} + lPointer = [] + for dToken in self.lToken: + for i, dPointer in enumerate(lPointer): + bValid = False + for dNode in self._getNextMatchingNodes(dToken, dPointer["dNode"]): + dPointer["nOffset"] = dToken["i"] + dPointer["dNode"] = dNode + bValid = True + if not bValid: + del lPointer[i] + for dNode in self._getNextMatchingNodes(dToken, dGraph): + lPointer.append({"nOffset": 0, "dNode": dNode}) + for dPointer in lPointer: + if "" in dPointer["dNode"]: + for dNode in dGraph[dPointer["dNode"][""]]: + dErr = self._executeActions(dNode) + return dErr + + def _getNextMatchingNodes (self, dToken, dNode): + if dToken["sValue"] in dNode: + yield dGraph[dNode[dToken["sValue"]]] + for sLemma in dToken["sLemma"]: + if sLemma in dNode: + yield dGraph[dNode[dToken["sValue"]]] + if "~" in dNode: + for sRegex in dNode["~"]: + for sMorph in dToken["lMorph"]: + if re.search(sRegex, sMorph): + yield dGraph[dNode["~"][sRegex]] + + def _executeActions (self, dNode): + for sLineId, nextNodeKey in dNode.items(): + for sArc in dGraph[nextNodeKey]: + bCondMemo = None + sFuncCond, cActionType, sWhat, *eAct = dRule[sArc] + # action in lActions: [ condition, action type, replacement/suggestion/action[, iGroupStart, iGroupEnd[, message, URL]] ] + try: + bCondMemo = not sFuncCond or globals()[sFuncCond](self, dDA, sCountry, bCondMemo) + if bCondMemo: + if cActionType == "-": + # grammar error + nErrorStart = nSentenceOffset + m.start(eAct[0]) + nErrorEnd = nSentenceOffset + m.start(eAct[1]) + if nErrorStart not in dErrs or nPriority > dPriority[nErrorStart]: + dErrs[nErrorStart] = _createError(self, sWhat, nErrorStart, nErrorEnd, sLineId, bUppercase, eAct[2], eAct[3], bIdRule, sOption, bContext) + dPriority[nErrorStart] = nPriority + elif cActionType == "~": + # text processor + self.lToken = _rewrite(self, sWhat, nErrorStart, nErrorEnd, bUppercase) + bChange = True + elif cActionType == "@": + # text processor + self.lToken = _rewrite(self, sWhat, nErrorStart, nErrorEnd, bUppercase) + bChange = True + elif cActionType == "=": + # disambiguation + globals()[sWhat](self, dDA) + elif cActionType == ">": + # we do nothing, this test is just a condition to apply all following actions + pass + else: + echo("# error: unknown action at " + sLineId) + elif cActionType == ">": + break + except Exception as e: + raise Exception(str(e), "# " + sLineId + " # " + sRuleId) + + def _createWriterError (self): + d = {} + return d + + def _createDictError (self): + d = {} + return d + + +#### Common functions + +def option (): + pass + + +#### Analyse tokens + +def morph (): + pass + +def morphex (): + pass + +def analyse (): + pass + +def analysex (): + pass + + +#### Go outside scope + +def nextToken (): + pass + +def prevToken (): + pass + +def look (): + pass + +def lookAndCheck (): + pass + + +#### Disambiguator + +def select (): + pass + +def exclude (): + pass + +def define (): + pass ADDED gc_lang/fr/rules_graph.grx Index: gc_lang/fr/rules_graph.grx ================================================================== --- /dev/null +++ gc_lang/fr/rules_graph.grx @@ -0,0 +1,60 @@ +# +# RÈGLES DE GRAMMAIRE FRANÇAISE POUR GRAMMALECTE +# par Olivier R. +# +# Copyright © 2011-2017. +# +# This file is part of Grammalecte. +# +# Grammalecte is free software: you can redistribute it and/or modify +# it under the terms of the GNU General Public License as published by +# the Free Software Foundation, either version 3 of the License, or +# (at your option) any later version. +# +# Grammalecte is distributed in the hope that it will be useful, +# but WITHOUT ANY WARRANTY; without even the implied warranty of +# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the +# GNU General Public License for more details. +# +# You should have received a copy of the GNU General Public License +# along with Grammalecte. If not, see +# + +# RÈGLES POUR LE GRAPHE DE TOKENS + +# DOCUMENTATION +# Expressions régulières en Python : http://docs.python.org/library/re.html + +# [++] : séparateur des règles pour le paragraphe et des règles pour la phrase. + +# Types d’action: +# ->> erreur +# ~>> préprocesseur de texte +# =>> désambiguïsateur + + +# Fin d’interprétation du fichier avec une ligne commençant par #END + +# ERREURS COURANTES +# http://fr.wikipedia.org/wiki/Wikip%C3%A9dia:Fautes_d%27orthographe/Courantes + + +__rule1__ + les ~:N:.:s + des ~:N:.:s + ces ~:N:.:s + <<- -1>> acquit # Message0|http://test.grammalecte.net + +__rule2__ + ci important que soi + ci vraiment il y a + ci pour ça + <<- morph(\2, ":[WAR]", False) -1>> si # Message1|http://test.grammalecte.net + +__rule3__ + contre nature + contre pétrie + contre action + <<- morph(\1, "xxxx") -1:2>> =$area.replace(" ", "") # Message2|http://test.grammalecte.org + <<- ~>> =$area.replace(" ", "") + Index: graphspell-js/tokenizer.js ================================================================== --- graphspell-js/tokenizer.js +++ graphspell-js/tokenizer.js @@ -16,11 +16,11 @@ "default": [ [/^[  \t]+/, 'SPACE'], [/^\/(?:~|bin|boot|dev|etc|home|lib|mnt|opt|root|sbin|tmp|usr|var|Bureau|Documents|Images|Musique|Public|Téléchargements|Vidéos)(?:\/[a-zA-Zà-öÀ-Ö0-9ø-ÿØ-ßĀ-ʯﬁ-ﬆ_.()-]+)*/, 'FOLDERUNIX'], [/^[a-zA-Z]:\$?:Program Files(?: \(x86$|)|[a-zA-Zà-öÀ-Ö0-9ø-ÿØ-ßĀ-ʯﬁ-ﬆ.()]+)(?:\\[a-zA-Zà-öÀ-Ö0-9ø-ÿØ-ßĀ-ʯﬁ-ﬆ_.()-]+)*/, 'FOLDERWIN'], - [/^[,.;:!?…«»“”‘’"(){}\[\]/·–—]+/, 'SEPARATOR'], + [/^[,.;:!?…«»“”‘’"(){}\[\]/·–—]/, 'SEPARATOR'], [/^[A-Z][.][A-Z][.](?:[A-Z][.])*/, 'ACRONYM'], [/^(?:https?:\/\/|www[.]|[a-zA-Zà-öÀ-Ö0-9ø-ÿØ-ßĀ-ʯﬁ-ﬆ_-]+[@.][a-zA-Zà-öÀ-Ö0-9ø-ÿØ-ßĀ-ʯﬁ-ﬆ_-]{2,}[@.])[a-zA-Z0-9][a-zA-Zà-öÀ-Ö0-9ø-ÿØ-ßĀ-ʯﬁ-ﬆ_.\/?&!%=+*"'@$#-]+/, 'LINK'], [/^[#@][a-zA-Zà-öÀ-Ö0-9ø-ÿØ-ßĀ-ʯﬁ-ﬆ_-]+/, 'TAG'], [/^<[a-zA-Zà-öÀ-Ö0-9ø-ÿØ-ßĀ-ʯﬁ-ﬆ]+.*?>|<\/[a-zA-Zà-öÀ-Ö0-9ø-ÿØ-ßĀ-ʯﬁ-ﬆ]+ *>/, 'HTML'], [/^\[\/?[a-zA-Zà-öÀ-Ö0-9ø-ÿØ-ßĀ-ʯﬁ-ﬆ]+\]/, 'PSEUDOHTML'], @@ -32,11 +32,11 @@ "fr": [ [/^[  \t]+/, 'SPACE'], [/^\/(?:~|bin|boot|dev|etc|home|lib|mnt|opt|root|sbin|tmp|usr|var|Bureau|Documents|Images|Musique|Public|Téléchargements|Vidéos)(?:\/[a-zA-Zà-öÀ-Ö0-9ø-ÿØ-ßĀ-ʯﬁ-ﬆ_.()-]+)*/, 'FOLDERUNIX'], [/^[a-zA-Z]:\$?:Program Files(?: \(x86$|)|[a-zA-Zà-öÀ-Ö0-9ø-ÿØ-ßĀ-ʯﬁ-ﬆ.()]+)(?:\\[a-zA-Zà-öÀ-Ö0-9ø-ÿØ-ßĀ-ʯﬁ-ﬆ_.()-]+)*/, 'FOLDERWIN'], - [/^[,.;:!?…«»“”‘’"(){}\[\]/·–—]+/, 'SEPARATOR'], + [/^[,.;:!?…«»“”‘’"(){}\[\]/·–—]/, 'SEPARATOR'], [/^[A-Z][.][A-Z][.](?:[A-Z][.])*/, 'ACRONYM'], [/^(?:https?:\/\/|www[.]|[a-zA-Zà-öÀ-Ö0-9ø-ÿØ-ßĀ-ʯﬁ-ﬆ_-]+[@.][a-zA-Zà-öÀ-Ö0-9ø-ÿØ-ßĀ-ʯﬁ-ﬆ_-]{2,}[@.])[a-zA-Z0-9][a-zA-Zà-öÀ-Ö0-9ø-ÿØ-ßĀ-ʯﬁ-ﬆ_.\/?&!%=+*"'@$#-]+/, 'LINK'], [/^[#@][a-zA-Zà-öÀ-Ö0-9ø-ÿØ-ßĀ-ʯﬁ-ﬆ_-]+/, 'TAG'], [/^<[a-zA-Zà-öÀ-Ö0-9ø-ÿØ-ßĀ-ʯﬁ-ﬆ]+.*?>|<\/[a-zA-Zà-öÀ-Ö0-9ø-ÿØ-ßĀ-ʯﬁ-ﬆ]+ *>/, 'HTML'], [/^\[\/?[a-zA-Zà-öÀ-Ö0-9ø-ÿØ-ßĀ-ʯﬁ-ﬆ]+\]/, 'PSEUDOHTML'], @@ -60,36 +60,32 @@ this.aRules = aTkzPatterns[this.sLang]; } * genTokens (sText) { let m; - let i = 0; + let iNext = 0; while (sText) { - let nCut = 1; + let iCut = 1; + let iToken = 0; for (let [zRegex, sType] of this.aRules) { try { if ((m = zRegex.exec(sText)) !== null) { - if (sType == 'SEPARATOR') { - for (let c of m[0]) { - yield { "sType": sType, "sValue": c, "nStart": i, "nEnd": i + m[0].length } - } - } else { - yield { "sType": sType, "sValue": m[0], "nStart": i, "nEnd": i + m[0].length } - } - nCut = m[0].length; + iToken += 1; + yield { "i": iToken, "sType": sType, "sValue": m[0], "nStart": iNext, "nEnd": iNext + m[0].length } + iCut = m[0].length; break; } } catch (e) { helpers.logerror(e); } } - i += nCut; - sText = sText.slice(nCut); + iNext += iCut; + sText = sText.slice(iCut); } } } if (typeof(exports) !== 'undefined') { exports.Tokenizer = Tokenizer; } Index: graphspell/tokenizer.py ================================================================== --- graphspell/tokenizer.py +++ graphspell/tokenizer.py @@ -5,11 +5,11 @@ _PATTERNS = { "default": ( r'(?P/(?:bin|boot|dev|etc|home|lib|mnt|opt|root|sbin|tmp|usr|var|Bureau|Documents|Images|Musique|Public|Téléchargements|Vidéos)(?:/[\w.()-]+)*)', r'(?P[a-zA-Z]:\\(?:Program Files(?: [(]x86[)]|)|[\w.()]+)(?:\\[\w.()-]+)*)', - r'(?P[.,?!:;…«»“”"()/·]+)', + r'(?P[][,.;:!?…«»“”‘’"(){}/·–—])', r'(?P[A-Z][.][A-Z][.](?:[A-Z][.])*)', r'(?P(?:https?://|www[.]|\w+[@.]\w\w+[@.])\w[\w./?&!%=+*"\'@$#-]+)', r'(?P[#@][\w-]+)', r'(?P<\w+.*?>|)', r'(?P\[/?\w+\])', @@ -19,11 +19,11 @@ ), "fr": ( r'(?P/(?:bin|boot|dev|etc|home|lib|mnt|opt|root|sbin|tmp|usr|var|Bureau|Documents|Images|Musique|Public|Téléchargements|Vidéos)(?:/[\w.()-]+)*)', r'(?P[a-zA-Z]:\\(?:Program Files(?: [(]x86[)]|)|[\w.()]+)(?:\\[\w.()-]+)*)', - r'(?P[.,?!:;…«»“”"()/·]+)', + r'(?P[][,.;:!?…«»“”‘’"(){}/·–—])', r'(?P[A-Z][.][A-Z][.](?:[A-Z][.])*)', r'(?P(?:https?://|www[.]|\w+[@.]\w\w+[@.])\w[\w./?&!%=+*"\'@$#-]+)', r'(?P[#@][\w-]+)', r'(?P<\w+.*?>|)', r'(?P\[/?\w+\])', @@ -43,7 +43,7 @@ if sLang not in _PATTERNS: self.sLang = "default" self.zToken = re.compile( "(?i)" + '|'.join(sRegex for sRegex in _PATTERNS[sLang]) ) def genTokens (self, sText): - for m in self.zToken.finditer(sText): - yield { "sType": m.lastgroup, "sValue": m.group(), "nStart": m.start(), "nEnd": m.end() } + for i, m in enumerate(self.zToken.finditer(sText), 1): + yield { "i": i, "sType": m.lastgroup, "sValue": m.group(), "nStart": m.start(), "nEnd": m.end() } Index: make.py ================================================================== --- make.py +++ make.py @@ -17,10 +17,11 @@ from distutils import dir_util, file_util import dialog_bundled import compile_rules +import compile_rules_graph import helpers import lex_build sWarningMessage = "The content of this folder is generated by code and replaced at each build.\n" @@ -191,12 +192,15 @@ dVars = xConfig._sections['args'] dVars['locales'] = dVars["locales"].replace("_", "-") dVars['loc'] = str(dict([ [s, [s[0:2], s[3:5], ""]] for s in dVars["locales"].split(" ") ])) ## COMPILE RULES - dResult = compile_rules.make(spLang, dVars['lang'], bJavaScript) - dVars.update(dResult) + dResultRegex = compile_rules.make(spLang, dVars['lang'], bJavaScript) + dVars.update(dResultRegex) + + dResultGraph = compile_rules_graph.make(spLang, dVars['lang'], bJavaScript) + dVars.update(dResultGraph) ## READ GRAMMAR CHECKER PLUGINS print("PYTHON:") print("+ Plugins: ", end="") sCodePlugins = ""