ADDED compile_rules_graph.py Index: compile_rules_graph.py ================================================================== --- /dev/null +++ compile_rules_graph.py @@ -0,0 +1,282 @@ +# Create a Direct Acyclic Rule Graph (DARG) + +import re +import traceback +import json +import darg + + +dDEF = {} +dACTIONS = {} +lFUNCTIONS = [] + + +def prepareFunction (s): + s = s.replace("__also__", "bCondMemo") + s = s.replace("__else__", "not bCondMemo") + s = re.sub(r"isStart *", 'before(["", ","])', s) + s = re.sub(r"isRealStart *", 'before([""])', s) + s = re.sub(r"isStart0 *", 'before0(["", ","])', s) + s = re.sub(r"isRealStart0 *", 'before0([""])', s) + s = re.sub(r"isEnd *", 'after(["", ","])', s) + s = re.sub(r"isRealEnd *", 'after([""])', s) + s = re.sub(r"isEnd0 *", 'after0(["", ","])', s) + s = re.sub(r"isRealEnd0 *", 'after0([""])', s) + s = re.sub(r"(select|exclude)[(][\\](\d+)", '\\1(lToken[\\2]', s) + s = re.sub(r"define[(][\\](\d+)", 'define(lToken[\\1]', s) + s = re.sub(r"(morph|morphex|displayInfo)[(][\\](\d+)", '\\1(lToken[\\2])', s) + s = re.sub(r"token$\s*(\d)", 'nextToken(\\1', s) # token(n) + s = re.sub(r"token\(\s*-(\d)", 'prevToken(\\1', s) # token(-n) + s = re.sub(r"before\(\s*", 'look(s[:m.start()], ', s) # before(s) + s = re.sub(r"after\(\s*", 'look(s[m.end():], ', s) # after(s) + s = re.sub(r"textarea\(\s*", 'look(s, ', s) # textarea(s) + s = re.sub(r"before_chk1\(\s*", 'look_chk1(dDA, s[:m.start()], 0, ', s) # before_chk1(s) + s = re.sub(r"after_chk1\(\s*", 'look_chk1(dDA, s[m.end():], m.end(), ', s) # after_chk1(s) + s = re.sub(r"textarea_chk1\(\s*", 'look_chk1(dDA, s, 0, ', s) # textarea_chk1(s) + s = re.sub(r"isEndOfNG\(\s*$", 'isEndOfNG(dDA, s[m.end():], m.end())', s) # isEndOfNG(s) + s = re.sub(r"isNextNotCOD$\s*$", 'isNextNotCOD(dDA, s[m.end():], m.end())', s) # isNextNotCOD(s) + s = re.sub(r"isNextVerb$\s*$", 'isNextVerb(dDA, s[m.end():], m.end())', s) # isNextVerb(s) + s = re.sub(r"\bspell *[(]", '_oSpellChecker.isValid(', s) + s = re.sub(r"[\\](\d+)", 'lToken[\\1]', s) + return s + + +def changeReferenceToken (s, dPos): + for i in range(len(dPos), 0, -1): + s = s.replace("\\"+str(i), "\\"+dPos[i]) + return s + + +def createRule (iLine, sRuleName, sTokenLine, sActions, nPriority): + # print(iLine, "//", sRuleName, "//", sTokenLine, "//", sActions, "//", nPriority) + lToken = sTokenLine.split() + + # Calculate positions + dPos = {} + nGroup = 0 + for i, sToken in enumerate(lToken): + if sToken.startswith("(") and sToken.endswith(")"): + lToken[i] = sToken[1:-1] + nGroup += 1 + dPos[nGroup] = i + + # Parse actions + for nAction, sAction in enumerate(sActions.split(" <<- ")): + if sAction.strip(): + sActionId = sRuleName + "_a" + str(nAction) + aAction = createAction(sActionId, sAction, nGroup, nPriority, dPos) + if aAction: + dACTIONS[sActionId] = aAction + lResult = list(lToken) + lResult.extend(["##"+str(iLine), sActionId]) + yield lResult + + +def createAction (sIdAction, sAction, nGroup, nPriority, dPos): + m = re.search("([-~=])(\\d+|)(:\\d+|)>> ", sAction) + if not m: + print(" # Error. No action found at: ", sIdAction) + print(" ==", sAction, "==") + return None + # Condition + sCondition = sAction[:m.start()].strip() + if sCondition: + sCondition = prepareFunction(sCondition) + sCondition = changeReferenceToken(sCondition, dPos) + lFUNCTIONS.append(("g_c_"+sIdAction, sCondition)) + sCondition = "g_c_"+sIdAction + else: + sCondition = "" + # Action + cAction = m.group(1) + sAction = sAction[m.end():].strip() + sAction = changeReferenceToken(sAction, dPos) + iStartAction = int(m.group(2)) if m.group(2) else 0 + iEndAction = int(m.group(3)[1:]) if m.group(3) else iStartAction + if nGroup: + iStartAction = dPos[iStartAction] + iEndAction = dPos[iEndAction] + + if cAction == "-": + ## error + iMsg = sAction.find(" # ") + if iMsg == -1: + sMsg = "# Error. Error message not found." + sURL = "" + print(sMsg + " Action id: " + sIdAction) + else: + sMsg = sAction[iMsg+3:].strip() + sAction = sAction[:iMsg].strip() + sURL = "" + mURL = re.search("[|] *(https?://.*)", sMsg) + if mURL: + sURL = mURL.group(1).strip() + sMsg = sMsg[:mURL.start(0)].strip() + if sMsg[0:1] == "=": + sMsg = prepareFunction(sMsg[1:]) + lFUNCTIONS.append(("g_m_"+sIdAction, sMsg)) + for x in re.finditer("group[(](\d+)[)]", sMsg): + if int(x.group(1)) > nGroup: + print("# Error in groups in message at line " + sIdAction + " ("+str(nGroup)+" groups only)") + sMsg = "=g_m_"+sIdAction + else: + for x in re.finditer(r"\$\d+)", sMsg): + if int(x.group(1)) > nGroup: + print("# Error in groups in message at line " + sIdAction + " ("+str(nGroup)+" groups only)") + if re.search("[.]\\w+[(]", sMsg): + print("# Error in message at line " + sIdAction + ": This message looks like code. Line should begin with =") + + if sAction[0:1] == "=" or cAction == "=": + if "define" in sAction and not re.search(r"define\(\\\d+ *, *\[.*\] *$", sAction): + print("# Error in action at line " + sIdAction + ": second argument for define must be a list of strings") + sAction = prepareFunction(sAction) + for x in re.finditer("group[(](\d+)[)]", sAction): + if int(x.group(1)) > nGroup: + print("# Error in groups in replacement at line " + sIdAction + " ("+str(nGroup)+" groups only)") + else: + for x in re.finditer(r"\\(\d+)", sAction): + if int(x.group(1)) > nGroup: + print("# Error in groups in replacement at line " + sIdAction + " ("+str(nGroup)+" groups only)") + if re.search("[.]\\w+[(]|sugg\\w+[(]", sAction): + print("# Error in action at line " + sIdAction + ": This action looks like code. Line should begin with =") + + if cAction == "-": + ## error detected --> suggestion + if not sAction: + print("# Error in action at line " + sIdAction + ": This action is empty.") + if sAction[0:1] == "=": + lFUNCTIONS.append(("g_s_"+sIdAction, sAction[1:])) + sAction = "=g_s_"+sIdAction + elif sAction.startswith('"') and sAction.endswith('"'): + sAction = sAction[1:-1] + if not sMsg: + print("# Error in action at line " + sIdAction + ": The message is empty.") + return [sCondition, cAction, sAction, iStartAction, iEndAction, nPriority, sMsg, sURL] + elif cAction == "~": + ## text processor + if not sAction: + print("# Error in action at line " + sIdAction + ": This action is empty.") + if sAction[0:1] == "=": + lFUNCTIONS.append(("g_p_"+sIdAction, sAction[1:])) + sAction = "=g_p_"+sIdAction + elif sAction.startswith('"') and sAction.endswith('"'): + sAction = sAction[1:-1] + return [sCondition, cAction, sAction, iStartAction, iEndAction] + elif cAction == "=": + ## disambiguator + if sAction[0:1] == "=": + sAction = sAction[1:] + if not sAction: + print("# Error in action at line " + sIdAction + ": This action is empty.") + lFUNCTIONS.append(("g_d_"+sIdAction, sAction)) + sAction = "g_d_"+sIdAction + return [sCondition, cAction, sAction] + elif cAction == ">": + ## no action, break loop if condition is False + return [sCondition, cAction, ""] + else: + print("# Unknown action at line " + sIdAction) + return None + + +def make (spLang, sLang, bJavaScript): + "compile rules, returns a dictionary of values" + # for clarity purpose, don’t create any file here + + print("> read graph rules file...") + try: + lRules = open(spLang + "/rules_graph.grx", 'r', encoding="utf-8").readlines() + except: + print("Error. Rules file in project [" + sLang + "] not found.") + exit() + + # removing comments, zeroing empty lines, creating definitions, storing tests, merging rule lines + print(" parsing rules...") + global dDEF + lLine = [] + lRuleLine = [] + lTest = [] + lOpt = [] + lTokenLine = [] + sActions = "" + nPriority = 4 + + for i, sLine in enumerate(lRules, 1): + sLine = sLine.rstrip() + if "\t" in sLine: + print("Error. Tabulation at line: ", i) + break + if sLine.startswith('#END'): + printBookmark(0, "BREAK BY #END", i) + break + elif sLine.startswith("#"): + pass + elif sLine.startswith("DEF:"): + m = re.match("DEF: +([a-zA-Z_][a-zA-Z_0-9]*) +(.+)$", sLine.strip()) + if m: + dDEF["{"+m.group(1)+"}"] = m.group(2) + else: + print("Error in definition: ", end="") + print(sLine.strip()) + elif sLine.startswith("TEST:"): + lTest.append("{:<8}".format(i) + " " + sLine[5:].strip()) + elif sLine.startswith("TODO:"): + pass + elif sLine.startswith("!!"): + m = re.search("^!!+", sLine) + nExMk = len(m.group(0)) + if sLine[nExMk:].strip(): + printBookmark(nExMk-2, sLine[nExMk:].strip(), i) + elif sLine.startswith("__") and sLine.endswith("__"): + # new rule group + m = re.match("__(\\w+)(!\\d|)__", sLine) + if m: + sRuleName = m.group(1) + nPriority = int(m.group(2)[1:]) if m.group(2) else 4 + else: + print("Error at rule group: ", sLine, " -- line:", i) + break + elif re.match("[ ]*$", sLine): + # empty line to end merging + for i, sTokenLine in lTokenLine: + lRuleLine.append((i, sRuleName, sTokenLine, sActions, nPriority)) + lTokenLine = [] + sActions = "" + sRuleName = "" + nPriority = 4 + elif sLine.startswith((" ")): + # actions + sActions += " " + sLine.strip() + else: + lTokenLine.append([i, sLine.strip()]) + + # tests + print(" list tests...") + sGCTests = "\n".join(lTest) + sGCTestsJS = '{ "aData2": ' + json.dumps(lTest, ensure_ascii=False) + " }\n" + + # processing rules + print(" preparing rules...") + lPreparedRule = [] + for i, sRuleGroup, sTokenLine, sActions, nPriority in lRuleLine: + for lRule in createRule(i, sRuleGroup, sTokenLine, sActions, nPriority): + lPreparedRule.append(lRule) + + # Graph creation + for e in lPreparedRule: + print(e) + + oDARG = darg.DARG(lPreparedRule, sLang) + oRuleGraph = oDARG.createGraph() + + # Result + d = { + "graph_callables": None, + "graph_gctests": None, + "rules_graph": oRuleGraph, + "rules_actions": dACTIONS + } + + return d + + ADDED darg.py Index: darg.py ================================================================== --- /dev/null +++ darg.py @@ -0,0 +1,175 @@ +#!python3 + +# RULE GRAPH BUILDER +# +# by Olivier R. +# License: MPL 2 + + +import json +import time +import traceback + +from graphspell.progressbar import ProgressBar + + + +class DARG: + """DIRECT ACYCLIC RULE GRAPH""" + # This code is inspired from Steve Hanov’s DAWG, 2011. (http://stevehanov.ca/blog/index.php?id=115) + + def __init__ (self, lRule, sLangCode): + print("===== Direct Acyclic Rule Graph - Minimal Acyclic Finite State Automaton =====") + + # Preparing DARG + print(" > Preparing list of tokens") + self.sLangCode = sLangCode + self.nRule = len(lRule) + self.aPreviousRule = [] + Node.resetNextId() + self.oRoot = Node() + self.lUncheckedNodes = [] # list of nodes that have not been checked for duplication. + self.lMinimizedNodes = {} # list of unique nodes that have been checked for duplication. + self.nNode = 0 + self.nArc = 0 + + # build + lRule.sort() + oProgBar = ProgressBar(0, len(lRule)) + for aRule in lRule: + self.insert(aRule) + oProgBar.increment(1) + oProgBar.done() + self.finish() + self.countNodes() + self.countArcs() + self.displayInfo() + + # BUILD DARG + def insert (self, aRule): + if aRule < self.aPreviousRule: + sys.exit("# Error: tokens must be inserted in order.") + + # find common prefix between word and previous word + nCommonPrefix = 0 + for i in range(min(len(aRule), len(self.aPreviousRule))): + if aRule[i] != self.aPreviousRule[i]: + break + nCommonPrefix += 1 + + # Check the lUncheckedNodes for redundant nodes, proceeding from last + # one down to the common prefix size. Then truncate the list at that point. + self._minimize(nCommonPrefix) + + # add the suffix, starting from the correct node mid-way through the graph + if len(self.lUncheckedNodes) == 0: + oNode = self.oRoot + else: + oNode = self.lUncheckedNodes[-1][2] + + iToken = nCommonPrefix + for sToken in aRule[nCommonPrefix:]: + oNextNode = Node() + oNode.dArcs[sToken] = oNextNode + self.lUncheckedNodes.append((oNode, sToken, oNextNode)) + if iToken == (len(aRule) - 2): + oNode.bFinal = True + iToken += 1 + oNode = oNextNode + oNode.bFinal = True + self.aPreviousRule = aRule + + def finish (self): + "minimize unchecked nodes" + self._minimize(0) + + def _minimize (self, downTo): + # proceed from the leaf up to a certain point + for i in range( len(self.lUncheckedNodes)-1, downTo-1, -1 ): + oNode, sToken, oChildNode = self.lUncheckedNodes[i] + if oChildNode in self.lMinimizedNodes: + # replace the child with the previously encountered one + oNode.dArcs[sToken] = self.lMinimizedNodes[oChildNode] + else: + # add the state to the minimized nodes. + self.lMinimizedNodes[oChildNode] = oChildNode + self.lUncheckedNodes.pop() + + def countNodes (self): + self.nNode = len(self.lMinimizedNodes) + + def countArcs (self): + self.nArc = 0 + for oNode in self.lMinimizedNodes: + self.nArc += len(oNode.dArcs) + + def displayInfo (self): + print(" * {:<12} {:>16,}".format("Rules:", self.nRule)) + print(" * {:<12} {:>16,}".format("Nodes:", self.nNode)) + print(" * {:<12} {:>16,}".format("Arcs:", self.nArc)) + + def createGraph (self): + dGraph = { 0: self.oRoot.getNodeAsDict() } + print(0, "\t", self.oRoot.getNodeAsDict()) + for oNode in self.lMinimizedNodes: + sHashId = oNode.__hash__() + if sHashId not in dGraph: + dGraph[sHashId] = oNode.getNodeAsDict() + print(sHashId, "\t", dGraph[sHashId]) + else: + print("Error. Double node… same id: ", sHashId) + print(str(oNode.getNodeAsDict())) + return dGraph + + + +class Node: + NextId = 0 + + def __init__ (self): + self.i = Node.NextId + Node.NextId += 1 + self.bFinal = False + self.dArcs = {} # key: arc value; value: a node + + @classmethod + def resetNextId (cls): + cls.NextId = 0 + + def __str__ (self): + # Caution! this function is used for hashing and comparison! + cFinal = "1" if self.bFinal else "0" + l = [cFinal] + for (key, oNode) in self.dArcs.items(): + l.append(str(key)) + l.append(str(oNode.i)) + return "_".join(l) + + def __hash__ (self): + # Used as a key in a python dictionary. + return self.__str__().__hash__() + + def __eq__ (self, other): + # Used as a key in a python dictionary. + # Nodes are equivalent if they have identical arcs, and each identical arc leads to identical states. + return self.__str__() == other.__str__() + + def getNodeAsDict (self): + "returns the node as a dictionary structure" + dNode = {} + dRegex = {} + dRules = {} + for arc, oNode in self.dArcs.items(): + if type(arc) == str and arc.startswith("~"): + dRegex[arc[1:]] = oNode.__hash__() + elif arc.startswith("##"): + dRules[arc[1:]] = oNode.__hash__() + else: + dNode[arc] = oNode.__hash__() + if dRegex: + dNode[""] = dRegex + if dRules: + dNode[""] = dRules + #if self.bFinal: + # dNode[""] = 1 + return dNode Index: gc_core/py/lang_core/gc_engine.py ================================================================== --- gc_core/py/lang_core/gc_engine.py +++ gc_core/py/lang_core/gc_engine.py @@ -10,10 +10,13 @@ from ..graphspell.spellchecker import SpellChecker from ..graphspell.echo import echo from . import gc_options +from ..graphspell.tokenizer import Tokenizer +from .gc_rules_graph import dGraph + __all__ = [ "lang", "locales", "pkg", "name", "version", "author", \ "load", "parse", "getSpellChecker", \ "setOption", "setOptions", "getOptions", "getDefaultOptions", "getOptionsLabels", "resetOptions", "displayOptions", \ "ignoreRule", "resetIgnoreRules", "reactivateRule", "listRules", "displayRules" ] @@ -33,28 +36,27 @@ # data _sAppContext = "" # what software is running _dOptions = None _aIgnoredRules = set() _oSpellChecker = None -_dAnalyses = {} # cache for data from dictionary - +_oTokenizer = None #### Parsing def parse (sText, sCountry="${country_default}", bDebug=False, dOptions=None, bContext=False): "analyses the paragraph sText and returns list of errors" #sText = unicodedata.normalize("NFC", sText) aErrors = None - sAlt = sText + sRealText = sText dDA = {} # Disambiguisator. Key = position; value = list of morphologies dPriority = {} # Key = position; value = priority dOpt = _dOptions if not dOptions else dOptions # parse paragraph try: - sNew, aErrors = _proofread(sText, sAlt, 0, True, dDA, dPriority, sCountry, dOpt, bDebug, bContext) + sNew, aErrors = _proofread(sText, sRealText, 0, True, dDA, dPriority, sCountry, dOpt, bDebug, bContext) if sNew: sText = sNew except: raise @@ -71,11 +73,12 @@ # parse sentences for iStart, iEnd in _getSentenceBoundaries(sText): if 4 < (iEnd - iStart) < 2000: dDA.clear() try: - _, errs = _proofread(sText[iStart:iEnd], sAlt[iStart:iEnd], iStart, False, dDA, dPriority, sCountry, dOpt, bDebug, bContext) + # regex parser + _, errs = _proofread(sText[iStart:iEnd], sRealText[iStart:iEnd], iStart, False, dDA, dPriority, sCountry, dOpt, bDebug, bContext) aErrors.update(errs) except: raise return aErrors.values() # this is a view (iterable) @@ -289,14 +292,17 @@ def load (sContext="Python"): global _oSpellChecker global _sAppContext global _dOptions + global _oTokenizer try: _oSpellChecker = SpellChecker("${lang}", "${dic_main_filename_py}", "${dic_extended_filename_py}", "${dic_community_filename_py}", "${dic_personal_filename_py}") _sAppContext = sContext _dOptions = dict(gc_options.getOptions(sContext)) # duplication necessary, to be able to reset to default + _oTokenizer = _oSpellChecker.getTokenizer() + _oSpellChecker.activateStorage() except: traceback.print_exc() def setOption (sOpt, bVal): @@ -369,15 +375,15 @@ #### common functions # common regexes -_zEndOfSentence = re.compile('([.?!:;…][ .?!… »”")]*|.$)') -_zBeginOfParagraph = re.compile("^\W*") -_zEndOfParagraph = re.compile("\W*$") -_zNextWord = re.compile(" +(\w[\w-]*)") -_zPrevWord = re.compile("(\w[\w-]*) +$") +_zEndOfSentence = re.compile(r'([.?!:;…][ .?!… »”")]*|.$)') +_zBeginOfParagraph = re.compile(r"^\W*") +_zEndOfParagraph = re.compile(r"\W*$") +_zNextWord = re.compile(r" +(\w[\w-]*)") +_zPrevWord = re.compile(r"(\w[\w-]*) +$") def option (sOpt): "return True if option sOpt is active" return _dOptions.get(sOpt, False) @@ -386,33 +392,25 @@ def displayInfo (dDA, tWord): "for debugging: retrieve info of word" if not tWord: echo("> nothing to find") return True - if tWord[1] not in _dAnalyses and not _storeMorphFromFSA(tWord[1]): - echo("> not in FSA") + lMorph = _oSpellChecker.getMorph(tWord[1]) + if not lMorph: + echo("> not in dictionary") return True if tWord[0] in dDA: echo("DA: " + str(dDA[tWord[0]])) - echo("FSA: " + str(_dAnalyses[tWord[1]])) + echo("FSA: " + str(lMorph)) return True - -def _storeMorphFromFSA (sWord): - "retrieves morphologies list from _oSpellChecker -> _dAnalyses" - global _dAnalyses - _dAnalyses[sWord] = _oSpellChecker.getMorph(sWord) - return True if _dAnalyses[sWord] else False - def morph (dDA, tWord, sPattern, bStrict=True, bNoWord=False): "analyse a tuple (position, word), return True if sPattern in morphologies (disambiguation on)" if not tWord: return bNoWord - if tWord[1] not in _dAnalyses and not _storeMorphFromFSA(tWord[1]): - return False - lMorph = dDA[tWord[0]] if tWord[0] in dDA else _dAnalyses[tWord[1]] + lMorph = dDA[tWord[0]] if tWord[0] in dDA else _oSpellChecker.getMorph(tWord[1]) if not lMorph: return False p = re.compile(sPattern) if bStrict: return all(p.search(s) for s in lMorph) @@ -421,13 +419,13 @@ def morphex (dDA, tWord, sPattern, sNegPattern, bNoWord=False): "analyse a tuple (position, word), returns True if not sNegPattern in word morphologies and sPattern in word morphologies (disambiguation on)" if not tWord: return bNoWord - if tWord[1] not in _dAnalyses and not _storeMorphFromFSA(tWord[1]): + lMorph = dDA[tWord[0]] if tWord[0] in dDA else _oSpellChecker.getMorph(tWord[1]) + if not lMorph: return False - lMorph = dDA[tWord[0]] if tWord[0] in dDA else _dAnalyses[tWord[1]] # check negative condition np = re.compile(sNegPattern) if any(np.search(s) for s in lMorph): return False # search sPattern @@ -435,40 +433,32 @@ return any(p.search(s) for s in lMorph) def analyse (sWord, sPattern, bStrict=True): "analyse a word, return True if sPattern in morphologies (disambiguation off)" - if sWord not in _dAnalyses and not _storeMorphFromFSA(sWord): - return False - if not _dAnalyses[sWord]: + lMorph = _oSpellChecker.getMorph(sWord) + if not lMorph: return False p = re.compile(sPattern) if bStrict: - return all(p.search(s) for s in _dAnalyses[sWord]) - return any(p.search(s) for s in _dAnalyses[sWord]) + return all(p.search(s) for s in lMorph) + return any(p.search(s) for s in lMorph) def analysex (sWord, sPattern, sNegPattern): "analyse a word, returns True if not sNegPattern in word morphologies and sPattern in word morphologies (disambiguation off)" - if sWord not in _dAnalyses and not _storeMorphFromFSA(sWord): + lMorph = _oSpellChecker.getMorph(sWord) + if not lMorph: return False # check negative condition np = re.compile(sNegPattern) - if any(np.search(s) for s in _dAnalyses[sWord]): + if any(np.search(s) for s in lMorph): return False # search sPattern p = re.compile(sPattern) - return any(p.search(s) for s in _dAnalyses[sWord]) - - -def stem (sWord): - "returns a list of sWord's stems" - if not sWord: - return [] - if sWord not in _dAnalyses and not _storeMorphFromFSA(sWord): - return [] - return [ s[1:s.find(" ")] for s in _dAnalyses[sWord] ] + return any(p.search(s) for s in lMorph) + ## functions to get text outside pattern scope # warning: check compile_rules.py to understand how it works @@ -534,52 +524,47 @@ def select (dDA, nPos, sWord, sPattern, lDefault=None): if not sWord: return True if nPos in dDA: return True - if sWord not in _dAnalyses and not _storeMorphFromFSA(sWord): - return True - if len(_dAnalyses[sWord]) == 1: - return True - lSelect = [ sMorph for sMorph in _dAnalyses[sWord] if re.search(sPattern, sMorph) ] - if lSelect: - if len(lSelect) != len(_dAnalyses[sWord]): - dDA[nPos] = lSelect - #echo("= "+sWord+" "+str(dDA.get(nPos, "null"))) + lMorph = _oSpellChecker.getMorph(sWord) + if not lMorph or len(lMorph) == 1: + return True + lSelect = [ sMorph for sMorph in lMorph if re.search(sPattern, sMorph) ] + if lSelect: + if len(lSelect) != len(lMorph): + dDA[nPos] = lSelect elif lDefault: dDA[nPos] = lDefault - #echo("= "+sWord+" "+str(dDA.get(nPos, "null"))) return True def exclude (dDA, nPos, sWord, sPattern, lDefault=None): if not sWord: return True if nPos in dDA: return True - if sWord not in _dAnalyses and not _storeMorphFromFSA(sWord): - return True - if len(_dAnalyses[sWord]) == 1: - return True - lSelect = [ sMorph for sMorph in _dAnalyses[sWord] if not re.search(sPattern, sMorph) ] - if lSelect: - if len(lSelect) != len(_dAnalyses[sWord]): - dDA[nPos] = lSelect - #echo("= "+sWord+" "+str(dDA.get(nPos, "null"))) + lMorph = _oSpellChecker.getMorph(sWord) + if not lMorph or len(lMorph) == 1: + return True + lSelect = [ sMorph for sMorph in lMorph if not re.search(sPattern, sMorph) ] + if lSelect: + if len(lSelect) != len(lMorph): + dDA[nPos] = lSelect elif lDefault: dDA[nPos] = lDefault - #echo("= "+sWord+" "+str(dDA.get(nPos, "null"))) return True def define (dDA, nPos, lMorph): dDA[nPos] = lMorph - #echo("= "+str(nPos)+" "+str(dDA[nPos])) return True #### GRAMMAR CHECKER PLUGINS ${plugins} + +#### CALLABLES (generated code) ${callables} ADDED gc_core/py/lang_core/gc_rules_graph.py Index: gc_core/py/lang_core/gc_rules_graph.py ================================================================== --- /dev/null +++ gc_core/py/lang_core/gc_rules_graph.py @@ -0,0 +1,5 @@ +# generated code, do not edit + +dGraph = ${rules_graph} + +dRule = ${rules_actions} ADDED gc_core/py/lang_core/gc_sentence.py Index: gc_core/py/lang_core/gc_sentence.py ================================================================== --- /dev/null +++ gc_core/py/lang_core/gc_sentence.py @@ -0,0 +1,141 @@ +# Sentence checker + +from ..graphspell.tokenizer import Tokenizer +from .gc_graph import dGraph + + +oTokenizer = Tokenizer("${lang}") + + +class Sentence: + + def __init__ (self, sSentence, sSentence0, nOffset): + self.sSentence = sSentence + self.sSentence0 = sSentence0 + self.nOffset = nOffset + self.lToken = list(oTokenizer.genTokens()) + + def parse (self): + dErr = {} + lPointer = [] + for dToken in self.lToken: + for i, dPointer in enumerate(lPointer): + bValid = False + for dNode in self._getNextMatchingNodes(dToken, dPointer["dNode"]): + dPointer["nOffset"] = dToken["i"] + dPointer["dNode"] = dNode + bValid = True + if not bValid: + del lPointer[i] + for dNode in self._getNextMatchingNodes(dToken, dGraph): + lPointer.append({"nOffset": 0, "dNode": dNode}) + for dPointer in lPointer: + if "" in dPointer["dNode"]: + for dNode in dGraph[dPointer["dNode"][""]]: + dErr = self._executeActions(dNode) + return dErr + + def _getNextMatchingNodes (self, dToken, dNode): + if dToken["sValue"] in dNode: + yield dGraph[dNode[dToken["sValue"]]] + for sLemma in dToken["sLemma"]: + if sLemma in dNode: + yield dGraph[dNode[dToken["sValue"]]] + if "~" in dNode: + for sRegex in dNode["~"]: + for sMorph in dToken["lMorph"]: + if re.search(sRegex, sMorph): + yield dGraph[dNode["~"][sRegex]] + + def _executeActions (self, dNode): + for sLineId, nextNodeKey in dNode.items(): + for sArc in dGraph[nextNodeKey]: + bCondMemo = None + sFuncCond, cActionType, sWhat, *eAct = dRule[sArc] + # action in lActions: [ condition, action type, replacement/suggestion/action[, iGroupStart, iGroupEnd[, message, URL]] ] + try: + bCondMemo = not sFuncCond or globals()[sFuncCond](self, dDA, sCountry, bCondMemo) + if bCondMemo: + if cActionType == "-": + # grammar error + nErrorStart = nSentenceOffset + m.start(eAct[0]) + nErrorEnd = nSentenceOffset + m.start(eAct[1]) + if nErrorStart not in dErrs or nPriority > dPriority[nErrorStart]: + dErrs[nErrorStart] = _createError(self, sWhat, nErrorStart, nErrorEnd, sLineId, bUppercase, eAct[2], eAct[3], bIdRule, sOption, bContext) + dPriority[nErrorStart] = nPriority + elif cActionType == "~": + # text processor + self.lToken = _rewrite(self, sWhat, nErrorStart, nErrorEnd, bUppercase) + bChange = True + elif cActionType == "@": + # text processor + self.lToken = _rewrite(self, sWhat, nErrorStart, nErrorEnd, bUppercase) + bChange = True + elif cActionType == "=": + # disambiguation + globals()[sWhat](self, dDA) + elif cActionType == ">": + # we do nothing, this test is just a condition to apply all following actions + pass + else: + echo("# error: unknown action at " + sLineId) + elif cActionType == ">": + break + except Exception as e: + raise Exception(str(e), "# " + sLineId + " # " + sRuleId) + + def _createWriterError (self): + d = {} + return d + + def _createDictError (self): + d = {} + return d + + +#### Common functions + +def option (): + pass + + +#### Analyse tokens + +def morph (): + pass + +def morphex (): + pass + +def analyse (): + pass + +def analysex (): + pass + + +#### Go outside scope + +def nextToken (): + pass + +def prevToken (): + pass + +def look (): + pass + +def lookAndCheck (): + pass + + +#### Disambiguator + +def select (): + pass + +def exclude (): + pass + +def define (): + pass Index: gc_lang/fr/modules/gce_analyseur.py ================================================================== --- gc_lang/fr/modules/gce_analyseur.py +++ gc_lang/fr/modules/gce_analyseur.py @@ -15,63 +15,58 @@ return "nous" if s2 == "vous": return "vous" if s2 == "eux": return "ils" - if s2 == "elle" or s2 == "elles": - # We don’t check if word exists in _dAnalyses, for it is assumed it has been done before - if cr.mbNprMasNotFem(_dAnalyses.get(s1, False)): + if s2 == "elle" or s2 == "elles": + if cr.mbNprMasNotFem(_oSpellChecker.getMorph(s1)): return "ils" # si épicène, indéterminable, mais OSEF, le féminin l’emporte return "elles" return s1 + " et " + s2 def apposition (sWord1, sWord2): "returns True if nom + nom (no agreement required)" - # We don’t check if word exists in _dAnalyses, for it is assumed it has been done before - return cr.mbNomNotAdj(_dAnalyses.get(sWord2, False)) and cr.mbPpasNomNotAdj(_dAnalyses.get(sWord1, False)) + return cr.mbNomNotAdj(_oSpellChecker.getMorph(sWord2)) and cr.mbPpasNomNotAdj(_oSpellChecker.getMorph(sWord1)) def isAmbiguousNAV (sWord): "words which are nom|adj and verb are ambiguous (except être and avoir)" - if sWord not in _dAnalyses and not _storeMorphFromFSA(sWord): + lMorph = _oSpellChecker.getMorph(sWord) + if not cr.mbNomAdj(lMorph) or sWord == "est": return False - if not cr.mbNomAdj(_dAnalyses[sWord]) or sWord == "est": - return False - if cr.mbVconj(_dAnalyses[sWord]) and not cr.mbMG(_dAnalyses[sWord]): + if cr.mbVconj(lMorph) and not cr.mbMG(lMorph): return True return False def isAmbiguousAndWrong (sWord1, sWord2, sReqMorphNA, sReqMorphConj): "use it if sWord1 won’t be a verb; word2 is assumed to be True via isAmbiguousNAV" - # We don’t check if word exists in _dAnalyses, for it is assumed it has been done before - a2 = _dAnalyses.get(sWord2, None) + a2 = _oSpellChecker.getMorph(sWord2) if not a2: return False if cr.checkConjVerb(a2, sReqMorphConj): # verb word2 is ok return False - a1 = _dAnalyses.get(sWord1, None) + a1 = _oSpellChecker.getMorph(sWord1) if not a1: return False if cr.checkAgreement(a1, a2) and (cr.mbAdj(a2) or cr.mbAdj(a1)): return False return True def isVeryAmbiguousAndWrong (sWord1, sWord2, sReqMorphNA, sReqMorphConj, bLastHopeCond): "use it if sWord1 can be also a verb; word2 is assumed to be True via isAmbiguousNAV" - # We don’t check if word exists in _dAnalyses, for it is assumed it has been done before - a2 = _dAnalyses.get(sWord2, None) + a2 = _oSpellChecker.getMorph(sWord2) if not a2: return False if cr.checkConjVerb(a2, sReqMorphConj): # verb word2 is ok return False - a1 = _dAnalyses.get(sWord1, None) + a1 = _oSpellChecker.getMorph(sWord1) if not a1: return False if cr.checkAgreement(a1, a2) and (cr.mbAdj(a2) or cr.mbAdjNb(a1)): return False # now, we know there no agreement, and conjugation is also wrong @@ -82,15 +77,14 @@ return True return False def checkAgreement (sWord1, sWord2): - # We don’t check if word exists in _dAnalyses, for it is assumed it has been done before - a2 = _dAnalyses.get(sWord2, None) + a2 = _oSpellChecker.getMorph(sWord2) if not a2: return True - a1 = _dAnalyses.get(sWord1, None) + a1 = _oSpellChecker.getMorph(sWord1) if not a1: return True return cr.checkAgreement(a1, a2) Index: gc_lang/fr/modules/gce_suggestions.py ================================================================== --- gc_lang/fr/modules/gce_suggestions.py +++ gc_lang/fr/modules/gce_suggestions.py @@ -7,16 +7,16 @@ ## Verbs def suggVerb (sFlex, sWho, funcSugg2=None): aSugg = set() - for sStem in stem(sFlex): + for sStem in _oSpellChecker.getLemma(sFlex): tTags = conj._getTags(sStem) if tTags: # we get the tense aTense = set() - for sMorph in _dAnalyses.get(sFlex, []): # we don’t check if word exists in _dAnalyses, for it is assumed it has been done before + for sMorph in _oSpellChecker.getMorph(sFlex): for m in re.finditer(">"+sStem+" .*?(:(?:Y|I[pqsf]|S[pq]|K|P))", sMorph): # stem must be used in regex to prevent confusion between different verbs (e.g. sauras has 2 stems: savoir and saurer) if m: if m.group(1) == ":Y": aTense.add(":Ip") @@ -40,11 +40,11 @@ return "" def suggVerbPpas (sFlex, sWhat=None): aSugg = set() - for sStem in stem(sFlex): + for sStem in _oSpellChecker.getLemma(sFlex): tTags = conj._getTags(sStem) if tTags: if not sWhat: aSugg.add(conj._getConjWithTags(sStem, tTags, ":PQ", ":Q1")) aSugg.add(conj._getConjWithTags(sStem, tTags, ":PQ", ":Q2")) @@ -83,21 +83,21 @@ return "" def suggVerbTense (sFlex, sTense, sWho): aSugg = set() - for sStem in stem(sFlex): + for sStem in _oSpellChecker.getLemma(sFlex): if conj.hasConj(sStem, sTense, sWho): aSugg.add(conj.getConj(sStem, sTense, sWho)) if aSugg: return "|".join(aSugg) return "" def suggVerbImpe (sFlex): aSugg = set() - for sStem in stem(sFlex): + for sStem in _oSpellChecker.getLemma(sFlex): tTags = conj._getTags(sStem) if tTags: if conj._hasConjWithTags(tTags, ":E", ":2s"): aSugg.add(conj._getConjWithTags(sStem, tTags, ":E", ":2s")) if conj._hasConjWithTags(tTags, ":E", ":1p"): @@ -108,11 +108,11 @@ return "|".join(aSugg) return "" def suggVerbInfi (sFlex): - return "|".join([ sStem for sStem in stem(sFlex) if conj.isVerb(sStem) ]) + return "|".join([ sStem for sStem in _oSpellChecker.getLemma(sFlex) if conj.isVerb(sStem) ]) _dQuiEst = { "je": ":1s", "j’": ":1s", "j’en": ":1s", "j’y": ":1s", \ "tu": ":2s", "il": ":3s", "on": ":3s", "elle": ":3s", "nous": ":1p", "vous": ":2p", "ils": ":3p", "elles": ":3p" } _lIndicatif = [":Ip", ":Iq", ":Is", ":If"] @@ -131,11 +131,11 @@ if not sWho: if sSuj[0:1].islower(): # pas un pronom, ni un nom propre return "" sWho = ":3s" aSugg = set() - for sStem in stem(sFlex): + for sStem in _oSpellChecker.getLemma(sFlex): tTags = conj._getTags(sStem) if tTags: for sTense in lMode: if conj._hasConjWithTags(tTags, sTense, sWho): aSugg.add(conj._getConjWithTags(sStem, tTags, sTense, sWho)) @@ -147,13 +147,14 @@ ## Nouns and adjectives def suggPlur (sFlex, sWordToAgree=None): "returns plural forms assuming sFlex is singular" if sWordToAgree: - if sWordToAgree not in _dAnalyses and not _storeMorphFromFSA(sWordToAgree): + lMorph = _oSpellChecker.getMorph(sFlex) + if not lMorph: return "" - sGender = cr.getGender(_dAnalyses.get(sWordToAgree, [])) + sGender = cr.getGender(lMorph) if sGender == ":m": return suggMasPlur(sFlex) elif sGender == ":f": return suggFemPlur(sFlex) aSugg = set() @@ -191,13 +192,12 @@ return "" def suggMasSing (sFlex, bSuggSimil=False): "returns masculine singular forms" - # we don’t check if word exists in _dAnalyses, for it is assumed it has been done before aSugg = set() - for sMorph in _dAnalyses.get(sFlex, []): + for sMorph in _oSpellChecker.getMorph(sFlex): if not ":V" in sMorph: # not a verb if ":m" in sMorph or ":e" in sMorph: aSugg.add(suggSing(sFlex)) else: @@ -219,13 +219,12 @@ return "" def suggMasPlur (sFlex, bSuggSimil=False): "returns masculine plural forms" - # we don’t check if word exists in _dAnalyses, for it is assumed it has been done before aSugg = set() - for sMorph in _dAnalyses.get(sFlex, []): + for sMorph in _oSpellChecker.getMorph(sFlex): if not ":V" in sMorph: # not a verb if ":m" in sMorph or ":e" in sMorph: aSugg.add(suggPlur(sFlex)) else: @@ -250,13 +249,12 @@ return "" def suggFemSing (sFlex, bSuggSimil=False): "returns feminine singular forms" - # we don’t check if word exists in _dAnalyses, for it is assumed it has been done before aSugg = set() - for sMorph in _dAnalyses.get(sFlex, []): + for sMorph in _oSpellChecker.getMorph(sFlex): if not ":V" in sMorph: # not a verb if ":f" in sMorph or ":e" in sMorph: aSugg.add(suggSing(sFlex)) else: @@ -276,13 +274,12 @@ return "" def suggFemPlur (sFlex, bSuggSimil=False): "returns feminine plural forms" - # we don’t check if word exists in _dAnalyses, for it is assumed it has been done before aSugg = set() - for sMorph in _dAnalyses.get(sFlex, []): + for sMorph in _oSpellChecker.getMorph(sFlex): if not ":V" in sMorph: # not a verb if ":f" in sMorph or ":e" in sMorph: aSugg.add(suggPlur(sFlex)) else: @@ -301,33 +298,32 @@ return "|".join(aSugg) return "" def hasFemForm (sFlex): - for sStem in stem(sFlex): + for sStem in _oSpellChecker.getLemma(sFlex): if mfsp.isFemForm(sStem) or conj.hasConj(sStem, ":PQ", ":Q3"): return True if phonet.hasSimil(sFlex, ":f"): return True return False def hasMasForm (sFlex): - for sStem in stem(sFlex): + for sStem in _oSpellChecker.getLemma(sFlex): if mfsp.isFemForm(sStem) or conj.hasConj(sStem, ":PQ", ":Q1"): # what has a feminine form also has a masculine form return True if phonet.hasSimil(sFlex, ":m"): return True return False def switchGender (sFlex, bPlur=None): - # we don’t check if word exists in _dAnalyses, for it is assumed it has been done before aSugg = set() if bPlur == None: - for sMorph in _dAnalyses.get(sFlex, []): + for sMorph in _oSpellChecker.getMorph(sFlex): if ":f" in sMorph: if ":s" in sMorph: aSugg.add(suggMasSing(sFlex)) elif ":p" in sMorph: aSugg.add(suggMasPlur(sFlex)) @@ -338,17 +334,17 @@ aSugg.add(suggFemPlur(sFlex)) else: aSugg.add(suggFemSing(sFlex)) aSugg.add(suggFemPlur(sFlex)) elif bPlur: - for sMorph in _dAnalyses.get(sFlex, []): + for sMorph in _oSpellChecker.getMorph(sFlex): if ":f" in sMorph: aSugg.add(suggMasPlur(sFlex)) elif ":m" in sMorph: aSugg.add(suggFemPlur(sFlex)) else: - for sMorph in _dAnalyses.get(sFlex, []): + for sMorph in _oSpellChecker.getMorph(sFlex): if ":f" in sMorph: aSugg.add(suggMasSing(sFlex)) elif ":m" in sMorph: aSugg.add(suggFemSing(sFlex)) if aSugg: @@ -355,13 +351,12 @@ return "|".join(aSugg) return "" def switchPlural (sFlex): - # we don’t check if word exists in _dAnalyses, for it is assumed it has been done before aSugg = set() - for sMorph in _dAnalyses.get(sFlex, []): + for sMorph in _oSpellChecker.getMorph(sFlex): if ":s" in sMorph: aSugg.add(suggPlur(sFlex)) elif ":p" in sMorph: aSugg.add(suggSing(sFlex)) if aSugg: @@ -373,13 +368,12 @@ return phonet.hasSimil(sWord, sPattern) def suggSimil (sWord, sPattern=None, bSubst=False): "return list of words phonetically similar to sWord and whom POS is matching sPattern" - # we don’t check if word exists in _dAnalyses, for it is assumed it has been done before aSugg = phonet.selectSimil(sWord, sPattern) - for sMorph in _dAnalyses.get(sWord, []): + for sMorph in _oSpellChecker.getMorph(sWord): aSugg.update(conj.getSimil(sWord, sMorph, bSubst)) break if aSugg: return "|".join(aSugg) return "" @@ -392,12 +386,11 @@ return "ce|cet" return "ce" def suggLesLa (sWord): - # we don’t check if word exists in _dAnalyses, for it is assumed it has been done before - if any( ":p" in sMorph for sMorph in _dAnalyses.get(sWord, []) ): + if any( ":p" in sMorph for sMorph in _oSpellChecker.getMorph(sWord) ): return "les|la" return "la" _zBinary = re.compile("^[01]+$") ADDED gc_lang/fr/rules_graph.grx Index: gc_lang/fr/rules_graph.grx ================================================================== --- /dev/null +++ gc_lang/fr/rules_graph.grx @@ -0,0 +1,60 @@ +# +# RÈGLES DE GRAMMAIRE FRANÇAISE POUR GRAMMALECTE +# par Olivier R. +# +# Copyright © 2011-2017. +# +# This file is part of Grammalecte. +# +# Grammalecte is free software: you can redistribute it and/or modify +# it under the terms of the GNU General Public License as published by +# the Free Software Foundation, either version 3 of the License, or +# (at your option) any later version. +# +# Grammalecte is distributed in the hope that it will be useful, +# but WITHOUT ANY WARRANTY; without even the implied warranty of +# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the +# GNU General Public License for more details. +# +# You should have received a copy of the GNU General Public License +# along with Grammalecte. If not, see +# + +# RÈGLES POUR LE GRAPHE DE TOKENS + +# DOCUMENTATION +# Expressions régulières en Python : http://docs.python.org/library/re.html + +# [++] : séparateur des règles pour le paragraphe et des règles pour la phrase. + +# Types d’action: +# ->> erreur +# ~>> préprocesseur de texte +# =>> désambiguïsateur + + +# Fin d’interprétation du fichier avec une ligne commençant par #END + +# ERREURS COURANTES +# http://fr.wikipedia.org/wiki/Wikip%C3%A9dia:Fautes_d%27orthographe/Courantes + + +__rule1__ + les ~:N:.:s + des ~:N:.:s + ces ~:N:.:s + <<- -1>> acquit # Message0|http://test.grammalecte.net + +__rule2__ + ci important que soi + ci vraiment il y a + ci pour ça + <<- morph(\2, ":[WAR]", False) -1>> si # Message1|http://test.grammalecte.net + +__rule3__ + contre nature + contre pétrie + contre action + <<- morph(\1, "xxxx") -1:2>> =$area.replace(" ", "") # Message2|http://test.grammalecte.org + <<- ~>> =$area.replace(" ", "") + Index: graphspell-js/spellchecker.js ================================================================== --- graphspell-js/spellchecker.js +++ graphspell-js/spellchecker.js @@ -41,10 +41,14 @@ this.oPersonalDic = this._loadDictionary(personalDic, sPath); this.bExtendedDic = Boolean(this.oExtendedDic); this.bCommunityDic = Boolean(this.oCommunityDic); this.bPersonalDic = Boolean(this.oPersonalDic); this.oTokenizer = null; + // storage + this.bStorage = false; + this._dMorphologies = new Map(); // key: flexion, value: list of morphologies + this._dLemmas = new Map(); // key: flexion, value: list of lemmas } _loadDictionary (dictionary, sPath="", bNecessary=false) { // returns an IBDAWG object if (!dictionary) { @@ -132,10 +136,26 @@ deactivatePersonalDictionary () { this.bPersonalDic = false; } + + // Storage + + activateStorage () { + this.bStorage = true; + } + + deactivateStorage () { + this.bStorage = false; + } + + clearStorage () { + this._dLemmas.clear(); + this._dMorphologies.clear(); + } + // parse text functions parseParagraph (sText) { if (!this.oTokenizer) { @@ -203,21 +223,39 @@ return false; } getMorph (sWord) { // retrieves morphologies list, different casing allowed - let lResult = this.oMainDic.getMorph(sWord); + if (this.bStorage && this._dMorphologies.has(sWord)) { + return this._dMorphologies.get(sWord); + } + let lMorph = this.oMainDic.getMorph(sWord); if (this.bExtendedDic) { - lResult.push(...this.oExtendedDic.getMorph(sWord)); + lMorph.push(...this.oExtendedDic.getMorph(sWord)); } if (this.bCommunityDic) { - lResult.push(...this.oCommunityDic.getMorph(sWord)); + lMorph.push(...this.oCommunityDic.getMorph(sWord)); } if (this.bPersonalDic) { - lResult.push(...this.oPersonalDic.getMorph(sWord)); + lMorph.push(...this.oPersonalDic.getMorph(sWord)); + } + if (this.bStorage) { + this._dMorphologies.set(sWord, lMorph); + this._dLemmas.set(sWord, new Set(this.getMorph(sWord).map((sMorph) => { return sMorph.slice(0, sMorph.indexOf(" ")); }))); + } + return lMorph; + } + + getLemma (sWord) { + // retrieves lemmas + if (this.bStorage) { + if (!this._dLemmas.has(sWord)) { + this.getMorph(sWord); + } + return this._dLemmas.get(sWord); } - return lResult; + return new Set(this.getMorph(sWord).map((sMorph) => { return sMorph.slice(0, sMorph.indexOf(" ")); })); } * suggest (sWord, nSuggLimit=10) { // generator: returns 1, 2 or 3 lists of suggestions yield this.oMainDic.suggest(sWord, nSuggLimit); Index: graphspell-js/tokenizer.js ================================================================== --- graphspell-js/tokenizer.js +++ graphspell-js/tokenizer.js @@ -16,11 +16,11 @@ "default": [ [/^[  \t]+/, 'SPACE'], [/^\/(?:~|bin|boot|dev|etc|home|lib|mnt|opt|root|sbin|tmp|usr|var|Bureau|Documents|Images|Musique|Public|Téléchargements|Vidéos)(?:\/[a-zA-Zà-öÀ-Ö0-9ø-ÿØ-ßĀ-ʯﬁ-ﬆ_.()-]+)*/, 'FOLDERUNIX'], [/^[a-zA-Z]:\$?:Program Files(?: \(x86$|)|[a-zA-Zà-öÀ-Ö0-9ø-ÿØ-ßĀ-ʯﬁ-ﬆ.()]+)(?:\\[a-zA-Zà-öÀ-Ö0-9ø-ÿØ-ßĀ-ʯﬁ-ﬆ_.()-]+)*/, 'FOLDERWIN'], - [/^[,.;:!?…«»“”‘’"(){}\[\]/·–—]+/, 'SEPARATOR'], + [/^[,.;:!?…«»“”‘’"(){}\[\]/·–—]/, 'SEPARATOR'], [/^[A-Z][.][A-Z][.](?:[A-Z][.])*/, 'ACRONYM'], [/^(?:https?:\/\/|www[.]|[a-zA-Zà-öÀ-Ö0-9ø-ÿØ-ßĀ-ʯﬁ-ﬆ_-]+[@.][a-zA-Zà-öÀ-Ö0-9ø-ÿØ-ßĀ-ʯﬁ-ﬆ_-]{2,}[@.])[a-zA-Z0-9][a-zA-Zà-öÀ-Ö0-9ø-ÿØ-ßĀ-ʯﬁ-ﬆ_.\/?&!%=+*"'@$#-]+/, 'LINK'], [/^[#@][a-zA-Zà-öÀ-Ö0-9ø-ÿØ-ßĀ-ʯﬁ-ﬆ_-]+/, 'TAG'], [/^<[a-zA-Zà-öÀ-Ö0-9ø-ÿØ-ßĀ-ʯﬁ-ﬆ]+.*?>|<\/[a-zA-Zà-öÀ-Ö0-9ø-ÿØ-ßĀ-ʯﬁ-ﬆ]+ *>/, 'HTML'], [/^\[\/?[a-zA-Zà-öÀ-Ö0-9ø-ÿØ-ßĀ-ʯﬁ-ﬆ]+\]/, 'PSEUDOHTML'], @@ -32,11 +32,11 @@ "fr": [ [/^[  \t]+/, 'SPACE'], [/^\/(?:~|bin|boot|dev|etc|home|lib|mnt|opt|root|sbin|tmp|usr|var|Bureau|Documents|Images|Musique|Public|Téléchargements|Vidéos)(?:\/[a-zA-Zà-öÀ-Ö0-9ø-ÿØ-ßĀ-ʯﬁ-ﬆ_.()-]+)*/, 'FOLDERUNIX'], [/^[a-zA-Z]:\$?:Program Files(?: \(x86$|)|[a-zA-Zà-öÀ-Ö0-9ø-ÿØ-ßĀ-ʯﬁ-ﬆ.()]+)(?:\\[a-zA-Zà-öÀ-Ö0-9ø-ÿØ-ßĀ-ʯﬁ-ﬆ_.()-]+)*/, 'FOLDERWIN'], - [/^[,.;:!?…«»“”‘’"(){}\[\]/·–—]+/, 'SEPARATOR'], + [/^[,.;:!?…«»“”‘’"(){}\[\]/·–—]/, 'SEPARATOR'], [/^[A-Z][.][A-Z][.](?:[A-Z][.])*/, 'ACRONYM'], [/^(?:https?:\/\/|www[.]|[a-zA-Zà-öÀ-Ö0-9ø-ÿØ-ßĀ-ʯﬁ-ﬆ_-]+[@.][a-zA-Zà-öÀ-Ö0-9ø-ÿØ-ßĀ-ʯﬁ-ﬆ_-]{2,}[@.])[a-zA-Z0-9][a-zA-Zà-öÀ-Ö0-9ø-ÿØ-ßĀ-ʯﬁ-ﬆ_.\/?&!%=+*"'@$#-]+/, 'LINK'], [/^[#@][a-zA-Zà-öÀ-Ö0-9ø-ÿØ-ßĀ-ʯﬁ-ﬆ_-]+/, 'TAG'], [/^<[a-zA-Zà-öÀ-Ö0-9ø-ÿØ-ßĀ-ʯﬁ-ﬆ]+.*?>|<\/[a-zA-Zà-öÀ-Ö0-9ø-ÿØ-ßĀ-ʯﬁ-ﬆ]+ *>/, 'HTML'], [/^\[\/?[a-zA-Zà-öÀ-Ö0-9ø-ÿØ-ßĀ-ʯﬁ-ﬆ]+\]/, 'PSEUDOHTML'], @@ -60,36 +60,32 @@ this.aRules = aTkzPatterns[this.sLang]; } * genTokens (sText) { let m; - let i = 0; + let iNext = 0; while (sText) { - let nCut = 1; + let iCut = 1; + let iToken = 0; for (let [zRegex, sType] of this.aRules) { try { if ((m = zRegex.exec(sText)) !== null) { - if (sType == 'SEPARATOR') { - for (let c of m[0]) { - yield { "sType": sType, "sValue": c, "nStart": i, "nEnd": i + m[0].length } - } - } else { - yield { "sType": sType, "sValue": m[0], "nStart": i, "nEnd": i + m[0].length } - } - nCut = m[0].length; + iToken += 1; + yield { "i": iToken, "sType": sType, "sValue": m[0], "nStart": iNext, "nEnd": iNext + m[0].length } + iCut = m[0].length; break; } } catch (e) { helpers.logerror(e); } } - i += nCut; - sText = sText.slice(nCut); + iNext += iCut; + sText = sText.slice(iCut); } } } if (typeof(exports) !== 'undefined') { exports.Tokenizer = Tokenizer; } Index: graphspell/spellchecker.py ================================================================== --- graphspell/spellchecker.py +++ graphspell/spellchecker.py @@ -34,10 +34,14 @@ self.oPersonalDic = self._loadDictionary(sfPersonalDic) self.bExtendedDic = bool(self.oExtendedDic) self.bCommunityDic = bool(self.oCommunityDic) self.bPersonalDic = bool(self.oPersonalDic) self.oTokenizer = None + # storage + self.bStorage = False + self._dMorphologies = {} # key: flexion, value: list of morphologies + self._dLemmas = {} # key: flexion, value: list of lemmas def _loadDictionary (self, source, bNecessary=False): "returns an IBDAWG object" if not source: return None @@ -97,10 +101,23 @@ self.bCommunityDic = False def deactivatePersonalDictionary (self): self.bPersonalDic = False + + # Storage + + def activateStorage (self): + self.bStorage = True + + def deactivateStorage (self): + self.bStorage = False + + def clearStorage (self): + self._dLemmas.clear() + self._dMorphologies.clear() + # parse text functions def parseParagraph (self, sText, bSpellSugg=False): if not self.oTokenizer: @@ -169,20 +186,30 @@ return True return False def getMorph (self, sWord): "retrieves morphologies list, different casing allowed" - lResult = self.oMainDic.getMorph(sWord) + if self.bStorage and sWord in self._dMorphologies: + return self._dMorphologies[sWord] + lMorph = self.oMainDic.getMorph(sWord) if self.bExtendedDic: - lResult.extend(self.oExtendedDic.getMorph(sWord)) + lMorph.extend(self.oExtendedDic.getMorph(sWord)) if self.bCommunityDic: - lResult.extend(self.oCommunityDic.getMorph(sWord)) + lMorph.extend(self.oCommunityDic.getMorph(sWord)) if self.bPersonalDic: - lResult.extend(self.oPersonalDic.getMorph(sWord)) - return lResult + lMorph.extend(self.oPersonalDic.getMorph(sWord)) + if self.bStorage: + self._dMorphologies[sWord] = lMorph + self._dLemmas[sWord] = set([ s[1:s.find(" ")] for s in lMorph ]) + return lMorph def getLemma (self, sWord): + "retrieves lemmas" + if self.bStorage: + if sWord not in self._dLemmas: + self.getMorph(sWord) + return self._dLemmas[sWord] return set([ s[1:s.find(" ")] for s in self.getMorph(sWord) ]) def suggest (self, sWord, nSuggLimit=10): "generator: returns 1, 2 or 3 lists of suggestions" yield self.oMainDic.suggest(sWord, nSuggLimit) Index: graphspell/tokenizer.py ================================================================== --- graphspell/tokenizer.py +++ graphspell/tokenizer.py @@ -5,11 +5,11 @@ _PATTERNS = { "default": ( r'(?P/(?:bin|boot|dev|etc|home|lib|mnt|opt|root|sbin|tmp|usr|var|Bureau|Documents|Images|Musique|Public|Téléchargements|Vidéos)(?:/[\w.()-]+)*)', r'(?P[a-zA-Z]:\\(?:Program Files(?: [(]x86[)]|)|[\w.()]+)(?:\\[\w.()-]+)*)', - r'(?P[.,?!:;…«»“”"()/·]+)', + r'(?P[][,.;:!?…«»“”‘’"(){}/·–—])', r'(?P[A-Z][.][A-Z][.](?:[A-Z][.])*)', r'(?P(?:https?://|www[.]|\w+[@.]\w\w+[@.])\w[\w./?&!%=+*"\'@$#-]+)', r'(?P[#@][\w-]+)', r'(?P<\w+.*?>|)', r'(?P\[/?\w+\])', @@ -19,11 +19,11 @@ ), "fr": ( r'(?P/(?:bin|boot|dev|etc|home|lib|mnt|opt|root|sbin|tmp|usr|var|Bureau|Documents|Images|Musique|Public|Téléchargements|Vidéos)(?:/[\w.()-]+)*)', r'(?P[a-zA-Z]:\\(?:Program Files(?: [(]x86[)]|)|[\w.()]+)(?:\\[\w.()-]+)*)', - r'(?P[.,?!:;…«»“”"()/·]+)', + r'(?P[][,.;:!?…«»“”‘’"(){}/·–—])', r'(?P[A-Z][.][A-Z][.](?:[A-Z][.])*)', r'(?P(?:https?://|www[.]|\w+[@.]\w\w+[@.])\w[\w./?&!%=+*"\'@$#-]+)', r'(?P[#@][\w-]+)', r'(?P<\w+.*?>|)', r'(?P\[/?\w+\])', @@ -43,7 +43,7 @@ if sLang not in _PATTERNS: self.sLang = "default" self.zToken = re.compile( "(?i)" + '|'.join(sRegex for sRegex in _PATTERNS[sLang]) ) def genTokens (self, sText): - for m in self.zToken.finditer(sText): - yield { "sType": m.lastgroup, "sValue": m.group(), "nStart": m.start(), "nEnd": m.end() } + for i, m in enumerate(self.zToken.finditer(sText), 1): + yield { "i": i, "sType": m.lastgroup, "sValue": m.group(), "nStart": m.start(), "nEnd": m.end() } Index: make.py ================================================================== --- make.py +++ make.py @@ -17,10 +17,11 @@ from distutils import dir_util, file_util import dialog_bundled import compile_rules +import compile_rules_graph import helpers import lex_build sWarningMessage = "The content of this folder is generated by code and replaced at each build.\n" @@ -191,12 +192,15 @@ dVars = xConfig._sections['args'] dVars['locales'] = dVars["locales"].replace("_", "-") dVars['loc'] = str(dict([ [s, [s[0:2], s[3:5], ""]] for s in dVars["locales"].split(" ") ])) ## COMPILE RULES - dResult = compile_rules.make(spLang, dVars['lang'], bJavaScript) - dVars.update(dResult) + dResultRegex = compile_rules.make(spLang, dVars['lang'], bJavaScript) + dVars.update(dResultRegex) + + dResultGraph = compile_rules_graph.make(spLang, dVars['lang'], bJavaScript) + dVars.update(dResultGraph) ## READ GRAMMAR CHECKER PLUGINS print("PYTHON:") print("+ Plugins: ", end="") sCodePlugins = ""