import re
import sys
import traceback
import json
from distutils import file_util
import compile_rules_js_convert as jsconv
dDEF = {}
lFUNCTIONS = []
aRULESET = set() # set of rule-ids to check if there is several rules with the same id
nRULEWITHOUTNAME = 0
dJSREGEXES = {}
sWORDLIMITLEFT = r"(?<![\w.,–-])" # r"(?<![-.,—])\b" seems slower
sWORDLIMITRIGHT = r"(?![\w–-])" # r"\b(?!-—)" seems slower
def prepareFunction (s):
s = s.replace("__also__", "bCondMemo")
s = s.replace("__else__", "not bCondMemo")
s = re.sub(r"isStart *\(\)", 'before("^ *$|, *$")', s)
s = re.sub(r"isRealStart *\(\)", 'before("^ *$")', s)
s = re.sub(r"isStart0 *\(\)", 'before0("^ *$|, *$")', s)
s = re.sub(r"isRealStart0 *\(\)", 'before0("^ *$")', s)
s = re.sub(r"isEnd *\(\)", 'after("^ *$|^,")', s)
s = re.sub(r"isRealEnd *\(\)", 'after("^ *$")', s)
s = re.sub(r"isEnd0 *\(\)", 'after0("^ *$|^,")', s)
s = re.sub(r"isRealEnd0 *\(\)", 'after0("^ *$")', s)
s = re.sub(r"(select|exclude)[(][\\](\d+)", '\\1(dDA, m.start(\\2), m.group(\\2)', s)
s = re.sub(r"define[(][\\](\d+)", 'define(dDA, m.start(\\1)', s)
s = re.sub(r"(morph|morphex|displayInfo)[(][\\](\d+)", '\\1((m.start(\\2), m.group(\\2))', s)
s = re.sub(r"(morph|morphex|displayInfo)[(]", '\\1(dDA, ', s)
s = re.sub(r"(sugg\w+|switch\w+)\(@", '\\1(m.group(i[4])', s)
s = re.sub(r"word\(\s*1\b", 'nextword1(s, m.end()', s) # word(1)
s = re.sub(r"word\(\s*-1\b", 'prevword1(s, m.start()', s) # word(-1)
s = re.sub(r"word\(\s*(\d)", 'nextword(s, m.end(), \\1', s) # word(n)
s = re.sub(r"word\(\s*-(\d)", 'prevword(s, m.start(), \\1', s) # word(-n)
s = re.sub(r"before\(\s*", 'look(s[:m.start()], ', s) # before(s)
s = re.sub(r"after\(\s*", 'look(s[m.end():], ', s) # after(s)
s = re.sub(r"textarea\(\s*", 'look(s, ', s) # textarea(s)
s = re.sub(r"before_chk1\(\s*", 'look_chk1(dDA, s[:m.start()], 0, ', s) # before_chk1(s)
s = re.sub(r"after_chk1\(\s*", 'look_chk1(dDA, s[m.end():], m.end(), ', s) # after_chk1(s)
s = re.sub(r"textarea_chk1\(\s*", 'look_chk1(dDA, s, 0, ', s) # textarea_chk1(s)
s = re.sub(r"/0", 'sx[m.start():m.end()]', s) # /0
s = re.sub(r"before0\(\s*", 'look(sx[:m.start()], ', s) # before0(s)
s = re.sub(r"after0\(\s*", 'look(sx[m.end():], ', s) # after0(s)
s = re.sub(r"textarea0\(\s*", 'look(sx, ', s) # textarea0(s)
s = re.sub(r"before0_chk1\(\s*", 'look_chk1(dDA, sx[:m.start()], 0, ', s) # before0_chk1(s)
s = re.sub(r"after0_chk1\(\s*", 'look_chk1(dDA, sx[m.end():], m.end(), ', s) # after0_chk1(s)
s = re.sub(r"textarea0_chk1\(\s*", 'look_chk1(dDA, sx, 0, ', s) # textarea0_chk1(s)
s = re.sub(r"isEndOfNG\(\s*\)", 'isEndOfNG(dDA, s[m.end():], m.end())', s) # isEndOfNG(s)
s = re.sub(r"isNextNotCOD\(\s*\)", 'isNextNotCOD(dDA, s[m.end():], m.end())', s) # isNextNotCOD(s)
s = re.sub(r"isNextVerb\(\s*\)", 'isNextVerb(dDA, s[m.end():], m.end())', s) # isNextVerb(s)
s = re.sub(r"\bspell *[(]", '_oSpellChecker.isValid(', s)
s = re.sub(r"[\\](\d+)", 'm.group(\\1)', s)
return s
def uppercase (s, sLang):
"(flag i is not enough): converts regex to uppercase regex: 'foo' becomes '[Ff][Oo][Oo]', but 'Bar' becomes 'B[Aa][Rr]'."
sUp = ""
nState = 0
for i in range(0, len(s)):
c = s[i]
if c == "[":
nState = 1
if nState == 1 and c == "]":
nState = 0
if c == "<" and i > 3 and s[i-3:i] == "(?P":
nState = 2
if nState == 2 and c == ">":
nState = 0
if c == "?" and i > 0 and s[i-1:i] == "(" and s[i+1:i+2] != ":":
nState = 5
if nState == 5 and c == ")":
nState = 0
if c.isalpha() and c.islower() and nState == 0:
if c == "i" and (sLang == "tr" or sLang == "az"):
sUp += "[İ" + c + "]"
else:
sUp += "[" + c.upper() + c + "]"
elif c.isalpha() and c.islower() and nState == 1 and s[i+1:i+2] != "-":
if s[i-1:i] == "-" and s[i-2:i-1].islower(): # [a-z] -> [a-zA-Z]
sUp += c + s[i-2:i-1].upper() + "-" + c.upper()
elif c == "i" and (sLang == "tr" or sLang == "az"):
sUp += "İ" + c
else:
sUp += c.upper() + c
else:
sUp += c
if c == "\\":
nState = 4
elif nState == 4:
nState = 0
return sUp
def countGroupInRegex (sRegex):
try:
return re.compile(sRegex).groups
except:
traceback.print_exc()
print(sRegex)
return 0
def createRule (s, nIdLine, sLang, bParagraph, dOptPriority):
"returns rule as list [option name, regex, bCaseInsensitive, identifier, list of actions]"
global dJSREGEXES
global nRULEWITHOUTNAME
#### OPTIONS
sLineId = str(nIdLine) + ("p" if bParagraph else "s")
sRuleId = sLineId
sOption = False # False or [a-z0-9]+ name
nPriority = 4 # Default is 4, value must be between 0 and 9
tGroups = None # code for groups positioning (only useful for JavaScript)
cCaseMode = 'i' # i: case insensitive, s: case sensitive, u: uppercasing allowed
cWordLimitLeft = '[' # [: word limit, <: no specific limit
cWordLimitRight = ']' # ]: word limit, >: no specific limit
m = re.match("^__(?P<borders_and_case>[[<]\\w[]>])(?P<option>/[a-zA-Z0-9]+|)(?P<ruleid>\\(\\w+\\)|)(?P<priority>![0-9]|)__ *", s)
if m:
cWordLimitLeft = m.group('borders_and_case')[0]
cCaseMode = m.group('borders_and_case')[1]
cWordLimitRight = m.group('borders_and_case')[2]
sOption = m.group('option')[1:] if m.group('option') else False
if m.group('ruleid'):
sRuleId = m.group('ruleid')[1:-1]
if sRuleId in aRULESET:
print("# Error. Several rules have the same id: " + sRuleId)
exit()
aRULESET.add(sRuleId)
else:
nRULEWITHOUTNAME += 1
nPriority = dOptPriority.get(sOption, 4)
if m.group('priority'):
nPriority = int(m.group('priority')[1:])
s = s[m.end(0):]
else:
print("# Warning. No option defined at line: " + sLineId)
#### REGEX TRIGGER
i = s.find(" <<-")
if i == -1:
print("# Error: no condition at line " + sLineId)
return None
sRegex = s[:i].strip()
s = s[i+4:]
# JS groups positioning codes
m = re.search("@@\\S+", sRegex)
if m:
tGroups = jsconv.groupsPositioningCodeToList(sRegex[m.start()+2:])
sRegex = sRegex[:m.start()].strip()
# JS regex
m = re.search("<js>.+</js>i?", sRegex)
if m:
dJSREGEXES[sLineId] = m.group(0)
sRegex = sRegex[:m.start()].strip()
if "<js>" in sRegex or "</js>" in sRegex:
print("# Error: JavaScript regex not delimited at line " + sLineId)
return None
# quotes ?
if sRegex.startswith('"') and sRegex.endswith('"'):
sRegex = sRegex[1:-1]
## definitions
for sDef, sRepl in dDEF.items():
sRegex = sRegex.replace(sDef, sRepl)
## count number of groups (must be done before modifying the regex)
nGroup = countGroupInRegex(sRegex)
if nGroup > 0:
if not tGroups:
print("# Warning: groups positioning code for JavaScript should be defined at line " + sLineId)
else:
if nGroup != len(tGroups):
print("# Error: groups positioning code irrelevant at line " + sLineId)
## word limit
if cWordLimitLeft == '[' and not sRegex.startswith(("^", '’', "'", ",")):
sRegex = sWORDLIMITLEFT + sRegex
if cWordLimitRight == ']' and not sRegex.endswith(("$", '’', "'", ",")):
sRegex = sRegex + sWORDLIMITRIGHT
## casing mode
if cCaseMode == "i":
bCaseInsensitive = True
if not sRegex.startswith("(?i)"):
sRegex = "(?i)" + sRegex
elif cCaseMode == "s":
bCaseInsensitive = False
sRegex = sRegex.replace("(?i)", "")
elif cCaseMode == "u":
bCaseInsensitive = False
sRegex = sRegex.replace("(?i)", "")
sRegex = uppercase(sRegex, sLang)
else:
print("# Unknown case mode [" + cCaseMode + "] at line " + sLineId)
## check regex
try:
z = re.compile(sRegex)
except:
print("# Regex error at line ", nIdLine)
print(sRegex)
traceback.print_exc()
return None
## groups in non grouping parenthesis
for x in re.finditer("\(\?:[^)]*\([[\w -]", sRegex):
print("# Warning: groups inside non grouping parenthesis in regex at line " + sLineId)
#### PARSE ACTIONS
lActions = []
nAction = 1
for sAction in s.split(" <<- "):
t = createAction(sRuleId + "_" + str(nAction), sAction, nGroup)
nAction += 1
if t:
lActions.append(t)
if not lActions:
return None
return [sOption, sRegex, bCaseInsensitive, sLineId, sRuleId, nPriority, lActions, tGroups]
def createAction (sIdAction, sAction, nGroup):
"returns an action to perform as a tuple (condition, action type, action[, iGroup [, message, URL ]])"
global lFUNCTIONS
m = re.search(r"([-~=>])(\d*|)>>", sAction)
if not m:
print("# No action at line " + sIdAction)
return None
#### CONDITION
sCondition = sAction[:m.start()].strip()
if sCondition:
sCondition = prepareFunction(sCondition)
lFUNCTIONS.append(("c_"+sIdAction, sCondition))
for x in re.finditer("[.](?:group|start|end)[(](\d+)[)]", sCondition):
if int(x.group(1)) > nGroup:
print("# Error in groups in condition at line " + sIdAction + " ("+str(nGroup)+" groups only)")
if ".match" in sCondition:
print("# Error. JS compatibility. Don't use .match() in condition, use .search()")
sCondition = "c_"+sIdAction
else:
sCondition = None
#### iGroup / positioning
iGroup = int(m.group(2)) if m.group(2) else 0
if iGroup > nGroup:
print("# Selected group > group number in regex at line " + sIdAction)
#### ACTION
sAction = sAction[m.end():].strip()
cAction = m.group(1)
if cAction == "-":
## error
iMsg = sAction.find(" # ")
if iMsg == -1:
sMsg = "# Error. Error message not found."
sURL = ""
print(sMsg + " Action id: " + sIdAction)
else:
sMsg = sAction[iMsg+3:].strip()
sAction = sAction[:iMsg].strip()
sURL = ""
mURL = re.search("[|] *(https?://.*)", sMsg)
if mURL:
sURL = mURL.group(1).strip()
sMsg = sMsg[:mURL.start(0)].strip()
if sMsg[0:1] == "=":
sMsg = prepareFunction(sMsg[1:])
lFUNCTIONS.append(("m_"+sIdAction, sMsg))
for x in re.finditer("group[(](\d+)[)]", sMsg):
if int(x.group(1)) > nGroup:
print("# Error in groups in message at line " + sIdAction + " ("+str(nGroup)+" groups only)")
sMsg = "=m_"+sIdAction
else:
for x in re.finditer(r"\\(\d+)", sMsg):
if int(x.group(1)) > nGroup:
print("# Error in groups in message at line " + sIdAction + " ("+str(nGroup)+" groups only)")
if re.search("[.]\\w+[(]", sMsg):
print("# Error in message at line " + sIdAction + ": This message looks like code. Line should begin with =")
if sAction[0:1] == "=" or cAction == "=":
if "define" in sAction and not re.search(r"define\(\\\d+ *, *\[.*\] *\)", sAction):
print("# Error in action at line " + sIdAction + ": second argument for define must be a list of strings")
sAction = prepareFunction(sAction)
sAction = sAction.replace("m.group(i[4])", "m.group("+str(iGroup)+")")
for x in re.finditer("group[(](\d+)[)]", sAction):
if int(x.group(1)) > nGroup:
print("# Error in groups in replacement at line " + sIdAction + " ("+str(nGroup)+" groups only)")
else:
for x in re.finditer(r"\\(\d+)", sAction):
if int(x.group(1)) > nGroup:
print("# Error in groups in replacement at line " + sIdAction + " ("+str(nGroup)+" groups only)")
if re.search("[.]\\w+[(]|sugg\\w+[(]", sAction):
print("# Error in action at line " + sIdAction + ": This action looks like code. Line should begin with =")
if cAction == "-":
## error detected --> suggestion
if not sAction:
print("# Error in action at line " + sIdAction + ": This action is empty.")
if sAction[0:1] == "=":
lFUNCTIONS.append(("s_"+sIdAction, sAction[1:]))
sAction = "=s_"+sIdAction
elif sAction.startswith('"') and sAction.endswith('"'):
sAction = sAction[1:-1]
if not sMsg:
print("# Error in action at line " + sIdAction + ": the message is empty.")
return [sCondition, cAction, sAction, iGroup, sMsg, sURL]
elif cAction == "~":
## text processor
if not sAction:
print("# Error in action at line " + sIdAction + ": This action is empty.")
if sAction[0:1] == "=":
lFUNCTIONS.append(("p_"+sIdAction, sAction[1:]))
sAction = "=p_"+sIdAction
elif sAction.startswith('"') and sAction.endswith('"'):
sAction = sAction[1:-1]
return [sCondition, cAction, sAction, iGroup]
elif cAction == "=":
## disambiguator
if sAction[0:1] == "=":
sAction = sAction[1:]
if not sAction:
print("# Error in action at line " + sIdAction + ": This action is empty.")
lFUNCTIONS.append(("d_"+sIdAction, sAction))
sAction = "d_"+sIdAction
return [sCondition, cAction, sAction]
elif cAction == ">":
## no action, break loop if condition is False
return [sCondition, cAction, ""]
else:
print("# Unknown action at line " + sIdAction)
return None
def _calcRulesStats (lRules):
d = {'=':0, '~': 0, '-': 0, '>': 0}
for aRule in lRules:
for aAction in aRule[6]:
d[aAction[1]] = d[aAction[1]] + 1
return (d, len(lRules))
def displayStats (lParagraphRules, lSentenceRules):
print(" {:>18} {:>18} {:>18} {:>18}".format("DISAMBIGUATOR", "TEXT PROCESSOR", "GRAMMAR CHECKING", "REGEX"))
d, nRule = _calcRulesStats(lParagraphRules)
print("§ {:>10} actions {:>10} actions {:>10} actions in {:>8} rules".format(d['='], d['~'], d['-'], nRule))
d, nRule = _calcRulesStats(lSentenceRules)
print("s {:>10} actions {:>10} actions {:>10} actions in {:>8} rules".format(d['='], d['~'], d['-'], nRule))
def mergeRulesByOption (lRules):
"returns a list of tuples [option, list of rules] keeping the rules order"
lFinal = []
lTemp = []
sOption = None
for aRule in lRules:
if aRule[0] != sOption:
if sOption != None:
lFinal.append([sOption, lTemp])
# new tuple
sOption = aRule[0]
lTemp = []
lTemp.append(aRule[1:])
lFinal.append([sOption, lTemp])
return lFinal
def prepareOptions (lOptionLines):
"returns a dictionary with data about options"
sLang = ""
sDefaultUILang = ""
lStructOpt = []
lOpt = []
dOptLabel = {}
dOptPriority = {}
for sLine in lOptionLines:
sLine = sLine.strip()
if sLine.startswith("OPTGROUP/"):
m = re.match("OPTGROUP/([a-z0-9]+):(.+)$", sLine)
lStructOpt.append( (m.group(1), list(map(str.split, m.group(2).split(",")))) )
elif sLine.startswith("OPTSOFTWARE:"):
lOpt = [ [s, {}] for s in sLine[12:].strip().split() ] # don’t use tuples (s, {}), because unknown to JS
elif sLine.startswith("OPT/"):
m = re.match("OPT/([a-z0-9]+):(.+)$", sLine)
for i, sOpt in enumerate(m.group(2).split()):
lOpt[i][1][m.group(1)] = eval(sOpt)
elif sLine.startswith("OPTPRIORITY/"):
m = re.match("OPTPRIORITY/([a-z0-9]+): *([0-9])$", sLine)
dOptPriority[m.group(1)] = int(m.group(2))
elif sLine.startswith("OPTLANG/"):
m = re.match("OPTLANG/([a-z][a-z](?:_[A-Z][A-Z]|)):(.+)$", sLine)
sLang = m.group(1)[:2]
dOptLabel[sLang] = { "__optiontitle__": m.group(2).strip() }
elif sLine.startswith("OPTDEFAULTUILANG:"):
m = re.match("OPTDEFAULTUILANG: *([a-z][a-z](?:_[A-Z][A-Z]|))$", sLine)
sDefaultUILang = m.group(1)[:2]
elif sLine.startswith("OPTLABEL/"):
m = re.match("OPTLABEL/([a-z0-9]+):(.+)$", sLine)
dOptLabel[sLang][m.group(1)] = list(map(str.strip, m.group(2).split("|"))) if "|" in m.group(2) else [m.group(2).strip(), ""]
else:
print("# Error. Wrong option line in:\n ")
print(sLine)
print(" options defined for: " + ", ".join([ t[0] for t in lOpt ]))
dOptions = { "lStructOpt": lStructOpt, "dOptLabel": dOptLabel, "sDefaultUILang": sDefaultUILang }
dOptions.update({ "dOpt"+k: v for k, v in lOpt })
return dOptions, dOptPriority
def printBookmark (nLevel, sComment, nLine):
print(" {:>6}: {}".format(nLine, " " * nLevel + sComment))
def make (lRules, sLang, bJavaScript):
"compile rules, returns a dictionary of values"
# for clarity purpose, don’t create any file here
# removing comments, zeroing empty lines, creating definitions, storing tests, merging rule lines
print(" parsing rules...")
global dDEF
lLine = []
lRuleLine = []
lTest = []
lOpt = []
zBookmark = re.compile("^!!+")
zGraphLink = re.compile(r"^@@@@GRAPHLINK>(\w+)@@@@")
for i, sLine in enumerate(lRules, 1):
if sLine.startswith('#END'):
printBookmark(0, "BREAK BY #END", i)
break
elif sLine.startswith("#"):
pass
elif sLine.startswith("@@@@"):
m = re.match(r"^@@@@GRAPHLINK>(\w+)@@@@", sLine.strip())
if m:
#lRuleLine.append(["@GRAPHLINK", m.group(1)])
printBookmark(1, "@GRAPHLINK: " + m.group(1), i)
elif sLine.startswith("DEF:"):
m = re.match("DEF: +([a-zA-Z_][a-zA-Z_0-9]*) +(.+)$", sLine.strip())
if m:
dDEF["{"+m.group(1)+"}"] = m.group(2)
else:
print("Error in definition: ", end="")
print(sLine.strip())
elif sLine.startswith("TEST:"):
lTest.append("{:<8}".format(i) + " " + sLine[5:].strip())
elif sLine.startswith("TODO:"):
pass
elif sLine.startswith(("OPTGROUP/", "OPTSOFTWARE:", "OPT/", "OPTLANG/", "OPTDEFAULTUILANG:", "OPTLABEL/", "OPTPRIORITY/")):
lOpt.append(sLine)
elif re.match("[ \t]*$", sLine):
pass
elif sLine.startswith("!!"):
m = zBookmark.search(sLine)
nExMk = len(m.group(0))
if sLine[nExMk:].strip():
printBookmark(nExMk-2, sLine[nExMk:].strip(), i)
elif sLine.startswith((" ", "\t")):
lRuleLine[len(lRuleLine)-1][1] += " " + sLine.strip()
else:
lRuleLine.append([i, sLine.strip()])
# generating options files
print(" parsing options...")
try:
dOptions, dOptPriority = prepareOptions(lOpt)
except:
traceback.print_exc()
exit()
# tests
print(" list tests...")
sGCTests = "\n".join(lTest)
sGCTestsJS = '{ "aData": ' + json.dumps(lTest, ensure_ascii=False) + " }\n"
# processing
print(" preparing rules...")
bParagraph = True
lParagraphRules = []
lSentenceRules = []
lParagraphRulesJS = []
lSentenceRulesJS = []
for nLine, sLine in lRuleLine:
if sLine:
if sLine == "[++]":
bParagraph = False
else:
aRule = createRule(sLine, nLine, sLang, bParagraph, dOptPriority)
if aRule:
if bParagraph:
lParagraphRules.append(aRule)
lParagraphRulesJS.append(jsconv.pyRuleToJS(aRule, dJSREGEXES, sWORDLIMITLEFT))
else:
lSentenceRules.append(aRule)
lSentenceRulesJS.append(jsconv.pyRuleToJS(aRule, dJSREGEXES, sWORDLIMITLEFT))
# creating file with all functions callable by rules
print(" creating callables...")
sPyCallables = "# generated code, do not edit\n"
sJSCallables = "// generated code, do not edit\nconst oEvalFunc = {\n"
for sFuncName, sReturn in lFUNCTIONS:
cType = sFuncName[0:1]
if cType == "c": # condition
sParams = "s, sx, m, dDA, sCountry, bCondMemo"
elif cType == "m": # message
sParams = "s, m"
elif cType == "s": # suggestion
sParams = "s, m"
elif cType == "p": # preprocessor
sParams = "s, m"
elif cType == "d": # disambiguator
sParams = "s, m, dDA"
else:
print("# Unknown function type in [" + sFuncName + "]")
continue
sPyCallables += "def {} ({}):\n".format(sFuncName, sParams)
sPyCallables += " return " + sReturn + "\n"
sJSCallables += " {}: function ({})".format(sFuncName, sParams) + " {\n"
sJSCallables += " return " + jsconv.py2js(sReturn) + ";\n"
sJSCallables += " },\n"
sJSCallables += "}\n"
displayStats(lParagraphRules, lSentenceRules)
print("Unnamed rules: " + str(nRULEWITHOUTNAME))
d = { "callables": sPyCallables,
"callablesJS": sJSCallables,
"gctests": sGCTests,
"gctestsJS": sGCTestsJS,
"paragraph_rules": mergeRulesByOption(lParagraphRules),
"sentence_rules": mergeRulesByOption(lSentenceRules),
"paragraph_rules_JS": jsconv.writeRulesToJSArray(mergeRulesByOption(lParagraphRulesJS)),
"sentence_rules_JS": jsconv.writeRulesToJSArray(mergeRulesByOption(lSentenceRulesJS)) }
d.update(dOptions)
return d