Grammalecte: Check-in [18191569f4]

Overview

Comment:	[build] compile rules: code cleaning (pylint)
Downloads:	Tarball \| ZIP archive \| SQL archive
Timelines:	family \| ancestors \| descendants \| both \| build \| rg
Files:	files \| file ages \| folders
SHA3-256:	18191569f40606a692b0397f1b35893588a120aa9b77afca5e4da3f7e03d0cb5
User & Date:	olr on 2018-06-24 17:51:48
Other Links:	branch diff \| manifest \| tags

Context

2018-06-24
18:45		[build] compile rules: code clarification check-in: 4ff036a562 user: olr tags: build, rg
17:51		[build] compile rules: code cleaning (pylint) check-in: 18191569f4 user: olr tags: build, rg
16:45		[server] code cleaning (pylint) check-in: 9e6790402a user: olr tags: server, rg

Changes

Hide Diffs Side-by-Side Diffs Ignore Whitespace Patch

Modified compile_rules.py from [2382a8be81] to [30d53476f8].

Modified compile_rules_graph.py from [ef1f63d249] to [c4702b3e5a].

Modified compile_rules_js_convert.py from [f2cc9f3e39] to [9aa0239064].

Modified make.py from [b5066cbbba] to [47003996f5].




1 2 3 4 5 6 7	1 2 3 4 5 6 7 8 9 10	+ + +	""" Grammalecte: compile rules """ import re import traceback import json import compile_rules_js_convert as jsconv import compile_rules_graph as crg
︙
16 17 18 19 20 21 22 23 24 25 26 27 28 29	19 20 21 22 23 24 25 26 27 28 29 30 31 32 33	+	dJSREGEXES = {} sWORDLIMITLEFT = r"(?<![\w.,–-])" # r"(?<![-.,—])\b" seems slower sWORDLIMITRIGHT = r"(?![\w–-])" # r"\b(?!-—)" seems slower def prepareFunction (s): "convert simple rule syntax to a string of Python code" s = s.replace("__also__", "bCondMemo") s = s.replace("__else__", "not bCondMemo") s = re.sub(r"isStart \(\)", 'before("^ $\|, $")', s) s = re.sub(r"isRealStart \(\)", 'before("^ $")', s) s = re.sub(r"isStart0 \(\)", 'before0("^ $\|, $")', s) s = re.sub(r"isRealStart0 \(\)", 'before0("^ $")', s) s = re.sub(r"isEnd \(\)", 'after("^ $\|^,")', s)
︙
96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 ~~122~~ 123 124 125 126 127 128 129	100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134	+ - +	nState = 4 elif nState == 4: nState = 0 return sUp def countGroupInRegex (sRegex): "returns the number of groups in <sRegex>" try: return re.compile(sRegex).groups except: traceback.print_exc() print(sRegex) return 0 def createRule (s, nIdLine, sLang, bParagraph, dOptPriority): "returns rule as list [option name, regex, bCaseInsensitive, identifier, list of actions]" global dJSREGEXES global nRULEWITHOUTNAME sLineId = str(nIdLine) + ("p" if bParagraph else "s") sRuleId = sLineId #### GRAPH CALL if s.startswith("@@@@"): if bParagraph: ~~print("Error. Graph call can’t be made only after the first pass (sentence by sentence)")~~ print("Error. Graph call can be made only after the first pass (sentence by sentence)") exit() return ["@@@@", s[4:], sLineId] #### OPTIONS sOption = False # False or [a-z0-9]+ name nPriority = 4 # Default is 4, value must be between 0 and 9 tGroups = None # code for groups positioning (only useful for JavaScript)
︙
209 210 211 212 213 214 215 ~~216~~ 217 218 219 220 221 222 ~~223~~ 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 ~~242 243~~ 244 245 246 247 248 249 250 251 252 253 ~~254~~ 255 256 257 258 259 260 261	214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264	- + - + - - - +	sRegex = sRegex.replace("(?i)", "") sRegex = uppercase(sRegex, sLang) else: print("# Unknown case mode [" + cCaseMode + "] at line " + sLineId) ## check regex try: ~~~~z =~~ re.compile(sRegex)~~ re.compile(sRegex) except: print("# Regex error at line ", nIdLine) print(sRegex) traceback.print_exc() return None ## groups in non grouping parenthesis ~~for x in re.finditer("\(\?:[^)]\([[\w -]", sRegex):~~ for x in re.finditer(r"\(\?:[^)]\([[\w -]", sRegex): print("# Warning: groups inside non grouping parenthesis in regex at line " + sLineId) #### PARSE ACTIONS lActions = [] nAction = 1 for sAction in s.split(" <<- "): t = createAction(sRuleId + "_" + str(nAction), sAction, nGroup) nAction += 1 if t: lActions.append(t) if not lActions: return None return [sOption, sRegex, bCaseInsensitive, sLineId, sRuleId, nPriority, lActions, tGroups] def createAction (sIdAction, sAction, nGroup): "returns an action to perform as a tuple (condition, action type, action[, iGroup [, message, URL ]])" ~~global lFUNCTIONS~~ m = re.search(r"([-~=>])(\d*\|)>>", sAction) if not m: print("# No action at line " + sIdAction) return None #### CONDITION sCondition = sAction[:m.start()].strip() if sCondition: sCondition = prepareFunction(sCondition) lFUNCTIONS.append(("_c_"+sIdAction, sCondition)) ~~for x in re.finditer("[.](?:group\|start\|end)[(](\d+)[)]", sCondition):~~ for x in re.finditer(r"[.](?:group\|start\|end)[(](\d+)[)]", sCondition): if int(x.group(1)) > nGroup: print("# Error in groups in condition at line " + sIdAction + " ("+str(nGroup)+" groups only)") if ".match" in sCondition: print("# Error. JS compatibility. Don't use .match() in condition, use .search()") sCondition = "_c_"+sIdAction else: sCondition = None
︙
282 283 284 285 286 287 288 ~~289~~ 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 ~~305~~ 306 307 308 309 310 311 312	285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315	- + - +	mURL = re.search("[\|] (https?://.)", sMsg) if mURL: sURL = mURL.group(1).strip() sMsg = sMsg[:mURL.start(0)].strip() if sMsg[0:1] == "=": sMsg = prepareFunction(sMsg[1:]) lFUNCTIONS.append(("_m_"+sIdAction, sMsg)) ~~for x in re.finditer("group[(](\d+)[)]", sMsg):~~ for x in re.finditer(r"group[(](\d+)[)]", sMsg): if int(x.group(1)) > nGroup: print("# Error in groups in message at line " + sIdAction + " ("+str(nGroup)+" groups only)") sMsg = "=_m_"+sIdAction else: for x in re.finditer(r"\\(\d+)", sMsg): if int(x.group(1)) > nGroup: print("# Error in groups in message at line " + sIdAction + " ("+str(nGroup)+" groups only)") if re.search("[.]\\w+[(]", sMsg): print("# Error in message at line " + sIdAction + ": This message looks like code. Line should begin with =") if sAction[0:1] == "=" or cAction == "=": if "define" in sAction and not re.search(r"define\(\\\d+ , \[.\] \)", sAction): print("# Error in action at line " + sIdAction + ": second argument for define must be a list of strings") sAction = prepareFunction(sAction) sAction = sAction.replace("m.group(i[4])", "m.group("+str(iGroup)+")") ~~for x in re.finditer("group[(](\d+)[)]", sAction):~~ for x in re.finditer(r"group[(](\d+)[)]", sAction): if int(x.group(1)) > nGroup: print("# Error in groups in replacement at line " + sIdAction + " ("+str(nGroup)+" groups only)") else: for x in re.finditer(r"\\(\d+)", sAction): if int(x.group(1)) > nGroup: print("# Error in groups in replacement at line " + sIdAction + " ("+str(nGroup)+" groups only)") if re.search("[.]\\w+[(]\|sugg\\w+[(]", sAction):
︙
348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370	351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375	+ +	return [sCondition, cAction, ""] else: print("# Unknown action at line " + sIdAction) return None def _calcRulesStats (lRules): "count rules and actions" d = {'=':0, '~': 0, '-': 0, '>': 0} for aRule in lRules: if aRule[0] != "@@@@": for aAction in aRule[6]: d[aAction[1]] = d[aAction[1]] + 1 return (d, len(lRules)) def displayStats (lParagraphRules, lSentenceRules): "display rules numbers" print(" {:>18} {:>18} {:>18} {:>18}".format("DISAMBIGUATOR", "TEXT PROCESSOR", "GRAMMAR CHECKING", "REGEX")) d, nRule = _calcRulesStats(lParagraphRules) print("§ {:>10} actions {:>10} actions {:>10} actions in {:>8} rules".format(d['='], d['~'], d['-'], nRule)) d, nRule = _calcRulesStats(lSentenceRules) print("s {:>10} actions {:>10} actions {:>10} actions in {:>8} rules".format(d['='], d['~'], d['-'], nRule))
︙
399 400 401 402 403 404 405 ~~406~~ 407 408 409 410 411 412 413	404 405 406 407 408 409 410 411 412 413 414 415 416 417 418	- +	m = re.match("OPTGROUP/([a-z0-9]+):(.+)$", sLine) lStructOpt.append( (m.group(1), list(map(str.split, m.group(2).split(",")))) ) elif sLine.startswith("OPTSOFTWARE:"): lOpt = [ [s, {}] for s in sLine[12:].strip().split() ] # don’t use tuples (s, {}), because unknown to JS elif sLine.startswith("OPT/"): m = re.match("OPT/([a-z0-9]+):(.+)$", sLine) for i, sOpt in enumerate(m.group(2).split()): ~~lOpt[i][1][m.group(1)] = eval(sOpt)~~ lOpt[i][1][m.group(1)] = eval(sOpt) elif sLine.startswith("OPTPRIORITY/"): m = re.match("OPTPRIORITY/([a-z0-9]+): *([0-9])$", sLine) dOptPriority[m.group(1)] = int(m.group(2)) elif sLine.startswith("OPTLANG/"): m = re.match("OPTLANG/([a-z][a-z](?:_[A-Z][A-Z]\|)):(.+)$", sLine) sLang = m.group(1)[:2] dOptLabel[sLang] = { "__optiontitle__": m.group(2).strip() }
︙
423 424 425 426 427 428 429 430 431 432 433 434 435 436	428 429 430 431 432 433 434 435 436 437 438 439 440 441 442	+	print(" options defined for: " + ", ".join([ t[0] for t in lOpt ])) dOptions = { "lStructOpt": lStructOpt, "dOptLabel": dOptLabel, "sDefaultUILang": sDefaultUILang } dOptions.update({ "dOpt"+k: v for k, v in lOpt }) return dOptions, dOptPriority def printBookmark (nLevel, sComment, nLine): "print bookmark within the rules file" print(" {:>6}: {}".format(nLine, " " * nLevel + sComment)) def make (spLang, sLang, bJavaScript): "compile rules, returns a dictionary of values" # for clarity purpose, don’t create any file here
︙
569 570 571 572 573 574 575 ~~576 577 578 579 580 581 582 583 584~~ 585 586 ~~587 588~~ 589 ~~590~~	575 576 577 578 579 580 581 582 583 584 585 586 587 588 589 590 591 592 593 594 595 596	- - - - - - - - - + + + + + + + + + - - + + - +	sJSCallables += " },\n" sJSCallables += "}\n" displayStats(lParagraphRules, lSentenceRules) print("Unnamed rules: " + str(nRULEWITHOUTNAME)) d ~~= {~~ "callables": sPyCallables, "callablesJS": sJSCallables, "gctests": sGCTests, "gctestsJS": sGCTestsJS, "paragraph_rules": mergeRulesByOption(lParagraphRules), "sentence_rules": mergeRulesByOption(lSentenceRules), "paragraph_rules_JS": jsconv.writeRulesToJSArray(mergeRulesByOption(lParagraphRulesJS)), "sentence_rules_JS": jsconv.writeRulesToJSArray(mergeRulesByOption(lSentenceRulesJS)) } d.update(dOptions) dVars = { "callables": sPyCallables, "callablesJS": sJSCallables, "gctests": sGCTests, "gctestsJS": sGCTestsJS, "paragraph_rules": mergeRulesByOption(lParagraphRules), "sentence_rules": mergeRulesByOption(lSentenceRules), "paragraph_rules_JS": jsconv.writeRulesToJSArray(mergeRulesByOption(lParagraphRulesJS)), "sentence_rules_JS": jsconv.writeRulesToJSArray(mergeRulesByOption(lSentenceRulesJS)) } dVars.update(dOptions) # compile graph rules ~~d2 = crg.make(lGraphRule, dDEF, sLang, bJavaScript) d.update(d2)~~ dVars2 = crg.make(lGraphRule, dDEF, sLang, bJavaScript) dVars.update(dVars2) ~~return d~~ return dVars



1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21	1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25	+ + - + + +	""" Grammalecte: compile rules ~~# Create a Direct Acyclic Rule Graph (DARG)~~ Create a Direct Acyclic Rule Graphs (DARGs) """ import re import traceback import json import darg dACTIONS = {} dFUNCTIONS = {} def prepareFunction (s, bTokenValue=False): "convert simple rule syntax to a string of Python code" s = s.replace("__also__", "bCondMemo") s = s.replace("__else__", "not bCondMemo") s = re.sub(r"(morph\|analyse\|displayInfo)[(]\\(\d+)", 'g_\\1(lToken[\\2+nTokenOffset]', s) s = re.sub(r"(select\|exclude\|define)[(][\\](\d+)", 'g_\\1(lToken[\\2+nTokenOffset], dTags', s) s = re.sub(r"(tag_before\|tag_after)[(][\\](\d+)", 'g_\\1(lToken[\\2+nTokenOffset], dTags', s) s = re.sub(r"(switchGender\|has(?:Mas\|Fem)Form)[(]\\(\d+)", '\\1(lToken[\\2+nTokenOffset]["sValue"]', s) s = re.sub(r"(morph\|analyse)\(>1", 'g_\\1(lToken[nLastToken+1]', s) # next token
︙
35 36 37 38 39 40 41 42 43 44 45 46 47 48 49	39 40 41 42 43 44 45 46 47 48 49 50 51 52 53	- +	return s def genTokenLines (sTokenLine, dDef): "tokenize a string and return a list of lines of tokens" lToken = sTokenLine.split() lTokenLines = None ~~for i, sToken in ~~enumerate(~~lToken):~~ for sToken in lToken: # optional token? bNullPossible = sToken.startswith("?") and sToken.endswith("¿") if bNullPossible: sToken = sToken[1:-1] # token with definition? if sToken.startswith("({") and sToken.endswith("})") and sToken[1:-1] in dDef: sToken = "(" + dDef[sToken[1:-1]] + ")"
︙
92 93 94 95 96 97 98 99 100 101 102 103 104 105	96 97 98 99 100 101 102 103 104 105 106 107 108 109 110	+	for aRule in lTokenLines: aRule.append(sToken) for aRule in lTokenLines: yield aRule def createRule (iLine, sRuleName, sTokenLine, iActionBlock, sActions, nPriority, dDef): "generator: create rule as list" # print(iLine, "//", sRuleName, "//", sTokenLine, "//", sActions, "//", nPriority) for lToken in genTokenLines(sTokenLine, dDef): # Calculate positions dPos = {} # key: iGroup, value: iToken iGroup = 0 for i, sToken in enumerate(lToken): if sToken.startswith("(") and sToken.endswith(")"):
︙
117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149	122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158	+ + + +	dACTIONS[sActionId] = aAction lResult = list(lToken) lResult.extend(["##"+str(iLine), sActionId]) yield lResult def changeReferenceToken (sText, dPos): "change group reference in <sText> with values in <dPos>" for i in range(len(dPos), 0, -1): sText = sText.replace("\\"+str(i), "\\"+str(dPos[i])) return sText def checkTokenNumbers (sText, sActionId, nToken): "check if token references in <sText> greater than <nToken> (debugging)" for x in re.finditer(r"\\(\d+)", sText): if int(x.group(1)) > nToken: print("# Error in token index at line " + sActionId + " ("+str(nToken)+" tokens only)") print(sText) def checkIfThereIsCode (sText, sActionId): "check if there is code in <sText> (debugging)" if re.search("[.]\\w+[(]\|sugg\\w+[(]\|\\([0-9]\|\\[[0-9]", sText): print("# Warning at line " + sActionId + ": This message looks like code. Line should probably begin with =") print(sText) def createAction (sActionId, sAction, nPriority, nToken, dPos): "create action rule as a list" # Option sOption = False m = re.match("/(\\w+)/", sAction) if m: sOption = m.group(1) sAction = sAction[m.end():].strip() # valid action?
︙
365 366 367 368 369 370 371 ~~372~~ 373 374 375 376 ~~377~~	374 375 376 377 378 379 380 381 382 383 384 385	- + -	print("\nActions:") for sActionName, aAction in dACTIONS.items(): print(sActionName, aAction) print("\nFunctions:") print(sPyCallables) # Result ~~~~d =~~ {~~ return { "graph_callables": sPyCallables, "rules_graphs": dAllGraph, "rules_actions": dACTIONS } ~~return d~~


1 2 3 4 5 6 7 8	1 2 3 4 5 6 7 8 9 10	+ - + +	""" ~~# Convert Python code to JavaScript code~~ Convert Python code and regexes to JavaScript code """ import copy import re import json def py2js (sCode):
︙
114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144	116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148	+ +	sRegex = sRegex + "i" if not lNegLookBeforeRegex: lNegLookBeforeRegex = None return (sRegex, lNegLookBeforeRegex) def pyRuleToJS (lRule, dJSREGEXES, sWORDLIMITLEFT): "modify Python rules -> JS rules" lRuleJS = copy.deepcopy(lRule) # graph rules if lRuleJS[0] == "@@@@": return lRuleJS del lRule[-1] # tGroups positioning codes are useless for Python # error messages for aAction in lRuleJS[6]: if aAction[1] == "-": aAction[2] = aAction[2].replace(" ", " ") # nbsp --> nnbsp aAction[4] = aAction[4].replace("« ", "« ").replace(" »", " »").replace(" :", " :").replace(" :", " :") # js regexes lRuleJS[1], lNegLookBehindRegex = regex2js(dJSREGEXES.get(lRuleJS[3], lRuleJS[1]), sWORDLIMITLEFT) lRuleJS.append(lNegLookBehindRegex) return lRuleJS def writeRulesToJSArray (lRules): "create rules as a string of arrays (to be bundled in a JSON string)" sArray = "[\n" for sOption, aRuleGroup in lRules: if sOption != "@@@@": sArray += ' ["' + sOption + '", [\n' if sOption else " [false, [\n" for sRegex, bCaseInsensitive, sLineId, sRuleId, nPriority, lActions, aGroups, aNegLookBehindRegex in aRuleGroup: sArray += ' [' + sRegex + ", " sArray += "true, " if bCaseInsensitive else "false, "
︙
155 156 157 158 159 160 161 162 163 164 165	159 160 161 162 163 164 165 166 167 168 169 170	+	sArray += ' ["' + sGraphName + '", "' + sLineId + '"],\n"' sArray += " ]],\n" sArray += "]" return sArray def groupsPositioningCodeToList (sGroupsPositioningCode): "convert <sGroupsPositioningCode> to a list of codes (numbers or strings)" if not sGroupsPositioningCode: return None return [ int(sCode) if sCode.isdigit() or (sCode[0:1] == "-" and sCode[1:].isdigit()) else sCode \ for sCode in sGroupsPositioningCode.split(",") ]

Grammalecte Check-in [18191569f4]