1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
|
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
|
-
-
+
+
-
+
-
+
-
-
+
+
|
import re
import sys
import traceback
import copy
import json
from distutils import file_util
DEF = {}
FUNCTIONS = []
dDEF = {}
lFUNCTIONS = []
RULESET = set() # set of rule-ids to check if there is several rules with the same id
aRULESET = set() # set of rule-ids to check if there is several rules with the same id
JSREGEXES = {}
dJSREGEXES = {}
WORDLIMITLEFT = r"(?<![\w.,–-])" # r"(?<![-.,—])\b" seems slower
WORDLIMITRIGHT = r"(?![\w–-])" # r"\b(?!-—)" seems slower
sWORDLIMITLEFT = r"(?<![\w.,–-])" # r"(?<![-.,—])\b" seems slower
sWORDLIMITRIGHT = r"(?![\w–-])" # r"\b(?!-—)" seems slower
def prepareFunction (s):
s = s.replace("__also__", "bCondMemo")
s = s.replace("__else__", "not bCondMemo")
s = re.sub(r"(select|exclude)[(][\\](\d+)", '\\1(dDA, m.start(\\2), m.group(\\2)', s)
s = re.sub(r"define[(][\\](\d+)", 'define(dDA, m.start(\\1)', s)
|
︙ | | |
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
|
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
|
-
+
-
+
-
+
|
traceback.print_exc()
print(sRegex)
return 0
def createRule (s, nIdLine, sLang, bParagraph, dOptPriority):
"returns rule as list [option name, regex, bCaseInsensitive, identifier, list of actions]"
global JSREGEXES
global dJSREGEXES
#### OPTIONS
sLineId = str(nIdLine) + ("p" if bParagraph else "s")
sRuleId = sLineId
sOption = False # False or [a-z0-9]+ name
nPriority = 4 # Default is 4, value must be between 0 and 9
tGroups = None # code for groups positioning (only useful for JavaScript)
cCaseMode = 'i' # i: case insensitive, s: case sensitive, u: uppercasing allowed
cWordLimitLeft = '[' # [: word limit, <: no specific limit
cWordLimitRight = ']' # ]: word limit, >: no specific limit
m = re.match("^__(?P<borders_and_case>[[<]\\w[]>])(?P<option>/[a-zA-Z0-9]+|)(?P<ruleid>\\(\\w+\\)|)(?P<priority>![0-9]|)__ *", s)
if m:
cWordLimitLeft = m.group('borders_and_case')[0]
cCaseMode = m.group('borders_and_case')[1]
cWordLimitRight = m.group('borders_and_case')[2]
sOption = m.group('option')[1:] if m.group('option') else False
if m.group('ruleid'):
sRuleId = m.group('ruleid')[1:-1]
if sRuleId in RULESET:
if sRuleId in aRULESET:
print("# Error. Several rules have the same id: " + sRuleId)
exit()
RULESET.add(sRuleId)
aRULESET.add(sRuleId)
nPriority = dOptPriority.get(sOption, 4)
if m.group('priority'):
nPriority = int(m.group('priority')[1:])
s = s[m.end(0):]
else:
print("# Warning. No option defined at line: " + sLineId)
|
︙ | | |
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
|
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
|
-
+
-
+
-
+
-
+
|
m = re.search("@@\\S+", sRegex)
if m:
tGroups = groupsPositioningCodeToList(sRegex[m.start()+2:])
sRegex = sRegex[:m.start()].strip()
# JS regex
m = re.search("<js>.+</js>i?", sRegex)
if m:
JSREGEXES[sLineId] = m.group(0)
dJSREGEXES[sLineId] = m.group(0)
sRegex = sRegex[:m.start()].strip()
if "<js>" in sRegex or "</js>" in sRegex:
print("# Error: JavaScript regex not delimited at line " + sLineId)
return None
# quotes ?
if sRegex.startswith('"') and sRegex.endswith('"'):
sRegex = sRegex[1:-1]
## definitions
for sDef, sRepl in DEF.items():
for sDef, sRepl in dDEF.items():
sRegex = sRegex.replace(sDef, sRepl)
## count number of groups (must be done before modifying the regex)
nGroup = countGroupInRegex(sRegex)
if nGroup > 0:
if not tGroups:
print("# Warning: groups positioning code for JavaScript should be defined at line " + sLineId)
else:
if nGroup != len(tGroups):
print("# Error: groups positioning code irrelevant at line " + sLineId)
## word limit
if cWordLimitLeft == '[' and not sRegex.startswith(("^", '’', "'", ",")):
sRegex = WORDLIMITLEFT + sRegex
sRegex = sWORDLIMITLEFT + sRegex
if cWordLimitRight == ']' and not sRegex.endswith(("$", '’', "'", ",")):
sRegex = sRegex + WORDLIMITRIGHT
sRegex = sRegex + sWORDLIMITRIGHT
## casing mode
if cCaseMode == "i":
bCaseInsensitive = True
if not sRegex.startswith("(?i)"):
sRegex = "(?i)" + sRegex
elif cCaseMode == "s":
|
︙ | | |
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
|
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
|
-
+
-
+
|
return None
return [sOption, sRegex, bCaseInsensitive, sLineId, sRuleId, nPriority, lActions, tGroups]
def createAction (sIdAction, sAction, nGroup):
"returns an action to perform as a tuple (condition, action type, action[, iGroup [, message, URL ]])"
global FUNCTIONS
global lFUNCTIONS
m = re.search(r"([-~=>])(\d*|)>>", sAction)
if not m:
print("# No action at line " + sIdAction)
return None
#### CONDITION
sCondition = sAction[:m.start()].strip()
if sCondition:
sCondition = prepareFunction(sCondition)
FUNCTIONS.append(("c_"+sIdAction, sCondition))
lFUNCTIONS.append(("c_"+sIdAction, sCondition))
for x in re.finditer("[.](?:group|start|end)[(](\d+)[)]", sCondition):
if int(x.group(1)) > nGroup:
print("# Error in groups in condition at line " + sIdAction + " ("+str(nGroup)+" groups only)")
if ".match" in sCondition:
print("# Error. JS compatibility. Don't use .match() in condition, use .search()")
sCondition = "c_"+sIdAction
else:
|
︙ | | |
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
|
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
|
-
+
|
sURL = ""
mURL = re.search("[|] *(https?://.*)", sMsg)
if mURL:
sURL = mURL.group(1).strip()
sMsg = sMsg[:mURL.start(0)].strip()
if sMsg[0:1] == "=":
sMsg = prepareFunction(sMsg[1:])
FUNCTIONS.append(("m_"+sIdAction, sMsg))
lFUNCTIONS.append(("m_"+sIdAction, sMsg))
for x in re.finditer("group[(](\d+)[)]", sMsg):
if int(x.group(1)) > nGroup:
print("# Error in groups in message at line " + sIdAction + " ("+str(nGroup)+" groups only)")
sMsg = "=m_"+sIdAction
else:
for x in re.finditer(r"\\(\d+)", sMsg):
if int(x.group(1)) > nGroup:
|
︙ | | |
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
|
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
|
-
+
-
+
-
+
|
print("# Error in action at line " + sIdAction + ": This action looks like code. Line should begin with =")
if cAction == "-":
## error detected --> suggestion
if not sAction:
print("# Error in action at line " + sIdAction + ": This action is empty.")
if sAction[0:1] == "=":
FUNCTIONS.append(("s_"+sIdAction, sAction[1:]))
lFUNCTIONS.append(("s_"+sIdAction, sAction[1:]))
sAction = "=s_"+sIdAction
elif sAction.startswith('"') and sAction.endswith('"'):
sAction = sAction[1:-1]
if not sMsg:
print("# Error in action at line " + sIdAction + ": the message is empty.")
return [sCondition, cAction, sAction, iGroup, sMsg, sURL]
elif cAction == "~":
## text processor
if not sAction:
print("# Error in action at line " + sIdAction + ": This action is empty.")
if sAction[0:1] == "=":
FUNCTIONS.append(("p_"+sIdAction, sAction[1:]))
lFUNCTIONS.append(("p_"+sIdAction, sAction[1:]))
sAction = "=p_"+sIdAction
elif sAction.startswith('"') and sAction.endswith('"'):
sAction = sAction[1:-1]
return [sCondition, cAction, sAction, iGroup]
elif cAction == "=":
## disambiguator
if sAction[0:1] == "=":
sAction = sAction[1:]
if not sAction:
print("# Error in action at line " + sIdAction + ": This action is empty.")
FUNCTIONS.append(("d_"+sIdAction, sAction))
lFUNCTIONS.append(("d_"+sIdAction, sAction))
sAction = "d_"+sIdAction
return [sCondition, cAction, sAction]
elif cAction == ">":
## no action, break loop if condition is False
return [sCondition, cAction, ""]
else:
print("# Unknown action at line " + sIdAction)
|
︙ | | |
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
|
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
|
-
-
+
+
|
# Ā-ʯ 0100-02AF (mixed)
# -> a-zA-Zà-öÀ-Ö0-9_ø-ÿØ-ßĀ-ʯ
bCaseInsensitive = False
if "(?i)" in sRegex:
sRegex = sRegex.replace("(?i)", "")
bCaseInsensitive = True
lNegLookBeforeRegex = []
if WORDLIMITLEFT in sRegex:
sRegex = sRegex.replace(WORDLIMITLEFT, "")
if sWORDLIMITLEFT in sRegex:
sRegex = sRegex.replace(sWORDLIMITLEFT, "")
lNegLookBeforeRegex = ["[a-zA-Zà-öÀ-Ö0-9_ø-ÿØ-ßĀ-ʯ.,–-]$"]
sRegex = sRegex.replace("[\\w", "[a-zA-Zà-öÀ-Ö0-9_ø-ÿØ-ßĀ-ʯ")
sRegex = sRegex.replace("\\w", "[a-zA-Zà-öÀ-Ö0-9_ø-ÿØ-ßĀ-ʯ]")
sRegex = sRegex.replace("[.]", r"\.")
if not sRegex.startswith("<js>"):
sRegex = sRegex.replace("/", r"\/")
m = re.search(r"\(\?<!([^()]+)\)", sRegex) # Negative lookbefore assertion should always be at the beginning of regex
|
︙ | | |
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
|
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
|
-
+
|
lRuleJS = copy.deepcopy(lRule)
del lRule[-1] # tGroups positioning codes are useless for Python
# error messages
for aAction in lRuleJS[6]:
if aAction[1] == "-":
aAction[4] = aAction[4].replace("« ", "« ").replace(" »", " »")
# js regexes
lRuleJS[1], lNegLookBehindRegex = regex2js( JSREGEXES.get(lRuleJS[3], lRuleJS[1]) )
lRuleJS[1], lNegLookBehindRegex = regex2js( dJSREGEXES.get(lRuleJS[3], lRuleJS[1]) )
lRuleJS.append(lNegLookBehindRegex)
return lRuleJS
def writeRulesToJSArray (lRules):
sArray = "[\n"
for sOption, aRuleGroup in lRules:
|
︙ | | |
542
543
544
545
546
547
548
549
550
551
552
553
554
555
556
557
558
559
560
561
562
563
564
565
566
567
568
569
|
542
543
544
545
546
547
548
549
550
551
552
553
554
555
556
557
558
559
560
561
562
563
564
565
566
567
568
569
|
-
+
-
+
|
def make (lRules, sLang, bJavaScript):
"compile rules, returns a dictionary of values"
# for clarity purpose, don’t create any file here
# removing comments, zeroing empty lines, creating definitions, storing tests, merging rule lines
print(" parsing rules...")
global DEF
global dDEF
lLine = []
lRuleLine = []
lTest = []
lOpt = []
for i, sLine in enumerate(lRules, 1):
if sLine.startswith('#END'):
break
elif sLine.startswith("#"):
pass
elif sLine.startswith("DEF:"):
m = re.match("DEF: +([a-zA-Z_][a-zA-Z_0-9]*) +(.+)$", sLine.strip())
if m:
DEF["{"+m.group(1)+"}"] = m.group(2)
dDEF["{"+m.group(1)+"}"] = m.group(2)
else:
print("Error in definition: ", end="")
print(sLine.strip())
elif sLine.startswith("TEST:"):
lTest.append("{:<8}".format(i) + " " + sLine[5:].strip())
elif sLine.startswith("TODO:"):
pass
|
︙ | | |
611
612
613
614
615
616
617
618
619
620
621
622
623
624
625
|
611
612
613
614
615
616
617
618
619
620
621
622
623
624
625
|
-
+
|
lSentenceRules.append(aRule)
lSentenceRulesJS.append(pyRuleToJS(aRule))
# creating file with all functions callable by rules
print(" creating callables...")
sPyCallables = "# generated code, do not edit\n"
sJSCallables = "// generated code, do not edit\nconst oEvalFunc = {\n"
for sFuncName, sReturn in FUNCTIONS:
for sFuncName, sReturn in lFUNCTIONS:
cType = sFuncName[0:1]
if cType == "c": # condition
sParams = "s, sx, m, dDA, sCountry, bCondMemo"
elif cType == "m": # message
sParams = "s, m"
elif cType == "s": # suggestion
sParams = "s, m"
|
︙ | | |