87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
|
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
|
-
-
-
-
-
-
-
+
+
+
+
+
+
+
+
-
+
-
-
-
-
-
+
-
-
-
-
-
+
+
+
+
+
+
+
+
+
+
+
+
-
+
+
|
def _loadRules ():
from . import gc_rules
global _rules
_rules = gc_rules
# compile rules regex
for lRuleGroup in chain(_rules.lParagraphRules, _rules.lSentenceRules):
for rule in lRuleGroup[1]:
try:
rule[0] = re.compile(rule[0])
except:
echo("Bad regular expression in # " + str(rule[2]))
rule[0] = "(?i)<Grammalecte>"
for sOption, lRuleGroup in chain(_rules.lParagraphRules, _rules.lSentenceRules):
if sOption != "@@@@":
for aRule in lRuleGroup:
try:
aRule[0] = re.compile(aRule[0])
except:
echo("Bad regular expression in # " + str(aRule[2]))
aRule[0] = "(?i)<Grammalecte>"
#### Parsing
def parse (sText, sCountry="${country_default}", bDebug=False, dOptions=None, bContext=False):
"analyses the paragraph sText and returns list of errors"
#sText = unicodedata.normalize("NFC", sText)
aErrors = None
sRealText = sText
dDA = {} # Disambiguisator. Key = position; value = list of morphologies
dPriority = {} # Key = position; value = priority
dOpt = _dOptions if not dOptions else dOptions
bShowRuleId = option('idrule')
# parse paragraph
try:
sNew, aErrors = _proofread(sText, sRealText, 0, True, dDA, dPriority, sCountry, dOpt, bShowRuleId, bDebug, bContext)
sNew, aErrors = _proofread(None, sText, sRealText, 0, True, dDA, dPriority, sCountry, dOpt, bShowRuleId, bDebug, bContext)
if sNew:
sText = sNew
except:
raise
# cleanup
if " " in sText:
sText = sText.replace(" ", ' ') # nbsp
if " " in sText:
sText = sText.replace(" ", ' ') # nnbsp
if "'" in sText:
sText = sText.replace("'", "’")
if "‑" in sText:
sText = sText.replace("‑", "-") # nobreakdash
# parse sentences
for iStart, iEnd in _getSentenceBoundaries(sText):
if 4 < (iEnd - iStart) < 2000:
dDA.clear()
try:
# regex parser
_, errs = _proofread(sText[iStart:iEnd], sRealText[iStart:iEnd], iStart, False, dDA, dPriority, sCountry, dOpt, bShowRuleId, bDebug, bContext)
aErrors.update(errs)
# token parser
oSentence = TokenSentence(sText[iStart:iEnd], sRealText[iStart:iEnd], iStart)
bChange, errs = oSentence.parse(dAllGraph["test_graph"], dPriority, sCountry, dOpt, bShowRuleId, bDebug, bContext)
_, errs = _proofread(oSentence, sText[iStart:iEnd], sRealText[iStart:iEnd], iStart, False, dDA, dPriority, sCountry, dOpt, bShowRuleId, bDebug, bContext)
aErrors.update(errs)
if bChange:
oSentence.rewrite()
if bDebug:
print("~", oSentence.sSentence)
except:
raise
return aErrors.values() # this is a view (iterable)
_zEndOfSentence = re.compile(r'([.?!:;…][ .?!… »”")]*|.$)')
_zBeginOfParagraph = re.compile(r"^\W*")
_zEndOfParagraph = re.compile(r"\W*$")
def _getSentenceBoundaries (sText):
iStart = _zBeginOfParagraph.match(sText).end()
for m in _zEndOfSentence.finditer(sText):
yield (iStart, m.end())
iStart = m.end()
def _proofread (s, sx, nOffset, bParagraph, dDA, dPriority, sCountry, dOptions, bShowRuleId, bDebug, bContext):
def _proofread (oSentence, s, sx, nOffset, bParagraph, dDA, dPriority, sCountry, dOptions, bShowRuleId, bDebug, bContext):
dErrs = {}
bChange = False
for sOption, lRuleGroup in _getRules(bParagraph):
if sOption == "@@@@":
# graph rules
for sGraphName, sLineId in lRuleGroup:
if bDebug:
print(sGraphName, sLineId)
bChange, errs = oSentence.parse(dAllGraph[sGraphName], dPriority, sCountry, dOptions, bShowRuleId, bDebug, bContext)
dErrs.update(errs)
if bChange:
oSentence.rewrite()
if bDebug:
print("~", oSentence.sSentence)
if not sOption or dOptions.get(sOption, False):
elif not sOption or dOptions.get(sOption, False):
# regex rules
for zRegex, bUppercase, sLineId, sRuleId, nPriority, lActions in lRuleGroup:
if sRuleId not in _aIgnoredRules:
for m in zRegex.finditer(s):
bCondMemo = None
for sFuncCond, cActionType, sWhat, *eAct in lActions:
# action in lActions: [ condition, action type, replacement/suggestion/action[, iGroup[, message, URL]] ]
try:
|
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
|
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
|
+
-
-
-
+
+
+
|
if sFilter:
try:
zFilter = re.compile(sFilter)
except:
echo("# Error. List rules: wrong regex.")
sFilter = None
for sOption, lRuleGroup in chain(_getRules(True), _getRules(False)):
if sOption != "@@@@":
for _, _, sLineId, sRuleId, _, _ in lRuleGroup:
if not sFilter or zFilter.search(sRuleId):
yield (sOption, sLineId, sRuleId)
for _, _, sLineId, sRuleId, _, _ in lRuleGroup:
if not sFilter or zFilter.search(sRuleId):
yield (sOption, sLineId, sRuleId)
def displayRules (sFilter=None):
echo("List of rules. Filter: << " + str(sFilter) + " >>")
for sOption, sLineId, sRuleId in listRules(sFilter):
echo("{:<10} {:<10} {}".format(sOption, sLineId, sRuleId))
|