190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
|
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
|
-
-
+
+
-
+
-
-
-
-
+
+
+
+
+
-
-
+
+
+
+
+
+
-
-
-
-
+
-
-
-
-
-
-
-
-
-
+
-
-
+
-
-
-
-
-
-
-
-
-
-
+
-
-
-
-
-
-
-
-
-
-
-
+
+
+
+
+
+
+
+
|
"set options to default values"
global _dOptions
_dOptions = getDefaultOptions()
#### Parsing
def parse (sText, sCountry="${country_default}", bDebug=False, dOptions=None, bContext=False):
"init point to analyze a text"
def parse (sText, sCountry="${country_default}", bDebug=False, dOptions=None, bContext=False, bFullInfo=False):
"init point to analyse <sText> and returns an iterable of errors or (with option <bFullInfo>) a list of sentences with tokens and errors"
oText = TextParser(sText)
return oText.parse(sCountry, bDebug, dOptions, bContext)
return oText.parse(sCountry, bDebug, dOptions, bContext, bFullInfo)
#### TEXT PARSER
class TextParser:
"Text parser"
def __init__ (self, sText):
self.sText = sText
self.sText0 = sText
self.sSentence = ""
self.sSentence0 = ""
self.nOffsetWithinParagraph = 0
self.lToken = []
self.dTokenPos = {}
self.dTags = {}
self.dError = {}
self.dErrorPriority = {} # Key = position; value = priority
self.dTokenPos = {} # {position: token}
self.dTags = {} # {position: tags}
self.dError = {} # {position: error}
self.dSentenceError = {} # {position: error} (for the current sentence only)
self.dErrorPriority = {} # {position: priority of the current error}
def __str__ (self):
s = "===== TEXT =====\n"
s += "sentence: " + self.sSentence0 + "\n"
s += "now: " + self.sSentence + "\n"
for dToken in self.lToken:
s += '#{i}\t{nStart}:{nEnd}\t{sValue}\t{sType}'.format(**dToken)
if "lMorph" in dToken:
s += "\t" + str(dToken["lMorph"])
if "aTags" in dToken:
s += "\t" + str(dToken["aTags"])
s += "\n"
#for nPos, dToken in self.dTokenPos.items():
# s += "{}\t{}\n".format(nPos, dToken)
return s
def parse (self, sCountry="${country_default}", bDebug=False, dOptions=None, bContext=False):
"analyses sText and returns an iterable of errors"
def parse (self, sCountry="${country_default}", bDebug=False, dOptions=None, bContext=False, bFullInfo=False):
"analyses <sText> and returns an iterable of errors or (with option <bFullInfo>) a list of sentences with tokens and errors"
#sText = unicodedata.normalize("NFC", sText)
dOpt = dOptions or _dOptions
bShowRuleId = option('idrule')
# parse paragraph
try:
self.parseText(self.sText, self.sText0, True, 0, sCountry, dOpt, bShowRuleId, bDebug, bContext)
except:
raise
# parse sentences
sText = self._getCleanText()
lSentences = []
for iStart, iEnd in text.getSentenceBoundaries(sText):
if 4 < (iEnd - iStart) < 2000:
try:
self.sSentence = sText[iStart:iEnd]
self.sSentence0 = self.sText0[iStart:iEnd]
self.nOffsetWithinParagraph = iStart
self.lToken = list(_oTokenizer.genTokens(self.sSentence, True))
self.dTokenPos = { dToken["nStart"]: dToken for dToken in self.lToken if dToken["sType"] != "INFO" }
if bFullInfo:
dSentence = { "nStart": iStart, "nEnd": iEnd, "sSentence": self.sSentence, "lToken": list(self.lToken) }
# the list of tokens is duplicated, to keep all tokens from being deleted when analysis
self.parseText(self.sSentence, self.sSentence0, False, iStart, sCountry, dOpt, bShowRuleId, bDebug, bContext)
except:
raise
return self.dError.values() # this is a view (iterable)
if bFullInfo:
def parseAndGetSentences (self, sCountry="${country_default}", bDebug=False):
"analyses sText and returns a list of sentences with their tokens"
#sText = unicodedata.normalize("NFC", sText)
# parse paragraph
try:
self.parseText(self.sText, self.sText0, True, 0, sCountry, dOptions, bShowRuleId, bDebug, bContext)
except:
raise
# parse sentences
dSentence["aGrammarErrors"] = list(self.dSentenceError.values())
sText = self._getCleanText()
lSentence = []
lSentences.append(dSentence)
i = 0
for iStart, iEnd in text.getSentenceBoundaries(sText):
try:
self.sSentence = sText[iStart:iEnd]
self.sSentence0 = self.sText0[iStart:iEnd]
self.nOffsetWithinParagraph = iStart
self.lToken = list(_oTokenizer.genTokens(self.sSentence, True))
self.dTokenPos = { dToken["nStart"]: dToken for dToken in self.lToken if dToken["sType"] != "INFO" }
i += 1
lSentence.append({
self.dSentenceError.clear()
"i": i,
"iStart": iStart,
"iEnd": iEnd,
"sSentence": self.sSentence,
"sSentence0": self.sSentence0,
"lToken": list(lToken) # this is a copy
})
self.parseText(self.sSentence, self.sSentence0, False, iStart, sCountry, dOptions, False, False, False)
except:
raise
return lSentence
except:
raise
if bFullInfo:
# Grammar checking and sentence analysis
return lSentences
else:
# Grammar checking only
return self.dError.values() # this is a view (iterable)
def _getCleanText (self):
sText = self.sText
if " " in sText:
sText = sText.replace(" ", ' ') # nbsp
if " " in sText:
sText = sText.replace(" ", ' ') # nnbsp
|
333
334
335
336
337
338
339
340
341
342
343
344
345
346
|
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
|
+
|
echo("RULE: " + sLineId)
if cActionType == "-":
# grammar error
nErrorStart = nOffset + m.start(eAct[0])
if nErrorStart not in self.dError or nPriority > self.dErrorPriority.get(nErrorStart, -1):
self.dError[nErrorStart] = self._createErrorFromRegex(sText, sText0, sWhat, nOffset, m, eAct[0], sLineId, sRuleId, bUppercase, eAct[1], eAct[2], bShowRuleId, sOption, bContext)
self.dErrorPriority[nErrorStart] = nPriority
self.dSentenceError[nErrorStart] = self.dError[nErrorStart]
elif cActionType == "~":
# text processor
sText = self.rewriteText(sText, sWhat, eAct[0], m, bUppercase)
bChange = True
if bDebug:
echo("~ " + sText + " -- " + m.group(eAct[0]) + " # " + sLineId)
elif cActionType == "=":
|
556
557
558
559
560
561
562
563
564
565
566
567
568
569
|
538
539
540
541
542
543
544
545
546
547
548
549
550
551
552
|
+
|
if "bImmune" not in self.lToken[nTokenErrorStart]:
nTokenErrorEnd = nTokenOffset + iTokenEnd if iTokenEnd > 0 else nLastToken + iTokenEnd
nErrorStart = self.nOffsetWithinParagraph + (self.lToken[nTokenErrorStart]["nStart"] if cStartLimit == "<" else self.lToken[nTokenErrorStart]["nEnd"])
nErrorEnd = self.nOffsetWithinParagraph + (self.lToken[nTokenErrorEnd]["nEnd"] if cEndLimit == ">" else self.lToken[nTokenErrorEnd]["nStart"])
if nErrorStart not in self.dError or nPriority > self.dErrorPriority.get(nErrorStart, -1):
self.dError[nErrorStart] = self._createErrorFromTokens(sWhat, nTokenOffset, nLastToken, nTokenErrorStart, nErrorStart, nErrorEnd, sLineId, sRuleId, bCaseSvty, sMessage, sURL, bShowRuleId, sOption, bContext)
self.dErrorPriority[nErrorStart] = nPriority
self.dSentenceError[nErrorStart] = self.dError[nErrorStart]
if bDebug:
echo(" NEW_ERROR: {}".format(self.dError[nErrorStart]))
elif cActionType == "~":
# text processor
nTokenStart = nTokenOffset + eAct[0] if eAct[0] > 0 else nLastToken + eAct[0]
nTokenEnd = nTokenOffset + eAct[1] if eAct[1] > 0 else nLastToken + eAct[1]
self._tagAndPrepareTokenForRewriting(sWhat, nTokenStart, nTokenEnd, nTokenOffset, nLastToken, eAct[2], bDebug)
|