191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
|
global _dOptions
_dOptions = getDefaultOptions()
#### Parsing
def parse (sText, sCountry="${country_default}", bDebug=False, dOptions=None, bContext=False, bFullInfo=False):
"init point to analyse <sText> and returns an iterable of errors or (with option <bFullInfo>) a list of sentences with tokens and errors"
oText = TextParser(sText)
return oText.parse(sCountry, bDebug, dOptions, bContext, bFullInfo)
#### TEXT PARSER
class TextParser:
|
|
|
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
|
global _dOptions
_dOptions = getDefaultOptions()
#### Parsing
def parse (sText, sCountry="${country_default}", bDebug=False, dOptions=None, bContext=False, bFullInfo=False):
"init point to analyse <sText> and returns an iterable of errors or (with option <bFullInfo>) paragraphs errors and sentences with tokens and errors"
oText = TextParser(sText)
return oText.parse(sCountry, bDebug, dOptions, bContext, bFullInfo)
#### TEXT PARSER
class TextParser:
|
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
|
s += "\t" + str(dToken["aTags"])
s += "\n"
#for nPos, dToken in self.dTokenPos.items():
# s += "{}\t{}\n".format(nPos, dToken)
return s
def parse (self, sCountry="${country_default}", bDebug=False, dOptions=None, bContext=False, bFullInfo=False):
"analyses <sText> and returns an iterable of errors or (with option <bFullInfo>) a list of sentences with tokens and errors"
#sText = unicodedata.normalize("NFC", sText)
dOpt = dOptions or _dOptions
bShowRuleId = option('idrule')
# parse paragraph
try:
self.parseText(self.sText, self.sText0, True, 0, sCountry, dOpt, bShowRuleId, bDebug, bContext)
except:
raise
# parse sentences
sText = self._getCleanText()
lSentences = []
for iStart, iEnd in text.getSentenceBoundaries(sText):
if 4 < (iEnd - iStart) < 2000:
try:
self.sSentence = sText[iStart:iEnd]
self.sSentence0 = self.sText0[iStart:iEnd]
self.nOffsetWithinParagraph = iStart
self.lToken = list(_oTokenizer.genTokens(self.sSentence, True))
self.dTokenPos = { dToken["nStart"]: dToken for dToken in self.lToken if dToken["sType"] != "INFO" }
if bFullInfo:
dSentence = { "nStart": iStart, "nEnd": iEnd, "sSentence": self.sSentence, "lToken": list(self.lToken) }
# the list of tokens is duplicated, to keep all tokens from being deleted when analysis
self.parseText(self.sSentence, self.sSentence0, False, iStart, sCountry, dOpt, bShowRuleId, bDebug, bContext)
if bFullInfo:
dSentence["aGrammarErrors"] = list(self.dSentenceError.values())
lSentences.append(dSentence)
self.dSentenceError.clear()
except:
raise
if bFullInfo:
# Grammar checking and sentence analysis
return lSentences
else:
# Grammar checking only
return self.dError.values() # this is a view (iterable)
def _getCleanText (self):
sText = self.sText
if " " in sText:
|
|
>
>
>
>
<
>
>
>
|
|
|
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
|
s += "\t" + str(dToken["aTags"])
s += "\n"
#for nPos, dToken in self.dTokenPos.items():
# s += "{}\t{}\n".format(nPos, dToken)
return s
def parse (self, sCountry="${country_default}", bDebug=False, dOptions=None, bContext=False, bFullInfo=False):
"analyses <sText> and returns an iterable of errors or (with option <bFullInfo>) paragraphs errors and sentences with tokens and errors"
#sText = unicodedata.normalize("NFC", sText)
dOpt = dOptions or _dOptions
bShowRuleId = option('idrule')
# parse paragraph
try:
self.parseText(self.sText, self.sText0, True, 0, sCountry, dOpt, bShowRuleId, bDebug, bContext)
except:
raise
if bFullInfo:
lParagraphErrors = list(self.dError.values())
lSentences = []
self.dSentenceError.clear()
# parse sentences
sText = self._getCleanText()
for iStart, iEnd in text.getSentenceBoundaries(sText):
if 4 < (iEnd - iStart) < 2000:
try:
self.sSentence = sText[iStart:iEnd]
self.sSentence0 = self.sText0[iStart:iEnd]
self.nOffsetWithinParagraph = iStart
self.lToken = list(_oTokenizer.genTokens(self.sSentence, True))
self.dTokenPos = { dToken["nStart"]: dToken for dToken in self.lToken if dToken["sType"] != "INFO" }
if bFullInfo:
dSentence = { "nStart": iStart, "nEnd": iEnd, "sSentence": self.sSentence, "lToken": list(self.lToken) }
for dToken in dSentence["lToken"]:
if dToken["sType"] == "WORD":
dToken["bValidToken"] = _oSpellChecker.isValidToken(dToken["sValue"])
# the list of tokens is duplicated, to keep all tokens from being deleted when analysis
self.parseText(self.sSentence, self.sSentence0, False, iStart, sCountry, dOpt, bShowRuleId, bDebug, bContext)
if bFullInfo:
dSentence["lGrammarErrors"] = list(self.dSentenceError.values())
lSentences.append(dSentence)
self.dSentenceError.clear()
except:
raise
if bFullInfo:
# Grammar checking and sentence analysis
return lParagraphErrors, lSentences
else:
# Grammar checking only
return self.dError.values() # this is a view (iterable)
def _getCleanText (self):
sText = self.sText
if " " in sText:
|