139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
|
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
|
-
+
|
if "‑" in sText:
sText = sText.replace("‑", "-") # nobreakdash
# parse sentences
for iStart, iEnd in _getSentenceBoundaries(sText):
if 4 < (iEnd - iStart) < 2000:
try:
oSentence = TokenSentence(sText[iStart:iEnd], sRealText[iStart:iEnd], iStart)
oSentence = TextParser(sText[iStart:iEnd], sRealText[iStart:iEnd], iStart)
_, dErrors = _proofread(oSentence, sText[iStart:iEnd], sRealText[iStart:iEnd], iStart, False, dErrors, dPriority, sCountry, dOpt, bShowRuleId, bDebug, bContext)
except:
raise
return dErrors.values() # this is a view (iterable)
def _proofread (oSentence, s, sx, nOffset, bParagraph, dErrors, dPriority, sCountry, dOptions, bShowRuleId, bDebug, bContext):
|
561
562
563
564
565
566
567
568
569
570
571
572
573
574
575
576
577
578
579
580
581
582
583
584
585
586
587
588
589
590
|
561
562
563
564
565
566
567
568
569
570
571
572
573
574
575
576
577
578
579
580
581
582
583
584
585
586
587
588
589
590
591
|
-
+
-
+
+
-
+
|
return True
dTokenPos[nPos]["lMorph"] = lMorph
return True
#### TOKEN SENTENCE CHECKER
#### TEXT PARSER
class TokenSentence:
class TextParser:
"Text parser"
def __init__ (self, sSentence, sSentence0, nOffset):
self.sSentence = sSentence
self.sSentence0 = sSentence0
self.nOffsetWithinParagraph = nOffset
self.lToken = list(_oTokenizer.genTokens(sSentence, True))
self.dTokenPos = { dToken["nStart"]: dToken for dToken in self.lToken if dToken["sType"] != "INFO" }
self.dTags = {}
self.dError = {}
def __str__ (self):
s = "TEXT ==========\n"
s = "sentence: " + self.sSentence0 + "\n"
s += "sentence: " + self.sSentence0 + "\n"
s += "now: " + self.sSentence + "\n"
for dToken in self.lToken:
s += f'{dToken["nStart"]}\t{dToken["nEnd"]}\t{dToken["sValue"]}'
if "lMorph" in dToken:
s += "\t" + str(dToken["lMorph"])
s += "\n"
for nPos, dToken in self.dTokenPos.items():
|
680
681
682
683
684
685
686
687
688
689
690
691
692
693
694
695
|
681
682
683
684
685
686
687
688
689
690
691
692
693
694
695
696
|
-
-
+
+
|
continue
if not sPattern or any(re.search(sPattern, sMorph) for sMorph in _oSpellChecker.getMorph(dToken["sValue"])):
if bDebug:
print(" MATCH: @" + sRegex)
yield dGraph[dNode["<re_morph>"][sRegex]]
# token tags
if "tags" in dToken and "<tags>" in dNode:
for sTag in dNode["<tags>"]:
if sTag in dToken["tags"]:
for sTag in dToken["tags"]:
if sTag in dNode["<tags>"]:
if bDebug:
print(" MATCH: /" + sTag)
yield dGraph[dNode["<tags>"][sTag]]
# meta arc (for token type)
if "<meta>" in dNode:
for sMeta in dNode["<meta>"]:
# not regex here, we just search if <dNode["sType"]> exists within <sMeta>
|
785
786
787
788
789
790
791
792
793
794
795
796
797
798
799
|
786
787
788
789
790
791
792
793
794
795
796
797
798
799
800
|
-
+
|
if bDebug:
print(" COND_OK")
pass
elif cActionType == "/":
if bDebug:
print(" SEMANTIC_TAG:\n ", dRule[sRuleId])
nTokenStart = nTokenOffset + eAct[0]
nTokenEnd = nTokenOffset + eAct[1]
nTokenEnd = nTokenOffset + (eAct[1] if eAct[1] else eAct[0])
for i in range(nTokenStart, nTokenEnd+1):
if "tags" in self.lToken[i]:
self.lToken[i]["tags"].update(sWhat.split("|"))
else:
self.lToken[i]["tags"] = set(sWhat.split("|"))
elif cActionType == "%":
# sentence tags
|