Grammalecte  Diff

Differences From Artifact [d2b4f44276]:

To Artifact [903e91d939]:


139
140
141
142
143
144
145
146

147
148
149
150
151
152
153
139
140
141
142
143
144
145

146
147
148
149
150
151
152
153







-
+







    if "‑" in sText:
        sText = sText.replace("‑", "-") # nobreakdash

    # parse sentences
    for iStart, iEnd in _getSentenceBoundaries(sText):
        if 4 < (iEnd - iStart) < 2000:
            try:
                oSentence = TokenSentence(sText[iStart:iEnd], sRealText[iStart:iEnd], iStart)
                oSentence = TextParser(sText[iStart:iEnd], sRealText[iStart:iEnd], iStart)
                _, dErrors = _proofread(oSentence, sText[iStart:iEnd], sRealText[iStart:iEnd], iStart, False, dErrors, dPriority, sCountry, dOpt, bShowRuleId, bDebug, bContext)
            except:
                raise
    return dErrors.values() # this is a view (iterable)


def _proofread (oSentence, s, sx, nOffset, bParagraph, dErrors, dPriority, sCountry, dOptions, bShowRuleId, bDebug, bContext):
561
562
563
564
565
566
567
568

569
570

571
572
573
574
575
576
577
578
579
580
581
582

583

584
585
586
587
588
589
590
561
562
563
564
565
566
567

568
569

570
571
572
573
574
575
576
577
578
579
580
581
582
583

584
585
586
587
588
589
590
591







-
+

-
+












+
-
+







        return True
    dTokenPos[nPos]["lMorph"] = lMorph
    return True




#### TOKEN SENTENCE CHECKER
#### TEXT PARSER

class TokenSentence:
class TextParser:
    "Text parser"

    def __init__ (self, sSentence, sSentence0, nOffset):
        self.sSentence = sSentence
        self.sSentence0 = sSentence0
        self.nOffsetWithinParagraph = nOffset
        self.lToken = list(_oTokenizer.genTokens(sSentence, True))
        self.dTokenPos = { dToken["nStart"]: dToken  for dToken in self.lToken  if dToken["sType"] != "INFO" }
        self.dTags = {}
        self.dError = {}

    def __str__ (self):
        s = "TEXT ==========\n"
        s = "sentence: " + self.sSentence0 + "\n"
        s += "sentence: " + self.sSentence0 + "\n"
        s += "now:      " + self.sSentence  + "\n"
        for dToken in self.lToken:
            s += f'{dToken["nStart"]}\t{dToken["nEnd"]}\t{dToken["sValue"]}'
            if "lMorph" in dToken:
                s += "\t" + str(dToken["lMorph"])
            s += "\n"
        for nPos, dToken in self.dTokenPos.items():
680
681
682
683
684
685
686
687
688


689
690
691
692
693
694
695
681
682
683
684
685
686
687


688
689
690
691
692
693
694
695
696







-
-
+
+







                                continue
                            if not sPattern or any(re.search(sPattern, sMorph)  for sMorph in _oSpellChecker.getMorph(dToken["sValue"])):
                                if bDebug:
                                    print("  MATCH: @" + sRegex)
                                yield dGraph[dNode["<re_morph>"][sRegex]]
        # token tags
        if "tags" in dToken and "<tags>" in dNode:
            for sTag in dNode["<tags>"]:
                if sTag in dToken["tags"]:
            for sTag in dToken["tags"]:
                if sTag in dNode["<tags>"]:
                    if bDebug:
                        print("  MATCH: /" + sTag)
                    yield dGraph[dNode["<tags>"][sTag]]
        # meta arc (for token type)
        if "<meta>" in dNode:
            for sMeta in dNode["<meta>"]:
                # not regex here, we just search if <dNode["sType"]> exists within <sMeta>
785
786
787
788
789
790
791
792

793
794
795
796
797
798
799
786
787
788
789
790
791
792

793
794
795
796
797
798
799
800







-
+







                                if bDebug:
                                    print("  COND_OK")
                                pass
                            elif cActionType == "/":
                                if bDebug:
                                    print("  SEMANTIC_TAG:\n  ", dRule[sRuleId])
                                nTokenStart = nTokenOffset + eAct[0]
                                nTokenEnd = nTokenOffset + eAct[1]
                                nTokenEnd = nTokenOffset + (eAct[1]  if eAct[1]  else eAct[0])
                                for i in range(nTokenStart, nTokenEnd+1):
                                    if "tags" in self.lToken[i]:
                                        self.lToken[i]["tags"].update(sWhat.split("|"))
                                    else:
                                        self.lToken[i]["tags"] = set(sWhat.split("|"))
                            elif cActionType == "%":
                                # sentence tags