Grammalecte  Check-in [5fc6d41650]

Overview
Comment:[core] DARG: text processing
Downloads: Tarball | ZIP archive | SQL archive
Timelines: family | ancestors | descendants | both | core | rg
Files: files | file ages | folders
SHA3-256: 5fc6d416505febdcd0257332fbc8902b0b846ab936dcf5b4727bc48e2057efc5
User & Date: olr on 2018-06-06 15:24:46
Other Links: branch diff | manifest | tags
Context
2018-06-06
16:55
[build][core] tokens auto selection check-in: e0f39d31ba user: olr tags: core, build, rg
15:24
[core] DARG: text processing check-in: 5fc6d41650 user: olr tags: core, rg
10:36
[core][fr] end of lemma is now a slash instead of a space check-in: 023f83bc15 user: olr tags: fr, core, rg
Changes

Modified gc_core/py/lang_core/gc_engine.py from [f0f3202267] to [894e8606f6].

77
78
79
80
81
82
83
84

85


86
87
88
89
90
91
92
77
78
79
80
81
82
83

84
85
86
87
88
89
90
91
92
93
94







-
+

+
+







            dDA.clear()
            try:
                # regex parser
                _, errs = _proofread(sText[iStart:iEnd], sRealText[iStart:iEnd], iStart, False, dDA, dPriority, sCountry, dOpt, bShowRuleId, bDebug, bContext)
                aErrors.update(errs)
                # token parser
                oSentence = TokenSentence(sText[iStart:iEnd], sRealText[iStart:iEnd], iStart)
                _, errs = oSentence.parse(dPriority, sCountry, dOpt, bShowRuleId, bDebug, bContext)
                bChange, errs = oSentence.parse(dPriority, sCountry, dOpt, bShowRuleId, bDebug, bContext)
                aErrors.update(errs)
                if bChange:
                    oSentence.rewrite()
            except:
                raise
    return aErrors.values() # this is a view (iterable)


def _getSentenceBoundaries (sText):
    iStart = _zBeginOfParagraph.match(sText).end()
662
663
664
665
666
667
668
669

670
671
672

673
674
675
676
677
678
679
664
665
666
667
668
669
670

671
672
673

674
675
676
677
678
679
680
681







-
+


-
+










#### TOKEN SENTENCE CHECKER

class TokenSentence:

    def __init__ (self, sSentence, sSentence0, iStart):
    def __init__ (self, sSentence, sSentence0, nOffset):
        self.sSentence = sSentence
        self.sSentence0 = sSentence0
        self.iStart = iStart
        self.nOffset = nOffset
        self.lToken = list(_oTokenizer.genTokens(sSentence, True))

    def _getNextMatchingNodes (self, dToken, dNode):
        "generator: return nodes where <dToken> “values” match <dNode> arcs"
        # token value
        if dToken["sValue"] in dNode:
            #print("value found: ", dToken["sValue"])
748
749
750
751
752
753
754
755

756
757
758
759
760
761
762
763
764
765
766
767
768
769
770
771
772
773


774
775
776
777
778
779
780
781


782
783
784
785
786
787
788
789
790
791
792
793
794
795
796
797
798
799
800
801
802
803
804
805
806
807
808



























809
810

811
812
813
814
815
816



















817


818
819
820
821
822
823
824
750
751
752
753
754
755
756

757
758
759
760
761
762
763
764
765
766
767
768
769
770
771
772
773


774
775
776
777
778
779
780
781


782
783



784
785
786
787
788
789
790
791
792
793
794
795
796
797
798
799








800
801
802
803
804
805
806
807
808
809
810
811
812
813
814
815
816
817
818
819
820
821
822
823
824
825
826
827

828






829
830
831
832
833
834
835
836
837
838
839
840
841
842
843
844
845
846
847

848
849
850
851
852
853
854
855
856







-
+
















-
-
+
+






-
-
+
+
-
-
-
















-
-
-
-
-
-
-
-
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+

-
+
-
-
-
-
-
-
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
-
+
+







                    if bHasChanged:
                        bChange = True
        if dErr:
            print(dErr)
        return (bChange, dErr)

    def _executeActions (self, dNode, nTokenOffset, dPriority, dOpt, sCountry, bShowRuleId, bContext):
        #print(locals())
        "execute actions found in the DARG"
        dErrs = {}
        bChange = False
        for sLineId, nextNodeKey in dNode.items():
            for sRuleId in dGraph[nextNodeKey]:
                print(sRuleId)
                bCondMemo = None
                sFuncCond, cActionType, sWhat, *eAct = dRule[sRuleId]
                # action in lActions: [ condition, action type, replacement/suggestion/action[, iTokenStart, iTokenEnd[, nPriority, message, URL]] ]
                try:
                    bCondMemo = not sFuncCond or globals()[sFuncCond](self.lToken, nTokenOffset, sCountry, bCondMemo)
                    if bCondMemo:
                        if cActionType == "-":
                            # grammar error
                            print("-")
                            nTokenErrorStart = nTokenOffset + eAct[0]
                            nTokenErrorEnd = nTokenOffset + eAct[1]
                            nErrorStart = self.iStart + self.lToken[nTokenErrorStart]["nStart"]
                            nErrorEnd = self.iStart + self.lToken[nTokenErrorEnd]["nEnd"]
                            nErrorStart = self.nOffset + self.lToken[nTokenErrorStart]["nStart"]
                            nErrorEnd = self.nOffset + self.lToken[nTokenErrorEnd]["nEnd"]
                            if nErrorStart not in dErrs or eAct[2] > dPriority[nErrorStart]:
                                dErrs[nErrorStart] = _createTokenError(self.lToken, self.sSentence, self.sSentence0, sWhat, nTokenErrorStart, nErrorStart, nErrorEnd, sLineId, sRuleId, True, eAct[3], eAct[4], bShowRuleId, "notype", bContext)
                                dPriority[nErrorStart] = eAct[2]
                        elif cActionType == "~":
                            # text processor
                            print("~")
                            self._rewrite(sWhat, nErrorStart, nErrorEnd)
                        elif cActionType == "@":
                            self._tagAndPrepareTokenForRewriting(sWhat, nTokenOffset + eAct[0], nTokenOffset + eAct[1])
                            bChange = True
                            # jump
                            print("@")
                            self._jump(sWhat)
                        elif cActionType == "=":
                            # disambiguation
                            print("=")
                            globals()[sWhat](self.lToken)
                        elif cActionType == ">":
                            # we do nothing, this test is just a condition to apply all following actions
                            print(">")
                            pass
                        else:
                            print("# error: unknown action at " + sLineId)
                    elif cActionType == ">":
                        break
                except Exception as e:
                    raise Exception(str(e), sLineId)
        return bChange, dErrs

    def _rewrite (self, sWhat, nErrorStart, nErrorEnd):
        "text processor: rewrite tokens between <nErrorStart> and <nErrorEnd> position"
        lTokenValue = sWhat.split("|")
        if len(lTokenValue) != (nErrorEnd - nErrorStart + 1):
            print("Error. Text processor: number of replacements != number of tokens.")
            return
        for i, sValue in zip(range(nErrorStart, nErrorEnd+1), lTokenValue):
            self.lToken[i]["sValue"] = sValue
    def _tagAndPrepareTokenForRewriting (self, sWhat, nTokenRewriteStart, nTokenRewriteEnd, bUppercase=True):
        "text processor: rewrite tokens between <nTokenRewriteStart> and <nTokenRewriteEnd> position"
        if sWhat == "*":
            # purge text
            if nTokenRewriteEnd - nTokenRewriteStart == 0:
                self.lToken[nTokenRewriteStart]["bToRemove"] = True
            else:
                for i in range(nTokenRewriteStart, nTokenRewriteEnd+1):
                    self.lToken[i]["bToRemove"] = True
        else:
            if sWhat.startswith("="):
                sWhat = globals()[sWhat[1:]](self.lToken)
            bUppercase = bUppercase and self.lToken[nTokenRewriteStart]["sValue"][0:1].isupper()
            if nTokenRewriteEnd - nTokenRewriteStart == 0:
                sWhat = sWhat + " " * (len(self.lToken[nTokenRewriteStart]["sValue"])-len(sWhat))
                if bUppercase:
                    sWhat = sWhat[0:1].upper() + sWhat[1:]
                self.lToken[nTokenRewriteStart]["sNewValue"] = sWhat
            else:
                lTokenValue = sWhat.split("|")
                if len(lTokenValue) != (nTokenRewriteEnd - nTokenRewriteStart + 1):
                    print("Error. Text processor: number of replacements != number of tokens.")
                    return
                for i, sValue in zip(range(nTokenRewriteStart, nTokenRewriteEnd+1), lTokenValue):
                    if bUppercase:
                        sValue = sValue[0:1].upper() + sValue[1:]
                    self.lToken[i]["sNewValue"] = sValue

    def _jump (self, sWhat):
    def rewrite (self):
        try:
            nFrom, nTo = sWhat.split(">")
            self.lToken[int(nFrom)]["iJump"] = int(nTo)
        except:
            print("# Error. Jump failed: ", sWhat)
            traceback.print_exc()
        "rewrite the sentence, modify tokens, purge the token list"
        lNewToken = []
        for i, dToken in enumerate(self.lToken):
            if "bToRemove" in dToken:
                # remove useless token
                self.sSentence = self.sSentence[:self.nOffset+dToken["nStart"]] + " " * (dToken["nEnd"] - dToken["nStart"]) + self.sSentence[self.nOffset+dToken["nEnd"]:]
                #print("removed:", dToken["sValue"])
            else:
                lNewToken.append(dToken)
                if "sNewValue" in dToken:
                    # rewrite token and sentence
                    print(dToken["sValue"], "->", dToken["sNewValue"])
                    dToken["sRealValue"] = dToken["sValue"]
                    dToken["sValue"] = dToken["sNewValue"]
                    nDiffLen = len(dToken["sRealValue"]) - len(dToken["sNewValue"])
                    sNewRepl = (dToken["sNewValue"] + " " * nDiffLen)  if nDiffLen >= 0  else dToken["sNewValue"][:len(dToken["sRealValue"])]
                    self.sSentence = self.sSentence[:self.nOffset+dToken["nStart"]] + sNewRepl + self.sSentence[self.nOffset+dToken["nEnd"]:]
                    del dToken["sNewValue"]
        print(self.sSentence)
            return
        self.lToken.clear()
        self.lToken = lNewToken


#### Analyse tokens

def g_morph (dToken, sPattern, sNegPattern=""):
    "analyse a token, return True if <sNegPattern> not in morphologies and <sPattern> in morphologies"
    if "lMorph" in dToken:

Modified gc_lang/fr/rules_graph.grx from [02356176f2] to [747aec6cb3].

34
35
36
37
38
39
40














41
42
43
44
45
46
47
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61







+
+
+
+
+
+
+
+
+
+
+
+
+
+









# Fin d’interprétation du fichier avec une ligne commençant par #END

# ERREURS COURANTES
# http://fr.wikipedia.org/wiki/Wikip%C3%A9dia:Fautes_d%27orthographe/Courantes


__pp__
    >avoir  marre  [d’|des|du|de]
        <<- ~1:3>> *

TEST: J’en ai marre de ces gens-là.


__pp2__
    il ne pense qu’ à sa gueule
        <<- ~4:7>> que|Z|a|perdu

TEST: il ne pense qu’à sa gueule.


__avoir_confiance_en__
    >avoir confiance (dans) [moi|toi|soi|lui|elle|nous|vous|eux|elles]
        <<-  -1>> en                                                                                # Avoir confiance en quelqu’un ou quelque chose.|http://grammalecte.net

TEST: Elle avait confiance {{dans}} lui.

61
62
63
64
65
66
67

68
69
70
71
72
73
74
75
76
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91







+










TEST: Je suis {{an}} désaccord avec lui.


__faire_plaisir__
    >faire plaisirs
        <<- -2>> plaisir                                                                            # Faire plaisir : dans cette locution, “plaisir” doit être au singulier.
        <<- ~2>> *

TEST: Ça me fait {{plaisirs}}.


__test__
    je  ~préf[éè]r  [que|qu’]  @(?::Os|:M)¬:X  @:I
        <<- morph(\1, ":V") and morph(\4, ":Os|:M", ":X") -5>> SUBJONCTIF                  # SUBJONCTIF.

TEST: je préférerais qu’Isabelle {{est}} partie.