Grammalecte  Check-in [746cb57e54]

Overview
Comment:[core] gc engine: generator yield pointers instead of nodes
Downloads: Tarball | ZIP archive | SQL archive
Timelines: family | ancestors | descendants | both | core | rg
Files: files | file ages | folders
SHA3-256: 746cb57e5483b1f5ffba18645d7e00cfb53abf9ebd22f0c25e91de06e77fe62f
User & Date: olr on 2018-07-31 09:28:29
Other Links: branch diff | manifest | tags
Context
2018-07-31
09:57
[core] gc engine: jump action to keep pointers until token found check-in: 043d1fdd77 user: olr tags: core, rg
09:28
[core] gc engine: generator yield pointers instead of nodes check-in: 746cb57e54 user: olr tags: core, rg
05:25
[fr] conversion: regex rules -> graph rules check-in: 655aa0f725 user: olr tags: fr, rg
Changes

Modified gc_core/py/lang_core/gc_engine.py from [d9f8c3ac90] to [e17d8732c9].

601
602
603
604
605
606
607
608

609
610
611
612
613
614

615
616
617
618
619
620

621
622
623
624
625
626

627
628
629
630
631

632
633
634
635
636
637
638
639
640

641
642
643
644
645
646
647
648
649

650
651
652
653
654
655
656
657
658

659
660
661
662
663
664
665
666
667
668

669
670
671
672
673
674
675
676
677
678
679

680
681
682
683
684
685
686
687

688
689
690
691
692
693
694

695
696
697
698

699
700
701
702

703
704
705
706
707

708
709
710
711

712
713
714
715
716
717
718

719
720
721
722
723
724

725
726
727
728

729
730
731
732
733
734
735

736
737
738
739
740
741
742
601
602
603
604
605
606
607

608
609
610
611
612
613

614
615
616
617
618
619

620
621
622
623
624
625

626
627
628
629
630

631
632
633
634
635
636
637
638
639

640
641
642
643
644
645
646
647
648

649
650
651
652
653
654
655
656
657

658
659
660
661
662
663
664
665
666
667

668
669
670
671
672
673
674
675
676
677
678

679
680
681
682
683
684
685
686

687
688
689
690
691
692
693

694
695
696
697

698
699
700
701

702
703
704
705
706

707
708
709
710

711
712
713
714
715
716
717

718
719
720
721
722
723

724

725
726

727

728
729
730
731
732

733
734
735
736
737
738
739
740







-
+





-
+





-
+





-
+




-
+








-
+








-
+








-
+









-
+










-
+







-
+






-
+



-
+



-
+




-
+



-
+






-
+





-
+
-


-
+
-





-
+







                dToken["lMorph"] = self.dTokenPos[dToken["nStart"]]["lMorph"]
        self.lToken = lNewToken
        self.dTokenPos = { dToken["nStart"]: dToken  for dToken in self.lToken  if dToken["sType"] != "INFO" }
        if bDebug:
            print("UPDATE:")
            print(self)

    def _getNextMatchingNodes (self, dToken, dGraph, dNode, bDebug=False):
    def _getNextMatchingNodes (self, dToken, dGraph, iNode1, dNode, bDebug=False):
        "generator: return nodes where <dToken> “values” match <dNode> arcs"
        # token value
        if dToken["sValue"] in dNode:
            if bDebug:
                print("  MATCH:", dToken["sValue"])
            yield dGraph[dNode[dToken["sValue"]]]
            yield { "iNode1": iNode1, "dNode": dGraph[dNode[dToken["sValue"]]] }
        if dToken["sValue"][0:2].istitle(): # we test only 2 first chars, to make valid words such as "Laissez-les", "Passe-partout".
            sValue = dToken["sValue"].lower()
            if sValue in dNode:
                if bDebug:
                    print("  MATCH:", sValue)
                yield dGraph[dNode[sValue]]
                yield { "iNode1": iNode1, "dNode": dGraph[dNode[sValue]] }
        elif dToken["sValue"].isupper():
            sValue = dToken["sValue"].lower()
            if sValue in dNode:
                if bDebug:
                    print("  MATCH:", sValue)
                yield dGraph[dNode[sValue]]
                yield { "iNode1": iNode1, "dNode": dGraph[dNode[sValue]] }
            sValue = dToken["sValue"].capitalize()
            if sValue in dNode:
                if bDebug:
                    print("  MATCH:", sValue)
                yield dGraph[dNode[sValue]]
                yield { "iNode1": iNode1, "dNode": dGraph[dNode[sValue]] }
        # regex value arcs
        if "<re_value>" in dNode:
            for sRegex in dNode["<re_value>"]:
                if "¬" not in sRegex:
                    # no anti-pattern
                    if re.search(sRegex, dToken["sValue"]):
                        if bDebug:
                            print("  MATCH: ~" + sRegex)
                        yield dGraph[dNode["<re_value>"][sRegex]]
                        yield { "iNode1": iNode1, "dNode": dGraph[dNode["<re_value>"][sRegex]] }
                else:
                    # there is an anti-pattern
                    sPattern, sNegPattern = sRegex.split("¬", 1)
                    if sNegPattern and re.search(sNegPattern, dToken["sValue"]):
                        continue
                    if not sPattern or re.search(sPattern, dToken["sValue"]):
                        if bDebug:
                            print("  MATCH: ~" + sRegex)
                        yield dGraph[dNode["<re_value>"][sRegex]]
                        yield { "iNode1": iNode1, "dNode": dGraph[dNode["<re_value>"][sRegex]] }
        # analysable tokens
        if dToken["sType"][0:4] == "WORD":
            # token lemmas
            if "<lemmas>" in dNode:
                for sLemma in _oSpellChecker.getLemma(dToken["sValue"]):
                    if sLemma in dNode["<lemmas>"]:
                        if bDebug:
                            print("  MATCH: >" + sLemma)
                        yield dGraph[dNode["<lemmas>"][sLemma]]
                        yield { "iNode1": iNode1, "dNode": dGraph[dNode["<lemmas>"][sLemma]] }
            # regex morph arcs
            if "<re_morph>" in dNode:
                for sRegex in dNode["<re_morph>"]:
                    if "¬" not in sRegex:
                        # no anti-pattern
                        lMorph = dToken.get("lMorph", _oSpellChecker.getMorph(dToken["sValue"]))
                        if any(re.search(sRegex, sMorph)  for sMorph in lMorph):
                            if bDebug:
                                print("  MATCH: @" + sRegex)
                            yield dGraph[dNode["<re_morph>"][sRegex]]
                            yield { "iNode1": iNode1, "dNode": dGraph[dNode["<re_morph>"][sRegex]] }
                    else:
                        # there is an anti-pattern
                        sPattern, sNegPattern = sRegex.split("¬", 1)
                        if sNegPattern == "*":
                            # all morphologies must match with <sPattern>
                            if sPattern:
                                lMorph = dToken.get("lMorph", _oSpellChecker.getMorph(dToken["sValue"]))
                                if lMorph and all(re.search(sPattern, sMorph)  for sMorph in lMorph):
                                    if bDebug:
                                        print("  MATCH: @" + sRegex)
                                    yield dGraph[dNode["<re_morph>"][sRegex]]
                                    yield { "iNode1": iNode1, "dNode": dGraph[dNode["<re_morph>"][sRegex]] }
                        else:
                            lMorph = dToken.get("lMorph", _oSpellChecker.getMorph(dToken["sValue"]))
                            if sNegPattern and any(re.search(sNegPattern, sMorph)  for sMorph in lMorph):
                                continue
                            if not sPattern or any(re.search(sPattern, sMorph)  for sMorph in lMorph):
                                if bDebug:
                                    print("  MATCH: @" + sRegex)
                                yield dGraph[dNode["<re_morph>"][sRegex]]
                                yield { "iNode1": iNode1, "dNode": dGraph[dNode["<re_morph>"][sRegex]] }
        # token tags
        if "tags" in dToken and "<tags>" in dNode:
            for sTag in dToken["tags"]:
                if sTag in dNode["<tags>"]:
                    if bDebug:
                        print("  MATCH: /" + sTag)
                    yield dGraph[dNode["<tags>"][sTag]]
                    yield { "iNode1": iNode1, "dNode": dGraph[dNode["<tags>"][sTag]] }
        # meta arc (for token type)
        if "<meta>" in dNode:
            for sMeta in dNode["<meta>"]:
                # not regex here, we just search if <dNode["sType"]> exists within <sMeta>
                # no regex here, we just search if <dNode["sType"]> exists within <sMeta>
                if sMeta == "*":
                    if bDebug:
                        print("  MATCH: *" + sMeta)
                    yield dGraph[dNode["<meta>"]["*"]]
                    yield { "iNode1": iNode1, "dNode": dGraph[dNode["<meta>"]["*"]] }
                elif "¬" in sMeta:
                    if dToken["sType"] not in sMeta:
                        if bDebug:
                            print("  MATCH: *" + sMeta)
                        yield dGraph[dNode["<meta>"][sMeta]]
                        yield { "iNode1": iNode1, "dNode": dGraph[dNode["<meta>"][sMeta]] }
                elif dToken["sType"] in sMeta:
                    if bDebug:
                        print("  MATCH: *" + sMeta)
                    yield dGraph[dNode["<meta>"][sMeta]]
                    yield { "iNode1": iNode1, "dNode": dGraph[dNode["<meta>"][sMeta]] }

    def parse (self, dGraph, dPriority, sCountry="${country_default}", dOptions=None, bShowRuleId=False, bDebug=False, bContext=False):
        "parse tokens from the text and execute actions encountered"
        dOpt = _dOptions  if not dOptions  else dOptions
        lPointer = []
        bTagAndRewrite = False
        for i, dToken in enumerate(self.lToken):
        for iToken, dToken in enumerate(self.lToken):
            if bDebug:
                print("TOKEN:", dToken["sValue"])
            # check arcs for each existing pointer
            lNextPointer = []
            for dPointer in lPointer:
                for dNode in self._getNextMatchingNodes(dToken, dGraph, dPointer["dNode"], bDebug):
                lNextPointer.extend(self._getNextPointers(dToken, dGraph, dPointer["iNode1"], dPointer["dNode"], bDebug))
                    lNextPointer.append({"iToken": dPointer["iToken"], "dNode": dNode})
            lPointer = lNextPointer
            # check arcs of first nodes
            for dNode in self._getNextMatchingNodes(dToken, dGraph, dGraph[0], bDebug):
            lPointer.extend(self._getNextPointers(dToken, dGraph, iToken, dGraph[0], bDebug))
                lPointer.append({"iToken": i, "dNode": dNode})
            # check if there is rules to check for each pointer
            for dPointer in lPointer:
                #if bDebug:
                #    print("+", dPointer)
                if "<rules>" in dPointer["dNode"]:
                    bChange = self._executeActions(dGraph, dPointer["dNode"]["<rules>"], dPointer["iToken"]-1, i, dPriority, dOpt, sCountry, bShowRuleId, bDebug, bContext)
                    bChange = self._executeActions(dGraph, dPointer["dNode"]["<rules>"], dPointer["iNode1"]-1, iToken, dPriority, dOpt, sCountry, bShowRuleId, bDebug, bContext)
                    if bChange:
                        bTagAndRewrite = True
        if bTagAndRewrite:
            self.rewrite(bDebug)
        if bDebug:
            print(self)
        return (bTagAndRewrite, self.sSentence)