601
602
603
604
605
606
607
608
609
610
611
612
613
614
615
616
617
618
619
620
621
622
623
624
625
626
627
628
629
630
631
632
633
634
635
636
637
638
639
640
641
642
643
644
645
646
647
648
649
650
651
652
653
654
655
656
657
658
659
660
661
662
663
664
665
666
667
668
669
670
671
672
673
674
675
676
677
678
679
680
681
682
683
684
685
686
687
688
689
690
691
692
693
694
695
696
697
698
699
700
701
702
703
704
705
706
707
708
709
710
711
712
713
714
715
716
717
718
719
720
721
722
723
724
725
726
727
728
729
730
731
732
733
734
|
dToken["lMorph"] = self.dTokenPos[dToken["nStart"]]["lMorph"]
self.lToken = lNewToken
self.dTokenPos = { dToken["nStart"]: dToken for dToken in self.lToken if dToken["sType"] != "INFO" }
if bDebug:
print("UPDATE:")
print(self)
def _getNextMatchingNodes (self, dToken, dGraph, iNode1, dNode, bDebug=False):
"generator: return nodes where <dToken> “values” match <dNode> arcs"
# token value
if dToken["sValue"] in dNode:
if bDebug:
print(" MATCH:", dToken["sValue"])
yield { "iNode1": iNode1, "dNode": dGraph[dNode[dToken["sValue"]]] }
if dToken["sValue"][0:2].istitle(): # we test only 2 first chars, to make valid words such as "Laissez-les", "Passe-partout".
sValue = dToken["sValue"].lower()
if sValue in dNode:
if bDebug:
print(" MATCH:", sValue)
yield { "iNode1": iNode1, "dNode": dGraph[dNode[sValue]] }
elif dToken["sValue"].isupper():
sValue = dToken["sValue"].lower()
if sValue in dNode:
if bDebug:
print(" MATCH:", sValue)
yield { "iNode1": iNode1, "dNode": dGraph[dNode[sValue]] }
sValue = dToken["sValue"].capitalize()
if sValue in dNode:
if bDebug:
print(" MATCH:", sValue)
yield { "iNode1": iNode1, "dNode": dGraph[dNode[sValue]] }
# regex value arcs
if "<re_value>" in dNode:
for sRegex in dNode["<re_value>"]:
if "¬" not in sRegex:
# no anti-pattern
if re.search(sRegex, dToken["sValue"]):
if bDebug:
print(" MATCH: ~" + sRegex)
yield { "iNode1": iNode1, "dNode": dGraph[dNode["<re_value>"][sRegex]] }
else:
# there is an anti-pattern
sPattern, sNegPattern = sRegex.split("¬", 1)
if sNegPattern and re.search(sNegPattern, dToken["sValue"]):
continue
if not sPattern or re.search(sPattern, dToken["sValue"]):
if bDebug:
print(" MATCH: ~" + sRegex)
yield { "iNode1": iNode1, "dNode": dGraph[dNode["<re_value>"][sRegex]] }
# analysable tokens
if dToken["sType"][0:4] == "WORD":
# token lemmas
if "<lemmas>" in dNode:
for sLemma in _oSpellChecker.getLemma(dToken["sValue"]):
if sLemma in dNode["<lemmas>"]:
if bDebug:
print(" MATCH: >" + sLemma)
yield { "iNode1": iNode1, "dNode": dGraph[dNode["<lemmas>"][sLemma]] }
# regex morph arcs
if "<re_morph>" in dNode:
for sRegex in dNode["<re_morph>"]:
if "¬" not in sRegex:
# no anti-pattern
lMorph = dToken.get("lMorph", _oSpellChecker.getMorph(dToken["sValue"]))
if any(re.search(sRegex, sMorph) for sMorph in lMorph):
if bDebug:
print(" MATCH: @" + sRegex)
yield { "iNode1": iNode1, "dNode": dGraph[dNode["<re_morph>"][sRegex]] }
else:
# there is an anti-pattern
sPattern, sNegPattern = sRegex.split("¬", 1)
if sNegPattern == "*":
# all morphologies must match with <sPattern>
if sPattern:
lMorph = dToken.get("lMorph", _oSpellChecker.getMorph(dToken["sValue"]))
if lMorph and all(re.search(sPattern, sMorph) for sMorph in lMorph):
if bDebug:
print(" MATCH: @" + sRegex)
yield { "iNode1": iNode1, "dNode": dGraph[dNode["<re_morph>"][sRegex]] }
else:
lMorph = dToken.get("lMorph", _oSpellChecker.getMorph(dToken["sValue"]))
if sNegPattern and any(re.search(sNegPattern, sMorph) for sMorph in lMorph):
continue
if not sPattern or any(re.search(sPattern, sMorph) for sMorph in lMorph):
if bDebug:
print(" MATCH: @" + sRegex)
yield { "iNode1": iNode1, "dNode": dGraph[dNode["<re_morph>"][sRegex]] }
# token tags
if "tags" in dToken and "<tags>" in dNode:
for sTag in dToken["tags"]:
if sTag in dNode["<tags>"]:
if bDebug:
print(" MATCH: /" + sTag)
yield { "iNode1": iNode1, "dNode": dGraph[dNode["<tags>"][sTag]] }
# meta arc (for token type)
if "<meta>" in dNode:
for sMeta in dNode["<meta>"]:
# no regex here, we just search if <dNode["sType"]> exists within <sMeta>
if sMeta == "*":
if bDebug:
print(" MATCH: *" + sMeta)
yield { "iNode1": iNode1, "dNode": dGraph[dNode["<meta>"]["*"]] }
elif "¬" in sMeta:
if dToken["sType"] not in sMeta:
if bDebug:
print(" MATCH: *" + sMeta)
yield { "iNode1": iNode1, "dNode": dGraph[dNode["<meta>"][sMeta]] }
elif dToken["sType"] in sMeta:
if bDebug:
print(" MATCH: *" + sMeta)
yield { "iNode1": iNode1, "dNode": dGraph[dNode["<meta>"][sMeta]] }
def parse (self, dGraph, dPriority, sCountry="${country_default}", dOptions=None, bShowRuleId=False, bDebug=False, bContext=False):
"parse tokens from the text and execute actions encountered"
dOpt = _dOptions if not dOptions else dOptions
lPointer = []
bTagAndRewrite = False
for iToken, dToken in enumerate(self.lToken):
if bDebug:
print("TOKEN:", dToken["sValue"])
# check arcs for each existing pointer
lNextPointer = []
for dPointer in lPointer:
lNextPointer.extend(self._getNextPointers(dToken, dGraph, dPointer["iNode1"], dPointer["dNode"], bDebug))
lPointer = lNextPointer
# check arcs of first nodes
lPointer.extend(self._getNextPointers(dToken, dGraph, iToken, dGraph[0], bDebug))
# check if there is rules to check for each pointer
for dPointer in lPointer:
#if bDebug:
# print("+", dPointer)
if "<rules>" in dPointer["dNode"]:
bChange = self._executeActions(dGraph, dPointer["dNode"]["<rules>"], dPointer["iNode1"]-1, iToken, dPriority, dOpt, sCountry, bShowRuleId, bDebug, bContext)
if bChange:
|
|
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
|
|
|
601
602
603
604
605
606
607
608
609
610
611
612
613
614
615
616
617
618
619
620
621
622
623
624
625
626
627
628
629
630
631
632
633
634
635
636
637
638
639
640
641
642
643
644
645
646
647
648
649
650
651
652
653
654
655
656
657
658
659
660
661
662
663
664
665
666
667
668
669
670
671
672
673
674
675
676
677
678
679
680
681
682
683
684
685
686
687
688
689
690
691
692
693
694
695
696
697
698
699
700
701
702
703
704
705
706
707
708
709
710
711
712
713
714
715
716
717
718
719
720
721
722
723
724
725
726
727
728
729
730
731
732
733
734
735
736
737
738
739
740
741
742
743
744
745
746
747
748
749
750
751
752
753
754
755
756
|
dToken["lMorph"] = self.dTokenPos[dToken["nStart"]]["lMorph"]
self.lToken = lNewToken
self.dTokenPos = { dToken["nStart"]: dToken for dToken in self.lToken if dToken["sType"] != "INFO" }
if bDebug:
print("UPDATE:")
print(self)
def _getNextPointers (self, dToken, dGraph, iNode1, dNode, bKeep=False, bDebug=False):
"generator: return nodes where <dToken> “values” match <dNode> arcs"
bTokenFound = False
# token value
if dToken["sValue"] in dNode:
if bDebug:
print(" MATCH:", dToken["sValue"])
yield { "iNode1": iNode1, "dNode": dGraph[dNode[dToken["sValue"]]] }
bTokenFound = True
if dToken["sValue"][0:2].istitle(): # we test only 2 first chars, to make valid words such as "Laissez-les", "Passe-partout".
sValue = dToken["sValue"].lower()
if sValue in dNode:
if bDebug:
print(" MATCH:", sValue)
yield { "iNode1": iNode1, "dNode": dGraph[dNode[sValue]] }
bTokenFound = True
elif dToken["sValue"].isupper():
sValue = dToken["sValue"].lower()
if sValue in dNode:
if bDebug:
print(" MATCH:", sValue)
yield { "iNode1": iNode1, "dNode": dGraph[dNode[sValue]] }
bTokenFound = True
sValue = dToken["sValue"].capitalize()
if sValue in dNode:
if bDebug:
print(" MATCH:", sValue)
yield { "iNode1": iNode1, "dNode": dGraph[dNode[sValue]] }
bTokenFound = True
# regex value arcs
if "<re_value>" in dNode:
for sRegex in dNode["<re_value>"]:
if "¬" not in sRegex:
# no anti-pattern
if re.search(sRegex, dToken["sValue"]):
if bDebug:
print(" MATCH: ~" + sRegex)
yield { "iNode1": iNode1, "dNode": dGraph[dNode["<re_value>"][sRegex]] }
bTokenFound = True
else:
# there is an anti-pattern
sPattern, sNegPattern = sRegex.split("¬", 1)
if sNegPattern and re.search(sNegPattern, dToken["sValue"]):
continue
if not sPattern or re.search(sPattern, dToken["sValue"]):
if bDebug:
print(" MATCH: ~" + sRegex)
yield { "iNode1": iNode1, "dNode": dGraph[dNode["<re_value>"][sRegex]] }
bTokenFound = True
# analysable tokens
if dToken["sType"][0:4] == "WORD":
# token lemmas
if "<lemmas>" in dNode:
for sLemma in _oSpellChecker.getLemma(dToken["sValue"]):
if sLemma in dNode["<lemmas>"]:
if bDebug:
print(" MATCH: >" + sLemma)
yield { "iNode1": iNode1, "dNode": dGraph[dNode["<lemmas>"][sLemma]] }
bTokenFound = True
# regex morph arcs
if "<re_morph>" in dNode:
for sRegex in dNode["<re_morph>"]:
if "¬" not in sRegex:
# no anti-pattern
lMorph = dToken.get("lMorph", _oSpellChecker.getMorph(dToken["sValue"]))
if any(re.search(sRegex, sMorph) for sMorph in lMorph):
if bDebug:
print(" MATCH: @" + sRegex)
yield { "iNode1": iNode1, "dNode": dGraph[dNode["<re_morph>"][sRegex]] }
bTokenFound = True
else:
# there is an anti-pattern
sPattern, sNegPattern = sRegex.split("¬", 1)
if sNegPattern == "*":
# all morphologies must match with <sPattern>
if sPattern:
lMorph = dToken.get("lMorph", _oSpellChecker.getMorph(dToken["sValue"]))
if lMorph and all(re.search(sPattern, sMorph) for sMorph in lMorph):
if bDebug:
print(" MATCH: @" + sRegex)
yield { "iNode1": iNode1, "dNode": dGraph[dNode["<re_morph>"][sRegex]] }
bTokenFound = True
else:
lMorph = dToken.get("lMorph", _oSpellChecker.getMorph(dToken["sValue"]))
if sNegPattern and any(re.search(sNegPattern, sMorph) for sMorph in lMorph):
continue
if not sPattern or any(re.search(sPattern, sMorph) for sMorph in lMorph):
if bDebug:
print(" MATCH: @" + sRegex)
yield { "iNode1": iNode1, "dNode": dGraph[dNode["<re_morph>"][sRegex]] }
bTokenFound = True
# token tags
if "tags" in dToken and "<tags>" in dNode:
for sTag in dToken["tags"]:
if sTag in dNode["<tags>"]:
if bDebug:
print(" MATCH: /" + sTag)
yield { "iNode1": iNode1, "dNode": dGraph[dNode["<tags>"][sTag]] }
bTokenFound = True
# meta arc (for token type)
if "<meta>" in dNode:
for sMeta in dNode["<meta>"]:
# no regex here, we just search if <dNode["sType"]> exists within <sMeta>
if sMeta == "*":
if bDebug:
print(" MATCH: *" + sMeta)
yield { "iNode1": iNode1, "dNode": dGraph[dNode["<meta>"]["*"]] }
bTokenFound = True
elif "¬" in sMeta:
if dToken["sType"] not in sMeta:
if bDebug:
print(" MATCH: *" + sMeta)
yield { "iNode1": iNode1, "dNode": dGraph[dNode["<meta>"][sMeta]] }
bTokenFound = True
elif dToken["sType"] in sMeta:
if bDebug:
print(" MATCH: *" + sMeta)
yield { "iNode1": iNode1, "dNode": dGraph[dNode["<meta>"][sMeta]] }
bTokenFound = True
if bKeep and not bTokenFound:
yield { "iNode1": iNode1, "dNode": dNode, "bKeep": True }
# JUMP
# Warning! Recurssion!
if "<>" in dNode:
yield from self._getNextPointers(self, dToken, dGraph, iNode1, dGraph[dNode["<>"]], True, bDebug)
def parse (self, dGraph, dPriority, sCountry="${country_default}", dOptions=None, bShowRuleId=False, bDebug=False, bContext=False):
"parse tokens from the text and execute actions encountered"
dOpt = _dOptions if not dOptions else dOptions
lPointer = []
bTagAndRewrite = False
for iToken, dToken in enumerate(self.lToken):
if bDebug:
print("TOKEN:", dToken["sValue"])
# check arcs for each existing pointer
lNextPointer = []
for dPointer in lPointer:
lNextPointer.extend(self._getNextPointers(dToken, dGraph, dPointer["iNode1"], dPointer["dNode"], dPointer.get("bKeep", False), bDebug))
lPointer = lNextPointer
# check arcs of first nodes
lPointer.extend(self._getNextPointers(dToken, dGraph, iToken, dGraph[0], False, bDebug))
# check if there is rules to check for each pointer
for dPointer in lPointer:
#if bDebug:
# print("+", dPointer)
if "<rules>" in dPointer["dNode"]:
bChange = self._executeActions(dGraph, dPointer["dNode"]["<rules>"], dPointer["iNode1"]-1, iToken, dPriority, dOpt, sCountry, bShowRuleId, bDebug, bContext)
if bChange:
|