338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
|
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
|
-
+
-
+
-
+
-
+
|
sText = self.parseGraph(_rules_graph.dAllGraph[sGraphName], sCountry, dOptions, bShowRuleId, bDebug, bContext)
elif not sOption or dOptions.get(sOption, False):
# regex rules
for zRegex, bUppercase, sLineId, sRuleId, nPriority, lActions in lRuleGroup:
if sRuleId not in _aIgnoredRules:
for m in zRegex.finditer(sText):
bCondMemo = None
for sFuncCond, cActionType, sWhat, *eAct in lActions:
for sFuncCond, cActionType, sAction, *eAct in lActions:
# action in lActions: [ condition, action type, replacement/suggestion/action[, iGroup[, message, URL]] ]
try:
bCondMemo = not sFuncCond or getattr(gc_functions, sFuncCond)(sText, sText0, m, self.dTokenPos, sCountry, bCondMemo)
if bCondMemo:
if bDebug:
echo("RULE: " + sLineId)
if cActionType == "-":
# grammar error
nErrorStart = nOffset + m.start(eAct[0])
if nErrorStart not in self.dError or nPriority > self.dErrorPriority.get(nErrorStart, -1):
self.dError[nErrorStart] = self._createErrorFromRegex(sText, sText0, sWhat, nOffset, m, eAct[0], sLineId, sRuleId, bUppercase, eAct[1], eAct[2], bShowRuleId, sOption, bContext)
self.dError[nErrorStart] = self._createErrorFromRegex(sText, sText0, sAction, nOffset, m, eAct[0], sLineId, sRuleId, bUppercase, eAct[1], eAct[2], bShowRuleId, sOption, bContext)
self.dErrorPriority[nErrorStart] = nPriority
self.dSentenceError[nErrorStart] = self.dError[nErrorStart]
elif cActionType == "~":
# text processor
sText = self.rewriteText(sText, sWhat, eAct[0], m, bUppercase)
sText = self.rewriteText(sText, sAction, eAct[0], m, bUppercase)
bChange = True
if bDebug:
echo("~ " + sText + " -- " + m.group(eAct[0]) + " # " + sLineId)
elif cActionType == "=":
# disambiguation
if not bParagraph:
getattr(gc_functions, sWhat)(sText, m, self.dTokenPos)
getattr(gc_functions, sAction)(sText, m, self.dTokenPos)
if bDebug:
echo("= " + m.group(0) + " # " + sLineId)
elif cActionType == ">":
# we do nothing, this test is just a condition to apply all following actions
pass
else:
echo("# error: unknown action at " + sLineId)
|
552
553
554
555
556
557
558
559
560
561
562
563
564
565
566
567
568
569
570
571
572
573
574
575
576
577
578
579
580
581
582
583
584
585
586
587
588
589
590
591
592
593
594
595
596
597
598
599
600
601
|
552
553
554
555
556
557
558
559
560
561
562
563
564
565
566
567
568
569
570
571
572
573
574
575
576
577
578
579
580
581
582
583
584
585
586
587
588
589
590
591
592
593
594
595
596
597
598
599
600
601
|
-
+
-
+
-
-
+
+
-
+
-
-
-
+
+
+
-
-
-
-
+
+
+
+
-
-
+
+
-
-
-
+
+
+
-
+
|
yield ("*", sMeta, dNode["<meta>"][sMeta])
bTokenFound = True
if not bTokenFound and bKeep:
yield (None, "", -1)
# JUMP
# Warning! Recursion!
if "<>" in dNode:
yield from self._getMatches(dGraph, dToken, dGraph[dNode["<>"]], bKeep=True)
yield from self._getNextNodes(dGraph, dToken, dGraph[dNode["<>"]], bKeep=True)
def parseGraph (self, dGraph, sCountry="${country_default}", dOptions=None, bShowRuleId=False, bDebug=False, bContext=False):
"parse graph with tokens from the text and execute actions encountered"
lPointer = []
lPointers = []
bTagAndRewrite = False
for iToken, dToken in enumerate(self.lTokens):
if bDebug:
echo("TOKEN: " + dToken["sValue"])
# check arcs for each existing pointer
lNextPointer = []
for dPointer in lPointer:
lNextPointers = []
for dPointer in lPointers:
if dPointer["nMultiEnd"] != -1:
if dToken["i"] <= dPointer["nMultiEnd"]:
lNextPointer.append(dPointer)
lNextPointers.append(dPointer)
if dToken["i"] != dPointer["nMultiEnd"]:
continue
for cActionType, sMatch, iNode in self._getMatches(dGraph, dToken, dGraph[dPointer["iNode"]]):
if cActionType is None:
lNextPointer.append(dPointer)
for cNodeType, sMatch, iNode in self._getNextNodes(dGraph, dToken, dGraph[dPointer["iNode"]]):
if cNodeType is None:
lNextPointers.append(dPointer)
continue
if bDebug:
echo(" MATCH: " + cActionType + sMatch)
nMultiEnd = -1 if cActionType != "&" else dToken["nMultiStartTo"]
lNextPointer.append({ "iToken1": dPointer["iToken1"], "iNode": iNode, "nMultiEnd": nMultiEnd })
lPointer = lNextPointer
echo(" MATCH: " + cNodeType + sMatch)
nMultiEnd = -1 if cNodeType != "&" else dToken["nMultiStartTo"]
lNextPointers.append({ "iToken1": dPointer["iToken1"], "iNode": iNode, "nMultiEnd": nMultiEnd })
lPointers = lNextPointers
# check arcs of first nodes
for cActionType, sMatch, iNode in self._getMatches(dGraph, dToken, dGraph[0]):
if cActionType is None:
for cNodeType, sMatch, iNode in self._getNextNodes(dGraph, dToken, dGraph[0]):
if cNodeType is None:
continue
if bDebug:
echo(" MATCH: " + cActionType + sMatch)
nMultiEnd = -1 if cActionType != "&" else dToken["nMultiStartTo"]
lPointer.append({ "iToken1": iToken, "iNode": iNode, "nMultiEnd": nMultiEnd })
echo(" MATCH: " + cNodeType + sMatch)
nMultiEnd = -1 if cNodeType != "&" else dToken["nMultiStartTo"]
lPointers.append({ "iToken1": iToken, "iNode": iNode, "nMultiEnd": nMultiEnd })
# check if there is rules to check for each pointer
for dPointer in lPointer:
for dPointer in lPointers:
if dPointer["nMultiEnd"] != -1:
if dToken["i"] < dPointer["nMultiEnd"]:
continue
if dToken["i"] == dPointer["nMultiEnd"]:
dPointer["nMultiEnd"] = -1
if "<rules>" in dGraph[dPointer["iNode"]]:
bChange = self._executeActions(dGraph, dGraph[dPointer["iNode"]]["<rules>"], dPointer["iToken1"]-1, iToken, dOptions, sCountry, bShowRuleId, bDebug, bContext)
|
612
613
614
615
616
617
618
619
620
621
622
623
624
625
626
627
628
629
630
631
632
633
634
635
636
637
638
639
640
641
642
643
644
645
646
647
648
649
650
651
652
653
654
655
656
657
658
659
660
661
662
663
664
665
666
667
668
669
670
671
672
673
674
675
676
677
678
679
680
681
682
683
684
685
686
687
688
689
690
691
692
693
694
695
696
697
698
699
700
701
702
703
704
705
706
707
708
709
710
711
712
|
612
613
614
615
616
617
618
619
620
621
622
623
624
625
626
627
628
629
630
631
632
633
634
635
636
637
638
639
640
641
642
643
644
645
646
647
648
649
650
651
652
653
654
655
656
657
658
659
660
661
662
663
664
665
666
667
668
669
670
671
672
673
674
675
676
677
678
679
680
681
682
683
684
685
686
687
688
689
690
691
692
693
694
695
696
697
698
699
700
701
702
703
704
705
706
707
708
709
710
711
712
|
-
+
-
+
-
+
-
+
-
+
-
+
-
+
-
+
-
-
+
+
-
+
-
+
|
bChange = False
for sLineId, nextNodeKey in dNode.items():
bCondMemo = None
for sRuleId in dGraph[nextNodeKey]:
try:
if bDebug:
echo(" >TRY: " + sRuleId + " " + sLineId)
_, sOption, sFuncCond, cActionType, sWhat, *eAct = _rules_graph.dRule[sRuleId]
_, sOption, sFuncCond, cActionType, sAction, *eAct = _rules_graph.dRule[sRuleId]
# Suggestion [ option, condition, "-", replacement/suggestion/action, iTokenStart, iTokenEnd, cStartLimit, cEndLimit, bCaseSvty, nPriority, sMessage, iURL ]
# TextProcessor [ option, condition, "~", replacement/suggestion/action, iTokenStart, iTokenEnd, bCaseSvty ]
# Disambiguator [ option, condition, "=", replacement/suggestion/action ]
# Tag [ option, condition, "/", replacement/suggestion/action, iTokenStart, iTokenEnd ]
# Immunity [ option, condition, "!", option, iTokenStart, iTokenEnd ]
# Multi-token [ option, condition, "&", morphologies, iTokenStart, iTokenEnd ]
# Test [ option, condition, ">", "" ]
if not sOption or dOptions.get(sOption, False):
bCondMemo = not sFuncCond or getattr(gc_functions, sFuncCond)(self.lTokens, nTokenOffset, nLastToken, sCountry, bCondMemo, self.dTags, self.sSentence, self.sSentence0)
if bCondMemo:
if cActionType == "-":
# grammar error
iTokenStart, iTokenEnd, cStartLimit, cEndLimit, bCaseSvty, nPriority, sMessage, iURL = eAct
nTokenErrorStart = nTokenOffset + iTokenStart if iTokenStart > 0 else nLastToken + iTokenStart
if "sImmunity" not in self.lTokens[nTokenErrorStart] or (self.lTokens[nTokenErrorStart]["sImmunity"] != "*" and sOption not in self.lTokens[nTokenErrorStart]["sImmunity"]):
nTokenErrorEnd = nTokenOffset + iTokenEnd if iTokenEnd > 0 else nLastToken + iTokenEnd
nErrorStart = self.nOffsetWithinParagraph + (self.lTokens[nTokenErrorStart]["nStart"] if cStartLimit == "<" else self.lTokens[nTokenErrorStart]["nEnd"])
nErrorEnd = self.nOffsetWithinParagraph + (self.lTokens[nTokenErrorEnd]["nEnd"] if cEndLimit == ">" else self.lTokens[nTokenErrorEnd]["nStart"])
if nErrorStart not in self.dError or nPriority > self.dErrorPriority.get(nErrorStart, -1):
self.dError[nErrorStart] = self._createErrorFromTokens(sWhat, nTokenOffset, nLastToken, nTokenErrorStart, nErrorStart, nErrorEnd, sLineId, sRuleId, bCaseSvty, \
self.dError[nErrorStart] = self._createErrorFromTokens(sAction, nTokenOffset, nLastToken, nTokenErrorStart, nErrorStart, nErrorEnd, sLineId, sRuleId, bCaseSvty, \
sMessage, _rules_graph.dURL.get(iURL, ""), bShowRuleId, sOption, bContext)
self.dErrorPriority[nErrorStart] = nPriority
self.dSentenceError[nErrorStart] = self.dError[nErrorStart]
if bDebug:
echo(" NEW_ERROR: {}".format(self.dError[nErrorStart]))
elif cActionType == "~":
# text processor
nTokenStart = nTokenOffset + eAct[0] if eAct[0] > 0 else nLastToken + eAct[0]
nTokenEnd = nTokenOffset + eAct[1] if eAct[1] > 0 else nLastToken + eAct[1]
self._tagAndPrepareTokenForRewriting(sWhat, nTokenStart, nTokenEnd, nTokenOffset, nLastToken, eAct[2], bDebug)
self._tagAndPrepareTokenForRewriting(sAction, nTokenStart, nTokenEnd, nTokenOffset, nLastToken, eAct[2], bDebug)
bChange = True
if bDebug:
echo(" TEXT_PROCESSOR: [{}:{}] > {}".format(self.lTokens[nTokenStart]["sValue"], self.lTokens[nTokenEnd]["sValue"], sWhat))
echo(" TEXT_PROCESSOR: [{}:{}] > {}".format(self.lTokens[nTokenStart]["sValue"], self.lTokens[nTokenEnd]["sValue"], sAction))
elif cActionType == "=":
# disambiguation
getattr(gc_functions, sWhat)(self.lTokens, nTokenOffset, nLastToken)
getattr(gc_functions, sAction)(self.lTokens, nTokenOffset, nLastToken)
if bDebug:
echo(" DISAMBIGUATOR: ({}) [{}:{}]".format(sWhat, self.lTokens[nTokenOffset+1]["sValue"], self.lTokens[nLastToken]["sValue"]))
echo(" DISAMBIGUATOR: ({}) [{}:{}]".format(sAction, self.lTokens[nTokenOffset+1]["sValue"], self.lTokens[nLastToken]["sValue"]))
elif cActionType == ">":
# we do nothing, this test is just a condition to apply all following actions
if bDebug:
echo(" COND_OK")
elif cActionType == "/":
# Tag
nTokenStart = nTokenOffset + eAct[0] if eAct[0] > 0 else nLastToken + eAct[0]
nTokenEnd = nTokenOffset + eAct[1] if eAct[1] > 0 else nLastToken + eAct[1]
for i in range(nTokenStart, nTokenEnd+1):
if "aTags" in self.lTokens[i]:
self.lTokens[i]["aTags"].update(sWhat.split("|"))
self.lTokens[i]["aTags"].update(sAction.split("|"))
else:
self.lTokens[i]["aTags"] = set(sWhat.split("|"))
self.lTokens[i]["aTags"] = set(sAction.split("|"))
if bDebug:
echo(" TAG: {} > [{}:{}]".format(sWhat, self.lTokens[nTokenStart]["sValue"], self.lTokens[nTokenEnd]["sValue"]))
for sTag in sWhat.split("|"):
echo(" TAG: {} > [{}:{}]".format(sAction, self.lTokens[nTokenStart]["sValue"], self.lTokens[nTokenEnd]["sValue"]))
for sTag in sAction.split("|"):
if sTag not in self.dTags:
self.dTags[sTag] = [nTokenStart, nTokenEnd]
else:
self.dTags[sTag][0] = min(nTokenStart, self.dTags[sTag][0])
self.dTags[sTag][1] = max(nTokenEnd, self.dTags[sTag][1])
elif cActionType == "!":
# immunity
if bDebug:
echo(" IMMUNITY: " + sLineId + " / " + sRuleId)
nTokenStart = nTokenOffset + eAct[0] if eAct[0] > 0 else nLastToken + eAct[0]
nTokenEnd = nTokenOffset + eAct[1] if eAct[1] > 0 else nLastToken + eAct[1]
sImmunity = sWhat or "*"
sImmunity = sAction or "*"
if nTokenEnd - nTokenStart == 0:
self.lTokens[nTokenStart]["sImmunity"] = sImmunity
nErrorStart = self.nOffsetWithinParagraph + self.lTokens[nTokenStart]["nStart"]
if nErrorStart in self.dError:
del self.dError[nErrorStart]
else:
for i in range(nTokenStart, nTokenEnd+1):
self.lTokens[i]["sImmunity"] = sImmunity
nErrorStart = self.nOffsetWithinParagraph + self.lTokens[i]["nStart"]
if nErrorStart in self.dError:
del self.dError[nErrorStart]
elif cActionType == "&":
# multi-tokens
nTokenStart = nTokenOffset + eAct[0] if eAct[0] > 0 else nLastToken + eAct[0]
nTokenEnd = nTokenOffset + eAct[1] if eAct[1] > 0 else nLastToken + eAct[1]
dMultiToken = {
"nTokenStart": nTokenStart,
"nTokenEnd": nTokenEnd,
"lTokens": self.lTokens[nTokenStart:nTokenEnd+1],
"lMorph": sWhat.split("|") if sWhat else [":HM"]
"lMorph": sAction.split("|") if sAction else [":HM"]
}
self.lTokens[nTokenStart]["nMultiStartTo"] = nTokenEnd
self.lTokens[nTokenEnd]["nMultiEndFrom"] = nTokenStart
self.lTokens[nTokenStart]["dMultiToken"] = dMultiToken
self.lTokens[nTokenEnd]["dMultiToken"] = dMultiToken
print(dMultiToken)
else:
|
822
823
824
825
826
827
828
829
830
831
832
833
834
835
836
837
838
839
840
841
842
843
844
845
846
847
848
849
850
851
852
853
854
855
856
857
858
859
860
861
862
863
864
865
866
867
868
869
870
871
872
|
822
823
824
825
826
827
828
829
830
831
832
833
834
835
836
837
838
839
840
841
842
843
844
845
846
847
848
849
850
851
852
853
854
855
856
857
858
859
860
861
862
863
864
865
866
867
868
869
870
871
872
|
-
+
-
+
-
+
-
-
+
+
-
-
+
+
-
-
+
+
-
+
-
-
+
+
-
+
|
if bUppercase and m.group(iGroup)[0:1].isupper():
sNew = sNew.capitalize()
else:
sNew = m.expand(sRepl)
sNew = sNew + " " * (nLen-len(sNew))
return sText[0:m.start(iGroup)] + sNew + sText[m.end(iGroup):]
def _tagAndPrepareTokenForRewriting (self, sWhat, nTokenRewriteStart, nTokenRewriteEnd, nTokenOffset, nLastToken, bCaseSvty, bDebug):
def _tagAndPrepareTokenForRewriting (self, sAction, nTokenRewriteStart, nTokenRewriteEnd, nTokenOffset, nLastToken, bCaseSvty, bDebug):
"text processor: rewrite tokens between <nTokenRewriteStart> and <nTokenRewriteEnd> position"
if sWhat == "*":
if sAction == "*":
# purge text
if nTokenRewriteEnd - nTokenRewriteStart == 0:
self.lTokens[nTokenRewriteStart]["bToRemove"] = True
else:
for i in range(nTokenRewriteStart, nTokenRewriteEnd+1):
self.lTokens[i]["bToRemove"] = True
elif sWhat == "␣":
elif sAction == "␣":
# merge tokens
self.lTokens[nTokenRewriteStart]["nMergeUntil"] = nTokenRewriteEnd
elif sWhat.startswith("␣"):
sWhat = self._expand(sWhat, nTokenOffset, nLastToken)
elif sAction.startswith("␣"):
sAction = self._expand(sAction, nTokenOffset, nLastToken)
self.lTokens[nTokenRewriteStart]["nMergeUntil"] = nTokenRewriteEnd
self.lTokens[nTokenRewriteStart]["sMergedValue"] = sWhat[1:]
elif sWhat == "_":
self.lTokens[nTokenRewriteStart]["sMergedValue"] = sAction[1:]
elif sAction == "_":
# neutralized token
if nTokenRewriteEnd - nTokenRewriteStart == 0:
self.lTokens[nTokenRewriteStart]["sNewValue"] = "_"
else:
for i in range(nTokenRewriteStart, nTokenRewriteEnd+1):
self.lTokens[i]["sNewValue"] = "_"
else:
if sWhat.startswith("="):
sWhat = getattr(gc_functions, sWhat[1:])(self.lTokens, nTokenOffset, nLastToken)
if sAction.startswith("="):
sAction = getattr(gc_functions, sAction[1:])(self.lTokens, nTokenOffset, nLastToken)
else:
sWhat = self._expand(sWhat, nTokenOffset, nLastToken)
sAction = self._expand(sAction, nTokenOffset, nLastToken)
bUppercase = bCaseSvty and self.lTokens[nTokenRewriteStart]["sValue"][0:1].isupper()
if nTokenRewriteEnd - nTokenRewriteStart == 0:
# one token
if bUppercase:
sWhat = sWhat[0:1].upper() + sWhat[1:]
self.lTokens[nTokenRewriteStart]["sNewValue"] = sWhat
sAction = sAction[0:1].upper() + sAction[1:]
self.lTokens[nTokenRewriteStart]["sNewValue"] = sAction
else:
# several tokens
lTokenValue = sWhat.split("|")
lTokenValue = sAction.split("|")
if len(lTokenValue) != (nTokenRewriteEnd - nTokenRewriteStart + 1):
if bDebug:
echo("Error. Text processor: number of replacements != number of tokens.")
return
for i, sValue in zip(range(nTokenRewriteStart, nTokenRewriteEnd+1), lTokenValue):
if not sValue or sValue == "*":
self.lTokens[i]["bToRemove"] = True
|