338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
|
sText = self.parseGraph(_rules_graph.dAllGraph[sGraphName], sCountry, dOptions, bShowRuleId, bDebug, bContext)
elif not sOption or dOptions.get(sOption, False):
# regex rules
for zRegex, bUppercase, sLineId, sRuleId, nPriority, lActions in lRuleGroup:
if sRuleId not in _aIgnoredRules:
for m in zRegex.finditer(sText):
bCondMemo = None
for sFuncCond, cActionType, sWhat, *eAct in lActions:
# action in lActions: [ condition, action type, replacement/suggestion/action[, iGroup[, message, URL]] ]
try:
bCondMemo = not sFuncCond or getattr(gc_functions, sFuncCond)(sText, sText0, m, self.dTokenPos, sCountry, bCondMemo)
if bCondMemo:
if bDebug:
echo("RULE: " + sLineId)
if cActionType == "-":
# grammar error
nErrorStart = nOffset + m.start(eAct[0])
if nErrorStart not in self.dError or nPriority > self.dErrorPriority.get(nErrorStart, -1):
self.dError[nErrorStart] = self._createErrorFromRegex(sText, sText0, sWhat, nOffset, m, eAct[0], sLineId, sRuleId, bUppercase, eAct[1], eAct[2], bShowRuleId, sOption, bContext)
self.dErrorPriority[nErrorStart] = nPriority
self.dSentenceError[nErrorStart] = self.dError[nErrorStart]
elif cActionType == "~":
# text processor
sText = self.rewriteText(sText, sWhat, eAct[0], m, bUppercase)
bChange = True
if bDebug:
echo("~ " + sText + " -- " + m.group(eAct[0]) + " # " + sLineId)
elif cActionType == "=":
# disambiguation
if not bParagraph:
getattr(gc_functions, sWhat)(sText, m, self.dTokenPos)
if bDebug:
echo("= " + m.group(0) + " # " + sLineId)
elif cActionType == ">":
# we do nothing, this test is just a condition to apply all following actions
pass
else:
echo("# error: unknown action at " + sLineId)
|
|
|
|
|
|
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
|
sText = self.parseGraph(_rules_graph.dAllGraph[sGraphName], sCountry, dOptions, bShowRuleId, bDebug, bContext)
elif not sOption or dOptions.get(sOption, False):
# regex rules
for zRegex, bUppercase, sLineId, sRuleId, nPriority, lActions in lRuleGroup:
if sRuleId not in _aIgnoredRules:
for m in zRegex.finditer(sText):
bCondMemo = None
for sFuncCond, cActionType, sAction, *eAct in lActions:
# action in lActions: [ condition, action type, replacement/suggestion/action[, iGroup[, message, URL]] ]
try:
bCondMemo = not sFuncCond or getattr(gc_functions, sFuncCond)(sText, sText0, m, self.dTokenPos, sCountry, bCondMemo)
if bCondMemo:
if bDebug:
echo("RULE: " + sLineId)
if cActionType == "-":
# grammar error
nErrorStart = nOffset + m.start(eAct[0])
if nErrorStart not in self.dError or nPriority > self.dErrorPriority.get(nErrorStart, -1):
self.dError[nErrorStart] = self._createErrorFromRegex(sText, sText0, sAction, nOffset, m, eAct[0], sLineId, sRuleId, bUppercase, eAct[1], eAct[2], bShowRuleId, sOption, bContext)
self.dErrorPriority[nErrorStart] = nPriority
self.dSentenceError[nErrorStart] = self.dError[nErrorStart]
elif cActionType == "~":
# text processor
sText = self.rewriteText(sText, sAction, eAct[0], m, bUppercase)
bChange = True
if bDebug:
echo("~ " + sText + " -- " + m.group(eAct[0]) + " # " + sLineId)
elif cActionType == "=":
# disambiguation
if not bParagraph:
getattr(gc_functions, sAction)(sText, m, self.dTokenPos)
if bDebug:
echo("= " + m.group(0) + " # " + sLineId)
elif cActionType == ">":
# we do nothing, this test is just a condition to apply all following actions
pass
else:
echo("# error: unknown action at " + sLineId)
|
527
528
529
530
531
532
533
534
535
536
537
538
539
540
541
542
543
544
545
546
547
548
549
550
551
552
553
554
555
556
557
558
559
560
561
562
563
564
565
566
567
568
569
570
571
572
573
574
575
576
577
578
579
580
581
582
583
584
585
586
587
588
589
590
591
592
593
594
595
596
597
598
599
600
601
602
603
604
605
606
607
608
609
610
611
612
613
614
615
616
617
618
619
620
621
622
623
624
625
626
627
628
629
630
631
632
633
634
635
636
637
638
639
640
641
642
643
644
645
646
647
648
649
650
651
652
653
654
655
656
657
658
659
660
661
662
663
664
665
666
667
|
yield ("*", sMeta, dNode["<meta>"][sMeta])
bTokenFound = True
if not bTokenFound and bKeep:
yield (None, "", -1)
# JUMP
# Warning! Recursion!
if "<>" in dNode:
yield from self._getMatches(dGraph, dToken, dGraph[dNode["<>"]], bKeep=True)
def parseGraph (self, dGraph, sCountry="${country_default}", dOptions=None, bShowRuleId=False, bDebug=False, bContext=False):
"parse graph with tokens from the text and execute actions encountered"
lPointer = []
bTagAndRewrite = False
for iToken, dToken in enumerate(self.lTokens):
if bDebug:
echo("TOKEN: " + dToken["sValue"])
# check arcs for each existing pointer
lNextPointer = []
for dPointer in lPointer:
for cActionType, sMatch, iNode in self._getMatches(dGraph, dToken, dGraph[dPointer["iNode"]]):
if cActionType is None:
lNextPointer.append(dPointer)
continue
if bDebug:
echo(" MATCH: " + cActionType + sMatch)
lNextPointer.append({ "iToken1": dPointer["iToken1"], "iNode": iNode })
lPointer = lNextPointer
# check arcs of first nodes
for cActionType, sMatch, iNode in self._getMatches(dGraph, dToken, dGraph[0]):
if cActionType is None:
continue
if bDebug:
echo(" MATCH: " + cActionType + sMatch)
lPointer.append({ "iToken1": iToken, "iNode": iNode })
# check if there is rules to check for each pointer
for dPointer in lPointer:
#if bDebug:
# echo("+", dPointer)
if "<rules>" in dGraph[dPointer["iNode"]]:
bChange = self._executeActions(dGraph, dGraph[dPointer["iNode"]]["<rules>"], dPointer["iToken1"]-1, iToken, dOptions, sCountry, bShowRuleId, bDebug, bContext)
if bChange:
bTagAndRewrite = True
if bTagAndRewrite:
self.rewriteFromTags(bDebug)
if bDebug:
echo(self)
return self.sSentence
def _executeActions (self, dGraph, dNode, nTokenOffset, nLastToken, dOptions, sCountry, bShowRuleId, bDebug, bContext):
"execute actions found in the DARG"
bChange = False
for sLineId, nextNodeKey in dNode.items():
bCondMemo = None
for sRuleId in dGraph[nextNodeKey]:
try:
if bDebug:
echo(" >TRY: " + sRuleId + " " + sLineId)
_, sOption, sFuncCond, cActionType, sWhat, *eAct = _rules_graph.dRule[sRuleId]
# Suggestion [ option, condition, "-", replacement/suggestion/action, iTokenStart, iTokenEnd, cStartLimit, cEndLimit, bCaseSvty, nPriority, sMessage, iURL ]
# TextProcessor [ option, condition, "~", replacement/suggestion/action, iTokenStart, iTokenEnd, bCaseSvty ]
# Disambiguator [ option, condition, "=", replacement/suggestion/action ]
# Tag [ option, condition, "/", replacement/suggestion/action, iTokenStart, iTokenEnd ]
# Immunity [ option, condition, "!", option, iTokenStart, iTokenEnd ]
# Test [ option, condition, ">", "" ]
if not sOption or dOptions.get(sOption, False):
bCondMemo = not sFuncCond or getattr(gc_functions, sFuncCond)(self.lTokens, nTokenOffset, nLastToken, sCountry, bCondMemo, self.dTags, self.sSentence, self.sSentence0)
if bCondMemo:
if cActionType == "-":
# grammar error
iTokenStart, iTokenEnd, cStartLimit, cEndLimit, bCaseSvty, nPriority, sMessage, iURL = eAct
nTokenErrorStart = nTokenOffset + iTokenStart if iTokenStart > 0 else nLastToken + iTokenStart
if "sImmunity" not in self.lTokens[nTokenErrorStart] or (self.lTokens[nTokenErrorStart]["sImmunity"] != "*" and sOption not in self.lTokens[nTokenErrorStart]["sImmunity"]):
nTokenErrorEnd = nTokenOffset + iTokenEnd if iTokenEnd > 0 else nLastToken + iTokenEnd
nErrorStart = self.nOffsetWithinParagraph + (self.lTokens[nTokenErrorStart]["nStart"] if cStartLimit == "<" else self.lTokens[nTokenErrorStart]["nEnd"])
nErrorEnd = self.nOffsetWithinParagraph + (self.lTokens[nTokenErrorEnd]["nEnd"] if cEndLimit == ">" else self.lTokens[nTokenErrorEnd]["nStart"])
if nErrorStart not in self.dError or nPriority > self.dErrorPriority.get(nErrorStart, -1):
self.dError[nErrorStart] = self._createErrorFromTokens(sWhat, nTokenOffset, nLastToken, nTokenErrorStart, nErrorStart, nErrorEnd, sLineId, sRuleId, bCaseSvty, \
sMessage, _rules_graph.dURL.get(iURL, ""), bShowRuleId, sOption, bContext)
self.dErrorPriority[nErrorStart] = nPriority
self.dSentenceError[nErrorStart] = self.dError[nErrorStart]
if bDebug:
echo(" NEW_ERROR: {}".format(self.dError[nErrorStart]))
elif cActionType == "~":
# text processor
nTokenStart = nTokenOffset + eAct[0] if eAct[0] > 0 else nLastToken + eAct[0]
nTokenEnd = nTokenOffset + eAct[1] if eAct[1] > 0 else nLastToken + eAct[1]
self._tagAndPrepareTokenForRewriting(sWhat, nTokenStart, nTokenEnd, nTokenOffset, nLastToken, eAct[2], bDebug)
bChange = True
if bDebug:
echo(" TEXT_PROCESSOR: [{}:{}] > {}".format(self.lTokens[nTokenStart]["sValue"], self.lTokens[nTokenEnd]["sValue"], sWhat))
elif cActionType == "=":
# disambiguation
getattr(gc_functions, sWhat)(self.lTokens, nTokenOffset, nLastToken)
if bDebug:
echo(" DISAMBIGUATOR: ({}) [{}:{}]".format(sWhat, self.lTokens[nTokenOffset+1]["sValue"], self.lTokens[nLastToken]["sValue"]))
elif cActionType == ">":
# we do nothing, this test is just a condition to apply all following actions
if bDebug:
echo(" COND_OK")
elif cActionType == "/":
# Tag
nTokenStart = nTokenOffset + eAct[0] if eAct[0] > 0 else nLastToken + eAct[0]
nTokenEnd = nTokenOffset + eAct[1] if eAct[1] > 0 else nLastToken + eAct[1]
for i in range(nTokenStart, nTokenEnd+1):
if "aTags" in self.lTokens[i]:
self.lTokens[i]["aTags"].update(sWhat.split("|"))
else:
self.lTokens[i]["aTags"] = set(sWhat.split("|"))
if bDebug:
echo(" TAG: {} > [{}:{}]".format(sWhat, self.lTokens[nTokenStart]["sValue"], self.lTokens[nTokenEnd]["sValue"]))
for sTag in sWhat.split("|"):
if sTag not in self.dTags:
self.dTags[sTag] = [nTokenStart, nTokenEnd]
else:
self.dTags[sTag][0] = min(nTokenStart, self.dTags[sTag][0])
self.dTags[sTag][1] = max(nTokenEnd, self.dTags[sTag][1])
elif cActionType == "!":
# immunity
if bDebug:
echo(" IMMUNITY: " + sLineId + " / " + sRuleId)
nTokenStart = nTokenOffset + eAct[0] if eAct[0] > 0 else nLastToken + eAct[0]
nTokenEnd = nTokenOffset + eAct[1] if eAct[1] > 0 else nLastToken + eAct[1]
sImmunity = sWhat or "*"
if nTokenEnd - nTokenStart == 0:
self.lTokens[nTokenStart]["sImmunity"] = sImmunity
nErrorStart = self.nOffsetWithinParagraph + self.lTokens[nTokenStart]["nStart"]
if nErrorStart in self.dError:
del self.dError[nErrorStart]
else:
for i in range(nTokenStart, nTokenEnd+1):
self.lTokens[i]["sImmunity"] = sImmunity
nErrorStart = self.nOffsetWithinParagraph + self.lTokens[i]["nStart"]
if nErrorStart in self.dError:
del self.dError[nErrorStart]
else:
echo("# error: unknown action at " + sLineId)
elif cActionType == ">":
if bDebug:
echo(" COND_BREAK")
break
except Exception as e:
|
|
|
|
|
>
>
>
>
>
|
|
|
|
>
|
|
|
|
|
>
|
|
|
>
>
>
|
|
>
|
|
|
|
|
|
|
|
|
|
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
|
552
553
554
555
556
557
558
559
560
561
562
563
564
565
566
567
568
569
570
571
572
573
574
575
576
577
578
579
580
581
582
583
584
585
586
587
588
589
590
591
592
593
594
595
596
597
598
599
600
601
602
603
604
605
606
607
608
609
610
611
612
613
614
615
616
617
618
619
620
621
622
623
624
625
626
627
628
629
630
631
632
633
634
635
636
637
638
639
640
641
642
643
644
645
646
647
648
649
650
651
652
653
654
655
656
657
658
659
660
661
662
663
664
665
666
667
668
669
670
671
672
673
674
675
676
677
678
679
680
681
682
683
684
685
686
687
688
689
690
691
692
693
694
695
696
697
698
699
700
701
702
703
704
705
706
707
708
709
710
711
712
713
714
715
716
717
718
|
yield ("*", sMeta, dNode["<meta>"][sMeta])
bTokenFound = True
if not bTokenFound and bKeep:
yield (None, "", -1)
# JUMP
# Warning! Recursion!
if "<>" in dNode:
yield from self._getNextNodes(dGraph, dToken, dGraph[dNode["<>"]], bKeep=True)
def parseGraph (self, dGraph, sCountry="${country_default}", dOptions=None, bShowRuleId=False, bDebug=False, bContext=False):
"parse graph with tokens from the text and execute actions encountered"
lPointers = []
bTagAndRewrite = False
for iToken, dToken in enumerate(self.lTokens):
if bDebug:
echo("TOKEN: " + dToken["sValue"])
# check arcs for each existing pointer
lNextPointers = []
for dPointer in lPointers:
if dPointer["nMultiEnd"] != -1:
if dToken["i"] <= dPointer["nMultiEnd"]:
lNextPointers.append(dPointer)
if dToken["i"] != dPointer["nMultiEnd"]:
continue
for cNodeType, sMatch, iNode in self._getNextNodes(dGraph, dToken, dGraph[dPointer["iNode"]]):
if cNodeType is None:
lNextPointers.append(dPointer)
continue
if bDebug:
echo(" MATCH: " + cNodeType + sMatch)
nMultiEnd = -1 if cNodeType != "&" else dToken["nMultiStartTo"]
lNextPointers.append({ "iToken1": dPointer["iToken1"], "iNode": iNode, "nMultiEnd": nMultiEnd })
lPointers = lNextPointers
# check arcs of first nodes
for cNodeType, sMatch, iNode in self._getNextNodes(dGraph, dToken, dGraph[0]):
if cNodeType is None:
continue
if bDebug:
echo(" MATCH: " + cNodeType + sMatch)
nMultiEnd = -1 if cNodeType != "&" else dToken["nMultiStartTo"]
lPointers.append({ "iToken1": iToken, "iNode": iNode, "nMultiEnd": nMultiEnd })
# check if there is rules to check for each pointer
for dPointer in lPointers:
if dPointer["nMultiEnd"] != -1:
if dToken["i"] < dPointer["nMultiEnd"]:
continue
if dToken["i"] == dPointer["nMultiEnd"]:
dPointer["nMultiEnd"] = -1
if "<rules>" in dGraph[dPointer["iNode"]]:
bChange = self._executeActions(dGraph, dGraph[dPointer["iNode"]]["<rules>"], dPointer["iToken1"]-1, iToken, dOptions, sCountry, bShowRuleId, bDebug, bContext)
if bChange:
bTagAndRewrite = True
if bTagAndRewrite:
self.rewriteFromTags(bDebug)
if bDebug:
echo(self)
return self.sSentence
def _executeActions (self, dGraph, dNode, nTokenOffset, nLastToken, dOptions, sCountry, bShowRuleId, bDebug, bContext):
"execute actions found in the DARG"
bChange = False
for sLineId, nextNodeKey in dNode.items():
bCondMemo = None
for sRuleId in dGraph[nextNodeKey]:
try:
if bDebug:
echo(" >TRY: " + sRuleId + " " + sLineId)
_, sOption, sFuncCond, cActionType, sAction, *eAct = _rules_graph.dRule[sRuleId]
# Suggestion [ option, condition, "-", replacement/suggestion/action, iTokenStart, iTokenEnd, cStartLimit, cEndLimit, bCaseSvty, nPriority, sMessage, iURL ]
# TextProcessor [ option, condition, "~", replacement/suggestion/action, iTokenStart, iTokenEnd, bCaseSvty ]
# Disambiguator [ option, condition, "=", replacement/suggestion/action ]
# Tag [ option, condition, "/", replacement/suggestion/action, iTokenStart, iTokenEnd ]
# Immunity [ option, condition, "!", option, iTokenStart, iTokenEnd ]
# Multi-token [ option, condition, "&", morphologies, iTokenStart, iTokenEnd ]
# Test [ option, condition, ">", "" ]
if not sOption or dOptions.get(sOption, False):
bCondMemo = not sFuncCond or getattr(gc_functions, sFuncCond)(self.lTokens, nTokenOffset, nLastToken, sCountry, bCondMemo, self.dTags, self.sSentence, self.sSentence0)
if bCondMemo:
if cActionType == "-":
# grammar error
iTokenStart, iTokenEnd, cStartLimit, cEndLimit, bCaseSvty, nPriority, sMessage, iURL = eAct
nTokenErrorStart = nTokenOffset + iTokenStart if iTokenStart > 0 else nLastToken + iTokenStart
if "sImmunity" not in self.lTokens[nTokenErrorStart] or (self.lTokens[nTokenErrorStart]["sImmunity"] != "*" and sOption not in self.lTokens[nTokenErrorStart]["sImmunity"]):
nTokenErrorEnd = nTokenOffset + iTokenEnd if iTokenEnd > 0 else nLastToken + iTokenEnd
nErrorStart = self.nOffsetWithinParagraph + (self.lTokens[nTokenErrorStart]["nStart"] if cStartLimit == "<" else self.lTokens[nTokenErrorStart]["nEnd"])
nErrorEnd = self.nOffsetWithinParagraph + (self.lTokens[nTokenErrorEnd]["nEnd"] if cEndLimit == ">" else self.lTokens[nTokenErrorEnd]["nStart"])
if nErrorStart not in self.dError or nPriority > self.dErrorPriority.get(nErrorStart, -1):
self.dError[nErrorStart] = self._createErrorFromTokens(sAction, nTokenOffset, nLastToken, nTokenErrorStart, nErrorStart, nErrorEnd, sLineId, sRuleId, bCaseSvty, \
sMessage, _rules_graph.dURL.get(iURL, ""), bShowRuleId, sOption, bContext)
self.dErrorPriority[nErrorStart] = nPriority
self.dSentenceError[nErrorStart] = self.dError[nErrorStart]
if bDebug:
echo(" NEW_ERROR: {}".format(self.dError[nErrorStart]))
elif cActionType == "~":
# text processor
nTokenStart = nTokenOffset + eAct[0] if eAct[0] > 0 else nLastToken + eAct[0]
nTokenEnd = nTokenOffset + eAct[1] if eAct[1] > 0 else nLastToken + eAct[1]
self._tagAndPrepareTokenForRewriting(sAction, nTokenStart, nTokenEnd, nTokenOffset, nLastToken, eAct[2], bDebug)
bChange = True
if bDebug:
echo(" TEXT_PROCESSOR: [{}:{}] > {}".format(self.lTokens[nTokenStart]["sValue"], self.lTokens[nTokenEnd]["sValue"], sAction))
elif cActionType == "=":
# disambiguation
getattr(gc_functions, sAction)(self.lTokens, nTokenOffset, nLastToken)
if bDebug:
echo(" DISAMBIGUATOR: ({}) [{}:{}]".format(sAction, self.lTokens[nTokenOffset+1]["sValue"], self.lTokens[nLastToken]["sValue"]))
elif cActionType == ">":
# we do nothing, this test is just a condition to apply all following actions
if bDebug:
echo(" COND_OK")
elif cActionType == "/":
# Tag
nTokenStart = nTokenOffset + eAct[0] if eAct[0] > 0 else nLastToken + eAct[0]
nTokenEnd = nTokenOffset + eAct[1] if eAct[1] > 0 else nLastToken + eAct[1]
for i in range(nTokenStart, nTokenEnd+1):
if "aTags" in self.lTokens[i]:
self.lTokens[i]["aTags"].update(sAction.split("|"))
else:
self.lTokens[i]["aTags"] = set(sAction.split("|"))
if bDebug:
echo(" TAG: {} > [{}:{}]".format(sAction, self.lTokens[nTokenStart]["sValue"], self.lTokens[nTokenEnd]["sValue"]))
for sTag in sAction.split("|"):
if sTag not in self.dTags:
self.dTags[sTag] = [nTokenStart, nTokenEnd]
else:
self.dTags[sTag][0] = min(nTokenStart, self.dTags[sTag][0])
self.dTags[sTag][1] = max(nTokenEnd, self.dTags[sTag][1])
elif cActionType == "!":
# immunity
if bDebug:
echo(" IMMUNITY: " + sLineId + " / " + sRuleId)
nTokenStart = nTokenOffset + eAct[0] if eAct[0] > 0 else nLastToken + eAct[0]
nTokenEnd = nTokenOffset + eAct[1] if eAct[1] > 0 else nLastToken + eAct[1]
sImmunity = sAction or "*"
if nTokenEnd - nTokenStart == 0:
self.lTokens[nTokenStart]["sImmunity"] = sImmunity
nErrorStart = self.nOffsetWithinParagraph + self.lTokens[nTokenStart]["nStart"]
if nErrorStart in self.dError:
del self.dError[nErrorStart]
else:
for i in range(nTokenStart, nTokenEnd+1):
self.lTokens[i]["sImmunity"] = sImmunity
nErrorStart = self.nOffsetWithinParagraph + self.lTokens[i]["nStart"]
if nErrorStart in self.dError:
del self.dError[nErrorStart]
elif cActionType == "&":
# multi-tokens
nTokenStart = nTokenOffset + eAct[0] if eAct[0] > 0 else nLastToken + eAct[0]
nTokenEnd = nTokenOffset + eAct[1] if eAct[1] > 0 else nLastToken + eAct[1]
dMultiToken = {
"nTokenStart": nTokenStart,
"nTokenEnd": nTokenEnd,
"lTokens": self.lTokens[nTokenStart:nTokenEnd+1],
"lMorph": sAction.split("|") if sAction else [":HM"]
}
self.lTokens[nTokenStart]["nMultiStartTo"] = nTokenEnd
self.lTokens[nTokenEnd]["nMultiEndFrom"] = nTokenStart
self.lTokens[nTokenStart]["dMultiToken"] = dMultiToken
self.lTokens[nTokenEnd]["dMultiToken"] = dMultiToken
print(dMultiToken)
else:
echo("# error: unknown action at " + sLineId)
elif cActionType == ">":
if bDebug:
echo(" COND_BREAK")
break
except Exception as e:
|