156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
|
bParagraphChange = False
bSentenceChange = False
dTokenPos = oSentence.dTokenPos if oSentence else {}
for sOption, lRuleGroup in _getRules(bParagraph):
if sOption == "@@@@":
# graph rules
if not bParagraph and bSentenceChange:
oSentence.update(s)
bSentenceChange = False
for sGraphName, sLineId in lRuleGroup:
if bDebug:
print("\n>>>> GRAPH:", sGraphName, sLineId)
bParagraphChange, s = oSentence.parse(dAllGraph[sGraphName], dPriority, sCountry, dOptions, bShowRuleId, bDebug, bContext)
dErrs.update(oSentence.dError)
elif not sOption or dOptions.get(sOption, False):
# regex rules
for zRegex, bUppercase, sLineId, sRuleId, nPriority, lActions in lRuleGroup:
if sRuleId not in _aIgnoredRules:
for m in zRegex.finditer(s):
bCondMemo = None
for sFuncCond, cActionType, sWhat, *eAct in lActions:
# action in lActions: [ condition, action type, replacement/suggestion/action[, iGroup[, message, URL]] ]
try:
bCondMemo = not sFuncCond or globals()[sFuncCond](s, sx, m, dTokenPos, sCountry, bCondMemo)
if bCondMemo:
if cActionType == "-":
# grammar error
nErrorStart = nOffset + m.start(eAct[0])
if nErrorStart not in dErrs or nPriority > dPriority.get(nErrorStart, -1):
dErrs[nErrorStart] = _createError(s, sx, sWhat, nOffset, m, eAct[0], sLineId, sRuleId, bUppercase, eAct[1], eAct[2], bShowRuleId, sOption, bContext)
dPriority[nErrorStart] = nPriority
elif cActionType == "~":
|
|
>
>
>
|
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
|
bParagraphChange = False
bSentenceChange = False
dTokenPos = oSentence.dTokenPos if oSentence else {}
for sOption, lRuleGroup in _getRules(bParagraph):
if sOption == "@@@@":
# graph rules
if not bParagraph and bSentenceChange:
oSentence.update(s, bDebug)
bSentenceChange = False
for sGraphName, sLineId in lRuleGroup:
if bDebug:
print("\n>>>> GRAPH:", sGraphName, sLineId)
bParagraphChange, s = oSentence.parse(dAllGraph[sGraphName], dPriority, sCountry, dOptions, bShowRuleId, bDebug, bContext)
dErrs.update(oSentence.dError)
dTokenPos = oSentence.dTokenPos
elif not sOption or dOptions.get(sOption, False):
# regex rules
for zRegex, bUppercase, sLineId, sRuleId, nPriority, lActions in lRuleGroup:
if sRuleId not in _aIgnoredRules:
for m in zRegex.finditer(s):
bCondMemo = None
for sFuncCond, cActionType, sWhat, *eAct in lActions:
# action in lActions: [ condition, action type, replacement/suggestion/action[, iGroup[, message, URL]] ]
try:
bCondMemo = not sFuncCond or globals()[sFuncCond](s, sx, m, dTokenPos, sCountry, bCondMemo)
if bCondMemo:
if bDebug:
print("RULE:", sLineId)
if cActionType == "-":
# grammar error
nErrorStart = nOffset + m.start(eAct[0])
if nErrorStart not in dErrs or nPriority > dPriority.get(nErrorStart, -1):
dErrs[nErrorStart] = _createError(s, sx, sWhat, nOffset, m, eAct[0], sLineId, sRuleId, bUppercase, eAct[1], eAct[2], bShowRuleId, sOption, bContext)
dPriority[nErrorStart] = nPriority
elif cActionType == "~":
|
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
|
return any(zPattern.search(s) for s in lMorph)
def morphex (dTokenPos, tWord, sPattern, sNegPattern, bNoWord=False):
"analyse a tuple (position, word), returns True if not sNegPattern in word morphologies and sPattern in word morphologies (disambiguation on)"
if not tWord:
return bNoWord
lMorph = dTokenPos[tWord[0]]["lMorph"] if tWord[0] in dTokenPos and "lMorph" in dTokenPos[tWord[0]] else _oSpellChecker.getMorph(tWord[1])
if not lMorph:
return False
if (tWord[1].startswith("noir")):
print(tWord)
print(dTokenPos)
print(lMorph)
# check negative condition
zNegPattern = re.compile(sNegPattern)
if any(zNegPattern.search(s) for s in lMorph):
return False
# search sPattern
zPattern = re.compile(sPattern)
return any(zPattern.search(s) for s in lMorph)
|
<
<
<
<
<
|
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
|
return any(zPattern.search(s) for s in lMorph)
def morphex (dTokenPos, tWord, sPattern, sNegPattern, bNoWord=False):
"analyse a tuple (position, word), returns True if not sNegPattern in word morphologies and sPattern in word morphologies (disambiguation on)"
if not tWord:
return bNoWord
lMorph = dTokenPos[tWord[0]]["lMorph"] if tWord[0] in dTokenPos and "lMorph" in dTokenPos[tWord[0]] else _oSpellChecker.getMorph(tWord[1])
if not lMorph:
return False
# check negative condition
zNegPattern = re.compile(sNegPattern)
if any(zNegPattern.search(s) for s in lMorph):
return False
# search sPattern
zPattern = re.compile(sPattern)
return any(zPattern.search(s) for s in lMorph)
|
574
575
576
577
578
579
580
581
582
583
584
585
586
587
588
589
590
591
592
593
594
595
596
597
598
599
600
601
602
603
604
605
606
607
|
"Text parser"
def __init__ (self, sSentence, sSentence0, nOffset):
self.sSentence = sSentence
self.sSentence0 = sSentence0
self.nOffsetWithinParagraph = nOffset
self.lToken = list(_oTokenizer.genTokens(sSentence, True))
self.dTokenPos = { dToken["nStart"]: dToken for dToken in self.lToken }
self.dTags = {}
self.dError = {}
def __str__ (self):
s = "sentence: " + self.sSentence0 + "\n"
s += "now: " + self.sSentence + "\n"
for dToken in self.lToken:
s += f'{dToken["nStart"]}\t{dToken["nEnd"]}\t{dToken["sValue"]}'
if "lMorph" in dToken:
s += "\t" + str(dToken["lMorph"])
s += "\n"
for nPos, dToken in self.dTokenPos.items():
s += f"{nPos}\t{dToken}\n"
return s
def update (self, sSentence):
"update <sSentence> and retokenize"
self.sSentence = sSentence
self.lToken = list(_oTokenizer.genTokens(sSentence, True))
def _getNextMatchingNodes (self, dToken, dGraph, dNode, bDebug=False):
"generator: return nodes where <dToken> “values” match <dNode> arcs"
# token value
if dToken["sValue"] in dNode:
if bDebug:
print(" MATCH:", dToken["sValue"])
|
|
|
|
>
>
>
>
>
>
>
>
|
572
573
574
575
576
577
578
579
580
581
582
583
584
585
586
587
588
589
590
591
592
593
594
595
596
597
598
599
600
601
602
603
604
605
606
607
608
609
610
611
612
613
|
"Text parser"
def __init__ (self, sSentence, sSentence0, nOffset):
self.sSentence = sSentence
self.sSentence0 = sSentence0
self.nOffsetWithinParagraph = nOffset
self.lToken = list(_oTokenizer.genTokens(sSentence, True))
self.dTokenPos = { dToken["nStart"]: dToken for dToken in self.lToken if dToken["sType"] != "INFO" }
self.dTags = {}
self.dError = {}
def __str__ (self):
s = "sentence: " + self.sSentence0 + "\n"
s += "now: " + self.sSentence + "\n"
for dToken in self.lToken:
s += f'{dToken["nStart"]}\t{dToken["nEnd"]}\t{dToken["sValue"]}'
if "lMorph" in dToken:
s += "\t" + str(dToken["lMorph"])
s += "\n"
for nPos, dToken in self.dTokenPos.items():
s += f"{nPos}\t{dToken}\n"
return s
def update (self, sSentence, bDebug=False):
"update <sSentence> and retokenize"
self.sSentence = sSentence
lNewToken = list(_oTokenizer.genTokens(sSentence, True))
for dToken in lNewToken:
if "lMorph" in self.dTokenPos.get(dToken["nStart"], {}):
dToken["lMorph"] = self.dTokenPos[dToken["nStart"]]["lMorph"]
self.lToken = lNewToken
self.dTokenPos = { dToken["nStart"]: dToken for dToken in self.lToken if dToken["sType"] != "INFO" }
if bDebug:
print("UPDATE:")
print(self)
def _getNextMatchingNodes (self, dToken, dGraph, dNode, bDebug=False):
"generator: return nodes where <dToken> “values” match <dNode> arcs"
# token value
if dToken["sValue"] in dNode:
if bDebug:
print(" MATCH:", dToken["sValue"])
|
911
912
913
914
915
916
917
918
919
920
921
922
923
924
925
926
927
928
929
930
931
932
933
934
935
936
937
938
939
940
941
942
943
944
945
946
947
948
949
950
951
952
953
954
955
956
957
958
959
960
961
962
|
if bDebug:
print("REWRITE")
lNewToken = []
nMergeUntil = 0
dTokenMerger = None
for dToken in self.lToken:
bKeepToken = True
if "bImmune" in dToken:
nErrorStart = self.nOffsetWithinParagraph + dToken["nStart"]
if nErrorStart in self.dError:
if bDebug:
print("immunity -> error removed:", self.dError[nErrorStart])
del self.dError[nErrorStart]
if nMergeUntil and dToken["i"] <= nMergeUntil:
dTokenMerger["sValue"] += " " * (dToken["nStart"] - dTokenMerger["nEnd"]) + dToken["sValue"]
dTokenMerger["nEnd"] = dToken["nEnd"]
if bDebug:
print(" MERGED TOKEN:", dTokenMerger["sValue"])
bKeepToken = False
if "nMergeUntil" in dToken:
if dToken["i"] > nMergeUntil: # this token is not already merged with a previous token
dTokenMerger = dToken
if dToken["nMergeUntil"] > nMergeUntil:
nMergeUntil = dToken["nMergeUntil"]
del dToken["nMergeUntil"]
elif "bToRemove" in dToken:
if bDebug:
print(" REMOVED:", dToken["sValue"])
self.sSentence = self.sSentence[:dToken["nStart"]] + " " * (dToken["nEnd"] - dToken["nStart"]) + self.sSentence[dToken["nEnd"]:]
bKeepToken = False
#
if bKeepToken:
lNewToken.append(dToken)
if "sNewValue" in dToken:
# rewrite token and sentence
if bDebug:
print(dToken["sValue"], "->", dToken["sNewValue"])
dToken["sRealValue"] = dToken["sValue"]
dToken["sValue"] = dToken["sNewValue"]
nDiffLen = len(dToken["sRealValue"]) - len(dToken["sNewValue"])
sNewRepl = (dToken["sNewValue"] + " " * nDiffLen) if nDiffLen >= 0 else dToken["sNewValue"][:len(dToken["sRealValue"])]
self.sSentence = self.sSentence[:dToken["nStart"]] + sNewRepl + self.sSentence[dToken["nEnd"]:]
del dToken["sNewValue"]
else:
del self.dTokenPos[dToken["nStart"]]
if bDebug:
print(" REWRITED:", self.sSentence)
self.lToken.clear()
self.lToken = lNewToken
|
>
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
>
|
>
>
>
>
|
917
918
919
920
921
922
923
924
925
926
927
928
929
930
931
932
933
934
935
936
937
938
939
940
941
942
943
944
945
946
947
948
949
950
951
952
953
954
955
956
957
958
959
960
961
962
963
964
965
966
967
968
969
970
971
972
973
974
|
if bDebug:
print("REWRITE")
lNewToken = []
nMergeUntil = 0
dTokenMerger = None
for dToken in self.lToken:
bKeepToken = True
if dToken["sType"] != "INFO":
if "bImmune" in dToken:
nErrorStart = self.nOffsetWithinParagraph + dToken["nStart"]
if nErrorStart in self.dError:
if bDebug:
print("immunity -> error removed:", self.dError[nErrorStart])
del self.dError[nErrorStart]
if nMergeUntil and dToken["i"] <= nMergeUntil:
dTokenMerger["sValue"] += " " * (dToken["nStart"] - dTokenMerger["nEnd"]) + dToken["sValue"]
dTokenMerger["nEnd"] = dToken["nEnd"]
if bDebug:
print(" MERGED TOKEN:", dTokenMerger["sValue"])
bKeepToken = False
if "nMergeUntil" in dToken:
if dToken["i"] > nMergeUntil: # this token is not already merged with a previous token
dTokenMerger = dToken
if dToken["nMergeUntil"] > nMergeUntil:
nMergeUntil = dToken["nMergeUntil"]
del dToken["nMergeUntil"]
elif "bToRemove" in dToken:
if bDebug:
print(" REMOVED:", dToken["sValue"])
self.sSentence = self.sSentence[:dToken["nStart"]] + " " * (dToken["nEnd"] - dToken["nStart"]) + self.sSentence[dToken["nEnd"]:]
bKeepToken = False
#
if bKeepToken:
lNewToken.append(dToken)
if "sNewValue" in dToken:
# rewrite token and sentence
if bDebug:
print(dToken["sValue"], "->", dToken["sNewValue"])
dToken["sRealValue"] = dToken["sValue"]
dToken["sValue"] = dToken["sNewValue"]
nDiffLen = len(dToken["sRealValue"]) - len(dToken["sNewValue"])
sNewRepl = (dToken["sNewValue"] + " " * nDiffLen) if nDiffLen >= 0 else dToken["sNewValue"][:len(dToken["sRealValue"])]
self.sSentence = self.sSentence[:dToken["nStart"]] + sNewRepl + self.sSentence[dToken["nEnd"]:]
del dToken["sNewValue"]
else:
try:
del self.dTokenPos[dToken["nStart"]]
except:
print(self)
print(dToken)
exit()
if bDebug:
print(" REWRITED:", self.sSentence)
self.lToken.clear()
self.lToken = lNewToken
|