Overview
| Comment: | [core] DARG: text processing |
|---|---|
| Downloads: | Tarball | ZIP archive | SQL archive |
| Timelines: | family | ancestors | descendants | both | core | rg |
| Files: | files | file ages | folders |
| SHA3-256: |
5fc6d416505febdcd0257332fbc8902b |
| User & Date: | olr on 2018-06-06 15:24:46 |
| Other Links: | branch diff | manifest | tags |
Context
|
2018-06-06
| ||
| 16:55 | [build][core] tokens auto selection check-in: e0f39d31ba user: olr tags: core, build, rg | |
| 15:24 | [core] DARG: text processing check-in: 5fc6d41650 user: olr tags: core, rg | |
| 10:36 | [core][fr] end of lemma is now a slash instead of a space check-in: 023f83bc15 user: olr tags: fr, core, rg | |
Changes
Modified gc_core/py/lang_core/gc_engine.py from [f0f3202267] to [894e8606f6].
| ︙ | ︙ | |||
77 78 79 80 81 82 83 |
dDA.clear()
try:
# regex parser
_, errs = _proofread(sText[iStart:iEnd], sRealText[iStart:iEnd], iStart, False, dDA, dPriority, sCountry, dOpt, bShowRuleId, bDebug, bContext)
aErrors.update(errs)
# token parser
oSentence = TokenSentence(sText[iStart:iEnd], sRealText[iStart:iEnd], iStart)
| | > > | 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 |
dDA.clear()
try:
# regex parser
_, errs = _proofread(sText[iStart:iEnd], sRealText[iStart:iEnd], iStart, False, dDA, dPriority, sCountry, dOpt, bShowRuleId, bDebug, bContext)
aErrors.update(errs)
# token parser
oSentence = TokenSentence(sText[iStart:iEnd], sRealText[iStart:iEnd], iStart)
bChange, errs = oSentence.parse(dPriority, sCountry, dOpt, bShowRuleId, bDebug, bContext)
aErrors.update(errs)
if bChange:
oSentence.rewrite()
except:
raise
return aErrors.values() # this is a view (iterable)
def _getSentenceBoundaries (sText):
iStart = _zBeginOfParagraph.match(sText).end()
|
| ︙ | ︙ | |||
662 663 664 665 666 667 668 | #### TOKEN SENTENCE CHECKER class TokenSentence: | | | | 664 665 666 667 668 669 670 671 672 673 674 675 676 677 678 679 680 681 |
#### TOKEN SENTENCE CHECKER
class TokenSentence:
def __init__ (self, sSentence, sSentence0, nOffset):
self.sSentence = sSentence
self.sSentence0 = sSentence0
self.nOffset = nOffset
self.lToken = list(_oTokenizer.genTokens(sSentence, True))
def _getNextMatchingNodes (self, dToken, dNode):
"generator: return nodes where <dToken> “values” match <dNode> arcs"
# token value
if dToken["sValue"] in dNode:
#print("value found: ", dToken["sValue"])
|
| ︙ | ︙ | |||
748 749 750 751 752 753 754 |
if bHasChanged:
bChange = True
if dErr:
print(dErr)
return (bChange, dErr)
def _executeActions (self, dNode, nTokenOffset, dPriority, dOpt, sCountry, bShowRuleId, bContext):
| | | | | | < < < | | > > > > > > > > > > > > > > > > > | | | | | > > | | < > | | > > > > | > > > | > > > > > > | < > > | 750 751 752 753 754 755 756 757 758 759 760 761 762 763 764 765 766 767 768 769 770 771 772 773 774 775 776 777 778 779 780 781 782 783 784 785 786 787 788 789 790 791 792 793 794 795 796 797 798 799 800 801 802 803 804 805 806 807 808 809 810 811 812 813 814 815 816 817 818 819 820 821 822 823 824 825 826 827 828 829 830 831 832 833 834 835 836 837 838 839 840 841 842 843 844 845 846 847 848 849 850 851 852 853 854 855 856 |
if bHasChanged:
bChange = True
if dErr:
print(dErr)
return (bChange, dErr)
def _executeActions (self, dNode, nTokenOffset, dPriority, dOpt, sCountry, bShowRuleId, bContext):
"execute actions found in the DARG"
dErrs = {}
bChange = False
for sLineId, nextNodeKey in dNode.items():
for sRuleId in dGraph[nextNodeKey]:
print(sRuleId)
bCondMemo = None
sFuncCond, cActionType, sWhat, *eAct = dRule[sRuleId]
# action in lActions: [ condition, action type, replacement/suggestion/action[, iTokenStart, iTokenEnd[, nPriority, message, URL]] ]
try:
bCondMemo = not sFuncCond or globals()[sFuncCond](self.lToken, nTokenOffset, sCountry, bCondMemo)
if bCondMemo:
if cActionType == "-":
# grammar error
print("-")
nTokenErrorStart = nTokenOffset + eAct[0]
nTokenErrorEnd = nTokenOffset + eAct[1]
nErrorStart = self.nOffset + self.lToken[nTokenErrorStart]["nStart"]
nErrorEnd = self.nOffset + self.lToken[nTokenErrorEnd]["nEnd"]
if nErrorStart not in dErrs or eAct[2] > dPriority[nErrorStart]:
dErrs[nErrorStart] = _createTokenError(self.lToken, self.sSentence, self.sSentence0, sWhat, nTokenErrorStart, nErrorStart, nErrorEnd, sLineId, sRuleId, True, eAct[3], eAct[4], bShowRuleId, "notype", bContext)
dPriority[nErrorStart] = eAct[2]
elif cActionType == "~":
# text processor
print("~")
self._tagAndPrepareTokenForRewriting(sWhat, nTokenOffset + eAct[0], nTokenOffset + eAct[1])
bChange = True
elif cActionType == "=":
# disambiguation
print("=")
globals()[sWhat](self.lToken)
elif cActionType == ">":
# we do nothing, this test is just a condition to apply all following actions
print(">")
pass
else:
print("# error: unknown action at " + sLineId)
elif cActionType == ">":
break
except Exception as e:
raise Exception(str(e), sLineId)
return bChange, dErrs
def _tagAndPrepareTokenForRewriting (self, sWhat, nTokenRewriteStart, nTokenRewriteEnd, bUppercase=True):
"text processor: rewrite tokens between <nTokenRewriteStart> and <nTokenRewriteEnd> position"
if sWhat == "*":
# purge text
if nTokenRewriteEnd - nTokenRewriteStart == 0:
self.lToken[nTokenRewriteStart]["bToRemove"] = True
else:
for i in range(nTokenRewriteStart, nTokenRewriteEnd+1):
self.lToken[i]["bToRemove"] = True
else:
if sWhat.startswith("="):
sWhat = globals()[sWhat[1:]](self.lToken)
bUppercase = bUppercase and self.lToken[nTokenRewriteStart]["sValue"][0:1].isupper()
if nTokenRewriteEnd - nTokenRewriteStart == 0:
sWhat = sWhat + " " * (len(self.lToken[nTokenRewriteStart]["sValue"])-len(sWhat))
if bUppercase:
sWhat = sWhat[0:1].upper() + sWhat[1:]
self.lToken[nTokenRewriteStart]["sNewValue"] = sWhat
else:
lTokenValue = sWhat.split("|")
if len(lTokenValue) != (nTokenRewriteEnd - nTokenRewriteStart + 1):
print("Error. Text processor: number of replacements != number of tokens.")
return
for i, sValue in zip(range(nTokenRewriteStart, nTokenRewriteEnd+1), lTokenValue):
if bUppercase:
sValue = sValue[0:1].upper() + sValue[1:]
self.lToken[i]["sNewValue"] = sValue
def rewrite (self):
"rewrite the sentence, modify tokens, purge the token list"
lNewToken = []
for i, dToken in enumerate(self.lToken):
if "bToRemove" in dToken:
# remove useless token
self.sSentence = self.sSentence[:self.nOffset+dToken["nStart"]] + " " * (dToken["nEnd"] - dToken["nStart"]) + self.sSentence[self.nOffset+dToken["nEnd"]:]
#print("removed:", dToken["sValue"])
else:
lNewToken.append(dToken)
if "sNewValue" in dToken:
# rewrite token and sentence
print(dToken["sValue"], "->", dToken["sNewValue"])
dToken["sRealValue"] = dToken["sValue"]
dToken["sValue"] = dToken["sNewValue"]
nDiffLen = len(dToken["sRealValue"]) - len(dToken["sNewValue"])
sNewRepl = (dToken["sNewValue"] + " " * nDiffLen) if nDiffLen >= 0 else dToken["sNewValue"][:len(dToken["sRealValue"])]
self.sSentence = self.sSentence[:self.nOffset+dToken["nStart"]] + sNewRepl + self.sSentence[self.nOffset+dToken["nEnd"]:]
del dToken["sNewValue"]
print(self.sSentence)
self.lToken.clear()
self.lToken = lNewToken
#### Analyse tokens
def g_morph (dToken, sPattern, sNegPattern=""):
"analyse a token, return True if <sNegPattern> not in morphologies and <sPattern> in morphologies"
if "lMorph" in dToken:
|
| ︙ | ︙ |
Modified gc_lang/fr/rules_graph.grx from [02356176f2] to [747aec6cb3].
| ︙ | ︙ | |||
34 35 36 37 38 39 40 41 42 43 44 45 46 47 |
# Fin d’interprétation du fichier avec une ligne commençant par #END
# ERREURS COURANTES
# http://fr.wikipedia.org/wiki/Wikip%C3%A9dia:Fautes_d%27orthographe/Courantes
__avoir_confiance_en__
>avoir confiance (dans) [moi|toi|soi|lui|elle|nous|vous|eux|elles]
<<- -1>> en # Avoir confiance en quelqu’un ou quelque chose.|http://grammalecte.net
TEST: Elle avait confiance {{dans}} lui.
| > > > > > > > > > > > > > > | 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 |
# Fin d’interprétation du fichier avec une ligne commençant par #END
# ERREURS COURANTES
# http://fr.wikipedia.org/wiki/Wikip%C3%A9dia:Fautes_d%27orthographe/Courantes
__pp__
>avoir marre [d’|des|du|de]
<<- ~1:3>> *
TEST: J’en ai marre de ces gens-là.
__pp2__
il ne pense qu’ à sa gueule
<<- ~4:7>> que|Z|a|perdu
TEST: il ne pense qu’à sa gueule.
__avoir_confiance_en__
>avoir confiance (dans) [moi|toi|soi|lui|elle|nous|vous|eux|elles]
<<- -1>> en # Avoir confiance en quelqu’un ou quelque chose.|http://grammalecte.net
TEST: Elle avait confiance {{dans}} lui.
|
| ︙ | ︙ | |||
61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 |
TEST: Je suis {{an}} désaccord avec lui.
__faire_plaisir__
>faire plaisirs
<<- -2>> plaisir # Faire plaisir : dans cette locution, “plaisir” doit être au singulier.
TEST: Ça me fait {{plaisirs}}.
__test__
je ~préf[éè]r [que|qu’] @(?::Os|:M)¬:X @:I
<<- morph(\1, ":V") and morph(\4, ":Os|:M", ":X") -5>> SUBJONCTIF # SUBJONCTIF.
TEST: je préférerais qu’Isabelle {{est}} partie.
| > | 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 |
TEST: Je suis {{an}} désaccord avec lui.
__faire_plaisir__
>faire plaisirs
<<- -2>> plaisir # Faire plaisir : dans cette locution, “plaisir” doit être au singulier.
<<- ~2>> *
TEST: Ça me fait {{plaisirs}}.
__test__
je ~préf[éè]r [que|qu’] @(?::Os|:M)¬:X @:I
<<- morph(\1, ":V") and morph(\4, ":Os|:M", ":X") -5>> SUBJONCTIF # SUBJONCTIF.
TEST: je préférerais qu’Isabelle {{est}} partie.
|