Overview
Comment: | [core] DARG: text processing |
---|---|
Downloads: | Tarball | ZIP archive | SQL archive |
Timelines: | family | ancestors | descendants | both | core | rg |
Files: | files | file ages | folders |
SHA3-256: |
5fc6d416505febdcd0257332fbc8902b |
User & Date: | olr on 2018-06-06 15:24:46 |
Other Links: | branch diff | manifest | tags |
Context
2018-06-06
| ||
16:55 | [build][core] tokens auto selection check-in: e0f39d31ba user: olr tags: core, build, rg | |
15:24 | [core] DARG: text processing check-in: 5fc6d41650 user: olr tags: core, rg | |
10:36 | [core][fr] end of lemma is now a slash instead of a space check-in: 023f83bc15 user: olr tags: fr, core, rg | |
Changes
Modified gc_core/py/lang_core/gc_engine.py from [f0f3202267] to [894e8606f6].
︙ | ︙ | |||
77 78 79 80 81 82 83 | dDA.clear() try: # regex parser _, errs = _proofread(sText[iStart:iEnd], sRealText[iStart:iEnd], iStart, False, dDA, dPriority, sCountry, dOpt, bShowRuleId, bDebug, bContext) aErrors.update(errs) # token parser oSentence = TokenSentence(sText[iStart:iEnd], sRealText[iStart:iEnd], iStart) | | > > | 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 | dDA.clear() try: # regex parser _, errs = _proofread(sText[iStart:iEnd], sRealText[iStart:iEnd], iStart, False, dDA, dPriority, sCountry, dOpt, bShowRuleId, bDebug, bContext) aErrors.update(errs) # token parser oSentence = TokenSentence(sText[iStart:iEnd], sRealText[iStart:iEnd], iStart) bChange, errs = oSentence.parse(dPriority, sCountry, dOpt, bShowRuleId, bDebug, bContext) aErrors.update(errs) if bChange: oSentence.rewrite() except: raise return aErrors.values() # this is a view (iterable) def _getSentenceBoundaries (sText): iStart = _zBeginOfParagraph.match(sText).end() |
︙ | ︙ | |||
662 663 664 665 666 667 668 | #### TOKEN SENTENCE CHECKER class TokenSentence: | | | | 664 665 666 667 668 669 670 671 672 673 674 675 676 677 678 679 680 681 | #### TOKEN SENTENCE CHECKER class TokenSentence: def __init__ (self, sSentence, sSentence0, nOffset): self.sSentence = sSentence self.sSentence0 = sSentence0 self.nOffset = nOffset self.lToken = list(_oTokenizer.genTokens(sSentence, True)) def _getNextMatchingNodes (self, dToken, dNode): "generator: return nodes where <dToken> “values” match <dNode> arcs" # token value if dToken["sValue"] in dNode: #print("value found: ", dToken["sValue"]) |
︙ | ︙ | |||
748 749 750 751 752 753 754 | if bHasChanged: bChange = True if dErr: print(dErr) return (bChange, dErr) def _executeActions (self, dNode, nTokenOffset, dPriority, dOpt, sCountry, bShowRuleId, bContext): | | | | | | < < < | | > > > > > > > > > > > > > > > > > | | | | | > > | | < > | | > > > > | > > > | > > > > > > | < > > | 750 751 752 753 754 755 756 757 758 759 760 761 762 763 764 765 766 767 768 769 770 771 772 773 774 775 776 777 778 779 780 781 782 783 784 785 786 787 788 789 790 791 792 793 794 795 796 797 798 799 800 801 802 803 804 805 806 807 808 809 810 811 812 813 814 815 816 817 818 819 820 821 822 823 824 825 826 827 828 829 830 831 832 833 834 835 836 837 838 839 840 841 842 843 844 845 846 847 848 849 850 851 852 853 854 855 856 | if bHasChanged: bChange = True if dErr: print(dErr) return (bChange, dErr) def _executeActions (self, dNode, nTokenOffset, dPriority, dOpt, sCountry, bShowRuleId, bContext): "execute actions found in the DARG" dErrs = {} bChange = False for sLineId, nextNodeKey in dNode.items(): for sRuleId in dGraph[nextNodeKey]: print(sRuleId) bCondMemo = None sFuncCond, cActionType, sWhat, *eAct = dRule[sRuleId] # action in lActions: [ condition, action type, replacement/suggestion/action[, iTokenStart, iTokenEnd[, nPriority, message, URL]] ] try: bCondMemo = not sFuncCond or globals()[sFuncCond](self.lToken, nTokenOffset, sCountry, bCondMemo) if bCondMemo: if cActionType == "-": # grammar error print("-") nTokenErrorStart = nTokenOffset + eAct[0] nTokenErrorEnd = nTokenOffset + eAct[1] nErrorStart = self.nOffset + self.lToken[nTokenErrorStart]["nStart"] nErrorEnd = self.nOffset + self.lToken[nTokenErrorEnd]["nEnd"] if nErrorStart not in dErrs or eAct[2] > dPriority[nErrorStart]: dErrs[nErrorStart] = _createTokenError(self.lToken, self.sSentence, self.sSentence0, sWhat, nTokenErrorStart, nErrorStart, nErrorEnd, sLineId, sRuleId, True, eAct[3], eAct[4], bShowRuleId, "notype", bContext) dPriority[nErrorStart] = eAct[2] elif cActionType == "~": # text processor print("~") self._tagAndPrepareTokenForRewriting(sWhat, nTokenOffset + eAct[0], nTokenOffset + eAct[1]) bChange = True elif cActionType == "=": # disambiguation print("=") globals()[sWhat](self.lToken) elif cActionType == ">": # we do nothing, this test is just a condition to apply all following actions print(">") pass else: print("# error: unknown action at " + sLineId) elif cActionType == ">": break except Exception as e: raise Exception(str(e), sLineId) return bChange, dErrs def _tagAndPrepareTokenForRewriting (self, sWhat, nTokenRewriteStart, nTokenRewriteEnd, bUppercase=True): "text processor: rewrite tokens between <nTokenRewriteStart> and <nTokenRewriteEnd> position" if sWhat == "*": # purge text if nTokenRewriteEnd - nTokenRewriteStart == 0: self.lToken[nTokenRewriteStart]["bToRemove"] = True else: for i in range(nTokenRewriteStart, nTokenRewriteEnd+1): self.lToken[i]["bToRemove"] = True else: if sWhat.startswith("="): sWhat = globals()[sWhat[1:]](self.lToken) bUppercase = bUppercase and self.lToken[nTokenRewriteStart]["sValue"][0:1].isupper() if nTokenRewriteEnd - nTokenRewriteStart == 0: sWhat = sWhat + " " * (len(self.lToken[nTokenRewriteStart]["sValue"])-len(sWhat)) if bUppercase: sWhat = sWhat[0:1].upper() + sWhat[1:] self.lToken[nTokenRewriteStart]["sNewValue"] = sWhat else: lTokenValue = sWhat.split("|") if len(lTokenValue) != (nTokenRewriteEnd - nTokenRewriteStart + 1): print("Error. Text processor: number of replacements != number of tokens.") return for i, sValue in zip(range(nTokenRewriteStart, nTokenRewriteEnd+1), lTokenValue): if bUppercase: sValue = sValue[0:1].upper() + sValue[1:] self.lToken[i]["sNewValue"] = sValue def rewrite (self): "rewrite the sentence, modify tokens, purge the token list" lNewToken = [] for i, dToken in enumerate(self.lToken): if "bToRemove" in dToken: # remove useless token self.sSentence = self.sSentence[:self.nOffset+dToken["nStart"]] + " " * (dToken["nEnd"] - dToken["nStart"]) + self.sSentence[self.nOffset+dToken["nEnd"]:] #print("removed:", dToken["sValue"]) else: lNewToken.append(dToken) if "sNewValue" in dToken: # rewrite token and sentence print(dToken["sValue"], "->", dToken["sNewValue"]) dToken["sRealValue"] = dToken["sValue"] dToken["sValue"] = dToken["sNewValue"] nDiffLen = len(dToken["sRealValue"]) - len(dToken["sNewValue"]) sNewRepl = (dToken["sNewValue"] + " " * nDiffLen) if nDiffLen >= 0 else dToken["sNewValue"][:len(dToken["sRealValue"])] self.sSentence = self.sSentence[:self.nOffset+dToken["nStart"]] + sNewRepl + self.sSentence[self.nOffset+dToken["nEnd"]:] del dToken["sNewValue"] print(self.sSentence) self.lToken.clear() self.lToken = lNewToken #### Analyse tokens def g_morph (dToken, sPattern, sNegPattern=""): "analyse a token, return True if <sNegPattern> not in morphologies and <sPattern> in morphologies" if "lMorph" in dToken: |
︙ | ︙ |
Modified gc_lang/fr/rules_graph.grx from [02356176f2] to [747aec6cb3].
︙ | ︙ | |||
34 35 36 37 38 39 40 41 42 43 44 45 46 47 | # Fin d’interprétation du fichier avec une ligne commençant par #END # ERREURS COURANTES # http://fr.wikipedia.org/wiki/Wikip%C3%A9dia:Fautes_d%27orthographe/Courantes __avoir_confiance_en__ >avoir confiance (dans) [moi|toi|soi|lui|elle|nous|vous|eux|elles] <<- -1>> en # Avoir confiance en quelqu’un ou quelque chose.|http://grammalecte.net TEST: Elle avait confiance {{dans}} lui. | > > > > > > > > > > > > > > | 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 | # Fin d’interprétation du fichier avec une ligne commençant par #END # ERREURS COURANTES # http://fr.wikipedia.org/wiki/Wikip%C3%A9dia:Fautes_d%27orthographe/Courantes __pp__ >avoir marre [d’|des|du|de] <<- ~1:3>> * TEST: J’en ai marre de ces gens-là. __pp2__ il ne pense qu’ à sa gueule <<- ~4:7>> que|Z|a|perdu TEST: il ne pense qu’à sa gueule. __avoir_confiance_en__ >avoir confiance (dans) [moi|toi|soi|lui|elle|nous|vous|eux|elles] <<- -1>> en # Avoir confiance en quelqu’un ou quelque chose.|http://grammalecte.net TEST: Elle avait confiance {{dans}} lui. |
︙ | ︙ | |||
61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 | TEST: Je suis {{an}} désaccord avec lui. __faire_plaisir__ >faire plaisirs <<- -2>> plaisir # Faire plaisir : dans cette locution, “plaisir” doit être au singulier. TEST: Ça me fait {{plaisirs}}. __test__ je ~préf[éè]r [que|qu’] @(?::Os|:M)¬:X @:I <<- morph(\1, ":V") and morph(\4, ":Os|:M", ":X") -5>> SUBJONCTIF # SUBJONCTIF. TEST: je préférerais qu’Isabelle {{est}} partie. | > | 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 | TEST: Je suis {{an}} désaccord avec lui. __faire_plaisir__ >faire plaisirs <<- -2>> plaisir # Faire plaisir : dans cette locution, “plaisir” doit être au singulier. <<- ~2>> * TEST: Ça me fait {{plaisirs}}. __test__ je ~préf[éè]r [que|qu’] @(?::Os|:M)¬:X @:I <<- morph(\1, ":V") and morph(\4, ":Os|:M", ":X") -5>> SUBJONCTIF # SUBJONCTIF. TEST: je préférerais qu’Isabelle {{est}} partie. |