Grammalecte  Diff

Differences From Artifact [42d702f828]:

To Artifact [417d2bdb9e]:


8
9
10
11
12
13
14

15
16
17
18
19


20
21
22
23
24
25
26
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29







+





+
+







import os
import sys
import re
import collections
import zipfile
import math
import argparse
import tags
from enum import Enum

from distutils import dir_util
from distutils import file_util
from string import Template

import tags

import metagraphe
import metaphone2
import thes_build


# Dictionnaire des caractères pour le tri naturel.
642
643
644
645
646
647
648
649

650
651
652
653
654
655
656
657
658
659
660
661
662
663
664
665
666

667
668
669
670
671
672
673
645
646
647
648
649
650
651

652
653
654
655
656
657
658
659
660
661
662
663
664
665
666
667
668

669
670
671
672
673
674
675
676







-
+
















-
+








class Entree:
    def __init__ (self, sLine):
        self.lemma = ''
        self.flags = ''
        # champs morphologiques Hunspell
        self.po = ''
        self.iz = ''
        self.iz = '' # hunspell attribute: is
        self.ds = ''
        self.ts = ''
        self.ip = ''
        self.dp = ''
        self.tp = ''
        self.sp = ''
        self.pa = ''
        self.st = ''
        self.al = ''
        self.ph = ''
        # champs annexes
        self.lx = ''
        self.se = ''
        self.et = ''
        self.di = '*'
        self.fq = ''
        self.iD = '0'
        self.iD = '0' # hunspell attribute: id

        # autres
        self.comment = ''
        self.err = ''
        self.nFlexions = 0
        self.lFlexions = []
        self.sStem = ''
687
688
689
690
691
692
693
694
695



696
697
698
699
700
701





702
703

704
705

706
707
708
709
710
711
712
713
714
715
716
717
718
719
720
721
722
723
724
725
726
727
728





729
730



731
732

733
734
735
736
737

738
739
740
741
742
743
744
745
746
747

748
749
750
751
752
753
754
755

756
757

758
759

760
761

762
763

764
765
766
767
768
769
770
690
691
692
693
694
695
696


697
698
699






700
701
702
703
704


705


706























707
708
709
710
711


712
713
714
715

716
717
718
719
720

721
722
723
724
725
726
727
728
729
730

731
732
733
734
735
736
737
738

739
740

741
742

743
744

745
746

747
748
749
750
751
752
753
754







-
-
+
+
+
-
-
-
-
-
-
+
+
+
+
+
-
-
+
-
-
+
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
+
+
+
+
+
-
-
+
+
+

-
+




-
+









-
+







-
+

-
+

-
+

-
+

-
+







        # lemme et drapeaux
        firstElems = elems[0].split('/') if elems[0] != "/"  else elems[0]
        self.lemma = firstElems[0]
        self.flags = firstElems[1]  if len(firstElems) > 1  else ''
        # morph
        for i in range(1, nElems):
            if len(elems[i]) > 3 and elems[i][2] == ':':
                fields = elems[i].split(':', 1)
                if fields[0] == 'po':
                sAttr, sContent = elems[i].split(':', 1)
                if sAttr in {"po", "is", "ds", "ts", "ip", "dp", "tp", "sp", "pa", "st", "al", "ph", "lx", "se", "et", "di", "fq", "id"}:
                    # vérification
                    self.po = fields[1]  if self.po == ''  else self.po + ' ' + fields[1]
                elif fields[0] == 'is':
                    self.iz = fields[1]  if self.iz == ''  else self.iz + ' ' + fields[1]
                elif fields[0] == 'ds':
                    self.ds = fields[1]  if self.ds == ''  else self.ds + ' ' + fields[1]
                elif fields[0] == 'ts':
                    if sAttr in {"po", "is", "lx", "se", "et"} \
                        and ( sContent not in tags.dTags[sAttr] and not (sAttr == "po" and re.match("v[0123][ea_][ix_][tx_][nx_][pqrex_][mx_][eaz_]", sContent)) ):
                        echo("  ## Étiquette inconnue pour le tag <{}>: {} @ {}/{}".format(sAttr, sContent, self.lemma, self.flags))
                    # renommage des attributs
                    if sAttr == "is":
                    self.ts = fields[1]  if self.ts == ''  else self.ts + ' ' + fields[1]
                elif fields[0] == 'ip':
                        sAttr = "iz"
                    self.ip = fields[1]  if self.ip == ''  else self.ip + ' ' + fields[1]
                elif fields[0] == 'dp':
                    if sAttr == "id":
                    self.dp = fields[1]  if self.dp == ''  else self.dp + ' ' + fields[1]
                elif fields[0] == 'tp':
                    self.tp = fields[1]  if self.tp == ''  else self.tp + ' ' + fields[1]
                elif fields[0] == 'sp':
                    self.sp = fields[1]  if self.sp == ''  else self.sp + ' ' + fields[1]
                elif fields[0] == 'pa':
                    self.pa = fields[1]  if self.pa == ''  else self.pa + ' ' + fields[1]
                elif fields[0] == 'st':
                    self.st = fields[1]  if self.st == ''  else self.st + ' ' + fields[1]
                elif fields[0] == 'al':
                    self.al = fields[1]  if self.al == ''  else self.al + ' ' + fields[1]
                elif fields[0] == 'ph':
                    self.ph = fields[1]  if self.ph == ''  else self.ph + ' ' + fields[1]
                elif fields[0] == 'lx':
                    self.lx = fields[1]  if self.lx == ''  else self.lx + ' ' + fields[1]
                elif fields[0] == 'se':
                    self.se = fields[1]  if self.se == ''  else self.se + ' ' + fields[1]
                elif fields[0] == 'et':
                    self.et = fields[1]  if self.et == ''  else self.et + ' ' + fields[1]
                elif fields[0] == 'di':
                    self.di = fields[1]
                elif fields[0] == 'fq':
                    self.fq = fields[1]
                        sAttr = "iD"
                    # modification
                    try:
                        if sAttr in {"po", "iz", "ds", "ts", "ip", "dp", "tp", "sp", "pa", "st", "al", "ph", "lx", "se", "et"}:
                            sContent = getattr(self, sAttr) + " " + sContent
                elif fields[0] == 'id':
                    self.iD = fields[1]
                        setattr(self, sAttr, sContent.strip())
                    except:
                        echo('  ## Erreur. Attribut non attribuable: {}  @  {}/{}'.format(sAttr, self.lemma, self.flags))
                else:
                    echo('  ## Champ inconnu: {}  dans  {}/{}'.format(fields[0], self.lemma, self.flags))
                    echo('  ## Champ inconnu: {} @ {}/{}'.format(sAttr, self.lemma, self.flags))
            else:
                self.err = self.err + elems[i]
        if self.err:
            echo("\n## Erreur dans le dictionnaire : {}".format(self.err))
            echo("   dans : " + self.lemma)
            echo("   @ : " + self.lemma)

    def __str__ (self):
        return "{0.lemma}/{0.flags} {1}".format(self, self.getMorph(2))

    def check (self):
        sErr = ''
        if self.lemma == '':
            sErr += 'lemme vide'
        if re.match(r"^\s", self.lemma):
            sErr += 'premier caractère un espace dans <' + self.lemma + '>'
            sErr += 'premier caractère un espace @ <' + self.lemma + '>'
        if re.search(r"\s$", self.lemma):
            sErr += 'espace en fin de lemme'
        if re.match(r"v[0123]", self.po) and not re.match(r"[eas_][ix_][tx_][nx_][pqreuvx_][mx_][ex_z][ax_z]\b", self.po[2:]):
            sErr += 'verbe inconnu: ' + self.po
        if (re.match(r"S[.]", self.flags) and re.search("[sxz]$", self.lemma)) or (re.match(r"X[.]", self.flags) and not re.search("[ul]$", self.lemma)):
            sErr += 'drapeau inutile'
        if self.iz == '' and re.match(r"[SXAI](?!=)", self.flags) and self.po:
            sErr += '[is] vide'
            sErr += '<is> vide'
        if re.match(r"pl|sg|inv", self.iz):
            sErr += '[is] incomplet'
            sErr += '<is> incomplet'
        if re.match(r"[FW]", self.flags) and re.search(r"epi|mas|fem|inv|sg|pl", self.iz):
            sErr += '[is] incohérent'
            sErr += '<is> incohérent'
        if re.search(r"pl|sg|inv", self.iz) and re.match(r"[SXAIFW](?!=)", self.flags):
            sErr += '[is] incohérent'
            sErr += '<is> incohérent'
        if self.iz.endswith(("mas", "fem", "epi")) and (not self.flags or not self.flags.startswith(("S", "X", "F", "W", "A", "I", "U"))):
            sErr += '[is] incomplet'
            sErr += '<is> incomplet'
        if self.flags.startswith(("a0", "b0", "c0", "d0")) and not self.lemma.endswith("er"):
            sErr += "drapeau pour verbe du 1ᵉʳ groupe sur un lemme non conforme"
        if self.flags.startswith("f") and not self.lemma.endswith(("ir", "ïr")):
            sErr += "drapeau pour verbe du 2ᵉ groupe sur un lemme non conforme"
        if sErr:
            echo('   error -  id: ' + self.iD, end = "")
            echo('  ' + sErr + '  in  ' + self.__str__())
952
953
954
955
956
957
958
959

960
961
962
963
964
965
966
967
968
969
970
971
972
973
974

975
976
977
978
979
980
981
936
937
938
939
940
941
942

943
944
945
946
947
948
949
950
951
952
953
954




955
956
957
958
959
960
961
962







-
+











-
-
-
-
+







        if self.tp: txt += self.tp + ' '
        if self.sp: txt += self.sp + ' '
        return txt

    def getConjugation (self):
        sRes = self.lemma + "\t" + self.po[1:10] + "\n"
        for oFlex in self.lFlexions:
            sMorph = oFlex.sMorph[11:].rstrip("!").replace("ppas adj", "ppas").replace("ppas 1jsg", "ppas")
            sMorph = oFlex.sMorph[11:].rstrip("!").replace("ppas adj", "ppas").replace("1jsg", "").strip()
            if not sMorph.startswith("ppas") and sMorph.find(" ") > 1:
                # complex tags
                for s in getVerbMultiMorph(sMorph):
                    sRes += "_\t" + s + "\t" + oFlex.sFlexion + "\n"
            else:
                sRes += "_\t" + sMorph + "\t" + oFlex.sFlexion + "\n"
        return sRes + "$\n"

    def getDeclination (self):
        sRes = self.lemma + "\t" + self.flags + "\n"
        for oFlex in self.lFlexions:
            if "ppas" in oFlex.sMorph:
                sMorph = oFlex.sMorph.replace("ppas adj", "adj").replace("ppas 1jsg", "adj")
                sRes += "_\t" + sMorph + "\t" + oFlex.sFlexion + "\n"
            elif "adj" in oFlex.sMorph or "nom" in oFlex.sMorph:
            if "adj" in oFlex.sMorph or "nom" in oFlex.sMorph:
                sRes += "_\t" + oFlex.sMorph + "\t" + oFlex.sFlexion + "\n"
        return sRes + "$\n"

    def calcOccurFromFlexions (self):
        self.nOccur = 0
        for o in self.lFlexions:
            self.nOccur += o.nOccur
1121
1122
1123
1124
1125
1126
1127
1128

1129
1130
1131
1132
1133
1134
1135
1102
1103
1104
1105
1106
1107
1108

1109
1110
1111
1112
1113
1114
1115
1116







-
+








    def getGrammarCheckerRepr (self):
        return "{0.sFlexion}\t{0.oEntry.lemma}\t{1}\n".format(self, self._getSimpleTags())

    _dTagReplacement = {
        # POS
        "nom": ":N", "adj": ":A", "adv": ":W", "negadv": ":X", "mg": ":G", "nb": ":B", "nbro": ":Br",
        "loc.nom": ":ÉN", "loc.adj": ":ÉA", "loc.adv": ":ÉW", "loc.verb": ":ÉV",
        "loc.nom": ":ÉN", "loc.adj": ":ÉA", "loc.adv": ":ÉW",
        "interj": ":J", "loc.interj": ":ÉJ", "titr": ":T",
        "mas": ":m", "fem": ":f", "epi": ":e", "sg": ":s", "pl": ":p", "inv": ":i",
        "infi": ":Y",
        "ppre": ":P", "ppas": ":Q",
        "ipre": ":Ip", "iimp": ":Iq", "ipsi": ":Is", "ifut": ":If",
        "spre": ":Sp", "simp": ":Sq", "cond": ":K", "impe": ":E",
        "1sg": ":1s", "1isg": ":1ś", "1jsg": ":1ŝ", "2sg": ":2s", "3sg": ":3s", "1pl": ":1p", "2pl": ":2p", "3pl": ":3p", "3pl!": ":3p!",
1313
1314
1315
1316
1317
1318
1319
1320

1321
1322
1323
1324
1325
1326
1327
1294
1295
1296
1297
1298
1299
1300

1301
1302
1303
1304
1305
1306
1307
1308







-
+







                elif fields[0] == 'ph':
                    self.ph = fields[1]  if self.pa == ''  else self.pa + ' ' + fields[1]
                elif fields[0] == 'lx':
                    self.lx = fields[1]  if self.lx == ''  else self.lx + ' ' + fields[1]
                elif fields[0] == 'di':
                    self.di = fields[1]
                else:
                    echo('Champ inconnu: {}  dans  {}'.format(fields[0], self.sFlagName))
                    echo('Champ inconnu: {} @ {}'.format(fields[0], self.sFlagName))
            else:
                echo("  # Erreur affixe : {}".format(line))

    def isReplicationRule (self):
        "is this rule used for replication of a virtual lemma"
        return self.flags == "" and ((self.cut == "0" and self.add == "") or self.cut == self.add)