Grammalecte  Diff

Differences From Artifact [42d702f828]:

To Artifact [417d2bdb9e]:


8
9
10
11
12
13
14

15
16
17
18
19


20
21
22
23
24
25
26
import os
import sys
import re
import collections
import zipfile
import math
import argparse

from enum import Enum

from distutils import dir_util
from distutils import file_util
from string import Template



import metagraphe
import metaphone2
import thes_build


# Dictionnaire des caractères pour le tri naturel.







>





>
>







8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
import os
import sys
import re
import collections
import zipfile
import math
import argparse
import tags
from enum import Enum

from distutils import dir_util
from distutils import file_util
from string import Template

import tags

import metagraphe
import metaphone2
import thes_build


# Dictionnaire des caractères pour le tri naturel.
642
643
644
645
646
647
648
649
650
651
652
653
654
655
656
657
658
659
660
661
662
663
664
665
666
667
668
669
670
671
672
673

class Entree:
    def __init__ (self, sLine):
        self.lemma = ''
        self.flags = ''
        # champs morphologiques Hunspell
        self.po = ''
        self.iz = ''
        self.ds = ''
        self.ts = ''
        self.ip = ''
        self.dp = ''
        self.tp = ''
        self.sp = ''
        self.pa = ''
        self.st = ''
        self.al = ''
        self.ph = ''
        # champs annexes
        self.lx = ''
        self.se = ''
        self.et = ''
        self.di = '*'
        self.fq = ''
        self.iD = '0'

        # autres
        self.comment = ''
        self.err = ''
        self.nFlexions = 0
        self.lFlexions = []
        self.sStem = ''







|
















|







645
646
647
648
649
650
651
652
653
654
655
656
657
658
659
660
661
662
663
664
665
666
667
668
669
670
671
672
673
674
675
676

class Entree:
    def __init__ (self, sLine):
        self.lemma = ''
        self.flags = ''
        # champs morphologiques Hunspell
        self.po = ''
        self.iz = '' # hunspell attribute: is
        self.ds = ''
        self.ts = ''
        self.ip = ''
        self.dp = ''
        self.tp = ''
        self.sp = ''
        self.pa = ''
        self.st = ''
        self.al = ''
        self.ph = ''
        # champs annexes
        self.lx = ''
        self.se = ''
        self.et = ''
        self.di = '*'
        self.fq = ''
        self.iD = '0' # hunspell attribute: id

        # autres
        self.comment = ''
        self.err = ''
        self.nFlexions = 0
        self.lFlexions = []
        self.sStem = ''
687
688
689
690
691
692
693
694

695
696
697
698
699
700
701
702
703
704
705
706
707
708
709
710
711
712
713
714
715
716
717
718
719
720
721
722
723
724
725
726
727


728
729
730


731
732
733
734
735
736
737
738
739
740
741
742
743
744
745
746
747
748
749
750
751
752
753
754
755
756
757
758
759
760
761
762
763
764
765
766
767
768
769
770
        # lemme et drapeaux
        firstElems = elems[0].split('/') if elems[0] != "/"  else elems[0]
        self.lemma = firstElems[0]
        self.flags = firstElems[1]  if len(firstElems) > 1  else ''
        # morph
        for i in range(1, nElems):
            if len(elems[i]) > 3 and elems[i][2] == ':':
                fields = elems[i].split(':', 1)

                if fields[0] == 'po':
                    self.po = fields[1]  if self.po == ''  else self.po + ' ' + fields[1]
                elif fields[0] == 'is':
                    self.iz = fields[1]  if self.iz == ''  else self.iz + ' ' + fields[1]
                elif fields[0] == 'ds':
                    self.ds = fields[1]  if self.ds == ''  else self.ds + ' ' + fields[1]
                elif fields[0] == 'ts':
                    self.ts = fields[1]  if self.ts == ''  else self.ts + ' ' + fields[1]
                elif fields[0] == 'ip':
                    self.ip = fields[1]  if self.ip == ''  else self.ip + ' ' + fields[1]
                elif fields[0] == 'dp':
                    self.dp = fields[1]  if self.dp == ''  else self.dp + ' ' + fields[1]
                elif fields[0] == 'tp':
                    self.tp = fields[1]  if self.tp == ''  else self.tp + ' ' + fields[1]
                elif fields[0] == 'sp':
                    self.sp = fields[1]  if self.sp == ''  else self.sp + ' ' + fields[1]
                elif fields[0] == 'pa':
                    self.pa = fields[1]  if self.pa == ''  else self.pa + ' ' + fields[1]
                elif fields[0] == 'st':
                    self.st = fields[1]  if self.st == ''  else self.st + ' ' + fields[1]
                elif fields[0] == 'al':
                    self.al = fields[1]  if self.al == ''  else self.al + ' ' + fields[1]
                elif fields[0] == 'ph':
                    self.ph = fields[1]  if self.ph == ''  else self.ph + ' ' + fields[1]
                elif fields[0] == 'lx':
                    self.lx = fields[1]  if self.lx == ''  else self.lx + ' ' + fields[1]
                elif fields[0] == 'se':
                    self.se = fields[1]  if self.se == ''  else self.se + ' ' + fields[1]
                elif fields[0] == 'et':
                    self.et = fields[1]  if self.et == ''  else self.et + ' ' + fields[1]
                elif fields[0] == 'di':
                    self.di = fields[1]
                elif fields[0] == 'fq':


                    self.fq = fields[1]
                elif fields[0] == 'id':
                    self.iD = fields[1]


                else:
                    echo('  ## Champ inconnu: {}  dans  {}/{}'.format(fields[0], self.lemma, self.flags))
            else:
                self.err = self.err + elems[i]
        if self.err:
            echo("\n## Erreur dans le dictionnaire : {}".format(self.err))
            echo("   dans : " + self.lemma)

    def __str__ (self):
        return "{0.lemma}/{0.flags} {1}".format(self, self.getMorph(2))

    def check (self):
        sErr = ''
        if self.lemma == '':
            sErr += 'lemme vide'
        if re.match(r"^\s", self.lemma):
            sErr += 'premier caractère un espace dans <' + self.lemma + '>'
        if re.search(r"\s$", self.lemma):
            sErr += 'espace en fin de lemme'
        if re.match(r"v[0123]", self.po) and not re.match(r"[eas_][ix_][tx_][nx_][pqreuvx_][mx_][ex_z][ax_z]\b", self.po[2:]):
            sErr += 'verbe inconnu: ' + self.po
        if (re.match(r"S[.]", self.flags) and re.search("[sxz]$", self.lemma)) or (re.match(r"X[.]", self.flags) and not re.search("[ul]$", self.lemma)):
            sErr += 'drapeau inutile'
        if self.iz == '' and re.match(r"[SXAI](?!=)", self.flags) and self.po:
            sErr += '[is] vide'
        if re.match(r"pl|sg|inv", self.iz):
            sErr += '[is] incomplet'
        if re.match(r"[FW]", self.flags) and re.search(r"epi|mas|fem|inv|sg|pl", self.iz):
            sErr += '[is] incohérent'
        if re.search(r"pl|sg|inv", self.iz) and re.match(r"[SXAIFW](?!=)", self.flags):
            sErr += '[is] incohérent'
        if self.iz.endswith(("mas", "fem", "epi")) and (not self.flags or not self.flags.startswith(("S", "X", "F", "W", "A", "I", "U"))):
            sErr += '[is] incomplet'
        if self.flags.startswith(("a0", "b0", "c0", "d0")) and not self.lemma.endswith("er"):
            sErr += "drapeau pour verbe du 1ᵉʳ groupe sur un lemme non conforme"
        if self.flags.startswith("f") and not self.lemma.endswith(("ir", "ïr")):
            sErr += "drapeau pour verbe du 2ᵉ groupe sur un lemme non conforme"
        if sErr:
            echo('   error -  id: ' + self.iD, end = "")
            echo('  ' + sErr + '  in  ' + self.__str__())







|
>
|
<
|
|
|
|
|
<
|
<
|
<
<
<
<
<
<
<
<
<
<
<
<
<
<
<
<
<
<
<
<
|
|
>
>
|
<
|
>
>

|




|









|







|

|

|

|

|







690
691
692
693
694
695
696
697
698
699

700
701
702
703
704

705

706




















707
708
709
710
711

712
713
714
715
716
717
718
719
720
721
722
723
724
725
726
727
728
729
730
731
732
733
734
735
736
737
738
739
740
741
742
743
744
745
746
747
748
749
750
751
752
753
754
        # lemme et drapeaux
        firstElems = elems[0].split('/') if elems[0] != "/"  else elems[0]
        self.lemma = firstElems[0]
        self.flags = firstElems[1]  if len(firstElems) > 1  else ''
        # morph
        for i in range(1, nElems):
            if len(elems[i]) > 3 and elems[i][2] == ':':
                sAttr, sContent = elems[i].split(':', 1)
                if sAttr in {"po", "is", "ds", "ts", "ip", "dp", "tp", "sp", "pa", "st", "al", "ph", "lx", "se", "et", "di", "fq", "id"}:
                    # vérification

                    if sAttr in {"po", "is", "lx", "se", "et"} \
                        and ( sContent not in tags.dTags[sAttr] and not (sAttr == "po" and re.match("v[0123][ea_][ix_][tx_][nx_][pqrex_][mx_][eaz_]", sContent)) ):
                        echo("  ## Étiquette inconnue pour le tag <{}>: {} @ {}/{}".format(sAttr, sContent, self.lemma, self.flags))
                    # renommage des attributs
                    if sAttr == "is":

                        sAttr = "iz"

                    if sAttr == "id":




















                        sAttr = "iD"
                    # modification
                    try:
                        if sAttr in {"po", "iz", "ds", "ts", "ip", "dp", "tp", "sp", "pa", "st", "al", "ph", "lx", "se", "et"}:
                            sContent = getattr(self, sAttr) + " " + sContent

                        setattr(self, sAttr, sContent.strip())
                    except:
                        echo('  ## Erreur. Attribut non attribuable: {}  @  {}/{}'.format(sAttr, self.lemma, self.flags))
                else:
                    echo('  ## Champ inconnu: {} @ {}/{}'.format(sAttr, self.lemma, self.flags))
            else:
                self.err = self.err + elems[i]
        if self.err:
            echo("\n## Erreur dans le dictionnaire : {}".format(self.err))
            echo("   @ : " + self.lemma)

    def __str__ (self):
        return "{0.lemma}/{0.flags} {1}".format(self, self.getMorph(2))

    def check (self):
        sErr = ''
        if self.lemma == '':
            sErr += 'lemme vide'
        if re.match(r"^\s", self.lemma):
            sErr += 'premier caractère un espace @ <' + self.lemma + '>'
        if re.search(r"\s$", self.lemma):
            sErr += 'espace en fin de lemme'
        if re.match(r"v[0123]", self.po) and not re.match(r"[eas_][ix_][tx_][nx_][pqreuvx_][mx_][ex_z][ax_z]\b", self.po[2:]):
            sErr += 'verbe inconnu: ' + self.po
        if (re.match(r"S[.]", self.flags) and re.search("[sxz]$", self.lemma)) or (re.match(r"X[.]", self.flags) and not re.search("[ul]$", self.lemma)):
            sErr += 'drapeau inutile'
        if self.iz == '' and re.match(r"[SXAI](?!=)", self.flags) and self.po:
            sErr += '<is> vide'
        if re.match(r"pl|sg|inv", self.iz):
            sErr += '<is> incomplet'
        if re.match(r"[FW]", self.flags) and re.search(r"epi|mas|fem|inv|sg|pl", self.iz):
            sErr += '<is> incohérent'
        if re.search(r"pl|sg|inv", self.iz) and re.match(r"[SXAIFW](?!=)", self.flags):
            sErr += '<is> incohérent'
        if self.iz.endswith(("mas", "fem", "epi")) and (not self.flags or not self.flags.startswith(("S", "X", "F", "W", "A", "I", "U"))):
            sErr += '<is> incomplet'
        if self.flags.startswith(("a0", "b0", "c0", "d0")) and not self.lemma.endswith("er"):
            sErr += "drapeau pour verbe du 1ᵉʳ groupe sur un lemme non conforme"
        if self.flags.startswith("f") and not self.lemma.endswith(("ir", "ïr")):
            sErr += "drapeau pour verbe du 2ᵉ groupe sur un lemme non conforme"
        if sErr:
            echo('   error -  id: ' + self.iD, end = "")
            echo('  ' + sErr + '  in  ' + self.__str__())
952
953
954
955
956
957
958
959
960
961
962
963
964
965
966
967
968
969
970
971
972
973
974
975
976
977
978
979
980
981
        if self.tp: txt += self.tp + ' '
        if self.sp: txt += self.sp + ' '
        return txt

    def getConjugation (self):
        sRes = self.lemma + "\t" + self.po[1:10] + "\n"
        for oFlex in self.lFlexions:
            sMorph = oFlex.sMorph[11:].rstrip("!").replace("ppas adj", "ppas").replace("ppas 1jsg", "ppas")
            if not sMorph.startswith("ppas") and sMorph.find(" ") > 1:
                # complex tags
                for s in getVerbMultiMorph(sMorph):
                    sRes += "_\t" + s + "\t" + oFlex.sFlexion + "\n"
            else:
                sRes += "_\t" + sMorph + "\t" + oFlex.sFlexion + "\n"
        return sRes + "$\n"

    def getDeclination (self):
        sRes = self.lemma + "\t" + self.flags + "\n"
        for oFlex in self.lFlexions:
            if "ppas" in oFlex.sMorph:
                sMorph = oFlex.sMorph.replace("ppas adj", "adj").replace("ppas 1jsg", "adj")
                sRes += "_\t" + sMorph + "\t" + oFlex.sFlexion + "\n"
            elif "adj" in oFlex.sMorph or "nom" in oFlex.sMorph:
                sRes += "_\t" + oFlex.sMorph + "\t" + oFlex.sFlexion + "\n"
        return sRes + "$\n"

    def calcOccurFromFlexions (self):
        self.nOccur = 0
        for o in self.lFlexions:
            self.nOccur += o.nOccur







|











<
<
<
|







936
937
938
939
940
941
942
943
944
945
946
947
948
949
950
951
952
953
954



955
956
957
958
959
960
961
962
        if self.tp: txt += self.tp + ' '
        if self.sp: txt += self.sp + ' '
        return txt

    def getConjugation (self):
        sRes = self.lemma + "\t" + self.po[1:10] + "\n"
        for oFlex in self.lFlexions:
            sMorph = oFlex.sMorph[11:].rstrip("!").replace("ppas adj", "ppas").replace("1jsg", "").strip()
            if not sMorph.startswith("ppas") and sMorph.find(" ") > 1:
                # complex tags
                for s in getVerbMultiMorph(sMorph):
                    sRes += "_\t" + s + "\t" + oFlex.sFlexion + "\n"
            else:
                sRes += "_\t" + sMorph + "\t" + oFlex.sFlexion + "\n"
        return sRes + "$\n"

    def getDeclination (self):
        sRes = self.lemma + "\t" + self.flags + "\n"
        for oFlex in self.lFlexions:



            if "adj" in oFlex.sMorph or "nom" in oFlex.sMorph:
                sRes += "_\t" + oFlex.sMorph + "\t" + oFlex.sFlexion + "\n"
        return sRes + "$\n"

    def calcOccurFromFlexions (self):
        self.nOccur = 0
        for o in self.lFlexions:
            self.nOccur += o.nOccur
1121
1122
1123
1124
1125
1126
1127
1128
1129
1130
1131
1132
1133
1134
1135

    def getGrammarCheckerRepr (self):
        return "{0.sFlexion}\t{0.oEntry.lemma}\t{1}\n".format(self, self._getSimpleTags())

    _dTagReplacement = {
        # POS
        "nom": ":N", "adj": ":A", "adv": ":W", "negadv": ":X", "mg": ":G", "nb": ":B", "nbro": ":Br",
        "loc.nom": ":ÉN", "loc.adj": ":ÉA", "loc.adv": ":ÉW", "loc.verb": ":ÉV",
        "interj": ":J", "loc.interj": ":ÉJ", "titr": ":T",
        "mas": ":m", "fem": ":f", "epi": ":e", "sg": ":s", "pl": ":p", "inv": ":i",
        "infi": ":Y",
        "ppre": ":P", "ppas": ":Q",
        "ipre": ":Ip", "iimp": ":Iq", "ipsi": ":Is", "ifut": ":If",
        "spre": ":Sp", "simp": ":Sq", "cond": ":K", "impe": ":E",
        "1sg": ":1s", "1isg": ":1ś", "1jsg": ":1ŝ", "2sg": ":2s", "3sg": ":3s", "1pl": ":1p", "2pl": ":2p", "3pl": ":3p", "3pl!": ":3p!",







|







1102
1103
1104
1105
1106
1107
1108
1109
1110
1111
1112
1113
1114
1115
1116

    def getGrammarCheckerRepr (self):
        return "{0.sFlexion}\t{0.oEntry.lemma}\t{1}\n".format(self, self._getSimpleTags())

    _dTagReplacement = {
        # POS
        "nom": ":N", "adj": ":A", "adv": ":W", "negadv": ":X", "mg": ":G", "nb": ":B", "nbro": ":Br",
        "loc.nom": ":ÉN", "loc.adj": ":ÉA", "loc.adv": ":ÉW",
        "interj": ":J", "loc.interj": ":ÉJ", "titr": ":T",
        "mas": ":m", "fem": ":f", "epi": ":e", "sg": ":s", "pl": ":p", "inv": ":i",
        "infi": ":Y",
        "ppre": ":P", "ppas": ":Q",
        "ipre": ":Ip", "iimp": ":Iq", "ipsi": ":Is", "ifut": ":If",
        "spre": ":Sp", "simp": ":Sq", "cond": ":K", "impe": ":E",
        "1sg": ":1s", "1isg": ":1ś", "1jsg": ":1ŝ", "2sg": ":2s", "3sg": ":3s", "1pl": ":1p", "2pl": ":2p", "3pl": ":3p", "3pl!": ":3p!",
1313
1314
1315
1316
1317
1318
1319
1320
1321
1322
1323
1324
1325
1326
1327
                elif fields[0] == 'ph':
                    self.ph = fields[1]  if self.pa == ''  else self.pa + ' ' + fields[1]
                elif fields[0] == 'lx':
                    self.lx = fields[1]  if self.lx == ''  else self.lx + ' ' + fields[1]
                elif fields[0] == 'di':
                    self.di = fields[1]
                else:
                    echo('Champ inconnu: {}  dans  {}'.format(fields[0], self.sFlagName))
            else:
                echo("  # Erreur affixe : {}".format(line))

    def isReplicationRule (self):
        "is this rule used for replication of a virtual lemma"
        return self.flags == "" and ((self.cut == "0" and self.add == "") or self.cut == self.add)








|







1294
1295
1296
1297
1298
1299
1300
1301
1302
1303
1304
1305
1306
1307
1308
                elif fields[0] == 'ph':
                    self.ph = fields[1]  if self.pa == ''  else self.pa + ' ' + fields[1]
                elif fields[0] == 'lx':
                    self.lx = fields[1]  if self.lx == ''  else self.lx + ' ' + fields[1]
                elif fields[0] == 'di':
                    self.di = fields[1]
                else:
                    echo('Champ inconnu: {} @ {}'.format(fields[0], self.sFlagName))
            else:
                echo("  # Erreur affixe : {}".format(line))

    def isReplicationRule (self):
        "is this rule used for replication of a virtual lemma"
        return self.flags == "" and ((self.cut == "0" and self.add == "") or self.cut == self.add)