Grammalecte  Diff

Differences From Artifact [642a6504a5]:

To Artifact [877bd56310]:


1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
1
2
3
4
5
6
7
8
9

10
11
12
13
14
15
16









-







#!python3

__author__ = "Olivier R."
__license__ = "MPL 2"



import os
import sys
import time
import re
import collections
import zipfile
import math
import argparse
from enum import Enum

41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
40
41
42
43
44
45
46

47
48








49
50
51
52
53
54
55







-


-
-
-
-
-
-
-
-








# Les dictionnaires
dSUBDIC = { '*': 'Commun',
            'R': 'Réforme1990',
            'M': 'Moderne',
            'C': 'Classique',
            'A': 'Annexe',
            'P': 'Multimots',
            'X': 'Contributeurs' }

dMODERNE = { 'name': 'DICTIONNAIRE ORTHOGRAPHIQUE FRANÇAIS “MODERNE”',
             'shortname': '“Moderne”',
             'asciiName': 'fr-moderne',
             'mozAsciiName': 'fr-FR-modern',
             'subDicts': '*MX',
             'mozId': 'fr-dicollecte-moderne',
             'description': "Dictionnaire français “Moderne”" }

dCLASSIQUE = { 'name': 'DICTIONNAIRE ORTHOGRAPHIQUE FRANÇAIS “CLASSIQUE”',
               'shortname': '“Classique”',
               'asciiName': 'fr-classique',
               'mozAsciiName': 'fr-FR-classic',
               'subDicts': '*MCX',
               'mozId': 'fr-dicollecte-classique',
               'description': "Dictionnaire français “Classique”" }
84
85
86
87
88
89
90
91
92


93
94
95
96
97
98
99
74
75
76
77
78
79
80


81
82
83
84
85
86
87
88
89







-
-
+
+







            'mozId': 'fr-dicollecte',
            'description': "Dictionnaire orthographique de la langue française" }


BUILD_PATH = '_build'
PREFIX_DICT_PATH = 'hunspell-french-dictionaries-v'
EXT_PREFIX_OOO = 'lo-oo-ressources-linguistiques-fr-v'
EXT_PREFIX_MOZ = 'moz-hunspell-fr-dicollecte-v'
LEX_PREFIX = 'lexique-dicollecte-fr-v'
EXT_PREFIX_MOZ = 'moz-hunspell-fr-v'
LEX_PREFIX = 'lexique-grammalecte-fr-v'
STATS_NAME = 'statistiques-v'

MPLHEADER = "# This Source Code Form is subject to the terms of the Mozilla Public\n" + \
            "# License, v. 2.0. If a copy of the MPL was not distributed with this\n" + \
            "# file, You can obtain one at http://mozilla.org/MPL/2.0/.\n\n"


311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
301
302
303
304
305
306
307

308
309
310
311
312
313
314







-







        "Écrire le fichier des affixes (.aff)"
        echo(' * Dictionnaire >> [ {}.aff ]'.format(dTplVars['asciiName']))
        info = "# This Source Code Form is subject to the terms of the Mozilla Public\n" + \
               "# License, v. 2.0. If a copy of the MPL was not distributed with this\n" + \
               "# file, You can obtain one at http://mozilla.org/MPL/2.0/.\n\n" + \
               "# AFFIXES DU {} v{}\n".format(dTplVars['name'], self.sVersion) + \
               "# par Olivier R. -- licence MPL 2.0\n" + \
               "# Généré le " + time.strftime("%d-%m-%Y à %H:%M") + "\n" \
               "# Pour améliorer le dictionnaire, allez sur https://grammalecte.net/\n\n"

        with open(spDst+'/'+dTplVars['asciiName']+'.aff', 'w', encoding='utf-8', newline="\n") as hDst:
            hDst.write(info)
            hDst.write(self.sSettings + "\n")
            if self.bShortenTags:
                hDst.write("AM {}\n".format(len(self.dAM)))
512
513
514
515
516
517
518
519

520
521
522
523
524
525
526
501
502
503
504
505
506
507

508
509
510
511
512
513
514
515







-
+







                    hDst.write(str(dRefW[key]))
                    hDst.write("\n")

    def writeLexicon (self, spfDst, version, oStatsLex):
        echo(' * Lexique >> [ {} ] '.format(spfDst))
        with open(spfDst, 'w', encoding='utf-8', newline="\n") as hDst:
            hDst.write(MPLHEADER)
            hDst.write("# Lexique des formes fléchies du français - Dicollecte v{}\n# Licence : MPL v2.0\n\n".format(version))
            hDst.write("# Lexique des formes fléchies du français - Grammalecte v{}\n# Licence : MPL v2.0\n\n".format(version))
            hDst.write(oStatsLex.getInfo())
            hDst.write(Flexion.header(oStatsLex))
            for oFlex in self.lFlexions:
                hDst.write(oFlex.__str__(oStatsLex))

    def writeGrammarCheckerLexicon (self, spfDst, version):
        echo(' * Lexique simplifié >> [ {} ] '.format(spfDst))
586
587
588
589
590
591
592
593

594
595
596
597
598
599
600
575
576
577
578
579
580
581

582
583
584
585
586
587
588
589







-
+







    def createMozillaExtensions (self, spBuild, dTplVars, lDictVars, spDestGL=""):
        # Mozilla extension 1
        echo(" * Dictionnaire >> extension pour Mozilla")
        dTplVars['version'] = self.sVersion
        sExtensionName = EXT_PREFIX_MOZ + self.sVersion
        spExt = spBuild + '/' + sExtensionName
        dir_util.mkpath(spExt+'/dictionaries')
        copyTemplate('_templates/moz', spExt, 'install.rdf', dTplVars)
        copyTemplate('_templates/moz', spExt, 'manifest.json', dTplVars)
        spDict = spBuild + '/' + PREFIX_DICT_PATH + self.sVersion
        file_util.copy_file(spDict+'/fr-classique.dic', spExt+'/dictionaries/fr-classic.dic')
        file_util.copy_file(spDict+'/fr-classique.aff', spExt+'/dictionaries/fr-classic.aff')
        copyTemplate('orthographe', spExt, 'README_dict_fr.txt', dTplVars)
        createZipFiles(spExt, spBuild, sExtensionName + '.xpi')
        # Grammalecte
        if spDestGL:
612
613
614
615
616
617
618
619

620
621
622
623
624
625
626
601
602
603
604
605
606
607

608
609
610
611
612
613
614
615







-
+







                    hDiff.write("{0.iD}\t{0.fq}\n".format(oEntry))
                    hNotes.write("{0.lemma}/{0.flags}\t{0.oldFq} > {0.fq}\n".format(oEntry))

    def createLexiconPackages (self, spBuild, version, oStatsLex, spDestGL=""):
        sLexName = LEX_PREFIX + version
        spLex = spBuild + '/' + sLexName
        dir_util.mkpath(spLex)
        # write Dicollecte lexicon
        # write lexicon
        self.sortLexiconByFreq()
        self.writeLexicon(spLex + '/' + sLexName + '.txt', version, oStatsLex)
        self.writeGrammarCheckerLexicon(spBuild + '/' + sLexName + '.lex', version)
        copyTemplate('lexique', spLex, 'README_lexique.txt', {'version': version})
        # zip
        createZipFiles(spLex, spBuild, sLexName + '.zip')
        # copy GC lexicon to Grammalecte
643
644
645
646
647
648
649
650
651
652
653
654
655
656
657
658
659
660
661
662
663
664
665
666
667
668
669
670
671
672
673
674
675
676
677
678
679
680
681
682
683
684
685
686
687
688
689
690
691
692
693
694
695
696
697
698
699
700
701
702
703
704
705
706
707
708
709
710
632
633
634
635
636
637
638






















































639
640
641
642
643
644
645







-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-







        with open(spBuild+'/dictDecl.txt', 'w', encoding='utf-8', newline="\n") as hDst:
            for oEntry in self.lEntry:
                if re.match("[SXFWIA]", oEntry.flags) and (oEntry.po.startswith("nom") or oEntry.po.startswith("adj")):
                    hDst.write(oEntry.getDeclination())
        if spDestGL:
            echo("   Fichier de déclinaison copié dans Grammalecte...")
            file_util.copy_file(spBuild+'/dictDecl.txt', spDestGL)

    def generateSpellVariants (self, nReq, spBuild):
        if nReq < 1: nReq = 1
        if nReq > 2: nReq = 2
        echo(" * Lexique >> variantes par suppression... n = " + str(nReq))
        with open(spBuild+'/dictSpellVariants-'+str(nReq)+'.txt', 'w', encoding='utf-8', newline="\n") as hDst:
            for oFlex in frozenset(self.lFlexions):
                hDst.write(oFlex.sFlexion+"\t_\t_\n")
                if len(oFlex.sFlexion) <= 2:
                    n = 0
                elif len(oFlex.sFlexion) <= 5:
                    n = 1
                else:
                    n = nReq
                #lTup = self._generatePhonetVariants(oFlex.sFlexion)
                lTup = self._generateDeleteVariants(oFlex.sFlexion, oFlex.sFlexion, n)
                for t in lTup:
                    sTag = t[1]  if "\t" in t[1]  else t[1]+"\t_"
                    hDst.write(t[0]+"\t"+sTag+"\n")

    _lTupPhonet = [ ("ph", "f"), ("qu", "k"), ("ss", "c"), ("ss", "ç"), ("ct", "x"),
        ("oe", "œ"), ("ae", "æ"), ("ei", "é"), ("ai", "é"), ("au", "o"), ("eau", "o"),
    ]

    def _generatePhonetVariants (self, s):
        l = []
        for torep, rep in self._lTupPhonet():
            for m in torep.finditer(s):
                l.append( (s[:m.start(0)] + rep + s[m.end(0):], str(m.start(0))+":"+str(m.start(0)+len(rep))+">"+torep) )
        return l

    def _generateDeleteVariants (self, sWord0, sWordCur, n):
        "renvoie une liste de tuples : (forme dégradée de sWord, code de genèse de sWord)"
        # caution: recursive function
        if n == 0:
            return []
        lTup = []
        for i in range(len(sWordCur)):
            sNew = sWordCur[0:i]+sWordCur[i+1:]
            lTup.append( ( sNew, self._generateAddCode(sWord0, sNew) ) )
            lTup += self._generateDeleteVariants(sWord0, sNew, n-1)
        return lTup

    def _generateAddCode (self, sWord, sCrippled):
        "returns addCode to generate sWord from sCrippled"
        sAdd = ""
        for i in range(len(sWord)):
            if sWord[i] != sCrippled[i:i+1]:
                sCrippled = sCrippled[:i] + sWord[i] + sCrippled[i:]
                if sAdd:
                    sAdd += "\t"
                sAdd += str(i)+"+"+sWord[i]
        return sAdd  if sAdd  else "0"



class Entree:
    def __init__ (self, sLine):
        self.lemma = ''
        self.flags = ''
        # champs morphologiques Hunspell
808
809
810
811
812
813
814
815

816
817
818
819
820
821
822
823
824
825
826
827
828
829
830
831
832

833
834
835
836
837
838
839
743
744
745
746
747
748
749

750
751
752
753
754
755
756
757


758
759



760
761

762
763
764
765
766
767
768
769







-
+







-
-


-
-
-


-
+







            sErr += 'lemme vide'
        if re.match(r"^\s", self.lemma):
            sErr += 'premier caractère un espace dans <' + self.lemma + '>'
        if re.search(r"\s$", self.lemma):
            sErr += 'espace en fin de lemme'
        if re.match(r"v[0123]", self.po) and not re.match(r"[eas_][ix_][tx_][nx_][pqreuvx_][mx_][ex_z][ax_z]\b", self.po[2:]):
            sErr += 'verbe inconnu: ' + self.po
        if (re.match(r"S[*.]", self.flags) and re.search("[sxz]$", self.lemma)) or (re.match(r"X[*.]", self.flags) and not re.search("[ul]$", self.lemma)):
        if (re.match(r"S[.]", self.flags) and re.search("[sxz]$", self.lemma)) or (re.match(r"X[.]", self.flags) and not re.search("[ul]$", self.lemma)):
            sErr += 'drapeau inutile'
        if self.iz == '' and re.match(r"[SXAI](?!=)", self.flags) and self.po:
            sErr += '[is] vide'
        if re.match(r"pl|sg|inv", self.iz):
            sErr += '[is] incomplet'
        if re.match(r"[FW]", self.flags) and re.search(r"epi|mas|fem|inv|sg|pl", self.iz):
            sErr += '[is] incohérent'
        if re.match(r".\*", self.flags) and re.match(r"[bcdfgjklmnpqrstvwxz]", self.lemma):
            sErr += 'drapeau pour lemme commençant par une voyelle'
        if re.search(r"pl|sg|inv", self.iz) and re.match(r"[SXAIFW](?!=)", self.flags):
            sErr += '[is] incohérent'
        if re.search(r"nom|adj", self.po) and re.match(r"(?i)[aâàäáeéèêëiîïíìoôöóòuûüúù]", self.lemma) and re.match("[SFWXAI][.]", self.flags) \
           and "pel" not in self.lx:
            sErr += 'le drapeau derait finir avec *'
        if self.iz.endswith(("mas", "fem", "epi")) and (not self.flags or not self.flags.startswith(("S", "X", "F", "W", "A", "I", "U"))):
            sErr += '[is] incomplet'
        if self.flags.startswith(("a", "b", "c", "d")) and not self.lemma.endswith("er"):
        if self.flags.startswith(("a0", "b0", "c0", "d0")) and not self.lemma.endswith("er"):
            sErr += "drapeau pour verbe du 1ᵉʳ groupe sur un lemme non conforme"
        if self.flags.startswith("f") and not self.lemma.endswith(("ir", "ïr")):
            sErr += "drapeau pour verbe du 2ᵉ groupe sur un lemme non conforme"
        if sErr:
            echo('   error -  id: ' + self.iD, end = "")
            echo('  ' + sErr + '  in  ' + self.__str__())

938
939
940
941
942
943
944
945

946
947
948
949
950
951
952
868
869
870
871
872
873
874

875
876
877
878
879
880
881
882







-
+







            morph = self.lexMorph()
        lFlexions = [(self.lemma, morph, self.di)]  if iPR == 0 and not self.flags.endswith('()')  else []
        lFlexPrefix = []
        lFlexSuffix = []
        for sFlag in makeLongFlags(self.flags):
            if sFlag not in dFlags:
                if sFlag not in ['**', '()', '||', '--']:
                    lFlexions.append( (self.lemma, '[unknown flag: {}]'.format(sFlag)) )
                    lFlexions.append( (self.lemma, '[unknown flag: {}]'.format(sFlag), self.di) )
                    echo("ERROR: "  + self.lemma + ' - unknown flag: ' + sFlag)
            else:
                oFlag = dFlags[sFlag]
                if not oFlag.bSfx:
                    # cas des préfixes
                    for oRule in oFlag.lRules:
                        if oRule.motif.search(self.lemma):
1189
1190
1191
1192
1193
1194
1195
1196
1197


1198
1199
1200
1201
1202
1203
1204

1205
1206
1207
1208
1209
1210


1211
1212
1213
1214
1215
1216

1217
1218
1219
1220
1221
1222
1223
1119
1120
1121
1122
1123
1124
1125


1126
1127
1128
1129
1130
1131
1132
1133

1134
1135
1136
1137
1138


1139
1140
1141
1142
1143
1144
1145

1146
1147
1148
1149
1150
1151
1152
1153







-
-
+
+






-
+




-
-
+
+





-
+








    def getGrammarCheckerRepr (self):
        return "{0.sFlexion}\t{0.oEntry.lemma}\t{1}\n".format(self, self._getSimpleTags())

    _dTagReplacement = {
        # POS
        "nom": ":N", "adj": ":A", "adv": ":W", "negadv": ":X", "mg": ":G", "nb": ":B", "nbro": ":Br",
        "loc.nom": ":Ñ", "loc.adj": ":Â", "loc.adv": ":Ŵ", "loc.verb": ":",
        "interj": ":J", "loc.interj": ":Ĵ", "titr": ":T",
        "loc.nom": ":ÉN", "loc.adj": ":ÉA", "loc.adv": ":ÉW", "loc.verb": ":ÉV",
        "interj": ":J", "loc.interj": ":ÉJ", "titr": ":T",
        "mas": ":m", "fem": ":f", "epi": ":e", "sg": ":s", "pl": ":p", "inv": ":i",
        "infi": ":Y",
        "ppre": ":P", "ppas": ":Q",
        "ipre": ":Ip", "iimp": ":Iq", "ipsi": ":Is", "ifut": ":If",
        "spre": ":Sp", "simp": ":Sq", "cond": ":K", "impe": ":E",
        "1sg": ":1s", "1isg": ":1ś", "1jsg": ":1ŝ", "2sg": ":2s", "3sg": ":3s", "1pl": ":1p", "2pl": ":2p", "3pl": ":3p", "3pl!": ":3p!",
        "prepv": ":Rv", "prep": ":R", "loc.prep": ":Ŕ", "loc.prepv": "Ŕv",
        "prepv": ":Rv", "prep": ":R", "loc.prep": ":ÉR", "loc.prepv": "ÉRv",
        "detpos": ":Dp", "detdem": ":Dd", "detind": ":Di", "detneg": ":Dn", "detex": ":De", "det": ":D",
        "advint": ":U",
        "prodem": ":Od", "proind": ":Oi", "proint": ":Ot", "proneg": ":On", "prorel": ":Or", "proadv": ":Ow",
        "properobj": ":Oo", "propersuj": ":Os", "1pe": ":O1", "2pe": ":O2", "3pe": ":O3", "preverb": ":Ov",
        "cjco": ":Cc", "cjsub": ":Cs", "cj": ":C", "loc.cj": ":Ĉ", "loc.cjsub": ":Ĉs",
        "prn": ":M1", "patr": ":M2", "loc.patr": ":2", "npr": ":MP", "nompr": ":NM",
        "cjco": ":Cc", "cjsub": ":Cs", "cj": ":C", "loc.cj": ":ÉC", "loc.cjsub": ":ÉCs",
        "prn": ":M1", "patr": ":M2", "loc.patr": ":ÉM2", "npr": ":MP", "nompr": ":NM",
        "pfx": ":Zp", "sfx": ":Zs",
        "div": ":H",
        "err": ":F",
        "ponc": ":@p", "sign": ":@s",
        # LEX
        "symb": ";S"
        "symb": ";S", "unit": ";U"
    }

    def _getSimpleTags (self):
        s = ""
        # POS
        for sTag in self.sMorph.split():
            if sTag.startswith("v"):
1505
1506
1507
1508
1509
1510
1511
1512
1513
1514
1515
1516
1517
1518
1519
1435
1436
1437
1438
1439
1440
1441

1442
1443
1444
1445
1446
1447
1448







-








def main ():
    xParser = argparse.ArgumentParser()
    xParser.add_argument("-v", "--verdic", help="set dictionary version, i.e. 5.4", type=str, default="X.Y.z")
    xParser.add_argument("-m", "--mode", help="0: no tags,  1: Hunspell tags (default),  2: All tags", type=int, choices=[0, 1, 2], default=1)
    xParser.add_argument("-u", "--uncompress", help="do not use Hunspell compression", action="store_true")
    xParser.add_argument("-s", "--simplify", help="no virtual lemmas", action="store_true")
    xParser.add_argument("-sv", "--spellvariants", help="generate spell variants", action="store_true")
    xParser.add_argument("-gl", "--grammalecte", help="copy generated files to Grammalecte folders", action="store_true")
    xArgs = xParser.parse_args()

    if xArgs.simplify:
        xArgs.mode = 0
        xArgs.uncompress = True

1527
1528
1529
1530
1531
1532
1533
1534

1535
1536
1537
1538
1539
1540
1541
1542
1543
1544
1545
1546
1547
1548
1549
1550
1551
1552
1553
1554
1456
1457
1458
1459
1460
1461
1462

1463
1464
1465
1466
1467
1468
1469
1470
1471
1472
1473
1474


1475
1476
1477
1478
1479
1480
1481







-
+











-
-







    spBuild = BUILD_PATH + '/' + xArgs.verdic
    dir_util.mkpath(spBuild)

    ### Lecture des fichiers et création du dictionnaire
    oFrenchDict = Dictionnaire(xArgs.verdic, "French dictionary")
    for sFile in ['orthographe/FRANCAIS.dic']:
        oFrenchDict.readDictionary(sFile)
    oFrenchDict.readAffixes('orthographe/FRANCAIS_5.aff')
    oFrenchDict.readAffixes('orthographe/FRANCAIS_7.aff')

    ### Contrôle
    oFrenchDict.sortEntriesNatural()
    oFrenchDict.checkEntries()

    ### Lexique
    oFrenchDict.generateFlexions()
    oFrenchDict.calcMetagraphe()
    oFrenchDict.calcMetaphone2()

    #oFrenchDict.createNgrams(spBuild, 3)
    if xArgs.spellvariants:
        oFrenchDict.generateSpellVariants(1, spBuild)

    ### Statistiques
    spfStats = spBuild+'/'+STATS_NAME+xArgs.verdic+'.txt'
    oStatsLex = StatsLex(oFrenchDict)
    oStatsLex.addLexFromFile('lexique/corpus_data/stats_google_ngram_1.txt', 'G', 'Google 1-grams')
    oStatsLex.addLexFromFile('lexique/corpus_data/stats_frwiki.txt', 'W', 'Wikipédia')
    oStatsLex.addLexFromFile('lexique/corpus_data/stats_frwikisource.txt', 'S', 'Wikisource')