Grammalecte  Diff

Differences From Artifact [21ee33ebdc]:

To Artifact [4630ffd3ad]:


21
22
23
24
25
26
27
28

29
30
31
32
33
34
35
21
22
23
24
25
26
27

28
29
30
31
32
33
34
35







-
+








import metagraphe
import metaphone2


# Dictionnaire des caractères pour le tri naturel.
# Ordre souhaitable, mais pose problème pour la recherche, car engendre des égalités de lemmes différents.
# Il faut donc travailler sur un dictionnaire trié *numériquement* et le sauvegarder selon le tri *naturel*             
# Il faut donc travailler sur un dictionnaire trié *numériquement* et le sauvegarder selon le tri *naturel*
CHARMAP = str.maketrans({ 'à': 'a',  'À': 'A',  'â': 'a',  'Â': 'A',  'ä': 'a',  'Ä': 'A',  'å': 'a',  'Å': 'A',  'ā': 'a',  'Ā': 'A',
                          'ç': 'c',  'Ç': 'C',
                          'é': 'e',  'É': 'E',  'è': 'e',  'È': 'E',  'ê': 'e',  'Ê': 'E',  'ë': 'e',  'Ë': 'E',  'ē': 'e',  'Ē': 'E',
                          'î': 'i',  'Î': 'I',  'ï': 'i',  'Ï': 'I',  'ī': 'i',  'Ī': 'I',
                          'ñ': 'n',
                          'ô': 'o',  'Ô': 'O',  'ö': 'o',  'Ö': 'O',  'ō': 'o',  'Ō': 'O',
                          'ù': 'u',  'Ù': 'U',  'û': 'u',  'Û': 'U',  'ü': 'u',  'Ü': 'U',  'ū': 'u',  'Ū': 'U',
215
216
217
218
219
220
221
222

223
224
225
226

227
228
229
230
231
232
233
215
216
217
218
219
220
221

222
223
224
225

226
227
228
229
230
231
232
233







-
+



-
+







        # Affixes
        self.sSettings = '' # enregistre tout avant la ligne # END
        self.dFlags = collections.OrderedDict()
        self.bShortenTags = False
        self.dAM = collections.OrderedDict() # étiquettes morphologiques
        self.dAF = collections.OrderedDict() # étiquettes drapeaux
        # Flexions
        self.lFlexions = []           # liste des flexions avec lemme, morphologie et occurrences 
        self.lFlexions = []           # liste des flexions avec lemme, morphologie et occurrences
        self.lStatsLex = []
        self.nTotOccurRecognizedWords = 0
        self.aFlexions = None
    

    def readDictionary (self, spf):
        "Lecture du dictionnaire"
        echo('Dictionnaire << [ {} ]'.format(spf), end=' ')
        for sLine in readfile(spf):
            sLine = sLine.strip()
            if not sLine.isdigit() and not sLine.startswith("#"):
                self.lEntry.append(Entree(sLine))
278
279
280
281
282
283
284
285

286
287
288
289
290
291
292
278
279
280
281
282
283
284

285
286
287
288
289
290
291
292







-
+







                dAF[oEntry.flags] = dAF.get(oEntry.flags, 0) + 1
            sMorph = oEntry.getMorph(nMode).strip()
            if sMorph:
                dAM[sMorph] = dAM.get(sMorph, 0) + 1

        lAF = sorted(dAF.items(), key = lambda x: (x[1], x[0]), reverse=True)
        lAM = sorted(dAM.items(), key = lambda x: (x[1], x[0]), reverse=True)
        

        with open(spDst, 'a', encoding='utf-8', newline="\n") as hDst:
            hDst.write("\n\nDrapeaux :\n")
            for nAF, elem in enumerate(lAF, 1):
                self.dAF[elem[0]] = str(nAF)
                hDst.write("  > {0[1]:>8} : {0[0]}\n".format(elem))
            hDst.write("\n\nMorphologies :\n")
            for nAM, elem in enumerate(lAM, 1):
301
302
303
304
305
306
307
308

309
310
311
312
313
314
315
316
317
318
319

320
321
322
323
324
325
326
301
302
303
304
305
306
307

308
309
310
311
312
313
314
315
316
317
318

319
320
321
322
323
324
325
326







-
+










-
+







            if oEntry.di in dTplVars['subDicts']:
                nEntry += 1
        with open(spDst+'/'+dTplVars['asciiName']+'.dic', 'w', encoding='utf-8', newline="\n") as hDst:
            hDst.write(str(nEntry)+"\n")
            for oEntry in self.lEntry:
                if oEntry.di in dTplVars['subDicts']:
                    hDst.write(oEntry.getEntryLine(self, nMode, bSimplified))
    

    def writeAffixes (self, spDst, dTplVars, nMode, bSimplified):
        "Écrire le fichier des affixes (.aff)"
        echo(' * Dictionnaire >> [ {}.aff ]'.format(dTplVars['asciiName']))
        info = "# This Source Code Form is subject to the terms of the Mozilla Public\n" + \
               "# License, v. 2.0. If a copy of the MPL was not distributed with this\n" + \
               "# file, You can obtain one at http://mozilla.org/MPL/2.0/.\n\n" + \
               "# AFFIXES DU {} v{}\n".format(dTplVars['name'], self.sVersion) + \
               "# par Olivier R. -- licence MPL 2.0\n" + \
               "# Généré le " + time.strftime("%d-%m-%Y à %H:%M") + "\n" \
               "# Pour améliorer le dictionnaire, allez sur http://www.dicollecte.org/\n\n"
               

        with open(spDst+'/'+dTplVars['asciiName']+'.aff', 'w', encoding='utf-8', newline="\n") as hDst:
            hDst.write(info)
            hDst.write(self.sSettings + "\n")
            if self.bShortenTags:
                hDst.write("AM {}\n".format(len(self.dAM)))
                for item in self.dAM.items():
                    hDst.write("AM {}\n".format(item[0]))
334
335
336
337
338
339
340
341

342
343
344
345
346
347
348
334
335
336
337
338
339
340

341
342
343
344
345
346
347
348







-
+








    def sortEntriesNatural (self):
        echo(' * Dictionnaire - Tri naturel des entrées...')
        self.lEntry = sorted(self.lEntry, key=Entree.keyTriNat)

    def sortEntriesNumerical (self):
        echo(' * Dictionnaire - Tri numérique des entrées...')
        self.lEntry = sorted(self.lEntry, key=Entree.keyTriNum)        
        self.lEntry = sorted(self.lEntry, key=Entree.keyTriNum)

    def sortLexiconByFlexion (self):
        echo(' * Dictionnaire - tri du lexique (par flexion)...')
        self.lFlexions = sorted(self.lFlexions, key=Flexion.keyFlexion)

    def sortLexiconByFreq (self):
        echo(' * Dictionnaire - tri du lexique (par fréquence)...')
372
373
374
375
376
377
378
379

380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399

400
401
402
403
404
405
406
407
408
409
410
411
412
413

414
415
416
417
418
419
420

421
422
423
424
425
426
427
428
429
430

431
432
433
434
435
436
437
438
439

440
441
442
443
444
445
446

447
448
449
450
451
452
453
372
373
374
375
376
377
378

379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398

399
400
401
402
403
404
405
406
407
408
409
410
411
412

413
414
415
416
417
418
419

420
421
422
423
424
425
426
427
428
429

430
431
432
433
434
435
436
437
438

439
440
441
442
443
444
445

446
447
448
449
450
451
452
453







-
+



















-
+













-
+






-
+









-
+








-
+






-
+







                d[oFlex.sFlexion] = [oFlex.oEntry]
        for oFlex in self.lFlexions:
            oFlex.lMulti = list(d[oFlex.sFlexion])
            oFlex.nMulti = len(oFlex.lMulti)
        for oFlex in self.lFlexions:
            oFlex.lMulti.remove(oFlex.oEntry)
            oFlex.nMulti -= 1
        

    def setTagsFrom (self, other):
        echo(' * Dictionnaire - copie des tags...')
        for i in range(self.nEntry):
            for oEntry in other.lEntry:
                if self.lEntry[i].lemma == oEntry.lemma and self.lEntry[i].flags == oEntry.flags:
                    self.lEntry[i].setTagsFrom(oEntry)

    def calculateStats (self, oStatsLex, spfDst):
        echo(" * Dictionnaire - calculs...")
        with open(spfDst, 'w', encoding='utf-8', newline="\n") as hDst:
            # Occurrences brutes des formes fléchies
            echo("   comptage des occurrences...")
            hDst.write(oStatsLex.getInfo())
            for oFlex in self.lFlexions:
                oFlex.setOccur(oStatsLex.getFlexionOccur(oFlex.sFlexion))
            self.nTotOccurRecognizedWords = 0
            for oFlex in self.lFlexions:
                oFlex.calcOccur()
                self.nTotOccurRecognizedWords += oFlex.nOccur
            

            # Report des occurrences
            echo("   report des occurrences des formes fléchies multiples...")
            hDst.write("Report des occurrences des formes fléchies multiples :\n")
            hDst.write("  Légende :\n")
            hDst.write("    >>   le nombre d’occurrences de la flexion est ramené à la moyenne.\n")
            hDst.write("    +>   le nombre d’occurrences de la flexion est augmenté avec le surplus d’occurrences des flexions ramenées à la moyenne.\n")
            hDst.write("    %>   le nombre d’occurrences de la flexion est pondéré avec le poids de la moyenne de l’entrée.\n\n")

            for oEntry in self.lEntry:
                oEntry.calcOccurFromFlexions()
                oEntry.calcAverageKnownOccurrence()
                oEntry.solveOccurMultipleFlexions(hDst, oStatsLex)
                oEntry.calcOccurFromFlexions()
            

            # Fréquences
            echo("   calcul des fréquences et indices de fréquence...")
            for oFlex in self.lFlexions:
                oFlex.calcFreq(self.nTotOccurRecognizedWords)
            for oEntry in self.lEntry:
                oEntry.calcFreq(self.nTotOccurRecognizedWords)
            

            # Entrées, statistiques
            echo("   statistiques...")
            hDst.write("\n\nNatures grammaticales :\n")
            d = {}
            for oEntry in self.lEntry:
                po = re.sub("(?<=v[0-3])[itnpqrmaezx_]+", "", oEntry.po)
                d[po] = d.get(po, 0) + 1
            for e in sorted(d.items(), key = lambda x: (x[1], x[0]), reverse=True):
                hDst.write(" * {0[1]:<15} : {0[0]}\n".format(e))
            

            hDst.write("\n\nVentilation des entrées par indice de fréquence :\n")
            d1 = {}
            d2 = {}
            for oEntry in self.lEntry:
                d1[oEntry.fq] = d1.get(oEntry.fq, 0) + 1
                d2[oEntry.fq] = d2.get(oEntry.fq, 0) + oEntry.fFreq
            for k in sorted(d1.keys()):
                hDst.write(" * {} : {} entrées ({:.2f} %)  → {:.9f} %\n".format(k, d1[k], (d1[k]*100)/self.nEntry, d2[k]))
                    

            hDst.write("\n\nRépartition des entrées par sous-dictionnaire :\n")
            d = {}
            for oEntry in self.lEntry:
                d[oEntry.di] = d.get(oEntry.di, 0) + 1
            for sKey, nVal in d.items():
                hDst.write(" * {0:<15} : {1} entrées ({2:.2f} %)\n".format(dSUBDIC[sKey], nVal, (nVal*100)/self.nEntry))
            

            # Occurrences des lettres
            echo("   occurrences des lettres...")
            d = {}
            for oFlex in self.lFlexions:
                for c in oFlex.sFlexion:
                    d[c] = d.get(c, 0) + oFlex.nOccur
            nTot = 0
472
473
474
475
476
477
478
479

480
481
482
483
484

485
486
487
488
489
490
491
472
473
474
475
476
477
478

479
480
481
482
483

484
485
486
487
488
489
490
491







-
+




-
+







            hDst.write("\n\nNombre de formes fléchies : {}\n".format(len(self.lFlexions)))
            hDst.write("\n\nNombre de graphies : {}\n".format(len(self.aFlexions)))

    def calcMetagraphe (self):
        echo(" * Lexique - Metagraphe")
        for oFlex in self.lFlexions:
            oFlex.calcMetagraphe()
    

    def calcMetaphone2 (self):
        echo(" * Lexique - Metaphone 2")
        for oFlex in self.lFlexions:
            oFlex.calcMetaphone2()
    

    def createNgrams (self, spDest, n):
        echo(" * Lexique - Ngrams " + str(n))
        if n < 2:
            echo("erreur: n = " + str(n))
            return
        dOccur = {} # ngram:n
        dRefW = {} # ngram:set(idx)
560
561
562
563
564
565
566
567

568
569
570
571
572
573
574
575
576
577
578
579
580
581
582
583
584
585
586
587
588

589
590
591
592
593
594
595
596
597
598
599
600
601
602
603
604
605
606
607
608

609
610
611
612
613
614
615
616
617

618
619
620
621
622
623
624
560
561
562
563
564
565
566

567
568
569
570
571
572
573
574
575
576
577
578
579
580
581
582
583
584
585
586
587

588
589
590
591
592
593
594
595
596
597
598
599
600
601
602
603
604
605
606
607

608
609
610
611
612
613
614
615
616

617
618
619
620
621
622
623
624







-
+




















-
+



















-
+








-
+







        file_util.copy_file('_templates/ooo/french_flag.png', spExt)
        file_util.copy_file('_templates/ooo/french_flag_16.bmp', spExt+'/ui')
        copyTemplate('_templates/ooo', spExt, 'description.xml', dTplVars)
        copyTemplate('_templates/ooo', spExt, 'dictionaries.xcu', dTplVars)
        #file_util.copy_file('_templates/ooo/dictionaries.xcu.tpl.xml', spExt)
        copyTemplate('_templates/ooo', spExt, 'package-description.txt', dTplVars)
        for dVars in lDictVars:
            dicPath = spBuild + '/' + PREFIX_DICT_PATH + self.sVersion 
            dicPath = spBuild + '/' + PREFIX_DICT_PATH + self.sVersion
            file_util.copy_file(dicPath+'/'+dVars['asciiName']+'.dic', spExt+'/dictionaries/'+dVars['asciiName']+'.dic')
            file_util.copy_file(dicPath+'/'+dVars['asciiName']+'.aff', spExt+'/dictionaries/'+dVars['asciiName']+'.aff')
        copyTemplate('orthographe', spExt+'/dictionaries', 'README_dict_fr.txt', dTplVars)
        # thesaurus
        file_util.copy_file('thesaurus/thes_fr.dat', spExt+'/dictionaries')
        file_util.copy_file('thesaurus/thes_fr.idx', spExt+'/dictionaries')
        file_util.copy_file('thesaurus/README_thes_fr.txt', spExt+'/dictionaries')
        # hyphenation
        file_util.copy_file('césures/hyph_fr.dic', spExt+'/dictionaries')
        file_util.copy_file('césures/hyph_fr.iso8859-1.dic', spExt+'/dictionaries')
        file_util.copy_file('césures/frhyph.tex', spExt+'/dictionaries')
        file_util.copy_file('césures/hyph-fr.tex', spExt+'/dictionaries')
        file_util.copy_file('césures/README_hyph_fr-3.0.txt', spExt+'/dictionaries')
        file_util.copy_file('césures/README_hyph_fr-2.9.txt', spExt+'/dictionaries')
        # zip
        createZipFiles(spExt, spBuild, sExtensionName + '.oxt')
        # copy to Grammalecte Project
        if spDestGL:
            echo("   extension copiée dans Grammalecte...")
            dir_util.copy_tree(spExt+'/dictionaries', spDestGL)
    

    def createMozillaExtensions (self, spBuild, dTplVars, lDictVars, spDestGL=""):
        # Mozilla extension 1
        echo(" * Dictionnaire >> extension pour Mozilla")
        dTplVars['version'] = self.sVersion
        sExtensionName = EXT_PREFIX_MOZ + self.sVersion
        spExt = spBuild + '/' + sExtensionName
        dir_util.mkpath(spExt+'/dictionaries')
        copyTemplate('_templates/moz', spExt, 'install.rdf', dTplVars)
        spDict = spBuild + '/' + PREFIX_DICT_PATH + self.sVersion
        file_util.copy_file(spDict+'/fr-classique.dic', spExt+'/dictionaries/fr-classic.dic')
        file_util.copy_file(spDict+'/fr-classique.aff', spExt+'/dictionaries/fr-classic.aff')
        copyTemplate('orthographe', spExt, 'README_dict_fr.txt', dTplVars)
        createZipFiles(spExt, spBuild, sExtensionName + '.xpi')
        # Grammalecte
        if spDestGL:
            echo(" * Dictionnaire >> copie des dicos dans Grammalecte")
            for dVars in lDictVars:
                file_util.copy_file(spDict+'/'+dVars['asciiName']+'.dic', spDestGL+'/'+dVars['mozAsciiName']+"/"+dVars['mozAsciiName']+'.dic')
                file_util.copy_file(spDict+'/'+dVars['asciiName']+'.aff', spDestGL+'/'+dVars['mozAsciiName']+"/"+dVars['mozAsciiName']+'.aff')
    

    def createFileIfqForDB (self, spBuild):
        echo(" * Dictionnaire >> indices de fréquence pour la DB...")
        with open(spBuild+'/dictIdxIfq-'+self.sVersion+'.diff.txt', 'w', encoding='utf-8', newline="\n") as hDiff, \
             open(spBuild+'/dictIdxIfq-'+self.sVersion+'.notes.txt', 'w', encoding='utf-8', newline="\n") as hNotes:
            for oEntry in self.lEntry:
                if oEntry.fq != oEntry.oldFq:
                    hDiff.write("{0.iD}\t{0.fq}\n".format(oEntry))
                    hNotes.write("{0.lemma}/{0.flags}\t{0.oldFq} > {0.fq}\n".format(oEntry))
        

    def createLexiconPackages (self, spBuild, version, oStatsLex, spDestGL=""):
        sLexName = LEX_PREFIX + version
        spLex = spBuild + '/' + sLexName
        dir_util.mkpath(spLex)
        # write Dicollecte lexicon
        self.sortLexiconByFreq()
        self.writeLexicon(spLex + '/' + sLexName + '.txt', version, oStatsLex)
737
738
739
740
741
742
743
744

745
746
747
748
749
750
751
737
738
739
740
741
742
743

744
745
746
747
748
749
750
751







-
+







        self.nFlexions = 0
        self.lFlexions = []
        self.sRadical = ''
        self.nOccur = 0
        self.nAKO = -1   # Average known occurrences
        self.fFreq = 0
        self.oldFq = ''
        

        sLine = sLine.rstrip(" \n")
        # commentaire
        if '#' in sLine:
            sLine, comment = sLine.split('#', 1)
            self.comment = comment.strip()
        # éléments de la ligne
        elems = sLine.split()
797
798
799
800
801
802
803
804

805
806
807
808
809
810
811
797
798
799
800
801
802
803

804
805
806
807
808
809
810
811







-
+







                else:
                    echo('  ## Champ inconnu: {}  dans  {}/{}'.format(fields[0], self.lemma, self.flags))
            else:
                self.err = self.err + elems[i]
        if self.err:
            echo("\n## Erreur dans le dictionnaire : {}".format(self.err))
            echo("   dans : " + self.lemma)
                

    def __str__ (self):
        return "{0.lemma}/{0.flags} {1}".format(self, self.getMorph(2))

    def check (self):
        sErr = ''
        if self.lemma == '':
            sErr += 'lemme vide'
863
864
865
866
867
868
869
870

871
872
873
874
875
876
877
863
864
865
866
867
868
869

870
871
872
873
874
875
876
877







-
+








    def keyTriNat (self):
        return (self.lemma.translate(CHARMAP), self.flags, self.po)

    def keyTriNum (self):
        return (self.lemma, self.flags, self.po)

    def getEntryLine (self, oDict, nMode, bSimplified=False):    
    def getEntryLine (self, oDict, nMode, bSimplified=False):
        sLine = self.lemma
        if self.flags:
            sLine += '/'
            sLine += self.flags  if not oDict.bShortenTags or bSimplified  else oDict.dAF[self.flags]
        if bSimplified:
            return sLine.replace("()", "") + "\n"
        if nMode > 0:
926
927
928
929
930
931
932
933

934
935
936
937
938
939
940
926
927
928
929
930
931
932

933
934
935
936
937
938
939
940







-
+







                    #echo(sFlex + " " + sMorph + ", ")
                    pass
        # Drapeaux dont le lemme féminin doit être remplacé par le masculin dans la gestion des formes fléchies
        if self.flags.startswith(("F.", "F*", "W.", "W*")):
            # recherche de la forme masculine
            for t in lTuples:
                sMorph = self.clean(t[1])
                if sMorph.endswith('mas') or sMorph.endswith('mas sg') or sMorph.endswith('mas inv'): 
                if sMorph.endswith('mas') or sMorph.endswith('mas sg') or sMorph.endswith('mas inv'):
                    self.sRadical = t[0]
        else:
            self.sRadical = self.lemma
        # Tag duplicates
        d = {}
        for oFlex in self.lFlexions:
            d[oFlex.sFlexion] = d.get(oFlex.sFlexion, 0) + 1
971
972
973
974
975
976
977
978

979
980
981
982
983
984
985
971
972
973
974
975
976
977

978
979
980
981
982
983
984
985







-
+







                                        lFlexions.append( (oRule.add+flex[0], flex[1]+ruleMorph) )
                                else:
                                    lFlexions.append(flexion)
                            else:
                                flexion = (self.lemma.replace(oRule.cut, oRule.add, 1), ruleMorph+morph, oRule.di)
                                if oFlag.bMix:
                                    lFlexPrefix.append(flexion)
                                    for flex in lFlexSuffix: 
                                    for flex in lFlexSuffix:
                                        lFlexions.append( (flex[0].replace(oRule.cut, oRule.add, 1), flex[1]+ruleMorph) )
                                else:
                                    lFlexions.append(flexion)
                            if oRule.flags != '' and oRule.flags != '**':
                                lFlexions.extend(Entree(flexion[0]+'/'+oRule.flags)._flechir(dFlags, flexion[1], iPR+1))
                else:
                    # cas des suffixes
1058
1059
1060
1061
1062
1063
1064
1065

1066
1067
1068
1069
1070
1071
1072
1073
1074

1075
1076
1077
1078
1079
1080
1081
1082
1083
1084
1085
1086
1087
1088

1089
1090
1091
1092
1093
1094
1095
1058
1059
1060
1061
1062
1063
1064

1065
1066
1067
1068
1069
1070
1071
1072
1073

1074
1075
1076
1077
1078
1079
1080
1081
1082
1083
1084
1085
1086
1087

1088
1089
1090
1091
1092
1093
1094
1095







-
+








-
+













-
+







    def calcOccurFromFlexions (self):
        self.nOccur = 0
        for o in self.lFlexions:
            self.nOccur += o.nOccur

    def calcAverageKnownOccurrence (self):
        # nous calculons la moyenne des occurrences des formes fléchies
        # qui n’ont pas d’équivalent dans les autres entrées (nMulti = 0) 
        # qui n’ont pas d’équivalent dans les autres entrées (nMulti = 0)
        nOccur = 0
        nFlex = 0
        for oFlex in self.lFlexions:
            if oFlex.nMulti == 0:
                nOccur += oFlex.nOccur
                nFlex += 1
        # moyenne des formes fléchies sans équivalent ou -1
        self.nAKO = math.ceil(nOccur / nFlex)  if nFlex > 0  else -1
    

    def solveOccurMultipleFlexions (self, hDst, oStatsLex):
        sBlank = "           "
        if self.nAKO >= 0:
            for oFlex in self.lFlexions:
                if oFlex.nMulti > 0 and not oFlex.bBlocked:
                    # on trie les entrées avec AKO et sans AKO
                    lEntWithAKO = []
                    lEntNoAKO = []
                    for oEntry in oFlex.lMulti:
                        if oEntry.nAKO >= 0:
                            lEntWithAKO.append(oEntry)
                        else:
                            lEntNoAKO.append(oEntry)
                    

                    if lEntNoAKO:
                        # on calcule la différence totale occasionnée par du passage des flexions appartenant à des entrées avec AKO au niveau AKO
                        nDiff = (oFlex.nOccur - self.nAKO) * oFlex.nDup
                        for oEntry in lEntWithAKO:
                            for oFlexM in oEntry.lFlexions:
                                if oFlex.sFlexion == oFlexM.sFlexion:
                                    nDiff += oFlexM.nOccur - oEntry.nAKO
1117
1118
1119
1120
1121
1122
1123
1124

1125
1126
1127
1128
1129
1130
1131
1132
1133
1134
1135
1136
1137
1138

1139
1140
1141
1142
1143
1144
1145
1117
1118
1119
1120
1121
1122
1123

1124
1125
1126
1127
1128
1129
1130
1131
1132
1133
1134
1135
1136
1137

1138
1139
1140
1141
1142
1143
1144
1145







-
+













-
+







                                        oFlexM.setOccurAndBlock(nNewOccur)
                    else:
                        # Toutes les entrées sont avec AKO : on pondère
                        nFlexOccur = oStatsLex.getFlexionOccur(oFlex.sFlexion)
                        nTotAKO = self.nAKO
                        for oEnt in oFlex.lMulti:
                            nTotAKO += oEnt.nAKO
                        

                        hDst.write(" = {0.sFlexion}\n".format(oFlex))
                        hDst.write("       moyennes connues\n")
                        for oFlexD in self.lFlexions:
                            if oFlex.sFlexion == oFlexD.sFlexion:
                                nNewOccur = math.ceil((nFlexOccur * (self.nAKO / nTotAKO)) / oFlexD.nDup)  if nTotAKO  else 0
                                hDst.write(sBlank + "{2:<30} {0.sMorph:<30}  {0.nOccur:>10}  %> {1:>10}\n".format(oFlexD, nNewOccur, self.getShortDescr()))
                                oFlexD.setOccurAndBlock(nNewOccur)
                        for oEntry in oFlex.lMulti:
                            for oFlexM in oEntry.lFlexions:
                                if oFlex.sFlexion == oFlexM.sFlexion:
                                    nNewOccur = math.ceil((nFlexOccur * (oEntry.nAKO / nTotAKO)) / oFlexM.nDup)  if nTotAKO  else 0
                                    hDst.write(sBlank + "{2:<30} {0.sMorph:<30}  {0.nOccur:>10}  %> {1:>10}\n".format(oFlexM, nNewOccur, oEntry.getShortDescr()))
                                    oFlexM.setOccurAndBlock(nNewOccur)
        

    def calcFreq (self, nTot):
        self.fFreq = (self.nOccur * 100) / nTot
        self.oldFq = self.fq
        self.fq = getIfq(self.fFreq)



1154
1155
1156
1157
1158
1159
1160
1161

1162
1163
1164
1165
1166
1167
1168
1169
1170
1171

1172
1173
1174
1175

1176
1177
1178
1179
1180
1181
1182
1154
1155
1156
1157
1158
1159
1160

1161
1162
1163
1164
1165
1166
1167
1168
1169
1170

1171
1172
1173
1174

1175
1176
1177
1178
1179
1180
1181
1182







-
+









-
+



-
+







        self.nDup    = 0    # duplicates in the same entry
        self.nMulti  = 0    # duplicates with other entries
        self.lMulti  = []   # list of similar flexions
        self.fFreq   = 0
        self.cFq     = ''
        self.metagfx = ''   # métagraphe
        self.metaph2 = ''   # métaphone 2
    

    def setOccur (self, n):
        self.nOccur = n

    def setOccurAndBlock (self, n):
        self.nOccur = n
        self.bBlocked = True

    def calcOccur (self):
        self.nOccur = math.ceil((self.nOccur / (self.nMulti+1)) / self.nDup)
    

    def calcFreq (self, nTot):
        self.fFreq = (self.nOccur * 100) / nTot
        self.cFq = getIfq(self.fFreq)
    

    def calcMetagraphe (self):
        t = metagraphe.getMetagraphe(self.sFlexion, self.sMorph)
        self.metagfx = t[0]  if not t[1]  else t[0]+"/"+t[1]

    def calcMetaphone2 (self):
        t = metaphone2.dm(self.sFlexion)
        self.metaph2 = t[0]  if not t[1]  else t[0]+"/"+t[1]
1258
1259
1260
1261
1262
1263
1264
1265

1266
1267
1268
1269
1270
1271
1272
1273
1274
1275
1276
1277
1278
1279
1280
1281
1282

1283
1284
1285
1286
1287
1288
1289
1258
1259
1260
1261
1262
1263
1264

1265
1266
1267
1268
1269
1270
1271
1272
1273
1274
1275
1276
1277
1278
1279
1280
1281

1282
1283
1284
1285
1286
1287
1288
1289







-
+
















-
+







        return (self.sFlexion.translate(CHARMAP), self.sMorph)

    def keyFreq (self):
        return (100-self.fFreq, self.oEntry.sRadical, self.sFlexion)

    def keyOcc (self):
        return (self.nOccur, self.oEntry.sRadical, self.sFlexion)
        

    def keyIdx (self):
        return self.oEntry.iD

    def keyFlexion (self):
        return self.sFlexion



class Flag:
    def __init__ (self, sFlagType, sFlagName, sMix):
        self.sFlagName = sFlagName
        self.bSfx = True  if sFlagType == 'SFX'  else False
        self.bMix = True  if sMix == 'Y'  else False
        self.lRules = []
        self.nRules = 0
        self.nOccur = 0
        

    def addAffixRule (self, line):
        "ajoute une règle au drapeau"
        oRule = AffixRule(line)
        self.lRules.append(oRule)
        self.nRules += 1

    def getFlag (self, subDicts, oDict, nMode, bSimplified):
1331
1332
1333
1334
1335
1336
1337
1338

1339
1340
1341
1342
1343
1344
1345
1331
1332
1333
1334
1335
1336
1337

1338
1339
1340
1341
1342
1343
1344
1345







-
+







        # champs de Dicollecte
        self.lx = ''
        self.di = '*'
        # erreurs
        self.err = ''
        # autres champs
        self.nOccur = 0
        

        sLine = sLine.rstrip(" \n")
        # commentaire
        if '#' in sLine:
            sLine, comment = sLine.split('#', 1)
            self.comment = comment.strip()
        # éléments de la ligne
        elems = sLine.split()
1389
1390
1391
1392
1393
1394
1395
1396

1397
1398
1399
1400
1401
1402
1403
1404
1405
1406
1407
1408
1409
1410
1411
1412
1413
1414
1415
1416

1417
1418
1419
1420
1421
1422
1423
1389
1390
1391
1392
1393
1394
1395

1396
1397
1398
1399
1400
1401
1402
1403
1404
1405
1406
1407
1408
1409
1410
1411
1412
1413
1414
1415

1416
1417
1418
1419
1420
1421
1422
1423







-
+



















-
+







                    self.lx = fields[1]  if self.lx == ''  else self.lx + ' ' + fields[1]
                elif fields[0] == 'di':
                    self.di = fields[1]
                else:
                    echo('Champ inconnu: {}  dans  {}'.format(fields[0], self.sFlagName))
            else:
                echo("  # Erreur affixe : {}".format(line))
    

    def isReplicationRule (self):
        "is this rule used for replication of a virtual lemma"
        return self.flags == "" and ((self.cut == "0" and self.add == "") or self.cut == self.add)

    def getRuleLine (self, oDict, nMode, bSimplified=False):
        sLine = 'SFX'  if self.bSfx  else 'PFX'
        sLine += ' ' + self.sFlagName + ' ' + self.cut + ' '
        sLine += self.add  if self.add  else '0'
        if self.flags != '':
            sLine += '/'
            sLine += self.flags  if not oDict.bShortenTags or bSimplified  else oDict.dAF[self.flags]
            if bSimplified:
                sLine = sLine.replace("()", "")
        sLine += ' ' + self.cond
        if not bSimplified and nMode > 0:
            sMorph = self.getMorph(nMode)
            if sMorph:
                sLine += sMorph  if not oDict.bShortenTags or bSimplified  else ' ' + oDict.dAM[sMorph.strip()]
        return sLine + "\n"
    

    def getMorph (self, nMode):
        # morphology for Hunspell
        txt = ''
        if self.po: txt += fieldToHunspell('po', self.po)
        if self.iz: txt += fieldToHunspell('is', self.iz)
        if self.ds: txt += fieldToHunspell('ds', self.ds)
        if self.ts: txt += fieldToHunspell('ts', self.ts)
1448
1449
1450
1451
1452
1453
1454
1455

1456
1457
1458
1459
1460
1461
1462
1448
1449
1450
1451
1452
1453
1454

1455
1456
1457
1458
1459
1460
1461
1462







-
+









class StatsLex:
    def __init__ (self, oDict):
        echo("Lexique statistique")
        self.dFlexions = { oFlex.sFlexion: []  for oFlex in oDict.lFlexions }
        self.lLex = []
        

    def addLexFromFile (self, sPathFile, cLexID, sLexName):
        if not os.path.isfile(sPathFile):
            echo(' * Lexique statistique - fichier {} introuvable'.format(sPathFile))
            return None
        if len(cLexID) != 1:
            echo(' * Lexique statistique - fichier {} - identifiant incorrect, 1 caractère requis'.format(sPathFile))
            return None
1516
1517
1518
1519
1520
1521
1522
1523

1524
1525
1526
1527

1528
1529
1530
1531
1532
1533

1534
1535
1536
1537

1538
1539
1540
1541
1542
1543
1544
1545
1546
1547
1548
1549
1550
1551
1552
1553
1554
1555
1556

1557
1558
1559
1560
1561
1562

1563
1564
1565
1566
1567
1568
1569
1516
1517
1518
1519
1520
1521
1522

1523
1524
1525
1526

1527
1528
1529
1530
1531
1532

1533
1534
1535
1536

1537
1538
1539
1540
1541
1542
1543
1544
1545
1546
1547
1548
1549
1550
1551
1552
1553
1554
1555

1556
1557
1558
1559
1560
1561

1562
1563
1564
1565
1566
1567
1568
1569







-
+



-
+





-
+



-
+


















-
+





-
+







        xArgs.uncompress = True

    echo("Python: " + sys.version)
    echo("Version: " + xArgs.verdic)
    echo("Simplify: " + str(xArgs.simplify))
    echo("Mode: " + str(xArgs.mode))
    echo("Compression: " + str(not(xArgs.uncompress)))
    

    ### création du répertoire
    spBuild = BUILD_PATH + '/' + xArgs.verdic
    dir_util.mkpath(spBuild)
    

    ### Lecture des fichiers et création du dictionnaire
    oFrenchDict = Dictionnaire(xArgs.verdic, "French dictionary")
    for sFile in ['orthographe/FRANCAIS.dic']:
        oFrenchDict.readDictionary(sFile)
    oFrenchDict.readAffixes('orthographe/FRANCAIS_5.aff')
    

    ### Contrôle
    oFrenchDict.sortEntriesNatural()
    oFrenchDict.checkEntries()
    

    ### Lexique
    oFrenchDict.generateFlexions()
    oFrenchDict.calcMetagraphe()
    oFrenchDict.calcMetaphone2()

    #oFrenchDict.createNgrams(spBuild, 3)
    if xArgs.spellvariants:
        oFrenchDict.generateSpellVariants(1, spBuild)

    ### Statistiques
    spfStats = spBuild+'/'+STATS_NAME+xArgs.verdic+'.txt'
    oStatsLex = StatsLex(oFrenchDict)
    oStatsLex.addLexFromFile('lexique/corpus_data/stats_google_ngram_1.txt', 'G', 'Google 1-grams')
    oStatsLex.addLexFromFile('lexique/corpus_data/stats_frwiki.txt', 'W', 'Wikipédia')
    oStatsLex.addLexFromFile('lexique/corpus_data/stats_frwikisource.txt', 'S', 'Wikisource')
    oStatsLex.addLexFromFile('lexique/corpus_data/stats_litterature.txt', 'L', 'Littérature')
    oStatsLex.write(spBuild+'/test_lex.txt')
    oFrenchDict.calculateStats(oStatsLex, spfStats)
    

    ### écriture des paquets
    echo("Création des paquets...")

    spLexiconDestGL = "../../../lexicons"  if xArgs.grammalecte  else ""
    spLibreOfficeExtDestGL = "../oxt/Dictionnaires/dictionaries"  if xArgs.grammalecte  else ""
    spMozillaExtDestGL = "../xpi/data/dictionaries"  if xArgs.grammalecte  else ""
    spMozillaExtDestGL = ""  # les dictionnaires pour Hunspell ne sont plus utilisés pour l’instant dans Firefox / Thunderbird
    spDataDestGL = "../data"  if xArgs.grammalecte  else ""

    if not xArgs.uncompress:
        oFrenchDict.defineAbreviatedTags(xArgs.mode, spfStats)
    oFrenchDict.createFiles(spBuild, [dMODERNE, dTOUTESVAR, dCLASSIQUE, dREFORME1990], xArgs.mode, xArgs.simplify)
    oFrenchDict.createLexiconPackages(spBuild, xArgs.verdic, oStatsLex, spLexiconDestGL)
    oFrenchDict.createFileIfqForDB(spBuild)