Grammalecte  Check-in [656e8f3fad]

Overview
Comment:[fr] gendicfr: apostrophe typographique remplacée par apostrophe droit pour les dictionnaires Hunspell
Downloads: Tarball | ZIP archive | SQL archive
Timelines: family | ancestors | descendants | both | fr | rg
Files: files | file ages | folders
SHA3-256: 656e8f3fade04639ca7b3d4568ca2c653e49ebfdaae40b7135d8d6ef811ce973
User & Date: olr on 2018-08-26 09:17:00
Other Links: branch diff | manifest | tags
Context
2018-08-26
09:37
[fr] gendicfr: entrées avec espace non incluses dans les dictionnaires Hunspell check-in: 85dde78b00 user: olr tags: fr, rg
09:17
[fr] gendicfr: apostrophe typographique remplacée par apostrophe droit pour les dictionnaires Hunspell check-in: 656e8f3fad user: olr tags: fr, rg
08:57
[fr] conversion: clarification et corrections de bugs check-in: a12aee51d6 user: olr tags: fr, rg
Changes

Modified gc_lang/fr/dictionnaire/genfrdic.py from [21ee33ebdc] to [1e7a4b144b].

21
22
23
24
25
26
27
28
29
30
31
32
33
34
35

import metagraphe
import metaphone2


# Dictionnaire des caractères pour le tri naturel.
# Ordre souhaitable, mais pose problème pour la recherche, car engendre des égalités de lemmes différents.
# Il faut donc travailler sur un dictionnaire trié *numériquement* et le sauvegarder selon le tri *naturel*             
CHARMAP = str.maketrans({ 'à': 'a',  'À': 'A',  'â': 'a',  'Â': 'A',  'ä': 'a',  'Ä': 'A',  'å': 'a',  'Å': 'A',  'ā': 'a',  'Ā': 'A',
                          'ç': 'c',  'Ç': 'C',
                          'é': 'e',  'É': 'E',  'è': 'e',  'È': 'E',  'ê': 'e',  'Ê': 'E',  'ë': 'e',  'Ë': 'E',  'ē': 'e',  'Ē': 'E',
                          'î': 'i',  'Î': 'I',  'ï': 'i',  'Ï': 'I',  'ī': 'i',  'Ī': 'I',
                          'ñ': 'n',
                          'ô': 'o',  'Ô': 'O',  'ö': 'o',  'Ö': 'O',  'ō': 'o',  'Ō': 'O',
                          'ù': 'u',  'Ù': 'U',  'û': 'u',  'Û': 'U',  'ü': 'u',  'Ü': 'U',  'ū': 'u',  'Ū': 'U',







|







21
22
23
24
25
26
27
28
29
30
31
32
33
34
35

import metagraphe
import metaphone2


# Dictionnaire des caractères pour le tri naturel.
# Ordre souhaitable, mais pose problème pour la recherche, car engendre des égalités de lemmes différents.
# Il faut donc travailler sur un dictionnaire trié *numériquement* et le sauvegarder selon le tri *naturel*
CHARMAP = str.maketrans({ 'à': 'a',  'À': 'A',  'â': 'a',  'Â': 'A',  'ä': 'a',  'Ä': 'A',  'å': 'a',  'Å': 'A',  'ā': 'a',  'Ā': 'A',
                          'ç': 'c',  'Ç': 'C',
                          'é': 'e',  'É': 'E',  'è': 'e',  'È': 'E',  'ê': 'e',  'Ê': 'E',  'ë': 'e',  'Ë': 'E',  'ē': 'e',  'Ē': 'E',
                          'î': 'i',  'Î': 'I',  'ï': 'i',  'Ï': 'I',  'ī': 'i',  'Ī': 'I',
                          'ñ': 'n',
                          'ô': 'o',  'Ô': 'O',  'ö': 'o',  'Ö': 'O',  'ō': 'o',  'Ō': 'O',
                          'ù': 'u',  'Ù': 'U',  'û': 'u',  'Û': 'U',  'ü': 'u',  'Ü': 'U',  'ū': 'u',  'Ū': 'U',
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
        # Affixes
        self.sSettings = '' # enregistre tout avant la ligne # END
        self.dFlags = collections.OrderedDict()
        self.bShortenTags = False
        self.dAM = collections.OrderedDict() # étiquettes morphologiques
        self.dAF = collections.OrderedDict() # étiquettes drapeaux
        # Flexions
        self.lFlexions = []           # liste des flexions avec lemme, morphologie et occurrences 
        self.lStatsLex = []
        self.nTotOccurRecognizedWords = 0
        self.aFlexions = None
    
    def readDictionary (self, spf):
        "Lecture du dictionnaire"
        echo('Dictionnaire << [ {} ]'.format(spf), end=' ')
        for sLine in readfile(spf):
            sLine = sLine.strip()
            if not sLine.isdigit() and not sLine.startswith("#"):
                self.lEntry.append(Entree(sLine))







|



|







215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
        # Affixes
        self.sSettings = '' # enregistre tout avant la ligne # END
        self.dFlags = collections.OrderedDict()
        self.bShortenTags = False
        self.dAM = collections.OrderedDict() # étiquettes morphologiques
        self.dAF = collections.OrderedDict() # étiquettes drapeaux
        # Flexions
        self.lFlexions = []           # liste des flexions avec lemme, morphologie et occurrences
        self.lStatsLex = []
        self.nTotOccurRecognizedWords = 0
        self.aFlexions = None

    def readDictionary (self, spf):
        "Lecture du dictionnaire"
        echo('Dictionnaire << [ {} ]'.format(spf), end=' ')
        for sLine in readfile(spf):
            sLine = sLine.strip()
            if not sLine.isdigit() and not sLine.startswith("#"):
                self.lEntry.append(Entree(sLine))
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
                dAF[oEntry.flags] = dAF.get(oEntry.flags, 0) + 1
            sMorph = oEntry.getMorph(nMode).strip()
            if sMorph:
                dAM[sMorph] = dAM.get(sMorph, 0) + 1

        lAF = sorted(dAF.items(), key = lambda x: (x[1], x[0]), reverse=True)
        lAM = sorted(dAM.items(), key = lambda x: (x[1], x[0]), reverse=True)
        
        with open(spDst, 'a', encoding='utf-8', newline="\n") as hDst:
            hDst.write("\n\nDrapeaux :\n")
            for nAF, elem in enumerate(lAF, 1):
                self.dAF[elem[0]] = str(nAF)
                hDst.write("  > {0[1]:>8} : {0[0]}\n".format(elem))
            hDst.write("\n\nMorphologies :\n")
            for nAM, elem in enumerate(lAM, 1):







|







278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
                dAF[oEntry.flags] = dAF.get(oEntry.flags, 0) + 1
            sMorph = oEntry.getMorph(nMode).strip()
            if sMorph:
                dAM[sMorph] = dAM.get(sMorph, 0) + 1

        lAF = sorted(dAF.items(), key = lambda x: (x[1], x[0]), reverse=True)
        lAM = sorted(dAM.items(), key = lambda x: (x[1], x[0]), reverse=True)

        with open(spDst, 'a', encoding='utf-8', newline="\n") as hDst:
            hDst.write("\n\nDrapeaux :\n")
            for nAF, elem in enumerate(lAF, 1):
                self.dAF[elem[0]] = str(nAF)
                hDst.write("  > {0[1]:>8} : {0[0]}\n".format(elem))
            hDst.write("\n\nMorphologies :\n")
            for nAM, elem in enumerate(lAM, 1):
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
            if oEntry.di in dTplVars['subDicts']:
                nEntry += 1
        with open(spDst+'/'+dTplVars['asciiName']+'.dic', 'w', encoding='utf-8', newline="\n") as hDst:
            hDst.write(str(nEntry)+"\n")
            for oEntry in self.lEntry:
                if oEntry.di in dTplVars['subDicts']:
                    hDst.write(oEntry.getEntryLine(self, nMode, bSimplified))
    
    def writeAffixes (self, spDst, dTplVars, nMode, bSimplified):
        "Écrire le fichier des affixes (.aff)"
        echo(' * Dictionnaire >> [ {}.aff ]'.format(dTplVars['asciiName']))
        info = "# This Source Code Form is subject to the terms of the Mozilla Public\n" + \
               "# License, v. 2.0. If a copy of the MPL was not distributed with this\n" + \
               "# file, You can obtain one at http://mozilla.org/MPL/2.0/.\n\n" + \
               "# AFFIXES DU {} v{}\n".format(dTplVars['name'], self.sVersion) + \
               "# par Olivier R. -- licence MPL 2.0\n" + \
               "# Généré le " + time.strftime("%d-%m-%Y à %H:%M") + "\n" \
               "# Pour améliorer le dictionnaire, allez sur http://www.dicollecte.org/\n\n"
               
        with open(spDst+'/'+dTplVars['asciiName']+'.aff', 'w', encoding='utf-8', newline="\n") as hDst:
            hDst.write(info)
            hDst.write(self.sSettings + "\n")
            if self.bShortenTags:
                hDst.write("AM {}\n".format(len(self.dAM)))
                for item in self.dAM.items():
                    hDst.write("AM {}\n".format(item[0]))







|










|







301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
            if oEntry.di in dTplVars['subDicts']:
                nEntry += 1
        with open(spDst+'/'+dTplVars['asciiName']+'.dic', 'w', encoding='utf-8', newline="\n") as hDst:
            hDst.write(str(nEntry)+"\n")
            for oEntry in self.lEntry:
                if oEntry.di in dTplVars['subDicts']:
                    hDst.write(oEntry.getEntryLine(self, nMode, bSimplified))

    def writeAffixes (self, spDst, dTplVars, nMode, bSimplified):
        "Écrire le fichier des affixes (.aff)"
        echo(' * Dictionnaire >> [ {}.aff ]'.format(dTplVars['asciiName']))
        info = "# This Source Code Form is subject to the terms of the Mozilla Public\n" + \
               "# License, v. 2.0. If a copy of the MPL was not distributed with this\n" + \
               "# file, You can obtain one at http://mozilla.org/MPL/2.0/.\n\n" + \
               "# AFFIXES DU {} v{}\n".format(dTplVars['name'], self.sVersion) + \
               "# par Olivier R. -- licence MPL 2.0\n" + \
               "# Généré le " + time.strftime("%d-%m-%Y à %H:%M") + "\n" \
               "# Pour améliorer le dictionnaire, allez sur http://www.dicollecte.org/\n\n"

        with open(spDst+'/'+dTplVars['asciiName']+'.aff', 'w', encoding='utf-8', newline="\n") as hDst:
            hDst.write(info)
            hDst.write(self.sSettings + "\n")
            if self.bShortenTags:
                hDst.write("AM {}\n".format(len(self.dAM)))
                for item in self.dAM.items():
                    hDst.write("AM {}\n".format(item[0]))
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348

    def sortEntriesNatural (self):
        echo(' * Dictionnaire - Tri naturel des entrées...')
        self.lEntry = sorted(self.lEntry, key=Entree.keyTriNat)

    def sortEntriesNumerical (self):
        echo(' * Dictionnaire - Tri numérique des entrées...')
        self.lEntry = sorted(self.lEntry, key=Entree.keyTriNum)        

    def sortLexiconByFlexion (self):
        echo(' * Dictionnaire - tri du lexique (par flexion)...')
        self.lFlexions = sorted(self.lFlexions, key=Flexion.keyFlexion)

    def sortLexiconByFreq (self):
        echo(' * Dictionnaire - tri du lexique (par fréquence)...')







|







334
335
336
337
338
339
340
341
342
343
344
345
346
347
348

    def sortEntriesNatural (self):
        echo(' * Dictionnaire - Tri naturel des entrées...')
        self.lEntry = sorted(self.lEntry, key=Entree.keyTriNat)

    def sortEntriesNumerical (self):
        echo(' * Dictionnaire - Tri numérique des entrées...')
        self.lEntry = sorted(self.lEntry, key=Entree.keyTriNum)

    def sortLexiconByFlexion (self):
        echo(' * Dictionnaire - tri du lexique (par flexion)...')
        self.lFlexions = sorted(self.lFlexions, key=Flexion.keyFlexion)

    def sortLexiconByFreq (self):
        echo(' * Dictionnaire - tri du lexique (par fréquence)...')
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
                d[oFlex.sFlexion] = [oFlex.oEntry]
        for oFlex in self.lFlexions:
            oFlex.lMulti = list(d[oFlex.sFlexion])
            oFlex.nMulti = len(oFlex.lMulti)
        for oFlex in self.lFlexions:
            oFlex.lMulti.remove(oFlex.oEntry)
            oFlex.nMulti -= 1
        
    def setTagsFrom (self, other):
        echo(' * Dictionnaire - copie des tags...')
        for i in range(self.nEntry):
            for oEntry in other.lEntry:
                if self.lEntry[i].lemma == oEntry.lemma and self.lEntry[i].flags == oEntry.flags:
                    self.lEntry[i].setTagsFrom(oEntry)

    def calculateStats (self, oStatsLex, spfDst):
        echo(" * Dictionnaire - calculs...")
        with open(spfDst, 'w', encoding='utf-8', newline="\n") as hDst:
            # Occurrences brutes des formes fléchies
            echo("   comptage des occurrences...")
            hDst.write(oStatsLex.getInfo())
            for oFlex in self.lFlexions:
                oFlex.setOccur(oStatsLex.getFlexionOccur(oFlex.sFlexion))
            self.nTotOccurRecognizedWords = 0
            for oFlex in self.lFlexions:
                oFlex.calcOccur()
                self.nTotOccurRecognizedWords += oFlex.nOccur
            
            # Report des occurrences
            echo("   report des occurrences des formes fléchies multiples...")
            hDst.write("Report des occurrences des formes fléchies multiples :\n")
            hDst.write("  Légende :\n")
            hDst.write("    >>   le nombre d’occurrences de la flexion est ramené à la moyenne.\n")
            hDst.write("    +>   le nombre d’occurrences de la flexion est augmenté avec le surplus d’occurrences des flexions ramenées à la moyenne.\n")
            hDst.write("    %>   le nombre d’occurrences de la flexion est pondéré avec le poids de la moyenne de l’entrée.\n\n")

            for oEntry in self.lEntry:
                oEntry.calcOccurFromFlexions()
                oEntry.calcAverageKnownOccurrence()
                oEntry.solveOccurMultipleFlexions(hDst, oStatsLex)
                oEntry.calcOccurFromFlexions()
            
            # Fréquences
            echo("   calcul des fréquences et indices de fréquence...")
            for oFlex in self.lFlexions:
                oFlex.calcFreq(self.nTotOccurRecognizedWords)
            for oEntry in self.lEntry:
                oEntry.calcFreq(self.nTotOccurRecognizedWords)
            
            # Entrées, statistiques
            echo("   statistiques...")
            hDst.write("\n\nNatures grammaticales :\n")
            d = {}
            for oEntry in self.lEntry:
                po = re.sub("(?<=v[0-3])[itnpqrmaezx_]+", "", oEntry.po)
                d[po] = d.get(po, 0) + 1
            for e in sorted(d.items(), key = lambda x: (x[1], x[0]), reverse=True):
                hDst.write(" * {0[1]:<15} : {0[0]}\n".format(e))
            
            hDst.write("\n\nVentilation des entrées par indice de fréquence :\n")
            d1 = {}
            d2 = {}
            for oEntry in self.lEntry:
                d1[oEntry.fq] = d1.get(oEntry.fq, 0) + 1
                d2[oEntry.fq] = d2.get(oEntry.fq, 0) + oEntry.fFreq
            for k in sorted(d1.keys()):
                hDst.write(" * {} : {} entrées ({:.2f} %)  → {:.9f} %\n".format(k, d1[k], (d1[k]*100)/self.nEntry, d2[k]))
                    
            hDst.write("\n\nRépartition des entrées par sous-dictionnaire :\n")
            d = {}
            for oEntry in self.lEntry:
                d[oEntry.di] = d.get(oEntry.di, 0) + 1
            for sKey, nVal in d.items():
                hDst.write(" * {0:<15} : {1} entrées ({2:.2f} %)\n".format(dSUBDIC[sKey], nVal, (nVal*100)/self.nEntry))
            
            # Occurrences des lettres
            echo("   occurrences des lettres...")
            d = {}
            for oFlex in self.lFlexions:
                for c in oFlex.sFlexion:
                    d[c] = d.get(c, 0) + oFlex.nOccur
            nTot = 0







|



















|













|






|









|








|






|







372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
                d[oFlex.sFlexion] = [oFlex.oEntry]
        for oFlex in self.lFlexions:
            oFlex.lMulti = list(d[oFlex.sFlexion])
            oFlex.nMulti = len(oFlex.lMulti)
        for oFlex in self.lFlexions:
            oFlex.lMulti.remove(oFlex.oEntry)
            oFlex.nMulti -= 1

    def setTagsFrom (self, other):
        echo(' * Dictionnaire - copie des tags...')
        for i in range(self.nEntry):
            for oEntry in other.lEntry:
                if self.lEntry[i].lemma == oEntry.lemma and self.lEntry[i].flags == oEntry.flags:
                    self.lEntry[i].setTagsFrom(oEntry)

    def calculateStats (self, oStatsLex, spfDst):
        echo(" * Dictionnaire - calculs...")
        with open(spfDst, 'w', encoding='utf-8', newline="\n") as hDst:
            # Occurrences brutes des formes fléchies
            echo("   comptage des occurrences...")
            hDst.write(oStatsLex.getInfo())
            for oFlex in self.lFlexions:
                oFlex.setOccur(oStatsLex.getFlexionOccur(oFlex.sFlexion))
            self.nTotOccurRecognizedWords = 0
            for oFlex in self.lFlexions:
                oFlex.calcOccur()
                self.nTotOccurRecognizedWords += oFlex.nOccur

            # Report des occurrences
            echo("   report des occurrences des formes fléchies multiples...")
            hDst.write("Report des occurrences des formes fléchies multiples :\n")
            hDst.write("  Légende :\n")
            hDst.write("    >>   le nombre d’occurrences de la flexion est ramené à la moyenne.\n")
            hDst.write("    +>   le nombre d’occurrences de la flexion est augmenté avec le surplus d’occurrences des flexions ramenées à la moyenne.\n")
            hDst.write("    %>   le nombre d’occurrences de la flexion est pondéré avec le poids de la moyenne de l’entrée.\n\n")

            for oEntry in self.lEntry:
                oEntry.calcOccurFromFlexions()
                oEntry.calcAverageKnownOccurrence()
                oEntry.solveOccurMultipleFlexions(hDst, oStatsLex)
                oEntry.calcOccurFromFlexions()

            # Fréquences
            echo("   calcul des fréquences et indices de fréquence...")
            for oFlex in self.lFlexions:
                oFlex.calcFreq(self.nTotOccurRecognizedWords)
            for oEntry in self.lEntry:
                oEntry.calcFreq(self.nTotOccurRecognizedWords)

            # Entrées, statistiques
            echo("   statistiques...")
            hDst.write("\n\nNatures grammaticales :\n")
            d = {}
            for oEntry in self.lEntry:
                po = re.sub("(?<=v[0-3])[itnpqrmaezx_]+", "", oEntry.po)
                d[po] = d.get(po, 0) + 1
            for e in sorted(d.items(), key = lambda x: (x[1], x[0]), reverse=True):
                hDst.write(" * {0[1]:<15} : {0[0]}\n".format(e))

            hDst.write("\n\nVentilation des entrées par indice de fréquence :\n")
            d1 = {}
            d2 = {}
            for oEntry in self.lEntry:
                d1[oEntry.fq] = d1.get(oEntry.fq, 0) + 1
                d2[oEntry.fq] = d2.get(oEntry.fq, 0) + oEntry.fFreq
            for k in sorted(d1.keys()):
                hDst.write(" * {} : {} entrées ({:.2f} %)  → {:.9f} %\n".format(k, d1[k], (d1[k]*100)/self.nEntry, d2[k]))

            hDst.write("\n\nRépartition des entrées par sous-dictionnaire :\n")
            d = {}
            for oEntry in self.lEntry:
                d[oEntry.di] = d.get(oEntry.di, 0) + 1
            for sKey, nVal in d.items():
                hDst.write(" * {0:<15} : {1} entrées ({2:.2f} %)\n".format(dSUBDIC[sKey], nVal, (nVal*100)/self.nEntry))

            # Occurrences des lettres
            echo("   occurrences des lettres...")
            d = {}
            for oFlex in self.lFlexions:
                for c in oFlex.sFlexion:
                    d[c] = d.get(c, 0) + oFlex.nOccur
            nTot = 0
472
473
474
475
476
477
478
479
480
481
482
483
484
485
486
487
488
489
490
491
            hDst.write("\n\nNombre de formes fléchies : {}\n".format(len(self.lFlexions)))
            hDst.write("\n\nNombre de graphies : {}\n".format(len(self.aFlexions)))

    def calcMetagraphe (self):
        echo(" * Lexique - Metagraphe")
        for oFlex in self.lFlexions:
            oFlex.calcMetagraphe()
    
    def calcMetaphone2 (self):
        echo(" * Lexique - Metaphone 2")
        for oFlex in self.lFlexions:
            oFlex.calcMetaphone2()
    
    def createNgrams (self, spDest, n):
        echo(" * Lexique - Ngrams " + str(n))
        if n < 2:
            echo("erreur: n = " + str(n))
            return
        dOccur = {} # ngram:n
        dRefW = {} # ngram:set(idx)







|




|







472
473
474
475
476
477
478
479
480
481
482
483
484
485
486
487
488
489
490
491
            hDst.write("\n\nNombre de formes fléchies : {}\n".format(len(self.lFlexions)))
            hDst.write("\n\nNombre de graphies : {}\n".format(len(self.aFlexions)))

    def calcMetagraphe (self):
        echo(" * Lexique - Metagraphe")
        for oFlex in self.lFlexions:
            oFlex.calcMetagraphe()

    def calcMetaphone2 (self):
        echo(" * Lexique - Metaphone 2")
        for oFlex in self.lFlexions:
            oFlex.calcMetaphone2()

    def createNgrams (self, spDest, n):
        echo(" * Lexique - Ngrams " + str(n))
        if n < 2:
            echo("erreur: n = " + str(n))
            return
        dOccur = {} # ngram:n
        dRefW = {} # ngram:set(idx)
560
561
562
563
564
565
566
567
568
569
570
571
572
573
574
575
576
577
578
579
580
581
582
583
584
585
586
587
588
589
590
591
592
593
594
595
596
597
598
599
600
601
602
603
604
605
606
607
608
609
610
611
612
613
614
615
616
617
618
619
620
621
622
623
624
        file_util.copy_file('_templates/ooo/french_flag.png', spExt)
        file_util.copy_file('_templates/ooo/french_flag_16.bmp', spExt+'/ui')
        copyTemplate('_templates/ooo', spExt, 'description.xml', dTplVars)
        copyTemplate('_templates/ooo', spExt, 'dictionaries.xcu', dTplVars)
        #file_util.copy_file('_templates/ooo/dictionaries.xcu.tpl.xml', spExt)
        copyTemplate('_templates/ooo', spExt, 'package-description.txt', dTplVars)
        for dVars in lDictVars:
            dicPath = spBuild + '/' + PREFIX_DICT_PATH + self.sVersion 
            file_util.copy_file(dicPath+'/'+dVars['asciiName']+'.dic', spExt+'/dictionaries/'+dVars['asciiName']+'.dic')
            file_util.copy_file(dicPath+'/'+dVars['asciiName']+'.aff', spExt+'/dictionaries/'+dVars['asciiName']+'.aff')
        copyTemplate('orthographe', spExt+'/dictionaries', 'README_dict_fr.txt', dTplVars)
        # thesaurus
        file_util.copy_file('thesaurus/thes_fr.dat', spExt+'/dictionaries')
        file_util.copy_file('thesaurus/thes_fr.idx', spExt+'/dictionaries')
        file_util.copy_file('thesaurus/README_thes_fr.txt', spExt+'/dictionaries')
        # hyphenation
        file_util.copy_file('césures/hyph_fr.dic', spExt+'/dictionaries')
        file_util.copy_file('césures/hyph_fr.iso8859-1.dic', spExt+'/dictionaries')
        file_util.copy_file('césures/frhyph.tex', spExt+'/dictionaries')
        file_util.copy_file('césures/hyph-fr.tex', spExt+'/dictionaries')
        file_util.copy_file('césures/README_hyph_fr-3.0.txt', spExt+'/dictionaries')
        file_util.copy_file('césures/README_hyph_fr-2.9.txt', spExt+'/dictionaries')
        # zip
        createZipFiles(spExt, spBuild, sExtensionName + '.oxt')
        # copy to Grammalecte Project
        if spDestGL:
            echo("   extension copiée dans Grammalecte...")
            dir_util.copy_tree(spExt+'/dictionaries', spDestGL)
    
    def createMozillaExtensions (self, spBuild, dTplVars, lDictVars, spDestGL=""):
        # Mozilla extension 1
        echo(" * Dictionnaire >> extension pour Mozilla")
        dTplVars['version'] = self.sVersion
        sExtensionName = EXT_PREFIX_MOZ + self.sVersion
        spExt = spBuild + '/' + sExtensionName
        dir_util.mkpath(spExt+'/dictionaries')
        copyTemplate('_templates/moz', spExt, 'install.rdf', dTplVars)
        spDict = spBuild + '/' + PREFIX_DICT_PATH + self.sVersion
        file_util.copy_file(spDict+'/fr-classique.dic', spExt+'/dictionaries/fr-classic.dic')
        file_util.copy_file(spDict+'/fr-classique.aff', spExt+'/dictionaries/fr-classic.aff')
        copyTemplate('orthographe', spExt, 'README_dict_fr.txt', dTplVars)
        createZipFiles(spExt, spBuild, sExtensionName + '.xpi')
        # Grammalecte
        if spDestGL:
            echo(" * Dictionnaire >> copie des dicos dans Grammalecte")
            for dVars in lDictVars:
                file_util.copy_file(spDict+'/'+dVars['asciiName']+'.dic', spDestGL+'/'+dVars['mozAsciiName']+"/"+dVars['mozAsciiName']+'.dic')
                file_util.copy_file(spDict+'/'+dVars['asciiName']+'.aff', spDestGL+'/'+dVars['mozAsciiName']+"/"+dVars['mozAsciiName']+'.aff')
    
    def createFileIfqForDB (self, spBuild):
        echo(" * Dictionnaire >> indices de fréquence pour la DB...")
        with open(spBuild+'/dictIdxIfq-'+self.sVersion+'.diff.txt', 'w', encoding='utf-8', newline="\n") as hDiff, \
             open(spBuild+'/dictIdxIfq-'+self.sVersion+'.notes.txt', 'w', encoding='utf-8', newline="\n") as hNotes:
            for oEntry in self.lEntry:
                if oEntry.fq != oEntry.oldFq:
                    hDiff.write("{0.iD}\t{0.fq}\n".format(oEntry))
                    hNotes.write("{0.lemma}/{0.flags}\t{0.oldFq} > {0.fq}\n".format(oEntry))
        
    def createLexiconPackages (self, spBuild, version, oStatsLex, spDestGL=""):
        sLexName = LEX_PREFIX + version
        spLex = spBuild + '/' + sLexName
        dir_util.mkpath(spLex)
        # write Dicollecte lexicon
        self.sortLexiconByFreq()
        self.writeLexicon(spLex + '/' + sLexName + '.txt', version, oStatsLex)







|




















|



















|








|







560
561
562
563
564
565
566
567
568
569
570
571
572
573
574
575
576
577
578
579
580
581
582
583
584
585
586
587
588
589
590
591
592
593
594
595
596
597
598
599
600
601
602
603
604
605
606
607
608
609
610
611
612
613
614
615
616
617
618
619
620
621
622
623
624
        file_util.copy_file('_templates/ooo/french_flag.png', spExt)
        file_util.copy_file('_templates/ooo/french_flag_16.bmp', spExt+'/ui')
        copyTemplate('_templates/ooo', spExt, 'description.xml', dTplVars)
        copyTemplate('_templates/ooo', spExt, 'dictionaries.xcu', dTplVars)
        #file_util.copy_file('_templates/ooo/dictionaries.xcu.tpl.xml', spExt)
        copyTemplate('_templates/ooo', spExt, 'package-description.txt', dTplVars)
        for dVars in lDictVars:
            dicPath = spBuild + '/' + PREFIX_DICT_PATH + self.sVersion
            file_util.copy_file(dicPath+'/'+dVars['asciiName']+'.dic', spExt+'/dictionaries/'+dVars['asciiName']+'.dic')
            file_util.copy_file(dicPath+'/'+dVars['asciiName']+'.aff', spExt+'/dictionaries/'+dVars['asciiName']+'.aff')
        copyTemplate('orthographe', spExt+'/dictionaries', 'README_dict_fr.txt', dTplVars)
        # thesaurus
        file_util.copy_file('thesaurus/thes_fr.dat', spExt+'/dictionaries')
        file_util.copy_file('thesaurus/thes_fr.idx', spExt+'/dictionaries')
        file_util.copy_file('thesaurus/README_thes_fr.txt', spExt+'/dictionaries')
        # hyphenation
        file_util.copy_file('césures/hyph_fr.dic', spExt+'/dictionaries')
        file_util.copy_file('césures/hyph_fr.iso8859-1.dic', spExt+'/dictionaries')
        file_util.copy_file('césures/frhyph.tex', spExt+'/dictionaries')
        file_util.copy_file('césures/hyph-fr.tex', spExt+'/dictionaries')
        file_util.copy_file('césures/README_hyph_fr-3.0.txt', spExt+'/dictionaries')
        file_util.copy_file('césures/README_hyph_fr-2.9.txt', spExt+'/dictionaries')
        # zip
        createZipFiles(spExt, spBuild, sExtensionName + '.oxt')
        # copy to Grammalecte Project
        if spDestGL:
            echo("   extension copiée dans Grammalecte...")
            dir_util.copy_tree(spExt+'/dictionaries', spDestGL)

    def createMozillaExtensions (self, spBuild, dTplVars, lDictVars, spDestGL=""):
        # Mozilla extension 1
        echo(" * Dictionnaire >> extension pour Mozilla")
        dTplVars['version'] = self.sVersion
        sExtensionName = EXT_PREFIX_MOZ + self.sVersion
        spExt = spBuild + '/' + sExtensionName
        dir_util.mkpath(spExt+'/dictionaries')
        copyTemplate('_templates/moz', spExt, 'install.rdf', dTplVars)
        spDict = spBuild + '/' + PREFIX_DICT_PATH + self.sVersion
        file_util.copy_file(spDict+'/fr-classique.dic', spExt+'/dictionaries/fr-classic.dic')
        file_util.copy_file(spDict+'/fr-classique.aff', spExt+'/dictionaries/fr-classic.aff')
        copyTemplate('orthographe', spExt, 'README_dict_fr.txt', dTplVars)
        createZipFiles(spExt, spBuild, sExtensionName + '.xpi')
        # Grammalecte
        if spDestGL:
            echo(" * Dictionnaire >> copie des dicos dans Grammalecte")
            for dVars in lDictVars:
                file_util.copy_file(spDict+'/'+dVars['asciiName']+'.dic', spDestGL+'/'+dVars['mozAsciiName']+"/"+dVars['mozAsciiName']+'.dic')
                file_util.copy_file(spDict+'/'+dVars['asciiName']+'.aff', spDestGL+'/'+dVars['mozAsciiName']+"/"+dVars['mozAsciiName']+'.aff')

    def createFileIfqForDB (self, spBuild):
        echo(" * Dictionnaire >> indices de fréquence pour la DB...")
        with open(spBuild+'/dictIdxIfq-'+self.sVersion+'.diff.txt', 'w', encoding='utf-8', newline="\n") as hDiff, \
             open(spBuild+'/dictIdxIfq-'+self.sVersion+'.notes.txt', 'w', encoding='utf-8', newline="\n") as hNotes:
            for oEntry in self.lEntry:
                if oEntry.fq != oEntry.oldFq:
                    hDiff.write("{0.iD}\t{0.fq}\n".format(oEntry))
                    hNotes.write("{0.lemma}/{0.flags}\t{0.oldFq} > {0.fq}\n".format(oEntry))

    def createLexiconPackages (self, spBuild, version, oStatsLex, spDestGL=""):
        sLexName = LEX_PREFIX + version
        spLex = spBuild + '/' + sLexName
        dir_util.mkpath(spLex)
        # write Dicollecte lexicon
        self.sortLexiconByFreq()
        self.writeLexicon(spLex + '/' + sLexName + '.txt', version, oStatsLex)
737
738
739
740
741
742
743
744
745
746
747
748
749
750
751
        self.nFlexions = 0
        self.lFlexions = []
        self.sRadical = ''
        self.nOccur = 0
        self.nAKO = -1   # Average known occurrences
        self.fFreq = 0
        self.oldFq = ''
        
        sLine = sLine.rstrip(" \n")
        # commentaire
        if '#' in sLine:
            sLine, comment = sLine.split('#', 1)
            self.comment = comment.strip()
        # éléments de la ligne
        elems = sLine.split()







|







737
738
739
740
741
742
743
744
745
746
747
748
749
750
751
        self.nFlexions = 0
        self.lFlexions = []
        self.sRadical = ''
        self.nOccur = 0
        self.nAKO = -1   # Average known occurrences
        self.fFreq = 0
        self.oldFq = ''

        sLine = sLine.rstrip(" \n")
        # commentaire
        if '#' in sLine:
            sLine, comment = sLine.split('#', 1)
            self.comment = comment.strip()
        # éléments de la ligne
        elems = sLine.split()
797
798
799
800
801
802
803
804
805
806
807
808
809
810
811
                else:
                    echo('  ## Champ inconnu: {}  dans  {}/{}'.format(fields[0], self.lemma, self.flags))
            else:
                self.err = self.err + elems[i]
        if self.err:
            echo("\n## Erreur dans le dictionnaire : {}".format(self.err))
            echo("   dans : " + self.lemma)
                
    def __str__ (self):
        return "{0.lemma}/{0.flags} {1}".format(self, self.getMorph(2))

    def check (self):
        sErr = ''
        if self.lemma == '':
            sErr += 'lemme vide'







|







797
798
799
800
801
802
803
804
805
806
807
808
809
810
811
                else:
                    echo('  ## Champ inconnu: {}  dans  {}/{}'.format(fields[0], self.lemma, self.flags))
            else:
                self.err = self.err + elems[i]
        if self.err:
            echo("\n## Erreur dans le dictionnaire : {}".format(self.err))
            echo("   dans : " + self.lemma)

    def __str__ (self):
        return "{0.lemma}/{0.flags} {1}".format(self, self.getMorph(2))

    def check (self):
        sErr = ''
        if self.lemma == '':
            sErr += 'lemme vide'
863
864
865
866
867
868
869
870
871
872
873
874
875
876
877
878

    def keyTriNat (self):
        return (self.lemma.translate(CHARMAP), self.flags, self.po)

    def keyTriNum (self):
        return (self.lemma, self.flags, self.po)

    def getEntryLine (self, oDict, nMode, bSimplified=False):    
        sLine = self.lemma
        if self.flags:
            sLine += '/'
            sLine += self.flags  if not oDict.bShortenTags or bSimplified  else oDict.dAF[self.flags]
        if bSimplified:
            return sLine.replace("()", "") + "\n"
        if nMode > 0:
            sMorph = self.getMorph(nMode)







|
|







863
864
865
866
867
868
869
870
871
872
873
874
875
876
877
878

    def keyTriNat (self):
        return (self.lemma.translate(CHARMAP), self.flags, self.po)

    def keyTriNum (self):
        return (self.lemma, self.flags, self.po)

    def getEntryLine (self, oDict, nMode, bSimplified=False):
        sLine = self.lemma.replace("’", "'")
        if self.flags:
            sLine += '/'
            sLine += self.flags  if not oDict.bShortenTags or bSimplified  else oDict.dAF[self.flags]
        if bSimplified:
            return sLine.replace("()", "") + "\n"
        if nMode > 0:
            sMorph = self.getMorph(nMode)
926
927
928
929
930
931
932
933
934
935
936
937
938
939
940
                    #echo(sFlex + " " + sMorph + ", ")
                    pass
        # Drapeaux dont le lemme féminin doit être remplacé par le masculin dans la gestion des formes fléchies
        if self.flags.startswith(("F.", "F*", "W.", "W*")):
            # recherche de la forme masculine
            for t in lTuples:
                sMorph = self.clean(t[1])
                if sMorph.endswith('mas') or sMorph.endswith('mas sg') or sMorph.endswith('mas inv'): 
                    self.sRadical = t[0]
        else:
            self.sRadical = self.lemma
        # Tag duplicates
        d = {}
        for oFlex in self.lFlexions:
            d[oFlex.sFlexion] = d.get(oFlex.sFlexion, 0) + 1







|







926
927
928
929
930
931
932
933
934
935
936
937
938
939
940
                    #echo(sFlex + " " + sMorph + ", ")
                    pass
        # Drapeaux dont le lemme féminin doit être remplacé par le masculin dans la gestion des formes fléchies
        if self.flags.startswith(("F.", "F*", "W.", "W*")):
            # recherche de la forme masculine
            for t in lTuples:
                sMorph = self.clean(t[1])
                if sMorph.endswith('mas') or sMorph.endswith('mas sg') or sMorph.endswith('mas inv'):
                    self.sRadical = t[0]
        else:
            self.sRadical = self.lemma
        # Tag duplicates
        d = {}
        for oFlex in self.lFlexions:
            d[oFlex.sFlexion] = d.get(oFlex.sFlexion, 0) + 1
971
972
973
974
975
976
977
978
979
980
981
982
983
984
985
                                        lFlexions.append( (oRule.add+flex[0], flex[1]+ruleMorph) )
                                else:
                                    lFlexions.append(flexion)
                            else:
                                flexion = (self.lemma.replace(oRule.cut, oRule.add, 1), ruleMorph+morph, oRule.di)
                                if oFlag.bMix:
                                    lFlexPrefix.append(flexion)
                                    for flex in lFlexSuffix: 
                                        lFlexions.append( (flex[0].replace(oRule.cut, oRule.add, 1), flex[1]+ruleMorph) )
                                else:
                                    lFlexions.append(flexion)
                            if oRule.flags != '' and oRule.flags != '**':
                                lFlexions.extend(Entree(flexion[0]+'/'+oRule.flags)._flechir(dFlags, flexion[1], iPR+1))
                else:
                    # cas des suffixes







|







971
972
973
974
975
976
977
978
979
980
981
982
983
984
985
                                        lFlexions.append( (oRule.add+flex[0], flex[1]+ruleMorph) )
                                else:
                                    lFlexions.append(flexion)
                            else:
                                flexion = (self.lemma.replace(oRule.cut, oRule.add, 1), ruleMorph+morph, oRule.di)
                                if oFlag.bMix:
                                    lFlexPrefix.append(flexion)
                                    for flex in lFlexSuffix:
                                        lFlexions.append( (flex[0].replace(oRule.cut, oRule.add, 1), flex[1]+ruleMorph) )
                                else:
                                    lFlexions.append(flexion)
                            if oRule.flags != '' and oRule.flags != '**':
                                lFlexions.extend(Entree(flexion[0]+'/'+oRule.flags)._flechir(dFlags, flexion[1], iPR+1))
                else:
                    # cas des suffixes
1058
1059
1060
1061
1062
1063
1064
1065
1066
1067
1068
1069
1070
1071
1072
1073
1074
1075
1076
1077
1078
1079
1080
1081
1082
1083
1084
1085
1086
1087
1088
1089
1090
1091
1092
1093
1094
1095
    def calcOccurFromFlexions (self):
        self.nOccur = 0
        for o in self.lFlexions:
            self.nOccur += o.nOccur

    def calcAverageKnownOccurrence (self):
        # nous calculons la moyenne des occurrences des formes fléchies
        # qui n’ont pas d’équivalent dans les autres entrées (nMulti = 0) 
        nOccur = 0
        nFlex = 0
        for oFlex in self.lFlexions:
            if oFlex.nMulti == 0:
                nOccur += oFlex.nOccur
                nFlex += 1
        # moyenne des formes fléchies sans équivalent ou -1
        self.nAKO = math.ceil(nOccur / nFlex)  if nFlex > 0  else -1
    
    def solveOccurMultipleFlexions (self, hDst, oStatsLex):
        sBlank = "           "
        if self.nAKO >= 0:
            for oFlex in self.lFlexions:
                if oFlex.nMulti > 0 and not oFlex.bBlocked:
                    # on trie les entrées avec AKO et sans AKO
                    lEntWithAKO = []
                    lEntNoAKO = []
                    for oEntry in oFlex.lMulti:
                        if oEntry.nAKO >= 0:
                            lEntWithAKO.append(oEntry)
                        else:
                            lEntNoAKO.append(oEntry)
                    
                    if lEntNoAKO:
                        # on calcule la différence totale occasionnée par du passage des flexions appartenant à des entrées avec AKO au niveau AKO
                        nDiff = (oFlex.nOccur - self.nAKO) * oFlex.nDup
                        for oEntry in lEntWithAKO:
                            for oFlexM in oEntry.lFlexions:
                                if oFlex.sFlexion == oFlexM.sFlexion:
                                    nDiff += oFlexM.nOccur - oEntry.nAKO







|








|













|







1058
1059
1060
1061
1062
1063
1064
1065
1066
1067
1068
1069
1070
1071
1072
1073
1074
1075
1076
1077
1078
1079
1080
1081
1082
1083
1084
1085
1086
1087
1088
1089
1090
1091
1092
1093
1094
1095
    def calcOccurFromFlexions (self):
        self.nOccur = 0
        for o in self.lFlexions:
            self.nOccur += o.nOccur

    def calcAverageKnownOccurrence (self):
        # nous calculons la moyenne des occurrences des formes fléchies
        # qui n’ont pas d’équivalent dans les autres entrées (nMulti = 0)
        nOccur = 0
        nFlex = 0
        for oFlex in self.lFlexions:
            if oFlex.nMulti == 0:
                nOccur += oFlex.nOccur
                nFlex += 1
        # moyenne des formes fléchies sans équivalent ou -1
        self.nAKO = math.ceil(nOccur / nFlex)  if nFlex > 0  else -1

    def solveOccurMultipleFlexions (self, hDst, oStatsLex):
        sBlank = "           "
        if self.nAKO >= 0:
            for oFlex in self.lFlexions:
                if oFlex.nMulti > 0 and not oFlex.bBlocked:
                    # on trie les entrées avec AKO et sans AKO
                    lEntWithAKO = []
                    lEntNoAKO = []
                    for oEntry in oFlex.lMulti:
                        if oEntry.nAKO >= 0:
                            lEntWithAKO.append(oEntry)
                        else:
                            lEntNoAKO.append(oEntry)

                    if lEntNoAKO:
                        # on calcule la différence totale occasionnée par du passage des flexions appartenant à des entrées avec AKO au niveau AKO
                        nDiff = (oFlex.nOccur - self.nAKO) * oFlex.nDup
                        for oEntry in lEntWithAKO:
                            for oFlexM in oEntry.lFlexions:
                                if oFlex.sFlexion == oFlexM.sFlexion:
                                    nDiff += oFlexM.nOccur - oEntry.nAKO
1117
1118
1119
1120
1121
1122
1123
1124
1125
1126
1127
1128
1129
1130
1131
1132
1133
1134
1135
1136
1137
1138
1139
1140
1141
1142
1143
1144
1145
                                        oFlexM.setOccurAndBlock(nNewOccur)
                    else:
                        # Toutes les entrées sont avec AKO : on pondère
                        nFlexOccur = oStatsLex.getFlexionOccur(oFlex.sFlexion)
                        nTotAKO = self.nAKO
                        for oEnt in oFlex.lMulti:
                            nTotAKO += oEnt.nAKO
                        
                        hDst.write(" = {0.sFlexion}\n".format(oFlex))
                        hDst.write("       moyennes connues\n")
                        for oFlexD in self.lFlexions:
                            if oFlex.sFlexion == oFlexD.sFlexion:
                                nNewOccur = math.ceil((nFlexOccur * (self.nAKO / nTotAKO)) / oFlexD.nDup)  if nTotAKO  else 0
                                hDst.write(sBlank + "{2:<30} {0.sMorph:<30}  {0.nOccur:>10}  %> {1:>10}\n".format(oFlexD, nNewOccur, self.getShortDescr()))
                                oFlexD.setOccurAndBlock(nNewOccur)
                        for oEntry in oFlex.lMulti:
                            for oFlexM in oEntry.lFlexions:
                                if oFlex.sFlexion == oFlexM.sFlexion:
                                    nNewOccur = math.ceil((nFlexOccur * (oEntry.nAKO / nTotAKO)) / oFlexM.nDup)  if nTotAKO  else 0
                                    hDst.write(sBlank + "{2:<30} {0.sMorph:<30}  {0.nOccur:>10}  %> {1:>10}\n".format(oFlexM, nNewOccur, oEntry.getShortDescr()))
                                    oFlexM.setOccurAndBlock(nNewOccur)
        
    def calcFreq (self, nTot):
        self.fFreq = (self.nOccur * 100) / nTot
        self.oldFq = self.fq
        self.fq = getIfq(self.fFreq)










|













|







1117
1118
1119
1120
1121
1122
1123
1124
1125
1126
1127
1128
1129
1130
1131
1132
1133
1134
1135
1136
1137
1138
1139
1140
1141
1142
1143
1144
1145
                                        oFlexM.setOccurAndBlock(nNewOccur)
                    else:
                        # Toutes les entrées sont avec AKO : on pondère
                        nFlexOccur = oStatsLex.getFlexionOccur(oFlex.sFlexion)
                        nTotAKO = self.nAKO
                        for oEnt in oFlex.lMulti:
                            nTotAKO += oEnt.nAKO

                        hDst.write(" = {0.sFlexion}\n".format(oFlex))
                        hDst.write("       moyennes connues\n")
                        for oFlexD in self.lFlexions:
                            if oFlex.sFlexion == oFlexD.sFlexion:
                                nNewOccur = math.ceil((nFlexOccur * (self.nAKO / nTotAKO)) / oFlexD.nDup)  if nTotAKO  else 0
                                hDst.write(sBlank + "{2:<30} {0.sMorph:<30}  {0.nOccur:>10}  %> {1:>10}\n".format(oFlexD, nNewOccur, self.getShortDescr()))
                                oFlexD.setOccurAndBlock(nNewOccur)
                        for oEntry in oFlex.lMulti:
                            for oFlexM in oEntry.lFlexions:
                                if oFlex.sFlexion == oFlexM.sFlexion:
                                    nNewOccur = math.ceil((nFlexOccur * (oEntry.nAKO / nTotAKO)) / oFlexM.nDup)  if nTotAKO  else 0
                                    hDst.write(sBlank + "{2:<30} {0.sMorph:<30}  {0.nOccur:>10}  %> {1:>10}\n".format(oFlexM, nNewOccur, oEntry.getShortDescr()))
                                    oFlexM.setOccurAndBlock(nNewOccur)

    def calcFreq (self, nTot):
        self.fFreq = (self.nOccur * 100) / nTot
        self.oldFq = self.fq
        self.fq = getIfq(self.fFreq)



1154
1155
1156
1157
1158
1159
1160
1161
1162
1163
1164
1165
1166
1167
1168
1169
1170
1171
1172
1173
1174
1175
1176
1177
1178
1179
1180
1181
1182
        self.nDup    = 0    # duplicates in the same entry
        self.nMulti  = 0    # duplicates with other entries
        self.lMulti  = []   # list of similar flexions
        self.fFreq   = 0
        self.cFq     = ''
        self.metagfx = ''   # métagraphe
        self.metaph2 = ''   # métaphone 2
    
    def setOccur (self, n):
        self.nOccur = n

    def setOccurAndBlock (self, n):
        self.nOccur = n
        self.bBlocked = True

    def calcOccur (self):
        self.nOccur = math.ceil((self.nOccur / (self.nMulti+1)) / self.nDup)
    
    def calcFreq (self, nTot):
        self.fFreq = (self.nOccur * 100) / nTot
        self.cFq = getIfq(self.fFreq)
    
    def calcMetagraphe (self):
        t = metagraphe.getMetagraphe(self.sFlexion, self.sMorph)
        self.metagfx = t[0]  if not t[1]  else t[0]+"/"+t[1]

    def calcMetaphone2 (self):
        t = metaphone2.dm(self.sFlexion)
        self.metaph2 = t[0]  if not t[1]  else t[0]+"/"+t[1]







|









|



|







1154
1155
1156
1157
1158
1159
1160
1161
1162
1163
1164
1165
1166
1167
1168
1169
1170
1171
1172
1173
1174
1175
1176
1177
1178
1179
1180
1181
1182
        self.nDup    = 0    # duplicates in the same entry
        self.nMulti  = 0    # duplicates with other entries
        self.lMulti  = []   # list of similar flexions
        self.fFreq   = 0
        self.cFq     = ''
        self.metagfx = ''   # métagraphe
        self.metaph2 = ''   # métaphone 2

    def setOccur (self, n):
        self.nOccur = n

    def setOccurAndBlock (self, n):
        self.nOccur = n
        self.bBlocked = True

    def calcOccur (self):
        self.nOccur = math.ceil((self.nOccur / (self.nMulti+1)) / self.nDup)

    def calcFreq (self, nTot):
        self.fFreq = (self.nOccur * 100) / nTot
        self.cFq = getIfq(self.fFreq)

    def calcMetagraphe (self):
        t = metagraphe.getMetagraphe(self.sFlexion, self.sMorph)
        self.metagfx = t[0]  if not t[1]  else t[0]+"/"+t[1]

    def calcMetaphone2 (self):
        t = metaphone2.dm(self.sFlexion)
        self.metaph2 = t[0]  if not t[1]  else t[0]+"/"+t[1]
1258
1259
1260
1261
1262
1263
1264
1265
1266
1267
1268
1269
1270
1271
1272
1273
1274
1275
1276
1277
1278
1279
1280
1281
1282
1283
1284
1285
1286
1287
1288
1289
        return (self.sFlexion.translate(CHARMAP), self.sMorph)

    def keyFreq (self):
        return (100-self.fFreq, self.oEntry.sRadical, self.sFlexion)

    def keyOcc (self):
        return (self.nOccur, self.oEntry.sRadical, self.sFlexion)
        
    def keyIdx (self):
        return self.oEntry.iD

    def keyFlexion (self):
        return self.sFlexion



class Flag:
    def __init__ (self, sFlagType, sFlagName, sMix):
        self.sFlagName = sFlagName
        self.bSfx = True  if sFlagType == 'SFX'  else False
        self.bMix = True  if sMix == 'Y'  else False
        self.lRules = []
        self.nRules = 0
        self.nOccur = 0
        
    def addAffixRule (self, line):
        "ajoute une règle au drapeau"
        oRule = AffixRule(line)
        self.lRules.append(oRule)
        self.nRules += 1

    def getFlag (self, subDicts, oDict, nMode, bSimplified):







|
















|







1258
1259
1260
1261
1262
1263
1264
1265
1266
1267
1268
1269
1270
1271
1272
1273
1274
1275
1276
1277
1278
1279
1280
1281
1282
1283
1284
1285
1286
1287
1288
1289
        return (self.sFlexion.translate(CHARMAP), self.sMorph)

    def keyFreq (self):
        return (100-self.fFreq, self.oEntry.sRadical, self.sFlexion)

    def keyOcc (self):
        return (self.nOccur, self.oEntry.sRadical, self.sFlexion)

    def keyIdx (self):
        return self.oEntry.iD

    def keyFlexion (self):
        return self.sFlexion



class Flag:
    def __init__ (self, sFlagType, sFlagName, sMix):
        self.sFlagName = sFlagName
        self.bSfx = True  if sFlagType == 'SFX'  else False
        self.bMix = True  if sMix == 'Y'  else False
        self.lRules = []
        self.nRules = 0
        self.nOccur = 0

    def addAffixRule (self, line):
        "ajoute une règle au drapeau"
        oRule = AffixRule(line)
        self.lRules.append(oRule)
        self.nRules += 1

    def getFlag (self, subDicts, oDict, nMode, bSimplified):
1331
1332
1333
1334
1335
1336
1337
1338
1339
1340
1341
1342
1343
1344
1345
        # champs de Dicollecte
        self.lx = ''
        self.di = '*'
        # erreurs
        self.err = ''
        # autres champs
        self.nOccur = 0
        
        sLine = sLine.rstrip(" \n")
        # commentaire
        if '#' in sLine:
            sLine, comment = sLine.split('#', 1)
            self.comment = comment.strip()
        # éléments de la ligne
        elems = sLine.split()







|







1331
1332
1333
1334
1335
1336
1337
1338
1339
1340
1341
1342
1343
1344
1345
        # champs de Dicollecte
        self.lx = ''
        self.di = '*'
        # erreurs
        self.err = ''
        # autres champs
        self.nOccur = 0

        sLine = sLine.rstrip(" \n")
        # commentaire
        if '#' in sLine:
            sLine, comment = sLine.split('#', 1)
            self.comment = comment.strip()
        # éléments de la ligne
        elems = sLine.split()
1389
1390
1391
1392
1393
1394
1395
1396
1397
1398
1399
1400
1401
1402
1403
1404
1405
1406
1407
1408
1409
1410
1411
1412
1413
1414
1415
1416
1417
1418
1419
1420
1421
1422
1423
                    self.lx = fields[1]  if self.lx == ''  else self.lx + ' ' + fields[1]
                elif fields[0] == 'di':
                    self.di = fields[1]
                else:
                    echo('Champ inconnu: {}  dans  {}'.format(fields[0], self.sFlagName))
            else:
                echo("  # Erreur affixe : {}".format(line))
    
    def isReplicationRule (self):
        "is this rule used for replication of a virtual lemma"
        return self.flags == "" and ((self.cut == "0" and self.add == "") or self.cut == self.add)

    def getRuleLine (self, oDict, nMode, bSimplified=False):
        sLine = 'SFX'  if self.bSfx  else 'PFX'
        sLine += ' ' + self.sFlagName + ' ' + self.cut + ' '
        sLine += self.add  if self.add  else '0'
        if self.flags != '':
            sLine += '/'
            sLine += self.flags  if not oDict.bShortenTags or bSimplified  else oDict.dAF[self.flags]
            if bSimplified:
                sLine = sLine.replace("()", "")
        sLine += ' ' + self.cond
        if not bSimplified and nMode > 0:
            sMorph = self.getMorph(nMode)
            if sMorph:
                sLine += sMorph  if not oDict.bShortenTags or bSimplified  else ' ' + oDict.dAM[sMorph.strip()]
        return sLine + "\n"
    
    def getMorph (self, nMode):
        # morphology for Hunspell
        txt = ''
        if self.po: txt += fieldToHunspell('po', self.po)
        if self.iz: txt += fieldToHunspell('is', self.iz)
        if self.ds: txt += fieldToHunspell('ds', self.ds)
        if self.ts: txt += fieldToHunspell('ts', self.ts)







|



















|







1389
1390
1391
1392
1393
1394
1395
1396
1397
1398
1399
1400
1401
1402
1403
1404
1405
1406
1407
1408
1409
1410
1411
1412
1413
1414
1415
1416
1417
1418
1419
1420
1421
1422
1423
                    self.lx = fields[1]  if self.lx == ''  else self.lx + ' ' + fields[1]
                elif fields[0] == 'di':
                    self.di = fields[1]
                else:
                    echo('Champ inconnu: {}  dans  {}'.format(fields[0], self.sFlagName))
            else:
                echo("  # Erreur affixe : {}".format(line))

    def isReplicationRule (self):
        "is this rule used for replication of a virtual lemma"
        return self.flags == "" and ((self.cut == "0" and self.add == "") or self.cut == self.add)

    def getRuleLine (self, oDict, nMode, bSimplified=False):
        sLine = 'SFX'  if self.bSfx  else 'PFX'
        sLine += ' ' + self.sFlagName + ' ' + self.cut + ' '
        sLine += self.add  if self.add  else '0'
        if self.flags != '':
            sLine += '/'
            sLine += self.flags  if not oDict.bShortenTags or bSimplified  else oDict.dAF[self.flags]
            if bSimplified:
                sLine = sLine.replace("()", "")
        sLine += ' ' + self.cond
        if not bSimplified and nMode > 0:
            sMorph = self.getMorph(nMode)
            if sMorph:
                sLine += sMorph  if not oDict.bShortenTags or bSimplified  else ' ' + oDict.dAM[sMorph.strip()]
        return sLine + "\n"

    def getMorph (self, nMode):
        # morphology for Hunspell
        txt = ''
        if self.po: txt += fieldToHunspell('po', self.po)
        if self.iz: txt += fieldToHunspell('is', self.iz)
        if self.ds: txt += fieldToHunspell('ds', self.ds)
        if self.ts: txt += fieldToHunspell('ts', self.ts)
1448
1449
1450
1451
1452
1453
1454
1455
1456
1457
1458
1459
1460
1461
1462


class StatsLex:
    def __init__ (self, oDict):
        echo("Lexique statistique")
        self.dFlexions = { oFlex.sFlexion: []  for oFlex in oDict.lFlexions }
        self.lLex = []
        
    def addLexFromFile (self, sPathFile, cLexID, sLexName):
        if not os.path.isfile(sPathFile):
            echo(' * Lexique statistique - fichier {} introuvable'.format(sPathFile))
            return None
        if len(cLexID) != 1:
            echo(' * Lexique statistique - fichier {} - identifiant incorrect, 1 caractère requis'.format(sPathFile))
            return None







|







1448
1449
1450
1451
1452
1453
1454
1455
1456
1457
1458
1459
1460
1461
1462


class StatsLex:
    def __init__ (self, oDict):
        echo("Lexique statistique")
        self.dFlexions = { oFlex.sFlexion: []  for oFlex in oDict.lFlexions }
        self.lLex = []

    def addLexFromFile (self, sPathFile, cLexID, sLexName):
        if not os.path.isfile(sPathFile):
            echo(' * Lexique statistique - fichier {} introuvable'.format(sPathFile))
            return None
        if len(cLexID) != 1:
            echo(' * Lexique statistique - fichier {} - identifiant incorrect, 1 caractère requis'.format(sPathFile))
            return None
1516
1517
1518
1519
1520
1521
1522
1523
1524
1525
1526
1527
1528
1529
1530
1531
1532
1533
1534
1535
1536
1537
1538
1539
1540
1541
1542
1543
1544
1545
1546
1547
1548
1549
1550
1551
1552
1553
1554
1555
1556
1557
1558
1559
1560
1561
1562
1563
        xArgs.uncompress = True

    echo("Python: " + sys.version)
    echo("Version: " + xArgs.verdic)
    echo("Simplify: " + str(xArgs.simplify))
    echo("Mode: " + str(xArgs.mode))
    echo("Compression: " + str(not(xArgs.uncompress)))
    
    ### création du répertoire
    spBuild = BUILD_PATH + '/' + xArgs.verdic
    dir_util.mkpath(spBuild)
    
    ### Lecture des fichiers et création du dictionnaire
    oFrenchDict = Dictionnaire(xArgs.verdic, "French dictionary")
    for sFile in ['orthographe/FRANCAIS.dic']:
        oFrenchDict.readDictionary(sFile)
    oFrenchDict.readAffixes('orthographe/FRANCAIS_5.aff')
    
    ### Contrôle
    oFrenchDict.sortEntriesNatural()
    oFrenchDict.checkEntries()
    
    ### Lexique
    oFrenchDict.generateFlexions()
    oFrenchDict.calcMetagraphe()
    oFrenchDict.calcMetaphone2()

    #oFrenchDict.createNgrams(spBuild, 3)
    if xArgs.spellvariants:
        oFrenchDict.generateSpellVariants(1, spBuild)

    ### Statistiques
    spfStats = spBuild+'/'+STATS_NAME+xArgs.verdic+'.txt'
    oStatsLex = StatsLex(oFrenchDict)
    oStatsLex.addLexFromFile('lexique/corpus_data/stats_google_ngram_1.txt', 'G', 'Google 1-grams')
    oStatsLex.addLexFromFile('lexique/corpus_data/stats_frwiki.txt', 'W', 'Wikipédia')
    oStatsLex.addLexFromFile('lexique/corpus_data/stats_frwikisource.txt', 'S', 'Wikisource')
    oStatsLex.addLexFromFile('lexique/corpus_data/stats_litterature.txt', 'L', 'Littérature')
    oStatsLex.write(spBuild+'/test_lex.txt')
    oFrenchDict.calculateStats(oStatsLex, spfStats)
    
    ### écriture des paquets
    echo("Création des paquets...")

    spLexiconDestGL = "../../../lexicons"  if xArgs.grammalecte  else ""
    spLibreOfficeExtDestGL = "../oxt/Dictionnaires/dictionaries"  if xArgs.grammalecte  else ""
    spMozillaExtDestGL = "../xpi/data/dictionaries"  if xArgs.grammalecte  else ""
    spDataDestGL = "../data"  if xArgs.grammalecte  else ""







|



|





|



|


















|







1516
1517
1518
1519
1520
1521
1522
1523
1524
1525
1526
1527
1528
1529
1530
1531
1532
1533
1534
1535
1536
1537
1538
1539
1540
1541
1542
1543
1544
1545
1546
1547
1548
1549
1550
1551
1552
1553
1554
1555
1556
1557
1558
1559
1560
1561
1562
1563
        xArgs.uncompress = True

    echo("Python: " + sys.version)
    echo("Version: " + xArgs.verdic)
    echo("Simplify: " + str(xArgs.simplify))
    echo("Mode: " + str(xArgs.mode))
    echo("Compression: " + str(not(xArgs.uncompress)))

    ### création du répertoire
    spBuild = BUILD_PATH + '/' + xArgs.verdic
    dir_util.mkpath(spBuild)

    ### Lecture des fichiers et création du dictionnaire
    oFrenchDict = Dictionnaire(xArgs.verdic, "French dictionary")
    for sFile in ['orthographe/FRANCAIS.dic']:
        oFrenchDict.readDictionary(sFile)
    oFrenchDict.readAffixes('orthographe/FRANCAIS_5.aff')

    ### Contrôle
    oFrenchDict.sortEntriesNatural()
    oFrenchDict.checkEntries()

    ### Lexique
    oFrenchDict.generateFlexions()
    oFrenchDict.calcMetagraphe()
    oFrenchDict.calcMetaphone2()

    #oFrenchDict.createNgrams(spBuild, 3)
    if xArgs.spellvariants:
        oFrenchDict.generateSpellVariants(1, spBuild)

    ### Statistiques
    spfStats = spBuild+'/'+STATS_NAME+xArgs.verdic+'.txt'
    oStatsLex = StatsLex(oFrenchDict)
    oStatsLex.addLexFromFile('lexique/corpus_data/stats_google_ngram_1.txt', 'G', 'Google 1-grams')
    oStatsLex.addLexFromFile('lexique/corpus_data/stats_frwiki.txt', 'W', 'Wikipédia')
    oStatsLex.addLexFromFile('lexique/corpus_data/stats_frwikisource.txt', 'S', 'Wikisource')
    oStatsLex.addLexFromFile('lexique/corpus_data/stats_litterature.txt', 'L', 'Littérature')
    oStatsLex.write(spBuild+'/test_lex.txt')
    oFrenchDict.calculateStats(oStatsLex, spfStats)

    ### écriture des paquets
    echo("Création des paquets...")

    spLexiconDestGL = "../../../lexicons"  if xArgs.grammalecte  else ""
    spLibreOfficeExtDestGL = "../oxt/Dictionnaires/dictionaries"  if xArgs.grammalecte  else ""
    spMozillaExtDestGL = "../xpi/data/dictionaries"  if xArgs.grammalecte  else ""
    spDataDestGL = "../data"  if xArgs.grammalecte  else ""