Grammalecte: genfrdic.py at [246965c90e]

File gc_lang/fr/dictionnaire/genfrdic.py artifact 877bd56310 part of check-in 246965c90e

#!python3

__author__ = "Olivier R."
__license__ = "MPL 2"



import os
import sys
import re
import collections
import zipfile
import math
import argparse
from enum import Enum

from distutils import dir_util
from distutils import file_util
from string import Template

import metagraphe
import metaphone2
import thes_build


# Dictionnaire des caractères pour le tri naturel.
# Ordre souhaitable, mais pose problème pour la recherche, car engendre des égalités de lemmes différents.
# Il faut donc travailler sur un dictionnaire trié *numériquement* et le sauvegarder selon le tri *naturel*
CHARMAP = str.maketrans({ 'à': 'a',  'À': 'A',  'â': 'a',  'Â': 'A',  'ä': 'a',  'Ä': 'A',  'å': 'a',  'Å': 'A',  'ā': 'a',  'Ā': 'A',
                          'ç': 'c',  'Ç': 'C',
                          'é': 'e',  'É': 'E',  'è': 'e',  'È': 'E',  'ê': 'e',  'Ê': 'E',  'ë': 'e',  'Ë': 'E',  'ē': 'e',  'Ē': 'E',
                          'î': 'i',  'Î': 'I',  'ï': 'i',  'Ï': 'I',  'ī': 'i',  'Ī': 'I',
                          'ñ': 'n',
                          'ô': 'o',  'Ô': 'O',  'ö': 'o',  'Ö': 'O',  'ō': 'o',  'Ō': 'O',
                          'ù': 'u',  'Ù': 'U',  'û': 'u',  'Û': 'U',  'ü': 'u',  'Ü': 'U',  'ū': 'u',  'Ū': 'U',
                          'ÿ': 'y',
                          'æ': 'ae', 'Æ': 'AE', 'œ':'oe', 'Œ': 'OE',
                          '-': None, '.': None, "'": None })


# Les dictionnaires
dSUBDIC = { '*': 'Commun',
            'R': 'Réforme1990',
            'M': 'Moderne',
            'C': 'Classique',
            'A': 'Annexe',
            'X': 'Contributeurs' }

dCLASSIQUE = { 'name': 'DICTIONNAIRE ORTHOGRAPHIQUE FRANÇAIS “CLASSIQUE”',
               'shortname': '“Classique”',
               'asciiName': 'fr-classique',
               'mozAsciiName': 'fr-FR-classic',
               'subDicts': '*MCX',
               'mozId': 'fr-dicollecte-classique',
               'description': "Dictionnaire français “Classique”" }

dREFORME1990 = { 'name': 'DICTIONNAIRE ORTHOGRAPHIQUE FRANÇAIS “RÉFORME 1990”',
                 'shortname': '“Réforme 1990”',
                 'asciiName': 'fr-reforme1990',
                 'mozAsciiName': 'fr-FR-reform',
                 'subDicts': '*RX',
                 'mozId': 'fr-dicollecte-reforme1990',
                 'description': "Dictionnaire français “Réforme 1990”" }

dTOUTESVAR = { 'name': 'DICTIONNAIRE ORTHOGRAPHIQUE FRANÇAIS “TOUTES VARIANTES”',
               'shortname': '“Toutes variantes”',
               'asciiName': 'fr-toutesvariantes',
               'mozAsciiName': 'fr-FR-classic-reform',
               'subDicts': '*MCRAX',
               'mozId': 'fr-dicollecte-toutesvariantes',
               'description': "Dictionnaire français “Toutes variantes”" }

dMOZEXT = { 'name': 'DICTIONNAIRE ORTHOGRAPHIQUE FRANÇAIS',
            'mozId': 'fr-dicollecte',
            'description': "Dictionnaire orthographique de la langue française" }


BUILD_PATH = '_build'
PREFIX_DICT_PATH = 'hunspell-french-dictionaries-v'
EXT_PREFIX_OOO = 'lo-oo-ressources-linguistiques-fr-v'
EXT_PREFIX_MOZ = 'moz-hunspell-fr-v'
LEX_PREFIX = 'lexique-grammalecte-fr-v'
STATS_NAME = 'statistiques-v'

MPLHEADER = "# This Source Code Form is subject to the terms of the Mozilla Public\n" + \
            "# License, v. 2.0. If a copy of the MPL was not distributed with this\n" + \
            "# file, You can obtain one at http://mozilla.org/MPL/2.0/.\n\n"


def echo (obj, sep=' ', end='\n', file=sys.stdout, flush=False):
    """ Print for Windows to avoid Python crashes.
        Encoding depends on Windows locale. No useful standard.
        Always returns True (useful for debugging)."""
    if sys.platform != "win32":
        print(obj, sep=sep, end=end, file=file, flush=flush)
        return True
    try:
        print(str(obj).replace("œ", "oe"), sep=sep, end=end, file=file, flush=flush)
    except:
        try:
            print(str(obj).translate(CHARMAP), sep=sep, end=end, file=file, flush=flush)
        except:
            print(str(obj).encode('ascii', 'replace').decode('ascii', 'replace'), sep=sep, end=end, file=file, flush=flush)
    return True


def makeLongFlags (sFlags):
    "renvoie la liste des drapeaux, créés à partir de la chaîne"
    if len(sFlags) % 2 != 0:
        echo(">| erreur: %s" % sFlags)
        sFlags = sFlags + ' '
    return [ sFlags[i:i+2]  for i in range(0, len(sFlags), 2) ]


def makeNumFlags (sFlags):
    return sFlags.split(',')


def makeOneCharFlags (sFlags):
    return list(sFlags)


def fieldToHunspell (sFieldName, sFieldValue):
    "renvoie le texte pour Hunspell de la valeur d’un champ"
    sSep = ' ' + sFieldName + ':'
    return sSep + sFieldValue.replace(' ', sSep)


def getListNgrams (sWord, n):
    return [ sWord[i:i+n]  for i in range(len(sWord)-n-1) ]


def createZipFiles (spSrc, spDst, zipFileName):
    echo(' > Zip  [ {} ]'.format(spSrc))
    def _addDir (_spSrc, _subPath, _zipFile):
        for _fileToZip in os.listdir(_spSrc):
            if os.path.isdir(_spSrc+'/'+_fileToZip):
                _addDir(_spSrc+'/'+_fileToZip, _fileToZip, _zipFile)
            else:
                zipFile.write(_spSrc+'/'+_fileToZip, _subPath+'/'+_fileToZip)
    #
    zipFile = zipfile.ZipFile(spDst+'/'+zipFileName, 'w', zipfile.ZIP_DEFLATED)
    for fileToZip in os.listdir(spSrc):
        if os.path.isdir(spSrc+'/'+fileToZip):
            _addDir(spSrc+'/'+fileToZip, fileToZip, zipFileName)
        else:
            zipFile.write(spSrc+'/'+fileToZip, fileToZip)
    zipFile.close()


def copyTemplate (spSrc, spDst, spf, dVars):
    if spf.endswith('xml') or spf.endswith('rdf'):
        for key in dVars:
            dVars[key] = dVars[key].replace('&', '&amp;')
    xTemplate = Template( open(spSrc+'/'+spf, 'r', encoding='utf-8').read() )
    open(spDst+'/'+spf, 'w', encoding='utf-8', newline="\n").write(xTemplate.safe_substitute(dVars))


def getIfq (f):
    "renvoie l’indice de fréquence (un caractère)"
    if f == 0:         return '0'
    if f < 0.00000001: return '1'
    if f < 0.0000001:  return '2'
    if f < 0.000001:   return '3'
    if f < 0.00001:    return '4'
    if f < 0.0001:     return '5'
    if f < 0.001:      return '6'
    if f < 0.01:       return '7'
    if f < 0.1:        return '8'
    return '9'


def getVerbMultiMorph (s):
    "renvoie la liste des morphologies fusionnées"
    lTag = s.split()
    lRes = []
    for n, sTag in enumerate(lTag, 1):
        if not sTag[0].isdigit():
            sMorph = sTag
            for sTag2 in lTag[n:]:
                if sTag2[0].isdigit():
                    lRes.append(sMorph + " " + sTag2)
        else:
            break
    return lRes


def readfile (spf):
    "generator: returns file line by line"
    if os.path.isfile(spf):
        with open(spf, "r", encoding="utf-8") as hSrc:
            for sLine in hSrc:
                yield sLine
    else:
        print("# Error: file not found.")



class Dictionnaire:
    def __init__ (self, version, name):
        # Dictionary
        self.sName = name
        self.lEntry = []
        self.nEntry = 0
        self.sVersion = version
        # Affixes
        self.sSettings = '' # enregistre tout avant la ligne # END
        self.dFlags = collections.OrderedDict()
        self.bShortenTags = False
        self.dAM = collections.OrderedDict() # étiquettes morphologiques
        self.dAF = collections.OrderedDict() # étiquettes drapeaux
        # Flexions
        self.lFlexions = []           # liste des flexions avec lemme, morphologie et occurrences
        self.lStatsLex = []
        self.nTotOccurRecognizedWords = 0
        self.aFlexions = None

    def readDictionary (self, spf):
        "Lecture du dictionnaire"
        echo('Dictionnaire << [ {} ]'.format(spf), end=' ')
        for sLine in readfile(spf):
            sLine = sLine.strip()
            if not sLine.isdigit() and not sLine.startswith("#"):
                self.lEntry.append(Entree(sLine))
        self.nEntry = len(self.lEntry)
        echo('- {} entrées'.format(self.nEntry))

    def readAffixes (self, spf):
        "Lecture du fichier des affixes"
        echo("Dictionnaire << [ {} ]".format(spf))
        bSettings = True
        for sLine in readfile(spf):
            if sLine.startswith("# END"):
                bSettings = False
            elif sLine.startswith("#"):
                pass
            elif sLine.startswith(("PFX", "SFX")):
                sLine = re.sub(" *#.*$", "", sLine.rstrip(" \n"))
                lElem = sLine.split()
                if len(lElem) >= 4:
                    if lElem[1] not in self.dFlags:
                        # nouveau drapeau
                        oFlag = Flag(lElem[0], lElem[1], lElem[2])
                        self.dFlags[lElem[1]] = oFlag
                    else:
                        # nouvelle règle
                        oFlag.addAffixRule(sLine)
                else:
                    echo("  # erreur de lecture: {}".format(sLine))
            elif bSettings:
                # toutes les lignes non-commentaires avant # END sont enregistrées dans self.sSettings
                self.sSettings += sLine

    def defineAbreviatedTags (self, nMode, spDst):
        "Abrégé des étiquettes grammaticales et des drapeaux"
        echo(" * Dictionnaire - compression Hunspell... ")
        self.bShortenTags = True
        dAF = {}
        dAM = {}
        for oFlag in self.dFlags.values():
            for oRule in oFlag.lRules:
                if oRule.flags:
                    dAF[oRule.flags] = dAF.get(oRule.flags, 0) + 1
                sMorph = oRule.getMorph(nMode).strip()
                if sMorph:
                    dAM[sMorph] = dAM.get(sMorph, 0) + 1
        for oEntry in self.lEntry:
            if oEntry.flags:
                dAF[oEntry.flags] = dAF.get(oEntry.flags, 0) + 1
            sMorph = oEntry.getMorph(nMode).strip()
            if sMorph:
                dAM[sMorph] = dAM.get(sMorph, 0) + 1

        lAF = sorted(dAF.items(), key = lambda x: (x[1], x[0]), reverse=True)
        lAM = sorted(dAM.items(), key = lambda x: (x[1], x[0]), reverse=True)

        with open(spDst, 'a', encoding='utf-8', newline="\n") as hDst:
            hDst.write("\n\nDrapeaux :\n")
            for nAF, elem in enumerate(lAF, 1):
                self.dAF[elem[0]] = str(nAF)
                hDst.write("  > {0[1]:>8} : {0[0]}\n".format(elem))
            hDst.write("\n\nMorphologies :\n")
            for nAM, elem in enumerate(lAM, 1):
                self.dAM[elem[0]] = str(nAM)
                hDst.write("  > {0[1]:>8} : {0[0]}\n".format(elem))

    def writeDictionary (self, spDst, dTplVars, nMode, bSimplified):
        "Écrire le fichier dictionnaire (.dic)"
        echo(' * Dictionnaire >> [ {}.dic ] ({})'.format(dTplVars['asciiName'], dTplVars['subDicts']))
        nEntry = 0
        for oEntry in self.lEntry:
            if oEntry.di in dTplVars['subDicts'] and " " not in oEntry.lemma:
                nEntry += 1
        with open(spDst+'/'+dTplVars['asciiName']+'.dic', 'w', encoding='utf-8', newline="\n") as hDst:
            hDst.write(str(nEntry)+"\n")
            for oEntry in self.lEntry:
                if oEntry.di in dTplVars['subDicts'] and re.search(r"^[\w’'-]+$", oEntry.lemma):
                    hDst.write(oEntry.getHunspellLine(self, nMode, bSimplified))

    def writeAffixes (self, spDst, dTplVars, nMode, bSimplified):
        "Écrire le fichier des affixes (.aff)"
        echo(' * Dictionnaire >> [ {}.aff ]'.format(dTplVars['asciiName']))
        info = "# This Source Code Form is subject to the terms of the Mozilla Public\n" + \
               "# License, v. 2.0. If a copy of the MPL was not distributed with this\n" + \
               "# file, You can obtain one at http://mozilla.org/MPL/2.0/.\n\n" + \
               "# AFFIXES DU {} v{}\n".format(dTplVars['name'], self.sVersion) + \
               "# par Olivier R. -- licence MPL 2.0\n" + \
               "# Pour améliorer le dictionnaire, allez sur https://grammalecte.net/\n\n"

        with open(spDst+'/'+dTplVars['asciiName']+'.aff', 'w', encoding='utf-8', newline="\n") as hDst:
            hDst.write(info)
            hDst.write(self.sSettings + "\n")
            if self.bShortenTags:
                hDst.write("AM {}\n".format(len(self.dAM)))
                for item in self.dAM.items():
                    hDst.write("AM {}\n".format(item[0]))
                hDst.write("\n")
                hDst.write("AF {}\n".format(len(self.dAF)))
                for item in self.dAF.items():
                    hDst.write("AF {}\n".format(item[0]))
                hDst.write("\n")
            for oFlag in self.dFlags.values():
                hDst.write(oFlag.getFlag(dTplVars['subDicts'], self, nMode, bSimplified))

    def sortEntriesNatural (self):
        echo(' * Dictionnaire - Tri naturel des entrées...')
        self.lEntry = sorted(self.lEntry, key=Entree.keyTriNat)

    def sortEntriesNumerical (self):
        echo(' * Dictionnaire - Tri numérique des entrées...')
        self.lEntry = sorted(self.lEntry, key=Entree.keyTriNum)

    def sortLexiconByFlexion (self):
        echo(' * Dictionnaire - tri du lexique (par flexion)...')
        self.lFlexions = sorted(self.lFlexions, key=Flexion.keyFlexion)

    def sortLexiconByFreq (self):
        echo(' * Dictionnaire - tri du lexique (par fréquence)...')
        self.lFlexions = sorted(self.lFlexions, key=Flexion.keyFreq)

    def sortLexiconByIdx (self):
        echo(' * Dictionnaire - tri du lexique (par index)...')
        self.lFlexions = sorted(self.lFlexions, key=Flexion.keyIdx)

    def checkEntries (self):
        echo(' * Dictionnaire - contrôle des entrées...')
        for e in self.lEntry:
            e.check()

    def generateFlexions (self):
        echo(' * Lexique - genèse des formes fléchies...')
        for oEntry in self.lEntry:
            oEntry.generateFlexions(self.dFlags)
            self.lFlexions.extend(oEntry.lFlexions)
        # Count flexions in multiple entries
        d = {}
        for oFlex in self.lFlexions:
            if oFlex.sFlexion in d:
                if oFlex.oEntry not in d[oFlex.sFlexion]:
                    d[oFlex.sFlexion].append(oFlex.oEntry)
            else:
                d[oFlex.sFlexion] = [oFlex.oEntry]
        for oFlex in self.lFlexions:
            oFlex.lMulti = list(d[oFlex.sFlexion])
            oFlex.nMulti = len(oFlex.lMulti)
        for oFlex in self.lFlexions:
            oFlex.lMulti.remove(oFlex.oEntry)
            oFlex.nMulti -= 1

    def setTagsFrom (self, other):
        echo(' * Dictionnaire - copie des tags...')
        for i in range(self.nEntry):
            for oEntry in other.lEntry:
                if self.lEntry[i].lemma == oEntry.lemma and self.lEntry[i].flags == oEntry.flags:
                    self.lEntry[i].setTagsFrom(oEntry)

    def calculateStats (self, oStatsLex, spfDst):
        echo(" * Dictionnaire - calculs...")
        with open(spfDst, 'w', encoding='utf-8', newline="\n") as hDst:
            # Occurrences brutes des formes fléchies
            echo("   comptage des occurrences...")
            hDst.write(oStatsLex.getInfo())
            for oFlex in self.lFlexions:
                oFlex.setOccur(oStatsLex.getFlexionOccur(oFlex.sFlexion))
            self.nTotOccurRecognizedWords = 0
            for oFlex in self.lFlexions:
                oFlex.calcOccur()
                self.nTotOccurRecognizedWords += oFlex.nOccur

            # Report des occurrences
            echo("   report des occurrences des formes fléchies multiples...")
            hDst.write("Report des occurrences des formes fléchies multiples :\n")
            hDst.write("  Légende :\n")
            hDst.write("    >>   le nombre d’occurrences de la flexion est ramené à la moyenne.\n")
            hDst.write("    +>   le nombre d’occurrences de la flexion est augmenté avec le surplus d’occurrences des flexions ramenées à la moyenne.\n")
            hDst.write("    %>   le nombre d’occurrences de la flexion est pondéré avec le poids de la moyenne de l’entrée.\n\n")

            for oEntry in self.lEntry:
                oEntry.calcOccurFromFlexions()
                oEntry.calcAverageKnownOccurrence()
                oEntry.solveOccurMultipleFlexions(hDst, oStatsLex)
                oEntry.calcOccurFromFlexions()

            # Fréquences
            echo("   calcul des fréquences et indices de fréquence...")
            for oFlex in self.lFlexions:
                oFlex.calcFreq(self.nTotOccurRecognizedWords)
            for oEntry in self.lEntry:
                oEntry.calcFreq(self.nTotOccurRecognizedWords)

            # Entrées, statistiques
            echo("   statistiques...")
            hDst.write("\n\nNatures grammaticales :\n")
            d = {}
            for oEntry in self.lEntry:
                po = re.sub("(?<=v[0-3])[itnpqrmaezx_]+", "", oEntry.po)
                d[po] = d.get(po, 0) + 1
            for e in sorted(d.items(), key = lambda x: (x[1], x[0]), reverse=True):
                hDst.write(" * {0[1]:<15} : {0[0]}\n".format(e))

            hDst.write("\n\nVentilation des entrées par indice de fréquence :\n")
            d1 = {}
            d2 = {}
            for oEntry in self.lEntry:
                d1[oEntry.fq] = d1.get(oEntry.fq, 0) + 1
                d2[oEntry.fq] = d2.get(oEntry.fq, 0) + oEntry.fFreq
            for k in sorted(d1.keys()):
                hDst.write(" * {} : {} entrées ({:.2f} %)  → {:.9f} %\n".format(k, d1[k], (d1[k]*100)/self.nEntry, d2[k]))

            hDst.write("\n\nRépartition des entrées par sous-dictionnaire :\n")
            d = {}
            for oEntry in self.lEntry:
                d[oEntry.di] = d.get(oEntry.di, 0) + 1
            for sKey, nVal in d.items():
                hDst.write(" * {0:<15} : {1} entrées ({2:.2f} %)\n".format(dSUBDIC[sKey], nVal, (nVal*100)/self.nEntry))

            # Occurrences des lettres
            echo("   occurrences des lettres...")
            d = {}
            for oFlex in self.lFlexions:
                for c in oFlex.sFlexion:
                    d[c] = d.get(c, 0) + oFlex.nOccur
            nTot = 0
            for k in d:
                nTot += d[k]
            hDst.write("\n\nOccurrences des lettres dans le corpus :\n")
            for sKey, nVal in sorted(d.items(), key = lambda x: (x[1], x[0]), reverse=True):
                hDst.write("   {} : {:>16,.0f}  /  {:.8f} %\n".format(sKey, nVal, nVal*100/nTot))

            # Mots par nombre de lettres
            echo("   Nombre de lettres dans les mots...")
            if not self.aFlexions:
                self.aFlexions = set([e.sFlexion for e in self.lFlexions])
            d = {}
            for sFlex in self.aFlexions:
                n = len(sFlex)
                d[n] = d.get(n, 0) + 1
            hDst.write("\n\nNombre de lettres dans les graphies :\n")
            for sKey, nVal in sorted(d.items()):
                hDst.write("   {:>2} lettres : {:>8} graphies\n".format(sKey, nVal))

            hDst.write("\n\nNombre de formes fléchies : {}\n".format(len(self.lFlexions)))
            hDst.write("\n\nNombre de graphies : {}\n".format(len(self.aFlexions)))

    def calcMetagraphe (self):
        echo(" * Lexique - Metagraphe")
        for oFlex in self.lFlexions:
            oFlex.calcMetagraphe()

    def calcMetaphone2 (self):
        echo(" * Lexique - Metaphone 2")
        for oFlex in self.lFlexions:
            oFlex.calcMetaphone2()

    def createNgrams (self, spDest, n):
        echo(" * Lexique - Ngrams " + str(n))
        if n < 2:
            echo("erreur: n = " + str(n))
            return
        dOccur = {} # ngram:n
        dRefW = {} # ngram:set(idx)
        dWords = {} # word:idx
        for oFlex in self.lFlexions:
            for sNgram in getListNgrams(oFlex.sFlexion, n):
                # words list
                if oFlex.sFlexion not in dWords:
                    dWords[oFlex.sFlexion] = len(dWords)
                idx = dWords[oFlex.sFlexion]
                # ngram occurrence
                dOccur[sNgram] = dOccur.get(sNgram, 0) + 1
                if sNgram not in dRefW:
                    dRefW[sNgram] = set()
                # ngram word reference
                dRefW[sNgram].add(idx)
        with open(spDest+"/ngrams-%d.txt"%n, 'w', encoding='utf-8', newline="\n") as hDst:
            for key, value in dWords.items():
                hDst.write("%d: %s\n"% (value, key))
            for key, value in dOccur.items():
                if value > 1:
                    hDst.write("%s: %d  --  "% (key, value))
                    hDst.write(str(dRefW[key]))
                    hDst.write("\n")

    def writeLexicon (self, spfDst, version, oStatsLex):
        echo(' * Lexique >> [ {} ] '.format(spfDst))
        with open(spfDst, 'w', encoding='utf-8', newline="\n") as hDst:
            hDst.write(MPLHEADER)
            hDst.write("# Lexique des formes fléchies du français - Grammalecte v{}\n# Licence : MPL v2.0\n\n".format(version))
            hDst.write(oStatsLex.getInfo())
            hDst.write(Flexion.header(oStatsLex))
            for oFlex in self.lFlexions:
                hDst.write(oFlex.__str__(oStatsLex))

    def writeGrammarCheckerLexicon (self, spfDst, version):
        echo(' * Lexique simplifié >> [ {} ] '.format(spfDst))
        with open(spfDst[:-4]+".lex", 'w', encoding='utf-8', newline="\n") as hDst:
            hDst.write(MPLHEADER)
            hDst.write("# Lexique simplifié pour Grammalecte v{}\n# Licence : MPL v2.0\n\n".format(version))
            hDst.write(Flexion.simpleHeader())
            for oFlex in self.lFlexions:
                hDst.write(oFlex.getGrammarCheckerRepr())

    def createFiles (self, spDst, lDictVars, nMode, bSimplified):
        sDicName = PREFIX_DICT_PATH + self.sVersion
        spDic = spDst + '/' + sDicName
        dir_util.mkpath(spDic)
        for dVars in lDictVars:
            # template vars
            dVars['version'] = self.sVersion
            # Dictionaries files (.dic) (.aff)
            self.writeAffixes(spDic, dVars, nMode, bSimplified)
            self.writeDictionary(spDic, dVars, nMode, bSimplified)
        copyTemplate('orthographe', spDic, 'README_dict_fr.txt', dVars)
        createZipFiles(spDic, spDst, sDicName + '.zip')

    def createLibreOfficeExtension (self, spBuild, dTplVars, lDictVars, spDestGL=""):
        # LibreOffice extension
        echo(" * Dictionnaire >> extension pour LibreOffice")
        dTplVars['version'] = self.sVersion
        sExtensionName = EXT_PREFIX_OOO + self.sVersion
        spExt = spBuild + '/' + sExtensionName
        dir_util.mkpath(spExt+'/META-INF')
        dir_util.mkpath(spExt+'/ui')
        dir_util.mkpath(spExt+'/dictionaries')
        dir_util.mkpath(spExt+'/pythonpath')
        file_util.copy_file('_templates/ooo/manifest.xml', spExt+'/META-INF')
        file_util.copy_file('_templates/ooo/DictionarySwitcher.py', spExt)
        file_util.copy_file('_templates/ooo/ds_strings.py', spExt+'/pythonpath')
        file_util.copy_file('_templates/ooo/addons.xcu', spExt+'/ui')
        file_util.copy_file('_templates/ooo/french_flag.png', spExt)
        file_util.copy_file('_templates/ooo/french_flag_16.bmp', spExt+'/ui')
        copyTemplate('_templates/ooo', spExt, 'description.xml', dTplVars)
        copyTemplate('_templates/ooo', spExt, 'dictionaries.xcu', dTplVars)
        #file_util.copy_file('_templates/ooo/dictionaries.xcu.tpl.xml', spExt)
        copyTemplate('_templates/ooo', spExt, 'package-description.txt', dTplVars)
        for dVars in lDictVars:
            dicPath = spBuild + '/' + PREFIX_DICT_PATH + self.sVersion
            file_util.copy_file(dicPath+'/'+dVars['asciiName']+'.dic', spExt+'/dictionaries/'+dVars['asciiName']+'.dic')
            file_util.copy_file(dicPath+'/'+dVars['asciiName']+'.aff', spExt+'/dictionaries/'+dVars['asciiName']+'.aff')
        copyTemplate('orthographe', spExt+'/dictionaries', 'README_dict_fr.txt', dTplVars)
        # hyphenation
        file_util.copy_file('césures/hyph_fr.dic', spExt+'/dictionaries')
        file_util.copy_file('césures/hyph_fr.iso8859-1.dic', spExt+'/dictionaries')
        file_util.copy_file('césures/frhyph.tex', spExt+'/dictionaries')
        file_util.copy_file('césures/hyph-fr.tex', spExt+'/dictionaries')
        file_util.copy_file('césures/README_hyph_fr-3.0.txt', spExt+'/dictionaries')
        file_util.copy_file('césures/README_hyph_fr-2.9.txt', spExt+'/dictionaries')
        # zip
        createZipFiles(spExt, spBuild, sExtensionName + '.oxt')
        # copy to Grammalecte Project
        if spDestGL:
            echo("   Dictionnaires Hunspell copiés dans Grammalecte pour LibreOffice...")
            dir_util.copy_tree(spExt+'/dictionaries', spDestGL)

    def createMozillaExtensions (self, spBuild, dTplVars, lDictVars, spDestGL=""):
        # Mozilla extension 1
        echo(" * Dictionnaire >> extension pour Mozilla")
        dTplVars['version'] = self.sVersion
        sExtensionName = EXT_PREFIX_MOZ + self.sVersion
        spExt = spBuild + '/' + sExtensionName
        dir_util.mkpath(spExt+'/dictionaries')
        copyTemplate('_templates/moz', spExt, 'manifest.json', dTplVars)
        spDict = spBuild + '/' + PREFIX_DICT_PATH + self.sVersion
        file_util.copy_file(spDict+'/fr-classique.dic', spExt+'/dictionaries/fr-classic.dic')
        file_util.copy_file(spDict+'/fr-classique.aff', spExt+'/dictionaries/fr-classic.aff')
        copyTemplate('orthographe', spExt, 'README_dict_fr.txt', dTplVars)
        createZipFiles(spExt, spBuild, sExtensionName + '.xpi')
        # Grammalecte
        if spDestGL:
            echo("   Dictionnaires Hunspell copiés dans Grammalecte pour Mozilla")
            for dVars in lDictVars:
                file_util.copy_file(spDict+'/'+dVars['asciiName']+'.dic', spDestGL+'/'+dVars['mozAsciiName']+"/"+dVars['mozAsciiName']+'.dic')
                file_util.copy_file(spDict+'/'+dVars['asciiName']+'.aff', spDestGL+'/'+dVars['mozAsciiName']+"/"+dVars['mozAsciiName']+'.aff')

    def createFileIfqForDB (self, spBuild):
        echo(" * Dictionnaire >> indices de fréquence pour la DB...")
        with open(spBuild+'/dictIdxIfq-'+self.sVersion+'.diff.txt', 'w', encoding='utf-8', newline="\n") as hDiff, \
             open(spBuild+'/dictIdxIfq-'+self.sVersion+'.notes.txt', 'w', encoding='utf-8', newline="\n") as hNotes:
            for oEntry in self.lEntry:
                if oEntry.fq != oEntry.oldFq:
                    hDiff.write("{0.iD}\t{0.fq}\n".format(oEntry))
                    hNotes.write("{0.lemma}/{0.flags}\t{0.oldFq} > {0.fq}\n".format(oEntry))

    def createLexiconPackages (self, spBuild, version, oStatsLex, spDestGL=""):
        sLexName = LEX_PREFIX + version
        spLex = spBuild + '/' + sLexName
        dir_util.mkpath(spLex)
        # write lexicon
        self.sortLexiconByFreq()
        self.writeLexicon(spLex + '/' + sLexName + '.txt', version, oStatsLex)
        self.writeGrammarCheckerLexicon(spBuild + '/' + sLexName + '.lex', version)
        copyTemplate('lexique', spLex, 'README_lexique.txt', {'version': version})
        # zip
        createZipFiles(spLex, spBuild, sLexName + '.zip')
        # copy GC lexicon to Grammalecte
        if spDestGL:
            file_util.copy_file(spBuild + '/' + sLexName + '.lex', spDestGL + '/French.lex')
            file_util.copy_file('lexique/French.tagset.txt', spDestGL)

    def createDictConj (self, spBuild, spDestGL=""):
        echo(" * Dictionnaire >> fichier de conjugaison...")
        with open(spBuild+'/dictConj.txt', 'w', encoding='utf-8', newline="\n") as hDst:
            for oEntry in self.lEntry:
                if oEntry.po.startswith("v"):
                    hDst.write(oEntry.getConjugation())
        if spDestGL:
            echo("   Fichier de conjugaison copié dans Grammalecte...")
            file_util.copy_file(spBuild+'/dictConj.txt', spDestGL)

    def createDictDecl (self, spBuild, spDestGL=""):
        echo(" * Dictionnaire >> fichier de déclinaison...")
        with open(spBuild+'/dictDecl.txt', 'w', encoding='utf-8', newline="\n") as hDst:
            for oEntry in self.lEntry:
                if re.match("[SXFWIA]", oEntry.flags) and (oEntry.po.startswith("nom") or oEntry.po.startswith("adj")):
                    hDst.write(oEntry.getDeclination())
        if spDestGL:
            echo("   Fichier de déclinaison copié dans Grammalecte...")
            file_util.copy_file(spBuild+'/dictDecl.txt', spDestGL)


class Entree:
    def __init__ (self, sLine):
        self.lemma = ''
        self.flags = ''
        # champs morphologiques Hunspell
        self.po = ''
        self.iz = ''
        self.ds = ''
        self.ts = ''
        self.ip = ''
        self.dp = ''
        self.tp = ''
        self.sp = ''
        self.pa = ''
        self.st = ''
        self.al = ''
        self.ph = ''
        # champs annexes
        self.lx = ''
        self.se = ''
        self.et = ''
        self.di = '*'
        self.fq = ''
        self.iD = '0'

        # autres
        self.comment = ''
        self.err = ''
        self.nFlexions = 0
        self.lFlexions = []
        self.sStem = ''
        self.nOccur = 0
        self.nAKO = -1   # Average known occurrences
        self.fFreq = 0
        self.oldFq = ''

        sLine = sLine.rstrip(" \n")
        # commentaire
        if '#' in sLine:
            sLine, comment = sLine.split('#', 1)
            self.comment = comment.strip()
        # éléments de la ligne
        elems = sLine.split("\t")
        nElems = len(elems)
        # lemme et drapeaux
        firstElems = elems[0].split('/') if elems[0] != "/"  else elems[0]
        self.lemma = firstElems[0]
        self.flags = firstElems[1]  if len(firstElems) > 1  else ''
        # morph
        for i in range(1, nElems):
            if len(elems[i]) > 3 and elems[i][2] == ':':
                fields = elems[i].split(':', 1)
                if fields[0] == 'po':
                    self.po = fields[1]  if self.po == ''  else self.po + ' ' + fields[1]
                elif fields[0] == 'is':
                    self.iz = fields[1]  if self.iz == ''  else self.iz + ' ' + fields[1]
                elif fields[0] == 'ds':
                    self.ds = fields[1]  if self.ds == ''  else self.ds + ' ' + fields[1]
                elif fields[0] == 'ts':
                    self.ts = fields[1]  if self.ts == ''  else self.ts + ' ' + fields[1]
                elif fields[0] == 'ip':
                    self.ip = fields[1]  if self.ip == ''  else self.ip + ' ' + fields[1]
                elif fields[0] == 'dp':
                    self.dp = fields[1]  if self.dp == ''  else self.dp + ' ' + fields[1]
                elif fields[0] == 'tp':
                    self.tp = fields[1]  if self.tp == ''  else self.tp + ' ' + fields[1]
                elif fields[0] == 'sp':
                    self.sp = fields[1]  if self.sp == ''  else self.sp + ' ' + fields[1]
                elif fields[0] == 'pa':
                    self.pa = fields[1]  if self.pa == ''  else self.pa + ' ' + fields[1]
                elif fields[0] == 'st':
                    self.st = fields[1]  if self.st == ''  else self.st + ' ' + fields[1]
                elif fields[0] == 'al':
                    self.al = fields[1]  if self.al == ''  else self.al + ' ' + fields[1]
                elif fields[0] == 'ph':
                    self.ph = fields[1]  if self.ph == ''  else self.ph + ' ' + fields[1]
                elif fields[0] == 'lx':
                    self.lx = fields[1]  if self.lx == ''  else self.lx + ' ' + fields[1]
                elif fields[0] == 'se':
                    self.se = fields[1]  if self.se == ''  else self.se + ' ' + fields[1]
                elif fields[0] == 'et':
                    self.et = fields[1]  if self.et == ''  else self.et + ' ' + fields[1]
                elif fields[0] == 'di':
                    self.di = fields[1]
                elif fields[0] == 'fq':
                    self.fq = fields[1]
                elif fields[0] == 'id':
                    self.iD = fields[1]
                else:
                    echo('  ## Champ inconnu: {}  dans  {}/{}'.format(fields[0], self.lemma, self.flags))
            else:
                self.err = self.err + elems[i]
        if self.err:
            echo("\n## Erreur dans le dictionnaire : {}".format(self.err))
            echo("   dans : " + self.lemma)

    def __str__ (self):
        return "{0.lemma}/{0.flags} {1}".format(self, self.getMorph(2))

    def check (self):
        sErr = ''
        if self.lemma == '':
            sErr += 'lemme vide'
        if re.match(r"^\s", self.lemma):
            sErr += 'premier caractère un espace dans <' + self.lemma + '>'
        if re.search(r"\s$", self.lemma):
            sErr += 'espace en fin de lemme'
        if re.match(r"v[0123]", self.po) and not re.match(r"[eas_][ix_][tx_][nx_][pqreuvx_][mx_][ex_z][ax_z]\b", self.po[2:]):
            sErr += 'verbe inconnu: ' + self.po
        if (re.match(r"S[.]", self.flags) and re.search("[sxz]$", self.lemma)) or (re.match(r"X[.]", self.flags) and not re.search("[ul]$", self.lemma)):
            sErr += 'drapeau inutile'
        if self.iz == '' and re.match(r"[SXAI](?!=)", self.flags) and self.po:
            sErr += '[is] vide'
        if re.match(r"pl|sg|inv", self.iz):
            sErr += '[is] incomplet'
        if re.match(r"[FW]", self.flags) and re.search(r"epi|mas|fem|inv|sg|pl", self.iz):
            sErr += '[is] incohérent'
        if re.search(r"pl|sg|inv", self.iz) and re.match(r"[SXAIFW](?!=)", self.flags):
            sErr += '[is] incohérent'
        if self.iz.endswith(("mas", "fem", "epi")) and (not self.flags or not self.flags.startswith(("S", "X", "F", "W", "A", "I", "U"))):
            sErr += '[is] incomplet'
        if self.flags.startswith(("a0", "b0", "c0", "d0")) and not self.lemma.endswith("er"):
            sErr += "drapeau pour verbe du 1ᵉʳ groupe sur un lemme non conforme"
        if self.flags.startswith("f") and not self.lemma.endswith(("ir", "ïr")):
            sErr += "drapeau pour verbe du 2ᵉ groupe sur un lemme non conforme"
        if sErr:
            echo('   error -  id: ' + self.iD, end = "")
            echo('  ' + sErr + '  in  ' + self.__str__())

    def setTagsFrom (self, oEnt):
        self.po = oEnt.po
        self.iz = oEnt.iz
        self.ds = oEnt.ds
        self.ts = oEnt.ts
        self.ip = oEnt.ip
        self.dp = oEnt.dp
        self.tp = oEnt.tp
        self.sp = oEnt.sp
        self.pa = oEnt.pa
        self.st = oEnt.st
        self.al = oEnt.al
        self.ph = oEnt.ph
        self.lx = oEnt.lx
        self.se = oEnt.se
        self.et = oEnt.et
        self.di = oEnt.di
        self.fq = oEnt.fq

    def keyTriNat (self):
        return (self.lemma.translate(CHARMAP), self.flags, self.po)

    def keyTriNum (self):
        return (self.lemma, self.flags, self.po)

    def getHunspellLine (self, oDict, nMode, bSimplified=False):
        sLine = self.lemma.replace("’", "'")
        if self.flags:
            sLine += '/'
            sLine += self.flags  if not oDict.bShortenTags or bSimplified  else oDict.dAF[self.flags]
        if bSimplified:
            return sLine.replace("()", "") + "\n"
        if nMode > 0:
            sMorph = self.getMorph(nMode)
            if sMorph:
                sLine += sMorph  if not oDict.bShortenTags  else "\t" + oDict.dAM[sMorph.strip()]
        return sLine + "\n"

    def getMorph (self, nMode):
        txt = ''
        if self.po: txt += fieldToHunspell('po', self.po)
        if self.iz: txt += fieldToHunspell('is', self.iz)
        if self.ds: txt += fieldToHunspell('ds', self.ds)
        if self.ts: txt += fieldToHunspell('ts', self.ts)
        if self.ip: txt += fieldToHunspell('ip', self.ip)
        if self.dp: txt += fieldToHunspell('dp', self.dp)
        if self.tp: txt += fieldToHunspell('tp', self.tp)
        if self.sp: txt += fieldToHunspell('sp', self.sp)
        if self.pa: txt += fieldToHunspell('pa', self.pa)
        if self.al: txt += fieldToHunspell('al', self.al)
        if self.st: txt += fieldToHunspell('st', self.st)
        if self.ph: txt += fieldToHunspell('ph', self.ph)
        if nMode > 1:
            if self.lx: txt += fieldToHunspell('lx', self.lx)
            if self.se: txt += fieldToHunspell('se', self.se)
            if self.et: txt += fieldToHunspell('et', self.et)
            if self.fq: txt += ' fq:' + self.fq
            if self.di != '*': txt += ' di:' + self.di
        return txt

    def getShortDescr (self):
        txt = self.lemma
        if self.flags:
            txt += '/' + self.flags
        if self.di != '*':
            txt += ' di:' + self.di
        return txt

    def generateFlexions (self, dFlags):
        lTuples = self._flechir(dFlags)
        # création des objects flexions
        self.nFlexion = 0
        self.lFlexions = []
        sReject = ""
        for sFlex, sMorph, sDic in lTuples:
            if '+' not in sMorph:
                sMorph = self.clean(sMorph)
                if not sMorph.endswith((" mas", " fem", " epi")):
                    self.nFlexion += 1
                    self.lFlexions.append( Flexion(self, sFlex, sMorph, sDic, self.nFlexion) )
                else:
                    #echo(sFlex + " " + sMorph + ", ")
                    pass
        # Lemme
        self.sStem = self.st  if self.st  else self.lemma
        # Tag duplicates
        d = {}
        for oFlex in self.lFlexions:
            d[oFlex.sFlexion] = d.get(oFlex.sFlexion, 0) + 1
        for oFlex in self.lFlexions:
            oFlex.nDup = d[oFlex.sFlexion]

    def _flechir (self, dFlags, morph='', iPR=0):
        # recursive function!
        "renvoie une liste de tuples (déclinaisons, morphologie), formes fléchies du lemme"
        if iPR == 2:
            return []
        if iPR == 0:
            morph = self.lexMorph()
        lFlexions = [(self.lemma, morph, self.di)]  if iPR == 0 and not self.flags.endswith('()')  else []
        lFlexPrefix = []
        lFlexSuffix = []
        for sFlag in makeLongFlags(self.flags):
            if sFlag not in dFlags:
                if sFlag not in ['**', '()', '||', '--']:
                    lFlexions.append( (self.lemma, '[unknown flag: {}]'.format(sFlag), self.di) )
                    echo("ERROR: "  + self.lemma + ' - unknown flag: ' + sFlag)
            else:
                oFlag = dFlags[sFlag]
                if not oFlag.bSfx:
                    # cas des préfixes
                    for oRule in oFlag.lRules:
                        if oRule.motif.search(self.lemma):
                            ruleMorph = oRule.lexMorph()
                            if oRule.cut == '0':
                                flexion = (oRule.add+self.lemma, ruleMorph+morph, oRule.di)
                                if oFlag.bMix:
                                    lFlexPrefix.append(flexion)
                                    for flex in lFlexSuffix:
                                        lFlexions.append( (oRule.add+flex[0], flex[1]+ruleMorph) )
                                else:
                                    lFlexions.append(flexion)
                            else:
                                flexion = (self.lemma.replace(oRule.cut, oRule.add, 1), ruleMorph+morph, oRule.di)
                                if oFlag.bMix:
                                    lFlexPrefix.append(flexion)
                                    for flex in lFlexSuffix:
                                        lFlexions.append( (flex[0].replace(oRule.cut, oRule.add, 1), flex[1]+ruleMorph) )
                                else:
                                    lFlexions.append(flexion)
                            if oRule.flags != '' and oRule.flags != '**':
                                lFlexions.extend(Entree(flexion[0]+'/'+oRule.flags)._flechir(dFlags, flexion[1], iPR+1))
                else:
                    # cas des suffixes
                    for oRule in oFlag.lRules:
                        if oRule.motif.search(self.lemma):
                            ruleMorph = oRule.lexMorph()
                            if not oRule.flags.endswith('**') or oRule.flags == '**':
                                # règle ordinaire, pas de circumfix
                                if oRule.cut == '0':
                                    flexion = (self.lemma+oRule.add, morph+ruleMorph, oRule.di)
                                    if oFlag.bMix:
                                        lFlexSuffix.append(flexion)
                                        for flex in lFlexPrefix:
                                            lFlexions.append( (flex[0]+oRule.add, flex[1]+ruleMorph) )
                                    else:
                                        lFlexions.append(flexion)
                                else:
                                    nCut = len(oRule.cut)
                                    flexion = (self.lemma[:-nCut]+oRule.add, morph+ruleMorph, oRule.di)
                                    if oFlag.bMix:
                                        lFlexSuffix.append(flexion)
                                        for flex in lFlexPrefix:
                                            lFlexions.append( (flex[0][:-nCut]+oRule.add, flex[1]+ruleMorph) )
                                    else:
                                        lFlexions.append(flexion)
                                if oRule.flags != '' and oRule.flags != '**':
                                    lFlexions.extend(Entree(flexion[0]+'/'+oRule.flags)._flechir(dFlags, flexion[1], iPR+1))
                            else:
                                # la règle impose un circumfix
                                if oRule.cut == '0':
                                    flexion = (self.lemma+oRule.add, morph+ruleMorph, oRule.di)
                                else:
                                    flexion = (self.lemma[:-len(oRule.cut)]+oRule.add, morph+ruleMorph, oRule.di)
                                lFlexions.extend(Entree(flexion[0]+'/'+oRule.flags)._flechir(dFlags, flexion[1], iPR+1))
        lFlexions = lFlexions + lFlexPrefix + lFlexSuffix
        return lFlexions

    def clean (self, s):
        return s.replace('  ', ' ').strip(' ')

    def lexMorph (self):
        # morphology for lexicon
        txt = ' '
        if self.po: txt += self.po + ' '
        if self.iz: txt += self.iz + ' '
        if self.ds: txt += self.ds + ' '
        if self.ts: txt += self.ts + ' '
        if self.ip: txt += self.ip + ' '
        if self.dp: txt += self.dp + ' '
        if self.tp: txt += self.tp + ' '
        if self.sp: txt += self.sp + ' '
        return txt

    def getConjugation (self):
        sRes = self.lemma + "\t" + self.po[1:10] + "\n"
        for oFlex in self.lFlexions:
            sMorph = oFlex.sMorph[11:].rstrip("!").replace("ppas adj", "ppas").replace("ppas 1jsg", "ppas")
            if not sMorph.startswith("ppas") and sMorph.find(" ") > 1:
                # complex tags
                for s in getVerbMultiMorph(sMorph):
                    sRes += "_\t" + s + "\t" + oFlex.sFlexion + "\n"
            else:
                sRes += "_\t" + sMorph + "\t" + oFlex.sFlexion + "\n"
        return sRes + "$\n"

    def getDeclination (self):
        sRes = self.lemma + "\t" + self.flags + "\n"
        for oFlex in self.lFlexions:
            if "ppas" in oFlex.sMorph:
                sMorph = oFlex.sMorph.replace("ppas adj", "adj").replace("ppas 1jsg", "adj")
                sRes += "_\t" + sMorph + "\t" + oFlex.sFlexion + "\n"
            elif "adj" in oFlex.sMorph or "nom" in oFlex.sMorph:
                sRes += "_\t" + oFlex.sMorph + "\t" + oFlex.sFlexion + "\n"
        return sRes + "$\n"

    def calcOccurFromFlexions (self):
        self.nOccur = 0
        for o in self.lFlexions:
            self.nOccur += o.nOccur

    def calcAverageKnownOccurrence (self):
        # nous calculons la moyenne des occurrences des formes fléchies
        # qui n’ont pas d’équivalent dans les autres entrées (nMulti = 0)
        nOccur = 0
        nFlex = 0
        for oFlex in self.lFlexions:
            if oFlex.nMulti == 0:
                nOccur += oFlex.nOccur
                nFlex += 1
        # moyenne des formes fléchies sans équivalent ou -1
        self.nAKO = math.ceil(nOccur / nFlex)  if nFlex > 0  else -1

    def solveOccurMultipleFlexions (self, hDst, oStatsLex):
        sBlank = "           "
        if self.nAKO >= 0:
            for oFlex in self.lFlexions:
                if oFlex.nMulti > 0 and not oFlex.bBlocked:
                    # on trie les entrées avec AKO et sans AKO
                    lEntWithAKO = []
                    lEntNoAKO = []
                    for oEntry in oFlex.lMulti:
                        if oEntry.nAKO >= 0:
                            lEntWithAKO.append(oEntry)
                        else:
                            lEntNoAKO.append(oEntry)

                    if lEntNoAKO:
                        # on calcule la différence totale occasionnée par du passage des flexions appartenant à des entrées avec AKO au niveau AKO
                        nDiff = (oFlex.nOccur - self.nAKO) * oFlex.nDup
                        for oEntry in lEntWithAKO:
                            for oFlexM in oEntry.lFlexions:
                                if oFlex.sFlexion == oFlexM.sFlexion:
                                    nDiff += oFlexM.nOccur - oEntry.nAKO
                        if nDiff > 0:
                            # on peut passer à les formes fléchies à AKO
                            hDst.write(" * {0.sFlexion}\n".format(oFlex))
                            hDst.write("       moyenne connue\n")
                            for oFlexD in self.lFlexions:
                                if oFlex.sFlexion == oFlexD.sFlexion:
                                    hDst.write(sBlank + "{2:<30} {0.sMorph:<30}  {0.nOccur:>10}  >> {1:>10}\n".format(oFlexD, self.nAKO, self.getShortDescr()))
                                    oFlexD.setOccurAndBlock(self.nAKO)
                            for oEntry in lEntWithAKO:
                                hDst.write("       moyenne connue\n")
                                for oFlexM in oEntry.lFlexions:
                                    if oFlex.sFlexion == oFlexM.sFlexion:
                                        hDst.write(sBlank + "{2:<30} {0.sMorph:<30}  {0.nOccur:>10}  >> {1:>10}\n".format(oFlexM, oEntry.nAKO, oEntry.getShortDescr()))
                                        oFlexM.setOccurAndBlock(oEntry.nAKO)
                            # on répercute nDiff sur les flexions sans AKO
                            for oEntry in lEntNoAKO:
                                hDst.write("       sans moyenne connue\n")
                                for oFlexM in oEntry.lFlexions:
                                    if oFlex.sFlexion == oFlexM.sFlexion:
                                        nNewOccur = oFlexM.nOccur + math.ceil((nDiff / len(lEntNoAKO)) / oFlexM.nDup)
                                        hDst.write(sBlank + "{2:<30} {0.sMorph:<30}  {0.nOccur:>10}  +> {1:>10}\n".format(oFlexM, nNewOccur, oEntry.getShortDescr()))
                                        oFlexM.setOccurAndBlock(nNewOccur)
                    else:
                        # Toutes les entrées sont avec AKO : on pondère
                        nFlexOccur = oStatsLex.getFlexionOccur(oFlex.sFlexion)
                        nTotAKO = self.nAKO
                        for oEnt in oFlex.lMulti:
                            nTotAKO += oEnt.nAKO

                        hDst.write(" = {0.sFlexion}\n".format(oFlex))
                        hDst.write("       moyennes connues\n")
                        for oFlexD in self.lFlexions:
                            if oFlex.sFlexion == oFlexD.sFlexion:
                                nNewOccur = math.ceil((nFlexOccur * (self.nAKO / nTotAKO)) / oFlexD.nDup)  if nTotAKO  else 0
                                hDst.write(sBlank + "{2:<30} {0.sMorph:<30}  {0.nOccur:>10}  %> {1:>10}\n".format(oFlexD, nNewOccur, self.getShortDescr()))
                                oFlexD.setOccurAndBlock(nNewOccur)
                        for oEntry in oFlex.lMulti:
                            for oFlexM in oEntry.lFlexions:
                                if oFlex.sFlexion == oFlexM.sFlexion:
                                    nNewOccur = math.ceil((nFlexOccur * (oEntry.nAKO / nTotAKO)) / oFlexM.nDup)  if nTotAKO  else 0
                                    hDst.write(sBlank + "{2:<30} {0.sMorph:<30}  {0.nOccur:>10}  %> {1:>10}\n".format(oFlexM, nNewOccur, oEntry.getShortDescr()))
                                    oFlexM.setOccurAndBlock(nNewOccur)

    def calcFreq (self, nTot):
        self.fFreq = (self.nOccur * 100) / nTot
        self.oldFq = self.fq
        self.fq = getIfq(self.fFreq)



class Flexion:
    def __init__ (self, oEntry, sFlex='', sMorph='', cDic='', nFlexId=0):
        self.oEntry = oEntry
        self.sFlexion = sFlex
        self.sMorph = sMorph
        self.cDic    = cDic
        self.nFlexId = nFlexId
        self.nOccur  = 0
        self.bBlocked  = False
        self.nDup    = 0    # duplicates in the same entry
        self.nMulti  = 0    # duplicates with other entries
        self.lMulti  = []   # list of similar flexions
        self.fFreq   = 0
        self.cFq     = ''
        self.metagfx = ''   # métagraphe
        self.metaph2 = ''   # métaphone 2

    def setOccur (self, n):
        self.nOccur = n

    def setOccurAndBlock (self, n):
        self.nOccur = n
        self.bBlocked = True

    def calcOccur (self):
        self.nOccur = math.ceil((self.nOccur / (self.nMulti+1)) / self.nDup)

    def calcFreq (self, nTot):
        self.fFreq = (self.nOccur * 100) / nTot
        self.cFq = getIfq(self.fFreq)

    def calcMetagraphe (self):
        t = metagraphe.getMetagraphe(self.sFlexion, self.sMorph)
        self.metagfx = t[0]  if not t[1]  else t[0]+"/"+t[1]

    def calcMetaphone2 (self):
        t = metaphone2.dm(self.sFlexion)
        self.metaph2 = t[0]  if not t[1]  else t[0]+"/"+t[1]

    @classmethod
    def header (cls, oStatsLex):
        sOccurs = ''
        for t in oStatsLex.lLex:
            sOccurs += t[1] + "\t"
        return "id\tfid\tFlexion\tLemme\tÉtiquettes\tMétagraphe (β)\tMetaphone2\tNotes\tSémantique\tÉtymologie\tSous-dictionnaire\t" + sOccurs + "Total occurrences\tDoublons\tMultiples\tFréquence\tIndice de fréquence\n"

    def __str__ (self, oStatsLex):
        sOccurs = ''
        for v in oStatsLex.dFlexions[self.sFlexion]:
            sOccurs += str(v) + "\t"
        return "{0.oEntry.iD}\t{0.nFlexId}\t{0.sFlexion}\t{0.oEntry.sStem}\t{0.sMorph}\t{0.metagfx}\t{0.metaph2}\t{0.oEntry.lx}\t{0.oEntry.se}\t{0.oEntry.et}\t{0.oEntry.di}{2}\t{1}{0.nOccur}\t{0.nDup}\t{0.nMulti}\t{0.fFreq:.15f}\t{0.cFq}\n".format(self, sOccurs, "/"+self.cDic if self.cDic != "*" else "")

    @classmethod
    def simpleHeader (cls):
        return "# :POS ;LEX ~SEM =FQ /DIC\n"

    def getGrammarCheckerRepr (self):
        return "{0.sFlexion}\t{0.oEntry.lemma}\t{1}\n".format(self, self._getSimpleTags())

    _dTagReplacement = {
        # POS
        "nom": ":N", "adj": ":A", "adv": ":W", "negadv": ":X", "mg": ":G", "nb": ":B", "nbro": ":Br",
        "loc.nom": ":ÉN", "loc.adj": ":ÉA", "loc.adv": ":ÉW", "loc.verb": ":ÉV",
        "interj": ":J", "loc.interj": ":ÉJ", "titr": ":T",
        "mas": ":m", "fem": ":f", "epi": ":e", "sg": ":s", "pl": ":p", "inv": ":i",
        "infi": ":Y",
        "ppre": ":P", "ppas": ":Q",
        "ipre": ":Ip", "iimp": ":Iq", "ipsi": ":Is", "ifut": ":If",
        "spre": ":Sp", "simp": ":Sq", "cond": ":K", "impe": ":E",
        "1sg": ":1s", "1isg": ":1ś", "1jsg": ":1ŝ", "2sg": ":2s", "3sg": ":3s", "1pl": ":1p", "2pl": ":2p", "3pl": ":3p", "3pl!": ":3p!",
        "prepv": ":Rv", "prep": ":R", "loc.prep": ":ÉR", "loc.prepv": "ÉRv",
        "detpos": ":Dp", "detdem": ":Dd", "detind": ":Di", "detneg": ":Dn", "detex": ":De", "det": ":D",
        "advint": ":U",
        "prodem": ":Od", "proind": ":Oi", "proint": ":Ot", "proneg": ":On", "prorel": ":Or", "proadv": ":Ow",
        "properobj": ":Oo", "propersuj": ":Os", "1pe": ":O1", "2pe": ":O2", "3pe": ":O3", "preverb": ":Ov",
        "cjco": ":Cc", "cjsub": ":Cs", "cj": ":C", "loc.cj": ":ÉC", "loc.cjsub": ":ÉCs",
        "prn": ":M1", "patr": ":M2", "loc.patr": ":ÉM2", "npr": ":MP", "nompr": ":NM",
        "pfx": ":Zp", "sfx": ":Zs",
        "div": ":H",
        "err": ":F",
        "ponc": ":@p", "sign": ":@s",
        # LEX
        "symb": ";S", "unit": ";U"
    }

    def _getSimpleTags (self):
        s = ""
        # POS
        for sTag in self.sMorph.split():
            if sTag.startswith("v"):
                s += ":V" + sTag[1:]
            else:
                if sTag in self._dTagReplacement:
                    s += self._dTagReplacement[sTag]
                else:
                    echo(" # unknown tag: " + sTag + "  on: " + self.oEntry.lemma)
        # LEX
        for sTag in self.oEntry.lx.split():
            if sTag in self._dTagReplacement:
                s += self._dTagReplacement[sTag]
        # SEM
        #s += "~" + self.oEntry.se  if self.oEntry.se and self.oEntry.se != "@"  else ""
        # ETY
        #s += "<" + self.oEntry.et  if self.oEntry.et and self.oEntry.et != "@"  else ""
        # IFQ
        #s += "=" + self.cFq
        # DIC
        if self.oEntry.di == "*" and self.cDic != "*":
            s += "/" + self.cDic
        else:
            s += "/" + self.oEntry.di
        return s

    def keyTriNat (self):
        return (self.sFlexion.translate(CHARMAP), self.sMorph)

    def keyFreq (self):
        return (100-self.fFreq, self.oEntry.sStem, self.sFlexion)

    def keyOcc (self):
        return (self.nOccur, self.oEntry.sStem, self.sFlexion)

    def keyIdx (self):
        return self.oEntry.iD

    def keyFlexion (self):
        return self.sFlexion



class Flag:
    def __init__ (self, sFlagType, sFlagName, sMix):
        self.sFlagName = sFlagName
        self.bSfx = True  if sFlagType == 'SFX'  else False
        self.bMix = True  if sMix == 'Y'  else False
        self.lRules = []
        self.nRules = 0
        self.nOccur = 0

    def addAffixRule (self, line):
        "ajoute une règle au drapeau"
        oRule = AffixRule(line)
        self.lRules.append(oRule)
        self.nRules += 1

    def getFlag (self, subDicts, oDict, nMode, bSimplified):
        nRules = 0
        sRules = ''
        for oRule in self.lRules:
            if oRule.di in subDicts:
                if not (bSimplified and oRule.isReplicationRule()):
                    sRules += oRule.getRuleLine(oDict, nMode, bSimplified)
                    nRules += 1
        if nRules:
            txt = "\n"
            txt += 'SFX'  if self.bSfx  else 'PFX'
            txt += ' ' + self.sFlagName + ' '
            txt += 'Y'  if self.bMix  else 'N'
            txt += ' ' + str(nRules) + "\n"
            txt += sRules
            return txt
        else:
            return ''


class AffixRule:
    def __init__ (self, sLine):
        self.sFlagName = ''
        self.bSfx = True
        self.comment = ''
        # Règle
        self.cut = ''
        self.add = ''
        self.flags = ''
        self.cond = ''
        self.motif = ''
        # champs morphologiques de Hunspell
        self.po = ''
        self.iz = ''
        self.ds = ''
        self.ts = ''
        self.ip = ''
        self.dp = ''
        self.tp = ''
        self.sp = ''
        self.pa = ''
        self.ph = ''
        # champs de Dicollecte
        self.lx = ''
        self.di = '*'
        # erreurs
        self.err = ''
        # autres champs
        self.nOccur = 0

        sLine = sLine.rstrip(" \n")
        # commentaire
        if '#' in sLine:
            sLine, comment = sLine.split('#', 1)
            self.comment = comment.strip()
        # éléments de la ligne
        elems = sLine.split()
        nElems = len(elems)
        # type et nom
        self.bSfx = True  if elems[0] == "SFX"  else False
        self.sFlagName = elems[1]
        # lemme et drapeaux
        self.cut = elems[2]
        if '/' in elems[3]:
            self.add, self.flags = elems[3].split('/')
        else:
            self.add = elems[3]
            self.flags = ''
        if self.add == '0':
            self.add = ''
        self.cond = elems[4]
        try:
            self.motif = re.compile(self.cond+'$')  if self.bSfx  else re.compile('^'+self.cond)
        except:
            echo("error:"+self.cond)
        # morph
        for i in range(5, nElems):
            if len(elems[i]) > 3 and elems[i][2] == ':':
                fields = elems[i].split(':',1)
                if fields[0] == 'po':
                    self.po = fields[1]  if self.po == ''  else self.po + ' ' + fields[1]
                elif fields[0] == 'is':
                    self.iz = fields[1]  if self.iz == ''  else self.iz + ' ' + fields[1]
                elif fields[0] == 'ds':
                    self.ds = fields[1]  if self.ds == ''  else self.ds + ' ' + fields[1]
                elif fields[0] == 'ts':
                    self.ts = fields[1]  if self.ts == ''  else self.ts + ' ' + fields[1]
                elif fields[0] == 'ip':
                    self.ip = fields[1]  if self.ip == ''  else self.ip + ' ' + fields[1]
                elif fields[0] == 'dp':
                    self.dp = fields[1]  if self.dp == ''  else self.dp + ' ' + fields[1]
                elif fields[0] == 'tp':
                    self.tp = fields[1]  if self.tp == ''  else self.tp + ' ' + fields[1]
                elif fields[0] == 'sp':
                    self.sp = fields[1]  if self.sp == ''  else self.sp + ' ' + fields[1]
                elif fields[0] == 'pa':
                    self.pa = fields[1]  if self.pa == ''  else self.pa + ' ' + fields[1]
                elif fields[0] == 'ph':
                    self.ph = fields[1]  if self.pa == ''  else self.pa + ' ' + fields[1]
                elif fields[0] == 'lx':
                    self.lx = fields[1]  if self.lx == ''  else self.lx + ' ' + fields[1]
                elif fields[0] == 'di':
                    self.di = fields[1]
                else:
                    echo('Champ inconnu: {}  dans  {}'.format(fields[0], self.sFlagName))
            else:
                echo("  # Erreur affixe : {}".format(line))

    def isReplicationRule (self):
        "is this rule used for replication of a virtual lemma"
        return self.flags == "" and ((self.cut == "0" and self.add == "") or self.cut == self.add)

    def getRuleLine (self, oDict, nMode, bSimplified=False):
        sLine = 'SFX'  if self.bSfx  else 'PFX'
        sLine += ' ' + self.sFlagName + ' ' + self.cut + ' '
        sLine += self.add  if self.add  else '0'
        if self.flags != '':
            sLine += '/'
            sLine += self.flags  if not oDict.bShortenTags or bSimplified  else oDict.dAF[self.flags]
            if bSimplified:
                sLine = sLine.replace("()", "")
        sLine += ' ' + self.cond
        if not bSimplified and nMode > 0:
            sMorph = self.getMorph(nMode)
            if sMorph:
                sLine += sMorph  if not oDict.bShortenTags or bSimplified  else ' ' + oDict.dAM[sMorph.strip()]
        return sLine + "\n"

    def getMorph (self, nMode):
        # morphology for Hunspell
        txt = ''
        if self.po: txt += fieldToHunspell('po', self.po)
        if self.iz: txt += fieldToHunspell('is', self.iz)
        if self.ds: txt += fieldToHunspell('ds', self.ds)
        if self.ts: txt += fieldToHunspell('ts', self.ts)
        if self.ip: txt += fieldToHunspell('ip', self.ip)
        if self.dp: txt += fieldToHunspell('dp', self.dp)
        if self.tp: txt += fieldToHunspell('tp', self.tp)
        if self.sp: txt += fieldToHunspell('sp', self.sp)
        if self.pa: txt += fieldToHunspell('pa', self.pa)
        if self.ph: txt += fieldToHunspell('ph', self.ph)
        if nMode > 1:
            if self.lx: txt += fieldToHunspell('lx', self.lx)
            if self.di != '*': txt += ' di:' + self.di
        return txt

    def lexMorph (self):
        # morphology for lexicon
        txt = ' '
        if self.po: txt += self.po + ' '
        if self.iz: txt += self.iz + ' '
        if self.ds: txt += self.ds + ' '
        if self.ts: txt += self.ts + ' '
        if self.ip: txt += self.ip + ' '
        if self.dp: txt += self.dp + ' '
        if self.tp: txt += self.tp + ' '
        if self.sp: txt += self.sp + ' '
        return txt



class StatsLex:
    def __init__ (self, oDict):
        echo("Lexique statistique")
        self.dFlexions = { oFlex.sFlexion: []  for oFlex in oDict.lFlexions }
        self.lLex = []

    def addLexFromFile (self, sPathFile, cLexID, sLexName):
        if not os.path.isfile(sPathFile):
            echo(' * Lexique statistique - fichier {} introuvable'.format(sPathFile))
            return None
        if len(cLexID) != 1:
            echo(' * Lexique statistique - fichier {} - identifiant incorrect, 1 caractère requis'.format(sPathFile))
            return None
        echo(" * Lexique statistique << [ {} ]".format(sPathFile))
        nTotKnownOccur = 0
        nTotOccur = 0
        for sLine in readfile(sPathFile):
            sWord, sVal = sLine.rstrip().split()
            n = int(sVal)
            if sWord in self.dFlexions:
                self.dFlexions[sWord].append(n)
                nTotKnownOccur += n
            nTotOccur += n
        self.lLex.append((cLexID, sLexName, nTotKnownOccur, nTotOccur))
        # we fill gaps
        nLex = len(self.lLex)
        for sFlex in self.dFlexions:
            if len(self.dFlexions[sFlex]) < nLex:
                self.dFlexions[sFlex].append(0)

    def getFlexionOccur (self, sFlex):
        return sum(self.dFlexions[sFlex])

    def getInfo (self):
        nKnownTot = 0
        nTot = 0
        s = "Corpus :\n"
        for t in self.lLex:
            s += " * {:<20} -> {:>18,} mots reconnus / {:>18,}\n".format(t[1], t[2], t[3])
            nKnownTot += t[2]
            nTot += t[3]
        s += "\n * {:<20} -> {:>18,} mots reconnus / {:>18,}\n\n".format('TOTAL', nKnownTot, nTot)
        return s

    def write (self, sPathFile):
        with open(sPathFile, 'w', encoding='utf-8', newline="\n") as hDst:
            for t in self.lLex:
                hDst.write(str(t)+"\n")
            for e in self.dFlexions.items():
                hDst.write("{} - {}\n".format(e[0], e[1]))


def createThesaurusPackage (spBuild, sVersion, spCopy=""):
    print(" * Création du thésaurus")
    spThesaurus = spBuild+"/thesaurus-v"+sVersion
    dir_util.mkpath(spThesaurus)
    thes_build.build("thesaurus/thes_fr.dat", "thesaurus/synsets_fr.dat", spThesaurus)
    file_util.copy_file('thesaurus/README_thes_fr.txt', spThesaurus)
    if spCopy:
        # copy in libreoffice extension package
        print("   Copie du thésaurus dans:", spCopy)
        file_util.copy_file(spThesaurus+'/thes_fr.dat', spCopy)
        file_util.copy_file(spThesaurus+'/thes_fr.idx', spCopy)
        file_util.copy_file(spThesaurus+'/README_thes_fr.txt', spCopy)


def main ():
    xParser = argparse.ArgumentParser()
    xParser.add_argument("-v", "--verdic", help="set dictionary version, i.e. 5.4", type=str, default="X.Y.z")
    xParser.add_argument("-m", "--mode", help="0: no tags,  1: Hunspell tags (default),  2: All tags", type=int, choices=[0, 1, 2], default=1)
    xParser.add_argument("-u", "--uncompress", help="do not use Hunspell compression", action="store_true")
    xParser.add_argument("-s", "--simplify", help="no virtual lemmas", action="store_true")
    xParser.add_argument("-gl", "--grammalecte", help="copy generated files to Grammalecte folders", action="store_true")
    xArgs = xParser.parse_args()

    if xArgs.simplify:
        xArgs.mode = 0
        xArgs.uncompress = True

    echo("Python: " + sys.version)
    echo("Version: " + xArgs.verdic)
    echo("Simplify: " + str(xArgs.simplify))
    echo("Mode: " + str(xArgs.mode))
    echo("Compression: " + str(not(xArgs.uncompress)))

    ### création du répertoire
    spBuild = BUILD_PATH + '/' + xArgs.verdic
    dir_util.mkpath(spBuild)

    ### Lecture des fichiers et création du dictionnaire
    oFrenchDict = Dictionnaire(xArgs.verdic, "French dictionary")
    for sFile in ['orthographe/FRANCAIS.dic']:
        oFrenchDict.readDictionary(sFile)
    oFrenchDict.readAffixes('orthographe/FRANCAIS_7.aff')

    ### Contrôle
    oFrenchDict.sortEntriesNatural()
    oFrenchDict.checkEntries()

    ### Lexique
    oFrenchDict.generateFlexions()
    oFrenchDict.calcMetagraphe()
    oFrenchDict.calcMetaphone2()

    #oFrenchDict.createNgrams(spBuild, 3)

    ### Statistiques
    spfStats = spBuild+'/'+STATS_NAME+xArgs.verdic+'.txt'
    oStatsLex = StatsLex(oFrenchDict)
    oStatsLex.addLexFromFile('lexique/corpus_data/stats_google_ngram_1.txt', 'G', 'Google 1-grams')
    oStatsLex.addLexFromFile('lexique/corpus_data/stats_frwiki.txt', 'W', 'Wikipédia')
    oStatsLex.addLexFromFile('lexique/corpus_data/stats_frwikisource.txt', 'S', 'Wikisource')
    oStatsLex.addLexFromFile('lexique/corpus_data/stats_litterature.txt', 'L', 'Littérature')
    oStatsLex.write(spBuild+'/test_lex.txt')
    oFrenchDict.calculateStats(oStatsLex, spfStats)

    ### Écriture des paquets
    echo("Création des paquets...")

    spLexiconDestGL = "../../../lexicons"  if xArgs.grammalecte  else ""
    spLibreOfficeExtDestGL = "../oxt/Dictionnaires/dictionaries"  if xArgs.grammalecte  else ""
    spMozillaExtDestGL = ""  if xArgs.grammalecte  else "" # no more Hunspell dictionaries in Mozilla extensions for now
    spDataDestGL = "../data"  if xArgs.grammalecte  else ""

    ### dictionnaires
    if not xArgs.uncompress:
        oFrenchDict.defineAbreviatedTags(xArgs.mode, spfStats)
    oFrenchDict.createFiles(spBuild, [dTOUTESVAR, dCLASSIQUE, dREFORME1990], xArgs.mode, xArgs.simplify)
    oFrenchDict.createLexiconPackages(spBuild, xArgs.verdic, oStatsLex, spLexiconDestGL)
    oFrenchDict.createFileIfqForDB(spBuild)
    createThesaurusPackage(spBuild, "2.4", spLibreOfficeExtDestGL)
    oFrenchDict.createLibreOfficeExtension(spBuild, dMOZEXT, [dTOUTESVAR, dCLASSIQUE, dREFORME1990], spLibreOfficeExtDestGL)
    oFrenchDict.createMozillaExtensions(spBuild, dMOZEXT, [dTOUTESVAR, dCLASSIQUE, dREFORME1990], spMozillaExtDestGL)
    oFrenchDict.createDictConj(spBuild, spDataDestGL)
    oFrenchDict.createDictDecl(spBuild, spDataDestGL)


if __name__ == '__main__':
    main()
Grammalecte genfrdic.py at [246965c90e]

File gc_lang/fr/dictionnaire/genfrdic.py artifact 877bd56310 part of check-in 246965c90e