#!python3
__author__ = "Olivier R."
__license__ = "MPL 2"
import os
import sys
import re
import collections
import zipfile
import math
import argparse
import tags
import shutil
from enum import Enum
from string import Template
import tags
import metagraphe
import metaphone2
import thes_build
# Dictionnaire des caractères pour le tri naturel.
# Ordre souhaitable, mais pose problème pour la recherche, car engendre des égalités de lemmes différents.
# Il faut donc travailler sur un dictionnaire trié *numériquement* et le sauvegarder selon le tri *naturel*
CHARMAP = str.maketrans({ 'à': 'a', 'À': 'A', 'â': 'a', 'Â': 'A', 'ä': 'a', 'Ä': 'A', 'å': 'a', 'Å': 'A', 'ā': 'a', 'Ā': 'A',
'ç': 'c', 'Ç': 'C',
'é': 'e', 'É': 'E', 'è': 'e', 'È': 'E', 'ê': 'e', 'Ê': 'E', 'ë': 'e', 'Ë': 'E', 'ē': 'e', 'Ē': 'E',
'î': 'i', 'Î': 'I', 'ï': 'i', 'Ï': 'I', 'ī': 'i', 'Ī': 'I',
'ñ': 'n',
'ô': 'o', 'Ô': 'O', 'ö': 'o', 'Ö': 'O', 'ō': 'o', 'Ō': 'O',
'ù': 'u', 'Ù': 'U', 'û': 'u', 'Û': 'U', 'ü': 'u', 'Ü': 'U', 'ū': 'u', 'Ū': 'U',
'ÿ': 'y',
'æ': 'ae', 'Æ': 'AE', 'œ':'oe', 'Œ': 'OE',
'-': None, '.': None, "'": None })
# Les dictionnaires
dSUBDIC = { '*': 'Commun',
'R': 'Réforme1990',
'M': 'Moderne',
'C': 'Classique',
'A': 'Annexe',
'X': 'Contributeurs' }
dCLASSIQUE = { 'name': 'DICTIONNAIRE ORTHOGRAPHIQUE FRANÇAIS “CLASSIQUE”',
'shortname': '“Classique”',
'asciiName': 'fr-classique',
'mozAsciiName': 'fr-FR-classic',
'subDicts': '*MCX',
'mozId': 'fr-dicollecte-classique',
'description': "Dictionnaire français “Classique”" }
dREFORME1990 = { 'name': 'DICTIONNAIRE ORTHOGRAPHIQUE FRANÇAIS “RÉFORME 1990”',
'shortname': '“Réforme 1990”',
'asciiName': 'fr-reforme1990',
'mozAsciiName': 'fr-FR-reform',
'subDicts': '*RX',
'mozId': 'fr-dicollecte-reforme1990',
'description': "Dictionnaire français “Réforme 1990”" }
dTOUTESVAR = { 'name': 'DICTIONNAIRE ORTHOGRAPHIQUE FRANÇAIS “TOUTES VARIANTES”',
'shortname': '“Toutes variantes”',
'asciiName': 'fr-toutesvariantes',
'mozAsciiName': 'fr-FR-classic-reform',
'subDicts': '*MCRAX',
'mozId': 'fr-dicollecte-toutesvariantes',
'description': "Dictionnaire français “Toutes variantes”" }
dMOZEXT = { 'name': 'DICTIONNAIRE ORTHOGRAPHIQUE FRANÇAIS',
'mozId': 'fr-dicollecte',
'description': "Dictionnaire orthographique de la langue française" }
BUILD_PATH = '_build'
PREFIX_DICT_PATH = 'hunspell-french-dictionaries-v'
EXT_PREFIX_OOO = 'lo-oo-ressources-linguistiques-fr-v'
EXT_PREFIX_MOZ = 'moz-hunspell-fr-v'
LEX_PREFIX = 'lexique-grammalecte-fr-v'
STATS_NAME = 'statistiques-v'
MPLHEADER = "# This Source Code Form is subject to the terms of the Mozilla Public\n" + \
"# License, v. 2.0. If a copy of the MPL was not distributed with this\n" + \
"# file, You can obtain one at http://mozilla.org/MPL/2.0/.\n\n"
def echo (obj, sep=' ', end='\n', file=sys.stdout, flush=False):
""" Print for Windows to avoid Python crashes.
Encoding depends on Windows locale. No useful standard.
Always returns True (useful for debugging)."""
if sys.platform != "win32":
print(obj, sep=sep, end=end, file=file, flush=flush)
return True
try:
print(str(obj).replace("œ", "oe"), sep=sep, end=end, file=file, flush=flush)
except:
try:
print(str(obj).translate(CHARMAP), sep=sep, end=end, file=file, flush=flush)
except:
print(str(obj).encode('ascii', 'replace').decode('ascii', 'replace'), sep=sep, end=end, file=file, flush=flush)
return True
def makeLongFlags (sFlags):
"renvoie la liste des drapeaux, créés à partir de la chaîne"
if len(sFlags) % 2 != 0:
echo(">| erreur: %s" % sFlags)
sFlags = sFlags + ' '
return [ sFlags[i:i+2] for i in range(0, len(sFlags), 2) ]
def makeNumFlags (sFlags):
return sFlags.split(',')
def makeOneCharFlags (sFlags):
return list(sFlags)
def fieldToHunspell (sFieldName, sFieldValue):
"renvoie le texte pour Hunspell de la valeur d’un champ"
sSep = ' ' + sFieldName + ':'
return sSep + sFieldValue.replace(' ', sSep)
def getListNgrams (sWord, n):
return [ sWord[i:i+n] for i in range(len(sWord)-n-1) ]
def createZipFiles (spSrc, spDst, zipFileName):
echo(' > Zip [ {} ]'.format(spSrc))
def _addDir (_spSrc, _subPath, _zipFile):
for _fileToZip in os.listdir(_spSrc):
if os.path.isdir(_spSrc+'/'+_fileToZip):
_addDir(_spSrc+'/'+_fileToZip, _fileToZip, _zipFile)
else:
zipFile.write(_spSrc+'/'+_fileToZip, _subPath+'/'+_fileToZip)
#
zipFile = zipfile.ZipFile(spDst+'/'+zipFileName, 'w', zipfile.ZIP_DEFLATED)
for fileToZip in os.listdir(spSrc):
if os.path.isdir(spSrc+'/'+fileToZip):
_addDir(spSrc+'/'+fileToZip, fileToZip, zipFileName)
else:
zipFile.write(spSrc+'/'+fileToZip, fileToZip)
zipFile.close()
def copyTemplate (spSrc, spDst, spf, dVars):
if spf.endswith('xml') or spf.endswith('rdf'):
for key in dVars:
dVars[key] = dVars[key].replace('&', '&')
xTemplate = Template( open(spSrc+'/'+spf, 'r', encoding='utf-8').read() )
open(spDst+'/'+spf, 'w', encoding='utf-8', newline="\n").write(xTemplate.safe_substitute(dVars))
def getIfq (f):
"renvoie l’indice de fréquence (un caractère)"
if f == 0: return '0'
if f < 0.00000001: return '1'
if f < 0.0000001: return '2'
if f < 0.000001: return '3'
if f < 0.00001: return '4'
if f < 0.0001: return '5'
if f < 0.001: return '6'
if f < 0.01: return '7'
if f < 0.1: return '8'
return '9'
def getVerbMultiMorph (s):
"renvoie la liste des morphologies fusionnées"
lTag = s.split()
lRes = []
for n, sTag in enumerate(lTag, 1):
if not sTag[0].isdigit():
sMorph = sTag
for sTag2 in lTag[n:]:
if sTag2[0].isdigit():
lRes.append(sMorph + " " + sTag2)
else:
break
return lRes
def readfile (spf):
"generator: returns file line by line"
if os.path.isfile(spf):
with open(spf, "r", encoding="utf-8") as hSrc:
for sLine in hSrc:
yield sLine
else:
print("# Error: file not found.")
def createFolder (sp):
"make a folder if it doesn’t exist; don’t change anything if it exists"
if not os.path.exists(sp):
os.makedirs(sp, exist_ok=True)
class Dictionnaire:
def __init__ (self, version, name):
# Dictionary
self.sName = name
self.lEntry = []
self.nEntry = 0
self.sVersion = version
# Affixes
self.sSettings = '' # enregistre tout avant la ligne # END
self.dFlags = collections.OrderedDict()
self.bShortenTags = False
self.dAM = collections.OrderedDict() # étiquettes morphologiques
self.dAF = collections.OrderedDict() # étiquettes drapeaux
# Flexions
self.lFlexions = [] # liste des flexions avec lemme, morphologie et occurrences
self.lStatsLex = []
self.nTotOccurRecognizedWords = 0
self.aFlexions = None
def readDictionary (self, spf):
"Lecture du dictionnaire"
echo('Dictionnaire << [ {} ]'.format(spf), end=' ')
for sLine in readfile(spf):
sLine = sLine.strip()
if not sLine.isdigit() and not sLine.startswith("#"):
self.lEntry.append(Entree(sLine))
self.nEntry = len(self.lEntry)
echo('- {} entrées'.format(self.nEntry))
def readAffixes (self, spf):
"Lecture du fichier des affixes"
echo("Dictionnaire << [ {} ]".format(spf))
bSettings = True
for sLine in readfile(spf):
if sLine.startswith("# END"):
bSettings = False
elif sLine.startswith("#"):
pass
elif sLine.startswith(("PFX", "SFX")):
sLine = re.sub(" *#.*$", "", sLine.rstrip(" \n"))
lElem = sLine.split()
if len(lElem) >= 4:
if lElem[1] not in self.dFlags:
# nouveau drapeau
oFlag = Flag(lElem[0], lElem[1], lElem[2])
self.dFlags[lElem[1]] = oFlag
else:
# nouvelle règle
oFlag.addAffixRule(sLine)
else:
echo(" # erreur de lecture: {}".format(sLine))
elif bSettings:
# toutes les lignes non-commentaires avant # END sont enregistrées dans self.sSettings
self.sSettings += sLine
def defineAbreviatedTags (self, nMode, spDst):
"Abrégé des étiquettes grammaticales et des drapeaux"
echo(" * Dictionnaire - compression Hunspell... ")
self.bShortenTags = True
dAF = {}
dAM = {}
for oFlag in self.dFlags.values():
for oRule in oFlag.lRules:
if oRule.flags:
dAF[oRule.flags] = dAF.get(oRule.flags, 0) + 1
sMorph = oRule.getMorph(nMode).strip()
if sMorph:
dAM[sMorph] = dAM.get(sMorph, 0) + 1
for oEntry in self.lEntry:
if oEntry.flags:
dAF[oEntry.flags] = dAF.get(oEntry.flags, 0) + 1
sMorph = oEntry.getMorph(nMode).strip()
if sMorph:
dAM[sMorph] = dAM.get(sMorph, 0) + 1
lAF = sorted(dAF.items(), key = lambda x: (x[1], x[0]), reverse=True)
lAM = sorted(dAM.items(), key = lambda x: (x[1], x[0]), reverse=True)
with open(spDst, 'a', encoding='utf-8', newline="\n") as hDst:
hDst.write("\n\nDrapeaux :\n")
for nAF, elem in enumerate(lAF, 1):
self.dAF[elem[0]] = str(nAF)
hDst.write(" > {0[1]:>8} : {0[0]}\n".format(elem))
hDst.write("\n\nMorphologies :\n")
for nAM, elem in enumerate(lAM, 1):
self.dAM[elem[0]] = str(nAM)
hDst.write(" > {0[1]:>8} : {0[0]}\n".format(elem))
def writeDictionary (self, spDst, dTplVars, nMode, bSimplified):
"Écrire le fichier dictionnaire (.dic)"
echo(' * Dictionnaire >> [ {}.dic ] ({})'.format(dTplVars['asciiName'], dTplVars['subDicts']))
nEntry = 0
for oEntry in self.lEntry:
if oEntry.di in dTplVars['subDicts'] and " " not in oEntry.lemma:
nEntry += 1
with open(spDst+'/'+dTplVars['asciiName']+'.dic', 'w', encoding='utf-8', newline="\n") as hDst:
hDst.write(str(nEntry)+"\n")
for oEntry in self.lEntry:
if oEntry.di in dTplVars['subDicts'] and re.search(r"^[\w’'-]+$", oEntry.lemma):
hDst.write(oEntry.getHunspellLine(self, nMode, bSimplified))
def writeAffixes (self, spDst, dTplVars, nMode, bSimplified):
"Écrire le fichier des affixes (.aff)"
echo(' * Dictionnaire >> [ {}.aff ]'.format(dTplVars['asciiName']))
info = "# This Source Code Form is subject to the terms of the Mozilla Public\n" + \
"# License, v. 2.0. If a copy of the MPL was not distributed with this\n" + \
"# file, You can obtain one at http://mozilla.org/MPL/2.0/.\n\n" + \
"# AFFIXES DU {} v{}\n".format(dTplVars['name'], self.sVersion) + \
"# par Olivier R. -- licence MPL 2.0\n" + \
"# Pour améliorer le dictionnaire, allez sur https://grammalecte.net/\n\n"
with open(spDst+'/'+dTplVars['asciiName']+'.aff', 'w', encoding='utf-8', newline="\n") as hDst:
hDst.write(info)
hDst.write(self.sSettings + "\n")
if self.bShortenTags:
hDst.write("AM {}\n".format(len(self.dAM)))
for item in self.dAM.items():
hDst.write("AM {}\n".format(item[0]))
hDst.write("\n")
hDst.write("AF {}\n".format(len(self.dAF)))
for item in self.dAF.items():
hDst.write("AF {}\n".format(item[0]))
hDst.write("\n")
for oFlag in self.dFlags.values():
hDst.write(oFlag.getFlag(dTplVars['subDicts'], self, nMode, bSimplified))
def sortEntriesNatural (self):
echo(' * Dictionnaire - Tri naturel des entrées...')
self.lEntry = sorted(self.lEntry, key=Entree.keyTriNat)
def sortEntriesNumerical (self):
echo(' * Dictionnaire - Tri numérique des entrées...')
self.lEntry = sorted(self.lEntry, key=Entree.keyTriNum)
def sortLexiconByFlexion (self):
echo(' * Dictionnaire - tri du lexique (par flexion)...')
self.lFlexions = sorted(self.lFlexions, key=Flexion.keyFlexion)
def sortLexiconByFreq (self):
echo(' * Dictionnaire - tri du lexique (par fréquence)...')
self.lFlexions = sorted(self.lFlexions, key=Flexion.keyFreq)
def sortLexiconByIdx (self):
echo(' * Dictionnaire - tri du lexique (par index)...')
self.lFlexions = sorted(self.lFlexions, key=Flexion.keyIdx)
def checkEntries (self):
echo(' * Dictionnaire - contrôle des entrées...')
for e in self.lEntry:
e.check()
def generateFlexions (self):
echo(' * Lexique - genèse des formes fléchies...')
for oEntry in self.lEntry:
oEntry.generateFlexions(self.dFlags)
self.lFlexions.extend(oEntry.lFlexions)
# Count flexions in multiple entries
d = {}
for oFlex in self.lFlexions:
if oFlex.sFlexion in d:
if oFlex.oEntry not in d[oFlex.sFlexion]:
d[oFlex.sFlexion].append(oFlex.oEntry)
else:
d[oFlex.sFlexion] = [oFlex.oEntry]
for oFlex in self.lFlexions:
oFlex.lMulti = list(d[oFlex.sFlexion])
oFlex.nMulti = len(oFlex.lMulti)
for oFlex in self.lFlexions:
oFlex.lMulti.remove(oFlex.oEntry)
oFlex.nMulti -= 1
def setTagsFrom (self, other):
echo(' * Dictionnaire - copie des tags...')
for i in range(self.nEntry):
for oEntry in other.lEntry:
if self.lEntry[i].lemma == oEntry.lemma and self.lEntry[i].flags == oEntry.flags:
self.lEntry[i].setTagsFrom(oEntry)
def calculateStats (self, oStatsLex, spfDst):
echo(" * Dictionnaire - calculs...")
with open(spfDst, 'w', encoding='utf-8', newline="\n") as hDst:
# Occurrences brutes des formes fléchies
echo(" comptage des occurrences...")
hDst.write(oStatsLex.getInfo())
for oFlex in self.lFlexions:
oFlex.setOccur(oStatsLex.getFlexionOccur(oFlex.sFlexion))
self.nTotOccurRecognizedWords = 0
for oFlex in self.lFlexions:
oFlex.calcOccur()
self.nTotOccurRecognizedWords += oFlex.nOccur
# Report des occurrences
echo(" report des occurrences des formes fléchies multiples...")
hDst.write("Report des occurrences des formes fléchies multiples :\n")
hDst.write(" Légende :\n")
hDst.write(" >> le nombre d’occurrences de la flexion est ramené à la moyenne.\n")
hDst.write(" +> le nombre d’occurrences de la flexion est augmenté avec le surplus d’occurrences des flexions ramenées à la moyenne.\n")
hDst.write(" %> le nombre d’occurrences de la flexion est pondéré avec le poids de la moyenne de l’entrée.\n\n")
for oEntry in self.lEntry:
oEntry.calcOccurFromFlexions()
oEntry.calcAverageKnownOccurrence()
oEntry.solveOccurMultipleFlexions(hDst, oStatsLex)
oEntry.calcOccurFromFlexions()
# Fréquences
echo(" calcul des fréquences et indices de fréquence...")
for oFlex in self.lFlexions:
oFlex.calcFreq(self.nTotOccurRecognizedWords)
for oEntry in self.lEntry:
oEntry.calcFreq(self.nTotOccurRecognizedWords)
# Entrées, statistiques
echo(" statistiques...")
hDst.write("\n\nNatures grammaticales :\n")
d = {}
for oEntry in self.lEntry:
po = re.sub("(?<=v[0-3])[itnpqrmaezx_]+", "", oEntry.po)
d[po] = d.get(po, 0) + 1
for e in sorted(d.items(), key = lambda x: (x[1], x[0]), reverse=True):
hDst.write(" * {0[1]:<15} : {0[0]}\n".format(e))
hDst.write("\n\nVentilation des entrées par indice de fréquence :\n")
d1 = {}
d2 = {}
for oEntry in self.lEntry:
d1[oEntry.fq] = d1.get(oEntry.fq, 0) + 1
d2[oEntry.fq] = d2.get(oEntry.fq, 0) + oEntry.fFreq
for k in sorted(d1.keys()):
hDst.write(" * {} : {} entrées ({:.2f} %) → {:.9f} %\n".format(k, d1[k], (d1[k]*100)/self.nEntry, d2[k]))
hDst.write("\n\nRépartition des entrées par sous-dictionnaire :\n")
d = {}
for oEntry in self.lEntry:
d[oEntry.di] = d.get(oEntry.di, 0) + 1
for sKey, nVal in d.items():
hDst.write(" * {0:<15} : {1} entrées ({2:.2f} %)\n".format(dSUBDIC[sKey], nVal, (nVal*100)/self.nEntry))
# Occurrences des lettres
echo(" occurrences des lettres...")
d = {}
for oFlex in self.lFlexions:
for c in oFlex.sFlexion:
d[c] = d.get(c, 0) + oFlex.nOccur
nTot = 0
for k in d:
nTot += d[k]
if nTot:
hDst.write("\n\nOccurrences des lettres dans le corpus :\n")
for sKey, nVal in sorted(d.items(), key = lambda x: (x[1], x[0]), reverse=True):
hDst.write(" {} : {:>16,.0f} / {:.8f} %\n".format(sKey, nVal, nVal*100/nTot))
# Mots par nombre de lettres
echo(" Nombre de lettres dans les mots...")
if not self.aFlexions:
self.aFlexions = set([e.sFlexion for e in self.lFlexions])
d = {}
for sFlex in self.aFlexions:
n = len(sFlex)
d[n] = d.get(n, 0) + 1
hDst.write("\n\nNombre de lettres dans les graphies :\n")
for sKey, nVal in sorted(d.items()):
hDst.write(" {:>2} lettres : {:>8} graphies\n".format(sKey, nVal))
hDst.write("\n\nNombre de formes fléchies : {}\n".format(len(self.lFlexions)))
hDst.write("\n\nNombre de graphies : {}\n".format(len(self.aFlexions)))
def calcMetagraphe (self):
echo(" * Lexique - Metagraphe")
for oFlex in self.lFlexions:
oFlex.calcMetagraphe()
def calcMetaphone2 (self):
echo(" * Lexique - Metaphone 2")
for oFlex in self.lFlexions:
oFlex.calcMetaphone2()
def createNgrams (self, spDest, n):
echo(" * Lexique - Ngrams " + str(n))
if n < 2:
echo("erreur: n = " + str(n))
return
dOccur = {} # ngram:n
dRefW = {} # ngram:set(idx)
dWords = {} # word:idx
for oFlex in self.lFlexions:
for sNgram in getListNgrams(oFlex.sFlexion, n):
# words list
if oFlex.sFlexion not in dWords:
dWords[oFlex.sFlexion] = len(dWords)
idx = dWords[oFlex.sFlexion]
# ngram occurrence
dOccur[sNgram] = dOccur.get(sNgram, 0) + 1
if sNgram not in dRefW:
dRefW[sNgram] = set()
# ngram word reference
dRefW[sNgram].add(idx)
with open(spDest+"/ngrams-%d.txt"%n, 'w', encoding='utf-8', newline="\n") as hDst:
for key, value in dWords.items():
hDst.write("%d: %s\n"% (value, key))
for key, value in dOccur.items():
if value > 1:
hDst.write("%s: %d -- "% (key, value))
hDst.write(str(dRefW[key]))
hDst.write("\n")
def writeLexicon (self, spfDst, version, oStatsLex):
echo(' * Lexique >> [ {} ] '.format(spfDst))
with open(spfDst, 'w', encoding='utf-8', newline="\n") as hDst:
hDst.write(MPLHEADER)
hDst.write("# Lexique des formes fléchies du français - Grammalecte v{}\n# Licence : MPL v2.0\n\n".format(version))
hDst.write(oStatsLex.getInfo())
hDst.write(Flexion.header(oStatsLex))
for oFlex in self.lFlexions:
hDst.write(oFlex.__str__(oStatsLex))
def writeGrammarCheckerLexicon (self, spfDst, version):
echo(' * Lexique simplifié >> [ {} ] '.format(spfDst))
with open(spfDst[:-4]+".lex", 'w', encoding='utf-8', newline="\n") as hDst:
hDst.write(MPLHEADER)
hDst.write("# Lexique simplifié pour Grammalecte v{}\n# Licence : MPL v2.0\n\n".format(version))
hDst.write(Flexion.simpleHeader())
for oFlex in self.lFlexions:
hDst.write(oFlex.getGrammarCheckerRepr())
def createFiles (self, spDst, lDictVars, nMode, bSimplified):
sDicName = PREFIX_DICT_PATH + self.sVersion
spDic = spDst + '/' + sDicName
createFolder(spDic)
for dVars in lDictVars:
# template vars
dVars['version'] = self.sVersion
# Dictionaries files (.dic) (.aff)
self.writeAffixes(spDic, dVars, nMode, bSimplified)
self.writeDictionary(spDic, dVars, nMode, bSimplified)
copyTemplate('orthographe', spDic, 'README_dict_fr.txt', dVars)
createZipFiles(spDic, spDst, sDicName + '.zip')
def createLibreOfficeExtension (self, spBuild, dTplVars, lDictVars, sThesVer, spDestGL=""):
# LibreOffice extension
echo(" * Dictionnaire >> extension pour LibreOffice")
dTplVars['version'] = self.sVersion
sExtensionName = EXT_PREFIX_OOO + self.sVersion
spExt = spBuild + '/' + sExtensionName
createFolder(spExt+'/META-INF')
createFolder(spExt+'/ui')
createFolder(spExt+'/dictionaries')
createFolder(spExt+'/pythonpath')
shutil.copy2('_templates/ooo/manifest.xml', spExt+'/META-INF')
shutil.copy2('_templates/ooo/DictionarySwitcher.py', spExt)
shutil.copy2('_templates/ooo/ds_strings.py', spExt+'/pythonpath')
shutil.copy2('_templates/ooo/addons.xcu', spExt+'/ui')
shutil.copy2('_templates/ooo/french_flag.png', spExt)
shutil.copy2('_templates/ooo/french_flag_16.bmp', spExt+'/ui')
copyTemplate('_templates/ooo', spExt, 'description.xml', dTplVars)
copyTemplate('_templates/ooo', spExt, 'dictionaries.xcu', dTplVars)
#shutil.copy2('_templates/ooo/dictionaries.xcu.tpl.xml', spExt)
copyTemplate('_templates/ooo', spExt, 'package-description.txt', dTplVars)
for dVars in lDictVars:
dicPath = spBuild + '/' + PREFIX_DICT_PATH + self.sVersion
shutil.copy2(dicPath+'/'+dVars['asciiName']+'.dic', spExt+'/dictionaries/'+dVars['asciiName']+'.dic')
shutil.copy2(dicPath+'/'+dVars['asciiName']+'.aff', spExt+'/dictionaries/'+dVars['asciiName']+'.aff')
copyTemplate('orthographe', spExt+'/dictionaries', 'README_dict_fr.txt', dTplVars)
# hyphenation
shutil.copy2('césures/hyph_fr.dic', spExt+'/dictionaries')
shutil.copy2('césures/hyph_fr.iso8859-1.dic', spExt+'/dictionaries')
shutil.copy2('césures/frhyph.tex', spExt+'/dictionaries')
shutil.copy2('césures/hyph-fr.tex', spExt+'/dictionaries')
shutil.copy2('césures/README_hyph_fr-3.0.txt', spExt+'/dictionaries')
shutil.copy2('césures/README_hyph_fr-2.9.txt', spExt+'/dictionaries')
# thesaurus
shutil.copy2(spBuild+"/thesaurus-v"+sThesVer+'/thes_fr.dat', spExt+"/dictionaries")
shutil.copy2(spBuild+"/thesaurus-v"+sThesVer+'/thes_fr.idx', spExt+"/dictionaries")
shutil.copy2(spBuild+"/thesaurus-v"+sThesVer+'/README_thes_fr.txt', spExt+"/dictionaries")
# zip
createZipFiles(spExt, spBuild, sExtensionName + '.oxt')
# copy to Grammalecte Project
if spDestGL:
echo(" Dictionnaires Hunspell copiés dans Grammalecte pour LibreOffice...")
shutil.copytree(spExt+'/dictionaries', spDestGL, dirs_exist_ok=True)
def createMozillaExtensions (self, spBuild, dTplVars, lDictVars, spDestGL=""):
# Mozilla extension 1
echo(" * Dictionnaire >> extension pour Mozilla")
dTplVars['version'] = self.sVersion
sExtensionName = EXT_PREFIX_MOZ + self.sVersion
spExt = spBuild + '/' + sExtensionName
createFolder(spExt+'/dictionaries')
copyTemplate('_templates/moz', spExt, 'manifest.json', dTplVars)
spDict = spBuild + '/' + PREFIX_DICT_PATH + self.sVersion
shutil.copy2(spDict+'/fr-classique.dic', spExt+'/dictionaries/fr-classic.dic')
shutil.copy2(spDict+'/fr-classique.aff', spExt+'/dictionaries/fr-classic.aff')
copyTemplate('orthographe', spExt, 'README_dict_fr.txt', dTplVars)
createZipFiles(spExt, spBuild, sExtensionName + '.xpi')
# Grammalecte
if spDestGL:
echo(" Dictionnaires Hunspell copiés dans Grammalecte pour Mozilla")
for dVars in lDictVars:
shutil.copy2(spDict+'/'+dVars['asciiName']+'.dic', spDestGL+'/'+dVars['mozAsciiName']+"/"+dVars['mozAsciiName']+'.dic')
shutil.copy2(spDict+'/'+dVars['asciiName']+'.aff', spDestGL+'/'+dVars['mozAsciiName']+"/"+dVars['mozAsciiName']+'.aff')
def createFileIfqForDB (self, spBuild):
echo(" * Dictionnaire >> indices de fréquence pour la DB...")
with open(spBuild+'/dictIdxIfq-'+self.sVersion+'.diff.txt', 'w', encoding='utf-8', newline="\n") as hDiff, \
open(spBuild+'/dictIdxIfq-'+self.sVersion+'.notes.txt', 'w', encoding='utf-8', newline="\n") as hNotes:
for oEntry in self.lEntry:
if oEntry.fq != oEntry.oldFq:
hDiff.write("{0.iD}\t{0.fq}\n".format(oEntry))
hNotes.write("{0.lemma}/{0.flags}\t{0.oldFq} > {0.fq}\n".format(oEntry))
def createLexiconPackages (self, spBuild, version, oStatsLex, spDestGL=""):
sLexName = LEX_PREFIX + version
spLex = spBuild + '/' + sLexName
createFolder(spLex)
# write lexicon
self.sortLexiconByFreq()
self.writeLexicon(spLex + '/' + sLexName + '.txt', version, oStatsLex)
self.writeGrammarCheckerLexicon(spBuild + '/' + sLexName + '.lex', version)
copyTemplate('lexique', spLex, 'README_lexique.txt', {'version': version})
# zip
createZipFiles(spLex, spBuild, sLexName + '.zip')
# copy GC lexicon to Grammalecte
if spDestGL:
shutil.copy2(spBuild + '/' + sLexName + '.lex', spDestGL + '/French.lex')
shutil.copy2('lexique/French.tagset.txt', spDestGL)
def createDictConj (self, spBuild, spDestGL=""):
echo(" * Dictionnaire >> fichier de conjugaison...")
with open(spBuild+'/dictConj.txt', 'w', encoding='utf-8', newline="\n") as hDst:
for oEntry in self.lEntry:
if oEntry.po.startswith("v"):
hDst.write(oEntry.getConjugation())
if spDestGL:
echo(" Fichier de conjugaison copié dans Grammalecte...")
shutil.copy2(spBuild+'/dictConj.txt', spDestGL)
def createDictDecl (self, spBuild, spDestGL=""):
echo(" * Dictionnaire >> fichier de déclinaison...")
with open(spBuild+'/dictDecl.txt', 'w', encoding='utf-8', newline="\n") as hDst:
for oEntry in self.lEntry:
if re.match("[SXFWIA]", oEntry.flags) and (oEntry.po.startswith("nom") or oEntry.po.startswith("adj")):
hDst.write(oEntry.getDeclination())
if spDestGL:
echo(" Fichier de déclinaison copié dans Grammalecte...")
shutil.copy2(spBuild+'/dictDecl.txt', spDestGL)
class Entree:
def __init__ (self, sLine):
self.lemma = ''
self.flags = ''
# champs morphologiques Hunspell
self.po = ''
self.iz = '' # hunspell attribute: is
self.ds = ''
self.ts = ''
self.ip = ''
self.dp = ''
self.tp = ''
self.sp = ''
self.pa = ''
self.st = ''
self.al = ''
self.ph = ''
# champs annexes
self.lx = ''
self.se = ''
self.et = ''
self.di = '*'
self.fq = ''
self.iD = '0' # hunspell attribute: id
# autres
self.comment = ''
self.err = ''
self.nFlexions = 0
self.lFlexions = []
self.sStem = ''
self.nOccur = 0
self.nAKO = -1 # Average known occurrences
self.fFreq = 0
self.oldFq = ''
sLine = sLine.rstrip(" \n")
# commentaire
if '#' in sLine:
sLine, comment = sLine.split('#', 1)
self.comment = comment.strip()
# éléments de la ligne
elems = sLine.split("\t")
nElems = len(elems)
# lemme et drapeaux
firstElems = elems[0].split('/') if elems[0] != "/" else elems[0]
self.lemma = firstElems[0]
self.flags = firstElems[1] if len(firstElems) > 1 else ''
# morph
for i in range(1, nElems):
if len(elems[i]) > 3 and elems[i][2] == ':':
sAttr, sContent = elems[i].split(':', 1)
if sAttr in {"po", "is", "ds", "ts", "ip", "dp", "tp", "sp", "pa", "st", "al", "ph", "lx", "se", "et", "di", "fq", "id"}:
# vérification
if sAttr in {"po", "is", "lx", "se", "et"} \
and ( sContent not in tags.dTags[sAttr] and not (sAttr == "po" and re.match("v[0123][ea_][ix_][tx_][nx_][pqrex_][mx_][eaz_]", sContent)) ):
echo(" ## Étiquette inconnue pour le tag <{}>: {} @ {}/{}".format(sAttr, sContent, self.lemma, self.flags))
# renommage des attributs
if sAttr == "is":
sAttr = "iz"
if sAttr == "id":
sAttr = "iD"
# modification
try:
if sAttr in {"po", "iz", "ds", "ts", "ip", "dp", "tp", "sp", "pa", "st", "al", "ph", "lx", "se", "et"}:
sContent = getattr(self, sAttr) + " " + sContent
setattr(self, sAttr, sContent.strip())
except:
echo(' ## Erreur. Attribut non attribuable: {} @ {}/{}'.format(sAttr, self.lemma, self.flags))
else:
echo(' ## Champ inconnu: {} @ {}/{}'.format(sAttr, self.lemma, self.flags))
else:
self.err = self.err + elems[i]
if self.err:
echo("\n## Erreur dans le dictionnaire : {}".format(self.err))
echo(" @ : " + self.lemma)
def __str__ (self):
return "{0.lemma}/{0.flags} {1}".format(self, self.getMorph(2))
def check (self):
sErr = ''
if self.lemma == '':
sErr += 'lemme vide'
if re.match(r"^\s", self.lemma):
sErr += 'premier caractère un espace @ <' + self.lemma + '>'
if re.search(r"\s$", self.lemma):
sErr += 'espace en fin de lemme'
if re.match(r"v[0123]", self.po) and not re.match(r"[eas_][ix_][tx_][nx_][pqreuvx_][mx_][ex_z][ax_z]\b", self.po[2:]):
sErr += 'verbe inconnu: ' + self.po
if (re.match(r"S[.]", self.flags) and re.search("[sxz]$", self.lemma)) or (re.match(r"X[.]", self.flags) and not re.search("[ul]$", self.lemma)):
sErr += 'drapeau inutile'
if self.iz == '' and re.match(r"[SXAI](?!=)", self.flags) and self.po:
sErr += '<is> vide'
if re.match(r"pl|sg|inv", self.iz):
sErr += '<is> incomplet'
if re.match(r"[FW]", self.flags) and re.search(r"epi|mas|fem|inv|sg|pl", self.iz):
sErr += '<is> incohérent'
if re.search(r"pl|sg|inv", self.iz) and re.match(r"[SXAIFW](?!=)", self.flags):
sErr += '<is> incohérent'
if self.iz.endswith(("mas", "fem", "epi")) and (not self.flags or not self.flags.startswith(("S", "X", "F", "W", "A", "I", "U"))):
sErr += '<is> incomplet'
if self.flags.startswith(("a0", "b0", "c0", "d0")) and not self.lemma.endswith("er"):
sErr += "drapeau pour verbe du 1ᵉʳ groupe sur un lemme non conforme"
if self.flags.startswith("f") and not self.lemma.endswith(("ir", "ïr")):
sErr += "drapeau pour verbe du 2ᵉ groupe sur un lemme non conforme"
if sErr:
echo(' error - id: ' + self.iD, end = "")
echo(' ' + sErr + ' in ' + self.__str__())
def setTagsFrom (self, oEnt):
self.po = oEnt.po
self.iz = oEnt.iz
self.ds = oEnt.ds
self.ts = oEnt.ts
self.ip = oEnt.ip
self.dp = oEnt.dp
self.tp = oEnt.tp
self.sp = oEnt.sp
self.pa = oEnt.pa
self.st = oEnt.st
self.al = oEnt.al
self.ph = oEnt.ph
self.lx = oEnt.lx
self.se = oEnt.se
self.et = oEnt.et
self.di = oEnt.di
self.fq = oEnt.fq
def keyTriNat (self):
return (self.lemma.translate(CHARMAP), self.flags, self.po)
def keyTriNum (self):
return (self.lemma, self.flags, self.po)
def getHunspellLine (self, oDict, nMode, bSimplified=False):
sLine = self.lemma.replace("’", "'")
if self.flags:
sLine += '/'
sLine += self.flags if not oDict.bShortenTags or bSimplified else oDict.dAF[self.flags]
if bSimplified:
return sLine.replace("()", "") + "\n"
if nMode > 0:
sMorph = self.getMorph(nMode)
if sMorph:
sLine += sMorph if not oDict.bShortenTags else "\t" + oDict.dAM[sMorph.strip()]
return sLine + "\n"
def getMorph (self, nMode):
txt = ''
if self.po: txt += fieldToHunspell('po', self.po)
if self.iz: txt += fieldToHunspell('is', self.iz)
if self.ds: txt += fieldToHunspell('ds', self.ds)
if self.ts: txt += fieldToHunspell('ts', self.ts)
if self.ip: txt += fieldToHunspell('ip', self.ip)
if self.dp: txt += fieldToHunspell('dp', self.dp)
if self.tp: txt += fieldToHunspell('tp', self.tp)
if self.sp: txt += fieldToHunspell('sp', self.sp)
if self.pa: txt += fieldToHunspell('pa', self.pa)
if self.al: txt += fieldToHunspell('al', self.al)
if self.st: txt += fieldToHunspell('st', self.st)
if self.ph: txt += fieldToHunspell('ph', self.ph)
if nMode > 1:
if self.lx: txt += fieldToHunspell('lx', self.lx)
if self.se: txt += fieldToHunspell('se', self.se)
if self.et: txt += fieldToHunspell('et', self.et)
if self.fq: txt += ' fq:' + self.fq
if self.di != '*': txt += ' di:' + self.di
return txt
def getShortDescr (self):
txt = self.lemma
if self.flags:
txt += '/' + self.flags
if self.di != '*':
txt += ' di:' + self.di
return txt
def generateFlexions (self, dFlags):
lTuples = self._flechir(dFlags)
# création des objects flexions
self.nFlexion = 0
self.lFlexions = []
sReject = ""
for sFlex, sMorph, sDic in lTuples:
if '+' not in sMorph:
sMorph = self.clean(sMorph)
if not sMorph.endswith((" mas", " fem", " epi")):
self.nFlexion += 1
self.lFlexions.append( Flexion(self, sFlex, sMorph, sDic, self.nFlexion) )
else:
#echo(sFlex + " " + sMorph + ", ")
pass
# Lemme
self.sStem = self.st if self.st else self.lemma
# Tag duplicates
d = {}
for oFlex in self.lFlexions:
d[oFlex.sFlexion] = d.get(oFlex.sFlexion, 0) + 1
for oFlex in self.lFlexions:
oFlex.nDup = d[oFlex.sFlexion]
def _flechir (self, dFlags, morph='', iPR=0):
# recursive function!
"renvoie une liste de tuples (déclinaisons, morphologie), formes fléchies du lemme"
if iPR == 2:
return []
if iPR == 0:
morph = self.lexMorph()
lFlexions = [(self.lemma, morph, self.di)] if iPR == 0 and not self.flags.endswith('()') else []
lFlexPrefix = []
lFlexSuffix = []
for sFlag in makeLongFlags(self.flags):
if sFlag not in dFlags:
if sFlag not in ['**', '()', '||', '--']:
lFlexions.append( (self.lemma, '[unknown flag: {}]'.format(sFlag), self.di) )
echo("ERROR: " + self.lemma + ' - unknown flag: ' + sFlag)
else:
oFlag = dFlags[sFlag]
if not oFlag.bSfx:
# cas des préfixes
for oRule in oFlag.lRules:
if oRule.motif.search(self.lemma):
ruleMorph = oRule.lexMorph()
if oRule.cut == '0':
flexion = (oRule.add+self.lemma, ruleMorph+morph, oRule.di)
if oFlag.bMix:
lFlexPrefix.append(flexion)
for flex in lFlexSuffix:
lFlexions.append( (oRule.add+flex[0], flex[1]+ruleMorph) )
else:
lFlexions.append(flexion)
else:
flexion = (self.lemma.replace(oRule.cut, oRule.add, 1), ruleMorph+morph, oRule.di)
if oFlag.bMix:
lFlexPrefix.append(flexion)
for flex in lFlexSuffix:
lFlexions.append( (flex[0].replace(oRule.cut, oRule.add, 1), flex[1]+ruleMorph) )
else:
lFlexions.append(flexion)
if oRule.flags != '' and oRule.flags != '**':
lFlexions.extend(Entree(flexion[0]+'/'+oRule.flags)._flechir(dFlags, flexion[1], iPR+1))
else:
# cas des suffixes
for oRule in oFlag.lRules:
if oRule.motif.search(self.lemma):
ruleMorph = oRule.lexMorph()
if not oRule.flags.endswith('**') or oRule.flags == '**':
# règle ordinaire, pas de circumfix
if oRule.cut == '0':
flexion = (self.lemma+oRule.add, morph+ruleMorph, oRule.di)
if oFlag.bMix:
lFlexSuffix.append(flexion)
for flex in lFlexPrefix:
lFlexions.append( (flex[0]+oRule.add, flex[1]+ruleMorph) )
else:
lFlexions.append(flexion)
else:
nCut = len(oRule.cut)
flexion = (self.lemma[:-nCut]+oRule.add, morph+ruleMorph, oRule.di)
if oFlag.bMix:
lFlexSuffix.append(flexion)
for flex in lFlexPrefix:
lFlexions.append( (flex[0][:-nCut]+oRule.add, flex[1]+ruleMorph) )
else:
lFlexions.append(flexion)
if oRule.flags != '' and oRule.flags != '**':
lFlexions.extend(Entree(flexion[0]+'/'+oRule.flags)._flechir(dFlags, flexion[1], iPR+1))
else:
# la règle impose un circumfix
if oRule.cut == '0':
flexion = (self.lemma+oRule.add, morph+ruleMorph, oRule.di)
else:
flexion = (self.lemma[:-len(oRule.cut)]+oRule.add, morph+ruleMorph, oRule.di)
lFlexions.extend(Entree(flexion[0]+'/'+oRule.flags)._flechir(dFlags, flexion[1], iPR+1))
lFlexions = lFlexions + lFlexPrefix + lFlexSuffix
return lFlexions
def clean (self, s):
return s.replace(' ', ' ').strip(' ')
def lexMorph (self):
# morphology for lexicon
txt = ' '
if self.po: txt += self.po + ' '
if self.iz: txt += self.iz + ' '
if self.ds: txt += self.ds + ' '
if self.ts: txt += self.ts + ' '
if self.ip: txt += self.ip + ' '
if self.dp: txt += self.dp + ' '
if self.tp: txt += self.tp + ' '
if self.sp: txt += self.sp + ' '
return txt
def getConjugation (self):
sRes = self.lemma + "\t" + self.po[1:10] + "\n"
for oFlex in self.lFlexions:
sMorph = oFlex.sMorph[11:].rstrip("!").replace("ppas adj", "ppas").replace("1jsg", "").strip()
if not sMorph.startswith("ppas") and sMorph.find(" ") > 1:
# complex tags
for s in getVerbMultiMorph(sMorph):
sRes += "_\t" + s + "\t" + oFlex.sFlexion + "\n"
else:
sRes += "_\t" + sMorph + "\t" + oFlex.sFlexion + "\n"
return sRes + "$\n"
def getDeclination (self):
sRes = self.lemma + "\t" + self.flags + "\n"
for oFlex in self.lFlexions:
if "adj" in oFlex.sMorph or "nom" in oFlex.sMorph:
sRes += "_\t" + oFlex.sMorph + "\t" + oFlex.sFlexion + "\n"
return sRes + "$\n"
def calcOccurFromFlexions (self):
self.nOccur = 0
for o in self.lFlexions:
self.nOccur += o.nOccur
def calcAverageKnownOccurrence (self):
# nous calculons la moyenne des occurrences des formes fléchies
# qui n’ont pas d’équivalent dans les autres entrées (nMulti = 0)
nOccur = 0
nFlex = 0
for oFlex in self.lFlexions:
if oFlex.nMulti == 0:
nOccur += oFlex.nOccur
nFlex += 1
# moyenne des formes fléchies sans équivalent ou -1
self.nAKO = math.ceil(nOccur / nFlex) if nFlex > 0 else -1
def solveOccurMultipleFlexions (self, hDst, oStatsLex):
sBlank = " "
if self.nAKO >= 0:
for oFlex in self.lFlexions:
if oFlex.nMulti > 0 and not oFlex.bBlocked:
# on trie les entrées avec AKO et sans AKO
lEntWithAKO = []
lEntNoAKO = []
for oEntry in oFlex.lMulti:
if oEntry.nAKO >= 0:
lEntWithAKO.append(oEntry)
else:
lEntNoAKO.append(oEntry)
if lEntNoAKO:
# on calcule la différence totale occasionnée par du passage des flexions appartenant à des entrées avec AKO au niveau AKO
nDiff = (oFlex.nOccur - self.nAKO) * oFlex.nDup
for oEntry in lEntWithAKO:
for oFlexM in oEntry.lFlexions:
if oFlex.sFlexion == oFlexM.sFlexion:
nDiff += oFlexM.nOccur - oEntry.nAKO
if nDiff > 0:
# on peut passer à les formes fléchies à AKO
hDst.write(" * {0.sFlexion}\n".format(oFlex))
hDst.write(" moyenne connue\n")
for oFlexD in self.lFlexions:
if oFlex.sFlexion == oFlexD.sFlexion:
hDst.write(sBlank + "{2:<30} {0.sMorph:<30} {0.nOccur:>10} >> {1:>10}\n".format(oFlexD, self.nAKO, self.getShortDescr()))
oFlexD.setOccurAndBlock(self.nAKO)
for oEntry in lEntWithAKO:
hDst.write(" moyenne connue\n")
for oFlexM in oEntry.lFlexions:
if oFlex.sFlexion == oFlexM.sFlexion:
hDst.write(sBlank + "{2:<30} {0.sMorph:<30} {0.nOccur:>10} >> {1:>10}\n".format(oFlexM, oEntry.nAKO, oEntry.getShortDescr()))
oFlexM.setOccurAndBlock(oEntry.nAKO)
# on répercute nDiff sur les flexions sans AKO
for oEntry in lEntNoAKO:
hDst.write(" sans moyenne connue\n")
for oFlexM in oEntry.lFlexions:
if oFlex.sFlexion == oFlexM.sFlexion:
nNewOccur = oFlexM.nOccur + math.ceil((nDiff / len(lEntNoAKO)) / oFlexM.nDup)
hDst.write(sBlank + "{2:<30} {0.sMorph:<30} {0.nOccur:>10} +> {1:>10}\n".format(oFlexM, nNewOccur, oEntry.getShortDescr()))
oFlexM.setOccurAndBlock(nNewOccur)
else:
# Toutes les entrées sont avec AKO : on pondère
nFlexOccur = oStatsLex.getFlexionOccur(oFlex.sFlexion)
nTotAKO = self.nAKO
for oEnt in oFlex.lMulti:
nTotAKO += oEnt.nAKO
hDst.write(" = {0.sFlexion}\n".format(oFlex))
hDst.write(" moyennes connues\n")
for oFlexD in self.lFlexions:
if oFlex.sFlexion == oFlexD.sFlexion:
nNewOccur = math.ceil((nFlexOccur * (self.nAKO / nTotAKO)) / oFlexD.nDup) if nTotAKO else 0
hDst.write(sBlank + "{2:<30} {0.sMorph:<30} {0.nOccur:>10} %> {1:>10}\n".format(oFlexD, nNewOccur, self.getShortDescr()))
oFlexD.setOccurAndBlock(nNewOccur)
for oEntry in oFlex.lMulti:
for oFlexM in oEntry.lFlexions:
if oFlex.sFlexion == oFlexM.sFlexion:
nNewOccur = math.ceil((nFlexOccur * (oEntry.nAKO / nTotAKO)) / oFlexM.nDup) if nTotAKO else 0
hDst.write(sBlank + "{2:<30} {0.sMorph:<30} {0.nOccur:>10} %> {1:>10}\n".format(oFlexM, nNewOccur, oEntry.getShortDescr()))
oFlexM.setOccurAndBlock(nNewOccur)
def calcFreq (self, nTot):
self.fFreq = (self.nOccur * 100) / nTot if nTot else 0
self.oldFq = self.fq
self.fq = getIfq(self.fFreq)
class Flexion:
def __init__ (self, oEntry, sFlex='', sMorph='', cDic='', nFlexId=0):
self.oEntry = oEntry
self.sFlexion = sFlex
self.sMorph = sMorph
self.cDic = cDic
self.nFlexId = nFlexId
self.nOccur = 0
self.bBlocked = False
self.nDup = 0 # duplicates in the same entry
self.nMulti = 0 # duplicates with other entries
self.lMulti = [] # list of similar flexions
self.fFreq = 0
self.cFq = ''
self.metagfx = '' # métagraphe
self.metaph2 = '' # métaphone 2
def setOccur (self, n):
self.nOccur = n
def setOccurAndBlock (self, n):
self.nOccur = n
self.bBlocked = True
def calcOccur (self):
self.nOccur = math.ceil((self.nOccur / (self.nMulti+1)) / self.nDup)
def calcFreq (self, nTot):
self.fFreq = (self.nOccur * 100) / nTot if nTot else 0
self.cFq = getIfq(self.fFreq)
def calcMetagraphe (self):
t = metagraphe.getMetagraphe(self.sFlexion, self.sMorph)
self.metagfx = t[0] if not t[1] else t[0]+"/"+t[1]
def calcMetaphone2 (self):
t = metaphone2.dm(self.sFlexion)
self.metaph2 = t[0] if not t[1] else t[0]+"/"+t[1]
@classmethod
def header (cls, oStatsLex):
sOccurs = ''
for t in oStatsLex.lLex:
sOccurs += t[1] + "\t"
return "id\tfid\tFlexion\tLemme\tÉtiquettes\tMétagraphe (β)\tMetaphone2\tNotes\tSémantique\tÉtymologie\tSous-dictionnaire\t" + sOccurs + "Total occurrences\tDoublons\tMultiples\tFréquence\tIndice de fréquence\n"
def __str__ (self, oStatsLex):
sOccurs = ''
for v in oStatsLex.dFlexions[self.sFlexion]:
sOccurs += str(v) + "\t"
return "{0.oEntry.iD}\t{0.nFlexId}\t{0.sFlexion}\t{0.oEntry.sStem}\t{0.sMorph}\t{0.metagfx}\t{0.metaph2}\t{0.oEntry.lx}\t{0.oEntry.se}\t{0.oEntry.et}\t{0.oEntry.di}{2}\t{1}{0.nOccur}\t{0.nDup}\t{0.nMulti}\t{0.fFreq:.15f}\t{0.cFq}\n".format(self, sOccurs, "/"+self.cDic if self.cDic != "*" else "")
@classmethod
def simpleHeader (cls):
return "# :POS ;LEX ~SEM =FQ /DIC\n"
def getGrammarCheckerRepr (self):
return "{0.sFlexion}\t{0.oEntry.lemma}\t{1}\n".format(self, self._getSimpleTags())
_dTagReplacement = {
# POS
"nom": ":N", "adj": ":A", "adv": ":W", "negadv": ":X", "mg": ":G", "nb": ":B", "nbro": ":Br",
"loc.nom": ":ÉN", "loc.adj": ":ÉA", "loc.adv": ":ÉW",
"interj": ":J", "loc.interj": ":ÉJ", "titr": ":T",
"mas": ":m", "fem": ":f", "epi": ":e", "sg": ":s", "pl": ":p", "inv": ":i",
"infi": ":Y",
"ppre": ":P", "ppas": ":Q",
"ipre": ":Ip", "iimp": ":Iq", "ipsi": ":Is", "ifut": ":If",
"spre": ":Sp", "simp": ":Sq", "cond": ":K", "impe": ":E",
"1sg": ":1s", "1isg": ":1ś", "1jsg": ":1ŝ", "2sg": ":2s", "3sg": ":3s", "1pl": ":1p", "2pl": ":2p", "3pl": ":3p", "3pl!": ":3p!",
"prepv": ":Rv", "prep": ":R", "loc.prep": ":ÉR", "loc.prepv": ":ÉRv",
"detpos": ":Dp", "detdem": ":Dd", "detind": ":Di", "detneg": ":Dn", "detex": ":De", "det": ":D",
"advint": ":U",
"prodem": ":Od", "proind": ":Oi", "proint": ":Ot", "proneg": ":On", "prorel": ":Or", "proadv": ":Ow",
"properobj": ":Oo", "propersuj": ":Os", "1pe": ":O1", "2pe": ":O2", "3pe": ":O3", "preverb": ":Ov",
"cjco": ":Cc", "cjsub": ":Cs", "cj": ":C", "loc.cj": ":ÉC", "loc.cjsub": ":ÉCs",
"prn": ":M1", "patr": ":M2", "loc.patr": ":ÉM2", "npr": ":MP", "nompr": ":NM",
"pfx": ":Zp", "sfx": ":Zs",
"div": ":H",
"err": ":F",
"ponc": ":@p", "sign": ":@s",
# LEX
"symb": ";S", "unit": ";U", "col": ";C", "hm": ";É", "pel": ";é",
# SEM
"gent": "#G", "lang": "#L", "cité": "#C", "pays": "#P"
}
def _getSimpleTags (self):
s = ""
# POS
for sTag in self.sMorph.split():
if sTag.startswith("v"):
s += ":V" + sTag[1:]
else:
if sTag in self._dTagReplacement:
s += self._dTagReplacement[sTag]
else:
echo(" # unknown tag: " + sTag + " on: " + self.oEntry.lemma)
# LEX
for sTag in self.oEntry.lx.split():
if sTag in self._dTagReplacement:
s += self._dTagReplacement[sTag]
# SEM
for sTag in self.oEntry.se.split():
if sTag in self._dTagReplacement:
s += self._dTagReplacement[sTag]
#s += "~" + self.oEntry.se if self.oEntry.se and self.oEntry.se != "@" else ""
# ETY
#s += "<" + self.oEntry.et if self.oEntry.et and self.oEntry.et != "@" else ""
# IFQ
#s += "=" + self.cFq
# DIC
if self.oEntry.di == "*" and self.cDic != "*":
s += "/" + self.cDic
else:
s += "/" + self.oEntry.di
return s
def keyTriNat (self):
return (self.sFlexion.translate(CHARMAP), self.sMorph)
def keyFreq (self):
return (100-self.fFreq, self.oEntry.sStem, self.sFlexion)
def keyOcc (self):
return (self.nOccur, self.oEntry.sStem, self.sFlexion)
def keyIdx (self):
return self.oEntry.iD
def keyFlexion (self):
return self.sFlexion
class Flag:
def __init__ (self, sFlagType, sFlagName, sMix):
self.sFlagName = sFlagName
self.bSfx = True if sFlagType == 'SFX' else False
self.bMix = True if sMix == 'Y' else False
self.lRules = []
self.nRules = 0
self.nOccur = 0
def addAffixRule (self, line):
"ajoute une règle au drapeau"
oRule = AffixRule(line)
self.lRules.append(oRule)
self.nRules += 1
def getFlag (self, subDicts, oDict, nMode, bSimplified):
nRules = 0
sRules = ''
for oRule in self.lRules:
if oRule.di in subDicts:
if not (bSimplified and oRule.isReplicationRule()):
sRules += oRule.getRuleLine(oDict, nMode, bSimplified)
nRules += 1
if nRules:
txt = "\n"
txt += 'SFX' if self.bSfx else 'PFX'
txt += ' ' + self.sFlagName + ' '
txt += 'Y' if self.bMix else 'N'
txt += ' ' + str(nRules) + "\n"
txt += sRules
return txt
else:
return ''
class AffixRule:
def __init__ (self, sLine):
self.sFlagName = ''
self.bSfx = True
self.comment = ''
# Règle
self.cut = ''
self.add = ''
self.flags = ''
self.cond = ''
self.motif = ''
# champs morphologiques de Hunspell
self.po = ''
self.iz = ''
self.ds = ''
self.ts = ''
self.ip = ''
self.dp = ''
self.tp = ''
self.sp = ''
self.pa = ''
self.ph = ''
# champs de Dicollecte
self.lx = ''
self.di = '*'
# erreurs
self.err = ''
# autres champs
self.nOccur = 0
sLine = sLine.rstrip(" \n")
# commentaire
if '#' in sLine:
sLine, comment = sLine.split('#', 1)
self.comment = comment.strip()
# éléments de la ligne
elems = sLine.split()
nElems = len(elems)
# type et nom
self.bSfx = True if elems[0] == "SFX" else False
self.sFlagName = elems[1]
# lemme et drapeaux
self.cut = elems[2]
if '/' in elems[3]:
self.add, self.flags = elems[3].split('/')
else:
self.add = elems[3]
self.flags = ''
if self.add == '0':
self.add = ''
self.cond = elems[4]
try:
self.motif = re.compile(self.cond+'$') if self.bSfx else re.compile('^'+self.cond)
except:
echo("error:"+self.cond)
# morph
for i in range(5, nElems):
if len(elems[i]) > 3 and elems[i][2] == ':':
fields = elems[i].split(':',1)
if fields[0] == 'po':
self.po = fields[1] if self.po == '' else self.po + ' ' + fields[1]
elif fields[0] == 'is':
self.iz = fields[1] if self.iz == '' else self.iz + ' ' + fields[1]
elif fields[0] == 'ds':
self.ds = fields[1] if self.ds == '' else self.ds + ' ' + fields[1]
elif fields[0] == 'ts':
self.ts = fields[1] if self.ts == '' else self.ts + ' ' + fields[1]
elif fields[0] == 'ip':
self.ip = fields[1] if self.ip == '' else self.ip + ' ' + fields[1]
elif fields[0] == 'dp':
self.dp = fields[1] if self.dp == '' else self.dp + ' ' + fields[1]
elif fields[0] == 'tp':
self.tp = fields[1] if self.tp == '' else self.tp + ' ' + fields[1]
elif fields[0] == 'sp':
self.sp = fields[1] if self.sp == '' else self.sp + ' ' + fields[1]
elif fields[0] == 'pa':
self.pa = fields[1] if self.pa == '' else self.pa + ' ' + fields[1]
elif fields[0] == 'ph':
self.ph = fields[1] if self.pa == '' else self.pa + ' ' + fields[1]
elif fields[0] == 'lx':
self.lx = fields[1] if self.lx == '' else self.lx + ' ' + fields[1]
elif fields[0] == 'di':
self.di = fields[1]
else:
echo('Champ inconnu: {} @ {}'.format(fields[0], self.sFlagName))
else:
echo(" # Erreur affixe : {}".format(line))
def isReplicationRule (self):
"is this rule used for replication of a virtual lemma"
return self.flags == "" and ((self.cut == "0" and self.add == "") or self.cut == self.add)
def getRuleLine (self, oDict, nMode, bSimplified=False):
sLine = 'SFX' if self.bSfx else 'PFX'
sLine += ' ' + self.sFlagName + ' ' + self.cut + ' '
sLine += self.add if self.add else '0'
if self.flags != '':
sLine += '/'
sLine += self.flags if not oDict.bShortenTags or bSimplified else oDict.dAF[self.flags]
if bSimplified:
sLine = sLine.replace("()", "")
sLine += ' ' + self.cond
if not bSimplified and nMode > 0:
sMorph = self.getMorph(nMode)
if sMorph:
sLine += sMorph if not oDict.bShortenTags or bSimplified else ' ' + oDict.dAM[sMorph.strip()]
return sLine + "\n"
def getMorph (self, nMode):
# morphology for Hunspell
txt = ''
if self.po: txt += fieldToHunspell('po', self.po)
if self.iz: txt += fieldToHunspell('is', self.iz)
if self.ds: txt += fieldToHunspell('ds', self.ds)
if self.ts: txt += fieldToHunspell('ts', self.ts)
if self.ip: txt += fieldToHunspell('ip', self.ip)
if self.dp: txt += fieldToHunspell('dp', self.dp)
if self.tp: txt += fieldToHunspell('tp', self.tp)
if self.sp: txt += fieldToHunspell('sp', self.sp)
if self.pa: txt += fieldToHunspell('pa', self.pa)
if self.ph: txt += fieldToHunspell('ph', self.ph)
if nMode > 1:
if self.lx: txt += fieldToHunspell('lx', self.lx)
if self.di != '*': txt += ' di:' + self.di
return txt
def lexMorph (self):
# morphology for lexicon
txt = ' '
if self.po: txt += self.po + ' '
if self.iz: txt += self.iz + ' '
if self.ds: txt += self.ds + ' '
if self.ts: txt += self.ts + ' '
if self.ip: txt += self.ip + ' '
if self.dp: txt += self.dp + ' '
if self.tp: txt += self.tp + ' '
if self.sp: txt += self.sp + ' '
return txt
class StatsLex:
def __init__ (self, oDict):
echo("Lexique statistique")
self.dFlexions = { oFlex.sFlexion: [] for oFlex in oDict.lFlexions }
self.lLex = []
def addLexFromFile (self, sPathFile, cLexID, sLexName):
if not os.path.isfile(sPathFile):
echo(' * Lexique statistique - fichier {} introuvable'.format(sPathFile))
return None
if len(cLexID) != 1:
echo(' * Lexique statistique - fichier {} - identifiant incorrect, 1 caractère requis'.format(sPathFile))
return None
echo(" * Lexique statistique << [ {} ]".format(sPathFile))
nTotKnownOccur = 0
nTotOccur = 0
for sLine in readfile(sPathFile):
sWord, sVal = sLine.rstrip().split()
n = int(sVal)
if sWord in self.dFlexions:
self.dFlexions[sWord].append(n)
nTotKnownOccur += n
nTotOccur += n
self.lLex.append((cLexID, sLexName, nTotKnownOccur, nTotOccur))
# we fill gaps
nLex = len(self.lLex)
for sFlex in self.dFlexions:
if len(self.dFlexions[sFlex]) < nLex:
self.dFlexions[sFlex].append(0)
def getFlexionOccur (self, sFlex):
return sum(self.dFlexions[sFlex])
def getInfo (self):
nKnownTot = 0
nTot = 0
s = "Corpus :\n"
for t in self.lLex:
s += " * {:<20} -> {:>18,} mots reconnus / {:>18,}\n".format(t[1], t[2], t[3])
nKnownTot += t[2]
nTot += t[3]
s += "\n * {:<20} -> {:>18,} mots reconnus / {:>18,}\n\n".format('TOTAL', nKnownTot, nTot)
return s
def write (self, sPathFile):
with open(sPathFile, 'w', encoding='utf-8', newline="\n") as hDst:
for t in self.lLex:
hDst.write(str(t)+"\n")
for e in self.dFlexions.items():
hDst.write("{} - {}\n".format(e[0], e[1]))
def createThesaurusPackage (spBuild, sVersion, spCopy="", spDataDestGL=""):
print(" * Création du thésaurus")
spThesaurus = spBuild+"/thesaurus-v"+sVersion
createFolder(spThesaurus)
thes_build.build("thesaurus/thes_fr.dat", "thesaurus/synsets_fr.dat", spThesaurus)
shutil.copy2('thesaurus/README_thes_fr.txt', spThesaurus)
if spCopy:
# copy in libreoffice extension package
print(" Copie du thésaurus dans:", spCopy)
shutil.copy2(spThesaurus+'/thes_fr.dat', spCopy)
shutil.copy2(spThesaurus+'/thes_fr.idx', spCopy)
shutil.copy2(spThesaurus+'/README_thes_fr.txt', spCopy)
if spDataDestGL:
# copy in data source folder of Grammalecte
shutil.copy2(spThesaurus+'/thes_fr.json', spDataDestGL)
def main ():
xParser = argparse.ArgumentParser()
xParser.add_argument("-v", "--verdic", help="set dictionary version, i.e. 5.4", type=str, default="X.Y.z")
xParser.add_argument("-m", "--mode", help="0: no tags, 1: Hunspell tags (default), 2: All tags", type=int, choices=[0, 1, 2], default=1)
xParser.add_argument("-u", "--uncompress", help="do not use Hunspell compression", action="store_true")
xParser.add_argument("-s", "--simplify", help="no virtual lemmas", action="store_true")
xParser.add_argument("-gl", "--grammalecte", help="copy generated files to Grammalecte folders", action="store_true")
xArgs = xParser.parse_args()
if xArgs.simplify:
xArgs.mode = 0
xArgs.uncompress = True
echo("Python: " + sys.version)
echo("Version: " + xArgs.verdic)
echo("Simplify: " + str(xArgs.simplify))
echo("Mode: " + str(xArgs.mode))
echo("Compression: " + str(not(xArgs.uncompress)))
### création du répertoire
spBuild = BUILD_PATH + '/' + xArgs.verdic
createFolder(spBuild)
### Lecture des fichiers et création du dictionnaire
oFrenchDict = Dictionnaire(xArgs.verdic, "French dictionary")
for sFile in ['orthographe/FRANCAIS.dic']:
oFrenchDict.readDictionary(sFile)
oFrenchDict.readAffixes('orthographe/FRANCAIS_7.aff')
### Contrôle
oFrenchDict.sortEntriesNatural()
oFrenchDict.checkEntries()
### Lexique
oFrenchDict.generateFlexions()
oFrenchDict.calcMetagraphe()
oFrenchDict.calcMetaphone2()
#oFrenchDict.createNgrams(spBuild, 3)
### Statistiques
spfStats = spBuild+'/'+STATS_NAME+xArgs.verdic+'.txt'
oStatsLex = StatsLex(oFrenchDict)
oStatsLex.addLexFromFile('lexique/corpus_data/stats_google_ngram_1.txt', 'G', 'Google 1-grams')
oStatsLex.addLexFromFile('lexique/corpus_data/stats_frwiki.txt', 'W', 'Wikipédia')
oStatsLex.addLexFromFile('lexique/corpus_data/stats_frwikisource.txt', 'S', 'Wikisource')
oStatsLex.addLexFromFile('lexique/corpus_data/stats_litterature.txt', 'L', 'Littérature')
oStatsLex.write(spBuild+'/test_lex.txt')
oFrenchDict.calculateStats(oStatsLex, spfStats)
### Écriture des paquets
echo("Création des paquets...")
sThesaurusVersion = "3.0"
spLexiconDestGL = "../../../lexicons" if xArgs.grammalecte else ""
spLibreOfficeExtDestGL = "../oxt/Dictionnaires/dictionaries" if xArgs.grammalecte else ""
spMozillaExtDestGL = "" if xArgs.grammalecte else "" # no more Hunspell dictionaries in Mozilla extensions for now
spDataDestGL = "../data" if xArgs.grammalecte else ""
### dictionnaires
if not xArgs.uncompress:
oFrenchDict.defineAbreviatedTags(xArgs.mode, spfStats)
oFrenchDict.createFiles(spBuild, [dTOUTESVAR, dCLASSIQUE, dREFORME1990], xArgs.mode, xArgs.simplify)
oFrenchDict.createLexiconPackages(spBuild, xArgs.verdic, oStatsLex, spLexiconDestGL)
oFrenchDict.createFileIfqForDB(spBuild)
createThesaurusPackage(spBuild, sThesaurusVersion, spLibreOfficeExtDestGL, spDataDestGL)
oFrenchDict.createLibreOfficeExtension(spBuild, dMOZEXT, [dTOUTESVAR, dCLASSIQUE, dREFORME1990], sThesaurusVersion, spLibreOfficeExtDestGL)
oFrenchDict.createMozillaExtensions(spBuild, dMOZEXT, [dTOUTESVAR, dCLASSIQUE, dREFORME1990], spMozillaExtDestGL)
oFrenchDict.createDictConj(spBuild, spDataDestGL)
oFrenchDict.createDictDecl(spBuild, spDataDestGL)
if __name__ == '__main__':
main()