Index: gc_lang/fr/dictionnaire/genfrdic.py ================================================================== --- gc_lang/fr/dictionnaire/genfrdic.py +++ gc_lang/fr/dictionnaire/genfrdic.py @@ -23,11 +23,11 @@ import metaphone2 # Dictionnaire des caractères pour le tri naturel. # Ordre souhaitable, mais pose problème pour la recherche, car engendre des égalités de lemmes différents. -# Il faut donc travailler sur un dictionnaire trié *numériquement* et le sauvegarder selon le tri *naturel* +# Il faut donc travailler sur un dictionnaire trié *numériquement* et le sauvegarder selon le tri *naturel* CHARMAP = str.maketrans({ 'à': 'a', 'À': 'A', 'â': 'a', 'Â': 'A', 'ä': 'a', 'Ä': 'A', 'å': 'a', 'Å': 'A', 'ā': 'a', 'Ā': 'A', 'ç': 'c', 'Ç': 'C', 'é': 'e', 'É': 'E', 'è': 'e', 'È': 'E', 'ê': 'e', 'Ê': 'E', 'ë': 'e', 'Ë': 'E', 'ē': 'e', 'Ē': 'E', 'î': 'i', 'Î': 'I', 'ï': 'i', 'Ï': 'I', 'ī': 'i', 'Ī': 'I', 'ñ': 'n', @@ -217,15 +217,15 @@ self.dFlags = collections.OrderedDict() self.bShortenTags = False self.dAM = collections.OrderedDict() # étiquettes morphologiques self.dAF = collections.OrderedDict() # étiquettes drapeaux # Flexions - self.lFlexions = [] # liste des flexions avec lemme, morphologie et occurrences + self.lFlexions = [] # liste des flexions avec lemme, morphologie et occurrences self.lStatsLex = [] self.nTotOccurRecognizedWords = 0 self.aFlexions = None - + def readDictionary (self, spf): "Lecture du dictionnaire" echo('Dictionnaire << [ {} ]'.format(spf), end=' ') for sLine in readfile(spf): sLine = sLine.strip() @@ -280,11 +280,11 @@ if sMorph: dAM[sMorph] = dAM.get(sMorph, 0) + 1 lAF = sorted(dAF.items(), key = lambda x: (x[1], x[0]), reverse=True) lAM = sorted(dAM.items(), key = lambda x: (x[1], x[0]), reverse=True) - + with open(spDst, 'a', encoding='utf-8', newline="\n") as hDst: hDst.write("\n\nDrapeaux :\n") for nAF, elem in enumerate(lAF, 1): self.dAF[elem[0]] = str(nAF) hDst.write(" > {0[1]:>8} : {0[0]}\n".format(elem)) @@ -303,11 +303,11 @@ with open(spDst+'/'+dTplVars['asciiName']+'.dic', 'w', encoding='utf-8', newline="\n") as hDst: hDst.write(str(nEntry)+"\n") for oEntry in self.lEntry: if oEntry.di in dTplVars['subDicts']: hDst.write(oEntry.getEntryLine(self, nMode, bSimplified)) - + def writeAffixes (self, spDst, dTplVars, nMode, bSimplified): "Écrire le fichier des affixes (.aff)" echo(' * Dictionnaire >> [ {}.aff ]'.format(dTplVars['asciiName'])) info = "# This Source Code Form is subject to the terms of the Mozilla Public\n" + \ "# License, v. 2.0. If a copy of the MPL was not distributed with this\n" + \ @@ -314,11 +314,11 @@ "# file, You can obtain one at http://mozilla.org/MPL/2.0/.\n\n" + \ "# AFFIXES DU {} v{}\n".format(dTplVars['name'], self.sVersion) + \ "# par Olivier R. -- licence MPL 2.0\n" + \ "# Généré le " + time.strftime("%d-%m-%Y à %H:%M") + "\n" \ "# Pour améliorer le dictionnaire, allez sur http://www.dicollecte.org/\n\n" - + with open(spDst+'/'+dTplVars['asciiName']+'.aff', 'w', encoding='utf-8', newline="\n") as hDst: hDst.write(info) hDst.write(self.sSettings + "\n") if self.bShortenTags: hDst.write("AM {}\n".format(len(self.dAM))) @@ -336,11 +336,11 @@ echo(' * Dictionnaire - Tri naturel des entrées...') self.lEntry = sorted(self.lEntry, key=Entree.keyTriNat) def sortEntriesNumerical (self): echo(' * Dictionnaire - Tri numérique des entrées...') - self.lEntry = sorted(self.lEntry, key=Entree.keyTriNum) + self.lEntry = sorted(self.lEntry, key=Entree.keyTriNum) def sortLexiconByFlexion (self): echo(' * Dictionnaire - tri du lexique (par flexion)...') self.lFlexions = sorted(self.lFlexions, key=Flexion.keyFlexion) @@ -374,11 +374,11 @@ oFlex.lMulti = list(d[oFlex.sFlexion]) oFlex.nMulti = len(oFlex.lMulti) for oFlex in self.lFlexions: oFlex.lMulti.remove(oFlex.oEntry) oFlex.nMulti -= 1 - + def setTagsFrom (self, other): echo(' * Dictionnaire - copie des tags...') for i in range(self.nEntry): for oEntry in other.lEntry: if self.lEntry[i].lemma == oEntry.lemma and self.lEntry[i].flags == oEntry.flags: @@ -394,11 +394,11 @@ oFlex.setOccur(oStatsLex.getFlexionOccur(oFlex.sFlexion)) self.nTotOccurRecognizedWords = 0 for oFlex in self.lFlexions: oFlex.calcOccur() self.nTotOccurRecognizedWords += oFlex.nOccur - + # Report des occurrences echo(" report des occurrences des formes fléchies multiples...") hDst.write("Report des occurrences des formes fléchies multiples :\n") hDst.write(" Légende :\n") hDst.write(" >> le nombre d’occurrences de la flexion est ramené à la moyenne.\n") @@ -408,44 +408,44 @@ for oEntry in self.lEntry: oEntry.calcOccurFromFlexions() oEntry.calcAverageKnownOccurrence() oEntry.solveOccurMultipleFlexions(hDst, oStatsLex) oEntry.calcOccurFromFlexions() - + # Fréquences echo(" calcul des fréquences et indices de fréquence...") for oFlex in self.lFlexions: oFlex.calcFreq(self.nTotOccurRecognizedWords) for oEntry in self.lEntry: oEntry.calcFreq(self.nTotOccurRecognizedWords) - + # Entrées, statistiques echo(" statistiques...") hDst.write("\n\nNatures grammaticales :\n") d = {} for oEntry in self.lEntry: po = re.sub("(?<=v[0-3])[itnpqrmaezx_]+", "", oEntry.po) d[po] = d.get(po, 0) + 1 for e in sorted(d.items(), key = lambda x: (x[1], x[0]), reverse=True): hDst.write(" * {0[1]:<15} : {0[0]}\n".format(e)) - + hDst.write("\n\nVentilation des entrées par indice de fréquence :\n") d1 = {} d2 = {} for oEntry in self.lEntry: d1[oEntry.fq] = d1.get(oEntry.fq, 0) + 1 d2[oEntry.fq] = d2.get(oEntry.fq, 0) + oEntry.fFreq for k in sorted(d1.keys()): hDst.write(" * {} : {} entrées ({:.2f} %) → {:.9f} %\n".format(k, d1[k], (d1[k]*100)/self.nEntry, d2[k])) - + hDst.write("\n\nRépartition des entrées par sous-dictionnaire :\n") d = {} for oEntry in self.lEntry: d[oEntry.di] = d.get(oEntry.di, 0) + 1 for sKey, nVal in d.items(): hDst.write(" * {0:<15} : {1} entrées ({2:.2f} %)\n".format(dSUBDIC[sKey], nVal, (nVal*100)/self.nEntry)) - + # Occurrences des lettres echo(" occurrences des lettres...") d = {} for oFlex in self.lFlexions: for c in oFlex.sFlexion: @@ -474,16 +474,16 @@ def calcMetagraphe (self): echo(" * Lexique - Metagraphe") for oFlex in self.lFlexions: oFlex.calcMetagraphe() - + def calcMetaphone2 (self): echo(" * Lexique - Metaphone 2") for oFlex in self.lFlexions: oFlex.calcMetaphone2() - + def createNgrams (self, spDest, n): echo(" * Lexique - Ngrams " + str(n)) if n < 2: echo("erreur: n = " + str(n)) return @@ -562,11 +562,11 @@ copyTemplate('_templates/ooo', spExt, 'description.xml', dTplVars) copyTemplate('_templates/ooo', spExt, 'dictionaries.xcu', dTplVars) #file_util.copy_file('_templates/ooo/dictionaries.xcu.tpl.xml', spExt) copyTemplate('_templates/ooo', spExt, 'package-description.txt', dTplVars) for dVars in lDictVars: - dicPath = spBuild + '/' + PREFIX_DICT_PATH + self.sVersion + dicPath = spBuild + '/' + PREFIX_DICT_PATH + self.sVersion file_util.copy_file(dicPath+'/'+dVars['asciiName']+'.dic', spExt+'/dictionaries/'+dVars['asciiName']+'.dic') file_util.copy_file(dicPath+'/'+dVars['asciiName']+'.aff', spExt+'/dictionaries/'+dVars['asciiName']+'.aff') copyTemplate('orthographe', spExt+'/dictionaries', 'README_dict_fr.txt', dTplVars) # thesaurus file_util.copy_file('thesaurus/thes_fr.dat', spExt+'/dictionaries') @@ -583,11 +583,11 @@ createZipFiles(spExt, spBuild, sExtensionName + '.oxt') # copy to Grammalecte Project if spDestGL: echo(" extension copiée dans Grammalecte...") dir_util.copy_tree(spExt+'/dictionaries', spDestGL) - + def createMozillaExtensions (self, spBuild, dTplVars, lDictVars, spDestGL=""): # Mozilla extension 1 echo(" * Dictionnaire >> extension pour Mozilla") dTplVars['version'] = self.sVersion sExtensionName = EXT_PREFIX_MOZ + self.sVersion @@ -603,20 +603,20 @@ if spDestGL: echo(" * Dictionnaire >> copie des dicos dans Grammalecte") for dVars in lDictVars: file_util.copy_file(spDict+'/'+dVars['asciiName']+'.dic', spDestGL+'/'+dVars['mozAsciiName']+"/"+dVars['mozAsciiName']+'.dic') file_util.copy_file(spDict+'/'+dVars['asciiName']+'.aff', spDestGL+'/'+dVars['mozAsciiName']+"/"+dVars['mozAsciiName']+'.aff') - + def createFileIfqForDB (self, spBuild): echo(" * Dictionnaire >> indices de fréquence pour la DB...") with open(spBuild+'/dictIdxIfq-'+self.sVersion+'.diff.txt', 'w', encoding='utf-8', newline="\n") as hDiff, \ open(spBuild+'/dictIdxIfq-'+self.sVersion+'.notes.txt', 'w', encoding='utf-8', newline="\n") as hNotes: for oEntry in self.lEntry: if oEntry.fq != oEntry.oldFq: hDiff.write("{0.iD}\t{0.fq}\n".format(oEntry)) hNotes.write("{0.lemma}/{0.flags}\t{0.oldFq} > {0.fq}\n".format(oEntry)) - + def createLexiconPackages (self, spBuild, version, oStatsLex, spDestGL=""): sLexName = LEX_PREFIX + version spLex = spBuild + '/' + sLexName dir_util.mkpath(spLex) # write Dicollecte lexicon @@ -739,11 +739,11 @@ self.sRadical = '' self.nOccur = 0 self.nAKO = -1 # Average known occurrences self.fFreq = 0 self.oldFq = '' - + sLine = sLine.rstrip(" \n") # commentaire if '#' in sLine: sLine, comment = sLine.split('#', 1) self.comment = comment.strip() @@ -799,11 +799,11 @@ else: self.err = self.err + elems[i] if self.err: echo("\n## Erreur dans le dictionnaire : {}".format(self.err)) echo(" dans : " + self.lemma) - + def __str__ (self): return "{0.lemma}/{0.flags} {1}".format(self, self.getMorph(2)) def check (self): sErr = '' @@ -865,11 +865,11 @@ return (self.lemma.translate(CHARMAP), self.flags, self.po) def keyTriNum (self): return (self.lemma, self.flags, self.po) - def getEntryLine (self, oDict, nMode, bSimplified=False): + def getEntryLine (self, oDict, nMode, bSimplified=False): sLine = self.lemma if self.flags: sLine += '/' sLine += self.flags if not oDict.bShortenTags or bSimplified else oDict.dAF[self.flags] if bSimplified: @@ -928,11 +928,11 @@ # Drapeaux dont le lemme féminin doit être remplacé par le masculin dans la gestion des formes fléchies if self.flags.startswith(("F.", "F*", "W.", "W*")): # recherche de la forme masculine for t in lTuples: sMorph = self.clean(t[1]) - if sMorph.endswith('mas') or sMorph.endswith('mas sg') or sMorph.endswith('mas inv'): + if sMorph.endswith('mas') or sMorph.endswith('mas sg') or sMorph.endswith('mas inv'): self.sRadical = t[0] else: self.sRadical = self.lemma # Tag duplicates d = {} @@ -973,11 +973,11 @@ lFlexions.append(flexion) else: flexion = (self.lemma.replace(oRule.cut, oRule.add, 1), ruleMorph+morph, oRule.di) if oFlag.bMix: lFlexPrefix.append(flexion) - for flex in lFlexSuffix: + for flex in lFlexSuffix: lFlexions.append( (flex[0].replace(oRule.cut, oRule.add, 1), flex[1]+ruleMorph) ) else: lFlexions.append(flexion) if oRule.flags != '' and oRule.flags != '**': lFlexions.extend(Entree(flexion[0]+'/'+oRule.flags)._flechir(dFlags, flexion[1], iPR+1)) @@ -1060,20 +1060,20 @@ for o in self.lFlexions: self.nOccur += o.nOccur def calcAverageKnownOccurrence (self): # nous calculons la moyenne des occurrences des formes fléchies - # qui n’ont pas d’équivalent dans les autres entrées (nMulti = 0) + # qui n’ont pas d’équivalent dans les autres entrées (nMulti = 0) nOccur = 0 nFlex = 0 for oFlex in self.lFlexions: if oFlex.nMulti == 0: nOccur += oFlex.nOccur nFlex += 1 # moyenne des formes fléchies sans équivalent ou -1 self.nAKO = math.ceil(nOccur / nFlex) if nFlex > 0 else -1 - + def solveOccurMultipleFlexions (self, hDst, oStatsLex): sBlank = " " if self.nAKO >= 0: for oFlex in self.lFlexions: if oFlex.nMulti > 0 and not oFlex.bBlocked: @@ -1083,11 +1083,11 @@ for oEntry in oFlex.lMulti: if oEntry.nAKO >= 0: lEntWithAKO.append(oEntry) else: lEntNoAKO.append(oEntry) - + if lEntNoAKO: # on calcule la différence totale occasionnée par du passage des flexions appartenant à des entrées avec AKO au niveau AKO nDiff = (oFlex.nOccur - self.nAKO) * oFlex.nDup for oEntry in lEntWithAKO: for oFlexM in oEntry.lFlexions: @@ -1119,11 +1119,11 @@ # Toutes les entrées sont avec AKO : on pondère nFlexOccur = oStatsLex.getFlexionOccur(oFlex.sFlexion) nTotAKO = self.nAKO for oEnt in oFlex.lMulti: nTotAKO += oEnt.nAKO - + hDst.write(" = {0.sFlexion}\n".format(oFlex)) hDst.write(" moyennes connues\n") for oFlexD in self.lFlexions: if oFlex.sFlexion == oFlexD.sFlexion: nNewOccur = math.ceil((nFlexOccur * (self.nAKO / nTotAKO)) / oFlexD.nDup) if nTotAKO else 0 @@ -1133,11 +1133,11 @@ for oFlexM in oEntry.lFlexions: if oFlex.sFlexion == oFlexM.sFlexion: nNewOccur = math.ceil((nFlexOccur * (oEntry.nAKO / nTotAKO)) / oFlexM.nDup) if nTotAKO else 0 hDst.write(sBlank + "{2:<30} {0.sMorph:<30} {0.nOccur:>10} %> {1:>10}\n".format(oFlexM, nNewOccur, oEntry.getShortDescr())) oFlexM.setOccurAndBlock(nNewOccur) - + def calcFreq (self, nTot): self.fFreq = (self.nOccur * 100) / nTot self.oldFq = self.fq self.fq = getIfq(self.fFreq) @@ -1156,25 +1156,25 @@ self.lMulti = [] # list of similar flexions self.fFreq = 0 self.cFq = '' self.metagfx = '' # métagraphe self.metaph2 = '' # métaphone 2 - + def setOccur (self, n): self.nOccur = n def setOccurAndBlock (self, n): self.nOccur = n self.bBlocked = True def calcOccur (self): self.nOccur = math.ceil((self.nOccur / (self.nMulti+1)) / self.nDup) - + def calcFreq (self, nTot): self.fFreq = (self.nOccur * 100) / nTot self.cFq = getIfq(self.fFreq) - + def calcMetagraphe (self): t = metagraphe.getMetagraphe(self.sFlexion, self.sMorph) self.metagfx = t[0] if not t[1] else t[0]+"/"+t[1] def calcMetaphone2 (self): @@ -1260,11 +1260,11 @@ def keyFreq (self): return (100-self.fFreq, self.oEntry.sRadical, self.sFlexion) def keyOcc (self): return (self.nOccur, self.oEntry.sRadical, self.sFlexion) - + def keyIdx (self): return self.oEntry.iD def keyFlexion (self): return self.sFlexion @@ -1277,11 +1277,11 @@ self.bSfx = True if sFlagType == 'SFX' else False self.bMix = True if sMix == 'Y' else False self.lRules = [] self.nRules = 0 self.nOccur = 0 - + def addAffixRule (self, line): "ajoute une règle au drapeau" oRule = AffixRule(line) self.lRules.append(oRule) self.nRules += 1 @@ -1333,11 +1333,11 @@ self.di = '*' # erreurs self.err = '' # autres champs self.nOccur = 0 - + sLine = sLine.rstrip(" \n") # commentaire if '#' in sLine: sLine, comment = sLine.split('#', 1) self.comment = comment.strip() @@ -1391,11 +1391,11 @@ self.di = fields[1] else: echo('Champ inconnu: {} dans {}'.format(fields[0], self.sFlagName)) else: echo(" # Erreur affixe : {}".format(line)) - + def isReplicationRule (self): "is this rule used for replication of a virtual lemma" return self.flags == "" and ((self.cut == "0" and self.add == "") or self.cut == self.add) def getRuleLine (self, oDict, nMode, bSimplified=False): @@ -1411,11 +1411,11 @@ if not bSimplified and nMode > 0: sMorph = self.getMorph(nMode) if sMorph: sLine += sMorph if not oDict.bShortenTags or bSimplified else ' ' + oDict.dAM[sMorph.strip()] return sLine + "\n" - + def getMorph (self, nMode): # morphology for Hunspell txt = '' if self.po: txt += fieldToHunspell('po', self.po) if self.iz: txt += fieldToHunspell('is', self.iz) @@ -1450,11 +1450,11 @@ class StatsLex: def __init__ (self, oDict): echo("Lexique statistique") self.dFlexions = { oFlex.sFlexion: [] for oFlex in oDict.lFlexions } self.lLex = [] - + def addLexFromFile (self, sPathFile, cLexID, sLexName): if not os.path.isfile(sPathFile): echo(' * Lexique statistique - fichier {} introuvable'.format(sPathFile)) return None if len(cLexID) != 1: @@ -1518,25 +1518,25 @@ echo("Python: " + sys.version) echo("Version: " + xArgs.verdic) echo("Simplify: " + str(xArgs.simplify)) echo("Mode: " + str(xArgs.mode)) echo("Compression: " + str(not(xArgs.uncompress))) - + ### création du répertoire spBuild = BUILD_PATH + '/' + xArgs.verdic dir_util.mkpath(spBuild) - + ### Lecture des fichiers et création du dictionnaire oFrenchDict = Dictionnaire(xArgs.verdic, "French dictionary") for sFile in ['orthographe/FRANCAIS.dic']: oFrenchDict.readDictionary(sFile) oFrenchDict.readAffixes('orthographe/FRANCAIS_5.aff') - + ### Contrôle oFrenchDict.sortEntriesNatural() oFrenchDict.checkEntries() - + ### Lexique oFrenchDict.generateFlexions() oFrenchDict.calcMetagraphe() oFrenchDict.calcMetaphone2() @@ -1551,17 +1551,17 @@ oStatsLex.addLexFromFile('lexique/corpus_data/stats_frwiki.txt', 'W', 'Wikipédia') oStatsLex.addLexFromFile('lexique/corpus_data/stats_frwikisource.txt', 'S', 'Wikisource') oStatsLex.addLexFromFile('lexique/corpus_data/stats_litterature.txt', 'L', 'Littérature') oStatsLex.write(spBuild+'/test_lex.txt') oFrenchDict.calculateStats(oStatsLex, spfStats) - + ### écriture des paquets echo("Création des paquets...") spLexiconDestGL = "../../../lexicons" if xArgs.grammalecte else "" spLibreOfficeExtDestGL = "../oxt/Dictionnaires/dictionaries" if xArgs.grammalecte else "" - spMozillaExtDestGL = "../xpi/data/dictionaries" if xArgs.grammalecte else "" + spMozillaExtDestGL = "" # les dictionnaires pour Hunspell ne sont plus utilisés pour l’instant dans Firefox / Thunderbird spDataDestGL = "../data" if xArgs.grammalecte else "" if not xArgs.uncompress: oFrenchDict.defineAbreviatedTags(xArgs.mode, spfStats) oFrenchDict.createFiles(spBuild, [dMODERNE, dTOUTESVAR, dCLASSIQUE, dREFORME1990], xArgs.mode, xArgs.simplify)