Grammalecte: Diff

Differences From Artifact [642a6504a5]:

File gc_lang/fr/dictionnaire/genfrdic.py — part of check-in [e6ddc02081] at 2020-04-29 11:43:11 on branch trunk — [fr] gendicfr: étiquette pour prépositions verbales (user: olr, size: 70581) [annotate] [blame] [check-ins using] [more...]

To Artifact [877bd56310]:

File gc_lang/fr/dictionnaire/genfrdic.py — part of check-in [de4afcc57a] at 2020-05-31 08:02:38 on branch trunk — [fr] gendicfr.py: don’t include date in affixes file -> deterministic build (user: olr, size: 67222) [annotate] [blame] [check-ins using] [more...]

1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17	1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16	-	#!python3 __author__ = "Olivier R." __license__ = "MPL 2" import os import sys ~~import time~~ import re import collections import zipfile import math import argparse from enum import Enum
︙
41 42 43 44 45 46 47 48 49 50 ~~51 52 53 54 55 56 57 58~~ 59 60 61 62 63 64 65	40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55	- - - - - - - - -	# Les dictionnaires dSUBDIC = { '': 'Commun', 'R': 'Réforme1990', 'M': 'Moderne', 'C': 'Classique', 'A': 'Annexe', ~~'P': 'Multimots',~~ 'X': 'Contributeurs' } ~~dMODERNE = { 'name': 'DICTIONNAIRE ORTHOGRAPHIQUE FRANÇAIS “MODERNE”',~~ ~~'shortname': '“Moderne”',~~ ~~'asciiName': 'fr-moderne',~~ ~~'mozAsciiName': 'fr-FR-modern',~~ ~~'subDicts': 'MX',~~ ~~'mozId': 'fr-dicollecte-moderne',~~ ~~'description': "Dictionnaire français “Moderne”" }~~ dCLASSIQUE = { 'name': 'DICTIONNAIRE ORTHOGRAPHIQUE FRANÇAIS “CLASSIQUE”', 'shortname': '“Classique”', 'asciiName': 'fr-classique', 'mozAsciiName': 'fr-FR-classic', 'subDicts': '*MCX', 'mozId': 'fr-dicollecte-classique', 'description': "Dictionnaire français “Classique”" }
︙
84 85 86 87 88 89 90 ~~91 92~~ 93 94 95 96 97 98 99	74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89	- - + +	'mozId': 'fr-dicollecte', 'description': "Dictionnaire orthographique de la langue française" } BUILD_PATH = '_build' PREFIX_DICT_PATH = 'hunspell-french-dictionaries-v' EXT_PREFIX_OOO = 'lo-oo-ressources-linguistiques-fr-v' ~~EXT_PREFIX_MOZ = 'moz-hunspell-fr~~-dicollecte~~-v' LEX_PREFIX = 'lexique-~~dicol~~lecte-fr-v'~~ EXT_PREFIX_MOZ = 'moz-hunspell-fr-v' LEX_PREFIX = 'lexique-grammalecte-fr-v' STATS_NAME = 'statistiques-v' MPLHEADER = "# This Source Code Form is subject to the terms of the Mozilla Public\n" + \ "# License, v. 2.0. If a copy of the MPL was not distributed with this\n" + \ "# file, You can obtain one at http://mozilla.org/MPL/2.0/.\n\n"
︙
311 312 313 314 315 316 317 ~~318~~ 319 320 321 322 323 324 325	301 302 303 304 305 306 307 308 309 310 311 312 313 314	-	"Écrire le fichier des affixes (.aff)" echo(' * Dictionnaire >> [ {}.aff ]'.format(dTplVars['asciiName'])) info = "# This Source Code Form is subject to the terms of the Mozilla Public\n" + \ "# License, v. 2.0. If a copy of the MPL was not distributed with this\n" + \ "# file, You can obtain one at http://mozilla.org/MPL/2.0/.\n\n" + \ "# AFFIXES DU {} v{}\n".format(dTplVars['name'], self.sVersion) + \ "# par Olivier R. -- licence MPL 2.0\n" + \ ~~"# Généré le " + time.strftime("%d-%m-%Y à %H:%M") + "\n" \~~ "# Pour améliorer le dictionnaire, allez sur https://grammalecte.net/\n\n" with open(spDst+'/'+dTplVars['asciiName']+'.aff', 'w', encoding='utf-8', newline="\n") as hDst: hDst.write(info) hDst.write(self.sSettings + "\n") if self.bShortenTags: hDst.write("AM {}\n".format(len(self.dAM)))
︙
512 513 514 515 516 517 518 ~~519~~ 520 521 522 523 524 525 526	501 502 503 504 505 506 507 508 509 510 511 512 513 514 515	- +	hDst.write(str(dRefW[key])) hDst.write("\n") def writeLexicon (self, spfDst, version, oStatsLex): echo(' * Lexique >> [ {} ] '.format(spfDst)) with open(spfDst, 'w', encoding='utf-8', newline="\n") as hDst: hDst.write(MPLHEADER) ~~hDst.write("# Lexique des formes fléchies du français - ~~Dicol~~lecte v{}\n# Licence : MPL v2.0\n\n".format(version))~~ hDst.write("# Lexique des formes fléchies du français - Grammalecte v{}\n# Licence : MPL v2.0\n\n".format(version)) hDst.write(oStatsLex.getInfo()) hDst.write(Flexion.header(oStatsLex)) for oFlex in self.lFlexions: hDst.write(oFlex.__str__(oStatsLex)) def writeGrammarCheckerLexicon (self, spfDst, version): echo(' * Lexique simplifié >> [ {} ] '.format(spfDst))
︙
586 587 588 589 590 591 592 ~~593~~ 594 595 596 597 598 599 600	575 576 577 578 579 580 581 582 583 584 585 586 587 588 589	- +	def createMozillaExtensions (self, spBuild, dTplVars, lDictVars, spDestGL=""): # Mozilla extension 1 echo(" * Dictionnaire >> extension pour Mozilla") dTplVars['version'] = self.sVersion sExtensionName = EXT_PREFIX_MOZ + self.sVersion spExt = spBuild + '/' + sExtensionName dir_util.mkpath(spExt+'/dictionaries') ~~copyTemplate('_templates/moz', spExt, 'inst~~all.rdf~~', dTplVars)~~ copyTemplate('_templates/moz', spExt, 'manifest.json', dTplVars) spDict = spBuild + '/' + PREFIX_DICT_PATH + self.sVersion file_util.copy_file(spDict+'/fr-classique.dic', spExt+'/dictionaries/fr-classic.dic') file_util.copy_file(spDict+'/fr-classique.aff', spExt+'/dictionaries/fr-classic.aff') copyTemplate('orthographe', spExt, 'README_dict_fr.txt', dTplVars) createZipFiles(spExt, spBuild, sExtensionName + '.xpi') # Grammalecte if spDestGL:
︙
612 613 614 615 616 617 618 ~~619~~ 620 621 622 623 624 625 626	601 602 603 604 605 606 607 608 609 610 611 612 613 614 615	- +	hDiff.write("{0.iD}\t{0.fq}\n".format(oEntry)) hNotes.write("{0.lemma}/{0.flags}\t{0.oldFq} > {0.fq}\n".format(oEntry)) def createLexiconPackages (self, spBuild, version, oStatsLex, spDestGL=""): sLexName = LEX_PREFIX + version spLex = spBuild + '/' + sLexName dir_util.mkpath(spLex) ~~# write ~~Dicollecte~~ lexicon~~ # write lexicon self.sortLexiconByFreq() self.writeLexicon(spLex + '/' + sLexName + '.txt', version, oStatsLex) self.writeGrammarCheckerLexicon(spBuild + '/' + sLexName + '.lex', version) copyTemplate('lexique', spLex, 'README_lexique.txt', {'version': version}) # zip createZipFiles(spLex, spBuild, sLexName + '.zip') # copy GC lexicon to Grammalecte
︙
643 644 645 646 647 648 649 650 651 652 653 654 655 656 657 658 659 660 661 662 663 664 665 666 667 668 669 670 671 672 673 674 675 676 677 678 679 680 681 682 683 684 685 686 687 688 689 690 691 692 693 694 695 696 697 698 699 700 701 702 703 704 705 706 707 708 709 710	632 633 634 635 636 637 638 639 640 641 642 643 644 645	- - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - -	with open(spBuild+'/dictDecl.txt', 'w', encoding='utf-8', newline="\n") as hDst: for oEntry in self.lEntry: if re.match("[SXFWIA]", oEntry.flags) and (oEntry.po.startswith("nom") or oEntry.po.startswith("adj")): hDst.write(oEntry.getDeclination()) if spDestGL: echo(" Fichier de déclinaison copié dans Grammalecte...") file_util.copy_file(spBuild+'/dictDecl.txt', spDestGL) ~~def generateSpellVariants (self, nReq, spBuild):~~ ~~if nReq < 1: nReq = 1~~ ~~if nReq > 2: nReq = 2~~ ~~echo(" * Lexique >> variantes par suppression... n = " + str(nReq))~~ ~~with open(spBuild+'/dictSpellVariants-'+str(nReq)+'.txt', 'w', encoding='utf-8', newline="\n") as hDst:~~ ~~for oFlex in frozenset(self.lFlexions):~~ ~~hDst.write(oFlex.sFlexion+"\t_\t_\n")~~ ~~if len(oFlex.sFlexion) <= 2:~~ ~~n = 0~~ ~~elif len(oFlex.sFlexion) <= 5:~~ ~~n = 1~~ ~~else:~~ ~~n = nReq~~ ~~#lTup = self._generatePhonetVariants(oFlex.sFlexion)~~ ~~lTup = self._generateDeleteVariants(oFlex.sFlexion, oFlex.sFlexion, n)~~ ~~for t in lTup:~~ ~~sTag = t[1] if "\t" in t[1] else t[1]+"\t_"~~ ~~hDst.write(t[0]+"\t"+sTag+"\n")~~ ~~_lTupPhonet = [ ("ph", "f"), ("qu", "k"), ("ss", "c"), ("ss", "ç"), ("ct", "x"),~~ ~~("oe", "œ"), ("ae", "æ"), ("ei", "é"), ("ai", "é"), ("au", "o"), ("eau", "o"),~~ ] ~~def _generatePhonetVariants (self, s):~~ ~~l = []~~ ~~for torep, rep in self._lTupPhonet():~~ ~~for m in torep.finditer(s):~~ ~~l.append( (s[:m.start(0)] + rep + s[m.end(0):], str(m.start(0))+":"+str(m.start(0)+len(rep))+">"+torep) )~~ ~~return l~~ ~~def _generateDeleteVariants (self, sWord0, sWordCur, n):~~ ~~"renvoie une liste de tuples : (forme dégradée de sWord, code de genèse de sWord)"~~ ~~# caution: recursive function~~ ~~if n == 0:~~ ~~return []~~ ~~lTup = []~~ ~~for i in range(len(sWordCur)):~~ ~~sNew = sWordCur[0:i]+sWordCur[i+1:]~~ ~~lTup.append( ( sNew, self._generateAddCode(sWord0, sNew) ) )~~ ~~lTup += self._generateDeleteVariants(sWord0, sNew, n-1)~~ ~~return lTup~~ ~~def _generateAddCode (self, sWord, sCrippled):~~ ~~"returns addCode to generate sWord from sCrippled"~~ ~~sAdd = ""~~ ~~for i in range(len(sWord)):~~ ~~if sWord[i] != sCrippled[i:i+1]:~~ ~~sCrippled = sCrippled[:i] + sWord[i] + sCrippled[i:]~~ ~~if sAdd:~~ ~~sAdd += "\t"~~ ~~sAdd += str(i)+"+"+sWord[i]~~ ~~return sAdd if sAdd else "0"~~ class Entree: def __init__ (self, sLine): self.lemma = '' self.flags = '' # champs morphologiques Hunspell
︙
808 809 810 811 812 813 814 ~~815~~ 816 817 818 819 820 821 822 ~~823 824~~ 825 826 ~~827 828 829~~ 830 831 ~~832~~ 833 834 835 836 837 838 839	743 744 745 746 747 748 749 750 751 752 753 754 755 756 757 758 759 760 761 762 763 764 765 766 767 768 769	- + - - - - - - +	sErr += 'lemme vide' if re.match(r"^\s", self.lemma): sErr += 'premier caractère un espace dans <' + self.lemma + '>' if re.search(r"\s$", self.lemma): sErr += 'espace en fin de lemme' if re.match(r"v[0123]", self.po) and not re.match(r"[eas_][ix_][tx_][nx_][pqreuvx_][mx_][ex_z][ax_z]\b", self.po[2:]): sErr += 'verbe inconnu: ' + self.po ~~if (re.match(r"S[.]", self.flags) and re.search("[sxz]$", self.lemma)) or (re.match(r"X[.]", self.flags) and not re.search("[ul]$", self.lemma)):~~ if (re.match(r"S[.]", self.flags) and re.search("[sxz]$", self.lemma)) or (re.match(r"X[.]", self.flags) and not re.search("[ul]$", self.lemma)): sErr += 'drapeau inutile' if self.iz == '' and re.match(r"[SXAI](?!=)", self.flags) and self.po: sErr += '[is] vide' if re.match(r"pl\|sg\|inv", self.iz): sErr += '[is] incomplet' if re.match(r"[FW]", self.flags) and re.search(r"epi\|mas\|fem\|inv\|sg\|pl", self.iz): sErr += '[is] incohérent' ~~if re.match(r".\", self.flags) and re.match(r"[bcdfgjklmnpqrstvwxz]", self.lemma):~~ ~~sErr += 'drapeau pour lemme commençant par une voyelle'~~ if re.search(r"pl\|sg\|inv", self.iz) and re.match(r"[SXAIFW](?!=)", self.flags): sErr += '[is] incohérent' ~~if re.search(r"nom\|adj", self.po) and re.match(r"(?i)[aâàäáeéèêëiîïíìoôöóòuûüúù]", self.lemma) and re.match("[SFWXAI][.]", self.flags) \~~ ~~and "pel" not in self.lx:~~ ~~sErr += 'le drapeau derait finir avec '~~ if self.iz.endswith(("mas", "fem", "epi")) and (not self.flags or not self.flags.startswith(("S", "X", "F", "W", "A", "I", "U"))): sErr += '[is] incomplet' ~~if self.flags.startswith(("a", "b", "c~~", "d~~")) and not self.lemma.endswith("er"):~~ if self.flags.startswith(("a0", "b0", "c0", "d0")) and not self.lemma.endswith("er"): sErr += "drapeau pour verbe du 1ᵉʳ groupe sur un lemme non conforme" if self.flags.startswith("f") and not self.lemma.endswith(("ir", "ïr")): sErr += "drapeau pour verbe du 2ᵉ groupe sur un lemme non conforme" if sErr: echo(' error - id: ' + self.iD, end = "") echo(' ' + sErr + ' in ' + self.__str__())
︙
938 939 940 941 942 943 944 ~~945~~ 946 947 948 949 950 951 952	868 869 870 871 872 873 874 875 876 877 878 879 880 881 882	- +	morph = self.lexMorph() lFlexions = [(self.lemma, morph, self.di)] if iPR == 0 and not self.flags.endswith('()') else [] lFlexPrefix = [] lFlexSuffix = [] for sFlag in makeLongFlags(self.flags): if sFlag not in dFlags: if sFlag not in ['**', '()', '\|\|', '--']: ~~lFlexions.append( (self.lemma, '[unknown flag: {}]'.format(sFlag)) )~~ lFlexions.append( (self.lemma, '[unknown flag: {}]'.format(sFlag), self.di) ) echo("ERROR: " + self.lemma + ' - unknown flag: ' + sFlag) else: oFlag = dFlags[sFlag] if not oFlag.bSfx: # cas des préfixes for oRule in oFlag.lRules: if oRule.motif.search(self.lemma):
︙
1189 1190 1191 1192 1193 1194 1195 ~~1196 1197~~ 1198 1199 1200 1201 1202 1203 ~~1204~~ 1205 1206 1207 1208 ~~1209 1210~~ 1211 1212 1213 1214 1215 ~~1216~~ 1217 1218 1219 1220 1221 1222 1223	1119 1120 1121 1122 1123 1124 1125 1126 1127 1128 1129 1130 1131 1132 1133 1134 1135 1136 1137 1138 1139 1140 1141 1142 1143 1144 1145 1146 1147 1148 1149 1150 1151 1152 1153	- - + + - + - - + + - +	def getGrammarCheckerRepr (self): return "{0.sFlexion}\t{0.oEntry.lemma}\t{1}\n".format(self, self._getSimpleTags()) _dTagReplacement = { # POS "nom": ":N", "adj": ":A", "adv": ":W", "negadv": ":X", "mg": ":G", "nb": ":B", "nbro": ":Br", ~~"loc.nom": ":Ñ", "loc.adj": ":Â", "loc.adv": ":Ŵ", "loc.verb": ":Ṽ", "interj": ":J", "loc.interj": ":Ĵ", "titr": ":T",~~ "loc.nom": ":ÉN", "loc.adj": ":ÉA", "loc.adv": ":ÉW", "loc.verb": ":ÉV", "interj": ":J", "loc.interj": ":ÉJ", "titr": ":T", "mas": ":m", "fem": ":f", "epi": ":e", "sg": ":s", "pl": ":p", "inv": ":i", "infi": ":Y", "ppre": ":P", "ppas": ":Q", "ipre": ":Ip", "iimp": ":Iq", "ipsi": ":Is", "ifut": ":If", "spre": ":Sp", "simp": ":Sq", "cond": ":K", "impe": ":E", "1sg": ":1s", "1isg": ":1ś", "1jsg": ":1ŝ", "2sg": ":2s", "3sg": ":3s", "1pl": ":1p", "2pl": ":2p", "3pl": ":3p", "3pl!": ":3p!", ~~"prepv": ":Rv", "prep": ":R", "loc.prep": ":Ŕ", "loc.prepv": "Ŕv",~~ "prepv": ":Rv", "prep": ":R", "loc.prep": ":ÉR", "loc.prepv": "ÉRv", "detpos": ":Dp", "detdem": ":Dd", "detind": ":Di", "detneg": ":Dn", "detex": ":De", "det": ":D", "advint": ":U", "prodem": ":Od", "proind": ":Oi", "proint": ":Ot", "proneg": ":On", "prorel": ":Or", "proadv": ":Ow", "properobj": ":Oo", "propersuj": ":Os", "1pe": ":O1", "2pe": ":O2", "3pe": ":O3", "preverb": ":Ov", ~~"cjco": ":Cc", "cjsub": ":Cs", "cj": ":C", "loc.cj": ":Ĉ", "loc.cjsub": ":Ĉs", "prn": ":M1", "patr": ":M2", "loc.patr": ":Ḿ2", "npr": ":MP", "nompr": ":NM",~~ "cjco": ":Cc", "cjsub": ":Cs", "cj": ":C", "loc.cj": ":ÉC", "loc.cjsub": ":ÉCs", "prn": ":M1", "patr": ":M2", "loc.patr": ":ÉM2", "npr": ":MP", "nompr": ":NM", "pfx": ":Zp", "sfx": ":Zs", "div": ":H", "err": ":F", "ponc": ":@p", "sign": ":@s", # LEX ~~"symb": ";S"~~ "symb": ";S", "unit": ";U" } def _getSimpleTags (self): s = "" # POS for sTag in self.sMorph.split(): if sTag.startswith("v"):
︙
1505 1506 1507 1508 1509 1510 1511 ~~1512~~ 1513 1514 1515 1516 1517 1518 1519	1435 1436 1437 1438 1439 1440 1441 1442 1443 1444 1445 1446 1447 1448	-	def main (): xParser = argparse.ArgumentParser() xParser.add_argument("-v", "--verdic", help="set dictionary version, i.e. 5.4", type=str, default="X.Y.z") xParser.add_argument("-m", "--mode", help="0: no tags, 1: Hunspell tags (default), 2: All tags", type=int, choices=[0, 1, 2], default=1) xParser.add_argument("-u", "--uncompress", help="do not use Hunspell compression", action="store_true") xParser.add_argument("-s", "--simplify", help="no virtual lemmas", action="store_true") ~~xParser.add_argument("-sv", "--spellvariants", help="generate spell variants", action="store_true")~~ xParser.add_argument("-gl", "--grammalecte", help="copy generated files to Grammalecte folders", action="store_true") xArgs = xParser.parse_args() if xArgs.simplify: xArgs.mode = 0 xArgs.uncompress = True
︙
1527 1528 1529 1530 1531 1532 1533 ~~1534~~ 1535 1536 1537 1538 1539 1540 1541 1542 1543 1544 1545 ~~1546 1547~~ 1548 1549 1550 1551 1552 1553 1554	1456 1457 1458 1459 1460 1461 1462 1463 1464 1465 1466 1467 1468 1469 1470 1471 1472 1473 1474 1475 1476 1477 1478 1479 1480 1481	- + - -	spBuild = BUILD_PATH + '/' + xArgs.verdic dir_util.mkpath(spBuild) ### Lecture des fichiers et création du dictionnaire oFrenchDict = Dictionnaire(xArgs.verdic, "French dictionary") for sFile in ['orthographe/FRANCAIS.dic']: oFrenchDict.readDictionary(sFile) ~~oFrenchDict.readAffixes('orthographe/FRANCAIS_5.aff')~~ oFrenchDict.readAffixes('orthographe/FRANCAIS_7.aff') ### Contrôle oFrenchDict.sortEntriesNatural() oFrenchDict.checkEntries() ### Lexique oFrenchDict.generateFlexions() oFrenchDict.calcMetagraphe() oFrenchDict.calcMetaphone2() #oFrenchDict.createNgrams(spBuild, 3) ~~if xArgs.spellvariants:~~ ~~oFrenchDict.generateSpellVariants(1, spBuild)~~ ### Statistiques spfStats = spBuild+'/'+STATS_NAME+xArgs.verdic+'.txt' oStatsLex = StatsLex(oFrenchDict) oStatsLex.addLexFromFile('lexique/corpus_data/stats_google_ngram_1.txt', 'G', 'Google 1-grams') oStatsLex.addLexFromFile('lexique/corpus_data/stats_frwiki.txt', 'W', 'Wikipédia') oStatsLex.addLexFromFile('lexique/corpus_data/stats_frwikisource.txt', 'S', 'Wikisource')
︙

Grammalecte Diff

Differences From Artifact [642a6504a5]:

To Artifact [877bd56310]: