1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
|
#!python3
__author__ = "Olivier R."
__license__ = "MPL 2"
import os
import sys
import time
import re
import collections
import zipfile
import math
import argparse
from enum import Enum
|
<
|
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
|
#!python3
__author__ = "Olivier R."
__license__ = "MPL 2"
import os
import sys
import re
import collections
import zipfile
import math
import argparse
from enum import Enum
|
︙ | | | ︙ | |
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
|
# Les dictionnaires
dSUBDIC = { '*': 'Commun',
'R': 'Réforme1990',
'M': 'Moderne',
'C': 'Classique',
'A': 'Annexe',
'P': 'Multimots',
'X': 'Contributeurs' }
dMODERNE = { 'name': 'DICTIONNAIRE ORTHOGRAPHIQUE FRANÇAIS “MODERNE”',
'shortname': '“Moderne”',
'asciiName': 'fr-moderne',
'mozAsciiName': 'fr-FR-modern',
'subDicts': '*MX',
'mozId': 'fr-dicollecte-moderne',
'description': "Dictionnaire français “Moderne”" }
dCLASSIQUE = { 'name': 'DICTIONNAIRE ORTHOGRAPHIQUE FRANÇAIS “CLASSIQUE”',
'shortname': '“Classique”',
'asciiName': 'fr-classique',
'mozAsciiName': 'fr-FR-classic',
'subDicts': '*MCX',
'mozId': 'fr-dicollecte-classique',
'description': "Dictionnaire français “Classique”" }
|
<
<
<
<
<
<
<
<
<
|
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
|
# Les dictionnaires
dSUBDIC = { '*': 'Commun',
'R': 'Réforme1990',
'M': 'Moderne',
'C': 'Classique',
'A': 'Annexe',
'X': 'Contributeurs' }
dCLASSIQUE = { 'name': 'DICTIONNAIRE ORTHOGRAPHIQUE FRANÇAIS “CLASSIQUE”',
'shortname': '“Classique”',
'asciiName': 'fr-classique',
'mozAsciiName': 'fr-FR-classic',
'subDicts': '*MCX',
'mozId': 'fr-dicollecte-classique',
'description': "Dictionnaire français “Classique”" }
|
︙ | | | ︙ | |
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
|
'mozId': 'fr-dicollecte',
'description': "Dictionnaire orthographique de la langue française" }
BUILD_PATH = '_build'
PREFIX_DICT_PATH = 'hunspell-french-dictionaries-v'
EXT_PREFIX_OOO = 'lo-oo-ressources-linguistiques-fr-v'
EXT_PREFIX_MOZ = 'moz-hunspell-fr-dicollecte-v'
LEX_PREFIX = 'lexique-dicollecte-fr-v'
STATS_NAME = 'statistiques-v'
MPLHEADER = "# This Source Code Form is subject to the terms of the Mozilla Public\n" + \
"# License, v. 2.0. If a copy of the MPL was not distributed with this\n" + \
"# file, You can obtain one at http://mozilla.org/MPL/2.0/.\n\n"
|
|
|
|
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
|
'mozId': 'fr-dicollecte',
'description': "Dictionnaire orthographique de la langue française" }
BUILD_PATH = '_build'
PREFIX_DICT_PATH = 'hunspell-french-dictionaries-v'
EXT_PREFIX_OOO = 'lo-oo-ressources-linguistiques-fr-v'
EXT_PREFIX_MOZ = 'moz-hunspell-fr-v'
LEX_PREFIX = 'lexique-grammalecte-fr-v'
STATS_NAME = 'statistiques-v'
MPLHEADER = "# This Source Code Form is subject to the terms of the Mozilla Public\n" + \
"# License, v. 2.0. If a copy of the MPL was not distributed with this\n" + \
"# file, You can obtain one at http://mozilla.org/MPL/2.0/.\n\n"
|
︙ | | | ︙ | |
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
|
"Écrire le fichier des affixes (.aff)"
echo(' * Dictionnaire >> [ {}.aff ]'.format(dTplVars['asciiName']))
info = "# This Source Code Form is subject to the terms of the Mozilla Public\n" + \
"# License, v. 2.0. If a copy of the MPL was not distributed with this\n" + \
"# file, You can obtain one at http://mozilla.org/MPL/2.0/.\n\n" + \
"# AFFIXES DU {} v{}\n".format(dTplVars['name'], self.sVersion) + \
"# par Olivier R. -- licence MPL 2.0\n" + \
"# Généré le " + time.strftime("%d-%m-%Y à %H:%M") + "\n" \
"# Pour améliorer le dictionnaire, allez sur https://grammalecte.net/\n\n"
with open(spDst+'/'+dTplVars['asciiName']+'.aff', 'w', encoding='utf-8', newline="\n") as hDst:
hDst.write(info)
hDst.write(self.sSettings + "\n")
if self.bShortenTags:
hDst.write("AM {}\n".format(len(self.dAM)))
|
<
|
301
302
303
304
305
306
307
308
309
310
311
312
313
314
|
"Écrire le fichier des affixes (.aff)"
echo(' * Dictionnaire >> [ {}.aff ]'.format(dTplVars['asciiName']))
info = "# This Source Code Form is subject to the terms of the Mozilla Public\n" + \
"# License, v. 2.0. If a copy of the MPL was not distributed with this\n" + \
"# file, You can obtain one at http://mozilla.org/MPL/2.0/.\n\n" + \
"# AFFIXES DU {} v{}\n".format(dTplVars['name'], self.sVersion) + \
"# par Olivier R. -- licence MPL 2.0\n" + \
"# Pour améliorer le dictionnaire, allez sur https://grammalecte.net/\n\n"
with open(spDst+'/'+dTplVars['asciiName']+'.aff', 'w', encoding='utf-8', newline="\n") as hDst:
hDst.write(info)
hDst.write(self.sSettings + "\n")
if self.bShortenTags:
hDst.write("AM {}\n".format(len(self.dAM)))
|
︙ | | | ︙ | |
512
513
514
515
516
517
518
519
520
521
522
523
524
525
526
|
hDst.write(str(dRefW[key]))
hDst.write("\n")
def writeLexicon (self, spfDst, version, oStatsLex):
echo(' * Lexique >> [ {} ] '.format(spfDst))
with open(spfDst, 'w', encoding='utf-8', newline="\n") as hDst:
hDst.write(MPLHEADER)
hDst.write("# Lexique des formes fléchies du français - Dicollecte v{}\n# Licence : MPL v2.0\n\n".format(version))
hDst.write(oStatsLex.getInfo())
hDst.write(Flexion.header(oStatsLex))
for oFlex in self.lFlexions:
hDst.write(oFlex.__str__(oStatsLex))
def writeGrammarCheckerLexicon (self, spfDst, version):
echo(' * Lexique simplifié >> [ {} ] '.format(spfDst))
|
|
|
501
502
503
504
505
506
507
508
509
510
511
512
513
514
515
|
hDst.write(str(dRefW[key]))
hDst.write("\n")
def writeLexicon (self, spfDst, version, oStatsLex):
echo(' * Lexique >> [ {} ] '.format(spfDst))
with open(spfDst, 'w', encoding='utf-8', newline="\n") as hDst:
hDst.write(MPLHEADER)
hDst.write("# Lexique des formes fléchies du français - Grammalecte v{}\n# Licence : MPL v2.0\n\n".format(version))
hDst.write(oStatsLex.getInfo())
hDst.write(Flexion.header(oStatsLex))
for oFlex in self.lFlexions:
hDst.write(oFlex.__str__(oStatsLex))
def writeGrammarCheckerLexicon (self, spfDst, version):
echo(' * Lexique simplifié >> [ {} ] '.format(spfDst))
|
︙ | | | ︙ | |
586
587
588
589
590
591
592
593
594
595
596
597
598
599
600
|
def createMozillaExtensions (self, spBuild, dTplVars, lDictVars, spDestGL=""):
# Mozilla extension 1
echo(" * Dictionnaire >> extension pour Mozilla")
dTplVars['version'] = self.sVersion
sExtensionName = EXT_PREFIX_MOZ + self.sVersion
spExt = spBuild + '/' + sExtensionName
dir_util.mkpath(spExt+'/dictionaries')
copyTemplate('_templates/moz', spExt, 'install.rdf', dTplVars)
spDict = spBuild + '/' + PREFIX_DICT_PATH + self.sVersion
file_util.copy_file(spDict+'/fr-classique.dic', spExt+'/dictionaries/fr-classic.dic')
file_util.copy_file(spDict+'/fr-classique.aff', spExt+'/dictionaries/fr-classic.aff')
copyTemplate('orthographe', spExt, 'README_dict_fr.txt', dTplVars)
createZipFiles(spExt, spBuild, sExtensionName + '.xpi')
# Grammalecte
if spDestGL:
|
|
|
575
576
577
578
579
580
581
582
583
584
585
586
587
588
589
|
def createMozillaExtensions (self, spBuild, dTplVars, lDictVars, spDestGL=""):
# Mozilla extension 1
echo(" * Dictionnaire >> extension pour Mozilla")
dTplVars['version'] = self.sVersion
sExtensionName = EXT_PREFIX_MOZ + self.sVersion
spExt = spBuild + '/' + sExtensionName
dir_util.mkpath(spExt+'/dictionaries')
copyTemplate('_templates/moz', spExt, 'manifest.json', dTplVars)
spDict = spBuild + '/' + PREFIX_DICT_PATH + self.sVersion
file_util.copy_file(spDict+'/fr-classique.dic', spExt+'/dictionaries/fr-classic.dic')
file_util.copy_file(spDict+'/fr-classique.aff', spExt+'/dictionaries/fr-classic.aff')
copyTemplate('orthographe', spExt, 'README_dict_fr.txt', dTplVars)
createZipFiles(spExt, spBuild, sExtensionName + '.xpi')
# Grammalecte
if spDestGL:
|
︙ | | | ︙ | |
612
613
614
615
616
617
618
619
620
621
622
623
624
625
626
|
hDiff.write("{0.iD}\t{0.fq}\n".format(oEntry))
hNotes.write("{0.lemma}/{0.flags}\t{0.oldFq} > {0.fq}\n".format(oEntry))
def createLexiconPackages (self, spBuild, version, oStatsLex, spDestGL=""):
sLexName = LEX_PREFIX + version
spLex = spBuild + '/' + sLexName
dir_util.mkpath(spLex)
# write Dicollecte lexicon
self.sortLexiconByFreq()
self.writeLexicon(spLex + '/' + sLexName + '.txt', version, oStatsLex)
self.writeGrammarCheckerLexicon(spBuild + '/' + sLexName + '.lex', version)
copyTemplate('lexique', spLex, 'README_lexique.txt', {'version': version})
# zip
createZipFiles(spLex, spBuild, sLexName + '.zip')
# copy GC lexicon to Grammalecte
|
|
|
601
602
603
604
605
606
607
608
609
610
611
612
613
614
615
|
hDiff.write("{0.iD}\t{0.fq}\n".format(oEntry))
hNotes.write("{0.lemma}/{0.flags}\t{0.oldFq} > {0.fq}\n".format(oEntry))
def createLexiconPackages (self, spBuild, version, oStatsLex, spDestGL=""):
sLexName = LEX_PREFIX + version
spLex = spBuild + '/' + sLexName
dir_util.mkpath(spLex)
# write lexicon
self.sortLexiconByFreq()
self.writeLexicon(spLex + '/' + sLexName + '.txt', version, oStatsLex)
self.writeGrammarCheckerLexicon(spBuild + '/' + sLexName + '.lex', version)
copyTemplate('lexique', spLex, 'README_lexique.txt', {'version': version})
# zip
createZipFiles(spLex, spBuild, sLexName + '.zip')
# copy GC lexicon to Grammalecte
|
︙ | | | ︙ | |
643
644
645
646
647
648
649
650
651
652
653
654
655
656
657
658
659
660
661
662
663
664
665
666
667
668
669
670
671
672
673
674
675
676
677
678
679
680
681
682
683
684
685
686
687
688
689
690
691
692
693
694
695
696
697
698
699
700
701
702
703
704
705
706
707
708
709
710
|
with open(spBuild+'/dictDecl.txt', 'w', encoding='utf-8', newline="\n") as hDst:
for oEntry in self.lEntry:
if re.match("[SXFWIA]", oEntry.flags) and (oEntry.po.startswith("nom") or oEntry.po.startswith("adj")):
hDst.write(oEntry.getDeclination())
if spDestGL:
echo(" Fichier de déclinaison copié dans Grammalecte...")
file_util.copy_file(spBuild+'/dictDecl.txt', spDestGL)
def generateSpellVariants (self, nReq, spBuild):
if nReq < 1: nReq = 1
if nReq > 2: nReq = 2
echo(" * Lexique >> variantes par suppression... n = " + str(nReq))
with open(spBuild+'/dictSpellVariants-'+str(nReq)+'.txt', 'w', encoding='utf-8', newline="\n") as hDst:
for oFlex in frozenset(self.lFlexions):
hDst.write(oFlex.sFlexion+"\t_\t_\n")
if len(oFlex.sFlexion) <= 2:
n = 0
elif len(oFlex.sFlexion) <= 5:
n = 1
else:
n = nReq
#lTup = self._generatePhonetVariants(oFlex.sFlexion)
lTup = self._generateDeleteVariants(oFlex.sFlexion, oFlex.sFlexion, n)
for t in lTup:
sTag = t[1] if "\t" in t[1] else t[1]+"\t_"
hDst.write(t[0]+"\t"+sTag+"\n")
_lTupPhonet = [ ("ph", "f"), ("qu", "k"), ("ss", "c"), ("ss", "ç"), ("ct", "x"),
("oe", "œ"), ("ae", "æ"), ("ei", "é"), ("ai", "é"), ("au", "o"), ("eau", "o"),
]
def _generatePhonetVariants (self, s):
l = []
for torep, rep in self._lTupPhonet():
for m in torep.finditer(s):
l.append( (s[:m.start(0)] + rep + s[m.end(0):], str(m.start(0))+":"+str(m.start(0)+len(rep))+">"+torep) )
return l
def _generateDeleteVariants (self, sWord0, sWordCur, n):
"renvoie une liste de tuples : (forme dégradée de sWord, code de genèse de sWord)"
# caution: recursive function
if n == 0:
return []
lTup = []
for i in range(len(sWordCur)):
sNew = sWordCur[0:i]+sWordCur[i+1:]
lTup.append( ( sNew, self._generateAddCode(sWord0, sNew) ) )
lTup += self._generateDeleteVariants(sWord0, sNew, n-1)
return lTup
def _generateAddCode (self, sWord, sCrippled):
"returns addCode to generate sWord from sCrippled"
sAdd = ""
for i in range(len(sWord)):
if sWord[i] != sCrippled[i:i+1]:
sCrippled = sCrippled[:i] + sWord[i] + sCrippled[i:]
if sAdd:
sAdd += "\t"
sAdd += str(i)+"+"+sWord[i]
return sAdd if sAdd else "0"
class Entree:
def __init__ (self, sLine):
self.lemma = ''
self.flags = ''
# champs morphologiques Hunspell
|
<
<
<
<
<
<
<
<
<
<
<
<
<
<
<
<
<
<
<
<
<
<
<
<
<
<
<
<
<
<
<
<
<
<
<
<
<
<
<
<
<
<
<
<
<
<
<
<
<
<
<
<
<
<
|
632
633
634
635
636
637
638
639
640
641
642
643
644
645
|
with open(spBuild+'/dictDecl.txt', 'w', encoding='utf-8', newline="\n") as hDst:
for oEntry in self.lEntry:
if re.match("[SXFWIA]", oEntry.flags) and (oEntry.po.startswith("nom") or oEntry.po.startswith("adj")):
hDst.write(oEntry.getDeclination())
if spDestGL:
echo(" Fichier de déclinaison copié dans Grammalecte...")
file_util.copy_file(spBuild+'/dictDecl.txt', spDestGL)
class Entree:
def __init__ (self, sLine):
self.lemma = ''
self.flags = ''
# champs morphologiques Hunspell
|
︙ | | | ︙ | |
808
809
810
811
812
813
814
815
816
817
818
819
820
821
822
823
824
825
826
827
828
829
830
831
832
833
834
835
836
837
838
839
|
sErr += 'lemme vide'
if re.match(r"^\s", self.lemma):
sErr += 'premier caractère un espace dans <' + self.lemma + '>'
if re.search(r"\s$", self.lemma):
sErr += 'espace en fin de lemme'
if re.match(r"v[0123]", self.po) and not re.match(r"[eas_][ix_][tx_][nx_][pqreuvx_][mx_][ex_z][ax_z]\b", self.po[2:]):
sErr += 'verbe inconnu: ' + self.po
if (re.match(r"S[*.]", self.flags) and re.search("[sxz]$", self.lemma)) or (re.match(r"X[*.]", self.flags) and not re.search("[ul]$", self.lemma)):
sErr += 'drapeau inutile'
if self.iz == '' and re.match(r"[SXAI](?!=)", self.flags) and self.po:
sErr += '[is] vide'
if re.match(r"pl|sg|inv", self.iz):
sErr += '[is] incomplet'
if re.match(r"[FW]", self.flags) and re.search(r"epi|mas|fem|inv|sg|pl", self.iz):
sErr += '[is] incohérent'
if re.match(r".\*", self.flags) and re.match(r"[bcdfgjklmnpqrstvwxz]", self.lemma):
sErr += 'drapeau pour lemme commençant par une voyelle'
if re.search(r"pl|sg|inv", self.iz) and re.match(r"[SXAIFW](?!=)", self.flags):
sErr += '[is] incohérent'
if re.search(r"nom|adj", self.po) and re.match(r"(?i)[aâàäáeéèêëiîïíìoôöóòuûüúù]", self.lemma) and re.match("[SFWXAI][.]", self.flags) \
and "pel" not in self.lx:
sErr += 'le drapeau derait finir avec *'
if self.iz.endswith(("mas", "fem", "epi")) and (not self.flags or not self.flags.startswith(("S", "X", "F", "W", "A", "I", "U"))):
sErr += '[is] incomplet'
if self.flags.startswith(("a", "b", "c", "d")) and not self.lemma.endswith("er"):
sErr += "drapeau pour verbe du 1ᵉʳ groupe sur un lemme non conforme"
if self.flags.startswith("f") and not self.lemma.endswith(("ir", "ïr")):
sErr += "drapeau pour verbe du 2ᵉ groupe sur un lemme non conforme"
if sErr:
echo(' error - id: ' + self.iD, end = "")
echo(' ' + sErr + ' in ' + self.__str__())
|
|
<
<
<
<
<
|
|
743
744
745
746
747
748
749
750
751
752
753
754
755
756
757
758
759
760
761
762
763
764
765
766
767
768
769
|
sErr += 'lemme vide'
if re.match(r"^\s", self.lemma):
sErr += 'premier caractère un espace dans <' + self.lemma + '>'
if re.search(r"\s$", self.lemma):
sErr += 'espace en fin de lemme'
if re.match(r"v[0123]", self.po) and not re.match(r"[eas_][ix_][tx_][nx_][pqreuvx_][mx_][ex_z][ax_z]\b", self.po[2:]):
sErr += 'verbe inconnu: ' + self.po
if (re.match(r"S[.]", self.flags) and re.search("[sxz]$", self.lemma)) or (re.match(r"X[.]", self.flags) and not re.search("[ul]$", self.lemma)):
sErr += 'drapeau inutile'
if self.iz == '' and re.match(r"[SXAI](?!=)", self.flags) and self.po:
sErr += '[is] vide'
if re.match(r"pl|sg|inv", self.iz):
sErr += '[is] incomplet'
if re.match(r"[FW]", self.flags) and re.search(r"epi|mas|fem|inv|sg|pl", self.iz):
sErr += '[is] incohérent'
if re.search(r"pl|sg|inv", self.iz) and re.match(r"[SXAIFW](?!=)", self.flags):
sErr += '[is] incohérent'
if self.iz.endswith(("mas", "fem", "epi")) and (not self.flags or not self.flags.startswith(("S", "X", "F", "W", "A", "I", "U"))):
sErr += '[is] incomplet'
if self.flags.startswith(("a0", "b0", "c0", "d0")) and not self.lemma.endswith("er"):
sErr += "drapeau pour verbe du 1ᵉʳ groupe sur un lemme non conforme"
if self.flags.startswith("f") and not self.lemma.endswith(("ir", "ïr")):
sErr += "drapeau pour verbe du 2ᵉ groupe sur un lemme non conforme"
if sErr:
echo(' error - id: ' + self.iD, end = "")
echo(' ' + sErr + ' in ' + self.__str__())
|
︙ | | | ︙ | |
938
939
940
941
942
943
944
945
946
947
948
949
950
951
952
|
morph = self.lexMorph()
lFlexions = [(self.lemma, morph, self.di)] if iPR == 0 and not self.flags.endswith('()') else []
lFlexPrefix = []
lFlexSuffix = []
for sFlag in makeLongFlags(self.flags):
if sFlag not in dFlags:
if sFlag not in ['**', '()', '||', '--']:
lFlexions.append( (self.lemma, '[unknown flag: {}]'.format(sFlag)) )
echo("ERROR: " + self.lemma + ' - unknown flag: ' + sFlag)
else:
oFlag = dFlags[sFlag]
if not oFlag.bSfx:
# cas des préfixes
for oRule in oFlag.lRules:
if oRule.motif.search(self.lemma):
|
|
|
868
869
870
871
872
873
874
875
876
877
878
879
880
881
882
|
morph = self.lexMorph()
lFlexions = [(self.lemma, morph, self.di)] if iPR == 0 and not self.flags.endswith('()') else []
lFlexPrefix = []
lFlexSuffix = []
for sFlag in makeLongFlags(self.flags):
if sFlag not in dFlags:
if sFlag not in ['**', '()', '||', '--']:
lFlexions.append( (self.lemma, '[unknown flag: {}]'.format(sFlag), self.di) )
echo("ERROR: " + self.lemma + ' - unknown flag: ' + sFlag)
else:
oFlag = dFlags[sFlag]
if not oFlag.bSfx:
# cas des préfixes
for oRule in oFlag.lRules:
if oRule.motif.search(self.lemma):
|
︙ | | | ︙ | |
1189
1190
1191
1192
1193
1194
1195
1196
1197
1198
1199
1200
1201
1202
1203
1204
1205
1206
1207
1208
1209
1210
1211
1212
1213
1214
1215
1216
1217
1218
1219
1220
1221
1222
1223
|
def getGrammarCheckerRepr (self):
return "{0.sFlexion}\t{0.oEntry.lemma}\t{1}\n".format(self, self._getSimpleTags())
_dTagReplacement = {
# POS
"nom": ":N", "adj": ":A", "adv": ":W", "negadv": ":X", "mg": ":G", "nb": ":B", "nbro": ":Br",
"loc.nom": ":Ñ", "loc.adj": ":Â", "loc.adv": ":Ŵ", "loc.verb": ":Ṽ",
"interj": ":J", "loc.interj": ":Ĵ", "titr": ":T",
"mas": ":m", "fem": ":f", "epi": ":e", "sg": ":s", "pl": ":p", "inv": ":i",
"infi": ":Y",
"ppre": ":P", "ppas": ":Q",
"ipre": ":Ip", "iimp": ":Iq", "ipsi": ":Is", "ifut": ":If",
"spre": ":Sp", "simp": ":Sq", "cond": ":K", "impe": ":E",
"1sg": ":1s", "1isg": ":1ś", "1jsg": ":1ŝ", "2sg": ":2s", "3sg": ":3s", "1pl": ":1p", "2pl": ":2p", "3pl": ":3p", "3pl!": ":3p!",
"prepv": ":Rv", "prep": ":R", "loc.prep": ":Ŕ", "loc.prepv": "Ŕv",
"detpos": ":Dp", "detdem": ":Dd", "detind": ":Di", "detneg": ":Dn", "detex": ":De", "det": ":D",
"advint": ":U",
"prodem": ":Od", "proind": ":Oi", "proint": ":Ot", "proneg": ":On", "prorel": ":Or", "proadv": ":Ow",
"properobj": ":Oo", "propersuj": ":Os", "1pe": ":O1", "2pe": ":O2", "3pe": ":O3", "preverb": ":Ov",
"cjco": ":Cc", "cjsub": ":Cs", "cj": ":C", "loc.cj": ":Ĉ", "loc.cjsub": ":Ĉs",
"prn": ":M1", "patr": ":M2", "loc.patr": ":Ḿ2", "npr": ":MP", "nompr": ":NM",
"pfx": ":Zp", "sfx": ":Zs",
"div": ":H",
"err": ":F",
"ponc": ":@p", "sign": ":@s",
# LEX
"symb": ";S"
}
def _getSimpleTags (self):
s = ""
# POS
for sTag in self.sMorph.split():
if sTag.startswith("v"):
|
|
|
|
|
|
|
|
1119
1120
1121
1122
1123
1124
1125
1126
1127
1128
1129
1130
1131
1132
1133
1134
1135
1136
1137
1138
1139
1140
1141
1142
1143
1144
1145
1146
1147
1148
1149
1150
1151
1152
1153
|
def getGrammarCheckerRepr (self):
return "{0.sFlexion}\t{0.oEntry.lemma}\t{1}\n".format(self, self._getSimpleTags())
_dTagReplacement = {
# POS
"nom": ":N", "adj": ":A", "adv": ":W", "negadv": ":X", "mg": ":G", "nb": ":B", "nbro": ":Br",
"loc.nom": ":ÉN", "loc.adj": ":ÉA", "loc.adv": ":ÉW", "loc.verb": ":ÉV",
"interj": ":J", "loc.interj": ":ÉJ", "titr": ":T",
"mas": ":m", "fem": ":f", "epi": ":e", "sg": ":s", "pl": ":p", "inv": ":i",
"infi": ":Y",
"ppre": ":P", "ppas": ":Q",
"ipre": ":Ip", "iimp": ":Iq", "ipsi": ":Is", "ifut": ":If",
"spre": ":Sp", "simp": ":Sq", "cond": ":K", "impe": ":E",
"1sg": ":1s", "1isg": ":1ś", "1jsg": ":1ŝ", "2sg": ":2s", "3sg": ":3s", "1pl": ":1p", "2pl": ":2p", "3pl": ":3p", "3pl!": ":3p!",
"prepv": ":Rv", "prep": ":R", "loc.prep": ":ÉR", "loc.prepv": "ÉRv",
"detpos": ":Dp", "detdem": ":Dd", "detind": ":Di", "detneg": ":Dn", "detex": ":De", "det": ":D",
"advint": ":U",
"prodem": ":Od", "proind": ":Oi", "proint": ":Ot", "proneg": ":On", "prorel": ":Or", "proadv": ":Ow",
"properobj": ":Oo", "propersuj": ":Os", "1pe": ":O1", "2pe": ":O2", "3pe": ":O3", "preverb": ":Ov",
"cjco": ":Cc", "cjsub": ":Cs", "cj": ":C", "loc.cj": ":ÉC", "loc.cjsub": ":ÉCs",
"prn": ":M1", "patr": ":M2", "loc.patr": ":ÉM2", "npr": ":MP", "nompr": ":NM",
"pfx": ":Zp", "sfx": ":Zs",
"div": ":H",
"err": ":F",
"ponc": ":@p", "sign": ":@s",
# LEX
"symb": ";S", "unit": ";U"
}
def _getSimpleTags (self):
s = ""
# POS
for sTag in self.sMorph.split():
if sTag.startswith("v"):
|
︙ | | | ︙ | |
1505
1506
1507
1508
1509
1510
1511
1512
1513
1514
1515
1516
1517
1518
1519
|
def main ():
xParser = argparse.ArgumentParser()
xParser.add_argument("-v", "--verdic", help="set dictionary version, i.e. 5.4", type=str, default="X.Y.z")
xParser.add_argument("-m", "--mode", help="0: no tags, 1: Hunspell tags (default), 2: All tags", type=int, choices=[0, 1, 2], default=1)
xParser.add_argument("-u", "--uncompress", help="do not use Hunspell compression", action="store_true")
xParser.add_argument("-s", "--simplify", help="no virtual lemmas", action="store_true")
xParser.add_argument("-sv", "--spellvariants", help="generate spell variants", action="store_true")
xParser.add_argument("-gl", "--grammalecte", help="copy generated files to Grammalecte folders", action="store_true")
xArgs = xParser.parse_args()
if xArgs.simplify:
xArgs.mode = 0
xArgs.uncompress = True
|
<
|
1435
1436
1437
1438
1439
1440
1441
1442
1443
1444
1445
1446
1447
1448
|
def main ():
xParser = argparse.ArgumentParser()
xParser.add_argument("-v", "--verdic", help="set dictionary version, i.e. 5.4", type=str, default="X.Y.z")
xParser.add_argument("-m", "--mode", help="0: no tags, 1: Hunspell tags (default), 2: All tags", type=int, choices=[0, 1, 2], default=1)
xParser.add_argument("-u", "--uncompress", help="do not use Hunspell compression", action="store_true")
xParser.add_argument("-s", "--simplify", help="no virtual lemmas", action="store_true")
xParser.add_argument("-gl", "--grammalecte", help="copy generated files to Grammalecte folders", action="store_true")
xArgs = xParser.parse_args()
if xArgs.simplify:
xArgs.mode = 0
xArgs.uncompress = True
|
︙ | | | ︙ | |
1527
1528
1529
1530
1531
1532
1533
1534
1535
1536
1537
1538
1539
1540
1541
1542
1543
1544
1545
1546
1547
1548
1549
1550
1551
1552
1553
1554
|
spBuild = BUILD_PATH + '/' + xArgs.verdic
dir_util.mkpath(spBuild)
### Lecture des fichiers et création du dictionnaire
oFrenchDict = Dictionnaire(xArgs.verdic, "French dictionary")
for sFile in ['orthographe/FRANCAIS.dic']:
oFrenchDict.readDictionary(sFile)
oFrenchDict.readAffixes('orthographe/FRANCAIS_5.aff')
### Contrôle
oFrenchDict.sortEntriesNatural()
oFrenchDict.checkEntries()
### Lexique
oFrenchDict.generateFlexions()
oFrenchDict.calcMetagraphe()
oFrenchDict.calcMetaphone2()
#oFrenchDict.createNgrams(spBuild, 3)
if xArgs.spellvariants:
oFrenchDict.generateSpellVariants(1, spBuild)
### Statistiques
spfStats = spBuild+'/'+STATS_NAME+xArgs.verdic+'.txt'
oStatsLex = StatsLex(oFrenchDict)
oStatsLex.addLexFromFile('lexique/corpus_data/stats_google_ngram_1.txt', 'G', 'Google 1-grams')
oStatsLex.addLexFromFile('lexique/corpus_data/stats_frwiki.txt', 'W', 'Wikipédia')
oStatsLex.addLexFromFile('lexique/corpus_data/stats_frwikisource.txt', 'S', 'Wikisource')
|
|
<
<
|
1456
1457
1458
1459
1460
1461
1462
1463
1464
1465
1466
1467
1468
1469
1470
1471
1472
1473
1474
1475
1476
1477
1478
1479
1480
1481
|
spBuild = BUILD_PATH + '/' + xArgs.verdic
dir_util.mkpath(spBuild)
### Lecture des fichiers et création du dictionnaire
oFrenchDict = Dictionnaire(xArgs.verdic, "French dictionary")
for sFile in ['orthographe/FRANCAIS.dic']:
oFrenchDict.readDictionary(sFile)
oFrenchDict.readAffixes('orthographe/FRANCAIS_7.aff')
### Contrôle
oFrenchDict.sortEntriesNatural()
oFrenchDict.checkEntries()
### Lexique
oFrenchDict.generateFlexions()
oFrenchDict.calcMetagraphe()
oFrenchDict.calcMetaphone2()
#oFrenchDict.createNgrams(spBuild, 3)
### Statistiques
spfStats = spBuild+'/'+STATS_NAME+xArgs.verdic+'.txt'
oStatsLex = StatsLex(oFrenchDict)
oStatsLex.addLexFromFile('lexique/corpus_data/stats_google_ngram_1.txt', 'G', 'Google 1-grams')
oStatsLex.addLexFromFile('lexique/corpus_data/stats_frwiki.txt', 'W', 'Wikipédia')
oStatsLex.addLexFromFile('lexique/corpus_data/stats_frwikisource.txt', 'S', 'Wikisource')
|
︙ | | | ︙ | |