︙ | | |
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
|
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
|
-
+
-
-
+
+
|
hDst.write(" > {0[1]:>8} : {0[0]}\n".format(elem))
def writeDictionary (self, spDst, dTplVars, nMode, bSimplified):
"Écrire le fichier dictionnaire (.dic)"
echo(' * Dictionnaire >> [ {}.dic ] ({})'.format(dTplVars['asciiName'], dTplVars['subDicts']))
nEntry = 0
for oEntry in self.lEntry:
if oEntry.di in dTplVars['subDicts']:
if oEntry.di in dTplVars['subDicts'] and " " not in oEntry.lemma:
nEntry += 1
with open(spDst+'/'+dTplVars['asciiName']+'.dic', 'w', encoding='utf-8', newline="\n") as hDst:
hDst.write(str(nEntry)+"\n")
for oEntry in self.lEntry:
if oEntry.di in dTplVars['subDicts']:
hDst.write(oEntry.getEntryLine(self, nMode, bSimplified))
if oEntry.di in dTplVars['subDicts'] and " " not in oEntry.lemma:
hDst.write(oEntry.getHunspellLine(self, nMode, bSimplified))
def writeAffixes (self, spDst, dTplVars, nMode, bSimplified):
"Écrire le fichier des affixes (.aff)"
echo(' * Dictionnaire >> [ {}.aff ]'.format(dTplVars['asciiName']))
info = "# This Source Code Form is subject to the terms of the Mozilla Public\n" + \
"# License, v. 2.0. If a copy of the MPL was not distributed with this\n" + \
"# file, You can obtain one at http://mozilla.org/MPL/2.0/.\n\n" + \
|
︙ | | |
732
733
734
735
736
737
738
739
740
741
742
743
744
745
746
747
748
749
750
751
752
753
754
755
756
757
758
|
732
733
734
735
736
737
738
739
740
741
742
743
744
745
746
747
748
749
750
751
752
753
754
755
756
757
758
|
-
+
-
+
|
self.iD = '0'
# autres
self.comment = ''
self.err = ''
self.nFlexions = 0
self.lFlexions = []
self.sRadical = ''
self.sStem = ''
self.nOccur = 0
self.nAKO = -1 # Average known occurrences
self.fFreq = 0
self.oldFq = ''
sLine = sLine.rstrip(" \n")
# commentaire
if '#' in sLine:
sLine, comment = sLine.split('#', 1)
self.comment = comment.strip()
# éléments de la ligne
elems = sLine.split()
elems = sLine.split("\t")
nElems = len(elems)
# lemme et drapeaux
firstElems = elems[0].split('/')
self.lemma = firstElems[0]
self.flags = firstElems[1] if len(firstElems) > 1 else ''
# morph
for i in range(1, nElems):
|
︙ | | |
814
815
816
817
818
819
820
821
822
823
824
825
826
827
828
829
830
831
832
833
834
835
836
837
838
|
814
815
816
817
818
819
820
821
822
823
824
825
826
827
828
829
830
831
832
833
834
835
836
837
838
|
-
+
-
+
-
+
-
+
|
if re.search(r"\s$", self.lemma):
sErr += 'espace en fin de lemme'
if re.match(r"v[0123]", self.po) and not re.match(r"[eas_][ix_][tx_][nx_][pqreuvx_][mx_][ex_z][ax_z]\b", self.po[2:]):
sErr += 'verbe inconnu: ' + self.po
if (re.match(r"S[*.]", self.flags) and re.search("[sxz]$", self.lemma)) or (re.match(r"X[*.]", self.flags) and not re.search("[ul]$", self.lemma)):
sErr += 'drapeau inutile'
if self.iz == '' and re.match(r"[SXAI](?!=)", self.flags) and self.po:
sErr += '[is]'
sErr += '[is] vide'
if re.match(r"pl|sg|inv", self.iz):
sErr += '[is]'
sErr += '[is] incomplet'
if re.match(r"[FW]", self.flags) and re.search(r"epi|mas|fem|inv|sg|pl", self.iz):
sErr += '[is]'
sErr += '[is] incohérent'
if re.match(r"[FW]", self.flags) and re.search(r"[^eë]$", self.lemma):
sErr += "fin de lemme inapproprié"
if re.match(r".\*", self.flags) and re.match(r"[bcdfgjklmnpqrstvwxz]", self.lemma):
sErr += 'drapeau pour lemme commençant par une voyelle'
if re.search(r"pl|sg|inv", self.iz) and re.match(r"[SXAIFW](?!=)", self.flags):
sErr += '[is]'
sErr += '[is] incohérent'
if re.search(r"nom|adj", self.po) and re.match(r"(?i)[aâàäáeéèêëiîïíìoôöóòuûüúù]", self.lemma) and re.match("[SFWXAI][.]", self.flags) \
and "pel" not in self.lx:
sErr += 'le drapeau derait finir avec *'
if not self.flags and self.iz.endswith(("mas", "fem", "epi")):
sErr += '[is] incomplet'
if self.flags.startswith(("a", "b", "c", "d")) and not self.lemma.endswith("er"):
sErr += "drapeau pour verbe du 1ᵉʳ groupe sur un lemme non conforme"
|
︙ | | |
863
864
865
866
867
868
869
870
871
872
873
874
875
876
877
878
|
863
864
865
866
867
868
869
870
871
872
873
874
875
876
877
878
|
-
-
+
+
|
def keyTriNat (self):
return (self.lemma.translate(CHARMAP), self.flags, self.po)
def keyTriNum (self):
return (self.lemma, self.flags, self.po)
def getEntryLine (self, oDict, nMode, bSimplified=False):
sLine = self.lemma
def getHunspellLine (self, oDict, nMode, bSimplified=False):
sLine = self.lemma.replace("’", "'")
if self.flags:
sLine += '/'
sLine += self.flags if not oDict.bShortenTags or bSimplified else oDict.dAF[self.flags]
if bSimplified:
return sLine.replace("()", "") + "\n"
if nMode > 0:
sMorph = self.getMorph(nMode)
|
︙ | | |
922
923
924
925
926
927
928
929
930
931
932
933
934
935
936
937
938
939
940
941
942
943
|
922
923
924
925
926
927
928
929
930
931
932
933
934
935
936
937
938
939
940
941
942
943
944
945
946
|
+
+
+
-
-
-
-
-
-
-
-
+
+
+
+
+
+
+
+
|
if not sMorph.endswith((" mas", " fem", " epi")):
self.lFlexions.append( Flexion(self, sFlex, sMorph, sDic) )
self.nFlexions += 1
else:
#echo(sFlex + " " + sMorph + ", ")
pass
# Drapeaux dont le lemme féminin doit être remplacé par le masculin dans la gestion des formes fléchies
if self.st:
self.sStem = self.st
else:
if self.flags.startswith(("F.", "F*", "W.", "W*")):
# recherche de la forme masculine
for t in lTuples:
sMorph = self.clean(t[1])
if sMorph.endswith('mas') or sMorph.endswith('mas sg') or sMorph.endswith('mas inv'):
self.sRadical = t[0]
else:
self.sRadical = self.lemma
if self.flags.startswith(("F.", "F*", "W.", "W*")):
# recherche de la forme masculine
for t in lTuples:
sMorph = self.clean(t[1])
if sMorph.endswith(('mas', 'mas sg', 'mas inv')):
self.sStem = t[0]
else:
self.sStem = self.lemma
# Tag duplicates
d = {}
for oFlex in self.lFlexions:
d[oFlex.sFlexion] = d.get(oFlex.sFlexion, 0) + 1
for oFlex in self.lFlexions:
oFlex.nDup = d[oFlex.sFlexion]
|
︙ | | |
1188
1189
1190
1191
1192
1193
1194
1195
1196
1197
1198
1199
1200
1201
1202
|
1191
1192
1193
1194
1195
1196
1197
1198
1199
1200
1201
1202
1203
1204
1205
|
-
+
|
sOccurs += t[1] + "\t"
return "id\tFlexion\tLemme\tÉtiquettes\tMétagraphe (β)\tMetaphone2\tNotes\tSémantique\tÉtymologie\tSous-dictionnaire\t" + sOccurs + "Total occurrences\tDoublons\tMultiples\tFréquence\tIndice de fréquence\n"
def __str__ (self, oStatsLex):
sOccurs = ''
for v in oStatsLex.dFlexions[self.sFlexion]:
sOccurs += str(v) + "\t"
return "{0.oEntry.iD}\t{0.sFlexion}\t{0.oEntry.sRadical}\t{0.sMorph}\t{0.metagfx}\t{0.metaph2}\t{0.oEntry.lx}\t{0.oEntry.se}\t{0.oEntry.et}\t{0.oEntry.di}{2}\t{1}{0.nOccur}\t{0.nDup}\t{0.nMulti}\t{0.fFreq:.15f}\t{0.cFq}\n".format(self, sOccurs, "/"+self.cDic if self.cDic != "*" else "")
return "{0.oEntry.iD}\t{0.sFlexion}\t{0.oEntry.sStem}\t{0.sMorph}\t{0.metagfx}\t{0.metaph2}\t{0.oEntry.lx}\t{0.oEntry.se}\t{0.oEntry.et}\t{0.oEntry.di}{2}\t{1}{0.nOccur}\t{0.nDup}\t{0.nMulti}\t{0.fFreq:.15f}\t{0.cFq}\n".format(self, sOccurs, "/"+self.cDic if self.cDic != "*" else "")
@classmethod
def simpleHeader (cls):
return "# :POS ;LEX ~SEM =FQ /DIC\n"
def getGrammarCheckerRepr (self):
return "{0.sFlexion}\t{0.oEntry.lemma}\t{1}\n".format(self, self._getSimpleTags())
|
︙ | | |
1212
1213
1214
1215
1216
1217
1218
1219
1220
1221
1222
1223
1224
1225
1226
|
1215
1216
1217
1218
1219
1220
1221
1222
1223
1224
1225
1226
1227
1228
1229
|
-
+
|
"ipre": ":Ip", "iimp": ":Iq", "ipsi": ":Is", "ifut": ":If",
"spre": ":Sp", "simp": ":Sq", "cond": ":K", "impe": ":E",
"1sg": ":1s", "1isg": ":1ś", "1jsg": ":1ŝ", "2sg": ":2s", "3sg": ":3s", "1pl": ":1p", "2pl": ":2p", "3pl": ":3p", "3pl!": ":3p!",
"prepv": ":Rv", "prep": ":R", "loc.prep": ":Ŕ",
"detpos": ":Dp", "detdem": ":Dd", "detind": ":Di", "detneg": ":Dn", "detex": ":De", "det": ":D",
"advint": ":U",
"prodem": ":Od", "proind": ":Oi", "proint": ":Ot", "proneg": ":On", "prorel": ":Or", "proadv": ":Ow",
"properobj": ":Oo", "propersuj": ":Os", "1pe": ":O1", "2pe": ":O2", "3pe": ":O3",
"properobj": ":Oo", "propersuj": ":Os", "1pe": ":O1", "2pe": ":O2", "3pe": ":O3", "preverb": ":Ov",
"cjco": ":Cc", "cjsub": ":Cs", "cj": ":C", "loc.cj": ":Ĉ", "loc.cjsub": ":Ĉs",
"prn": ":M1", "patr": ":M2", "loc.patr": ":Ḿ2", "npr": ":MP", "nompr": ":NM",
"pfx": ":Zp", "sfx": ":Zs",
"div": ":H",
"err": ":#",
# LEX
"symb": ";S"
|
︙ | | |
1254
1255
1256
1257
1258
1259
1260
1261
1262
1263
1264
1265
1266
1267
1268
1269
1270
1271
|
1257
1258
1259
1260
1261
1262
1263
1264
1265
1266
1267
1268
1269
1270
1271
1272
1273
1274
|
-
+
-
+
|
s += "/" + self.oEntry.di
return s
def keyTriNat (self):
return (self.sFlexion.translate(CHARMAP), self.sMorph)
def keyFreq (self):
return (100-self.fFreq, self.oEntry.sRadical, self.sFlexion)
return (100-self.fFreq, self.oEntry.sStem, self.sFlexion)
def keyOcc (self):
return (self.nOccur, self.oEntry.sRadical, self.sFlexion)
return (self.nOccur, self.oEntry.sStem, self.sFlexion)
def keyIdx (self):
return self.oEntry.iD
def keyFlexion (self):
return self.sFlexion
|
︙ | | |
1497
1498
1499
1500
1501
1502
1503
1504
1505
1506
1507
1508
1509
1510
1511
|
1500
1501
1502
1503
1504
1505
1506
1507
1508
1509
1510
1511
1512
1513
|
-
|
hDst.write(str(t)+"\n")
for e in self.dFlexions.items():
hDst.write("{} - {}\n".format(e[0], e[1]))
def main ():
xParser = argparse.ArgumentParser()
xParser.add_argument("-v", "--verdic", help="set dictionary version, i.e. 5.4", type=str, default="X.Y.z")
xParser.add_argument("-m", "--mode", help="0: no tags, 1: Hunspell tags (default), 2: All tags", type=int, choices=[0, 1, 2], default=1)
xParser.add_argument("-u", "--uncompress", help="do not use Hunspell compression", action="store_true")
xParser.add_argument("-s", "--simplify", help="no virtual lemmas", action="store_true")
xParser.add_argument("-sv", "--spellvariants", help="generate spell variants", action="store_true")
xParser.add_argument("-gl", "--grammalecte", help="copy generated files to Grammalecte folders", action="store_true")
|
︙ | | |
1555
1556
1557
1558
1559
1560
1561
1562
1563
1564
1565
1566
1567
1568
1569
|
1557
1558
1559
1560
1561
1562
1563
1564
1565
1566
1567
1568
1569
1570
1571
|
-
+
|
oFrenchDict.calculateStats(oStatsLex, spfStats)
### écriture des paquets
echo("Création des paquets...")
spLexiconDestGL = "../../../lexicons" if xArgs.grammalecte else ""
spLibreOfficeExtDestGL = "../oxt/Dictionnaires/dictionaries" if xArgs.grammalecte else ""
spMozillaExtDestGL = "" # les dictionnaires pour Hunspell ne sont plus utilisés pour l’instant dans Firefox / Thunderbird
spMozillaExtDestGL = "" if xArgs.grammalecte else "" # no more Hunspell dictionaries in Mozilla extensions for now
spDataDestGL = "../data" if xArgs.grammalecte else ""
if not xArgs.uncompress:
oFrenchDict.defineAbreviatedTags(xArgs.mode, spfStats)
oFrenchDict.createFiles(spBuild, [dMODERNE, dTOUTESVAR, dCLASSIQUE, dREFORME1990], xArgs.mode, xArgs.simplify)
oFrenchDict.createLexiconPackages(spBuild, xArgs.verdic, oStatsLex, spLexiconDestGL)
oFrenchDict.createFileIfqForDB(spBuild)
|
︙ | | |