︙ | | |
547
548
549
550
551
552
553
554
555
556
557
558
559
560
561
|
547
548
549
550
551
552
553
554
555
556
557
558
559
560
561
|
-
+
|
dVars['version'] = self.sVersion
# Dictionaries files (.dic) (.aff)
self.writeAffixes(spDic, dVars, nMode, bSimplified)
self.writeDictionary(spDic, dVars, nMode, bSimplified)
copyTemplate('orthographe', spDic, 'README_dict_fr.txt', dVars)
createZipFiles(spDic, spDst, sDicName + '.zip')
def createLibreOfficeExtension (self, spBuild, dTplVars, lDictVars, spGL):
def createLibreOfficeExtension (self, spBuild, dTplVars, lDictVars, spDestGL=""):
# LibreOffice extension
echo(" * Dictionnaire >> extension pour LibreOffice")
dTplVars['version'] = self.sVersion
sExtensionName = EXT_PREFIX_OOO + self.sVersion
spExt = spBuild + '/' + sExtensionName
dir_util.mkpath(spExt+'/META-INF')
dir_util.mkpath(spExt+'/ui')
|
︙ | | |
586
587
588
589
590
591
592
593
594
595
596
597
598
599
600
601
602
603
604
605
606
607
608
609
610
611
612
613
614
615
616
617
618
619
620
621
622
623
624
625
626
627
628
629
630
631
632
633
634
635
636
637
638
639
640
641
642
643
644
645
646
647
648
649
650
651
652
653
654
655
656
657
658
659
660
661
662
663
|
586
587
588
589
590
591
592
593
594
595
596
597
598
599
600
601
602
603
604
605
606
607
608
609
610
611
612
613
614
615
616
617
618
619
620
621
622
623
624
625
626
627
628
629
630
631
632
633
634
635
636
637
638
639
640
641
642
643
644
645
646
647
648
649
650
651
652
653
654
655
656
657
658
659
660
661
662
663
664
665
666
667
|
-
+
-
+
-
+
+
-
-
-
-
+
+
+
+
-
+
+
-
-
+
+
-
+
+
-
-
+
+
-
+
+
-
-
+
+
|
file_util.copy_file('césures/frhyph.tex', spExt+'/dictionaries')
file_util.copy_file('césures/hyph-fr.tex', spExt+'/dictionaries')
file_util.copy_file('césures/README_hyph_fr-3.0.txt', spExt+'/dictionaries')
file_util.copy_file('césures/README_hyph_fr-2.9.txt', spExt+'/dictionaries')
# zip
createZipFiles(spExt, spBuild, sExtensionName + '.oxt')
# copy to Grammalecte Project
if spGL:
if spDestGL:
echo(" extension copiée dans Grammalecte...")
dir_util.copy_tree(spExt+'/dictionaries', spGL)
dir_util.copy_tree(spExt+'/dictionaries', spDestGL)
def createMozillaExtensions (self, spBuild, dTplVars, lDictVars, spDestGL):
def createMozillaExtensions (self, spBuild, dTplVars, lDictVars, spDestGL=""):
# Mozilla extension 1
echo(" * Dictionnaire >> extension pour Mozilla")
dTplVars['version'] = self.sVersion
sExtensionName = EXT_PREFIX_MOZ + self.sVersion
spExt = spBuild + '/' + sExtensionName
dir_util.mkpath(spExt+'/dictionaries')
copyTemplate('_templates/moz', spExt, 'install.rdf', dTplVars)
spDict = spBuild + '/' + PREFIX_DICT_PATH + self.sVersion
file_util.copy_file(spDict+'/fr-classique.dic', spExt+'/dictionaries/fr-classic.dic')
file_util.copy_file(spDict+'/fr-classique.aff', spExt+'/dictionaries/fr-classic.aff')
copyTemplate('orthographe', spExt, 'README_dict_fr.txt', dTplVars)
createZipFiles(spExt, spBuild, sExtensionName + '.xpi')
# Grammalecte
if spDestGL:
echo(" * Dictionnaire >> copie des dicos dans Grammalecte")
for dVars in lDictVars:
file_util.copy_file(spDict+'/'+dVars['asciiName']+'.dic', spDestGL+'/'+dVars['mozAsciiName']+"/"+dVars['mozAsciiName']+'.dic')
file_util.copy_file(spDict+'/'+dVars['asciiName']+'.aff', spDestGL+'/'+dVars['mozAsciiName']+"/"+dVars['mozAsciiName']+'.aff')
echo(" * Dictionnaire >> copie des dicos dans Grammalecte")
for dVars in lDictVars:
file_util.copy_file(spDict+'/'+dVars['asciiName']+'.dic', spDestGL+'/'+dVars['mozAsciiName']+"/"+dVars['mozAsciiName']+'.dic')
file_util.copy_file(spDict+'/'+dVars['asciiName']+'.aff', spDestGL+'/'+dVars['mozAsciiName']+"/"+dVars['mozAsciiName']+'.aff')
def createFileIfqForDB (self, spBuild):
echo(" * Dictionnaire >> indices de fréquence pour la DB...")
with open(spBuild+'/dictIdxIfq-'+self.sVersion+'.diff.txt', 'w', encoding='utf-8', newline="\n") as hDiff, \
open(spBuild+'/dictIdxIfq-'+self.sVersion+'.notes.txt', 'w', encoding='utf-8', newline="\n") as hNotes:
for oEntry in self.lEntry:
if oEntry.fq != oEntry.oldFq:
hDiff.write("{0.iD}\t{0.fq}\n".format(oEntry))
hNotes.write("{0.lemma}/{0.flags}\t{0.oldFq} > {0.fq}\n".format(oEntry))
def createLexiconPackages (self, spBuild, version, oStatsLex, spLexGL):
def createLexiconPackages (self, spBuild, version, oStatsLex, spDestGL=""):
sLexName = LEX_PREFIX + version
spLex = spBuild + '/' + sLexName
dir_util.mkpath(spLex)
# write Dicollecte lexicon
self.sortLexiconByFreq()
self.writeLexicon(spLex + '/' + sLexName + '.txt', version, oStatsLex)
self.writeGrammarCheckerLexicon(spBuild + '/' + sLexName + '.lex', version)
copyTemplate('lexique', spLex, 'README_lexique.txt', {'version': version})
# zip
createZipFiles(spLex, spBuild, sLexName + '.zip')
# copy GC lexicon to Grammalecte
if spDestGL:
file_util.copy_file(spBuild + '/' + sLexName + '.lex', spLexGL + '/French.lex')
file_util.copy_file('lexique/French.tagset.txt', spLexGL)
file_util.copy_file(spBuild + '/' + sLexName + '.lex', spDestGL + '/French.lex')
file_util.copy_file('lexique/French.tagset.txt', spDestGL)
def createDictConj (self, spBuild, spCopy):
def createDictConj (self, spBuild, spDestGL=""):
echo(" * Dictionnaire >> fichier de conjugaison...")
with open(spBuild+'/dictConj.txt', 'w', encoding='utf-8', newline="\n") as hDst:
for oEntry in self.lEntry:
if oEntry.po.startswith("v"):
hDst.write(oEntry.getConjugation())
if spDestGL:
echo(" Fichier de conjugaison copié dans Grammalecte...")
file_util.copy_file(spBuild+'/dictConj.txt', spCopy)
echo(" Fichier de conjugaison copié dans Grammalecte...")
file_util.copy_file(spBuild+'/dictConj.txt', spDestGL)
def createDictDecl (self, spBuild, spCopy):
def createDictDecl (self, spBuild, spDestGL=""):
echo(" * Dictionnaire >> fichier de déclinaison...")
with open(spBuild+'/dictDecl.txt', 'w', encoding='utf-8', newline="\n") as hDst:
for oEntry in self.lEntry:
if re.match("[SXFWIA]", oEntry.flags) and (oEntry.po.startswith("nom") or oEntry.po.startswith("adj")):
hDst.write(oEntry.getDeclination())
if spDestGL:
echo(" Fichier de déclinaison copié dans Grammalecte...")
file_util.copy_file(spBuild+'/dictDecl.txt', spCopy)
echo(" Fichier de déclinaison copié dans Grammalecte...")
file_util.copy_file(spBuild+'/dictDecl.txt', spDestGL)
def generateSpellVariants (self, nReq, spBuild):
if nReq < 1: nReq = 1
if nReq > 2: nReq = 2
echo(" * Lexique >> variantes par suppression... n = " + str(nReq))
with open(spBuild+'/dictSpellVariants-'+str(nReq)+'.txt', 'w', encoding='utf-8', newline="\n") as hDst:
for oFlex in frozenset(self.lFlexions):
|
︙ | | |
805
806
807
808
809
810
811
812
813
814
815
816
817
818
819
820
821
|
809
810
811
812
813
814
815
816
817
818
819
820
821
822
|
-
-
-
|
if self.err:
echo("\n## Erreur dans le dictionnaire : {}".format(self.err))
echo(" dans : " + self.lemma)
def __str__ (self):
return "{0.lemma}/{0.flags} {1}".format(self, self.getMorph(2))
def display (self):
echo(self.__str__())
def check (self):
sErr = ''
if self.lemma == '':
sErr += 'lemme vide'
if not re.match(r"[a-zA-ZéÉôÔàâÂîÎïèÈêÊÜœŒæÆçÇ0-9µåÅΩ&αβγδεζηθικλμνξοπρστυφχψωΔℓΩ_]", self.lemma):
sErr += 'premier caractère inconnu: ' + self.lemma[0]
if re.search(r"\s$", self.lemma):
|
︙ | | |
1075
1076
1077
1078
1079
1080
1081
1082
1083
1084
1085
1086
1087
1088
1089
|
1076
1077
1078
1079
1080
1081
1082
1083
1084
1085
1086
1087
1088
1089
1090
|
-
+
|
# moyenne des formes fléchies sans équivalent ou -1
self.nAKO = math.ceil(nOccur / nFlex) if nFlex > 0 else -1
def solveOccurMultipleFlexions (self, hDst, oStatsLex):
sBlank = " "
if self.nAKO >= 0:
for oFlex in self.lFlexions:
if oFlex.nMulti > 0 and not oFlex.bFixed:
if oFlex.nMulti > 0 and not oFlex.bBlocked:
# on trie les entrées avec AKO et sans AKO
lEntWithAKO = []
lEntNoAKO = []
for oEntry in oFlex.lMulti:
if oEntry.nAKO >= 0:
lEntWithAKO.append(oEntry)
else:
|
︙ | | |
1099
1100
1101
1102
1103
1104
1105
1106
1107
1108
1109
1110
1111
1112
1113
1114
1115
1116
1117
1118
1119
1120
1121
1122
1123
1124
1125
1126
1127
1128
1129
1130
1131
1132
1133
1134
1135
1136
1137
1138
1139
1140
1141
1142
1143
1144
1145
1146
1147
1148
1149
1150
1151
1152
1153
1154
1155
1156
1157
1158
1159
1160
1161
1162
1163
1164
1165
1166
1167
1168
1169
1170
1171
1172
1173
1174
|
1100
1101
1102
1103
1104
1105
1106
1107
1108
1109
1110
1111
1112
1113
1114
1115
1116
1117
1118
1119
1120
1121
1122
1123
1124
1125
1126
1127
1128
1129
1130
1131
1132
1133
1134
1135
1136
1137
1138
1139
1140
1141
1142
1143
1144
1145
1146
1147
1148
1149
1150
1151
1152
1153
1154
1155
1156
1157
1158
1159
1160
1161
1162
1163
1164
1165
1166
1167
1168
1169
1170
1171
1172
1173
1174
1175
1176
1177
1178
|
-
+
-
+
-
+
-
+
-
+
-
+
-
+
+
+
+
-
+
|
if nDiff > 0:
# on peut passer à les formes fléchies à AKO
hDst.write(" * {0.sFlexion}\n".format(oFlex))
hDst.write(" moyenne connue\n")
for oFlexD in self.lFlexions:
if oFlex.sFlexion == oFlexD.sFlexion:
hDst.write(sBlank + "{2:<30} {0.sMorph:<30} {0.nOccur:>10} >> {1:>10}\n".format(oFlexD, self.nAKO, self.getShortDescr()))
oFlexD.setOccur(self.nAKO)
oFlexD.setOccurAndBlock(self.nAKO)
for oEntry in lEntWithAKO:
hDst.write(" moyenne connue\n")
for oFlexM in oEntry.lFlexions:
if oFlex.sFlexion == oFlexM.sFlexion:
hDst.write(sBlank + "{2:<30} {0.sMorph:<30} {0.nOccur:>10} >> {1:>10}\n".format(oFlexM, oEntry.nAKO, oEntry.getShortDescr()))
oFlexM.setOccur(oEntry.nAKO)
oFlexM.setOccurAndBlock(oEntry.nAKO)
# on répercute nDiff sur les flexions sans AKO
for oEntry in lEntNoAKO:
hDst.write(" sans moyenne connue\n")
for oFlexM in oEntry.lFlexions:
if oFlex.sFlexion == oFlexM.sFlexion:
nNewOccur = oFlexM.nOccur + math.ceil((nDiff / len(lEntNoAKO)) / oFlexM.nDup)
hDst.write(sBlank + "{2:<30} {0.sMorph:<30} {0.nOccur:>10} +> {1:>10}\n".format(oFlexM, nNewOccur, oEntry.getShortDescr()))
oFlexM.setOccur(nNewOccur)
oFlexM.setOccurAndBlock(nNewOccur)
else:
# Toutes les entrées sont avec AKO : on pondère
nFlexOccur = oStatsLex.getFlexionOccur(oFlex.sFlexion)
nTotAKO = self.nAKO
for oEnt in oFlex.lMulti:
nTotAKO += oEnt.nAKO
hDst.write(" = {0.sFlexion}\n".format(oFlex))
hDst.write(" moyennes connues\n")
for oFlexD in self.lFlexions:
if oFlex.sFlexion == oFlexD.sFlexion:
nNewOccur = math.ceil((nFlexOccur * (self.nAKO / nTotAKO)) / oFlexD.nDup) if nTotAKO else 0
hDst.write(sBlank + "{2:<30} {0.sMorph:<30} {0.nOccur:>10} %> {1:>10}\n".format(oFlexD, nNewOccur, self.getShortDescr()))
oFlexD.setOccur(nNewOccur)
oFlexD.setOccurAndBlock(nNewOccur)
for oEntry in oFlex.lMulti:
for oFlexM in oEntry.lFlexions:
if oFlex.sFlexion == oFlexM.sFlexion:
nNewOccur = math.ceil((nFlexOccur * (oEntry.nAKO / nTotAKO)) / oFlexM.nDup) if nTotAKO else 0
hDst.write(sBlank + "{2:<30} {0.sMorph:<30} {0.nOccur:>10} %> {1:>10}\n".format(oFlexM, nNewOccur, oEntry.getShortDescr()))
oFlexM.setOccur(nNewOccur)
oFlexM.setOccurAndBlock(nNewOccur)
def calcFreq (self, nTot):
self.fFreq = (self.nOccur * 100) / nTot
self.oldFq = self.fq
self.fq = getIfq(self.fFreq)
class Flexion:
def __init__ (self, oEntry, sFlex='', sMorph='', cDic=''):
self.oEntry = oEntry
self.sFlexion = sFlex
self.sMorph = sMorph
self.cDic = cDic
self.nOccur = 0
self.bFixed = False
self.bBlocked = False
self.nDup = 0 # duplicates in the same entry
self.nMulti = 0 # duplicates with other entries
self.lMulti = [] # list of similar flexions
self.fFreq = 0
self.cFq = ''
self.metagfx = '' # métagraphe
self.metaph2 = '' # métaphone 2
def setOccur (self, n):
self.nOccur = n
def setOccurAndBlock (self, n):
self.nOccur = n
self.bFixed = True
self.bBlocked = True
def calcOccur (self):
self.nOccur = math.ceil((self.nOccur / (self.nMulti+1)) / self.nDup)
def calcFreq (self, nTot):
self.fFreq = (self.nOccur * 100) / nTot
self.cFq = getIfq(self.fFreq)
|
︙ | | |
1190
1191
1192
1193
1194
1195
1196
1197
1198
1199
1200
1201
1202
1203
1204
1205
1206
|
1194
1195
1196
1197
1198
1199
1200
1201
1202
1203
1204
1205
1206
1207
|
-
-
-
|
def __str__ (self, oStatsLex):
sOccurs = ''
for v in oStatsLex.dFlexions[self.sFlexion]:
sOccurs += str(v) + "\t"
return "{0.oEntry.iD}\t{0.sFlexion}\t{0.oEntry.sRadical}\t{0.sMorph}\t{0.metagfx}\t{0.metaph2}\t{0.oEntry.lx}\t{0.oEntry.se}\t{0.oEntry.et}\t{0.oEntry.di}{2}\t{1}{0.nOccur}\t{0.nDup}\t{0.nMulti}\t{0.fFreq:.15f}\t{0.cFq}\n".format(self, sOccurs, "/"+self.cDic if self.cDic != "*" else "")
def display (self):
echo(self.__str__())
@classmethod
def simpleHeader (cls):
return "# :POS ;LEX ~SEM =FQ /DIC\n"
def getGrammarCheckerRepr (self):
return "{0.sFlexion}\t{0.oEntry.lemma}\t{1}\n".format(self, self._getSimpleTags())
|
︙ | | |
1504
1505
1506
1507
1508
1509
1510
1511
1512
1513
1514
1515
1516
1517
|
1505
1506
1507
1508
1509
1510
1511
1512
1513
1514
1515
1516
1517
1518
1519
|
+
|
xParser = argparse.ArgumentParser()
xParser.add_argument("-v", "--verdic", help="set dictionary version, i.e. 5.4", type=str, default="X.Y.z")
xParser.add_argument("-m", "--mode", help="0: no tags, 1: Hunspell tags (default), 2: All tags", type=int, choices=[0, 1, 2], default=1)
xParser.add_argument("-u", "--uncompress", help="do not use Hunspell compression", action="store_true")
xParser.add_argument("-s", "--simplify", help="no virtual lemmas", action="store_true")
xParser.add_argument("-sv", "--spellvariants", help="generate spell variants", action="store_true")
xParser.add_argument("-gl", "--grammalecte", help="copy generated files to Grammalecte folders", action="store_true")
xArgs = xParser.parse_args()
if xArgs.simplify:
xArgs.mode = 0
xArgs.uncompress = True
echo("Python: " + sys.version)
|
︙ | | |
1551
1552
1553
1554
1555
1556
1557
1558
1559
1560
1561
1562
1563
1564
1565
1566
1567
1568
1569
1570
1571
|
1553
1554
1555
1556
1557
1558
1559
1560
1561
1562
1563
1564
1565
1566
1567
1568
1569
1570
1571
1572
1573
1574
1575
1576
1577
1578
1579
|
+
+
+
+
+
+
-
-
-
+
+
+
-
-
+
+
|
oStatsLex.addLexFromFile('lexique/corpus_data/stats_frwikisource.txt', 'S', 'Wikisource')
oStatsLex.addLexFromFile('lexique/corpus_data/stats_litterature.txt', 'L', 'Littérature')
oStatsLex.write(spBuild+'/test_lex.txt')
oFrenchDict.calculateStats(oStatsLex, spfStats)
### écriture des paquets
echo("Création des paquets...")
spLexiconDestGL = "../../../lexicons" if xArgs.grammalecte else ""
spLibreOfficeExtDestGL = "../oxt/Dictionnaires/dictionaries" if xArgs.grammalecte else ""
spMozillaExtDestGL = "../xpi/data/dictionaries" if xArgs.grammalecte else ""
spDataDestGL = "../data" if xArgs.grammalecte else ""
if not xArgs.uncompress:
oFrenchDict.defineAbreviatedTags(xArgs.mode, spfStats)
oFrenchDict.createFiles(spBuild, [dMODERNE, dTOUTESVAR, dCLASSIQUE, dREFORME1990], xArgs.mode, xArgs.simplify)
oFrenchDict.createLibreOfficeExtension(spBuild, dMOZEXT, [dMODERNE, dTOUTESVAR, dCLASSIQUE, dREFORME1990], "../oxt/Dictionnaires/dictionaries")
oFrenchDict.createMozillaExtensions(spBuild, dMOZEXT, [dMODERNE, dTOUTESVAR, dCLASSIQUE, dREFORME1990], "../xpi/data/dictionaries")
oFrenchDict.createLexiconPackages(spBuild, xArgs.verdic, oStatsLex, "../../../lexicons")
oFrenchDict.createLexiconPackages(spBuild, xArgs.verdic, oStatsLex, spLexiconDestGL)
oFrenchDict.createFileIfqForDB(spBuild)
oFrenchDict.createLibreOfficeExtension(spBuild, dMOZEXT, [dMODERNE, dTOUTESVAR, dCLASSIQUE, dREFORME1990], spLibreOfficeExtDestGL)
oFrenchDict.createMozillaExtensions(spBuild, dMOZEXT, [dMODERNE, dTOUTESVAR, dCLASSIQUE, dREFORME1990], spMozillaExtDestGL)
oFrenchDict.createDictConj(spBuild, "../data")
oFrenchDict.createDictDecl(spBuild, "../data")
oFrenchDict.createDictConj(spBuild, spDataDestGL)
oFrenchDict.createDictDecl(spBuild, spDataDestGL)
if __name__ == '__main__':
main()
|