1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
|
#!python3
# FRENCH DATA BUILDER
#
# by Olivier R.
# License: MPL 2
import json
import os
import grammalecte.ibdawg as ibdawg
from grammalecte.echo import echo
from grammalecte.str_transform import defineSuffixCode
import grammalecte.fr.conj as conj
class cd:
"""Context manager for changing the current working directory"""
def __init__ (self, newPath):
self.newPath = os.path.expanduser(newPath)
def __enter__ (self):
self.savedPath = os.getcwd()
os.chdir(self.newPath)
def __exit__ (self, etype, value, traceback):
os.chdir(self.savedPath)
def readFile (spf):
if os.path.isfile(spf):
with open(spf, "r", encoding="utf-8") as hSrc:
for sLine in hSrc:
sLine = sLine.strip()
if sLine and not sLine.startswith("#"):
yield sLine
else:
raise OSError("# Error. File not found or not loadable: " + spf)
def makeDictionaries (sp, sVersion):
|
>
>
>
>
|
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
|
#!python3
# FRENCH DATA BUILDER
#
# by Olivier R.
# License: MPL 2
import json
import os
import itertools
import grammalecte.ibdawg as ibdawg
from grammalecte.echo import echo
from grammalecte.str_transform import defineSuffixCode
import grammalecte.fr.conj as conj
import grammalecte.tokenizer as tkz
class cd:
"""Context manager for changing the current working directory"""
def __init__ (self, newPath):
self.newPath = os.path.expanduser(newPath)
def __enter__ (self):
self.savedPath = os.getcwd()
os.chdir(self.newPath)
def __exit__ (self, etype, value, traceback):
os.chdir(self.savedPath)
def readFile (spf):
if os.path.isfile(spf):
with open(spf, "r", encoding="utf-8") as hSrc:
for sLine in hSrc:
sLine = sLine.strip()
if sLine == "__END__":
break
if sLine and not sLine.startswith("#"):
yield sLine
else:
raise OSError("# Error. File not found or not loadable: " + spf)
def makeDictionaries (sp, sVersion):
|
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
|
lTagMasForm = []
lTagMiscPlur = []
dMiscPlur = {}
dMasForm = {}
lTag = []
lTagMasPl = []
for n, sLine in enumerate(readFile(sp+"/data/dictDecl.txt")):
sLine = sLine.strip()
nTab = sLine.count("\t")
if nTab == 1:
# new entry
lTag.clear()
lTagMasPl.clear()
sLemma, sFlags = sLine.split("\t")
if sFlags.startswith("S"):
|
<
|
183
184
185
186
187
188
189
190
191
192
193
194
195
196
|
lTagMasForm = []
lTagMiscPlur = []
dMiscPlur = {}
dMasForm = {}
lTag = []
lTagMasPl = []
for n, sLine in enumerate(readFile(sp+"/data/dictDecl.txt")):
nTab = sLine.count("\t")
if nTab == 1:
# new entry
lTag.clear()
lTagMasPl.clear()
sLemma, sFlags = sLine.split("\t")
if sFlags.startswith("S"):
|
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
|
open(sp+"/modules-js/phonet_data.json", "w", encoding="utf-8", newline="\n").write(sCode)
def makeLocutions (sp, bJS=False):
"compile list of locutions in JSON"
print("> Locutions ", end="")
print("(Python et JavaScript)" if bJS else "(Python seulement)")
dLocutions = {}
sVal = ":H"
for sLine in readFile(sp+"/data/locutions.txt"):
if sLine.startswith("[") and sLine.endswith("]"):
sLabel, sVal = sLine[1:-1].split("|", 1)
continue
lElem = sLine.split()
dCur = dLocutions
for sWord in lElem:
if sWord not in dCur:
dCur[sWord] = {}
dCur = dCur[sWord]
dCur[":"] = sVal
sCode = "# generated data (do not edit)\n\n" + \
"dLocutions = " + str(dLocutions) + "\n"
open(sp+"/modules/locutions_data.py", "w", encoding="utf-8", newline="\n").write(sCode)
if bJS:
open(sp+"/modules-js/locutions_data.json", "w", encoding="utf-8", newline="\n").write(json.dumps(dLocutions, ensure_ascii=False))
def before (spLaunch, dVars, bJS=False):
print("========== Build Hunspell dictionaries ==========")
makeDictionaries(spLaunch, dVars['oxt_version'])
def after (spLaunch, dVars, bJS=False):
print("========== Build French data ==========")
makeMfsp(spLaunch, bJS)
makeConj(spLaunch, bJS)
makePhonetTable(spLaunch, bJS)
makeLocutions(spLaunch, bJS)
|
|
|
>
|
<
|
|
|
>
>
>
|
>
>
|
|
|
|
|
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
|
open(sp+"/modules-js/phonet_data.json", "w", encoding="utf-8", newline="\n").write(sCode)
def makeLocutions (sp, bJS=False):
"compile list of locutions in JSON"
print("> Locutions ", end="")
print("(Python et JavaScript)" if bJS else "(Python seulement)")
dLocGraph = {}
oTokenizer = tkz.Tokenizer("fr")
for sLine in itertools.chain(readFile(sp+"/data/locutions_adverbiales.txt"), \
readFile(sp+"/data/locutions_prépositives.txt"), \
readFile(sp+"/data/locutions_conjonctives.txt"), \
readFile(sp+"/data/locutions_pronominales.txt"), \
readFile(sp+"/data/locutions_adjectivales.txt"), \
readFile(sp+"/data/locutions_interjectives.txt"), \
readFile(sp+"/data/locutions_nominales.txt"), \
readFile(sp+"/data/locutions_verbales.txt")):
dCur = dLocGraph
sLoc, sTag = sLine.split("\t")
for oToken in oTokenizer.genTokens(sLoc.strip()):
sWord = oToken["sValue"]
if sWord not in dCur:
dCur[sWord] = {}
dCur = dCur[sWord]
dCur["_:_"] = sTag
sCode = "# generated data (do not edit)\n\n" + \
"dLocutions = " + str(dLocGraph) + "\n"
open(sp+"/modules/locutions_data.py", "w", encoding="utf-8", newline="\n").write(sCode)
if bJS:
open(sp+"/modules-js/locutions_data.json", "w", encoding="utf-8", newline="\n").write(json.dumps(dLocGraph, ensure_ascii=False))
def before (spLaunch, dVars, bJS=False):
print("========== Build Hunspell dictionaries ==========")
makeDictionaries(spLaunch, dVars['oxt_version'])
def after (spLaunch, dVars, bJS=False):
print("========== Build French data ==========")
makeMfsp(spLaunch, bJS)
makeConj(spLaunch, bJS)
makePhonetTable(spLaunch, bJS)
makeLocutions(spLaunch, bJS)
|