12
13
14
15
16
17
18
19
20
21
22
23
24
25
|
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
|
+
+
|
import platform
import graphspell.ibdawg as ibdawg
from graphspell.echo import echo
from graphspell.str_transform import defineSuffixCode
import graphspell.tokenizer as tkz
import gc_lang.fr.modules.conj as conj
oDict = None
class cd:
"""Context manager for changing the current working directory"""
def __init__ (self, newPath):
|
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
|
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
|
-
-
-
-
+
-
-
+
+
+
+
+
-
-
-
+
+
+
+
+
+
+
+
+
+
+
+
+
+
-
+
|
open(sp+"/modules-js/mfsp_data.json", "w", encoding="utf-8", newline="\n").write(sCode)
def makePhonetTable (sp, bJS=False):
print("> Correspondances phonétiques ", end="")
print("(Python et JavaScript)" if bJS else "(Python seulement)")
import gc_lang.fr.modules.conj as conj
loadDictionary()
# set of homophonic words
lSet = []
for sLine in readFile(sp+"/data/phonet_simil.txt"):
lWord = sLine.split()
aMore = set()
for sWord in lWord:
if sWord.endswith("er") and conj.isVerb(sWord):
aMore = aMore.union(conj.getConjSimilInfiV1(sWord))
lWord.extend(conj.getConjSimilInfiV1(sWord))
lWord.extend(list(aMore))
lSet.append(sorted(set(lWord)))
lSet.append(set(lWord))
# dictionary of words
dWord = {}
aMultiSetWord = set()
lNewSet = []
nAppend = 0
for i, aSet in enumerate(lSet):
for sWord in aSet:
if oDict.lookup(sWord):
if sWord not in dWord:
dWord[sWord] = i # warning, what if word in several sets?
else:
echo("Mot inconnu : " + sWord)
dWord[sWord] = i
else:
# word in several set
aMultiSetWord.add(sWord)
iSet = dWord[sWord]
lNewSet.append(lSet[iSet].union(aSet))
dWord[sWord] = len(lSet) + nAppend
nAppend += 1
else:
echo(f" Mot inconnu : <{sWord}>")
lSet.extend(lNewSet)
lSet = [ sorted(aSet) for aSet in lSet ]
print(" Mots appartenant à plusieurs ensembles: ", ", ".join(aMultiSetWord))
# dictionary of morphologies
dMorph = {}
for sWord in dWord:
dMorph[sWord] = oDict.getMorph(sWord)
# write file for Python
sCode = "# generated data (do not edit)\n\n" + \
sCode = "# generated data built in build_data.py (do not edit)\n\n" + \
"dWord = " + str(dWord) + "\n\n" + \
"lSet = " + str(lSet) + "\n\n" + \
"dMorph = " + str(dMorph) + "\n"
open(sp+"/modules/phonet_data.py", "w", encoding="utf-8", newline="\n").write(sCode)
if bJS:
## write file for JavaScript
|
363
364
365
366
367
368
369
370
371
372
|
376
377
378
379
380
381
382
383
384
385
|
-
-
+
+
|
print("========== Build Hunspell dictionaries ==========")
makeDictionaries(spLaunch, dVars['oxt_version'])
def after (spLaunch, dVars, bJS=False):
print("========== Build French data ==========")
makeMfsp(spLaunch, bJS)
makeConj(spLaunch, bJS)
makePhonetTable(spLaunch, bJS)
makePhonetTable(spLaunch, bJS)
makeConj(spLaunch, bJS)
makeLocutions(spLaunch, bJS)
|