Grammalecte  Check-in [f82c3ce70e]

Overview
Comment:[build][fr] phonet simil: merge sets if words belongs to several sets
Downloads: Tarball | ZIP archive | SQL archive
Timelines: family | ancestors | descendants | both | trunk | fr | build
Files: files | file ages | folders
SHA3-256: f82c3ce70e793ad3e001a615db95560f4e4e1a9df5f4894ba9612cc2ba79c110
User & Date: olr on 2020-04-29 17:16:40
Other Links: manifest | tags
Context
2020-04-29
19:05
[build][fr] build_data.py: fix build for JS check-in: 6bc8dab4c2 user: olr tags: trunk, fr, build
17:16
[build][fr] phonet simil: merge sets if words belongs to several sets check-in: f82c3ce70e user: olr tags: trunk, fr, build
17:08
[fr] phonet_simil.txt update check-in: 582bf42669 user: olr tags: trunk, fr
Changes

Modified gc_lang/fr/build_data.py from [c910fde1c7] to [ce4f084f4e].

12
13
14
15
16
17
18


19
20
21
22
23
24
25
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27







+
+







import platform

import graphspell.ibdawg as ibdawg
from graphspell.echo import echo
from graphspell.str_transform import defineSuffixCode
import graphspell.tokenizer as tkz

import gc_lang.fr.modules.conj as conj


oDict = None


class cd:
    """Context manager for changing the current working directory"""
    def __init__ (self, newPath):
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299

300
301

302
303
304



305
306
307

308
309
310













311
312
313
314
315
316
317

318
319
320
321
322
323
324
283
284
285
286
287
288
289


290
291
292
293
294
295

296
297

298


299
300
301
302
303
304
305
306
307
308
309



310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328

329
330
331
332
333
334
335
336







-
-






-


-
+
-
-
+



+
+
+



+
-
-
-
+
+
+
+
+
+
+
+
+
+
+
+
+






-
+







        open(sp+"/modules-js/mfsp_data.json", "w", encoding="utf-8", newline="\n").write(sCode)


def makePhonetTable (sp, bJS=False):
    print("> Correspondances phonétiques ", end="")
    print("(Python et JavaScript)"  if bJS  else "(Python seulement)")

    import gc_lang.fr.modules.conj as conj

    loadDictionary()

    # set of homophonic words
    lSet = []
    for sLine in readFile(sp+"/data/phonet_simil.txt"):
        lWord = sLine.split()
        aMore = set()
        for sWord in lWord:
            if sWord.endswith("er") and conj.isVerb(sWord):
                aMore = aMore.union(conj.getConjSimilInfiV1(sWord))
                lWord.extend(conj.getConjSimilInfiV1(sWord))
        lWord.extend(list(aMore))
        lSet.append(sorted(set(lWord)))
        lSet.append(set(lWord))

    # dictionary of words
    dWord = {}
    aMultiSetWord = set()
    lNewSet = []
    nAppend = 0
    for i, aSet in enumerate(lSet):
        for sWord in aSet:
            if oDict.lookup(sWord):
                if sWord not in dWord:
                dWord[sWord] = i  # warning, what if word in several sets?
            else:
                echo("Mot inconnu : " + sWord)
                    dWord[sWord] = i
                else:
                    # word in several set
                    aMultiSetWord.add(sWord)
                    iSet = dWord[sWord]
                    lNewSet.append(lSet[iSet].union(aSet))
                    dWord[sWord] = len(lSet) + nAppend
                    nAppend += 1
            else:
                echo(f"  Mot inconnu : <{sWord}>")
    lSet.extend(lNewSet)
    print("  Mots appartenant à plusieurs ensembles: ", ", ".join(aMultiSetWord))

    # dictionary of morphologies
    dMorph = {}
    for sWord in dWord:
        dMorph[sWord] = oDict.getMorph(sWord)

    # write file for Python
    sCode = "# generated data (do not edit)\n\n" + \
    sCode = "# generated data built in build_data.py (do not edit)\n\n" + \
            "dWord = " + str(dWord) + "\n\n" + \
            "lSet = " + str(lSet) + "\n\n" + \
            "dMorph = " + str(dMorph) + "\n"
    open(sp+"/modules/phonet_data.py", "w", encoding="utf-8", newline="\n").write(sCode)

    if bJS:
        ## write file for JavaScript
363
364
365
366
367
368
369
370
371


372
375
376
377
378
379
380
381


382
383
384







-
-
+
+

    print("========== Build Hunspell dictionaries ==========")
    makeDictionaries(spLaunch, dVars['oxt_version'])


def after (spLaunch, dVars, bJS=False):
    print("========== Build French data ==========")
    makeMfsp(spLaunch, bJS)
    makeConj(spLaunch, bJS)
    makePhonetTable(spLaunch, bJS)
    makePhonetTable(spLaunch, bJS)
    makeConj(spLaunch, bJS)
    makeLocutions(spLaunch, bJS)

Modified gc_lang/fr/modules/conj_data.py from [24f905ee0b] to [348618f642].

cannot compute difference between binary files

Modified gc_lang/fr/modules/phonet_data.py from [fcf5178674] to [30d2b2eb01].

cannot compute difference between binary files