Grammalecte  Check-in [f8019de85c]

Overview
Comment:[fr] lexicographe: restructuration des données, réduction de l’échantillon de test, élisons dorénavant acceptées
Downloads: Tarball | ZIP archive | SQL archive
Timelines: family | ancestors | descendants | both | fr | Lexicographe
Files: files | file ages | folders
SHA3-256: f8019de85cad93654fd7bd83f676fc058689536b77021e51d6eb989b8faf1c20
User & Date: olr on 2017-10-30 13:48:49
Other Links: branch diff | manifest | tags
Context
2017-10-30
16:44
[fr] lexicographe: minor changes check-in: 4e84b8a6cf user: olr tags: fr, Lexicographe
13:48
[fr] lexicographe: restructuration des données, réduction de l’échantillon de test, élisons dorénavant acceptées check-in: f8019de85c user: olr tags: fr, Lexicographe
12:00
[fr] lexicographe: clarification de code check-in: 83fbc36b7a user: olr tags: fr, Lexicographe
Changes

Modified gc_lang/fr/build_data.py from [b7abe812ee] to [f2198525f4].

8
9
10
11
12
13
14

15
16
17
18
19
20
21
import json
import os

import grammalecte.ibdawg as ibdawg
from grammalecte.echo import echo
from grammalecte.str_transform import defineSuffixCode
import grammalecte.fr.conj as conj



class cd:
    """Context manager for changing the current working directory"""
    def __init__ (self, newPath):
        self.newPath = os.path.expanduser(newPath)








>







8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
import json
import os

import grammalecte.ibdawg as ibdawg
from grammalecte.echo import echo
from grammalecte.str_transform import defineSuffixCode
import grammalecte.fr.conj as conj
import grammalecte.tokenizer as tkz


class cd:
    """Context manager for changing the current working directory"""
    def __init__ (self, newPath):
        self.newPath = os.path.expanduser(newPath)

308
309
310
311
312
313
314
315

316
317
318
319


320
321
322
323
324
325

326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344


def makeLocutions (sp, bJS=False):
    "compile list of locutions in JSON"
    print("> Locutions ", end="")
    print("(Python et JavaScript)"  if bJS  else "(Python seulement)")
    with open(sp+"/data/locutions.txt", 'r', encoding='utf-8') as hSrc:
        dLocutions = {}

        for sLine in hSrc.readlines():
            if not sLine.startswith("#") and sLine.strip():
                lElem = sLine.strip().split()
                dCur = dLocutions


                for sWord in lElem:
                    if sWord not in dCur and not sWord.startswith(":"):
                        dCur[sWord] = {}
                    if sWord not in dCur and sWord.startswith(":"):
                        dCur[sWord] = ''
                    dCur = dCur[sWord]


    sCode = "# generated data (do not edit)\n\n" + \
            "dLocutions = " + str(dLocutions) + "\n"
    open(sp+"/modules/locutions_data.py", "w", encoding="utf-8", newline="\n").write(sCode)
    if bJS:
        open(sp+"/modules-js/locutions_data.json", "w", encoding="utf-8", newline="\n").write(json.dumps(dLocutions, ensure_ascii=False))


def before (spLaunch, dVars, bJS=False):
    print("========== Build Hunspell dictionaries ==========")
    makeDictionaries(spLaunch, dVars['oxt_version'])


def after (spLaunch, dVars, bJS=False):
    print("========== Build French data ==========")
    makeMfsp(spLaunch, bJS)
    makeConj(spLaunch, bJS)
    makePhonetTable(spLaunch, bJS)
    makeLocutions(spLaunch, bJS)







|
>


<
|
>
>
|
|

<
<

>


|


|













309
310
311
312
313
314
315
316
317
318
319

320
321
322
323
324
325


326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346


def makeLocutions (sp, bJS=False):
    "compile list of locutions in JSON"
    print("> Locutions ", end="")
    print("(Python et JavaScript)"  if bJS  else "(Python seulement)")
    with open(sp+"/data/locutions.txt", 'r', encoding='utf-8') as hSrc:
        dLocGraph = {}
        oTokenizer = tkz.Tokenizer("fr")
        for sLine in hSrc.readlines():
            if not sLine.startswith("#") and sLine.strip():

                dCur = dLocGraph
                sLoc, sTag = sLine.strip().split("\t")
                for oToken in oTokenizer.genTokens(sLoc.strip()):
                    sWord = oToken["sValue"]
                    if sWord not in dCur:
                        dCur[sWord] = {}


                    dCur = dCur[sWord]
                dCur[":"] = sTag

    sCode = "# generated data (do not edit)\n\n" + \
            "dLocutions = " + str(dLocGraph) + "\n"
    open(sp+"/modules/locutions_data.py", "w", encoding="utf-8", newline="\n").write(sCode)
    if bJS:
        open(sp+"/modules-js/locutions_data.json", "w", encoding="utf-8", newline="\n").write(json.dumps(dLocGraph, ensure_ascii=False))


def before (spLaunch, dVars, bJS=False):
    print("========== Build Hunspell dictionaries ==========")
    makeDictionaries(spLaunch, dVars['oxt_version'])


def after (spLaunch, dVars, bJS=False):
    print("========== Build French data ==========")
    makeMfsp(spLaunch, bJS)
    makeConj(spLaunch, bJS)
    makePhonetTable(spLaunch, bJS)
    makeLocutions(spLaunch, bJS)

Modified gc_lang/fr/data/locutions.txt from [443f1274df] to [e32c77973b].

more than 10,000 changes

Modified gc_lang/fr/modules-js/lexicographe.js from [57cce4f9d0] to [ffb5515826].

346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
                    aElem.push(oToken);
                }
            }
        }
        return aElem;
    }

    _unifyStr (sWord){
        return sWord.replace('’', 'e').toLowerCase();
    }

    getListOfTokensReduc (sText, bInfo = true) {
        let aTokenList = this.getListOfTokens(sText.replace("'", "’").trim(), false);
        let iKey = 0;
        let aElem = [];
        do {
            let oToken = aTokenList[iKey];
            let sTokenTmpKey = '';
            let aTokenTempList = [oToken];
            if (oToken.sType == "WORD" || oToken.sType == "ELPFX"){
                let iKeyTree = iKey + 1;
                let oLocNode = this.oLocGraph[this._unifyStr(oToken.sValue)];
                while (oLocNode) {
                    let oTokenNext = aTokenList[iKeyTree];
                    iKeyTree++;
                    if (oTokenNext) {
                        oLocNode = oLocNode[this._unifyStr(oTokenNext.sValue)];
                    }
                    if (oLocNode && iKeyTree <= aTokenList.length) {
                        sTokenTmpKey = Object.keys(oLocNode)[0];
                        aTokenTempList.push(oTokenNext);
                    } else {
                        break;
                    }
                }
            }

            if (sTokenTmpKey.substring(0, 1) == ':') {
                let sWord = '';
                for (let oTokenWord of aTokenTempList) {
                    sWord += oTokenWord.sValue+' ';
                }
                iKey = iKey + aTokenTempList.length-1;
                let oTokenLocution = {
                    'nEnd': aTokenTempList[aTokenTempList.length-1].nEnd,
                    'nStart': aTokenTempList[0].nStart,
                    'sType': "LOC",
                    'sValue': sWord.replace('’ ','’').trim()
                };
                if (bInfo) {
                    let aFormatedTag = [];
                    for (let sTagMulti of sTokenTmpKey.split('|') ){
                        aFormatedTag.push( this._formatTags(sTagMulti).replace(/( \(él.\))/g,'') );
                    }
                    aElem.push({
                        sType: oTokenLocution.sType,
                        sValue: oTokenLocution.sValue,
                        aLabel: aFormatedTag
                    });







<
<
<
<






|



|




|


|







|













|







346
347
348
349
350
351
352




353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
                    aElem.push(oToken);
                }
            }
        }
        return aElem;
    }





    getListOfTokensReduc (sText, bInfo = true) {
        let aTokenList = this.getListOfTokens(sText.replace("'", "’").trim(), false);
        let iKey = 0;
        let aElem = [];
        do {
            let oToken = aTokenList[iKey];
            let sMorphLoc = '';
            let aTokenTempList = [oToken];
            if (oToken.sType == "WORD" || oToken.sType == "ELPFX"){
                let iKeyTree = iKey + 1;
                let oLocNode = this.oLocGraph[oToken.sValue.toLowerCase()];
                while (oLocNode) {
                    let oTokenNext = aTokenList[iKeyTree];
                    iKeyTree++;
                    if (oTokenNext) {
                        oLocNode = oLocNode[oTokenNext.sValue.toLowerCase()];
                    }
                    if (oLocNode && iKeyTree <= aTokenList.length) {
                        sMorphLoc = oLocNode[":"];
                        aTokenTempList.push(oTokenNext);
                    } else {
                        break;
                    }
                }
            }

            if (sMorphLoc) {
                let sWord = '';
                for (let oTokenWord of aTokenTempList) {
                    sWord += oTokenWord.sValue+' ';
                }
                iKey = iKey + aTokenTempList.length-1;
                let oTokenLocution = {
                    'nEnd': aTokenTempList[aTokenTempList.length-1].nEnd,
                    'nStart': aTokenTempList[0].nStart,
                    'sType': "LOC",
                    'sValue': sWord.replace('’ ','’').trim()
                };
                if (bInfo) {
                    let aFormatedTag = [];
                    for (let sTagMulti of sMorphLoc.split('|') ){
                        aFormatedTag.push( this._formatTags(sTagMulti).replace(/( \(él.\))/g,'') );
                    }
                    aElem.push({
                        sType: oTokenLocution.sType,
                        sValue: oTokenLocution.sValue,
                        aLabel: aFormatedTag
                    });

Modified gc_lang/fr/modules-js/locutions_data.json from [249fef4e46] to [18410d013e].

cannot compute difference between binary files

Modified gc_lang/fr/modules-js/phonet_data.json from [45669375f1] to [63815e6d96].

cannot compute difference between binary files

Modified gc_lang/fr/modules/locutions_data.py from [cbeadb1ff5] to [02f99ec70a].

cannot compute difference between binary files

Modified gc_lang/fr/modules/phonet_data.py from [497cbd30f5] to [be2bb20c17].

cannot compute difference between binary files