Grammalecte  Check-in [9377402874]

Overview
Comment:[graphspell] lexicographer: better readbility for past participle
Downloads: Tarball | ZIP archive | SQL archive
Timelines: family | ancestors | descendants | both | trunk | graphspell
Files: files | file ages | folders
SHA3-256: 9377402874b68007e8d5f2c6c77387747356ad8c76cc69411d5eb80f48f810c3
User & Date: olr on 2020-11-19 18:40:57
Other Links: manifest | tags
Context
2020-11-19
23:48
[fr] ajustements check-in: 9b10c609d3 user: olr tags: trunk, fr
18:40
[graphspell] lexicographer: better readbility for past participle check-in: 9377402874 user: olr tags: trunk, graphspell
18:38
[fr] ajustements check-in: 38b9862aab user: olr tags: trunk, fr
Changes

Modified graphspell-js/lexgraph_fr.js from [d137f983fd] to [56d17d9958].

365
366
367
368
369
370
371
372

373
374
375
376
377
378
379
365
366
367
368
369
370
371

372
373
374
375
376
377
378
379







-
+







            ['‰', "signe pour mille"],
        ]),

    _zPartDemForm: new RegExp("([a-zA-Zà-ö0-9À-Öø-ÿØ-ßĀ-ʯ]+)-(là|ci)$", "i"),
    _aPartDemExceptList: new Set(["celui", "celle", "ceux", "celles", "de", "jusque", "par", "marie-couche-toi"]),
    _zInterroVerb: new RegExp("([a-zA-Zà-ö0-9À-Öø-ÿØ-ßĀ-ʯ]+)(-(?:t-(?:ie?l|elle|on)|je|tu|ie?ls?|elles?|on|[nv]ous))$", "i"),
    _zImperatifVerb: new RegExp("([a-zA-Zà-ö0-9À-Öø-ÿØ-ßĀ-ʯ]+)(-(?:l(?:es?|a)-(?:moi|toi|lui|[nv]ous|leur)|y|en|[mts]['’ʼ‘‛´`′‵՚ꞌꞋ](?:y|en)|les?|la|[mt]oi|leur|lui))$", "i"),
    _zTag: new RegExp("[:;/][a-zA-Z0-9ÑÂĴĈŔÔṼŴ!][^:;/]*", "g"),
    _zTag: new RegExp("[:;/][a-zA-Z0-9É@*!][^:;/]*", "g"),

    split: function (sWord) {
        // returns an arry of strings (prefix, trimed_word, suffix)
        let sPrefix = "";
        let sSuffix = "";
        // préfixe élidé
        let m = /^([ldmtsnjcç]|lorsqu|presqu|jusqu|puisqu|quoiqu|quelqu|qu)['’ʼ‘‛´`′‵՚ꞌꞋ]([a-zA-Zà-öÀ-Ö0-9_ø-ÿØ-ßĀ-ʯfi-st-]+)/i.exec(sWord);
401
402
403
404
405
406
407


408







409
410
411
412

413
414
415

416
417
418
419







420
421
422
423
424







425
426
427
428
429
430
431
401
402
403
404
405
406
407
408
409

410
411
412
413
414
415
416
417
418


419



420




421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446







+
+
-
+
+
+
+
+
+
+


-
-
+
-
-
-
+
-
-
-
-
+
+
+
+
+
+
+





+
+
+
+
+
+
+







    },

    readableMorph: function (sMorph) {
        if (!sMorph) {
            return " mot inconnu";
        }
        let sRes = "";
        let sVType = "";
        if (sMorph.includes(":V")) {
        sMorph = sMorph.replace(/:V([0-3][ea_])[itpqnmr_eaxz]+/, ":V$1");
            sMorph = sMorph.replace(/:V([0-3][ea_])[itpqnmr_eaxz]+/, ":V$1");
        }
        if (sMorph.includes(":Q")) {
            let nVerbTag = sMorph.indexOf(":V")
            sVType = sMorph.slice(nVerbTag, nVerbTag+4);
            sMorph = sMorph.replace(/:V[0123]./, "").replace(/:1[ŝś]/, "");
        }
        let m;
        while ((m = this._zTag.exec(sMorph)) !== null) {
            if (this.dTag.has(m[0])) {
                sRes += this.dTag.get(m[0])[0];
            sRes += this._readableTag(m[0]);
            } else {
                sRes += " [" + m[0] + "]?";
            }
        }
        }
        if (sRes.startsWith(" verbe") && !sRes.includes("infinitif")) {
            sRes += " [" + sMorph.slice(1, sMorph.indexOf("/")) + "]";
        }
        if ((sRes.startsWith(" verbe") && !sRes.includes("infinitif")) || sRes.startsWith(" participe")) {
            if (sVType) {
                sRes += " [" + sMorph.slice(1, sMorph.indexOf("/")) + " : " + this._readableTag(sVType).gl_trimRight(",") + "]";
            } else {
                sRes += " [" + sMorph.slice(1, sMorph.indexOf("/")) + "]";
            }
        }
        if (!sRes) {
            return " [" + sMorph + "]: étiquettes inconnues";
        }
        return sRes.gl_trimRight(",");
    },

    _readableTag: function (sTag) {
        if (this.dTag.has(sTag)) {
            return this.dTag.get(sTag)[0];
        }
        return " [" + sTag + "]?";
    },

    setLabelsOnToken (oToken) {
        // Token: .sType, .sValue, .nStart, .nEnd, .lMorph
        let m = null;
        try {
            switch (oToken.sType) {
                case 'PUNC':

Modified graphspell/lexgraph_fr.py from [24bc88a28a] to [851691ac43].

369
370
371
372
373
374
375
376

377
378
379
380
381
382
383
369
370
371
372
373
374
375

376
377
378
379
380
381
382
383







-
+







    '%': "signe de pourcentage",
    '‰': "signe pour mille"
}


_zElidedPrefix = re.compile("(?i)^([ldmtsnjcç]|lorsqu|presqu|jusqu|puisqu|quoiqu|quelqu|qu)[’'‘`ʼ]([\\w-]+)")
_zCompoundWord = re.compile("(?i)(\\w+)(-(?:(?:les?|la)-(?:moi|toi|lui|[nv]ous|leur)|t-(?:il|elle|on)|y|en|[mts]’(?:y|en)|les?|l[aà]|[mt]oi|leur|lui|je|tu|ils?|elles?|on|[nv]ous|ce))$")
_zTag = re.compile("[:;/][\\w*][^:;/]*")
_zTag = re.compile("[:;/][\\w@*!][^:;/]*")

def split (sWord):
    "split word in 3 parts: prefix, root, suffix"
    sPrefix = ""
    sSuffix = ""
    # préfixe élidé
    m = _zElidedPrefix.match(sWord)
401
402
403
404
405
406
407


408





409
410
411




412
413
414
415

416
417
418
419






420
421
422
423
424
425
426
401
402
403
404
405
406
407
408
409

410
411
412
413
414
415


416
417
418
419
420



421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438







+
+
-
+
+
+
+
+

-
-
+
+
+
+

-
-
-
+




+
+
+
+
+
+









def readableMorph (sMorph):
    "returns string: readable tags"
    if not sMorph:
        return "mot inconnu"
    sRes = ""
    sVType = ""
    if ":V" in sMorph:
    sMorph = re.sub("(?<=V[0123][ea_])[itpqnmr_eaxz]+", "", sMorph)
        sMorph = re.sub("(?<=V[0123][ea_])[itpqnmr_eaxz]+", "", sMorph)
    if ":Q" in sMorph:
        nVerbTag = sMorph.find(":V")
        sVType = sMorph[nVerbTag:nVerbTag+4]
        sMorph = sMorph[4:].replace(":1ŝ", "").replace(":1ś", "")
    for m in _zTag.finditer(sMorph):
        if m.group(0) in _dTAGS:
            sRes += _dTAGS[m.group(0)][0]
        sRes += _readableTag(m.group(0))
    if sRes.startswith((" verbe", " participe")) and not sRes.endswith("infinitif"):
        if sVType:
            sRes += " [" + sMorph[1:sMorph.find("/")] + " : " + _readableTag(sVType).rstrip(",") + "]"
        else:
            sRes += " [" + m.group(0) + "]?"
    if sRes.startswith(" verbe") and not sRes.endswith("infinitif"):
        sRes += " [" + sMorph[1:sMorph.find("/")] +"]"
            sRes += " [" + sMorph[1:sMorph.find("/")] + "]"
    if not sRes:
        return " [" + sMorph + "]: étiquettes inconnues"
    return sRes.rstrip(",")

def _readableTag (sTag):
    "returns string: readable tag"
    if sTag in _dTAGS:
        return _dTAGS[sTag][0]
    return " [" + sTag + "]?"


_zPartDemForm = re.compile("([\\w]+)-(là|ci)$")
_zInterroVerb = re.compile("([\\w]+)(-(?:t-(?:ie?l|elle|on)|je|tu|ie?ls?|elles?|on|[nv]ous))$")
_zImperatifVerb = re.compile("([\\w]+)(-(?:l(?:es?|a)-(?:moi|toi|lui|[nv]ous|leur)|y|en|[mts]['’ʼ‘‛´`′‵՚ꞌꞋ](?:y|en)|les?|la|[mt]oi|leur|lui))$")

def setLabelsOnToken (dToken):
    # Token: .sType, .sValue, .nStart, .nEnd, .lMorph