Grammalecte  Check-in [b3448ac17f]

Overview
Comment:[graphspell][fx] update tokenizer and lexicographer: add symbols and emojis
Downloads: Tarball | ZIP archive | SQL archive
Timelines: family | ancestors | descendants | both | trunk | fx | graphspell
Files: files | file ages | folders
SHA3-256: b3448ac17f5057b4c4bfdbd977ef76ad4bb84295fa88d01ca87620931a64fb13
User & Date: olr on 2020-11-30 15:15:42
Other Links: manifest | tags
Context
2020-12-02
07:56
[misc] sublime text syntax update check-in: 02626b166a user: olr tags: trunk, misc
2020-11-30
15:15
[graphspell][fx] update tokenizer and lexicographer: add symbols and emojis check-in: b3448ac17f user: olr tags: trunk, fx, graphspell
11:12
[fr] ajustements check-in: ad1f902c97 user: olr tags: trunk, fr
Changes

Modified gc_lang/fr/webext/content_scripts/panel_lxg.css from [c6d8057ee1] to [ef239ab156].

146
147
148
149
150
151
152




153
154
155
156
157
158
159
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163







+
+
+
+







div.grammalecte_lxg_token_COMPLEX {
    background-color: hsla(60, 50%, 50%, 1);
}
div.grammalecte_lxg_token_PUNC {
    background-color: hsla(210, 50%, 50%, 1);
}
div.grammalecte_lxg_token_SIGN {
    background-color: hsla(210, 50%, 50%, 1);
}
div.grammalecte_lxg_token_SYMBOL,
div.grammalecte_lxg_token_EMOJI {
    background-color: hsla(300, 50%, 50%, 1);
}
div.grammalecte_lxg_token_LINK {
    background-color: hsla(270, 50%, 50%, 1);
}
div.grammalecte_lxg_token_HTML,
div.grammalecte_lxg_token_PSEUDO_HTML {

Modified graphspell-js/lexgraph_fr.js from [95daec96bc] to [7a8c8a0a55].

447
448
449
450
451
452
453






454
455
456
457
458
459
460
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466







+
+
+
+
+
+







        let m = null;
        try {
            switch (oToken.sType) {
                case 'PUNC':
                case 'SIGN':
                    oToken["aLabels"] = [this.dValues.gl_get(oToken["sValue"], "signe de ponctuation divers")];
                    break;
                case 'SYMB':
                    oToken["aLabels"] = ["symbole"];
                    break;
                case 'EMOJI':
                    oToken["aLabels"] = ["émoji"];
                    break;
                case 'NUM':
                    oToken["aLabels"] = ["nombre"];
                    break;
                case 'LINK':
                    oToken["aLabels"] = ["hyperlien"];
                    break;
                case 'TAG':

Modified graphspell-js/tokenizer.js from [9c02b80583] to [8e6d24c94a].

44
45
46
47
48
49
50


51

52
53
54
55
56
57
58
44
45
46
47
48
49
50
51
52

53
54
55
56
57
58
59
60







+
+
-
+







            [/^(?:l|d|n|m|t|s|j|c|ç|lorsqu|puisqu|jusqu|quoiqu|qu|presqu|quelqu)['’ʼ‘‛´`′‵՚ꞌꞋ]/i, 'WORDELD'],
            [/^\d\d?[h:]\d\d(?:[m:]\d\ds?|)\b/, 'HOUR'],
            [/^\d+(?:ers?\b|res?\b|è[rm]es?\b|i[èe][mr]es?\b|de?s?\b|nde?s?\b|ès?\b|es?\b|ᵉʳˢ?|ʳᵉˢ?|ᵈᵉ?ˢ?|ⁿᵈᵉ?ˢ?|ᵉˢ?)/, 'WORDORD'],
            [/^\d+(?:[.,]\d+|)/, 'NUM'],
            [/^[&%‰€$+±=*/<>⩾⩽#|×¥£§¢¬÷@-]/, 'SIGN'],
            [/^[a-zA-Zà-öÀ-Ö0-9ø-ÿØ-ßĀ-ʯff-stᴀ-ᶿ\u0300-\u036fᵉʳˢⁿᵈ]+(?:[’'`-][a-zA-Zà-öÀ-Ö0-9ø-ÿØ-ßĀ-ʯff-stᴀ-ᶿ\u0300-\u036fᵉʳˢⁿᵈ]+)*/, 'WORD'],
            [/^_+/, 'UNDERSCORE'],
            [/^[\u2600-\u26ff\u2700-\u27bf\u1f650-\u1f67f\u1f700-\u1f77f\u1f780-\u1f7ff\u1f800-\u1f8ff]/, 'SYMBOL'],
            [/^[\u1f300-\u1f5ff\u1f600-\u1f64f\u1f680-\u1f6ff\u1f900-\u1f9ff]+/u, "EMOJI"],
            [/^\S/, 'OTHER'],
            [/^\S/u, 'OTHER'],
        ]
};


class Tokenizer {

    constructor (sLang) {

Modified graphspell/lexgraph_fr.py from [3b5f42b556] to [8b4d1fbe84].

439
440
441
442
443
444
445




446
447
448
449
450
451
452
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456







+
+
+
+







_zImperatifVerb = re.compile("([\\w]+)(-(?:l(?:es?|a)-(?:moi|toi|lui|[nv]ous|leur)|y|en|[mts]['’ʼ‘‛´`′‵՚ꞌꞋ](?:y|en)|les?|la|[mt]oi|leur|lui))$")

def setLabelsOnToken (dToken):
    # Token: .sType, .sValue, .nStart, .nEnd, .lMorph
    try:
        if dToken["sType"] == "PUNC" or dToken["sType"] == "SIGN":
            dToken["aLabels"] = [_dValues.get(dToken["sValue"], "signe de ponctuation divers")]
        elif dToken["sType"] == 'SYMBOL':
            dToken["aLabels"] = ["symbole"]
        elif dToken["sType"] == 'EMOJI':
            dToken["aLabels"] = ["émoji"]
        elif dToken["sType"] == 'NUM':
            dToken["aLabels"] = ["nombre"]
        elif dToken["sType"] == 'LINK':
            dToken["aLabels"] = ["hyperlien"]
        elif dToken["sType"] == 'TAG':
            dToken["aLabels"] = ["étiquette (hashtag)"]
        elif dToken["sType"] == 'HTML':

Modified graphspell/tokenizer.py from [84d5574a19] to [1e4c6dee79].

38
39
40
41
42
43
44


45
46
47
48
49
50
51
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53







+
+







            r"(?P<WORDELD>(?:l|d|n|m|t|s|j|c|ç|lorsqu|puisqu|jusqu|quoiqu|qu|presqu|quelqu)['’ʼ‘‛´`′‵՚ꞌꞋ])",
            r'(?P<WORDORD>\d+(?:ers?|res?|è[rm]es?|i[èe][mr]es?|de?s?|nde?s?|ès?|es?|ᵉʳˢ?|ʳᵉˢ?|ᵈᵉ?ˢ?|ⁿᵈᵉ?ˢ?|ᵉˢ?)\b)',
            r'(?P<HOUR>\d\d?[h:]\d\d(?:[m:]\d\ds?|)\b)',
            r'(?P<NUM>\d+(?:[.,]\d+|))',
            r'(?P<SIGN>[&%‰€$+±=*/<>⩾⩽#|×¥£¢§¬÷@-])',
            r"(?P<WORD>(?:(?!_)[\w\u0300-\u036f])+(?:[’'`-](?:(?!_)[\w\u0300-\u036f])+)*)",        # with combining diacritics
            r"(?P<UNDERSCORE>_+)",
            r"(?P<SYMBOL>[\u2600-\u26ff\u2700-\u27bf\U0001f650-\U0001f67f\U0001f700-\U0001f77f\U0001f780-\U0001f7ff\U0001f800-\U0001f8ff])",
            r"(?P<EMOJI>[\U0001f300-\U0001f5ff\U0001f600-\U0001f64f\U0001f680-\U0001f6ff\U0001f900-\U0001f9ff]+)",
            r"(?P<OTHER>\S)"
        )
}


class Tokenizer:
    "Tokenizer: transforms a text in a list of tokens"