Overview
Comment: | [graphspell][fx] update tokenizer and lexicographer: add symbols and emojis |
---|---|
Downloads: | Tarball | ZIP archive | SQL archive |
Timelines: | family | ancestors | descendants | both | trunk | fx | graphspell |
Files: | files | file ages | folders |
SHA3-256: |
b3448ac17f5057b4c4bfdbd977ef76ad |
User & Date: | olr on 2020-11-30 15:15:42 |
Other Links: | manifest | tags |
Context
2020-12-02
| ||
07:56 | [misc] sublime text syntax update check-in: 02626b166a user: olr tags: trunk, misc | |
2020-11-30
| ||
15:15 | [graphspell][fx] update tokenizer and lexicographer: add symbols and emojis check-in: b3448ac17f user: olr tags: trunk, fx, graphspell | |
11:12 | [fr] ajustements check-in: ad1f902c97 user: olr tags: trunk, fr | |
Changes
Modified gc_lang/fr/webext/content_scripts/panel_lxg.css from [c6d8057ee1] to [ef239ab156].
︙ | ︙ | |||
146 147 148 149 150 151 152 153 154 155 156 157 158 159 | div.grammalecte_lxg_token_COMPLEX { background-color: hsla(60, 50%, 50%, 1); } div.grammalecte_lxg_token_PUNC { background-color: hsla(210, 50%, 50%, 1); } div.grammalecte_lxg_token_SIGN { background-color: hsla(300, 50%, 50%, 1); } div.grammalecte_lxg_token_LINK { background-color: hsla(270, 50%, 50%, 1); } div.grammalecte_lxg_token_HTML, div.grammalecte_lxg_token_PSEUDO_HTML { | > > > > | 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 | div.grammalecte_lxg_token_COMPLEX { background-color: hsla(60, 50%, 50%, 1); } div.grammalecte_lxg_token_PUNC { background-color: hsla(210, 50%, 50%, 1); } div.grammalecte_lxg_token_SIGN { background-color: hsla(210, 50%, 50%, 1); } div.grammalecte_lxg_token_SYMBOL, div.grammalecte_lxg_token_EMOJI { background-color: hsla(300, 50%, 50%, 1); } div.grammalecte_lxg_token_LINK { background-color: hsla(270, 50%, 50%, 1); } div.grammalecte_lxg_token_HTML, div.grammalecte_lxg_token_PSEUDO_HTML { |
︙ | ︙ |
Modified graphspell-js/lexgraph_fr.js from [95daec96bc] to [7a8c8a0a55].
︙ | ︙ | |||
447 448 449 450 451 452 453 454 455 456 457 458 459 460 | let m = null; try { switch (oToken.sType) { case 'PUNC': case 'SIGN': oToken["aLabels"] = [this.dValues.gl_get(oToken["sValue"], "signe de ponctuation divers")]; break; case 'NUM': oToken["aLabels"] = ["nombre"]; break; case 'LINK': oToken["aLabels"] = ["hyperlien"]; break; case 'TAG': | > > > > > > | 447 448 449 450 451 452 453 454 455 456 457 458 459 460 461 462 463 464 465 466 | let m = null; try { switch (oToken.sType) { case 'PUNC': case 'SIGN': oToken["aLabels"] = [this.dValues.gl_get(oToken["sValue"], "signe de ponctuation divers")]; break; case 'SYMB': oToken["aLabels"] = ["symbole"]; break; case 'EMOJI': oToken["aLabels"] = ["émoji"]; break; case 'NUM': oToken["aLabels"] = ["nombre"]; break; case 'LINK': oToken["aLabels"] = ["hyperlien"]; break; case 'TAG': |
︙ | ︙ |
Modified graphspell-js/tokenizer.js from [9c02b80583] to [8e6d24c94a].
︙ | ︙ | |||
44 45 46 47 48 49 50 | [/^(?:l|d|n|m|t|s|j|c|ç|lorsqu|puisqu|jusqu|quoiqu|qu|presqu|quelqu)['’ʼ‘‛´`′‵՚ꞌꞋ]/i, 'WORDELD'], [/^\d\d?[h:]\d\d(?:[m:]\d\ds?|)\b/, 'HOUR'], [/^\d+(?:ers?\b|res?\b|è[rm]es?\b|i[èe][mr]es?\b|de?s?\b|nde?s?\b|ès?\b|es?\b|ᵉʳˢ?|ʳᵉˢ?|ᵈᵉ?ˢ?|ⁿᵈᵉ?ˢ?|ᵉˢ?)/, 'WORDORD'], [/^\d+(?:[.,]\d+|)/, 'NUM'], [/^[&%‰€$+±=*/<>⩾⩽#|×¥£§¢¬÷@-]/, 'SIGN'], [/^[a-zA-Zà-öÀ-Ö0-9ø-ÿØ-ßĀ-ʯff-stᴀ-ᶿ\u0300-\u036fᵉʳˢⁿᵈ]+(?:[’'`-][a-zA-Zà-öÀ-Ö0-9ø-ÿØ-ßĀ-ʯff-stᴀ-ᶿ\u0300-\u036fᵉʳˢⁿᵈ]+)*/, 'WORD'], [/^_+/, 'UNDERSCORE'], | > > | | 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 | [/^(?:l|d|n|m|t|s|j|c|ç|lorsqu|puisqu|jusqu|quoiqu|qu|presqu|quelqu)['’ʼ‘‛´`′‵՚ꞌꞋ]/i, 'WORDELD'], [/^\d\d?[h:]\d\d(?:[m:]\d\ds?|)\b/, 'HOUR'], [/^\d+(?:ers?\b|res?\b|è[rm]es?\b|i[èe][mr]es?\b|de?s?\b|nde?s?\b|ès?\b|es?\b|ᵉʳˢ?|ʳᵉˢ?|ᵈᵉ?ˢ?|ⁿᵈᵉ?ˢ?|ᵉˢ?)/, 'WORDORD'], [/^\d+(?:[.,]\d+|)/, 'NUM'], [/^[&%‰€$+±=*/<>⩾⩽#|×¥£§¢¬÷@-]/, 'SIGN'], [/^[a-zA-Zà-öÀ-Ö0-9ø-ÿØ-ßĀ-ʯff-stᴀ-ᶿ\u0300-\u036fᵉʳˢⁿᵈ]+(?:[’'`-][a-zA-Zà-öÀ-Ö0-9ø-ÿØ-ßĀ-ʯff-stᴀ-ᶿ\u0300-\u036fᵉʳˢⁿᵈ]+)*/, 'WORD'], [/^_+/, 'UNDERSCORE'], [/^[\u2600-\u26ff\u2700-\u27bf\u1f650-\u1f67f\u1f700-\u1f77f\u1f780-\u1f7ff\u1f800-\u1f8ff]/, 'SYMBOL'], [/^[\u1f300-\u1f5ff\u1f600-\u1f64f\u1f680-\u1f6ff\u1f900-\u1f9ff]+/u, "EMOJI"], [/^\S/u, 'OTHER'], ] }; class Tokenizer { constructor (sLang) { |
︙ | ︙ |
Modified graphspell/lexgraph_fr.py from [3b5f42b556] to [8b4d1fbe84].
︙ | ︙ | |||
439 440 441 442 443 444 445 446 447 448 449 450 451 452 | _zImperatifVerb = re.compile("([\\w]+)(-(?:l(?:es?|a)-(?:moi|toi|lui|[nv]ous|leur)|y|en|[mts]['’ʼ‘‛´`′‵՚ꞌꞋ](?:y|en)|les?|la|[mt]oi|leur|lui))$") def setLabelsOnToken (dToken): # Token: .sType, .sValue, .nStart, .nEnd, .lMorph try: if dToken["sType"] == "PUNC" or dToken["sType"] == "SIGN": dToken["aLabels"] = [_dValues.get(dToken["sValue"], "signe de ponctuation divers")] elif dToken["sType"] == 'NUM': dToken["aLabels"] = ["nombre"] elif dToken["sType"] == 'LINK': dToken["aLabels"] = ["hyperlien"] elif dToken["sType"] == 'TAG': dToken["aLabels"] = ["étiquette (hashtag)"] elif dToken["sType"] == 'HTML': | > > > > | 439 440 441 442 443 444 445 446 447 448 449 450 451 452 453 454 455 456 | _zImperatifVerb = re.compile("([\\w]+)(-(?:l(?:es?|a)-(?:moi|toi|lui|[nv]ous|leur)|y|en|[mts]['’ʼ‘‛´`′‵՚ꞌꞋ](?:y|en)|les?|la|[mt]oi|leur|lui))$") def setLabelsOnToken (dToken): # Token: .sType, .sValue, .nStart, .nEnd, .lMorph try: if dToken["sType"] == "PUNC" or dToken["sType"] == "SIGN": dToken["aLabels"] = [_dValues.get(dToken["sValue"], "signe de ponctuation divers")] elif dToken["sType"] == 'SYMBOL': dToken["aLabels"] = ["symbole"] elif dToken["sType"] == 'EMOJI': dToken["aLabels"] = ["émoji"] elif dToken["sType"] == 'NUM': dToken["aLabels"] = ["nombre"] elif dToken["sType"] == 'LINK': dToken["aLabels"] = ["hyperlien"] elif dToken["sType"] == 'TAG': dToken["aLabels"] = ["étiquette (hashtag)"] elif dToken["sType"] == 'HTML': |
︙ | ︙ |
Modified graphspell/tokenizer.py from [84d5574a19] to [1e4c6dee79].
︙ | ︙ | |||
38 39 40 41 42 43 44 45 46 47 48 49 50 51 | r"(?P<WORDELD>(?:l|d|n|m|t|s|j|c|ç|lorsqu|puisqu|jusqu|quoiqu|qu|presqu|quelqu)['’ʼ‘‛´`′‵՚ꞌꞋ])", r'(?P<WORDORD>\d+(?:ers?|res?|è[rm]es?|i[èe][mr]es?|de?s?|nde?s?|ès?|es?|ᵉʳˢ?|ʳᵉˢ?|ᵈᵉ?ˢ?|ⁿᵈᵉ?ˢ?|ᵉˢ?)\b)', r'(?P<HOUR>\d\d?[h:]\d\d(?:[m:]\d\ds?|)\b)', r'(?P<NUM>\d+(?:[.,]\d+|))', r'(?P<SIGN>[&%‰€$+±=*/<>⩾⩽#|×¥£¢§¬÷@-])', r"(?P<WORD>(?:(?!_)[\w\u0300-\u036f])+(?:[’'`-](?:(?!_)[\w\u0300-\u036f])+)*)", # with combining diacritics r"(?P<UNDERSCORE>_+)", r"(?P<OTHER>\S)" ) } class Tokenizer: "Tokenizer: transforms a text in a list of tokens" | > > | 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 | r"(?P<WORDELD>(?:l|d|n|m|t|s|j|c|ç|lorsqu|puisqu|jusqu|quoiqu|qu|presqu|quelqu)['’ʼ‘‛´`′‵՚ꞌꞋ])", r'(?P<WORDORD>\d+(?:ers?|res?|è[rm]es?|i[èe][mr]es?|de?s?|nde?s?|ès?|es?|ᵉʳˢ?|ʳᵉˢ?|ᵈᵉ?ˢ?|ⁿᵈᵉ?ˢ?|ᵉˢ?)\b)', r'(?P<HOUR>\d\d?[h:]\d\d(?:[m:]\d\ds?|)\b)', r'(?P<NUM>\d+(?:[.,]\d+|))', r'(?P<SIGN>[&%‰€$+±=*/<>⩾⩽#|×¥£¢§¬÷@-])', r"(?P<WORD>(?:(?!_)[\w\u0300-\u036f])+(?:[’'`-](?:(?!_)[\w\u0300-\u036f])+)*)", # with combining diacritics r"(?P<UNDERSCORE>_+)", r"(?P<SYMBOL>[\u2600-\u26ff\u2700-\u27bf\U0001f650-\U0001f67f\U0001f700-\U0001f77f\U0001f780-\U0001f7ff\U0001f800-\U0001f8ff])", r"(?P<EMOJI>[\U0001f300-\U0001f5ff\U0001f600-\U0001f64f\U0001f680-\U0001f6ff\U0001f900-\U0001f9ff]+)", r"(?P<OTHER>\S)" ) } class Tokenizer: "Tokenizer: transforms a text in a list of tokens" |
︙ | ︙ |