Grammalecte  Check-in [98cbf77aef]

Overview
Comment:[build][core][fr][misc] phonet token, new syntax
Downloads: Tarball | ZIP archive | SQL archive
Timelines: family | ancestors | descendants | both | trunk | fr | core | build | misc
Files: files | file ages | folders
SHA3-256: 98cbf77aeff1786b8832a40f9b8e00963ed3cb803e1b6b37db20c5c220604ebd
User & Date: olr on 2020-11-14 11:08:39
Other Links: manifest | tags
Context
2020-11-14
13:46
[core][fr] phonet: better code for isSimilAs() check-in: 5f68edd979 user: olr tags: trunk, fr, core
11:08
[build][core][fr][misc] phonet token, new syntax check-in: 98cbf77aef user: olr tags: trunk, fr, core, build, misc
2020-11-13
19:28
[fr] affixes: màj check-in: 45248d2762 user: olr tags: trunk, fr
Changes

Modified darg.py from [9b17f8d6af] to [f98928fa4d].

216
217
218
219
220
221
222

223
224
225
226
227
228
229
230
231
232
233
234


235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250


251
252
253
254
255
256
257
258
259
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264







+












+
+
















+
+









    def getNodeAsDict (self):
        "returns the node as a dictionary structure"
        dNode = {}
        dReValue = {}   # regex for token values
        dReMorph = {}   # regex for morph
        dMorph = {}     # simple search in morph
        dLemma = {}
        dPhonet = {}
        dMeta = {}
        dTag = {}
        dRule = {}
        for sArc, oNode in self.dArcs.items():
            if sArc.startswith("@") and len(sArc) > 1:
                dReMorph[sArc[1:]] = oNode.__hash__()
            elif sArc.startswith("$") and len(sArc) > 1:
                dMorph[sArc[1:]] = oNode.__hash__()
            elif sArc.startswith("~") and len(sArc) > 1:
                dReValue[sArc[1:]] = oNode.__hash__()
            elif sArc.startswith(">") and len(sArc) > 1:
                dLemma[sArc[1:]] = oNode.__hash__()
            elif sArc.startswith("%") and len(sArc) > 1:
                dPhonet[sArc[1:]] = oNode.__hash__()
            elif sArc.startswith("*") and len(sArc) > 1:
                dMeta[sArc[1:]] = oNode.__hash__()
            elif sArc.startswith("/") and len(sArc) > 1:
                dTag[sArc[1:]] = oNode.__hash__()
            elif sArc.startswith("##"):
                dRule[sArc[1:]] = oNode.__hash__()
            else:
                dNode[sArc] = oNode.__hash__()
        if dReValue:
            dNode["<re_value>"] = dReValue
        if dReMorph:
            dNode["<re_morph>"] = dReMorph
        if dMorph:
            dNode["<morph>"] = dMorph
        if dLemma:
            dNode["<lemmas>"] = dLemma
        if dPhonet:
            dNode["<phonet>"] = dPhonet
        if dTag:
            dNode["<tags>"] = dTag
        if dMeta:
            dNode["<meta>"] = dMeta
        if dRule:
            dNode["<rules>"] = dRule
        #if self.bFinal:
        #    dNode["<final>"] = 1
        return dNode

Modified gc_core/js/lang_core/gc_engine.js from [2769c1001e] to [2d8b7e8dc3].

474
475
476
477
478
479
480


























481
482
483
484
485
486
487
474
475
476
477
478
479
480
481
482
483
484
485
486
487
488
489
490
491
492
493
494
495
496
497
498
499
500
501
502
503
504
505
506
507
508
509
510
511
512
513







+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+







                            if (bDebug) {
                                console.log("  MATCH: >" + sLemma);
                            }
                            yield { "iToken1": iToken1, "iNode": oNode["<lemmas>"][sLemma] };
                            bTokenFound = true;
                        }
                    }
                }
                // phonetic similarity
                if (oNode.hasOwnProperty("<phonet>")) {
                    for (let sPhonet in oNode["<phonet>"]) {
                        if (sPhonet.endsWith("!")) {
                            let sPhon = sPhonet.slice(0,-1);
                            if (oToken["sValue"] == sPhon) {
                                continue;
                            }
                            if (oToken["sValue"].slice(0,1).gl_isUpperCase()) {
                                if (oToken["sValue"].toLowerCase() == sPhon) {
                                    continue;
                                }
                                if (oToken["sValue"].gl_isUpperCase() && oToken["sValue"].gl_toCapitalize() == sPhon) {
                                    continue;
                                }
                            }
                        }
                        if (phonet.isSimilAs(oToken["sValue"], sPhonet.gl_trimRight("!"))) {
                            if (bDebug) {
                                console.log("  MATCH: %" + sPhonet);
                            }
                            yield { "iToken1": iToken1, "iNode": oNode["<phonet>"][sPhonet] };
                            bTokenFound = true;
                        }
                    }
                }
                // morph arcs
                if (oNode.hasOwnProperty("<morph>")) {
                    let lMorph = (oToken.hasOwnProperty("lMorph")) ? oToken["lMorph"] : gc_engine.oSpellChecker.getMorph(oToken["sValue"]);
                    if (lMorph.length > 0) {
                        for (let sSearch in oNode["<morph>"]) {
                            if (!sSearch.includes("¬")) {

Modified gc_core/py/lang_core/gc_engine.py from [957964b063] to [f9382d41bb].

11
12
13
14
15
16
17

18
19
20
21
22
23
24
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25







+







from ..graphspell.spellchecker import SpellChecker
from ..graphspell.echo import echo

from .. import text

from . import gc_functions
from . import gc_options
from . import phonet

try:
    # LibreOffice / OpenOffice
    from com.sun.star.linguistic2 import SingleProofreadingError
    from com.sun.star.text.TextMarkupType import PROOFREADING
    from com.sun.star.beans import PropertyValue
    #import lightproof_handler_${implname} as opt
457
458
459
460
461
462
463

















464
465
466
467
468
469
470
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
485
486
487
488







+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+







            if "<lemmas>" in dNode:
                for sLemma in _oSpellChecker.getLemma(dToken["sValue"]):
                    if sLemma in dNode["<lemmas>"]:
                        if bDebug:
                            echo("  MATCH: >" + sLemma)
                        yield { "iToken1": iToken1, "iNode": dNode["<lemmas>"][sLemma] }
                        bTokenFound = True
            # phonetic similarity
            if "<phonet>" in dNode:
                for sPhonet in dNode["<phonet>"]:
                    if sPhonet.endswith("!"):
                        sPhon = sPhonet[0:-1]
                        if dToken["sValue"] == sPhon:
                            continue
                        if dToken["sValue"][0:1].isupper():
                            if dToken["sValue"].lower() == sPhon:
                                continue
                            if dToken["sValue"].isupper() and dToken["sValue"].capitalize() == sPhon:
                                continue
                    if phonet.isSimilAs(dToken["sValue"], sPhonet.rstrip("!")):
                        if bDebug:
                            echo("  MATCH: %" + sPhonet)
                        yield { "iToken1": iToken1, "iNode": dNode["<phonet>"][sPhonet] }
                        bTokenFound = True
            # morph arcs
            if "<morph>" in dNode:
                lMorph = dToken.get("lMorph", _oSpellChecker.getMorph(dToken["sValue"]))
                if lMorph:
                    for sSearch in dNode["<morph>"]:
                        if "¬" not in sSearch:
                            # no anti-pattern

Modified gc_lang/fr/config.ini from [9e7bd75963] to [4aa81ead77].

1
2
3
4
5
6
7
8
9

10
11
12
13
14
15
16
1
2
3
4
5
6
7
8

9
10
11
12
13
14
15
16








-
+







[args]
lang = fr
lang_name = French
locales = fr_FR fr_BE fr_CA fr_CH fr_LU fr_BF fr_BJ fr_CD fr_CI fr_CM fr_MA fr_ML fr_MU fr_NE fr_RE fr_SN fr_TG
country_default = FR
name = Grammalecte
implname = grammalecte
# always use 3 numbers for version: x.y.z
version = 1.12.2
version = 2.0.0
author = Olivier R.
provider = Grammalecte.net
link = https://grammalecte.net
description = Correcteur grammatical, orthographique et typographique pour le français.
extras = README_fr.txt
logo = logo.png

Modified gc_lang/fr/modules-js/phonet.js from [0562e85836] to [62385b8be1].

25
26
27
28
29
30
31
32

33
34
35
36
37
38
39
25
26
27
28
29
30
31

32
33
34
35
36
37
38
39







-
+







        }
        catch (e) {
            console.error(e);
        }
    },

    hasSimil: function (sWord, sPattern=null) {
        // return True if there is list of words phonetically similar to sWord
        // return True if there is list of words phonetically similar to <sWord>
        if (!sWord) {
            return false;
        }
        if (this._dWord.has(sWord)) {
            if (sPattern) {
                return this.getSimil(sWord).some(sSimil => this._dMorph.gl_get(sSimil, []).some(sMorph => sMorph.search(sPattern) >= 0));
            }
48
49
50
51
52
53
54
55

56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72

73
74
75
76
77
78
79
80
81
82
83
84























85
86
87
88
89
90
91
48
49
50
51
52
53
54

55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71

72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114







-
+
















-
+












+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+







                return true;
            }
        }
        return false;
    },

    getSimil: function (sWord) {
        // return list of words phonetically similar to sWord
        // return list of words phonetically similar to <sWord>
        if (!sWord) {
            return [];
        }
        if (this._dWord.has(sWord)) {
            return this._lSet[this._dWord.get(sWord)];
        }
        if (sWord.slice(0,1).gl_isUpperCase()) {
            sWord = sWord.toLowerCase();
            if (this._dWord.has(sWord)) {
                return this._lSet[this._dWord.get(sWord)];
            }
        }
        return [];
    },

    selectSimil: function (sWord, sPattern) {
        // return a set of words phonetically similar to sWord and whom POS is matching sPattern
        // return a set of words phonetically similar to <sWord> and whom POS is matching <sPattern>
        if (!sPattern) {
            return new Set(this.getSimil(sWord));
        }
        let aSelect = new Set();
        for (let sSimil of this.getSimil(sWord)) {
            for (let sMorph of this._dMorph.gl_get(sSimil, [])) {
                if (sMorph.search(sPattern) >= 0) {
                    aSelect.add(sSimil);
                }
            }
        }
        return aSelect;
    },

    isSimilAs: function (sWord, sSimil) {
        // return True if <sWord> phonetically similar to <sSimil> (<sWord> tested with several casing)
        if (!sWord) {
            return false;
        }
        let lSimils = this.getSimil(sSimil);
        if (lSimils.length == 0) {
            return false;
        }
        if (lSimils.includes(sWord)) {
            return true;
        }
        if (sWord.slice(0,1).gl_isUpperCase()) {
            if (lSimils.includes(sWord.toLowerCase())) {
                return true;
            }
            if (sWord.gl_isUpperCase() && lSimils.includes(sWord.gl_toCapitalize())) {
                return true;
            }
        }
        return false;
    }
};


// Initialization
if (!phonet.bInit && typeof(process) !== 'undefined') {
    // NodeJS

Modified gc_lang/fr/modules/phonet.py from [df9f884192] to [a7ff873dfd].

8
9
10
11
12
13
14
15

16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32

33
34
35
36
37
38
39
40
41
42
43
44
45

46
47
48
49
50
51
52
53

















8
9
10
11
12
13
14

15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31

32
33
34
35
36
37
38
39
40
41
42
43
44

45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70







-
+
















-
+












-
+








+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+

from .phonet_data import dWord as _dWord
from .phonet_data import lSet as _lSet
from .phonet_data import dMorph as _dMorph


def hasSimil (sWord, sPattern=None):
    "return True if there is list of words phonetically similar to sWord"
    "return True if there is list of words phonetically similar to <sWord>"
    if not sWord:
        return False
    if sWord in _dWord:
        if sPattern:
            return any(re.search(sPattern, sMorph)  for sSimil in getSimil(sWord)  for sMorph in _dMorph.get(sSimil, []))
        return True
    if sWord[0:1].isupper():
        sWord = sWord.lower()
        if sWord in _dWord:
            if sPattern:
                return any(re.search(sPattern, sMorph)  for sSimil in getSimil(sWord)  for sMorph in _dMorph.get(sSimil, []))
            return True
    return False


def getSimil (sWord):
    "return list of words phonetically similar to sWord"
    "return list of words phonetically similar to <sWord>"
    if not sWord:
        return []
    if sWord in _dWord:
        return _lSet[_dWord[sWord]]
    if sWord[0:1].isupper():
        sWord = sWord.lower()
        if sWord in _dWord:
            return _lSet[_dWord[sWord]]
    return []


def selectSimil (sWord, sPattern):
    "return a set of words phonetically similar to sWord and whom POS is matching sPattern"
    "return a set of words phonetically similar to <sWord> and whom POS is matching <sPattern>"
    if not sPattern:
        return set(getSimil(sWord))
    aSelect = set()
    for sSimil in getSimil(sWord):
        for sMorph in _dMorph.get(sSimil, []):
            if re.search(sPattern, sMorph):
                aSelect.add(sSimil)
    return aSelect


def isSimilAs (sWord, sSimil):
    "return True if <sWord> phonetically similar to <sSimil> (<sWord> tested with several casing)"
    if not sWord:
        return False
    lSimils = getSimil(sSimil)
    if not lSimils:
        return False
    if sWord in lSimils:
        return True
    if sWord[0:1].isupper():
        if sWord.lower() in lSimils:
            return True
        if sWord.isupper() and sWord.capitalize() in lSimils:
            return True
    return False

Modified gc_lang/fr/rules.grx from [0adc5875f6] to [a9ce3440c8].

12255
12256
12257
12258
12259
12260
12261
















12262
12263
12264
12265
12266
12267
12268
12255
12256
12257
12258
12259
12260
12261
12262
12263
12264
12265
12266
12267
12268
12269
12270
12271
12272
12273
12274
12275
12276
12277
12278
12279
12280
12281
12282
12283
12284







+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+







__conf_tandis_que__
    tendis [que|qu’]
        <<- /conf/ not value(<1, "|je|tu|il|elle|iel|on|ne|n’|le|la|les|l’|me|m’|te|t’|se|s’|")
        ->> tandis \2           && Confusion probable. Écrivez “tandis que” s’il s’agit bien de la locution conjonctive exprimant concomitance ou opposition.|https://fr.wiktionary.org/wiki/tandis_que

TEST: mais {{tendis que}} le policier examinait nos papiers             ->> tandis que


# tard / tare
__conf_tard_tare__
    il >être ?$:W¿ %tard!
    [se|s’] >faire %tard!
    [me|m’|te|t’|se|s’] >lever ?$:W¿ %tard!
    [quelque+s|un] temps plus %tard!
        <<- /conf/ --1>> tard                                       && Confusion. Pour dire que le temps a passé, écrivez “tard”.|https://fr.wiktionary.org/wiki/tard

TEST: il est trop {{tare}}                                  ->> tard
TEST: quelque temps plus {{tares}}                          ->> tard
TEST: s’fait {{tare}}                                       ->> tard
TEST: quelque temps plus tard
TEST: QUELQUE TEMPS PLUS TARD
TEST: Quelque Temps Plus Tard


# taule / tôle
__conf_taule_tôle1__
    [>taule] [de|d’|en] [>acier|>alu|>aluminium|>bardage|>cuivre|>étanchéité|>fer|>festonnage|inox|>laiton|>métal|>trapèze|>zinc|>éverite|>fibrociment|>fibro-ciment|>plastique|>polycarbonate|PVC]
        <<- /conf/ -1>> =\1.replace("au", "ô").replace("AU", "Ô")   && Confusion. La taule est la forme argotique pour évoquer la prison, le bordel ou toute forme d’habitation.

TEST: une {{taule}} en acier

Modified misc/grammalecte.sublime-color-scheme from [e0092a90a2] to [c24fa9f267].

1
2
3
4
5
6
7
8
9
10

11
12
13
14
15
16
17
1
2
3
4
5
6
7
8
9

10
11
12
13
14
15
16
17









-
+







{
    "name": "Grammalecte Color Scheme",
    "globals":
    {
        "background":               "hsl(210, 20%, 15%)",
        "foreground":               "hsl(210, 20%, 95%)",

        "caret":                    "hsl(210, 20%, 80%)",
        "block_caret":              "red",
        "line_highlight":           "hsl(210, 60%, 25%)",
        "line_highlight":           "hsl(210, 60%, 30%)",
        "bracket_options":          "underline bold",

        "selection":                "hsl(210, 50%, 20%)",
        "selection_border":         "hsl(210, 80%, 40%)",
        "selection_border_width":   "1",
        "selection_corner_style":   "cut",
        "selection_corner_radius":  "3",
64
65
66
67
68
69
70

71
72
73
74
75
76
77
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78







+








        {   "name": "Entity Valid",         "scope": "entity.valid",        "foreground": "hsl(150, 100%, 80%)",    "background": "hsl(150, 100%, 20%)",    "font_style": "bold",   },
        {   "name": "Entity Invalid",       "scope": "entity.invalid",      "foreground": "hsl(0, 100%, 80%)",      "background": "hsl(0, 100%, 20%)",      "font_style": "bold",   },
        {   "name": "Token meta",           "scope": "string.meta",         "foreground": "hsl(270, 100%, 90%)",    "background": "hsl(270, 100%, 40%)",  },
        {   "name": "Token token",          "scope": "string.token",        "foreground": "hsl(240, 50%, 90%)",     "background": "hsl(240, 50%, 40%)",  },
        {   "name": "Token Jumptoken",      "scope": "string.jumptoken",    "foreground": "hsl(0, 50%, 90%)",       "background": "hsl(10, 50%, 40%)",  },
        {   "name": "Token lemma",          "scope": "string.lemma",        "foreground": "hsl(210, 100%, 80%)",    "background": "hsl(210, 100%, 15%)",  },
        {   "name": "Token phonet",         "scope": "string.phonet",       "foreground": "hsl(90, 100%, 80%)",    "background": "hsl(90, 100%, 10%)",  },
        {   "name": "Token tag",            "scope": "string.tag",          "foreground": "hsl(30, 100%, 90%)",     "background": "hsl(30, 100%, 20%)",  },
        {   "name": "Token regex",          "scope": "string.regex",        "foreground": "hsl(60, 100%, 80%)",     "background": "hsl(60, 100%, 10%)",  },
        {   "name": "Token morph regex",    "scope": "string.morph.regex",  "foreground": "hsl(150, 80%, 90%)",     "background": "hsl(150, 80%, 10%)",  },
        {   "name": "Token morph negregex", "scope": "string.morph.negregex","foreground": "hsl(0, 80%, 90%)",      "background": "hsl(0, 80%, 10%)",  },


        {   "name": "Keyword Python",       "scope": "keyword.python",      "foreground": "#A0A0A0",  },

Modified misc/grammalecte.sublime-syntax from [c9d3e55815] to [90c3fa5c9d].

56
57
58
59
60
61
62
63

64
65
66
67
68
69
70
56
57
58
59
60
61
62

63
64
65
66
67
68
69
70







-
+







    # other.
    - match: '\b(?:if|else|and|or|not|in)\b'
      scope: keyword.python

    - match: '\b(?:True|False|None)\b'
      scope: constant.language

    - match: '\b(?:spell|morph|morphVC|stem|tag|value|space_after|textarea0?\w*|before0?\w*|after0?\w*|word|option|define|define_from|select|exclude|analyse\w*|tag_\w+|apposition|is[A-Z]\w+|rewriteSubject|checkD\w+|getD\w+|has[A-Z]\w+|sugg[A-Z]\w+|switch[A-Z]\w+|ceOrCet|formatN\w+|mbUnit)\b'
    - match: '\b(?:spell|morph|morphVC|stem|tag|value|space_after|textarea0?\w*|before0?\w*|after0?\w*|word|option|define|define_from|select|exclude|analyse\w*|tag_\w+|apposition|is[A-Z]\w+|checkAgreement|rewrite|checkD\w+|getD\w+|has[A-Z]\w+|sugg[A-Z]\w+|switch[A-Z]\w+|ceOrCet|formatN\w+|mbUnit)\b'
      scope: entity.name.function

    - match: '\b(?:replace|endswith|startswith|search|upper|lower|capitalize|strip|rstrip|is(?:alpha|upper|lower|digit|title))\b'
      scope: support.function

    - match: '\becho\b'
      scope: support.function.debug
149
150
151
152
153
154
155





156
157
158
159
160
161
162
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167







+
+
+
+
+







        1: entity.tag.group

    # Tokens
    - match: '(>)[\w-]+'
      scope: string.lemma
      captures:
        1: entity.valid

    - match: '(%)[\w-]+'
      scope: string.phonet
      captures:
        1: entity.valid

    - match: '(~)(?!(?:\d+(?::\d+|)|)>>)[^\s¬]*'
      scope: string.regex
      captures:
        1: entity.valid

    - match: '(@)([^@\s¬]*)'