Grammalecte  Check-in [b68161b398]

Overview
Comment:[graphspell] tokenizer and suggestion engine: other apostrophes
Downloads: Tarball | ZIP archive | SQL archive
Timelines: family | ancestors | descendants | both | trunk | graphspell
Files: files | file ages | folders
SHA3-256: b68161b39849bb8fe242ed6d3c766a3845c248f7b76f4cc227e4b9b7421d4f2a
User & Date: olr on 2020-05-07 10:35:13
Other Links: manifest | tags
Context
2020-05-07
11:33
[fr] ajustements, +nr: iel·s +état +ppas check-in: f117d9d93a user: olr tags: trunk, fr
10:35
[graphspell] tokenizer and suggestion engine: other apostrophes check-in: b68161b398 user: olr tags: trunk, graphspell
10:34
[fr] ajustements: apostrophes check-in: c379fb8706 user: olr tags: trunk, fr
Changes

Modified graphspell-js/char_player.js from [76dbb40bcc] to [3caadd8250].

105
106
107
108
109
110
111













112
113
114
115
116
117
118
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131







+
+
+
+
+
+
+
+
+
+
+
+
+







    aConsonant: new Set("bcçdfghjklmnñpqrstvwxzBCÇDFGHJKLMNÑPQRSTVWXZ"),
    aDouble: new Set("bcdfjklmnprstzBCDFJKLMNPRSTZ"),  // letters that may be used twice successively


    // Similar chars

    d1to1: new Map([
        ["'", "'’"],    // U+0027: apostrophe droite
        ["’", "’"],     // U+2019: apostrophe typographique  (sera utilisée par défaut)
        ["ʼ", "ʼ’"],    // U+02BC: Lettre modificative apostrophe
        ["‘", "‘’"],    // U+2018: guillemet-apostrophe culbuté
        ["‛", "‛’"],    // U+201B: guillemet-virgule supérieur culbuté
        ["´", "´’"],    // U+00B4: accent aigu
        ["`", "`’"],    // U+0060: accent grave
        ["′", "′’"],    // U+2032: prime
        ["‵", "‵’"],    // U+2035: prime réfléchi
        ["՚", "՚’"],    // U+055A: apostrophe arménienne
        ["ꞌ", "ꞌ’"],    // U+A78C: latin minuscule saltillo
        ["Ꞌ", "Ꞌ’"],    // U+A78B: latin majuscule saltillo

        ["1", "1₁liîLIÎ"],
        ["2", "2₂zZ"],
        ["3", "3₃eéèêEÉÈÊ"],
        ["4", "4₄aàâAÀÂ"],
        ["5", "5₅sgSG"],
        ["6", "6₆bdgBDG"],
        ["7", "7₇ltLT"],

Modified graphspell-js/tokenizer.js from [efabea9cdf] to [0e7b889227].

35
36
37
38
39
40
41
42

43
44
45
46
47
48
49
35
36
37
38
39
40
41

42
43
44
45
46
47
48
49







-
+







            [/^[,.;:!?…«»“”‘’"(){}\[\]·–—¿¡]/, 'PUNC'],
            [/^[A-Z][.][A-Z][.](?:[A-Z][.])*/, 'WORD_ACRONYM'],
            [/^(?:https?:\/\/|www[.]|[a-zA-Zà-öÀ-Ö0-9ø-ÿØ-ßĀ-ʯff-st_-]+[@.][a-zA-Zà-öÀ-Ö0-9ø-ÿØ-ßĀ-ʯff-st_-]{2,}[@.])[a-zA-Z0-9][a-zA-Zà-öÀ-Ö0-9ø-ÿØ-ßĀ-ʯff-st_.\/?&!%=+*"'@$#-]+/, 'LINK'],
            [/^[#@][a-zA-Zà-öÀ-Ö0-9ø-ÿØ-ßĀ-ʯff-st_-]+/, 'TAG'],
            [/^<[a-zA-Z]+.*?>|^<\/[a-zA-Z]+ *>/, 'HTML'],
            [/^\[\/?[a-zA-Z]+\]/, 'PSEUDOHTML'],
            [/^&\w+;(?:\w+;|)/, 'HTMLENTITY'],
            [/^(?:l|d|n|m|t|s|j|c|ç|lorsqu|puisqu|jusqu|quoiqu|qu|presqu|quelqu)['’´‘′]/i, 'WORD_ELIDED'],
            [/^(?:l|d|n|m|t|s|j|c|ç|lorsqu|puisqu|jusqu|quoiqu|qu|presqu|quelqu)['’ʼ‛´`‵՚ꞌꞋ]/i, 'WORD_ELIDED'],
            [/^\d\d?[h:]\d\d(?:[m:]\d\ds?|)\b/, 'HOUR'],
            [/^\d+(?:ers?\b|res?\b|è[rm]es?\b|i[èe][mr]es?\b|de?s?\b|nde?s?\b|ès?\b|es?\b|ᵉʳˢ?|ʳᵉˢ?|ᵈᵉ?ˢ?|ⁿᵈᵉ?ˢ?|ᵉˢ?)/, 'WORD_ORDINAL'],
            [/^\d+(?:[.,]\d+|)/, 'NUM'],
            [/^[&%‰€$+±=*/<>⩾⩽#|×¥£§¢¬÷@-]/, 'SIGN'],
            [/^[a-zA-Zà-öÀ-Ö0-9ø-ÿØ-ßĀ-ʯff-stᴀ-ᶿ\u0300-\u036fᵉʳˢⁿᵈ_]+(?:[’'`-][a-zA-Zà-öÀ-Ö0-9ø-ÿØ-ßĀ-ʯff-stᴀ-ᶿ\u0300-\u036fᵉʳˢⁿᵈ_]+)*/, 'WORD']
        ]
};

Modified graphspell/char_player.py from [fff304a6c5] to [fa338bf2f3].

91
92
93
94
95
96
97













98
99
100
101
102
103
104
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117







+
+
+
+
+
+
+
+
+
+
+
+
+







aConsonant = set("bcçdfghjklmnñpqrstvwxzBCÇDFGHJKLMNÑPQRSTVWXZ")
aDouble = set("bcdfjklmnprstzBCDFJKLMNPRSTZ")  # letters that may be used twice successively


# Similar chars

d1to1 = {
    "'": "'’",  # U+0027: apostrophe droite
    "’": "’",   # U+2019: apostrophe typographique  (sera utilisée par défaut)
    "ʼ": "ʼ’",  # U+02BC: Lettre modificative apostrophe
    "‘": "‘’",  # U+2018: guillemet-apostrophe culbuté
    "‛": "‛’",  # U+201B: guillemet-virgule supérieur culbuté
    "´": "´’",  # U+00B4: accent aigu
    "`": "`’",  # U+0060: accent grave
    "′": "′’",  # U+2032: prime
    "‵": "‵’",  # U+2035: prime réfléchi
    "՚": "՚’",  # U+055A: apostrophe arménienne
    "ꞌ": "ꞌ’",  # U+A78C: latin minuscule saltillo
    "Ꞌ": "Ꞌ’",  # U+A78B: latin majuscule saltillo

    "1": "1₁liîLIÎ",
    "2": "2₂zZ",
    "3": "3₃eéèêEÉÈÊ",
    "4": "4₄aàâAÀÂ",
    "5": "5₅sgSG",
    "6": "6₆bdgBDG",
    "7": "7₇ltLT",

Modified graphspell/tokenizer.py from [81da836011] to [b7228e1a86].

29
30
31
32
33
34
35
36

37
38
39
40
41
42
43
29
30
31
32
33
34
35

36
37
38
39
40
41
42
43







-
+







            r'(?P<FOLDERWIN>[a-zA-Z]:\\(?:Program Files(?: [(]x86[)]|)|[\w.()]+)(?:\\[\w.()-]+)*)',
            r'(?P<PUNC>[][,.;:!?…«»“”‘’"(){}·–—¿¡])',
            r'(?P<WORD_ACRONYM>[A-Z][.][A-Z][.](?:[A-Z][.])*)',
            r'(?P<LINK>(?:https?://|www[.]|\w+[@.]\w\w+[@.])\w[\w./?&!%=+*"\'@$#-]+)',
            r'(?P<HASHTAG>[#@][\w-]+)',
            r'(?P<HTML><\w+.*?>|</\w+ *>)',
            r'(?P<PSEUDOHTML>\[/?\w+\])',
            r"(?P<WORD_ELIDED>(?:l|d|n|m|t|s|j|c|ç|lorsqu|puisqu|jusqu|quoiqu|qu|presqu|quelqu)['’´‘′])",
            r"(?P<WORD_ELIDED>(?:l|d|n|m|t|s|j|c|ç|lorsqu|puisqu|jusqu|quoiqu|qu|presqu|quelqu)['’ʼ‛´`‵՚ꞌꞋ])",
            r'(?P<WORD_ORDINAL>\d+(?:ers?|res?|è[rm]es?|i[èe][mr]es?|de?s?|nde?s?|ès?|es?|ᵉʳˢ?|ʳᵉˢ?|ᵈᵉ?ˢ?|ⁿᵈᵉ?ˢ?|ᵉˢ?)\b)',
            r'(?P<HOUR>\d\d?[h:]\d\d(?:[m:]\d\ds?|)\b)',
            r'(?P<NUM>\d+(?:[.,]\d+|))',
            r'(?P<SIGN>[&%‰€$+±=*/<>⩾⩽#|×¥£¢§¬÷@-])',
            r"(?P<WORD>[\w\u0300-\u036f]+(?:[’'`-][\w\u0300-\u036f]+)*)"
        )
}