| 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
2021
22
23
24
25
26
27
2829
30
31
32
33
34
35
3637
38
39
40
41
42
4344
4546
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
6263
6465
6667
6869
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95 | 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
 | 
-
-
-
-
-
-
-
+
+
-
+
+
-
-
+
+
-
+
-
-
+
+
+
-
+
-
+
+
+
+
+
+
-
+
+
+
-
-
-
+
+
+
-
-
-
-
-
+
-
-
-
-
-
-
-
-
-
-
-
+
+
+
+
+
+
+
+
+
+
+
+
+
+
 | 
// JavaScript
// Very simple tokenizer
/*jslint esversion: 6*/
/*global require,exports*/
"use strict";
const aTkzPatterns = {
    // All regexps must start with ^.
    "default":
        [
            [/^[   \t]+/, 'SPACE'],
            [/^\/(?:~|bin|boot|dev|etc|home|lib|mnt|opt|root|sbin|tmp|usr|var|Bureau|Documents|Images|Musique|Public|Téléchargements|Vidéos)(?:\/[a-zA-Zà-öÀ-Ö0-9ø-ÿØ-ßĀ-ʯfi-st_.()-]+)*/, 'FOLDERUNIX'],
            [/^[a-zA-Z]:\\(?:Program Files(?: \(x86\)|)|[a-zA-Zà-öÀ-Ö0-9ø-ÿØ-ßĀ-ʯfi-st.()]+)(?:\\[a-zA-Zà-öÀ-Ö0-9ø-ÿØ-ßĀ-ʯfi-st_.()-]+)*/, 'FOLDERWIN'],if (typeof(require) !== 'undefined') {    var helpers = require("resource://grammalecte/graphspell/helpers.js");}            [/^[,.;:!?…«»“”‘’"(){}\[\][/^[,.;:!?…«»“”‘’"(){}\[\]·–—]/, 'SEPARATOR'],
            [/^[A-Z][.][A-Z][.](?:[A-Z][.])*/, 'WORD_ACRONYM'],
            [/^(?:https?:\/\/|www[.]|[a-zA-Zà-öÀ-Ö0-9ø-ÿØ-ßĀ-ʯfi-st_-]+[@.][a-zA-Zà-öÀ-Ö0-9ø-ÿØ-ßĀ-ʯfi-st_-]{2,}[@.])[a-zA-Z0-9][a-zA-Zà-öÀ-Ö0-9ø-ÿØ-ßĀ-ʯfi-st_.\/?&!%=+*"'@$#-]+/, 'LINK'],
            [/^[#@][a-zA-Zà-öÀ-Ö0-9ø-ÿØ-ßĀ-ʯfi-st_-]+/, 'TAG'],
            [/^<[a-zA-Zà-öÀ-Ö0-9ø-ÿØ-ßĀ-ʯfi-st]+.*?>|<\/[a-zA-Zà-öÀ-Ö0-9ø-ÿØ-ßĀ-ʯfi-st]+ *>/, 'HTML'],
            [/^\[\/?[a-zA-Zà-öÀ-Ö0-9ø-ÿØ-ßĀ-ʯfi-st]+\]/, 'PSEUDOHTML'],
            [/^&\w+;(?:\w+;|)/, 'HTMLENTITY'],
            [/^\d\d?h\d\d\b/, 'HOUR'],/·–—]+/, 'SEPARATOR'],
            [/^[A-Z][.][A-Z][.](?:[A-Z][.])*/, 'ACRONYM'],            [/^[/^\d+(?:[.,]\d+|)/, 'NUM'],
            [/^[%‰+=*/<>⩾⩽-]/, 'SIGN'],
            [/^[a-zA-Zà-öÀ-Ö0-9ø-ÿØ-ßĀ-ʯfi-st]+(?:[’'`-][a-zA-Zà-öÀ-Ö0-9ø-ÿØ-ßĀ-ʯfi-st]+)*/, 'WORD']
        ],
    "fr":
        [
            [/^[   \t]+/, 'SPACE'],
            [/^\/(?:~|bin|boot|dev|etc|home|lib|mnt|opt|root|sbin|tmp|usr|var|Bureau|Documents|Images|Musique|Public|Téléchargements|Vidéos)(?:\/[a-zA-Zà-öÀ-Ö0-9ø-ÿØ-ßĀ-ʯfi-st_.()-]+)*/, 'FOLDERUNIX'],
            [/^[a-zA-Z]:\\(?:Program Files(?: \(x86\)|)|[a-zA-Zà-öÀ-Ö0-9ø-ÿØ-ßĀ-ʯfi-st.()]+)(?:\\[a-zA-Zà-öÀ-Ö0-9ø-ÿØ-ßĀ-ʯfi-st_.()-]+)*/, 'FOLDERWIN'],-?\d+(?:[.,]\d+|)/, 'NUM'],            [/^[,.;:!?…«»“”‘’"(){}\[\][/^[,.;:!?…«»“”‘’"(){}\[\]·–—]/, 'SEPARATOR'],
            [/^[A-Z][.][A-Z][.](?:[A-Z][.])*/, 'WORD_ACRONYM'],
            [/^(?:https?:\/\/|www[.]|[a-zA-Zà-öÀ-Ö0-9ø-ÿØ-ßĀ-ʯfi-st_-]+[@.][a-zA-Zà-öÀ-Ö0-9ø-ÿØ-ßĀ-ʯfi-st_-]{2,}[@.])[a-zA-Z0-9][a-zA-Zà-öÀ-Ö0-9ø-ÿØ-ßĀ-ʯfi-st_.\/?&!%=+*"'@$#-]+/, 'LINK'],
            [/^[#@][a-zA-Zà-öÀ-Ö0-9ø-ÿØ-ßĀ-ʯfi-st_-]+/, 'TAG'],
            [/^<[a-zA-Zà-öÀ-Ö0-9ø-ÿØ-ßĀ-ʯfi-st]+.*?>|<\/[a-zA-Zà-öÀ-Ö0-9ø-ÿØ-ßĀ-ʯfi-st]+ *>/, 'HTML'],
            [/^\[\/?[a-zA-Zà-öÀ-Ö0-9ø-ÿØ-ßĀ-ʯfi-st]+\]/, 'PSEUDOHTML'],
            [/^&\w+;(?:\w+;|)/, 'HTMLENTITY'],/·–—]+/, 'SEPARATOR'],
            [/^[A-Z][.][A-Z][.](?:[A-Z][.])*/, 'ACRONYM'],            [/^(?:l|d|n|m|t|s|j|c|ç|lorsqu|puisqu|jusqu|quoiqu|qu)['’`]/i, '[/^(?:l|d|n|m|t|s|j|c|ç|lorsqu|puisqu|jusqu|quoiqu|qu)['’`]/i, 'WORD_ELIDED'],
            [/^\d\d?[hm]\d\d\b/, 'HOUR'],ELPFX'],            [/^\d+(?:er|nd[/^\d+(?:ers?|nds?|es?|des?|ièmes?|èmes?|emes?|ᵉʳˢ?|ⁿᵈˢ?|ᵉˢ?|ᵈᵉˢ?)\b/, 'WORD_ORDINAL'],
            [/^\d+(?:[.,]\d+|)/, 'NUM'],
            [/^[%‰+=*/<>⩾⩽-]/, 'SIGN'],
            [/^[a-zA-Zà-öÀ-Ö0-9ø-ÿØ-ßĀ-ʯfi-st]+(?:[’'`-][a-zA-Zà-öÀ-Ö0-9ø-ÿØ-ßĀ-ʯfi-st]+)*/, 'WORD']
        ]
};
class Tokenizer {
    constructor (sLang) {
        this.sLang = sLang;
        if (!aTkzPatterns.hasOwnProperty(sLang)) {
            this.sLang = "default";
        }
        this.aRules = aTkzPatterns[this.sLang];
    }|e|de|ième|ème|eme)s?\b/, 'ORDINAL'],
            [/^-?\d+(?:[.,]\d+|)/, 'NUM'],    * genTokens (sText) {
* genTokens (sText, bStartEndToken=false, bWithSpaces=false) {
        let m;        let ilet iToken;
        let iNext = 0;
        let iEnd = sText.length;
        if (bStartEndToken) {
            yield { "i": 0, "sType": "INFO", "sValue": "<start>", "nStart": 0, "nEnd": 0, "lMorph": ["<start>"] };
        }
        while (sText) { = 0;            let let iCut = 1;
            let iToken = 0;
            for (let [zRegex, sType] of this.aRules) {
                if (sType !== "SPACE"  ||  bWithSpaces) {nCut = 1;                try {
                    if ((m = zRegex.exec(sText)) !== null) {
                        itry {
                        if ((m = zRegex.exec(sText)) !== null) {
                            iToken += 1;f(sType =='SEPARATOR') {yield { "i": iToken, "sType": sType, "sValue": m[0], "nStart": iNext, "nEnd": iNext + m[0].length }                            for (let c of m[0]) {                                yield { "sType": sType, "sValue": c, "nStart": i, "nEnd": i + m[0].length }                            }                        } else {yield { "sType": sType, "sValue": m[0], "nStart": i, "nEnd": i + m[0].length }iCut = m[0].length;
                            break;
                        }
                    }
                    catch (e) {
                        console.error(e);
                    }
                }
            }
            iNext += iCut;
            sText = sText.slice(iCut);
        }
        if (bStartEndToken) {
            yield { "i": iToken+1, "sType": "INFO", "sValue": "<end>", "nStart": iEnd, "nEnd": iEnd, "lMorph": ["<end>"] };
        }
    }
}
if (typeof(exports) !== 'undefined') {
    exports.Tokenizer = Tokenizer;
}                        }nCut = m[0].length;
                        break;
                    }
                }
                catch (e) {helpers.logerror(e);
                }
            }
            i += nCut;
            sText = sText.slice(nCut); |