Grammalecte  Diff

Differences From Artifact [a6594366c3]:

To Artifact [fcd058bf6a]:


1
2
3
4
5


6
7


8
9
10
11
12
13
14
15
// JavaScript
// Very simple tokenizer

"use strict";



const helpers = require("resource://grammalecte/helpers.js");



const aPatterns = {
    // All regexps must start with ^.
    "default":
        [
            [/^[   \t]+/, 'SPACE'],
            [/^[,.;:!?…«»“”‘’"(){}\[\]/·–—]+/, 'SEPARATOR'],
            [/^(?:https?:\/\/|www[.]|[a-zA-Zà-öÀ-Ö0-9ø-ÿØ-ßĀ-ʯfi-st_]+[@.][a-zA-Zà-öÀ-Ö0-9ø-ÿØ-ßĀ-ʯfi-st_]+[@.])[a-zA-Zà-öÀ-Ö0-9ø-ÿØ-ßĀ-ʯfi-st_.\/?&!%=+*"'@$#-]+/, 'LINK'],
            [/^[#@][a-zA-Zà-öÀ-Ö0-9ø-ÿØ-ßĀ-ʯfi-st_-]+/, 'TAG'],





>
>
|
|
>
>
|







1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
// JavaScript
// Very simple tokenizer

"use strict";


if (typeof(exports) !== 'undefined') {
    var helpers = require("resource://grammalecte/helpers.js");
}


const aTkzPatterns = {
    // All regexps must start with ^.
    "default":
        [
            [/^[   \t]+/, 'SPACE'],
            [/^[,.;:!?…«»“”‘’"(){}\[\]/·–—]+/, 'SEPARATOR'],
            [/^(?:https?:\/\/|www[.]|[a-zA-Zà-öÀ-Ö0-9ø-ÿØ-ßĀ-ʯfi-st_]+[@.][a-zA-Zà-öÀ-Ö0-9ø-ÿØ-ßĀ-ʯfi-st_]+[@.])[a-zA-Zà-öÀ-Ö0-9ø-ÿØ-ßĀ-ʯfi-st_.\/?&!%=+*"'@$#-]+/, 'LINK'],
            [/^[#@][a-zA-Zà-öÀ-Ö0-9ø-ÿØ-ßĀ-ʯfi-st_-]+/, 'TAG'],
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49

50
51
52
53
54
55
56
            [/^&\w+;(?:\w+;|)/, 'HTMLENTITY'],
            [/^(?:l|d|n|m|t|s|j|c|ç|lorsqu|puisqu|jusqu|quoiqu|qu)['’`]/i, 'ELPFX'],
            [/^\d\d?[hm]\d\d\b/, 'HOUR'],
            [/^\d+(?:er|nd|e|de|ième|ème|eme)s?\b/, 'ORDINAL'],
            [/^-?\d+(?:[.,]\d+|)/, 'NUM'],
            [/^[a-zA-Zà-öÀ-Ö0-9ø-ÿØ-ßĀ-ʯfi-st]+(?:[’'`-][a-zA-Zà-öÀ-Ö0-9ø-ÿØ-ßĀ-ʯfi-st]+)*/, 'WORD']
        ]
}


class Tokenizer {

    constructor (sLang) {
        this.sLang = sLang;
        if (!aPatterns.hasOwnProperty(sLang)) {
            this.sLang = "default";
        }
        this.aRules = aPatterns[this.sLang];
    };


    * genTokens (sText) {
        let m;
        let i = 0;
        while (sText) {
            let nCut = 1;
            for (let [zRegex, sType] of this.aRules) {







|






|


|
<
>







35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52

53
54
55
56
57
58
59
60
            [/^&\w+;(?:\w+;|)/, 'HTMLENTITY'],
            [/^(?:l|d|n|m|t|s|j|c|ç|lorsqu|puisqu|jusqu|quoiqu|qu)['’`]/i, 'ELPFX'],
            [/^\d\d?[hm]\d\d\b/, 'HOUR'],
            [/^\d+(?:er|nd|e|de|ième|ème|eme)s?\b/, 'ORDINAL'],
            [/^-?\d+(?:[.,]\d+|)/, 'NUM'],
            [/^[a-zA-Zà-öÀ-Ö0-9ø-ÿØ-ßĀ-ʯfi-st]+(?:[’'`-][a-zA-Zà-öÀ-Ö0-9ø-ÿØ-ßĀ-ʯfi-st]+)*/, 'WORD']
        ]
};


class Tokenizer {

    constructor (sLang) {
        this.sLang = sLang;
        if (!aTkzPatterns.hasOwnProperty(sLang)) {
            this.sLang = "default";
        }
        this.aRules = aTkzPatterns[this.sLang];

    }

    * genTokens (sText) {
        let m;
        let i = 0;
        while (sText) {
            let nCut = 1;
            for (let [zRegex, sType] of this.aRules) {
70
71
72
73
74
75
76
77

78
79
80
81
82
83
84
                catch (e) {
                    helpers.logerror(e);
                }
            }
            i += nCut;
            sText = sText.slice(nCut);
        }
    };


    getSpellingErrors (sText, oDict) {
        let aSpellErr = [];
        for (let oToken of this.genTokens(sText)) {
            if (oToken.sType === 'WORD' && !oDict.isValidToken(oToken.sValue)) {
                aSpellErr.push(oToken);
            }







<
>







74
75
76
77
78
79
80

81
82
83
84
85
86
87
88
                catch (e) {
                    helpers.logerror(e);
                }
            }
            i += nCut;
            sText = sText.slice(nCut);
        }

    }

    getSpellingErrors (sText, oDict) {
        let aSpellErr = [];
        for (let oToken of this.genTokens(sText)) {
            if (oToken.sType === 'WORD' && !oDict.isValidToken(oToken.sValue)) {
                aSpellErr.push(oToken);
            }