Grammalecte  Diff

Differences From Artifact [16e7826100]:

To Artifact [efabea9cdf]:


40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
            [/^\[\/?[a-zA-Z]+\]/, 'PSEUDOHTML'],
            [/^&\w+;(?:\w+;|)/, 'HTMLENTITY'],
            [/^(?:l|d|n|m|t|s|j|c|ç|lorsqu|puisqu|jusqu|quoiqu|qu|presqu|quelqu)['’´‘′`ʼ]/i, 'WORD_ELIDED'],
            [/^\d\d?[h:]\d\d(?:[m:]\d\ds?|)\b/, 'HOUR'],
            [/^\d+(?:ers?\b|res?\b|è[rm]es?\b|i[èe][mr]es?\b|de?s?\b|nde?s?\b|ès?\b|es?\b|ᵉʳˢ?|ʳᵉˢ?|ᵈᵉ?ˢ?|ⁿᵈᵉ?ˢ?|ᵉˢ?)/, 'WORD_ORDINAL'],
            [/^\d+(?:[.,]\d+|)/, 'NUM'],
            [/^[&%‰€$+±=*/<>⩾⩽#|×¥£§¢¬÷@-]/, 'SIGN'],
            [/^[a-zA-Zà-öÀ-Ö0-9ø-ÿØ-ßĀ-ʯff-stᴀ-ᶿᵉʳˢⁿᵈ_]+(?:[’'`-][a-zA-Zà-öÀ-Ö0-9ø-ÿØ-ßĀ-ʯff-stᴀ-ᶿᵉʳˢⁿᵈ_]+)*/, 'WORD']
        ]
};


class Tokenizer {

    constructor (sLang) {







|







40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
            [/^\[\/?[a-zA-Z]+\]/, 'PSEUDOHTML'],
            [/^&\w+;(?:\w+;|)/, 'HTMLENTITY'],
            [/^(?:l|d|n|m|t|s|j|c|ç|lorsqu|puisqu|jusqu|quoiqu|qu|presqu|quelqu)['’´‘′`ʼ]/i, 'WORD_ELIDED'],
            [/^\d\d?[h:]\d\d(?:[m:]\d\ds?|)\b/, 'HOUR'],
            [/^\d+(?:ers?\b|res?\b|è[rm]es?\b|i[èe][mr]es?\b|de?s?\b|nde?s?\b|ès?\b|es?\b|ᵉʳˢ?|ʳᵉˢ?|ᵈᵉ?ˢ?|ⁿᵈᵉ?ˢ?|ᵉˢ?)/, 'WORD_ORDINAL'],
            [/^\d+(?:[.,]\d+|)/, 'NUM'],
            [/^[&%‰€$+±=*/<>⩾⩽#|×¥£§¢¬÷@-]/, 'SIGN'],
            [/^[a-zA-Zà-öÀ-Ö0-9ø-ÿØ-ßĀ-ʯff-stᴀ-ᶿ\u0300-\u036fᵉʳˢⁿᵈ_]+(?:[’'`-][a-zA-Zà-öÀ-Ö0-9ø-ÿØ-ßĀ-ʯff-stᴀ-ᶿ\u0300-\u036fᵉʳˢⁿᵈ_]+)*/, 'WORD']
        ]
};


class Tokenizer {

    constructor (sLang) {
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
        while (sText) {
            let iCut = 1;
            for (let [zRegex, sType] of this.aRules) {
                if (sType !== "SPACE"  ||  bWithSpaces) {
                    try {
                        if ((m = zRegex.exec(sText)) !== null) {
                            iToken += 1;
                            yield { "i": iToken, "sType": sType, "sValue": m[0], "nStart": iNext, "nEnd": iNext + m[0].length };
                            iCut = m[0].length;
                            break;
                        }
                    }
                    catch (e) {
                        console.error(e);
                    }







|







70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
        while (sText) {
            let iCut = 1;
            for (let [zRegex, sType] of this.aRules) {
                if (sType !== "SPACE"  ||  bWithSpaces) {
                    try {
                        if ((m = zRegex.exec(sText)) !== null) {
                            iToken += 1;
                            yield { "i": iToken, "sType": sType, "sValue": m[0], "nStart": iNext, "nEnd": iNext + m[0].length };  // m[0].normalize("NFC") not usefull at the moment
                            iCut = m[0].length;
                            break;
                        }
                    }
                    catch (e) {
                        console.error(e);
                    }