Grammalecte  Check-in [d12872816f]

Overview
Comment:[graphspell][js] tokenizer: don’t use spaces as tokens, yield information token (start/end)
Downloads: Tarball | ZIP archive | SQL archive
Timelines: family | ancestors | descendants | both | graphspell | rg
Files: files | file ages | folders
SHA3-256: d12872816fe0c2e77014a341e1e3ad1623dd1b5acb91cc89b1564dd9b65b447a
User & Date: olr on 2018-09-11 18:55:24
Other Links: branch diff | manifest | tags
Context
2018-09-11
20:44
[core][py] don’t ask for morphologies several times uselessly check-in: 95ac5ee834 user: olr tags: core, rg
18:55
[graphspell][js] tokenizer: don’t use spaces as tokens, yield information token (start/end) check-in: d12872816f user: olr tags: graphspell, rg
18:38
[core][js] gc engine: fix several bugs check-in: e47d159953 user: olr tags: core, rg
Changes

Modified graphspell-js/tokenizer.js from [4a5b091820] to [d2289444f9].

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
1
2
3
4
5
6
7





8
9
10
11
12
13
14







-
-
-
-
-







// JavaScript
// Very simple tokenizer
/*jslint esversion: 6*/
/*global require,exports*/

"use strict";


if (typeof(require) !== 'undefined') {
    var helpers = require("resource://grammalecte/graphspell/helpers.js");
}


const aTkzPatterns = {
    // All regexps must start with ^.
    "default":
        [
            [/^[   \t]+/, 'SPACE'],
            [/^\/(?:~|bin|boot|dev|etc|home|lib|mnt|opt|root|sbin|tmp|usr|var|Bureau|Documents|Images|Musique|Public|Téléchargements|Vidéos)(?:\/[a-zA-Zà-öÀ-Ö0-9ø-ÿØ-ßĀ-ʯfi-st_.()-]+)*/, 'FOLDERUNIX'],
58
59
60
61
62
63
64
65

66

67




68
69
70
71

72
73
74
75
76
77
78
79
80
81











82
83
84
85
86



87
88
89
90
91
92
93
53
54
55
56
57
58
59

60
61
62
63
64
65
66
67
68
69
70
71
72










73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98







-
+

+

+
+
+
+




+
-
-
-
-
-
-
-
-
-
-
+
+
+
+
+
+
+
+
+
+
+





+
+
+







        this.sLang = sLang;
        if (!aTkzPatterns.hasOwnProperty(sLang)) {
            this.sLang = "default";
        }
        this.aRules = aTkzPatterns[this.sLang];
    }

    * genTokens (sText) {
    * genTokens (sText, bStartEndToken=false, bWithSpaces=false) {
        let m;
        let iToken;
        let iNext = 0;
        let iEnd = sText.length;
        if (bStartEndToken) {
            yield { "i": 0, "sType": "INFO", "sValue": "<start>", "nStart": 0, "nEnd": 0, "lMorph": ["<start>"] };
        }
        while (sText) {
            let iCut = 1;
            let iToken = 0;
            for (let [zRegex, sType] of this.aRules) {
                if (sType !== "SPACE"  ||  bWithSpaces) {
                try {
                    if ((m = zRegex.exec(sText)) !== null) {
                        iToken += 1;
                        yield { "i": iToken, "sType": sType, "sValue": m[0], "nStart": iNext, "nEnd": iNext + m[0].length }
                        iCut = m[0].length;
                        break;
                    }
                }
                catch (e) {
                    helpers.logerror(e);
                    try {
                        if ((m = zRegex.exec(sText)) !== null) {
                            iToken += 1;
                            yield { "i": iToken, "sType": sType, "sValue": m[0], "nStart": iNext, "nEnd": iNext + m[0].length }
                            iCut = m[0].length;
                            break;
                        }
                    }
                    catch (e) {
                        console.error(e);
                    }
                }
            }
            iNext += iCut;
            sText = sText.slice(iCut);
        }
        if (bStartEndToken) {
            yield { "i": iToken+1, "sType": "INFO", "sValue": "<end>", "nStart": iEnd, "nEnd": iEnd, "lMorph": ["<end>"] };
        }
    }
}


if (typeof(exports) !== 'undefined') {
    exports.Tokenizer = Tokenizer;
}