1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
|
// JavaScript
// Very simple tokenizer
"use strict";
const helpers = require("resource://grammalecte/helpers.js");
const aPatterns = {
// All regexps must start with ^.
"default":
[
[/^[ \t]+/, 'SPACE'],
[/^[,.;:!?…«»“”‘’"(){}\[\]/·–—]+/, 'SEPARATOR'],
[/^(?:https?:\/\/|www[.]|[a-zA-Zà-öÀ-Ö0-9ø-ÿØ-ßĀ-ʯfi-st_]+[@.][a-zA-Zà-öÀ-Ö0-9ø-ÿØ-ßĀ-ʯfi-st_]+[@.])[a-zA-Zà-öÀ-Ö0-9ø-ÿØ-ßĀ-ʯfi-st_.\/?&!%=+*"'@$#-]+/, 'LINK'],
[/^[#@][a-zA-Zà-öÀ-Ö0-9ø-ÿØ-ßĀ-ʯfi-st_-]+/, 'TAG'],
|
>
>
|
|
>
>
|
|
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
|
// JavaScript
// Very simple tokenizer
"use strict";
if (typeof(exports) !== 'undefined') {
var helpers = require("resource://grammalecte/helpers.js");
}
const aTkzPatterns = {
// All regexps must start with ^.
"default":
[
[/^[ \t]+/, 'SPACE'],
[/^[,.;:!?…«»“”‘’"(){}\[\]/·–—]+/, 'SEPARATOR'],
[/^(?:https?:\/\/|www[.]|[a-zA-Zà-öÀ-Ö0-9ø-ÿØ-ßĀ-ʯfi-st_]+[@.][a-zA-Zà-öÀ-Ö0-9ø-ÿØ-ßĀ-ʯfi-st_]+[@.])[a-zA-Zà-öÀ-Ö0-9ø-ÿØ-ßĀ-ʯfi-st_.\/?&!%=+*"'@$#-]+/, 'LINK'],
[/^[#@][a-zA-Zà-öÀ-Ö0-9ø-ÿØ-ßĀ-ʯfi-st_-]+/, 'TAG'],
|
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
|
}
class Tokenizer {
constructor (sLang) {
this.sLang = sLang;
if (!aPatterns.hasOwnProperty(sLang)) {
this.sLang = "default";
}
this.aRules = aPatterns[this.sLang];
};
* genTokens (sText) {
let m;
let i = 0;
while (sText) {
let nCut = 1;
|
|
|
|
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
|
}
class Tokenizer {
constructor (sLang) {
this.sLang = sLang;
if (!aTkzPatterns.hasOwnProperty(sLang)) {
this.sLang = "default";
}
this.aRules = aTkzPatterns[this.sLang];
};
* genTokens (sText) {
let m;
let i = 0;
while (sText) {
let nCut = 1;
|