1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
|
// JavaScript
// Very simple tokenizer
"use strict";
const helpers = require("resource://grammalecte/helpers.js");
const aPatterns = {
// All regexps must start with ^.
"default":
[
[/^[ \t]+/, 'SPACE'],
[/^[,.;:!?…«»“”"()/·]+/, 'SEPARATOR'],
[/^(?:https?:\/\/|www[.]|[a-zA-Zà-öÀ-Ö0-9ø-ÿØ-ßĀ-ʯfi-st_]+[@.][a-zA-Zà-öÀ-Ö0-9ø-ÿØ-ßĀ-ʯfi-st_]+[@.])[a-zA-Zà-öÀ-Ö0-9ø-ÿØ-ßĀ-ʯfi-st_.\/?&!%=+*"'@$#-]+/, 'LINK'],
[/^[#@][a-zA-Zà-öÀ-Ö0-9ø-ÿØ-ßĀ-ʯfi-st_-]+/, 'TAG'],
[/^<[a-zA-Zà-öÀ-Ö0-9ø-ÿØ-ßĀ-ʯfi-st]+.*?>|<\/[a-zA-Zà-öÀ-Ö0-9ø-ÿØ-ßĀ-ʯfi-st]+ *>/, 'HTML'],
[/^\[\/?[a-zA-Zà-öÀ-Ö0-9ø-ÿØ-ßĀ-ʯfi-st]+\]/, 'PSEUDOHTML'],
[/^&\w+;(?:\w+;|)/, 'HTMLENTITY'],
[/^\d\d?h\d\d\b/, 'HOUR'],
[/^-?\d+(?:[.,]\d+|)/, 'NUM'],
[/^[a-zA-Zà-öÀ-Ö0-9ø-ÿØ-ßĀ-ʯfi-st]+(?:[’'`-][a-zA-Zà-öÀ-Ö0-9ø-ÿØ-ßĀ-ʯfi-st]+)*/, 'WORD']
],
"fr":
[
[/^[ \t]+/, 'SPACE'],
[/^[,.;:!?…«»“”"()/·]+/, 'SEPARATOR'],
[/^(?:https?:\/\/|www[.]|[a-zA-Zà-öÀ-Ö0-9ø-ÿØ-ßĀ-ʯfi-st_]+[@.][a-zA-Zà-öÀ-Ö0-9ø-ÿØ-ßĀ-ʯfi-st_]+[@.])[a-zA-Zà-öÀ-Ö0-9ø-ÿØ-ßĀ-ʯfi-st_.\/?&!%=+*"'@$#-]+/, 'LINK'],
[/^[#@][a-zA-Zà-öÀ-Ö0-9ø-ÿØ-ßĀ-ʯfi-st_-]+/, 'TAG'],
[/^<[a-zA-Zà-öÀ-Ö0-9ø-ÿØ-ßĀ-ʯfi-st]+.*?>|<\/[a-zA-Zà-öÀ-Ö0-9ø-ÿØ-ßĀ-ʯfi-st]+ *>/, 'HTML'],
[/^\[\/?[a-zA-Zà-öÀ-Ö0-9ø-ÿØ-ßĀ-ʯfi-st]+\]/, 'PSEUDOHTML'],
[/^&\w+;(?:\w+;|)/, 'HTMLENTITY'],
[/^(?:l|d|n|m|t|s|j|c|ç|lorsqu|puisqu|jusqu|quoiqu|qu)['’`]/i, 'ELPFX'],
[/^\d\d?[hm]\d\d\b/, 'HOUR'],
[/^\d+(?:er|nd|e|de|ième|ème|eme)\b/, 'ORDINAL'],
[/^-?\d+(?:[.,]\d+|)/, 'NUM'],
[/^[a-zA-Zà-öÀ-Ö0-9ø-ÿØ-ßĀ-ʯfi-st]+(?:[’'`-][a-zA-Zà-öÀ-Ö0-9ø-ÿØ-ßĀ-ʯfi-st]+)*/, 'WORD']
]
}
class Tokenizer {
constructor (sLang) {
this.sLang = sLang;
if (!aPatterns.hasOwnProperty(sLang)) {
this.sLang = "default";
}
this.aRules = aPatterns[sLang];
};
* genTokens (sText) {
let m;
let i = 0;
while (sText) {
let nCut = 1;
for (let [zRegex, sType] of this.aRules) {
try {
if ((m = zRegex.exec(sText)) !== null) {
yield { "sType": sType, "sValue": m[0], "nStart": i, "nEnd": i + m[0].length }
nCut = m[0].length;
break;
}
}
catch (e) {
helpers.logerror(e);
}
}
i += nCut;
sText = sText.slice(nCut);
}
}
}
exports.Tokenizer = Tokenizer;
|
>
>
>
>
|
|
>
|
|
|
|
|
|
|
<
>
>
>
>
>
>
|
>
|
>
>
>
>
>
|
>
>
>
>
>
>
>
|
>
|
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
|
// JavaScript
// Very simple tokenizer
/*jslint esversion: 6*/
/*global require,exports*/
"use strict";
if (typeof(require) !== 'undefined') {
var helpers = require("resource://grammalecte/helpers.js");
}
const aTkzPatterns = {
// All regexps must start with ^.
"default":
[
[/^[ \t]+/, 'SPACE'],
[/^[,.;:!?…«»“”‘’"(){}\[\]/·–—]+/, 'SEPARATOR'],
[/^(?:https?:\/\/|www[.]|[a-zA-Zà-öÀ-Ö0-9ø-ÿØ-ßĀ-ʯfi-st_]+[@.][a-zA-Zà-öÀ-Ö0-9ø-ÿØ-ßĀ-ʯfi-st_]+[@.])[a-zA-Zà-öÀ-Ö0-9ø-ÿØ-ßĀ-ʯfi-st_.\/?&!%=+*"'@$#-]+/, 'LINK'],
[/^[#@][a-zA-Zà-öÀ-Ö0-9ø-ÿØ-ßĀ-ʯfi-st_-]+/, 'TAG'],
[/^<[a-zA-Zà-öÀ-Ö0-9ø-ÿØ-ßĀ-ʯfi-st]+.*?>|<\/[a-zA-Zà-öÀ-Ö0-9ø-ÿØ-ßĀ-ʯfi-st]+ *>/, 'HTML'],
[/^\[\/?[a-zA-Zà-öÀ-Ö0-9ø-ÿØ-ßĀ-ʯfi-st]+\]/, 'PSEUDOHTML'],
[/^&\w+;(?:\w+;|)/, 'HTMLENTITY'],
[/^\d\d?h\d\d\b/, 'HOUR'],
[/^-?\d+(?:[.,]\d+|)/, 'NUM'],
[/^[a-zA-Zà-öÀ-Ö0-9ø-ÿØ-ßĀ-ʯfi-st]+(?:[’'`-][a-zA-Zà-öÀ-Ö0-9ø-ÿØ-ßĀ-ʯfi-st]+)*/, 'WORD']
],
"fr":
[
[/^[ \t]+/, 'SPACE'],
[/^[,.;:!?…«»“”‘’"(){}\[\]/·–—]+/, 'SEPARATOR'],
[/^(?:https?:\/\/|www[.]|[a-zA-Zà-öÀ-Ö0-9ø-ÿØ-ßĀ-ʯfi-st_]+[@.][a-zA-Zà-öÀ-Ö0-9ø-ÿØ-ßĀ-ʯfi-st_]+[@.])[a-zA-Zà-öÀ-Ö0-9ø-ÿØ-ßĀ-ʯfi-st_.\/?&!%=+*"'@$#-]+/, 'LINK'],
[/^[#@][a-zA-Zà-öÀ-Ö0-9ø-ÿØ-ßĀ-ʯfi-st_-]+/, 'TAG'],
[/^<[a-zA-Zà-öÀ-Ö0-9ø-ÿØ-ßĀ-ʯfi-st]+.*?>|<\/[a-zA-Zà-öÀ-Ö0-9ø-ÿØ-ßĀ-ʯfi-st]+ *>/, 'HTML'],
[/^\[\/?[a-zA-Zà-öÀ-Ö0-9ø-ÿØ-ßĀ-ʯfi-st]+\]/, 'PSEUDOHTML'],
[/^&\w+;(?:\w+;|)/, 'HTMLENTITY'],
[/^(?:l|d|n|m|t|s|j|c|ç|lorsqu|puisqu|jusqu|quoiqu|qu)['’`]/i, 'ELPFX'],
[/^\d\d?[hm]\d\d\b/, 'HOUR'],
[/^\d+(?:er|nd|e|de|ième|ème|eme)s?\b/, 'ORDINAL'],
[/^-?\d+(?:[.,]\d+|)/, 'NUM'],
[/^[a-zA-Zà-öÀ-Ö0-9ø-ÿØ-ßĀ-ʯfi-st]+(?:[’'`-][a-zA-Zà-öÀ-Ö0-9ø-ÿØ-ßĀ-ʯfi-st]+)*/, 'WORD']
]
};
class Tokenizer {
constructor (sLang) {
this.sLang = sLang;
if (!aTkzPatterns.hasOwnProperty(sLang)) {
this.sLang = "default";
}
this.aRules = aTkzPatterns[this.sLang];
}
* genTokens (sText) {
let m;
let i = 0;
while (sText) {
let nCut = 1;
for (let [zRegex, sType] of this.aRules) {
try {
if ((m = zRegex.exec(sText)) !== null) {
if (sType == 'SEPARATOR') {
for (let c of m[0]) {
yield { "sType": sType, "sValue": c, "nStart": i, "nEnd": i + m[0].length }
}
} else {
yield { "sType": sType, "sValue": m[0], "nStart": i, "nEnd": i + m[0].length }
}
nCut = m[0].length;
break;
}
}
catch (e) {
helpers.logerror(e);
}
}
i += nCut;
sText = sText.slice(nCut);
}
}
getSpellingErrors (sText, oDict) {
let aSpellErr = [];
for (let oToken of this.genTokens(sText)) {
if (oToken.sType === 'WORD' && !oDict.isValidToken(oToken.sValue)) {
aSpellErr.push(oToken);
}
}
return aSpellErr;
}
}
if (typeof(exports) !== 'undefined') {
exports.Tokenizer = Tokenizer;
}
|