1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
|
// JavaScript
// Very simple tokenizer
"use strict";
const helpers = require("resource://grammalecte/helpers.js");
const aPatterns = {
// All regexps must start with ^.
"default":
[
[/^[ \t]+/, 'SPACE'],
[/^[,.;:!?…«»“”‘’"(){}\[\]/·–—]+/, 'SEPARATOR'],
[/^(?:https?:\/\/|www[.]|[a-zA-Zà-öÀ-Ö0-9ø-ÿØ-ßĀ-ʯfi-st_]+[@.][a-zA-Zà-öÀ-Ö0-9ø-ÿØ-ßĀ-ʯfi-st_]+[@.])[a-zA-Zà-öÀ-Ö0-9ø-ÿØ-ßĀ-ʯfi-st_.\/?&!%=+*"'@$#-]+/, 'LINK'],
[/^[#@][a-zA-Zà-öÀ-Ö0-9ø-ÿØ-ßĀ-ʯfi-st_-]+/, 'TAG'],
|
>
>
|
|
>
>
|
|
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
|
// JavaScript
// Very simple tokenizer
"use strict";
if (typeof(exports) !== 'undefined') {
var helpers = require("resource://grammalecte/helpers.js");
}
const aTkzPatterns = {
// All regexps must start with ^.
"default":
[
[/^[ \t]+/, 'SPACE'],
[/^[,.;:!?…«»“”‘’"(){}\[\]/·–—]+/, 'SEPARATOR'],
[/^(?:https?:\/\/|www[.]|[a-zA-Zà-öÀ-Ö0-9ø-ÿØ-ßĀ-ʯfi-st_]+[@.][a-zA-Zà-öÀ-Ö0-9ø-ÿØ-ßĀ-ʯfi-st_]+[@.])[a-zA-Zà-öÀ-Ö0-9ø-ÿØ-ßĀ-ʯfi-st_.\/?&!%=+*"'@$#-]+/, 'LINK'],
[/^[#@][a-zA-Zà-öÀ-Ö0-9ø-ÿØ-ßĀ-ʯfi-st_-]+/, 'TAG'],
|
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
|
[/^&\w+;(?:\w+;|)/, 'HTMLENTITY'],
[/^(?:l|d|n|m|t|s|j|c|ç|lorsqu|puisqu|jusqu|quoiqu|qu)['’`]/i, 'ELPFX'],
[/^\d\d?[hm]\d\d\b/, 'HOUR'],
[/^\d+(?:er|nd|e|de|ième|ème|eme)s?\b/, 'ORDINAL'],
[/^-?\d+(?:[.,]\d+|)/, 'NUM'],
[/^[a-zA-Zà-öÀ-Ö0-9ø-ÿØ-ßĀ-ʯfi-st]+(?:[’'`-][a-zA-Zà-öÀ-Ö0-9ø-ÿØ-ßĀ-ʯfi-st]+)*/, 'WORD']
]
}
class Tokenizer {
constructor (sLang) {
this.sLang = sLang;
if (!aPatterns.hasOwnProperty(sLang)) {
this.sLang = "default";
}
this.aRules = aPatterns[this.sLang];
};
* genTokens (sText) {
let m;
let i = 0;
while (sText) {
let nCut = 1;
for (let [zRegex, sType] of this.aRules) {
|
|
|
|
<
>
|
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
|
[/^&\w+;(?:\w+;|)/, 'HTMLENTITY'],
[/^(?:l|d|n|m|t|s|j|c|ç|lorsqu|puisqu|jusqu|quoiqu|qu)['’`]/i, 'ELPFX'],
[/^\d\d?[hm]\d\d\b/, 'HOUR'],
[/^\d+(?:er|nd|e|de|ième|ème|eme)s?\b/, 'ORDINAL'],
[/^-?\d+(?:[.,]\d+|)/, 'NUM'],
[/^[a-zA-Zà-öÀ-Ö0-9ø-ÿØ-ßĀ-ʯfi-st]+(?:[’'`-][a-zA-Zà-öÀ-Ö0-9ø-ÿØ-ßĀ-ʯfi-st]+)*/, 'WORD']
]
};
class Tokenizer {
constructor (sLang) {
this.sLang = sLang;
if (!aTkzPatterns.hasOwnProperty(sLang)) {
this.sLang = "default";
}
this.aRules = aTkzPatterns[this.sLang];
}
* genTokens (sText) {
let m;
let i = 0;
while (sText) {
let nCut = 1;
for (let [zRegex, sType] of this.aRules) {
|
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
|
catch (e) {
helpers.logerror(e);
}
}
i += nCut;
sText = sText.slice(nCut);
}
};
getSpellingErrors (sText, oDict) {
let aSpellErr = [];
for (let oToken of this.genTokens(sText)) {
if (oToken.sType === 'WORD' && !oDict.isValidToken(oToken.sValue)) {
aSpellErr.push(oToken);
}
|
<
>
|
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
|
catch (e) {
helpers.logerror(e);
}
}
i += nCut;
sText = sText.slice(nCut);
}
}
getSpellingErrors (sText, oDict) {
let aSpellErr = [];
for (let oToken of this.genTokens(sText)) {
if (oToken.sType === 'WORD' && !oDict.isValidToken(oToken.sValue)) {
aSpellErr.push(oToken);
}
|