40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
|
[/^\[\/?[a-zA-Z]+\]/, 'PSEUDOHTML'],
[/^&\w+;(?:\w+;|)/, 'HTMLENTITY'],
[/^(?:l|d|n|m|t|s|j|c|ç|lorsqu|puisqu|jusqu|quoiqu|qu|presqu|quelqu)['’´‘′`ʼ]/i, 'WORD_ELIDED'],
[/^\d\d?[h:]\d\d(?:[m:]\d\ds?|)\b/, 'HOUR'],
[/^\d+(?:ers?\b|res?\b|è[rm]es?\b|i[èe][mr]es?\b|de?s?\b|nde?s?\b|ès?\b|es?\b|ᵉʳˢ?|ʳᵉˢ?|ᵈᵉ?ˢ?|ⁿᵈᵉ?ˢ?|ᵉˢ?)/, 'WORD_ORDINAL'],
[/^\d+(?:[.,]\d+|)/, 'NUM'],
[/^[&%‰€$+±=*/<>⩾⩽#|×¥£§¢¬÷@-]/, 'SIGN'],
[/^[a-zA-Zà-öÀ-Ö0-9ø-ÿØ-ßĀ-ʯff-stᴀ-ᶿᵉʳˢⁿᵈ_]+(?:[’'`-][a-zA-Zà-öÀ-Ö0-9ø-ÿØ-ßĀ-ʯff-stᴀ-ᶿᵉʳˢⁿᵈ_]+)*/, 'WORD']
]
};
class Tokenizer {
constructor (sLang) {
|
|
|
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
|
[/^\[\/?[a-zA-Z]+\]/, 'PSEUDOHTML'],
[/^&\w+;(?:\w+;|)/, 'HTMLENTITY'],
[/^(?:l|d|n|m|t|s|j|c|ç|lorsqu|puisqu|jusqu|quoiqu|qu|presqu|quelqu)['’´‘′`ʼ]/i, 'WORD_ELIDED'],
[/^\d\d?[h:]\d\d(?:[m:]\d\ds?|)\b/, 'HOUR'],
[/^\d+(?:ers?\b|res?\b|è[rm]es?\b|i[èe][mr]es?\b|de?s?\b|nde?s?\b|ès?\b|es?\b|ᵉʳˢ?|ʳᵉˢ?|ᵈᵉ?ˢ?|ⁿᵈᵉ?ˢ?|ᵉˢ?)/, 'WORD_ORDINAL'],
[/^\d+(?:[.,]\d+|)/, 'NUM'],
[/^[&%‰€$+±=*/<>⩾⩽#|×¥£§¢¬÷@-]/, 'SIGN'],
[/^[a-zA-Zà-öÀ-Ö0-9ø-ÿØ-ßĀ-ʯff-stᴀ-ᶿ\u0300-\u036fᵉʳˢⁿᵈ_]+(?:[’'`-][a-zA-Zà-öÀ-Ö0-9ø-ÿØ-ßĀ-ʯff-stᴀ-ᶿ\u0300-\u036fᵉʳˢⁿᵈ_]+)*/, 'WORD']
]
};
class Tokenizer {
constructor (sLang) {
|
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
|
while (sText) {
let iCut = 1;
for (let [zRegex, sType] of this.aRules) {
if (sType !== "SPACE" || bWithSpaces) {
try {
if ((m = zRegex.exec(sText)) !== null) {
iToken += 1;
yield { "i": iToken, "sType": sType, "sValue": m[0], "nStart": iNext, "nEnd": iNext + m[0].length };
iCut = m[0].length;
break;
}
}
catch (e) {
console.error(e);
}
|
|
|
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
|
while (sText) {
let iCut = 1;
for (let [zRegex, sType] of this.aRules) {
if (sType !== "SPACE" || bWithSpaces) {
try {
if ((m = zRegex.exec(sText)) !== null) {
iToken += 1;
yield { "i": iToken, "sType": sType, "sValue": m[0], "nStart": iNext, "nEnd": iNext + m[0].length }; // m[0].normalize("NFC") not usefull at the moment
iCut = m[0].length;
break;
}
}
catch (e) {
console.error(e);
}
|