Overview
| Comment: | [graphspell][js] tokenizer: don’t use spaces as tokens, yield information token (start/end) |
|---|---|
| Downloads: | Tarball | ZIP archive | SQL archive |
| Timelines: | family | ancestors | descendants | both | graphspell | rg |
| Files: | files | file ages | folders |
| SHA3-256: |
d12872816fe0c2e77014a341e1e3ad16 |
| User & Date: | olr on 2018-09-11 18:55:24 |
| Other Links: | branch diff | manifest | tags |
Context
|
2018-09-11
| ||
| 20:44 | [core][py] don’t ask for morphologies several times uselessly check-in: 95ac5ee834 user: olr tags: core, rg | |
| 18:55 | [graphspell][js] tokenizer: don’t use spaces as tokens, yield information token (start/end) check-in: d12872816f user: olr tags: graphspell, rg | |
| 18:38 | [core][js] gc engine: fix several bugs check-in: e47d159953 user: olr tags: core, rg | |
Changes
Modified graphspell-js/tokenizer.js from [4a5b091820] to [d2289444f9].
1 2 3 4 5 6 7 | // JavaScript // Very simple tokenizer /*jslint esversion: 6*/ /*global require,exports*/ "use strict"; | < < < < < | 1 2 3 4 5 6 7 8 9 10 11 12 13 14 |
// JavaScript
// Very simple tokenizer
/*jslint esversion: 6*/
/*global require,exports*/
"use strict";
const aTkzPatterns = {
// All regexps must start with ^.
"default":
[
[/^[ \t]+/, 'SPACE'],
[/^\/(?:~|bin|boot|dev|etc|home|lib|mnt|opt|root|sbin|tmp|usr|var|Bureau|Documents|Images|Musique|Public|Téléchargements|Vidéos)(?:\/[a-zA-Zà-öÀ-Ö0-9ø-ÿØ-ßĀ-ʯfi-st_.()-]+)*/, 'FOLDERUNIX'],
|
| ︙ | ︙ | |||
58 59 60 61 62 63 64 |
this.sLang = sLang;
if (!aTkzPatterns.hasOwnProperty(sLang)) {
this.sLang = "default";
}
this.aRules = aTkzPatterns[this.sLang];
}
| | > > > > > > | | | | | | | | | | > > > > | 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 |
this.sLang = sLang;
if (!aTkzPatterns.hasOwnProperty(sLang)) {
this.sLang = "default";
}
this.aRules = aTkzPatterns[this.sLang];
}
* genTokens (sText, bStartEndToken=false, bWithSpaces=false) {
let m;
let iToken;
let iNext = 0;
let iEnd = sText.length;
if (bStartEndToken) {
yield { "i": 0, "sType": "INFO", "sValue": "<start>", "nStart": 0, "nEnd": 0, "lMorph": ["<start>"] };
}
while (sText) {
let iCut = 1;
let iToken = 0;
for (let [zRegex, sType] of this.aRules) {
if (sType !== "SPACE" || bWithSpaces) {
try {
if ((m = zRegex.exec(sText)) !== null) {
iToken += 1;
yield { "i": iToken, "sType": sType, "sValue": m[0], "nStart": iNext, "nEnd": iNext + m[0].length }
iCut = m[0].length;
break;
}
}
catch (e) {
console.error(e);
}
}
}
iNext += iCut;
sText = sText.slice(iCut);
}
if (bStartEndToken) {
yield { "i": iToken+1, "sType": "INFO", "sValue": "<end>", "nStart": iEnd, "nEnd": iEnd, "lMorph": ["<end>"] };
}
}
}
if (typeof(exports) !== 'undefined') {
exports.Tokenizer = Tokenizer;
}
|