Overview
| Comment: | [core] tokenization: folders |
|---|---|
| Downloads: | Tarball | ZIP archive | SQL archive |
| Timelines: | family | ancestors | descendants | both | trunk | core |
| Files: | files | file ages | folders |
| SHA3-256: |
35c48d42a8037d5e140f6f85505a6ec0 |
| User & Date: | olr on 2017-10-24 11:00:11 |
| Other Links: | manifest | tags |
Context
|
2017-10-24
| ||
| 11:51 | [fx] CSS: text-align: left, for paragraphs check-in: 6ca9a6e85f user: olr tags: trunk, fx | |
| 11:00 | [core] tokenization: folders check-in: 35c48d42a8 user: olr tags: trunk, core | |
| 09:08 | [fr] pt: arborescence check-in: 05863229dc user: olr tags: trunk, fr | |
Changes
Modified gc_core/js/tokenizer.js from [81088a0177] to [9622b0a610].
| ︙ | ︙ | |||
12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 |
const aTkzPatterns = {
// All regexps must start with ^.
"default":
[
[/^[ \t]+/, 'SPACE'],
[/^[,.;:!?…«»“”‘’"(){}\[\]/·–—]+/, 'SEPARATOR'],
[/^(?:https?:\/\/|www[.]|[a-zA-Zà-öÀ-Ö0-9ø-ÿØ-ßĀ-ʯfi-st_]+[@.][a-zA-Zà-öÀ-Ö0-9ø-ÿØ-ßĀ-ʯfi-st_]+[@.])[a-zA-Zà-öÀ-Ö0-9ø-ÿØ-ßĀ-ʯfi-st_.\/?&!%=+*"'@$#-]+/, 'LINK'],
[/^[#@][a-zA-Zà-öÀ-Ö0-9ø-ÿØ-ßĀ-ʯfi-st_-]+/, 'TAG'],
[/^<[a-zA-Zà-öÀ-Ö0-9ø-ÿØ-ßĀ-ʯfi-st]+.*?>|<\/[a-zA-Zà-öÀ-Ö0-9ø-ÿØ-ßĀ-ʯfi-st]+ *>/, 'HTML'],
[/^\[\/?[a-zA-Zà-öÀ-Ö0-9ø-ÿØ-ßĀ-ʯfi-st]+\]/, 'PSEUDOHTML'],
[/^&\w+;(?:\w+;|)/, 'HTMLENTITY'],
[/^\d\d?h\d\d\b/, 'HOUR'],
[/^-?\d+(?:[.,]\d+|)/, 'NUM'],
[/^[a-zA-Zà-öÀ-Ö0-9ø-ÿØ-ßĀ-ʯfi-st]+(?:[’'`-][a-zA-Zà-öÀ-Ö0-9ø-ÿØ-ßĀ-ʯfi-st]+)*/, 'WORD']
],
"fr":
[
[/^[ \t]+/, 'SPACE'],
[/^[,.;:!?…«»“”‘’"(){}\[\]/·–—]+/, 'SEPARATOR'],
[/^(?:https?:\/\/|www[.]|[a-zA-Zà-öÀ-Ö0-9ø-ÿØ-ßĀ-ʯfi-st_]+[@.][a-zA-Zà-öÀ-Ö0-9ø-ÿØ-ßĀ-ʯfi-st_]+[@.])[a-zA-Zà-öÀ-Ö0-9ø-ÿØ-ßĀ-ʯfi-st_.\/?&!%=+*"'@$#-]+/, 'LINK'],
[/^[#@][a-zA-Zà-öÀ-Ö0-9ø-ÿØ-ßĀ-ʯfi-st_-]+/, 'TAG'],
[/^<[a-zA-Zà-öÀ-Ö0-9ø-ÿØ-ßĀ-ʯfi-st]+.*?>|<\/[a-zA-Zà-öÀ-Ö0-9ø-ÿØ-ßĀ-ʯfi-st]+ *>/, 'HTML'],
[/^\[\/?[a-zA-Zà-öÀ-Ö0-9ø-ÿØ-ßĀ-ʯfi-st]+\]/, 'PSEUDOHTML'],
[/^&\w+;(?:\w+;|)/, 'HTMLENTITY'],
[/^(?:l|d|n|m|t|s|j|c|ç|lorsqu|puisqu|jusqu|quoiqu|qu)['’`]/i, 'ELPFX'],
| > > > > | 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 |
const aTkzPatterns = {
// All regexps must start with ^.
"default":
[
[/^[ \t]+/, 'SPACE'],
[/^\/(?:bin|boot|dev|etc|home|lib|mnt|opt|root|sbin|tmp|usr|var|Bureau|Documents|Images|Musique|Public|Téléchargements|Vidéos)(?:\/[a-zA-Zà-öÀ-Ö0-9ø-ÿØ-ßĀ-ʯfi-st.()]+)*/, 'FOLDER'],
[/^[a-zA-Z]:\\(?:Program Files(?: \(x86\)|)|[a-zA-Zà-öÀ-Ö0-9ø-ÿØ-ßĀ-ʯfi-st.()]+)(?:\\[a-zA-Zà-öÀ-Ö0-9ø-ÿØ-ßĀ-ʯfi-st.()]+)*/, 'FOLDER'],
[/^[,.;:!?…«»“”‘’"(){}\[\]/·–—]+/, 'SEPARATOR'],
[/^(?:https?:\/\/|www[.]|[a-zA-Zà-öÀ-Ö0-9ø-ÿØ-ßĀ-ʯfi-st_]+[@.][a-zA-Zà-öÀ-Ö0-9ø-ÿØ-ßĀ-ʯfi-st_]+[@.])[a-zA-Zà-öÀ-Ö0-9ø-ÿØ-ßĀ-ʯfi-st_.\/?&!%=+*"'@$#-]+/, 'LINK'],
[/^[#@][a-zA-Zà-öÀ-Ö0-9ø-ÿØ-ßĀ-ʯfi-st_-]+/, 'TAG'],
[/^<[a-zA-Zà-öÀ-Ö0-9ø-ÿØ-ßĀ-ʯfi-st]+.*?>|<\/[a-zA-Zà-öÀ-Ö0-9ø-ÿØ-ßĀ-ʯfi-st]+ *>/, 'HTML'],
[/^\[\/?[a-zA-Zà-öÀ-Ö0-9ø-ÿØ-ßĀ-ʯfi-st]+\]/, 'PSEUDOHTML'],
[/^&\w+;(?:\w+;|)/, 'HTMLENTITY'],
[/^\d\d?h\d\d\b/, 'HOUR'],
[/^-?\d+(?:[.,]\d+|)/, 'NUM'],
[/^[a-zA-Zà-öÀ-Ö0-9ø-ÿØ-ßĀ-ʯfi-st]+(?:[’'`-][a-zA-Zà-öÀ-Ö0-9ø-ÿØ-ßĀ-ʯfi-st]+)*/, 'WORD']
],
"fr":
[
[/^[ \t]+/, 'SPACE'],
[/^\/(?:bin|boot|dev|etc|home|lib|mnt|opt|root|sbin|tmp|usr|var|Bureau|Documents|Images|Musique|Public|Téléchargements|Vidéos)(?:\/[a-zA-Zà-öÀ-Ö0-9ø-ÿØ-ßĀ-ʯfi-st.()]+)*/, 'FOLDER'],
[/^[a-zA-Z]:\\(?:Program Files(?: \(x86\)|)|[a-zA-Zà-öÀ-Ö0-9ø-ÿØ-ßĀ-ʯfi-st.()]+)(?:\\[a-zA-Zà-öÀ-Ö0-9ø-ÿØ-ßĀ-ʯfi-st.()]+)*/, 'FOLDER'],
[/^[,.;:!?…«»“”‘’"(){}\[\]/·–—]+/, 'SEPARATOR'],
[/^(?:https?:\/\/|www[.]|[a-zA-Zà-öÀ-Ö0-9ø-ÿØ-ßĀ-ʯfi-st_]+[@.][a-zA-Zà-öÀ-Ö0-9ø-ÿØ-ßĀ-ʯfi-st_]+[@.])[a-zA-Zà-öÀ-Ö0-9ø-ÿØ-ßĀ-ʯfi-st_.\/?&!%=+*"'@$#-]+/, 'LINK'],
[/^[#@][a-zA-Zà-öÀ-Ö0-9ø-ÿØ-ßĀ-ʯfi-st_-]+/, 'TAG'],
[/^<[a-zA-Zà-öÀ-Ö0-9ø-ÿØ-ßĀ-ʯfi-st]+.*?>|<\/[a-zA-Zà-öÀ-Ö0-9ø-ÿØ-ßĀ-ʯfi-st]+ *>/, 'HTML'],
[/^\[\/?[a-zA-Zà-öÀ-Ö0-9ø-ÿØ-ßĀ-ʯfi-st]+\]/, 'PSEUDOHTML'],
[/^&\w+;(?:\w+;|)/, 'HTMLENTITY'],
[/^(?:l|d|n|m|t|s|j|c|ç|lorsqu|puisqu|jusqu|quoiqu|qu)['’`]/i, 'ELPFX'],
|
| ︙ | ︙ |
Modified gc_core/py/tokenizer.py from [27b6fefad2] to [d05a70dbc3].
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 |
# Very simple tokenizer
import re
_PATTERNS = {
"default":
(
r'(?P<PUNC>[.,?!:;…«»“”"()/·]+)',
r'(?P<LINK>(?:https?://|www[.]|\w+[@.]\w+[@.])[\w./?&!%=+*"\'@$#-]+)',
r'(?P<HASHTAG>[#@][\w-]+)',
r'(?P<HTML><\w+.*?>|</\w+ *>)',
r'(?P<PSEUDOHTML>\[/?\w+\])',
r'(?P<HOUR>\d\d?h\d\d\b)',
r'(?P<NUM>-?\d+(?:[.,]\d+))',
r"(?P<WORD>\w+(?:[’'`-]\w+)*)"
),
"fr":
(
r'(?P<PUNC>[.,?!:;…«»“”"()/·]+)',
r'(?P<LINK>(?:https?://|www[.]|\w+[@.]\w+[@.])[\w./?&!%=+*"\'@$#-]+)',
r'(?P<HASHTAG>[#@][\w-]+)',
r'(?P<HTML><\w+.*?>|</\w+ *>)',
r'(?P<PSEUDOHTML>\[/?\w+\])',
r"(?P<ELPFX>(?:l|d|n|m|t|s|j|c|ç|lorsqu|puisqu|jusqu|quoiqu|qu)['’`])",
r'(?P<ORDINAL>\d+(?:er|nd|e|de|ième|ème|eme)\b)',
| > > > > | 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 |
# Very simple tokenizer
import re
_PATTERNS = {
"default":
(
r'(?P<FOLDER>/(?:bin|boot|dev|etc|home|lib|mnt|opt|root|sbin|tmp|usr|var|Bureau|Documents|Images|Musique|Public|Téléchargements|Vidéos)(?:/[\w.()]+)*)',
r'(?P<FOLDER>[a-zA-Z]:\\(?:Program Files(?: [(]x86[)]|)|[\w.()]+)(?:\\[\w.()]+)*)',
r'(?P<PUNC>[.,?!:;…«»“”"()/·]+)',
r'(?P<LINK>(?:https?://|www[.]|\w+[@.]\w+[@.])[\w./?&!%=+*"\'@$#-]+)',
r'(?P<HASHTAG>[#@][\w-]+)',
r'(?P<HTML><\w+.*?>|</\w+ *>)',
r'(?P<PSEUDOHTML>\[/?\w+\])',
r'(?P<HOUR>\d\d?h\d\d\b)',
r'(?P<NUM>-?\d+(?:[.,]\d+))',
r"(?P<WORD>\w+(?:[’'`-]\w+)*)"
),
"fr":
(
r'(?P<FOLDER>/(?:bin|boot|dev|etc|home|lib|mnt|opt|root|sbin|tmp|usr|var|Bureau|Documents|Images|Musique|Public|Téléchargements|Vidéos)(?:/[\w.()]+)*)',
r'(?P<FOLDER>[a-zA-Z]:\\(?:Program Files(?: [(]x86[)]|)|[\w.()]+)(?:\\[\w.()]+)*)',
r'(?P<PUNC>[.,?!:;…«»“”"()/·]+)',
r'(?P<LINK>(?:https?://|www[.]|\w+[@.]\w+[@.])[\w./?&!%=+*"\'@$#-]+)',
r'(?P<HASHTAG>[#@][\w-]+)',
r'(?P<HTML><\w+.*?>|</\w+ *>)',
r'(?P<PSEUDOHTML>\[/?\w+\])',
r"(?P<ELPFX>(?:l|d|n|m|t|s|j|c|ç|lorsqu|puisqu|jusqu|quoiqu|qu)['’`])",
r'(?P<ORDINAL>\d+(?:er|nd|e|de|ième|ème|eme)\b)',
|
| ︙ | ︙ |
Modified gc_lang/fr/modules-js/lexicographe.js from [fd0872dfca] to [08da929c5c].
| ︙ | ︙ | |||
218 219 220 221 222 223 224 225 226 227 228 229 230 231 |
case 'LINK':
return { sType: oToken.sType, sValue: oToken.sValue.slice(0,40)+"…", aLabel: ["hyperlien"] };
break;
case 'ELPFX':
let sTemp = oToken.sValue.replace("’", "").replace("'", "").replace("`", "").toLowerCase();
return { sType: oToken.sType, sValue: oToken.sValue, aLabel: [_dPFX.gl_get(sTemp, "préfixe élidé inconnu")] };
break;
case 'WORD':
if (oToken.sValue.gl_count("-") > 4) {
return { sType: "COMPLEX", sValue: oToken.sValue, aLabel: ["élément complexe indéterminé"] };
}
else if (this.oDict.isValidToken(oToken.sValue)) {
let lMorph = this.oDict.getMorph(oToken.sValue);
let aElem = [];
| > > > | 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 |
case 'LINK':
return { sType: oToken.sType, sValue: oToken.sValue.slice(0,40)+"…", aLabel: ["hyperlien"] };
break;
case 'ELPFX':
let sTemp = oToken.sValue.replace("’", "").replace("'", "").replace("`", "").toLowerCase();
return { sType: oToken.sType, sValue: oToken.sValue, aLabel: [_dPFX.gl_get(sTemp, "préfixe élidé inconnu")] };
break;
case 'FOLDER':
return { sType: oToken.sType, sValue: oToken.sValue.slice(0,40)+"…", aLabel: ["dossier"] };
break;
case 'WORD':
if (oToken.sValue.gl_count("-") > 4) {
return { sType: "COMPLEX", sValue: oToken.sValue, aLabel: ["élément complexe indéterminé"] };
}
else if (this.oDict.isValidToken(oToken.sValue)) {
let lMorph = this.oDict.getMorph(oToken.sValue);
let aElem = [];
|
| ︙ | ︙ |
Modified gc_lang/fr/rules.grx from [6e78613320] to [c0ced6ba95].
| ︙ | ︙ | |||
464 465 466 467 468 469 470 |
__[s](p_adresse_IP)__
\d+[.:]\d+[.:]\d+[.:]\d+ <<- ~>> *
# Arborescence
__[s>(p_arborescence_Linux_Mac)__
/(?:bin|boot|dev|etc|home|lib|mnt|opt|root|sbin|tmp|usr|var|Bureau|Documents|Images|Musique|Public|Téléchargements|Vidéos)(?:/[\w.()]+)* <<- ~>> *
| | | < < | 464 465 466 467 468 469 470 471 472 473 474 475 476 477 478 479 |
__[s](p_adresse_IP)__
\d+[.:]\d+[.:]\d+[.:]\d+ <<- ~>> *
# Arborescence
__[s>(p_arborescence_Linux_Mac)__
/(?:bin|boot|dev|etc|home|lib|mnt|opt|root|sbin|tmp|usr|var|Bureau|Documents|Images|Musique|Public|Téléchargements|Vidéos)(?:/[\w.()]+)* <<- ~>> *
__[i>(p_arborescence_Windows)__
[a-z]:\\(?:Program Files(?: [(]x86[)]|)|[\w.()]+)(?:\\[\w.()]+)* <<- ~>> *
# mètres (m)
__[s>(p_chiffres_m)__
"\d+ (m) " @@w <<- ~1>> _
# heures
__[s](p_heures)__
|
| ︙ | ︙ |