1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
|
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
|
-
+
-
+
|
# Very simple tokenizer
import re
_PATTERNS = {
"default":
(
r'(?P<FOLDER1>/(?:bin|boot|dev|etc|home|lib|mnt|opt|root|sbin|tmp|usr|var|Bureau|Documents|Images|Musique|Public|Téléchargements|Vidéos)(?:/[\w.()]+)*)',
r'(?P<FOLDER2>[a-zA-Z]:\\(?:Program Files(?: [(]x86[)]|)|[\w.()]+)(?:\\[\w.()]+)*)',
r'(?P<PUNC>[.,?!:;…«»“”"()/·]+)',
r'(?P<LINK>(?:https?://|www[.]|\w+[@.]\w+[@.])[\w./?&!%=+*"\'@$#-]+)',
r'(?P<LINK>(?:https?://|www[.]|\w+[@.]\w+[@.])\w[\w./?&!%=+*"\'@$#-]+)',
r'(?P<HASHTAG>[#@][\w-]+)',
r'(?P<HTML><\w+.*?>|</\w+ *>)',
r'(?P<PSEUDOHTML>\[/?\w+\])',
r'(?P<HOUR>\d\d?h\d\d\b)',
r'(?P<NUM>-?\d+(?:[.,]\d+))',
r"(?P<WORD>\w+(?:[’'`-]\w+)*)"
),
"fr":
(
r'(?P<FOLDER1>/(?:bin|boot|dev|etc|home|lib|mnt|opt|root|sbin|tmp|usr|var|Bureau|Documents|Images|Musique|Public|Téléchargements|Vidéos)(?:/[\w.()]+)*)',
r'(?P<FOLDER2>[a-zA-Z]:\\(?:Program Files(?: [(]x86[)]|)|[\w.()]+)(?:\\[\w.()]+)*)',
r'(?P<PUNC>[.,?!:;…«»“”"()/·]+)',
r'(?P<LINK>(?:https?://|www[.]|\w+[@.]\w+[@.])[\w./?&!%=+*"\'@$#-]+)',
r'(?P<LINK>(?:https?://|www[.]|\w+[@.]\w+[@.])\w[\w./?&!%=+*"\'@$#-]+)',
r'(?P<HASHTAG>[#@][\w-]+)',
r'(?P<HTML><\w+.*?>|</\w+ *>)',
r'(?P<PSEUDOHTML>\[/?\w+\])',
r"(?P<ELPFX>(?:l|d|n|m|t|s|j|c|ç|lorsqu|puisqu|jusqu|quoiqu|qu)['’`])",
r'(?P<ORDINAL>\d+(?:er|nd|e|de|ième|ème|eme)\b)',
r'(?P<HOUR>\d\d?h\d\d\b)',
r'(?P<NUM>-?\d+(?:[.,]\d+|))',
|