Overview
| Comment: | [core] fix tokentizer: two similar group name in regex |
|---|---|
| Downloads: | Tarball | ZIP archive | SQL archive |
| Timelines: | family | ancestors | descendants | both | trunk | core |
| Files: | files | file ages | folders |
| SHA3-256: |
78199c400645aa907358ac2f6330566b |
| User & Date: | olr on 2017-10-24 11:59:45 |
| Other Links: | manifest | tags |
Context
|
2017-10-24
| ||
| 12:05 | [core] ibdawg: reduce hard replacements in suggestion mechanism check-in: 35abf9fb76 user: olr tags: trunk, core | |
| 11:59 | [core] fix tokentizer: two similar group name in regex check-in: 78199c4006 user: olr tags: trunk, core | |
| 11:51 | [fx] CSS: text-align: left, for paragraphs check-in: 6ca9a6e85f user: olr tags: trunk, fx | |
Changes
Modified gc_core/py/tokenizer.py from [d05a70dbc3] to [353949869b].
1 2 3 4 5 6 7 |
# Very simple tokenizer
import re
_PATTERNS = {
"default":
(
| | | | | | 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 |
# Very simple tokenizer
import re
_PATTERNS = {
"default":
(
r'(?P<FOLDER1>/(?:bin|boot|dev|etc|home|lib|mnt|opt|root|sbin|tmp|usr|var|Bureau|Documents|Images|Musique|Public|Téléchargements|Vidéos)(?:/[\w.()]+)*)',
r'(?P<FOLDER2>[a-zA-Z]:\\(?:Program Files(?: [(]x86[)]|)|[\w.()]+)(?:\\[\w.()]+)*)',
r'(?P<PUNC>[.,?!:;…«»“”"()/·]+)',
r'(?P<LINK>(?:https?://|www[.]|\w+[@.]\w+[@.])[\w./?&!%=+*"\'@$#-]+)',
r'(?P<HASHTAG>[#@][\w-]+)',
r'(?P<HTML><\w+.*?>|</\w+ *>)',
r'(?P<PSEUDOHTML>\[/?\w+\])',
r'(?P<HOUR>\d\d?h\d\d\b)',
r'(?P<NUM>-?\d+(?:[.,]\d+))',
r"(?P<WORD>\w+(?:[’'`-]\w+)*)"
),
"fr":
(
r'(?P<FOLDER1>/(?:bin|boot|dev|etc|home|lib|mnt|opt|root|sbin|tmp|usr|var|Bureau|Documents|Images|Musique|Public|Téléchargements|Vidéos)(?:/[\w.()]+)*)',
r'(?P<FOLDER2>[a-zA-Z]:\\(?:Program Files(?: [(]x86[)]|)|[\w.()]+)(?:\\[\w.()]+)*)',
r'(?P<PUNC>[.,?!:;…«»“”"()/·]+)',
r'(?P<LINK>(?:https?://|www[.]|\w+[@.]\w+[@.])[\w./?&!%=+*"\'@$#-]+)',
r'(?P<HASHTAG>[#@][\w-]+)',
r'(?P<HTML><\w+.*?>|</\w+ *>)',
r'(?P<PSEUDOHTML>\[/?\w+\])',
r"(?P<ELPFX>(?:l|d|n|m|t|s|j|c|ç|lorsqu|puisqu|jusqu|quoiqu|qu)['’`])",
r'(?P<ORDINAL>\d+(?:er|nd|e|de|ième|ème|eme)\b)',
|
| ︙ | ︙ |