Index: graphspell-js/tokenizer.js ================================================================== --- graphspell-js/tokenizer.js +++ graphspell-js/tokenizer.js @@ -24,10 +24,11 @@ [/^&\w+;(?:\w+;|)/, 'HTMLENTITY'], [/^\d\d?[h:]\d\d(?:[m:]\d\ds?|)\b/, 'HOUR'], [/^\d+(?:[.,]\d+|)/, 'NUM'], [/^[&%‰€$+±=*/<>⩾⩽#|×¥£§¢¬÷@-]/, 'SIGN'], [/^[a-zA-Zà-öÀ-Ö0-9ø-ÿØ-ßĀ-ʯff-stᴀ-ᶿ\u0300-\u036fᵉʳˢⁿᵈ]+(?:[’'`-][a-zA-Zà-öÀ-Ö0-9ø-ÿØ-ßĀ-ʯff-stᴀ-ᶿ\u0300-\u036fᵉʳˢⁿᵈ]+)*/, 'WORD'], + [/^_+/, 'UNDERSCORE'], [/^\S/, 'OTHER'] ], "fr": [ [/^[   \t]+/, 'SPACE'], @@ -44,10 +45,11 @@ [/^\d\d?[h:]\d\d(?:[m:]\d\ds?|)\b/, 'HOUR'], [/^\d+(?:ers?\b|res?\b|è[rm]es?\b|i[èe][mr]es?\b|de?s?\b|nde?s?\b|ès?\b|es?\b|ᵉʳˢ?|ʳᵉˢ?|ᵈᵉ?ˢ?|ⁿᵈᵉ?ˢ?|ᵉˢ?)/, 'WORD_ORDINAL'], [/^\d+(?:[.,]\d+|)/, 'NUM'], [/^[&%‰€$+±=*/<>⩾⩽#|×¥£§¢¬÷@-]/, 'SIGN'], [/^[a-zA-Zà-öÀ-Ö0-9ø-ÿØ-ßĀ-ʯff-stᴀ-ᶿ\u0300-\u036fᵉʳˢⁿᵈ]+(?:[’'`-][a-zA-Zà-öÀ-Ö0-9ø-ÿØ-ßĀ-ʯff-stᴀ-ᶿ\u0300-\u036fᵉʳˢⁿᵈ]+)*/, 'WORD'], + [/^_+/, 'UNDERSCORE'], [/^\S/, 'OTHER'], ] }; Index: graphspell/tokenizer.py ================================================================== --- graphspell/tokenizer.py +++ graphspell/tokenizer.py @@ -20,10 +20,11 @@ r'(?P\[/?\w+\])', r'(?P\d\d?[h:]\d\d(?:[m:]\d\ds?|)\b)', r'(?P\d+(?:[.,]\d+))', r'(?P[&%‰€$+±=*/<>⩾⩽#|×¥£§¢¬÷@-])', r"(?P(?:(?!_)[\w\u0300-\u036f])+(?:[’'`-](?:(?!_)[\w\u0300-\u036f])+)*)", # with combining diacritics + r"(?P_+)", r"(?P\S)" ), "fr": ( r'(?P/(?:bin|boot|dev|etc|home|lib|mnt|opt|root|sbin|tmp|usr|var|Bureau|Documents|Images|Musique|Public|Téléchargements|Vidéos)(?:/[\w.()-]+)*)', @@ -38,10 +39,11 @@ r'(?P\d+(?:ers?|res?|è[rm]es?|i[èe][mr]es?|de?s?|nde?s?|ès?|es?|ᵉʳˢ?|ʳᵉˢ?|ᵈᵉ?ˢ?|ⁿᵈᵉ?ˢ?|ᵉˢ?)\b)', r'(?P\d\d?[h:]\d\d(?:[m:]\d\ds?|)\b)', r'(?P\d+(?:[.,]\d+|))', r'(?P[&%‰€$+±=*/<>⩾⩽#|×¥£¢§¬÷@-])', r"(?P(?:(?!_)[\w\u0300-\u036f])+(?:[’'`-](?:(?!_)[\w\u0300-\u036f])+)*)", # with combining diacritics + r"(?P_+)", r"(?P\S)" ) }