Index: graphspell-js/tokenizer.js ================================================================== --- graphspell-js/tokenizer.js +++ graphspell-js/tokenizer.js @@ -23,11 +23,12 @@ [/^\[\/?[a-zA-Z]+\]/, 'PSEUDOHTML'], [/^&\w+;(?:\w+;|)/, 'HTMLENTITY'], [/^\d\d?[h:]\d\d(?:[m:]\d\ds?|)\b/, 'HOUR'], [/^\d+(?:[.,]\d+|)/, 'NUM'], [/^[&%‰€$+±=*/<>⩾⩽#|×¥£§¢¬÷@-]/, 'SIGN'], - [/^[a-zA-Zà-öÀ-Ö0-9ø-ÿØ-ßĀ-ʯff-stᴀ-ᶿ_]+(?:[’'`-][a-zA-Zà-öÀ-Ö0-9ø-ÿØ-ßĀ-ʯff-stᴀ-ᶿ_]+)*/, 'WORD'] + [/^[a-zA-Zà-öÀ-Ö0-9ø-ÿØ-ßĀ-ʯff-stᴀ-ᶿ_]+(?:[’'`-][a-zA-Zà-öÀ-Ö0-9ø-ÿØ-ßĀ-ʯff-stᴀ-ᶿ_]+)*/, 'WORD'], + [/^\S/, 'OTHER'] ], "fr": [ [/^[   \t]+/, 'SPACE'], [/^\/(?:~|bin|boot|dev|etc|home|lib|mnt|opt|root|sbin|tmp|usr|var|Bureau|Documents|Images|Musique|Public|Téléchargements|Vidéos)(?:\/[a-zA-Zà-öÀ-Ö0-9ø-ÿØ-ßĀ-ʯff-st_.()-]+)*/, 'FOLDERUNIX'], @@ -42,11 +43,12 @@ [/^(?:l|d|n|m|t|s|j|c|ç|lorsqu|puisqu|jusqu|quoiqu|qu|presqu|quelqu)['’ʼ‘‛´`′‵՚ꞌꞋ]/i, 'WORD_ELIDED'], [/^\d\d?[h:]\d\d(?:[m:]\d\ds?|)\b/, 'HOUR'], [/^\d+(?:ers?\b|res?\b|è[rm]es?\b|i[èe][mr]es?\b|de?s?\b|nde?s?\b|ès?\b|es?\b|ᵉʳˢ?|ʳᵉˢ?|ᵈᵉ?ˢ?|ⁿᵈᵉ?ˢ?|ᵉˢ?)/, 'WORD_ORDINAL'], [/^\d+(?:[.,]\d+|)/, 'NUM'], [/^[&%‰€$+±=*/<>⩾⩽#|×¥£§¢¬÷@-]/, 'SIGN'], - [/^[a-zA-Zà-öÀ-Ö0-9ø-ÿØ-ßĀ-ʯff-stᴀ-ᶿ\u0300-\u036fᵉʳˢⁿᵈ_]+(?:[’'`-][a-zA-Zà-öÀ-Ö0-9ø-ÿØ-ßĀ-ʯff-stᴀ-ᶿ\u0300-\u036fᵉʳˢⁿᵈ_]+)*/, 'WORD'] + [/^[a-zA-Zà-öÀ-Ö0-9ø-ÿØ-ßĀ-ʯff-stᴀ-ᶿ\u0300-\u036fᵉʳˢⁿᵈ_]+(?:[’'`-][a-zA-Zà-öÀ-Ö0-9ø-ÿØ-ßĀ-ʯff-stᴀ-ᶿ\u0300-\u036fᵉʳˢⁿᵈ_]+)*/, 'WORD'], + [/^\S/, 'OTHER'], ] }; class Tokenizer { Index: graphspell/tokenizer.py ================================================================== --- graphspell/tokenizer.py +++ graphspell/tokenizer.py @@ -19,11 +19,12 @@ r'(?P<\w+.*?>|)', r'(?P\[/?\w+\])', r'(?P\d\d?[h:]\d\d(?:[m:]\d\ds?|)\b)', r'(?P\d+(?:[.,]\d+))', r'(?P[&%‰€$+±=*/<>⩾⩽#|×¥£§¢¬÷@-])', - r"(?P\w+(?:[’'`-]\w+)*)" + r"(?P[\w\u0300-\u036f]+(?:[’'`-][\w\u0300-\u036f]+)*)", # with combining diacritics + r"(?P\S)" ), "fr": ( r'(?P/(?:bin|boot|dev|etc|home|lib|mnt|opt|root|sbin|tmp|usr|var|Bureau|Documents|Images|Musique|Public|Téléchargements|Vidéos)(?:/[\w.()-]+)*)', r'(?P[a-zA-Z]:\\(?:Program Files(?: [(]x86[)]|)|[\w.()]+)(?:\\[\w.()-]+)*)', @@ -36,11 +37,12 @@ r"(?P(?:l|d|n|m|t|s|j|c|ç|lorsqu|puisqu|jusqu|quoiqu|qu|presqu|quelqu)['’ʼ‘‛´`′‵՚ꞌꞋ])", r'(?P\d+(?:ers?|res?|è[rm]es?|i[èe][mr]es?|de?s?|nde?s?|ès?|es?|ᵉʳˢ?|ʳᵉˢ?|ᵈᵉ?ˢ?|ⁿᵈᵉ?ˢ?|ᵉˢ?)\b)', r'(?P\d\d?[h:]\d\d(?:[m:]\d\ds?|)\b)', r'(?P\d+(?:[.,]\d+|))', r'(?P[&%‰€$+±=*/<>⩾⩽#|×¥£¢§¬÷@-])', - r"(?P[\w\u0300-\u036f]+(?:[’'`-][\w\u0300-\u036f]+)*)" + r"(?P[\w\u0300-\u036f]+(?:[’'`-][\w\u0300-\u036f]+)*)", # with combining diacritics + r"(?P\S)" ) } class Tokenizer: