Index: graphspell-js/char_player.js ================================================================== --- graphspell-js/char_player.js +++ graphspell-js/char_player.js @@ -107,10 +107,23 @@ // Similar chars d1to1: new Map([ + ["'", "'’"], // U+0027: apostrophe droite + ["’", "’"], // U+2019: apostrophe typographique (sera utilisée par défaut) + ["ʼ", "ʼ’"], // U+02BC: Lettre modificative apostrophe + ["‘", "‘’"], // U+2018: guillemet-apostrophe culbuté + ["‛", "‛’"], // U+201B: guillemet-virgule supérieur culbuté + ["´", "´’"], // U+00B4: accent aigu + ["`", "`’"], // U+0060: accent grave + ["′", "′’"], // U+2032: prime + ["‵", "‵’"], // U+2035: prime réfléchi + ["՚", "՚’"], // U+055A: apostrophe arménienne + ["ꞌ", "ꞌ’"], // U+A78C: latin minuscule saltillo + ["Ꞌ", "Ꞌ’"], // U+A78B: latin majuscule saltillo + ["1", "1₁liîLIÎ"], ["2", "2₂zZ"], ["3", "3₃eéèêEÉÈÊ"], ["4", "4₄aàâAÀÂ"], ["5", "5₅sgSG"], Index: graphspell-js/tokenizer.js ================================================================== --- graphspell-js/tokenizer.js +++ graphspell-js/tokenizer.js @@ -37,11 +37,11 @@ [/^(?:https?:\/\/|www[.]|[a-zA-Zà-öÀ-Ö0-9ø-ÿØ-ßĀ-ʯff-st_-]+[@.][a-zA-Zà-öÀ-Ö0-9ø-ÿØ-ßĀ-ʯff-st_-]{2,}[@.])[a-zA-Z0-9][a-zA-Zà-öÀ-Ö0-9ø-ÿØ-ßĀ-ʯff-st_.\/?&!%=+*"'@$#-]+/, 'LINK'], [/^[#@][a-zA-Zà-öÀ-Ö0-9ø-ÿØ-ßĀ-ʯff-st_-]+/, 'TAG'], [/^<[a-zA-Z]+.*?>|^<\/[a-zA-Z]+ *>/, 'HTML'], [/^\[\/?[a-zA-Z]+\]/, 'PSEUDOHTML'], [/^&\w+;(?:\w+;|)/, 'HTMLENTITY'], - [/^(?:l|d|n|m|t|s|j|c|ç|lorsqu|puisqu|jusqu|quoiqu|qu|presqu|quelqu)['’´‘′`ʼ]/i, 'WORD_ELIDED'], + [/^(?:l|d|n|m|t|s|j|c|ç|lorsqu|puisqu|jusqu|quoiqu|qu|presqu|quelqu)['’ʼ‘‛´`′‵՚ꞌꞋ]/i, 'WORD_ELIDED'], [/^\d\d?[h:]\d\d(?:[m:]\d\ds?|)\b/, 'HOUR'], [/^\d+(?:ers?\b|res?\b|è[rm]es?\b|i[èe][mr]es?\b|de?s?\b|nde?s?\b|ès?\b|es?\b|ᵉʳˢ?|ʳᵉˢ?|ᵈᵉ?ˢ?|ⁿᵈᵉ?ˢ?|ᵉˢ?)/, 'WORD_ORDINAL'], [/^\d+(?:[.,]\d+|)/, 'NUM'], [/^[&%‰€$+±=*/<>⩾⩽#|×¥£§¢¬÷@-]/, 'SIGN'], [/^[a-zA-Zà-öÀ-Ö0-9ø-ÿØ-ßĀ-ʯff-stᴀ-ᶿ\u0300-\u036fᵉʳˢⁿᵈ_]+(?:[’'`-][a-zA-Zà-öÀ-Ö0-9ø-ÿØ-ßĀ-ʯff-stᴀ-ᶿ\u0300-\u036fᵉʳˢⁿᵈ_]+)*/, 'WORD'] Index: graphspell/char_player.py ================================================================== --- graphspell/char_player.py +++ graphspell/char_player.py @@ -93,10 +93,23 @@ # Similar chars d1to1 = { + "'": "'’", # U+0027: apostrophe droite + "’": "’", # U+2019: apostrophe typographique (sera utilisée par défaut) + "ʼ": "ʼ’", # U+02BC: Lettre modificative apostrophe + "‘": "‘’", # U+2018: guillemet-apostrophe culbuté + "‛": "‛’", # U+201B: guillemet-virgule supérieur culbuté + "´": "´’", # U+00B4: accent aigu + "`": "`’", # U+0060: accent grave + "′": "′’", # U+2032: prime + "‵": "‵’", # U+2035: prime réfléchi + "՚": "՚’", # U+055A: apostrophe arménienne + "ꞌ": "ꞌ’", # U+A78C: latin minuscule saltillo + "Ꞌ": "Ꞌ’", # U+A78B: latin majuscule saltillo + "1": "1₁liîLIÎ", "2": "2₂zZ", "3": "3₃eéèêEÉÈÊ", "4": "4₄aàâAÀÂ", "5": "5₅sgSG", Index: graphspell/tokenizer.py ================================================================== --- graphspell/tokenizer.py +++ graphspell/tokenizer.py @@ -31,11 +31,11 @@ r'(?P[A-Z][.][A-Z][.](?:[A-Z][.])*)', r'(?P(?:https?://|www[.]|\w+[@.]\w\w+[@.])\w[\w./?&!%=+*"\'@$#-]+)', r'(?P[#@][\w-]+)', r'(?P<\w+.*?>|)', r'(?P\[/?\w+\])', - r"(?P(?:l|d|n|m|t|s|j|c|ç|lorsqu|puisqu|jusqu|quoiqu|qu|presqu|quelqu)['’´‘′`ʼ])", + r"(?P(?:l|d|n|m|t|s|j|c|ç|lorsqu|puisqu|jusqu|quoiqu|qu|presqu|quelqu)['’ʼ‘‛´`′‵՚ꞌꞋ])", r'(?P\d+(?:ers?|res?|è[rm]es?|i[èe][mr]es?|de?s?|nde?s?|ès?|es?|ᵉʳˢ?|ʳᵉˢ?|ᵈᵉ?ˢ?|ⁿᵈᵉ?ˢ?|ᵉˢ?)\b)', r'(?P\d\d?[h:]\d\d(?:[m:]\d\ds?|)\b)', r'(?P\d+(?:[.,]\d+|))', r'(?P[&%‰€$+±=*/<>⩾⩽#|×¥£¢§¬÷@-])', r"(?P[\w\u0300-\u036f]+(?:[’'`-][\w\u0300-\u036f]+)*)"