41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
|
# common
"nbsp_titles": [("\\bM(mes?|ᵐᵉˢ?|grs?|ᵍʳˢ?|lles?|ˡˡᵉˢ?|rs?|ʳˢ?|M\\.) ", "M\\1 "),
("\\bP(re?s?|ʳᵉ?ˢ?) ", "P\\1 "),
("\\bD(re?s?|ʳᵉ?ˢ?) ", "D\\1 "),
("\\bV(ves?|ᵛᵉˢ?) ", "V\\1 ")],
"nbsp_before_symbol": [("(\\d) ?([%‰€$£¥˚Ω℃])", "\\1 \\2")],
"nbsp_before_units": [("(?<=[0-9⁰¹²³⁴⁵⁶⁷⁸⁹]) ?([kcmµn]?(?:[slgJKΩ]|m[²³]?|Wh?|Hz|dB)|[%‰]|°C)\\b", " \\1")],
"nbsp_repair": [("(?<=[[(])[ ]([!?:;])", "\\1"),
("(https?|ftp)[ ]:(?=//)", "\\1:"),
("&([a-z]+)[ ];", "&\\1;"),
("&#([0-9]+|x[0-9a-fA-F]+)[ ];", "&#\\1;")],
## missing spaces
"add_space_after_punctuation": [("([;!…])(?=\\w)", "\\1 "),
("[?](?=[A-ZÉÈÊÂÀÎ])", "? "),
("\\.(?=[A-ZÉÈÎ][a-zA-ZàâÂéÉèÈêÊîÎïÏôÔöÖûÛüÜùÙ])", ". "),
|
|
|
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
|
# common
"nbsp_titles": [("\\bM(mes?|ᵐᵉˢ?|grs?|ᵍʳˢ?|lles?|ˡˡᵉˢ?|rs?|ʳˢ?|M\\.) ", "M\\1 "),
("\\bP(re?s?|ʳᵉ?ˢ?) ", "P\\1 "),
("\\bD(re?s?|ʳᵉ?ˢ?) ", "D\\1 "),
("\\bV(ves?|ᵛᵉˢ?) ", "V\\1 ")],
"nbsp_before_symbol": [("(\\d) ?([%‰€$£¥˚Ω℃])", "\\1 \\2")],
"nbsp_before_units": [("(?<=[0-9⁰¹²³⁴⁵⁶⁷⁸⁹]) ?([kcmµn]?(?:[slgJKΩ]|m[²³]?|Wh?|Hz|dB)|[%‰]|°C)\\b", " \\1")],
"nbsp_repair": [("(?<=[\\[(])[ ]([!?:;])", "\\1"),
("(https?|ftp)[ ]:(?=//)", "\\1:"),
("&([a-z]+)[ ];", "&\\1;"),
("&#([0-9]+|x[0-9a-fA-F]+)[ ];", "&#\\1;")],
## missing spaces
"add_space_after_punctuation": [("([;!…])(?=\\w)", "\\1 "),
("[?](?=[A-ZÉÈÊÂÀÎ])", "? "),
("\\.(?=[A-ZÉÈÎ][a-zA-ZàâÂéÉèÈêÊîÎïÏôÔöÖûÛüÜùÙ])", ". "),
|
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
|
"erase_non_breaking_hyphens": [("", "")],
## typographic signs
"ts_apostrophe": [ ("(?i)\\b([ldnjmtscç])['´‘′`](?=\\w)", "\\1’"),
("(?i)(qu|jusqu|lorsqu|puisqu|quoiqu|quelqu|presqu|entr|aujourd|prud)['´‘′`]", "\\1’") ],
"ts_ellipsis": [ ("\\.\\.\\.", "…"),
("(?<=…)[.][.]", "…"),
("…[.](?![.])", "…") ],
"ts_n_dash_middle": [ (" [-—] ", " – "),
(" [-—],", " –,") ],
"ts_m_dash_middle": [ (" [-–] ", " — "),
(" [-–],", " —,") ],
"ts_n_dash_start": [ ("^[-—][ ]", "– "),
("^– ", "– "),
("^[-–—](?=[\\w.…])", "– ") ],
"ts_m_dash_start": [ ("^[-–][ ]", "— "),
("^— ", "— "),
("^«[ ][—–-][ ]", "« — "),
("^[-–—](?=[\\w.…])", "— ") ],
"ts_quotation_marks": [ (u'"(\\w+)"', "“$1”"),
("''(\\w+)''", "“$1”"),
("'(\\w+)'", "“$1”"),
("^(?:\"|'')(?=\\w)", "« "),
(" (?:\"|'')(?=\\w)", " « "),
("\\((?:\"|'')(?=\\w)", "(« "),
("(?<=\\w)(?:\"|'')$", " »"),
("(?<=\\w)(?:\"|'')(?=[] ,.:;?!…)])", " »"),
(u'(?<=[.!?…])" ', " » "),
(u'(?<=[.!?…])"$', " »") ],
"ts_spell": [ ("coeur", "cœur"), ("Coeur", "Cœur"),
("coel(?=[aeio])", "cœl"), ("Coel(?=[aeio])", "Cœl"),
("choeur", "chœur"), ("Choeur", "Chœur"),
("foet", "fœt"), ("Foet", "Fœt"),
("oeil", "œil"), ("Oeil", "Œil"),
("oeno", "œno"), ("Oeno", "Œno"),
("oesoph", "œsoph"), ("Oesoph", "Œsoph"),
|
|
|
|
|
|
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
|
"erase_non_breaking_hyphens": [("", "")],
## typographic signs
"ts_apostrophe": [ ("(?i)\\b([ldnjmtscç])['´‘′`](?=\\w)", "\\1’"),
("(?i)(qu|jusqu|lorsqu|puisqu|quoiqu|quelqu|presqu|entr|aujourd|prud)['´‘′`]", "\\1’") ],
"ts_ellipsis": [ ("\\.\\.\\.", "…"),
("(?<=…)[.][.]", "…"),
("…[.](?![.])", "…") ],
"ts_n_dash_middle": [ (" [-—] ", " – "),
(" [-—],", " –,") ],
"ts_m_dash_middle": [ (" [-–] ", " — "),
(" [-–],", " —,") ],
"ts_n_dash_start": [ ("^[-—][ ]", "– "),
("^– ", "– "),
("^[-–—](?=[\\w.…])", "– ") ],
"ts_m_dash_start": [ ("^[-–][ ]", "— "),
("^— ", "— "),
("^«[ ][—–-][ ]", "« — "),
("^[-–—](?=[\\w.…])", "— ") ],
"ts_quotation_marks": [ ('"(\\w+)"', "“$1”"),
("''(\\w+)''", "“$1”"),
("'(\\w+)'", "“$1”"),
("^(?:\"|'')(?=\\w)", "« "),
(" (?:\"|'')(?=\\w)", " « "),
("\\((?:\"|'')(?=\\w)", "(« "),
("(?<=\\w)(?:\"|'')$", " »"),
("(?<=\\w)(?:\"|'')(?=[] ,.:;?!…)])", " »"),
('(?<=[.!?…])" ', " » "),
('(?<=[.!?…])"$', " »") ],
"ts_spell": [ ("coeur", "cœur"), ("Coeur", "Cœur"),
("coel(?=[aeio])", "cœl"), ("Coel(?=[aeio])", "Cœl"),
("choeur", "chœur"), ("Choeur", "Chœur"),
("foet", "fœt"), ("Foet", "Fœt"),
("oeil", "œil"), ("Oeil", "Œil"),
("oeno", "œno"), ("Oeno", "Œno"),
("oesoph", "œsoph"), ("Oesoph", "Œsoph"),
|