35
36
37
38
39
40
41
42
43
44
45
46
47
48
|
("[ ]+:", " :")],
"nnbsp_within_quotation_marks":[("«(?=\\w)", "« "),
("«[ ]+", "« "),
("(?<=[\\w.!?])»", " »"),
("[ ]+»", " »")],
"nnbsp_within_numbers": [("(\\d)[ ](\\d)", "\\1 \\2")],
# common
"nbsp_before_symbol": [("(\\d) ?([%‰€$£¥˚Ω℃])", "\\1 \\2")],
"nbsp_before_units": [("(?<=[0-9⁰¹²³⁴⁵⁶⁷⁸⁹]) ?([kcmµn]?(?:[slgJKΩ]|m[²³]?|Wh?|Hz|dB)|[%‰]|°C)\\b", " \\1")],
"nbsp_repair": [("(?<=[[(])[ ]([!?:;])", "\\1"),
("(https?|ftp)[ ]:(?=//)", "\\1:"),
("&([a-z]+)[ ];", "&\\1;"),
("&#([0-9]+|x[0-9a-fA-F]+)[ ];", "&#\\1;")],
## missing spaces
|
>
|
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
|
("[ ]+:", " :")],
"nnbsp_within_quotation_marks":[("«(?=\\w)", "« "),
("«[ ]+", "« "),
("(?<=[\\w.!?])»", " »"),
("[ ]+»", " »")],
"nnbsp_within_numbers": [("(\\d)[ ](\\d)", "\\1 \\2")],
# common
"nbsp_titles": [("M(mes?|ᵐᵉˢ?|grs?|ᵍʳˢ?|lles?|ˡˡᵉˢ?|rs?|ʳˢ?|M\\.) ", "M\\1 ")],
"nbsp_before_symbol": [("(\\d) ?([%‰€$£¥˚Ω℃])", "\\1 \\2")],
"nbsp_before_units": [("(?<=[0-9⁰¹²³⁴⁵⁶⁷⁸⁹]) ?([kcmµn]?(?:[slgJKΩ]|m[²³]?|Wh?|Hz|dB)|[%‰]|°C)\\b", " \\1")],
"nbsp_repair": [("(?<=[[(])[ ]([!?:;])", "\\1"),
("(https?|ftp)[ ]:(?=//)", "\\1:"),
("&([a-z]+)[ ];", "&\\1;"),
("&#([0-9]+|x[0-9a-fA-F]+)[ ];", "&#\\1;")],
## missing spaces
|
192
193
194
195
196
197
198
199
200
201
202
203
204
205
|
("within_quotation_marks", True),
("nbsp_before_punctuation", True),
("nbsp_within_quotation_marks", True),
("nbsp_within_numbers", True),
("nnbsp_before_punctuation", False),
("nnbsp_within_quotation_marks", False),
("nnbsp_within_numbers", False),
("nbsp_before_symbol", True),
("nbsp_before_units", True),
("nbsp_repair", True),
("add_space_after_punctuation", True),
("add_space_around_hyphens", True),
("add_space_repair", True),
("erase_non_breaking_hyphens", False),
|
>
|
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
|
("within_quotation_marks", True),
("nbsp_before_punctuation", True),
("nbsp_within_quotation_marks", True),
("nbsp_within_numbers", True),
("nnbsp_before_punctuation", False),
("nnbsp_within_quotation_marks", False),
("nnbsp_within_numbers", False),
("nbsp_titles", False),
("nbsp_before_symbol", True),
("nbsp_before_units", True),
("nbsp_repair", True),
("add_space_after_punctuation", True),
("add_space_around_hyphens", True),
("add_space_repair", True),
("erase_non_breaking_hyphens", False),
|