45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
|
# common
"nbsp_titles": [("\\bM(mes?|ᵐᵉˢ?|grs?|ᵍʳˢ?|lles?|ˡˡᵉˢ?|rs?|ʳˢ?|M\\.) ", "M\\1 "),
("\\bP(re?s?|ʳᵉ?ˢ?) ", "P\\1 "),
("\\bD(re?s?|ʳᵉ?ˢ?) ", "D\\1 "),
("\\bV(ves?|ᵛᵉˢ?) ", "V\\1 ")],
"nbsp_before_symbol": [("(\\d) ?([%‰€$£¥˚Ω℃])", "\\1 \\2")],
"nbsp_before_units": [("(?<=[0-9⁰¹²³⁴⁵⁶⁷⁸⁹]) ?([kcmµn]?(?:[slgJKΩ]|m[²³]?|Wh?|Hz|dB)|[%‰]|°C)\\b", " \\1")],
"nbsp_repair": [("(?<=[[(])[ ]([!?:;])", "\\1"),
("(https?|ftp)[ ]:(?=//)", "\\1:"),
("&([a-z]+)[ ];", "&\\1;"),
("&#([0-9]+|x[0-9a-fA-F]+)[ ];", "&#\\1;")],
## missing spaces
"add_space_after_punctuation": [("([;!…])(?=\\w)", "\\1 "),
("[?](?=[A-ZÉÈÊÂÀÎ])", "? "),
("\\.(?=[A-ZÉÈÎ][a-zA-ZàâÂéÉèÈêÊîÎïÏôÔöÖûÛüÜùÙ])", ". "),
|
|
|
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
|
# common
"nbsp_titles": [("\\bM(mes?|ᵐᵉˢ?|grs?|ᵍʳˢ?|lles?|ˡˡᵉˢ?|rs?|ʳˢ?|M\\.) ", "M\\1 "),
("\\bP(re?s?|ʳᵉ?ˢ?) ", "P\\1 "),
("\\bD(re?s?|ʳᵉ?ˢ?) ", "D\\1 "),
("\\bV(ves?|ᵛᵉˢ?) ", "V\\1 ")],
"nbsp_before_symbol": [("(\\d) ?([%‰€$£¥˚Ω℃])", "\\1 \\2")],
"nbsp_before_units": [("(?<=[0-9⁰¹²³⁴⁵⁶⁷⁸⁹]) ?([kcmµn]?(?:[slgJKΩ]|m[²³]?|Wh?|Hz|dB)|[%‰]|°C)\\b", " \\1")],
"nbsp_repair": [("(?<=[\\[(])[ ]([!?:;])", "\\1"),
("(https?|ftp)[ ]:(?=//)", "\\1:"),
("&([a-z]+)[ ];", "&\\1;"),
("&#([0-9]+|x[0-9a-fA-F]+)[ ];", "&#\\1;")],
## missing spaces
"add_space_after_punctuation": [("([;!…])(?=\\w)", "\\1 "),
("[?](?=[A-ZÉÈÊÂÀÎ])", "? "),
("\\.(?=[A-ZÉÈÎ][a-zA-ZàâÂéÉèÈêÊîÎïÏôÔöÖûÛüÜùÙ])", ". "),
|