Grammalecte  Diff

Differences From Artifact [8fb9ec33bf]:

To Artifact [219d3111da]:


1




2
3
4
5
6
7
8
#!python3





import re


dReplTable = {
    # surnumerary_spaces
    "start_of_paragraph":          [("^[  ]+", "")],

>
>
>
>







1
2
3
4
5
6
7
8
9
10
11
12
#!python3

"""
Text formatter
"""

import re


dReplTable = {
    # surnumerary_spaces
    "start_of_paragraph":          [("^[  ]+", "")],
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
    # common
    "nbsp_titles":                 [("\\bM(mes?|ᵐᵉˢ?|grs?|ᵍʳˢ?|lles?|ˡˡᵉˢ?|rs?|ʳˢ?|M\\.) ", "M\\1 "),
                                    ("\\bP(re?s?|ʳᵉ?ˢ?) ", "P\\1 "),
                                    ("\\bD(re?s?|ʳᵉ?ˢ?) ", "D\\1 "),
                                    ("\\bV(ves?|ᵛᵉˢ?) ", "V\\1 ")],
    "nbsp_before_symbol":          [("(\\d) ?([%‰€$£¥˚Ω℃])", "\\1 \\2")],
    "nbsp_before_units":           [("(?<=[0-9⁰¹²³⁴⁵⁶⁷⁸⁹]) ?([kcmµn]?(?:[slgJKΩ]|m[²³]?|Wh?|Hz|dB)|[%‰]|°C)\\b", " \\1")],
    "nbsp_repair":                 [("(?<=[[(])[   ]([!?:;])", "\\1"),
                                    ("(https?|ftp)[   ]:(?=//)", "\\1:"),
                                    ("&([a-z]+)[   ];", "&\\1;"),
                                    ("&#([0-9]+|x[0-9a-fA-F]+)[   ];", "&#\\1;")],
    ## missing spaces
    "add_space_after_punctuation": [("([;!…])(?=\\w)", "\\1 "),
                                    ("[?](?=[A-ZÉÈÊÂÀÎ])", "? "),
                                    ("\\.(?=[A-ZÉÈÎ][a-zA-ZàâÂéÉèÈêÊîÎïÏôÔöÖûÛüÜùÙ])", ". "),







|







45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
    # common
    "nbsp_titles":                 [("\\bM(mes?|ᵐᵉˢ?|grs?|ᵍʳˢ?|lles?|ˡˡᵉˢ?|rs?|ʳˢ?|M\\.) ", "M\\1 "),
                                    ("\\bP(re?s?|ʳᵉ?ˢ?) ", "P\\1 "),
                                    ("\\bD(re?s?|ʳᵉ?ˢ?) ", "D\\1 "),
                                    ("\\bV(ves?|ᵛᵉˢ?) ", "V\\1 ")],
    "nbsp_before_symbol":          [("(\\d) ?([%‰€$£¥˚Ω℃])", "\\1 \\2")],
    "nbsp_before_units":           [("(?<=[0-9⁰¹²³⁴⁵⁶⁷⁸⁹]) ?([kcmµn]?(?:[slgJKΩ]|m[²³]?|Wh?|Hz|dB)|[%‰]|°C)\\b", " \\1")],
    "nbsp_repair":                 [("(?<=[\\[(])[   ]([!?:;])", "\\1"),
                                    ("(https?|ftp)[   ]:(?=//)", "\\1:"),
                                    ("&([a-z]+)[   ];", "&\\1;"),
                                    ("&#([0-9]+|x[0-9a-fA-F]+)[   ];", "&#\\1;")],
    ## missing spaces
    "add_space_after_punctuation": [("([;!…])(?=\\w)", "\\1 "),
                                    ("[?](?=[A-ZÉÈÊÂÀÎ])", "? "),
                                    ("\\.(?=[A-ZÉÈÎ][a-zA-ZàâÂéÉèÈêÊîÎïÏôÔöÖûÛüÜùÙ])", ". "),
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
    "erase_non_breaking_hyphens":  [("­", "")],
    ## typographic signs
    "ts_apostrophe":          [ ("(?i)\\b([ldnjmtscç])['´‘′`](?=\\w)", "\\1’"),
                                ("(?i)(qu|jusqu|lorsqu|puisqu|quoiqu|quelqu|presqu|entr|aujourd|prud)['´‘′`]", "\\1’") ],
    "ts_ellipsis":            [ ("\\.\\.\\.", "…"),
                                ("(?<=…)[.][.]", "…"),
                                ("…[.](?![.])", "…") ],
    "ts_n_dash_middle":       [ (" [-—] ", " – "), 
                                (" [-—],", " –,") ],
    "ts_m_dash_middle":       [ (" [-–] ", " — "),
                                (" [-–],", " —,") ],
    "ts_n_dash_start":        [ ("^[-—][  ]", "– "),
                                ("^– ", "– "),
                                ("^[-–—](?=[\\w.…])", "– ") ],
    "ts_m_dash_start":        [ ("^[-–][  ]", "— "),
                                ("^— ", "— "),
                                ("^«[  ][—–-][  ]", "« — "),
                                ("^[-–—](?=[\\w.…])", "— ") ],
    "ts_quotation_marks":     [ (u'"(\\w+)"', "“$1”"),
                                ("''(\\w+)''", "“$1”"),
                                ("'(\\w+)'", "“$1”"),
                                ("^(?:\"|'')(?=\\w)", "« "),
                                (" (?:\"|'')(?=\\w)", " « "),
                                ("\\((?:\"|'')(?=\\w)", "(« "),
                                ("(?<=\\w)(?:\"|'')$", " »"),
                                ("(?<=\\w)(?:\"|'')(?=[] ,.:;?!…)])", " »"),
                                (u'(?<=[.!?…])" ', " » "),
                                (u'(?<=[.!?…])"$', " »") ],
    "ts_spell":               [ ("coeur", "cœur"), ("Coeur", "Cœur"),
                                ("coel(?=[aeio])", "cœl"), ("Coel(?=[aeio])", "Cœl"),
                                ("choeur", "chœur"), ("Choeur", "Chœur"),
                                ("foet", "fœt"), ("Foet", "Fœt"),
                                ("oeil", "œil"), ("Oeil", "Œil"),
                                ("oeno", "œno"), ("Oeno", "Œno"),
                                ("oesoph", "œsoph"), ("Oesoph", "Œsoph"),







|










|







|
|







67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
    "erase_non_breaking_hyphens":  [("­", "")],
    ## typographic signs
    "ts_apostrophe":          [ ("(?i)\\b([ldnjmtscç])['´‘′`](?=\\w)", "\\1’"),
                                ("(?i)(qu|jusqu|lorsqu|puisqu|quoiqu|quelqu|presqu|entr|aujourd|prud)['´‘′`]", "\\1’") ],
    "ts_ellipsis":            [ ("\\.\\.\\.", "…"),
                                ("(?<=…)[.][.]", "…"),
                                ("…[.](?![.])", "…") ],
    "ts_n_dash_middle":       [ (" [-—] ", " – "),
                                (" [-—],", " –,") ],
    "ts_m_dash_middle":       [ (" [-–] ", " — "),
                                (" [-–],", " —,") ],
    "ts_n_dash_start":        [ ("^[-—][  ]", "– "),
                                ("^– ", "– "),
                                ("^[-–—](?=[\\w.…])", "– ") ],
    "ts_m_dash_start":        [ ("^[-–][  ]", "— "),
                                ("^— ", "— "),
                                ("^«[  ][—–-][  ]", "« — "),
                                ("^[-–—](?=[\\w.…])", "— ") ],
    "ts_quotation_marks":     [ ('"(\\w+)"', "“$1”"),
                                ("''(\\w+)''", "“$1”"),
                                ("'(\\w+)'", "“$1”"),
                                ("^(?:\"|'')(?=\\w)", "« "),
                                (" (?:\"|'')(?=\\w)", " « "),
                                ("\\((?:\"|'')(?=\\w)", "(« "),
                                ("(?<=\\w)(?:\"|'')$", " »"),
                                ("(?<=\\w)(?:\"|'')(?=[] ,.:;?!…)])", " »"),
                                ('(?<=[.!?…])" ', " » "),
                                ('(?<=[.!?…])"$', " »") ],
    "ts_spell":               [ ("coeur", "cœur"), ("Coeur", "Cœur"),
                                ("coel(?=[aeio])", "cœl"), ("Coel(?=[aeio])", "Cœl"),
                                ("choeur", "chœur"), ("Choeur", "Chœur"),
                                ("foet", "fœt"), ("Foet", "Fœt"),
                                ("oeil", "œil"), ("Oeil", "Œil"),
                                ("oeno", "œno"), ("Oeno", "Œno"),
                                ("oesoph", "œsoph"), ("Oesoph", "Œsoph"),