"""
Text formatter
"""
import re
dReplTable = {
# surnumerary_spaces
"start_of_paragraph": [("^[ ]+", "")],
"end_of_paragraph": [("[ ]+$", "")],
"between_words": [(" | ", " "), # espace + espace insécable -> espace
(" +", " "), # espaces surnuméraires
(" +", " ")], # espaces insécables surnuméraires
"before_punctuation": [(" +(?=[.,…])", "")],
"within_parenthesis": [("\\([ ]+", "("),
("[ ]+\\)", ")")],
"within_square_brackets": [("\\[[ ]+", "["),
("[ ]+\\]", "]")],
"within_quotation_marks": [("“[ ]+", "“"),
("[ ]”", "”")],
## non-breaking spaces
# espaces insécables
"nbsp_before_punctuation": [("(?<=[]\\w…)»}])([:;?!])[ …]", " \\1 "),
("(?<=[]\\w…)»}])([:;?!])$", " \\1"),
("[ ]+([:;?!])", " \\1")],
"nbsp_within_quotation_marks": [("«(?=\\w)", "« "),
("«[ ]+", "« "),
("(?<=[\\w.!?])»", " »"),
("[ ]+»", " »")],
"nbsp_within_numbers": [("(\\d)[ ](?=\\d)", "\\1 ")],
# espaces insécables fines
"nnbsp_before_punctuation": [("(?<=[]\\w…)»}])([;?!])[ …]", " \\1 "),
("(?<=[]\\w…)»}])([;?!])$", " \\1"),
("[ ]+([;?!])", " \\1"),
("(?<=[]\\w…)»}]):", " :"),
("[ ]+:", " :")],
"nnbsp_within_quotation_marks":[("«(?=\\w)", "« "),
("«[ ]+", "« "),
("(?<=[\\w.!?])»", " »"),
("[ ]+»", " »")],
"nnbsp_within_numbers": [("(\\d)[ ](\\d)", "\\1 \\2")],
# common
"nbsp_titles": [("\\bM(mes?|ᵐᵉˢ?|grs?|ᵍʳˢ?|lles?|ˡˡᵉˢ?|rs?|ʳˢ?|M\\.) ", "M\\1 "),
("\\bP(re?s?|ʳᵉ?ˢ?) ", "P\\1 "),
("\\bD(re?s?|ʳᵉ?ˢ?) ", "D\\1 "),
("\\bV(ves?|ᵛᵉˢ?) ", "V\\1 ")],
"nbsp_before_symbol": [("(\\d) ?([%‰€$£¥˚Ω℃])", "\\1 \\2")],
"nbsp_before_units": [("(?<=[0-9⁰¹²³⁴⁵⁶⁷⁸⁹]) ?([kcmµn]?(?:[slgJKΩ]|m[²³]?|Wh?|Hz|dB)|[%‰]|°C)\\b", " \\1")],
"nbsp_repair": [("(?<=[\\[(])[ ]([!?:;])", "\\1"),
("(https?|ftp)[ ]:(?=//)", "\\1:"),
("&([a-z]+)[ ];", "&\\1;"),
("&#([0-9]+|x[0-9a-fA-F]+)[ ];", "&#\\1;")],
## missing spaces
"add_space_after_punctuation": [("([;!…])(?=\\w)", "\\1 "),
("[?](?=[A-ZÉÈÊÂÀÎ])", "? "),
("\\.(?=[A-ZÉÈÎ][a-zA-ZàâÂéÉèÈêÊîÎïÏôÔöÖûÛüÜùÙ])", ". "),
("\\.(?=À)", ". "),
("(?i)([,:])(?=[a-zàâäéèêëîïôöûüù])", "\\1 "),
("(?i)([a-zàâäéèêëîïôöûüù]),(?=[0-9])", "\\1, ")],
"add_space_around_hyphens": [(" ([-–—])(?=[a-zàâäéèêëîïôöûüù\"«“'‘])", " \\1 "),
("(?<=[a-zàâäéèêëîïôöûüù\"»”'’])([-–—]) ", " \\1 ")],
"add_space_repair": [("DnT, ([wA])\\b", "DnT,\\1")],
## erase
"erase_non_breaking_hyphens": [("", "")],
## typographic signs
"ts_apostrophe": [ ("(?i)\\b([ldnjmtscç])['´‘′`](?=\\w)", "\\1’"),
("(?i)(qu|jusqu|lorsqu|puisqu|quoiqu|quelqu|presqu|entr|aujourd|prud)['´‘′`]", "\\1’") ],
"ts_ellipsis": [ ("\\.\\.\\.", "…"),
("(?<=…)[.][.]", "…"),
("…[.](?![.])", "…") ],
"ts_n_dash_middle": [ (" [-—] ", " – "),
(" [-—],", " –,") ],
"ts_m_dash_middle": [ (" [-–] ", " — "),
(" [-–],", " —,") ],
"ts_n_dash_start": [ ("^[-—][ ]", "– "),
("^– ", "– "),
("^[-–—](?=[\\w.…])", "– ") ],
"ts_m_dash_start": [ ("^[-–][ ]", "— "),
("^— ", "— "),
("^«[ ][—–-][ ]", "« — "),
("^[-–—](?=[\\w.…])", "— ") ],
"ts_quotation_marks": [ ('"(\\w+)"', "“\\1”"),
("''(\\w+)''", "“\\1”"),
("'(\\w+)'", "“\\1”"),
("^(?:\"|'')(?=\\w)", "« "),
(" (?:\"|'')(?=\\w)", " « "),
("\\((?:\"|'')(?=\\w)", "(« "),
("(?<=\\w)(?:\"|'')$", " »"),
("(?<=\\w)(?:\"|'')(?=[] ,.:;?!…)])", " »"),
('(?<=[.!?…])" ', " » "),
('(?<=[.!?…])"$', " »") ],
"ts_spell": [ ("coeur", "cœur"), ("Coeur", "Cœur"),
("coel(?=[aeio])", "cœl"), ("Coel(?=[aeio])", "Cœl"),
("choeur", "chœur"), ("Choeur", "Chœur"),
("foet", "fœt"), ("Foet", "Fœt"),
("oeil", "œil"), ("Oeil", "Œil"),
("oeno", "œno"), ("Oeno", "Œno"),
("oesoph", "œsoph"), ("Oesoph", "Œsoph"),
("oestro", "œstro"), ("Oestro", "Œstro"),
("oeuf", "œuf"), ("Oeuf", "Œuf"),
("oeuvr", "œuvr"), ("Oeuvr", "Œuvr"),
("moeur", "mœur"), ("Moeur", "Mœur"),
("noeu", "nœu"), ("Noeu", "Nœu"),
("soeur", "sœur"), ("Soeur", "Sœur"),
("voeu", "vœu"), ("Voeu", "Vœu"),
("aequo", "æquo"), ("Aequo", "Æquo"),
("\\bCa\\b", "Ça"), (" ca\\b", " ça"),
("\\bdej[aà]\\b", "déjà"), ("\\bplutot\\b", "plutôt"),
("\\bmeme\\b", "même"), ("\\bmemes\\b", "mêmes"), ("\\bMeme\\b", "Même"),
("\\b([cC]e(?:ux|lles?|lui))-la\\b", "\\1-là"),
("\\bmalgre\\b", "malgré"), ("\\bMalgre\\b", "Malgré"),
("\\betre\\b", "être"), ("\\bEtre\\b", "Être"),
("\\btres\\b", "très"), ("\\bTres\\b", "Très"),
("\\bEtai([ts]|ent)\\b", "Étai\\1"),
("\\bE(tat|cole|crit|poque|tude|ducation|glise|conomi(?:qu|)e|videmment|lysée|tienne|thiopie|cosse|gypt(?:e|ien)|rythrée|pinal|vreux)", "É\\1") ],
"ts_ligature_ffi_on": [("ffi", "ffi")],
"ts_ligature_ffl_on": [("ffl", "ffl")],
"ts_ligature_fi_on": [("fi", "fi")],
"ts_ligature_fl_on": [("fl", "fl")],
"ts_ligature_ff_on": [("ff", "ff")],
"ts_ligature_ft_on": [("ft", "ſt")],
"ts_ligature_st_on": [("st", "st")],
"ts_ligature_fi_off": [("fi", "fi")],
"ts_ligature_fl_off": [("fl", "fl")],
"ts_ligature_ff_off": [("ff", "ff")],
"ts_ligature_ffi_off": [("ffi", "ffi")],
"ts_ligature_ffl_off": [("ffl", "ffl")],
"ts_ligature_ft_off": [("ſt", "ft")],
"ts_ligature_st_off": [("st", "st")],
"ts_units": [ ("\\bN\\.([ms])\\b", "N·\\1"), # N·m et N·m-1, N·s
("\\bW\\.h\\b", "W·h"),
("\\bPa\\.s\\b", "Pa·s"),
("\\bA\\.h\\b", "A·h"),
("\\bΩ\\.m\\b", "Ω·m"),
("\\bS\\.m\\b", "S·m"),
("\\bg\\.s(?=-1)\\b", "g·s"),
("\\bm\\.s(?=-[12])\\b", "m·s"),
("\\bg\\.m(?=2|-3)\\b", "g·m"),
("\\bA\\.m(?=-1)\\b", "A·m"),
("\\bJ\\.K(?=-1)\\b", "J·K"),
("\\bW\\.m(?=-2)\\b", "W·m"),
("\\bcd\\.m(?=-2)\\b", "cd·m"),
("\\bC\\.kg(?=-1)\\b", "C·kg"),
("\\bH\\.m(?=-1)\\b", "H·m"),
("\\bJ\\.kg(?=-1)\\b", "J·kg"),
("\\bJ\\.m(?=-3)\\b", "J·m"),
("\\bm[2²]\\.s\\b", "m²·s"),
("\\bm[3³]\\.s(?=-1)\\b", "m³·s"),
#("\\bJ.kg-1.K-1\\b", "J·kg-1·K-1"),
#("\\bW.m-1.K-1\\b", "W·m-1·K-1"),
#("\\bW.m-2.K-1\\b", "W·m-2·K-1"),
("\\b(Y|Z|E|P|T|G|M|k|h|da|d|c|m|µ|n|p|f|a|z|y)Ω\\b", "\\1Ω") ],
## misc
"ordinals_exponant": [ ("\\b([0-9]+)(?:i?[èe]me|è|e)\\b", "\\1ᵉ"),
("\\b([XVICL]+)(?:i?[èe]me|è)\\b", "\\1ᵉ"),
("(?<=\\b(au|l[ea]|du) [XVICL])e\\b", "ᵉ"),
("(?<=\\b[XVI])e(?= siècle)", "ᵉ"),
("(?<=\\b[1I])er\\b", "ᵉʳ"),
("(?<=\\b[1I])re\\b", "ʳᵉ") ],
"ordinals_no_exponant": [ ("\\b([0-9]+)(?:i?[èe]me|è)\\b", "\\1e"),
("\\b([XVICL]+)(?:i?[èe]me|è)\\b", "\\1e"),
("(?<=\\b[1I])ᵉʳ\\b", "er"),
("(?<=\\b[1I])ʳᵉ\\b", "er")],
"etc": [ ("etc(…|[.][.][.]?)", "etc."),
("(?<!,) etc[.]", ", etc.") ],
## missing hyphens
"mh_interrogatives": [ ("[ -]t[’'](?=il\\b|elle|on\\b)", "-t-"),
(" t-(?=il|elle|on)", "-t-"),
("[ -]t[’'-](?=ils|elles)", "-"),
("(?<=[td])-t-(?=il|elle|on)", "-") ],
"mh_numbers": [ ("dix (sept|huit|neuf)", "dix-\\1"),
("quatre vingt", "quatre-vingt"),
("(soixante|quatre-vingt) dix", "\\1-dix"),
("(vingt|trente|quarante|cinquante|soixante(?:-dix|)|quatre-vingt(?:-dix|)) (deux|trois|quatre|cinq|six|sept|huit|neuf)\\b", "\\1-\\2")],
"mh_frequent_words": [ ("(?i)ce(lles?|lui|ux) (ci|là)\\b", "ce\\1-\\2"),
("(?i)(?<!-)\\b(ci) (joint|desso?us|contre|devant|avant|après|incluse|g[îi]t|gisent)", "\\1-\\2"),
("vis à vis", "vis-à-vis"),
("Vis à vis", "Vis-à-vis"),
("week end", "week-end"),
("Week end", "Week-end"),
("(?i)(plus|moins) value", "\\1-value") ],
## missing apostrophes
"ma_word": [("(?i)(qu|lorsqu|puisqu|quoiqu|presqu|jusqu|aujourd|entr|quelqu|prud) ", "\\1’")],
"ma_1letter_lowercase": [("\\b([ldjnmtscç]) (?=[aàeéêiîoôuyhAÀEÉÊIÎOÔUYH])", "\\1’")],
"ma_1letter_uppercase": [("\\b([LDJNMTSCÇ]) (?=[aàeéêiîoôuyhAÀEÉÊIÎOÔUYH])", "\\1’")]
}
dDefaultOptions = {
"ts_units": True,
"start_of_paragraph": True,
"end_of_paragraph": True,
"between_words": True,
"before_punctuation": True,
"within_parenthesis": True,
"within_square_brackets": True,
"within_quotation_marks": True,
"nbsp_before_punctuation": True,
"nbsp_within_quotation_marks": True,
"nbsp_within_numbers": True,
"nnbsp_before_punctuation": False,
"nnbsp_within_quotation_marks": False,
"nnbsp_within_numbers": False,
"nbsp_titles": False,
"nbsp_before_symbol": True,
"nbsp_before_units": True,
"nbsp_repair": True,
"add_space_after_punctuation": True,
"add_space_around_hyphens": True,
"add_space_repair": True,
"erase_non_breaking_hyphens": False,
"ts_apostrophe": True,
"ts_ellipsis": True,
"ts_n_dash_middle": True,
"ts_m_dash_middle": False,
"ts_n_dash_start": False,
"ts_m_dash_start": True,
"ts_quotation_marks": True,
"ts_spell": True,
"ts_ligature_ffi_on": False,
"ts_ligature_ffl_on": False,
"ts_ligature_fi_on": False,
"ts_ligature_fl_on": False,
"ts_ligature_ff_on": False,
"ts_ligature_ft_on": False,
"ts_ligature_st_on": False,
"ts_ligature_fi_off": False,
"ts_ligature_fl_off": False,
"ts_ligature_ff_off": False,
"ts_ligature_ffi_off": False,
"ts_ligature_ffl_off": False,
"ts_ligature_ft_off": False,
"ts_ligature_st_off": False,
"ordinals_exponant": False,
"ordinals_no_exponant": True,
"etc": True,
"mh_interrogatives": True,
"mh_numbers": True,
"mh_frequent_words": True,
"ma_word": True,
"ma_1letter_lowercase": False,
"ma_1letter_uppercase": False
}
class TextFormatter:
"Text Formatter: purge typographic mistakes from text"
def __init__ (self):
for _, lTup in dReplTable.items():
for i, t in enumerate(lTup):
lTup[i] = (re.compile(t[0]), t[1])
def formatText (self, sText):
"returns formatted text"
for sOptName, bVal in dDefaultOptions.items():
if bVal:
for zRgx, sRep in dReplTable[sOptName]:
sText = zRgx.sub(sRep, sText)
return sText
def getDefaultOptions (self):
"returns default options"
return dDefaultOptions.copy()