ADDED gc_lang/fr/oxt/TextFormatter/tf_tabrep.py Index: gc_lang/fr/oxt/TextFormatter/tf_tabrep.py ================================================================== --- gc_lang/fr/oxt/TextFormatter/tf_tabrep.py +++ gc_lang/fr/oxt/TextFormatter/tf_tabrep.py @@ -0,0 +1,423 @@ +# Regular expressions for the text formatter of LO +# working with ICU (bag of bugs) + + +# ICU: & is $0 in replacement field + +# NOTE: A LOT OF REGEX COULD BE MERGED IF ICU ENGINE WAS NOT SO BUGGY +# "([;?!…])(?=[:alnum:])" => "$1 " doesn’t work properly +# "(?<=[:alnum:])([;?!…])" => " $1 " doesn’t work properly + + + +# +# String to replace replacement regex? case sensitive? +# + +dTableRepl = { + # Restructuration + "struct1": [ + ("\\n", "\\n", True, True) # end of line => end of paragraph + ], + "struct2": [ + ("([:alpha:])- *\n([:alpha:])", "$1$2", True, False) # EOL + ], + + # espaces surnuméraires + "ssp1": [ + ("^[  ]+", "", True, True) + ], + "ssp2": [ + ("  ", " ", False, True), # espace + espace insécable -> espace + ("  ", " ", False, True), # espace insécable + espace -> espace + (" +", " ", True, True), # espaces surnuméraires + ("  +", " ", True, True) # espaces insécables surnuméraires + ], + "ssp3": [ + ("[  ]+$", "", True, True) + ], + "ssp4": [ + (" +(?=[.,…])", "", True, True) + ], + "ssp5": [ + ("\\([  ]+", "(", True, True), + ("[  ]+\\)", ")", True, True) + ], + "ssp6": [ + ("\\[[  ]+", "[", True, True), + ("[  ]+\\]", "]", True, True) + ], + "ssp7": [ + ("“[  ]+", "“", True, True), + ("[  ]”", "”", True, True) + ], + + # espaces insécables + "nbsp1": [ + ("(?<=[:alnum:]):[   ]", " : ", True, False), + ("(?<=[:alnum:]):$", " :", True, False), + ("(?<=[:alnum:]);", " ;", True, False), + ("(?<=[:alnum:])[?][   ]", " ? ", True, False), + ("(?<=[:alnum:])[?]$", " ?", True, False), + ("(?<=[:alnum:])!", " !", True, False), + ("(?<=[]…)»}]):", " :", True, False), + ("(?<=[]…)»}]);", " ;", True, False), + ("(?<=[]…)»}])[?][   ]", " ? ", True, False), + ("(?<=[]…)»}])[?]$", " ?", True, False), + ("(?<=[]…)»}])!", " !", True, False), + ("[  ]+([:;?!])", " $1", True, False) + ], + "nnbsp1": [ + ("(?<=[:alnum:]);", " ;", True, False), + ("(?<=[:alnum:])[?][   ]", " ? ", True, False), + ("(?<=[:alnum:])[?]$", " ?", True, False), + ("(?<=[:alnum:])!", " !", True, False), + ("(?<=[]…)»}]);", " ;", True, False), + ("(?<=[]…)»}])[?][   ]", " ? ", True, False), + ("(?<=[]…)»}])[?]$", " ?", True, False), + ("(?<=[]…)»}])!", " !", True, False), + ("[  ]+([;?!])", " $1", True, False), + ("(?<=[:alnum:]):[   ]", " : ", True, False), + ("(?<=[:alnum:]):$", " :", True, False), + ("(?<=[]…)»}]):", " :", True, False), + ("[  ]+:", " :", True, False) + ], + "nbsp1_fix": [ + ("([[(])[   ]([!?:;])", "$1$2", True, False), + ("(?<=http)[   ]://", "://", True, False), + ("(?<=https)[   ]://", "://", True, False), + ("(?<=ftp)[   ]://", "://", True, False), + ("(?<=&)amp[   ];", "amp;", True, False), + ("(?<=&)nbsp[   ];", "nbsp;", True, False), + ("(?<=&)lt[   ];", "lt;", True, False), + ("(?<=&)gt[   ];", "gt;", True, False), + ("(?<=&)apos[   ];", "apos;", True, False), + ("(?<=&)quot[   ];", "quot;", True, False), + ("(?<=&)thinsp[   ];", "thinsp;", True, False) + ], + "nbsp2": [ + ("«(?=[:alnum:])", "« ", True, False), + ("«[  ]+", "« ", True, False), + ("(?<=[:alnum:]|[.!?])»", " »", True, False), + ("[  ]+»", " »", True, False) + ], + "nnbsp2": [ + ("«(?=[:alnum:])", "« ", True, False), + ("«[  ]+", "« ", True, False), + ("(?<=[:alnum:]|[.!?])»", " »", True, False), + ("[  ]+»", " »", True, False) + ], + "nbsp3": [ + ("([:digit:])([%‰€$£¥˚℃])", "$1 $2", True, True), + ("([:digit:]) ([%‰€$£¥˚℃])", "$1 $2", True, True), + ], + "nbsp4": [ + ("([:digit:])[  ]([:digit:])", "$1 $2", True, True) + ], + "nnbsp4": [ + ("([:digit:])[  ]([:digit:])", "$1 $2", True, True) + ], + "nbsp5": [ + ("(?<=[0-9⁰¹²³⁴⁵⁶⁷⁸⁹]) ?([kcmµnd]?(?:[slgJKΩΩℓ]|m[²³]?|Wh?|Hz|dB)|[%‰]|°C)\\b", " $1", True, True) + ], + "nbsp6": [ + ("\\b(MM?\\.|Mlle|Mgr) ", "$1 ", True, True) + ], + + # espaces manquants + "space1": [ + (";(?=[:alnum:])", "; ", True, True), + ("\\?(?=[A-ZÉÈÊÂÀÎ])", "? ", True, True), + ("!(?=[:alnum:])", "! ", True, True), + ("…(?=[:alnum:])", "… ", True, True), + ("\\.(?=[A-ZÉÈÎ][:alpha:])", ". ", True, True), + ("\\.(?=À)", ". ", True, True), + (",(?=[:alpha:])", ", ", True, True), + ("([:alpha:]),([0-9])", "$1, $2", True, True), + (":(?=[:alpha:])", ": ", True, True) + ], + "space1_fix": [ + ("(?<=DnT), w\\b", ",w", True, True), + ("(?<=DnT), A\\b", ",A", True, True) + ], + "space2": [ + (" -(?=[:alpha:]|[\"«“'‘])", " - ", True, False), + (" –(?=[:alpha:]|[\"«“'‘])", " – ", True, False), # demi-cadratin + (" —(?=[:alpha:]|[\"«“'‘])", " — ", True, False), # cadratin + ("(?<=[:alpha:])– ", " – ", True, False), + ("(?<=[:alpha:])— ", " — ", True, False), + ("(?<=[:alpha:])- ", " - ", True, False), + ("(?<=[\"»”'’])– ", " – ", True, False), + ("(?<=[\"»”'’])— ", " — ", True, False), + ("(?<=[\"»”'’])- ", " - ", True, False) + ], + + # Suppressions + "delete1": [ + ("­", "", False, True) + ], + + # Signes typographiques + "typo1": [ + ("\\bl['´‘′`](?=[:alnum:])", "l’", True, True), + ("\\bj['´‘′`](?=[:alnum:])", "j’", True, True), + ("\\bm['´‘′`](?=[:alnum:])", "m’", True, True), + ("\\bt['´‘′`](?=[:alnum:])", "t’", True, True), + ("\\bs['´‘′`](?=[:alnum:])", "s’", True, True), + ("\\bc['´‘′`](?=[:alnum:])", "c’", True, True), + ("\\bd['´‘′`](?=[:alnum:])", "d’", True, True), + ("\\bn['´‘′`](?=[:alnum:])", "n’", True, True), + ("\\bç['´‘′`](?=[:alnum:])", "ç’", True, True), + ("\\bL['´‘′`](?=[:alnum:])", "L’", True, True), + ("\\bJ['´‘′`](?=[:alnum:])", "J’", True, True), + ("\\bM['´‘′`](?=[:alnum:])", "M’", True, True), + ("\\bT['´‘′`](?=[:alnum:])", "T’", True, True), + ("\\bS['´‘′`](?=[:alnum:])", "S’", True, True), + ("\\bC['´‘′`](?=[:alnum:])", "C’", True, True), + ("\\bD['´‘′`](?=[:alnum:])", "D’", True, True), + ("\\bN['´‘′`](?=[:alnum:])", "N’", True, True), + ("\\bÇ['´‘′`](?=[:alnum:])", "Ç’", True, True), + ("(qu|jusqu|lorsqu|puisqu|quoiqu|quelqu|presqu|entr|aujourd|prud)['´‘′`]", "$1’", True, False) + ], + "typo2": [ + ("...", "…", False, True), + ("(?<=…)[.][.]", "…", True, True), + ("…[.](?![.])", "…", True, True) + ], + "typo3a": [ # cadratin + (" - ", " — ", False, True), + (" – ", " — ", False, True), + (" -,", " —,", False, True), + (" –,", " —,", False, True) + ], + "typo3b": [ # demi-cadratin + (" - ", " – ", False, True), + (" — ", " – ", False, True), + (" -,", " –,", False, True), + (" —,", " –,", False, True) + ], + "typo4a": [ # cadratin + ("^-[  ]", "— ", True, True), + ("^–[  ]", "— ", True, True), + ("^— ", "— ", True, True), + ("^«[  ][—–-][  ]", "« — ", True, True), + ("^[-–—](?=[:alnum:])", "— ", True, False) + ], + "typo4b": [ # demin-cadratin + ("^-[  ]", "– ", True, True), + ("^—[  ]", "– ", True, True), + ("^– ", "– ", True, True), + ("^«[  ][—–-][  ]", "« – ", True, True), + ("^[-–—](?=[:alnum:])", "– ", True, False) + ], + "typo5": [ + ('"([:alpha:]+)"', "“$1”", True, False), + ("''([:alpha:]+)''", "“$1”", True, False), + ("'([:alpha:]+)'", "“$1”", True, False), + ('^"(?=[:alnum:])', "« ", True, False), + ("^''(?=[:alnum:])", "« ", True, False), + (' "(?=[:alnum:])', " « ", True, False), + (" ''(?=[:alnum:])", " « ", True, False), + ('\\("(?=[:alnum:])', "(« ", True, False), + ("\\(''(?=[:alnum:])", "(« ", True, False), + ('(?<=[:alnum:])"$', " »", True, False), + ("(?<=[:alnum:])''$", " »", True, False), + ('(?<=[:alnum:])"(?=[] ,.:;?!…)])', " »", True, False), + ("(?<=[:alnum:])''(?=[] ,.:;?!…)])", " »", True, False), + ('(?<=[.!?…])" ', " » ", True, False), + ('(?<=[.!?…])"$', " »", True, False) + ], + "typo6": [ + ("\\bN\\.([ms])\\b", "N·$1", True, True), # N·m et N·m-1, N·s + ("\\bW\\.h\\b", "W·h", True, True), + ("\\bPa\\.s\\b", "Pa·s", True, True), + ("\\bA\\.h\\b", "A·h", True, True), + ("\\bΩ\\.m\\b", "Ω·m", True, True), + ("\\bS\\.m\\b", "S·m", True, True), + ("\\bg\\.s(?=-1)\\b", "g·s", True, True), + ("\\bm\\.s(?=-[12])\\b", "m·s", True, True), + ("\\bg\\.m(?=2|-3)\\b", "g·m", True, True), + ("\\bA\\.m(?=-1)\\b", "A·m", True, True), + ("\\bJ\\.K(?=-1)\\b", "J·K", True, True), + ("\\bW\\.m(?=-2)\\b", "W·m", True, True), + ("\\bcd\\.m(?=-2)\\b", "cd·m", True, True), + ("\\bC\\.kg(?=-1)\\b", "C·kg", True, True), + ("\\bH\\.m(?=-1)\\b", "H·m", True, True), + ("\\bJ\\.kg(?=-1)\\b", "J·kg", True, True), + ("\\bJ\\.m(?=-3)\\b", "J·m", True, True), + ("\\bm[2²]\\.s\\b", "m²·s", True, True), + ("\\bm[3³]\\.s(?=-1)\\b", "m³·s", True, True), + #("\\bJ.kg-1.K-1\\b", "J·kg-1·K-1", True, True), + #("\\bW.m-1.K-1\\b", "W·m-1·K-1", True, True), + #("\\bW.m-2.K-1\\b", "W·m-2·K-1", True, True), + ("\\b(Y|Z|E|P|T|G|M|k|h|da|d|c|m|µ|n|p|f|a|z|y)Ω\\b", "$1Ω", True, True) + ], + "typo7": [ + # ligatures: pas de majuscules + ("coeur", "cœur", False, True), + ("coel([aeio])", "cœl$1", True, True), + ("choeur", "chœur", False, True), + ("foet", "fœt", False, True), + ("oeil", "œil", False, True), + ("oeno", "œno", False, True), + ("oesoph", "œsoph", False, True), + ("oestro", "œstro", False, True), + ("oeuf", "œuf", False, True), + ("oeuvr", "œuvr", False, True), + ("moeur", "mœur", False, True), + ("noeu", "nœu", False, True), + ("soeur", "sœur", False, True), + ("voeu", "vœu", False, True), + ("aequo", "æquo", False, True), + # ligatures: majuscules + ("Coeur", "Cœur", False, True), + ("Coel([aeio])", "Cœl$1", True, True), + ("Choeur", "Chœur", False, True), + ("Foet", "Fœt", False, True), + ("Oeil", "Œil", False, True), + ("Oeno", "Œno", False, True), + ("Oesoph", "Œsoph", False, True), + ("Oestro", "Œstro", False, True), + ("Oeuf", "Œuf", False, True), + ("Oeuvr", "Œuvr", False, True), + ("Moeur", "Mœur", False, True), + ("Noeu", "Nœu", False, True), + ("Soeur", "Sœur", False, True), + ("Voeu", "Vœu", False, True), + ("Aequo", "Æquo", False, True), + # mots communs avec diacritiques manquants + ("\\bCa\\b", "Ça", True, True), + (" ca\\b", " ça", True, True), + ("\\bdej[aà]\\b", "déjà", True, True), + ("\\bDej[aà]\\b", "Déjà", True, True), + ("\\bplutot\\b", "plutôt", True, True), + ("\\bPlutot\\b", "Plutôt", True, True), + ("\\b([cC]e(?:ux|lles?|lui))-la\\b", "$1-là", True, True), + ("\\bmalgre\\b", "malgré", True, True), + ("\\bMalgre\\b", "Malgré", True, True), + ("\\betre\\b", "être", True, True), + ("\\bEtre\\b", "Être", True, True), + ("\\btres\\b", "très", True, True), + ("\\bTres\\b", "Très", True, True), + ("\\bEtai([ts]|ent)\\b", "Étai$1", True, True), + ("\\bE(tat|cole|crit|poque|tude|ducation|glise|conomi(?:qu|)e|videmment|lysée|tienne|thiopie|cosse|gypt(?:e|ien)|rythrée|pinal|vreux)", "É$1", True, True) + ], + # faire ligatures + "typo_ffi_do": [ + ("ffi", "ffi", False, True) + ], + "typo_ffl_do": [ + ("ffl", "ffl", False, True) + ], + "typo_fi_do": [ + ("fi", "fi", False, True) + ], + "typo_fl_do": [ + ("fl", "fl", False, True) + ], + "typo_ff_do": [ + ("ff", "ff", False, True) + ], + "typo_ft_do": [ + ("ft", "ſt", False, True) + ], + "typo_st_do": [ + ("st", "st", False, True) + ], + # défaire ligatures + "typo_fi_undo": [ + ("fi", "fi", False, True) + ], + "typo_fl_undo": [ + ("fl", "fl", False, True) + ], + "typo_ff_undo": [ + ("ff", "ff", False, True) + ], + "typo_ff_undo": [ + ("ffi", "ffi", False, True) + ], + "typo_ff_undo": [ + ("ffl", "ffl", False, True) + ], + "typo_ft_undo": [ + ("ſt", "ft", False, True) + ], + "typo_st_undo": [ + ("st", "st", False, True) + ], + + # Divers + "misc1a": [ + ("(?<=\\b[0-9][0-9][0-9][0-9])(i?[èe]me|è|e)\\b", "ᵉ", True, False), + ("(?<=\\b[0-9][0-9][0-9])(i?[èe]me|è|e)\\b", "ᵉ", True, False), + ("(?<=\\b[0-9][0-9])(i?[èe]me|è|e)\\b", "ᵉ", True, False), + ("(?<=\\b[0-9])(i?[èe]me|è|e)\\b", "ᵉ", True, False), + ("(?<=\\b[XVICL][XVICL][XVICL][XVICL])(i?[èe]me|è|e)\\b", "ᵉ", True, True), + ("(?<=\\b[XVICL][XVICL][XVICL])(i?[èe]me|è|e)\\b", "ᵉ", True, True), + ("(?<=\\b[XVICL][XVICL])(i?[èe]me|è|e)\\b", "ᵉ", True, True), + ("(?<=\\b[XVICL])(i?[èe]me|è)\\b", "ᵉ", True, True), + ("(?<=\\b(au|l[ea]|du) [XVICL])e\\b", "ᵉ", True, True), + ("(?<=\\b[XVI])e(?= siècle)", "ᵉ", True, True), + ("(?<=\\b[1I])er\\b", "ᵉʳ", True, True), + ("(?<=\\b[1I])re\\b", "ʳᵉ", True, True) + ], + "misc1b": [ + ("(?<=\\b[0-9][0-9][0-9][0-9])(i?[èe]me|è|ᵉ)\\b", "e", True, False), + ("(?<=\\b[0-9][0-9][0-9])(i?[èe]me|è|ᵉ)\\b", "e", True, False), + ("(?<=\\b[0-9][0-9])(i?[èe]me|è|ᵉ)\\b", "e", True, False), + ("(?<=\\b[0-9])(i?[èe]me|è|ᵉ)\\b", "e", True, False), + ("(?<=\\b[XVICL][XVICL][XVICL][XVICL])(i?[èe]me|è|ᵉ)\\b", "e", True, True), + ("(?<=\\b[XVICL][XVICL][XVICL])(i?[èe]me|è|ᵉ)\\b", "e", True, True), + ("(?<=\\b[XVICL][XVICL])(i?[èe]me|è|ᵉ)\\b", "e", True, True), + ("(?<=\\b[XVICL])(i?[èe]me|è|ᵉ)\\b", "e", True, True), + ("(?<=\\b[1I])ᵉʳ\\b", "er", True, True), + ("(?<=\\b[1I])ʳᵉ\\b", "er", True, True) + ], + "misc2": [ + ("etc(…|[.][.][.]?)", "etc.", True, True), + ("(?