Grammalecte  textformatter.js at [c35321324a]

File gc_lang/fr/modules-js/textformatter.js artifact 879c3eaec1 part of check-in c35321324a


// Grammalecte - text formatter

/* jshint esversion:6, -W097 */
/* jslint esversion:6 */
/* global exports, console */

"use strict";

//!${map}


// Latin letters: http://unicode-table.com/fr/
// 0-9
// A-Z
// a-z
// À-Ö     00C0-00D6   (upper case)
// Ø-ß     00D8-00DF   (upper case)
// à-ö     00E0-00F6   (lower case)
// ø-ÿ     00F8-00FF   (lower case)
// Ā-ʯ     0100-02AF   (mixed)
// -> a-zA-Zà-ö0-9À-Öø-ÿØ-ßĀ-ʯ

// JS sucks: $0 is not recognized (why?), use $&

const oReplTable = {
    //// surnumerary_spaces
    "start_of_paragraph":         [ [/^[  ]+/gm, ""] ],
    "end_of_paragraph":           [ [/[  ]+$/gm, ""] ],
    "between_words":              [ [/  |  /g, " "],  // espace + espace insécable -> espace
                                    [/  +/g, " "],    // espaces surnuméraires
                                    [/  +/g, " "] ],  // espaces insécables surnuméraires
    "before_punctuation":         [ [/ +(?=[.,…])/g, ""] ],
    "within_parenthesis":         [ [/\([  ]+/g, "("],
                                    [/[  ]+\)/g, ")"] ],
    "within_square_brackets":     [ [/\[[  ]+/g, "["],
                                    [/[  ]+\]/g, "]"] ],
    "within_quotation_marks":     [ [/“[  ]+/g, "“"],
                                    [/[  ]”/g, "”"] ],
    //// non-breaking spaces
    // espaces insécables
    "nbsp_before_punctuation":    [ [/([a-zA-Zà-ö0-9À-Öø-ÿØ-ßĀ-ʯ…)»}\]])([:;?!])[   ]/g, "$1 $2 "],
                                    [/([a-zA-Zà-ö0-9À-Öø-ÿØ-ßĀ-ʯ…)»}\]])([:;?!])$/g, "$1 $2"],
                                    [/[  ]+([:;?!])/g, " $1"] ],
    "nbsp_within_quotation_marks":[ [/«([a-zA-Zà-ö0-9À-Öø-ÿØ-ßĀ-ʯ])/g, "« $1"],
                                    [/«[  ]+/g, "« "],
                                    [/([a-zA-Zà-ö0-9À-Öø-ÿØ-ßĀ-ʯ.!?])»/g, "$1 »"],
                                    [/[  ]+»/g, " »"] ],
    "nbsp_within_numbers":        [ [/(\d)[  ](?=\d)/g, "$1 "] ],
    // espaces insécables fines
    "nnbsp_before_punctuation":   [ [/([a-zA-Zà-ö0-9À-Öø-ÿØ-ßĀ-ʯ…\)»}\]])([;?!])[   ]/g, "$1 $2 "],
                                    [/([a-zA-Zà-ö0-9À-Öø-ÿØ-ßĀ-ʯ…\)»}\]])([;?!])$/g, "$1 $2"],
                                    [/[  ]+([;?!])/g, " $1"],
                                    [/([a-zA-Zà-ö0-9À-Öø-ÿØ-ßĀ-ʯ…\)»}\]]):[   ]/g, "$1 : "],
                                    [/([a-zA-Zà-ö0-9À-Öø-ÿØ-ßĀ-ʯ…\)»}\]]):$/g, "$1 :"],
                                    [/[  ]+:/g, " :"] ],
    "nnbsp_within_quotation_marks":[[/«([a-zA-Zà-ö0-9À-Öø-ÿØ-ßĀ-ʯ])/g, "« $1"],
                                    [/«[  ]+/g, "« "],
                                    [/([a-zA-Zà-ö0-9À-Öø-ÿØ-ßĀ-ʯ.!?])»/g, "$1 »"],
                                    [/[  ]+»/g, " »"] ],
    "nnbsp_within_numbers":       [ [/(\d)[  ](\d)/g, "$1 $2"] ],
    // common
    "nbsp_titles":                [ [/\bM(mes?|ᵐᵉˢ?|grs?|ᵍʳˢ?|lles?|ˡˡᵉˢ?|rs?|ʳˢ?|M[.]) /g, "M$1 "],
                                    [/\bP(re?s?|ʳᵉ?ˢ?) /g, "P$1 "],
                                    [/\bD(re?s?|ʳᵉ?ˢ?) /g, "D$1 "],
                                    [/\bV(ves?|ᵛᵉˢ?) /g, "V$1 "] ],
    "nbsp_before_symbol":         [ [/(\d) ?([%‰€$£¥˚Ω℃])/g, "$1 $2"] ],
    "nbsp_before_units":          [ [/([0-9⁰¹²³⁴⁵⁶⁷⁸⁹]) ?([kcmµn]?(?:[slgJKΩ]|m[²³]?|Wh?|Hz|dB)|[%‰]|°C)\b/g, "$1 $2"] ],
    "nbsp_repair":                [ [/([\[(])[   ]([!?:;])/g, "$1$2"],
                                    [/(https?|ftp)[   ]:\/\//g, "$1://"],
                                    [/&([a-z]+)[   ];/g, "&$1;"],
                                    [/&#([0-9]+|x[0-9a-fA-F]+)[   ];/g, "&#$1;"] ],
    //// missing spaces
    "add_space_after_punctuation":[ [/[;!…](?=[a-zA-Zà-ö0-9À-Öø-ÿØ-ßĀ-ʯ])/g, "$& "],
                                    [/[?](?=[A-ZÀ-ÖØ-ßĀ-ʯ])/g, "? "],
                                    [/\.(?=[A-ZÀ-ÖØ-ßĀ-ʯ][a-zA-Zà-ö0-9À-Öø-ÿØ-ßĀ-ʯ])/g, ". "],
                                    [/\.(?=À)/g, ". "],
                                    [/[,:](?=[a-zA-Zà-öÀ-Öø-ÿØ-ßĀ-ʯ])/g, "$& "],
                                    [/([a-zA-Zà-öÀ-Öø-ÿØ-ßĀ-ʯ]),(?=[0-9])/g, "$1, "] ],
    "add_space_around_hyphens":   [ [/ [-–—](?=[a-zA-Zà-ö0-9À-Öø-ÿØ-ßĀ-ʯ"«“'‘])/g, "$& "],
                                    [/([a-zA-Zà-ö0-9À-Öø-ÿØ-ßĀ-ʯ"»”'’])([-–—]) /g, "$1 $2 "] ],
    "add_space_repair":           [ [/DnT, ([wA])\b/g, "DnT,$1"] ],
    //// erase
    "erase_non_breaking_hyphens": [ [/­/g, ""] ],
    //// typographic signs
    "ts_apostrophe":              [ [/\b([ldnjmtscç])['´‘′`ʼ](?=[a-zA-Zà-ö0-9À-Öø-ÿØ-ßĀ-ʯ])/ig, "$1’"],
                                    [/\b(qu|jusqu|lorsqu|puisqu|quoiqu|quelqu|presqu|entr|aujourd|prud)['´‘′`ʼ]/ig, "$1’"] ],
    "ts_ellipsis":                [ [/\.\.\./g, "…"],
                                    [/…\.\./g, "……"],
                                    [/…\.(?!\.)/g, "…"] ],
    "ts_n_dash_middle":           [ [/ [-—] /g, " – "],
                                    [/ [-—],/g, " –,"] ],
    "ts_m_dash_middle":           [ [/ [-–] /g, " — "],
                                    [/ [-–],/g, " —,"] ],
    "ts_n_dash_start":            [ [/^[-—][  ]/gm, "– "],
                                    [/^– /gm, "– "],
                                    [/^[-–—](?=[a-zA-Zà-ö0-9À-Öø-ÿØ-ßĀ-ʯ.…])/gm, "– "] ],
    "ts_m_dash_start":            [ [/^[-–][  ]/gm, "— "],
                                    [/^— /gm, "— "],
                                    [/^«[  ][—–-][  ]/gm, "« — "],
                                    [/^[-–—](?=[a-zA-Zà-ö0-9À-Öø-ÿØ-ßĀ-ʯ.…])/gm, "— "] ],
    "ts_quotation_marks":         [ [/"([a-zA-Zà-ö0-9À-Öø-ÿØ-ßĀ-ʯ]+)"/g, "“$1”"],
                                    [/''([a-zA-Zà-ö0-9À-Öø-ÿØ-ßĀ-ʯ]+)''/g, "“$1”"],
                                    [/'([a-zA-Zà-ö0-9À-Öø-ÿØ-ßĀ-ʯ]+)'/g, "“$1”"],
                                    [/^(?:"|'')(?=[a-zA-Zà-ö0-9À-Öø-ÿØ-ßĀ-ʯ…])/gm, "« "],
                                    [/ (?:"|'')(?=[a-zA-Zà-ö0-9À-Öø-ÿØ-ßĀ-ʯ…])/g, " « "],
                                    [/\((?:"|'')(?=[a-zA-Zà-ö0-9À-Öø-ÿØ-ßĀ-ʯ…])/g, "(« "],
                                    [/([a-zA-Zà-ö0-9À-Öø-ÿØ-ßĀ-ʯ])(?:"|'')$/gm, "$1 »"],
                                    [/([a-zA-Zà-ö0-9À-Öø-ÿØ-ßĀ-ʯ])(?:"|'')(?=[ ,.:;?!…)\]])/g, "$1 »"],
                                    [/([.!?…])" /g, "$1 » "],
                                    [/([.!?…])"$/gm, "$1 »"] ],
    "ts_spell":                   [ [/coeur/g, "cœur"], [/Coeur/g, "Cœur"],
                                    [/coel(?=[aeio])/g, "cœl"], [/Coel(?=[aeio])/g, "Cœl"],
                                    [/choeur/g, "chœur"], [/Choeur/g, "Chœur"],
                                    [/foet/g, "fœt"], [/Foet/g, "Fœt"],
                                    [/oeil/g, "œil"], [/Oeil/g, "Œil"],
                                    [/oeno/g, "œno"], [/Oeno/g, "Œno"],
                                    [/oesoph/g, "œsoph"], [/Oesoph/g, "Œsoph"],
                                    [/oestro/g, "œstro"], [/Oestro/g, "Œstro"],
                                    [/oeuf/g, "œuf"], [/Oeuf/g, "Œuf"],
                                    [/oeuvr/g, "œuvr"], [/Oeuvr/g, "Œuvr"],
                                    [/moeur/g, "mœur"], [/Moeur/g, "Mœur"],
                                    [/noeu/g, "nœu"], [/Noeu/g, "Nœu"],
                                    [/soeur/g, "sœur"], [/Soeur/g, "Sœur"],
                                    [/voeu/g, "vœu"], [/Voeu/g, "Vœu"],
                                    [/aequo/g, "æquo"], [/Aequo/g, "Æquo"],
                                    [/Ca /g, "Ça "], [/ ca([ ,.!;:])/g, " ça$1"],
                                    [/(Ce(?:ux|lles?|lui))-la /g, "$1-là "], [/ (ce(?:ux|lles?|lui))-la/g, " $1-là"],
                                    [/ malgre /g, " malgré "], [/Malgre /g, "Malgré "],
                                    [/ etre([ ,.!;:])/g, " être$1"], [/Etre([ ,.!;:])/g, "Être$1"],
                                    [/ tres([ ,.!;:])/g, " très$1"], [/Tres /g, "Très "],
                                    [/\bEtai([ts]|ent)\b/g, "Étai$1"],
                                    [/\bE(tat|cole|crit|poque|tude|ducation|glise|conomi(?:qu|)e|videmment|lysée|tienne|thiopie|cosse|gypt(?:e|ien)|rythrée|pinal|vreux)/g, "É$1"] ],
    "ts_ligature_ffi_do":         [ [/ffi/g, "ffi"] ],
    "ts_ligature_ffl_do":         [ [/ffl/g, "ffl"] ],
    "ts_ligature_fi_do":          [ [/fi/g, "fi"] ],
    "ts_ligature_fl_do":          [ [/fl/g, "fl"] ],
    "ts_ligature_ff_do":          [ [/ff/g, "ff"] ],
    "ts_ligature_ft_do":          [ [/ft/g, "ſt"] ],
    "ts_ligature_st_do":          [ [/st/g, "st"] ],
    "ts_ligature_fi_undo":        [ [/fi/g, "fi"] ],
    "ts_ligature_fl_undo":        [ [/fl/g, "fl"] ],
    "ts_ligature_ff_undo":        [ [/ff/g, "ff"] ],
    "ts_ligature_ffi_undo":       [ [/ffi/g, "ffi"] ],
    "ts_ligature_ffl_undo":       [ [/ffl/g, "ffl"] ],
    "ts_ligature_ft_undo":        [ [/ſt/g, "ft"] ],
    "ts_ligature_st_undo":        [ [/st/g, "st"] ],
    "ts_units":                   [ [/\bN\.([ms])\b/g, "N·$1"], // N·m et N·m-1, N·s
                                    [/\bW\.h\b/g, "W·h"],
                                    [/\bPa\.s\b/g, "Pa·s"],
                                    [/\bA\.h\b/g, "A·h"],
                                    [/Ω\.m\b/g, "Ω·m"],
                                    [/\bS\.m\b/g, "S·m"],
                                    [/\bg\.s(?=-1)\b/g, "g·s"],
                                    [/\bm\.s(?=-[12])\b/g, "m·s"],
                                    [/\bg\.m(?=2|-3)\b/g, "g·m"],
                                    [/\bA\.m(?=-1)\b/g, "A·m"],
                                    [/\bJ\.K(?=-1)\b/g, "J·K"],
                                    [/\bW\.m(?=-2)\b/g, "W·m"],
                                    [/\bcd\.m(?=-2)\b/g, "cd·m"],
                                    [/\bC\.kg(?=-1)\b/g, "C·kg"],
                                    [/\bH\.m(?=-1)\b/g, "H·m"],
                                    [/\bJ\.kg(?=-1)\b/g, "J·kg"],
                                    [/\bJ\.m(?=-3)\b/g, "J·m"],
                                    [/\bm[2²]\.s\b/g, "m²·s"],
                                    [/\bm[3³]\.s(?=-1)\b/g, "m³·s"],
                                    //[/\bJ.kg-1.K-1\b/g, "J·kg-1·K-1"],
                                    //[/\bW.m-1.K-1\b/g, "W·m-1·K-1"],
                                    //[/\bW.m-2.K-1\b/g, "W·m-2·K-1"],
                                    [/\b(Y|Z|E|P|T|G|M|k|h|da|d|c|m|µ|n|p|f|a|z|y)Ω/g, "$1Ω"] ],
    //// misc
    "ordinals_exponant":          [ [/\b([0-9]+)(?:i?[èe]me|è|e)\b/g, "$1ᵉ"],
                                    [/\b([XVICL]+)(?:i?[èe]me|è)\b/g, "$1ᵉ"],
                                    [/\b((?:au|l[ea]|du) [XVICL])e\b/g, "$1ᵉ"],
                                    [/\b([XVI])e(?= siècle)/g, "$1ᵉ"],
                                    [/\b([1I])er\b/g, "$1ᵉʳ"],
                                    [/\b([1I])re\b/g, "$1ʳᵉ"] ],
    "ordinals_no_exponant":       [ [/\b([0-9]+)(?:i?[èe]me|è)\b/g, "$1e"],
                                    [/\b([XVICL]+)(?:i?[èe]me|è)\b/g, "$1e"],
                                    [/\b([1I])ᵉʳ\b/g, "$1er"],
                                    [/\b([1I])ʳᵉ\b/g, "$1er"] ],
    "etc":                        [ [/etc(…|\.\.\.?)/g, "etc."],
                                    [/\b etc\./g, ", etc."] ],
    "missing_hyphens":            [ [/[ -]t[’'](?=il\b|elle|on\b)/g, "-t-"],
                                    [/ t-(?=il|elle|on)/g, "-t-"],
                                    [/[ -]t[’'-](?=ils|elles)/g, "-"],
                                    [/([td])-t-(?=il|elle|on)/g, "$1-"],
                                    [/dix (sept|huit|neuf)/g, "dix-$1"],
                                    [/quatre vingt/g, "quatre-vingt"],
                                    [/(soixante|quatre-vingt) dix/g, "$1-dix"],
                                    [/(vingt|trente|quarante|cinquante|soixante(?:-dix|)|quatre-vingt(?:-dix|)) (deux|trois|quatre|cinq|six|sept|huit|neuf)\b/g, "$1-$2"],
                                    [/ ce(lles?|lui|ux) (ci|là)/g, "ce$1-$2"],
                                    [/Ce(lles?|lui|ux) (ci|là)/g, "Ce$1-$2"],
                                    [/^Ci (joint|desso?us|contre|devant|avant|après|incluse|g[îi]t|gisent)/gm, "Ci-$1"],
                                    [/ ci (joint|desso?us|contre|devant|avant|après|incluse|g[îi]t|gisent)/g, " ci-$1"],
                                    [/vis à vis/g, "vis-à-vis"],
                                    [/Vis à vis/g, "Vis-à-vis"],
                                    [/week end/g, "week-end"],
                                    [/Week end/g, "Week-end"],
                                    [/(plus|moins) value/ig, "$1-value"] ],
    //// missing apostrophes
    "ma_word":                    [ [/[  ](qu|lorsqu|puisqu|quoiqu|presqu|jusqu|aujourd|entr|quelqu|prud) /ig, "$1’"],
                                    [/^(qu|lorsqu|puisqu|quoiqu|presqu|jusqu|aujourd|entr|quelqu|prud) /ig, "$1’"] ],
    "ma_1letter_lowercase":       [ [/[  ]([ldjnmtscç]) (?=[aàeéêiîoôuyhAÀEÉÊIÎOÔUYH])/g, "$1’"] ],
    "ma_1letter_uppercase":       [ [/[  ]([LDJNMTSCÇ]) (?=[aàeéêiîoôuyhAÀEÉÊIÎOÔUYH])/g, "$1’"],
                                    [/^([LDJNMTSCÇ]) (?=[aàeéêiîoôuyhAÀEÉÊIÎOÔUYH])/g, "$1’"] ]
};


const dTFDefaultOptions = new Map ([
    ["ts_units", true],
    ["start_of_paragraph", true],
    ["end_of_paragraph", true],
    ["between_words", true],
    ["before_punctuation", true],
    ["within_parenthesis", true],
    ["within_square_brackets", true],
    ["within_quotation_marks", true],
    ["nbsp_before_punctuation", true],
    ["nbsp_within_quotation_marks", true],
    ["nbsp_within_numbers", true],
    ["nnbsp_before_punctuation", false],
    ["nnbsp_within_quotation_marks", false],
    ["nnbsp_within_numbers", false],
    ["nbsp_titles", false],
    ["nbsp_before_symbol", true],
    ["nbsp_before_units", true],
    ["nbsp_repair", true],
    ["add_space_after_punctuation", true],
    ["add_space_around_hyphens", true],
    ["add_space_repair", true],
    ["erase_non_breaking_hyphens", false],
    ["ts_apostrophe", true],
    ["ts_ellipsis", true],
    ["ts_n_dash_middle", true],
    ["ts_m_dash_middle", false],
    ["ts_n_dash_start", false],
    ["ts_m_dash_start", true],
    ["ts_quotation_marks", true],
    ["ts_spell", true],
    ["ts_ligature_ffi_do", false],
    ["ts_ligature_ffl_do", false],
    ["ts_ligature_fi_do", false],
    ["ts_ligature_fl_do", false],
    ["ts_ligature_ff_do", false],
    ["ts_ligature_ft_do", false],
    ["ts_ligature_st_do", false],
    ["ts_ligature_fi_undo", false],
    ["ts_ligature_fl_undo", false],
    ["ts_ligature_ff_undo", false],
    ["ts_ligature_ffi_undo", false],
    ["ts_ligature_ffl_undo", false],
    ["ts_ligature_ft_undo", false],
    ["ts_ligature_st_undo", false],
    ["ordinals_exponant", false],
    ["ordinals_no_exponant", true],
    ["etc", true],
    ["missing_hyphens", true],
    ["ma_word", true],
    ["ma_1letter_lowercase", false],
    ["ma_1letter_uppercase", false]
]);


class TextFormatter {

    constructor (bDebug=false) {
        this.sLang = "fr";
        this.bDebug = bDebug;
        //don't change this in external ;)
        this.dOptions = dTFDefaultOptions.gl_shallowCopy();
    }

    formatText (sText, dOpt=null) {
        if (dOpt !== null) {
            this.dOptions.gl_updateOnlyExistingKeys(dOpt);
        }
        for (let [sOptName, bVal] of this.dOptions) {
            //console.log(oReplTable);
            if (bVal && oReplTable[sOptName]) {
                for (let [zRgx, sRep] of oReplTable[sOptName]) {
                    sText = sText.replace(zRgx, sRep);
                }
            }
        }
        return sText;
    }

    formatTextCount (sText, dOpt=null) {
        let nCount = 0;
        if (dOpt !== null) {
            this.dOptions.gl_updateOnlyExistingKeys(dOpt);
        }
        for (let [sOptName, bVal] of this.dOptions) {
            if (bVal && oReplTable[sOptName]) {
                for (let [zRgx, sRep] of oReplTable[sOptName]) {
                    nCount += (sText.match(zRgx) || []).length;
                    sText = sText.replace(zRgx, sRep);
                }
            }
        }
        return [sText, nCount];
    }

    formatTextRule (sText, sRuleName) {
        if (oReplTable[sRuleName]) {
            for (let [zRgx, sRep] of oReplTable[sRuleName]) {
                sText = sText.replace(zRgx, sRep);
            }
        } else if (this.bDebug){
            console.log("# Error. TF: there is no option “" + sRuleName+ "”.");
        }
        return sText;
    }

    formatTextRuleCount (sText, sRuleName) {
        let nCount = 0;
        if (oReplTable[sRuleName]) {
            for (let [zRgx, sRep] of oReplTable[sRuleName]) {
                nCount += (sText.match(zRgx) || []).length;
                sText = sText.replace(zRgx, sRep);
            }
        } else if (this.bDebug){
            console.log("# Error. TF: there is no option “" + sRuleName+ "”.");
        }
        return [sText, nCount];
    }

    removeHyphenAtEndOfParagraphs (sText) {
        sText = sText.replace(/-[  ]*\n/gm, "");
        return sText;
    }

    removeHyphenAtEndOfParagraphsCount (sText) {
        let nCount = (sText.match(/-[  ]*\n/gm) || []).length;
        sText = sText.replace(/-[  ]*\n/gm, "");
        return [sText, nCount];
    }

    mergeContiguousParagraphs (sText) {
        sText = sText.replace(/^[  ]+$/gm, ""); // clear empty paragraphs
        let s = "";
        for (let sParagraph of this.getParagraph(sText)) {
            if (sParagraph === "") {
                s += "\n";
            } else {
                s += sParagraph + " ";
            }
        }
        s = s.replace(/  +/gm, " ").replace(/ $/gm, "");
        return s;
    }

    mergeContiguousParagraphsCount (sText) {
        let nCount = 0;
        sText = sText.replace(/^[  ]+$/gm, ""); // clear empty paragraphs
        let s = "";
        for (let sParagraph of this.getParagraph(sText)) {
            if (sParagraph === "") {
                s += "\n";
            } else {
                s += sParagraph + " ";
                nCount += 1;
            }
        }
        s = s.replace(/  +/gm, " ").replace(/ $/gm, "");
        return [s, nCount];
    }

    * getParagraph (sText, sSep="\n") {
        // generator: returns paragraphs of text
        let iStart = 0;
        let iEnd = 0;
        while ((iEnd = sText.indexOf(sSep, iStart)) !== -1) {
            yield sText.slice(iStart, iEnd);
            iStart = iEnd + 1;
        }
        yield sText.slice(iStart);
    }

    getDefaultOptions () {
        //we return a copy to make sure they are no modification in external
        return dTFDefaultOptions.gl_shallowCopy();
    }

    getOptions () {
        //we return a copy to make sure they are no modification in external
        return this.dOptions.gl_shallowCopy();
    }

    setOptions (dOpt=null) {
        if (dOpt !== null) {
            this.dOptions.gl_updateOnlyExistingKeys(dOpt);
        } else if (this.bDebug){
            console.log("# Error. TF: no option to change.");
        }
    }
}


if (typeof(exports) !== 'undefined') {
    exports.TextFormatter = TextFormatter;
}