Grammalecte  str_transform.js at trunk

File graphspell-js/str_transform.js artifact 3da9c81f36 on branch trunk


// STRING TRANSFORMATION

/* jshint esversion:6, -W097 */
/* jslint esversion:6 */
/* global exports, console */

"use strict";


if (typeof(process) !== 'undefined') {
    var char_player = require("./char_player.js");
}



// Note: 48 is the ASCII code for "0"

var str_transform = {

    getNgrams: function (sWord, n=2) {
        let lNgrams = [];
        for (let i=0;  i <= sWord.length - n;  i++) {
            lNgrams.push(sWord.slice(i, i+n));
        }
        return lNgrams;
    },

    _xTransCharsForSpelling: new Map([
        ['ſ', 's'],  ['ffi', 'ffi'],  ['ffl', 'ffl'],  ['ff', 'ff'],  ['ſt', 'ft'],  ['fi', 'fi'],  ['fl', 'fl'],  ['st', 'st'],
        ["'", '’']
    ]),

    spellingNormalization: function (sWord) {
        let sNewWord = "";
        for (let c of sWord) {
            sNewWord += this._xTransCharsForSpelling.gl_get(c, c);
        }
        return sNewWord.normalize("NFC");
    },

    _xTransCharsForSimplification: new Map([
        ['à', 'a'],  ['é', 'é'],  ['î', 'i'],  ['ô', 'o'],  ['û', 'u'],  ['ÿ', 'y'],
        ['â', 'a'],  ['è', 'é'],  ['ï', 'i'],  ['ö', 'o'],  ['ù', 'u'],  ['ŷ', 'y'],
        ['ä', 'a'],  ['ê', 'é'],  ['í', 'i'],  ['ó', 'o'],  ['ü', 'u'],  ['ý', 'y'],
        ['á', 'a'],  ['ë', 'é'],  ['ì', 'i'],  ['ò', 'o'],  ['ú', 'u'],  ['ỳ', 'y'],
        ['ā', 'a'],  ['ē', 'é'],  ['ī', 'i'],  ['ō', 'o'],  ['ū', 'u'],  ['ȳ', 'y'],
        ['ç', 'c'],  ['ñ', 'n'],
        ['œ', 'oe'], ['æ', 'ae'],
        ['ſ', 's'],  ['ffi', 'ffi'],  ['ffl', 'ffl'],  ['ff', 'ff'],  ['ſt', 'ft'],  ['fi', 'fi'],  ['fl', 'fl'],  ['st', 'st'],
        ["⁰", "0"], ["¹", "1"], ["²", "2"], ["³", "3"], ["⁴", "4"], ["⁵", "5"], ["⁶", "6"], ["⁷", "7"], ["⁸", "8"], ["⁹", "9"],
        ["₀", "0"], ["₁", "1"], ["₂", "2"], ["₃", "3"], ["₄", "4"], ["₅", "5"], ["₆", "6"], ["₇", "7"], ["₈", "8"], ["₉", "9"],
        ["’", ""], ["'", ""], ["ʼ", ""], ["‘", ""], ["‛", ""], ["´", ""], ["`", ""], ["′", ""], ["‵", ""], ["՚", ""], ["ꞌ", ""], ["Ꞌ", ""],
        ["-", ""]
    ]),

    simplifyWord: function (sWord) {
        // word simplication before calculating distance between words
        sWord = sWord.toLowerCase();
        sWord = [...sWord].map(c => this._xTransCharsForSimplification.gl_get(c, c)).join('');
        let sNewWord = "";
        let i = 1;
        for (let c of sWord) {
            if (c != sWord.slice(i, i+1) || (c == 'e' && sWord.slice(i, i+2) != "ee")) {  // exception for <e> to avoid confusion between crée / créai
                sNewWord += c;
            }
            i++;
        }
        return sNewWord.replace(/eau/g, "o").replace(/au/g, "o").replace(/ai/g, "éi").replace(/ei/g, "é").replace(/ph/g, "f");
    },

    cleanWord: function (sWord) {
        // word clean for the user who make commun and preditive error help suggest
        // remove letters repeated more than 2 times
        return sWord.replace(/(.)\1{2,}/ig,'$1$1');
    },

    _xTransNumbersToExponent: new Map([
        ["0", "⁰"], ["1", "¹"], ["2", "²"], ["3", "³"], ["4", "⁴"], ["5", "⁵"], ["6", "⁶"], ["7", "⁷"], ["8", "⁸"], ["9", "⁹"]
    ]),

    numbersToExponent: function (sWord) {
        let sNewWord = "";
        for (let c of sWord) {
            sNewWord += this._xTransNumbersToExponent.gl_get(c, c);
        }
        return sNewWord;
    },

    longestCommonSubstring: function (string1, string2) {
        // https://en.wikibooks.org/wiki/Algorithm_Implementation/Strings/Longest_common_substring
        // untested

        // init max value
        let longestCommonSubstring = 0;
        // init 2D array with 0
        let table = [],
            len1 = string1.length,
            len2 = string2.length,
            row, col;
        for (row = 0; row <= len1; row++) {
            table[row] = [];
            for (col = 0; col <= len2; col++) {
                table[row][col] = 0;
            }
        }
        // fill table
        let i, j;
        for (i = 0;  i < len1;  i++) {
            for (j = 0;  j < len2;  j++) {
                if (string1[i] === string2[j]) {
                    if (table[i][j] === 0){
                        table[i+1][j+1] = 1;
                    } else {
                        table[i+1][j+1] = table[i][j] + 1;
                    }
                    if (table[i+1][j+1] > longestCommonSubstring) {
                        longestCommonSubstring = table[i+1][j+1];
                    }
                } else {
                    table[i+1][j+1] = 0;
                }
            }
        }
        return longestCommonSubstring;
    },

    distanceDamerauLevenshtein: function (s1, s2) {
        // distance of Damerau-Levenshtein between <s1> and <s2>
        // https://fr.wikipedia.org/wiki/Distance_de_Damerau-Levenshtein
        try {
            let nLen1 = s1.length;
            let nLen2 = s2.length;
            let matrix = [];
            for (let i = 0;  i <= nLen1+1;  i++) {
                matrix[i] = new Array(nLen2 + 2);
            }
            for (let i = 0;  i <= nLen1+1;  i++) {
                matrix[i][0] = i;
            }
            for (let j = 0;  j <= nLen2+1;  j++) {
                matrix[0][j] = j;
            }
            for (let i = 1;  i <= nLen1;  i++) {
                for (let j = 1;  j <= nLen2;  j++) {
                    //let nCost = (s1[i-1] === s2[j-1]) ? 0 : 1;
                    let nCost = char_player.distanceBetweenChars(s1[i-1], s2[j-1]);
                    matrix[i][j] = Math.min(
                        matrix[i-1][j] + 1,         // Deletion
                        matrix[i][j-1] + 1,         // Insertion
                        matrix[i-1][j-1] + nCost    // Substitution
                    );
                    if (i > 1 && j > 1 && s1[i] == s2[j-1] && s1[i-1] == s2[j]) {
                        matrix[i][j] = Math.min(matrix[i][j], matrix[i-2][j-2] + nCost);  // Transposition
                    }
                }
            }
            return matrix[nLen1][nLen2];
        }
        catch (e) {
            console.error(e);
        }
    },

    distanceJaroWinkler: function(a, b, boost = .666) {
        // https://github.com/thsig/jaro-winkler-JS
        //if (a == b) { return 1.0; }
        let a_len = a.length;
        let b_len = b.length;
        let a_flag = [];
        let b_flag = [];
        let search_range = Math.floor(Math.max(a_len, b_len) / 2) - 1;
        let minv = Math.min(a_len, b_len);

        // Looking only within the search range, count and flag the matched pairs.
        let Num_com = 0;
        let yl1 = b_len - 1;
        for (let i = 0; i < a_len; i++) {
          let lowlim = (i >= search_range) ? i - search_range : 0;
          let hilim  = ((i + search_range) <= yl1) ? (i + search_range) : yl1;
          for (let j = lowlim; j <= hilim; j++) {
            if (b_flag[j] !== 1 && a[j] === b[i]) {
              a_flag[j] = 1;
              b_flag[i] = 1;
              Num_com++;
              break;
            }
          }
        }

        // Return if no characters in common
        if (Num_com === 0) { return 0.0; }

        // Count the number of transpositions
        let k = 0;
        let N_trans = 0;
        for (let i = 0; i < a_len; i++) {
          if (a_flag[i] === 1) {
            let j;
            for (j = k; j < b_len; j++) {
              if (b_flag[j] === 1) {
                k = j + 1;
                break;
              }
            }
            if (a[i] !== b[j]) { N_trans++; }
          }
        }
        N_trans = Math.floor(N_trans / 2);

        // Adjust for similarities in nonmatched characters
        let N_simi = 0;
        let adjwt = char_player.oDistanceBetweenChars;
        if (minv > Num_com) {
          for (let i = 0; i < a_len; i++) {
            if (!a_flag[i]) {
              for (let j = 0; j < b_len; j++) {
                if (!b_flag[j]) {
                  if (adjwt[a[i]] && adjwt[a[i]][b[j]]) {
                    N_simi += adjwt[a[i]][b[j]];
                    b_flag[j] = 2;
                    break;
                  }
                }
              }
            }
          }
        }

        let Num_sim = (N_simi / 10.0) + Num_com;

        // Main weight computation
        let weight = Num_sim / a_len + Num_sim / b_len + (Num_com - N_trans) / Num_com;
        weight = weight / 3;

        // Continue to boost the weight if the strings are similar
        if (weight > boost) {
          // Adjust for having up to the first 4 characters in common
          let j = (minv >= 4) ? 4 : minv;
          let i;
          for (i = 0; (i < j) && a[i] === b[i]; i++) { }
          if (i) { weight += i * 0.1 * (1.0 - weight) };

          // Adjust for long strings.
          // After agreeing beginning chars, at least two more must agree
          // and the agreeing characters must be more than half of the
          // remaining characters.
          if (minv > 4 && Num_com > i + 1 && 2 * Num_com >= minv + i) {
            weight += (1 - weight) * ((Num_com - i - 1) / (a_len * b_len - i*2 + 2));
          }
        }

        return weight;
    },

    distanceSift4: function (s1, s2, maxOffset=5) {
        // Sift4 - simplest version : https://siderite.dev/blog/super-fast-and-accurate-string-distance.html
        // online algorithm to compute the distance between two strings in O(n)
        // maxOffset is the number of characters to search for matching letters
        if (!s1 || !s1.length) {
            if (!s2) {
                return 0;
            }
            return s2.length;
        }

        if (!s2 || !s2.length) {
            return s1.length;
        }

        var l1 = s1.length;
        var l2 = s2.length;

        var c1 = 0; //cursor for string 1
        var c2 = 0; //cursor for string 2
        var lcss = 0; //largest common subsequence
        var local_cs = 0; //local common substring
        while ((c1 < l1) && (c2 < l2)) {
            if (s1.charAt(c1) == s2.charAt(c2)) {
                local_cs++;
            } else {
                lcss += local_cs;
                local_cs = 0;
                if (c1 != c2) {
                    c1 = c2 = Math.max(c1, c2); //using max to bypass the need for computer transpositions ('ab' vs 'ba')
                }
                for (var i = 0; i < maxOffset && (c1 + i < l1 || c2 + i < l2); i++) {
                    if ((c1 + i < l1) && (s1.charAt(c1 + i) == s2.charAt(c2))) {
                        c1 += i;
                        local_cs++;
                        break;
                    }
                    if ((c2 + i < l2) && (s1.charAt(c1) == s2.charAt(c2 + i))) {
                        c2 += i;
                        local_cs++;
                        break;
                    }
                }
            }
            c1++;
            c2++;
        }
        lcss += local_cs;
        return Math.round(Math.max(l1, l2) - lcss);
    },

    showDistance: function (s1, s2) {
        console.log(`${s1} ≠ ${s2}`);
        let nDL = this.distanceDamerauLevenshtein(s1, s2);
        let nS4 = this.distanceSift4(s1, s2);
        let fJW = this.distanceJaroWinkler(s1, s2);
        console.log(`DL: ${nDL} — S4: ${nS4} — JW: ${fJW}`);
    },

    // Suffix only
    defineSuffixCode: function (sFlex, sStem) {
        /*
            Returns a string defining how to get stem from flexion
                "n(sfx)"
            with n: a char with numeric meaning, "0" = 0, "1" = 1, ... ":" = 10, etc. (See ASCII table.) Says how many letters to strip from flexion.
                 sfx [optional]: string to add on flexion
            Examples:
                "0": strips nothing, adds nothing
                "1er": strips 1 letter, adds "er"
                "2": strips 2 letters, adds nothing
        */
        if (sFlex == sStem) {
            return "0";
        }
        let jSfx = 0;
        for (let i = 0;  i < Math.min(sFlex.length, sStem.length);  i++) {
            if (sFlex[i] !== sStem[i]) {
                break;
            }
            jSfx += 1;
        }
        return String.fromCharCode(sFlex.length-jSfx+48) + sStem.slice(jSfx);
    },

    changeWordWithSuffixCode: function (sWord, sSfxCode) {
        if (sSfxCode == "0") {
            return sWord;
        }
        return sSfxCode[0] == '0' ? sWord + sSfxCode.slice(1) : sWord.slice(0, -(sSfxCode.charCodeAt(0)-48)) + sSfxCode.slice(1);
    },

    // Prefix and suffix
    defineAffixCode: function (sFlex, sStem) {
        /*
            UNTESTED!
            Returns a string defining how to get stem from flexion. Examples:
                "0" if stem = flexion
                "stem" if no common substring
                "n(pfx)/m(sfx)"
            with n and m: chars with numeric meaning, "0" = 0, "1" = 1, ... ":" = 10, etc. (See ASCII table.) Says how many letters to strip from flexion.
                pfx [optional]: string to add before the flexion
                sfx [optional]: string to add after the flexion
        */
        if (sFlex == sStem) {
            return "0";
        }
        // is stem a substring of flexion?
        let n = sFlex.indexOf(sStem);
        if (n >= 0) {
            return String.fromCharCode(n+48) + "/" + String.fromCharCode(sFlex.length-(sStem.length+n)+48);
        }
        // no, so we are looking for common substring
        let sSubs = this.longestCommonSubstring(sFlex, sStem);
        if (sSubs.length > 1) {
            let iPos = sStem.indexOf(sSubs);
            let sPfx = sStem.slice(0, iPos);
            let sSfx = sStem.slice(iPos+sSubs.length);
            let n = sFlex.indexOf(sSubs);
            let m = sFlex.length - (sSubs.length+n);
            return String.fromCharCode(n+48) + sPfx + "/" + String.fromCharCode(m+48) + sSfx;
        }
        return sStem;
    },

    changeWordWithAffixCode: function (sWord, sAffCode) {
        if (sAffCode == "0") {
            return sWord;
        }
        if (!sAffCode.includes("/")) {
            return sAffCode;
        }
        let [sPfxCode, sSfxCode] = sAffCode.split('/');
        sWord = sPfxCode.slice(1) + sWord.slice(sPfxCode.charCodeAt(0)-48);
        return sSfxCode[0] == '0' ? sWord + sSfxCode.slice(1) : sWord.slice(0, -(sSfxCode.charCodeAt(0)-48)) + sSfxCode.slice(1);
    }
};


if (typeof(exports) !== 'undefined') {
    exports.simplifyWord = str_transform.simplifyWord;
    exports.numbersToExponent = str_transform.numbersToExponent;
    exports.spellingNormalization = str_transform.spellingNormalization;
    exports.longestCommonSubstring = str_transform.longestCommonSubstring;
    exports.distanceDamerauLevenshtein = str_transform.distanceDamerauLevenshtein;
    exports.distanceJaroWinkler = str_transform.distanceJaroWinkler;
    exports.showDistance = str_transform.showDistance;
    exports.changeWordWithSuffixCode = str_transform.changeWordWithSuffixCode;
    exports.changeWordWithAffixCode = str_transform.changeWordWithAffixCode;
    exports.defineAffixCode = str_transform.defineAffixCode;
    exports.defineSuffixCode = str_transform.defineSuffixCode;
}