Grammalecte: str_transform.js at [246965c90e]

File graphspell-js/str_transform.js artifact 46f5d63858 part of check-in 246965c90e

// STRING TRANSFORMATION

/* jshint esversion:6, -W097 */
/* jslint esversion:6 */
/* global exports, console */

"use strict";

if (typeof(process) !== 'undefined') {
    var char_player = require("./char_player.js");
} else if (typeof(require) !== 'undefined') {
    var char_player = require("resource://grammalecte/graphspell/char_player.js");
}


// Note: 48 is the ASCII code for "0"

var str_transform = {

    getNgrams: function (sWord, n=2) {
        let lNgrams = [];
        for (let i=0;  i <= sWord.length - n;  i++) {
            lNgrams.push(sWord.slice(i, i+n));
        }
        return lNgrams;
    },

    _xTransCharsForSpelling: new Map([
        ['ſ', 's'],  ['ﬃ', 'ffi'],  ['ﬄ', 'ffl'],  ['ﬀ', 'ff'],  ['ﬅ', 'ft'],  ['ﬁ', 'fi'],  ['ﬂ', 'fl'],  ['ﬆ', 'st']
    ]),

    spellingNormalization: function (sWord) {
        let sNewWord = "";
        for (let c of sWord) {
            sNewWord += this._xTransCharsForSpelling.gl_get(c, c);
        }
        return sNewWord.normalize("NFC");
    },

    _xTransCharsForSimplification: new Map([
        ['à', 'a'],  ['é', 'é'],  ['î', 'i'],  ['ô', 'o'],  ['û', 'u'],  ['ÿ', 'y'],
        ['â', 'a'],  ['è', 'é'],  ['ï', 'i'],  ['ö', 'o'],  ['ù', 'u'],  ['ŷ', 'y'],
        ['ä', 'a'],  ['ê', 'é'],  ['í', 'i'],  ['ó', 'o'],  ['ü', 'u'],  ['ý', 'y'],
        ['á', 'a'],  ['ë', 'é'],  ['ì', 'i'],  ['ò', 'o'],  ['ú', 'u'],  ['ỳ', 'y'],
        ['ā', 'a'],  ['ē', 'é'],  ['ī', 'i'],  ['ō', 'o'],  ['ū', 'u'],  ['ȳ', 'y'],
        ['ç', 'c'],  ['ñ', 'n'],
        ['œ', 'oe'], ['æ', 'ae'],
        ['ſ', 's'],  ['ﬃ', 'ffi'],  ['ﬄ', 'ffl'],  ['ﬀ', 'ff'],  ['ﬅ', 'ft'],  ['ﬁ', 'fi'],  ['ﬂ', 'fl'],  ['ﬆ', 'st'],
        ["⁰", "0"], ["¹", "1"], ["²", "2"], ["³", "3"], ["⁴", "4"], ["⁵", "5"], ["⁶", "6"], ["⁷", "7"], ["⁸", "8"], ["⁹", "9"],
        ["₀", "0"], ["₁", "1"], ["₂", "2"], ["₃", "3"], ["₄", "4"], ["₅", "5"], ["₆", "6"], ["₇", "7"], ["₈", "8"], ["₉", "9"]
    ]),

    simplifyWord: function (sWord) {
        // word simplication before calculating distance between words
        sWord = sWord.toLowerCase();
        sWord = [...sWord].map(c => this._xTransCharsForSimplification.gl_get(c, c)).join('');
        let sNewWord = "";
        let i = 1;
        for (let c of sWord) {
            if (c != sWord.slice(i, i+1) || (c == 'e' && sWord.slice(i, i+2) != "ee")) {  // exception for <e> to avoid confusion between crée / créai
                sNewWord += c;
            }
            i++;
        }
        return sNewWord.replace(/eau/g, "o").replace(/au/g, "o").replace(/ai/g, "éi").replace(/ei/g, "é").replace(/ph/g, "f");
    },

    _xTransNumbersToExponent: new Map([
        ["0", "⁰"], ["1", "¹"], ["2", "²"], ["3", "³"], ["4", "⁴"], ["5", "⁵"], ["6", "⁶"], ["7", "⁷"], ["8", "⁸"], ["9", "⁹"]
    ]),

    numbersToExponent: function (sWord) {
        let sNewWord = "";
        for (let c of sWord) {
            sNewWord += this._xTransNumbersToExponent.gl_get(c, c);
        }
        return sNewWord;
    },

    longestCommonSubstring: function (string1, string2) {
        // https://en.wikibooks.org/wiki/Algorithm_Implementation/Strings/Longest_common_substring
        // untested

        // init max value
        let longestCommonSubstring = 0;
        // init 2D array with 0
        let table = [],
            len1 = string1.length,
            len2 = string2.length,
            row, col;
        for (row = 0; row <= len1; row++) {
            table[row] = [];
            for (col = 0; col <= len2; col++) {
                table[row][col] = 0;
            }
        }
        // fill table
        let i, j;
        for (i = 0;  i < len1;  i++) {
            for (j = 0;  j < len2;  j++) {
                if (string1[i] === string2[j]) {
                    if (table[i][j] === 0){
                        table[i+1][j+1] = 1;
                    } else {
                        table[i+1][j+1] = table[i][j] + 1;
                    }
                    if (table[i+1][j+1] > longestCommonSubstring) {
                        longestCommonSubstring = table[i+1][j+1];
                    }
                } else {
                    table[i+1][j+1] = 0;
                }
            }
        }
        return longestCommonSubstring;
    },

    distanceDamerauLevenshtein: function (s1, s2) {
        // distance of Damerau-Levenshtein between <s1> and <s2>
        // https://fr.wikipedia.org/wiki/Distance_de_Damerau-Levenshtein
        try {
            let nLen1 = s1.length;
            let nLen2 = s2.length;
            let matrix = [];
            for (let i = 0;  i <= nLen1+1;  i++) {
                matrix[i] = new Array(nLen2 + 2);
            }
            for (let i = 0;  i <= nLen1+1;  i++) {
                matrix[i][0] = i;
            }
            for (let j = 0;  j <= nLen2+1;  j++) {
                matrix[0][j] = j;
            }
            for (let i = 1;  i <= nLen1;  i++) {
                for (let j = 1;  j <= nLen2;  j++) {
                    //let nCost = (s1[i-1] === s2[j-1]) ? 0 : 1;
                    let nCost = char_player.distanceBetweenChars(s1[i-1], s2[j-1]);
                    matrix[i][j] = Math.min(
                        matrix[i-1][j] + 1,         // Deletion
                        matrix[i][j-1] + 1,         // Insertion
                        matrix[i-1][j-1] + nCost    // Substitution
                    );
                    if (i > 1 && j > 1 && s1[i] == s2[j-1] && s1[i-1] == s2[j]) {
                        matrix[i][j] = Math.min(matrix[i][j], matrix[i-2][j-2] + nCost);  // Transposition
                    }
                }
            }
            return Math.floor(matrix[nLen1][nLen2]);
        }
        catch (e) {
            console.error(e);
        }
    },

    showDistance (s1, s2) {
        console.log(`Distance: ${s1} / ${s2} = ${this.distanceDamerauLevenshtein(s1, s2)})`);
    },

    // Suffix only
    defineSuffixCode: function (sFlex, sStem) {
        /*
            Returns a string defining how to get stem from flexion
                "n(sfx)"
            with n: a char with numeric meaning, "0" = 0, "1" = 1, ... ":" = 10, etc. (See ASCII table.) Says how many letters to strip from flexion.
                 sfx [optional]: string to add on flexion
            Examples:
                "0": strips nothing, adds nothing
                "1er": strips 1 letter, adds "er"
                "2": strips 2 letters, adds nothing
        */
        if (sFlex == sStem) {
            return "0";
        }
        let jSfx = 0;
        for (let i = 0;  i < Math.min(sFlex.length, sStem.length);  i++) {
            if (sFlex[i] !== sStem[i]) {
                break;
            }
            jSfx += 1;
        }
        return String.fromCharCode(sFlex.length-jSfx+48) + sStem.slice(jSfx);
    },

    changeWordWithSuffixCode: function (sWord, sSfxCode) {
        if (sSfxCode == "0") {
            return sWord;
        }
        return sSfxCode[0] == '0' ? sWord + sSfxCode.slice(1) : sWord.slice(0, -(sSfxCode.charCodeAt(0)-48)) + sSfxCode.slice(1);
    },

    // Prefix and suffix
    defineAffixCode: function (sFlex, sStem) {
        /*
            UNTESTED!
            Returns a string defining how to get stem from flexion. Examples:
                "0" if stem = flexion
                "stem" if no common substring
                "n(pfx)/m(sfx)"
            with n and m: chars with numeric meaning, "0" = 0, "1" = 1, ... ":" = 10, etc. (See ASCII table.) Says how many letters to strip from flexion.
                pfx [optional]: string to add before the flexion
                sfx [optional]: string to add after the flexion
        */
        if (sFlex == sStem) {
            return "0";
        }
        // is stem a substring of flexion?
        let n = sFlex.indexOf(sStem);
        if (n >= 0) {
            return String.fromCharCode(n+48) + "/" + String.fromCharCode(sFlex.length-(sStem.length+n)+48);
        }
        // no, so we are looking for common substring
        let sSubs = this.longestCommonSubstring(sFlex, sStem);
        if (sSubs.length > 1) {
            let iPos = sStem.indexOf(sSubs);
            let sPfx = sStem.slice(0, iPos);
            let sSfx = sStem.slice(iPos+sSubs.length);
            let n = sFlex.indexOf(sSubs);
            let m = sFlex.length - (sSubs.length+n);
            return String.fromCharCode(n+48) + sPfx + "/" + String.fromCharCode(m+48) + sSfx;
        }
        return sStem;
    },

    changeWordWithAffixCode: function (sWord, sAffCode) {
        if (sAffCode == "0") {
            return sWord;
        }
        if (!sAffCode.includes("/")) {
            return sAffCode;
        }
        let [sPfxCode, sSfxCode] = sAffCode.split('/');
        sWord = sPfxCode.slice(1) + sWord.slice(sPfxCode.charCodeAt(0)-48);
        return sSfxCode[0] == '0' ? sWord + sSfxCode.slice(1) : sWord.slice(0, -(sSfxCode.charCodeAt(0)-48)) + sSfxCode.slice(1);
    }
};


if (typeof(exports) !== 'undefined') {
    exports.simplifyWord = str_transform.simplifyWord;
    exports.numbersToExponent = str_transform.numbersToExponent;
    exports.spellingNormalization = str_transform.spellingNormalization;
    exports.longestCommonSubstring = str_transform.longestCommonSubstring;
    exports.distanceDamerauLevenshtein = str_transform.distanceDamerauLevenshtein;
    exports.distanceDamerauLevenshtein2 = str_transform.distanceDamerauLevenshtein2;
    exports.showDistance = str_transform.showDistance;
    exports.changeWordWithSuffixCode = str_transform.changeWordWithSuffixCode;
    exports.changeWordWithAffixCode = str_transform.changeWordWithAffixCode;
    exports.defineAffixCode = str_transform.defineAffixCode;
    exports.defineSuffixCode = str_transform.defineSuffixCode;
}
Grammalecte str_transform.js at [246965c90e]

File graphspell-js/str_transform.js artifact 46f5d63858 part of check-in 246965c90e