Grammalecte  str_transform.js at [90178af717]

File graphspell-js/str_transform.js artifact 9baf96ac7b part of check-in 90178af717


// STRING TRANSFORMATION

/* jshint esversion:6, -W097 */
/* jslint esversion:6 */
/* global exports, console */

"use strict";

if (typeof(process) !== 'undefined') {
    var char_player = require("./char_player.js");
} else if (typeof(require) !== 'undefined') {
    var char_player = require("resource://grammalecte/graphspell/char_player.js");
}


// Note: 48 is the ASCII code for "0"

var str_transform = {

    getNgrams: function (sWord, n=2) {
        let lNgrams = [];
        for (let i=0;  i <= sWord.length - n;  i++) {
            lNgrams.push(sWord.slice(i, i+n));
        }
        return lNgrams;
    },

    longestCommonSubstring: function (string1, string2) {
        // https://en.wikibooks.org/wiki/Algorithm_Implementation/Strings/Longest_common_substring
        // untested

        // init max value
        let longestCommonSubstring = 0;
        // init 2D array with 0
        let table = [],
            len1 = string1.length,
            len2 = string2.length,
            row, col;
        for (row = 0; row <= len1; row++) {
            table[row] = [];
            for (col = 0; col <= len2; col++) {
                table[row][col] = 0;
            }
        }
        // fill table
        let i, j;
        for (i = 0;  i < len1;  i++) {
            for (j = 0;  j < len2;  j++) {
                if (string1[i] === string2[j]) {
                    if (table[i][j] === 0){
                        table[i+1][j+1] = 1;
                    } else {
                        table[i+1][j+1] = table[i][j] + 1;
                    }
                    if (table[i+1][j+1] > longestCommonSubstring) {
                        longestCommonSubstring = table[i+1][j+1];
                    }
                } else {
                    table[i+1][j+1] = 0;
                }
            }
        }
        return longestCommonSubstring;
    },

    distanceDamerauLevenshtein: function (s1, s2) {
        // distance of Damerau-Levenshtein between <s1> and <s2>
        // https://fr.wikipedia.org/wiki/Distance_de_Damerau-Levenshtein
        try {
            let nLen1 = s1.length;
            let nLen2 = s2.length;
            let matrix = [];
            for (let i = 0;  i <= nLen1+1;  i++) {
                matrix[i] = new Array(nLen2 + 2);
            }
            for (let i = 0;  i <= nLen1+1;  i++) {
                matrix[i][0] = i;
            }
            for (let j = 0;  j <= nLen2+1;  j++) {
                matrix[0][j] = j;
            }
            for (let i = 1;  i <= nLen1;  i++) {
                for (let j = 1;  j <= nLen2;  j++) {
                    //let nCost = (s1[i-1] === s2[j-1]) ? 0 : 1;
                    let nCost = char_player.distanceBetweenChars(s1[i-1], s2[j-1]);
                    matrix[i][j] = Math.min(
                        matrix[i-1][j] + 1,         // Deletion
                        matrix[i][j-1] + 1,         // Insertion
                        matrix[i-1][j-1] + nCost    // Substitution
                    );
                    if (i > 1 && j > 1 && s1[i] == s2[j-1] && s1[i-1] == s2[j]) {
                        matrix[i][j] = Math.min(matrix[i][j], matrix[i-2][j-2] + nCost);  // Transposition
                    }
                }
            }
            return Math.floor(matrix[nLen1][nLen2]);
        }
        catch (e) {
            console.error(e);
        }
    },

    showDistance (s1, s2) {
        console.log(`Distance: ${s1} / ${s2} = ${this.distanceDamerauLevenshtein(s1, s2)})`);
    },

    // Suffix only
    defineSuffixCode: function (sFlex, sStem) {
        /*
            Returns a string defining how to get stem from flexion
                "n(sfx)"
            with n: a char with numeric meaning, "0" = 0, "1" = 1, ... ":" = 10, etc. (See ASCII table.) Says how many letters to strip from flexion.
                 sfx [optional]: string to add on flexion
            Examples:
                "0": strips nothing, adds nothing
                "1er": strips 1 letter, adds "er"
                "2": strips 2 letters, adds nothing
        */
        if (sFlex == sStem) {
            return "0";
        }
        let jSfx = 0;
        for (let i = 0;  i < Math.min(sFlex.length, sStem.length);  i++) {
            if (sFlex[i] !== sStem[i]) {
                break;
            }
            jSfx += 1;
        }
        return String.fromCharCode(sFlex.length-jSfx+48) + sStem.slice(jSfx);
    },

    changeWordWithSuffixCode: function (sWord, sSfxCode) {
        if (sSfxCode == "0") {
            return sWord;
        }
        return sSfxCode[0] == '0' ? sWord + sSfxCode.slice(1) : sWord.slice(0, -(sSfxCode.charCodeAt(0)-48)) + sSfxCode.slice(1);
    },

    // Prefix and suffix
    defineAffixCode: function (sFlex, sStem) {
        /*
            UNTESTED!
            Returns a string defining how to get stem from flexion. Examples:
                "0" if stem = flexion
                "stem" if no common substring
                "n(pfx)/m(sfx)"
            with n and m: chars with numeric meaning, "0" = 0, "1" = 1, ... ":" = 10, etc. (See ASCII table.) Says how many letters to strip from flexion.
                pfx [optional]: string to add before the flexion
                sfx [optional]: string to add after the flexion
        */
        if (sFlex == sStem) {
            return "0";
        }
        // is stem a substring of flexion?
        let n = sFlex.indexOf(sStem);
        if (n >= 0) {
            return String.fromCharCode(n+48) + "/" + String.fromCharCode(sFlex.length-(sStem.length+n)+48);
        }
        // no, so we are looking for common substring
        let sSubs = this.longestCommonSubstring(sFlex, sStem);
        if (sSubs.length > 1) {
            let iPos = sStem.indexOf(sSubs);
            let sPfx = sStem.slice(0, iPos);
            let sSfx = sStem.slice(iPos+sSubs.length);
            let n = sFlex.indexOf(sSubs);
            let m = sFlex.length - (sSubs.length+n);
            return String.fromCharCode(n+48) + sPfx + "/" + String.fromCharCode(m+48) + sSfx;
        }
        return sStem;
    },

    changeWordWithAffixCode: function (sWord, sAffCode) {
        if (sAffCode == "0") {
            return sWord;
        }
        if (!sAffCode.includes("/")) {
            return sAffCode;
        }
        let [sPfxCode, sSfxCode] = sAffCode.split('/');
        sWord = sPfxCode.slice(1) + sWord.slice(sPfxCode.charCodeAt(0)-48);
        return sSfxCode[0] == '0' ? sWord + sSfxCode.slice(1) : sWord.slice(0, -(sSfxCode.charCodeAt(0)-48)) + sSfxCode.slice(1);
    }
};


if (typeof(exports) !== 'undefined') {
    exports.longestCommonSubstring = str_transform.longestCommonSubstring;
    exports.distanceDamerauLevenshtein = str_transform.distanceDamerauLevenshtein;
    exports.distanceDamerauLevenshtein2 = str_transform.distanceDamerauLevenshtein2;
    exports.showDistance = str_transform.showDistance;
    exports.changeWordWithSuffixCode = str_transform.changeWordWithSuffixCode;
    exports.changeWordWithAffixCode = str_transform.changeWordWithAffixCode;
    exports.defineAffixCode = str_transform.defineAffixCode;
    exports.defineSuffixCode = str_transform.defineSuffixCode;
}