Index: graphspell-js/char_player.js ================================================================== --- graphspell-js/char_player.js +++ graphspell-js/char_player.js @@ -18,17 +18,57 @@ for (let c of sWord) { sNewWord += this._xTransCharsForSpelling.gl_get(c, c); } return sNewWord.normalize("NFC"); }, + + oDistanceBetweenChars: { + "a": {}, + "e": {"é": 0.5}, + "é": {"e": 0.5}, + "i": {"y": 0.2}, + "o": {}, + "u": {}, + "y": {"i": 0.3}, + "b": {"d": 0.8, "h": 0.9}, + "c": {"ç": 0.1, "k": 0.5, "q": 0.5, "s": 0.5, "x": 0.5, "z": 0.8}, + "d": {"b": 0.8}, + "f": {"v": 0.8}, + "g": {"j": 0.5}, + "h": {"b": 0.9}, + "j": {"g": 0.5, "i": 0.9}, + "k": {"c": 0.5, "q": 0.1, "x": 0.5}, + "l": {"i": 0.9}, + "m": {"n": 0.8}, + "n": {"m": 0.8, "r": 0.9}, + "p": {"q": 0.9}, + "q": {"c": 0.5, "k": 0.1, "p": 0.9}, + "r": {"n": 0.9, "j": 0.9}, + "s": {"c": 0.5, "ç": 0.1, "x": 0.5, "z": 0.5}, + "t": {"d": 0.9}, + "v": {"f": 0.8, "w": 0.1}, + "w": {"v": 0.1}, + "x": {"c": 0.5, "k": 0.5, "q": 0.5, "s": 0.5}, + "z": {"s": 0.5} + }, + + distanceBetweenChars: function (c1, c2) { + if (c1 == c2) { + return 0; + } + if (this.oDistanceBetweenChars.hasOwnProperty(c1) && this.oDistanceBetweenChars[c1].hasOwnProperty(c2)) { + return this.oDistanceBetweenChars[c1][c2]; + } + return 1; + }, _xTransCharsForSimplification: new Map([ - ['à', 'a'], ['é', 'é'], ['î', 'i'], ['ô', 'o'], ['û', 'u'], ['ÿ', 'i'], ['y', 'i'], - ['â', 'a'], ['è', 'é'], ['ï', 'i'], ['ö', 'o'], ['ù', 'u'], ['ŷ', 'i'], - ['ä', 'a'], ['ê', 'é'], ['í', 'i'], ['ó', 'o'], ['ü', 'u'], ['ý', 'i'], - ['á', 'a'], ['ë', 'é'], ['ì', 'i'], ['ò', 'o'], ['ú', 'u'], ['ỳ', 'i'], - ['ā', 'a'], ['ē', 'é'], ['ī', 'i'], ['ō', 'o'], ['ū', 'u'], ['ȳ', 'i'], + ['à', 'a'], ['é', 'é'], ['î', 'i'], ['ô', 'o'], ['û', 'u'], ['ÿ', 'y'], + ['â', 'a'], ['è', 'é'], ['ï', 'i'], ['ö', 'o'], ['ù', 'u'], ['ŷ', 'y'], + ['ä', 'a'], ['ê', 'é'], ['í', 'i'], ['ó', 'o'], ['ü', 'u'], ['ý', 'y'], + ['á', 'a'], ['ë', 'é'], ['ì', 'i'], ['ò', 'o'], ['ú', 'u'], ['ỳ', 'y'], + ['ā', 'a'], ['ē', 'é'], ['ī', 'i'], ['ō', 'o'], ['ū', 'u'], ['ȳ', 'y'], ['ç', 'c'], ['ñ', 'n'], ['k', 'q'], ['w', 'v'], ['œ', 'oe'], ['æ', 'ae'], ['ſ', 's'], ['ffi', 'ffi'], ['ffl', 'ffl'], ['ff', 'ff'], ['ſt', 'ft'], ['fi', 'fi'], ['fl', 'fl'], ['st', 'st'], ["⁰", "0"], ["¹", "1"], ["²", "2"], ["³", "3"], ["⁴", "4"], ["⁵", "5"], ["⁶", "6"], ["⁷", "7"], ["⁸", "8"], ["⁹", "9"], ["₀", "0"], ["₁", "1"], ["₂", "2"], ["₃", "3"], ["₄", "4"], ["₅", "5"], ["₆", "6"], ["₇", "7"], ["₈", "8"], ["₉", "9"] Index: graphspell-js/str_transform.js ================================================================== --- graphspell-js/str_transform.js +++ graphspell-js/str_transform.js @@ -4,10 +4,16 @@ /* jslint esversion:6 */ /* global exports, console */ "use strict"; +if (typeof(process) !== 'undefined') { + var char_player = require("./char_player.js"); +} else if (typeof(require) !== 'undefined') { + var char_player = require("resource://grammalecte/graphspell/char_player.js"); +} + // Note: 48 is the ASCII code for "0" var str_transform = { @@ -55,29 +61,30 @@ } } return longestCommonSubstring; }, - distanceDamerauLevenshtein2: function (s1, s2) { + distanceDamerauLevenshtein: function (s1, s2) { // distance of Damerau-Levenshtein between and // https://fr.wikipedia.org/wiki/Distance_de_Damerau-Levenshtein try { let nLen1 = s1.length; let nLen2 = s2.length; let matrix = []; - for (let i = 0; i <= nLen1; i++) { - matrix[i] = new Array(nLen2 + 1); + for (let i = 0; i <= nLen1+1; i++) { + matrix[i] = new Array(nLen2 + 2); } - for (let i = 0; i <= nLen1; i++) { + for (let i = 0; i <= nLen1+1; i++) { matrix[i][0] = i; } - for (let j = 0; j <= nLen2; j++) { + for (let j = 0; j <= nLen2+1; j++) { matrix[0][j] = j; } for (let i = 1; i <= nLen1; i++) { for (let j = 1; j <= nLen2; j++) { - let nCost = (s1[i] === s2[j]) ? 0 : 1; + //let nCost = (s1[i-1] === s2[j-1]) ? 0 : 1; + let nCost = char_player.distanceBetweenChars(s1[i-1], s2[j-1]); matrix[i][j] = Math.min( matrix[i-1][j] + 1, // Deletion matrix[i][j-1] + 1, // Insertion matrix[i-1][j-1] + nCost // Substitution ); @@ -84,18 +91,18 @@ if (i > 1 && j > 1 && s1[i] == s2[j-1] && s1[i-1] == s2[j]) { matrix[i][j] = Math.min(matrix[i][j], matrix[i-2][j-2] + nCost); // Transposition } } } - return matrix[nLen1][nLen2]; + return Math.floor(matrix[nLen1][nLen2]); } catch (e) { console.error(e); } }, - distanceDamerauLevenshtein: function (s1, s2) { + distanceDamerauLevenshtein2: function (s1, s2) { // distance of Damerau-Levenshtein between and // https://fr.wikipedia.org/wiki/Distance_de_Damerau-Levenshtein try { let nLen1 = s1.length; let nLen2 = s2.length; Index: graphspell/char_player.py ================================================================== --- graphspell/char_player.py +++ graphspell/char_player.py @@ -13,17 +13,56 @@ def spellingNormalization (sWord): "nomalization NFC and removing ligatures" return unicodedata.normalize("NFC", sWord.translate(_xTransCharsForSpelling)) + +dDistanceBetweenChars = { + "a": {}, + "e": {"é": 0.5}, + "é": {"e": 0.5}, + "i": {"y": 0.2}, + "o": {}, + "u": {}, + "y": {"i": 0.3}, + "b": {"d": 0.8, "h": 0.9}, + "c": {"ç": 0.1, "k": 0.5, "q": 0.5, "s": 0.5, "x": 0.5, "z": 0.8}, + "d": {"b": 0.8}, + "f": {"v": 0.8}, + "g": {"j": 0.5}, + "h": {"b": 0.9}, + "j": {"g": 0.5, "i": 0.9}, + "k": {"c": 0.5, "q": 0.1, "x": 0.5}, + "l": {"i": 0.9}, + "m": {"n": 0.8}, + "n": {"m": 0.8, "r": 0.9}, + "p": {"q": 0.9}, + "q": {"c": 0.5, "k": 0.1, "p": 0.9}, + "r": {"n": 0.9, "j": 0.9}, + "s": {"c": 0.5, "ç": 0.1, "x": 0.5, "z": 0.5}, + "t": {"d": 0.9}, + "v": {"f": 0.8, "w": 0.1}, + "w": {"v": 0.1}, + "x": {"c": 0.5, "k": 0.5, "q": 0.5, "s": 0.5}, + "z": {"s": 0.5} +} + + +def distanceBetweenChars (c1, c2): + if c1 == c2: + return 0 + if c1 not in dDistanceBetweenChars: + return 1 + return dDistanceBetweenChars[c1].get(c2, 1) + _xTransCharsForSimplification = str.maketrans({ - 'à': 'a', 'é': 'é', 'î': 'i', 'ô': 'o', 'û': 'u', 'ÿ': 'i', "y": "i", - 'â': 'a', 'è': 'é', 'ï': 'i', 'ö': 'o', 'ù': 'u', 'ŷ': 'i', - 'ä': 'a', 'ê': 'é', 'í': 'i', 'ó': 'o', 'ü': 'u', 'ý': 'i', - 'á': 'a', 'ë': 'é', 'ì': 'i', 'ò': 'o', 'ú': 'u', 'ỳ': 'i', - 'ā': 'a', 'ē': 'é', 'ī': 'i', 'ō': 'o', 'ū': 'u', 'ȳ': 'i', + 'à': 'a', 'é': 'é', 'î': 'i', 'ô': 'o', 'û': 'u', 'ÿ': 'y', + 'â': 'a', 'è': 'é', 'ï': 'i', 'ö': 'o', 'ù': 'u', 'ŷ': 'y', + 'ä': 'a', 'ê': 'é', 'í': 'i', 'ó': 'o', 'ü': 'u', 'ý': 'y', + 'á': 'a', 'ë': 'é', 'ì': 'i', 'ò': 'o', 'ú': 'u', 'ỳ': 'y', + 'ā': 'a', 'ē': 'é', 'ī': 'i', 'ō': 'o', 'ū': 'u', 'ȳ': 'y', 'ç': 'c', 'ñ': 'n', 'k': 'q', 'w': 'v', 'œ': 'oe', 'æ': 'ae', 'ſ': 's', 'ffi': 'ffi', 'ffl': 'ffl', 'ff': 'ff', 'ſt': 'ft', 'fi': 'fi', 'fl': 'fl', 'st': 'st', "⁰": "0", "¹": "1", "²": "2", "³": "3", "⁴": "4", "⁵": "5", "⁶": "6", "⁷": "7", "⁸": "8", "⁹": "9", "₀": "0", "₁": "1", "₂": "2", "₃": "3", "₄": "4", "₅": "5", "₆": "6", "₇": "7", "₈": "8", "₉": "9" Index: graphspell/str_transform.py ================================================================== --- graphspell/str_transform.py +++ graphspell/str_transform.py @@ -1,10 +1,13 @@ """ Operations on strings: - calculate distance between two strings - transform strings with transformation codes """ + +from .char_player import distanceBetweenChars + #### Ngrams def getNgrams (sWord, n=2): "return a list of Ngrams strings" @@ -42,19 +45,20 @@ d[i, -1] = i + 1 for j in range(-1, nLen2+1): d[-1, j] = j + 1 for i in range(nLen1): for j in range(nLen2): - nCost = 0 if s1[i] == s2[j] else 1 + #nCost = 0 if s1[i] == s2[j] else 1 + nCost = distanceBetweenChars(s1[i], s2[j]) d[i, j] = min( d[i-1, j] + 1, # Deletion d[i, j-1] + 1, # Insertion d[i-1, j-1] + nCost, # Substitution ) if i and j and s1[i] == s2[j-1] and s1[i-1] == s2[j]: d[i, j] = min(d[i, j], d[i-2, j-2] + nCost) # Transposition - return d[nLen1-1, nLen2-1] + return int(d[nLen1-1, nLen2-1]) def distanceSift4 (s1, s2, nMaxOffset=5): "implementation of general Sift4." # https://siderite.blogspot.com/2014/11/super-fast-and-accurate-string-distance.html