Overview
Comment: | [graphspell] experiment: DamerauLevenstein distance modified by function calculating distance between chars |
---|---|
Downloads: | Tarball | ZIP archive | SQL archive |
Timelines: | family | ancestors | descendants | both | trunk | graphspell |
Files: | files | file ages | folders |
SHA3-256: |
90478790e5f24f1080fba675fedafa45 |
User & Date: | olr on 2020-05-03 09:12:35 |
Other Links: | manifest | tags |
Context
2020-05-03
| ||
09:21 | [graphspell] remove useless code check-in: 2e960183fa user: olr tags: trunk, graphspell | |
09:12 | [graphspell] experiment: DamerauLevenstein distance modified by function calculating distance between chars check-in: 90478790e5 user: olr tags: trunk, graphspell | |
06:32 | [fr] ajustements check-in: 7108ce2dc1 user: olr tags: trunk, fr | |
Changes
Modified graphspell-js/char_player.js from [60a9fdaff6] to [a12e814d2d].
︙ | ︙ | |||
16 17 18 19 20 21 22 23 24 | spellingNormalization: function (sWord) { let sNewWord = ""; for (let c of sWord) { sNewWord += this._xTransCharsForSpelling.gl_get(c, c); } return sNewWord.normalize("NFC"); }, _xTransCharsForSimplification: new Map([ | > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > | | | | | | 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 | spellingNormalization: function (sWord) { let sNewWord = ""; for (let c of sWord) { sNewWord += this._xTransCharsForSpelling.gl_get(c, c); } return sNewWord.normalize("NFC"); }, oDistanceBetweenChars: { "a": {}, "e": {"é": 0.5}, "é": {"e": 0.5}, "i": {"y": 0.2}, "o": {}, "u": {}, "y": {"i": 0.3}, "b": {"d": 0.8, "h": 0.9}, "c": {"ç": 0.1, "k": 0.5, "q": 0.5, "s": 0.5, "x": 0.5, "z": 0.8}, "d": {"b": 0.8}, "f": {"v": 0.8}, "g": {"j": 0.5}, "h": {"b": 0.9}, "j": {"g": 0.5, "i": 0.9}, "k": {"c": 0.5, "q": 0.1, "x": 0.5}, "l": {"i": 0.9}, "m": {"n": 0.8}, "n": {"m": 0.8, "r": 0.9}, "p": {"q": 0.9}, "q": {"c": 0.5, "k": 0.1, "p": 0.9}, "r": {"n": 0.9, "j": 0.9}, "s": {"c": 0.5, "ç": 0.1, "x": 0.5, "z": 0.5}, "t": {"d": 0.9}, "v": {"f": 0.8, "w": 0.1}, "w": {"v": 0.1}, "x": {"c": 0.5, "k": 0.5, "q": 0.5, "s": 0.5}, "z": {"s": 0.5} }, distanceBetweenChars: function (c1, c2) { if (c1 == c2) { return 0; } if (this.oDistanceBetweenChars.hasOwnProperty(c1) && this.oDistanceBetweenChars[c1].hasOwnProperty(c2)) { return this.oDistanceBetweenChars[c1][c2]; } return 1; }, _xTransCharsForSimplification: new Map([ ['à', 'a'], ['é', 'é'], ['î', 'i'], ['ô', 'o'], ['û', 'u'], ['ÿ', 'y'], ['â', 'a'], ['è', 'é'], ['ï', 'i'], ['ö', 'o'], ['ù', 'u'], ['ŷ', 'y'], ['ä', 'a'], ['ê', 'é'], ['í', 'i'], ['ó', 'o'], ['ü', 'u'], ['ý', 'y'], ['á', 'a'], ['ë', 'é'], ['ì', 'i'], ['ò', 'o'], ['ú', 'u'], ['ỳ', 'y'], ['ā', 'a'], ['ē', 'é'], ['ī', 'i'], ['ō', 'o'], ['ū', 'u'], ['ȳ', 'y'], ['ç', 'c'], ['ñ', 'n'], ['k', 'q'], ['w', 'v'], ['œ', 'oe'], ['æ', 'ae'], ['ſ', 's'], ['ffi', 'ffi'], ['ffl', 'ffl'], ['ff', 'ff'], ['ſt', 'ft'], ['fi', 'fi'], ['fl', 'fl'], ['st', 'st'], ["⁰", "0"], ["¹", "1"], ["²", "2"], ["³", "3"], ["⁴", "4"], ["⁵", "5"], ["⁶", "6"], ["⁷", "7"], ["⁸", "8"], ["⁹", "9"], ["₀", "0"], ["₁", "1"], ["₂", "2"], ["₃", "3"], ["₄", "4"], ["₅", "5"], ["₆", "6"], ["₇", "7"], ["₈", "8"], ["₉", "9"] ]), |
︙ | ︙ |
Modified graphspell-js/str_transform.js from [d8c3ab3e0d] to [b9cc7d5a41].
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 | // STRING TRANSFORMATION /* jshint esversion:6, -W097 */ /* jslint esversion:6 */ /* global exports, console */ "use strict"; // Note: 48 is the ASCII code for "0" var str_transform = { getNgrams: function (sWord, n=2) { let lNgrams = []; | > > > > > > | 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 | // STRING TRANSFORMATION /* jshint esversion:6, -W097 */ /* jslint esversion:6 */ /* global exports, console */ "use strict"; if (typeof(process) !== 'undefined') { var char_player = require("./char_player.js"); } else if (typeof(require) !== 'undefined') { var char_player = require("resource://grammalecte/graphspell/char_player.js"); } // Note: 48 is the ASCII code for "0" var str_transform = { getNgrams: function (sWord, n=2) { let lNgrams = []; |
︙ | ︙ | |||
53 54 55 56 57 58 59 | table[i+1][j+1] = 0; } } } return longestCommonSubstring; }, | | | | | | | > | | | 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 | table[i+1][j+1] = 0; } } } return longestCommonSubstring; }, distanceDamerauLevenshtein: function (s1, s2) { // distance of Damerau-Levenshtein between <s1> and <s2> // https://fr.wikipedia.org/wiki/Distance_de_Damerau-Levenshtein try { let nLen1 = s1.length; let nLen2 = s2.length; let matrix = []; for (let i = 0; i <= nLen1+1; i++) { matrix[i] = new Array(nLen2 + 2); } for (let i = 0; i <= nLen1+1; i++) { matrix[i][0] = i; } for (let j = 0; j <= nLen2+1; j++) { matrix[0][j] = j; } for (let i = 1; i <= nLen1; i++) { for (let j = 1; j <= nLen2; j++) { //let nCost = (s1[i-1] === s2[j-1]) ? 0 : 1; let nCost = char_player.distanceBetweenChars(s1[i-1], s2[j-1]); matrix[i][j] = Math.min( matrix[i-1][j] + 1, // Deletion matrix[i][j-1] + 1, // Insertion matrix[i-1][j-1] + nCost // Substitution ); if (i > 1 && j > 1 && s1[i] == s2[j-1] && s1[i-1] == s2[j]) { matrix[i][j] = Math.min(matrix[i][j], matrix[i-2][j-2] + nCost); // Transposition } } } return Math.floor(matrix[nLen1][nLen2]); } catch (e) { console.error(e); } }, distanceDamerauLevenshtein2: function (s1, s2) { // distance of Damerau-Levenshtein between <s1> and <s2> // https://fr.wikipedia.org/wiki/Distance_de_Damerau-Levenshtein try { let nLen1 = s1.length; let nLen2 = s2.length; let INF = nLen1 + nLen2; let matrix = []; |
︙ | ︙ |
Modified graphspell/char_player.py from [d15991830e] to [72875ebb27].
︙ | ︙ | |||
11 12 13 14 15 16 17 18 19 | 'ſ': 's', 'ffi': 'ffi', 'ffl': 'ffl', 'ff': 'ff', 'ſt': 'ft', 'fi': 'fi', 'fl': 'fl', 'st': 'st' }) def spellingNormalization (sWord): "nomalization NFC and removing ligatures" return unicodedata.normalize("NFC", sWord.translate(_xTransCharsForSpelling)) _xTransCharsForSimplification = str.maketrans({ | > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > | | | | | | 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 | 'ſ': 's', 'ffi': 'ffi', 'ffl': 'ffl', 'ff': 'ff', 'ſt': 'ft', 'fi': 'fi', 'fl': 'fl', 'st': 'st' }) def spellingNormalization (sWord): "nomalization NFC and removing ligatures" return unicodedata.normalize("NFC", sWord.translate(_xTransCharsForSpelling)) dDistanceBetweenChars = { "a": {}, "e": {"é": 0.5}, "é": {"e": 0.5}, "i": {"y": 0.2}, "o": {}, "u": {}, "y": {"i": 0.3}, "b": {"d": 0.8, "h": 0.9}, "c": {"ç": 0.1, "k": 0.5, "q": 0.5, "s": 0.5, "x": 0.5, "z": 0.8}, "d": {"b": 0.8}, "f": {"v": 0.8}, "g": {"j": 0.5}, "h": {"b": 0.9}, "j": {"g": 0.5, "i": 0.9}, "k": {"c": 0.5, "q": 0.1, "x": 0.5}, "l": {"i": 0.9}, "m": {"n": 0.8}, "n": {"m": 0.8, "r": 0.9}, "p": {"q": 0.9}, "q": {"c": 0.5, "k": 0.1, "p": 0.9}, "r": {"n": 0.9, "j": 0.9}, "s": {"c": 0.5, "ç": 0.1, "x": 0.5, "z": 0.5}, "t": {"d": 0.9}, "v": {"f": 0.8, "w": 0.1}, "w": {"v": 0.1}, "x": {"c": 0.5, "k": 0.5, "q": 0.5, "s": 0.5}, "z": {"s": 0.5} } def distanceBetweenChars (c1, c2): if c1 == c2: return 0 if c1 not in dDistanceBetweenChars: return 1 return dDistanceBetweenChars[c1].get(c2, 1) _xTransCharsForSimplification = str.maketrans({ 'à': 'a', 'é': 'é', 'î': 'i', 'ô': 'o', 'û': 'u', 'ÿ': 'y', 'â': 'a', 'è': 'é', 'ï': 'i', 'ö': 'o', 'ù': 'u', 'ŷ': 'y', 'ä': 'a', 'ê': 'é', 'í': 'i', 'ó': 'o', 'ü': 'u', 'ý': 'y', 'á': 'a', 'ë': 'é', 'ì': 'i', 'ò': 'o', 'ú': 'u', 'ỳ': 'y', 'ā': 'a', 'ē': 'é', 'ī': 'i', 'ō': 'o', 'ū': 'u', 'ȳ': 'y', 'ç': 'c', 'ñ': 'n', 'k': 'q', 'w': 'v', 'œ': 'oe', 'æ': 'ae', 'ſ': 's', 'ffi': 'ffi', 'ffl': 'ffl', 'ff': 'ff', 'ſt': 'ft', 'fi': 'fi', 'fl': 'fl', 'st': 'st', "⁰": "0", "¹": "1", "²": "2", "³": "3", "⁴": "4", "⁵": "5", "⁶": "6", "⁷": "7", "⁸": "8", "⁹": "9", "₀": "0", "₁": "1", "₂": "2", "₃": "3", "₄": "4", "₅": "5", "₆": "6", "₇": "7", "₈": "8", "₉": "9" }) |
︙ | ︙ |
Modified graphspell/str_transform.py from [7dcad03ac9] to [452d0bdcef].
1 2 3 4 5 6 7 8 9 10 11 12 | """ Operations on strings: - calculate distance between two strings - transform strings with transformation codes """ #### Ngrams def getNgrams (sWord, n=2): "return a list of Ngrams strings" return [ sWord[i:i+n] for i in range(len(sWord)-n+1) ] | > > > | 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 | """ Operations on strings: - calculate distance between two strings - transform strings with transformation codes """ from .char_player import distanceBetweenChars #### Ngrams def getNgrams (sWord, n=2): "return a list of Ngrams strings" return [ sWord[i:i+n] for i in range(len(sWord)-n+1) ] |
︙ | ︙ | |||
40 41 42 43 44 45 46 | nLen2 = len(s2) for i in range(-1, nLen1+1): d[i, -1] = i + 1 for j in range(-1, nLen2+1): d[-1, j] = j + 1 for i in range(nLen1): for j in range(nLen2): | | > | | 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 | nLen2 = len(s2) for i in range(-1, nLen1+1): d[i, -1] = i + 1 for j in range(-1, nLen2+1): d[-1, j] = j + 1 for i in range(nLen1): for j in range(nLen2): #nCost = 0 if s1[i] == s2[j] else 1 nCost = distanceBetweenChars(s1[i], s2[j]) d[i, j] = min( d[i-1, j] + 1, # Deletion d[i, j-1] + 1, # Insertion d[i-1, j-1] + nCost, # Substitution ) if i and j and s1[i] == s2[j-1] and s1[i-1] == s2[j]: d[i, j] = min(d[i, j], d[i-2, j-2] + nCost) # Transposition return int(d[nLen1-1, nLen2-1]) def distanceSift4 (s1, s2, nMaxOffset=5): "implementation of general Sift4." # https://siderite.blogspot.com/2014/11/super-fast-and-accurate-string-distance.html if not s1: return len(s2) |
︙ | ︙ |