Overview
Comment: | [graphspell] experiment: DamerauLevenstein distance modified by function calculating distance between chars |
---|---|
Downloads: | Tarball | ZIP archive | SQL archive |
Timelines: | family | ancestors | descendants | both | trunk | graphspell |
Files: | files | file ages | folders |
SHA3-256: |
90478790e5f24f1080fba675fedafa45 |
User & Date: | olr on 2020-05-03 09:12:35 |
Other Links: | manifest | tags |
Context
2020-05-03
| ||
09:21 | [graphspell] remove useless code check-in: 2e960183fa user: olr tags: trunk, graphspell | |
09:12 | [graphspell] experiment: DamerauLevenstein distance modified by function calculating distance between chars check-in: 90478790e5 user: olr tags: trunk, graphspell | |
06:32 | [fr] ajustements check-in: 7108ce2dc1 user: olr tags: trunk, fr | |
Changes
Modified graphspell-js/char_player.js from [60a9fdaff6] to [a12e814d2d].
︙ | |||
16 17 18 19 20 21 22 23 24 | 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 | + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + - - - - - + + + + + | spellingNormalization: function (sWord) { let sNewWord = ""; for (let c of sWord) { sNewWord += this._xTransCharsForSpelling.gl_get(c, c); } return sNewWord.normalize("NFC"); }, oDistanceBetweenChars: { "a": {}, "e": {"é": 0.5}, "é": {"e": 0.5}, "i": {"y": 0.2}, "o": {}, "u": {}, "y": {"i": 0.3}, "b": {"d": 0.8, "h": 0.9}, "c": {"ç": 0.1, "k": 0.5, "q": 0.5, "s": 0.5, "x": 0.5, "z": 0.8}, "d": {"b": 0.8}, "f": {"v": 0.8}, "g": {"j": 0.5}, "h": {"b": 0.9}, "j": {"g": 0.5, "i": 0.9}, "k": {"c": 0.5, "q": 0.1, "x": 0.5}, "l": {"i": 0.9}, "m": {"n": 0.8}, "n": {"m": 0.8, "r": 0.9}, "p": {"q": 0.9}, "q": {"c": 0.5, "k": 0.1, "p": 0.9}, "r": {"n": 0.9, "j": 0.9}, "s": {"c": 0.5, "ç": 0.1, "x": 0.5, "z": 0.5}, "t": {"d": 0.9}, "v": {"f": 0.8, "w": 0.1}, "w": {"v": 0.1}, "x": {"c": 0.5, "k": 0.5, "q": 0.5, "s": 0.5}, "z": {"s": 0.5} }, distanceBetweenChars: function (c1, c2) { if (c1 == c2) { return 0; } if (this.oDistanceBetweenChars.hasOwnProperty(c1) && this.oDistanceBetweenChars[c1].hasOwnProperty(c2)) { return this.oDistanceBetweenChars[c1][c2]; } return 1; }, _xTransCharsForSimplification: new Map([ |
︙ |
Modified graphspell-js/str_transform.js from [d8c3ab3e0d] to [b9cc7d5a41].
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 | 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 | + + + + + + | // STRING TRANSFORMATION /* jshint esversion:6, -W097 */ /* jslint esversion:6 */ /* global exports, console */ "use strict"; if (typeof(process) !== 'undefined') { var char_player = require("./char_player.js"); } else if (typeof(require) !== 'undefined') { var char_player = require("resource://grammalecte/graphspell/char_player.js"); } // Note: 48 is the ASCII code for "0" var str_transform = { getNgrams: function (sWord, n=2) { let lNgrams = []; |
︙ | |||
53 54 55 56 57 58 59 | 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 | - + - - + + - + - + - + + - + - + | table[i+1][j+1] = 0; } } } return longestCommonSubstring; }, |
︙ |
Modified graphspell/char_player.py from [d15991830e] to [72875ebb27].
︙ | |||
11 12 13 14 15 16 17 18 19 | 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 | + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + - - - - - + + + + + | 'ſ': 's', 'ffi': 'ffi', 'ffl': 'ffl', 'ff': 'ff', 'ſt': 'ft', 'fi': 'fi', 'fl': 'fl', 'st': 'st' }) def spellingNormalization (sWord): "nomalization NFC and removing ligatures" return unicodedata.normalize("NFC", sWord.translate(_xTransCharsForSpelling)) dDistanceBetweenChars = { "a": {}, "e": {"é": 0.5}, "é": {"e": 0.5}, "i": {"y": 0.2}, "o": {}, "u": {}, "y": {"i": 0.3}, "b": {"d": 0.8, "h": 0.9}, "c": {"ç": 0.1, "k": 0.5, "q": 0.5, "s": 0.5, "x": 0.5, "z": 0.8}, "d": {"b": 0.8}, "f": {"v": 0.8}, "g": {"j": 0.5}, "h": {"b": 0.9}, "j": {"g": 0.5, "i": 0.9}, "k": {"c": 0.5, "q": 0.1, "x": 0.5}, "l": {"i": 0.9}, "m": {"n": 0.8}, "n": {"m": 0.8, "r": 0.9}, "p": {"q": 0.9}, "q": {"c": 0.5, "k": 0.1, "p": 0.9}, "r": {"n": 0.9, "j": 0.9}, "s": {"c": 0.5, "ç": 0.1, "x": 0.5, "z": 0.5}, "t": {"d": 0.9}, "v": {"f": 0.8, "w": 0.1}, "w": {"v": 0.1}, "x": {"c": 0.5, "k": 0.5, "q": 0.5, "s": 0.5}, "z": {"s": 0.5} } def distanceBetweenChars (c1, c2): if c1 == c2: return 0 if c1 not in dDistanceBetweenChars: return 1 return dDistanceBetweenChars[c1].get(c2, 1) _xTransCharsForSimplification = str.maketrans({ |
︙ |
Modified graphspell/str_transform.py from [7dcad03ac9] to [452d0bdcef].
1 2 3 4 5 6 7 8 9 10 11 12 | 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 | + + + | """ Operations on strings: - calculate distance between two strings - transform strings with transformation codes """ from .char_player import distanceBetweenChars #### Ngrams def getNgrams (sWord, n=2): "return a list of Ngrams strings" return [ sWord[i:i+n] for i in range(len(sWord)-n+1) ] |
︙ | |||
40 41 42 43 44 45 46 | 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 | - + + - + | nLen2 = len(s2) for i in range(-1, nLen1+1): d[i, -1] = i + 1 for j in range(-1, nLen2+1): d[-1, j] = j + 1 for i in range(nLen1): for j in range(nLen2): |
︙ |