Overview
| Comment: | [graphspell] experiment: DamerauLevenstein distance modified by function calculating distance between chars |
|---|---|
| Downloads: | Tarball | ZIP archive | SQL archive |
| Timelines: | family | ancestors | descendants | both | trunk | graphspell |
| Files: | files | file ages | folders |
| SHA3-256: |
90478790e5f24f1080fba675fedafa45 |
| User & Date: | olr on 2020-05-03 09:12:35 |
| Other Links: | manifest | tags |
Context
|
2020-05-03
| ||
| 09:21 | [graphspell] remove useless code check-in: 2e960183fa user: olr tags: trunk, graphspell | |
| 09:12 | [graphspell] experiment: DamerauLevenstein distance modified by function calculating distance between chars check-in: 90478790e5 user: olr tags: trunk, graphspell | |
| 06:32 | [fr] ajustements check-in: 7108ce2dc1 user: olr tags: trunk, fr | |
Changes
Modified graphspell-js/char_player.js from [60a9fdaff6] to [a12e814d2d].
| ︙ | |||
16 17 18 19 20 21 22 23 24 | 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 | + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + - - - - - + + + + + |
spellingNormalization: function (sWord) {
let sNewWord = "";
for (let c of sWord) {
sNewWord += this._xTransCharsForSpelling.gl_get(c, c);
}
return sNewWord.normalize("NFC");
},
oDistanceBetweenChars: {
"a": {},
"e": {"é": 0.5},
"é": {"e": 0.5},
"i": {"y": 0.2},
"o": {},
"u": {},
"y": {"i": 0.3},
"b": {"d": 0.8, "h": 0.9},
"c": {"ç": 0.1, "k": 0.5, "q": 0.5, "s": 0.5, "x": 0.5, "z": 0.8},
"d": {"b": 0.8},
"f": {"v": 0.8},
"g": {"j": 0.5},
"h": {"b": 0.9},
"j": {"g": 0.5, "i": 0.9},
"k": {"c": 0.5, "q": 0.1, "x": 0.5},
"l": {"i": 0.9},
"m": {"n": 0.8},
"n": {"m": 0.8, "r": 0.9},
"p": {"q": 0.9},
"q": {"c": 0.5, "k": 0.1, "p": 0.9},
"r": {"n": 0.9, "j": 0.9},
"s": {"c": 0.5, "ç": 0.1, "x": 0.5, "z": 0.5},
"t": {"d": 0.9},
"v": {"f": 0.8, "w": 0.1},
"w": {"v": 0.1},
"x": {"c": 0.5, "k": 0.5, "q": 0.5, "s": 0.5},
"z": {"s": 0.5}
},
distanceBetweenChars: function (c1, c2) {
if (c1 == c2) {
return 0;
}
if (this.oDistanceBetweenChars.hasOwnProperty(c1) && this.oDistanceBetweenChars[c1].hasOwnProperty(c2)) {
return this.oDistanceBetweenChars[c1][c2];
}
return 1;
},
_xTransCharsForSimplification: new Map([
|
| ︙ |
Modified graphspell-js/str_transform.js from [d8c3ab3e0d] to [b9cc7d5a41].
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 | 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 | + + + + + + |
// STRING TRANSFORMATION
/* jshint esversion:6, -W097 */
/* jslint esversion:6 */
/* global exports, console */
"use strict";
if (typeof(process) !== 'undefined') {
var char_player = require("./char_player.js");
} else if (typeof(require) !== 'undefined') {
var char_player = require("resource://grammalecte/graphspell/char_player.js");
}
// Note: 48 is the ASCII code for "0"
var str_transform = {
getNgrams: function (sWord, n=2) {
let lNgrams = [];
|
| ︙ | |||
53 54 55 56 57 58 59 | 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 | - + - - + + - + - + - + + - + - + |
table[i+1][j+1] = 0;
}
}
}
return longestCommonSubstring;
},
|
| ︙ |
Modified graphspell/char_player.py from [d15991830e] to [72875ebb27].
| ︙ | |||
11 12 13 14 15 16 17 18 19 | 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 | + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + - - - - - + + + + + |
'ſ': 's', 'ffi': 'ffi', 'ffl': 'ffl', 'ff': 'ff', 'ſt': 'ft', 'fi': 'fi', 'fl': 'fl', 'st': 'st'
})
def spellingNormalization (sWord):
"nomalization NFC and removing ligatures"
return unicodedata.normalize("NFC", sWord.translate(_xTransCharsForSpelling))
dDistanceBetweenChars = {
"a": {},
"e": {"é": 0.5},
"é": {"e": 0.5},
"i": {"y": 0.2},
"o": {},
"u": {},
"y": {"i": 0.3},
"b": {"d": 0.8, "h": 0.9},
"c": {"ç": 0.1, "k": 0.5, "q": 0.5, "s": 0.5, "x": 0.5, "z": 0.8},
"d": {"b": 0.8},
"f": {"v": 0.8},
"g": {"j": 0.5},
"h": {"b": 0.9},
"j": {"g": 0.5, "i": 0.9},
"k": {"c": 0.5, "q": 0.1, "x": 0.5},
"l": {"i": 0.9},
"m": {"n": 0.8},
"n": {"m": 0.8, "r": 0.9},
"p": {"q": 0.9},
"q": {"c": 0.5, "k": 0.1, "p": 0.9},
"r": {"n": 0.9, "j": 0.9},
"s": {"c": 0.5, "ç": 0.1, "x": 0.5, "z": 0.5},
"t": {"d": 0.9},
"v": {"f": 0.8, "w": 0.1},
"w": {"v": 0.1},
"x": {"c": 0.5, "k": 0.5, "q": 0.5, "s": 0.5},
"z": {"s": 0.5}
}
def distanceBetweenChars (c1, c2):
if c1 == c2:
return 0
if c1 not in dDistanceBetweenChars:
return 1
return dDistanceBetweenChars[c1].get(c2, 1)
_xTransCharsForSimplification = str.maketrans({
|
| ︙ |
Modified graphspell/str_transform.py from [7dcad03ac9] to [452d0bdcef].
1 2 3 4 5 6 7 8 9 10 11 12 | 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 | + + + |
"""
Operations on strings:
- calculate distance between two strings
- transform strings with transformation codes
"""
from .char_player import distanceBetweenChars
#### Ngrams
def getNgrams (sWord, n=2):
"return a list of Ngrams strings"
return [ sWord[i:i+n] for i in range(len(sWord)-n+1) ]
|
| ︙ | |||
40 41 42 43 44 45 46 | 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 | - + + - + |
nLen2 = len(s2)
for i in range(-1, nLen1+1):
d[i, -1] = i + 1
for j in range(-1, nLen2+1):
d[-1, j] = j + 1
for i in range(nLen1):
for j in range(nLen2):
|
| ︙ |