Overview
| Comment: | [graphspell] experiment: DamerauLevenstein distance modified by function calculating distance between chars |
|---|---|
| Downloads: | Tarball | ZIP archive | SQL archive |
| Timelines: | family | ancestors | descendants | both | trunk | graphspell |
| Files: | files | file ages | folders |
| SHA3-256: |
90478790e5f24f1080fba675fedafa45 |
| User & Date: | olr on 2020-05-03 09:12:35 |
| Other Links: | manifest | tags |
Context
|
2020-05-03
| ||
| 09:21 | [graphspell] remove useless code check-in: 2e960183fa user: olr tags: trunk, graphspell | |
| 09:12 | [graphspell] experiment: DamerauLevenstein distance modified by function calculating distance between chars check-in: 90478790e5 user: olr tags: trunk, graphspell | |
| 06:32 | [fr] ajustements check-in: 7108ce2dc1 user: olr tags: trunk, fr | |
Changes
Modified graphspell-js/char_player.js from [60a9fdaff6] to [a12e814d2d].
| ︙ | ︙ | |||
16 17 18 19 20 21 22 23 24 |
spellingNormalization: function (sWord) {
let sNewWord = "";
for (let c of sWord) {
sNewWord += this._xTransCharsForSpelling.gl_get(c, c);
}
return sNewWord.normalize("NFC");
},
_xTransCharsForSimplification: new Map([
| > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > | | | | | | 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 |
spellingNormalization: function (sWord) {
let sNewWord = "";
for (let c of sWord) {
sNewWord += this._xTransCharsForSpelling.gl_get(c, c);
}
return sNewWord.normalize("NFC");
},
oDistanceBetweenChars: {
"a": {},
"e": {"é": 0.5},
"é": {"e": 0.5},
"i": {"y": 0.2},
"o": {},
"u": {},
"y": {"i": 0.3},
"b": {"d": 0.8, "h": 0.9},
"c": {"ç": 0.1, "k": 0.5, "q": 0.5, "s": 0.5, "x": 0.5, "z": 0.8},
"d": {"b": 0.8},
"f": {"v": 0.8},
"g": {"j": 0.5},
"h": {"b": 0.9},
"j": {"g": 0.5, "i": 0.9},
"k": {"c": 0.5, "q": 0.1, "x": 0.5},
"l": {"i": 0.9},
"m": {"n": 0.8},
"n": {"m": 0.8, "r": 0.9},
"p": {"q": 0.9},
"q": {"c": 0.5, "k": 0.1, "p": 0.9},
"r": {"n": 0.9, "j": 0.9},
"s": {"c": 0.5, "ç": 0.1, "x": 0.5, "z": 0.5},
"t": {"d": 0.9},
"v": {"f": 0.8, "w": 0.1},
"w": {"v": 0.1},
"x": {"c": 0.5, "k": 0.5, "q": 0.5, "s": 0.5},
"z": {"s": 0.5}
},
distanceBetweenChars: function (c1, c2) {
if (c1 == c2) {
return 0;
}
if (this.oDistanceBetweenChars.hasOwnProperty(c1) && this.oDistanceBetweenChars[c1].hasOwnProperty(c2)) {
return this.oDistanceBetweenChars[c1][c2];
}
return 1;
},
_xTransCharsForSimplification: new Map([
['à', 'a'], ['é', 'é'], ['î', 'i'], ['ô', 'o'], ['û', 'u'], ['ÿ', 'y'],
['â', 'a'], ['è', 'é'], ['ï', 'i'], ['ö', 'o'], ['ù', 'u'], ['ŷ', 'y'],
['ä', 'a'], ['ê', 'é'], ['í', 'i'], ['ó', 'o'], ['ü', 'u'], ['ý', 'y'],
['á', 'a'], ['ë', 'é'], ['ì', 'i'], ['ò', 'o'], ['ú', 'u'], ['ỳ', 'y'],
['ā', 'a'], ['ē', 'é'], ['ī', 'i'], ['ō', 'o'], ['ū', 'u'], ['ȳ', 'y'],
['ç', 'c'], ['ñ', 'n'], ['k', 'q'], ['w', 'v'],
['œ', 'oe'], ['æ', 'ae'],
['ſ', 's'], ['ffi', 'ffi'], ['ffl', 'ffl'], ['ff', 'ff'], ['ſt', 'ft'], ['fi', 'fi'], ['fl', 'fl'], ['st', 'st'],
["⁰", "0"], ["¹", "1"], ["²", "2"], ["³", "3"], ["⁴", "4"], ["⁵", "5"], ["⁶", "6"], ["⁷", "7"], ["⁸", "8"], ["⁹", "9"],
["₀", "0"], ["₁", "1"], ["₂", "2"], ["₃", "3"], ["₄", "4"], ["₅", "5"], ["₆", "6"], ["₇", "7"], ["₈", "8"], ["₉", "9"]
]),
|
| ︙ | ︙ |
Modified graphspell-js/str_transform.js from [d8c3ab3e0d] to [b9cc7d5a41].
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 |
// STRING TRANSFORMATION
/* jshint esversion:6, -W097 */
/* jslint esversion:6 */
/* global exports, console */
"use strict";
// Note: 48 is the ASCII code for "0"
var str_transform = {
getNgrams: function (sWord, n=2) {
let lNgrams = [];
| > > > > > > | 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 |
// STRING TRANSFORMATION
/* jshint esversion:6, -W097 */
/* jslint esversion:6 */
/* global exports, console */
"use strict";
if (typeof(process) !== 'undefined') {
var char_player = require("./char_player.js");
} else if (typeof(require) !== 'undefined') {
var char_player = require("resource://grammalecte/graphspell/char_player.js");
}
// Note: 48 is the ASCII code for "0"
var str_transform = {
getNgrams: function (sWord, n=2) {
let lNgrams = [];
|
| ︙ | ︙ | |||
53 54 55 56 57 58 59 |
table[i+1][j+1] = 0;
}
}
}
return longestCommonSubstring;
},
| | | | | | | > | | | 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 |
table[i+1][j+1] = 0;
}
}
}
return longestCommonSubstring;
},
distanceDamerauLevenshtein: function (s1, s2) {
// distance of Damerau-Levenshtein between <s1> and <s2>
// https://fr.wikipedia.org/wiki/Distance_de_Damerau-Levenshtein
try {
let nLen1 = s1.length;
let nLen2 = s2.length;
let matrix = [];
for (let i = 0; i <= nLen1+1; i++) {
matrix[i] = new Array(nLen2 + 2);
}
for (let i = 0; i <= nLen1+1; i++) {
matrix[i][0] = i;
}
for (let j = 0; j <= nLen2+1; j++) {
matrix[0][j] = j;
}
for (let i = 1; i <= nLen1; i++) {
for (let j = 1; j <= nLen2; j++) {
//let nCost = (s1[i-1] === s2[j-1]) ? 0 : 1;
let nCost = char_player.distanceBetweenChars(s1[i-1], s2[j-1]);
matrix[i][j] = Math.min(
matrix[i-1][j] + 1, // Deletion
matrix[i][j-1] + 1, // Insertion
matrix[i-1][j-1] + nCost // Substitution
);
if (i > 1 && j > 1 && s1[i] == s2[j-1] && s1[i-1] == s2[j]) {
matrix[i][j] = Math.min(matrix[i][j], matrix[i-2][j-2] + nCost); // Transposition
}
}
}
return Math.floor(matrix[nLen1][nLen2]);
}
catch (e) {
console.error(e);
}
},
distanceDamerauLevenshtein2: function (s1, s2) {
// distance of Damerau-Levenshtein between <s1> and <s2>
// https://fr.wikipedia.org/wiki/Distance_de_Damerau-Levenshtein
try {
let nLen1 = s1.length;
let nLen2 = s2.length;
let INF = nLen1 + nLen2;
let matrix = [];
|
| ︙ | ︙ |
Modified graphspell/char_player.py from [d15991830e] to [72875ebb27].
| ︙ | ︙ | |||
11 12 13 14 15 16 17 18 19 |
'ſ': 's', 'ffi': 'ffi', 'ffl': 'ffl', 'ff': 'ff', 'ſt': 'ft', 'fi': 'fi', 'fl': 'fl', 'st': 'st'
})
def spellingNormalization (sWord):
"nomalization NFC and removing ligatures"
return unicodedata.normalize("NFC", sWord.translate(_xTransCharsForSpelling))
_xTransCharsForSimplification = str.maketrans({
| > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > | | | | | | 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 |
'ſ': 's', 'ffi': 'ffi', 'ffl': 'ffl', 'ff': 'ff', 'ſt': 'ft', 'fi': 'fi', 'fl': 'fl', 'st': 'st'
})
def spellingNormalization (sWord):
"nomalization NFC and removing ligatures"
return unicodedata.normalize("NFC", sWord.translate(_xTransCharsForSpelling))
dDistanceBetweenChars = {
"a": {},
"e": {"é": 0.5},
"é": {"e": 0.5},
"i": {"y": 0.2},
"o": {},
"u": {},
"y": {"i": 0.3},
"b": {"d": 0.8, "h": 0.9},
"c": {"ç": 0.1, "k": 0.5, "q": 0.5, "s": 0.5, "x": 0.5, "z": 0.8},
"d": {"b": 0.8},
"f": {"v": 0.8},
"g": {"j": 0.5},
"h": {"b": 0.9},
"j": {"g": 0.5, "i": 0.9},
"k": {"c": 0.5, "q": 0.1, "x": 0.5},
"l": {"i": 0.9},
"m": {"n": 0.8},
"n": {"m": 0.8, "r": 0.9},
"p": {"q": 0.9},
"q": {"c": 0.5, "k": 0.1, "p": 0.9},
"r": {"n": 0.9, "j": 0.9},
"s": {"c": 0.5, "ç": 0.1, "x": 0.5, "z": 0.5},
"t": {"d": 0.9},
"v": {"f": 0.8, "w": 0.1},
"w": {"v": 0.1},
"x": {"c": 0.5, "k": 0.5, "q": 0.5, "s": 0.5},
"z": {"s": 0.5}
}
def distanceBetweenChars (c1, c2):
if c1 == c2:
return 0
if c1 not in dDistanceBetweenChars:
return 1
return dDistanceBetweenChars[c1].get(c2, 1)
_xTransCharsForSimplification = str.maketrans({
'à': 'a', 'é': 'é', 'î': 'i', 'ô': 'o', 'û': 'u', 'ÿ': 'y',
'â': 'a', 'è': 'é', 'ï': 'i', 'ö': 'o', 'ù': 'u', 'ŷ': 'y',
'ä': 'a', 'ê': 'é', 'í': 'i', 'ó': 'o', 'ü': 'u', 'ý': 'y',
'á': 'a', 'ë': 'é', 'ì': 'i', 'ò': 'o', 'ú': 'u', 'ỳ': 'y',
'ā': 'a', 'ē': 'é', 'ī': 'i', 'ō': 'o', 'ū': 'u', 'ȳ': 'y',
'ç': 'c', 'ñ': 'n', 'k': 'q', 'w': 'v',
'œ': 'oe', 'æ': 'ae',
'ſ': 's', 'ffi': 'ffi', 'ffl': 'ffl', 'ff': 'ff', 'ſt': 'ft', 'fi': 'fi', 'fl': 'fl', 'st': 'st',
"⁰": "0", "¹": "1", "²": "2", "³": "3", "⁴": "4", "⁵": "5", "⁶": "6", "⁷": "7", "⁸": "8", "⁹": "9",
"₀": "0", "₁": "1", "₂": "2", "₃": "3", "₄": "4", "₅": "5", "₆": "6", "₇": "7", "₈": "8", "₉": "9"
})
|
| ︙ | ︙ |
Modified graphspell/str_transform.py from [7dcad03ac9] to [452d0bdcef].
1 2 3 4 5 6 7 8 9 10 11 12 |
"""
Operations on strings:
- calculate distance between two strings
- transform strings with transformation codes
"""
#### Ngrams
def getNgrams (sWord, n=2):
"return a list of Ngrams strings"
return [ sWord[i:i+n] for i in range(len(sWord)-n+1) ]
| > > > | 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 |
"""
Operations on strings:
- calculate distance between two strings
- transform strings with transformation codes
"""
from .char_player import distanceBetweenChars
#### Ngrams
def getNgrams (sWord, n=2):
"return a list of Ngrams strings"
return [ sWord[i:i+n] for i in range(len(sWord)-n+1) ]
|
| ︙ | ︙ | |||
40 41 42 43 44 45 46 |
nLen2 = len(s2)
for i in range(-1, nLen1+1):
d[i, -1] = i + 1
for j in range(-1, nLen2+1):
d[-1, j] = j + 1
for i in range(nLen1):
for j in range(nLen2):
| | > | | 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 |
nLen2 = len(s2)
for i in range(-1, nLen1+1):
d[i, -1] = i + 1
for j in range(-1, nLen2+1):
d[-1, j] = j + 1
for i in range(nLen1):
for j in range(nLen2):
#nCost = 0 if s1[i] == s2[j] else 1
nCost = distanceBetweenChars(s1[i], s2[j])
d[i, j] = min(
d[i-1, j] + 1, # Deletion
d[i, j-1] + 1, # Insertion
d[i-1, j-1] + nCost, # Substitution
)
if i and j and s1[i] == s2[j-1] and s1[i-1] == s2[j]:
d[i, j] = min(d[i, j], d[i-2, j-2] + nCost) # Transposition
return int(d[nLen1-1, nLen2-1])
def distanceSift4 (s1, s2, nMaxOffset=5):
"implementation of general Sift4."
# https://siderite.blogspot.com/2014/11/super-fast-and-accurate-string-distance.html
if not s1:
return len(s2)
|
| ︙ | ︙ |