Overview
| Comment: | [graphspell] char_player: another simplification method for sound “é” to avoid oversimplification |
|---|---|
| Downloads: | Tarball | ZIP archive | SQL archive |
| Timelines: | family | ancestors | descendants | both | trunk | graphspell |
| Files: | files | file ages | folders |
| SHA3-256: |
93a0b84b63436ae96aa4a9af492b8c0d |
| User & Date: | olr on 2018-11-23 16:05:11 |
| Other Links: | manifest | tags |
Context
|
2018-11-23
| ||
| 20:15 | [fr] ajustements et faux positifs check-in: 16642fccc3 user: olr tags: trunk, fr | |
| 16:05 | [graphspell] char_player: another simplification method for sound “é” to avoid oversimplification check-in: 93a0b84b63 user: olr tags: trunk, graphspell | |
| 15:33 | [graphspell] suggestions: split word at apostrophes and check each part check-in: 622060334c user: olr tags: trunk, graphspell | |
Changes
Modified graphspell-js/char_player.js from [98e99c3ff4] to [e1e81e7b96].
| ︙ | ︙ | |||
18 19 20 21 22 23 24 |
for (let c of sWord) {
sNewWord += this._xTransCharsForSpelling.gl_get(c, c);
}
return sNewWord.normalize("NFC");
},
_xTransCharsForSimplification: new Map([
| | | | | | | | 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 |
for (let c of sWord) {
sNewWord += this._xTransCharsForSpelling.gl_get(c, c);
}
return sNewWord.normalize("NFC");
},
_xTransCharsForSimplification: new Map([
['à', 'a'], ['é', 'é'], ['î', 'i'], ['ô', 'o'], ['û', 'u'], ['ÿ', 'i'], ['y', 'i'],
['â', 'a'], ['è', 'é'], ['ï', 'i'], ['ö', 'o'], ['ù', 'u'], ['ŷ', 'i'],
['ä', 'a'], ['ê', 'é'], ['í', 'i'], ['ó', 'o'], ['ü', 'u'], ['ý', 'i'],
['á', 'a'], ['ë', 'é'], ['ì', 'i'], ['ò', 'o'], ['ú', 'u'], ['ỳ', 'i'],
['ā', 'a'], ['ē', 'é'], ['ī', 'i'], ['ō', 'o'], ['ū', 'u'], ['ȳ', 'i'],
['ç', 'c'], ['ñ', 'n'], ['k', 'q'], ['w', 'v'],
['œ', 'oe'], ['æ', 'ae'],
['ſ', 's'], ['ffi', 'ffi'], ['ffl', 'ffl'], ['ff', 'ff'], ['ſt', 'ft'], ['fi', 'fi'], ['fl', 'fl'], ['st', 'st']
]),
simplifyWord: function (sWord) {
// word simplication before calculating distance between words
sWord = sWord.toLowerCase();
sWord = [...sWord].map(c => this._xTransCharsForSimplification.gl_get(c, c)).join('');
let sNewWord = "";
let i = 1;
for (let c of sWord) {
if (c == 'e' || c != sWord.slice(i, i+1)) { // exception for <e> to avoid confusion between crée / créai
sNewWord += c;
}
i++;
}
return sNewWord.replace(/eau/g, "o").replace(/au/g, "o").replace(/ai/g, "ê").replace(/ei/g, "ê").replace(/ph/g, "f");
},
aVowel: new Set("aáàâäāeéèêëēiíìîïīoóòôöōuúùûüūyýỳŷÿȳœæAÁÀÂÄĀEÉÈÊËĒIÍÌÎÏĪOÓÒÔÖŌUÚÙÛÜŪYÝỲŶŸȲŒÆ"),
aConsonant: new Set("bcçdfghjklmnñpqrstvwxzBCÇDFGHJKLMNÑPQRSTVWXZ"),
aDouble: new Set("bcdfjklmnprstzBCDFJKLMNPRSTZ"), // letters that may be used twice successively
|
| ︙ | ︙ |
Modified graphspell/char_player.py from [8c9fd715c3] to [a659c6f535].
| ︙ | ︙ | |||
13 14 15 16 17 18 19 |
def spellingNormalization (sWord):
"nomalization NFC and removing ligatures"
return unicodedata.normalize("NFC", sWord.translate(_xTransCharsForSpelling))
_xTransCharsForSimplification = str.maketrans({
| | | | | | | | 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 |
def spellingNormalization (sWord):
"nomalization NFC and removing ligatures"
return unicodedata.normalize("NFC", sWord.translate(_xTransCharsForSpelling))
_xTransCharsForSimplification = str.maketrans({
'à': 'a', 'é': 'é', 'î': 'i', 'ô': 'o', 'û': 'u', 'ÿ': 'i', "y": "i",
'â': 'a', 'è': 'é', 'ï': 'i', 'ö': 'o', 'ù': 'u', 'ŷ': 'i',
'ä': 'a', 'ê': 'é', 'í': 'i', 'ó': 'o', 'ü': 'u', 'ý': 'i',
'á': 'a', 'ë': 'é', 'ì': 'i', 'ò': 'o', 'ú': 'u', 'ỳ': 'i',
'ā': 'a', 'ē': 'é', 'ī': 'i', 'ō': 'o', 'ū': 'u', 'ȳ': 'i',
'ç': 'c', 'ñ': 'n', 'k': 'q', 'w': 'v',
'œ': 'oe', 'æ': 'ae',
'ſ': 's', 'ffi': 'ffi', 'ffl': 'ffl', 'ff': 'ff', 'ſt': 'ft', 'fi': 'fi', 'fl': 'fl', 'st': 'st',
})
def simplifyWord (sWord):
"word simplication before calculating distance between words"
sWord = sWord.lower().translate(_xTransCharsForSimplification)
sNewWord = ""
for i, c in enumerate(sWord, 1):
if c == 'e' or c != sWord[i:i+1]: # exception for <e> to avoid confusion between crée / créai
sNewWord += c
return sNewWord.replace("eau", "o").replace("au", "o").replace("ai", "ê").replace("ei", "ê").replace("ph", "f")
aVowel = set("aáàâäāeéèêëēiíìîïīoóòôöōuúùûüūyýỳŷÿȳœæAÁÀÂÄĀEÉÈÊËĒIÍÌÎÏĪOÓÒÔÖŌUÚÙÛÜŪYÝỲŶŸȲŒÆ")
aConsonant = set("bcçdfghjklmnñpqrstvwxzBCÇDFGHJKLMNÑPQRSTVWXZ")
aDouble = set("bcdfjklmnprstzBCDFJKLMNPRSTZ") # letters that may be used twice successively
|
| ︙ | ︙ |