Overview
Comment: | [graphspell] char_player: another simplification method for sound “é” to avoid oversimplification |
---|---|
Downloads: | Tarball | ZIP archive | SQL archive |
Timelines: | family | ancestors | descendants | both | trunk | graphspell |
Files: | files | file ages | folders |
SHA3-256: |
93a0b84b63436ae96aa4a9af492b8c0d |
User & Date: | olr on 2018-11-23 16:05:11 |
Other Links: | manifest | tags |
Context
2018-11-23
| ||
20:15 | [fr] ajustements et faux positifs check-in: 16642fccc3 user: olr tags: trunk, fr | |
16:05 | [graphspell] char_player: another simplification method for sound “é” to avoid oversimplification check-in: 93a0b84b63 user: olr tags: trunk, graphspell | |
15:33 | [graphspell] suggestions: split word at apostrophes and check each part check-in: 622060334c user: olr tags: trunk, graphspell | |
Changes
Modified graphspell-js/char_player.js from [98e99c3ff4] to [e1e81e7b96].
︙ | ︙ | |||
18 19 20 21 22 23 24 | for (let c of sWord) { sNewWord += this._xTransCharsForSpelling.gl_get(c, c); } return sNewWord.normalize("NFC"); }, _xTransCharsForSimplification: new Map([ | | | | | | | | 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 | for (let c of sWord) { sNewWord += this._xTransCharsForSpelling.gl_get(c, c); } return sNewWord.normalize("NFC"); }, _xTransCharsForSimplification: new Map([ ['à', 'a'], ['é', 'é'], ['î', 'i'], ['ô', 'o'], ['û', 'u'], ['ÿ', 'i'], ['y', 'i'], ['â', 'a'], ['è', 'é'], ['ï', 'i'], ['ö', 'o'], ['ù', 'u'], ['ŷ', 'i'], ['ä', 'a'], ['ê', 'é'], ['í', 'i'], ['ó', 'o'], ['ü', 'u'], ['ý', 'i'], ['á', 'a'], ['ë', 'é'], ['ì', 'i'], ['ò', 'o'], ['ú', 'u'], ['ỳ', 'i'], ['ā', 'a'], ['ē', 'é'], ['ī', 'i'], ['ō', 'o'], ['ū', 'u'], ['ȳ', 'i'], ['ç', 'c'], ['ñ', 'n'], ['k', 'q'], ['w', 'v'], ['œ', 'oe'], ['æ', 'ae'], ['ſ', 's'], ['ffi', 'ffi'], ['ffl', 'ffl'], ['ff', 'ff'], ['ſt', 'ft'], ['fi', 'fi'], ['fl', 'fl'], ['st', 'st'] ]), simplifyWord: function (sWord) { // word simplication before calculating distance between words sWord = sWord.toLowerCase(); sWord = [...sWord].map(c => this._xTransCharsForSimplification.gl_get(c, c)).join(''); let sNewWord = ""; let i = 1; for (let c of sWord) { if (c == 'e' || c != sWord.slice(i, i+1)) { // exception for <e> to avoid confusion between crée / créai sNewWord += c; } i++; } return sNewWord.replace(/eau/g, "o").replace(/au/g, "o").replace(/ai/g, "ê").replace(/ei/g, "ê").replace(/ph/g, "f"); }, aVowel: new Set("aáàâäāeéèêëēiíìîïīoóòôöōuúùûüūyýỳŷÿȳœæAÁÀÂÄĀEÉÈÊËĒIÍÌÎÏĪOÓÒÔÖŌUÚÙÛÜŪYÝỲŶŸȲŒÆ"), aConsonant: new Set("bcçdfghjklmnñpqrstvwxzBCÇDFGHJKLMNÑPQRSTVWXZ"), aDouble: new Set("bcdfjklmnprstzBCDFJKLMNPRSTZ"), // letters that may be used twice successively |
︙ | ︙ |
Modified graphspell/char_player.py from [8c9fd715c3] to [a659c6f535].
︙ | ︙ | |||
13 14 15 16 17 18 19 | def spellingNormalization (sWord): "nomalization NFC and removing ligatures" return unicodedata.normalize("NFC", sWord.translate(_xTransCharsForSpelling)) _xTransCharsForSimplification = str.maketrans({ | | | | | | | | 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 | def spellingNormalization (sWord): "nomalization NFC and removing ligatures" return unicodedata.normalize("NFC", sWord.translate(_xTransCharsForSpelling)) _xTransCharsForSimplification = str.maketrans({ 'à': 'a', 'é': 'é', 'î': 'i', 'ô': 'o', 'û': 'u', 'ÿ': 'i', "y": "i", 'â': 'a', 'è': 'é', 'ï': 'i', 'ö': 'o', 'ù': 'u', 'ŷ': 'i', 'ä': 'a', 'ê': 'é', 'í': 'i', 'ó': 'o', 'ü': 'u', 'ý': 'i', 'á': 'a', 'ë': 'é', 'ì': 'i', 'ò': 'o', 'ú': 'u', 'ỳ': 'i', 'ā': 'a', 'ē': 'é', 'ī': 'i', 'ō': 'o', 'ū': 'u', 'ȳ': 'i', 'ç': 'c', 'ñ': 'n', 'k': 'q', 'w': 'v', 'œ': 'oe', 'æ': 'ae', 'ſ': 's', 'ffi': 'ffi', 'ffl': 'ffl', 'ff': 'ff', 'ſt': 'ft', 'fi': 'fi', 'fl': 'fl', 'st': 'st', }) def simplifyWord (sWord): "word simplication before calculating distance between words" sWord = sWord.lower().translate(_xTransCharsForSimplification) sNewWord = "" for i, c in enumerate(sWord, 1): if c == 'e' or c != sWord[i:i+1]: # exception for <e> to avoid confusion between crée / créai sNewWord += c return sNewWord.replace("eau", "o").replace("au", "o").replace("ai", "ê").replace("ei", "ê").replace("ph", "f") aVowel = set("aáàâäāeéèêëēiíìîïīoóòôöōuúùûüūyýỳŷÿȳœæAÁÀÂÄĀEÉÈÊËĒIÍÌÎÏĪOÓÒÔÖŌUÚÙÛÜŪYÝỲŶŸȲŒÆ") aConsonant = set("bcçdfghjklmnñpqrstvwxzBCÇDFGHJKLMNÑPQRSTVWXZ") aDouble = set("bcdfjklmnprstzBCDFJKLMNPRSTZ") # letters that may be used twice successively |
︙ | ︙ |