Grammalecte  Diff

Differences From Artifact [0a316c953c]:

To Artifact [8c9fd715c3]:



1
2



3
4
5
6
7
8
9
10
11
12

13
14
15
16
17
18
19
20
21
22
23
24

25
26
27
28
29
30
31
1


2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26

27
28
29
30
31
32
33
34
+
-
-
+
+
+










+











-
+







"""
# list of similar chars
# useful for suggestion mechanism
List of similar chars
useful for suggestion mechanism
"""

import re
import unicodedata


_xTransCharsForSpelling = str.maketrans({
    'ſ': 's',  'ffi': 'ffi',  'ffl': 'ffl',  'ff': 'ff',  'ſt': 'ft',  'fi': 'fi',  'fl': 'fl',  'st': 'st'
})

def spellingNormalization (sWord):
    "nomalization NFC and removing ligatures"
    return unicodedata.normalize("NFC", sWord.translate(_xTransCharsForSpelling))


_xTransCharsForSimplification = str.maketrans({
    'à': 'a',  'é': 'e',  'î': 'i',  'ô': 'o',  'û': 'u',  'ÿ': 'i',  "y": "i",
    'â': 'a',  'è': 'e',  'ï': 'i',  'ö': 'o',  'ù': 'u',  'ŷ': 'i',
    'ä': 'a',  'ê': 'e',  'í': 'i',  'ó': 'o',  'ü': 'u',  'ý': 'i',
    'á': 'a',  'ë': 'e',  'ì': 'i',  'ò': 'o',  'ú': 'u',  'ỳ': 'i',
    'ā': 'a',  'ē': 'e',  'ī': 'i',  'ō': 'o',  'ū': 'u',  'ȳ': 'i',
    'ç': 'c',  'ñ': 'n',  'k': 'q',  'w': 'v',
    'œ': 'oe',  'æ': 'ae',
    'ſ': 's',  'ffi': 'ffi',  'ffl': 'ffl',  'ff': 'ff',  'ſt': 'ft',  'fi': 'fi',  'fl': 'fl',  'st': 'st', 
    'ſ': 's',  'ffi': 'ffi',  'ffl': 'ffl',  'ff': 'ff',  'ſt': 'ft',  'fi': 'fi',  'fl': 'fl',  'st': 'st',
})

def simplifyWord (sWord):
    "word simplication before calculating distance between words"
    sWord = sWord.lower().translate(_xTransCharsForSimplification)
    sNewWord = ""
    for i, c in enumerate(sWord, 1):
90
91
92
93
94
95
96
97

98
99
100
101
102
103
104
93
94
95
96
97
98
99

100
101
102
103
104
105
106
107







-
+







    "Ë": "EeÉéÈèÊêËëĒēŒœ",

    "f": "fF",
    "F": "Ff",

    "g": "gGjJĵĴ",
    "G": "GgJjĴĵ",
    

    "h": "hH",
    "H": "Hh",

    "i": "iIîÎïÏyYíÍìÌīĪÿŸ",
    "I": "IiÎîÏïYyÍíÌìĪīŸÿ",
    "î": "iIîÎïÏyYíÍìÌīĪÿŸ",
    "Î": "IiÎîÏïYyÍíÌìĪīŸÿ",
235
236
237
238
239
240
241

242
243
244
245
246
247
248
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252







+







    "X": ("CC", "CT", "XX"),
    "z": ("ss", "zh"),
    "Z": ("SS", "ZH"),
}


def get1toXReplacement (cPrev, cCur, cNext):
    "return tuple of replacements for <cCur>"
    if cCur in aConsonant  and  (cPrev in aConsonant  or  cNext in aConsonant):
        return ()
    return d1toX.get(cCur, ())


d2toX = {
    "am": ("an", "en", "em"),