11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
|
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
|
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
-
-
-
-
-
-
+
+
+
+
+
+
-
+
+
+
+
+
+
+
+
+
+
+
+
+
+
|
'ſ': 's', 'ffi': 'ffi', 'ffl': 'ffl', 'ff': 'ff', 'ſt': 'ft', 'fi': 'fi', 'fl': 'fl', 'st': 'st'
})
def spellingNormalization (sWord):
"nomalization NFC and removing ligatures"
return unicodedata.normalize("NFC", sWord.translate(_xTransCharsForSpelling))
dDistanceBetweenChars = {
"a": {},
"e": {"é": 0.5},
"é": {"e": 0.5},
"i": {"y": 0.2},
"o": {},
"u": {},
"y": {"i": 0.3},
"b": {"d": 0.8, "h": 0.9},
"c": {"ç": 0.1, "k": 0.5, "q": 0.5, "s": 0.5, "x": 0.5, "z": 0.8},
"d": {"b": 0.8},
"f": {"v": 0.8},
"g": {"j": 0.5},
"h": {"b": 0.9},
"j": {"g": 0.5, "i": 0.9},
"k": {"c": 0.5, "q": 0.1, "x": 0.5},
"l": {"i": 0.9},
"m": {"n": 0.8},
"n": {"m": 0.8, "r": 0.9},
"p": {"q": 0.9},
"q": {"c": 0.5, "k": 0.1, "p": 0.9},
"r": {"n": 0.9, "j": 0.9},
"s": {"c": 0.5, "ç": 0.1, "x": 0.5, "z": 0.5},
"t": {"d": 0.9},
"v": {"f": 0.8, "w": 0.1},
"w": {"v": 0.1},
"x": {"c": 0.5, "k": 0.5, "q": 0.5, "s": 0.5},
"z": {"s": 0.5}
}
def distanceBetweenChars (c1, c2):
if c1 == c2:
return 0
if c1 not in dDistanceBetweenChars:
return 1
return dDistanceBetweenChars[c1].get(c2, 1)
_xTransCharsForSimplification = str.maketrans({
'à': 'a', 'é': 'é', 'î': 'i', 'ô': 'o', 'û': 'u', 'ÿ': 'i', "y": "i",
'â': 'a', 'è': 'é', 'ï': 'i', 'ö': 'o', 'ù': 'u', 'ŷ': 'i',
'ä': 'a', 'ê': 'é', 'í': 'i', 'ó': 'o', 'ü': 'u', 'ý': 'i',
'á': 'a', 'ë': 'é', 'ì': 'i', 'ò': 'o', 'ú': 'u', 'ỳ': 'i',
'ā': 'a', 'ē': 'é', 'ī': 'i', 'ō': 'o', 'ū': 'u', 'ȳ': 'i',
'ç': 'c', 'ñ': 'n', 'k': 'q', 'w': 'v',
'à': 'a', 'é': 'é', 'î': 'i', 'ô': 'o', 'û': 'u', 'ÿ': 'y',
'â': 'a', 'è': 'é', 'ï': 'i', 'ö': 'o', 'ù': 'u', 'ŷ': 'y',
'ä': 'a', 'ê': 'é', 'í': 'i', 'ó': 'o', 'ü': 'u', 'ý': 'y',
'á': 'a', 'ë': 'é', 'ì': 'i', 'ò': 'o', 'ú': 'u', 'ỳ': 'y',
'ā': 'a', 'ē': 'é', 'ī': 'i', 'ō': 'o', 'ū': 'u', 'ȳ': 'y',
'ç': 'c', 'ñ': 'n',
'œ': 'oe', 'æ': 'ae',
'ſ': 's', 'ffi': 'ffi', 'ffl': 'ffl', 'ff': 'ff', 'ſt': 'ft', 'fi': 'fi', 'fl': 'fl', 'st': 'st',
"⁰": "0", "¹": "1", "²": "2", "³": "3", "⁴": "4", "⁵": "5", "⁶": "6", "⁷": "7", "⁸": "8", "⁹": "9",
"₀": "0", "₁": "1", "₂": "2", "₃": "3", "₄": "4", "₅": "5", "₆": "6", "₇": "7", "₈": "8", "₉": "9"
})
def simplifyWord (sWord):
"word simplication before calculating distance between words"
sWord = sWord.lower().translate(_xTransCharsForSimplification)
sNewWord = ""
for i, c in enumerate(sWord, 1):
if c == 'e' or c != sWord[i:i+1]: # exception for <e> to avoid confusion between crée / créai
sNewWord += c
return sNewWord.replace("eau", "o").replace("au", "o").replace("ai", "ẽ").replace("ei", "ẽ").replace("ph", "f")
return sNewWord.replace("eau", "o").replace("au", "o").replace("ai", "é").replace("ei", "é").replace("ph", "f")
_xTransNumbersToExponent = str.maketrans({
"0": "⁰", "1": "¹", "2": "²", "3": "³", "4": "⁴", "5": "⁵", "6": "⁶", "7": "⁷", "8": "⁸", "9": "⁹"
})
def numbersToExponent (sWord):
"convert numeral chars to exponant chars"
return sWord.translate(_xTransNumbersToExponent)
aVowel = set("aáàâäāeéèêëēiíìîïīoóòôöōuúùûüūyýỳŷÿȳœæAÁÀÂÄĀEÉÈÊËĒIÍÌÎÏĪOÓÒÔÖŌUÚÙÛÜŪYÝỲŶŸȲŒÆ")
aConsonant = set("bcçdfghjklmnñpqrstvwxzBCÇDFGHJKLMNÑPQRSTVWXZ")
aDouble = set("bcdfjklmnprstzBCDFJKLMNPRSTZ") # letters that may be used twice successively
# Similar chars
d1to1 = {
"'": "'’", # U+0027: apostrophe droite
"’": "’", # U+2019: apostrophe typographique (sera utilisée par défaut)
"ʼ": "ʼ’", # U+02BC: Lettre modificative apostrophe
"‘": "‘’", # U+2018: guillemet-apostrophe culbuté
"‛": "‛’", # U+201B: guillemet-virgule supérieur culbuté
"´": "´’", # U+00B4: accent aigu
"`": "`’", # U+0060: accent grave
"′": "′’", # U+2032: prime
"‵": "‵’", # U+2035: prime réfléchi
"՚": "՚’", # U+055A: apostrophe arménienne
"ꞌ": "ꞌ’", # U+A78C: latin minuscule saltillo
"Ꞌ": "Ꞌ’", # U+A78B: latin majuscule saltillo
"1": "1₁liîLIÎ",
"2": "2₂zZ",
"3": "3₃eéèêEÉÈÊ",
"4": "4₄aàâAÀÂ",
"5": "5₅sgSG",
"6": "6₆bdgBDG",
"7": "7₇ltLT",
|