Grammalecte  Diff

Differences From Artifact [97f69bd70d]:

To Artifact [b0152aab01]:


15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44


45
46
47
48
49
50

51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
15
16
17
18
19
20
21























22
23






24






















25
26
27
28
29
30
31







-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
+
+
-
-
-
-
-
-
+
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-







})

def cleanWord (sWord):
    "word simplication before calculating distance between words"
    return sWord.lower().translate(_xTransChars).replace("eau", "o").replace("au", "o")


def distanceDamerauLevenshtein (s1, s2):
    "distance of Damerau-Levenshtein between <s1> and <s2>"
    # https://fr.wikipedia.org/wiki/Distance_de_Damerau-Levenshtein
    d = {}
    nLen1 = len(s1)
    nLen2 = len(s2)
    for i in range(-1, nLen1+1):
        d[i, -1] = i + 1
    for j in range(-1, nLen2+1):
        d[-1, j] = j + 1
    for i in range(nLen1):
        for j in range(nLen2):
            nCost = 0  if s1[i] == s2[j]  else 1
            d[i, j] = min(
                d[i-1, j]   + 1,        # Deletion
                d[i,   j-1] + 1,        # Insertion
                d[i-1, j-1] + nCost,    # Substitution
            )
            if i and j and s1[i] == s2[j-1] and s1[i-1] == s2[j]:
                d[i, j] = min(d[i, j], d[i-2, j-2] + nCost)     # Transposition
    return d[nLen1-1, nLen2-1]


aVowel = set("aáàâäāeéèêëēiíìîïīoóòôöōuúùûüūyýỳŷÿȳœæAÁÀÂÄĀEÉÈÊËĒIÍÌÎÏĪOÓÒÔÖŌUÚÙÛÜŪYÝỲŶŸȲŒÆ")
aConsonant = set("bcçdfghjklmnñpqrstvwxzBCÇDFGHJKLMNÑPQRSTVWXZ")
def showDistance (s1, s2):
    s1b = cleanWord(s1);
    s2b = cleanWord(s2);
    print(f"Distance: {s1} / {s2} = {distanceDamerauLevenshtein(s1, s2)}")
    print(f"Distance: {s1b} / {s2b} = {distanceDamerauLevenshtein(s1b, s2b)}")

aDouble = set("bcçdfjklmnprstzBCÇDFJKLMNPRSTZ")  # letters that may be used twice successively

# Method: Remove Useless Chars

_dVovels = {
    'a': '',  'e': '',  'i': '',  'o': '',  'u': '',  'y': '',
    'à': '',  'é': '',  'î': '',  'ô': '',  'û': '',  'ÿ': '',
    'â': '',  'è': '',  'ï': '',  'ö': '',  'ù': '',  'ŷ': '',
    'ä': '',  'ê': '',  'í': '',  'ó': '',  'ü': '',  'ý': '',
    'á': '',  'ë': '',  'ì': '',  'ò': '',  'ú': '',  'ỳ': '',
    'ā': '',  'ē': '',  'ī': '',  'ō': '',  'ū': '',  'ȳ': '',
    'h': '',  'œ': '',  'æ': ''
 }

_xTransVovels = str.maketrans(_dVovels)


aVovels = frozenset(_dVovels.keys())


def shrinkWord (sWord):
    "remove vovels and h"
    return sWord[0:1].replace("h", "") + sWord[1:].translate(_xTransVovels)


# Similar chars

d1to1 = {
    "1": "liîLIÎ",
    "2": "zZ",
201
202
203
204
205
206
207
208
209
210
211


212
213
214
215
216
217
218
219
220
221
222
223
224
153
154
155
156
157
158
159


160
161
162
163
164
165
166
167


168
169
170
171
172
173
174







-
-


+
+




-
-







d1toX = {
    "æ": ("ae",),
    "Æ": ("AE",),
    "b": ("bb",),
    "B": ("BB",),
    "c": ("cc", "ss", "qu", "ch"),
    "C": ("CC", "SS", "QU", "CH"),
    "ç": ("ss", "cc", "qh", "ch"),
    "Ç": ("SS", "CC", "QH", "CH"),
    "d": ("dd",),
    "D": ("DD",),
    "é": ("ai", "ei"),
    "É": ("AI", "EI"),
    "f": ("ff", "ph"),
    "F": ("FF", "PH"),
    "g": ("gu", "ge", "gg", "gh"),
    "G": ("GU", "GE", "GG", "GH"),
    "i": ("ii",),
    "I": ("II",),
    "j": ("jj", "dj"),
    "J": ("JJ", "DJ"),
    "k": ("qu", "ck", "ch", "cu", "kk", "kh"),
    "K": ("QU", "CK", "CH", "CU", "KK", "KH"),
    "l": ("ll",),
    "L": ("LL",),
    "m": ("mm", "mn"),
240
241
242
243
244
245
246







247
248
249
250
251
252
253
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210







+
+
+
+
+
+
+







    "t": ("tt", "th"),
    "T": ("TT", "TH"),
    "x": ("cc", "ct", "xx"),
    "X": ("CC", "CT", "XX"),
    "z": ("ss", "zh"),
    "Z": ("SS", "ZH"),
}


def get1toXReplacement (cPrev, cCur, cNext):
    if cCur in aConsonant  and  (cPrev in aConsonant  or  cNext in aConsonant):
        return ()
    return d1toX.get(cCur, ())


d2toX = {
    "an": ("en",),
    "AN": ("EN",),
    "au": ("eau", "o", "ô"),
    "AU": ("EAU", "O", "Ô"),
    "en": ("an",),