Grammalecte  Diff

Differences From Artifact [d15991830e]:

To Artifact [fa338bf2f3]:


11
12
13
14
15
16
17
18







































19
20
21
22
23
24
25






26
27
28
29
30
31
32
33
34
35
36
37
38
39

40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58













59
60
61
62
63
64
65
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58






59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77

78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117








+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+

-
-
-
-
-
-
+
+
+
+
+
+













-
+



















+
+
+
+
+
+
+
+
+
+
+
+
+







    'ſ': 's',  'ffi': 'ffi',  'ffl': 'ffl',  'ff': 'ff',  'ſt': 'ft',  'fi': 'fi',  'fl': 'fl',  'st': 'st'
})

def spellingNormalization (sWord):
    "nomalization NFC and removing ligatures"
    return unicodedata.normalize("NFC", sWord.translate(_xTransCharsForSpelling))


dDistanceBetweenChars = {
    "a": {},
    "e": {"é": 0.5},
    "é": {"e": 0.5},
    "i": {"y": 0.2},
    "o": {},
    "u": {},
    "y": {"i": 0.3},
    "b": {"d": 0.8, "h": 0.9},
    "c": {"ç": 0.1, "k": 0.5, "q": 0.5, "s": 0.5, "x": 0.5, "z": 0.8},
    "d": {"b": 0.8},
    "f": {"v": 0.8},
    "g": {"j": 0.5},
    "h": {"b": 0.9},
    "j": {"g": 0.5, "i": 0.9},
    "k": {"c": 0.5, "q": 0.1, "x": 0.5},
    "l": {"i": 0.9},
    "m": {"n": 0.8},
    "n": {"m": 0.8, "r": 0.9},
    "p": {"q": 0.9},
    "q": {"c": 0.5, "k": 0.1, "p": 0.9},
    "r": {"n": 0.9, "j": 0.9},
    "s": {"c": 0.5, "ç": 0.1, "x": 0.5, "z": 0.5},
    "t": {"d": 0.9},
    "v": {"f": 0.8, "w": 0.1},
    "w": {"v": 0.1},
    "x": {"c": 0.5, "k": 0.5, "q": 0.5, "s": 0.5},
    "z": {"s": 0.5}
}


def distanceBetweenChars (c1, c2):
    if c1 == c2:
        return 0
    if c1 not in dDistanceBetweenChars:
        return 1
    return dDistanceBetweenChars[c1].get(c2, 1)


_xTransCharsForSimplification = str.maketrans({
    'à': 'a',  'é': 'é',  'î': 'i',  'ô': 'o',  'û': 'u',  'ÿ': 'i',  "y": "i",
    'â': 'a',  'è': 'é',  'ï': 'i',  'ö': 'o',  'ù': 'u',  'ŷ': 'i',
    'ä': 'a',  'ê': 'é',  'í': 'i',  'ó': 'o',  'ü': 'u',  'ý': 'i',
    'á': 'a',  'ë': 'é',  'ì': 'i',  'ò': 'o',  'ú': 'u',  'ỳ': 'i',
    'ā': 'a',  'ē': 'é',  'ī': 'i',  'ō': 'o',  'ū': 'u',  'ȳ': 'i',
    'ç': 'c',  'ñ': 'n',  'k': 'q',  'w': 'v',
    'à': 'a',  'é': 'é',  'î': 'i',  'ô': 'o',  'û': 'u',  'ÿ': 'y',
    'â': 'a',  'è': 'é',  'ï': 'i',  'ö': 'o',  'ù': 'u',  'ŷ': 'y',
    'ä': 'a',  'ê': 'é',  'í': 'i',  'ó': 'o',  'ü': 'u',  'ý': 'y',
    'á': 'a',  'ë': 'é',  'ì': 'i',  'ò': 'o',  'ú': 'u',  'ỳ': 'y',
    'ā': 'a',  'ē': 'é',  'ī': 'i',  'ō': 'o',  'ū': 'u',  'ȳ': 'y',
    'ç': 'c',  'ñ': 'n',
    'œ': 'oe',  'æ': 'ae',
    'ſ': 's',  'ffi': 'ffi',  'ffl': 'ffl',  'ff': 'ff',  'ſt': 'ft',  'fi': 'fi',  'fl': 'fl',  'st': 'st',
    "⁰": "0", "¹": "1", "²": "2", "³": "3", "⁴": "4", "⁵": "5", "⁶": "6", "⁷": "7", "⁸": "8", "⁹": "9",
    "₀": "0", "₁": "1", "₂": "2", "₃": "3", "₄": "4", "₅": "5", "₆": "6", "₇": "7", "₈": "8", "₉": "9"
})

def simplifyWord (sWord):
    "word simplication before calculating distance between words"
    sWord = sWord.lower().translate(_xTransCharsForSimplification)
    sNewWord = ""
    for i, c in enumerate(sWord, 1):
        if c == 'e' or c != sWord[i:i+1]:  # exception for <e> to avoid confusion between crée / créai
            sNewWord += c
    return sNewWord.replace("eau", "o").replace("au", "o").replace("ai", "").replace("ei", "").replace("ph", "f")
    return sNewWord.replace("eau", "o").replace("au", "o").replace("ai", "é").replace("ei", "é").replace("ph", "f")


_xTransNumbersToExponent = str.maketrans({
    "0": "⁰", "1": "¹", "2": "²", "3": "³", "4": "⁴", "5": "⁵", "6": "⁶", "7": "⁷", "8": "⁸", "9": "⁹"
})

def numbersToExponent (sWord):
    "convert numeral chars to exponant chars"
    return sWord.translate(_xTransNumbersToExponent)


aVowel = set("aáàâäāeéèêëēiíìîïīoóòôöōuúùûüūyýỳŷÿȳœæAÁÀÂÄĀEÉÈÊËĒIÍÌÎÏĪOÓÒÔÖŌUÚÙÛÜŪYÝỲŶŸȲŒÆ")
aConsonant = set("bcçdfghjklmnñpqrstvwxzBCÇDFGHJKLMNÑPQRSTVWXZ")
aDouble = set("bcdfjklmnprstzBCDFJKLMNPRSTZ")  # letters that may be used twice successively


# Similar chars

d1to1 = {
    "'": "'’",  # U+0027: apostrophe droite
    "’": "’",   # U+2019: apostrophe typographique  (sera utilisée par défaut)
    "ʼ": "ʼ’",  # U+02BC: Lettre modificative apostrophe
    "‘": "‘’",  # U+2018: guillemet-apostrophe culbuté
    "‛": "‛’",  # U+201B: guillemet-virgule supérieur culbuté
    "´": "´’",  # U+00B4: accent aigu
    "`": "`’",  # U+0060: accent grave
    "′": "′’",  # U+2032: prime
    "‵": "‵’",  # U+2035: prime réfléchi
    "՚": "՚’",  # U+055A: apostrophe arménienne
    "ꞌ": "ꞌ’",  # U+A78C: latin minuscule saltillo
    "Ꞌ": "Ꞌ’",  # U+A78B: latin majuscule saltillo

    "1": "1₁liîLIÎ",
    "2": "2₂zZ",
    "3": "3₃eéèêEÉÈÊ",
    "4": "4₄aàâAÀÂ",
    "5": "5₅sgSG",
    "6": "6₆bdgBDG",
    "7": "7₇ltLT",