Grammalecte  Check-in [0b7150270a]

Overview
Comment:merge trunk
Downloads: Tarball | ZIP archive | SQL archive
Timelines: family | ancestors | descendants | both | multid
Files: files | file ages | folders
SHA3-256: 0b7150270a659e28c3cd498edd1990327ad2f219db00ca9c9c4c8eadc3d4384e
User & Date: olr on 2018-02-19 09:44:11
Other Links: branch diff | manifest | tags
Context
2018-02-19
12:37
[lo] update: helpers check-in: 12ad381687 user: olr tags: lo, multid
09:44
merge trunk check-in: 0b7150270a user: olr tags: multid
09:11
[fr] version 0.6.2 check-in: 18027d1022 user: olr tags: trunk, fr
2018-02-18
16:28
[lo] UI for dictionaries options check-in: edf22c7d52 user: olr tags: lo, multid
Changes

Modified gc_lang/fr/build_data.py from [a0d5d064eb] to [1f69de4a2f].

267
268
269
270
271
272
273
274

275
276
277
278
279
280
281
267
268
269
270
271
272
273

274
275
276
277
278
279
280
281







-
+







def makePhonetTable (sp, bJS=False):
    print("> Correspondances phonétiques ", end="")
    print("(Python et JavaScript)"  if bJS  else "(Python seulement)")
    
    import gc_lang.fr.modules.conj as conj

    try:
        oDict = ibdawg.IBDAWG("French.bdic")
        oDict = ibdawg.IBDAWG("fr.bdic")
    except:
        traceback.print_exc()
        return

    # set of homophonic words
    lSet = []
    for sLine in readFile(sp+"/data/phonet_simil.txt"):

Modified gc_lang/fr/config.ini from [08f47bce51] to [c7e11a6902].

1
2
3
4
5
6
7
8
9

10
11
12
13
14
15
16
1
2
3
4
5
6
7
8

9
10
11
12
13
14
15
16








-
+







[args]
lang = fr
lang_name = French
locales = fr_FR fr_BE fr_CA fr_CH fr_LU fr_MC fr_BF fr_CI fr_SN fr_ML fr_NE fr_TG fr_BJ
country_default = FR
name = Grammalecte
implname = grammalecte
# always use 3 numbers for version: x.y.z
version = 0.6.1
version = 0.6.2
author = Olivier R.
provider = Dicollecte
link = http://grammalecte.net
description = Correcteur grammatical pour le français.
extras = README_fr.txt
logo = logo.png

Modified gc_lang/fr/dictionnaire/genfrdic.py from [5036afecd5] to [e42dad16b6].

813
814
815
816
817
818
819
820

821
822
823
824
825
826
827
813
814
815
816
817
818
819

820
821
822
823
824
825
826
827







-
+







    def __str__ (self):
        return "{0.lemma}/{0.flags} {1}".format(self, self.getMorph(2))

    def check (self):
        sErr = ''
        if self.lemma == '':
            sErr += 'lemme vide'
        if not re.match(r"[a-zA-ZéÉôÔàâÂîÎïèÈêÊÜœŒæÆçÇ0-9µåÅΩ&αβγδεζηθικλμνξοπρστυφχψωΔℓΩ_]", self.lemma):
        if not re.match(r"[a-zA-ZéÉôÔàâáÂîÎïèÈêÊÜœŒæÆçÇ0-9µåÅΩ&αβγδεζηθικλμνξοπρστυφχψωΔℓΩ_]", self.lemma):
            sErr += 'premier caractère inconnu: ' + self.lemma[0]
        if re.search(r"\s$", self.lemma):
            sErr += 'espace en fin de lemme'
        if re.match(r"v[0123]", self.po) and not re.match(r"[eas_][ix_][tx_][nx_][pqreuvx_][mx_][ex_z][ax_z]\b", self.po[2:]):
            sErr += 'verbe inconnu: ' + self.po
        if (re.match(r"S[*.]", self.flags) and re.search("[sxz]$", self.lemma)) or (re.match(r"X[*.]", self.flags) and not re.search("[ul]$", self.lemma)):
            sErr += 'drapeau inutile'

Modified gc_lang/fr/dictionnaire/orthographe/FRANCAIS_5.aff from [078b475b37] to [a2e4f22697].

48
49
50
51
52
53
54
55

56
57
58
59
60
61
62
48
49
50
51
52
53
54

55
56
57
58
59
60
61
62







-
+







MAP tT
MAP vV
MAP wW
MAP xX
MAP zZ

# Remplacements envisagés & barbarismes
REP 84
REP 82
REP ^Ca$ Ça
REP ^l l'
REP ^d d'
REP ^n n'
REP ^s s'
REP ^j j'
REP ^m m'
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
131
132
133
134
135
136
137


138
139
140
141
142
143
144







-
-







REP faisez$ faites
REP puit puits
REP sanctionnable punissable
REP questionnable discutable
REP antitartre détartrant
REP email courriel
REP construirent construisirent
REP cad$ c’est-à-dire
REP càd$ c’est-à-dire


# Phonétique
#PHONE 69
#PHONE AN(DT)$                   @
#PHONE AILL                      AY
#PHONE AIS$                      E
322
323
324
325
326
327
328
329

330
331
332
333
334
335
336
337
338

339
340
341
342
343
344
345
320
321
322
323
324
325
326

327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344







-
+









+







# Astuce de Hunspell pour contourner la non-normalisation de l’unicode dans OOo
# http://www.openoffice.org/issues/show_bug.cgi?id=75769
# La première colonne dresse une liste de caractères écrits avec des diacritiques combinants :
# http://www.unicode.org/charts/    U0300 +
# La seconde colonne établit l’équivalent en Latin-1 étendu :
# Hunspell fait la modification pour vérifier l’orthographe. (Peut-être pas utile pour Mozilla)
# Apostrophes: U+2019, U+02BC
ICONV 41
ICONV 42
ICONV ’ '
ICONV ʼ '
ICONV ffi ffi
ICONV ffl ffl
ICONV ff ff
ICONV ſt ft
ICONV fi fi
ICONV fl fl
ICONV st st
ICONV ſ s
ICONV à à
ICONV â â
ICONV ä ä
ICONV é é
ICONV è è
ICONV ê ê
ICONV ë ë

Modified gc_lang/fr/perf_memo.txt from [cec037999d] to [15962af16c].

19
20
21
22
23
24
25

19
20
21
22
23
24
25
26







+
0.5.12      2016.10.14 18:58    4.51895     1.0843      0.772805    0.22387     0.249411    0.261593    0.628802    0.339303    0.0570326   0.00805416  
0.5.15      2017.01.22 11:44    4.85204     1.16134     0.770762    0.227874    0.244574    0.253305    0.58831     0.319987    0.0603996   0.00694786  
0.5.15      2017.01.22 11:47    4.85593     1.15248     0.762924    0.22744     0.243461    0.254609    0.586741    0.317503    0.0588827   0.00701016  (unicode normalisation NFC)
0.5.15      2017.01.31 12:06    4.88227     1.18008     0.782217    0.232617    0.247672    0.257628    0.596903    0.32169     0.0603505   0.00695196  
0.5.15      2017.02.05 10:10    4.90222     1.18444     0.786696    0.233413    0.25071     0.260214    0.602112    0.325235    0.0609932   0.00706897  
0.5.16      2017.05.12 07:41    4.92201     1.19269     0.80639     0.239147    0.257518    0.266523    0.62111     0.33359     0.0634668   0.00757178  
0.6.1       2018.02.12 09:58    5.25924     1.2649      0.878442    0.257465    0.280558    0.293903    0.686887    0.391275    0.0672474   0.00824723  
0.6.2       2018.02.19 09:06    6.20116     1.44334     1.02936     0.272956    0.311561    0.362367    0.812705    0.419061    0.0773003   0.00845671  (spelling normalization)

Modified gc_lang/fr/webext/manifest.json from [d0c2c44fc6] to [92ab4049ef].

1
2
3
4
5

6
7
8
9
10
11
12
1
2
3
4

5
6
7
8
9
10
11
12




-
+







{
  "manifest_version": 2,
  "name": "Grammalecte [fr]",
  "short_name": "Grammalecte [fr]",
  "version": "0.6.1",
  "version": "0.6.2",

  "applications": {
    "gecko": {
      "id": "French-GC@grammalecte.net",
      "strict_min_version": "56.0"
    }
  },

Modified graphspell-js/char_player.js from [c9b14a8774] to [c171c18615].

1
2
3
4
5
6
7
8
9













10
11
12
13
14
15
16

17
18
19
20
21
22
23
24
25

26
27

28
29
30
31
32
33
34
1
2
3
4
5
6
7
8

9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37

38
39

40
41
42
43
44
45
46
47








-
+
+
+
+
+
+
+
+
+
+
+
+
+







+








-
+

-
+







// list of similar chars
// useful for suggestion mechanism

${map}


var char_player = {

    _dTransChars: new Map([
    _xTransCharsForSpelling: new Map([
        ['ſ', 's'],  ['ffi', 'ffi'],  ['ffl', 'ffl'],  ['ff', 'ff'],  ['ſt', 'ft'],  ['fi', 'fi'],  ['fl', 'fl'],  ['st', 'st']
    ]),

    spellingNormalization: function (sWord) {
        let sNewWord = "";
        for (let c of sWord) {
            sNewWord += this._xTransCharsForSpelling.gl_get(c, c);
        }
        return sNewWord.normalize("NFC");
    },

    _xTransCharsForSimplification: new Map([
        ['à', 'a'],  ['é', 'e'],  ['î', 'i'],  ['ô', 'o'],  ['û', 'u'],  ['ÿ', 'i'],  ['y', 'i'],
        ['â', 'a'],  ['è', 'e'],  ['ï', 'i'],  ['ö', 'o'],  ['ù', 'u'],  ['ŷ', 'i'],
        ['ä', 'a'],  ['ê', 'e'],  ['í', 'i'],  ['ó', 'o'],  ['ü', 'u'],  ['ý', 'i'],
        ['á', 'a'],  ['ë', 'e'],  ['ì', 'i'],  ['ò', 'o'],  ['ú', 'u'],  ['ỳ', 'i'],
        ['ā', 'a'],  ['ē', 'e'],  ['ī', 'i'],  ['ō', 'o'],  ['ū', 'u'],  ['ȳ', 'i'],
        ['ñ', 'n'],  ['k', 'q'],  ['w', 'v'],
        ['œ', 'oe'], ['æ', 'ae'], 
        ['ſ', 's'],  ['ffi', 'ffi'],  ['ffl', 'ffl'],  ['ff', 'ff'],  ['ſt', 'ft'],  ['fi', 'fi'],  ['fl', 'fl'],  ['st', 'st']
    ]),

    simplifyWord: function (sWord) {
        // word simplication before calculating distance between words
        sWord = sWord.toLowerCase();
        let sNewWord = "";
        let i = 1;
        for (let c of sWord) {
            let cNew = this._dTransChars.gl_get(c, c);
            let cNew = this._xTransCharsForSimplification.gl_get(c, c);
            let cNext = sWord.slice(i, i+1)
            if (cNew != this._dTransChars.gl_get(cNext, cNext)) {
            if (cNew != this._xTransCharsForSimplification.gl_get(cNext, cNext)) {
                sNewWord += cNew;
            }
            i++;
        }
        return sNewWord.replace(/eau/g, "o").replace(/au/g, "o").replace(/ai/g, "e").replace(/ei/g, "e").replace(/ph/g, "f");
    },

Modified graphspell-js/ibdawg.js from [08ad598b63] to [73e27f350e].

206
207
208
209
210
211
212

213
214
215
216
217
218
219
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220







+







            "sByDic": this.sByDic    // binary word graph
        };
        return oJSON;
    }

    isValidToken (sToken) {
        // checks if sToken is valid (if there is hyphens in sToken, sToken is split, each part is checked)
        sToken = char_player.spellingNormalization(sToken)
        if (this.isValid(sToken)) {
            return true;
        }
        if (sToken.includes("-")) {
            if (sToken.gl_count("-") > 4) {
                return true;
            }
276
277
278
279
280
281
282

283
284
285
286
287
288
289
290
291
292
293
294

295
296
297
298
299
300
301
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304







+












+







            }
        }
        return Boolean(this._convBytesToInteger(this.byDic.slice(iAddr, iAddr+this.nBytesArc)) & this._finalNodeMask);
    }

    getMorph (sWord) {
        // retrieves morphologies list, different casing allowed
        sWord = char_player.spellingNormalization(sWord)
        let l = this.morph(sWord);
        if (sWord[0].gl_isUpperCase()) {
            l.push(...this.morph(sWord.toLowerCase()));
            if (sWord.gl_isUpperCase() && sWord.length > 1) {
                l.push(...this.morph(sWord.gl_toCapitalize()));
            }
        }
        return l;
    }

    suggest (sWord, nSuggLimit=10) {
        // returns a array of suggestions for <sWord>
        sWord = char_player.spellingNormalization(sWord)
        let sPfx = "";
        let sSfx = "";
        [sPfx, sWord, sSfx] = char_player.cut(sWord);
        let nMaxSwitch = Math.max(Math.floor(sWord.length / 3), 1);
        let nMaxDel = Math.floor(sWord.length / 5);
        let nMaxHardRepl = Math.max(Math.floor((sWord.length - 5) / 4), 1);
        let oSuggResult = new SuggResult(sWord);

Modified graphspell/char_player.py from [82e97eae54] to [e841b9211a].

1
2
3
4

5
6
7









8
9
10
11
12
13
14


15
16
17
18
19

20
21
22
23
24
25
26
1
2
3
4
5
6
7

8
9
10
11
12
13
14
15
16
17
18
19
20
21
22

23
24
25
26
27
28

29
30
31
32
33
34
35
36




+


-
+
+
+
+
+
+
+
+
+






-
+
+




-
+







# list of similar chars
# useful for suggestion mechanism

import re
import unicodedata


_xTransChars = str.maketrans({
_xTransCharsForSpelling = str.maketrans({
    'ſ': 's',  'ffi': 'ffi',  'ffl': 'ffl',  'ff': 'ff',  'ſt': 'ft',  'fi': 'fi',  'fl': 'fl',  'st': 'st'
})

def spellingNormalization (sWord):
    return unicodedata.normalize("NFC", sWord.translate(_xTransCharsForSpelling))


_xTransCharsForSimplification = str.maketrans({
    'à': 'a',  'é': 'e',  'î': 'i',  'ô': 'o',  'û': 'u',  'ÿ': 'i',  "y": "i",
    'â': 'a',  'è': 'e',  'ï': 'i',  'ö': 'o',  'ù': 'u',  'ŷ': 'i',
    'ä': 'a',  'ê': 'e',  'í': 'i',  'ó': 'o',  'ü': 'u',  'ý': 'i',
    'á': 'a',  'ë': 'e',  'ì': 'i',  'ò': 'o',  'ú': 'u',  'ỳ': 'i',
    'ā': 'a',  'ē': 'e',  'ī': 'i',  'ō': 'o',  'ū': 'u',  'ȳ': 'i',
    'ñ': 'n',  'k': 'q',  'w': 'v',
    'œ': 'oe',  'æ': 'ae', 
    'œ': 'oe',  'æ': 'ae',
    'ſ': 's',  'ffi': 'ffi',  'ffl': 'ffl',  'ff': 'ff',  'ſt': 'ft',  'fi': 'fi',  'fl': 'fl',  'st': 'st', 
})

def simplifyWord (sWord):
    "word simplication before calculating distance between words"
    sWord = sWord.lower().translate(_xTransChars)
    sWord = sWord.lower().translate(_xTransCharsForSimplification)
    sNewWord = ""
    for i, c in enumerate(sWord, 1):
        if c != sWord[i:i+1]:
            sNewWord += c
    return sNewWord.replace("eau", "o").replace("au", "o").replace("ai", "e").replace("ei", "e").replace("ph", "f")


Modified graphspell/ibdawg.py from [3bf18d8144] to [c41b426a86].

214
215
216
217
218
219
220

221
222
223
224
225
226
227
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228







+







                            "sByDic": self.byDic.hex()  if bBinaryDictAsHexString  else [ e  for e in self.byDic ]
                        }, ensure_ascii=False))
            if bInJSModule:
                hDst.write(";\n\nexports.dictionary = dictionary;\n")

    def isValidToken (self, sToken):
        "checks if <sToken> is valid (if there is hyphens in <sToken>, <sToken> is split, each part is checked)"
        sToken = cp.spellingNormalization(sToken)
        if self.isValid(sToken):
            return True
        if "-" in sToken:
            if sToken.count("-") > 4:
                return True
            return all(self.isValid(sWord)  for sWord in sToken.split("-"))
        return False
256
257
258
259
260
261
262

263
264
265
266
267
268
269
270
271
272

273
274
275
276
277
278
279
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282







+










+







            iAddr = self._lookupArcNode(self.dChar[c], iAddr)
            if iAddr == None:
                return False
        return bool(int.from_bytes(self.byDic[iAddr:iAddr+self.nBytesArc], byteorder='big') & self._finalNodeMask)

    def getMorph (self, sWord):
        "retrieves morphologies list, different casing allowed"
        sWord = cp.spellingNormalization(sWord)
        l = self.morph(sWord)
        if sWord[0:1].isupper():
            l.extend(self.morph(sWord.lower()))
            if sWord.isupper() and len(sWord) > 1:
                l.extend(self.morph(sWord.capitalize()))
        return l

    #@timethis
    def suggest (self, sWord, nSuggLimit=10):
        "returns a set of suggestions for <sWord>"
        sWord = cp.spellingNormalization(sWord)
        sPfx, sWord, sSfx = cp.cut(sWord)
        nMaxSwitch = max(len(sWord) // 3, 1)
        nMaxDel = len(sWord) // 5
        nMaxHardRepl = max((len(sWord) - 5) // 4, 1)
        oSuggResult = SuggResult(sWord)
        self._suggest(oSuggResult, sWord, nMaxSwitch=nMaxSwitch, nMaxDel=nMaxDel, nMaxHardRepl=nMaxHardRepl)
        if sWord.istitle():
326
327
328
329
330
331
332

333
334
335
336
337
338
339
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343







+







                self._suggest(oSuggResult, "", nMaxSwitch, nMaxDel, nMaxHardRepl, nDeep+1, iAddr, sNewWord, True) # remove last char and go on
                for sRepl in cp.dFinal1.get(sRemain, ()):
                    self._suggest(oSuggResult, sRepl, nMaxSwitch, nMaxDel, nMaxHardRepl, nDeep+1, iAddr, sNewWord, True)

    #@timethis
    def suggest2 (self, sWord, nMaxSugg=10):
        "returns a set of suggestions for <sWord>"
        sWord = cp.spellingNormalization(sWord)
        sPfx, sWord, sSfx = cp.cut(sWord)
        oSuggResult = SuggResult(sWord)
        self._suggest2(oSuggResult)
        aSugg = oSuggResult.getSuggestions()
        if sSfx or sPfx:
            # we add what we removed
            return list(map(lambda sSug: sPfx + sSug + sSfx, aSugg))
382
383
384
385
386
387
388

389
390
391
392
393
394
395
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400







+







                    aTails.add(sTail + self.dCharVal[nVal])
                if n and not aTails:
                    aTails.update(self._getTails(jAddr, sTail+self.dCharVal[nVal], n-1))
        return aTails

    def drawPath (self, sWord, iAddr=0):
        "show the path taken by <sWord> in the graph"
        sWord = cp.spellingNormalization(sWord)
        c1 = sWord[0:1]  if sWord  else " "
        iPos = -1
        n = 0
        print(c1 + ": ", end="")
        for c2, jAddr in self._getCharArcs(iAddr):
            print(c2, end="")
            if c2 == sWord[0:1]: