Grammalecte  Check-in [0b7150270a]

Overview
Comment:merge trunk
Downloads: Tarball | ZIP archive | SQL archive
Timelines: family | ancestors | descendants | both | multid
Files: files | file ages | folders
SHA3-256: 0b7150270a659e28c3cd498edd1990327ad2f219db00ca9c9c4c8eadc3d4384e
User & Date: olr on 2018-02-19 09:44:11
Other Links: branch diff | manifest | tags
Context
2018-02-19
12:37
[lo] update: helpers check-in: 12ad381687 user: olr tags: lo, multid
09:44
merge trunk check-in: 0b7150270a user: olr tags: multid
09:11
[fr] version 0.6.2 check-in: 18027d1022 user: olr tags: trunk, fr
2018-02-18
16:28
[lo] UI for dictionaries options check-in: edf22c7d52 user: olr tags: lo, multid
Changes

Modified gc_lang/fr/build_data.py from [a0d5d064eb] to [1f69de4a2f].

267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
def makePhonetTable (sp, bJS=False):
    print("> Correspondances phonétiques ", end="")
    print("(Python et JavaScript)"  if bJS  else "(Python seulement)")
    
    import gc_lang.fr.modules.conj as conj

    try:
        oDict = ibdawg.IBDAWG("French.bdic")
    except:
        traceback.print_exc()
        return

    # set of homophonic words
    lSet = []
    for sLine in readFile(sp+"/data/phonet_simil.txt"):







|







267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
def makePhonetTable (sp, bJS=False):
    print("> Correspondances phonétiques ", end="")
    print("(Python et JavaScript)"  if bJS  else "(Python seulement)")
    
    import gc_lang.fr.modules.conj as conj

    try:
        oDict = ibdawg.IBDAWG("fr.bdic")
    except:
        traceback.print_exc()
        return

    # set of homophonic words
    lSet = []
    for sLine in readFile(sp+"/data/phonet_simil.txt"):

Modified gc_lang/fr/config.ini from [08f47bce51] to [c7e11a6902].

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
[args]
lang = fr
lang_name = French
locales = fr_FR fr_BE fr_CA fr_CH fr_LU fr_MC fr_BF fr_CI fr_SN fr_ML fr_NE fr_TG fr_BJ
country_default = FR
name = Grammalecte
implname = grammalecte
# always use 3 numbers for version: x.y.z
version = 0.6.1
author = Olivier R.
provider = Dicollecte
link = http://grammalecte.net
description = Correcteur grammatical pour le français.
extras = README_fr.txt
logo = logo.png









|







1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
[args]
lang = fr
lang_name = French
locales = fr_FR fr_BE fr_CA fr_CH fr_LU fr_MC fr_BF fr_CI fr_SN fr_ML fr_NE fr_TG fr_BJ
country_default = FR
name = Grammalecte
implname = grammalecte
# always use 3 numbers for version: x.y.z
version = 0.6.2
author = Olivier R.
provider = Dicollecte
link = http://grammalecte.net
description = Correcteur grammatical pour le français.
extras = README_fr.txt
logo = logo.png

Modified gc_lang/fr/dictionnaire/genfrdic.py from [5036afecd5] to [e42dad16b6].

813
814
815
816
817
818
819
820
821
822
823
824
825
826
827
    def __str__ (self):
        return "{0.lemma}/{0.flags} {1}".format(self, self.getMorph(2))

    def check (self):
        sErr = ''
        if self.lemma == '':
            sErr += 'lemme vide'
        if not re.match(r"[a-zA-ZéÉôÔàâÂîÎïèÈêÊÜœŒæÆçÇ0-9µåÅΩ&αβγδεζηθικλμνξοπρστυφχψωΔℓΩ_]", self.lemma):
            sErr += 'premier caractère inconnu: ' + self.lemma[0]
        if re.search(r"\s$", self.lemma):
            sErr += 'espace en fin de lemme'
        if re.match(r"v[0123]", self.po) and not re.match(r"[eas_][ix_][tx_][nx_][pqreuvx_][mx_][ex_z][ax_z]\b", self.po[2:]):
            sErr += 'verbe inconnu: ' + self.po
        if (re.match(r"S[*.]", self.flags) and re.search("[sxz]$", self.lemma)) or (re.match(r"X[*.]", self.flags) and not re.search("[ul]$", self.lemma)):
            sErr += 'drapeau inutile'







|







813
814
815
816
817
818
819
820
821
822
823
824
825
826
827
    def __str__ (self):
        return "{0.lemma}/{0.flags} {1}".format(self, self.getMorph(2))

    def check (self):
        sErr = ''
        if self.lemma == '':
            sErr += 'lemme vide'
        if not re.match(r"[a-zA-ZéÉôÔàâáÂîÎïèÈêÊÜœŒæÆçÇ0-9µåÅΩ&αβγδεζηθικλμνξοπρστυφχψωΔℓΩ_]", self.lemma):
            sErr += 'premier caractère inconnu: ' + self.lemma[0]
        if re.search(r"\s$", self.lemma):
            sErr += 'espace en fin de lemme'
        if re.match(r"v[0123]", self.po) and not re.match(r"[eas_][ix_][tx_][nx_][pqreuvx_][mx_][ex_z][ax_z]\b", self.po[2:]):
            sErr += 'verbe inconnu: ' + self.po
        if (re.match(r"S[*.]", self.flags) and re.search("[sxz]$", self.lemma)) or (re.match(r"X[*.]", self.flags) and not re.search("[ul]$", self.lemma)):
            sErr += 'drapeau inutile'

Modified gc_lang/fr/dictionnaire/orthographe/FRANCAIS_5.aff from [078b475b37] to [a2e4f22697].

48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
MAP tT
MAP vV
MAP wW
MAP xX
MAP zZ

# Remplacements envisagés & barbarismes
REP 84
REP ^Ca$ Ça
REP ^l l'
REP ^d d'
REP ^n n'
REP ^s s'
REP ^j j'
REP ^m m'







|







48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
MAP tT
MAP vV
MAP wW
MAP xX
MAP zZ

# Remplacements envisagés & barbarismes
REP 82
REP ^Ca$ Ça
REP ^l l'
REP ^d d'
REP ^n n'
REP ^s s'
REP ^j j'
REP ^m m'
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
REP faisez$ faites
REP puit puits
REP sanctionnable punissable
REP questionnable discutable
REP antitartre détartrant
REP email courriel
REP construirent construisirent
REP cad$ c’est-à-dire
REP càd$ c’est-à-dire


# Phonétique
#PHONE 69
#PHONE AN(DT)$                   @
#PHONE AILL                      AY
#PHONE AIS$                      E







<
<







131
132
133
134
135
136
137


138
139
140
141
142
143
144
REP faisez$ faites
REP puit puits
REP sanctionnable punissable
REP questionnable discutable
REP antitartre détartrant
REP email courriel
REP construirent construisirent




# Phonétique
#PHONE 69
#PHONE AN(DT)$                   @
#PHONE AILL                      AY
#PHONE AIS$                      E
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338

339
340
341
342
343
344
345
# Astuce de Hunspell pour contourner la non-normalisation de l’unicode dans OOo
# http://www.openoffice.org/issues/show_bug.cgi?id=75769
# La première colonne dresse une liste de caractères écrits avec des diacritiques combinants :
# http://www.unicode.org/charts/    U0300 +
# La seconde colonne établit l’équivalent en Latin-1 étendu :
# Hunspell fait la modification pour vérifier l’orthographe. (Peut-être pas utile pour Mozilla)
# Apostrophes: U+2019, U+02BC
ICONV 41
ICONV ’ '
ICONV ʼ '
ICONV ffi ffi
ICONV ffl ffl
ICONV ff ff
ICONV ſt ft
ICONV fi fi
ICONV fl fl
ICONV st st

ICONV à à
ICONV â â
ICONV ä ä
ICONV é é
ICONV è è
ICONV ê ê
ICONV ë ë







|









>







320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
# Astuce de Hunspell pour contourner la non-normalisation de l’unicode dans OOo
# http://www.openoffice.org/issues/show_bug.cgi?id=75769
# La première colonne dresse une liste de caractères écrits avec des diacritiques combinants :
# http://www.unicode.org/charts/    U0300 +
# La seconde colonne établit l’équivalent en Latin-1 étendu :
# Hunspell fait la modification pour vérifier l’orthographe. (Peut-être pas utile pour Mozilla)
# Apostrophes: U+2019, U+02BC
ICONV 42
ICONV ’ '
ICONV ʼ '
ICONV ffi ffi
ICONV ffl ffl
ICONV ff ff
ICONV ſt ft
ICONV fi fi
ICONV fl fl
ICONV st st
ICONV ſ s
ICONV à à
ICONV â â
ICONV ä ä
ICONV é é
ICONV è è
ICONV ê ê
ICONV ë ë

Modified gc_lang/fr/perf_memo.txt from [cec037999d] to [15962af16c].

19
20
21
22
23
24
25

0.5.12      2016.10.14 18:58    4.51895     1.0843      0.772805    0.22387     0.249411    0.261593    0.628802    0.339303    0.0570326   0.00805416  
0.5.15      2017.01.22 11:44    4.85204     1.16134     0.770762    0.227874    0.244574    0.253305    0.58831     0.319987    0.0603996   0.00694786  
0.5.15      2017.01.22 11:47    4.85593     1.15248     0.762924    0.22744     0.243461    0.254609    0.586741    0.317503    0.0588827   0.00701016  (unicode normalisation NFC)
0.5.15      2017.01.31 12:06    4.88227     1.18008     0.782217    0.232617    0.247672    0.257628    0.596903    0.32169     0.0603505   0.00695196  
0.5.15      2017.02.05 10:10    4.90222     1.18444     0.786696    0.233413    0.25071     0.260214    0.602112    0.325235    0.0609932   0.00706897  
0.5.16      2017.05.12 07:41    4.92201     1.19269     0.80639     0.239147    0.257518    0.266523    0.62111     0.33359     0.0634668   0.00757178  
0.6.1       2018.02.12 09:58    5.25924     1.2649      0.878442    0.257465    0.280558    0.293903    0.686887    0.391275    0.0672474   0.00824723  








>
19
20
21
22
23
24
25
26
0.5.12      2016.10.14 18:58    4.51895     1.0843      0.772805    0.22387     0.249411    0.261593    0.628802    0.339303    0.0570326   0.00805416  
0.5.15      2017.01.22 11:44    4.85204     1.16134     0.770762    0.227874    0.244574    0.253305    0.58831     0.319987    0.0603996   0.00694786  
0.5.15      2017.01.22 11:47    4.85593     1.15248     0.762924    0.22744     0.243461    0.254609    0.586741    0.317503    0.0588827   0.00701016  (unicode normalisation NFC)
0.5.15      2017.01.31 12:06    4.88227     1.18008     0.782217    0.232617    0.247672    0.257628    0.596903    0.32169     0.0603505   0.00695196  
0.5.15      2017.02.05 10:10    4.90222     1.18444     0.786696    0.233413    0.25071     0.260214    0.602112    0.325235    0.0609932   0.00706897  
0.5.16      2017.05.12 07:41    4.92201     1.19269     0.80639     0.239147    0.257518    0.266523    0.62111     0.33359     0.0634668   0.00757178  
0.6.1       2018.02.12 09:58    5.25924     1.2649      0.878442    0.257465    0.280558    0.293903    0.686887    0.391275    0.0672474   0.00824723  
0.6.2       2018.02.19 09:06    6.20116     1.44334     1.02936     0.272956    0.311561    0.362367    0.812705    0.419061    0.0773003   0.00845671  (spelling normalization)

Modified gc_lang/fr/webext/manifest.json from [d0c2c44fc6] to [92ab4049ef].

1
2
3
4
5
6
7
8
9
10
11
12
{
  "manifest_version": 2,
  "name": "Grammalecte [fr]",
  "short_name": "Grammalecte [fr]",
  "version": "0.6.1",

  "applications": {
    "gecko": {
      "id": "French-GC@grammalecte.net",
      "strict_min_version": "56.0"
    }
  },




|







1
2
3
4
5
6
7
8
9
10
11
12
{
  "manifest_version": 2,
  "name": "Grammalecte [fr]",
  "short_name": "Grammalecte [fr]",
  "version": "0.6.2",

  "applications": {
    "gecko": {
      "id": "French-GC@grammalecte.net",
      "strict_min_version": "56.0"
    }
  },

Modified graphspell-js/char_player.js from [c9b14a8774] to [c171c18615].

1
2
3
4
5
6
7
8
9












10
11
12
13
14
15
16

17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
// list of similar chars
// useful for suggestion mechanism

${map}


var char_player = {

    _dTransChars: new Map([












        ['à', 'a'],  ['é', 'e'],  ['î', 'i'],  ['ô', 'o'],  ['û', 'u'],  ['ÿ', 'i'],  ['y', 'i'],
        ['â', 'a'],  ['è', 'e'],  ['ï', 'i'],  ['ö', 'o'],  ['ù', 'u'],  ['ŷ', 'i'],
        ['ä', 'a'],  ['ê', 'e'],  ['í', 'i'],  ['ó', 'o'],  ['ü', 'u'],  ['ý', 'i'],
        ['á', 'a'],  ['ë', 'e'],  ['ì', 'i'],  ['ò', 'o'],  ['ú', 'u'],  ['ỳ', 'i'],
        ['ā', 'a'],  ['ē', 'e'],  ['ī', 'i'],  ['ō', 'o'],  ['ū', 'u'],  ['ȳ', 'i'],
        ['ñ', 'n'],  ['k', 'q'],  ['w', 'v'],
        ['œ', 'oe'], ['æ', 'ae'], 

    ]),

    simplifyWord: function (sWord) {
        // word simplication before calculating distance between words
        sWord = sWord.toLowerCase();
        let sNewWord = "";
        let i = 1;
        for (let c of sWord) {
            let cNew = this._dTransChars.gl_get(c, c);
            let cNext = sWord.slice(i, i+1)
            if (cNew != this._dTransChars.gl_get(cNext, cNext)) {
                sNewWord += cNew;
            }
            i++;
        }
        return sNewWord.replace(/eau/g, "o").replace(/au/g, "o").replace(/ai/g, "e").replace(/ei/g, "e").replace(/ph/g, "f");
    },









|
>
>
>
>
>
>
>
>
>
>
>
>







>








|

|







1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
// list of similar chars
// useful for suggestion mechanism

${map}


var char_player = {

    _xTransCharsForSpelling: new Map([
        ['ſ', 's'],  ['ffi', 'ffi'],  ['ffl', 'ffl'],  ['ff', 'ff'],  ['ſt', 'ft'],  ['fi', 'fi'],  ['fl', 'fl'],  ['st', 'st']
    ]),

    spellingNormalization: function (sWord) {
        let sNewWord = "";
        for (let c of sWord) {
            sNewWord += this._xTransCharsForSpelling.gl_get(c, c);
        }
        return sNewWord.normalize("NFC");
    },

    _xTransCharsForSimplification: new Map([
        ['à', 'a'],  ['é', 'e'],  ['î', 'i'],  ['ô', 'o'],  ['û', 'u'],  ['ÿ', 'i'],  ['y', 'i'],
        ['â', 'a'],  ['è', 'e'],  ['ï', 'i'],  ['ö', 'o'],  ['ù', 'u'],  ['ŷ', 'i'],
        ['ä', 'a'],  ['ê', 'e'],  ['í', 'i'],  ['ó', 'o'],  ['ü', 'u'],  ['ý', 'i'],
        ['á', 'a'],  ['ë', 'e'],  ['ì', 'i'],  ['ò', 'o'],  ['ú', 'u'],  ['ỳ', 'i'],
        ['ā', 'a'],  ['ē', 'e'],  ['ī', 'i'],  ['ō', 'o'],  ['ū', 'u'],  ['ȳ', 'i'],
        ['ñ', 'n'],  ['k', 'q'],  ['w', 'v'],
        ['œ', 'oe'], ['æ', 'ae'], 
        ['ſ', 's'],  ['ffi', 'ffi'],  ['ffl', 'ffl'],  ['ff', 'ff'],  ['ſt', 'ft'],  ['fi', 'fi'],  ['fl', 'fl'],  ['st', 'st']
    ]),

    simplifyWord: function (sWord) {
        // word simplication before calculating distance between words
        sWord = sWord.toLowerCase();
        let sNewWord = "";
        let i = 1;
        for (let c of sWord) {
            let cNew = this._xTransCharsForSimplification.gl_get(c, c);
            let cNext = sWord.slice(i, i+1)
            if (cNew != this._xTransCharsForSimplification.gl_get(cNext, cNext)) {
                sNewWord += cNew;
            }
            i++;
        }
        return sNewWord.replace(/eau/g, "o").replace(/au/g, "o").replace(/ai/g, "e").replace(/ei/g, "e").replace(/ph/g, "f");
    },

Modified graphspell-js/ibdawg.js from [08ad598b63] to [73e27f350e].

206
207
208
209
210
211
212

213
214
215
216
217
218
219
            "sByDic": this.sByDic    // binary word graph
        };
        return oJSON;
    }

    isValidToken (sToken) {
        // checks if sToken is valid (if there is hyphens in sToken, sToken is split, each part is checked)

        if (this.isValid(sToken)) {
            return true;
        }
        if (sToken.includes("-")) {
            if (sToken.gl_count("-") > 4) {
                return true;
            }







>







206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
            "sByDic": this.sByDic    // binary word graph
        };
        return oJSON;
    }

    isValidToken (sToken) {
        // checks if sToken is valid (if there is hyphens in sToken, sToken is split, each part is checked)
        sToken = char_player.spellingNormalization(sToken)
        if (this.isValid(sToken)) {
            return true;
        }
        if (sToken.includes("-")) {
            if (sToken.gl_count("-") > 4) {
                return true;
            }
276
277
278
279
280
281
282

283
284
285
286
287
288
289
290
291
292
293
294

295
296
297
298
299
300
301
            }
        }
        return Boolean(this._convBytesToInteger(this.byDic.slice(iAddr, iAddr+this.nBytesArc)) & this._finalNodeMask);
    }

    getMorph (sWord) {
        // retrieves morphologies list, different casing allowed

        let l = this.morph(sWord);
        if (sWord[0].gl_isUpperCase()) {
            l.push(...this.morph(sWord.toLowerCase()));
            if (sWord.gl_isUpperCase() && sWord.length > 1) {
                l.push(...this.morph(sWord.gl_toCapitalize()));
            }
        }
        return l;
    }

    suggest (sWord, nSuggLimit=10) {
        // returns a array of suggestions for <sWord>

        let sPfx = "";
        let sSfx = "";
        [sPfx, sWord, sSfx] = char_player.cut(sWord);
        let nMaxSwitch = Math.max(Math.floor(sWord.length / 3), 1);
        let nMaxDel = Math.floor(sWord.length / 5);
        let nMaxHardRepl = Math.max(Math.floor((sWord.length - 5) / 4), 1);
        let oSuggResult = new SuggResult(sWord);







>












>







277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
            }
        }
        return Boolean(this._convBytesToInteger(this.byDic.slice(iAddr, iAddr+this.nBytesArc)) & this._finalNodeMask);
    }

    getMorph (sWord) {
        // retrieves morphologies list, different casing allowed
        sWord = char_player.spellingNormalization(sWord)
        let l = this.morph(sWord);
        if (sWord[0].gl_isUpperCase()) {
            l.push(...this.morph(sWord.toLowerCase()));
            if (sWord.gl_isUpperCase() && sWord.length > 1) {
                l.push(...this.morph(sWord.gl_toCapitalize()));
            }
        }
        return l;
    }

    suggest (sWord, nSuggLimit=10) {
        // returns a array of suggestions for <sWord>
        sWord = char_player.spellingNormalization(sWord)
        let sPfx = "";
        let sSfx = "";
        [sPfx, sWord, sSfx] = char_player.cut(sWord);
        let nMaxSwitch = Math.max(Math.floor(sWord.length / 3), 1);
        let nMaxDel = Math.floor(sWord.length / 5);
        let nMaxHardRepl = Math.max(Math.floor((sWord.length - 5) / 4), 1);
        let oSuggResult = new SuggResult(sWord);

Modified graphspell/char_player.py from [82e97eae54] to [e841b9211a].

1
2
3
4

5
6
7








8
9
10
11
12
13
14

15
16
17
18
19
20
21
22
23
24
25
26
# list of similar chars
# useful for suggestion mechanism

import re



_xTransChars = str.maketrans({








    'à': 'a',  'é': 'e',  'î': 'i',  'ô': 'o',  'û': 'u',  'ÿ': 'i',  "y": "i",
    'â': 'a',  'è': 'e',  'ï': 'i',  'ö': 'o',  'ù': 'u',  'ŷ': 'i',
    'ä': 'a',  'ê': 'e',  'í': 'i',  'ó': 'o',  'ü': 'u',  'ý': 'i',
    'á': 'a',  'ë': 'e',  'ì': 'i',  'ò': 'o',  'ú': 'u',  'ỳ': 'i',
    'ā': 'a',  'ē': 'e',  'ī': 'i',  'ō': 'o',  'ū': 'u',  'ȳ': 'i',
    'ñ': 'n',  'k': 'q',  'w': 'v',
    'œ': 'oe',  'æ': 'ae', 

})

def simplifyWord (sWord):
    "word simplication before calculating distance between words"
    sWord = sWord.lower().translate(_xTransChars)
    sNewWord = ""
    for i, c in enumerate(sWord, 1):
        if c != sWord[i:i+1]:
            sNewWord += c
    return sNewWord.replace("eau", "o").replace("au", "o").replace("ai", "e").replace("ei", "e").replace("ph", "f")






>


|
>
>
>
>
>
>
>
>






|
>




|







1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
# list of similar chars
# useful for suggestion mechanism

import re
import unicodedata


_xTransCharsForSpelling = str.maketrans({
    'ſ': 's',  'ffi': 'ffi',  'ffl': 'ffl',  'ff': 'ff',  'ſt': 'ft',  'fi': 'fi',  'fl': 'fl',  'st': 'st'
})

def spellingNormalization (sWord):
    return unicodedata.normalize("NFC", sWord.translate(_xTransCharsForSpelling))


_xTransCharsForSimplification = str.maketrans({
    'à': 'a',  'é': 'e',  'î': 'i',  'ô': 'o',  'û': 'u',  'ÿ': 'i',  "y": "i",
    'â': 'a',  'è': 'e',  'ï': 'i',  'ö': 'o',  'ù': 'u',  'ŷ': 'i',
    'ä': 'a',  'ê': 'e',  'í': 'i',  'ó': 'o',  'ü': 'u',  'ý': 'i',
    'á': 'a',  'ë': 'e',  'ì': 'i',  'ò': 'o',  'ú': 'u',  'ỳ': 'i',
    'ā': 'a',  'ē': 'e',  'ī': 'i',  'ō': 'o',  'ū': 'u',  'ȳ': 'i',
    'ñ': 'n',  'k': 'q',  'w': 'v',
    'œ': 'oe',  'æ': 'ae',
    'ſ': 's',  'ffi': 'ffi',  'ffl': 'ffl',  'ff': 'ff',  'ſt': 'ft',  'fi': 'fi',  'fl': 'fl',  'st': 'st', 
})

def simplifyWord (sWord):
    "word simplication before calculating distance between words"
    sWord = sWord.lower().translate(_xTransCharsForSimplification)
    sNewWord = ""
    for i, c in enumerate(sWord, 1):
        if c != sWord[i:i+1]:
            sNewWord += c
    return sNewWord.replace("eau", "o").replace("au", "o").replace("ai", "e").replace("ei", "e").replace("ph", "f")


Modified graphspell/ibdawg.py from [3bf18d8144] to [c41b426a86].

214
215
216
217
218
219
220

221
222
223
224
225
226
227
                            "sByDic": self.byDic.hex()  if bBinaryDictAsHexString  else [ e  for e in self.byDic ]
                        }, ensure_ascii=False))
            if bInJSModule:
                hDst.write(";\n\nexports.dictionary = dictionary;\n")

    def isValidToken (self, sToken):
        "checks if <sToken> is valid (if there is hyphens in <sToken>, <sToken> is split, each part is checked)"

        if self.isValid(sToken):
            return True
        if "-" in sToken:
            if sToken.count("-") > 4:
                return True
            return all(self.isValid(sWord)  for sWord in sToken.split("-"))
        return False







>







214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
                            "sByDic": self.byDic.hex()  if bBinaryDictAsHexString  else [ e  for e in self.byDic ]
                        }, ensure_ascii=False))
            if bInJSModule:
                hDst.write(";\n\nexports.dictionary = dictionary;\n")

    def isValidToken (self, sToken):
        "checks if <sToken> is valid (if there is hyphens in <sToken>, <sToken> is split, each part is checked)"
        sToken = cp.spellingNormalization(sToken)
        if self.isValid(sToken):
            return True
        if "-" in sToken:
            if sToken.count("-") > 4:
                return True
            return all(self.isValid(sWord)  for sWord in sToken.split("-"))
        return False
256
257
258
259
260
261
262

263
264
265
266
267
268
269
270
271
272

273
274
275
276
277
278
279
            iAddr = self._lookupArcNode(self.dChar[c], iAddr)
            if iAddr == None:
                return False
        return bool(int.from_bytes(self.byDic[iAddr:iAddr+self.nBytesArc], byteorder='big') & self._finalNodeMask)

    def getMorph (self, sWord):
        "retrieves morphologies list, different casing allowed"

        l = self.morph(sWord)
        if sWord[0:1].isupper():
            l.extend(self.morph(sWord.lower()))
            if sWord.isupper() and len(sWord) > 1:
                l.extend(self.morph(sWord.capitalize()))
        return l

    #@timethis
    def suggest (self, sWord, nSuggLimit=10):
        "returns a set of suggestions for <sWord>"

        sPfx, sWord, sSfx = cp.cut(sWord)
        nMaxSwitch = max(len(sWord) // 3, 1)
        nMaxDel = len(sWord) // 5
        nMaxHardRepl = max((len(sWord) - 5) // 4, 1)
        oSuggResult = SuggResult(sWord)
        self._suggest(oSuggResult, sWord, nMaxSwitch=nMaxSwitch, nMaxDel=nMaxDel, nMaxHardRepl=nMaxHardRepl)
        if sWord.istitle():







>










>







257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
            iAddr = self._lookupArcNode(self.dChar[c], iAddr)
            if iAddr == None:
                return False
        return bool(int.from_bytes(self.byDic[iAddr:iAddr+self.nBytesArc], byteorder='big') & self._finalNodeMask)

    def getMorph (self, sWord):
        "retrieves morphologies list, different casing allowed"
        sWord = cp.spellingNormalization(sWord)
        l = self.morph(sWord)
        if sWord[0:1].isupper():
            l.extend(self.morph(sWord.lower()))
            if sWord.isupper() and len(sWord) > 1:
                l.extend(self.morph(sWord.capitalize()))
        return l

    #@timethis
    def suggest (self, sWord, nSuggLimit=10):
        "returns a set of suggestions for <sWord>"
        sWord = cp.spellingNormalization(sWord)
        sPfx, sWord, sSfx = cp.cut(sWord)
        nMaxSwitch = max(len(sWord) // 3, 1)
        nMaxDel = len(sWord) // 5
        nMaxHardRepl = max((len(sWord) - 5) // 4, 1)
        oSuggResult = SuggResult(sWord)
        self._suggest(oSuggResult, sWord, nMaxSwitch=nMaxSwitch, nMaxDel=nMaxDel, nMaxHardRepl=nMaxHardRepl)
        if sWord.istitle():
326
327
328
329
330
331
332

333
334
335
336
337
338
339
                self._suggest(oSuggResult, "", nMaxSwitch, nMaxDel, nMaxHardRepl, nDeep+1, iAddr, sNewWord, True) # remove last char and go on
                for sRepl in cp.dFinal1.get(sRemain, ()):
                    self._suggest(oSuggResult, sRepl, nMaxSwitch, nMaxDel, nMaxHardRepl, nDeep+1, iAddr, sNewWord, True)

    #@timethis
    def suggest2 (self, sWord, nMaxSugg=10):
        "returns a set of suggestions for <sWord>"

        sPfx, sWord, sSfx = cp.cut(sWord)
        oSuggResult = SuggResult(sWord)
        self._suggest2(oSuggResult)
        aSugg = oSuggResult.getSuggestions()
        if sSfx or sPfx:
            # we add what we removed
            return list(map(lambda sSug: sPfx + sSug + sSfx, aSugg))







>







329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
                self._suggest(oSuggResult, "", nMaxSwitch, nMaxDel, nMaxHardRepl, nDeep+1, iAddr, sNewWord, True) # remove last char and go on
                for sRepl in cp.dFinal1.get(sRemain, ()):
                    self._suggest(oSuggResult, sRepl, nMaxSwitch, nMaxDel, nMaxHardRepl, nDeep+1, iAddr, sNewWord, True)

    #@timethis
    def suggest2 (self, sWord, nMaxSugg=10):
        "returns a set of suggestions for <sWord>"
        sWord = cp.spellingNormalization(sWord)
        sPfx, sWord, sSfx = cp.cut(sWord)
        oSuggResult = SuggResult(sWord)
        self._suggest2(oSuggResult)
        aSugg = oSuggResult.getSuggestions()
        if sSfx or sPfx:
            # we add what we removed
            return list(map(lambda sSug: sPfx + sSug + sSfx, aSugg))
382
383
384
385
386
387
388

389
390
391
392
393
394
395
                    aTails.add(sTail + self.dCharVal[nVal])
                if n and not aTails:
                    aTails.update(self._getTails(jAddr, sTail+self.dCharVal[nVal], n-1))
        return aTails

    def drawPath (self, sWord, iAddr=0):
        "show the path taken by <sWord> in the graph"

        c1 = sWord[0:1]  if sWord  else " "
        iPos = -1
        n = 0
        print(c1 + ": ", end="")
        for c2, jAddr in self._getCharArcs(iAddr):
            print(c2, end="")
            if c2 == sWord[0:1]:







>







386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
                    aTails.add(sTail + self.dCharVal[nVal])
                if n and not aTails:
                    aTails.update(self._getTails(jAddr, sTail+self.dCharVal[nVal], n-1))
        return aTails

    def drawPath (self, sWord, iAddr=0):
        "show the path taken by <sWord> in the graph"
        sWord = cp.spellingNormalization(sWord)
        c1 = sWord[0:1]  if sWord  else " "
        iPos = -1
        n = 0
        print(c1 + ": ", end="")
        for c2, jAddr in self._getCharArcs(iAddr):
            print(c2, end="")
            if c2 == sWord[0:1]: