Grammalecte  Check-in [2381f7c9ae]

Overview
Comment:[graphspell] move function from char_player to str_transform
Downloads: Tarball | ZIP archive | SQL archive
Timelines: family | ancestors | descendants | both | trunk | graphspell
Files: files | file ages | folders
SHA3-256: 2381f7c9ae11b01adf4aaee8cfd9f4f715bd7397edc5bc7142a686e8534a03e0
User & Date: olr on 2020-08-05 06:56:15
Other Links: manifest | tags
Context
2020-08-05
09:30
[graphspell] move functions from char_player to str_transform and lexicographer check-in: 19fccd89d6 user: olr tags: trunk, graphspell
06:56
[graphspell] move function from char_player to str_transform check-in: 2381f7c9ae user: olr tags: trunk, graphspell
06:46
[graphspell][py] fix details check-in: 4617b0dfc4 user: olr tags: trunk, graphspell
Changes

Modified graphspell-js/char_player.js from [3caadd8250] to [ba78268468].

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
1
2
3
4
5
6
7
8
9
10
11












12
13
14
15
16
17
18











-
-
-
-
-
-
-
-
-
-
-
-







// list of similar chars
// useful for suggestion mechanism

/* jshint esversion:6 */
/* jslint esversion:6 */

${map}


var char_player = {

    _xTransCharsForSpelling: new Map([
        ['ſ', 's'],  ['ffi', 'ffi'],  ['ffl', 'ffl'],  ['ff', 'ff'],  ['ſt', 'ft'],  ['fi', 'fi'],  ['fl', 'fl'],  ['st', 'st']
    ]),

    spellingNormalization: function (sWord) {
        let sNewWord = "";
        for (let c of sWord) {
            sNewWord += this._xTransCharsForSpelling.gl_get(c, c);
        }
        return sNewWord.normalize("NFC");
    },

    oDistanceBetweenChars: {
        "a": {},
        "e": {"é": 0.5},
        "é": {"e": 0.5},
        "i": {"y": 0.2},
        "o": {},
        "u": {},

Modified graphspell-js/ibdawg.js from [189561924c] to [39524e288d].

234
235
236
237
238
239
240
241

242
243
244
245
246
247
248
234
235
236
237
238
239
240

241
242
243
244
245
246
247
248







-
+







            "l2grams": this.l2grams
        };
        return oJSON;
    }

    isValidToken (sToken) {
        // checks if sToken is valid (if there is hyphens in sToken, sToken is split, each part is checked)
        sToken = char_player.spellingNormalization(sToken);
        sToken = str_transform.spellingNormalization(sToken);
        if (this.isValid(sToken)) {
            return true;
        }
        if (sToken.includes("-")) {
            if (sToken.gl_count("-") > 4) {
                return true;
            }
307
308
309
310
311
312
313
314

315
316
317
318
319
320
321
322
323
324
325
326
327
328

329
330
331
332
333
334
335
307
308
309
310
311
312
313

314
315
316
317
318
319
320
321
322
323
324
325
326
327

328
329
330
331
332
333
334
335







-
+













-
+







            }
        }
        return Boolean(this._convBytesToInteger(this.byDic.slice(iAddr, iAddr+this.nBytesArc)) & this._finalNodeMask);
    }

    getMorph (sWord) {
        // retrieves morphologies list, different casing allowed
        sWord = char_player.spellingNormalization(sWord);
        sWord = str_transform.spellingNormalization(sWord);
        let l = this.morph(sWord);
        if (sWord[0].gl_isUpperCase()) {
            l.push(...this.morph(sWord.toLowerCase()));
            if (sWord.gl_isUpperCase() && sWord.length > 1) {
                l.push(...this.morph(sWord.gl_toCapitalize()));
            }
        }
        return l;
    }

    suggest (sWord, nSuggLimit=10, bSplitTrailingNumbers=false) {
        // returns a array of suggestions for <sWord>
        //console.time("Suggestions for " + sWord);
        sWord = char_player.spellingNormalization(sWord);
        sWord = str_transform.spellingNormalization(sWord);
        let sPfx = "";
        let sSfx = "";
        [sPfx, sWord, sSfx] = char_player.cut(sWord);
        let nMaxSwitch = Math.max(Math.floor(sWord.length / 3), 1);
        let nMaxDel = Math.floor(sWord.length / 5);
        let nMaxHardRepl = Math.max(Math.floor((sWord.length - 5) / 4), 1);
        let nMaxJump = Math.max(Math.floor(sWord.length / 4), 1);

Modified graphspell-js/str_transform.js from [9baf96ac7b] to [1ca4ee03ac].

20
21
22
23
24
25
26












27
28
29
30
31
32
33
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45







+
+
+
+
+
+
+
+
+
+
+
+







    getNgrams: function (sWord, n=2) {
        let lNgrams = [];
        for (let i=0;  i <= sWord.length - n;  i++) {
            lNgrams.push(sWord.slice(i, i+n));
        }
        return lNgrams;
    },

    _xTransCharsForSpelling: new Map([
        ['ſ', 's'],  ['ffi', 'ffi'],  ['ffl', 'ffl'],  ['ff', 'ff'],  ['ſt', 'ft'],  ['fi', 'fi'],  ['fl', 'fl'],  ['st', 'st']
    ]),

    spellingNormalization: function (sWord) {
        let sNewWord = "";
        for (let c of sWord) {
            sNewWord += this._xTransCharsForSpelling.gl_get(c, c);
        }
        return sNewWord.normalize("NFC");
    },

    longestCommonSubstring: function (string1, string2) {
        // https://en.wikibooks.org/wiki/Algorithm_Implementation/Strings/Longest_common_substring
        // untested

        // init max value
        let longestCommonSubstring = 0;

Modified graphspell/char_player.py from [fa338bf2f3] to [5484ce7bef].

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
1
2
3
4
5
6










7
8
9
10
11
12
13






-
-
-
-
-
-
-
-
-
-







"""
List of similar chars
useful for suggestion mechanism
"""

import re
import unicodedata


_xTransCharsForSpelling = str.maketrans({
    'ſ': 's',  'ffi': 'ffi',  'ffl': 'ffl',  'ff': 'ff',  'ſt': 'ft',  'fi': 'fi',  'fl': 'fl',  'st': 'st'
})

def spellingNormalization (sWord):
    "nomalization NFC and removing ligatures"
    return unicodedata.normalize("NFC", sWord.translate(_xTransCharsForSpelling))


dDistanceBetweenChars = {
    "a": {},
    "e": {"é": 0.5},
    "é": {"e": 0.5},
    "i": {"y": 0.2},

Modified graphspell/ibdawg.py from [f3a55a8c6e] to [78af5c9dd3].

244
245
246
247
248
249
250
251

252
253
254
255
256
257
258
244
245
246
247
248
249
250

251
252
253
254
255
256
257
258







-
+







                "l2grams": list(self.a2grams)
            }, ensure_ascii=False))
            if bInJSModule:
                hDst.write(";\n\nexports.dictionary = dictionary;\n")

    def isValidToken (self, sToken):
        "checks if <sToken> is valid (if there is hyphens in <sToken>, <sToken> is split, each part is checked)"
        sToken = cp.spellingNormalization(sToken)
        sToken = st.spellingNormalization(sToken)
        if self.isValid(sToken):
            return True
        if "-" in sToken:
            if sToken.count("-") > 4:
                return True
            return all(self.isValid(sWord)  for sWord in sToken.split("-"))
        if "." in sToken or "·" in sToken:
288
289
290
291
292
293
294
295

296
297
298
299
300
301
302
303
304
305
306
307

308
309
310
311
312
313
314
288
289
290
291
292
293
294

295
296
297
298
299
300
301
302
303
304
305
306

307
308
309
310
311
312
313
314







-
+











-
+







            iAddr = self._lookupArcNode(self.dChar[c], iAddr)
            if iAddr is None:
                return False
        return bool(int.from_bytes(self.byDic[iAddr:iAddr+self.nBytesArc], byteorder='big') & self._finalNodeMask)

    def getMorph (self, sWord):
        "retrieves morphologies list, different casing allowed"
        sWord = cp.spellingNormalization(sWord)
        sWord = st.spellingNormalization(sWord)
        l = self.morph(sWord)
        if sWord[0:1].isupper():
            l.extend(self.morph(sWord.lower()))
            if sWord.isupper() and len(sWord) > 1:
                l.extend(self.morph(sWord.capitalize()))
        return l

    #@timethis
    def suggest (self, sWord, nSuggLimit=10, bSplitTrailingNumbers=False):
        "returns a set of suggestions for <sWord>"
        sWord = sWord.rstrip(".")   # useful for LibreOffice
        sWord = cp.spellingNormalization(sWord)
        sWord = st.spellingNormalization(sWord)
        sPfx, sWord, sSfx = cp.cut(sWord)
        nMaxSwitch = max(len(sWord) // 3, 1)
        nMaxDel = len(sWord) // 5
        nMaxHardRepl = max((len(sWord) - 5) // 4, 1)
        nMaxJump = max(len(sWord) // 4, 1)
        oSuggResult = SuggResult(sWord)
        if bSplitTrailingNumbers:
408
409
410
411
412
413
414
415

416
417
418
419
420
421
422
408
409
410
411
412
413
414

415
416
417
418
419
420
421
422







-
+







                    aTails.add(sTail + self.dCharVal[nVal])
                if n and not aTails:
                    aTails.update(self._getTails(jAddr, sTail+self.dCharVal[nVal], n-1))
        return aTails

    def drawPath (self, sWord, iAddr=0):
        "show the path taken by <sWord> in the graph"
        sWord = cp.spellingNormalization(sWord)
        sWord = st.spellingNormalization(sWord)
        c1 = sWord[0:1]  if sWord  else " "
        iPos = -1
        n = 0
        echo(c1 + ": ", end="")
        for c2, jAddr in self._getCharArcs(iAddr):
            echo(c2, end="")
            if c2 == sWord[0:1]:

Modified graphspell/str_transform.py from [452d0bdcef] to [e25e3e9b20].

1
2
3
4
5


6
7
8
9
10

11
12
13
14
15












16
17
18
19
20
21
22
1
2
3
4
5
6
7
8
9
10
11

12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36





+
+




-
+





+
+
+
+
+
+
+
+
+
+
+
+







"""
Operations on strings:
- calculate distance between two strings
- transform strings with transformation codes
"""

import unicodedata

from .char_player import distanceBetweenChars


#### Ngrams
#### N-GRAMS

def getNgrams (sWord, n=2):
    "return a list of Ngrams strings"
    return [ sWord[i:i+n]  for i in range(len(sWord)-n+1) ]



#### WORD NORMALIZATION

_xTransCharsForSpelling = str.maketrans({
    'ſ': 's',  'ffi': 'ffi',  'ffl': 'ffl',  'ff': 'ff',  'ſt': 'ft',  'fi': 'fi',  'fl': 'fl',  'st': 'st'
})

def spellingNormalization (sWord):
    "nomalization NFC and removing ligatures"
    return unicodedata.normalize("NFC", sWord.translate(_xTransCharsForSpelling))



#### DISTANCE CALCULATIONS

def longestCommonSubstring (s1, s2):
    "longest common substring"
    # http://en.wikipedia.org/wiki/Longest_common_substring_problem