Grammalecte  Diff

Differences From Artifact [452d0bdcef]:

To Artifact [e25e3e9b20]:


1
2
3
4
5


6
7
8
9
10
11
12
13
14
15












16
17
18
19
20
21
22
"""
Operations on strings:
- calculate distance between two strings
- transform strings with transformation codes
"""



from .char_player import distanceBetweenChars


#### Ngrams

def getNgrams (sWord, n=2):
    "return a list of Ngrams strings"
    return [ sWord[i:i+n]  for i in range(len(sWord)-n+1) ]















#### DISTANCE CALCULATIONS

def longestCommonSubstring (s1, s2):
    "longest common substring"
    # http://en.wikipedia.org/wiki/Longest_common_substring_problem





>
>




|





>
>
>
>
>
>
>
>
>
>
>
>







1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
"""
Operations on strings:
- calculate distance between two strings
- transform strings with transformation codes
"""

import unicodedata

from .char_player import distanceBetweenChars


#### N-GRAMS

def getNgrams (sWord, n=2):
    "return a list of Ngrams strings"
    return [ sWord[i:i+n]  for i in range(len(sWord)-n+1) ]



#### WORD NORMALIZATION

_xTransCharsForSpelling = str.maketrans({
    'ſ': 's',  'ffi': 'ffi',  'ffl': 'ffl',  'ff': 'ff',  'ſt': 'ft',  'fi': 'fi',  'fl': 'fl',  'st': 'st'
})

def spellingNormalization (sWord):
    "nomalization NFC and removing ligatures"
    return unicodedata.normalize("NFC", sWord.translate(_xTransCharsForSpelling))



#### DISTANCE CALCULATIONS

def longestCommonSubstring (s1, s2):
    "longest common substring"
    # http://en.wikipedia.org/wiki/Longest_common_substring_problem