Grammalecte  Diff

Differences From Artifact [7dcad03ac9]:

To Artifact [452d0bdcef]:


1
2
3
4
5



6
7
8
9
10
11
12
"""
Operations on strings:
- calculate distance between two strings
- transform strings with transformation codes
"""




#### Ngrams

def getNgrams (sWord, n=2):
    "return a list of Ngrams strings"
    return [ sWord[i:i+n]  for i in range(len(sWord)-n+1) ]






>
>
>







1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
"""
Operations on strings:
- calculate distance between two strings
- transform strings with transformation codes
"""

from .char_player import distanceBetweenChars


#### Ngrams

def getNgrams (sWord, n=2):
    "return a list of Ngrams strings"
    return [ sWord[i:i+n]  for i in range(len(sWord)-n+1) ]

40
41
42
43
44
45
46
47

48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
    nLen2 = len(s2)
    for i in range(-1, nLen1+1):
        d[i, -1] = i + 1
    for j in range(-1, nLen2+1):
        d[-1, j] = j + 1
    for i in range(nLen1):
        for j in range(nLen2):
            nCost = 0  if s1[i] == s2[j]  else 1

            d[i, j] = min(
                d[i-1, j]   + 1,        # Deletion
                d[i,   j-1] + 1,        # Insertion
                d[i-1, j-1] + nCost,    # Substitution
            )
            if i and j and s1[i] == s2[j-1] and s1[i-1] == s2[j]:
                d[i, j] = min(d[i, j], d[i-2, j-2] + nCost)     # Transposition
    return d[nLen1-1, nLen2-1]


def distanceSift4 (s1, s2, nMaxOffset=5):
    "implementation of general Sift4."
    # https://siderite.blogspot.com/2014/11/super-fast-and-accurate-string-distance.html
    if not s1:
        return len(s2)







|
>







|







43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
    nLen2 = len(s2)
    for i in range(-1, nLen1+1):
        d[i, -1] = i + 1
    for j in range(-1, nLen2+1):
        d[-1, j] = j + 1
    for i in range(nLen1):
        for j in range(nLen2):
            #nCost = 0  if s1[i] == s2[j]  else 1
            nCost = distanceBetweenChars(s1[i], s2[j])
            d[i, j] = min(
                d[i-1, j]   + 1,        # Deletion
                d[i,   j-1] + 1,        # Insertion
                d[i-1, j-1] + nCost,    # Substitution
            )
            if i and j and s1[i] == s2[j-1] and s1[i-1] == s2[j]:
                d[i, j] = min(d[i, j], d[i-2, j-2] + nCost)     # Transposition
    return int(d[nLen1-1, nLen2-1])


def distanceSift4 (s1, s2, nMaxOffset=5):
    "implementation of general Sift4."
    # https://siderite.blogspot.com/2014/11/super-fast-and-accurate-string-distance.html
    if not s1:
        return len(s2)