Grammalecte  Check-in [766f20e23c]

Overview
Comment:[core] str_transform: change functions names
Downloads: Tarball | ZIP archive | SQL archive
Timelines: family | ancestors | descendants | both | trunk | core
Files: files | file ages | folders
SHA3-256: 766f20e23c4c92bb43d777545a47444788a877166ecc97bfeb31414b3021a599
User & Date: olr on 2017-06-23 17:25:20
Other Links: manifest | tags
Context
2017-06-23
19:23
[build] use one dictionary name instead of two check-in: cfc69abb68 user: olr tags: trunk, build
17:25
[core] str_transform: change functions names check-in: 766f20e23c user: olr tags: trunk, core
17:11
[core] dawg: compressed lexicon check-in: e5f3698eb4 user: olr tags: trunk, build, new_feature
Changes

Modified gc_core/py/dawg.py from [7e6ed7295c] to [ddd6fe1cc6].

14
15
16
17
18
19
20
21

22
23
24
25
26
27
28
14
15
16
17
18
19
20

21
22
23
24
25
26
27
28







-
+







import collections

from . import str_transform as st
from .progressbar import ProgressBar


def readFile (spf):
    print("Read lexicon: " + spf)
    print(" < Read lexicon: " + spf)
    if os.path.isfile(spf):
        with open(spf, "r", encoding="utf-8") as hSrc:
            for sLine in hSrc:
                sLine = sLine.strip()
                if sLine and not sLine.startswith("#"):
                    yield sLine
    else:
65
66
67
68
69
70
71
72

73
74
75
76
77
78
79
65
66
67
68
69
70
71

72
73
74
75
76
77
78
79







-
+







                        continue
                    sFlex, sStem = sLine.split("\t")
                else:
                    sFlex = sStem = sLine
                #print(sFlex, sStem, sTag)
                yield (sFlex, sStem, sTag)
                if sTag2:
                    sFlex2 = st.getStemFromSuffixCode(sFlex, sSfxCode)
                    sFlex2 = st.changeWordWithSuffixCode(sFlex, sSfxCode)
                    #print(sFlex2, sStem, sTag2)
                    yield (sFlex2, sStem, sTag2)
    if nErr:
        print(" # Lines ignored: {:>10}".format(nErr))



159
160
161
162
163
164
165
166

167
168

169
170
171
172
173
174
175
159
160
161
162
163
164
165

166
167

168
169
170
171
172
173
174
175







-
+

-
+







        self.nChar = len(dChar)
        self.nAff = nAff
        self.lArcVal = lVal
        self.nArcVal = len(lVal)
        self.nTag = self.nArcVal - self.nChar - nAff
        self.cStemming = cStemming
        if cStemming == "A":
            self.funcStemming = st.getStemFromAffixCode
            self.funcStemming = st.changeWordWithAffixCode
        elif cStemming == "S":    
            self.funcStemming = st.getStemFromSuffixCode
            self.funcStemming = st.changeWordWithSuffixCode
        else:
            self.funcStemming = st.noStemming
        
        # build
        lWord.sort()
        oProgBar = ProgressBar(0, len(lWord))
        for word in lWord:

Modified gc_core/py/ibdawg.py from [9ce1ce821d] to [095d971150].

40
41
42
43
44
45
46
47

48
49

50
51
52
53
54
55
56
40
41
42
43
44
45
46

47
48

49
50
51
52
53
54
55
56







-
+

-
+







        self.nBytesNodeAddress = int(l[3])
        self.nEntries = int(l[4])
        self.nNode = int(l[5])
        self.nArc = int(l[6])
        self.nAff = int(l[7])
        self.cStemming = l[8]
        if self.cStemming == "S":
            self.funcStemming = st.getStemFromSuffixCode
            self.funcStemming = st.changeWordWithSuffixCode
        elif self.cStemming == "A":
            self.funcStemming = st.getStemFromAffixCode
            self.funcStemming = st.changeWordWithAffixCode
        else:
            self.funcStemming = st.noStemming
        self.nTag = self.nArcVal - self.nChar - self.nAff
        self.dChar = {}
        for i in range(1, self.nChar):
            self.dChar[self.lArcVal[i]] = i
            

Modified gc_core/py/str_transform.py from [e86906e5ce] to [7df400eceb].

69
70
71
72
73
74
75
76

77
78
79


80
81
82
83
84
85
86
69
70
71
72
73
74
75

76
77


78
79
80
81
82
83
84
85
86







-
+

-
-
+
+







    jSfx = 0
    for i in range(min(len(sFlex), len(sStem))):
        if sFlex[i] != sStem[i]:
            break
        jSfx += 1
    return chr(len(sFlex)-jSfx+48) + sStem[jSfx:]  

def getStemFromSuffixCode (sFlex, sSfxCode):
def changeWordWithSuffixCode (sWord, sSfxCode):
    if sSfxCode == "0":
        return sFlex
    return sFlex[:-(ord(sSfxCode[0])-48)] + sSfxCode[1:]  if sSfxCode[0] != '0'  else sFlex + sSfxCode[1:]
        return sWord
    return sWord[:-(ord(sSfxCode[0])-48)] + sSfxCode[1:]  if sSfxCode[0] != '0'  else sWord + sSfxCode[1:]


# Prefix and suffix
def defineAffixCode (sFlex, sStem):
    """ Returns a string defining how to get stem from flexion. Examples:
            "0" if stem = flexion
            "stem" if no common substring
120
121
122
123
124
125
126
127

128
129

130
131
132
133
134


135
120
121
122
123
124
125
126

127
128

129
130
131
132


133
134
135







-
+

-
+



-
-
+
+

                if M[x][y] > longest:
                    longest = M[x][y]
                    x_longest = x
            else:
                M[x][y] = 0
    return s1[x_longest-longest : x_longest]

def getStemFromAffixCode (sFlex, sAffCode):
def changeWordWithAffixCode (sWord, sAffCode):
    if sAffCode == "0":
        return sFlex
        return sWord
    if '/' not in sAffCode:
        return "# error #"
    sPfxCode, sSfxCode = sAffCode.split('/')
    sFlex = sPfxCode[1:] + sFlex[(ord(sPfxCode[0])-48):] 
    return sFlex[:-(ord(sSfxCode[0])-48)] + sSfxCode[1:]  if sSfxCode[0] != '0'  else sFlex + sSfxCode[1:]
    sWord = sPfxCode[1:] + sWord[(ord(sPfxCode[0])-48):] 
    return sWord[:-(ord(sSfxCode[0])-48)] + sSfxCode[1:]  if sSfxCode[0] != '0'  else sWord + sSfxCode[1:]