Grammalecte  Check-in [766f20e23c]

Overview
Comment:[core] str_transform: change functions names
Downloads: Tarball | ZIP archive | SQL archive
Timelines: family | ancestors | descendants | both | trunk | core
Files: files | file ages | folders
SHA3-256: 766f20e23c4c92bb43d777545a47444788a877166ecc97bfeb31414b3021a599
User & Date: olr on 2017-06-23 17:25:20
Other Links: manifest | tags
Context
2017-06-23
19:23
[build] use one dictionary name instead of two check-in: cfc69abb68 user: olr tags: trunk, build
17:25
[core] str_transform: change functions names check-in: 766f20e23c user: olr tags: trunk, core
17:11
[core] dawg: compressed lexicon check-in: e5f3698eb4 user: olr tags: trunk, build, new_feature
Changes

Modified gc_core/py/dawg.py from [7e6ed7295c] to [ddd6fe1cc6].

14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
import collections

from . import str_transform as st
from .progressbar import ProgressBar


def readFile (spf):
    print("Read lexicon: " + spf)
    if os.path.isfile(spf):
        with open(spf, "r", encoding="utf-8") as hSrc:
            for sLine in hSrc:
                sLine = sLine.strip()
                if sLine and not sLine.startswith("#"):
                    yield sLine
    else:







|







14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
import collections

from . import str_transform as st
from .progressbar import ProgressBar


def readFile (spf):
    print(" < Read lexicon: " + spf)
    if os.path.isfile(spf):
        with open(spf, "r", encoding="utf-8") as hSrc:
            for sLine in hSrc:
                sLine = sLine.strip()
                if sLine and not sLine.startswith("#"):
                    yield sLine
    else:
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
                        continue
                    sFlex, sStem = sLine.split("\t")
                else:
                    sFlex = sStem = sLine
                #print(sFlex, sStem, sTag)
                yield (sFlex, sStem, sTag)
                if sTag2:
                    sFlex2 = st.getStemFromSuffixCode(sFlex, sSfxCode)
                    #print(sFlex2, sStem, sTag2)
                    yield (sFlex2, sStem, sTag2)
    if nErr:
        print(" # Lines ignored: {:>10}".format(nErr))










|







65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
                        continue
                    sFlex, sStem = sLine.split("\t")
                else:
                    sFlex = sStem = sLine
                #print(sFlex, sStem, sTag)
                yield (sFlex, sStem, sTag)
                if sTag2:
                    sFlex2 = st.changeWordWithSuffixCode(sFlex, sSfxCode)
                    #print(sFlex2, sStem, sTag2)
                    yield (sFlex2, sStem, sTag2)
    if nErr:
        print(" # Lines ignored: {:>10}".format(nErr))



159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
        self.nChar = len(dChar)
        self.nAff = nAff
        self.lArcVal = lVal
        self.nArcVal = len(lVal)
        self.nTag = self.nArcVal - self.nChar - nAff
        self.cStemming = cStemming
        if cStemming == "A":
            self.funcStemming = st.getStemFromAffixCode
        elif cStemming == "S":    
            self.funcStemming = st.getStemFromSuffixCode
        else:
            self.funcStemming = st.noStemming
        
        # build
        lWord.sort()
        oProgBar = ProgressBar(0, len(lWord))
        for word in lWord:







|

|







159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
        self.nChar = len(dChar)
        self.nAff = nAff
        self.lArcVal = lVal
        self.nArcVal = len(lVal)
        self.nTag = self.nArcVal - self.nChar - nAff
        self.cStemming = cStemming
        if cStemming == "A":
            self.funcStemming = st.changeWordWithAffixCode
        elif cStemming == "S":    
            self.funcStemming = st.changeWordWithSuffixCode
        else:
            self.funcStemming = st.noStemming
        
        # build
        lWord.sort()
        oProgBar = ProgressBar(0, len(lWord))
        for word in lWord:

Modified gc_core/py/ibdawg.py from [9ce1ce821d] to [095d971150].

40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
        self.nBytesNodeAddress = int(l[3])
        self.nEntries = int(l[4])
        self.nNode = int(l[5])
        self.nArc = int(l[6])
        self.nAff = int(l[7])
        self.cStemming = l[8]
        if self.cStemming == "S":
            self.funcStemming = st.getStemFromSuffixCode
        elif self.cStemming == "A":
            self.funcStemming = st.getStemFromAffixCode
        else:
            self.funcStemming = st.noStemming
        self.nTag = self.nArcVal - self.nChar - self.nAff
        self.dChar = {}
        for i in range(1, self.nChar):
            self.dChar[self.lArcVal[i]] = i
            







|

|







40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
        self.nBytesNodeAddress = int(l[3])
        self.nEntries = int(l[4])
        self.nNode = int(l[5])
        self.nArc = int(l[6])
        self.nAff = int(l[7])
        self.cStemming = l[8]
        if self.cStemming == "S":
            self.funcStemming = st.changeWordWithSuffixCode
        elif self.cStemming == "A":
            self.funcStemming = st.changeWordWithAffixCode
        else:
            self.funcStemming = st.noStemming
        self.nTag = self.nArcVal - self.nChar - self.nAff
        self.dChar = {}
        for i in range(1, self.nChar):
            self.dChar[self.lArcVal[i]] = i
            

Modified gc_core/py/str_transform.py from [e86906e5ce] to [7df400eceb].

69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
    jSfx = 0
    for i in range(min(len(sFlex), len(sStem))):
        if sFlex[i] != sStem[i]:
            break
        jSfx += 1
    return chr(len(sFlex)-jSfx+48) + sStem[jSfx:]  

def getStemFromSuffixCode (sFlex, sSfxCode):
    if sSfxCode == "0":
        return sFlex
    return sFlex[:-(ord(sSfxCode[0])-48)] + sSfxCode[1:]  if sSfxCode[0] != '0'  else sFlex + sSfxCode[1:]


# Prefix and suffix
def defineAffixCode (sFlex, sStem):
    """ Returns a string defining how to get stem from flexion. Examples:
            "0" if stem = flexion
            "stem" if no common substring







|

|
|







69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
    jSfx = 0
    for i in range(min(len(sFlex), len(sStem))):
        if sFlex[i] != sStem[i]:
            break
        jSfx += 1
    return chr(len(sFlex)-jSfx+48) + sStem[jSfx:]  

def changeWordWithSuffixCode (sWord, sSfxCode):
    if sSfxCode == "0":
        return sWord
    return sWord[:-(ord(sSfxCode[0])-48)] + sSfxCode[1:]  if sSfxCode[0] != '0'  else sWord + sSfxCode[1:]


# Prefix and suffix
def defineAffixCode (sFlex, sStem):
    """ Returns a string defining how to get stem from flexion. Examples:
            "0" if stem = flexion
            "stem" if no common substring
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
                if M[x][y] > longest:
                    longest = M[x][y]
                    x_longest = x
            else:
                M[x][y] = 0
    return s1[x_longest-longest : x_longest]

def getStemFromAffixCode (sFlex, sAffCode):
    if sAffCode == "0":
        return sFlex
    if '/' not in sAffCode:
        return "# error #"
    sPfxCode, sSfxCode = sAffCode.split('/')
    sFlex = sPfxCode[1:] + sFlex[(ord(sPfxCode[0])-48):] 
    return sFlex[:-(ord(sSfxCode[0])-48)] + sSfxCode[1:]  if sSfxCode[0] != '0'  else sFlex + sSfxCode[1:]








|

|



|
|

120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
                if M[x][y] > longest:
                    longest = M[x][y]
                    x_longest = x
            else:
                M[x][y] = 0
    return s1[x_longest-longest : x_longest]

def changeWordWithAffixCode (sWord, sAffCode):
    if sAffCode == "0":
        return sWord
    if '/' not in sAffCode:
        return "# error #"
    sPfxCode, sSfxCode = sAffCode.split('/')
    sWord = sPfxCode[1:] + sWord[(ord(sPfxCode[0])-48):] 
    return sWord[:-(ord(sSfxCode[0])-48)] + sSfxCode[1:]  if sSfxCode[0] != '0'  else sWord + sSfxCode[1:]