25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
|
for sLine in hSrc:
sLine = sLine.strip()
if sLine and not sLine.startswith("#"):
yield sLine
else:
raise OSError("# Error. File not found or not loadable: " + spf)
def getElemsFromFile (spf):
"returns tuple of (flexion, stem, tags) from lexicon file"
nErr = 0
if not spf.endswith(".clex"):
for sLine in readFile(spf):
try:
sFlex, sStem, sTag = sLine.split("\t")
yield (sFlex, sStem, sTag)
except:
nErr += 1
else:
sTag = "_" # neutral tag
sTag2 = ""
for sLine in readFile(spf):
if sLine.startswith("[") and sLine.endswith("]"):
# tag line
if "-->" in sLine:
try:
sTag, sSfxCode, sTag2 = sLine[1:-1].split(" --> ")
except:
nErr += 1
continue
sTag = sTag.strip()
sSfxCode = sSfxCode.strip()
sTag2 = sTag2.strip()
else:
sTag = sLine[1:-1]
sTag2 = ""
else:
# entry line
if "\t" in sLine:
if sLine.count("\t") > 1:
nErr += 1
continue
sFlex, sStem = sLine.split("\t")
else:
sFlex = sStem = sLine
#print(sFlex, sStem, sTag)
yield (sFlex, sStem, sTag)
if sTag2:
sFlex2 = st.changeWordWithSuffixCode(sFlex, sSfxCode)
#print(sFlex2, sStem, sTag2)
yield (sFlex2, sStem, sTag2)
if nErr:
print(" # Lines ignored: {:>10}".format(nErr))
class DAWG:
"""DIRECT ACYCLIC WORD GRAPH"""
# This code is inspired from Steve Hanov’s DAWG, 2011. (http://stevehanov.ca/blog/index.php?id=115)
# We store suffix/affix codes and tags within the graph after the “real” word.
# A word is a list of numbers [ c1, c2, c3 . . . cN, iAffix, iTags]
|
<
<
<
<
<
<
<
<
<
<
<
<
<
<
<
<
<
<
<
<
<
<
<
<
<
<
<
<
<
<
<
<
<
<
<
<
<
<
<
<
<
<
<
<
<
<
<
|
25
26
27
28
29
30
31
32
33
34
35
36
37
38
|
for sLine in hSrc:
sLine = sLine.strip()
if sLine and not sLine.startswith("#"):
yield sLine
else:
raise OSError("# Error. File not found or not loadable: " + spf)
class DAWG:
"""DIRECT ACYCLIC WORD GRAPH"""
# This code is inspired from Steve Hanov’s DAWG, 2011. (http://stevehanov.ca/blog/index.php?id=115)
# We store suffix/affix codes and tags within the graph after the “real” word.
# A word is a list of numbers [ c1, c2, c3 . . . cN, iAffix, iTags]
|
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
|
lEntry = []
lChar = ['']; dChar = {}; nChar = 1; dCharOccur = {}
lAff = []; dAff = {}; nAff = 0; dAffOccur = {}
lTag = []; dTag = {}; nTag = 0; dTagOccur = {}
nErr = 0
# read lexicon
for sFlex, sStem, sTag in getElemsFromFile(spfSrc):
addWordToCharDict(sFlex)
# chars
for c in sFlex:
if c not in dChar:
dChar[c] = nChar
lChar.append(c)
nChar += 1
|
>
|
|
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
|
lEntry = []
lChar = ['']; dChar = {}; nChar = 1; dCharOccur = {}
lAff = []; dAff = {}; nAff = 0; dAffOccur = {}
lTag = []; dTag = {}; nTag = 0; dTagOccur = {}
nErr = 0
# read lexicon
for sLine in readFile(spfSrc):
sFlex, sStem, sTag = sLine.split("\t")
addWordToCharDict(sFlex)
# chars
for c in sFlex:
if c not in dChar:
dChar[c] = nChar
lChar.append(c)
nChar += 1
|