23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
|
def readFile (spf):
print(" < Read lexicon: " + spf)
if os.path.isfile(spf):
with open(spf, "r", encoding="utf-8") as hSrc:
for sLine in hSrc:
sLine = sLine.strip()
if sLine and not sLine.startswith("#"):
yield sLine
else:
raise OSError("# Error. File not found or not loadable: " + spf)
class DAWG:
"""DIRECT ACYCLIC WORD GRAPH"""
# This code is inspired from Steve Hanov’s DAWG, 2011. (http://stevehanov.ca/blog/index.php?id=115)
# We store suffix/affix codes and tags within the graph after the “real” word.
# A word is a list of numbers [ c1, c2, c3 . . . cN, iAffix, iTags]
# Each arc is an index in self.lArcVal, where are stored characters, suffix/affix codes for stemming and tags.
# Important: As usual, the last node (after ‘iTags’) is tagged final, AND the node after ‘cN’ is ALSO tagged final.
def __init__ (self, spfSrc, cStemming, sLangCode, sLangName="", sDicName=""):
print("===== Direct Acyclic Word Graph - Minimal Acyclic Finite State Automaton =====")
cStemming = cStemming.upper()
if cStemming == "A":
funcStemmingGen = st.defineAffixCode
elif cStemming == "S":
funcStemmingGen = st.defineSuffixCode
elif cStemming == "N":
funcStemmingGen = st.noStemming
else:
raise ValueError("# Error. Unknown stemming code: {}".format(cStemming))
lEntry = []
lChar = ['']; dChar = {}; nChar = 1; dCharOccur = {}
lAff = []; dAff = {}; nAff = 0; dAffOccur = {}
lTag = []; dTag = {}; nTag = 0; dTagOccur = {}
nErr = 0
# read lexicon
for sLine in readFile(spfSrc):
sFlex, sStem, sTag = sLine.split("\t")
addWordToCharDict(sFlex)
# chars
for c in sFlex:
if c not in dChar:
dChar[c] = nChar
lChar.append(c)
nChar += 1
|
|
|
|
>
|
>
>
|
|
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
|
def readFile (spf):
print(" < Read lexicon: " + spf)
if os.path.isfile(spf):
with open(spf, "r", encoding="utf-8") as hSrc:
for sLine in hSrc:
sLine = sLine.strip()
if sLine and not sLine.startswith("#"):
yield sLine.split("\t")
else:
raise OSError("# Error. File not found or not loadable: " + spf)
class DAWG:
"""DIRECT ACYCLIC WORD GRAPH"""
# This code is inspired from Steve Hanov’s DAWG, 2011. (http://stevehanov.ca/blog/index.php?id=115)
# We store suffix/affix codes and tags within the graph after the “real” word.
# A word is a list of numbers [ c1, c2, c3 . . . cN, iAffix, iTags]
# Each arc is an index in self.lArcVal, where are stored characters, suffix/affix codes for stemming and tags.
# Important: As usual, the last node (after ‘iTags’) is tagged final, AND the node after ‘cN’ is ALSO tagged final.
def __init__ (self, src, cStemming, sLangCode, sLangName="", sDicName=""):
print("===== Direct Acyclic Word Graph - Minimal Acyclic Finite State Automaton =====")
cStemming = cStemming.upper()
if cStemming == "A":
funcStemmingGen = st.defineAffixCode
elif cStemming == "S":
funcStemmingGen = st.defineSuffixCode
elif cStemming == "N":
funcStemmingGen = st.noStemming
else:
raise ValueError("# Error. Unknown stemming code: {}".format(cStemming))
lEntry = []
lChar = ['']; dChar = {}; nChar = 1; dCharOccur = {}
lAff = []; dAff = {}; nAff = 0; dAffOccur = {}
lTag = []; dTag = {}; nTag = 0; dTagOccur = {}
nErr = 0
# read lexicon
if type(src) is str:
iterable = readFile(src)
else:
iterable = src
for sFlex, sStem, sTag in iterable:
addWordToCharDict(sFlex)
# chars
for c in sFlex:
if c not in dChar:
dChar[c] = nChar
lChar.append(c)
nChar += 1
|
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
|
lWord = [ [dChar[c] for c in sFlex] + [iAff+nChar] + [iTag+nChar+nAff] for sFlex, iAff, iTag in lEntry ]
lEntry = None
# Dictionary of arc values occurrency, to sort arcs of each node
dValOccur = dict( [ (dChar[c], dCharOccur[c]) for c in dChar ] \
+ [ (dAff[aff]+nChar, dAffOccur[aff]) for aff in dAff ] \
+ [ (dTag[tag]+nChar+nAff, dTagOccur[tag]) for tag in dTag ] )
#with open(spfSrc[:-8]+".valuesfreq.txt", 'w', encoding='utf-8') as hFreqDst: # DEBUG
# for iKey, nOcc in sorted(dValOccur.items(), key=lambda t: t[1], reverse=True):
# hFreqDst.write("{}: {}\n".format(lVal[iKey], nOcc))
# hFreqDst.close()
self.sFileName = spfSrc
self.sLangCode = sLangCode
self.sLangName = sLangName
self.sDicName = sDicName
self.nEntry = len(lWord)
self.aPreviousEntry = []
DawgNode.resetNextId()
self.oRoot = DawgNode()
|
<
<
<
<
|
|
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
|
lWord = [ [dChar[c] for c in sFlex] + [iAff+nChar] + [iTag+nChar+nAff] for sFlex, iAff, iTag in lEntry ]
lEntry = None
# Dictionary of arc values occurrency, to sort arcs of each node
dValOccur = dict( [ (dChar[c], dCharOccur[c]) for c in dChar ] \
+ [ (dAff[aff]+nChar, dAffOccur[aff]) for aff in dAff ] \
+ [ (dTag[tag]+nChar+nAff, dTagOccur[tag]) for tag in dTag ] )
self.sFileName = src if type(src) is str else "[None]"
self.sLangCode = sLangCode
self.sLangName = sLangName
self.sDicName = sDicName
self.nEntry = len(lWord)
self.aPreviousEntry = []
DawgNode.resetNextId()
self.oRoot = DawgNode()
|