Grammalecte  Check-in [3916c538b5]

Overview
Comment:[core] dawg: accept personal lexicon
Downloads: Tarball | ZIP archive | SQL archive
Timelines: family | ancestors | descendants | both | trunk | core | new_feature
Files: files | file ages | folders
SHA3-256: 3916c538b58d03f2b51613bc833cdae99a4e314baa760e9df06d5009a8468214
User & Date: olr on 2017-06-23 14:43:25
Other Links: manifest | tags
Context
2017-06-23
17:11
[core] dawg: compressed lexicon check-in: e5f3698eb4 user: olr tags: trunk, build, new_feature
14:43
[core] dawg: accept personal lexicon check-in: 3916c538b5 user: olr tags: trunk, core, new_feature
13:19
[build] lex_build.py: main() + options check-in: e091821b50 user: olr tags: trunk, build
Changes

Modified gc_core/py/dawg.py from [a9c487538f] to [a30caaeab0].

12
13
14
15
16
17
18








































19
20
21
22
23
24
25
26
27
28

29
30
31
32
33
34
35
36
37
38

39
40
41
42
43
44
45
46
47
48

49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78





















79
80
81
82
83


84
85
86
87
88
89
90
91
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67

68
69
70
71
72
73
74
75
76
77

78

79
80
81
82
83
84
85
86

87






























88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108





109
110

111
112
113
114
115
116
117







+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+









-
+









-
+
-








-
+
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
-
-
-
-
-
+
+
-







import sys
import os
import collections

from . import str_transform as st
from .progressbar import ProgressBar


def readFile (spf):
    if os.path.isfile(spf):
        with open(spf, "r", encoding="utf-8") as hSrc:
            for sLine in hSrc:
                sLine = sLine.strip()
                if sLine and not sLine.startswith("#"):
                    yield sLine
    else:
        raise OSError("# Error. File not found or not loadable: " + spf)


def getElemsFromFile (spf, bCompressedDic=False):
    nErr = 0
    if not bCompressedDic:
        for sLine in readFile(spf):
            try:
                sFlex, sStem, sTag = sLine.split("\t")
                yield (sFlex, sStem, sTag)
            except:
                nErr += 1
    else:
        sTag = ":_" # neutral tag
        for sLine in readFile(spf):
            if sLine.startswith("[") and sLine.endswith("]"):
                sTag = sLine[1:-1]
                continue
            else:
                if "\t" in sLine:
                    if sLine.count("\t") > 1:
                        nErr += 1
                        continue
                    sFlex, sStem = sLine.split("\t")
                else:
                    sFlex = sStem = sLine
                yield (sFlex, sStem, sTag)
    if nErr:
        print(" # Lines ignored: {:>10}".format(nErr))



class DAWG:
    """DIRECT ACYCLIC WORD GRAPH"""
    # This code is inspired from Steve Hanov’s DAWG, 2011. (http://stevehanov.ca/blog/index.php?id=115)
    # We store suffix/affix codes and tags within the graph after the “real” word.
    # A word is a list of numbers [ c1, c2, c3 . . . cN, iAffix, iTags]
    # Each arc is an index in self.lArcVal, where are stored characters, suffix/affix codes for stemming and tags.
    # Important: As usual, the last node (after ‘iTags’) is tagged final, AND the node after ‘cN’ is ALSO tagged final.

    def __init__ (self, spfSrc, sLangName, cStemming):
    def __init__ (self, spfSrc, sLangName, cStemming, bCompressedDic=False):
        print("===== Direct Acyclic Word Graph - Minimal Acyclic Finite State Automaton =====")
        cStemming = cStemming.upper()
        if cStemming == "A":
            funcStemmingGen = st.defineAffixCode
        elif cStemming == "S":
            funcStemmingGen = st.defineSuffixCode
        elif cStemming == "N":
            funcStemmingGen = st.noStemming
        else:
            print("# Error code: {}".format(cStemming))
            raise ValueError("# Error. Unknown stemming code: {}".format(cStemming))
            exit()

        lEntry = []
        lChar = ['']; dChar = {}; nChar = 1; dCharOccur = {}
        lAff  = [];   dAff  = {}; nAff  = 0; dAffOccur = {}
        lTag  = [];   dTag  = {}; nTag  = 0; dTagOccur = {}
        nErr = 0
        
        # read lexicon
        with open(spfSrc, 'r', encoding='utf-8') as hSrc:
        for sFlex, sStem, sTag in getElemsFromFile(spfSrc, bCompressedDic):
            print(" > Reading lexicon: " + spfSrc + " ...")
            for line in hSrc:
                line = line.strip()
                if not (line.startswith('#') or line == ''):
                    try:
                        flex, stem, tag = line.split("\t")
                    except:
                        nErr += 1
                        continue
                    # chars
                    for c in flex:
                        if c not in dChar:
                            dChar[c] = nChar
                            lChar.append(c)
                            nChar += 1
                        dCharOccur[c] = dCharOccur.get(c, 0) + 1
                    # affixes to find stem from flexion
                    aff = funcStemmingGen(flex, stem)
                    if aff not in dAff:
                        dAff[aff] = nAff
                        lAff.append(aff)
                        nAff += 1
                    dAffOccur[aff] = dCharOccur.get(aff, 0) + 1
                    # tags
                    if tag not in dTag:
                        dTag[tag] = nTag
                        lTag.append(tag)
                        nTag += 1
                    dTagOccur[tag] = dTagOccur.get(tag, 0) + 1
                    lEntry.append((flex, dAff[aff], dTag[tag]))
            # chars
            for c in sFlex:
                if c not in dChar:
                    dChar[c] = nChar
                    lChar.append(c)
                    nChar += 1
                dCharOccur[c] = dCharOccur.get(c, 0) + 1
            # affixes to find stem from flexion
            aff = funcStemmingGen(sFlex, sStem)
            if aff not in dAff:
                dAff[aff] = nAff
                lAff.append(aff)
                nAff += 1
            dAffOccur[aff] = dCharOccur.get(aff, 0) + 1
            # tags
            if sTag not in dTag:
                dTag[sTag] = nTag
                lTag.append(sTag)
                nTag += 1
            dTagOccur[sTag] = dTagOccur.get(sTag, 0) + 1
            lEntry.append((sFlex, dAff[aff], dTag[sTag]))
            hSrc.close()
        if nErr:
            print(" # Lines ignored: {:>10}".format(nErr))
        if not(lEntry):
            print(" # Empty lexicon")
        if not lEntry:
            raise ValueError("# Error. Empty lexicon")
            exit()
        
        # Preparing DAWG
        print(" > Preparing list of words")
        lVal = lChar + lAff + lTag
        lWord = [ [dChar[c] for c in sFlex] + [iAff+nChar] + [iTag+nChar+nAff]  for sFlex, iAff, iTag in lEntry ]
        lEntry = None
        
111
112
113
114
115
116
117
118

119
120
121
122
123
124
125
137
138
139
140
141
142
143

144
145
146
147
148
149
150
151







-
+







        self.nArc = 0
        self.dChar = dChar
        self.nChar = len(dChar)
        self.nAff = nAff
        self.lArcVal = lVal
        self.nArcVal = len(lVal)
        self.nTag = self.nArcVal - self.nChar - nAff
        self.cStemming = cStemming.upper()
        self.cStemming = cStemming
        if cStemming == "A":
            self.funcStemming = st.getStemFromAffixCode
        elif cStemming == "S":    
            self.funcStemming = st.getStemFromSuffixCode
        else:
            self.funcStemming = st.noStemming