Grammalecte  Check-in [3916c538b5]

Overview
Comment:[core] dawg: accept personal lexicon
Downloads: Tarball | ZIP archive | SQL archive
Timelines: family | ancestors | descendants | both | trunk | core | new_feature
Files: files | file ages | folders
SHA3-256: 3916c538b58d03f2b51613bc833cdae99a4e314baa760e9df06d5009a8468214
User & Date: olr on 2017-06-23 14:43:25
Other Links: manifest | tags
Context
2017-06-23
17:11
[core] dawg: compressed lexicon check-in: e5f3698eb4 user: olr tags: trunk, build, new_feature
14:43
[core] dawg: accept personal lexicon check-in: 3916c538b5 user: olr tags: trunk, core, new_feature
13:19
[build] lex_build.py: main() + options check-in: e091821b50 user: olr tags: trunk, build
Changes

Modified gc_core/py/dawg.py from [a9c487538f] to [a30caaeab0].

12
13
14
15
16
17
18








































19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
import sys
import os
import collections

from . import str_transform as st
from .progressbar import ProgressBar










































class DAWG:
    """DIRECT ACYCLIC WORD GRAPH"""
    # This code is inspired from Steve Hanov’s DAWG, 2011. (http://stevehanov.ca/blog/index.php?id=115)
    # We store suffix/affix codes and tags within the graph after the “real” word.
    # A word is a list of numbers [ c1, c2, c3 . . . cN, iAffix, iTags]
    # Each arc is an index in self.lArcVal, where are stored characters, suffix/affix codes for stemming and tags.
    # Important: As usual, the last node (after ‘iTags’) is tagged final, AND the node after ‘cN’ is ALSO tagged final.

    def __init__ (self, spfSrc, sLangName, cStemming):
        print("===== Direct Acyclic Word Graph - Minimal Acyclic Finite State Automaton =====")
        cStemming = cStemming.upper()
        if cStemming == "A":
            funcStemmingGen = st.defineAffixCode
        elif cStemming == "S":
            funcStemmingGen = st.defineSuffixCode
        elif cStemming == "N":
            funcStemmingGen = st.noStemming
        else:
            print("# Error code: {}".format(cStemming))
            exit()

        lEntry = []
        lChar = ['']; dChar = {}; nChar = 1; dCharOccur = {}
        lAff  = [];   dAff  = {}; nAff  = 0; dAffOccur = {}
        lTag  = [];   dTag  = {}; nTag  = 0; dTagOccur = {}
        nErr = 0
        
        # read lexicon
        with open(spfSrc, 'r', encoding='utf-8') as hSrc:
            print(" > Reading lexicon: " + spfSrc + " ...")
            for line in hSrc:
                line = line.strip()
                if not (line.startswith('#') or line == ''):
                    try:
                        flex, stem, tag = line.split("\t")
                    except:
                        nErr += 1
                        continue
                    # chars
                    for c in flex:
                        if c not in dChar:
                            dChar[c] = nChar
                            lChar.append(c)
                            nChar += 1
                        dCharOccur[c] = dCharOccur.get(c, 0) + 1
                    # affixes to find stem from flexion
                    aff = funcStemmingGen(flex, stem)
                    if aff not in dAff:
                        dAff[aff] = nAff
                        lAff.append(aff)
                        nAff += 1
                    dAffOccur[aff] = dCharOccur.get(aff, 0) + 1
                    # tags
                    if tag not in dTag:
                        dTag[tag] = nTag
                        lTag.append(tag)
                        nTag += 1
                    dTagOccur[tag] = dTagOccur.get(tag, 0) + 1
                    lEntry.append((flex, dAff[aff], dTag[tag]))
            hSrc.close()
        if nErr:
            print(" # Lines ignored: {:>10}".format(nErr))
        if not(lEntry):
            print(" # Empty lexicon")
            exit()
        
        # Preparing DAWG
        print(" > Preparing list of words")
        lVal = lChar + lAff + lTag
        lWord = [ [dChar[c] for c in sFlex] + [iAff+nChar] + [iTag+nChar+nAff]  for sFlex, iAff, iTag in lEntry ]
        lEntry = None
        







>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>









|









|
<








|
<
<
<
<
<
<
<
<
<
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
<
<
<
|
|
<







12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78

79
80
81
82
83
84
85
86
87









88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108



109
110

111
112
113
114
115
116
117
import sys
import os
import collections

from . import str_transform as st
from .progressbar import ProgressBar


def readFile (spf):
    if os.path.isfile(spf):
        with open(spf, "r", encoding="utf-8") as hSrc:
            for sLine in hSrc:
                sLine = sLine.strip()
                if sLine and not sLine.startswith("#"):
                    yield sLine
    else:
        raise OSError("# Error. File not found or not loadable: " + spf)


def getElemsFromFile (spf, bCompressedDic=False):
    nErr = 0
    if not bCompressedDic:
        for sLine in readFile(spf):
            try:
                sFlex, sStem, sTag = sLine.split("\t")
                yield (sFlex, sStem, sTag)
            except:
                nErr += 1
    else:
        sTag = ":_" # neutral tag
        for sLine in readFile(spf):
            if sLine.startswith("[") and sLine.endswith("]"):
                sTag = sLine[1:-1]
                continue
            else:
                if "\t" in sLine:
                    if sLine.count("\t") > 1:
                        nErr += 1
                        continue
                    sFlex, sStem = sLine.split("\t")
                else:
                    sFlex = sStem = sLine
                yield (sFlex, sStem, sTag)
    if nErr:
        print(" # Lines ignored: {:>10}".format(nErr))



class DAWG:
    """DIRECT ACYCLIC WORD GRAPH"""
    # This code is inspired from Steve Hanov’s DAWG, 2011. (http://stevehanov.ca/blog/index.php?id=115)
    # We store suffix/affix codes and tags within the graph after the “real” word.
    # A word is a list of numbers [ c1, c2, c3 . . . cN, iAffix, iTags]
    # Each arc is an index in self.lArcVal, where are stored characters, suffix/affix codes for stemming and tags.
    # Important: As usual, the last node (after ‘iTags’) is tagged final, AND the node after ‘cN’ is ALSO tagged final.

    def __init__ (self, spfSrc, sLangName, cStemming, bCompressedDic=False):
        print("===== Direct Acyclic Word Graph - Minimal Acyclic Finite State Automaton =====")
        cStemming = cStemming.upper()
        if cStemming == "A":
            funcStemmingGen = st.defineAffixCode
        elif cStemming == "S":
            funcStemmingGen = st.defineSuffixCode
        elif cStemming == "N":
            funcStemmingGen = st.noStemming
        else:
            raise ValueError("# Error. Unknown stemming code: {}".format(cStemming))


        lEntry = []
        lChar = ['']; dChar = {}; nChar = 1; dCharOccur = {}
        lAff  = [];   dAff  = {}; nAff  = 0; dAffOccur = {}
        lTag  = [];   dTag  = {}; nTag  = 0; dTagOccur = {}
        nErr = 0
        
        # read lexicon
        for sFlex, sStem, sTag in getElemsFromFile(spfSrc, bCompressedDic):









            # chars
            for c in sFlex:
                if c not in dChar:
                    dChar[c] = nChar
                    lChar.append(c)
                    nChar += 1
                dCharOccur[c] = dCharOccur.get(c, 0) + 1
            # affixes to find stem from flexion
            aff = funcStemmingGen(sFlex, sStem)
            if aff not in dAff:
                dAff[aff] = nAff
                lAff.append(aff)
                nAff += 1
            dAffOccur[aff] = dCharOccur.get(aff, 0) + 1
            # tags
            if sTag not in dTag:
                dTag[sTag] = nTag
                lTag.append(sTag)
                nTag += 1
            dTagOccur[sTag] = dTagOccur.get(sTag, 0) + 1
            lEntry.append((sFlex, dAff[aff], dTag[sTag]))



        if not lEntry:
            raise ValueError("# Error. Empty lexicon")

        
        # Preparing DAWG
        print(" > Preparing list of words")
        lVal = lChar + lAff + lTag
        lWord = [ [dChar[c] for c in sFlex] + [iAff+nChar] + [iTag+nChar+nAff]  for sFlex, iAff, iTag in lEntry ]
        lEntry = None
        
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
        self.nArc = 0
        self.dChar = dChar
        self.nChar = len(dChar)
        self.nAff = nAff
        self.lArcVal = lVal
        self.nArcVal = len(lVal)
        self.nTag = self.nArcVal - self.nChar - nAff
        self.cStemming = cStemming.upper()
        if cStemming == "A":
            self.funcStemming = st.getStemFromAffixCode
        elif cStemming == "S":    
            self.funcStemming = st.getStemFromSuffixCode
        else:
            self.funcStemming = st.noStemming
        







|







137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
        self.nArc = 0
        self.dChar = dChar
        self.nChar = len(dChar)
        self.nAff = nAff
        self.lArcVal = lVal
        self.nArcVal = len(lVal)
        self.nTag = self.nArcVal - self.nChar - nAff
        self.cStemming = cStemming
        if cStemming == "A":
            self.funcStemming = st.getStemFromAffixCode
        elif cStemming == "S":    
            self.funcStemming = st.getStemFromSuffixCode
        else:
            self.funcStemming = st.noStemming