Grammalecte  Diff

Differences From Artifact [eb4c8506bd]:

To Artifact [1a8e51e627]:


25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
25
26
27
28
29
30
31















































32
33
34
35
36
37
38







-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-







            for sLine in hSrc:
                sLine = sLine.strip()
                if sLine and not sLine.startswith("#"):
                    yield sLine
    else:
        raise OSError("# Error. File not found or not loadable: " + spf)


def getElemsFromFile (spf):
    "returns tuple of (flexion, stem, tags) from lexicon file"
    nErr = 0
    if not spf.endswith(".clex"):
        for sLine in readFile(spf):
            try:
                sFlex, sStem, sTag = sLine.split("\t")
                yield (sFlex, sStem, sTag)
            except:
                nErr += 1
    else:
        sTag = "_" # neutral tag
        sTag2 = ""
        for sLine in readFile(spf):
            if sLine.startswith("[") and sLine.endswith("]"):
                # tag line
                if "-->" in sLine:
                    try:
                        sTag, sSfxCode, sTag2 = sLine[1:-1].split(" --> ")
                    except:
                        nErr += 1
                        continue
                    sTag = sTag.strip()
                    sSfxCode = sSfxCode.strip()
                    sTag2 = sTag2.strip()
                else:
                    sTag = sLine[1:-1]
                    sTag2 = ""
            else:
                # entry line
                if "\t" in sLine:
                    if sLine.count("\t") > 1:
                        nErr += 1
                        continue
                    sFlex, sStem = sLine.split("\t")
                else:
                    sFlex = sStem = sLine
                #print(sFlex, sStem, sTag)
                yield (sFlex, sStem, sTag)
                if sTag2:
                    sFlex2 = st.changeWordWithSuffixCode(sFlex, sSfxCode)
                    #print(sFlex2, sStem, sTag2)
                    yield (sFlex2, sStem, sTag2)
    if nErr:
        print(" # Lines ignored: {:>10}".format(nErr))



class DAWG:
    """DIRECT ACYCLIC WORD GRAPH"""
    # This code is inspired from Steve Hanov’s DAWG, 2011. (http://stevehanov.ca/blog/index.php?id=115)
    # We store suffix/affix codes and tags within the graph after the “real” word.
    # A word is a list of numbers [ c1, c2, c3 . . . cN, iAffix, iTags]
101
102
103
104
105
106
107

108

109
110
111
112
113
114
115
54
55
56
57
58
59
60
61

62
63
64
65
66
67
68
69







+
-
+







        lEntry = []
        lChar = ['']; dChar = {}; nChar = 1; dCharOccur = {}
        lAff  = [];   dAff  = {}; nAff  = 0; dAffOccur = {}
        lTag  = [];   dTag  = {}; nTag  = 0; dTagOccur = {}
        nErr = 0
        
        # read lexicon
        for sLine in readFile(spfSrc):
        for sFlex, sStem, sTag in getElemsFromFile(spfSrc):
            sFlex, sStem, sTag = sLine.split("\t")
            addWordToCharDict(sFlex)
            # chars
            for c in sFlex:
                if c not in dChar:
                    dChar[c] = nChar
                    lChar.append(c)
                    nChar += 1