Grammalecte  Diff

Differences From Artifact [63684196d2]:

To Artifact [eb988983d4]:


10
11
12
13
14
15
16


17
18
19
20
21
22
23
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25







+
+









import sys
import os
import collections
import json
import time
import re
import traceback

from . import str_transform as st
from .progressbar import ProgressBar



def readFile (spf):
37
38
39
40
41
42
43
44

45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61







62
63
64
65
66
67

68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89






















90
91
92
93
94

95
96
97
98
99
100
101
39
40
41
42
43
44
45

46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77






















78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112







-
+

















+
+
+
+
+
+
+






+
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+





+







    """DIRECT ACYCLIC WORD GRAPH"""
    # This code is inspired from Steve Hanov’s DAWG, 2011. (http://stevehanov.ca/blog/index.php?id=115)
    # We store suffix/affix codes and tags within the graph after the “real” word.
    # A word is a list of numbers [ c1, c2, c3 . . . cN, iAffix, iTags]
    # Each arc is an index in self.lArcVal, where are stored characters, suffix/affix codes for stemming and tags.
    # Important: As usual, the last node (after ‘iTags’) is tagged final, AND the node after ‘cN’ is ALSO tagged final.

    def __init__ (self, src, cStemming, sLangCode, sLangName="", sDicName=""):
    def __init__ (self, src, cStemming, sLangCode, sLangName="", sDicName="", sSelectFilterRegex=""):
        print("===== Direct Acyclic Word Graph - Minimal Acyclic Finite State Automaton =====")
        cStemming = cStemming.upper()
        if cStemming == "A":
            funcStemmingGen = st.defineAffixCode
        elif cStemming == "S":
            funcStemmingGen = st.defineSuffixCode
        elif cStemming == "N":
            funcStemmingGen = st.noStemming
        else:
            raise ValueError("# Error. Unknown stemming code: {}".format(cStemming))

        aEntry = set()
        lChar = ['']; dChar = {}; nChar = 1; dCharOccur = {}
        lAff  = [];   dAff  = {}; nAff  = 0; dAffOccur = {}
        lTag  = [];   dTag  = {}; nTag  = 0; dTagOccur = {}
        nErr = 0

        try:
            zFilter = re.compile(sSelectFilterRegex)  if sSelectFilterRegex  else None
        except:
            print(" # Error. Wrong filter regex. Filter ignored.")
            traceback.print_exc()
            zFilter = None

        # read lexicon
        if type(src) is str:
            iterable = readFile(src)
        else:
            iterable = src
        for sFlex, sStem, sTag in iterable:
            if not zFilter or zFilter.search(sTag):
            addWordToCharDict(sFlex)
            # chars
            for c in sFlex:
                if c not in dChar:
                    dChar[c] = nChar
                    lChar.append(c)
                    nChar += 1
                dCharOccur[c] = dCharOccur.get(c, 0) + 1
            # affixes to find stem from flexion
            sAff = funcStemmingGen(sFlex, sStem)
            if sAff not in dAff:
                dAff[sAff] = nAff
                lAff.append(sAff)
                nAff += 1
            dAffOccur[sAff] = dCharOccur.get(sAff, 0) + 1
            # tags
            if sTag not in dTag:
                dTag[sTag] = nTag
                lTag.append(sTag)
                nTag += 1
            dTagOccur[sTag] = dTagOccur.get(sTag, 0) + 1
            aEntry.add((sFlex, dAff[sAff], dTag[sTag]))
                addWordToCharDict(sFlex)
                # chars
                for c in sFlex:
                    if c not in dChar:
                        dChar[c] = nChar
                        lChar.append(c)
                        nChar += 1
                    dCharOccur[c] = dCharOccur.get(c, 0) + 1
                # affixes to find stem from flexion
                sAff = funcStemmingGen(sFlex, sStem)
                if sAff not in dAff:
                    dAff[sAff] = nAff
                    lAff.append(sAff)
                    nAff += 1
                dAffOccur[sAff] = dCharOccur.get(sAff, 0) + 1
                # tags
                if sTag not in dTag:
                    dTag[sTag] = nTag
                    lTag.append(sTag)
                    nTag += 1
                dTagOccur[sTag] = dTagOccur.get(sTag, 0) + 1
                aEntry.add((sFlex, dAff[sAff], dTag[sTag]))
        if not aEntry:
            raise ValueError("# Error. Empty lexicon")
        
        # Preparing DAWG
        print(" > Preparing list of words")
        print(" Filter: " + (sSelectFilterRegex or "[None]"))
        lVal = lChar + lAff + lTag
        lWord = [ [dChar[c] for c in sFlex] + [iAff+nChar] + [iTag+nChar+nAff]  for sFlex, iAff, iTag in aEntry ]
        aEntry = None
        
        # Dictionary of arc values occurrency, to sort arcs of each node
        dValOccur = dict( [ (dChar[c], dCharOccur[c])  for c in dChar ] \
                        + [ (dAff[aff]+nChar, dAffOccur[aff]) for aff in dAff ] \