Grammalecte  Check-in [4414e62e0b]

Overview
Comment:[graphspell] code consistency
Downloads: Tarball | ZIP archive | SQL archive
Timelines: family | ancestors | descendants | both | trunk | graphspell
Files: files | file ages | folders
SHA3-256: 4414e62e0be60582dd2d0e9b415093257d721843e17bd1958e11a68eb9a64e1e
User & Date: olr on 2018-02-05 10:34:21
Original Comment: [graphspell] str_transform: code clarification
Other Links: manifest | tags
Context
2018-02-06
10:22
[graphspell][py] dawg: code clarification check-in: b181054462 user: olr tags: trunk, graphspell
2018-02-05
10:34
[graphspell] code consistency check-in: 4414e62e0b user: olr tags: trunk, graphspell
2018-02-04
11:43
[fr] faux positif: syntagme adjectival check-in: d03269c260 user: olr tags: trunk, fr
Changes

Modified graphspell-js/dawg.js from [b32ed20eaa] to [fa338df471].

57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
                    dChar.set(c, nChar);
                    lChar.push(c);
                    nChar += 1;
                }
                dCharOccur.set(c, dCharOccur.gl_get(c, 0) + 1);
            }
            // affixes to find stem from flexion
            sAff = funcStemmingGen(sFlex, sStem);
            if (!dAff.get(sAff)) {
                dAff.set(sAff, nAff);
                lAff.push(sAff);
                nAff += 1;
            }
            dAffOccur.set(sAff, dCharOccur.gl_get(sAff, 0) + 1);
            // tags







|







57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
                    dChar.set(c, nChar);
                    lChar.push(c);
                    nChar += 1;
                }
                dCharOccur.set(c, dCharOccur.gl_get(c, 0) + 1);
            }
            // affixes to find stem from flexion
            let sAff = funcStemmingGen(sFlex, sStem);
            if (!dAff.get(sAff)) {
                dAff.set(sAff, nAff);
                lAff.push(sAff);
                nAff += 1;
            }
            dAffOccur.set(sAff, dCharOccur.gl_get(sAff, 0) + 1);
            // tags
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
        lEntry.length = 0; // clear the array
        
        // Dictionary of arc values occurrency, to sort arcs of each node
        let lKeyVal = [];
        for (let c of dChar.keys()) { lKeyVal.push([dChar[c], dCharOccur[c]]); }
        for (let sAff of dAff.keys()) { lKeyVal.push([dAff[sAff]+nChar, dAffOccur[sAff]]); }
        for (let sTag in dTag.keys()) { lKeyVal.push([dTag[sTag]+nChar+nAff, dTagOccur[sTag]]); }
        dValOccur = new Map(lKeyVal);
        lKeyVal.length = 0; // clear the array

        //with open(spfSrc[:-8]+".valuesfreq.txt", 'w', encoding='utf-8') as hFreqDst:  # DEBUG
        //    for iKey, nOcc in sorted(dValOccur.entries(), key=lambda t: t[1], reverse=True):
        //        hFreqDst.write("{}: {}\n".format(lVal[iKey], nOcc))
        //    hFreqDst.close()
        







|







97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
        lEntry.length = 0; // clear the array
        
        // Dictionary of arc values occurrency, to sort arcs of each node
        let lKeyVal = [];
        for (let c of dChar.keys()) { lKeyVal.push([dChar[c], dCharOccur[c]]); }
        for (let sAff of dAff.keys()) { lKeyVal.push([dAff[sAff]+nChar, dAffOccur[sAff]]); }
        for (let sTag in dTag.keys()) { lKeyVal.push([dTag[sTag]+nChar+nAff, dTagOccur[sTag]]); }
        let dValOccur = new Map(lKeyVal);
        lKeyVal.length = 0; // clear the array

        //with open(spfSrc[:-8]+".valuesfreq.txt", 'w', encoding='utf-8') as hFreqDst:  # DEBUG
        //    for iKey, nOcc in sorted(dValOccur.entries(), key=lambda t: t[1], reverse=True):
        //        hFreqDst.write("{}: {}\n".format(lVal[iKey], nOcc))
        //    hFreqDst.close()
        
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
    }
    
    _parseNodes (oNode) {
        // Warning: recursive method
        if (oNode.pos > 0) {
            return;
        }
        oNode.setPos();
        this.lSortedNodes.append(oNode);
        for (let oNextNode of oNode.arcs.values()) {
             this._parseNodes(oNextNode);
        }
    }
        
    lookup (sWord) {
        let oNode = this.oRoot;







|
|







238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
    }
    
    _parseNodes (oNode) {
        // Warning: recursive method
        if (oNode.pos > 0) {
            return;
        }
        //oNode.setPos();  // version 2
        this.lSortedNodes.push(oNode);
        for (let oNextNode of oNode.arcs.values()) {
             this._parseNodes(oNextNode);
        }
    }
        
    lookup (sWord) {
        let oNode = this.oRoot;

Modified graphspell-js/ibdawg.js from [222e0edc2a] to [5f5404403f].

120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
        }
        // <dChar> to get the value of an arc, <dCharVal> to get the char of an arc with its value
        this.dChar = helpers.objectToMap(this.dChar);
        this.dCharVal = this.dChar.gl_reverse();
        //this.byDic = new Uint8Array(this.byDic);  // not quicker, even slower

        if (this.cStemming == "S") {
            this.funcStemming = str_transform.getStemFromSuffixCode;
        } else if (this.cStemming == "A") {
            this.funcStemming = str_transform.getStemFromAffixCode;
        } else {
            this.funcStemming = str_transform.noStemming;
        }

        // Configuring DAWG functions according to nVersion
        switch (this.nVersion) {
            case 1:







|

|







120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
        }
        // <dChar> to get the value of an arc, <dCharVal> to get the char of an arc with its value
        this.dChar = helpers.objectToMap(this.dChar);
        this.dCharVal = this.dChar.gl_reverse();
        //this.byDic = new Uint8Array(this.byDic);  // not quicker, even slower

        if (this.cStemming == "S") {
            this.funcStemming = str_transform.changeWordWithSuffixCode;
        } else if (this.cStemming == "A") {
            this.funcStemming = str_transform.changeWordWithAffixCode;
        } else {
            this.funcStemming = str_transform.noStemming;
        }

        // Configuring DAWG functions according to nVersion
        switch (this.nVersion) {
            case 1:

Modified graphspell-js/str_transform.js from [e440be0f51] to [047ef79ee5].

1
2
3



4
5
6






































7
8
9
10
11
12
13
//// STRING TRANSFORMATION
/*jslint esversion: 6*/




// Note: 48 is the ASCII code for "0"

var str_transform = {







































    distanceDamerauLevenshtein2: function (s1, s2) {
        // distance of Damerau-Levenshtein between <s1> and <s2>
        // https://fr.wikipedia.org/wiki/Distance_de_Damerau-Levenshtein
        try {
            let nLen1 = s1.length;
            let nLen2 = s2.length;



>
>
>



>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>







1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
//// STRING TRANSFORMATION
/*jslint esversion: 6*/

"use strict";


// Note: 48 is the ASCII code for "0"

var str_transform = {

    longestCommonSubstring: function (string1, string2) {
        // https://en.wikibooks.org/wiki/Algorithm_Implementation/Strings/Longest_common_substring
        // untested

        // init max value
        let longestCommonSubstring = 0;
        // init 2D array with 0
        let table = [],
            len1 = string1.length,
            len2 = string2.length,
            row, col;
        for (row = 0; row <= len1; row++) {
            table[row] = [];
            for (col = 0; col <= len2; col++) {
                table[row][col] = 0;
            }
        }
        // fill table
        let i, j;
        for (i = 0;  i < len1;  i++) {
            for (j = 0;  j < len2;  j++) {
                if (string1[i] === string2[j]) {
                    if (table[i][j] === 0){
                        table[i+1][j+1] = 1;
                    } else {
                        table[i+1][j+1] = table[i][j] + 1;
                    }
                    if (table[i+1][j+1] > longestCommonSubstring) {
                        longestCommonSubstring = table[i+1][j+1];
                    }
                } else {
                    table[i+1][j+1] = 0;
                }
            }
        }
        return longestCommonSubstring;
    },

    distanceDamerauLevenshtein2: function (s1, s2) {
        // distance of Damerau-Levenshtein between <s1> and <s2>
        // https://fr.wikipedia.org/wiki/Distance_de_Damerau-Levenshtein
        try {
            let nLen1 = s1.length;
            let nLen2 = s2.length;
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129

































130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
                break;
            }
            jSfx += 1;
        }
        return String.fromCharCode(sFlex.length-jSfx+48) + sStem.slice(jSfx);
    },

    getStemFromSuffixCode: function (sFlex, sSfxCode) {
        // Suffix only
        if (sSfxCode == "0") {
            return sFlex;
        }
        return sSfxCode[0] == '0' ? sFlex + sSfxCode.slice(1) : sFlex.slice(0, -(sSfxCode.charCodeAt(0)-48)) + sSfxCode.slice(1);
    },
    
    getStemFromAffixCode: function (sFlex, sAffCode) {
        // Prefix and suffix

































        if (sAffCode == "0") {
            return sFlex;
        }
        if (!sAffCode.includes("/")) {
            return "# error #";
        }
        let [sPfxCode, sSfxCode] = sAffCode.split('/');
        sFlex = sPfxCode.slice(1) + sFlex.slice(sPfxCode.charCodeAt(0)-48);
        return sSfxCode[0] == '0' ? sFlex + sSfxCode.slice(1) : sFlex.slice(0, -(sSfxCode.charCodeAt(0)-48)) + sSfxCode.slice(1);
    }
};


if (typeof(exports) !== 'undefined') {
    exports.getStemFromSuffixCode = str_transform.getStemFromSuffixCode;
    exports.getStemFromAffixCode = str_transform.getStemFromAffixCode;
}







|
<

|

|


<
|
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>

|


|


|
|








154
155
156
157
158
159
160
161

162
163
164
165
166
167

168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
                break;
            }
            jSfx += 1;
        }
        return String.fromCharCode(sFlex.length-jSfx+48) + sStem.slice(jSfx);
    },

    changeWordWithSuffixCode: function (sWord, sSfxCode) {

        if (sSfxCode == "0") {
            return sWord;
        }
        return sSfxCode[0] == '0' ? sWord + sSfxCode.slice(1) : sWord.slice(0, -(sSfxCode.charCodeAt(0)-48)) + sSfxCode.slice(1);
    },
    

    // Prefix and suffix
    defineAffixCode: function (sFlex, sStem) {
        /*
            UNTESTED!
            Returns a string defining how to get stem from flexion. Examples:
                "0" if stem = flexion
                "stem" if no common substring
                "n(pfx)/m(sfx)"
            with n and m: chars with numeric meaning, "0" = 0, "1" = 1, ... ":" = 10, etc. (See ASCII table.) Says how many letters to strip from flexion.
                pfx [optional]: string to add before the flexion 
                sfx [optional]: string to add after the flexion
        */
        if (sFlex == sStem) {
            return "0";
        }
        // is stem a substring of flexion?
        let n = sFlex.indexOf(sStem);
        if (n >= 0) {
            return String.fromCharCode(n+48) + "/" + String.fromCharCode(sFlex.length-(sStem.length+n)+48);
        }
        // no, so we are looking for common substring
        let sSubs = this.longestCommonSubstring(sFlex, sStem);
        if (sSubs.length > 1) {
            let iPos = sStem.indexOf(sSubs);
            let sPfx = sStem.slice(0, iPos);
            let sSfx = sStem.slice(iPos+sSubs.length);
            let n = sFlex.indexOf(sSubs);
            let m = sFlex.length - (sSubs.length+n);
            return String.fromCharCode(n+48) + sPfx + "/" + String.fromCharCode(m+48) + sSfx;
        }
        return sStem;
    },

    changeWordWithAffixCode: function (sWord, sAffCode) {
        if (sAffCode == "0") {
            return sWord;
        }
        if (!sAffCode.includes("/")) {
            return sAffCode;
        }
        let [sPfxCode, sSfxCode] = sAffCode.split('/');
        sWord = sPfxCode.slice(1) + sWord.slice(sPfxCode.charCodeAt(0)-48);
        return sSfxCode[0] == '0' ? sWord + sSfxCode.slice(1) : sWord.slice(0, -(sSfxCode.charCodeAt(0)-48)) + sSfxCode.slice(1);
    }
};


if (typeof(exports) !== 'undefined') {
    exports.getStemFromSuffixCode = str_transform.getStemFromSuffixCode;
    exports.getStemFromAffixCode = str_transform.getStemFromAffixCode;
}

Modified graphspell/str_transform.py from [646bc07929] to [9961c8cbc8].

182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
    sSubs = longestCommonSubstring(sFlex, sStem)
    if len(sSubs) > 1:
        iPos = sStem.find(sSubs)
        sPfx = sStem[:iPos]
        sSfx = sStem[iPos+len(sSubs):]
        n = sFlex.find(sSubs)
        m = len(sFlex) - (len(sSubs)+n)
        sAff = "{}/".format(chr(n+48))  if not sPfx  else "{}{}/".format(chr(n+48), sPfx)
        sAff += chr(m+48)  if not sSfx  else "{}{}".format(chr(m+48), sSfx)
        return sAff
    return sStem


def changeWordWithAffixCode (sWord, sAffCode):
    if sAffCode == "0":
        return sWord
    if '/' not in sAffCode:
        return "# error #"
    sPfxCode, sSfxCode = sAffCode.split('/')
    sWord = sPfxCode[1:] + sWord[(ord(sPfxCode[0])-48):] 
    return sWord[:-(ord(sSfxCode[0])-48)] + sSfxCode[1:]  if sSfxCode[0] != '0'  else sWord + sSfxCode[1:]








<
<
|







|




182
183
184
185
186
187
188


189
190
191
192
193
194
195
196
197
198
199
200
201
    sSubs = longestCommonSubstring(sFlex, sStem)
    if len(sSubs) > 1:
        iPos = sStem.find(sSubs)
        sPfx = sStem[:iPos]
        sSfx = sStem[iPos+len(sSubs):]
        n = sFlex.find(sSubs)
        m = len(sFlex) - (len(sSubs)+n)


        return chr(n+48) + sPfx + "/" + chr(m+48) + sSfx
    return sStem


def changeWordWithAffixCode (sWord, sAffCode):
    if sAffCode == "0":
        return sWord
    if '/' not in sAffCode:
        return sAffCode
    sPfxCode, sSfxCode = sAffCode.split('/')
    sWord = sPfxCode[1:] + sWord[(ord(sPfxCode[0])-48):] 
    return sWord[:-(ord(sSfxCode[0])-48)] + sSfxCode[1:]  if sSfxCode[0] != '0'  else sWord + sSfxCode[1:]