Grammalecte  Diff

Differences From Artifact [711ba29b0e]:

To Artifact [d94e6b7163]:


1
2
3
4
5
6
7
8

9
10
11
12
13
14
15
1
2
3
4
5
6
7

8
9
10
11
12
13
14
15







-
+







// JavaScript

// FSA DICTIONARY BUILDER
//
// by Olivier R.
// License: MPL 2
//
// This tool encodes lexicon into an indexable binary dictionary 
// This tool encodes lexicon into an indexable binary dictionary
// Input files MUST be encoded in UTF-8.

"use strict";


if (typeof(require) !== 'undefined') {
    var str_transform = require("resource://grammalecte/graphspell/str_transform.js");
37
38
39
40
41
42
43
44

45
46
47
48
49
50



51
52



53
54
55
56
57
58
59
37
38
39
40
41
42
43

44
45
46
47
48
49

50
51
52
53
54
55
56
57
58
59
60
61
62
63
64







-
+





-
+
+
+


+
+
+







            case "S":
                funcStemmingGen = str_transform.defineSuffixCode; break;
            case "N":
                funcStemmingGen = str_transform.noStemming; break;
            default:
                throw "Error. Unknown stemming code: " + cStemming;
        }
        

        let lEntry = [];
        let lChar = [''],  dChar = new Map(),  nChar = 1,  dCharOccur = new Map();
        let lAff  = [],    dAff  = new Map(),  nAff  = 0,  dAffOccur = new Map();
        let lTag  = [],    dTag  = new Map(),  nTag  = 0,  dTagOccur = new Map();
        let nErr = 0;
        

        this.a2grams = new Set();

        // read lexicon
        for (let [sFlex, sStem, sTag] of lEntrySrc) {
            for (let s2grams of str_transform.getNgrams(sFlex)) {
                this.a2grams.add(s2grams);
            }
            addWordToCharDict(sFlex);
            // chars
            for (let c of sFlex) {
                if (!dChar.get(c)) {
                    dChar.set(c, nChar);
                    lChar.push(c);
                    nChar += 1;
80
81
82
83
84
85
86
87

88
89
90
91
92
93
94
95
96
97
98
99
100
101
102

103
104
105
106
107
108
109
85
86
87
88
89
90
91

92
93
94
95
96
97
98
99
100
101
102
103
104
105
106

107
108
109
110
111
112
113
114







-
+














-
+







        if (lEntry.length == 0) {
            throw "Error. Empty lexicon";
        }

        lEntry = [...new Set(lEntry.map(e => JSON.stringify(e)))].map(s => JSON.parse(s));
        // Set can’t distinguish similar lists, so we transform list item in string given to the Set
        // then we transform items in list a new.
        

        // Preparing DAWG
        console.log(" > Preparing list of words");
        let lVal = lChar.concat(lAff).concat(lTag);
        let lWord = [];
        for (let [sFlex, iAff, iTag] of lEntry) {
            let lTemp = [];
            for (let c of sFlex) {
                lTemp.push(dChar.get(c));
            }
            lTemp.push(iAff+nChar);
            lTemp.push(iTag+nChar+nAff)
            lWord.push(lTemp);
        }
        lEntry.length = 0; // clear the array
        

        // Dictionary of arc values occurrency, to sort arcs of each node
        let lKeyVal = [];
        for (let c of dChar.keys()) { lKeyVal.push([dChar.get(c), dCharOccur.get(c)]); }
        for (let sAff of dAff.keys()) { lKeyVal.push([dAff.get(sAff)+nChar, dAffOccur.get(sAff)]); }
        for (let sTag of dTag.keys()) { lKeyVal.push([dTag.get(sTag)+nChar+nAff, dTagOccur.get(sTag)]); }
        let dValOccur = new Map(lKeyVal);
        lKeyVal.length = 0; // clear the array
129
130
131
132
133
134
135
136

137
138
139
140
141
142
143
134
135
136
137
138
139
140

141
142
143
144
145
146
147
148







-
+







        if (cStemming == "A") {
            this.funcStemming = str_transform.changeWordWithAffixCode;
        } else if (cStemming == "S") {
            this.funcStemming = str_transform.changeWordWithSuffixCode;
        } else {
            this.funcStemming = str_transform.noStemming;
        }
        

        // build
        lWord.sort();
        if (xProgressBarNode) {
            xProgressBarNode.value = 0;
            xProgressBarNode.max = lWord.length;
        }
        let i = 1;
218
219
220
221
222
223
224
225

226
227
228
229
230
231
232
223
224
225
226
227
228
229

230
231
232
233
234
235
236
237







-
+








    countArcs () {
        this.nArc = 0;
        for (let oNode of this.dMinimizedNodes.values()) {
            this.nArc += oNode.arcs.size;
        }
    }
    

    sortNodeArcs (dValOccur) {
        console.log(" > Sort node arcs");
        this.oRoot.sortArcs(dValOccur);
        for (let oNode of this.dMinimizedNodes.values()) {
            oNode.sortArcs(dValOccur);
        }
    }
271
272
273
274
275
276
277

278
279
280
281
282
283
284
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290







+







        console.log("Entries: " + this.nEntry);
        console.log("Characters: " + this.nChar);
        console.log("Affixes: " + this.nAff);
        console.log("Tags: " + this.nTag);
        console.log("Arc values: " + this.nArcVal);
        console.log("Nodes: " + this.nNode);
        console.log("Arcs: " + this.nArc);
        console.log("2grams: " + this.a2grams.size);
        console.log("Stemming: " + this.cStemming + "FX");
    }

    getArcStats () {
        let d = new Map();
        for (let oNode of this.dMinimizedNodes.values()) {
            let n = oNode.arcs.size;
392
393
394
395
396
397
398
399


400
401
402
403
404
405
406
398
399
400
401
402
403
404

405
406
407
408
409
410
411
412
413







-
+
+







            "nArc": this.nArc,
            "lArcVal": this.lArcVal,
            "nArcVal": this.nArcVal,
            "nCompressionMethod": nCompressionMethod,
            "nBytesArc": this.nBytesArc,
            "nBytesNodeAddress": this.nBytesNodeAddress,
            "nBytesOffset": this.nBytesOffset,
            "sByDic": sByDic    // binary word graph
            "sByDic": sByDic,    // binary word graph
            "l2grams": Array.from(this.a2grams)
        };
        return oJSON;
    }

    _getDate () {
        let oDate = new Date();
        let sMonth = (oDate.getMonth() + 1).toString().padStart(2, "0"); // Month+1: Because JS always sucks somehow.
484
485
486
487
488
489
490
491

492
493
494
495
496
497
498
491
492
493
494
495
496
497

498
499
500
501
502
503
504
505







-
+








    // VERSION 1 =====================================================================================================
    convToBytes1 (nBytesArc, nBytesNodeAddress) {
        /*
            Node scheme:
            - Arc length is defined by nBytesArc
            - Address length is defined by nBytesNodeAddress
                                           

            |                Arc                |                         Address of next node                          |
            |                                   |                                                                       |
             /---------------\ /---------------\ /---------------\ /---------------\ /---------------\ /---------------\
             | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | |
             \---------------/ \---------------/ \---------------/ \---------------/ \---------------/ \---------------/
             [...]
             /---------------\ /---------------\ /---------------\ /---------------\ /---------------\ /---------------\