Grammalecte  Diff

Differences From Artifact [711ba29b0e]:

To Artifact [d94e6b7163]:


1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
// JavaScript

// FSA DICTIONARY BUILDER
//
// by Olivier R.
// License: MPL 2
//
// This tool encodes lexicon into an indexable binary dictionary 
// Input files MUST be encoded in UTF-8.

"use strict";


if (typeof(require) !== 'undefined') {
    var str_transform = require("resource://grammalecte/graphspell/str_transform.js");







|







1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
// JavaScript

// FSA DICTIONARY BUILDER
//
// by Olivier R.
// License: MPL 2
//
// This tool encodes lexicon into an indexable binary dictionary
// Input files MUST be encoded in UTF-8.

"use strict";


if (typeof(require) !== 'undefined') {
    var str_transform = require("resource://grammalecte/graphspell/str_transform.js");
37
38
39
40
41
42
43
44
45
46
47
48
49
50


51
52



53
54
55
56
57
58
59
            case "S":
                funcStemmingGen = str_transform.defineSuffixCode; break;
            case "N":
                funcStemmingGen = str_transform.noStemming; break;
            default:
                throw "Error. Unknown stemming code: " + cStemming;
        }
        
        let lEntry = [];
        let lChar = [''],  dChar = new Map(),  nChar = 1,  dCharOccur = new Map();
        let lAff  = [],    dAff  = new Map(),  nAff  = 0,  dAffOccur = new Map();
        let lTag  = [],    dTag  = new Map(),  nTag  = 0,  dTagOccur = new Map();
        let nErr = 0;
        


        // read lexicon
        for (let [sFlex, sStem, sTag] of lEntrySrc) {



            addWordToCharDict(sFlex);
            // chars
            for (let c of sFlex) {
                if (!dChar.get(c)) {
                    dChar.set(c, nChar);
                    lChar.push(c);
                    nChar += 1;







|





|
>
>


>
>
>







37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
            case "S":
                funcStemmingGen = str_transform.defineSuffixCode; break;
            case "N":
                funcStemmingGen = str_transform.noStemming; break;
            default:
                throw "Error. Unknown stemming code: " + cStemming;
        }

        let lEntry = [];
        let lChar = [''],  dChar = new Map(),  nChar = 1,  dCharOccur = new Map();
        let lAff  = [],    dAff  = new Map(),  nAff  = 0,  dAffOccur = new Map();
        let lTag  = [],    dTag  = new Map(),  nTag  = 0,  dTagOccur = new Map();
        let nErr = 0;

        this.a2grams = new Set();

        // read lexicon
        for (let [sFlex, sStem, sTag] of lEntrySrc) {
            for (let s2grams of str_transform.getNgrams(sFlex)) {
                this.a2grams.add(s2grams);
            }
            addWordToCharDict(sFlex);
            // chars
            for (let c of sFlex) {
                if (!dChar.get(c)) {
                    dChar.set(c, nChar);
                    lChar.push(c);
                    nChar += 1;
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
        if (lEntry.length == 0) {
            throw "Error. Empty lexicon";
        }

        lEntry = [...new Set(lEntry.map(e => JSON.stringify(e)))].map(s => JSON.parse(s));
        // Set can’t distinguish similar lists, so we transform list item in string given to the Set
        // then we transform items in list a new.
        
        // Preparing DAWG
        console.log(" > Preparing list of words");
        let lVal = lChar.concat(lAff).concat(lTag);
        let lWord = [];
        for (let [sFlex, iAff, iTag] of lEntry) {
            let lTemp = [];
            for (let c of sFlex) {
                lTemp.push(dChar.get(c));
            }
            lTemp.push(iAff+nChar);
            lTemp.push(iTag+nChar+nAff)
            lWord.push(lTemp);
        }
        lEntry.length = 0; // clear the array
        
        // Dictionary of arc values occurrency, to sort arcs of each node
        let lKeyVal = [];
        for (let c of dChar.keys()) { lKeyVal.push([dChar.get(c), dCharOccur.get(c)]); }
        for (let sAff of dAff.keys()) { lKeyVal.push([dAff.get(sAff)+nChar, dAffOccur.get(sAff)]); }
        for (let sTag of dTag.keys()) { lKeyVal.push([dTag.get(sTag)+nChar+nAff, dTagOccur.get(sTag)]); }
        let dValOccur = new Map(lKeyVal);
        lKeyVal.length = 0; // clear the array







|














|







85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
        if (lEntry.length == 0) {
            throw "Error. Empty lexicon";
        }

        lEntry = [...new Set(lEntry.map(e => JSON.stringify(e)))].map(s => JSON.parse(s));
        // Set can’t distinguish similar lists, so we transform list item in string given to the Set
        // then we transform items in list a new.

        // Preparing DAWG
        console.log(" > Preparing list of words");
        let lVal = lChar.concat(lAff).concat(lTag);
        let lWord = [];
        for (let [sFlex, iAff, iTag] of lEntry) {
            let lTemp = [];
            for (let c of sFlex) {
                lTemp.push(dChar.get(c));
            }
            lTemp.push(iAff+nChar);
            lTemp.push(iTag+nChar+nAff)
            lWord.push(lTemp);
        }
        lEntry.length = 0; // clear the array

        // Dictionary of arc values occurrency, to sort arcs of each node
        let lKeyVal = [];
        for (let c of dChar.keys()) { lKeyVal.push([dChar.get(c), dCharOccur.get(c)]); }
        for (let sAff of dAff.keys()) { lKeyVal.push([dAff.get(sAff)+nChar, dAffOccur.get(sAff)]); }
        for (let sTag of dTag.keys()) { lKeyVal.push([dTag.get(sTag)+nChar+nAff, dTagOccur.get(sTag)]); }
        let dValOccur = new Map(lKeyVal);
        lKeyVal.length = 0; // clear the array
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
        if (cStemming == "A") {
            this.funcStemming = str_transform.changeWordWithAffixCode;
        } else if (cStemming == "S") {
            this.funcStemming = str_transform.changeWordWithSuffixCode;
        } else {
            this.funcStemming = str_transform.noStemming;
        }
        
        // build
        lWord.sort();
        if (xProgressBarNode) {
            xProgressBarNode.value = 0;
            xProgressBarNode.max = lWord.length;
        }
        let i = 1;







|







134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
        if (cStemming == "A") {
            this.funcStemming = str_transform.changeWordWithAffixCode;
        } else if (cStemming == "S") {
            this.funcStemming = str_transform.changeWordWithSuffixCode;
        } else {
            this.funcStemming = str_transform.noStemming;
        }

        // build
        lWord.sort();
        if (xProgressBarNode) {
            xProgressBarNode.value = 0;
            xProgressBarNode.max = lWord.length;
        }
        let i = 1;
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232

    countArcs () {
        this.nArc = 0;
        for (let oNode of this.dMinimizedNodes.values()) {
            this.nArc += oNode.arcs.size;
        }
    }
    
    sortNodeArcs (dValOccur) {
        console.log(" > Sort node arcs");
        this.oRoot.sortArcs(dValOccur);
        for (let oNode of this.dMinimizedNodes.values()) {
            oNode.sortArcs(dValOccur);
        }
    }







|







223
224
225
226
227
228
229
230
231
232
233
234
235
236
237

    countArcs () {
        this.nArc = 0;
        for (let oNode of this.dMinimizedNodes.values()) {
            this.nArc += oNode.arcs.size;
        }
    }

    sortNodeArcs (dValOccur) {
        console.log(" > Sort node arcs");
        this.oRoot.sortArcs(dValOccur);
        for (let oNode of this.dMinimizedNodes.values()) {
            oNode.sortArcs(dValOccur);
        }
    }
271
272
273
274
275
276
277

278
279
280
281
282
283
284
        console.log("Entries: " + this.nEntry);
        console.log("Characters: " + this.nChar);
        console.log("Affixes: " + this.nAff);
        console.log("Tags: " + this.nTag);
        console.log("Arc values: " + this.nArcVal);
        console.log("Nodes: " + this.nNode);
        console.log("Arcs: " + this.nArc);

        console.log("Stemming: " + this.cStemming + "FX");
    }

    getArcStats () {
        let d = new Map();
        for (let oNode of this.dMinimizedNodes.values()) {
            let n = oNode.arcs.size;







>







276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
        console.log("Entries: " + this.nEntry);
        console.log("Characters: " + this.nChar);
        console.log("Affixes: " + this.nAff);
        console.log("Tags: " + this.nTag);
        console.log("Arc values: " + this.nArcVal);
        console.log("Nodes: " + this.nNode);
        console.log("Arcs: " + this.nArc);
        console.log("2grams: " + this.a2grams.size);
        console.log("Stemming: " + this.cStemming + "FX");
    }

    getArcStats () {
        let d = new Map();
        for (let oNode of this.dMinimizedNodes.values()) {
            let n = oNode.arcs.size;
392
393
394
395
396
397
398
399

400
401
402
403
404
405
406
            "nArc": this.nArc,
            "lArcVal": this.lArcVal,
            "nArcVal": this.nArcVal,
            "nCompressionMethod": nCompressionMethod,
            "nBytesArc": this.nBytesArc,
            "nBytesNodeAddress": this.nBytesNodeAddress,
            "nBytesOffset": this.nBytesOffset,
            "sByDic": sByDic    // binary word graph

        };
        return oJSON;
    }

    _getDate () {
        let oDate = new Date();
        let sMonth = (oDate.getMonth() + 1).toString().padStart(2, "0"); // Month+1: Because JS always sucks somehow.







|
>







398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
            "nArc": this.nArc,
            "lArcVal": this.lArcVal,
            "nArcVal": this.nArcVal,
            "nCompressionMethod": nCompressionMethod,
            "nBytesArc": this.nBytesArc,
            "nBytesNodeAddress": this.nBytesNodeAddress,
            "nBytesOffset": this.nBytesOffset,
            "sByDic": sByDic,    // binary word graph
            "l2grams": Array.from(this.a2grams)
        };
        return oJSON;
    }

    _getDate () {
        let oDate = new Date();
        let sMonth = (oDate.getMonth() + 1).toString().padStart(2, "0"); // Month+1: Because JS always sucks somehow.
484
485
486
487
488
489
490
491
492
493
494
495
496
497
498

    // VERSION 1 =====================================================================================================
    convToBytes1 (nBytesArc, nBytesNodeAddress) {
        /*
            Node scheme:
            - Arc length is defined by nBytesArc
            - Address length is defined by nBytesNodeAddress
                                           
            |                Arc                |                         Address of next node                          |
            |                                   |                                                                       |
             /---------------\ /---------------\ /---------------\ /---------------\ /---------------\ /---------------\
             | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | |
             \---------------/ \---------------/ \---------------/ \---------------/ \---------------/ \---------------/
             [...]
             /---------------\ /---------------\ /---------------\ /---------------\ /---------------\ /---------------\







|







491
492
493
494
495
496
497
498
499
500
501
502
503
504
505

    // VERSION 1 =====================================================================================================
    convToBytes1 (nBytesArc, nBytesNodeAddress) {
        /*
            Node scheme:
            - Arc length is defined by nBytesArc
            - Address length is defined by nBytesNodeAddress

            |                Arc                |                         Address of next node                          |
            |                                   |                                                                       |
             /---------------\ /---------------\ /---------------\ /---------------\ /---------------\ /---------------\
             | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | |
             \---------------/ \---------------/ \---------------/ \---------------/ \---------------/ \---------------/
             [...]
             /---------------\ /---------------\ /---------------\ /---------------\ /---------------\ /---------------\