1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
|
// JavaScript
// FSA DICTIONARY BUILDER
//
// by Olivier R.
// License: MPL 2
//
// This tool encodes lexicon into an indexable binary dictionary
// Input files MUST be encoded in UTF-8.
"use strict";
if (typeof(require) !== 'undefined') {
var str_transform = require("resource://grammalecte/graphspell/str_transform.js");
|
|
|
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
|
// JavaScript
// FSA DICTIONARY BUILDER
//
// by Olivier R.
// License: MPL 2
//
// This tool encodes lexicon into an indexable binary dictionary
// Input files MUST be encoded in UTF-8.
"use strict";
if (typeof(require) !== 'undefined') {
var str_transform = require("resource://grammalecte/graphspell/str_transform.js");
|
︙ | | | ︙ | |
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
|
case "S":
funcStemmingGen = str_transform.defineSuffixCode; break;
case "N":
funcStemmingGen = str_transform.noStemming; break;
default:
throw "Error. Unknown stemming code: " + cStemming;
}
let lEntry = [];
let lChar = [''], dChar = new Map(), nChar = 1, dCharOccur = new Map();
let lAff = [], dAff = new Map(), nAff = 0, dAffOccur = new Map();
let lTag = [], dTag = new Map(), nTag = 0, dTagOccur = new Map();
let nErr = 0;
// read lexicon
for (let [sFlex, sStem, sTag] of lEntrySrc) {
addWordToCharDict(sFlex);
// chars
for (let c of sFlex) {
if (!dChar.get(c)) {
dChar.set(c, nChar);
lChar.push(c);
nChar += 1;
|
|
|
>
>
>
>
>
|
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
|
case "S":
funcStemmingGen = str_transform.defineSuffixCode; break;
case "N":
funcStemmingGen = str_transform.noStemming; break;
default:
throw "Error. Unknown stemming code: " + cStemming;
}
let lEntry = [];
let lChar = [''], dChar = new Map(), nChar = 1, dCharOccur = new Map();
let lAff = [], dAff = new Map(), nAff = 0, dAffOccur = new Map();
let lTag = [], dTag = new Map(), nTag = 0, dTagOccur = new Map();
let nErr = 0;
this.a2grams = new Set();
// read lexicon
for (let [sFlex, sStem, sTag] of lEntrySrc) {
for (let s2grams of str_transform.getNgrams(sFlex)) {
this.a2grams.add(s2grams);
}
addWordToCharDict(sFlex);
// chars
for (let c of sFlex) {
if (!dChar.get(c)) {
dChar.set(c, nChar);
lChar.push(c);
nChar += 1;
|
︙ | | | ︙ | |
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
|
if (lEntry.length == 0) {
throw "Error. Empty lexicon";
}
lEntry = [...new Set(lEntry.map(e => JSON.stringify(e)))].map(s => JSON.parse(s));
// Set can’t distinguish similar lists, so we transform list item in string given to the Set
// then we transform items in list a new.
// Preparing DAWG
console.log(" > Preparing list of words");
let lVal = lChar.concat(lAff).concat(lTag);
let lWord = [];
for (let [sFlex, iAff, iTag] of lEntry) {
let lTemp = [];
for (let c of sFlex) {
lTemp.push(dChar.get(c));
}
lTemp.push(iAff+nChar);
lTemp.push(iTag+nChar+nAff)
lWord.push(lTemp);
}
lEntry.length = 0; // clear the array
// Dictionary of arc values occurrency, to sort arcs of each node
let lKeyVal = [];
for (let c of dChar.keys()) { lKeyVal.push([dChar.get(c), dCharOccur.get(c)]); }
for (let sAff of dAff.keys()) { lKeyVal.push([dAff.get(sAff)+nChar, dAffOccur.get(sAff)]); }
for (let sTag of dTag.keys()) { lKeyVal.push([dTag.get(sTag)+nChar+nAff, dTagOccur.get(sTag)]); }
let dValOccur = new Map(lKeyVal);
lKeyVal.length = 0; // clear the array
|
|
|
|
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
|
if (lEntry.length == 0) {
throw "Error. Empty lexicon";
}
lEntry = [...new Set(lEntry.map(e => JSON.stringify(e)))].map(s => JSON.parse(s));
// Set can’t distinguish similar lists, so we transform list item in string given to the Set
// then we transform items in list a new.
// Preparing DAWG
console.log(" > Preparing list of words");
let lVal = lChar.concat(lAff).concat(lTag);
let lWord = [];
for (let [sFlex, iAff, iTag] of lEntry) {
let lTemp = [];
for (let c of sFlex) {
lTemp.push(dChar.get(c));
}
lTemp.push(iAff+nChar);
lTemp.push(iTag+nChar+nAff)
lWord.push(lTemp);
}
lEntry.length = 0; // clear the array
// Dictionary of arc values occurrency, to sort arcs of each node
let lKeyVal = [];
for (let c of dChar.keys()) { lKeyVal.push([dChar.get(c), dCharOccur.get(c)]); }
for (let sAff of dAff.keys()) { lKeyVal.push([dAff.get(sAff)+nChar, dAffOccur.get(sAff)]); }
for (let sTag of dTag.keys()) { lKeyVal.push([dTag.get(sTag)+nChar+nAff, dTagOccur.get(sTag)]); }
let dValOccur = new Map(lKeyVal);
lKeyVal.length = 0; // clear the array
|
︙ | | | ︙ | |
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
|
if (cStemming == "A") {
this.funcStemming = str_transform.changeWordWithAffixCode;
} else if (cStemming == "S") {
this.funcStemming = str_transform.changeWordWithSuffixCode;
} else {
this.funcStemming = str_transform.noStemming;
}
// build
lWord.sort();
if (xProgressBarNode) {
xProgressBarNode.value = 0;
xProgressBarNode.max = lWord.length;
}
let i = 1;
|
|
|
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
|
if (cStemming == "A") {
this.funcStemming = str_transform.changeWordWithAffixCode;
} else if (cStemming == "S") {
this.funcStemming = str_transform.changeWordWithSuffixCode;
} else {
this.funcStemming = str_transform.noStemming;
}
// build
lWord.sort();
if (xProgressBarNode) {
xProgressBarNode.value = 0;
xProgressBarNode.max = lWord.length;
}
let i = 1;
|
︙ | | | ︙ | |
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
|
countArcs () {
this.nArc = 0;
for (let oNode of this.dMinimizedNodes.values()) {
this.nArc += oNode.arcs.size;
}
}
sortNodeArcs (dValOccur) {
console.log(" > Sort node arcs");
this.oRoot.sortArcs(dValOccur);
for (let oNode of this.dMinimizedNodes.values()) {
oNode.sortArcs(dValOccur);
}
}
|
|
|
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
|
countArcs () {
this.nArc = 0;
for (let oNode of this.dMinimizedNodes.values()) {
this.nArc += oNode.arcs.size;
}
}
sortNodeArcs (dValOccur) {
console.log(" > Sort node arcs");
this.oRoot.sortArcs(dValOccur);
for (let oNode of this.dMinimizedNodes.values()) {
oNode.sortArcs(dValOccur);
}
}
|
︙ | | | ︙ | |
271
272
273
274
275
276
277
278
279
280
281
282
283
284
|
console.log("Entries: " + this.nEntry);
console.log("Characters: " + this.nChar);
console.log("Affixes: " + this.nAff);
console.log("Tags: " + this.nTag);
console.log("Arc values: " + this.nArcVal);
console.log("Nodes: " + this.nNode);
console.log("Arcs: " + this.nArc);
console.log("Stemming: " + this.cStemming + "FX");
}
getArcStats () {
let d = new Map();
for (let oNode of this.dMinimizedNodes.values()) {
let n = oNode.arcs.size;
|
>
|
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
|
console.log("Entries: " + this.nEntry);
console.log("Characters: " + this.nChar);
console.log("Affixes: " + this.nAff);
console.log("Tags: " + this.nTag);
console.log("Arc values: " + this.nArcVal);
console.log("Nodes: " + this.nNode);
console.log("Arcs: " + this.nArc);
console.log("2grams: " + this.a2grams.size);
console.log("Stemming: " + this.cStemming + "FX");
}
getArcStats () {
let d = new Map();
for (let oNode of this.dMinimizedNodes.values()) {
let n = oNode.arcs.size;
|
︙ | | | ︙ | |
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
|
"nArc": this.nArc,
"lArcVal": this.lArcVal,
"nArcVal": this.nArcVal,
"nCompressionMethod": nCompressionMethod,
"nBytesArc": this.nBytesArc,
"nBytesNodeAddress": this.nBytesNodeAddress,
"nBytesOffset": this.nBytesOffset,
"sByDic": sByDic // binary word graph
};
return oJSON;
}
_getDate () {
let oDate = new Date();
let sMonth = (oDate.getMonth() + 1).toString().padStart(2, "0"); // Month+1: Because JS always sucks somehow.
|
|
>
|
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
|
"nArc": this.nArc,
"lArcVal": this.lArcVal,
"nArcVal": this.nArcVal,
"nCompressionMethod": nCompressionMethod,
"nBytesArc": this.nBytesArc,
"nBytesNodeAddress": this.nBytesNodeAddress,
"nBytesOffset": this.nBytesOffset,
"sByDic": sByDic, // binary word graph
"l2grams": Array.from(this.a2grams)
};
return oJSON;
}
_getDate () {
let oDate = new Date();
let sMonth = (oDate.getMonth() + 1).toString().padStart(2, "0"); // Month+1: Because JS always sucks somehow.
|
︙ | | | ︙ | |
484
485
486
487
488
489
490
491
492
493
494
495
496
497
498
|
// VERSION 1 =====================================================================================================
convToBytes1 (nBytesArc, nBytesNodeAddress) {
/*
Node scheme:
- Arc length is defined by nBytesArc
- Address length is defined by nBytesNodeAddress
| Arc | Address of next node |
| | |
/---------------\ /---------------\ /---------------\ /---------------\ /---------------\ /---------------\
| | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | |
\---------------/ \---------------/ \---------------/ \---------------/ \---------------/ \---------------/
[...]
/---------------\ /---------------\ /---------------\ /---------------\ /---------------\ /---------------\
|
|
|
491
492
493
494
495
496
497
498
499
500
501
502
503
504
505
|
// VERSION 1 =====================================================================================================
convToBytes1 (nBytesArc, nBytesNodeAddress) {
/*
Node scheme:
- Arc length is defined by nBytesArc
- Address length is defined by nBytesNodeAddress
| Arc | Address of next node |
| | |
/---------------\ /---------------\ /---------------\ /---------------\ /---------------\ /---------------\
| | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | |
\---------------/ \---------------/ \---------------/ \---------------/ \---------------/ \---------------/
[...]
/---------------\ /---------------\ /---------------\ /---------------\ /---------------\ /---------------\
|
︙ | | | ︙ | |