24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
|
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
|
-
+
|
This code is inspired from Steve Hanov’s DAWG, 2011. (http://stevehanov.ca/blog/index.php?id=115)
We store suffix/affix codes and tags within the graph after the “real” word.
A word is a list of numbers [ c1, c2, c3 . . . cN, iAffix, iTags]
Each arc is an index in this.lArcVal, where are stored characters, suffix/affix codes for stemming and tags.
Important: As usual, the last node (after ‘iTags’) is tagged final, AND the node after ‘cN’ is ALSO tagged final.
*/
constructor (lEntrySrc, sLang, cStemming, xProgressBarNode=null) {
constructor (lEntrySrc, sLangCode, sLangName, sDicName, cStemming, xProgressBarNode=null) {
console.log("===== Direct Acyclic Word Graph - Minimal Acyclic Finite State Automaton =====");
let funcStemmingGen = null;
switch (cStemming.toUpperCase()) {
case "A":
funcStemmingGen = str_transform.defineAffixCode; break;
case "S":
funcStemmingGen = str_transform.defineSuffixCode; break;
|
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
|
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
|
-
-
+
+
+
|
let lKeyVal = [];
for (let c of dChar.keys()) { lKeyVal.push([dChar.get(c), dCharOccur.get(c)]); }
for (let sAff of dAff.keys()) { lKeyVal.push([dAff.get(sAff)+nChar, dAffOccur.get(sAff)]); }
for (let sTag of dTag.keys()) { lKeyVal.push([dTag.get(sTag)+nChar+nAff, dTagOccur.get(sTag)]); }
let dValOccur = new Map(lKeyVal);
lKeyVal.length = 0; // clear the array
this.sHeader = "/pyfsa/";
this.sLang = sLang;
this.sLangCode = sLangCode;
this.sLangName = sLangName;
this.sDicName = sDicName;
this.nEntry = lWord.length;
this.aPreviousEntry = [];
oNodeCounter.reset();
this.oRoot = new DawgNode();
this.lUncheckedNodes = []; // list of nodes that have not been checked for duplication.
this.dMinimizedNodes = new Map(); // list of unique nodes that have been checked for duplication.
this.nNode = 0;
|
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
|
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
|
+
+
+
-
+
-
+
-
-
-
-
-
+
-
-
-
+
+
+
+
-
-
-
-
-
+
+
+
+
+
+
+
|
if (nCompressionMethod == 1) {
sByDic = this.oRoot.convToBytes1(this.nBytesArc, this.nBytesNodeAddress);
for (let oNode of this.dMinimizedNodes.values()) {
sByDic += oNode.convToBytes1(this.nBytesArc, this.nBytesNodeAddress);
}
}
let oJSON = {
"sHeader": "/pyfsa/",
"sLangCode": this.sLangCode,
"sLangName": this.sLangName,
"sName": this.sName,
"sDicName": this.sDicName,
"nCompressionMethod": nCompressionMethod,
"sFileName": "[none]",
"sDate": this._getDate(),
"sHeader": this.sHeader + nCompressionMethod + "/",
"lArcVal": this.lArcVal,
"nArcVal": this.nArcVal,
"byDic": sByDic, // binary word graph
"sLang": this.sLang,
"nEntries": this.nEntry,
"nChar": this.nChar,
"nBytesArc": this.nBytesArc,
"nBytesNodeAddress": this.nBytesNodeAddress,
"nEntries": this.nEntry,
"nAff": this.nAff,
"nTag": this.nTag,
"cStemming": this.cStemming,
"dChar": helpers.mapToObject(this.dChar),
"nNode": this.nNode,
"nArc": this.nArc,
"nAff": this.nAff,
"cStemming": this.cStemming,
"nTag": this.nTag,
"dChar": helpers.mapToObject(this.dChar),
"nBytesOffset": this.nBytesOffset
"lArcVal": this.lArcVal,
"nArcVal": this.nArcVal,
"nCompressionMethod": nCompressionMethod,
"nBytesArc": this.nBytesArc,
"nBytesNodeAddress": this.nBytesNodeAddress,
"nBytesOffset": this.nBytesOffset,
"sByDic": sByDic // binary word graph
};
return oJSON;
},
_getDate () {
let oDate = new Date();
let sMonth = (oDate.getMonth() + 1).toString().padStart(2, "0"); // Month+1: Because JS always sucks somehow.
|