Grammalecte  dawg.js at [13ca4224a8]

File graphspell-js/dawg.js artifact f36dfe75af part of check-in 13ca4224a8


// JavaScript

// FSA DICTIONARY BUILDER
//
// by Olivier R.
// License: MPL 2
//
// This tool encodes lexicon into an indexable binary dictionary
// Input files MUST be encoded in UTF-8.

/* jshint esversion:6, -W097 */
/* jslint esversion:6 */
/* global require, exports, console, helpers */

"use strict";

if(typeof(process) !== 'undefined') {
    var str_transform = require("./str_transform.js");
} else if (typeof(require) !== 'undefined') {
    var str_transform = require("resource://grammalecte/graphspell/str_transform.js");
}

${map}


class DAWG {
    /*  DIRECT ACYCLIC WORD GRAPH
        This code is inspired from Steve Hanov’s DAWG, 2011. (http://stevehanov.ca/blog/index.php?id=115)
        We store suffix/affix codes and tags within the graph after the “real” word.
        A word is a list of numbers [ c1, c2, c3 . . . cN, iAffix, iTags]
        Each arc is an index in this.lArcVal, where are stored characters, suffix/affix codes for stemming and tags.
        Important: As usual, the last node (after ‘iTags’) is tagged final, AND the node after ‘cN’ is ALSO tagged final.
    */

    constructor (lEntrySrc, cStemming, sLangCode, sLangName="", sDicName="", xProgressBarNode=null) {
        console.log("===== Direct Acyclic Word Graph - Minimal Acyclic Finite State Automaton =====");
        let funcStemmingGen = null;
        switch (cStemming.toUpperCase()) {
            case "A":
                funcStemmingGen = str_transform.defineAffixCode; break;
            case "S":
                funcStemmingGen = str_transform.defineSuffixCode; break;
            case "N":
                funcStemmingGen = str_transform.noStemming; break;
            default:
                throw "Error. Unknown stemming code: " + cStemming;
        }

        let lEntry = [];
        let lChar = [''],  dChar = new Map(),  nChar = 1,  dCharOccur = new Map();
        let lAff  = [],    dAff  = new Map(),  nAff  = 0,  dAffOccur = new Map();
        let lTag  = [],    dTag  = new Map(),  nTag  = 0,  dTagOccur = new Map();
        let nErr = 0;

        this.a2grams = new Set();

        // read lexicon
        for (let [sFlex, sStem, sTag] of lEntrySrc) {
            for (let s2grams of str_transform.getNgrams(sFlex)) {
                this.a2grams.add(s2grams);
            }
            addWordToCharDict(sFlex);
            // chars
            for (let c of sFlex) {
                if (!dChar.get(c)) {
                    dChar.set(c, nChar);
                    lChar.push(c);
                    nChar += 1;
                }
                dCharOccur.set(c, dCharOccur.gl_get(c, 0) + 1);
            }
            // affixes to find stem from flexion
            let sAff = funcStemmingGen(sFlex, sStem);
            if (!dAff.get(sAff)) {
                dAff.set(sAff, nAff);
                lAff.push(sAff);
                nAff += 1;
            }
            dAffOccur.set(sAff, dAffOccur.gl_get(sAff, 0) + 1);
            // tags
            if (!dTag.get(sTag)) {
                dTag.set(sTag, nTag);
                lTag.push(sTag);
                nTag += 1;
            }
            dTagOccur.set(sTag, dTagOccur.gl_get(sTag, 0) + 1);
            lEntry.push([sFlex, dAff.get(sAff), dTag.get(sTag)]);
        }
        if (lEntry.length == 0) {
            throw "Error. Empty lexicon";
        }

        lEntry = [...new Set(lEntry.map(e => JSON.stringify(e)))].map(s => JSON.parse(s));
        // Set can’t distinguish similar lists, so we transform list item in string given to the Set
        // then we transform items in list a new.

        // Preparing DAWG
        console.log(" > Preparing list of words");
        let lVal = lChar.concat(lAff).concat(lTag);
        let lWord = [];
        for (let [sFlex, iAff, iTag] of lEntry) {
            let lTemp = [];
            for (let c of sFlex) {
                lTemp.push(dChar.get(c));
            }
            lTemp.push(iAff+nChar);
            lTemp.push(iTag+nChar+nAff);
            lWord.push(lTemp);
        }
        lEntry.length = 0; // clear the array

        // Dictionary of arc values occurrency, to sort arcs of each node
        let lKeyVal = [];
        for (let c of dChar.keys()) { lKeyVal.push([dChar.get(c), dCharOccur.get(c)]); }
        for (let sAff of dAff.keys()) { lKeyVal.push([dAff.get(sAff)+nChar, dAffOccur.get(sAff)]); }
        for (let sTag of dTag.keys()) { lKeyVal.push([dTag.get(sTag)+nChar+nAff, dTagOccur.get(sTag)]); }
        let dValOccur = new Map(lKeyVal);
        lKeyVal.length = 0; // clear the array

        this.sLangCode = sLangCode;
        this.sLangName = sLangName;
        this.sDicName = sDicName;
        this.nEntry = lWord.length;
        this.aPreviousEntry = [];
        oNodeCounter.reset();
        this.oRoot = new DawgNode();
        this.lUncheckedNodes = [];          // list of nodes that have not been checked for duplication.
        this.dMinimizedNodes = new Map();   // list of unique nodes that have been checked for duplication.
        this.nNode = 0;
        this.nArc = 0;
        this.dChar = dChar;
        this.nChar = dChar.size;
        this.nAff = nAff;
        this.lArcVal = lVal;
        this.nArcVal = lVal.length;
        this.nTag = this.nArcVal - this.nChar - nAff;
        this.cStemming = cStemming;
        if (cStemming == "A") {
            this.funcStemming = str_transform.changeWordWithAffixCode;
        } else if (cStemming == "S") {
            this.funcStemming = str_transform.changeWordWithSuffixCode;
        } else {
            this.funcStemming = str_transform.noStemming;
        }

        // build
        lWord.sort();
        if (xProgressBarNode) {
            xProgressBarNode.value = 0;
            xProgressBarNode.max = lWord.length;
        }
        let i = 1;
        for (let aEntry of lWord) {
            this.insert(aEntry);
            if (xProgressBarNode) {
                xProgressBarNode.value = i;
                i += 1;
            }
        }
        this.finish();
        this.countNodes();
        this.countArcs();
        this.sortNodeArcs(dValOccur);
        this.displayInfo();
        this.writeInfo();
        //this.oRoot.display(0, this.lArcVal, true);
    }

    // BUILD DAWG
    insert (aEntry) {
        if (aEntry < this.aPreviousEntry) {
            throw "Error: Words must be inserted in alphabetical order.";
        }

        // find common prefix between word and previous word
        let nCommonPrefix = 0;
        for (let i = 0;  i < Math.min(aEntry.length, this.aPreviousEntry.length);  i++) {
            if (aEntry[i] != this.aPreviousEntry[i]) {
                break;
            }
            nCommonPrefix += 1;
        }
        // Check the lUncheckedNodes for redundant nodes, proceeding from last
        // one down to the common prefix size. Then truncate the list at that point.
        this._minimize(nCommonPrefix);

        // add the suffix, starting from the correct node mid-way through the graph
        let oNode = (this.lUncheckedNodes.length == 0) ? this.oRoot : this.lUncheckedNodes[this.lUncheckedNodes.length-1][2];
        let iChar = nCommonPrefix;
        for (let c of aEntry.slice(nCommonPrefix)) {
            let oNextNode = new DawgNode();
            oNode.arcs.set(c, oNextNode);
            this.lUncheckedNodes.push([oNode, c, oNextNode]);
            if (iChar == (aEntry.length - 2)) {
                oNode.final = true;
            }
            iChar += 1;
            oNode = oNextNode;
        }
        oNode.final = true;
        this.aPreviousEntry = aEntry;
    }

    finish () {
        // minimize unchecked nodes
        this._minimize(0);
    }

    _minimize (nDownTo) {
        // proceed from the leaf up to a certain point
        for (let i = this.lUncheckedNodes.length-1;  i > nDownTo-1;  i--) {
            let [oNode, char, oChildNode] = this.lUncheckedNodes[i];
            if (this.dMinimizedNodes.has(oChildNode.__hash__())) {
                // replace the child with the previously encountered one
                oNode.arcs.set(char, this.dMinimizedNodes.get(oChildNode.__hash__()));
            } else {
                // add the state to the minimized nodes.
                this.dMinimizedNodes.set(oChildNode.__hash__(), oChildNode);
            }
            this.lUncheckedNodes.pop();
        }
    }

    countNodes () {
        this.nNode = this.dMinimizedNodes.size;
    }

    countArcs () {
        this.nArc = 0;
        for (let oNode of this.dMinimizedNodes.values()) {
            this.nArc += oNode.arcs.size;
        }
    }

    sortNodeArcs (dValOccur) {
        console.log(" > Sort node arcs");
        this.oRoot.sortArcs(dValOccur);
        for (let oNode of this.dMinimizedNodes.values()) {
            oNode.sortArcs(dValOccur);
        }
    }

    lookup (sWord) {
        let oNode = this.oRoot;
        for (let c of sWord) {
            if (!oNode.arcs.has(this.dChar.gl_get(c, ''))) {
                return false;
            }
            oNode = oNode.arcs.get(this.dChar.get(c));
        }
        return oNode.final;
    }

    morph (sWord) {
        let oNode = this.oRoot;
        for (let c of sWord) {
            if (!oNode.arcs.has(this.dChar.get(c, ''))) {
                return '';
            }
            oNode = oNode.arcs.get(this.dChar.get(c));
        }
        if (oNode.final) {
            let s = "* ";
            for (let arc of oNode.arcs.keys()) {
                if (arc >= this.nChar) {
                    s += " [" + this.funcStemming(sWord, this.lArcVal[arc]);
                    let oNode2 = oNode.arcs.get(arc);
                    for (let arc2 of oNode2.arcs.keys()) {
                        s += " / " + this.lArcVal[arc2];
                    }
                    s += "]";
                }
            }
            return s;
        }
        return '';
    }

    displayInfo () {
        console.log("Entries: " + this.nEntry);
        console.log("Characters: " + this.nChar);
        console.log("Affixes: " + this.nAff);
        console.log("Tags: " + this.nTag);
        console.log("Arc values: " + this.nArcVal);
        console.log("Nodes: " + this.nNode);
        console.log("Arcs: " + this.nArc);
        console.log("2grams: " + this.a2grams.size);
        console.log("Stemming: " + this.cStemming + "FX");
    }

    getArcStats () {
        let d = new Map();
        for (let oNode of this.dMinimizedNodes.values()) {
            let n = oNode.arcs.size;
            d.set(n, d.gl_get(n, 0) + 1);
        }
        let s = " * Nodes:\n";
        for (let [nKey, nVal] of d.entries()) {
            s = s + " " + nVal + " nodes have " + nKey + " arcs\n";
        }
        return s;
    }

    writeInfo () {
        console.log(this.getArcStats());
        console.log("\n * Values:\n");
        let i = 0;
        for (let s of this.lArcVal) {
            console.log(i + ": " + s);
            i++;
        }
    }

    * select (sPattern="") {
        // generator: returns all entries which morphology fits <sPattern>
        let zPattern = null;
        if (sPattern !== "") {
            try {
                zPattern = new RegExp(sPattern);
            }
            catch (e) {
                console.log("Error in regex pattern");
                console.log(e.message);
            }
        }
        yield* this._select(zPattern, this.oRoot, "");
    }

    * _select (zPattern, oNode, sWord) {
        // recursive generator
        for (let [nVal, oNextNode] of oNode.arcs.entries()) {
            if (nVal <= this.nChar) {
                // simple character
                yield* this._select(zPattern, oNextNode, sWord + this.lArcVal[nVal]);
            } else {
                let sEntry = sWord + "\t" + this.funcStemming(sWord, this.lArcVal[nVal]);
                for (let [nMorphVal, _] of oNextNode.arcs.entries()) {
                    if (!zPattern || zPattern.test(this.lArcVal[nMorphVal])) {
                        yield sEntry + "\t" + this.lArcVal[nMorphVal];
                    }
                }
            }
        }
    }

    // BINARY CONVERSION
    createBinaryJSON (nCompressionMethod) {
        console.log("Write DAWG as an indexable binary dictionary [method: "+nCompressionMethod+"]");
        if (nCompressionMethod == 1) {
            this.nBytesArc = Math.floor( (this.nArcVal.toString(2).length + 2) / 8 ) + 1;     // We add 2 bits. See DawgNode.convToBytes1()
            this.nBytesOffset = 0;
            this._calcNumBytesNodeAddress();
            this._calcNodesAddress1();
        } else {
            console.log("Error: unknown compression method");
        }
        console.log("Arc values (chars, affixes and tags): " + this.nArcVal);
        console.log("Arc size: "+this.nBytesArc+" bytes, Address size: "+this.nBytesNodeAddress+" bytes");
        console.log("-> " + this.nBytesArc+this.nBytesNodeAddress + " * " + this.nArc + " = " + (this.nBytesArc+this.nBytesNodeAddress)*this.nArc + " bytes");
        return this._createJSON(nCompressionMethod);
    }

    _calcNumBytesNodeAddress () {
        // how many bytes needed to store all nodes/arcs in the binary dictionary
        this.nBytesNodeAddress = 1;
        while (((this.nBytesArc + this.nBytesNodeAddress) * this.nArc) > (2 ** (this.nBytesNodeAddress * 8))) {
            this.nBytesNodeAddress += 1;
        }
    }

    _calcNodesAddress1 () {
        let nBytesNode = this.nBytesArc + this.nBytesNodeAddress;
        let iAddr = this.oRoot.arcs.size * nBytesNode;
        for (let oNode of this.dMinimizedNodes.values()) {
            oNode.addr = iAddr;
            iAddr += Math.max(oNode.arcs.size, 1) * nBytesNode;
        }
    }

    _createJSON (nCompressionMethod) {
        let sByDic = "";
        if (nCompressionMethod == 1) {
            sByDic = this.oRoot.convToBytes1(this.nBytesArc, this.nBytesNodeAddress);
            for (let oNode of this.dMinimizedNodes.values()) {
                sByDic += oNode.convToBytes1(this.nBytesArc, this.nBytesNodeAddress);
            }
        }
        let oJSON = {
            "sHeader": "/grammalecte-fsa/",
            "sLangCode": this.sLangCode,
            "sLangName": this.sLangName,
            "sDicName": this.sDicName,
            "sFileName": "[none]",
            "sDate": this._getDate(),
            "nEntry": this.nEntry,
            "nChar": this.nChar,
            "nAff": this.nAff,
            "nTag": this.nTag,
            "cStemming": this.cStemming,
            "dChar": helpers.mapToObject(this.dChar),
            "nNode": this.nNode,
            "nArc": this.nArc,
            "lArcVal": this.lArcVal,
            "nArcVal": this.nArcVal,
            "nCompressionMethod": nCompressionMethod,
            "nBytesArc": this.nBytesArc,
            "nBytesNodeAddress": this.nBytesNodeAddress,
            "nBytesOffset": this.nBytesOffset,
            "sByDic": sByDic,    // binary word graph
            "l2grams": Array.from(this.a2grams)
        };
        return oJSON;
    }

    _getDate () {
        let oDate = new Date();
        let sMonth = (oDate.getMonth() + 1).toString().padStart(2, "0"); // Month+1: Because JS always sucks somehow.
        let sDay = (oDate.getDate()).toString().padStart(2, "0");
        let sHours = (oDate.getHours()).toString().padStart(2, "0");
        let sMinutes = (oDate.getMinutes()).toString().padStart(2, "0");
        return `${oDate.getFullYear()}-${sMonth}-${sDay}, ${sHours}:${sMinutes}`;
    }
}


const oNodeCounter = {
    nNextId: 0,

    getId: function () {
        this.nNextId += 1;
        return this.nNextId-1;
    },

    reset: function () {
        this.nNextId = 0;
    }
};


class DawgNode {

    constructor () {
        this.i = oNodeCounter.getId();
        this.final = false;
        this.arcs = new Map();  // key: arc value; value: a node
        this.addr = 0;          // address in the binary dictionary
    }

    __str__ () {
        // Caution! this function is used for hashing and comparison!
        let sFinalChar = (self.final) ? "1" : "0";
        let l = [sFinalChar];
        for (let [key, node] of this.arcs.entries()) {
            l.push(key.toString());
            l.push(node.i.toString());
        }
        return l.join("_");
    }

    __hash__ () {
        // Used as a key in a python dictionary.
        return this.__str__();
    }

    __eq__ (other) {
        // Used as a key in a python dictionary.
        // Nodes are equivalent if they have identical arcs, and each identical arc leads to identical states.
        return this.__str__() == other.__str__();
    }

    sortArcs (dValOccur) {
        let lTemp = Array.from(this.arcs.entries());
        lTemp.sort(function (a, b) {
            if (dValOccur.get(a[0], 0) > dValOccur.get(b[0], 0))
                return -1;
            if (dValOccur.get(a[0], 0) < dValOccur.get(b[0], 0))
                return 1;
            return 0;
        });
        this.arcs = new Map(lTemp);
    }

    display (nTab, lArcVal, bRecur=false) {
        let sResult = "    ".repeat(nTab) + "Node: " + this.i + " " + this.final + "\n";
        for (let arc of this.arcs.keys()) {
            sResult += "    ".repeat(nTab) + lArcVal[arc] + "\n";
        }
        console.log(sResult);
        if (bRecur) {
            for (let oNode of this.arcs.values()) {
                oNode.display(nTab+1, lArcVal, bRecur);
            }
        }
    }

    // VERSION 1 =====================================================================================================
    convToBytes1 (nBytesArc, nBytesNodeAddress) {
        /*
            Node scheme:
            - Arc length is defined by nBytesArc
            - Address length is defined by nBytesNodeAddress

            |                Arc                |                         Address of next node                          |
            |                                   |                                                                       |
             /---------------\ /---------------\ /---------------\ /---------------\ /---------------\ /---------------\
             | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | |
             \---------------/ \---------------/ \---------------/ \---------------/ \---------------/ \---------------/
             [...]
             /---------------\ /---------------\ /---------------\ /---------------\ /---------------\ /---------------\
             | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | |
             \---------------/ \---------------/ \---------------/ \---------------/ \---------------/ \---------------/
              ^ ^
              | |
              | |
              |  \___ if 1, last arc of this node
               \_____ if 1, this node is final (only on the first arc)
        */
        let nArc = this.arcs.size;
        let nFinalNodeMask = 1 << ((nBytesArc*8)-1);
        let nFinalArcMask = 1 << ((nBytesArc*8)-2);
        if (this.arcs.size == 0) {
            let nVal = nFinalNodeMask | nFinalArcMask;
            let sBinary = this.convValueToHexString(nVal, nBytesArc);
            sBinary += this.convValueToHexString(0, nBytesNodeAddress);
            return sBinary;
        }
        let sBinary = "";
        let i = 1;
        for (let arc of this.arcs.keys()) {
            let nVal = arc;
            if (i == 1 && this.final) {
                nVal = nVal | nFinalNodeMask;
            }
            if (i == nArc) {
                nVal = nVal | nFinalArcMask;
            }
            i++;
            sBinary += this.convValueToHexString(nVal, nBytesArc);
            sBinary += this.convValueToHexString(this.arcs.get(arc).addr, nBytesNodeAddress);
        }
        return sBinary;
    }

    convValueToHexString (nVal, nByte) {
        // nVal: value to convert, nByte: number of bytes
        let sHexVal = nVal.toString(16); // conversion to hexadecimal string
        //console.log(`value: ${nVal} in ${nByte} bytes`);
        if (sHexVal.length < (nByte*2)) {
            return "0".repeat((nByte*2) - sHexVal.length) + sHexVal;
        } else if (sHexVal.length == (nByte*2)) {
            return sHexVal;
        } else {
            throw "Conversion to byte string: value bigger than allowed.";
        }
    }
}



// Another attempt to sort node arcs

const _dCharOrder = new Map([ ["", new Map()] ]);
// key: previous char, value: dictionary of chars {c: nValue}


function addWordToCharDict (sWord) {
    let cPrevious = "";
    for (let cChar of sWord) {
        if (!_dCharOrder.get(cPrevious)) {
            _dCharOrder.set(cPrevious, new Map());
        }
        _dCharOrder.get(cPrevious).set(cChar, _dCharOrder.get(cPrevious).gl_get(cChar, 0) + 1);
        cPrevious = cChar;
    }
}

function getCharOrderAfterChar (cChar) {
    return _dCharOrder.gl_get(cChar, null);
}

function displayCharOrder () {
    for (let [key, value] of _dCharOrder.entries()) {
        let s = "[" + key + "]: ";
        let lTemp = Array.from(value.entries());
        lTemp.sort(function (a, b) {
            if (a[1] > b[1])
                return -1;
            if (a[1] < b[1])
                return 1;
            return 0;
        });
        for (let [c, n] of lTemp) {
            s += c+":"+n+", ";
        }
        console.log(s);
    }
}