ADDED   graphspell-js/dawg.js
Index: graphspell-js/dawg.js
==================================================================
--- /dev/null
+++ graphspell-js/dawg.js
@@ -0,0 +1,591 @@
+// JavaScript
+
+// FSA DICTIONARY BUILDER
+//
+// by Olivier R.
+// License: MPL 2
+//
+// This tool encodes lexicon into an indexable binary dictionary 
+// Input files MUST be encoded in UTF-8.
+
+"use strict";
+
+
+if (typeof(require) !== 'undefined') {
+    var str_transform = require("resource://grammalecte/graphspell/str_transform.js");
+}
+
+
+${map}
+
+
+class DAWG {
+    /*  DIRECT ACYCLIC WORD GRAPH
+        This code is inspired from Steve Hanov’s DAWG, 2011. (http://stevehanov.ca/blog/index.php?id=115)
+        We store suffix/affix codes and tags within the graph after the “real” word.
+        A word is a list of numbers [ c1, c2, c3 . . . cN, iAffix, iTags]
+        Each arc is an index in this.lArcVal, where are stored characters, suffix/affix codes for stemming and tags.
+        Important: As usual, the last node (after ‘iTags’) is tagged final, AND the node after ‘cN’ is ALSO tagged final.
+    */
+
+    constructor (lEntrySrc, sLangName, cStemming, xProgressBarNode=null) {
+        console.log("===== Direct Acyclic Word Graph - Minimal Acyclic Finite State Automaton =====")
+        switch (cStemming.toUpperCase()) {
+            case "A":
+                funcStemmingGen = str_transform.defineAffixCode; break;
+            case "S":
+                funcStemmingGen = str_transform.defineSuffixCode; break;
+            case "N":
+                funcStemmingGen = str_transform.noStemming; break;
+            default:
+                throw "Error. Unknown stemming code: " + cStemming;
+        }
+        
+        let lEntry = [];
+        let lChar = [''],  Char = new Map(),  nChar = 1,  dCharOccur = new Map();
+        let lAff  = [],    Aff  = new Map(),  nAff  = 0,  dAffOccur = new Map();
+        let lTag  = [],    Tag  = new Map(),  nTag  = 0,  dTagOccur = new Map();
+        let nErr = 0;
+        
+        // read lexicon
+        for (let [sFlex, sStem, sTag] of lEntrySrc) {
+            addWordToCharDict(sFlex);
+            // chars
+            for (let c of sFlex) {
+                if (!dChar.get(c)) {
+                    dChar.set(c, nChar);
+                    lChar.push(c);
+                    nChar += 1;
+                }
+                dCharOccur.set(c, dCharOccur.gl_get(c, 0) + 1);
+            }
+            // affixes to find stem from flexion
+            sAff = funcStemmingGen(sFlex, sStem)
+            if (!dAff.get(sAff)) {
+                dAff.set(sAff, nAff);
+                lAff.push(sAff);
+                nAff += 1;
+            }
+            dAffOccur.set(sAff, dCharOccur.gl_get(sAff, 0) + 1);
+            // tags
+            if (!dTag.get(sTag)) {
+                dTag.set(sTag, nTag);
+                lTag.push(sTag);
+                nTag += 1;
+            }
+            dTagOccur.set(sTag, dTagOccur.gl_get(sTag, 0) + 1);
+            lEntry.push([sFlex, dAff.get(sAff), dTag.get(sTag)]);
+        }
+        if (lEntry.length == 0) {
+            throw "Error. Empty lexicon";
+        }
+        
+        // Preparing DAWG
+        console.log(" > Preparing list of words");
+        let lVal = lChar.concat(lAff).concat(lTag);
+        let lWord = [];
+        for (let [sFlex, iAff, iTag] of lEntry) {
+            let lTemp = [];
+            for (let c of sFlex) {
+                lTemp.push(dChar.get(c));
+            }
+            lTemp.push(iAff+nChar);
+            lTemp.push(iTag+nChar+nAff)
+            lWord.push(lTemp);
+        }
+        lEntry.length = 0; // clear the array
+        
+        // Dictionary of arc values occurrency, to sort arcs of each node
+        let lKeyVal = [];
+        for (let c of dChar.keys()) { lKeyVal.push([dChar[c], dCharOccur[c]]); }
+        for (let sAff of dAff.keys()) { lKeyVal.push([dAff[sAff]+nChar, dAffOccur[sAff]]); }
+        for (let sTag in dTag.keys()) { lKeyVal.push([dTag[sTag]+nChar+nAff, dTagOccur[sTag]]); }
+        dValOccur = new Map(lKeyVal);
+        lKeyVal.length = 0; // clear the array
+
+        //with open(spfSrc[:-8]+".valuesfreq.txt", 'w', encoding='utf-8') as hFreqDst:  # DEBUG
+        //    for iKey, nOcc in sorted(dValOccur.entries(), key=lambda t: t[1], reverse=True):
+        //        hFreqDst.write("{}: {}\n".format(lVal[iKey], nOcc))
+        //    hFreqDst.close()
+        
+        this.sLang = sLangName;
+        this.nEntry = lWord.length;
+        this.aPreviousEntry = [];
+        oNodeCounter.reset();
+        this.oRoot = new DawgNode();
+        this.lUncheckedNodes = [];          // list of nodes that have not been checked for duplication.
+        this.dMinimizedNodes = new Map();   // list of unique nodes that have been checked for duplication.
+        this.lSortedNodes = [];             // version 2 and 3
+        this.nNode = 0;
+        this.nArc = 0;
+        this.dChar = dChar;
+        this.nChar = dChar.length;
+        this.nAff = nAff;
+        this.lArcVal = lVal;
+        this.nArcVal = lVal.length;
+        this.nTag = this.nArcVal - this.nChar - nAff;
+        this.cStemming = cStemming;
+        if (cStemming == "A") {
+            this.funcStemming = str_transform.changeWordWithAffixCode;
+        } else if (cStemming == "S") {
+            this.funcStemming = str_transform.changeWordWithSuffixCode;
+        } else {
+            this.funcStemming = str_transform.noStemming;
+        }
+        
+        // build
+        lWord.sort();
+        if (xProgressBarNode) {
+            xProgressBarNode.value = 0;
+            xProgressBarNode.max = lWord.length;
+        }
+        let i = 0;
+        for (let aEntry of lWord) {
+            this.insert(aEntry);
+            if (xProgressBarNode) {
+                xProgressBarNode.value = i;
+                i += 1;
+            }
+        }
+        this.finish();
+        this.countNodes();
+        this.countArcs();
+        this.sortNodes();
+        this.sortNodeArcs(dValOccur);
+        this.displayInfo();
+    }
+
+    // BUILD DAWG
+    insert (aEntry) {
+        if (aEntry < this.aPreviousEntry) {
+            throw "Error: Words must be inserted in alphabetical order.";
+        }
+        
+        // find common prefix between word and previous word
+        let nCommonPrefix = 0;
+        for (let i = 0;  i < Math.min(aEntry.length, this.aPreviousEntry.length);  i++) {
+            if (aEntry[i] != this.aPreviousEntry[i]) {
+                break;
+            }
+            nCommonPrefix += 1;
+        }
+        // Check the lUncheckedNodes for redundant nodes, proceeding from last
+        // one down to the common prefix size. Then truncate the list at that point.
+        this._minimize(nCommonPrefix);
+
+        // add the suffix, starting from the correct node mid-way through the graph
+        let oNode = (this.lUncheckedNodes.length == 0) ? this.oRoot : this.lUncheckedNodes[this.lUncheckedNodes.length-1][2];
+        let iChar = nCommonPrefix;
+        for (let c of aEntry.slice(nCommonPrefix)) {
+            let oNextNode = new DawgNode();
+            oNode.arcs.set(c, oNextNode);
+            this.lUncheckedNodes.push([oNode, c, oNextNode]);
+            if (iChar == (aEntry.length - 2)) {
+                oNode.final = true;
+            }
+            iChar += 1;
+            oNode = oNextNode;
+        }
+        oNode.final = true;
+        this.aPreviousEntry = aEntry;
+    }
+
+    finish () {
+        // minimize unchecked nodes
+        this._minimize(0);
+    }
+
+    _minimize (nDownTo) {
+        // proceed from the leaf up to a certain point
+        for (let i = this.lUncheckedNodes.length-1;  i < nDownTo-1;  i--) {
+            let [oNode, char, oChildNode] = this.lUncheckedNodes[i];
+            if (this.dMinimizedNodes.has(oChildNode.__hash__())) {
+                // replace the child with the previously encountered one
+                oNode.arcs.set(char, this.dMinimizedNodes.get(oChildNode.__hash__()));
+            } else {
+                // add the state to the minimized nodes.
+                this.dMinimizedNodes.set(oChildNode.__hash__(), oChildNode);
+            }
+            this.lUncheckedNodes.pop();
+        }
+    }
+
+    countNodes () {
+        this.nNode = this.dMinimizedNodes.size;
+    }
+
+    countArcs () {
+        this.nArc = 0;
+        for (let oNode of this.dMinimizedNodes) {
+            this.nArc += oNode.arcs.size;
+        }
+    }
+    
+    sortNodeArcs (dValOccur) {
+        console.log(" > Sort node arcs");
+        this.oRoot.sortArcs(dValOccur);
+        for (let oNode of this.dMinimizedNodes) {
+            oNode.sortArcs(dValOccur);
+        }
+    }
+
+    sortNodes () {
+        console.log(" > Sort nodes");
+        for (let oNode of this.oRoot.arcs.values()) {
+            this._parseNodes(oNode);
+        }
+    }
+    
+    _parseNodes (oNode) {
+        // Warning: recursive method
+        if (oNode.pos > 0) {
+            return;
+        }
+        oNode.setPos();
+        this.lSortedNodes.append(oNode);
+        for (let oNextNode of oNode.arcs.values()) {
+             this._parseNodes(oNextNode);
+        }
+    }
+        
+    lookup (sWord) {
+        let oNode = this.oRoot;
+        for (let c of sWord) {
+            if (!oNode.arcs.has(this.dChar.gl_get(c, ''))) {
+                return false;
+            }
+            oNode = oNode.arcs.get(this.dChar.get(c));
+        }
+        return oNode.final;
+    }
+
+    morph (sWord) {
+        let oNode = this.oRoot;
+        for (let c in sWord) {
+            if (!oNode.arcs.has(this.dChar.get(c, ''))) {
+                return '';
+            }
+            oNode = oNode.arcs.get(this.dChar.get(c));
+        }
+        if (oNode.final) {
+            let s = "* ";
+            for (let arc of oNode.arcs.keys()) {
+                if (arc >= this.nChar) {
+                    s += " [" + this.funcStemming(sWord, this.lArcVal[arc]);
+                    let oNode2 = oNode.arcs[arc]
+                    for (let arc2 of oNode2.arcs.keys()) {
+                        s += " / " + this.lArcVal[arc2];
+                    }
+                    s += "]";
+                }
+            }
+            return s;
+        }
+        return '';
+    }
+
+    displayInfo () {
+        console.log("Entries: " + this.nEntry);
+        console.log("Characters: " + this.nChar);
+        console.log("Affixes: " + this.nAff);
+        console.log("Tags: " + this.nTag);
+        console.log("Arc values: " + this.nArcVal);
+        console.log("Nodes: " + this.nNode);
+        console.log("Arcs: " + this.nArc);
+        console.log("Stemming: " + this.cStemming + "FX");
+    }
+
+    getArcStats () {
+        let d = new Map();
+        for (let oNode of this.dMinimizedNodes.values()) {
+            let n = oNode.arcs.size;
+            d.set(n, d.gl_get(n, 0) + 1);
+        }
+        let s = " * Nodes:\n";
+        for (let [nKey, nVal] of d.entries()) {
+            s = s + " " + nVal + " nodes have " + nKey + " arcs\n";
+        }
+        return s;
+    }
+
+    writeInfo () {
+        console.log(this.getArcStats());
+        console.log("\n * Values:\n");
+        let i = 0;
+        for (let s of this.lArcVal) {
+            console.log(i + ": " + s);
+        }
+    }
+
+    // BINARY CONVERSION
+    createBinary (sPathFile, nMethod) {
+        console.log("Write DAWG as an indexable binary dictionary [method: "+nMethod+"]");
+        if (nMethod == 1) {
+            this.nBytesArc = Math.floor( (this.nArcVal.toString(2).length() + 2) / 8 ) + 1;     // We add 2 bits. See DawgNode.convToBytes1()
+            this._calcNumBytesNodeAddress()
+            this._calcNodesAddress1()
+        } else {
+            console.log("Error: unknown compression method");
+        }
+        console.log("Arc values (chars, affixes and tags): " + this.nArcVal);
+        console.log("Arc size: "+this.nBytesArc+" bytes, Address size: "+this.nBytesNodeAddress+" bytes");
+        console.log("-> " + this.nBytesArc+this.nBytesNodeAddress + " * " + this.nArc + " = " + (this.nBytesArc+this.nBytesNodeAddress)*this.nArc + " bytes");
+        return this._createJSON(nMethod);
+    }
+
+    _calcNumBytesNodeAddress () {
+        // how many bytes needed to store all nodes/arcs in the binary dictionary
+        this.nBytesNodeAddress = 1;
+        while (((this.nBytesArc + this.nBytesNodeAddress) * this.nArc) > (2 ** (this.nBytesNodeAddress * 8))) {
+            this.nBytesNodeAddress += 1;
+        }
+    }
+
+    _calcNodesAddress1 () {
+        let nBytesNode = this.nBytesArc + this.nBytesNodeAddress;
+        let iAddr = this.oRoot.arcs.size * nBytesNode;
+        for (let oNode of this.dMinimizedNodes.values()) {
+            oNode.addr = iAddr;
+            iAddr += Math.max(oNode.arcs.size, 1) * nBytesNode;
+        }
+    }
+
+    _createJSON (nMethod) {
+        /*
+            Format of the binary indexable dictionary:
+            Each section is separated with 4 bytes of \0
+            
+            - Section Header:
+                /pyfsa/[version]
+                    * version is an ASCII string
+            
+            - Section Informations:
+                /[tag_lang]
+                /[number of chars]
+                /[number of bytes for each arc]
+                /[number of bytes for each address node]
+                /[number of entries]
+                /[number of nodes]
+                /[number of arcs]
+                /[number of affixes]
+                    * each field is a ASCII string
+                /[stemming code]
+                    * "S" means stems are generated by /suffix_code/, "A" means they are generated by /affix_code/
+                      See defineSuffixCode() and defineAffixCode() for details.
+                      "N" means no stemming
+            
+            - Section Values:
+                    * a list of strings encoded in binary from utf-8, each value separated with a tabulation
+            
+            - Section Word Graph (nodes / arcs)
+                    * A list of nodes which are a list of arcs with an address of the next node.
+                      See DawgNode.convToBytes() for details.
+        */
+
+
+                        
+
+
+        // DAWG: nodes / arcs
+        let sByDic = "";
+        if (nMethod == 1) {
+            sByDic = this.oRoot.convToBytes1(this.nBytesArc, this.nBytesNodeAddress);
+            for (let oNode of this.dMinimizedNodes.values()) {
+                sByDic += oNode.convToBytes1(this.nBytesArc, this.nBytesNodeAddress);
+            }
+        }
+
+        let oJSON = {
+            "sName": this.sName,
+            "nVersion": this.nMethod,
+            "sHeader": this.sHeader,
+            "lArcVal": this.lArcVal,
+            "nArcVal": this.nArcVal,
+            "byDic": oConv.toHexadecimalString(sByDic),
+            "sLang": this.sLang,
+            "nChar": this.nChar,
+            "nBytesArc": this.nBytesArc,
+            "nBytesNodeAddress": this.nBytesNodeAddress,
+            "nEntries": this.nEntry,
+            "nNode": this.nNode,
+            "nArc": this.nArc,
+            "nAff": this.nAff,
+            "cStemming": this.cStemming,
+            "nTag": this.nTag,
+            "dChar": this.dChar,
+            "_arcMask": this._arcMask,
+            "_finalNodeMask": this._finalNodeMask,
+            "_lastArcMask": this._lastArcMask,
+            "_addrBitMask": this._addrBitMask,
+            "nBytesOffset": this.nBytesOffset
+        };
+    }
+}
+
+
+const oNodeCounter = {
+    nNextId: 0,
+
+    getId: function () {
+        this.nNextId += 1;
+        return this.nNextId-1;
+    },
+
+    reset: function () {
+        this.nNextId = 0;
+    }
+}
+
+
+class DawgNode {
+
+    constructor () {
+        this.i = oNodeCounter.getId();
+        this.final = false;
+        this.arcs = new Map();  // key: arc value; value: a node
+        this.addr = 0;          // address in the binary dictionary
+        this.pos = 0;           // position in the binary dictionary (version 2)
+        this.size = 0;          // size of node in bytes (version 3)
+    }
+
+    __str__ () {
+        // Caution! this function is used for hashing and comparison!
+        let l = [];
+        if (this.final) {
+            l.push("1");
+        } else {
+            l.push("0");
+        }
+        for (let [key, node] of this.arcs.entries()) {
+            l.push(key.toString());
+            l.push(node.i.toString());
+        }
+        return l.join("_");
+    }
+
+    __hash__ () {
+        // Used as a key in a python dictionary.
+        return this.__str__();
+    }
+
+    __eq__ (other) {
+        // Used as a key in a python dictionary.
+        // Nodes are equivalent if they have identical arcs, and each identical arc leads to identical states.
+        return this.__str__() == other.__str__();
+    }
+
+    sortArcs (dValOccur) {
+        let lTemp = this.arcs.entries();
+        lTemp.sort(function (a, b) {
+            if (dValOccur.get(a[0], 0) > dValOccur.get(b[0], 0))
+                return -1;
+            if (dValOccur.get(a[0], 0) < dValOccur.get(b[0], 0))
+                return 1;
+            return 0;
+        });
+        this.arcs = new Map(lTemp);
+    }
+
+    // VERSION 1 =====================================================================================================
+    convToBytes1 (nBytesArc, nBytesNodeAddress) {
+        /*
+            Node scheme:
+            - Arc length is defined by nBytesArc
+            - Address length is defined by nBytesNodeAddress
+                                           
+            |                Arc                |                         Address of next node                          |
+            |                                   |                                                                       |
+             /---------------\ /---------------\ /---------------\ /---------------\ /---------------\ /---------------\
+             | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | |
+             \---------------/ \---------------/ \---------------/ \---------------/ \---------------/ \---------------/
+             [...]
+             /---------------\ /---------------\ /---------------\ /---------------\ /---------------\ /---------------\
+             | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | |
+             \---------------/ \---------------/ \---------------/ \---------------/ \---------------/ \---------------/
+              ^ ^
+              | |
+              | |
+              |  \___ if 1, last arc of this node
+               \_____ if 1, this node is final (only on the first arc)
+        */
+        let nArc = this.arcs.size;
+        let nFinalNodeMask = 1 << ((nBytesArc*8)-1);
+        let nFinalArcMask = 1 << ((nBytesArc*8)-2);
+        if (this.arcs.size == 0) {
+            let val = nFinalNodeMask | nFinalArcMask;
+            let by = oConv.toHexString(val, nBytesArc);
+            by += oConv.toHexString(0, nBytesNodeAddress);
+            return by;
+        }
+        let by = [];
+        let i = 1;
+        for (let arc of this.arcs.keys()) {
+            let val = arc;
+            if (i == 1 && this.final) {
+                val = val | nFinalNodeMask;
+            }
+            if (i == nArc) {
+                val = val | nFinalArcMask;
+            }
+            i++;
+            by += oConv.toHexString(val, nBytesArc);
+            by += oConv.toHexString(this.arcs.get(arc).addr, nBytesNodeAddress);
+        }
+        return by;
+    }
+}
+
+
+const oConv = {
+    toHexString: function (nVal, nByte) {
+        // nVal: value to convert, nByte: number of bytes
+        let sHexVal = nVal.toString(16); // conversion to hexadecimal string
+        if (sHexVal.length < (nByte*2)) {
+            sHexVal = "0".repeat((nByte*2) - sHexVal.length) + sHexVal;
+        } else if (sHexVal.length == (nByte*2)) {
+            return sHexVal
+        } else {
+            throw "Conversion to byte string: value bigger than allowed.";
+        }
+    }
+}
+
+
+// Another attempt to sort node arcs
+
+const _dCharOrder = new Map([ ["", new Map()] ]);
+// key: previous char, value: dictionary of chars {c: nValue}
+
+
+function addWordToCharDict (sWord) {
+    let cPrevious = "";
+    for (let cChar of sWord) {
+        if (!_dCharOrder.get(cPrevious)) {
+            _dCharOrder.set(cPrevious, new Map());
+        }
+        _dCharOrder.get(cPrevious).set(cChar, _dCharOrder.get(cPrevious).gl_get(cChar, 0) + 1);
+        cPrevious = cChar;
+    }
+}
+
+function getCharOrderAfterChar (cChar) {
+    return _dCharOrder.gl_get(cChar, null);
+}
+
+function displayCharOrder () {
+    for (let [key, value] of _dCharOrder.entries()) {
+        let s = "[" + key + "]: ";
+        let lTemp = value.entries();
+        lTemp.sort(function (a, b) {
+            if (a[1] > b[1])
+                return -1;
+            if (a[1] < b[1])
+                return 1;
+            return 0;
+        });
+        for (let [c, n] of lTemp) {
+            s += c+":"+n+", ";
+        }
+        console.log(s);
+    }
+}