DELETED gc_core/js/char_player.js Index: gc_core/js/char_player.js ================================================================== --- gc_core/js/char_player.js +++ gc_core/js/char_player.js @@ -1,330 +0,0 @@ -// list of similar chars -// useful for suggestion mechanism - -${map} - - -var char_player = { - - _dTransChars: new Map([ - ['à', 'a'], ['é', 'e'], ['î', 'i'], ['ô', 'o'], ['û', 'u'], ['ÿ', 'i'], ['y', 'i'], - ['â', 'a'], ['è', 'e'], ['ï', 'i'], ['ö', 'o'], ['ù', 'u'], ['ŷ', 'i'], - ['ä', 'a'], ['ê', 'e'], ['í', 'i'], ['ó', 'o'], ['ü', 'u'], ['ý', 'i'], - ['á', 'a'], ['ë', 'e'], ['ì', 'i'], ['ò', 'o'], ['ú', 'u'], ['ỳ', 'i'], - ['ā', 'a'], ['ē', 'e'], ['ī', 'i'], ['ō', 'o'], ['ū', 'u'], ['ȳ', 'i'], - ['ñ', 'n'], ['k', 'q'], ['w', 'v'], - ['œ', 'oe'], ['æ', 'ae'], - ]), - - simplifyWord: function (sWord) { - // word simplication before calculating distance between words - sWord = sWord.toLowerCase(); - let sNewWord = ""; - let i = 1; - for (let c of sWord) { - let cNew = this._dTransChars.gl_get(c, c); - let cNext = sWord.slice(i, i+1) - if (cNew != this._dTransChars.gl_get(cNext, cNext)) { - sNewWord += cNew; - } - i++; - } - return sNewWord.replace(/eau/g, "o").replace(/au/g, "o").replace(/ai/g, "e").replace(/ei/g, "e").replace(/ph/g, "f"); - }, - - aVowel: new Set("aáàâäāeéèêëēiíìîïīoóòôöōuúùûüūyýỳŷÿȳœæAÁÀÂÄĀEÉÈÊËĒIÍÌÎÏĪOÓÒÔÖŌUÚÙÛÜŪYÝỲŶŸȲŒÆ"), - aConsonant: new Set("bcçdfghjklmnñpqrstvwxzBCÇDFGHJKLMNÑPQRSTVWXZ"), - aDouble: new Set("bcdfjklmnprstzBCDFJKLMNPRSTZ"), // letters that may be used twice successively - - - // Similar chars - - d1to1: new Map([ - ["1", "liîLIÎ"], - ["2", "zZ"], - ["3", "eéèêEÉÈÊ"], - ["4", "aàâAÀÂ"], - ["5", "sgSG"], - ["6", "bdgBDG"], - ["7", "ltLT"], - ["8", "bB"], - ["9", "gbdGBD"], - ["0", "oôOÔ"], - - ["a", "aàâáäæ"], - ["A", "AÀÂÁÄÆ"], - ["à", "aàâáäæ"], - ["À", "AÀÂÁÄÆ"], - ["â", "aàâáäæ"], - ["Â", "AÀÂÁÄÆ"], - ["á", "aàâáäæ"], - ["Á", "AÀÂÁÄÆ"], - ["ä", "aàâáäæ"], - ["Ä", "AÀÂÁÄÆ"], - - ["æ", "æéa"], - ["Æ", "ÆÉA"], - - ["c", "cçskqśŝ"], - ["C", "CÇSKQŚŜ"], - ["ç", "cçskqśŝ"], - ["Ç", "CÇSKQŚŜ"], - - ["e", "eéèêëœ"], - ["E", "EÉÈÊËŒ"], - ["é", "eéèêëœ"], - ["É", "EÉÈÊËŒ"], - ["ê", "eéèêëœ"], - ["Ê", "EÉÈÊËŒ"], - ["è", "eéèêëœ"], - ["È", "EÉÈÊËŒ"], - ["ë", "eéèêëœ"], - ["Ë", "EÉÈÊËŒ"], - - ["g", "gj"], - ["G", "GJ"], - - ["i", "iîïyíìÿ"], - ["I", "IÎÏYÍÌŸ"], - ["î", "iîïyíìÿ"], - ["Î", "IÎÏYÍÌŸ"], - ["ï", "iîïyíìÿ"], - ["Ï", "IÎÏYÍÌŸ"], - ["í", "iîïyíìÿ"], - ["Í", "IÎÏYÍÌŸ"], - ["ì", "iîïyíìÿ"], - ["Ì", "IÎÏYÍÌŸ"], - - ["j", "jg"], - ["J", "JG"], - - ["k", "kcq"], - ["K", "KCQ"], - - ["n", "nñ"], - ["N", "NÑ"], - - ["o", "oôóòöœ"], - ["O", "OÔÓÒÖŒ"], - ["ô", "oôóòöœ"], - ["Ô", "OÔÓÒÖŒ"], - ["ó", "oôóòöœ"], - ["Ó", "OÔÓÒÖŒ"], - ["ò", "oôóòöœ"], - ["Ò", "OÔÓÒÖŒ"], - ["ö", "oôóòöœ"], - ["Ö", "OÔÓÒÖŒ"], - - ["œ", "œoôeéèêë"], - ["Œ", "ŒOÔEÉÈÊË"], - - ["q", "qck"], - ["Q", "QCK"], - - ["s", "sśŝcç"], - ["S", "SŚŜCÇ"], - ["ś", "sśŝcç"], - ["Ś", "SŚŜCÇ"], - ["ŝ", "sśŝcç"], - ["Ŝ", "SŚŜCÇ"], - - ["u", "uûùüú"], - ["U", "UÛÙÜÚ"], - ["û", "uûùüú"], - ["Û", "UÛÙÜÚ"], - ["ù", "uûùüú"], - ["Ù", "UÛÙÜÚ"], - ["ü", "uûùüú"], - ["Ü", "UÛÙÜÚ"], - ["ú", "uûùüú"], - ["Ú", "UÛÙÜÚ"], - - ["v", "vw"], - ["V", "VW"], - - ["w", "wv"], - ["W", "WV"], - - ["x", "xck"], - ["X", "XCK"], - - ["y", "yÿiîŷýỳ"], - ["Y", "YŸIÎŶÝỲ"], - ["ÿ", "yÿiîŷýỳ"], - ["Ÿ", "YŸIÎŶÝỲ"], - ["ŷ", "yÿiîŷýỳ"], - ["Ŷ", "YŸIÎŶÝỲ"], - ["ý", "yÿiîŷýỳ"], - ["Ý", "YŸIÎŶÝỲ"], - ["ỳ", "yÿiîŷýỳ"], - ["Ỳ", "YŸIÎŶÝỲ"], - - ["z", "zs"], - ["Z", "ZS"], - ]), - - d1toX: new Map([ - ["æ", ["ae",]], - ["Æ", ["AE",]], - ["b", ["bb",]], - ["B", ["BB",]], - ["c", ["cc", "ss", "qu", "ch"]], - ["C", ["CC", "SS", "QU", "CH"]], - ["d", ["dd",]], - ["D", ["DD",]], - ["é", ["ai", "ei"]], - ["É", ["AI", "EI"]], - ["f", ["ff", "ph"]], - ["F", ["FF", "PH"]], - ["g", ["gu", "ge", "gg", "gh"]], - ["G", ["GU", "GE", "GG", "GH"]], - ["j", ["jj", "dj"]], - ["J", ["JJ", "DJ"]], - ["k", ["qu", "ck", "ch", "cu", "kk", "kh"]], - ["K", ["QU", "CK", "CH", "CU", "KK", "KH"]], - ["l", ["ll",]], - ["L", ["LL",]], - ["m", ["mm", "mn"]], - ["M", ["MM", "MN"]], - ["n", ["nn", "nm", "mn"]], - ["N", ["NN", "NM", "MN"]], - ["o", ["au", "eau"]], - ["O", ["AU", "EAU"]], - ["œ", ["oe", "eu"]], - ["Œ", ["OE", "EU"]], - ["p", ["pp", "ph"]], - ["P", ["PP", "PH"]], - ["q", ["qu", "ch", "cq", "ck", "kk"]], - ["Q", ["QU", "CH", "CQ", "CK", "KK"]], - ["r", ["rr",]], - ["R", ["RR",]], - ["s", ["ss", "sh"]], - ["S", ["SS", "SH"]], - ["t", ["tt", "th"]], - ["T", ["TT", "TH"]], - ["x", ["cc", "ct", "xx"]], - ["X", ["CC", "CT", "XX"]], - ["z", ["ss", "zh"]], - ["Z", ["SS", "ZH"]], - ]), - - get1toXReplacement: function (cPrev, cCur, cNext) { - if (this.aConsonant.has(cCur) && (this.aConsonant.has(cPrev) || this.aConsonant.has(cNext))) { - return []; - } - return this.d1toX.gl_get(cCur, []); - }, - - d2toX: new Map([ - ["am", ["an", "en", "em"]], - ["AM", ["AN", "EN", "EM"]], - ["an", ["am", "en", "em"]], - ["AN", ["AM", "EN", "EM"]], - ["au", ["eau", "o", "ô"]], - ["AU", ["EAU", "O", "Ô"]], - ["em", ["an", "am", "en"]], - ["EM", ["AN", "AM", "EN"]], - ["en", ["an", "am", "em"]], - ["EN", ["AN", "AM", "EM"]], - ["ai", ["ei", "é", "è", "ê", "ë"]], - ["AI", ["EI", "É", "È", "Ê", "Ë"]], - ["ei", ["ai", "é", "è", "ê", "ë"]], - ["EI", ["AI", "É", "È", "Ê", "Ë"]], - ["ch", ["sh", "c", "ss"]], - ["CH", ["SH", "C", "SS"]], - ["ct", ["x", "cc"]], - ["CT", ["X", "CC"]], - ["oa", ["oi",]], - ["OA", ["OI",]], - ["oi", ["oa", "oie"]], - ["OI", ["OA", "OIE"]], - ["ph", ["f",]], - ["PH", ["F",]], - ["qu", ["q", "cq", "ck", "c", "k"]], - ["QU", ["Q", "CQ", "CK", "C", "K"]], - ["ss", ["c", "ç"]], - ["SS", ["C", "Ç"]], - ["un", ["ein",]], - ["UN", ["EIN",]], - ]), - - // End of word - dFinal1: new Map([ - ["a", ["as", "at", "ant", "ah"]], - ["A", ["AS", "AT", "ANT", "AH"]], - ["c", ["ch",]], - ["C", ["CH",]], - ["e", ["et", "er", "ets", "ée", "ez", "ai", "ais", "ait", "ent", "eh"]], - ["E", ["ET", "ER", "ETS", "ÉE", "EZ", "AI", "AIS", "AIT", "ENT", "EH"]], - ["é", ["et", "er", "ets", "ée", "ez", "ai", "ais", "ait"]], - ["É", ["ET", "ER", "ETS", "ÉE", "EZ", "AI", "AIS", "AIT"]], - ["è", ["et", "er", "ets", "ée", "ez", "ai", "ais", "ait"]], - ["È", ["ET", "ER", "ETS", "ÉE", "EZ", "AI", "AIS", "AIT"]], - ["ê", ["et", "er", "ets", "ée", "ez", "ai", "ais", "ait"]], - ["Ê", ["ET", "ER", "ETS", "ÉE", "EZ", "AI", "AIS", "AIT"]], - ["ë", ["et", "er", "ets", "ée", "ez", "ai", "ais", "ait"]], - ["Ë", ["ET", "ER", "ETS", "ÉE", "EZ", "AI", "AIS", "AIT"]], - ["g", ["gh",]], - ["G", ["GH",]], - ["i", ["is", "it", "ie", "in"]], - ["I", ["IS", "IT", "IE", "IN"]], - ["n", ["nt", "nd", "ns", "nh"]], - ["N", ["NT", "ND", "NS", "NH"]], - ["o", ["aut", "ot", "os"]], - ["O", ["AUT", "OT", "OS"]], - ["ô", ["aut", "ot", "os"]], - ["Ô", ["AUT", "OT", "OS"]], - ["ö", ["aut", "ot", "os"]], - ["Ö", ["AUT", "OT", "OS"]], - ["p", ["ph",]], - ["P", ["PH",]], - ["s", ["sh",]], - ["S", ["SH",]], - ["t", ["th",]], - ["T", ["TH",]], - ["u", ["ut", "us", "uh"]], - ["U", ["UT", "US", "UH"]], - ]), - - dFinal2: new Map([ - ["ai", ["aient", "ais", "et"]], - ["AI", ["AIENT", "AIS", "ET"]], - ["an", ["ant", "ent"]], - ["AN", ["ANT", "ENT"]], - ["en", ["ent", "ant"]], - ["EN", ["ENT", "ANT"]], - ["ei", ["ait", "ais"]], - ["EI", ["AIT", "AIS"]], - ["on", ["ons", "ont"]], - ["ON", ["ONS", "ONT"]], - ["oi", ["ois", "oit", "oix"]], - ["OI", ["OIS", "OIT", "OIX"]], - ]), - - - // Préfixes et suffixes - aPfx1: new Set([ - "anti", "archi", "contre", "hyper", "mé", "méta", "im", "in", "ir", "par", "proto", - "pseudo", "pré", "re", "ré", "sans", "sous", "supra", "sur", "ultra" - ]), - - aPfx2: new Set([ - "belgo", "franco", "génito", "gynéco", "médico", "russo" - ]), - - - cut: function (sWord) { - // returns an arry of strings (prefix, trimed_word, suffix) - let m = /^([a-zA-Zà-öÀ-Ö0-9_ø-ÿØ-ßĀ-ʯfi-st]+)(-(?:t-|)(?:ils?|elles|on|je|tu|nous|vous)$)/.exec(sWord); - if (m) { - return ["", m[1], m[2]]; - } - return ["", sWord, ""]; - }, - - // Other functions - filterSugg: function (aSugg) { - return aSugg.filter((sSugg) => { return !sSugg.endsWith("è") && !sSugg.endsWith("È"); }); - } - -} DELETED gc_core/js/helpers.js Index: gc_core/js/helpers.js ================================================================== --- gc_core/js/helpers.js +++ gc_core/js/helpers.js @@ -1,100 +0,0 @@ - -// HELPERS -/*jslint esversion: 6*/ -/*global console,require,exports,XMLHttpRequest*/ - -"use strict"; - -// In Firefox, there is no console.log in PromiseWorker, but there is worker.log. -// In Thunderbird, you can’t access to console directly. So it’s required to pass a log function. -let funcOutput = null; - -var helpers = { - - setLogOutput: function (func) { - funcOutput = func; - }, - - echo: function (obj) { - if (funcOutput !== null) { - funcOutput(obj); - } else { - console.log(obj); - } - return true; - }, - - logerror: function (e, bStack=false) { - let sMsg = "\n" + e.fileName + "\n" + e.name + "\nline: " + e.lineNumber + "\n" + e.message; - if (bStack) { - sMsg += "\n--- Stack ---\n" + e.stack; - } - if (funcOutput !== null) { - funcOutput(sMsg); - } else { - console.error(sMsg); - } - }, - - inspect: function (o) { - let sMsg = "__inspect__: " + typeof o; - for (let sParam in o) { - sMsg += "\n" + sParam + ": " + o.sParam; - } - sMsg += "\n" + JSON.stringify(o) + "\n__end__"; - this.echo(sMsg); - }, - - loadFile: function (spf) { - // load ressources in workers (suggested by Mozilla extensions reviewers) - // for more options have a look here: https://gist.github.com/Noitidart/ec1e6b9a593ec7e3efed - // if not in workers, use sdk/data.load() instead - try { - let xRequest; - if (typeof XMLHttpRequest !== "undefined") { - xRequest = new XMLHttpRequest(); - } else { - // JS sucks again… necessary for Thunderbird - let { Cc, Ci } = require("chrome"); - xRequest = Cc["@mozilla.org/xmlextras/xmlhttprequest;1"].createInstance(); - xRequest.QueryInterface(Ci.nsIXMLHttpRequest); - } - xRequest.open('GET', spf, false); // 3rd arg is false for synchronous, sync is acceptable in workers - xRequest.overrideMimeType('text/json'); - xRequest.send(); - return xRequest.responseText; - } - catch (e) { - this.logerror(e); - return null; - } - }, - - // conversions - objectToMap: function (obj) { - let m = new Map(); - for (let param in obj) { - m.set(param, obj[param]); - } - return m; - }, - - mapToObject: function (m) { - let obj = {}; - for (let [k, v] of m) { - obj[k] = v; - } - return obj; - } -}; - - -if (typeof(exports) !== 'undefined') { - exports.setLogOutput = helpers.setLogOutput; - exports.echo = helpers.echo; - exports.logerror = helpers.logerror; - exports.inspect = helpers.inspect; - exports.loadFile = helpers.loadFile; - exports.objectToMap = helpers.objectToMap; - exports.mapToObject = helpers.mapToObject; -} DELETED gc_core/js/ibdawg.js Index: gc_core/js/ibdawg.js ================================================================== --- gc_core/js/ibdawg.js +++ gc_core/js/ibdawg.js @@ -1,513 +0,0 @@ -//// IBDAWG -/*jslint esversion: 6*/ -/*global console,require,exports*/ - -"use strict"; - - -if (typeof(require) !== 'undefined') { - var str_transform = require("resource://grammalecte/str_transform.js"); - var helpers = require("resource://grammalecte/helpers.js"); - var char_player = require("resource://grammalecte/char_player.js"); -} - - -// Don’t remove . Necessary in TB. -${string} -${map} -${set} - - -class SuggResult { - // Structure for storing, classifying and filtering suggestions - - constructor (sWord, nDistLimit=-1) { - this.sWord = sWord; - this.sSimplifiedWord = char_player.simplifyWord(sWord); - this.nDistLimit = (nDistLimit >= 0) ? nDistLimit : Math.floor(sWord.length / 3) + 1; - this.nMinDist = 1000; - this.aSugg = new Set(); - this.dSugg = new Map([ [0, []], [1, []], [2, []] ]); - } - - addSugg (sSugg, nDeep=0) { - // add a suggestion - if (!this.aSugg.has(sSugg)) { - let nDist = str_transform.distanceDamerauLevenshtein(this.sSimplifiedWord, char_player.simplifyWord(sSugg)); - if (nDist <= this.nDistLimit) { - if (!this.dSugg.has(nDist)) { - this.dSugg.set(nDist, []); - } - this.dSugg.get(nDist).push(sSugg); - this.aSugg.add(sSugg); - if (nDist < this.nMinDist) { - this.nMinDist = nDist; - } - this.nDistLimit = Math.min(this.nDistLimit, this.nMinDist+2); - } - } - } - - getSuggestions (nSuggLimit=10, nDistLimit=-1) { - // return a list of suggestions - let lRes = []; - if (this.dSugg.get(0).length) { - // we sort the better results with the original word - let dDistTemp = new Map(); - lRes.forEach((sSugg) => { dDistTemp.set(sSugg, str_transform.distanceDamerauLevenshtein(this.sWord, sSugg)); }); - lRes = lRes.sort((sA, sB) => { return dDistTemp.get(sA) - dDistTemp.get(sB); }); - dDistTemp.clear(); - } - for (let lSugg of this.dSugg.values()) { - for (let sSugg of lSugg) { lRes.push(sSugg); } - if (lRes.length > nSuggLimit) { - break; - } - } - lRes = char_player.filterSugg(lRes); - if (this.sWord.gl_isTitle()) { - lRes = lRes.map((sSugg) => { return sSugg.gl_toCapitalize(); }); - } - else if (this.sWord.gl_isUpperCase()) { - lRes = lRes.map((sSugg) => { return sSugg.toUpperCase(); }); - } - return lRes.slice(0, nSuggLimit); - } - - reset () { - this.aSugg.clear(); - this.dSugg.clear(); - } -} - - -class IBDAWG { - // INDEXABLE BINARY DIRECT ACYCLIC WORD GRAPH - - constructor (sDicName, sPath="") { - try { - let sURL = (sPath !== "") ? sPath + "/" + sDicName : "resource://grammalecte/_dictionaries/"+sDicName; - const dict = JSON.parse(helpers.loadFile(sURL)); - Object.assign(this, dict); - } - catch (e) { - throw Error("# Error. File not found or not loadable.\n" + e.message + "\n"); - } - /* - Properties: - sName, nVersion, sHeader, lArcVal, nArcVal, byDic, sLang, nChar, nBytesArc, nBytesNodeAddress, - nEntries, nNode, nArc, nAff, cStemming, nTag, dChar, _arcMask, _finalNodeMask, _lastArcMask, _addrBitMask, nBytesOffset, - */ - - /* - Bug workaround. - Mozilla’s JS parser sucks. Can’t read file bigger than 4 Mb! - So we convert huge hexadecimal string to list of numbers… - https://github.com/mozilla/addons-linter/issues/1361 - */ - let lTemp = []; - for (let i = 0; i < this.byDic.length; i+=2) { - lTemp.push(parseInt(this.byDic.slice(i, i+2), 16)); - } - this.byDic = lTemp; - /* end of bug workaround */ - - if (!this.sHeader.startsWith("/pyfsa/")) { - throw TypeError("# Error. Not a pyfsa binary dictionary. Header: " + this.sHeader); - } - if (!(this.nVersion == "1" || this.nVersion == "2" || this.nVersion == "3")) { - throw RangeError("# Error. Unknown dictionary version: " + this.nVersion); - } - // to get the value of an arc, to get the char of an arc with its value - this.dChar = helpers.objectToMap(this.dChar); - this.dCharVal = this.dChar.gl_reverse(); - //this.byDic = new Uint8Array(this.byDic); // not quicker, even slower - - if (this.cStemming == "S") { - this.funcStemming = str_transform.getStemFromSuffixCode; - } else if (this.cStemming == "A") { - this.funcStemming = str_transform.getStemFromAffixCode; - } else { - this.funcStemming = str_transform.noStemming; - } - - // Configuring DAWG functions according to nVersion - switch (this.nVersion) { - case 1: - this.morph = this._morph1; - this.stem = this._stem1; - this._lookupArcNode = this._lookupArcNode1; - this._getArcs = this._getArcs1; - this._writeNodes = this._writeNodes1; - break; - case 2: - this.morph = this._morph2; - this.stem = this._stem2; - this._lookupArcNode = this._lookupArcNode2; - this._getArcs = this._getArcs2; - this._writeNodes = this._writeNodes2; - break; - case 3: - this.morph = this._morph3; - this.stem = this._stem3; - this._lookupArcNode = this._lookupArcNode3; - this._getArcs = this._getArcs3; - this._writeNodes = this._writeNodes3; - break; - default: - throw ValueError("# Error: unknown code: " + this.nVersion); - } - //console.log(this.getInfo()); - this.bOptNumSigle = true; - this.bOptNumAtLast = false; - } - - getInfo () { - return ` Language: ${this.sLang} Version: ${this.nVersion} Stemming: ${this.cStemming}FX\n` + - ` Arcs values: ${this.nArcVal} = ${this.nChar} characters, ${this.nAff} affixes, ${this.nTag} tags\n` + - ` Dictionary: ${this.nEntries} entries, ${this.nNode} nodes, ${this.nArc} arcs\n` + - ` Address size: ${this.nBytesNodeAddress} bytes, Arc size: ${this.nBytesArc} bytes\n`; - } - - isValidToken (sToken) { - // checks if sToken is valid (if there is hyphens in sToken, sToken is split, each part is checked) - if (this.isValid(sToken)) { - return true; - } - if (sToken.includes("-")) { - if (sToken.gl_count("-") > 4) { - return true; - } - return sToken.split("-").every(sWord => this.isValid(sWord)); - } - return false; - } - - isValid (sWord) { - // checks if sWord is valid (different casing tested if the first letter is a capital) - if (!sWord) { - return null; - } - if (sWord.includes("’")) { // ugly hack - sWord = sWord.replace("’", "'"); - } - if (this.lookup(sWord)) { - return true; - } - if (sWord.charAt(0).gl_isUpperCase()) { - if (sWord.length > 1) { - if (sWord.gl_isTitle()) { - return !!this.lookup(sWord.toLowerCase()); - } - if (sWord.gl_isUpperCase()) { - if (this.bOptNumSigle) { - return true; - } - return !!(this.lookup(sWord.toLowerCase()) || this.lookup(sWord.gl_toCapitalize())); - } - return !!this.lookup(sWord.slice(0, 1).toLowerCase() + sWord.slice(1)); - } else { - return !!this.lookup(sWord.toLowerCase()); - } - } - return false; - } - - _convBytesToInteger (aBytes) { - // Byte order = Big Endian (bigger first) - let nVal = 0; - let nWeight = (aBytes.length - 1) * 8; - for (let n of aBytes) { - nVal += n << nWeight; - nWeight = nWeight - 8; - } - return nVal; - } - - lookup (sWord) { - // returns true if sWord in dictionary (strict verification) - let iAddr = 0; - for (let c of sWord) { - if (!this.dChar.has(c)) { - return false; - } - iAddr = this._lookupArcNode(this.dChar.get(c), iAddr); - if (iAddr === null) { - return false; - } - } - return Boolean(this._convBytesToInteger(this.byDic.slice(iAddr, iAddr+this.nBytesArc)) & this._finalNodeMask); - } - - getMorph (sWord) { - // retrieves morphologies list, different casing allowed - let l = this.morph(sWord); - if (sWord[0].gl_isUpperCase()) { - l = l.concat(this.morph(sWord.toLowerCase())); - if (sWord.gl_isUpperCase() && sWord.length > 1) { - l = l.concat(this.morph(sWord.gl_toCapitalize())); - } - } - return l; - } - - suggest (sWord, nSuggLimit=10) { - // returns a array of suggestions for - let sPfx = ""; - let sSfx = ""; - [sPfx, sWord, sSfx] = char_player.cut(sWord); - let nMaxSwitch = Math.max(Math.floor(sWord.length / 3), 1); - let nMaxDel = Math.floor(sWord.length / 5); - let nMaxHardRepl = Math.max(Math.floor((sWord.length - 5) / 4), 1); - let oSuggResult = new SuggResult(sWord); - this._suggest(oSuggResult, sWord, nMaxSwitch, nMaxDel, nMaxHardRepl); - if (sWord.gl_isTitle()) { - this._suggest(oSuggResult, sWord.toLowerCase(), nMaxSwitch, nMaxDel, nMaxHardRepl); - } - else if (sWord.gl_isLowerCase()) { - this._suggest(oSuggResult, sWord.gl_toCapitalize(), nMaxSwitch, nMaxDel, nMaxHardRepl); - } - let aSugg = oSuggResult.getSuggestions(nSuggLimit); - if (sSfx || sPfx) { - // we add what we removed - return aSugg.map( (sSugg) => { return sPfx + sSugg + sSfx } ); - } - return aSugg; - } - - _suggest (oSuggResult, sRemain, nMaxSwitch=0, nMaxDel=0, nMaxHardRepl=0, nDeep=0, iAddr=0, sNewWord="", bAvoidLoop=false) { - // returns a set of suggestions - // recursive function - if (sRemain == "") { - if (this._convBytesToInteger(this.byDic.slice(iAddr, iAddr+this.nBytesArc)) & this._finalNodeMask) { - oSuggResult.addSugg(sNewWord); - } - for (let sTail of this._getTails(iAddr)) { - oSuggResult.addSugg(sNewWord+sTail); - } - return; - } - let cCurrent = sRemain.slice(0, 1); - for (let [cChar, jAddr] of this._getCharArcs(iAddr)) { - if (char_player.d1to1.gl_get(cCurrent, cCurrent).indexOf(cChar) != -1) { - this._suggest(oSuggResult, sRemain.slice(1), nMaxSwitch, nMaxDel, nMaxHardRepl, nDeep+1, jAddr, sNewWord+cChar); - } - else if (!bAvoidLoop && nMaxHardRepl) { - this._suggest(oSuggResult, sRemain.slice(1), nMaxSwitch, nMaxDel, nMaxHardRepl-1, nDeep+1, jAddr, sNewWord+cChar, true); - } - } - if (!bAvoidLoop) { // avoid infinite loop - if (sRemain.length > 1) { - if (cCurrent == sRemain.slice(1, 2)) { - // same char, we remove 1 char without adding 1 to - this._suggest(oSuggResult, sRemain.slice(1), nMaxSwitch, nMaxDel, nMaxHardRepl, nDeep+1, iAddr, sNewWord); - } - else { - // switching chars - if (nMaxSwitch > 0) { - this._suggest(oSuggResult, sRemain.slice(1, 2)+sRemain.slice(0, 1)+sRemain.slice(2), nMaxSwitch-1, nMaxDel, nMaxHardRepl, nDeep+1, iAddr, sNewWord, true); - } - // delete char - if (nMaxDel > 0) { - this._suggest(oSuggResult, sRemain.slice(1), nMaxSwitch, nMaxDel-1, nMaxHardRepl, nDeep+1, iAddr, sNewWord, true); - } - } - // Phonetic replacements - for (let sRepl of char_player.get1toXReplacement(sNewWord.slice(-1), cCurrent, sRemain.slice(1,2))) { - this._suggest(oSuggResult, sRepl + sRemain.slice(1), nMaxSwitch, nMaxDel, nMaxHardRepl, nDeep+1, iAddr, sNewWord, true); - } - for (let sRepl of char_player.d2toX.gl_get(sRemain.slice(0, 2), [])) { - this._suggest(oSuggResult, sRepl + sRemain.slice(2), nMaxSwitch, nMaxDel, nMaxHardRepl, nDeep+1, iAddr, sNewWord, true); - } - } - // end of word - if (sRemain.length == 2) { - for (let sRepl of char_player.dFinal2.gl_get(sRemain, [])) { - this._suggest(oSuggResult, sRepl, nMaxSwitch, nMaxDel, nMaxHardRepl, nDeep+1, iAddr, sNewWord, true); - } - } - else if (sRemain.length == 1) { - this._suggest(oSuggResult, "", nMaxSwitch, nMaxDel, nMaxHardRepl, nDeep+1, iAddr, sNewWord, true); // remove last char and go on - for (let sRepl of char_player.dFinal1.gl_get(sRemain, [])) { - this._suggest(oSuggResult, sRepl, nMaxSwitch, nMaxDel, nMaxHardRepl, nDeep+1, iAddr, sNewWord, true); - } - } - } - } - - * _getCharArcs (iAddr) { - // generator: yield all chars and addresses from node at address - for (let [nVal, jAddr] of this._getArcs(iAddr)) { - if (nVal < this.nChar) { - yield [this.dCharVal.get(nVal), jAddr]; - } - } - } - - * _getSimilarCharArcs (cChar, iAddr) { - // generator: yield similar char of and address of the following node - for (let c of char_player.d1to1.gl_get(cChar, [cChar])) { - if (this.dChar.has(c)) { - let jAddr = this._lookupArcNode(this.dChar.get(c), iAddr); - if (jAddr) { - yield [c, jAddr]; - } - } - } - } - - _getTails (iAddr, sTail="", n=2) { - // return a list of suffixes ending at a distance of from - let aTails = new Set(); - for (let [nVal, jAddr] of this._getArcs(iAddr)) { - if (nVal < this.nChar) { - if (this._convBytesToInteger(this.byDic.slice(jAddr, jAddr+this.nBytesArc)) & this._finalNodeMask) { - aTails.add(sTail + this.dCharVal.get(nVal)); - } - if (n && aTails.size == 0) { - aTails.gl_update(this._getTails(jAddr, sTail+this.dCharVal.get(nVal), n-1)); - } - } - } - return aTails; - } - - // morph (sWord) { - // is defined in constructor - // } - - // VERSION 1 - _morph1 (sWord) { - // returns morphologies of sWord - let iAddr = 0; - for (let c of sWord) { - if (!this.dChar.has(c)) { - return []; - } - iAddr = this._lookupArcNode(this.dChar.get(c), iAddr); - if (iAddr === null) { - return []; - } - } - if (this._convBytesToInteger(this.byDic.slice(iAddr, iAddr+this.nBytesArc)) & this._finalNodeMask) { - let l = []; - let nRawArc = 0; - while (!(nRawArc & this._lastArcMask)) { - let iEndArcAddr = iAddr + this.nBytesArc; - nRawArc = this._convBytesToInteger(this.byDic.slice(iAddr, iEndArcAddr)); - let nArc = nRawArc & this._arcMask; - if (nArc >= this.nChar) { - // This value is not a char, this is a stemming code - let sStem = ">" + this.funcStemming(sWord, this.lArcVal[nArc]); - // Now , we go to the next node and retrieve all following arcs values, all of them are tags - let iAddr2 = this._convBytesToInteger(this.byDic.slice(iEndArcAddr, iEndArcAddr+this.nBytesNodeAddress)); - let nRawArc2 = 0; - while (!(nRawArc2 & this._lastArcMask)) { - let iEndArcAddr2 = iAddr2 + this.nBytesArc; - nRawArc2 = this._convBytesToInteger(this.byDic.slice(iAddr2, iEndArcAddr2)); - l.push(sStem + " " + this.lArcVal[nRawArc2 & this._arcMask]); - iAddr2 = iEndArcAddr2+this.nBytesNodeAddress; - } - } - iAddr = iEndArcAddr + this.nBytesNodeAddress; - } - return l; - } - return []; - } - - _stem1 (sWord) { - // returns stems list of sWord - let iAddr = 0; - for (let c of sWord) { - if (!this.dChar.has(c)) { - return []; - } - iAddr = this._lookupArcNode(this.dChar.get(c), iAddr); - if (iAddr === null) { - return []; - } - } - if (this._convBytesToInteger(this.byDic.slice(iAddr, iAddr+this.nBytesArc)) & this._finalNodeMask) { - let l = []; - let nRawArc = 0; - while (!(nRawArc & this._lastArcMask)) { - let iEndArcAddr = iAddr + this.nBytesArc; - nRawArc = this._convBytesToInteger(this.byDic.slice(iAddr, iEndArcAddr)); - let nArc = nRawArc & this._arcMask; - if (nArc >= this.nChar) { - // This value is not a char, this is a stemming code - l.push(this.funcStemming(sWord, this.lArcVal[nArc])); - } - iAddr = iEndArcAddr + this.nBytesNodeAddress; - } - return l; - } - return []; - } - - _lookupArcNode1 (nVal, iAddr) { - // looks if nVal is an arc at the node at iAddr, if yes, returns address of next node else None - while (true) { - let iEndArcAddr = iAddr+this.nBytesArc; - let nRawArc = this._convBytesToInteger(this.byDic.slice(iAddr, iEndArcAddr)); - if (nVal == (nRawArc & this._arcMask)) { - // the value we are looking for - // we return the address of the next node - return this._convBytesToInteger(this.byDic.slice(iEndArcAddr, iEndArcAddr+this.nBytesNodeAddress)); - } - else { - // value not found - if (nRawArc & this._lastArcMask) { - return null; - } - iAddr = iEndArcAddr + this.nBytesNodeAddress; - } - } - } - - * _getArcs1 (iAddr) { - "generator: return all arcs at as tuples of (nVal, iAddr)" - while (true) { - let iEndArcAddr = iAddr+this.nBytesArc; - let nRawArc = this._convBytesToInteger(this.byDic.slice(iAddr, iEndArcAddr)); - yield [nRawArc & this._arcMask, this._convBytesToInteger(this.byDic.slice(iEndArcAddr, iEndArcAddr+this.nBytesNodeAddress))]; - if (nRawArc & this._lastArcMask) { - break; - } - iAddr = iEndArcAddr+this.nBytesNodeAddress; - } - } - - // VERSION 2 - _morph2 (sWord) { - // to do - } - - _stem2 (sWord) { - // to do - } - - _lookupArcNode2 (nVal, iAddr) { - // to do - } - - - // VERSION 3 - _morph3 (sWord) { - // to do - } - - _stem3 (sWord) { - // to do - } - - _lookupArcNode3 (nVal, iAddr) { - // to do - } -} - - -if (typeof(exports) !== 'undefined') { - exports.IBDAWG = IBDAWG; -} DELETED gc_core/js/str_transform.js Index: gc_core/js/str_transform.js ================================================================== --- gc_core/js/str_transform.js +++ gc_core/js/str_transform.js @@ -1,121 +0,0 @@ -//// STRING TRANSFORMATION -/*jslint esversion: 6*/ - -// Note: 48 is the ASCII code for "0" - -var str_transform = { - - distanceDamerauLevenshtein2: function (s1, s2) { - // distance of Damerau-Levenshtein between and - // https://fr.wikipedia.org/wiki/Distance_de_Damerau-Levenshtein - try { - let nLen1 = s1.length; - let nLen2 = s2.length; - let matrix = []; - for (let i = 0; i <= nLen1; i++) { - matrix[i] = new Array(nLen2 + 1); - } - for (let i = 0; i <= nLen1; i++) { - matrix[i][0] = i; - } - for (let j = 0; j <= nLen2; j++) { - matrix[0][j] = j; - } - for (let i = 1; i <= nLen1; i++) { - for (let j = 1; j <= nLen2; j++) { - let nCost = (s1[i] === s2[j]) ? 0 : 1; - matrix[i][j] = Math.min( - matrix[i-1][j] + 1, // Deletion - matrix[i][j-1] + 1, // Insertion - matrix[i-1][j-1] + nCost // Substitution - ); - if (i > 1 && j > 1 && s1[i] == s2[j-1] && s1[i-1] == s2[j]) { - matrix[i][j] = Math.min(matrix[i][j], matrix[i-2][j-2] + nCost); // Transposition - } - } - } - return matrix[nLen1][nLen2]; - } - catch (e) { - helpers.logerror(e); - } - }, - - distanceDamerauLevenshtein: function (s1, s2) { - // distance of Damerau-Levenshtein between and - // https://fr.wikipedia.org/wiki/Distance_de_Damerau-Levenshtein - try { - let nLen1 = s1.length; - let nLen2 = s2.length; - let INF = nLen1 + nLen2; - let matrix = []; - let sd = {}; - for (let i = 0; i < nLen1+2; i++) { - matrix[i] = new Array(nLen2+2); - } - matrix[0][0] = INF; - for (let i = 0; i <= nLen1; i++) { - matrix[i+1][1] = i; - matrix[i+1][0] = INF; - sd[s1[i]] = 0; - } - for (let j = 0; j <= nLen2; j++) { - matrix[1][j+1] = j; - matrix[0][j+1] = INF; - sd[s2[j]] = 0; - } - - for (let i = 1; i <= nLen1; i++) { - let DB = 0; - for (let j = 1; j <= nLen2; j++) { - let i1 = sd[s2[j-1]]; - let j1 = DB; - if (s1[i-1] === s2[j-1]) { - matrix[i+1][j+1] = matrix[i][j]; - DB = j; - } - else { - matrix[i+1][j+1] = Math.min(matrix[i][j], Math.min(matrix[i+1][j], matrix[i][j+1])) + 1; - } - matrix[i+1][j+1] = Math.min(matrix[i+1][j+1], matrix[i1] ? matrix[i1][j1] + (i-i1-1) + 1 + (j-j1-1) : Infinity); - } - sd[s1[i-1]] = i; - } - return matrix[nLen1+1][nLen2+1]; - } - catch (e) { - helpers.logerror(e); - } - }, - - showDistance (s1, s2) { - console.log(`Distance: ${s1} / ${s2} = ${this.distanceDamerauLevenshtein(s1, s2)})`); - }, - - getStemFromSuffixCode: function (sFlex, sSfxCode) { - // Suffix only - if (sSfxCode == "0") { - return sFlex; - } - return sSfxCode[0] == '0' ? sFlex + sSfxCode.slice(1) : sFlex.slice(0, -(sSfxCode.charCodeAt(0)-48)) + sSfxCode.slice(1); - }, - - getStemFromAffixCode: function (sFlex, sAffCode) { - // Prefix and suffix - if (sAffCode == "0") { - return sFlex; - } - if (!sAffCode.includes("/")) { - return "# error #"; - } - let [sPfxCode, sSfxCode] = sAffCode.split('/'); - sFlex = sPfxCode.slice(1) + sFlex.slice(sPfxCode.charCodeAt(0)-48); - return sSfxCode[0] == '0' ? sFlex + sSfxCode.slice(1) : sFlex.slice(0, -(sSfxCode.charCodeAt(0)-48)) + sSfxCode.slice(1); - } -}; - - -if (typeof(exports) !== 'undefined') { - exports.getStemFromSuffixCode = str_transform.getStemFromSuffixCode; - exports.getStemFromAffixCode = str_transform.getStemFromAffixCode; -} DELETED gc_core/js/tokenizer.js Index: gc_core/js/tokenizer.js ================================================================== --- gc_core/js/tokenizer.js +++ gc_core/js/tokenizer.js @@ -1,105 +0,0 @@ -// JavaScript -// Very simple tokenizer -/*jslint esversion: 6*/ -/*global require,exports*/ - -"use strict"; - - -if (typeof(require) !== 'undefined') { - var helpers = require("resource://grammalecte/helpers.js"); -} - - -const aTkzPatterns = { - // All regexps must start with ^. - "default": - [ - [/^[   \t]+/, 'SPACE'], - [/^\/(?:~|bin|boot|dev|etc|home|lib|mnt|opt|root|sbin|tmp|usr|var|Bureau|Documents|Images|Musique|Public|Téléchargements|Vidéos)(?:\/[a-zA-Zà-öÀ-Ö0-9ø-ÿØ-ßĀ-ʯfi-st_.()-]+)*/, 'FOLDERUNIX'], - [/^[a-zA-Z]:\\(?:Program Files(?: \(x86\)|)|[a-zA-Zà-öÀ-Ö0-9ø-ÿØ-ßĀ-ʯfi-st.()]+)(?:\\[a-zA-Zà-öÀ-Ö0-9ø-ÿØ-ßĀ-ʯfi-st_.()-]+)*/, 'FOLDERWIN'], - [/^[,.;:!?…«»“”‘’"(){}\[\]/·–—]+/, 'SEPARATOR'], - [/^[A-Z][.][A-Z][.](?:[A-Z][.])*/, 'ACRONYM'], - [/^(?:https?:\/\/|www[.]|[a-zA-Zà-öÀ-Ö0-9ø-ÿØ-ßĀ-ʯfi-st_-]+[@.][a-zA-Zà-öÀ-Ö0-9ø-ÿØ-ßĀ-ʯfi-st_-]{2,}[@.])[a-zA-Z0-9][a-zA-Zà-öÀ-Ö0-9ø-ÿØ-ßĀ-ʯfi-st_.\/?&!%=+*"'@$#-]+/, 'LINK'], - [/^[#@][a-zA-Zà-öÀ-Ö0-9ø-ÿØ-ßĀ-ʯfi-st_-]+/, 'TAG'], - [/^<[a-zA-Zà-öÀ-Ö0-9ø-ÿØ-ßĀ-ʯfi-st]+.*?>|<\/[a-zA-Zà-öÀ-Ö0-9ø-ÿØ-ßĀ-ʯfi-st]+ *>/, 'HTML'], - [/^\[\/?[a-zA-Zà-öÀ-Ö0-9ø-ÿØ-ßĀ-ʯfi-st]+\]/, 'PSEUDOHTML'], - [/^&\w+;(?:\w+;|)/, 'HTMLENTITY'], - [/^\d\d?h\d\d\b/, 'HOUR'], - [/^-?\d+(?:[.,]\d+|)/, 'NUM'], - [/^[a-zA-Zà-öÀ-Ö0-9ø-ÿØ-ßĀ-ʯfi-st]+(?:[’'`-][a-zA-Zà-öÀ-Ö0-9ø-ÿØ-ßĀ-ʯfi-st]+)*/, 'WORD'] - ], - "fr": - [ - [/^[   \t]+/, 'SPACE'], - [/^\/(?:~|bin|boot|dev|etc|home|lib|mnt|opt|root|sbin|tmp|usr|var|Bureau|Documents|Images|Musique|Public|Téléchargements|Vidéos)(?:\/[a-zA-Zà-öÀ-Ö0-9ø-ÿØ-ßĀ-ʯfi-st_.()-]+)*/, 'FOLDERUNIX'], - [/^[a-zA-Z]:\\(?:Program Files(?: \(x86\)|)|[a-zA-Zà-öÀ-Ö0-9ø-ÿØ-ßĀ-ʯfi-st.()]+)(?:\\[a-zA-Zà-öÀ-Ö0-9ø-ÿØ-ßĀ-ʯfi-st_.()-]+)*/, 'FOLDERWIN'], - [/^[,.;:!?…«»“”‘’"(){}\[\]/·–—]+/, 'SEPARATOR'], - [/^[A-Z][.][A-Z][.](?:[A-Z][.])*/, 'ACRONYM'], - [/^(?:https?:\/\/|www[.]|[a-zA-Zà-öÀ-Ö0-9ø-ÿØ-ßĀ-ʯfi-st_-]+[@.][a-zA-Zà-öÀ-Ö0-9ø-ÿØ-ßĀ-ʯfi-st_-]{2,}[@.])[a-zA-Z0-9][a-zA-Zà-öÀ-Ö0-9ø-ÿØ-ßĀ-ʯfi-st_.\/?&!%=+*"'@$#-]+/, 'LINK'], - [/^[#@][a-zA-Zà-öÀ-Ö0-9ø-ÿØ-ßĀ-ʯfi-st_-]+/, 'TAG'], - [/^<[a-zA-Zà-öÀ-Ö0-9ø-ÿØ-ßĀ-ʯfi-st]+.*?>|<\/[a-zA-Zà-öÀ-Ö0-9ø-ÿØ-ßĀ-ʯfi-st]+ *>/, 'HTML'], - [/^\[\/?[a-zA-Zà-öÀ-Ö0-9ø-ÿØ-ßĀ-ʯfi-st]+\]/, 'PSEUDOHTML'], - [/^&\w+;(?:\w+;|)/, 'HTMLENTITY'], - [/^(?:l|d|n|m|t|s|j|c|ç|lorsqu|puisqu|jusqu|quoiqu|qu)['’`]/i, 'ELPFX'], - [/^\d\d?[hm]\d\d\b/, 'HOUR'], - [/^\d+(?:er|nd|e|de|ième|ème|eme)s?\b/, 'ORDINAL'], - [/^-?\d+(?:[.,]\d+|)/, 'NUM'], - [/^[a-zA-Zà-öÀ-Ö0-9ø-ÿØ-ßĀ-ʯfi-st]+(?:[’'`-][a-zA-Zà-öÀ-Ö0-9ø-ÿØ-ßĀ-ʯfi-st]+)*/, 'WORD'] - ] -}; - - -class Tokenizer { - - constructor (sLang) { - this.sLang = sLang; - if (!aTkzPatterns.hasOwnProperty(sLang)) { - this.sLang = "default"; - } - this.aRules = aTkzPatterns[this.sLang]; - } - - * genTokens (sText) { - let m; - let i = 0; - while (sText) { - let nCut = 1; - for (let [zRegex, sType] of this.aRules) { - try { - if ((m = zRegex.exec(sText)) !== null) { - if (sType == 'SEPARATOR') { - for (let c of m[0]) { - yield { "sType": sType, "sValue": c, "nStart": i, "nEnd": i + m[0].length } - } - } else { - yield { "sType": sType, "sValue": m[0], "nStart": i, "nEnd": i + m[0].length } - } - nCut = m[0].length; - break; - } - } - catch (e) { - helpers.logerror(e); - } - } - i += nCut; - sText = sText.slice(nCut); - } - } - - getSpellingErrors (sText, oDict) { - let aSpellErr = []; - for (let oToken of this.genTokens(sText)) { - if (oToken.sType === 'WORD' && !oDict.isValidToken(oToken.sValue)) { - aSpellErr.push(oToken); - } - } - return aSpellErr; - } -} - - -if (typeof(exports) !== 'undefined') { - exports.Tokenizer = Tokenizer; -} ADDED graphspell-js/char_player.js Index: graphspell-js/char_player.js ================================================================== --- graphspell-js/char_player.js +++ graphspell-js/char_player.js @@ -0,0 +1,330 @@ +// list of similar chars +// useful for suggestion mechanism + +${map} + + +var char_player = { + + _dTransChars: new Map([ + ['à', 'a'], ['é', 'e'], ['î', 'i'], ['ô', 'o'], ['û', 'u'], ['ÿ', 'i'], ['y', 'i'], + ['â', 'a'], ['è', 'e'], ['ï', 'i'], ['ö', 'o'], ['ù', 'u'], ['ŷ', 'i'], + ['ä', 'a'], ['ê', 'e'], ['í', 'i'], ['ó', 'o'], ['ü', 'u'], ['ý', 'i'], + ['á', 'a'], ['ë', 'e'], ['ì', 'i'], ['ò', 'o'], ['ú', 'u'], ['ỳ', 'i'], + ['ā', 'a'], ['ē', 'e'], ['ī', 'i'], ['ō', 'o'], ['ū', 'u'], ['ȳ', 'i'], + ['ñ', 'n'], ['k', 'q'], ['w', 'v'], + ['œ', 'oe'], ['æ', 'ae'], + ]), + + simplifyWord: function (sWord) { + // word simplication before calculating distance between words + sWord = sWord.toLowerCase(); + let sNewWord = ""; + let i = 1; + for (let c of sWord) { + let cNew = this._dTransChars.gl_get(c, c); + let cNext = sWord.slice(i, i+1) + if (cNew != this._dTransChars.gl_get(cNext, cNext)) { + sNewWord += cNew; + } + i++; + } + return sNewWord.replace(/eau/g, "o").replace(/au/g, "o").replace(/ai/g, "e").replace(/ei/g, "e").replace(/ph/g, "f"); + }, + + aVowel: new Set("aáàâäāeéèêëēiíìîïīoóòôöōuúùûüūyýỳŷÿȳœæAÁÀÂÄĀEÉÈÊËĒIÍÌÎÏĪOÓÒÔÖŌUÚÙÛÜŪYÝỲŶŸȲŒÆ"), + aConsonant: new Set("bcçdfghjklmnñpqrstvwxzBCÇDFGHJKLMNÑPQRSTVWXZ"), + aDouble: new Set("bcdfjklmnprstzBCDFJKLMNPRSTZ"), // letters that may be used twice successively + + + // Similar chars + + d1to1: new Map([ + ["1", "liîLIÎ"], + ["2", "zZ"], + ["3", "eéèêEÉÈÊ"], + ["4", "aàâAÀÂ"], + ["5", "sgSG"], + ["6", "bdgBDG"], + ["7", "ltLT"], + ["8", "bB"], + ["9", "gbdGBD"], + ["0", "oôOÔ"], + + ["a", "aàâáäæ"], + ["A", "AÀÂÁÄÆ"], + ["à", "aàâáäæ"], + ["À", "AÀÂÁÄÆ"], + ["â", "aàâáäæ"], + ["Â", "AÀÂÁÄÆ"], + ["á", "aàâáäæ"], + ["Á", "AÀÂÁÄÆ"], + ["ä", "aàâáäæ"], + ["Ä", "AÀÂÁÄÆ"], + + ["æ", "æéa"], + ["Æ", "ÆÉA"], + + ["c", "cçskqśŝ"], + ["C", "CÇSKQŚŜ"], + ["ç", "cçskqśŝ"], + ["Ç", "CÇSKQŚŜ"], + + ["e", "eéèêëœ"], + ["E", "EÉÈÊËŒ"], + ["é", "eéèêëœ"], + ["É", "EÉÈÊËŒ"], + ["ê", "eéèêëœ"], + ["Ê", "EÉÈÊËŒ"], + ["è", "eéèêëœ"], + ["È", "EÉÈÊËŒ"], + ["ë", "eéèêëœ"], + ["Ë", "EÉÈÊËŒ"], + + ["g", "gj"], + ["G", "GJ"], + + ["i", "iîïyíìÿ"], + ["I", "IÎÏYÍÌŸ"], + ["î", "iîïyíìÿ"], + ["Î", "IÎÏYÍÌŸ"], + ["ï", "iîïyíìÿ"], + ["Ï", "IÎÏYÍÌŸ"], + ["í", "iîïyíìÿ"], + ["Í", "IÎÏYÍÌŸ"], + ["ì", "iîïyíìÿ"], + ["Ì", "IÎÏYÍÌŸ"], + + ["j", "jg"], + ["J", "JG"], + + ["k", "kcq"], + ["K", "KCQ"], + + ["n", "nñ"], + ["N", "NÑ"], + + ["o", "oôóòöœ"], + ["O", "OÔÓÒÖŒ"], + ["ô", "oôóòöœ"], + ["Ô", "OÔÓÒÖŒ"], + ["ó", "oôóòöœ"], + ["Ó", "OÔÓÒÖŒ"], + ["ò", "oôóòöœ"], + ["Ò", "OÔÓÒÖŒ"], + ["ö", "oôóòöœ"], + ["Ö", "OÔÓÒÖŒ"], + + ["œ", "œoôeéèêë"], + ["Œ", "ŒOÔEÉÈÊË"], + + ["q", "qck"], + ["Q", "QCK"], + + ["s", "sśŝcç"], + ["S", "SŚŜCÇ"], + ["ś", "sśŝcç"], + ["Ś", "SŚŜCÇ"], + ["ŝ", "sśŝcç"], + ["Ŝ", "SŚŜCÇ"], + + ["u", "uûùüú"], + ["U", "UÛÙÜÚ"], + ["û", "uûùüú"], + ["Û", "UÛÙÜÚ"], + ["ù", "uûùüú"], + ["Ù", "UÛÙÜÚ"], + ["ü", "uûùüú"], + ["Ü", "UÛÙÜÚ"], + ["ú", "uûùüú"], + ["Ú", "UÛÙÜÚ"], + + ["v", "vw"], + ["V", "VW"], + + ["w", "wv"], + ["W", "WV"], + + ["x", "xck"], + ["X", "XCK"], + + ["y", "yÿiîŷýỳ"], + ["Y", "YŸIÎŶÝỲ"], + ["ÿ", "yÿiîŷýỳ"], + ["Ÿ", "YŸIÎŶÝỲ"], + ["ŷ", "yÿiîŷýỳ"], + ["Ŷ", "YŸIÎŶÝỲ"], + ["ý", "yÿiîŷýỳ"], + ["Ý", "YŸIÎŶÝỲ"], + ["ỳ", "yÿiîŷýỳ"], + ["Ỳ", "YŸIÎŶÝỲ"], + + ["z", "zs"], + ["Z", "ZS"], + ]), + + d1toX: new Map([ + ["æ", ["ae",]], + ["Æ", ["AE",]], + ["b", ["bb",]], + ["B", ["BB",]], + ["c", ["cc", "ss", "qu", "ch"]], + ["C", ["CC", "SS", "QU", "CH"]], + ["d", ["dd",]], + ["D", ["DD",]], + ["é", ["ai", "ei"]], + ["É", ["AI", "EI"]], + ["f", ["ff", "ph"]], + ["F", ["FF", "PH"]], + ["g", ["gu", "ge", "gg", "gh"]], + ["G", ["GU", "GE", "GG", "GH"]], + ["j", ["jj", "dj"]], + ["J", ["JJ", "DJ"]], + ["k", ["qu", "ck", "ch", "cu", "kk", "kh"]], + ["K", ["QU", "CK", "CH", "CU", "KK", "KH"]], + ["l", ["ll",]], + ["L", ["LL",]], + ["m", ["mm", "mn"]], + ["M", ["MM", "MN"]], + ["n", ["nn", "nm", "mn"]], + ["N", ["NN", "NM", "MN"]], + ["o", ["au", "eau"]], + ["O", ["AU", "EAU"]], + ["œ", ["oe", "eu"]], + ["Œ", ["OE", "EU"]], + ["p", ["pp", "ph"]], + ["P", ["PP", "PH"]], + ["q", ["qu", "ch", "cq", "ck", "kk"]], + ["Q", ["QU", "CH", "CQ", "CK", "KK"]], + ["r", ["rr",]], + ["R", ["RR",]], + ["s", ["ss", "sh"]], + ["S", ["SS", "SH"]], + ["t", ["tt", "th"]], + ["T", ["TT", "TH"]], + ["x", ["cc", "ct", "xx"]], + ["X", ["CC", "CT", "XX"]], + ["z", ["ss", "zh"]], + ["Z", ["SS", "ZH"]], + ]), + + get1toXReplacement: function (cPrev, cCur, cNext) { + if (this.aConsonant.has(cCur) && (this.aConsonant.has(cPrev) || this.aConsonant.has(cNext))) { + return []; + } + return this.d1toX.gl_get(cCur, []); + }, + + d2toX: new Map([ + ["am", ["an", "en", "em"]], + ["AM", ["AN", "EN", "EM"]], + ["an", ["am", "en", "em"]], + ["AN", ["AM", "EN", "EM"]], + ["au", ["eau", "o", "ô"]], + ["AU", ["EAU", "O", "Ô"]], + ["em", ["an", "am", "en"]], + ["EM", ["AN", "AM", "EN"]], + ["en", ["an", "am", "em"]], + ["EN", ["AN", "AM", "EM"]], + ["ai", ["ei", "é", "è", "ê", "ë"]], + ["AI", ["EI", "É", "È", "Ê", "Ë"]], + ["ei", ["ai", "é", "è", "ê", "ë"]], + ["EI", ["AI", "É", "È", "Ê", "Ë"]], + ["ch", ["sh", "c", "ss"]], + ["CH", ["SH", "C", "SS"]], + ["ct", ["x", "cc"]], + ["CT", ["X", "CC"]], + ["oa", ["oi",]], + ["OA", ["OI",]], + ["oi", ["oa", "oie"]], + ["OI", ["OA", "OIE"]], + ["ph", ["f",]], + ["PH", ["F",]], + ["qu", ["q", "cq", "ck", "c", "k"]], + ["QU", ["Q", "CQ", "CK", "C", "K"]], + ["ss", ["c", "ç"]], + ["SS", ["C", "Ç"]], + ["un", ["ein",]], + ["UN", ["EIN",]], + ]), + + // End of word + dFinal1: new Map([ + ["a", ["as", "at", "ant", "ah"]], + ["A", ["AS", "AT", "ANT", "AH"]], + ["c", ["ch",]], + ["C", ["CH",]], + ["e", ["et", "er", "ets", "ée", "ez", "ai", "ais", "ait", "ent", "eh"]], + ["E", ["ET", "ER", "ETS", "ÉE", "EZ", "AI", "AIS", "AIT", "ENT", "EH"]], + ["é", ["et", "er", "ets", "ée", "ez", "ai", "ais", "ait"]], + ["É", ["ET", "ER", "ETS", "ÉE", "EZ", "AI", "AIS", "AIT"]], + ["è", ["et", "er", "ets", "ée", "ez", "ai", "ais", "ait"]], + ["È", ["ET", "ER", "ETS", "ÉE", "EZ", "AI", "AIS", "AIT"]], + ["ê", ["et", "er", "ets", "ée", "ez", "ai", "ais", "ait"]], + ["Ê", ["ET", "ER", "ETS", "ÉE", "EZ", "AI", "AIS", "AIT"]], + ["ë", ["et", "er", "ets", "ée", "ez", "ai", "ais", "ait"]], + ["Ë", ["ET", "ER", "ETS", "ÉE", "EZ", "AI", "AIS", "AIT"]], + ["g", ["gh",]], + ["G", ["GH",]], + ["i", ["is", "it", "ie", "in"]], + ["I", ["IS", "IT", "IE", "IN"]], + ["n", ["nt", "nd", "ns", "nh"]], + ["N", ["NT", "ND", "NS", "NH"]], + ["o", ["aut", "ot", "os"]], + ["O", ["AUT", "OT", "OS"]], + ["ô", ["aut", "ot", "os"]], + ["Ô", ["AUT", "OT", "OS"]], + ["ö", ["aut", "ot", "os"]], + ["Ö", ["AUT", "OT", "OS"]], + ["p", ["ph",]], + ["P", ["PH",]], + ["s", ["sh",]], + ["S", ["SH",]], + ["t", ["th",]], + ["T", ["TH",]], + ["u", ["ut", "us", "uh"]], + ["U", ["UT", "US", "UH"]], + ]), + + dFinal2: new Map([ + ["ai", ["aient", "ais", "et"]], + ["AI", ["AIENT", "AIS", "ET"]], + ["an", ["ant", "ent"]], + ["AN", ["ANT", "ENT"]], + ["en", ["ent", "ant"]], + ["EN", ["ENT", "ANT"]], + ["ei", ["ait", "ais"]], + ["EI", ["AIT", "AIS"]], + ["on", ["ons", "ont"]], + ["ON", ["ONS", "ONT"]], + ["oi", ["ois", "oit", "oix"]], + ["OI", ["OIS", "OIT", "OIX"]], + ]), + + + // Préfixes et suffixes + aPfx1: new Set([ + "anti", "archi", "contre", "hyper", "mé", "méta", "im", "in", "ir", "par", "proto", + "pseudo", "pré", "re", "ré", "sans", "sous", "supra", "sur", "ultra" + ]), + + aPfx2: new Set([ + "belgo", "franco", "génito", "gynéco", "médico", "russo" + ]), + + + cut: function (sWord) { + // returns an arry of strings (prefix, trimed_word, suffix) + let m = /^([a-zA-Zà-öÀ-Ö0-9_ø-ÿØ-ßĀ-ʯfi-st]+)(-(?:t-|)(?:ils?|elles|on|je|tu|nous|vous)$)/.exec(sWord); + if (m) { + return ["", m[1], m[2]]; + } + return ["", sWord, ""]; + }, + + // Other functions + filterSugg: function (aSugg) { + return aSugg.filter((sSugg) => { return !sSugg.endsWith("è") && !sSugg.endsWith("È"); }); + } + +} ADDED graphspell-js/helpers.js Index: graphspell-js/helpers.js ================================================================== --- graphspell-js/helpers.js +++ graphspell-js/helpers.js @@ -0,0 +1,100 @@ + +// HELPERS +/*jslint esversion: 6*/ +/*global console,require,exports,XMLHttpRequest*/ + +"use strict"; + +// In Firefox, there is no console.log in PromiseWorker, but there is worker.log. +// In Thunderbird, you can’t access to console directly. So it’s required to pass a log function. +let funcOutput = null; + +var helpers = { + + setLogOutput: function (func) { + funcOutput = func; + }, + + echo: function (obj) { + if (funcOutput !== null) { + funcOutput(obj); + } else { + console.log(obj); + } + return true; + }, + + logerror: function (e, bStack=false) { + let sMsg = "\n" + e.fileName + "\n" + e.name + "\nline: " + e.lineNumber + "\n" + e.message; + if (bStack) { + sMsg += "\n--- Stack ---\n" + e.stack; + } + if (funcOutput !== null) { + funcOutput(sMsg); + } else { + console.error(sMsg); + } + }, + + inspect: function (o) { + let sMsg = "__inspect__: " + typeof o; + for (let sParam in o) { + sMsg += "\n" + sParam + ": " + o.sParam; + } + sMsg += "\n" + JSON.stringify(o) + "\n__end__"; + this.echo(sMsg); + }, + + loadFile: function (spf) { + // load ressources in workers (suggested by Mozilla extensions reviewers) + // for more options have a look here: https://gist.github.com/Noitidart/ec1e6b9a593ec7e3efed + // if not in workers, use sdk/data.load() instead + try { + let xRequest; + if (typeof XMLHttpRequest !== "undefined") { + xRequest = new XMLHttpRequest(); + } else { + // JS sucks again… necessary for Thunderbird + let { Cc, Ci } = require("chrome"); + xRequest = Cc["@mozilla.org/xmlextras/xmlhttprequest;1"].createInstance(); + xRequest.QueryInterface(Ci.nsIXMLHttpRequest); + } + xRequest.open('GET', spf, false); // 3rd arg is false for synchronous, sync is acceptable in workers + xRequest.overrideMimeType('text/json'); + xRequest.send(); + return xRequest.responseText; + } + catch (e) { + this.logerror(e); + return null; + } + }, + + // conversions + objectToMap: function (obj) { + let m = new Map(); + for (let param in obj) { + m.set(param, obj[param]); + } + return m; + }, + + mapToObject: function (m) { + let obj = {}; + for (let [k, v] of m) { + obj[k] = v; + } + return obj; + } +}; + + +if (typeof(exports) !== 'undefined') { + exports.setLogOutput = helpers.setLogOutput; + exports.echo = helpers.echo; + exports.logerror = helpers.logerror; + exports.inspect = helpers.inspect; + exports.loadFile = helpers.loadFile; + exports.objectToMap = helpers.objectToMap; + exports.mapToObject = helpers.mapToObject; +} ADDED graphspell-js/ibdawg.js Index: graphspell-js/ibdawg.js ================================================================== --- graphspell-js/ibdawg.js +++ graphspell-js/ibdawg.js @@ -0,0 +1,513 @@ +//// IBDAWG +/*jslint esversion: 6*/ +/*global console,require,exports*/ + +"use strict"; + + +if (typeof(require) !== 'undefined') { + var str_transform = require("resource://grammalecte/str_transform.js"); + var helpers = require("resource://grammalecte/helpers.js"); + var char_player = require("resource://grammalecte/char_player.js"); +} + + +// Don’t remove . Necessary in TB. +${string} +${map} +${set} + + +class SuggResult { + // Structure for storing, classifying and filtering suggestions + + constructor (sWord, nDistLimit=-1) { + this.sWord = sWord; + this.sSimplifiedWord = char_player.simplifyWord(sWord); + this.nDistLimit = (nDistLimit >= 0) ? nDistLimit : Math.floor(sWord.length / 3) + 1; + this.nMinDist = 1000; + this.aSugg = new Set(); + this.dSugg = new Map([ [0, []], [1, []], [2, []] ]); + } + + addSugg (sSugg, nDeep=0) { + // add a suggestion + if (!this.aSugg.has(sSugg)) { + let nDist = str_transform.distanceDamerauLevenshtein(this.sSimplifiedWord, char_player.simplifyWord(sSugg)); + if (nDist <= this.nDistLimit) { + if (!this.dSugg.has(nDist)) { + this.dSugg.set(nDist, []); + } + this.dSugg.get(nDist).push(sSugg); + this.aSugg.add(sSugg); + if (nDist < this.nMinDist) { + this.nMinDist = nDist; + } + this.nDistLimit = Math.min(this.nDistLimit, this.nMinDist+2); + } + } + } + + getSuggestions (nSuggLimit=10, nDistLimit=-1) { + // return a list of suggestions + let lRes = []; + if (this.dSugg.get(0).length) { + // we sort the better results with the original word + let dDistTemp = new Map(); + lRes.forEach((sSugg) => { dDistTemp.set(sSugg, str_transform.distanceDamerauLevenshtein(this.sWord, sSugg)); }); + lRes = lRes.sort((sA, sB) => { return dDistTemp.get(sA) - dDistTemp.get(sB); }); + dDistTemp.clear(); + } + for (let lSugg of this.dSugg.values()) { + for (let sSugg of lSugg) { lRes.push(sSugg); } + if (lRes.length > nSuggLimit) { + break; + } + } + lRes = char_player.filterSugg(lRes); + if (this.sWord.gl_isTitle()) { + lRes = lRes.map((sSugg) => { return sSugg.gl_toCapitalize(); }); + } + else if (this.sWord.gl_isUpperCase()) { + lRes = lRes.map((sSugg) => { return sSugg.toUpperCase(); }); + } + return lRes.slice(0, nSuggLimit); + } + + reset () { + this.aSugg.clear(); + this.dSugg.clear(); + } +} + + +class IBDAWG { + // INDEXABLE BINARY DIRECT ACYCLIC WORD GRAPH + + constructor (sDicName, sPath="") { + try { + let sURL = (sPath !== "") ? sPath + "/" + sDicName : "resource://grammalecte/_dictionaries/"+sDicName; + const dict = JSON.parse(helpers.loadFile(sURL)); + Object.assign(this, dict); + } + catch (e) { + throw Error("# Error. File not found or not loadable.\n" + e.message + "\n"); + } + /* + Properties: + sName, nVersion, sHeader, lArcVal, nArcVal, byDic, sLang, nChar, nBytesArc, nBytesNodeAddress, + nEntries, nNode, nArc, nAff, cStemming, nTag, dChar, _arcMask, _finalNodeMask, _lastArcMask, _addrBitMask, nBytesOffset, + */ + + /* + Bug workaround. + Mozilla’s JS parser sucks. Can’t read file bigger than 4 Mb! + So we convert huge hexadecimal string to list of numbers… + https://github.com/mozilla/addons-linter/issues/1361 + */ + let lTemp = []; + for (let i = 0; i < this.byDic.length; i+=2) { + lTemp.push(parseInt(this.byDic.slice(i, i+2), 16)); + } + this.byDic = lTemp; + /* end of bug workaround */ + + if (!this.sHeader.startsWith("/pyfsa/")) { + throw TypeError("# Error. Not a pyfsa binary dictionary. Header: " + this.sHeader); + } + if (!(this.nVersion == "1" || this.nVersion == "2" || this.nVersion == "3")) { + throw RangeError("# Error. Unknown dictionary version: " + this.nVersion); + } + // to get the value of an arc, to get the char of an arc with its value + this.dChar = helpers.objectToMap(this.dChar); + this.dCharVal = this.dChar.gl_reverse(); + //this.byDic = new Uint8Array(this.byDic); // not quicker, even slower + + if (this.cStemming == "S") { + this.funcStemming = str_transform.getStemFromSuffixCode; + } else if (this.cStemming == "A") { + this.funcStemming = str_transform.getStemFromAffixCode; + } else { + this.funcStemming = str_transform.noStemming; + } + + // Configuring DAWG functions according to nVersion + switch (this.nVersion) { + case 1: + this.morph = this._morph1; + this.stem = this._stem1; + this._lookupArcNode = this._lookupArcNode1; + this._getArcs = this._getArcs1; + this._writeNodes = this._writeNodes1; + break; + case 2: + this.morph = this._morph2; + this.stem = this._stem2; + this._lookupArcNode = this._lookupArcNode2; + this._getArcs = this._getArcs2; + this._writeNodes = this._writeNodes2; + break; + case 3: + this.morph = this._morph3; + this.stem = this._stem3; + this._lookupArcNode = this._lookupArcNode3; + this._getArcs = this._getArcs3; + this._writeNodes = this._writeNodes3; + break; + default: + throw ValueError("# Error: unknown code: " + this.nVersion); + } + //console.log(this.getInfo()); + this.bOptNumSigle = true; + this.bOptNumAtLast = false; + } + + getInfo () { + return ` Language: ${this.sLang} Version: ${this.nVersion} Stemming: ${this.cStemming}FX\n` + + ` Arcs values: ${this.nArcVal} = ${this.nChar} characters, ${this.nAff} affixes, ${this.nTag} tags\n` + + ` Dictionary: ${this.nEntries} entries, ${this.nNode} nodes, ${this.nArc} arcs\n` + + ` Address size: ${this.nBytesNodeAddress} bytes, Arc size: ${this.nBytesArc} bytes\n`; + } + + isValidToken (sToken) { + // checks if sToken is valid (if there is hyphens in sToken, sToken is split, each part is checked) + if (this.isValid(sToken)) { + return true; + } + if (sToken.includes("-")) { + if (sToken.gl_count("-") > 4) { + return true; + } + return sToken.split("-").every(sWord => this.isValid(sWord)); + } + return false; + } + + isValid (sWord) { + // checks if sWord is valid (different casing tested if the first letter is a capital) + if (!sWord) { + return null; + } + if (sWord.includes("’")) { // ugly hack + sWord = sWord.replace("’", "'"); + } + if (this.lookup(sWord)) { + return true; + } + if (sWord.charAt(0).gl_isUpperCase()) { + if (sWord.length > 1) { + if (sWord.gl_isTitle()) { + return !!this.lookup(sWord.toLowerCase()); + } + if (sWord.gl_isUpperCase()) { + if (this.bOptNumSigle) { + return true; + } + return !!(this.lookup(sWord.toLowerCase()) || this.lookup(sWord.gl_toCapitalize())); + } + return !!this.lookup(sWord.slice(0, 1).toLowerCase() + sWord.slice(1)); + } else { + return !!this.lookup(sWord.toLowerCase()); + } + } + return false; + } + + _convBytesToInteger (aBytes) { + // Byte order = Big Endian (bigger first) + let nVal = 0; + let nWeight = (aBytes.length - 1) * 8; + for (let n of aBytes) { + nVal += n << nWeight; + nWeight = nWeight - 8; + } + return nVal; + } + + lookup (sWord) { + // returns true if sWord in dictionary (strict verification) + let iAddr = 0; + for (let c of sWord) { + if (!this.dChar.has(c)) { + return false; + } + iAddr = this._lookupArcNode(this.dChar.get(c), iAddr); + if (iAddr === null) { + return false; + } + } + return Boolean(this._convBytesToInteger(this.byDic.slice(iAddr, iAddr+this.nBytesArc)) & this._finalNodeMask); + } + + getMorph (sWord) { + // retrieves morphologies list, different casing allowed + let l = this.morph(sWord); + if (sWord[0].gl_isUpperCase()) { + l = l.concat(this.morph(sWord.toLowerCase())); + if (sWord.gl_isUpperCase() && sWord.length > 1) { + l = l.concat(this.morph(sWord.gl_toCapitalize())); + } + } + return l; + } + + suggest (sWord, nSuggLimit=10) { + // returns a array of suggestions for + let sPfx = ""; + let sSfx = ""; + [sPfx, sWord, sSfx] = char_player.cut(sWord); + let nMaxSwitch = Math.max(Math.floor(sWord.length / 3), 1); + let nMaxDel = Math.floor(sWord.length / 5); + let nMaxHardRepl = Math.max(Math.floor((sWord.length - 5) / 4), 1); + let oSuggResult = new SuggResult(sWord); + this._suggest(oSuggResult, sWord, nMaxSwitch, nMaxDel, nMaxHardRepl); + if (sWord.gl_isTitle()) { + this._suggest(oSuggResult, sWord.toLowerCase(), nMaxSwitch, nMaxDel, nMaxHardRepl); + } + else if (sWord.gl_isLowerCase()) { + this._suggest(oSuggResult, sWord.gl_toCapitalize(), nMaxSwitch, nMaxDel, nMaxHardRepl); + } + let aSugg = oSuggResult.getSuggestions(nSuggLimit); + if (sSfx || sPfx) { + // we add what we removed + return aSugg.map( (sSugg) => { return sPfx + sSugg + sSfx } ); + } + return aSugg; + } + + _suggest (oSuggResult, sRemain, nMaxSwitch=0, nMaxDel=0, nMaxHardRepl=0, nDeep=0, iAddr=0, sNewWord="", bAvoidLoop=false) { + // returns a set of suggestions + // recursive function + if (sRemain == "") { + if (this._convBytesToInteger(this.byDic.slice(iAddr, iAddr+this.nBytesArc)) & this._finalNodeMask) { + oSuggResult.addSugg(sNewWord); + } + for (let sTail of this._getTails(iAddr)) { + oSuggResult.addSugg(sNewWord+sTail); + } + return; + } + let cCurrent = sRemain.slice(0, 1); + for (let [cChar, jAddr] of this._getCharArcs(iAddr)) { + if (char_player.d1to1.gl_get(cCurrent, cCurrent).indexOf(cChar) != -1) { + this._suggest(oSuggResult, sRemain.slice(1), nMaxSwitch, nMaxDel, nMaxHardRepl, nDeep+1, jAddr, sNewWord+cChar); + } + else if (!bAvoidLoop && nMaxHardRepl) { + this._suggest(oSuggResult, sRemain.slice(1), nMaxSwitch, nMaxDel, nMaxHardRepl-1, nDeep+1, jAddr, sNewWord+cChar, true); + } + } + if (!bAvoidLoop) { // avoid infinite loop + if (sRemain.length > 1) { + if (cCurrent == sRemain.slice(1, 2)) { + // same char, we remove 1 char without adding 1 to + this._suggest(oSuggResult, sRemain.slice(1), nMaxSwitch, nMaxDel, nMaxHardRepl, nDeep+1, iAddr, sNewWord); + } + else { + // switching chars + if (nMaxSwitch > 0) { + this._suggest(oSuggResult, sRemain.slice(1, 2)+sRemain.slice(0, 1)+sRemain.slice(2), nMaxSwitch-1, nMaxDel, nMaxHardRepl, nDeep+1, iAddr, sNewWord, true); + } + // delete char + if (nMaxDel > 0) { + this._suggest(oSuggResult, sRemain.slice(1), nMaxSwitch, nMaxDel-1, nMaxHardRepl, nDeep+1, iAddr, sNewWord, true); + } + } + // Phonetic replacements + for (let sRepl of char_player.get1toXReplacement(sNewWord.slice(-1), cCurrent, sRemain.slice(1,2))) { + this._suggest(oSuggResult, sRepl + sRemain.slice(1), nMaxSwitch, nMaxDel, nMaxHardRepl, nDeep+1, iAddr, sNewWord, true); + } + for (let sRepl of char_player.d2toX.gl_get(sRemain.slice(0, 2), [])) { + this._suggest(oSuggResult, sRepl + sRemain.slice(2), nMaxSwitch, nMaxDel, nMaxHardRepl, nDeep+1, iAddr, sNewWord, true); + } + } + // end of word + if (sRemain.length == 2) { + for (let sRepl of char_player.dFinal2.gl_get(sRemain, [])) { + this._suggest(oSuggResult, sRepl, nMaxSwitch, nMaxDel, nMaxHardRepl, nDeep+1, iAddr, sNewWord, true); + } + } + else if (sRemain.length == 1) { + this._suggest(oSuggResult, "", nMaxSwitch, nMaxDel, nMaxHardRepl, nDeep+1, iAddr, sNewWord, true); // remove last char and go on + for (let sRepl of char_player.dFinal1.gl_get(sRemain, [])) { + this._suggest(oSuggResult, sRepl, nMaxSwitch, nMaxDel, nMaxHardRepl, nDeep+1, iAddr, sNewWord, true); + } + } + } + } + + * _getCharArcs (iAddr) { + // generator: yield all chars and addresses from node at address + for (let [nVal, jAddr] of this._getArcs(iAddr)) { + if (nVal < this.nChar) { + yield [this.dCharVal.get(nVal), jAddr]; + } + } + } + + * _getSimilarCharArcs (cChar, iAddr) { + // generator: yield similar char of and address of the following node + for (let c of char_player.d1to1.gl_get(cChar, [cChar])) { + if (this.dChar.has(c)) { + let jAddr = this._lookupArcNode(this.dChar.get(c), iAddr); + if (jAddr) { + yield [c, jAddr]; + } + } + } + } + + _getTails (iAddr, sTail="", n=2) { + // return a list of suffixes ending at a distance of from + let aTails = new Set(); + for (let [nVal, jAddr] of this._getArcs(iAddr)) { + if (nVal < this.nChar) { + if (this._convBytesToInteger(this.byDic.slice(jAddr, jAddr+this.nBytesArc)) & this._finalNodeMask) { + aTails.add(sTail + this.dCharVal.get(nVal)); + } + if (n && aTails.size == 0) { + aTails.gl_update(this._getTails(jAddr, sTail+this.dCharVal.get(nVal), n-1)); + } + } + } + return aTails; + } + + // morph (sWord) { + // is defined in constructor + // } + + // VERSION 1 + _morph1 (sWord) { + // returns morphologies of sWord + let iAddr = 0; + for (let c of sWord) { + if (!this.dChar.has(c)) { + return []; + } + iAddr = this._lookupArcNode(this.dChar.get(c), iAddr); + if (iAddr === null) { + return []; + } + } + if (this._convBytesToInteger(this.byDic.slice(iAddr, iAddr+this.nBytesArc)) & this._finalNodeMask) { + let l = []; + let nRawArc = 0; + while (!(nRawArc & this._lastArcMask)) { + let iEndArcAddr = iAddr + this.nBytesArc; + nRawArc = this._convBytesToInteger(this.byDic.slice(iAddr, iEndArcAddr)); + let nArc = nRawArc & this._arcMask; + if (nArc >= this.nChar) { + // This value is not a char, this is a stemming code + let sStem = ">" + this.funcStemming(sWord, this.lArcVal[nArc]); + // Now , we go to the next node and retrieve all following arcs values, all of them are tags + let iAddr2 = this._convBytesToInteger(this.byDic.slice(iEndArcAddr, iEndArcAddr+this.nBytesNodeAddress)); + let nRawArc2 = 0; + while (!(nRawArc2 & this._lastArcMask)) { + let iEndArcAddr2 = iAddr2 + this.nBytesArc; + nRawArc2 = this._convBytesToInteger(this.byDic.slice(iAddr2, iEndArcAddr2)); + l.push(sStem + " " + this.lArcVal[nRawArc2 & this._arcMask]); + iAddr2 = iEndArcAddr2+this.nBytesNodeAddress; + } + } + iAddr = iEndArcAddr + this.nBytesNodeAddress; + } + return l; + } + return []; + } + + _stem1 (sWord) { + // returns stems list of sWord + let iAddr = 0; + for (let c of sWord) { + if (!this.dChar.has(c)) { + return []; + } + iAddr = this._lookupArcNode(this.dChar.get(c), iAddr); + if (iAddr === null) { + return []; + } + } + if (this._convBytesToInteger(this.byDic.slice(iAddr, iAddr+this.nBytesArc)) & this._finalNodeMask) { + let l = []; + let nRawArc = 0; + while (!(nRawArc & this._lastArcMask)) { + let iEndArcAddr = iAddr + this.nBytesArc; + nRawArc = this._convBytesToInteger(this.byDic.slice(iAddr, iEndArcAddr)); + let nArc = nRawArc & this._arcMask; + if (nArc >= this.nChar) { + // This value is not a char, this is a stemming code + l.push(this.funcStemming(sWord, this.lArcVal[nArc])); + } + iAddr = iEndArcAddr + this.nBytesNodeAddress; + } + return l; + } + return []; + } + + _lookupArcNode1 (nVal, iAddr) { + // looks if nVal is an arc at the node at iAddr, if yes, returns address of next node else None + while (true) { + let iEndArcAddr = iAddr+this.nBytesArc; + let nRawArc = this._convBytesToInteger(this.byDic.slice(iAddr, iEndArcAddr)); + if (nVal == (nRawArc & this._arcMask)) { + // the value we are looking for + // we return the address of the next node + return this._convBytesToInteger(this.byDic.slice(iEndArcAddr, iEndArcAddr+this.nBytesNodeAddress)); + } + else { + // value not found + if (nRawArc & this._lastArcMask) { + return null; + } + iAddr = iEndArcAddr + this.nBytesNodeAddress; + } + } + } + + * _getArcs1 (iAddr) { + "generator: return all arcs at as tuples of (nVal, iAddr)" + while (true) { + let iEndArcAddr = iAddr+this.nBytesArc; + let nRawArc = this._convBytesToInteger(this.byDic.slice(iAddr, iEndArcAddr)); + yield [nRawArc & this._arcMask, this._convBytesToInteger(this.byDic.slice(iEndArcAddr, iEndArcAddr+this.nBytesNodeAddress))]; + if (nRawArc & this._lastArcMask) { + break; + } + iAddr = iEndArcAddr+this.nBytesNodeAddress; + } + } + + // VERSION 2 + _morph2 (sWord) { + // to do + } + + _stem2 (sWord) { + // to do + } + + _lookupArcNode2 (nVal, iAddr) { + // to do + } + + + // VERSION 3 + _morph3 (sWord) { + // to do + } + + _stem3 (sWord) { + // to do + } + + _lookupArcNode3 (nVal, iAddr) { + // to do + } +} + + +if (typeof(exports) !== 'undefined') { + exports.IBDAWG = IBDAWG; +} ADDED graphspell-js/str_transform.js Index: graphspell-js/str_transform.js ================================================================== --- graphspell-js/str_transform.js +++ graphspell-js/str_transform.js @@ -0,0 +1,121 @@ +//// STRING TRANSFORMATION +/*jslint esversion: 6*/ + +// Note: 48 is the ASCII code for "0" + +var str_transform = { + + distanceDamerauLevenshtein2: function (s1, s2) { + // distance of Damerau-Levenshtein between and + // https://fr.wikipedia.org/wiki/Distance_de_Damerau-Levenshtein + try { + let nLen1 = s1.length; + let nLen2 = s2.length; + let matrix = []; + for (let i = 0; i <= nLen1; i++) { + matrix[i] = new Array(nLen2 + 1); + } + for (let i = 0; i <= nLen1; i++) { + matrix[i][0] = i; + } + for (let j = 0; j <= nLen2; j++) { + matrix[0][j] = j; + } + for (let i = 1; i <= nLen1; i++) { + for (let j = 1; j <= nLen2; j++) { + let nCost = (s1[i] === s2[j]) ? 0 : 1; + matrix[i][j] = Math.min( + matrix[i-1][j] + 1, // Deletion + matrix[i][j-1] + 1, // Insertion + matrix[i-1][j-1] + nCost // Substitution + ); + if (i > 1 && j > 1 && s1[i] == s2[j-1] && s1[i-1] == s2[j]) { + matrix[i][j] = Math.min(matrix[i][j], matrix[i-2][j-2] + nCost); // Transposition + } + } + } + return matrix[nLen1][nLen2]; + } + catch (e) { + helpers.logerror(e); + } + }, + + distanceDamerauLevenshtein: function (s1, s2) { + // distance of Damerau-Levenshtein between and + // https://fr.wikipedia.org/wiki/Distance_de_Damerau-Levenshtein + try { + let nLen1 = s1.length; + let nLen2 = s2.length; + let INF = nLen1 + nLen2; + let matrix = []; + let sd = {}; + for (let i = 0; i < nLen1+2; i++) { + matrix[i] = new Array(nLen2+2); + } + matrix[0][0] = INF; + for (let i = 0; i <= nLen1; i++) { + matrix[i+1][1] = i; + matrix[i+1][0] = INF; + sd[s1[i]] = 0; + } + for (let j = 0; j <= nLen2; j++) { + matrix[1][j+1] = j; + matrix[0][j+1] = INF; + sd[s2[j]] = 0; + } + + for (let i = 1; i <= nLen1; i++) { + let DB = 0; + for (let j = 1; j <= nLen2; j++) { + let i1 = sd[s2[j-1]]; + let j1 = DB; + if (s1[i-1] === s2[j-1]) { + matrix[i+1][j+1] = matrix[i][j]; + DB = j; + } + else { + matrix[i+1][j+1] = Math.min(matrix[i][j], Math.min(matrix[i+1][j], matrix[i][j+1])) + 1; + } + matrix[i+1][j+1] = Math.min(matrix[i+1][j+1], matrix[i1] ? matrix[i1][j1] + (i-i1-1) + 1 + (j-j1-1) : Infinity); + } + sd[s1[i-1]] = i; + } + return matrix[nLen1+1][nLen2+1]; + } + catch (e) { + helpers.logerror(e); + } + }, + + showDistance (s1, s2) { + console.log(`Distance: ${s1} / ${s2} = ${this.distanceDamerauLevenshtein(s1, s2)})`); + }, + + getStemFromSuffixCode: function (sFlex, sSfxCode) { + // Suffix only + if (sSfxCode == "0") { + return sFlex; + } + return sSfxCode[0] == '0' ? sFlex + sSfxCode.slice(1) : sFlex.slice(0, -(sSfxCode.charCodeAt(0)-48)) + sSfxCode.slice(1); + }, + + getStemFromAffixCode: function (sFlex, sAffCode) { + // Prefix and suffix + if (sAffCode == "0") { + return sFlex; + } + if (!sAffCode.includes("/")) { + return "# error #"; + } + let [sPfxCode, sSfxCode] = sAffCode.split('/'); + sFlex = sPfxCode.slice(1) + sFlex.slice(sPfxCode.charCodeAt(0)-48); + return sSfxCode[0] == '0' ? sFlex + sSfxCode.slice(1) : sFlex.slice(0, -(sSfxCode.charCodeAt(0)-48)) + sSfxCode.slice(1); + } +}; + + +if (typeof(exports) !== 'undefined') { + exports.getStemFromSuffixCode = str_transform.getStemFromSuffixCode; + exports.getStemFromAffixCode = str_transform.getStemFromAffixCode; +} ADDED graphspell-js/tokenizer.js Index: graphspell-js/tokenizer.js ================================================================== --- graphspell-js/tokenizer.js +++ graphspell-js/tokenizer.js @@ -0,0 +1,105 @@ +// JavaScript +// Very simple tokenizer +/*jslint esversion: 6*/ +/*global require,exports*/ + +"use strict"; + + +if (typeof(require) !== 'undefined') { + var helpers = require("resource://grammalecte/helpers.js"); +} + + +const aTkzPatterns = { + // All regexps must start with ^. + "default": + [ + [/^[   \t]+/, 'SPACE'], + [/^\/(?:~|bin|boot|dev|etc|home|lib|mnt|opt|root|sbin|tmp|usr|var|Bureau|Documents|Images|Musique|Public|Téléchargements|Vidéos)(?:\/[a-zA-Zà-öÀ-Ö0-9ø-ÿØ-ßĀ-ʯfi-st_.()-]+)*/, 'FOLDERUNIX'], + [/^[a-zA-Z]:\\(?:Program Files(?: \(x86\)|)|[a-zA-Zà-öÀ-Ö0-9ø-ÿØ-ßĀ-ʯfi-st.()]+)(?:\\[a-zA-Zà-öÀ-Ö0-9ø-ÿØ-ßĀ-ʯfi-st_.()-]+)*/, 'FOLDERWIN'], + [/^[,.;:!?…«»“”‘’"(){}\[\]/·–—]+/, 'SEPARATOR'], + [/^[A-Z][.][A-Z][.](?:[A-Z][.])*/, 'ACRONYM'], + [/^(?:https?:\/\/|www[.]|[a-zA-Zà-öÀ-Ö0-9ø-ÿØ-ßĀ-ʯfi-st_-]+[@.][a-zA-Zà-öÀ-Ö0-9ø-ÿØ-ßĀ-ʯfi-st_-]{2,}[@.])[a-zA-Z0-9][a-zA-Zà-öÀ-Ö0-9ø-ÿØ-ßĀ-ʯfi-st_.\/?&!%=+*"'@$#-]+/, 'LINK'], + [/^[#@][a-zA-Zà-öÀ-Ö0-9ø-ÿØ-ßĀ-ʯfi-st_-]+/, 'TAG'], + [/^<[a-zA-Zà-öÀ-Ö0-9ø-ÿØ-ßĀ-ʯfi-st]+.*?>|<\/[a-zA-Zà-öÀ-Ö0-9ø-ÿØ-ßĀ-ʯfi-st]+ *>/, 'HTML'], + [/^\[\/?[a-zA-Zà-öÀ-Ö0-9ø-ÿØ-ßĀ-ʯfi-st]+\]/, 'PSEUDOHTML'], + [/^&\w+;(?:\w+;|)/, 'HTMLENTITY'], + [/^\d\d?h\d\d\b/, 'HOUR'], + [/^-?\d+(?:[.,]\d+|)/, 'NUM'], + [/^[a-zA-Zà-öÀ-Ö0-9ø-ÿØ-ßĀ-ʯfi-st]+(?:[’'`-][a-zA-Zà-öÀ-Ö0-9ø-ÿØ-ßĀ-ʯfi-st]+)*/, 'WORD'] + ], + "fr": + [ + [/^[   \t]+/, 'SPACE'], + [/^\/(?:~|bin|boot|dev|etc|home|lib|mnt|opt|root|sbin|tmp|usr|var|Bureau|Documents|Images|Musique|Public|Téléchargements|Vidéos)(?:\/[a-zA-Zà-öÀ-Ö0-9ø-ÿØ-ßĀ-ʯfi-st_.()-]+)*/, 'FOLDERUNIX'], + [/^[a-zA-Z]:\\(?:Program Files(?: \(x86\)|)|[a-zA-Zà-öÀ-Ö0-9ø-ÿØ-ßĀ-ʯfi-st.()]+)(?:\\[a-zA-Zà-öÀ-Ö0-9ø-ÿØ-ßĀ-ʯfi-st_.()-]+)*/, 'FOLDERWIN'], + [/^[,.;:!?…«»“”‘’"(){}\[\]/·–—]+/, 'SEPARATOR'], + [/^[A-Z][.][A-Z][.](?:[A-Z][.])*/, 'ACRONYM'], + [/^(?:https?:\/\/|www[.]|[a-zA-Zà-öÀ-Ö0-9ø-ÿØ-ßĀ-ʯfi-st_-]+[@.][a-zA-Zà-öÀ-Ö0-9ø-ÿØ-ßĀ-ʯfi-st_-]{2,}[@.])[a-zA-Z0-9][a-zA-Zà-öÀ-Ö0-9ø-ÿØ-ßĀ-ʯfi-st_.\/?&!%=+*"'@$#-]+/, 'LINK'], + [/^[#@][a-zA-Zà-öÀ-Ö0-9ø-ÿØ-ßĀ-ʯfi-st_-]+/, 'TAG'], + [/^<[a-zA-Zà-öÀ-Ö0-9ø-ÿØ-ßĀ-ʯfi-st]+.*?>|<\/[a-zA-Zà-öÀ-Ö0-9ø-ÿØ-ßĀ-ʯfi-st]+ *>/, 'HTML'], + [/^\[\/?[a-zA-Zà-öÀ-Ö0-9ø-ÿØ-ßĀ-ʯfi-st]+\]/, 'PSEUDOHTML'], + [/^&\w+;(?:\w+;|)/, 'HTMLENTITY'], + [/^(?:l|d|n|m|t|s|j|c|ç|lorsqu|puisqu|jusqu|quoiqu|qu)['’`]/i, 'ELPFX'], + [/^\d\d?[hm]\d\d\b/, 'HOUR'], + [/^\d+(?:er|nd|e|de|ième|ème|eme)s?\b/, 'ORDINAL'], + [/^-?\d+(?:[.,]\d+|)/, 'NUM'], + [/^[a-zA-Zà-öÀ-Ö0-9ø-ÿØ-ßĀ-ʯfi-st]+(?:[’'`-][a-zA-Zà-öÀ-Ö0-9ø-ÿØ-ßĀ-ʯfi-st]+)*/, 'WORD'] + ] +}; + + +class Tokenizer { + + constructor (sLang) { + this.sLang = sLang; + if (!aTkzPatterns.hasOwnProperty(sLang)) { + this.sLang = "default"; + } + this.aRules = aTkzPatterns[this.sLang]; + } + + * genTokens (sText) { + let m; + let i = 0; + while (sText) { + let nCut = 1; + for (let [zRegex, sType] of this.aRules) { + try { + if ((m = zRegex.exec(sText)) !== null) { + if (sType == 'SEPARATOR') { + for (let c of m[0]) { + yield { "sType": sType, "sValue": c, "nStart": i, "nEnd": i + m[0].length } + } + } else { + yield { "sType": sType, "sValue": m[0], "nStart": i, "nEnd": i + m[0].length } + } + nCut = m[0].length; + break; + } + } + catch (e) { + helpers.logerror(e); + } + } + i += nCut; + sText = sText.slice(nCut); + } + } + + getSpellingErrors (sText, oDict) { + let aSpellErr = []; + for (let oToken of this.genTokens(sText)) { + if (oToken.sType === 'WORD' && !oDict.isValidToken(oToken.sValue)) { + aSpellErr.push(oToken); + } + } + return aSpellErr; + } +} + + +if (typeof(exports) !== 'undefined') { + exports.Tokenizer = Tokenizer; +}