DELETED gc_core/js/char_player.js Index: gc_core/js/char_player.js ================================================================== --- gc_core/js/char_player.js +++ /dev/null @@ -1,330 +0,0 @@ -// list of similar chars -// useful for suggestion mechanism - -${map} - - -var char_player = { - - _dTransChars: new Map([ - ['à', 'a'], ['é', 'e'], ['î', 'i'], ['ô', 'o'], ['û', 'u'], ['ÿ', 'i'], ['y', 'i'], - ['â', 'a'], ['è', 'e'], ['ï', 'i'], ['ö', 'o'], ['ù', 'u'], ['ŷ', 'i'], - ['ä', 'a'], ['ê', 'e'], ['í', 'i'], ['ó', 'o'], ['ü', 'u'], ['ý', 'i'], - ['á', 'a'], ['ë', 'e'], ['ì', 'i'], ['ò', 'o'], ['ú', 'u'], ['ỳ', 'i'], - ['ā', 'a'], ['ē', 'e'], ['ī', 'i'], ['ō', 'o'], ['ū', 'u'], ['ȳ', 'i'], - ['ñ', 'n'], ['k', 'q'], ['w', 'v'], - ['œ', 'oe'], ['æ', 'ae'], - ]), - - simplifyWord: function (sWord) { - // word simplication before calculating distance between words - sWord = sWord.toLowerCase(); - let sNewWord = ""; - let i = 1; - for (let c of sWord) { - let cNew = this._dTransChars.gl_get(c, c); - let cNext = sWord.slice(i, i+1) - if (cNew != this._dTransChars.gl_get(cNext, cNext)) { - sNewWord += cNew; - } - i++; - } - return sNewWord.replace(/eau/g, "o").replace(/au/g, "o").replace(/ai/g, "e").replace(/ei/g, "e").replace(/ph/g, "f"); - }, - - aVowel: new Set("aáàâäāeéèêëēiíìîïīoóòôöōuúùûüūyýỳŷÿȳœæAÁÀÂÄĀEÉÈÊËĒIÍÌÎÏĪOÓÒÔÖŌUÚÙÛÜŪYÝỲŶŸȲŒÆ"), - aConsonant: new Set("bcçdfghjklmnñpqrstvwxzBCÇDFGHJKLMNÑPQRSTVWXZ"), - aDouble: new Set("bcdfjklmnprstzBCDFJKLMNPRSTZ"), // letters that may be used twice successively - - - // Similar chars - - d1to1: new Map([ - ["1", "liîLIÎ"], - ["2", "zZ"], - ["3", "eéèêEÉÈÊ"], - ["4", "aàâAÀÂ"], - ["5", "sgSG"], - ["6", "bdgBDG"], - ["7", "ltLT"], - ["8", "bB"], - ["9", "gbdGBD"], - ["0", "oôOÔ"], - - ["a", "aàâáäæ"], - ["A", "AÀÂÁÄÆ"], - ["à", "aàâáäæ"], - ["À", "AÀÂÁÄÆ"], - ["â", "aàâáäæ"], - ["Â", "AÀÂÁÄÆ"], - ["á", "aàâáäæ"], - ["Á", "AÀÂÁÄÆ"], - ["ä", "aàâáäæ"], - ["Ä", "AÀÂÁÄÆ"], - - ["æ", "æéa"], - ["Æ", "ÆÉA"], - - ["c", "cçskqśŝ"], - ["C", "CÇSKQŚŜ"], - ["ç", "cçskqśŝ"], - ["Ç", "CÇSKQŚŜ"], - - ["e", "eéèêëœ"], - ["E", "EÉÈÊËŒ"], - ["é", "eéèêëœ"], - ["É", "EÉÈÊËŒ"], - ["ê", "eéèêëœ"], - ["Ê", "EÉÈÊËŒ"], - ["è", "eéèêëœ"], - ["È", "EÉÈÊËŒ"], - ["ë", "eéèêëœ"], - ["Ë", "EÉÈÊËŒ"], - - ["g", "gj"], - ["G", "GJ"], - - ["i", "iîïyíìÿ"], - ["I", "IÎÏYÍÌŸ"], - ["î", "iîïyíìÿ"], - ["Î", "IÎÏYÍÌŸ"], - ["ï", "iîïyíìÿ"], - ["Ï", "IÎÏYÍÌŸ"], - ["í", "iîïyíìÿ"], - ["Í", "IÎÏYÍÌŸ"], - ["ì", "iîïyíìÿ"], - ["Ì", "IÎÏYÍÌŸ"], - - ["j", "jg"], - ["J", "JG"], - - ["k", "kcq"], - ["K", "KCQ"], - - ["n", "nñ"], - ["N", "NÑ"], - - ["o", "oôóòöœ"], - ["O", "OÔÓÒÖŒ"], - ["ô", "oôóòöœ"], - ["Ô", "OÔÓÒÖŒ"], - ["ó", "oôóòöœ"], - ["Ó", "OÔÓÒÖŒ"], - ["ò", "oôóòöœ"], - ["Ò", "OÔÓÒÖŒ"], - ["ö", "oôóòöœ"], - ["Ö", "OÔÓÒÖŒ"], - - ["œ", "œoôeéèêë"], - ["Œ", "ŒOÔEÉÈÊË"], - - ["q", "qck"], - ["Q", "QCK"], - - ["s", "sśŝcç"], - ["S", "SŚŜCÇ"], - ["ś", "sśŝcç"], - ["Ś", "SŚŜCÇ"], - ["ŝ", "sśŝcç"], - ["Ŝ", "SŚŜCÇ"], - - ["u", "uûùüú"], - ["U", "UÛÙÜÚ"], - ["û", "uûùüú"], - ["Û", "UÛÙÜÚ"], - ["ù", "uûùüú"], - ["Ù", "UÛÙÜÚ"], - ["ü", "uûùüú"], - ["Ü", "UÛÙÜÚ"], - ["ú", "uûùüú"], - ["Ú", "UÛÙÜÚ"], - - ["v", "vw"], - ["V", "VW"], - - ["w", "wv"], - ["W", "WV"], - - ["x", "xck"], - ["X", "XCK"], - - ["y", "yÿiîŷýỳ"], - ["Y", "YŸIÎŶÝỲ"], - ["ÿ", "yÿiîŷýỳ"], - ["Ÿ", "YŸIÎŶÝỲ"], - ["ŷ", "yÿiîŷýỳ"], - ["Ŷ", "YŸIÎŶÝỲ"], - ["ý", "yÿiîŷýỳ"], - ["Ý", "YŸIÎŶÝỲ"], - ["ỳ", "yÿiîŷýỳ"], - ["Ỳ", "YŸIÎŶÝỲ"], - - ["z", "zs"], - ["Z", "ZS"], - ]), - - d1toX: new Map([ - ["æ", ["ae",]], - ["Æ", ["AE",]], - ["b", ["bb",]], - ["B", ["BB",]], - ["c", ["cc", "ss", "qu", "ch"]], - ["C", ["CC", "SS", "QU", "CH"]], - ["d", ["dd",]], - ["D", ["DD",]], - ["é", ["ai", "ei"]], - ["É", ["AI", "EI"]], - ["f", ["ff", "ph"]], - ["F", ["FF", "PH"]], - ["g", ["gu", "ge", "gg", "gh"]], - ["G", ["GU", "GE", "GG", "GH"]], - ["j", ["jj", "dj"]], - ["J", ["JJ", "DJ"]], - ["k", ["qu", "ck", "ch", "cu", "kk", "kh"]], - ["K", ["QU", "CK", "CH", "CU", "KK", "KH"]], - ["l", ["ll",]], - ["L", ["LL",]], - ["m", ["mm", "mn"]], - ["M", ["MM", "MN"]], - ["n", ["nn", "nm", "mn"]], - ["N", ["NN", "NM", "MN"]], - ["o", ["au", "eau"]], - ["O", ["AU", "EAU"]], - ["œ", ["oe", "eu"]], - ["Œ", ["OE", "EU"]], - ["p", ["pp", "ph"]], - ["P", ["PP", "PH"]], - ["q", ["qu", "ch", "cq", "ck", "kk"]], - ["Q", ["QU", "CH", "CQ", "CK", "KK"]], - ["r", ["rr",]], - ["R", ["RR",]], - ["s", ["ss", "sh"]], - ["S", ["SS", "SH"]], - ["t", ["tt", "th"]], - ["T", ["TT", "TH"]], - ["x", ["cc", "ct", "xx"]], - ["X", ["CC", "CT", "XX"]], - ["z", ["ss", "zh"]], - ["Z", ["SS", "ZH"]], - ]), - - get1toXReplacement: function (cPrev, cCur, cNext) { - if (this.aConsonant.has(cCur) && (this.aConsonant.has(cPrev) || this.aConsonant.has(cNext))) { - return []; - } - return this.d1toX.gl_get(cCur, []); - }, - - d2toX: new Map([ - ["am", ["an", "en", "em"]], - ["AM", ["AN", "EN", "EM"]], - ["an", ["am", "en", "em"]], - ["AN", ["AM", "EN", "EM"]], - ["au", ["eau", "o", "ô"]], - ["AU", ["EAU", "O", "Ô"]], - ["em", ["an", "am", "en"]], - ["EM", ["AN", "AM", "EN"]], - ["en", ["an", "am", "em"]], - ["EN", ["AN", "AM", "EM"]], - ["ai", ["ei", "é", "è", "ê", "ë"]], - ["AI", ["EI", "É", "È", "Ê", "Ë"]], - ["ei", ["ai", "é", "è", "ê", "ë"]], - ["EI", ["AI", "É", "È", "Ê", "Ë"]], - ["ch", ["sh", "c", "ss"]], - ["CH", ["SH", "C", "SS"]], - ["ct", ["x", "cc"]], - ["CT", ["X", "CC"]], - ["oa", ["oi",]], - ["OA", ["OI",]], - ["oi", ["oa", "oie"]], - ["OI", ["OA", "OIE"]], - ["ph", ["f",]], - ["PH", ["F",]], - ["qu", ["q", "cq", "ck", "c", "k"]], - ["QU", ["Q", "CQ", "CK", "C", "K"]], - ["ss", ["c", "ç"]], - ["SS", ["C", "Ç"]], - ["un", ["ein",]], - ["UN", ["EIN",]], - ]), - - // End of word - dFinal1: new Map([ - ["a", ["as", "at", "ant", "ah"]], - ["A", ["AS", "AT", "ANT", "AH"]], - ["c", ["ch",]], - ["C", ["CH",]], - ["e", ["et", "er", "ets", "ée", "ez", "ai", "ais", "ait", "ent", "eh"]], - ["E", ["ET", "ER", "ETS", "ÉE", "EZ", "AI", "AIS", "AIT", "ENT", "EH"]], - ["é", ["et", "er", "ets", "ée", "ez", "ai", "ais", "ait"]], - ["É", ["ET", "ER", "ETS", "ÉE", "EZ", "AI", "AIS", "AIT"]], - ["è", ["et", "er", "ets", "ée", "ez", "ai", "ais", "ait"]], - ["È", ["ET", "ER", "ETS", "ÉE", "EZ", "AI", "AIS", "AIT"]], - ["ê", ["et", "er", "ets", "ée", "ez", "ai", "ais", "ait"]], - ["Ê", ["ET", "ER", "ETS", "ÉE", "EZ", "AI", "AIS", "AIT"]], - ["ë", ["et", "er", "ets", "ée", "ez", "ai", "ais", "ait"]], - ["Ë", ["ET", "ER", "ETS", "ÉE", "EZ", "AI", "AIS", "AIT"]], - ["g", ["gh",]], - ["G", ["GH",]], - ["i", ["is", "it", "ie", "in"]], - ["I", ["IS", "IT", "IE", "IN"]], - ["n", ["nt", "nd", "ns", "nh"]], - ["N", ["NT", "ND", "NS", "NH"]], - ["o", ["aut", "ot", "os"]], - ["O", ["AUT", "OT", "OS"]], - ["ô", ["aut", "ot", "os"]], - ["Ô", ["AUT", "OT", "OS"]], - ["ö", ["aut", "ot", "os"]], - ["Ö", ["AUT", "OT", "OS"]], - ["p", ["ph",]], - ["P", ["PH",]], - ["s", ["sh",]], - ["S", ["SH",]], - ["t", ["th",]], - ["T", ["TH",]], - ["u", ["ut", "us", "uh"]], - ["U", ["UT", "US", "UH"]], - ]), - - dFinal2: new Map([ - ["ai", ["aient", "ais", "et"]], - ["AI", ["AIENT", "AIS", "ET"]], - ["an", ["ant", "ent"]], - ["AN", ["ANT", "ENT"]], - ["en", ["ent", "ant"]], - ["EN", ["ENT", "ANT"]], - ["ei", ["ait", "ais"]], - ["EI", ["AIT", "AIS"]], - ["on", ["ons", "ont"]], - ["ON", ["ONS", "ONT"]], - ["oi", ["ois", "oit", "oix"]], - ["OI", ["OIS", "OIT", "OIX"]], - ]), - - - // Préfixes et suffixes - aPfx1: new Set([ - "anti", "archi", "contre", "hyper", "mé", "méta", "im", "in", "ir", "par", "proto", - "pseudo", "pré", "re", "ré", "sans", "sous", "supra", "sur", "ultra" - ]), - - aPfx2: new Set([ - "belgo", "franco", "génito", "gynéco", "médico", "russo" - ]), - - - cut: function (sWord) { - // returns an arry of strings (prefix, trimed_word, suffix) - let m = /^([a-zA-Zà-öÀ-Ö0-9_ø-ÿØ-ßĀ-ʯfi-st]+)(-(?:t-|)(?:ils?|elles|on|je|tu|nous|vous)$)/.exec(sWord); - if (m) { - return ["", m[1], m[2]]; - } - return ["", sWord, ""]; - }, - - // Other functions - filterSugg: function (aSugg) { - return aSugg.filter((sSugg) => { return !sSugg.endsWith("è") && !sSugg.endsWith("È"); }); - } - -} DELETED gc_core/js/helpers.js Index: gc_core/js/helpers.js ================================================================== --- gc_core/js/helpers.js +++ /dev/null @@ -1,100 +0,0 @@ - -// HELPERS -/*jslint esversion: 6*/ -/*global console,require,exports,XMLHttpRequest*/ - -"use strict"; - -// In Firefox, there is no console.log in PromiseWorker, but there is worker.log. -// In Thunderbird, you can’t access to console directly. So it’s required to pass a log function. -let funcOutput = null; - -var helpers = { - - setLogOutput: function (func) { - funcOutput = func; - }, - - echo: function (obj) { - if (funcOutput !== null) { - funcOutput(obj); - } else { - console.log(obj); - } - return true; - }, - - logerror: function (e, bStack=false) { - let sMsg = "\n" + e.fileName + "\n" + e.name + "\nline: " + e.lineNumber + "\n" + e.message; - if (bStack) { - sMsg += "\n--- Stack ---\n" + e.stack; - } - if (funcOutput !== null) { - funcOutput(sMsg); - } else { - console.error(sMsg); - } - }, - - inspect: function (o) { - let sMsg = "__inspect__: " + typeof o; - for (let sParam in o) { - sMsg += "\n" + sParam + ": " + o.sParam; - } - sMsg += "\n" + JSON.stringify(o) + "\n__end__"; - this.echo(sMsg); - }, - - loadFile: function (spf) { - // load ressources in workers (suggested by Mozilla extensions reviewers) - // for more options have a look here: https://gist.github.com/Noitidart/ec1e6b9a593ec7e3efed - // if not in workers, use sdk/data.load() instead - try { - let xRequest; - if (typeof XMLHttpRequest !== "undefined") { - xRequest = new XMLHttpRequest(); - } else { - // JS sucks again… necessary for Thunderbird - let { Cc, Ci } = require("chrome"); - xRequest = Cc["@mozilla.org/xmlextras/xmlhttprequest;1"].createInstance(); - xRequest.QueryInterface(Ci.nsIXMLHttpRequest); - } - xRequest.open('GET', spf, false); // 3rd arg is false for synchronous, sync is acceptable in workers - xRequest.overrideMimeType('text/json'); - xRequest.send(); - return xRequest.responseText; - } - catch (e) { - this.logerror(e); - return null; - } - }, - - // conversions - objectToMap: function (obj) { - let m = new Map(); - for (let param in obj) { - m.set(param, obj[param]); - } - return m; - }, - - mapToObject: function (m) { - let obj = {}; - for (let [k, v] of m) { - obj[k] = v; - } - return obj; - } -}; - - -if (typeof(exports) !== 'undefined') { - exports.setLogOutput = helpers.setLogOutput; - exports.echo = helpers.echo; - exports.logerror = helpers.logerror; - exports.inspect = helpers.inspect; - exports.loadFile = helpers.loadFile; - exports.objectToMap = helpers.objectToMap; - exports.mapToObject = helpers.mapToObject; -} DELETED gc_core/js/ibdawg.js Index: gc_core/js/ibdawg.js ================================================================== --- gc_core/js/ibdawg.js +++ /dev/null @@ -1,513 +0,0 @@ -//// IBDAWG -/*jslint esversion: 6*/ -/*global console,require,exports*/ - -"use strict"; - - -if (typeof(require) !== 'undefined') { - var str_transform = require("resource://grammalecte/str_transform.js"); - var helpers = require("resource://grammalecte/helpers.js"); - var char_player = require("resource://grammalecte/char_player.js"); -} - - -// Don’t remove . Necessary in TB. -${string} -${map} -${set} - - -class SuggResult { - // Structure for storing, classifying and filtering suggestions - - constructor (sWord, nDistLimit=-1) { - this.sWord = sWord; - this.sSimplifiedWord = char_player.simplifyWord(sWord); - this.nDistLimit = (nDistLimit >= 0) ? nDistLimit : Math.floor(sWord.length / 3) + 1; - this.nMinDist = 1000; - this.aSugg = new Set(); - this.dSugg = new Map([ [0, []], [1, []], [2, []] ]); - } - - addSugg (sSugg, nDeep=0) { - // add a suggestion - if (!this.aSugg.has(sSugg)) { - let nDist = str_transform.distanceDamerauLevenshtein(this.sSimplifiedWord, char_player.simplifyWord(sSugg)); - if (nDist <= this.nDistLimit) { - if (!this.dSugg.has(nDist)) { - this.dSugg.set(nDist, []); - } - this.dSugg.get(nDist).push(sSugg); - this.aSugg.add(sSugg); - if (nDist < this.nMinDist) { - this.nMinDist = nDist; - } - this.nDistLimit = Math.min(this.nDistLimit, this.nMinDist+2); - } - } - } - - getSuggestions (nSuggLimit=10, nDistLimit=-1) { - // return a list of suggestions - let lRes = []; - if (this.dSugg.get(0).length) { - // we sort the better results with the original word - let dDistTemp = new Map(); - lRes.forEach((sSugg) => { dDistTemp.set(sSugg, str_transform.distanceDamerauLevenshtein(this.sWord, sSugg)); }); - lRes = lRes.sort((sA, sB) => { return dDistTemp.get(sA) - dDistTemp.get(sB); }); - dDistTemp.clear(); - } - for (let lSugg of this.dSugg.values()) { - for (let sSugg of lSugg) { lRes.push(sSugg); } - if (lRes.length > nSuggLimit) { - break; - } - } - lRes = char_player.filterSugg(lRes); - if (this.sWord.gl_isTitle()) { - lRes = lRes.map((sSugg) => { return sSugg.gl_toCapitalize(); }); - } - else if (this.sWord.gl_isUpperCase()) { - lRes = lRes.map((sSugg) => { return sSugg.toUpperCase(); }); - } - return lRes.slice(0, nSuggLimit); - } - - reset () { - this.aSugg.clear(); - this.dSugg.clear(); - } -} - - -class IBDAWG { - // INDEXABLE BINARY DIRECT ACYCLIC WORD GRAPH - - constructor (sDicName, sPath="") { - try { - let sURL = (sPath !== "") ? sPath + "/" + sDicName : "resource://grammalecte/_dictionaries/"+sDicName; - const dict = JSON.parse(helpers.loadFile(sURL)); - Object.assign(this, dict); - } - catch (e) { - throw Error("# Error. File not found or not loadable.\n" + e.message + "\n"); - } - /* - Properties: - sName, nVersion, sHeader, lArcVal, nArcVal, byDic, sLang, nChar, nBytesArc, nBytesNodeAddress, - nEntries, nNode, nArc, nAff, cStemming, nTag, dChar, _arcMask, _finalNodeMask, _lastArcMask, _addrBitMask, nBytesOffset, - */ - - /* - Bug workaround. - Mozilla’s JS parser sucks. Can’t read file bigger than 4 Mb! - So we convert huge hexadecimal string to list of numbers… - https://github.com/mozilla/addons-linter/issues/1361 - */ - let lTemp = []; - for (let i = 0; i < this.byDic.length; i+=2) { - lTemp.push(parseInt(this.byDic.slice(i, i+2), 16)); - } - this.byDic = lTemp; - /* end of bug workaround */ - - if (!this.sHeader.startsWith("/pyfsa/")) { - throw TypeError("# Error. Not a pyfsa binary dictionary. Header: " + this.sHeader); - } - if (!(this.nVersion == "1" || this.nVersion == "2" || this.nVersion == "3")) { - throw RangeError("# Error. Unknown dictionary version: " + this.nVersion); - } - // to get the value of an arc, to get the char of an arc with its value - this.dChar = helpers.objectToMap(this.dChar); - this.dCharVal = this.dChar.gl_reverse(); - //this.byDic = new Uint8Array(this.byDic); // not quicker, even slower - - if (this.cStemming == "S") { - this.funcStemming = str_transform.getStemFromSuffixCode; - } else if (this.cStemming == "A") { - this.funcStemming = str_transform.getStemFromAffixCode; - } else { - this.funcStemming = str_transform.noStemming; - } - - // Configuring DAWG functions according to nVersion - switch (this.nVersion) { - case 1: - this.morph = this._morph1; - this.stem = this._stem1; - this._lookupArcNode = this._lookupArcNode1; - this._getArcs = this._getArcs1; - this._writeNodes = this._writeNodes1; - break; - case 2: - this.morph = this._morph2; - this.stem = this._stem2; - this._lookupArcNode = this._lookupArcNode2; - this._getArcs = this._getArcs2; - this._writeNodes = this._writeNodes2; - break; - case 3: - this.morph = this._morph3; - this.stem = this._stem3; - this._lookupArcNode = this._lookupArcNode3; - this._getArcs = this._getArcs3; - this._writeNodes = this._writeNodes3; - break; - default: - throw ValueError("# Error: unknown code: " + this.nVersion); - } - //console.log(this.getInfo()); - this.bOptNumSigle = true; - this.bOptNumAtLast = false; - } - - getInfo () { - return ` Language: ${this.sLang} Version: ${this.nVersion} Stemming: ${this.cStemming}FX\n` + - ` Arcs values: ${this.nArcVal} = ${this.nChar} characters, ${this.nAff} affixes, ${this.nTag} tags\n` + - ` Dictionary: ${this.nEntries} entries, ${this.nNode} nodes, ${this.nArc} arcs\n` + - ` Address size: ${this.nBytesNodeAddress} bytes, Arc size: ${this.nBytesArc} bytes\n`; - } - - isValidToken (sToken) { - // checks if sToken is valid (if there is hyphens in sToken, sToken is split, each part is checked) - if (this.isValid(sToken)) { - return true; - } - if (sToken.includes("-")) { - if (sToken.gl_count("-") > 4) { - return true; - } - return sToken.split("-").every(sWord => this.isValid(sWord)); - } - return false; - } - - isValid (sWord) { - // checks if sWord is valid (different casing tested if the first letter is a capital) - if (!sWord) { - return null; - } - if (sWord.includes("’")) { // ugly hack - sWord = sWord.replace("’", "'"); - } - if (this.lookup(sWord)) { - return true; - } - if (sWord.charAt(0).gl_isUpperCase()) { - if (sWord.length > 1) { - if (sWord.gl_isTitle()) { - return !!this.lookup(sWord.toLowerCase()); - } - if (sWord.gl_isUpperCase()) { - if (this.bOptNumSigle) { - return true; - } - return !!(this.lookup(sWord.toLowerCase()) || this.lookup(sWord.gl_toCapitalize())); - } - return !!this.lookup(sWord.slice(0, 1).toLowerCase() + sWord.slice(1)); - } else { - return !!this.lookup(sWord.toLowerCase()); - } - } - return false; - } - - _convBytesToInteger (aBytes) { - // Byte order = Big Endian (bigger first) - let nVal = 0; - let nWeight = (aBytes.length - 1) * 8; - for (let n of aBytes) { - nVal += n << nWeight; - nWeight = nWeight - 8; - } - return nVal; - } - - lookup (sWord) { - // returns true if sWord in dictionary (strict verification) - let iAddr = 0; - for (let c of sWord) { - if (!this.dChar.has(c)) { - return false; - } - iAddr = this._lookupArcNode(this.dChar.get(c), iAddr); - if (iAddr === null) { - return false; - } - } - return Boolean(this._convBytesToInteger(this.byDic.slice(iAddr, iAddr+this.nBytesArc)) & this._finalNodeMask); - } - - getMorph (sWord) { - // retrieves morphologies list, different casing allowed - let l = this.morph(sWord); - if (sWord[0].gl_isUpperCase()) { - l = l.concat(this.morph(sWord.toLowerCase())); - if (sWord.gl_isUpperCase() && sWord.length > 1) { - l = l.concat(this.morph(sWord.gl_toCapitalize())); - } - } - return l; - } - - suggest (sWord, nSuggLimit=10) { - // returns a array of suggestions for - let sPfx = ""; - let sSfx = ""; - [sPfx, sWord, sSfx] = char_player.cut(sWord); - let nMaxSwitch = Math.max(Math.floor(sWord.length / 3), 1); - let nMaxDel = Math.floor(sWord.length / 5); - let nMaxHardRepl = Math.max(Math.floor((sWord.length - 5) / 4), 1); - let oSuggResult = new SuggResult(sWord); - this._suggest(oSuggResult, sWord, nMaxSwitch, nMaxDel, nMaxHardRepl); - if (sWord.gl_isTitle()) { - this._suggest(oSuggResult, sWord.toLowerCase(), nMaxSwitch, nMaxDel, nMaxHardRepl); - } - else if (sWord.gl_isLowerCase()) { - this._suggest(oSuggResult, sWord.gl_toCapitalize(), nMaxSwitch, nMaxDel, nMaxHardRepl); - } - let aSugg = oSuggResult.getSuggestions(nSuggLimit); - if (sSfx || sPfx) { - // we add what we removed - return aSugg.map( (sSugg) => { return sPfx + sSugg + sSfx } ); - } - return aSugg; - } - - _suggest (oSuggResult, sRemain, nMaxSwitch=0, nMaxDel=0, nMaxHardRepl=0, nDeep=0, iAddr=0, sNewWord="", bAvoidLoop=false) { - // returns a set of suggestions - // recursive function - if (sRemain == "") { - if (this._convBytesToInteger(this.byDic.slice(iAddr, iAddr+this.nBytesArc)) & this._finalNodeMask) { - oSuggResult.addSugg(sNewWord); - } - for (let sTail of this._getTails(iAddr)) { - oSuggResult.addSugg(sNewWord+sTail); - } - return; - } - let cCurrent = sRemain.slice(0, 1); - for (let [cChar, jAddr] of this._getCharArcs(iAddr)) { - if (char_player.d1to1.gl_get(cCurrent, cCurrent).indexOf(cChar) != -1) { - this._suggest(oSuggResult, sRemain.slice(1), nMaxSwitch, nMaxDel, nMaxHardRepl, nDeep+1, jAddr, sNewWord+cChar); - } - else if (!bAvoidLoop && nMaxHardRepl) { - this._suggest(oSuggResult, sRemain.slice(1), nMaxSwitch, nMaxDel, nMaxHardRepl-1, nDeep+1, jAddr, sNewWord+cChar, true); - } - } - if (!bAvoidLoop) { // avoid infinite loop - if (sRemain.length > 1) { - if (cCurrent == sRemain.slice(1, 2)) { - // same char, we remove 1 char without adding 1 to - this._suggest(oSuggResult, sRemain.slice(1), nMaxSwitch, nMaxDel, nMaxHardRepl, nDeep+1, iAddr, sNewWord); - } - else { - // switching chars - if (nMaxSwitch > 0) { - this._suggest(oSuggResult, sRemain.slice(1, 2)+sRemain.slice(0, 1)+sRemain.slice(2), nMaxSwitch-1, nMaxDel, nMaxHardRepl, nDeep+1, iAddr, sNewWord, true); - } - // delete char - if (nMaxDel > 0) { - this._suggest(oSuggResult, sRemain.slice(1), nMaxSwitch, nMaxDel-1, nMaxHardRepl, nDeep+1, iAddr, sNewWord, true); - } - } - // Phonetic replacements - for (let sRepl of char_player.get1toXReplacement(sNewWord.slice(-1), cCurrent, sRemain.slice(1,2))) { - this._suggest(oSuggResult, sRepl + sRemain.slice(1), nMaxSwitch, nMaxDel, nMaxHardRepl, nDeep+1, iAddr, sNewWord, true); - } - for (let sRepl of char_player.d2toX.gl_get(sRemain.slice(0, 2), [])) { - this._suggest(oSuggResult, sRepl + sRemain.slice(2), nMaxSwitch, nMaxDel, nMaxHardRepl, nDeep+1, iAddr, sNewWord, true); - } - } - // end of word - if (sRemain.length == 2) { - for (let sRepl of char_player.dFinal2.gl_get(sRemain, [])) { - this._suggest(oSuggResult, sRepl, nMaxSwitch, nMaxDel, nMaxHardRepl, nDeep+1, iAddr, sNewWord, true); - } - } - else if (sRemain.length == 1) { - this._suggest(oSuggResult, "", nMaxSwitch, nMaxDel, nMaxHardRepl, nDeep+1, iAddr, sNewWord, true); // remove last char and go on - for (let sRepl of char_player.dFinal1.gl_get(sRemain, [])) { - this._suggest(oSuggResult, sRepl, nMaxSwitch, nMaxDel, nMaxHardRepl, nDeep+1, iAddr, sNewWord, true); - } - } - } - } - - * _getCharArcs (iAddr) { - // generator: yield all chars and addresses from node at address - for (let [nVal, jAddr] of this._getArcs(iAddr)) { - if (nVal < this.nChar) { - yield [this.dCharVal.get(nVal), jAddr]; - } - } - } - - * _getSimilarCharArcs (cChar, iAddr) { - // generator: yield similar char of and address of the following node - for (let c of char_player.d1to1.gl_get(cChar, [cChar])) { - if (this.dChar.has(c)) { - let jAddr = this._lookupArcNode(this.dChar.get(c), iAddr); - if (jAddr) { - yield [c, jAddr]; - } - } - } - } - - _getTails (iAddr, sTail="", n=2) { - // return a list of suffixes ending at a distance of from - let aTails = new Set(); - for (let [nVal, jAddr] of this._getArcs(iAddr)) { - if (nVal < this.nChar) { - if (this._convBytesToInteger(this.byDic.slice(jAddr, jAddr+this.nBytesArc)) & this._finalNodeMask) { - aTails.add(sTail + this.dCharVal.get(nVal)); - } - if (n && aTails.size == 0) { - aTails.gl_update(this._getTails(jAddr, sTail+this.dCharVal.get(nVal), n-1)); - } - } - } - return aTails; - } - - // morph (sWord) { - // is defined in constructor - // } - - // VERSION 1 - _morph1 (sWord) { - // returns morphologies of sWord - let iAddr = 0; - for (let c of sWord) { - if (!this.dChar.has(c)) { - return []; - } - iAddr = this._lookupArcNode(this.dChar.get(c), iAddr); - if (iAddr === null) { - return []; - } - } - if (this._convBytesToInteger(this.byDic.slice(iAddr, iAddr+this.nBytesArc)) & this._finalNodeMask) { - let l = []; - let nRawArc = 0; - while (!(nRawArc & this._lastArcMask)) { - let iEndArcAddr = iAddr + this.nBytesArc; - nRawArc = this._convBytesToInteger(this.byDic.slice(iAddr, iEndArcAddr)); - let nArc = nRawArc & this._arcMask; - if (nArc >= this.nChar) { - // This value is not a char, this is a stemming code - let sStem = ">" + this.funcStemming(sWord, this.lArcVal[nArc]); - // Now , we go to the next node and retrieve all following arcs values, all of them are tags - let iAddr2 = this._convBytesToInteger(this.byDic.slice(iEndArcAddr, iEndArcAddr+this.nBytesNodeAddress)); - let nRawArc2 = 0; - while (!(nRawArc2 & this._lastArcMask)) { - let iEndArcAddr2 = iAddr2 + this.nBytesArc; - nRawArc2 = this._convBytesToInteger(this.byDic.slice(iAddr2, iEndArcAddr2)); - l.push(sStem + " " + this.lArcVal[nRawArc2 & this._arcMask]); - iAddr2 = iEndArcAddr2+this.nBytesNodeAddress; - } - } - iAddr = iEndArcAddr + this.nBytesNodeAddress; - } - return l; - } - return []; - } - - _stem1 (sWord) { - // returns stems list of sWord - let iAddr = 0; - for (let c of sWord) { - if (!this.dChar.has(c)) { - return []; - } - iAddr = this._lookupArcNode(this.dChar.get(c), iAddr); - if (iAddr === null) { - return []; - } - } - if (this._convBytesToInteger(this.byDic.slice(iAddr, iAddr+this.nBytesArc)) & this._finalNodeMask) { - let l = []; - let nRawArc = 0; - while (!(nRawArc & this._lastArcMask)) { - let iEndArcAddr = iAddr + this.nBytesArc; - nRawArc = this._convBytesToInteger(this.byDic.slice(iAddr, iEndArcAddr)); - let nArc = nRawArc & this._arcMask; - if (nArc >= this.nChar) { - // This value is not a char, this is a stemming code - l.push(this.funcStemming(sWord, this.lArcVal[nArc])); - } - iAddr = iEndArcAddr + this.nBytesNodeAddress; - } - return l; - } - return []; - } - - _lookupArcNode1 (nVal, iAddr) { - // looks if nVal is an arc at the node at iAddr, if yes, returns address of next node else None - while (true) { - let iEndArcAddr = iAddr+this.nBytesArc; - let nRawArc = this._convBytesToInteger(this.byDic.slice(iAddr, iEndArcAddr)); - if (nVal == (nRawArc & this._arcMask)) { - // the value we are looking for - // we return the address of the next node - return this._convBytesToInteger(this.byDic.slice(iEndArcAddr, iEndArcAddr+this.nBytesNodeAddress)); - } - else { - // value not found - if (nRawArc & this._lastArcMask) { - return null; - } - iAddr = iEndArcAddr + this.nBytesNodeAddress; - } - } - } - - * _getArcs1 (iAddr) { - "generator: return all arcs at as tuples of (nVal, iAddr)" - while (true) { - let iEndArcAddr = iAddr+this.nBytesArc; - let nRawArc = this._convBytesToInteger(this.byDic.slice(iAddr, iEndArcAddr)); - yield [nRawArc & this._arcMask, this._convBytesToInteger(this.byDic.slice(iEndArcAddr, iEndArcAddr+this.nBytesNodeAddress))]; - if (nRawArc & this._lastArcMask) { - break; - } - iAddr = iEndArcAddr+this.nBytesNodeAddress; - } - } - - // VERSION 2 - _morph2 (sWord) { - // to do - } - - _stem2 (sWord) { - // to do - } - - _lookupArcNode2 (nVal, iAddr) { - // to do - } - - - // VERSION 3 - _morph3 (sWord) { - // to do - } - - _stem3 (sWord) { - // to do - } - - _lookupArcNode3 (nVal, iAddr) { - // to do - } -} - - -if (typeof(exports) !== 'undefined') { - exports.IBDAWG = IBDAWG; -} DELETED gc_core/js/jsex_map.js Index: gc_core/js/jsex_map.js ================================================================== --- gc_core/js/jsex_map.js +++ /dev/null @@ -1,56 +0,0 @@ - -// Map -/*jslint esversion: 6*/ - -if (Map.prototype.grammalecte === undefined) { - Map.prototype.gl_shallowCopy = function () { - let oNewMap = new Map(); - for (let [key, val] of this.entries()) { - oNewMap.set(key, val); - } - return oNewMap; - }; - - Map.prototype.gl_get = function (key, defaultValue) { - let res = this.get(key); - if (res !== undefined) { - return res; - } - return defaultValue; - }; - - Map.prototype.gl_toString = function () { - // Default .toString() gives nothing useful - let sRes = "{ "; - for (let [k, v] of this.entries()) { - sRes += (typeof k === "string") ? '"' + k + '": ' : k.toString() + ": "; - sRes += (typeof v === "string") ? '"' + v + '", ' : v.toString() + ", "; - } - sRes = sRes.slice(0, -2) + " }"; - return sRes; - }; - - Map.prototype.gl_update = function (dDict) { - for (let [k, v] of dDict.entries()) { - this.set(k, v); - } - }; - - Map.prototype.gl_updateOnlyExistingKeys = function (dDict) { - for (let [k, v] of dDict.entries()) { - if (this.has(k)){ - this.set(k, v); - } - } - }; - - Map.prototype.gl_reverse = function () { - let dNewMap = new Map(); - this.forEach((val, key) => { - dNewMap.set(val, key); - }); - return dNewMap; - }; - - Map.prototype.grammalecte = true; -} DELETED gc_core/js/jsex_regex.js Index: gc_core/js/jsex_regex.js ================================================================== --- gc_core/js/jsex_regex.js +++ /dev/null @@ -1,90 +0,0 @@ - -// regex -/*jslint esversion: 6*/ - -if (RegExp.prototype.grammalecte === undefined) { - RegExp.prototype.gl_exec2 = function (sText, aGroupsPos, aNegLookBefore=null) { - let m; - while ((m = this.exec(sText)) !== null) { - // we have to iterate over sText here too - // because first match doesn’t imply it’s a valid match according to negative lookbefore assertions, - // and even if first match is finally invalid, it doesn’t mean the following eligible matchs would be invalid too. - if (aNegLookBefore !== null) { - // check negative look before assertions - if ( !aNegLookBefore.some(sRegEx => (RegExp.leftContext.search(sRegEx) >= 0)) ) { - break; - } - } else { - break; - } - } - if (m === null) { - return null; - } - - let codePos; - let iPos = 0; - m.start = [m.index]; - m.end = [this.lastIndex]; - try { - if (m.length > 1) { - // there is subgroup(s) - if (aGroupsPos !== null) { - // aGroupsPos is defined - for (let i = 1; i <= m.length-1; i++) { - codePos = aGroupsPos[i-1]; - if (typeof codePos === "number") { - // position as a number - m.start.push(m.index + codePos); - m.end.push(m.index + codePos + m[i].length); - } else if (codePos === "$") { - // at the end of the pattern - m.start.push(this.lastIndex - m[i].length); - m.end.push(this.lastIndex); - } else if (codePos === "w") { - // word in the middle of the pattern - iPos = m[0].search("[ ’,()«»“”]"+m[i]+"[ ,’()«»“”]") + 1 + m.index; - m.start.push(iPos); - m.end.push(iPos + m[i].length); - } else if (codePos === "*") { - // anywhere - iPos = m[0].indexOf(m[i]) + m.index; - m.start.push(iPos); - m.end.push(iPos + m[i].length); - } else if (codePos === "**") { - // anywhere after previous group - iPos = m[0].indexOf(m[i], m.end[i-1]-m.index) + m.index; - m.start.push(iPos); - m.end.push(iPos + m[i].length); - } else if (codePos.startsWith(">")) { - // >x:_ - // todo: look in substring x - iPos = m[0].indexOf(m[i]) + m.index; - m.start.push(iPos); - m.end.push(iPos + m[i].length); - } else { - console.error("# Error: unknown positioning code in regex [" + this.source + "], for group[" + i.toString() +"], code: [" + codePos + "]"); - } - } - } else { - // no aGroupsPos - for (let subm of m.slice(1)) { - iPos = m[0].indexOf(subm) + m.index; - m.start.push(iPos); - m.end.push(iPos + subm.length); - } - } - } - } - catch (e) { - if (typeof(helpers) !== "undefined") { - helpers.logerror(e); - } else { - console.error(e); - } - } - return m; - }; - - RegExp.prototype.grammalecte = true; -} DELETED gc_core/js/jsex_set.js Index: gc_core/js/jsex_set.js ================================================================== --- gc_core/js/jsex_set.js +++ /dev/null @@ -1,13 +0,0 @@ - -// Set -/*jslint esversion: 6*/ - -if (Set.prototype.grammalecte === undefined) { - Set.prototype.gl_update = function (aSet) { - for (let elem of aSet) { - this.add(elem); - } - }; - - Set.prototype.grammalecte = true; -} DELETED gc_core/js/jsex_string.js Index: gc_core/js/jsex_string.js ================================================================== --- gc_core/js/jsex_string.js +++ /dev/null @@ -1,58 +0,0 @@ - -// String -/*jslint esversion: 6*/ - -if (String.prototype.grammalecte === undefined) { - String.prototype.gl_count = function (sSearch, bOverlapping) { - // http://jsperf.com/string-ocurrence-split-vs-match/8 - if (sSearch.length <= 0) { - return this.length + 1; - } - let nOccur = 0; - let iPos = 0; - let nStep = (bOverlapping) ? 1 : sSearch.length; - while ((iPos = this.indexOf(sSearch, iPos)) >= 0) { - nOccur++; - iPos += nStep; - } - return nOccur; - }; - String.prototype.gl_isDigit = function () { - return (this.search(/^[0-9⁰¹²³⁴⁵⁶⁷⁸⁹]+$/) !== -1); - }; - String.prototype.gl_isLowerCase = function () { - return (this.search(/^[a-zà-öø-ÿ0-9-]+$/) !== -1); - }; - String.prototype.gl_isUpperCase = function () { - return (this.search(/^[A-ZÀ-ÖØ-ߌ0-9-]+$/) !== -1); - }; - String.prototype.gl_isTitle = function () { - return (this.search(/^[A-ZÀ-ÖØ-ߌ][a-zà-öø-ÿ'’-]+$/) !== -1); - }; - String.prototype.gl_toCapitalize = function () { - return this.slice(0,1).toUpperCase() + this.slice(1).toLowerCase(); - }; - String.prototype.gl_expand = function (oMatch) { - let sNew = this; - for (let i = 0; i < oMatch.length ; i++) { - let z = new RegExp("\\\\"+parseInt(i), "g"); - sNew = sNew.replace(z, oMatch[i]); - } - return sNew; - }; - String.prototype.gl_trimRight = function (sChars) { - let z = new RegExp("["+sChars+"]+$"); - return this.replace(z, ""); - }; - String.prototype.gl_trimLeft = function (sChars) { - let z = new RegExp("^["+sChars+"]+"); - return this.replace(z, ""); - }; - String.prototype.gl_trim = function (sChars) { - let z1 = new RegExp("^["+sChars+"]+"); - let z2 = new RegExp("["+sChars+"]+$"); - return this.replace(z1, "").replace(z2, ""); - }; - - String.prototype.grammalecte = true; -} Index: gc_core/js/lang_core/gc_engine.js ================================================================== --- gc_core/js/lang_core/gc_engine.js +++ gc_core/js/lang_core/gc_engine.js @@ -320,11 +320,11 @@ //// Initialization load: function (sContext="JavaScript", sPath="") { try { if (typeof(require) !== 'undefined') { - var ibdawg = require("resource://grammalecte/ibdawg.js"); + var ibdawg = require("resource://grammalecte/graphspell/ibdawg.js"); _oDict = new ibdawg.IBDAWG("${dic_name}.json"); } else { _oDict = new IBDAWG("${dic_name}.json", sPath); } _sAppContext = sContext; DELETED gc_core/js/str_transform.js Index: gc_core/js/str_transform.js ================================================================== --- gc_core/js/str_transform.js +++ /dev/null @@ -1,121 +0,0 @@ -//// STRING TRANSFORMATION -/*jslint esversion: 6*/ - -// Note: 48 is the ASCII code for "0" - -var str_transform = { - - distanceDamerauLevenshtein2: function (s1, s2) { - // distance of Damerau-Levenshtein between and - // https://fr.wikipedia.org/wiki/Distance_de_Damerau-Levenshtein - try { - let nLen1 = s1.length; - let nLen2 = s2.length; - let matrix = []; - for (let i = 0; i <= nLen1; i++) { - matrix[i] = new Array(nLen2 + 1); - } - for (let i = 0; i <= nLen1; i++) { - matrix[i][0] = i; - } - for (let j = 0; j <= nLen2; j++) { - matrix[0][j] = j; - } - for (let i = 1; i <= nLen1; i++) { - for (let j = 1; j <= nLen2; j++) { - let nCost = (s1[i] === s2[j]) ? 0 : 1; - matrix[i][j] = Math.min( - matrix[i-1][j] + 1, // Deletion - matrix[i][j-1] + 1, // Insertion - matrix[i-1][j-1] + nCost // Substitution - ); - if (i > 1 && j > 1 && s1[i] == s2[j-1] && s1[i-1] == s2[j]) { - matrix[i][j] = Math.min(matrix[i][j], matrix[i-2][j-2] + nCost); // Transposition - } - } - } - return matrix[nLen1][nLen2]; - } - catch (e) { - helpers.logerror(e); - } - }, - - distanceDamerauLevenshtein: function (s1, s2) { - // distance of Damerau-Levenshtein between and - // https://fr.wikipedia.org/wiki/Distance_de_Damerau-Levenshtein - try { - let nLen1 = s1.length; - let nLen2 = s2.length; - let INF = nLen1 + nLen2; - let matrix = []; - let sd = {}; - for (let i = 0; i < nLen1+2; i++) { - matrix[i] = new Array(nLen2+2); - } - matrix[0][0] = INF; - for (let i = 0; i <= nLen1; i++) { - matrix[i+1][1] = i; - matrix[i+1][0] = INF; - sd[s1[i]] = 0; - } - for (let j = 0; j <= nLen2; j++) { - matrix[1][j+1] = j; - matrix[0][j+1] = INF; - sd[s2[j]] = 0; - } - - for (let i = 1; i <= nLen1; i++) { - let DB = 0; - for (let j = 1; j <= nLen2; j++) { - let i1 = sd[s2[j-1]]; - let j1 = DB; - if (s1[i-1] === s2[j-1]) { - matrix[i+1][j+1] = matrix[i][j]; - DB = j; - } - else { - matrix[i+1][j+1] = Math.min(matrix[i][j], Math.min(matrix[i+1][j], matrix[i][j+1])) + 1; - } - matrix[i+1][j+1] = Math.min(matrix[i+1][j+1], matrix[i1] ? matrix[i1][j1] + (i-i1-1) + 1 + (j-j1-1) : Infinity); - } - sd[s1[i-1]] = i; - } - return matrix[nLen1+1][nLen2+1]; - } - catch (e) { - helpers.logerror(e); - } - }, - - showDistance (s1, s2) { - console.log(`Distance: ${s1} / ${s2} = ${this.distanceDamerauLevenshtein(s1, s2)})`); - }, - - getStemFromSuffixCode: function (sFlex, sSfxCode) { - // Suffix only - if (sSfxCode == "0") { - return sFlex; - } - return sSfxCode[0] == '0' ? sFlex + sSfxCode.slice(1) : sFlex.slice(0, -(sSfxCode.charCodeAt(0)-48)) + sSfxCode.slice(1); - }, - - getStemFromAffixCode: function (sFlex, sAffCode) { - // Prefix and suffix - if (sAffCode == "0") { - return sFlex; - } - if (!sAffCode.includes("/")) { - return "# error #"; - } - let [sPfxCode, sSfxCode] = sAffCode.split('/'); - sFlex = sPfxCode.slice(1) + sFlex.slice(sPfxCode.charCodeAt(0)-48); - return sSfxCode[0] == '0' ? sFlex + sSfxCode.slice(1) : sFlex.slice(0, -(sSfxCode.charCodeAt(0)-48)) + sSfxCode.slice(1); - } -}; - - -if (typeof(exports) !== 'undefined') { - exports.getStemFromSuffixCode = str_transform.getStemFromSuffixCode; - exports.getStemFromAffixCode = str_transform.getStemFromAffixCode; -} DELETED gc_core/js/tokenizer.js Index: gc_core/js/tokenizer.js ================================================================== --- gc_core/js/tokenizer.js +++ /dev/null @@ -1,105 +0,0 @@ -// JavaScript -// Very simple tokenizer -/*jslint esversion: 6*/ -/*global require,exports*/ - -"use strict"; - - -if (typeof(require) !== 'undefined') { - var helpers = require("resource://grammalecte/helpers.js"); -} - - -const aTkzPatterns = { - // All regexps must start with ^. - "default": - [ - [/^[   \t]+/, 'SPACE'], - [/^\/(?:~|bin|boot|dev|etc|home|lib|mnt|opt|root|sbin|tmp|usr|var|Bureau|Documents|Images|Musique|Public|Téléchargements|Vidéos)(?:\/[a-zA-Zà-öÀ-Ö0-9ø-ÿØ-ßĀ-ʯfi-st_.()-]+)*/, 'FOLDERUNIX'], - [/^[a-zA-Z]:\\(?:Program Files(?: \(x86\)|)|[a-zA-Zà-öÀ-Ö0-9ø-ÿØ-ßĀ-ʯfi-st.()]+)(?:\\[a-zA-Zà-öÀ-Ö0-9ø-ÿØ-ßĀ-ʯfi-st_.()-]+)*/, 'FOLDERWIN'], - [/^[,.;:!?…«»“”‘’"(){}\[\]/·–—]+/, 'SEPARATOR'], - [/^[A-Z][.][A-Z][.](?:[A-Z][.])*/, 'ACRONYM'], - [/^(?:https?:\/\/|www[.]|[a-zA-Zà-öÀ-Ö0-9ø-ÿØ-ßĀ-ʯfi-st_-]+[@.][a-zA-Zà-öÀ-Ö0-9ø-ÿØ-ßĀ-ʯfi-st_-]{2,}[@.])[a-zA-Z0-9][a-zA-Zà-öÀ-Ö0-9ø-ÿØ-ßĀ-ʯfi-st_.\/?&!%=+*"'@$#-]+/, 'LINK'], - [/^[#@][a-zA-Zà-öÀ-Ö0-9ø-ÿØ-ßĀ-ʯfi-st_-]+/, 'TAG'], - [/^<[a-zA-Zà-öÀ-Ö0-9ø-ÿØ-ßĀ-ʯfi-st]+.*?>|<\/[a-zA-Zà-öÀ-Ö0-9ø-ÿØ-ßĀ-ʯfi-st]+ *>/, 'HTML'], - [/^\[\/?[a-zA-Zà-öÀ-Ö0-9ø-ÿØ-ßĀ-ʯfi-st]+\]/, 'PSEUDOHTML'], - [/^&\w+;(?:\w+;|)/, 'HTMLENTITY'], - [/^\d\d?h\d\d\b/, 'HOUR'], - [/^-?\d+(?:[.,]\d+|)/, 'NUM'], - [/^[a-zA-Zà-öÀ-Ö0-9ø-ÿØ-ßĀ-ʯfi-st]+(?:[’'`-][a-zA-Zà-öÀ-Ö0-9ø-ÿØ-ßĀ-ʯfi-st]+)*/, 'WORD'] - ], - "fr": - [ - [/^[   \t]+/, 'SPACE'], - [/^\/(?:~|bin|boot|dev|etc|home|lib|mnt|opt|root|sbin|tmp|usr|var|Bureau|Documents|Images|Musique|Public|Téléchargements|Vidéos)(?:\/[a-zA-Zà-öÀ-Ö0-9ø-ÿØ-ßĀ-ʯfi-st_.()-]+)*/, 'FOLDERUNIX'], - [/^[a-zA-Z]:\\(?:Program Files(?: \(x86\)|)|[a-zA-Zà-öÀ-Ö0-9ø-ÿØ-ßĀ-ʯfi-st.()]+)(?:\\[a-zA-Zà-öÀ-Ö0-9ø-ÿØ-ßĀ-ʯfi-st_.()-]+)*/, 'FOLDERWIN'], - [/^[,.;:!?…«»“”‘’"(){}\[\]/·–—]+/, 'SEPARATOR'], - [/^[A-Z][.][A-Z][.](?:[A-Z][.])*/, 'ACRONYM'], - [/^(?:https?:\/\/|www[.]|[a-zA-Zà-öÀ-Ö0-9ø-ÿØ-ßĀ-ʯfi-st_-]+[@.][a-zA-Zà-öÀ-Ö0-9ø-ÿØ-ßĀ-ʯfi-st_-]{2,}[@.])[a-zA-Z0-9][a-zA-Zà-öÀ-Ö0-9ø-ÿØ-ßĀ-ʯfi-st_.\/?&!%=+*"'@$#-]+/, 'LINK'], - [/^[#@][a-zA-Zà-öÀ-Ö0-9ø-ÿØ-ßĀ-ʯfi-st_-]+/, 'TAG'], - [/^<[a-zA-Zà-öÀ-Ö0-9ø-ÿØ-ßĀ-ʯfi-st]+.*?>|<\/[a-zA-Zà-öÀ-Ö0-9ø-ÿØ-ßĀ-ʯfi-st]+ *>/, 'HTML'], - [/^\[\/?[a-zA-Zà-öÀ-Ö0-9ø-ÿØ-ßĀ-ʯfi-st]+\]/, 'PSEUDOHTML'], - [/^&\w+;(?:\w+;|)/, 'HTMLENTITY'], - [/^(?:l|d|n|m|t|s|j|c|ç|lorsqu|puisqu|jusqu|quoiqu|qu)['’`]/i, 'ELPFX'], - [/^\d\d?[hm]\d\d\b/, 'HOUR'], - [/^\d+(?:er|nd|e|de|ième|ème|eme)s?\b/, 'ORDINAL'], - [/^-?\d+(?:[.,]\d+|)/, 'NUM'], - [/^[a-zA-Zà-öÀ-Ö0-9ø-ÿØ-ßĀ-ʯfi-st]+(?:[’'`-][a-zA-Zà-öÀ-Ö0-9ø-ÿØ-ßĀ-ʯfi-st]+)*/, 'WORD'] - ] -}; - - -class Tokenizer { - - constructor (sLang) { - this.sLang = sLang; - if (!aTkzPatterns.hasOwnProperty(sLang)) { - this.sLang = "default"; - } - this.aRules = aTkzPatterns[this.sLang]; - } - - * genTokens (sText) { - let m; - let i = 0; - while (sText) { - let nCut = 1; - for (let [zRegex, sType] of this.aRules) { - try { - if ((m = zRegex.exec(sText)) !== null) { - if (sType == 'SEPARATOR') { - for (let c of m[0]) { - yield { "sType": sType, "sValue": c, "nStart": i, "nEnd": i + m[0].length } - } - } else { - yield { "sType": sType, "sValue": m[0], "nStart": i, "nEnd": i + m[0].length } - } - nCut = m[0].length; - break; - } - } - catch (e) { - helpers.logerror(e); - } - } - i += nCut; - sText = sText.slice(nCut); - } - } - - getSpellingErrors (sText, oDict) { - let aSpellErr = []; - for (let oToken of this.genTokens(sText)) { - if (oToken.sType === 'WORD' && !oDict.isValidToken(oToken.sValue)) { - aSpellErr.push(oToken); - } - } - return aSpellErr; - } -} - - -if (typeof(exports) !== 'undefined') { - exports.Tokenizer = Tokenizer; -} DELETED gc_core/py/char_player.py Index: gc_core/py/char_player.py ================================================================== --- gc_core/py/char_player.py +++ /dev/null @@ -1,324 +0,0 @@ -# list of similar chars -# useful for suggestion mechanism - -import re - - -_xTransChars = str.maketrans({ - 'à': 'a', 'é': 'e', 'î': 'i', 'ô': 'o', 'û': 'u', 'ÿ': 'i', "y": "i", - 'â': 'a', 'è': 'e', 'ï': 'i', 'ö': 'o', 'ù': 'u', 'ŷ': 'i', - 'ä': 'a', 'ê': 'e', 'í': 'i', 'ó': 'o', 'ü': 'u', 'ý': 'i', - 'á': 'a', 'ë': 'e', 'ì': 'i', 'ò': 'o', 'ú': 'u', 'ỳ': 'i', - 'ā': 'a', 'ē': 'e', 'ī': 'i', 'ō': 'o', 'ū': 'u', 'ȳ': 'i', - 'ñ': 'n', 'k': 'q', 'w': 'v', - 'œ': 'oe', 'æ': 'ae', -}) - -def simplifyWord (sWord): - "word simplication before calculating distance between words" - sWord = sWord.lower().translate(_xTransChars) - sNewWord = "" - for i, c in enumerate(sWord, 1): - if c != sWord[i:i+1]: - sNewWord += c - return sNewWord.replace("eau", "o").replace("au", "o").replace("ai", "e").replace("ei", "e").replace("ph", "f") - - -aVowel = set("aáàâäāeéèêëēiíìîïīoóòôöōuúùûüūyýỳŷÿȳœæAÁÀÂÄĀEÉÈÊËĒIÍÌÎÏĪOÓÒÔÖŌUÚÙÛÜŪYÝỲŶŸȲŒÆ") -aConsonant = set("bcçdfghjklmnñpqrstvwxzBCÇDFGHJKLMNÑPQRSTVWXZ") -aDouble = set("bcdfjklmnprstzBCDFJKLMNPRSTZ") # letters that may be used twice successively - - -# Similar chars - -d1to1 = { - "1": "liîLIÎ", - "2": "zZ", - "3": "eéèêEÉÈÊ", - "4": "aàâAÀÂ", - "5": "sgSG", - "6": "bdgBDG", - "7": "ltLT", - "8": "bB", - "9": "gbdGBD", - "0": "oôOÔ", - - "a": "aàâáäæ", - "A": "AÀÂÁÄÆ", - "à": "aàâáäæ", - "À": "AÀÂÁÄÆ", - "â": "aàâáäæ", - "Â": "AÀÂÁÄÆ", - "á": "aàâáäæ", - "Á": "AÀÂÁÄÆ", - "ä": "aàâáäæ", - "Ä": "AÀÂÁÄÆ", - - "æ": "æéa", - "Æ": "ÆÉA", - - "c": "cçskqśŝ", - "C": "CÇSKQŚŜ", - "ç": "cçskqśŝ", - "Ç": "CÇSKQŚŜ", - - "e": "eéèêëœ", - "E": "EÉÈÊËŒ", - "é": "eéèêëœ", - "É": "EÉÈÊËŒ", - "ê": "eéèêëœ", - "Ê": "EÉÈÊËŒ", - "è": "eéèêëœ", - "È": "EÉÈÊËŒ", - "ë": "eéèêëœ", - "Ë": "EÉÈÊËŒ", - - "g": "gj", - "G": "GJ", - - "i": "iîïyíìÿ", - "I": "IÎÏYÍÌŸ", - "î": "iîïyíìÿ", - "Î": "IÎÏYÍÌŸ", - "ï": "iîïyíìÿ", - "Ï": "IÎÏYÍÌŸ", - "í": "iîïyíìÿ", - "Í": "IÎÏYÍÌŸ", - "ì": "iîïyíìÿ", - "Ì": "IÎÏYÍÌŸ", - - "j": "jg", - "J": "JG", - - "k": "kcq", - "K": "KCQ", - - "n": "nñ", - "N": "NÑ", - - "o": "oôóòöœ", - "O": "OÔÓÒÖŒ", - "ô": "oôóòöœ", - "Ô": "OÔÓÒÖŒ", - "ó": "oôóòöœ", - "Ó": "OÔÓÒÖŒ", - "ò": "oôóòöœ", - "Ò": "OÔÓÒÖŒ", - "ö": "oôóòöœ", - "Ö": "OÔÓÒÖŒ", - - "œ": "œoôeéèêë", - "Œ": "ŒOÔEÉÈÊË", - - "q": "qck", - "Q": "QCK", - - "s": "sśŝcç", - "S": "SŚŜCÇ", - "ś": "sśŝcç", - "Ś": "SŚŜCÇ", - "ŝ": "sśŝcç", - "Ŝ": "SŚŜCÇ", - - "u": "uûùüú", - "U": "UÛÙÜÚ", - "û": "uûùüú", - "Û": "UÛÙÜÚ", - "ù": "uûùüú", - "Ù": "UÛÙÜÚ", - "ü": "uûùüú", - "Ü": "UÛÙÜÚ", - "ú": "uûùüú", - "Ú": "UÛÙÜÚ", - - "v": "vw", - "V": "VW", - - "w": "wv", - "W": "WV", - - "x": "xck", - "X": "XCK", - - "y": "yÿiîŷýỳ", - "Y": "YŸIÎŶÝỲ", - "ÿ": "yÿiîŷýỳ", - "Ÿ": "YŸIÎŶÝỲ", - "ŷ": "yÿiîŷýỳ", - "Ŷ": "YŸIÎŶÝỲ", - "ý": "yÿiîŷýỳ", - "Ý": "YŸIÎŶÝỲ", - "ỳ": "yÿiîŷýỳ", - "Ỳ": "YŸIÎŶÝỲ", - - "z": "zs", - "Z": "ZS", -} - -d1toX = { - "æ": ("ae",), - "Æ": ("AE",), - "b": ("bb",), - "B": ("BB",), - "c": ("cc", "ss", "qu", "ch"), - "C": ("CC", "SS", "QU", "CH"), - "d": ("dd",), - "D": ("DD",), - "é": ("ai", "ei"), - "É": ("AI", "EI"), - "f": ("ff", "ph"), - "F": ("FF", "PH"), - "g": ("gu", "ge", "gg", "gh"), - "G": ("GU", "GE", "GG", "GH"), - "j": ("jj", "dj"), - "J": ("JJ", "DJ"), - "k": ("qu", "ck", "ch", "cu", "kk", "kh"), - "K": ("QU", "CK", "CH", "CU", "KK", "KH"), - "l": ("ll",), - "L": ("LL",), - "m": ("mm", "mn"), - "M": ("MM", "MN"), - "n": ("nn", "nm", "mn"), - "N": ("NN", "NM", "MN"), - "o": ("au", "eau"), - "O": ("AU", "EAU"), - "œ": ("oe", "eu"), - "Œ": ("OE", "EU"), - "p": ("pp", "ph"), - "P": ("PP", "PH"), - "q": ("qu", "ch", "cq", "ck", "kk"), - "Q": ("QU", "CH", "CQ", "CK", "KK"), - "r": ("rr",), - "R": ("RR",), - "s": ("ss", "sh"), - "S": ("SS", "SH"), - "t": ("tt", "th"), - "T": ("TT", "TH"), - "x": ("cc", "ct", "xx"), - "X": ("CC", "CT", "XX"), - "z": ("ss", "zh"), - "Z": ("SS", "ZH"), -} - - -def get1toXReplacement (cPrev, cCur, cNext): - if cCur in aConsonant and (cPrev in aConsonant or cNext in aConsonant): - return () - return d1toX.get(cCur, ()) - - -d2toX = { - "am": ("an", "en", "em"), - "AM": ("AN", "EN", "EM"), - "an": ("am", "en", "em"), - "AN": ("AM", "EN", "EM"), - "au": ("eau", "o", "ô"), - "AU": ("EAU", "O", "Ô"), - "em": ("an", "am", "en"), - "EM": ("AN", "AM", "EN"), - "en": ("an", "am", "em"), - "EN": ("AN", "AM", "EM"), - "ai": ("ei", "é", "è", "ê", "ë"), - "AI": ("EI", "É", "È", "Ê", "Ë"), - "ei": ("ai", "é", "è", "ê", "ë"), - "EI": ("AI", "É", "È", "Ê", "Ë"), - "ch": ("sh", "c", "ss"), - "CH": ("SH", "C", "SS"), - "ct": ("x", "cc"), - "CT": ("X", "CC"), - "oa": ("oi",), - "OA": ("OI",), - "oi": ("oa", "oie"), - "OI": ("OA", "OIE"), - "ph": ("f",), - "PH": ("F",), - "qu": ("q", "cq", "ck", "c", "k"), - "QU": ("Q", "CQ", "CK", "C", "K"), - "ss": ("c", "ç"), - "SS": ("C", "Ç"), - "un": ("ein",), - "UN": ("EIN",), -} - - -# End of word - -dFinal1 = { - "a": ("as", "at", "ant", "ah"), - "A": ("AS", "AT", "ANT", "AH"), - "c": ("ch",), - "C": ("CH",), - "e": ("et", "er", "ets", "ée", "ez", "ai", "ais", "ait", "ent", "eh"), - "E": ("ET", "ER", "ETS", "ÉE", "EZ", "AI", "AIS", "AIT", "ENT", "EH"), - "é": ("et", "er", "ets", "ée", "ez", "ai", "ais", "ait"), - "É": ("ET", "ER", "ETS", "ÉE", "EZ", "AI", "AIS", "AIT"), - "è": ("et", "er", "ets", "ée", "ez", "ai", "ais", "ait"), - "È": ("ET", "ER", "ETS", "ÉE", "EZ", "AI", "AIS", "AIT"), - "ê": ("et", "er", "ets", "ée", "ez", "ai", "ais", "ait"), - "Ê": ("ET", "ER", "ETS", "ÉE", "EZ", "AI", "AIS", "AIT"), - "ë": ("et", "er", "ets", "ée", "ez", "ai", "ais", "ait"), - "Ë": ("ET", "ER", "ETS", "ÉE", "EZ", "AI", "AIS", "AIT"), - "g": ("gh",), - "G": ("GH",), - "i": ("is", "it", "ie", "in"), - "I": ("IS", "IT", "IE", "IN"), - "n": ("nt", "nd", "ns", "nh"), - "N": ("NT", "ND", "NS", "NH"), - "o": ("aut", "ot", "os"), - "O": ("AUT", "OT", "OS"), - "ô": ("aut", "ot", "os"), - "Ô": ("AUT", "OT", "OS"), - "ö": ("aut", "ot", "os"), - "Ö": ("AUT", "OT", "OS"), - "p": ("ph",), - "P": ("PH",), - "s": ("sh",), - "S": ("SH",), - "t": ("th",), - "T": ("TH",), - "u": ("ut", "us", "uh"), - "U": ("UT", "US", "UH"), -} - -dFinal2 = { - "ai": ("aient", "ais", "et"), - "AI": ("AIENT", "AIS", "ET"), - "an": ("ant", "ent"), - "AN": ("ANT", "ENT"), - "en": ("ent", "ant"), - "EN": ("ENT", "ANT"), - "ei": ("ait", "ais"), - "EI": ("AIT", "AIS"), - "on": ("ons", "ont"), - "ON": ("ONS", "ONT"), - "oi": ("ois", "oit", "oix"), - "OI": ("OIS", "OIT", "OIX"), -} - - -# Préfixes et suffixes - -aPfx1 = frozenset([ - "anti", "archi", "contre", "hyper", "mé", "méta", "im", "in", "ir", "par", "proto", - "pseudo", "pré", "re", "ré", "sans", "sous", "supra", "sur", "ultra" -]) -aPfx2 = frozenset([ - "belgo", "franco", "génito", "gynéco", "médico", "russo" -]) - - -_zMotAvecPronom = re.compile("^(?i)(\\w+)(-(?:t-|)(?:ils?|elles?|on|je|tu|nous|vous))$") - -def cut (sWord): - "returns a tuple of strings (prefix, trimed_word, suffix)" - m = _zMotAvecPronom.search(sWord) - if m: - return ("", m.group(1), m.group(2)) - return ("", sWord, "") - - -# Other functions - -def filterSugg (aSugg): - "exclude suggestions" - return filter(lambda sSugg: not sSugg.endswith(("è", "È")), aSugg) DELETED gc_core/py/dawg.py Index: gc_core/py/dawg.py ================================================================== --- gc_core/py/dawg.py +++ /dev/null @@ -1,775 +0,0 @@ -#!python3 - -# FSA DICTIONARY BUILDER -# -# by Olivier R. -# License: MPL 2 -# -# This tool encodes lexicon into an indexable binary dictionary -# Input files MUST be encoded in UTF-8. - - -import sys -import os -import collections - -from . import str_transform as st -from .progressbar import ProgressBar - - - -def readFile (spf): - print(" < Read lexicon: " + spf) - if os.path.isfile(spf): - with open(spf, "r", encoding="utf-8") as hSrc: - for sLine in hSrc: - sLine = sLine.strip() - if sLine and not sLine.startswith("#"): - yield sLine - else: - raise OSError("# Error. File not found or not loadable: " + spf) - - -def getElemsFromFile (spf): - "returns tuple of (flexion, stem, tags) from lexicon file" - nErr = 0 - if not spf.endswith(".clex"): - for sLine in readFile(spf): - try: - sFlex, sStem, sTag = sLine.split("\t") - yield (sFlex, sStem, sTag) - except: - nErr += 1 - else: - sTag = "_" # neutral tag - sTag2 = "" - for sLine in readFile(spf): - if sLine.startswith("[") and sLine.endswith("]"): - # tag line - if "-->" in sLine: - try: - sTag, sSfxCode, sTag2 = sLine[1:-1].split(" --> ") - except: - nErr += 1 - continue - sTag = sTag.strip() - sSfxCode = sSfxCode.strip() - sTag2 = sTag2.strip() - else: - sTag = sLine[1:-1] - sTag2 = "" - else: - # entry line - if "\t" in sLine: - if sLine.count("\t") > 1: - nErr += 1 - continue - sFlex, sStem = sLine.split("\t") - else: - sFlex = sStem = sLine - #print(sFlex, sStem, sTag) - yield (sFlex, sStem, sTag) - if sTag2: - sFlex2 = st.changeWordWithSuffixCode(sFlex, sSfxCode) - #print(sFlex2, sStem, sTag2) - yield (sFlex2, sStem, sTag2) - if nErr: - print(" # Lines ignored: {:>10}".format(nErr)) - - - -class DAWG: - """DIRECT ACYCLIC WORD GRAPH""" - # This code is inspired from Steve Hanov’s DAWG, 2011. (http://stevehanov.ca/blog/index.php?id=115) - # We store suffix/affix codes and tags within the graph after the “real” word. - # A word is a list of numbers [ c1, c2, c3 . . . cN, iAffix, iTags] - # Each arc is an index in self.lArcVal, where are stored characters, suffix/affix codes for stemming and tags. - # Important: As usual, the last node (after ‘iTags’) is tagged final, AND the node after ‘cN’ is ALSO tagged final. - - def __init__ (self, spfSrc, sLangName, cStemming): - print("===== Direct Acyclic Word Graph - Minimal Acyclic Finite State Automaton =====") - cStemming = cStemming.upper() - if cStemming == "A": - funcStemmingGen = st.defineAffixCode - elif cStemming == "S": - funcStemmingGen = st.defineSuffixCode - elif cStemming == "N": - funcStemmingGen = st.noStemming - else: - raise ValueError("# Error. Unknown stemming code: {}".format(cStemming)) - - lEntry = [] - lChar = ['']; dChar = {}; nChar = 1; dCharOccur = {} - lAff = []; dAff = {}; nAff = 0; dAffOccur = {} - lTag = []; dTag = {}; nTag = 0; dTagOccur = {} - nErr = 0 - - # read lexicon - for sFlex, sStem, sTag in getElemsFromFile(spfSrc): - addWordToCharDict(sFlex) - # chars - for c in sFlex: - if c not in dChar: - dChar[c] = nChar - lChar.append(c) - nChar += 1 - dCharOccur[c] = dCharOccur.get(c, 0) + 1 - # affixes to find stem from flexion - aff = funcStemmingGen(sFlex, sStem) - if aff not in dAff: - dAff[aff] = nAff - lAff.append(aff) - nAff += 1 - dAffOccur[aff] = dCharOccur.get(aff, 0) + 1 - # tags - if sTag not in dTag: - dTag[sTag] = nTag - lTag.append(sTag) - nTag += 1 - dTagOccur[sTag] = dTagOccur.get(sTag, 0) + 1 - lEntry.append((sFlex, dAff[aff], dTag[sTag])) - if not lEntry: - raise ValueError("# Error. Empty lexicon") - - # Preparing DAWG - print(" > Preparing list of words") - lVal = lChar + lAff + lTag - lWord = [ [dChar[c] for c in sFlex] + [iAff+nChar] + [iTag+nChar+nAff] for sFlex, iAff, iTag in lEntry ] - lEntry = None - - # Dictionary of arc values occurrency, to sort arcs of each node - dValOccur = dict( [ (dChar[c], dCharOccur[c]) for c in dChar ] \ - + [ (dAff[aff]+nChar, dAffOccur[aff]) for aff in dAff ] \ - + [ (dTag[tag]+nChar+nAff, dTagOccur[tag]) for tag in dTag ] ) - #with open(spfSrc[:-8]+".valuesfreq.txt", 'w', encoding='utf-8') as hFreqDst: # DEBUG - # for iKey, nOcc in sorted(dValOccur.items(), key=lambda t: t[1], reverse=True): - # hFreqDst.write("{}: {}\n".format(lVal[iKey], nOcc)) - # hFreqDst.close() - - self.sFile = spfSrc - self.sLang = sLangName - self.nEntry = len(lWord) - self.aPreviousEntry = [] - DawgNode.resetNextId() - self.oRoot = DawgNode() - self.lUncheckedNodes = [] # list of nodes that have not been checked for duplication. - self.lMinimizedNodes = {} # list of unique nodes that have been checked for duplication. - self.lSortedNodes = [] # version 2 and 3 - self.nNode = 0 - self.nArc = 0 - self.dChar = dChar - self.nChar = len(dChar) - self.nAff = nAff - self.lArcVal = lVal - self.nArcVal = len(lVal) - self.nTag = self.nArcVal - self.nChar - nAff - self.cStemming = cStemming - if cStemming == "A": - self.funcStemming = st.changeWordWithAffixCode - elif cStemming == "S": - self.funcStemming = st.changeWordWithSuffixCode - else: - self.funcStemming = st.noStemming - - # build - lWord.sort() - oProgBar = ProgressBar(0, len(lWord)) - for aEntry in lWord: - self.insert(aEntry) - oProgBar.increment(1) - oProgBar.done() - self.finish() - self.countNodes() - self.countArcs() - self.sortNodes() - self.sortNodeArcs(dValOccur) - #self.sortNodeArcs2 (self.oRoot, "") - self.displayInfo() - - # BUILD DAWG - def insert (self, aEntry): - if aEntry < self.aPreviousEntry: - sys.exit("# Error: Words must be inserted in alphabetical order.") - - # find common prefix between word and previous word - nCommonPrefix = 0 - for i in range(min(len(aEntry), len(self.aPreviousEntry))): - if aEntry[i] != self.aPreviousEntry[i]: - break - nCommonPrefix += 1 - - # Check the lUncheckedNodes for redundant nodes, proceeding from last - # one down to the common prefix size. Then truncate the list at that point. - self._minimize(nCommonPrefix) - - # add the suffix, starting from the correct node mid-way through the graph - if len(self.lUncheckedNodes) == 0: - oNode = self.oRoot - else: - oNode = self.lUncheckedNodes[-1][2] - - iChar = nCommonPrefix - for c in aEntry[nCommonPrefix:]: - oNextNode = DawgNode() - oNode.arcs[c] = oNextNode - self.lUncheckedNodes.append((oNode, c, oNextNode)) - if iChar == (len(aEntry) - 2): - oNode.final = True - iChar += 1 - oNode = oNextNode - oNode.final = True - self.aPreviousEntry = aEntry - - def finish (self): - "minimize unchecked nodes" - self._minimize(0) - - def _minimize (self, downTo): - # proceed from the leaf up to a certain point - for i in range( len(self.lUncheckedNodes)-1, downTo-1, -1 ): - oNode, char, oChildNode = self.lUncheckedNodes[i] - if oChildNode in self.lMinimizedNodes: - # replace the child with the previously encountered one - oNode.arcs[char] = self.lMinimizedNodes[oChildNode] - else: - # add the state to the minimized nodes. - self.lMinimizedNodes[oChildNode] = oChildNode - self.lUncheckedNodes.pop() - - def countNodes (self): - self.nNode = len(self.lMinimizedNodes) - - def countArcs (self): - self.nArc = 0 - for oNode in self.lMinimizedNodes: - self.nArc += len(oNode.arcs) - - def sortNodeArcs (self, dValOccur): - print(" > Sort node arcs") - self.oRoot.sortArcs(dValOccur) - for oNode in self.lMinimizedNodes: - oNode.sortArcs(dValOccur) - - def sortNodeArcs2 (self, oNode, cPrevious=""): - # recursive function - dCharOccur = getCharOrderAfterChar(cPrevious) - if dCharOccur: - oNode.sortArcs2(dCharOccur, self.lArcVal) - for nArcVal, oNextNode in oNode.arcs.items(): - self.sortNodeArcs2(oNextNode, self.lArcVal[nArcVal]) - - def sortNodes (self): - print(" > Sort nodes") - for oNode in self.oRoot.arcs.values(): - self._parseNodes(oNode) - - def _parseNodes (self, oNode): - # Warning: recursive method - if oNode.pos > 0: - return - oNode.setPos() - self.lSortedNodes.append(oNode) - for oNextNode in oNode.arcs.values(): - self._parseNodes(oNextNode) - - def lookup (self, sWord): - oNode = self.oRoot - for c in sWord: - if self.dChar.get(c, '') not in oNode.arcs: - return False - oNode = oNode.arcs[self.dChar[c]] - return oNode.final - - def morph (self, sWord): - oNode = self.oRoot - for c in sWord: - if self.dChar.get(c, '') not in oNode.arcs: - return '' - oNode = oNode.arcs[self.dChar[c]] - if oNode.final: - s = "* " - for arc in oNode.arcs: - if arc >= self.nChar: - s += " [" + self.funcStemming(sWord, self.lArcVal[arc]) - oNode2 = oNode.arcs[arc] - for arc2 in oNode2.arcs: - s += " / " + self.lArcVal[arc2] - s += "]" - return s - return '' - - def displayInfo (self): - print(" * {:<12} {:>16,}".format("Entries:", self.nEntry)) - print(" * {:<12} {:>16,}".format("Characters:", self.nChar)) - print(" * {:<12} {:>16,}".format("Affixes:", self.nAff)) - print(" * {:<12} {:>16,}".format("Tags:", self.nTag)) - print(" * {:<12} {:>16,}".format("Arc values:", self.nArcVal)) - print(" * {:<12} {:>16,}".format("Nodes:", self.nNode)) - print(" * {:<12} {:>16,}".format("Arcs:", self.nArc)) - print(" * {:<12} {:>16}".format("Stemming:", self.cStemming + "FX")) - - def getArcStats (self): - d = {} - for oNode in self.lMinimizedNodes: - n = len(oNode.arcs) - d[n] = d.get(n, 0) + 1 - s = " * Nodes:\n" - for n in d: - s = s + " {:>9} nodes have {:>3} arcs\n".format(d[n], n) - return s - - def writeInfo (self, sPathFile): - print(" > Write informations") - with open(sPathFile, 'w', encoding='utf-8', newline="\n") as hDst: - hDst.write(self.getArcStats()) - hDst.write("\n * Values:\n") - for i, s in enumerate(self.lArcVal): - hDst.write(" {:>6}. {}\n".format(i, s)) - hDst.close() - - # BINARY CONVERSION - def createBinary (self, sPathFile, nMethod, bDebug=False): - print(" > Write DAWG as an indexable binary dictionary [method: %d]" % nMethod) - if nMethod == 1: - self.nBytesArc = ( (self.nArcVal.bit_length() + 2) // 8 ) + 1 # We add 2 bits. See DawgNode.convToBytes1() - self._calcNumBytesNodeAddress() - self._calcNodesAddress1() - elif nMethod == 2: - self.nBytesArc = ( (self.nArcVal.bit_length() + 3) // 8 ) + 1 # We add 3 bits. See DawgNode.convToBytes2() - self._calcNumBytesNodeAddress() - self._calcNodesAddress2() - elif nMethod == 3: - self.nBytesArc = ( (self.nArcVal.bit_length() + 3) // 8 ) + 1 # We add 3 bits. See DawgNode.convToBytes3() - self.nBytesOffset = 1 - self.nMaxOffset = (2 ** (self.nBytesOffset * 8)) - 1 - self._calcNumBytesNodeAddress() - self._calcNodesAddress3() - else: - print(" # Error: unknown compression method") - print(" Arc values (chars, affixes and tags): {} -> {} bytes".format( self.nArcVal, len("\t".join(self.lArcVal).encode("utf-8")) )) - print(" Arc size: {} bytes, Address size: {} bytes -> {} * {} = {} bytes".format( self.nBytesArc, self.nBytesNodeAddress, \ - self.nBytesArc+self.nBytesNodeAddress, self.nArc, \ - (self.nBytesArc+self.nBytesNodeAddress)*self.nArc )) - self._writeBinary(sPathFile, nMethod) - if bDebug: - self._writeNodes(sPathFile, nMethod) - - def _calcNumBytesNodeAddress (self): - "how many bytes needed to store all nodes/arcs in the binary dictionary" - self.nBytesNodeAddress = 1 - while ((self.nBytesArc + self.nBytesNodeAddress) * self.nArc) > (2 ** (self.nBytesNodeAddress * 8)): - self.nBytesNodeAddress += 1 - - def _calcNodesAddress1 (self): - nBytesNode = self.nBytesArc + self.nBytesNodeAddress - iAddr = len(self.oRoot.arcs) * nBytesNode - for oNode in self.lMinimizedNodes: - oNode.addr = iAddr - iAddr += max(len(oNode.arcs), 1) * nBytesNode - - def _calcNodesAddress2 (self): - nBytesNode = self.nBytesArc + self.nBytesNodeAddress - iAddr = len(self.oRoot.arcs) * nBytesNode - for oNode in self.lSortedNodes: - oNode.addr = iAddr - iAddr += max(len(oNode.arcs), 1) * nBytesNode - for oNextNode in oNode.arcs.values(): - if (oNode.pos + 1) == oNextNode.pos: - iAddr -= self.nBytesNodeAddress - #break - - def _calcNodesAddress3 (self): - nBytesNode = self.nBytesArc + self.nBytesNodeAddress - # theorical nodes size if only addresses and no offset - self.oRoot.size = len(self.oRoot.arcs) * nBytesNode - for oNode in self.lSortedNodes: - oNode.size = max(len(oNode.arcs), 1) * nBytesNode - # rewind and calculate dropdown from the end, several times - nDiff = self.nBytesNodeAddress - self.nBytesOffset - bEnd = False - while not bEnd: - bEnd = True - # recalculate addresses - iAddr = self.oRoot.size - for oNode in self.lSortedNodes: - oNode.addr = iAddr - iAddr += oNode.size - # rewind and calculate dropdown from the end, several times - for i in range(self.nNode-1, -1, -1): - nSize = max(len(self.lSortedNodes[i].arcs), 1) * nBytesNode - for oNextNode in self.lSortedNodes[i].arcs.values(): - if 1 < (oNextNode.addr - self.lSortedNodes[i].addr) < self.nMaxOffset: - nSize -= nDiff - if self.lSortedNodes[i].size != nSize: - self.lSortedNodes[i].size = nSize - bEnd = False - - def _writeBinary (self, sPathFile, nMethod): - """ - Format of the binary indexable dictionary: - Each section is separated with 4 bytes of \0 - - - Section Header: - /pyfsa/[version] - * version is an ASCII string - - - Section Informations: - /[tag_lang] - /[number of chars] - /[number of bytes for each arc] - /[number of bytes for each address node] - /[number of entries] - /[number of nodes] - /[number of arcs] - /[number of affixes] - * each field is a ASCII string - /[stemming code] - * "S" means stems are generated by /suffix_code/, "A" means they are generated by /affix_code/ - See defineSuffixCode() and defineAffixCode() for details. - "N" means no stemming - - - Section Values: - * a list of strings encoded in binary from utf-8, each value separated with a tabulation - - - Section Word Graph (nodes / arcs) - * A list of nodes which are a list of arcs with an address of the next node. - See DawgNode.convToBytes() for details. - """ - if not sPathFile.endswith(".bdic"): - sPathFile += "."+str(nMethod)+".bdic" - with open(sPathFile, 'wb') as hDst: - # header - hDst.write("/pyfsa/{}/".format(nMethod).encode("utf-8")) - hDst.write(b"\0\0\0\0") - # infos - hDst.write("{}/{}/{}/{}/{}/{}/{}/{}/{}".format(self.sLang, self.nChar, self.nBytesArc, self.nBytesNodeAddress, \ - self.nEntry, self.nNode, self.nArc, self.nAff, self.cStemming).encode("utf-8")) - hDst.write(b"\0\0\0\0") - # lArcVal - hDst.write("\t".join(self.lArcVal).encode("utf-8")) - hDst.write(b"\0\0\0\0") - # DAWG: nodes / arcs - if nMethod == 1: - hDst.write(self.oRoot.convToBytes1(self.nBytesArc, self.nBytesNodeAddress)) - for oNode in self.lMinimizedNodes: - hDst.write(oNode.convToBytes1(self.nBytesArc, self.nBytesNodeAddress)) - elif nMethod == 2: - hDst.write(self.oRoot.convToBytes2(self.nBytesArc, self.nBytesNodeAddress)) - for oNode in self.lSortedNodes: - hDst.write(oNode.convToBytes2(self.nBytesArc, self.nBytesNodeAddress)) - elif nMethod == 3: - hDst.write(self.oRoot.convToBytes3(self.nBytesArc, self.nBytesNodeAddress, self.nBytesOffset)) - for oNode in self.lSortedNodes: - hDst.write(oNode.convToBytes3(self.nBytesArc, self.nBytesNodeAddress, self.nBytesOffset)) - hDst.close() - - def _writeNodes (self, sPathFile, nMethod): - "for debugging only" - print(" > Write nodes") - with open(sPathFile+".nodes."+str(nMethod)+".txt", 'w', encoding='utf-8', newline="\n") as hDst: - if nMethod == 1: - hDst.write(self.oRoot.getTxtRepr1(self.nBytesArc, self.nBytesNodeAddress, self.lArcVal)+"\n") - #hDst.write( ''.join( [ "%02X " % z for z in self.oRoot.convToBytes1(self.nBytesArc, self.nBytesNodeAddress) ] ).strip() ) - for oNode in self.lMinimizedNodes: - hDst.write(oNode.getTxtRepr1(self.nBytesArc, self.nBytesNodeAddress, self.lArcVal)+"\n") - if nMethod == 2: - hDst.write(self.oRoot.getTxtRepr2(self.nBytesArc, self.nBytesNodeAddress, self.lArcVal)+"\n") - for oNode in self.lSortedNodes: - hDst.write(oNode.getTxtRepr2(self.nBytesArc, self.nBytesNodeAddress, self.lArcVal)+"\n") - if nMethod == 3: - hDst.write(self.oRoot.getTxtRepr3(self.nBytesArc, self.nBytesNodeAddress, self.nBytesOffset, self.lArcVal)+"\n") - #hDst.write( ''.join( [ "%02X " % z for z in self.oRoot.convToBytes3(self.nBytesArc, self.nBytesNodeAddress, self.nBytesOffset) ] ).strip() ) - for oNode in self.lSortedNodes: - hDst.write(oNode.getTxtRepr3(self.nBytesArc, self.nBytesNodeAddress, self.nBytesOffset, self.lArcVal)+"\n") - hDst.close() - - def writeResults (self, sPathFile): - bFileExits = os.path.isfile("_lexicons.res.txt") - with open("_lexicons.res.txt", "a", encoding='utf-8', newline="\n") as hDst: - sFormat1 = "{:<12} {:>12} {:>5} {:>8} {:>8} {:>6} {:>8} {:>9} {:>9} {:>15} {:>12} {:>12}\n" - sFormat2 = "{:<12} {:>12,} {:>5,} {:>8,} {:>8} {:>6,} {:>8,} {:>9,} {:>9,} {:>15,} {:>12,} {:>12,}\n" - if not bFileExits: - hDst.write(sFormat1.format("Lexicon", "Entries", "Chars", "Affixes", "Stemming", "Tags", "Values", "Nodes", "Arcs", "Lexicon (Kb)", "Dict (Kb)", "LT Dict (Kb)")) - hDst.write(sFormat2.format(self.sLang, self.nEntry, self.nChar, self.nAff, self.cStemming + "FX", self.nTag, self.nArcVal, \ - self.nNode, self.nArc, os.path.getsize(self.sFile), os.path.getsize(sPathFile), \ - os.path.getsize("cfsa/dict/{}.dict".format(self.sLang)) if os.path.isfile("cfsa/dict/{}.dict".format(self.sLang)) else 0)) - hDst.close() - - - -class DawgNode: - NextId = 0 - NextPos = 1 # (version 2) - - def __init__ (self): - self.i = DawgNode.NextId - DawgNode.NextId += 1 - self.final = False - self.arcs = {} # key: arc value; value: a node - self.addr = 0 # address in the binary dictionary - self.pos = 0 # position in the binary dictionary (version 2) - self.size = 0 # size of node in bytes (version 3) - - @classmethod - def resetNextId (cls): - cls.NextId = 0 - - def setPos (self): # version 2 - self.pos = DawgNode.NextPos - DawgNode.NextPos += 1 - - def __str__ (self): - # Caution! this function is used for hashing and comparison! - l = [] - if self.final: - l.append("1") - else: - l.append("0") - for (key, node) in self.arcs.items(): - l.append(str(key)) - l.append(str(node.i)) - return "_".join(l) - - def __hash__ (self): - # Used as a key in a python dictionary. - return self.__str__().__hash__() - - def __eq__ (self, other): - # Used as a key in a python dictionary. - # Nodes are equivalent if they have identical arcs, and each identical arc leads to identical states. - return self.__str__() == other.__str__() - - def sortArcs (self, dValOccur): - self.arcs = collections.OrderedDict(sorted(self.arcs.items(), key=lambda t: dValOccur.get(t[0], 0), reverse=True)) - - def sortArcs2 (self, dValOccur, lArcVal): - self.arcs = collections.OrderedDict(sorted(self.arcs.items(), key=lambda t: dValOccur.get(lArcVal[t[0]], 0), reverse=True)) - - # VERSION 1 ===================================================================================================== - def convToBytes1 (self, nBytesArc, nBytesNodeAddress): - """ - Node scheme: - - Arc length is defined by nBytesArc - - Address length is defined by nBytesNodeAddress - - | Arc | Address of next node | - | | | - /---------------\ /---------------\ /---------------\ /---------------\ /---------------\ /---------------\ - | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | - \---------------/ \---------------/ \---------------/ \---------------/ \---------------/ \---------------/ - [...] - /---------------\ /---------------\ /---------------\ /---------------\ /---------------\ /---------------\ - | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | - \---------------/ \---------------/ \---------------/ \---------------/ \---------------/ \---------------/ - ^ ^ - | | - | | - | \___ if 1, last arc of this node - \_____ if 1, this node is final (only on the first arc) - """ - nArc = len(self.arcs) - nFinalNodeMask = 1 << ((nBytesArc*8)-1) - nFinalArcMask = 1 << ((nBytesArc*8)-2) - if len(self.arcs) == 0: - val = nFinalNodeMask | nFinalArcMask - by = val.to_bytes(nBytesArc, byteorder='big') - by += (0).to_bytes(nBytesNodeAddress, byteorder='big') - return by - by = b"" - for i, arc in enumerate(self.arcs, 1): - val = arc - if i == 1 and self.final: - val = val | nFinalNodeMask - if i == nArc: - val = val | nFinalArcMask - by += val.to_bytes(nBytesArc, byteorder='big') - by += self.arcs[arc].addr.to_bytes(nBytesNodeAddress, byteorder='big') - return by - - def getTxtRepr1 (self, nBytesArc, nBytesNodeAddress, lVal): - nArc = len(self.arcs) - nFinalNodeMask = 1 << ((nBytesArc*8)-1) - nFinalArcMask = 1 << ((nBytesArc*8)-2) - s = "i{:_>10} -- #{:_>10}\n".format(self.i, self.addr) - if len(self.arcs) == 0: - s += " {:<20} {:0>16} i{:_>10} #{:_>10}\n".format("", bin(nFinalNodeMask | nFinalArcMask)[2:], "0", "0") - return s - for i, arc in enumerate(self.arcs, 1): - val = arc - if i == 1 and self.final: - val = val | nFinalNodeMask - if i == nArc: - val = val | nFinalArcMask - s += " {:<20} {:0>16} i{:_>10} #{:_>10}\n".format(lVal[arc], bin(val)[2:], self.arcs[arc].i, self.arcs[arc].addr) - return s - - # VERSION 2 ===================================================================================================== - def convToBytes2 (self, nBytesArc, nBytesNodeAddress): - """ - Node scheme: - - Arc length is defined by nBytesArc - - Address length is defined by nBytesNodeAddress - - | Arc | Address of next node | - | | | - /---------------\ /---------------\ /---------------\ /---------------\ /---------------\ /---------------\ - | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | - \---------------/ \---------------/ \---------------/ \---------------/ \---------------/ \---------------/ - [...] - /---------------\ /---------------\ /---------------\ /---------------\ /---------------\ /---------------\ - | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | - \---------------/ \---------------/ \---------------/ \---------------/ \---------------/ \---------------/ - ^ ^ ^ - | | | - | | \_ if 1, caution, no address: next node is the following node - | \___ if 1, last arc of this node - \_____ if 1, this node is final (only on the first arc) - """ - nArc = len(self.arcs) - nFinalNodeMask = 1 << ((nBytesArc*8)-1) - nFinalArcMask = 1 << ((nBytesArc*8)-2) - nNextNodeMask = 1 << ((nBytesArc*8)-3) - if len(self.arcs) == 0: - val = nFinalNodeMask | nFinalArcMask - by = val.to_bytes(nBytesArc, byteorder='big') - by += (0).to_bytes(nBytesNodeAddress, byteorder='big') - return by - by = b"" - for i, arc in enumerate(self.arcs, 1): - val = arc - if i == 1 and self.final: - val = val | nFinalNodeMask - if i == nArc: - val = val | nFinalArcMask - if (self.pos + 1) == self.arcs[arc].pos and self.i != 0: - val = val | nNextNodeMask - by += val.to_bytes(nBytesArc, byteorder='big') - else: - by += val.to_bytes(nBytesArc, byteorder='big') - by += self.arcs[arc].addr.to_bytes(nBytesNodeAddress, byteorder='big') - return by - - def getTxtRepr2 (self, nBytesArc, nBytesNodeAddress, lVal): - nArc = len(self.arcs) - nFinalNodeMask = 1 << ((nBytesArc*8)-1) - nFinalArcMask = 1 << ((nBytesArc*8)-2) - nNextNodeMask = 1 << ((nBytesArc*8)-3) - s = "i{:_>10} -- #{:_>10}\n".format(self.i, self.addr) - if nArc == 0: - s += " {:<20} {:0>16} i{:_>10} #{:_>10}\n".format("", bin(nFinalNodeMask | nFinalArcMask)[2:], "0", "0") - return s - for i, arc in enumerate(self.arcs, 1): - val = arc - if i == 1 and self.final: - val = val | nFinalNodeMask - if i == nArc: - val = val | nFinalArcMask - if (self.pos + 1) == self.arcs[arc].pos and self.i != 0: - val = val | nNextNodeMask - s += " {:<20} {:0>16}\n".format(lVal[arc], bin(val)[2:], "") - else: - s += " {:<20} {:0>16} i{:_>10} #{:_>10}\n".format(lVal[arc], bin(val)[2:], self.arcs[arc].i, self.arcs[arc].addr) - return s - - # VERSION 3 ===================================================================================================== - def convToBytes3 (self, nBytesArc, nBytesNodeAddress, nBytesOffset): - """ - Node scheme: - - Arc length is defined by nBytesArc - - Address length is defined by nBytesNodeAddress - - Offset length is defined by nBytesOffset - - | Arc | Address of next node or offset to next node | - | | | - /---------------\ /---------------\ /---------------\ /---------------\ /---------------\ /---------------\ - |1|0|0| | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | - \---------------/ \---------------/ \---------------/ \---------------/ \---------------/ \---------------/ - [...] - /---------------\ /---------------\ /---------------\ - |0|0|1| | | | | | | | | | | | | | | | | | | | | | | | Offsets are shorter than addresses - \---------------/ \---------------/ \---------------/ - /---------------\ /---------------\ /---------------\ /---------------\ /---------------\ /---------------\ - |0|1|0| | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | - \---------------/ \---------------/ \---------------/ \---------------/ \---------------/ \---------------/ - - ^ ^ ^ - | | | - | | \_ if 1, offset instead of address of next node - | \___ if 1, last arc of this node - \_____ if 1, this node is final (only on the first arc) - """ - nArc = len(self.arcs) - nFinalNodeMask = 1 << ((nBytesArc*8)-1) - nFinalArcMask = 1 << ((nBytesArc*8)-2) - nNextNodeMask = 1 << ((nBytesArc*8)-3) - nMaxOffset = (2 ** (nBytesOffset * 8)) - 1 - if nArc == 0: - val = nFinalNodeMask | nFinalArcMask - by = val.to_bytes(nBytesArc, byteorder='big') - by += (0).to_bytes(nBytesNodeAddress, byteorder='big') - return by - by = b"" - for i, arc in enumerate(self.arcs, 1): - val = arc - if i == 1 and self.final: - val = val | nFinalNodeMask - if i == nArc: - val = val | nFinalArcMask - if 1 < (self.arcs[arc].addr - self.addr) < nMaxOffset and self.i != 0: - val = val | nNextNodeMask - by += val.to_bytes(nBytesArc, byteorder='big') - by += (self.arcs[arc].addr-self.addr).to_bytes(nBytesOffset, byteorder='big') - else: - by += val.to_bytes(nBytesArc, byteorder='big') - by += self.arcs[arc].addr.to_bytes(nBytesNodeAddress, byteorder='big') - return by - - def getTxtRepr3 (self, nBytesArc, nBytesNodeAddress, nBytesOffset, lVal): - nArc = len(self.arcs) - nFinalNodeMask = 1 << ((nBytesArc*8)-1) - nFinalArcMask = 1 << ((nBytesArc*8)-2) - nNextNodeMask = 1 << ((nBytesArc*8)-3) - nMaxOffset = (2 ** (nBytesOffset * 8)) - 1 - s = "i{:_>10} -- #{:_>10} ({})\n".format(self.i, self.addr, self.size) - if nArc == 0: - s += " {:<20} {:0>16} i{:_>10} #{:_>10}\n".format("", bin(nFinalNodeMask | nFinalArcMask)[2:], "0", "0") - return s - for i, arc in enumerate(self.arcs, 1): - val = arc - if i == 1 and self.final: - val = val | nFinalNodeMask - if i == nArc: - val = val | nFinalArcMask - if 1 < (self.arcs[arc].addr - self.addr) < nMaxOffset and self.i != 0: - val = val | nNextNodeMask - s += " {:<20} {:0>16} i{:_>10} +{:_>10}\n".format(lVal[arc], bin(val)[2:], self.arcs[arc].i, self.arcs[arc].addr - self.addr) - else: - s += " {:<20} {:0>16} i{:_>10} #{:_>10}\n".format(lVal[arc], bin(val)[2:], self.arcs[arc].i, self.arcs[arc].addr) - return s - - - -# Another attempt to sort node arcs - -_dCharOrder = { - # key: previous char, value: dictionary of chars {c: nValue} - "": {} -} - - -def addWordToCharDict (sWord): - cPrevious = "" - for cChar in sWord: - if cPrevious not in _dCharOrder: - _dCharOrder[cPrevious] = {} - _dCharOrder[cPrevious][cChar] = _dCharOrder[cPrevious].get(cChar, 0) + 1 - cPrevious = cChar - - -def getCharOrderAfterChar (cChar): - return _dCharOrder.get(cChar, None) - - -def displayCharOrder (): - for key, value in _dCharOrder.items(): - print("[" + key + "]: ", ", ".join([ c+":"+str(n) for c, n in sorted(value.items(), key=lambda t: t[1], reverse=True) ])) DELETED gc_core/py/echo.py Index: gc_core/py/echo.py ================================================================== --- gc_core/py/echo.py +++ /dev/null @@ -1,29 +0,0 @@ -#!python3 - -# The most boring yet indispensable function: print! - - -import sys - - -_CHARMAP = str.maketrans({ 'œ': 'ö', 'Œ': 'Ö', 'ʳ': "r", 'ᵉ': "e", '…': "_", \ - '“': '"', '”': '"', '„': '"', '‘': "'", '’': "'", \ - 'ā': 'â', 'Ā': 'Â', 'ē': 'ê', 'Ē': 'Ê', 'ī': 'î', 'Ī': 'Î', \ - 'ō': 'ô', 'Ō': 'Ô', 'ū': 'û', 'Ū': 'Û', 'Ÿ': 'Y', \ - 'ś': 's', 'ŝ': 's', \ - '—': '-', '–': '-' - }) - - -def echo (obj, sep=' ', end='\n', file=sys.stdout, flush=False): - """ Print for Windows to avoid Python crashes. - Encoding depends on Windows locale. No useful standard. - Always returns True (useful for debugging).""" - if sys.platform != "win32": - print(obj, sep=sep, end=end, file=file, flush=flush) - return True - try: - print(str(obj).translate(_CHARMAP), sep=sep, end=end, file=file, flush=flush) - except: - print(str(obj).encode('ascii', 'replace').decode('ascii', 'replace'), sep=sep, end=end, file=file, flush=flush) - return True DELETED gc_core/py/ibdawg.py Index: gc_core/py/ibdawg.py ================================================================== --- gc_core/py/ibdawg.py +++ /dev/null @@ -1,720 +0,0 @@ -#!python3 - -import os -import traceback -import pkgutil -import re -from functools import wraps -import time - -#import logging -#logging.basicConfig(filename="suggestions.log", level=logging.DEBUG) - -from . import str_transform as st -from . import char_player as cp -from .echo import echo - - -def timethis (func): - "decorator for the execution time" - @wraps(func) - def wrapper (*args, **kwargs): - fStart = time.time() - result = func(*args, **kwargs) - fEnd = time.time() - print(func.__name__, fEnd - fStart) - return result - return wrapper - - -class SuggResult: - """Structure for storing, classifying and filtering suggestions""" - - def __init__ (self, sWord, nDistLimit=-1): - self.sWord = sWord - self.sSimplifiedWord = cp.simplifyWord(sWord) - self.nDistLimit = nDistLimit if nDistLimit >= 0 else (len(sWord) // 3) + 1 - self.nMinDist = 1000 - self.aSugg = set() - self.dSugg = { 0: [], 1: [], 2: [] } - - def addSugg (self, sSugg, nDeep=0): - "add a suggestion" - #logging.info((nDeep * " ") + "__" + sSugg + "__") - if sSugg not in self.aSugg: - nDist = st.distanceDamerauLevenshtein(self.sSimplifiedWord, cp.simplifyWord(sSugg)) - if nDist <= self.nDistLimit: - if nDist not in self.dSugg: - self.dSugg[nDist] = [] - self.dSugg[nDist].append(sSugg) - self.aSugg.add(sSugg) - if nDist < self.nMinDist: - self.nMinDist = nDist - self.nDistLimit = min(self.nDistLimit, self.nMinDist+2) - - def getSuggestions (self, nSuggLimit=10, nDistLimit=-1): - "return a list of suggestions" - lRes = [] - if self.dSugg[0]: - # we sort the better results with the original word - self.dSugg[0].sort(key=lambda sSugg: st.distanceDamerauLevenshtein(self.sWord, sSugg)) - for lSugg in self.dSugg.values(): - lRes.extend(lSugg) - if len(lRes) > nSuggLimit: - break - lRes = list(cp.filterSugg(lRes)) - if self.sWord.istitle(): - lRes = list(map(lambda sSugg: sSugg.title(), lRes)) - elif self.sWord.isupper(): - lRes = list(map(lambda sSugg: sSugg.upper(), lRes)) - return lRes[:nSuggLimit] - - def reset (self): - self.aSugg.clear() - self.dSugg.clear() - - -class IBDAWG: - """INDEXABLE BINARY DIRECT ACYCLIC WORD GRAPH""" - - def __init__ (self, sDicName): - self.by = pkgutil.get_data(__package__, "_dictionaries/" + sDicName) - if not self.by: - raise OSError("# Error. File not found or not loadable: "+sDicName) - - if self.by[0:7] != b"/pyfsa/": - raise TypeError("# Error. Not a pyfsa binary dictionary. Header: {}".format(self.by[0:9])) - if not(self.by[7:8] == b"1" or self.by[7:8] == b"2" or self.by[7:8] == b"3"): - raise ValueError("# Error. Unknown dictionary version: {}".format(self.by[7:8])) - try: - header, info, values, bdic = self.by.split(b"\0\0\0\0", 3) - except Exception: - raise Exception - - self.sName = sDicName - self.nVersion = int(self.by[7:8].decode("utf-8")) - self.sHeader = header.decode("utf-8") - self.lArcVal = values.decode("utf-8").split("\t") - self.nArcVal = len(self.lArcVal) - self.byDic = bdic - - l = info.decode("utf-8").split("/") - self.sLang = l[0] - self.nChar = int(l[1]) - self.nBytesArc = int(l[2]) - self.nBytesNodeAddress = int(l[3]) - self.nEntries = int(l[4]) - self.nNode = int(l[5]) - self.nArc = int(l[6]) - self.nAff = int(l[7]) - self.cStemming = l[8] - if self.cStemming == "S": - self.funcStemming = st.changeWordWithSuffixCode - elif self.cStemming == "A": - self.funcStemming = st.changeWordWithAffixCode - else: - self.funcStemming = st.noStemming - self.nTag = self.nArcVal - self.nChar - self.nAff - # to get the value of an arc, to get the char of an arc with its value - self.dChar = {} - for i in range(1, self.nChar): - self.dChar[self.lArcVal[i]] = i - self.dCharVal = { v: k for k, v in self.dChar.items() } - - self._arcMask = (2 ** ((self.nBytesArc * 8) - 3)) - 1 - self._finalNodeMask = 1 << ((self.nBytesArc * 8) - 1) - self._lastArcMask = 1 << ((self.nBytesArc * 8) - 2) - self._addrBitMask = 1 << ((self.nBytesArc * 8) - 3) # version 2 - - self.nBytesOffset = 1 # version 3 - - # Configuring DAWG functions according to nVersion - if self.nVersion == 1: - self.morph = self._morph1 - self.stem = self._stem1 - self._lookupArcNode = self._lookupArcNode1 - self._getArcs = self._getArcs1 - self._writeNodes = self._writeNodes1 - elif self.nVersion == 2: - self.morph = self._morph2 - self.stem = self._stem2 - self._lookupArcNode = self._lookupArcNode2 - self._getArcs = self._getArcs2 - self._writeNodes = self._writeNodes2 - elif self.nVersion == 3: - self.morph = self._morph3 - self.stem = self._stem3 - self._lookupArcNode = self._lookupArcNode3 - self._getArcs = self._getArcs3 - self._writeNodes = self._writeNodes3 - else: - raise ValueError(" # Error: unknown code: {}".format(self.nVersion)) - - self.bOptNumSigle = False - self.bOptNumAtLast = False - - def getInfo (self): - return " Language: {0.sLang:>10} Version: {0.nVersion:>2} Stemming: {0.cStemming}FX\n" \ - " Arcs values: {0.nArcVal:>10,} = {0.nChar:>5,} characters, {0.nAff:>6,} affixes, {0.nTag:>6,} tags\n" \ - " Dictionary: {0.nEntries:>12,} entries, {0.nNode:>11,} nodes, {0.nArc:>11,} arcs\n" \ - " Address size: {0.nBytesNodeAddress:>1} bytes, Arc size: {0.nBytesArc:>1} bytes\n".format(self) - - def writeAsJSObject (self, spfDest, bInJSModule=False, bBinaryDictAsHexString=False): - "write IBDAWG as a JavaScript object in a JavaScript module" - import json - with open(spfDest, "w", encoding="utf-8", newline="\n") as hDst: - if bInJSModule: - hDst.write('// JavaScript\n// Generated data (do not edit)\n\n"use strict";\n\nconst dictionary = ') - hDst.write(json.dumps({ - "sName": self.sName, - "nVersion": self.nVersion, - "sHeader": self.sHeader, - "lArcVal": self.lArcVal, - "nArcVal": self.nArcVal, - # JavaScript is a pile of shit, so Mozilla’s JS parser don’t like file bigger than 4 Mb! - # So, if necessary, we use an hexadecimal string, that we will convert later in Firefox’s extension. - # https://github.com/mozilla/addons-linter/issues/1361 - "byDic": self.byDic.hex() if bBinaryDictAsHexString else [ e for e in self.byDic ], - "sLang": self.sLang, - "nChar": self.nChar, - "nBytesArc": self.nBytesArc, - "nBytesNodeAddress": self.nBytesNodeAddress, - "nEntries": self.nEntries, - "nNode": self.nNode, - "nArc": self.nArc, - "nAff": self.nAff, - "cStemming": self.cStemming, - "nTag": self.nTag, - "dChar": self.dChar, - "_arcMask": self._arcMask, - "_finalNodeMask": self._finalNodeMask, - "_lastArcMask": self._lastArcMask, - "_addrBitMask": self._addrBitMask, - "nBytesOffset": self.nBytesOffset - }, ensure_ascii=False)) - if bInJSModule: - hDst.write(";\n\nexports.dictionary = dictionary;\n") - - def isValidToken (self, sToken): - "checks if is valid (if there is hyphens in , is split, each part is checked)" - if self.isValid(sToken): - return True - if "-" in sToken: - if sToken.count("-") > 4: - return True - return all(self.isValid(sWord) for sWord in sToken.split("-")) - return False - - def isValid (self, sWord): - "checks if is valid (different casing tested if the first letter is a capital)" - if not sWord: - return None - if "’" in sWord: # ugly hack - sWord = sWord.replace("’", "'") - if self.lookup(sWord): - return True - if sWord[0:1].isupper(): - if len(sWord) > 1: - if sWord.istitle(): - return self.lookup(sWord.lower()) - if sWord.isupper(): - if self.bOptNumSigle: - return True - return self.lookup(sWord.lower()) or self.lookup(sWord.capitalize()) - return self.lookup(sWord[:1].lower() + sWord[1:]) - else: - return self.lookup(sWord.lower()) - return False - - def lookup (self, sWord): - "returns True if in dictionary (strict verification)" - iAddr = 0 - for c in sWord: - if c not in self.dChar: - return False - iAddr = self._lookupArcNode(self.dChar[c], iAddr) - if iAddr == None: - return False - return bool(int.from_bytes(self.byDic[iAddr:iAddr+self.nBytesArc], byteorder='big') & self._finalNodeMask) - - def getMorph (self, sWord): - "retrieves morphologies list, different casing allowed" - l = self.morph(sWord) - if sWord[0:1].isupper(): - l.extend(self.morph(sWord.lower())) - if sWord.isupper() and len(sWord) > 1: - l.extend(self.morph(sWord.capitalize())) - return l - - #@timethis - def suggest (self, sWord, nSuggLimit=10): - "returns a set of suggestions for " - sPfx, sWord, sSfx = cp.cut(sWord) - nMaxSwitch = max(len(sWord) // 3, 1) - nMaxDel = len(sWord) // 5 - nMaxHardRepl = max((len(sWord) - 5) // 4, 1) - oSuggResult = SuggResult(sWord) - self._suggest(oSuggResult, sWord, nMaxSwitch=nMaxSwitch, nMaxDel=nMaxDel, nMaxHardRepl=nMaxHardRepl) - if sWord.istitle(): - self._suggest(oSuggResult, sWord.lower(), nMaxSwitch=nMaxSwitch, nMaxDel=nMaxDel, nMaxHardRepl=nMaxHardRepl) - elif sWord.islower(): - self._suggest(oSuggResult, sWord.title(), nMaxSwitch=nMaxSwitch, nMaxDel=nMaxDel, nMaxHardRepl=nMaxHardRepl) - aSugg = oSuggResult.getSuggestions(nSuggLimit) - if sSfx or sPfx: - # we add what we removed - return list(map(lambda sSug: sPfx + sSug + sSfx, aSugg)) - return aSugg - - def _suggest (self, oSuggResult, sRemain, nMaxSwitch=0, nMaxDel=0, nMaxHardRepl=0, nDeep=0, iAddr=0, sNewWord="", bAvoidLoop=False): - # recursive function - #logging.info((nDeep * " ") + sNewWord + ":" + sRemain) - if not sRemain: - if int.from_bytes(self.byDic[iAddr:iAddr+self.nBytesArc], byteorder='big') & self._finalNodeMask: - oSuggResult.addSugg(sNewWord, nDeep) - for sTail in self._getTails(iAddr): - oSuggResult.addSugg(sNewWord+sTail, nDeep) - return - cCurrent = sRemain[0:1] - for cChar, jAddr in self._getCharArcs(iAddr): - if cChar in cp.d1to1.get(cCurrent, cCurrent): - self._suggest(oSuggResult, sRemain[1:], nMaxSwitch, nMaxDel, nMaxHardRepl, nDeep+1, jAddr, sNewWord+cChar) - elif not bAvoidLoop and nMaxHardRepl: - self._suggest(oSuggResult, sRemain[1:], nMaxSwitch, nMaxDel, nMaxHardRepl-1, nDeep+1, jAddr, sNewWord+cChar, True) - if not bAvoidLoop: # avoid infinite loop - if len(sRemain) > 1: - if cCurrent == sRemain[1:2]: - # same char, we remove 1 char without adding 1 to - self._suggest(oSuggResult, sRemain[1:], nMaxSwitch, nMaxDel, nMaxHardRepl, nDeep+1, iAddr, sNewWord) - else: - # switching chars - if nMaxSwitch: - self._suggest(oSuggResult, sRemain[1:2]+sRemain[0:1]+sRemain[2:], nMaxSwitch-1, nMaxDel, nMaxHardRepl, nDeep+1, iAddr, sNewWord, True) - # delete char - if nMaxDel: - self._suggest(oSuggResult, sRemain[1:], nMaxSwitch, nMaxDel-1, nMaxHardRepl, nDeep+1, iAddr, sNewWord, True) - # Phonetic replacements - for sRepl in cp.get1toXReplacement(sNewWord[-1:], cCurrent, sRemain[1:2]): - self._suggest(oSuggResult, sRepl + sRemain[1:], nMaxSwitch, nMaxDel, nMaxHardRepl, nDeep+1, iAddr, sNewWord, True) - for sRepl in cp.d2toX.get(sRemain[0:2], ()): - self._suggest(oSuggResult, sRepl + sRemain[2:], nMaxSwitch, nMaxDel, nMaxHardRepl, nDeep+1, iAddr, sNewWord, True) - # end of word - if len(sRemain) == 2: - for sRepl in cp.dFinal2.get(sRemain, ()): - self._suggest(oSuggResult, sRepl, nMaxSwitch, nMaxDel, nMaxHardRepl, nDeep+1, iAddr, sNewWord, True) - elif len(sRemain) == 1: - self._suggest(oSuggResult, "", nMaxSwitch, nMaxDel, nMaxHardRepl, nDeep+1, iAddr, sNewWord, True) # remove last char and go on - for sRepl in cp.dFinal1.get(sRemain, ()): - self._suggest(oSuggResult, sRepl, nMaxSwitch, nMaxDel, nMaxHardRepl, nDeep+1, iAddr, sNewWord, True) - - #@timethis - def suggest2 (self, sWord, nMaxSugg=10): - "returns a set of suggestions for " - sPfx, sWord, sSfx = cp.cut(sWord) - oSuggResult = SuggResult(sWord) - self._suggest2(oSuggResult) - aSugg = oSuggResult.getSuggestions() - if sSfx or sPfx: - # we add what we removed - return list(map(lambda sSug: sPfx + sSug + sSfx, aSugg)) - return aSugg - - def _suggest2 (self, oSuggResult, nDeep=0, iAddr=0, sNewWord=""): - # recursive function - #logging.info((nDeep * " ") + sNewWord) - if nDeep >= oSuggResult.nDistLimit: - sCleanNewWord = cp.simplifyWord(sNewWord) - if st.distanceSift4(oSuggResult.sCleanWord[:len(sCleanNewWord)], sCleanNewWord) > oSuggResult.nDistLimit: - return - if int.from_bytes(self.byDic[iAddr:iAddr+self.nBytesArc], byteorder='big') & self._finalNodeMask: - oSuggResult.addSugg(sNewWord, nDeep) - for cChar, jAddr in self._getCharArcsWithPriority(iAddr, oSuggResult.sWord[nDeep:nDeep+1]): - self._suggest2(oSuggResult, nDeep+1, jAddr, sNewWord+cChar) - return - - def _getCharArcs (self, iAddr): - "generator: yield all chars and addresses from node at address " - for nVal, jAddr in self._getArcs(iAddr): - if nVal < self.nChar: - yield (self.dCharVal[nVal], jAddr) - - def _getSimilarCharArcs (self, cChar, iAddr): - "generator: yield similar char of and address of the following node" - for c in cp.d1to1.get(cChar, [cChar]): - if c in self.dChar: - jAddr = self._lookupArcNode(self.dChar[c], iAddr) - if jAddr: - yield (c, jAddr) - - def _getCharArcsWithPriority (self, iAddr, cChar): - if not cChar: - yield from self._getCharArcs(iAddr) - lTuple = list(self._getCharArcs(iAddr)) - lTuple.sort(key=lambda t: 0 if t[0] in cp.d1to1.get(cChar, cChar) else 1) - yield from lTuple - - def _getTails (self, iAddr, sTail="", n=2): - "return a list of suffixes ending at a distance of from " - aTails = set() - for nVal, jAddr in self._getArcs(iAddr): - if nVal < self.nChar: - if int.from_bytes(self.byDic[jAddr:jAddr+self.nBytesArc], byteorder='big') & self._finalNodeMask: - aTails.add(sTail + self.dCharVal[nVal]) - if n and not aTails: - aTails.update(self._getTails(jAddr, sTail+self.dCharVal[nVal], n-1)) - return aTails - - def drawPath (self, sWord, iAddr=0): - "show the path taken by in the graph" - c1 = sWord[0:1] if sWord else " " - iPos = -1 - n = 0 - print(c1 + ": ", end="") - for c2, jAddr in self._getCharArcs(iAddr): - print(c2, end="") - if c2 == sWord[0:1]: - iNextNodeAddr = jAddr - iPos = n - n += 1 - if not sWord: - return - if iPos >= 0: - print("\n "+ " " * iPos + "|") - self.drawPath(sWord[1:], iNextNodeAddr) - - def select (self, sPattern=""): - "generator: returns all entries which morphology fits " - zPattern = None - try: - zPattern = re.compile(sPattern) - except: - print("# Error in regex pattern") - traceback.print_exc() - yield from self._select1(zPattern, 0, "") - - # def morph (self, sWord): - # is defined in __init__ - - # VERSION 1 - def _select1 (self, zPattern, iAddr, sWord): - # recursive generator - for nVal, jAddr in self._getArcs1(iAddr): - if nVal < self.nChar: - # simple character - yield from self._select1(zPattern, jAddr, sWord + self.lArcVal[nVal]) - else: - sEntry = sWord + "\t" + self.funcStemming(sWord, self.lArcVal[nVal]) - for nMorphVal, _ in self._getArcs1(jAddr): - if not zPattern or zPattern.search(self.lArcVal[nMorphVal]): - yield sEntry + "\t" + self.lArcVal[nMorphVal] - - def _morph1 (self, sWord): - "returns morphologies of " - iAddr = 0 - for c in sWord: - if c not in self.dChar: - return [] - iAddr = self._lookupArcNode(self.dChar[c], iAddr) - if iAddr == None: - return [] - if (int.from_bytes(self.byDic[iAddr:iAddr+self.nBytesArc], byteorder='big') & self._finalNodeMask): - l = [] - nRawArc = 0 - while not (nRawArc & self._lastArcMask): - iEndArcAddr = iAddr + self.nBytesArc - nRawArc = int.from_bytes(self.byDic[iAddr:iEndArcAddr], byteorder='big') - nArc = nRawArc & self._arcMask - if nArc >= self.nChar: - # This value is not a char, this is a stemming code - sStem = ">" + self.funcStemming(sWord, self.lArcVal[nArc]) - # Now , we go to the next node and retrieve all following arcs values, all of them are tags - iAddr2 = int.from_bytes(self.byDic[iEndArcAddr:iEndArcAddr+self.nBytesNodeAddress], byteorder='big') - nRawArc2 = 0 - while not (nRawArc2 & self._lastArcMask): - iEndArcAddr2 = iAddr2 + self.nBytesArc - nRawArc2 = int.from_bytes(self.byDic[iAddr2:iEndArcAddr2], byteorder='big') - l.append(sStem + " " + self.lArcVal[nRawArc2 & self._arcMask]) - iAddr2 = iEndArcAddr2+self.nBytesNodeAddress - iAddr = iEndArcAddr+self.nBytesNodeAddress - return l - return [] - - def _stem1 (self, sWord): - "returns stems list of " - iAddr = 0 - for c in sWord: - if c not in self.dChar: - return [] - iAddr = self._lookupArcNode(self.dChar[c], iAddr) - if iAddr == None: - return [] - if (int.from_bytes(self.byDic[iAddr:iAddr+self.nBytesArc], byteorder='big') & self._finalNodeMask): - l = [] - nRawArc = 0 - while not (nRawArc & self._lastArcMask): - iEndArcAddr = iAddr + self.nBytesArc - nRawArc = int.from_bytes(self.byDic[iAddr:iEndArcAddr], byteorder='big') - nArc = nRawArc & self._arcMask - if nArc >= self.nChar: - # This value is not a char, this is a stemming code - l.append(self.funcStemming(sWord, self.lArcVal[nArc])) - iAddr = iEndArcAddr+self.nBytesNodeAddress - return l - return [] - - def _lookupArcNode1 (self, nVal, iAddr): - "looks if is an arc at the node at , if yes, returns address of next node else None" - while True: - iEndArcAddr = iAddr+self.nBytesArc - nRawArc = int.from_bytes(self.byDic[iAddr:iEndArcAddr], byteorder='big') - if nVal == (nRawArc & self._arcMask): - # the value we are looking for - # we return the address of the next node - return int.from_bytes(self.byDic[iEndArcAddr:iEndArcAddr+self.nBytesNodeAddress], byteorder='big') - else: - # value not found - if (nRawArc & self._lastArcMask): - return None - iAddr = iEndArcAddr+self.nBytesNodeAddress - - def _getArcs1 (self, iAddr): - "generator: return all arcs at as tuples of (nVal, iAddr)" - while True: - iEndArcAddr = iAddr+self.nBytesArc - nRawArc = int.from_bytes(self.byDic[iAddr:iEndArcAddr], byteorder='big') - yield (nRawArc & self._arcMask, int.from_bytes(self.byDic[iEndArcAddr:iEndArcAddr+self.nBytesNodeAddress], byteorder='big')) - if (nRawArc & self._lastArcMask): - break - iAddr = iEndArcAddr+self.nBytesNodeAddress - - def _writeNodes1 (self, spfDest): - "for debugging only" - print(" > Write binary nodes") - with codecs.open(spfDest, 'w', 'utf-8', newline="\n") as hDst: - iAddr = 0 - hDst.write("i{:_>10} -- #{:_>10}\n".format("0", iAddr)) - while iAddr < len(self.byDic): - iEndArcAddr = iAddr+self.nBytesArc - nRawArc = int.from_bytes(self.byDic[iAddr:iEndArcAddr], byteorder='big') - nArc = nRawArc & self._arcMask - hDst.write(" {:<20} {:0>16} i{:>10} #{:_>10}\n".format(self.lArcVal[nArc], bin(nRawArc)[2:], "?", \ - int.from_bytes(self.byDic[iEndArcAddr:iEndArcAddr+self.nBytesNodeAddress], \ - byteorder='big'))) - iAddr = iEndArcAddr+self.nBytesNodeAddress - if (nRawArc & self._lastArcMask) and iAddr < len(self.byDic): - hDst.write("\ni{:_>10} -- #{:_>10}\n".format("?", iAddr)) - hDst.close() - - # VERSION 2 - def _morph2 (self, sWord): - "returns morphologies of " - iAddr = 0 - for c in sWord: - if c not in self.dChar: - return [] - iAddr = self._lookupArcNode(self.dChar[c], iAddr) - if iAddr == None: - return [] - if (int.from_bytes(self.byDic[iAddr:iAddr+self.nBytesArc], byteorder='big') & self._finalNodeMask): - l = [] - nRawArc = 0 - while not (nRawArc & self._lastArcMask): - iEndArcAddr = iAddr + self.nBytesArc - nRawArc = int.from_bytes(self.byDic[iAddr:iEndArcAddr], byteorder='big') - nArc = nRawArc & self._arcMask - if nArc >= self.nChar: - # This value is not a char, this is a stemming code - sStem = ">" + self.funcStemming(sWord, self.lArcVal[nArc]) - # Now , we go to the next node and retrieve all following arcs values, all of them are tags - if not (nRawArc & self._addrBitMask): - iAddr2 = int.from_bytes(self.byDic[iEndArcAddr:iEndArcAddr+self.nBytesNodeAddress], byteorder='big') - else: - # we go to the end of the node - iAddr2 = iEndArcAddr - while not (nRawArc & self._lastArcMask): - nRawArc = int.from_bytes(self.byDic[iAddr2:iAddr2+self.nBytesArc], byteorder='big') - iAddr2 += self.nBytesArc + self.nBytesNodeAddress - nRawArc2 = 0 - while not (nRawArc2 & self._lastArcMask): - iEndArcAddr2 = iAddr2 + self.nBytesArc - nRawArc2 = int.from_bytes(self.byDic[iAddr2:iEndArcAddr2], byteorder='big') - l.append(sStem + " " + self.lArcVal[nRawArc2 & self._arcMask]) - iAddr2 = iEndArcAddr2+self.nBytesNodeAddress if not (nRawArc2 & self._addrBitMask) else iEndArcAddr2 - iAddr = iEndArcAddr+self.nBytesNodeAddress if not (nRawArc & self._addrBitMask) else iEndArcAddr - return l - return [] - - def _stem2 (self, sWord): - "returns stems list of " - iAddr = 0 - for c in sWord: - if c not in self.dChar: - return [] - iAddr = self._lookupArcNode(self.dChar[c], iAddr) - if iAddr == None: - return [] - if (int.from_bytes(self.byDic[iAddr:iAddr+self.nBytesArc], byteorder='big') & self._finalNodeMask): - l = [] - nRawArc = 0 - while not (nRawArc & self._lastArcMask): - iEndArcAddr = iAddr + self.nBytesArc - nRawArc = int.from_bytes(self.byDic[iAddr:iEndArcAddr], byteorder='big') - nArc = nRawArc & self._arcMask - if nArc >= self.nChar: - # This value is not a char, this is a stemming code - l.append(self.funcStemming(sWord, self.lArcVal[nArc])) - # Now , we go to the next node - if not (nRawArc & self._addrBitMask): - iAddr2 = int.from_bytes(self.byDic[iEndArcAddr:iEndArcAddr+self.nBytesNodeAddress], byteorder='big') - else: - # we go to the end of the node - iAddr2 = iEndArcAddr - while not (nRawArc & self._lastArcMask): - nRawArc = int.from_bytes(self.byDic[iAddr2:iAddr2+self.nBytesArc], byteorder='big') - iAddr2 += self.nBytesArc + self.nBytesNodeAddress - iAddr = iEndArcAddr+self.nBytesNodeAddress if not (nRawArc & self._addrBitMask) else iEndArcAddr - return l - return [] - - def _lookupArcNode2 (self, nVal, iAddr): - "looks if is an arc at the node at , if yes, returns address of next node else None" - while True: - iEndArcAddr = iAddr+self.nBytesArc - nRawArc = int.from_bytes(self.byDic[iAddr:iEndArcAddr], byteorder='big') - if nVal == (nRawArc & self._arcMask): - # the value we are looking for - if not (nRawArc & self._addrBitMask): - # we return the address of the next node - return int.from_bytes(self.byDic[iEndArcAddr:iEndArcAddr+self.nBytesNodeAddress], byteorder='big') - else: - # we go to the end of the node - iAddr = iEndArcAddr - while not (nRawArc & self._lastArcMask): - nRawArc = int.from_bytes(self.byDic[iAddr:iAddr+self.nBytesArc], byteorder='big') - iAddr += self.nBytesArc + self.nBytesNodeAddress if not (nRawArc & self._addrBitMask) else self.nBytesArc - return iAddr - else: - # value not found - if (nRawArc & self._lastArcMask): - return None - iAddr = iEndArcAddr+self.nBytesNodeAddress if not (nRawArc & self._addrBitMask) else iEndArcAddr - - def _writeNodes2 (self, spfDest): - "for debugging only" - print(" > Write binary nodes") - with codecs.open(spfDest, 'w', 'utf-8', newline="\n") as hDst: - iAddr = 0 - hDst.write("i{:_>10} -- #{:_>10}\n".format("0", iAddr)) - while iAddr < len(self.byDic): - iEndArcAddr = iAddr+self.nBytesArc - nRawArc = int.from_bytes(self.byDic[iAddr:iEndArcAddr], byteorder='big') - nArc = nRawArc & self._arcMask - if not (nRawArc & self._addrBitMask): - iNextNodeAddr = int.from_bytes(self.byDic[iEndArcAddr:iEndArcAddr+self.nBytesNodeAddress], byteorder='big') - hDst.write(" {:<20} {:0>16} i{:>10} #{:_>10}\n".format(self.lArcVal[nArc], bin(nRawArc)[2:], "?", iNextNodeAddr)) - iAddr = iEndArcAddr+self.nBytesNodeAddress - else: - hDst.write(" {:<20} {:0>16}\n".format(self.lArcVal[nArc], bin(nRawArc)[2:])) - iAddr = iEndArcAddr - if (nRawArc & self._lastArcMask): - hDst.write("\ni{:_>10} -- #{:_>10}\n".format("?", iAddr)) - hDst.close() - - # VERSION 3 - def _morph3 (self, sWord): - "returns morphologies of " - iAddr = 0 - for c in sWord: - if c not in self.dChar: - return [] - iAddr = self._lookupArcNode(self.dChar[c], iAddr) - if iAddr == None: - return [] - if (int.from_bytes(self.byDic[iAddr:iAddr+self.nBytesArc], byteorder='big') & self._finalNodeMask): - l = [] - nRawArc = 0 - iAddrNode = iAddr - while not (nRawArc & self._lastArcMask): - iEndArcAddr = iAddr + self.nBytesArc - nRawArc = int.from_bytes(self.byDic[iAddr:iEndArcAddr], byteorder='big') - nArc = nRawArc & self._arcMask - if nArc >= self.nChar: - # This value is not a char, this is a stemming code - sStem = ">" + self.funcStemming(sWord, self.lArcVal[nArc]) - # Now , we go to the next node and retrieve all following arcs values, all of them are tags - if not (nRawArc & self._addrBitMask): - iAddr2 = int.from_bytes(self.byDic[iEndArcAddr:iEndArcAddr+self.nBytesNodeAddress], byteorder='big') - else: - iAddr2 = iAddrNode + int.from_bytes(self.byDic[iEndArcAddr:iEndArcAddr+self.nBytesOffset], byteorder='big') - nRawArc2 = 0 - while not (nRawArc2 & self._lastArcMask): - iEndArcAddr2 = iAddr2 + self.nBytesArc - nRawArc2 = int.from_bytes(self.byDic[iAddr2:iEndArcAddr2], byteorder='big') - l.append(sStem + " " + self.lArcVal[nRawArc2 & self._arcMask]) - iAddr2 = iEndArcAddr2+self.nBytesNodeAddress if not (nRawArc2 & self._addrBitMask) else iEndArcAddr2+self.nBytesOffset - iAddr = iEndArcAddr+self.nBytesNodeAddress if not (nRawArc & self._addrBitMask) else iEndArcAddr+self.nBytesOffset - return l - return [] - - def _stem3 (self, sWord): - "returns stems list of " - iAddr = 0 - for c in sWord: - if c not in self.dChar: - return [] - iAddr = self._lookupArcNode(self.dChar[c], iAddr) - if iAddr == None: - return [] - if (int.from_bytes(self.byDic[iAddr:iAddr+self.nBytesArc], byteorder='big') & self._finalNodeMask): - l = [] - nRawArc = 0 - iAddrNode = iAddr - while not (nRawArc & self._lastArcMask): - iEndArcAddr = iAddr + self.nBytesArc - nRawArc = int.from_bytes(self.byDic[iAddr:iEndArcAddr], byteorder='big') - nArc = nRawArc & self._arcMask - if nArc >= self.nChar: - # This value is not a char, this is a stemming code - l.append(self.funcStemming(sWord, self.lArcVal[nArc])) - iAddr = iEndArcAddr+self.nBytesNodeAddress if not (nRawArc & self._addrBitMask) else iEndArcAddr+self.nBytesOffset - return l - return [] - - def _lookupArcNode3 (self, nVal, iAddr): - "looks if is an arc at the node at , if yes, returns address of next node else None" - iAddrNode = iAddr - while True: - iEndArcAddr = iAddr+self.nBytesArc - nRawArc = int.from_bytes(self.byDic[iAddr:iEndArcAddr], byteorder='big') - if nVal == (nRawArc & self._arcMask): - # the value we are looking for - if not (nRawArc & self._addrBitMask): - return int.from_bytes(self.byDic[iEndArcAddr:iEndArcAddr+self.nBytesNodeAddress], byteorder='big') - else: - return iAddrNode + int.from_bytes(self.byDic[iEndArcAddr:iEndArcAddr+self.nBytesOffset], byteorder='big') - else: - # value not found - if (nRawArc & self._lastArcMask): - return None - iAddr = iEndArcAddr+self.nBytesNodeAddress if not (nRawArc & self._addrBitMask) else iEndArcAddr+self.nBytesOffset - - def _writeNodes3 (self, spfDest): - "for debugging only" - print(" > Write binary nodes") - with codecs.open(spfDest, 'w', 'utf-8', newline="\n") as hDst: - iAddr = 0 - hDst.write("i{:_>10} -- #{:_>10}\n".format("0", iAddr)) - while iAddr < len(self.byDic): - iEndArcAddr = iAddr+self.nBytesArc - nRawArc = int.from_bytes(self.byDic[iAddr:iEndArcAddr], byteorder='big') - nArc = nRawArc & self._arcMask - if not (nRawArc & self._addrBitMask): - iNextNodeAddr = int.from_bytes(self.byDic[iEndArcAddr:iEndArcAddr+self.nBytesNodeAddress], byteorder='big') - hDst.write(" {:<20} {:0>16} i{:>10} #{:_>10}\n".format(self.lArcVal[nArc], bin(nRawArc)[2:], "?", iNextNodeAddr)) - iAddr = iEndArcAddr+self.nBytesNodeAddress - else: - iNextNodeAddr = int.from_bytes(self.byDic[iEndArcAddr:iEndArcAddr+self.nBytesOffset], byteorder='big') - hDst.write(" {:<20} {:0>16} i{:>10} +{:_>10}\n".format(self.lArcVal[nArc], bin(nRawArc)[2:], "?", iNextNodeAddr)) - iAddr = iEndArcAddr+self.nBytesOffset - if (nRawArc & self._lastArcMask): - hDst.write("\ni{:_>10} -- #{:_>10}\n".format("?", iAddr)) - hDst.close() DELETED gc_core/py/keyboard_chars_proximity.py Index: gc_core/py/keyboard_chars_proximity.py ================================================================== --- gc_core/py/keyboard_chars_proximity.py +++ /dev/null @@ -1,220 +0,0 @@ -# Keyboard chars proximity - - -def getKeyboardMap (sKeyboard): - return _dKeyboardMap.get(sKeyboard.lower(), {}) - - -def getKeyboardList (): - return _dKeyboardMap.keys() - - -_dKeyboardMap = { - # keyboards by alphabetical order - # bépo, colemak and dvorak users are assumed to do less typing errors. - "azerty": { - # fr - # line 1 - "é": "az", - "è": "yu", - "ç": "àio", - "à": "op", - # line 2 - "a": "zéq", - "z": "aesq", - "e": "zrds", - "r": "etfd", - "t": "rygf", - "y": "tuhg", - "u": "yijh", - "i": "uokj", - "o": "iplk", - "p": "oml", - # line 3 - "q": "sawz", - "s": "qdzwxe", - "d": "sfexcr", - "f": "dgrcvt", - "g": "fhtvby", - "h": "gjybnu", - "j": "hkuni", - "k": "jlio", - "l": "kmop", - "m": "lùp", - "ù": "m", - # line 4 - "w": "xqs", - "x": "wcsd", - "c": "xvdf", - "v": "cbfg", - "b": "vngh", - "n": "bhj", - }, - "bépo": { - # fr - # line 2 - "b": "éa", - "é": "bpu", - "p": "éoi", - "o": "pèe", - "è": "o", - "v": "dt", - "d": "vls", - "l": "djr", - "j": "lzn", - "z": "jmw", - # line 3 - "a": "ubà", - "u": "aiéy", - "i": "uepx", - "e": "io", - "c": "t", - "t": "csvq", - "s": "trdg", - "r": "snlh", - "n": "rmjf", - "m": "nzç", - # line 4 - "à": "yêa", - "y": "àxu", - "x": "ywi", - "w": "z", - "k": "c", - "q": "gt", - "g": "qhs", - "h": "gfr", - "f": "hçn", - "ç": "fm", - }, - "colemak": { - # en, us, intl - # line 2 - "q": "wa", - "w": "qfr", - "f": "wps", - "p": "fgt", - "g": "pjd", - "j": "glh", - "l": "jun", - "u": "lye", - "y": "ui", - # line 3 - "a": "rqz", - "r": "aswx", - "s": "rtfc", - "t": "sdpv", - "d": "thgb", - "h": "dnjk", - "n": "helm", - "e": "niu", - "i": "eoy", - "o": "i", - # line 4 - "z": "xa", - "x": "zcr", - "c": "xvs", - "v": "cbt", - "b": "vkd", - "k": "bmh", - "m": "kn", - }, - "dvorak": { - # en, us, intl - # line 2 - "p": "yu", - "y": "pfi", - "f": "ygd", - "g": "fch", - "c": "grt", - "r": "cln", - "l": "rs", - # line 3 - "a": "o", - "o": "aeq", - "e": "ouj", - "u": "eipk", - "i": "udyx", - "d": "ihfb", - "h": "dtgm", - "t": "hncw", - "n": "tsrv", - "s": "nlz", - # line 4 - "q": "jo", - "j": "qke", - "k": "jxu", - "x": "kbi", - "b": "xmd", - "m": "bwh", - "w": "mvt", - "v": "wzn", - "z": "vs", - }, - "qwerty": { - # en, us, intl - # line 2 - "q": "wa", - "w": "qeas", - "e": "wrds", - "r": "etfd", - "t": "rygf", - "y": "tuhg", - "u": "yijh", - "i": "uokj", - "o": "iplk", - "p": "ol", - # line 3 - "a": "sqzw", - "s": "adwzxe", - "d": "sfexcr", - "f": "dgrcvt", - "g": "fhtvby", - "h": "gjybnu", - "j": "hkunmi", - "k": "jlimo", - "l": "kop", - # line 4 - "z": "xas", - "x": "zcsd", - "c": "xvdf", - "v": "cbfg", - "b": "vngh", - "n": "bmhj", - "m": "njk", - }, - "qwertz": { - # ge, au - # line 2 - "q": "wa", - "w": "qeas", - "e": "wrds", - "r": "etfd", - "t": "rzgf", - "z": "tuhg", - "u": "zijh", - "i": "uokj", - "o": "iplk", - "p": "oüöl", - "ü": "päö", - # line 3 - "a": "sqyw", - "s": "adwyxe", - "d": "sfexcr", - "f": "dgrcvt", - "g": "fhtvbz", - "h": "gjzbnu", - "j": "hkunmi", - "k": "jlimo", - "l": "köop", - "ö": "läpü", - "ä": "öü", - # line 4 - "y": "xas", - "x": "ycsd", - "c": "xvdf", - "v": "cbfg", - "b": "vngh", - "n": "bmhj", - "m": "njk", - } -} Index: gc_core/py/lang_core/gc_engine.py ================================================================== --- gc_core/py/lang_core/gc_engine.py +++ gc_core/py/lang_core/gc_engine.py @@ -6,12 +6,12 @@ import os import traceback #import unicodedata from itertools import chain -from ..ibdawg import IBDAWG -from ..echo import echo +from ..graphspell.ibdawg import IBDAWG +from ..graphspell.echo import echo from . import gc_options __all__ = [ "lang", "locales", "pkg", "name", "version", "author", \ "load", "parse", "getDictionary", \ DELETED gc_core/py/progressbar.py Index: gc_core/py/progressbar.py ================================================================== --- gc_core/py/progressbar.py +++ /dev/null @@ -1,35 +0,0 @@ -# Textual progressbar -# by Olivier R. -# License: MPL 2 - -import time - -class ProgressBar: - "Textual progressbar" - - def __init__ (self, nMin=0, nMax=100, nWidth=78): - "initiate with minimum nMin to maximum nMax" - self.nMin = nMin - self.nMax = nMax - self.nSpan = nMax - nMin - self.nWidth = nWidth-9 - self.nAdvance = -1 - self.nCurVal = nMin - self.startTime = time.time() - self._update() - - def _update (self): - fDone = ((self.nCurVal - self.nMin) / self.nSpan) - nAdvance = int(fDone * self.nWidth) - if (nAdvance > self.nAdvance): - self.nAdvance = nAdvance - print("\r[ {}{} {}% ] ".format('>'*nAdvance, ' '*(self.nWidth-nAdvance), round(fDone*100)), end="") - - def increment (self, n=1): - "increment value by n (1 by default)" - self.nCurVal += n - self._update() - - def done (self): - "to call when it’s finished" - print("\r[ task done in {:.1f} s ] ".format(time.time() - self.startTime)) DELETED gc_core/py/spellchecker.py Index: gc_core/py/spellchecker.py ================================================================== --- gc_core/py/spellchecker.py +++ /dev/null @@ -1,134 +0,0 @@ -# Spellchecker -# Wrapper for the IBDAWG class. -# Useful to check several dictionaries at once. - -from . import ibdawg - - -dDictionaries = { - "fr": "French.bdic", - "en": "English.bdic" -} - - -class Spellchecker (): - - def __init__ (self, sLangCode): - self.sLangCode = sLangCode - self.oMainDic = None - if sLangCode in dDictionaries: - self.oMainDic = ibdawg.IBDAWG(dDictionaries[sLangCode]) - self.lOtherDic = [] - return bool(self.oMainDic) - - - def setMainDictionary (self, sDicName): - try: - self.oMainDic = ibdawg.IBDAWG(sDicName) - return True - except: - print("Error: <" + sDicName + "> not set as main dictionary.") - return False - - def addDictionary (self, sDicName): - try: - self.lOtherDic.append(ibdawg.IBDAWG(sDicName)) - return True - except: - print("Error: <" + sDicName + "> not added to the list.") - return False - - # Return codes: - # 0: invalid - # 1: correct in main dictionary - # 2+: correct in foreign dictionaries - - - # check in the main dictionary only - - def isValidToken (self, sToken): - "(in main dictionary) checks if sToken is valid (if there is hyphens in sToken, sToken is split, each part is checked)" - if self.oMainDic.isValidToken(sToken): - return 1 - return 0 - - def isValid (self, sWord): - "(in main dictionary) checks if sWord is valid (different casing tested if the first letter is a capital)" - if self.oMainDic.isValid(sWord): - return 1 - return 0 - - def lookup (self, sWord): - "(in main dictionary) checks if sWord is in dictionary as is (strict verification)" - if self.oMainDic.lookup(sWord): - return 1 - return 0 - - - # check in all dictionaries - - def isValidTokenAll (self, sToken): - "(in all dictionaries) checks if sToken is valid (if there is hyphens in sToken, sToken is split, each part is checked)" - if self.oMainDic.isValidToken(sToken): - return 1 - for i, oDic in enumerate(self.lOtherDic, 2): - if oDic.isValidToken(sToken): - return i - return 0 - - def isValidAll (self, sWord): - "(in all dictionaries) checks if sWord is valid (different casing tested if the first letter is a capital)" - if self.oMainDic.isValid(sToken): - return 1 - for i, oDic in enumerate(self.lOtherDic, 2): - if oDic.isValid(sToken): - return i - return 0 - - def lookupAll (self, sWord): - "(in all dictionaries) checks if sWord is in dictionary as is (strict verification)" - if self.oMainDic.lookup(sToken): - return 1 - for i, oDic in enumerate(self.lOtherDic, 2): - if oDic.lookup(sToken): - return i - return 0 - - - # check in dictionaries up to level n - - def isValidTokenLevel (self, sToken, nLevel): - "(in dictionaries up to level n) checks if sToken is valid (if there is hyphens in sToken, sToken is split, each part is checked)" - if self.oMainDic.isValidToken(sToken): - return 1 - if nLevel >= 2: - for i, oDic in enumerate(self.lOtherDic, 2): - if oDic.isValidToken(sToken): - return i - if i == nLevel: - break - return 0 - - def isValidLevel (self, sWord, nLevel): - "(in dictionaries up to level n) checks if sWord is valid (different casing tested if the first letter is a capital)" - if self.oMainDic.isValid(sToken): - return 1 - if nLevel >= 2: - for i, oDic in enumerate(self.lOtherDic, 2): - if oDic.isValid(sToken): - return i - if i == nLevel: - break - return 0 - - def lookupLevel (self, sWord, nLevel): - "(in dictionaries up to level n) checks if sWord is in dictionary as is (strict verification)" - if self.oMainDic.lookup(sToken): - return 1 - if nLevel >= 2: - for i, oDic in enumerate(self.lOtherDic, 2): - if oDic.lookup(sToken): - return i - if i == nLevel: - break - return 0 DELETED gc_core/py/str_transform.py Index: gc_core/py/str_transform.py ================================================================== --- gc_core/py/str_transform.py +++ /dev/null @@ -1,203 +0,0 @@ -#!python3 - - -#### DISTANCE CALCULATIONS - -def longestCommonSubstring (s1, s2): - # http://en.wikipedia.org/wiki/Longest_common_substring_problem - # http://en.wikibooks.org/wiki/Algorithm_implementation/Strings/Longest_common_substring - M = [ [0]*(1+len(s2)) for i in range(1+len(s1)) ] - longest, x_longest = 0, 0 - for x in range(1, 1+len(s1)): - for y in range(1, 1+len(s2)): - if s1[x-1] == s2[y-1]: - M[x][y] = M[x-1][y-1] + 1 - if M[x][y] > longest: - longest = M[x][y] - x_longest = x - else: - M[x][y] = 0 - return s1[x_longest-longest : x_longest] - - -def distanceDamerauLevenshtein (s1, s2): - "distance of Damerau-Levenshtein between and " - # https://fr.wikipedia.org/wiki/Distance_de_Damerau-Levenshtein - d = {} - nLen1 = len(s1) - nLen2 = len(s2) - for i in range(-1, nLen1+1): - d[i, -1] = i + 1 - for j in range(-1, nLen2+1): - d[-1, j] = j + 1 - for i in range(nLen1): - for j in range(nLen2): - nCost = 0 if s1[i] == s2[j] else 1 - d[i, j] = min( - d[i-1, j] + 1, # Deletion - d[i, j-1] + 1, # Insertion - d[i-1, j-1] + nCost, # Substitution - ) - if i and j and s1[i] == s2[j-1] and s1[i-1] == s2[j]: - d[i, j] = min(d[i, j], d[i-2, j-2] + nCost) # Transposition - return d[nLen1-1, nLen2-1] - - -def distanceSift4 (s1, s2, nMaxOffset=5): - "implementation of general Sift4." - # https://siderite.blogspot.com/2014/11/super-fast-and-accurate-string-distance.html - if not s1: - return len(s2) - if not s2: - return len(s1) - nLen1, nLen2 = len(s1), len(s2) - i1, i2 = 0, 0 # Cursors for each string - nLargestCS = 0 # Largest common substring - nLocalCS = 0 # Local common substring - nTrans = 0 # Number of transpositions ('ab' vs 'ba') - lOffset = [] # Offset pair array, for computing the transpositions - - while i1 < nLen1 and i2 < nLen2: - if s1[i1] == s2[i2]: - nLocalCS += 1 - # Check if current match is a transposition - bTrans = False - i = 0 - while i < len(lOffset): - t = lOffset[i] - if i1 <= t[0] or i2 <= t[1]: - bTrans = abs(i2-i1) >= abs(t[1] - t[0]) - if bTrans: - nTrans += 1 - elif not t[2]: - t[2] = True - nTrans += 1 - break - elif i1 > t[1] and i2 > t[0]: - del lOffset[i] - else: - i += 1 - lOffset.append([i1, i2, bTrans]) - else: - nLargestCS += nLocalCS - nLocalCS = 0 - if i1 != i2: - i1 = i2 = min(i1, i2) - for i in range(nMaxOffset): - if i1 + i >= nLen1 and i2 + i >= nLen2: - break - elif i1 + i < nLen1 and s1[i1+i] == s2[i2]: - i1 += i - 1 - i2 -= 1 - break - elif i2 + i < nLen2 and s1[i1] == s2[i2+i]: - i2 += i - 1 - i1 -= 1 - break - i1 += 1 - i2 += 1 - if i1 >= nLen1 or i2 >= nLen2: - nLargestCS += nLocalCS - nLocalCS = 0 - i1 = i2 = min(i1, i2) - nLargestCS += nLocalCS - return round(max(nLen1, nLen2) - nLargestCS + nTrans) - - -def showDistance (s1, s2): - print("Damerau-Levenshtein: " + s1 + "/" + s2 + " = " + distanceDamerauLevenshtein(s1, s2)) - print("Sift4:" + s1 + "/" + s2 + " = " + distanceSift4(s1, s2)) - - - - -#### STEMMING OPERATIONS - -## No stemming - -def noStemming (sFlex, sStem): - return sStem - -def rebuildWord (sFlex, cmd1, cmd2): - if cmd1 == "_": - return sFlex - n, c = cmd1.split(":") - s = s[:n] + c + s[n:] - if cmd2 == "_": - return s - n, c = cmd2.split(":") - return s[:n] + c + s[n:] - - -## Define affixes for stemming - -# Note: 48 is the ASCII code for "0" - - -# Suffix only -def defineSuffixCode (sFlex, sStem): - """ Returns a string defining how to get stem from flexion - "n(sfx)" - with n: a char with numeric meaning, "0" = 0, "1" = 1, ... ":" = 10, etc. (See ASCII table.) Says how many letters to strip from flexion. - sfx [optional]: string to add on flexion - Examples: - "0": strips nothing, adds nothing - "1er": strips 1 letter, adds "er" - "2": strips 2 letters, adds nothing - """ - if sFlex == sStem: - return "0" - jSfx = 0 - for i in range(min(len(sFlex), len(sStem))): - if sFlex[i] != sStem[i]: - break - jSfx += 1 - return chr(len(sFlex)-jSfx+48) + sStem[jSfx:] - - -def changeWordWithSuffixCode (sWord, sSfxCode): - if sSfxCode == "0": - return sWord - return sWord[:-(ord(sSfxCode[0])-48)] + sSfxCode[1:] if sSfxCode[0] != '0' else sWord + sSfxCode[1:] - - -# Prefix and suffix - -def defineAffixCode (sFlex, sStem): - """ Returns a string defining how to get stem from flexion. Examples: - "0" if stem = flexion - "stem" if no common substring - "n(pfx)/m(sfx)" - with n and m: chars with numeric meaning, "0" = 0, "1" = 1, ... ":" = 10, etc. (See ASCII table.) Says how many letters to strip from flexion. - pfx [optional]: string to add before the flexion - sfx [optional]: string to add after the flexion - """ - if sFlex == sStem: - return "0" - # is stem a substring of flexion? - n = sFlex.find(sStem) - if n >= 0: - return "{}/{}".format(chr(n+48), chr(len(sFlex)-(len(sStem)+n)+48)) - # no, so we are looking for common substring - sSubs = longestCommonSubstring(sFlex, sStem) - if len(sSubs) > 1: - iPos = sStem.find(sSubs) - sPfx = sStem[:iPos] - sSfx = sStem[iPos+len(sSubs):] - n = sFlex.find(sSubs) - m = len(sFlex) - (len(sSubs)+n) - sAff = "{}/".format(chr(n+48)) if not sPfx else "{}{}/".format(chr(n+48), sPfx) - sAff += chr(m+48) if not sSfx else "{}{}".format(chr(m+48), sSfx) - return sAff - return sStem - - -def changeWordWithAffixCode (sWord, sAffCode): - if sAffCode == "0": - return sWord - if '/' not in sAffCode: - return "# error #" - sPfxCode, sSfxCode = sAffCode.split('/') - sWord = sPfxCode[1:] + sWord[(ord(sPfxCode[0])-48):] - return sWord[:-(ord(sSfxCode[0])-48)] + sSfxCode[1:] if sSfxCode[0] != '0' else sWord + sSfxCode[1:] - DELETED gc_core/py/tokenizer.py Index: gc_core/py/tokenizer.py ================================================================== --- gc_core/py/tokenizer.py +++ /dev/null @@ -1,49 +0,0 @@ -# Very simple tokenizer - -import re - -_PATTERNS = { - "default": - ( - r'(?P/(?:bin|boot|dev|etc|home|lib|mnt|opt|root|sbin|tmp|usr|var|Bureau|Documents|Images|Musique|Public|Téléchargements|Vidéos)(?:/[\w.()-]+)*)', - r'(?P[a-zA-Z]:\\(?:Program Files(?: [(]x86[)]|)|[\w.()]+)(?:\\[\w.()-]+)*)', - r'(?P[.,?!:;…«»“”"()/·]+)', - r'(?P[A-Z][.][A-Z][.](?:[A-Z][.])*)', - r'(?P(?:https?://|www[.]|\w+[@.]\w\w+[@.])\w[\w./?&!%=+*"\'@$#-]+)', - r'(?P[#@][\w-]+)', - r'(?P<\w+.*?>|)', - r'(?P\[/?\w+\])', - r'(?P\d\d?h\d\d\b)', - r'(?P-?\d+(?:[.,]\d+))', - r"(?P\w+(?:[’'`-]\w+)*)" - ), - "fr": - ( - r'(?P/(?:bin|boot|dev|etc|home|lib|mnt|opt|root|sbin|tmp|usr|var|Bureau|Documents|Images|Musique|Public|Téléchargements|Vidéos)(?:/[\w.()-]+)*)', - r'(?P[a-zA-Z]:\\(?:Program Files(?: [(]x86[)]|)|[\w.()]+)(?:\\[\w.()-]+)*)', - r'(?P[.,?!:;…«»“”"()/·]+)', - r'(?P[A-Z][.][A-Z][.](?:[A-Z][.])*)', - r'(?P(?:https?://|www[.]|\w+[@.]\w\w+[@.])\w[\w./?&!%=+*"\'@$#-]+)', - r'(?P[#@][\w-]+)', - r'(?P<\w+.*?>|)', - r'(?P\[/?\w+\])', - r"(?P(?:l|d|n|m|t|s|j|c|ç|lorsqu|puisqu|jusqu|quoiqu|qu)['’`])", - r'(?P\d+(?:er|nd|e|de|ième|ème|eme)\b)', - r'(?P\d\d?h\d\d\b)', - r'(?P-?\d+(?:[.,]\d+|))', - r"(?P\w+(?:[’'`-]\w+)*)" - ) -} - - -class Tokenizer: - - def __init__ (self, sLang): - self.sLang = sLang - if sLang not in _PATTERNS: - self.sLang = "default" - self.zToken = re.compile( "(?i)" + '|'.join(sRegex for sRegex in _PATTERNS[sLang]) ) - - def genTokens (self, sText): - for m in self.zToken.finditer(sText): - yield { "sType": m.lastgroup, "sValue": m.group(), "nStart": m.start(), "nEnd": m.end() } Index: gc_lang/fr/modules/tests.py ================================================================== --- gc_lang/fr/modules/tests.py +++ gc_lang/fr/modules/tests.py @@ -5,16 +5,16 @@ import os import re import time -from .. import ibdawg +from ..graphspell.ibdawg import IBDAWG +from ..graphspell.echo import echo from . import gc_engine as gce from . import conj from . import phonet from . import mfsp -from ..echo import echo def _fuckBackslashUTF8 (s): "fuck that shit" return s.replace("\u2019", "'").replace("\u2013", "–").replace("\u2014", "—") @@ -22,11 +22,11 @@ class TestDictionary (unittest.TestCase): @classmethod def setUpClass (cls): - cls.oDic = ibdawg.IBDAWG("French.bdic") + cls.oDic = IBDAWG("French.bdic") def test_lookup (self): for sWord in ["branche", "Émilie"]: self.assertTrue(self.oDic.lookup(sWord), sWord) Index: gc_lang/fr/webext/gce_worker.js ================================================================== --- gc_lang/fr/webext/gce_worker.js +++ gc_lang/fr/webext/gce_worker.js @@ -31,15 +31,15 @@ //console.log("[Worker] GC Engine Worker [start]"); //console.log(self); importScripts("grammalecte/helpers.js"); -importScripts("grammalecte/str_transform.js"); -importScripts("grammalecte/char_player.js"); -importScripts("grammalecte/ibdawg.js"); +importScripts("grammalecte/graphspell/str_transform.js"); +importScripts("grammalecte/graphspell/char_player.js"); +importScripts("grammalecte/graphspell/ibdawg.js"); importScripts("grammalecte/text.js"); -importScripts("grammalecte/tokenizer.js"); +importScripts("grammalecte/graphspell/tokenizer.js"); importScripts("grammalecte/fr/conj.js"); importScripts("grammalecte/fr/mfsp.js"); importScripts("grammalecte/fr/phonet.js"); importScripts("grammalecte/fr/cregex.js"); importScripts("grammalecte/fr/gc_options.js"); Index: grammalecte-cli.py ================================================================== --- grammalecte-cli.py +++ grammalecte-cli.py @@ -7,12 +7,12 @@ import grammalecte.fr as gce import grammalecte.fr.lexicographe as lxg import grammalecte.fr.textformatter as tf import grammalecte.text as txt -import grammalecte.tokenizer as tkz -from grammalecte.echo import echo +import grammalecte.graphspell.tokenizer as tkz +from grammalecte.graphspell.echo import echo _EXAMPLE = "Quoi ? Racontes ! Racontes-moi ! Bon sangg, parles ! Oui. Il y a des menteur partout. " \ "Je suit sidéré par la brutales arrogance de cette homme-là. Quelle salopard ! Un escrocs de la pire espece. " \ "Quant sera t’il châtiés pour ses mensonge ? Merde ! J’en aie marre." Index: grammalecte-server.py ================================================================== --- grammalecte-server.py +++ grammalecte-server.py @@ -12,12 +12,12 @@ import grammalecte.fr as gce import grammalecte.fr.lexicographe as lxg import grammalecte.fr.textformatter as tf import grammalecte.text as txt -import grammalecte.tokenizer as tkz -from grammalecte.echo import echo +import grammalecte.graphspell.tokenizer as tkz +from grammalecte.graphspell.echo import echo HOMEPAGE = """ ADDED graphspell-js/char_player.js Index: graphspell-js/char_player.js ================================================================== --- /dev/null +++ graphspell-js/char_player.js @@ -0,0 +1,330 @@ +// list of similar chars +// useful for suggestion mechanism + +${map} + + +var char_player = { + + _dTransChars: new Map([ + ['à', 'a'], ['é', 'e'], ['î', 'i'], ['ô', 'o'], ['û', 'u'], ['ÿ', 'i'], ['y', 'i'], + ['â', 'a'], ['è', 'e'], ['ï', 'i'], ['ö', 'o'], ['ù', 'u'], ['ŷ', 'i'], + ['ä', 'a'], ['ê', 'e'], ['í', 'i'], ['ó', 'o'], ['ü', 'u'], ['ý', 'i'], + ['á', 'a'], ['ë', 'e'], ['ì', 'i'], ['ò', 'o'], ['ú', 'u'], ['ỳ', 'i'], + ['ā', 'a'], ['ē', 'e'], ['ī', 'i'], ['ō', 'o'], ['ū', 'u'], ['ȳ', 'i'], + ['ñ', 'n'], ['k', 'q'], ['w', 'v'], + ['œ', 'oe'], ['æ', 'ae'], + ]), + + simplifyWord: function (sWord) { + // word simplication before calculating distance between words + sWord = sWord.toLowerCase(); + let sNewWord = ""; + let i = 1; + for (let c of sWord) { + let cNew = this._dTransChars.gl_get(c, c); + let cNext = sWord.slice(i, i+1) + if (cNew != this._dTransChars.gl_get(cNext, cNext)) { + sNewWord += cNew; + } + i++; + } + return sNewWord.replace(/eau/g, "o").replace(/au/g, "o").replace(/ai/g, "e").replace(/ei/g, "e").replace(/ph/g, "f"); + }, + + aVowel: new Set("aáàâäāeéèêëēiíìîïīoóòôöōuúùûüūyýỳŷÿȳœæAÁÀÂÄĀEÉÈÊËĒIÍÌÎÏĪOÓÒÔÖŌUÚÙÛÜŪYÝỲŶŸȲŒÆ"), + aConsonant: new Set("bcçdfghjklmnñpqrstvwxzBCÇDFGHJKLMNÑPQRSTVWXZ"), + aDouble: new Set("bcdfjklmnprstzBCDFJKLMNPRSTZ"), // letters that may be used twice successively + + + // Similar chars + + d1to1: new Map([ + ["1", "liîLIÎ"], + ["2", "zZ"], + ["3", "eéèêEÉÈÊ"], + ["4", "aàâAÀÂ"], + ["5", "sgSG"], + ["6", "bdgBDG"], + ["7", "ltLT"], + ["8", "bB"], + ["9", "gbdGBD"], + ["0", "oôOÔ"], + + ["a", "aàâáäæ"], + ["A", "AÀÂÁÄÆ"], + ["à", "aàâáäæ"], + ["À", "AÀÂÁÄÆ"], + ["â", "aàâáäæ"], + ["Â", "AÀÂÁÄÆ"], + ["á", "aàâáäæ"], + ["Á", "AÀÂÁÄÆ"], + ["ä", "aàâáäæ"], + ["Ä", "AÀÂÁÄÆ"], + + ["æ", "æéa"], + ["Æ", "ÆÉA"], + + ["c", "cçskqśŝ"], + ["C", "CÇSKQŚŜ"], + ["ç", "cçskqśŝ"], + ["Ç", "CÇSKQŚŜ"], + + ["e", "eéèêëœ"], + ["E", "EÉÈÊËŒ"], + ["é", "eéèêëœ"], + ["É", "EÉÈÊËŒ"], + ["ê", "eéèêëœ"], + ["Ê", "EÉÈÊËŒ"], + ["è", "eéèêëœ"], + ["È", "EÉÈÊËŒ"], + ["ë", "eéèêëœ"], + ["Ë", "EÉÈÊËŒ"], + + ["g", "gj"], + ["G", "GJ"], + + ["i", "iîïyíìÿ"], + ["I", "IÎÏYÍÌŸ"], + ["î", "iîïyíìÿ"], + ["Î", "IÎÏYÍÌŸ"], + ["ï", "iîïyíìÿ"], + ["Ï", "IÎÏYÍÌŸ"], + ["í", "iîïyíìÿ"], + ["Í", "IÎÏYÍÌŸ"], + ["ì", "iîïyíìÿ"], + ["Ì", "IÎÏYÍÌŸ"], + + ["j", "jg"], + ["J", "JG"], + + ["k", "kcq"], + ["K", "KCQ"], + + ["n", "nñ"], + ["N", "NÑ"], + + ["o", "oôóòöœ"], + ["O", "OÔÓÒÖŒ"], + ["ô", "oôóòöœ"], + ["Ô", "OÔÓÒÖŒ"], + ["ó", "oôóòöœ"], + ["Ó", "OÔÓÒÖŒ"], + ["ò", "oôóòöœ"], + ["Ò", "OÔÓÒÖŒ"], + ["ö", "oôóòöœ"], + ["Ö", "OÔÓÒÖŒ"], + + ["œ", "œoôeéèêë"], + ["Œ", "ŒOÔEÉÈÊË"], + + ["q", "qck"], + ["Q", "QCK"], + + ["s", "sśŝcç"], + ["S", "SŚŜCÇ"], + ["ś", "sśŝcç"], + ["Ś", "SŚŜCÇ"], + ["ŝ", "sśŝcç"], + ["Ŝ", "SŚŜCÇ"], + + ["u", "uûùüú"], + ["U", "UÛÙÜÚ"], + ["û", "uûùüú"], + ["Û", "UÛÙÜÚ"], + ["ù", "uûùüú"], + ["Ù", "UÛÙÜÚ"], + ["ü", "uûùüú"], + ["Ü", "UÛÙÜÚ"], + ["ú", "uûùüú"], + ["Ú", "UÛÙÜÚ"], + + ["v", "vw"], + ["V", "VW"], + + ["w", "wv"], + ["W", "WV"], + + ["x", "xck"], + ["X", "XCK"], + + ["y", "yÿiîŷýỳ"], + ["Y", "YŸIÎŶÝỲ"], + ["ÿ", "yÿiîŷýỳ"], + ["Ÿ", "YŸIÎŶÝỲ"], + ["ŷ", "yÿiîŷýỳ"], + ["Ŷ", "YŸIÎŶÝỲ"], + ["ý", "yÿiîŷýỳ"], + ["Ý", "YŸIÎŶÝỲ"], + ["ỳ", "yÿiîŷýỳ"], + ["Ỳ", "YŸIÎŶÝỲ"], + + ["z", "zs"], + ["Z", "ZS"], + ]), + + d1toX: new Map([ + ["æ", ["ae",]], + ["Æ", ["AE",]], + ["b", ["bb",]], + ["B", ["BB",]], + ["c", ["cc", "ss", "qu", "ch"]], + ["C", ["CC", "SS", "QU", "CH"]], + ["d", ["dd",]], + ["D", ["DD",]], + ["é", ["ai", "ei"]], + ["É", ["AI", "EI"]], + ["f", ["ff", "ph"]], + ["F", ["FF", "PH"]], + ["g", ["gu", "ge", "gg", "gh"]], + ["G", ["GU", "GE", "GG", "GH"]], + ["j", ["jj", "dj"]], + ["J", ["JJ", "DJ"]], + ["k", ["qu", "ck", "ch", "cu", "kk", "kh"]], + ["K", ["QU", "CK", "CH", "CU", "KK", "KH"]], + ["l", ["ll",]], + ["L", ["LL",]], + ["m", ["mm", "mn"]], + ["M", ["MM", "MN"]], + ["n", ["nn", "nm", "mn"]], + ["N", ["NN", "NM", "MN"]], + ["o", ["au", "eau"]], + ["O", ["AU", "EAU"]], + ["œ", ["oe", "eu"]], + ["Œ", ["OE", "EU"]], + ["p", ["pp", "ph"]], + ["P", ["PP", "PH"]], + ["q", ["qu", "ch", "cq", "ck", "kk"]], + ["Q", ["QU", "CH", "CQ", "CK", "KK"]], + ["r", ["rr",]], + ["R", ["RR",]], + ["s", ["ss", "sh"]], + ["S", ["SS", "SH"]], + ["t", ["tt", "th"]], + ["T", ["TT", "TH"]], + ["x", ["cc", "ct", "xx"]], + ["X", ["CC", "CT", "XX"]], + ["z", ["ss", "zh"]], + ["Z", ["SS", "ZH"]], + ]), + + get1toXReplacement: function (cPrev, cCur, cNext) { + if (this.aConsonant.has(cCur) && (this.aConsonant.has(cPrev) || this.aConsonant.has(cNext))) { + return []; + } + return this.d1toX.gl_get(cCur, []); + }, + + d2toX: new Map([ + ["am", ["an", "en", "em"]], + ["AM", ["AN", "EN", "EM"]], + ["an", ["am", "en", "em"]], + ["AN", ["AM", "EN", "EM"]], + ["au", ["eau", "o", "ô"]], + ["AU", ["EAU", "O", "Ô"]], + ["em", ["an", "am", "en"]], + ["EM", ["AN", "AM", "EN"]], + ["en", ["an", "am", "em"]], + ["EN", ["AN", "AM", "EM"]], + ["ai", ["ei", "é", "è", "ê", "ë"]], + ["AI", ["EI", "É", "È", "Ê", "Ë"]], + ["ei", ["ai", "é", "è", "ê", "ë"]], + ["EI", ["AI", "É", "È", "Ê", "Ë"]], + ["ch", ["sh", "c", "ss"]], + ["CH", ["SH", "C", "SS"]], + ["ct", ["x", "cc"]], + ["CT", ["X", "CC"]], + ["oa", ["oi",]], + ["OA", ["OI",]], + ["oi", ["oa", "oie"]], + ["OI", ["OA", "OIE"]], + ["ph", ["f",]], + ["PH", ["F",]], + ["qu", ["q", "cq", "ck", "c", "k"]], + ["QU", ["Q", "CQ", "CK", "C", "K"]], + ["ss", ["c", "ç"]], + ["SS", ["C", "Ç"]], + ["un", ["ein",]], + ["UN", ["EIN",]], + ]), + + // End of word + dFinal1: new Map([ + ["a", ["as", "at", "ant", "ah"]], + ["A", ["AS", "AT", "ANT", "AH"]], + ["c", ["ch",]], + ["C", ["CH",]], + ["e", ["et", "er", "ets", "ée", "ez", "ai", "ais", "ait", "ent", "eh"]], + ["E", ["ET", "ER", "ETS", "ÉE", "EZ", "AI", "AIS", "AIT", "ENT", "EH"]], + ["é", ["et", "er", "ets", "ée", "ez", "ai", "ais", "ait"]], + ["É", ["ET", "ER", "ETS", "ÉE", "EZ", "AI", "AIS", "AIT"]], + ["è", ["et", "er", "ets", "ée", "ez", "ai", "ais", "ait"]], + ["È", ["ET", "ER", "ETS", "ÉE", "EZ", "AI", "AIS", "AIT"]], + ["ê", ["et", "er", "ets", "ée", "ez", "ai", "ais", "ait"]], + ["Ê", ["ET", "ER", "ETS", "ÉE", "EZ", "AI", "AIS", "AIT"]], + ["ë", ["et", "er", "ets", "ée", "ez", "ai", "ais", "ait"]], + ["Ë", ["ET", "ER", "ETS", "ÉE", "EZ", "AI", "AIS", "AIT"]], + ["g", ["gh",]], + ["G", ["GH",]], + ["i", ["is", "it", "ie", "in"]], + ["I", ["IS", "IT", "IE", "IN"]], + ["n", ["nt", "nd", "ns", "nh"]], + ["N", ["NT", "ND", "NS", "NH"]], + ["o", ["aut", "ot", "os"]], + ["O", ["AUT", "OT", "OS"]], + ["ô", ["aut", "ot", "os"]], + ["Ô", ["AUT", "OT", "OS"]], + ["ö", ["aut", "ot", "os"]], + ["Ö", ["AUT", "OT", "OS"]], + ["p", ["ph",]], + ["P", ["PH",]], + ["s", ["sh",]], + ["S", ["SH",]], + ["t", ["th",]], + ["T", ["TH",]], + ["u", ["ut", "us", "uh"]], + ["U", ["UT", "US", "UH"]], + ]), + + dFinal2: new Map([ + ["ai", ["aient", "ais", "et"]], + ["AI", ["AIENT", "AIS", "ET"]], + ["an", ["ant", "ent"]], + ["AN", ["ANT", "ENT"]], + ["en", ["ent", "ant"]], + ["EN", ["ENT", "ANT"]], + ["ei", ["ait", "ais"]], + ["EI", ["AIT", "AIS"]], + ["on", ["ons", "ont"]], + ["ON", ["ONS", "ONT"]], + ["oi", ["ois", "oit", "oix"]], + ["OI", ["OIS", "OIT", "OIX"]], + ]), + + + // Préfixes et suffixes + aPfx1: new Set([ + "anti", "archi", "contre", "hyper", "mé", "méta", "im", "in", "ir", "par", "proto", + "pseudo", "pré", "re", "ré", "sans", "sous", "supra", "sur", "ultra" + ]), + + aPfx2: new Set([ + "belgo", "franco", "génito", "gynéco", "médico", "russo" + ]), + + + cut: function (sWord) { + // returns an arry of strings (prefix, trimed_word, suffix) + let m = /^([a-zA-Zà-öÀ-Ö0-9_ø-ÿØ-ßĀ-ʯfi-st]+)(-(?:t-|)(?:ils?|elles|on|je|tu|nous|vous)$)/.exec(sWord); + if (m) { + return ["", m[1], m[2]]; + } + return ["", sWord, ""]; + }, + + // Other functions + filterSugg: function (aSugg) { + return aSugg.filter((sSugg) => { return !sSugg.endsWith("è") && !sSugg.endsWith("È"); }); + } + +} ADDED graphspell-js/helpers.js Index: graphspell-js/helpers.js ================================================================== --- /dev/null +++ graphspell-js/helpers.js @@ -0,0 +1,100 @@ + +// HELPERS +/*jslint esversion: 6*/ +/*global console,require,exports,XMLHttpRequest*/ + +"use strict"; + +// In Firefox, there is no console.log in PromiseWorker, but there is worker.log. +// In Thunderbird, you can’t access to console directly. So it’s required to pass a log function. +let funcOutput = null; + +var helpers = { + + setLogOutput: function (func) { + funcOutput = func; + }, + + echo: function (obj) { + if (funcOutput !== null) { + funcOutput(obj); + } else { + console.log(obj); + } + return true; + }, + + logerror: function (e, bStack=false) { + let sMsg = "\n" + e.fileName + "\n" + e.name + "\nline: " + e.lineNumber + "\n" + e.message; + if (bStack) { + sMsg += "\n--- Stack ---\n" + e.stack; + } + if (funcOutput !== null) { + funcOutput(sMsg); + } else { + console.error(sMsg); + } + }, + + inspect: function (o) { + let sMsg = "__inspect__: " + typeof o; + for (let sParam in o) { + sMsg += "\n" + sParam + ": " + o.sParam; + } + sMsg += "\n" + JSON.stringify(o) + "\n__end__"; + this.echo(sMsg); + }, + + loadFile: function (spf) { + // load ressources in workers (suggested by Mozilla extensions reviewers) + // for more options have a look here: https://gist.github.com/Noitidart/ec1e6b9a593ec7e3efed + // if not in workers, use sdk/data.load() instead + try { + let xRequest; + if (typeof XMLHttpRequest !== "undefined") { + xRequest = new XMLHttpRequest(); + } else { + // JS sucks again… necessary for Thunderbird + let { Cc, Ci } = require("chrome"); + xRequest = Cc["@mozilla.org/xmlextras/xmlhttprequest;1"].createInstance(); + xRequest.QueryInterface(Ci.nsIXMLHttpRequest); + } + xRequest.open('GET', spf, false); // 3rd arg is false for synchronous, sync is acceptable in workers + xRequest.overrideMimeType('text/json'); + xRequest.send(); + return xRequest.responseText; + } + catch (e) { + this.logerror(e); + return null; + } + }, + + // conversions + objectToMap: function (obj) { + let m = new Map(); + for (let param in obj) { + m.set(param, obj[param]); + } + return m; + }, + + mapToObject: function (m) { + let obj = {}; + for (let [k, v] of m) { + obj[k] = v; + } + return obj; + } +}; + + +if (typeof(exports) !== 'undefined') { + exports.setLogOutput = helpers.setLogOutput; + exports.echo = helpers.echo; + exports.logerror = helpers.logerror; + exports.inspect = helpers.inspect; + exports.loadFile = helpers.loadFile; + exports.objectToMap = helpers.objectToMap; + exports.mapToObject = helpers.mapToObject; +} ADDED graphspell-js/ibdawg.js Index: graphspell-js/ibdawg.js ================================================================== --- /dev/null +++ graphspell-js/ibdawg.js @@ -0,0 +1,513 @@ +//// IBDAWG +/*jslint esversion: 6*/ +/*global console,require,exports*/ + +"use strict"; + + +if (typeof(require) !== 'undefined') { + var str_transform = require("resource://grammalecte/str_transform.js"); + var helpers = require("resource://grammalecte/helpers.js"); + var char_player = require("resource://grammalecte/char_player.js"); +} + + +// Don’t remove . Necessary in TB. +${string} +${map} +${set} + + +class SuggResult { + // Structure for storing, classifying and filtering suggestions + + constructor (sWord, nDistLimit=-1) { + this.sWord = sWord; + this.sSimplifiedWord = char_player.simplifyWord(sWord); + this.nDistLimit = (nDistLimit >= 0) ? nDistLimit : Math.floor(sWord.length / 3) + 1; + this.nMinDist = 1000; + this.aSugg = new Set(); + this.dSugg = new Map([ [0, []], [1, []], [2, []] ]); + } + + addSugg (sSugg, nDeep=0) { + // add a suggestion + if (!this.aSugg.has(sSugg)) { + let nDist = str_transform.distanceDamerauLevenshtein(this.sSimplifiedWord, char_player.simplifyWord(sSugg)); + if (nDist <= this.nDistLimit) { + if (!this.dSugg.has(nDist)) { + this.dSugg.set(nDist, []); + } + this.dSugg.get(nDist).push(sSugg); + this.aSugg.add(sSugg); + if (nDist < this.nMinDist) { + this.nMinDist = nDist; + } + this.nDistLimit = Math.min(this.nDistLimit, this.nMinDist+2); + } + } + } + + getSuggestions (nSuggLimit=10, nDistLimit=-1) { + // return a list of suggestions + let lRes = []; + if (this.dSugg.get(0).length) { + // we sort the better results with the original word + let dDistTemp = new Map(); + lRes.forEach((sSugg) => { dDistTemp.set(sSugg, str_transform.distanceDamerauLevenshtein(this.sWord, sSugg)); }); + lRes = lRes.sort((sA, sB) => { return dDistTemp.get(sA) - dDistTemp.get(sB); }); + dDistTemp.clear(); + } + for (let lSugg of this.dSugg.values()) { + for (let sSugg of lSugg) { lRes.push(sSugg); } + if (lRes.length > nSuggLimit) { + break; + } + } + lRes = char_player.filterSugg(lRes); + if (this.sWord.gl_isTitle()) { + lRes = lRes.map((sSugg) => { return sSugg.gl_toCapitalize(); }); + } + else if (this.sWord.gl_isUpperCase()) { + lRes = lRes.map((sSugg) => { return sSugg.toUpperCase(); }); + } + return lRes.slice(0, nSuggLimit); + } + + reset () { + this.aSugg.clear(); + this.dSugg.clear(); + } +} + + +class IBDAWG { + // INDEXABLE BINARY DIRECT ACYCLIC WORD GRAPH + + constructor (sDicName, sPath="") { + try { + let sURL = (sPath !== "") ? sPath + "/" + sDicName : "resource://grammalecte/_dictionaries/"+sDicName; + const dict = JSON.parse(helpers.loadFile(sURL)); + Object.assign(this, dict); + } + catch (e) { + throw Error("# Error. File not found or not loadable.\n" + e.message + "\n"); + } + /* + Properties: + sName, nVersion, sHeader, lArcVal, nArcVal, byDic, sLang, nChar, nBytesArc, nBytesNodeAddress, + nEntries, nNode, nArc, nAff, cStemming, nTag, dChar, _arcMask, _finalNodeMask, _lastArcMask, _addrBitMask, nBytesOffset, + */ + + /* + Bug workaround. + Mozilla’s JS parser sucks. Can’t read file bigger than 4 Mb! + So we convert huge hexadecimal string to list of numbers… + https://github.com/mozilla/addons-linter/issues/1361 + */ + let lTemp = []; + for (let i = 0; i < this.byDic.length; i+=2) { + lTemp.push(parseInt(this.byDic.slice(i, i+2), 16)); + } + this.byDic = lTemp; + /* end of bug workaround */ + + if (!this.sHeader.startsWith("/pyfsa/")) { + throw TypeError("# Error. Not a pyfsa binary dictionary. Header: " + this.sHeader); + } + if (!(this.nVersion == "1" || this.nVersion == "2" || this.nVersion == "3")) { + throw RangeError("# Error. Unknown dictionary version: " + this.nVersion); + } + // to get the value of an arc, to get the char of an arc with its value + this.dChar = helpers.objectToMap(this.dChar); + this.dCharVal = this.dChar.gl_reverse(); + //this.byDic = new Uint8Array(this.byDic); // not quicker, even slower + + if (this.cStemming == "S") { + this.funcStemming = str_transform.getStemFromSuffixCode; + } else if (this.cStemming == "A") { + this.funcStemming = str_transform.getStemFromAffixCode; + } else { + this.funcStemming = str_transform.noStemming; + } + + // Configuring DAWG functions according to nVersion + switch (this.nVersion) { + case 1: + this.morph = this._morph1; + this.stem = this._stem1; + this._lookupArcNode = this._lookupArcNode1; + this._getArcs = this._getArcs1; + this._writeNodes = this._writeNodes1; + break; + case 2: + this.morph = this._morph2; + this.stem = this._stem2; + this._lookupArcNode = this._lookupArcNode2; + this._getArcs = this._getArcs2; + this._writeNodes = this._writeNodes2; + break; + case 3: + this.morph = this._morph3; + this.stem = this._stem3; + this._lookupArcNode = this._lookupArcNode3; + this._getArcs = this._getArcs3; + this._writeNodes = this._writeNodes3; + break; + default: + throw ValueError("# Error: unknown code: " + this.nVersion); + } + //console.log(this.getInfo()); + this.bOptNumSigle = true; + this.bOptNumAtLast = false; + } + + getInfo () { + return ` Language: ${this.sLang} Version: ${this.nVersion} Stemming: ${this.cStemming}FX\n` + + ` Arcs values: ${this.nArcVal} = ${this.nChar} characters, ${this.nAff} affixes, ${this.nTag} tags\n` + + ` Dictionary: ${this.nEntries} entries, ${this.nNode} nodes, ${this.nArc} arcs\n` + + ` Address size: ${this.nBytesNodeAddress} bytes, Arc size: ${this.nBytesArc} bytes\n`; + } + + isValidToken (sToken) { + // checks if sToken is valid (if there is hyphens in sToken, sToken is split, each part is checked) + if (this.isValid(sToken)) { + return true; + } + if (sToken.includes("-")) { + if (sToken.gl_count("-") > 4) { + return true; + } + return sToken.split("-").every(sWord => this.isValid(sWord)); + } + return false; + } + + isValid (sWord) { + // checks if sWord is valid (different casing tested if the first letter is a capital) + if (!sWord) { + return null; + } + if (sWord.includes("’")) { // ugly hack + sWord = sWord.replace("’", "'"); + } + if (this.lookup(sWord)) { + return true; + } + if (sWord.charAt(0).gl_isUpperCase()) { + if (sWord.length > 1) { + if (sWord.gl_isTitle()) { + return !!this.lookup(sWord.toLowerCase()); + } + if (sWord.gl_isUpperCase()) { + if (this.bOptNumSigle) { + return true; + } + return !!(this.lookup(sWord.toLowerCase()) || this.lookup(sWord.gl_toCapitalize())); + } + return !!this.lookup(sWord.slice(0, 1).toLowerCase() + sWord.slice(1)); + } else { + return !!this.lookup(sWord.toLowerCase()); + } + } + return false; + } + + _convBytesToInteger (aBytes) { + // Byte order = Big Endian (bigger first) + let nVal = 0; + let nWeight = (aBytes.length - 1) * 8; + for (let n of aBytes) { + nVal += n << nWeight; + nWeight = nWeight - 8; + } + return nVal; + } + + lookup (sWord) { + // returns true if sWord in dictionary (strict verification) + let iAddr = 0; + for (let c of sWord) { + if (!this.dChar.has(c)) { + return false; + } + iAddr = this._lookupArcNode(this.dChar.get(c), iAddr); + if (iAddr === null) { + return false; + } + } + return Boolean(this._convBytesToInteger(this.byDic.slice(iAddr, iAddr+this.nBytesArc)) & this._finalNodeMask); + } + + getMorph (sWord) { + // retrieves morphologies list, different casing allowed + let l = this.morph(sWord); + if (sWord[0].gl_isUpperCase()) { + l = l.concat(this.morph(sWord.toLowerCase())); + if (sWord.gl_isUpperCase() && sWord.length > 1) { + l = l.concat(this.morph(sWord.gl_toCapitalize())); + } + } + return l; + } + + suggest (sWord, nSuggLimit=10) { + // returns a array of suggestions for + let sPfx = ""; + let sSfx = ""; + [sPfx, sWord, sSfx] = char_player.cut(sWord); + let nMaxSwitch = Math.max(Math.floor(sWord.length / 3), 1); + let nMaxDel = Math.floor(sWord.length / 5); + let nMaxHardRepl = Math.max(Math.floor((sWord.length - 5) / 4), 1); + let oSuggResult = new SuggResult(sWord); + this._suggest(oSuggResult, sWord, nMaxSwitch, nMaxDel, nMaxHardRepl); + if (sWord.gl_isTitle()) { + this._suggest(oSuggResult, sWord.toLowerCase(), nMaxSwitch, nMaxDel, nMaxHardRepl); + } + else if (sWord.gl_isLowerCase()) { + this._suggest(oSuggResult, sWord.gl_toCapitalize(), nMaxSwitch, nMaxDel, nMaxHardRepl); + } + let aSugg = oSuggResult.getSuggestions(nSuggLimit); + if (sSfx || sPfx) { + // we add what we removed + return aSugg.map( (sSugg) => { return sPfx + sSugg + sSfx } ); + } + return aSugg; + } + + _suggest (oSuggResult, sRemain, nMaxSwitch=0, nMaxDel=0, nMaxHardRepl=0, nDeep=0, iAddr=0, sNewWord="", bAvoidLoop=false) { + // returns a set of suggestions + // recursive function + if (sRemain == "") { + if (this._convBytesToInteger(this.byDic.slice(iAddr, iAddr+this.nBytesArc)) & this._finalNodeMask) { + oSuggResult.addSugg(sNewWord); + } + for (let sTail of this._getTails(iAddr)) { + oSuggResult.addSugg(sNewWord+sTail); + } + return; + } + let cCurrent = sRemain.slice(0, 1); + for (let [cChar, jAddr] of this._getCharArcs(iAddr)) { + if (char_player.d1to1.gl_get(cCurrent, cCurrent).indexOf(cChar) != -1) { + this._suggest(oSuggResult, sRemain.slice(1), nMaxSwitch, nMaxDel, nMaxHardRepl, nDeep+1, jAddr, sNewWord+cChar); + } + else if (!bAvoidLoop && nMaxHardRepl) { + this._suggest(oSuggResult, sRemain.slice(1), nMaxSwitch, nMaxDel, nMaxHardRepl-1, nDeep+1, jAddr, sNewWord+cChar, true); + } + } + if (!bAvoidLoop) { // avoid infinite loop + if (sRemain.length > 1) { + if (cCurrent == sRemain.slice(1, 2)) { + // same char, we remove 1 char without adding 1 to + this._suggest(oSuggResult, sRemain.slice(1), nMaxSwitch, nMaxDel, nMaxHardRepl, nDeep+1, iAddr, sNewWord); + } + else { + // switching chars + if (nMaxSwitch > 0) { + this._suggest(oSuggResult, sRemain.slice(1, 2)+sRemain.slice(0, 1)+sRemain.slice(2), nMaxSwitch-1, nMaxDel, nMaxHardRepl, nDeep+1, iAddr, sNewWord, true); + } + // delete char + if (nMaxDel > 0) { + this._suggest(oSuggResult, sRemain.slice(1), nMaxSwitch, nMaxDel-1, nMaxHardRepl, nDeep+1, iAddr, sNewWord, true); + } + } + // Phonetic replacements + for (let sRepl of char_player.get1toXReplacement(sNewWord.slice(-1), cCurrent, sRemain.slice(1,2))) { + this._suggest(oSuggResult, sRepl + sRemain.slice(1), nMaxSwitch, nMaxDel, nMaxHardRepl, nDeep+1, iAddr, sNewWord, true); + } + for (let sRepl of char_player.d2toX.gl_get(sRemain.slice(0, 2), [])) { + this._suggest(oSuggResult, sRepl + sRemain.slice(2), nMaxSwitch, nMaxDel, nMaxHardRepl, nDeep+1, iAddr, sNewWord, true); + } + } + // end of word + if (sRemain.length == 2) { + for (let sRepl of char_player.dFinal2.gl_get(sRemain, [])) { + this._suggest(oSuggResult, sRepl, nMaxSwitch, nMaxDel, nMaxHardRepl, nDeep+1, iAddr, sNewWord, true); + } + } + else if (sRemain.length == 1) { + this._suggest(oSuggResult, "", nMaxSwitch, nMaxDel, nMaxHardRepl, nDeep+1, iAddr, sNewWord, true); // remove last char and go on + for (let sRepl of char_player.dFinal1.gl_get(sRemain, [])) { + this._suggest(oSuggResult, sRepl, nMaxSwitch, nMaxDel, nMaxHardRepl, nDeep+1, iAddr, sNewWord, true); + } + } + } + } + + * _getCharArcs (iAddr) { + // generator: yield all chars and addresses from node at address + for (let [nVal, jAddr] of this._getArcs(iAddr)) { + if (nVal < this.nChar) { + yield [this.dCharVal.get(nVal), jAddr]; + } + } + } + + * _getSimilarCharArcs (cChar, iAddr) { + // generator: yield similar char of and address of the following node + for (let c of char_player.d1to1.gl_get(cChar, [cChar])) { + if (this.dChar.has(c)) { + let jAddr = this._lookupArcNode(this.dChar.get(c), iAddr); + if (jAddr) { + yield [c, jAddr]; + } + } + } + } + + _getTails (iAddr, sTail="", n=2) { + // return a list of suffixes ending at a distance of from + let aTails = new Set(); + for (let [nVal, jAddr] of this._getArcs(iAddr)) { + if (nVal < this.nChar) { + if (this._convBytesToInteger(this.byDic.slice(jAddr, jAddr+this.nBytesArc)) & this._finalNodeMask) { + aTails.add(sTail + this.dCharVal.get(nVal)); + } + if (n && aTails.size == 0) { + aTails.gl_update(this._getTails(jAddr, sTail+this.dCharVal.get(nVal), n-1)); + } + } + } + return aTails; + } + + // morph (sWord) { + // is defined in constructor + // } + + // VERSION 1 + _morph1 (sWord) { + // returns morphologies of sWord + let iAddr = 0; + for (let c of sWord) { + if (!this.dChar.has(c)) { + return []; + } + iAddr = this._lookupArcNode(this.dChar.get(c), iAddr); + if (iAddr === null) { + return []; + } + } + if (this._convBytesToInteger(this.byDic.slice(iAddr, iAddr+this.nBytesArc)) & this._finalNodeMask) { + let l = []; + let nRawArc = 0; + while (!(nRawArc & this._lastArcMask)) { + let iEndArcAddr = iAddr + this.nBytesArc; + nRawArc = this._convBytesToInteger(this.byDic.slice(iAddr, iEndArcAddr)); + let nArc = nRawArc & this._arcMask; + if (nArc >= this.nChar) { + // This value is not a char, this is a stemming code + let sStem = ">" + this.funcStemming(sWord, this.lArcVal[nArc]); + // Now , we go to the next node and retrieve all following arcs values, all of them are tags + let iAddr2 = this._convBytesToInteger(this.byDic.slice(iEndArcAddr, iEndArcAddr+this.nBytesNodeAddress)); + let nRawArc2 = 0; + while (!(nRawArc2 & this._lastArcMask)) { + let iEndArcAddr2 = iAddr2 + this.nBytesArc; + nRawArc2 = this._convBytesToInteger(this.byDic.slice(iAddr2, iEndArcAddr2)); + l.push(sStem + " " + this.lArcVal[nRawArc2 & this._arcMask]); + iAddr2 = iEndArcAddr2+this.nBytesNodeAddress; + } + } + iAddr = iEndArcAddr + this.nBytesNodeAddress; + } + return l; + } + return []; + } + + _stem1 (sWord) { + // returns stems list of sWord + let iAddr = 0; + for (let c of sWord) { + if (!this.dChar.has(c)) { + return []; + } + iAddr = this._lookupArcNode(this.dChar.get(c), iAddr); + if (iAddr === null) { + return []; + } + } + if (this._convBytesToInteger(this.byDic.slice(iAddr, iAddr+this.nBytesArc)) & this._finalNodeMask) { + let l = []; + let nRawArc = 0; + while (!(nRawArc & this._lastArcMask)) { + let iEndArcAddr = iAddr + this.nBytesArc; + nRawArc = this._convBytesToInteger(this.byDic.slice(iAddr, iEndArcAddr)); + let nArc = nRawArc & this._arcMask; + if (nArc >= this.nChar) { + // This value is not a char, this is a stemming code + l.push(this.funcStemming(sWord, this.lArcVal[nArc])); + } + iAddr = iEndArcAddr + this.nBytesNodeAddress; + } + return l; + } + return []; + } + + _lookupArcNode1 (nVal, iAddr) { + // looks if nVal is an arc at the node at iAddr, if yes, returns address of next node else None + while (true) { + let iEndArcAddr = iAddr+this.nBytesArc; + let nRawArc = this._convBytesToInteger(this.byDic.slice(iAddr, iEndArcAddr)); + if (nVal == (nRawArc & this._arcMask)) { + // the value we are looking for + // we return the address of the next node + return this._convBytesToInteger(this.byDic.slice(iEndArcAddr, iEndArcAddr+this.nBytesNodeAddress)); + } + else { + // value not found + if (nRawArc & this._lastArcMask) { + return null; + } + iAddr = iEndArcAddr + this.nBytesNodeAddress; + } + } + } + + * _getArcs1 (iAddr) { + "generator: return all arcs at as tuples of (nVal, iAddr)" + while (true) { + let iEndArcAddr = iAddr+this.nBytesArc; + let nRawArc = this._convBytesToInteger(this.byDic.slice(iAddr, iEndArcAddr)); + yield [nRawArc & this._arcMask, this._convBytesToInteger(this.byDic.slice(iEndArcAddr, iEndArcAddr+this.nBytesNodeAddress))]; + if (nRawArc & this._lastArcMask) { + break; + } + iAddr = iEndArcAddr+this.nBytesNodeAddress; + } + } + + // VERSION 2 + _morph2 (sWord) { + // to do + } + + _stem2 (sWord) { + // to do + } + + _lookupArcNode2 (nVal, iAddr) { + // to do + } + + + // VERSION 3 + _morph3 (sWord) { + // to do + } + + _stem3 (sWord) { + // to do + } + + _lookupArcNode3 (nVal, iAddr) { + // to do + } +} + + +if (typeof(exports) !== 'undefined') { + exports.IBDAWG = IBDAWG; +} ADDED graphspell-js/str_transform.js Index: graphspell-js/str_transform.js ================================================================== --- /dev/null +++ graphspell-js/str_transform.js @@ -0,0 +1,121 @@ +//// STRING TRANSFORMATION +/*jslint esversion: 6*/ + +// Note: 48 is the ASCII code for "0" + +var str_transform = { + + distanceDamerauLevenshtein2: function (s1, s2) { + // distance of Damerau-Levenshtein between and + // https://fr.wikipedia.org/wiki/Distance_de_Damerau-Levenshtein + try { + let nLen1 = s1.length; + let nLen2 = s2.length; + let matrix = []; + for (let i = 0; i <= nLen1; i++) { + matrix[i] = new Array(nLen2 + 1); + } + for (let i = 0; i <= nLen1; i++) { + matrix[i][0] = i; + } + for (let j = 0; j <= nLen2; j++) { + matrix[0][j] = j; + } + for (let i = 1; i <= nLen1; i++) { + for (let j = 1; j <= nLen2; j++) { + let nCost = (s1[i] === s2[j]) ? 0 : 1; + matrix[i][j] = Math.min( + matrix[i-1][j] + 1, // Deletion + matrix[i][j-1] + 1, // Insertion + matrix[i-1][j-1] + nCost // Substitution + ); + if (i > 1 && j > 1 && s1[i] == s2[j-1] && s1[i-1] == s2[j]) { + matrix[i][j] = Math.min(matrix[i][j], matrix[i-2][j-2] + nCost); // Transposition + } + } + } + return matrix[nLen1][nLen2]; + } + catch (e) { + helpers.logerror(e); + } + }, + + distanceDamerauLevenshtein: function (s1, s2) { + // distance of Damerau-Levenshtein between and + // https://fr.wikipedia.org/wiki/Distance_de_Damerau-Levenshtein + try { + let nLen1 = s1.length; + let nLen2 = s2.length; + let INF = nLen1 + nLen2; + let matrix = []; + let sd = {}; + for (let i = 0; i < nLen1+2; i++) { + matrix[i] = new Array(nLen2+2); + } + matrix[0][0] = INF; + for (let i = 0; i <= nLen1; i++) { + matrix[i+1][1] = i; + matrix[i+1][0] = INF; + sd[s1[i]] = 0; + } + for (let j = 0; j <= nLen2; j++) { + matrix[1][j+1] = j; + matrix[0][j+1] = INF; + sd[s2[j]] = 0; + } + + for (let i = 1; i <= nLen1; i++) { + let DB = 0; + for (let j = 1; j <= nLen2; j++) { + let i1 = sd[s2[j-1]]; + let j1 = DB; + if (s1[i-1] === s2[j-1]) { + matrix[i+1][j+1] = matrix[i][j]; + DB = j; + } + else { + matrix[i+1][j+1] = Math.min(matrix[i][j], Math.min(matrix[i+1][j], matrix[i][j+1])) + 1; + } + matrix[i+1][j+1] = Math.min(matrix[i+1][j+1], matrix[i1] ? matrix[i1][j1] + (i-i1-1) + 1 + (j-j1-1) : Infinity); + } + sd[s1[i-1]] = i; + } + return matrix[nLen1+1][nLen2+1]; + } + catch (e) { + helpers.logerror(e); + } + }, + + showDistance (s1, s2) { + console.log(`Distance: ${s1} / ${s2} = ${this.distanceDamerauLevenshtein(s1, s2)})`); + }, + + getStemFromSuffixCode: function (sFlex, sSfxCode) { + // Suffix only + if (sSfxCode == "0") { + return sFlex; + } + return sSfxCode[0] == '0' ? sFlex + sSfxCode.slice(1) : sFlex.slice(0, -(sSfxCode.charCodeAt(0)-48)) + sSfxCode.slice(1); + }, + + getStemFromAffixCode: function (sFlex, sAffCode) { + // Prefix and suffix + if (sAffCode == "0") { + return sFlex; + } + if (!sAffCode.includes("/")) { + return "# error #"; + } + let [sPfxCode, sSfxCode] = sAffCode.split('/'); + sFlex = sPfxCode.slice(1) + sFlex.slice(sPfxCode.charCodeAt(0)-48); + return sSfxCode[0] == '0' ? sFlex + sSfxCode.slice(1) : sFlex.slice(0, -(sSfxCode.charCodeAt(0)-48)) + sSfxCode.slice(1); + } +}; + + +if (typeof(exports) !== 'undefined') { + exports.getStemFromSuffixCode = str_transform.getStemFromSuffixCode; + exports.getStemFromAffixCode = str_transform.getStemFromAffixCode; +} ADDED graphspell-js/tokenizer.js Index: graphspell-js/tokenizer.js ================================================================== --- /dev/null +++ graphspell-js/tokenizer.js @@ -0,0 +1,105 @@ +// JavaScript +// Very simple tokenizer +/*jslint esversion: 6*/ +/*global require,exports*/ + +"use strict"; + + +if (typeof(require) !== 'undefined') { + var helpers = require("resource://grammalecte/helpers.js"); +} + + +const aTkzPatterns = { + // All regexps must start with ^. + "default": + [ + [/^[   \t]+/, 'SPACE'], + [/^\/(?:~|bin|boot|dev|etc|home|lib|mnt|opt|root|sbin|tmp|usr|var|Bureau|Documents|Images|Musique|Public|Téléchargements|Vidéos)(?:\/[a-zA-Zà-öÀ-Ö0-9ø-ÿØ-ßĀ-ʯfi-st_.()-]+)*/, 'FOLDERUNIX'], + [/^[a-zA-Z]:\\(?:Program Files(?: \(x86\)|)|[a-zA-Zà-öÀ-Ö0-9ø-ÿØ-ßĀ-ʯfi-st.()]+)(?:\\[a-zA-Zà-öÀ-Ö0-9ø-ÿØ-ßĀ-ʯfi-st_.()-]+)*/, 'FOLDERWIN'], + [/^[,.;:!?…«»“”‘’"(){}\[\]/·–—]+/, 'SEPARATOR'], + [/^[A-Z][.][A-Z][.](?:[A-Z][.])*/, 'ACRONYM'], + [/^(?:https?:\/\/|www[.]|[a-zA-Zà-öÀ-Ö0-9ø-ÿØ-ßĀ-ʯfi-st_-]+[@.][a-zA-Zà-öÀ-Ö0-9ø-ÿØ-ßĀ-ʯfi-st_-]{2,}[@.])[a-zA-Z0-9][a-zA-Zà-öÀ-Ö0-9ø-ÿØ-ßĀ-ʯfi-st_.\/?&!%=+*"'@$#-]+/, 'LINK'], + [/^[#@][a-zA-Zà-öÀ-Ö0-9ø-ÿØ-ßĀ-ʯfi-st_-]+/, 'TAG'], + [/^<[a-zA-Zà-öÀ-Ö0-9ø-ÿØ-ßĀ-ʯfi-st]+.*?>|<\/[a-zA-Zà-öÀ-Ö0-9ø-ÿØ-ßĀ-ʯfi-st]+ *>/, 'HTML'], + [/^\[\/?[a-zA-Zà-öÀ-Ö0-9ø-ÿØ-ßĀ-ʯfi-st]+\]/, 'PSEUDOHTML'], + [/^&\w+;(?:\w+;|)/, 'HTMLENTITY'], + [/^\d\d?h\d\d\b/, 'HOUR'], + [/^-?\d+(?:[.,]\d+|)/, 'NUM'], + [/^[a-zA-Zà-öÀ-Ö0-9ø-ÿØ-ßĀ-ʯfi-st]+(?:[’'`-][a-zA-Zà-öÀ-Ö0-9ø-ÿØ-ßĀ-ʯfi-st]+)*/, 'WORD'] + ], + "fr": + [ + [/^[   \t]+/, 'SPACE'], + [/^\/(?:~|bin|boot|dev|etc|home|lib|mnt|opt|root|sbin|tmp|usr|var|Bureau|Documents|Images|Musique|Public|Téléchargements|Vidéos)(?:\/[a-zA-Zà-öÀ-Ö0-9ø-ÿØ-ßĀ-ʯfi-st_.()-]+)*/, 'FOLDERUNIX'], + [/^[a-zA-Z]:\\(?:Program Files(?: \(x86\)|)|[a-zA-Zà-öÀ-Ö0-9ø-ÿØ-ßĀ-ʯfi-st.()]+)(?:\\[a-zA-Zà-öÀ-Ö0-9ø-ÿØ-ßĀ-ʯfi-st_.()-]+)*/, 'FOLDERWIN'], + [/^[,.;:!?…«»“”‘’"(){}\[\]/·–—]+/, 'SEPARATOR'], + [/^[A-Z][.][A-Z][.](?:[A-Z][.])*/, 'ACRONYM'], + [/^(?:https?:\/\/|www[.]|[a-zA-Zà-öÀ-Ö0-9ø-ÿØ-ßĀ-ʯfi-st_-]+[@.][a-zA-Zà-öÀ-Ö0-9ø-ÿØ-ßĀ-ʯfi-st_-]{2,}[@.])[a-zA-Z0-9][a-zA-Zà-öÀ-Ö0-9ø-ÿØ-ßĀ-ʯfi-st_.\/?&!%=+*"'@$#-]+/, 'LINK'], + [/^[#@][a-zA-Zà-öÀ-Ö0-9ø-ÿØ-ßĀ-ʯfi-st_-]+/, 'TAG'], + [/^<[a-zA-Zà-öÀ-Ö0-9ø-ÿØ-ßĀ-ʯfi-st]+.*?>|<\/[a-zA-Zà-öÀ-Ö0-9ø-ÿØ-ßĀ-ʯfi-st]+ *>/, 'HTML'], + [/^\[\/?[a-zA-Zà-öÀ-Ö0-9ø-ÿØ-ßĀ-ʯfi-st]+\]/, 'PSEUDOHTML'], + [/^&\w+;(?:\w+;|)/, 'HTMLENTITY'], + [/^(?:l|d|n|m|t|s|j|c|ç|lorsqu|puisqu|jusqu|quoiqu|qu)['’`]/i, 'ELPFX'], + [/^\d\d?[hm]\d\d\b/, 'HOUR'], + [/^\d+(?:er|nd|e|de|ième|ème|eme)s?\b/, 'ORDINAL'], + [/^-?\d+(?:[.,]\d+|)/, 'NUM'], + [/^[a-zA-Zà-öÀ-Ö0-9ø-ÿØ-ßĀ-ʯfi-st]+(?:[’'`-][a-zA-Zà-öÀ-Ö0-9ø-ÿØ-ßĀ-ʯfi-st]+)*/, 'WORD'] + ] +}; + + +class Tokenizer { + + constructor (sLang) { + this.sLang = sLang; + if (!aTkzPatterns.hasOwnProperty(sLang)) { + this.sLang = "default"; + } + this.aRules = aTkzPatterns[this.sLang]; + } + + * genTokens (sText) { + let m; + let i = 0; + while (sText) { + let nCut = 1; + for (let [zRegex, sType] of this.aRules) { + try { + if ((m = zRegex.exec(sText)) !== null) { + if (sType == 'SEPARATOR') { + for (let c of m[0]) { + yield { "sType": sType, "sValue": c, "nStart": i, "nEnd": i + m[0].length } + } + } else { + yield { "sType": sType, "sValue": m[0], "nStart": i, "nEnd": i + m[0].length } + } + nCut = m[0].length; + break; + } + } + catch (e) { + helpers.logerror(e); + } + } + i += nCut; + sText = sText.slice(nCut); + } + } + + getSpellingErrors (sText, oDict) { + let aSpellErr = []; + for (let oToken of this.genTokens(sText)) { + if (oToken.sType === 'WORD' && !oDict.isValidToken(oToken.sValue)) { + aSpellErr.push(oToken); + } + } + return aSpellErr; + } +} + + +if (typeof(exports) !== 'undefined') { + exports.Tokenizer = Tokenizer; +} ADDED graphspell/char_player.py Index: graphspell/char_player.py ================================================================== --- /dev/null +++ graphspell/char_player.py @@ -0,0 +1,324 @@ +# list of similar chars +# useful for suggestion mechanism + +import re + + +_xTransChars = str.maketrans({ + 'à': 'a', 'é': 'e', 'î': 'i', 'ô': 'o', 'û': 'u', 'ÿ': 'i', "y": "i", + 'â': 'a', 'è': 'e', 'ï': 'i', 'ö': 'o', 'ù': 'u', 'ŷ': 'i', + 'ä': 'a', 'ê': 'e', 'í': 'i', 'ó': 'o', 'ü': 'u', 'ý': 'i', + 'á': 'a', 'ë': 'e', 'ì': 'i', 'ò': 'o', 'ú': 'u', 'ỳ': 'i', + 'ā': 'a', 'ē': 'e', 'ī': 'i', 'ō': 'o', 'ū': 'u', 'ȳ': 'i', + 'ñ': 'n', 'k': 'q', 'w': 'v', + 'œ': 'oe', 'æ': 'ae', +}) + +def simplifyWord (sWord): + "word simplication before calculating distance between words" + sWord = sWord.lower().translate(_xTransChars) + sNewWord = "" + for i, c in enumerate(sWord, 1): + if c != sWord[i:i+1]: + sNewWord += c + return sNewWord.replace("eau", "o").replace("au", "o").replace("ai", "e").replace("ei", "e").replace("ph", "f") + + +aVowel = set("aáàâäāeéèêëēiíìîïīoóòôöōuúùûüūyýỳŷÿȳœæAÁÀÂÄĀEÉÈÊËĒIÍÌÎÏĪOÓÒÔÖŌUÚÙÛÜŪYÝỲŶŸȲŒÆ") +aConsonant = set("bcçdfghjklmnñpqrstvwxzBCÇDFGHJKLMNÑPQRSTVWXZ") +aDouble = set("bcdfjklmnprstzBCDFJKLMNPRSTZ") # letters that may be used twice successively + + +# Similar chars + +d1to1 = { + "1": "liîLIÎ", + "2": "zZ", + "3": "eéèêEÉÈÊ", + "4": "aàâAÀÂ", + "5": "sgSG", + "6": "bdgBDG", + "7": "ltLT", + "8": "bB", + "9": "gbdGBD", + "0": "oôOÔ", + + "a": "aàâáäæ", + "A": "AÀÂÁÄÆ", + "à": "aàâáäæ", + "À": "AÀÂÁÄÆ", + "â": "aàâáäæ", + "Â": "AÀÂÁÄÆ", + "á": "aàâáäæ", + "Á": "AÀÂÁÄÆ", + "ä": "aàâáäæ", + "Ä": "AÀÂÁÄÆ", + + "æ": "æéa", + "Æ": "ÆÉA", + + "c": "cçskqśŝ", + "C": "CÇSKQŚŜ", + "ç": "cçskqśŝ", + "Ç": "CÇSKQŚŜ", + + "e": "eéèêëœ", + "E": "EÉÈÊËŒ", + "é": "eéèêëœ", + "É": "EÉÈÊËŒ", + "ê": "eéèêëœ", + "Ê": "EÉÈÊËŒ", + "è": "eéèêëœ", + "È": "EÉÈÊËŒ", + "ë": "eéèêëœ", + "Ë": "EÉÈÊËŒ", + + "g": "gj", + "G": "GJ", + + "i": "iîïyíìÿ", + "I": "IÎÏYÍÌŸ", + "î": "iîïyíìÿ", + "Î": "IÎÏYÍÌŸ", + "ï": "iîïyíìÿ", + "Ï": "IÎÏYÍÌŸ", + "í": "iîïyíìÿ", + "Í": "IÎÏYÍÌŸ", + "ì": "iîïyíìÿ", + "Ì": "IÎÏYÍÌŸ", + + "j": "jg", + "J": "JG", + + "k": "kcq", + "K": "KCQ", + + "n": "nñ", + "N": "NÑ", + + "o": "oôóòöœ", + "O": "OÔÓÒÖŒ", + "ô": "oôóòöœ", + "Ô": "OÔÓÒÖŒ", + "ó": "oôóòöœ", + "Ó": "OÔÓÒÖŒ", + "ò": "oôóòöœ", + "Ò": "OÔÓÒÖŒ", + "ö": "oôóòöœ", + "Ö": "OÔÓÒÖŒ", + + "œ": "œoôeéèêë", + "Œ": "ŒOÔEÉÈÊË", + + "q": "qck", + "Q": "QCK", + + "s": "sśŝcç", + "S": "SŚŜCÇ", + "ś": "sśŝcç", + "Ś": "SŚŜCÇ", + "ŝ": "sśŝcç", + "Ŝ": "SŚŜCÇ", + + "u": "uûùüú", + "U": "UÛÙÜÚ", + "û": "uûùüú", + "Û": "UÛÙÜÚ", + "ù": "uûùüú", + "Ù": "UÛÙÜÚ", + "ü": "uûùüú", + "Ü": "UÛÙÜÚ", + "ú": "uûùüú", + "Ú": "UÛÙÜÚ", + + "v": "vw", + "V": "VW", + + "w": "wv", + "W": "WV", + + "x": "xck", + "X": "XCK", + + "y": "yÿiîŷýỳ", + "Y": "YŸIÎŶÝỲ", + "ÿ": "yÿiîŷýỳ", + "Ÿ": "YŸIÎŶÝỲ", + "ŷ": "yÿiîŷýỳ", + "Ŷ": "YŸIÎŶÝỲ", + "ý": "yÿiîŷýỳ", + "Ý": "YŸIÎŶÝỲ", + "ỳ": "yÿiîŷýỳ", + "Ỳ": "YŸIÎŶÝỲ", + + "z": "zs", + "Z": "ZS", +} + +d1toX = { + "æ": ("ae",), + "Æ": ("AE",), + "b": ("bb",), + "B": ("BB",), + "c": ("cc", "ss", "qu", "ch"), + "C": ("CC", "SS", "QU", "CH"), + "d": ("dd",), + "D": ("DD",), + "é": ("ai", "ei"), + "É": ("AI", "EI"), + "f": ("ff", "ph"), + "F": ("FF", "PH"), + "g": ("gu", "ge", "gg", "gh"), + "G": ("GU", "GE", "GG", "GH"), + "j": ("jj", "dj"), + "J": ("JJ", "DJ"), + "k": ("qu", "ck", "ch", "cu", "kk", "kh"), + "K": ("QU", "CK", "CH", "CU", "KK", "KH"), + "l": ("ll",), + "L": ("LL",), + "m": ("mm", "mn"), + "M": ("MM", "MN"), + "n": ("nn", "nm", "mn"), + "N": ("NN", "NM", "MN"), + "o": ("au", "eau"), + "O": ("AU", "EAU"), + "œ": ("oe", "eu"), + "Œ": ("OE", "EU"), + "p": ("pp", "ph"), + "P": ("PP", "PH"), + "q": ("qu", "ch", "cq", "ck", "kk"), + "Q": ("QU", "CH", "CQ", "CK", "KK"), + "r": ("rr",), + "R": ("RR",), + "s": ("ss", "sh"), + "S": ("SS", "SH"), + "t": ("tt", "th"), + "T": ("TT", "TH"), + "x": ("cc", "ct", "xx"), + "X": ("CC", "CT", "XX"), + "z": ("ss", "zh"), + "Z": ("SS", "ZH"), +} + + +def get1toXReplacement (cPrev, cCur, cNext): + if cCur in aConsonant and (cPrev in aConsonant or cNext in aConsonant): + return () + return d1toX.get(cCur, ()) + + +d2toX = { + "am": ("an", "en", "em"), + "AM": ("AN", "EN", "EM"), + "an": ("am", "en", "em"), + "AN": ("AM", "EN", "EM"), + "au": ("eau", "o", "ô"), + "AU": ("EAU", "O", "Ô"), + "em": ("an", "am", "en"), + "EM": ("AN", "AM", "EN"), + "en": ("an", "am", "em"), + "EN": ("AN", "AM", "EM"), + "ai": ("ei", "é", "è", "ê", "ë"), + "AI": ("EI", "É", "È", "Ê", "Ë"), + "ei": ("ai", "é", "è", "ê", "ë"), + "EI": ("AI", "É", "È", "Ê", "Ë"), + "ch": ("sh", "c", "ss"), + "CH": ("SH", "C", "SS"), + "ct": ("x", "cc"), + "CT": ("X", "CC"), + "oa": ("oi",), + "OA": ("OI",), + "oi": ("oa", "oie"), + "OI": ("OA", "OIE"), + "ph": ("f",), + "PH": ("F",), + "qu": ("q", "cq", "ck", "c", "k"), + "QU": ("Q", "CQ", "CK", "C", "K"), + "ss": ("c", "ç"), + "SS": ("C", "Ç"), + "un": ("ein",), + "UN": ("EIN",), +} + + +# End of word + +dFinal1 = { + "a": ("as", "at", "ant", "ah"), + "A": ("AS", "AT", "ANT", "AH"), + "c": ("ch",), + "C": ("CH",), + "e": ("et", "er", "ets", "ée", "ez", "ai", "ais", "ait", "ent", "eh"), + "E": ("ET", "ER", "ETS", "ÉE", "EZ", "AI", "AIS", "AIT", "ENT", "EH"), + "é": ("et", "er", "ets", "ée", "ez", "ai", "ais", "ait"), + "É": ("ET", "ER", "ETS", "ÉE", "EZ", "AI", "AIS", "AIT"), + "è": ("et", "er", "ets", "ée", "ez", "ai", "ais", "ait"), + "È": ("ET", "ER", "ETS", "ÉE", "EZ", "AI", "AIS", "AIT"), + "ê": ("et", "er", "ets", "ée", "ez", "ai", "ais", "ait"), + "Ê": ("ET", "ER", "ETS", "ÉE", "EZ", "AI", "AIS", "AIT"), + "ë": ("et", "er", "ets", "ée", "ez", "ai", "ais", "ait"), + "Ë": ("ET", "ER", "ETS", "ÉE", "EZ", "AI", "AIS", "AIT"), + "g": ("gh",), + "G": ("GH",), + "i": ("is", "it", "ie", "in"), + "I": ("IS", "IT", "IE", "IN"), + "n": ("nt", "nd", "ns", "nh"), + "N": ("NT", "ND", "NS", "NH"), + "o": ("aut", "ot", "os"), + "O": ("AUT", "OT", "OS"), + "ô": ("aut", "ot", "os"), + "Ô": ("AUT", "OT", "OS"), + "ö": ("aut", "ot", "os"), + "Ö": ("AUT", "OT", "OS"), + "p": ("ph",), + "P": ("PH",), + "s": ("sh",), + "S": ("SH",), + "t": ("th",), + "T": ("TH",), + "u": ("ut", "us", "uh"), + "U": ("UT", "US", "UH"), +} + +dFinal2 = { + "ai": ("aient", "ais", "et"), + "AI": ("AIENT", "AIS", "ET"), + "an": ("ant", "ent"), + "AN": ("ANT", "ENT"), + "en": ("ent", "ant"), + "EN": ("ENT", "ANT"), + "ei": ("ait", "ais"), + "EI": ("AIT", "AIS"), + "on": ("ons", "ont"), + "ON": ("ONS", "ONT"), + "oi": ("ois", "oit", "oix"), + "OI": ("OIS", "OIT", "OIX"), +} + + +# Préfixes et suffixes + +aPfx1 = frozenset([ + "anti", "archi", "contre", "hyper", "mé", "méta", "im", "in", "ir", "par", "proto", + "pseudo", "pré", "re", "ré", "sans", "sous", "supra", "sur", "ultra" +]) +aPfx2 = frozenset([ + "belgo", "franco", "génito", "gynéco", "médico", "russo" +]) + + +_zMotAvecPronom = re.compile("^(?i)(\\w+)(-(?:t-|)(?:ils?|elles?|on|je|tu|nous|vous))$") + +def cut (sWord): + "returns a tuple of strings (prefix, trimed_word, suffix)" + m = _zMotAvecPronom.search(sWord) + if m: + return ("", m.group(1), m.group(2)) + return ("", sWord, "") + + +# Other functions + +def filterSugg (aSugg): + "exclude suggestions" + return filter(lambda sSugg: not sSugg.endswith(("è", "È")), aSugg) ADDED graphspell/dawg.py Index: graphspell/dawg.py ================================================================== --- /dev/null +++ graphspell/dawg.py @@ -0,0 +1,775 @@ +#!python3 + +# FSA DICTIONARY BUILDER +# +# by Olivier R. +# License: MPL 2 +# +# This tool encodes lexicon into an indexable binary dictionary +# Input files MUST be encoded in UTF-8. + + +import sys +import os +import collections + +from . import str_transform as st +from .progressbar import ProgressBar + + + +def readFile (spf): + print(" < Read lexicon: " + spf) + if os.path.isfile(spf): + with open(spf, "r", encoding="utf-8") as hSrc: + for sLine in hSrc: + sLine = sLine.strip() + if sLine and not sLine.startswith("#"): + yield sLine + else: + raise OSError("# Error. File not found or not loadable: " + spf) + + +def getElemsFromFile (spf): + "returns tuple of (flexion, stem, tags) from lexicon file" + nErr = 0 + if not spf.endswith(".clex"): + for sLine in readFile(spf): + try: + sFlex, sStem, sTag = sLine.split("\t") + yield (sFlex, sStem, sTag) + except: + nErr += 1 + else: + sTag = "_" # neutral tag + sTag2 = "" + for sLine in readFile(spf): + if sLine.startswith("[") and sLine.endswith("]"): + # tag line + if "-->" in sLine: + try: + sTag, sSfxCode, sTag2 = sLine[1:-1].split(" --> ") + except: + nErr += 1 + continue + sTag = sTag.strip() + sSfxCode = sSfxCode.strip() + sTag2 = sTag2.strip() + else: + sTag = sLine[1:-1] + sTag2 = "" + else: + # entry line + if "\t" in sLine: + if sLine.count("\t") > 1: + nErr += 1 + continue + sFlex, sStem = sLine.split("\t") + else: + sFlex = sStem = sLine + #print(sFlex, sStem, sTag) + yield (sFlex, sStem, sTag) + if sTag2: + sFlex2 = st.changeWordWithSuffixCode(sFlex, sSfxCode) + #print(sFlex2, sStem, sTag2) + yield (sFlex2, sStem, sTag2) + if nErr: + print(" # Lines ignored: {:>10}".format(nErr)) + + + +class DAWG: + """DIRECT ACYCLIC WORD GRAPH""" + # This code is inspired from Steve Hanov’s DAWG, 2011. (http://stevehanov.ca/blog/index.php?id=115) + # We store suffix/affix codes and tags within the graph after the “real” word. + # A word is a list of numbers [ c1, c2, c3 . . . cN, iAffix, iTags] + # Each arc is an index in self.lArcVal, where are stored characters, suffix/affix codes for stemming and tags. + # Important: As usual, the last node (after ‘iTags’) is tagged final, AND the node after ‘cN’ is ALSO tagged final. + + def __init__ (self, spfSrc, sLangName, cStemming): + print("===== Direct Acyclic Word Graph - Minimal Acyclic Finite State Automaton =====") + cStemming = cStemming.upper() + if cStemming == "A": + funcStemmingGen = st.defineAffixCode + elif cStemming == "S": + funcStemmingGen = st.defineSuffixCode + elif cStemming == "N": + funcStemmingGen = st.noStemming + else: + raise ValueError("# Error. Unknown stemming code: {}".format(cStemming)) + + lEntry = [] + lChar = ['']; dChar = {}; nChar = 1; dCharOccur = {} + lAff = []; dAff = {}; nAff = 0; dAffOccur = {} + lTag = []; dTag = {}; nTag = 0; dTagOccur = {} + nErr = 0 + + # read lexicon + for sFlex, sStem, sTag in getElemsFromFile(spfSrc): + addWordToCharDict(sFlex) + # chars + for c in sFlex: + if c not in dChar: + dChar[c] = nChar + lChar.append(c) + nChar += 1 + dCharOccur[c] = dCharOccur.get(c, 0) + 1 + # affixes to find stem from flexion + aff = funcStemmingGen(sFlex, sStem) + if aff not in dAff: + dAff[aff] = nAff + lAff.append(aff) + nAff += 1 + dAffOccur[aff] = dCharOccur.get(aff, 0) + 1 + # tags + if sTag not in dTag: + dTag[sTag] = nTag + lTag.append(sTag) + nTag += 1 + dTagOccur[sTag] = dTagOccur.get(sTag, 0) + 1 + lEntry.append((sFlex, dAff[aff], dTag[sTag])) + if not lEntry: + raise ValueError("# Error. Empty lexicon") + + # Preparing DAWG + print(" > Preparing list of words") + lVal = lChar + lAff + lTag + lWord = [ [dChar[c] for c in sFlex] + [iAff+nChar] + [iTag+nChar+nAff] for sFlex, iAff, iTag in lEntry ] + lEntry = None + + # Dictionary of arc values occurrency, to sort arcs of each node + dValOccur = dict( [ (dChar[c], dCharOccur[c]) for c in dChar ] \ + + [ (dAff[aff]+nChar, dAffOccur[aff]) for aff in dAff ] \ + + [ (dTag[tag]+nChar+nAff, dTagOccur[tag]) for tag in dTag ] ) + #with open(spfSrc[:-8]+".valuesfreq.txt", 'w', encoding='utf-8') as hFreqDst: # DEBUG + # for iKey, nOcc in sorted(dValOccur.items(), key=lambda t: t[1], reverse=True): + # hFreqDst.write("{}: {}\n".format(lVal[iKey], nOcc)) + # hFreqDst.close() + + self.sFile = spfSrc + self.sLang = sLangName + self.nEntry = len(lWord) + self.aPreviousEntry = [] + DawgNode.resetNextId() + self.oRoot = DawgNode() + self.lUncheckedNodes = [] # list of nodes that have not been checked for duplication. + self.lMinimizedNodes = {} # list of unique nodes that have been checked for duplication. + self.lSortedNodes = [] # version 2 and 3 + self.nNode = 0 + self.nArc = 0 + self.dChar = dChar + self.nChar = len(dChar) + self.nAff = nAff + self.lArcVal = lVal + self.nArcVal = len(lVal) + self.nTag = self.nArcVal - self.nChar - nAff + self.cStemming = cStemming + if cStemming == "A": + self.funcStemming = st.changeWordWithAffixCode + elif cStemming == "S": + self.funcStemming = st.changeWordWithSuffixCode + else: + self.funcStemming = st.noStemming + + # build + lWord.sort() + oProgBar = ProgressBar(0, len(lWord)) + for aEntry in lWord: + self.insert(aEntry) + oProgBar.increment(1) + oProgBar.done() + self.finish() + self.countNodes() + self.countArcs() + self.sortNodes() + self.sortNodeArcs(dValOccur) + #self.sortNodeArcs2 (self.oRoot, "") + self.displayInfo() + + # BUILD DAWG + def insert (self, aEntry): + if aEntry < self.aPreviousEntry: + sys.exit("# Error: Words must be inserted in alphabetical order.") + + # find common prefix between word and previous word + nCommonPrefix = 0 + for i in range(min(len(aEntry), len(self.aPreviousEntry))): + if aEntry[i] != self.aPreviousEntry[i]: + break + nCommonPrefix += 1 + + # Check the lUncheckedNodes for redundant nodes, proceeding from last + # one down to the common prefix size. Then truncate the list at that point. + self._minimize(nCommonPrefix) + + # add the suffix, starting from the correct node mid-way through the graph + if len(self.lUncheckedNodes) == 0: + oNode = self.oRoot + else: + oNode = self.lUncheckedNodes[-1][2] + + iChar = nCommonPrefix + for c in aEntry[nCommonPrefix:]: + oNextNode = DawgNode() + oNode.arcs[c] = oNextNode + self.lUncheckedNodes.append((oNode, c, oNextNode)) + if iChar == (len(aEntry) - 2): + oNode.final = True + iChar += 1 + oNode = oNextNode + oNode.final = True + self.aPreviousEntry = aEntry + + def finish (self): + "minimize unchecked nodes" + self._minimize(0) + + def _minimize (self, downTo): + # proceed from the leaf up to a certain point + for i in range( len(self.lUncheckedNodes)-1, downTo-1, -1 ): + oNode, char, oChildNode = self.lUncheckedNodes[i] + if oChildNode in self.lMinimizedNodes: + # replace the child with the previously encountered one + oNode.arcs[char] = self.lMinimizedNodes[oChildNode] + else: + # add the state to the minimized nodes. + self.lMinimizedNodes[oChildNode] = oChildNode + self.lUncheckedNodes.pop() + + def countNodes (self): + self.nNode = len(self.lMinimizedNodes) + + def countArcs (self): + self.nArc = 0 + for oNode in self.lMinimizedNodes: + self.nArc += len(oNode.arcs) + + def sortNodeArcs (self, dValOccur): + print(" > Sort node arcs") + self.oRoot.sortArcs(dValOccur) + for oNode in self.lMinimizedNodes: + oNode.sortArcs(dValOccur) + + def sortNodeArcs2 (self, oNode, cPrevious=""): + # recursive function + dCharOccur = getCharOrderAfterChar(cPrevious) + if dCharOccur: + oNode.sortArcs2(dCharOccur, self.lArcVal) + for nArcVal, oNextNode in oNode.arcs.items(): + self.sortNodeArcs2(oNextNode, self.lArcVal[nArcVal]) + + def sortNodes (self): + print(" > Sort nodes") + for oNode in self.oRoot.arcs.values(): + self._parseNodes(oNode) + + def _parseNodes (self, oNode): + # Warning: recursive method + if oNode.pos > 0: + return + oNode.setPos() + self.lSortedNodes.append(oNode) + for oNextNode in oNode.arcs.values(): + self._parseNodes(oNextNode) + + def lookup (self, sWord): + oNode = self.oRoot + for c in sWord: + if self.dChar.get(c, '') not in oNode.arcs: + return False + oNode = oNode.arcs[self.dChar[c]] + return oNode.final + + def morph (self, sWord): + oNode = self.oRoot + for c in sWord: + if self.dChar.get(c, '') not in oNode.arcs: + return '' + oNode = oNode.arcs[self.dChar[c]] + if oNode.final: + s = "* " + for arc in oNode.arcs: + if arc >= self.nChar: + s += " [" + self.funcStemming(sWord, self.lArcVal[arc]) + oNode2 = oNode.arcs[arc] + for arc2 in oNode2.arcs: + s += " / " + self.lArcVal[arc2] + s += "]" + return s + return '' + + def displayInfo (self): + print(" * {:<12} {:>16,}".format("Entries:", self.nEntry)) + print(" * {:<12} {:>16,}".format("Characters:", self.nChar)) + print(" * {:<12} {:>16,}".format("Affixes:", self.nAff)) + print(" * {:<12} {:>16,}".format("Tags:", self.nTag)) + print(" * {:<12} {:>16,}".format("Arc values:", self.nArcVal)) + print(" * {:<12} {:>16,}".format("Nodes:", self.nNode)) + print(" * {:<12} {:>16,}".format("Arcs:", self.nArc)) + print(" * {:<12} {:>16}".format("Stemming:", self.cStemming + "FX")) + + def getArcStats (self): + d = {} + for oNode in self.lMinimizedNodes: + n = len(oNode.arcs) + d[n] = d.get(n, 0) + 1 + s = " * Nodes:\n" + for n in d: + s = s + " {:>9} nodes have {:>3} arcs\n".format(d[n], n) + return s + + def writeInfo (self, sPathFile): + print(" > Write informations") + with open(sPathFile, 'w', encoding='utf-8', newline="\n") as hDst: + hDst.write(self.getArcStats()) + hDst.write("\n * Values:\n") + for i, s in enumerate(self.lArcVal): + hDst.write(" {:>6}. {}\n".format(i, s)) + hDst.close() + + # BINARY CONVERSION + def createBinary (self, sPathFile, nMethod, bDebug=False): + print(" > Write DAWG as an indexable binary dictionary [method: %d]" % nMethod) + if nMethod == 1: + self.nBytesArc = ( (self.nArcVal.bit_length() + 2) // 8 ) + 1 # We add 2 bits. See DawgNode.convToBytes1() + self._calcNumBytesNodeAddress() + self._calcNodesAddress1() + elif nMethod == 2: + self.nBytesArc = ( (self.nArcVal.bit_length() + 3) // 8 ) + 1 # We add 3 bits. See DawgNode.convToBytes2() + self._calcNumBytesNodeAddress() + self._calcNodesAddress2() + elif nMethod == 3: + self.nBytesArc = ( (self.nArcVal.bit_length() + 3) // 8 ) + 1 # We add 3 bits. See DawgNode.convToBytes3() + self.nBytesOffset = 1 + self.nMaxOffset = (2 ** (self.nBytesOffset * 8)) - 1 + self._calcNumBytesNodeAddress() + self._calcNodesAddress3() + else: + print(" # Error: unknown compression method") + print(" Arc values (chars, affixes and tags): {} -> {} bytes".format( self.nArcVal, len("\t".join(self.lArcVal).encode("utf-8")) )) + print(" Arc size: {} bytes, Address size: {} bytes -> {} * {} = {} bytes".format( self.nBytesArc, self.nBytesNodeAddress, \ + self.nBytesArc+self.nBytesNodeAddress, self.nArc, \ + (self.nBytesArc+self.nBytesNodeAddress)*self.nArc )) + self._writeBinary(sPathFile, nMethod) + if bDebug: + self._writeNodes(sPathFile, nMethod) + + def _calcNumBytesNodeAddress (self): + "how many bytes needed to store all nodes/arcs in the binary dictionary" + self.nBytesNodeAddress = 1 + while ((self.nBytesArc + self.nBytesNodeAddress) * self.nArc) > (2 ** (self.nBytesNodeAddress * 8)): + self.nBytesNodeAddress += 1 + + def _calcNodesAddress1 (self): + nBytesNode = self.nBytesArc + self.nBytesNodeAddress + iAddr = len(self.oRoot.arcs) * nBytesNode + for oNode in self.lMinimizedNodes: + oNode.addr = iAddr + iAddr += max(len(oNode.arcs), 1) * nBytesNode + + def _calcNodesAddress2 (self): + nBytesNode = self.nBytesArc + self.nBytesNodeAddress + iAddr = len(self.oRoot.arcs) * nBytesNode + for oNode in self.lSortedNodes: + oNode.addr = iAddr + iAddr += max(len(oNode.arcs), 1) * nBytesNode + for oNextNode in oNode.arcs.values(): + if (oNode.pos + 1) == oNextNode.pos: + iAddr -= self.nBytesNodeAddress + #break + + def _calcNodesAddress3 (self): + nBytesNode = self.nBytesArc + self.nBytesNodeAddress + # theorical nodes size if only addresses and no offset + self.oRoot.size = len(self.oRoot.arcs) * nBytesNode + for oNode in self.lSortedNodes: + oNode.size = max(len(oNode.arcs), 1) * nBytesNode + # rewind and calculate dropdown from the end, several times + nDiff = self.nBytesNodeAddress - self.nBytesOffset + bEnd = False + while not bEnd: + bEnd = True + # recalculate addresses + iAddr = self.oRoot.size + for oNode in self.lSortedNodes: + oNode.addr = iAddr + iAddr += oNode.size + # rewind and calculate dropdown from the end, several times + for i in range(self.nNode-1, -1, -1): + nSize = max(len(self.lSortedNodes[i].arcs), 1) * nBytesNode + for oNextNode in self.lSortedNodes[i].arcs.values(): + if 1 < (oNextNode.addr - self.lSortedNodes[i].addr) < self.nMaxOffset: + nSize -= nDiff + if self.lSortedNodes[i].size != nSize: + self.lSortedNodes[i].size = nSize + bEnd = False + + def _writeBinary (self, sPathFile, nMethod): + """ + Format of the binary indexable dictionary: + Each section is separated with 4 bytes of \0 + + - Section Header: + /pyfsa/[version] + * version is an ASCII string + + - Section Informations: + /[tag_lang] + /[number of chars] + /[number of bytes for each arc] + /[number of bytes for each address node] + /[number of entries] + /[number of nodes] + /[number of arcs] + /[number of affixes] + * each field is a ASCII string + /[stemming code] + * "S" means stems are generated by /suffix_code/, "A" means they are generated by /affix_code/ + See defineSuffixCode() and defineAffixCode() for details. + "N" means no stemming + + - Section Values: + * a list of strings encoded in binary from utf-8, each value separated with a tabulation + + - Section Word Graph (nodes / arcs) + * A list of nodes which are a list of arcs with an address of the next node. + See DawgNode.convToBytes() for details. + """ + if not sPathFile.endswith(".bdic"): + sPathFile += "."+str(nMethod)+".bdic" + with open(sPathFile, 'wb') as hDst: + # header + hDst.write("/pyfsa/{}/".format(nMethod).encode("utf-8")) + hDst.write(b"\0\0\0\0") + # infos + hDst.write("{}/{}/{}/{}/{}/{}/{}/{}/{}".format(self.sLang, self.nChar, self.nBytesArc, self.nBytesNodeAddress, \ + self.nEntry, self.nNode, self.nArc, self.nAff, self.cStemming).encode("utf-8")) + hDst.write(b"\0\0\0\0") + # lArcVal + hDst.write("\t".join(self.lArcVal).encode("utf-8")) + hDst.write(b"\0\0\0\0") + # DAWG: nodes / arcs + if nMethod == 1: + hDst.write(self.oRoot.convToBytes1(self.nBytesArc, self.nBytesNodeAddress)) + for oNode in self.lMinimizedNodes: + hDst.write(oNode.convToBytes1(self.nBytesArc, self.nBytesNodeAddress)) + elif nMethod == 2: + hDst.write(self.oRoot.convToBytes2(self.nBytesArc, self.nBytesNodeAddress)) + for oNode in self.lSortedNodes: + hDst.write(oNode.convToBytes2(self.nBytesArc, self.nBytesNodeAddress)) + elif nMethod == 3: + hDst.write(self.oRoot.convToBytes3(self.nBytesArc, self.nBytesNodeAddress, self.nBytesOffset)) + for oNode in self.lSortedNodes: + hDst.write(oNode.convToBytes3(self.nBytesArc, self.nBytesNodeAddress, self.nBytesOffset)) + hDst.close() + + def _writeNodes (self, sPathFile, nMethod): + "for debugging only" + print(" > Write nodes") + with open(sPathFile+".nodes."+str(nMethod)+".txt", 'w', encoding='utf-8', newline="\n") as hDst: + if nMethod == 1: + hDst.write(self.oRoot.getTxtRepr1(self.nBytesArc, self.nBytesNodeAddress, self.lArcVal)+"\n") + #hDst.write( ''.join( [ "%02X " % z for z in self.oRoot.convToBytes1(self.nBytesArc, self.nBytesNodeAddress) ] ).strip() ) + for oNode in self.lMinimizedNodes: + hDst.write(oNode.getTxtRepr1(self.nBytesArc, self.nBytesNodeAddress, self.lArcVal)+"\n") + if nMethod == 2: + hDst.write(self.oRoot.getTxtRepr2(self.nBytesArc, self.nBytesNodeAddress, self.lArcVal)+"\n") + for oNode in self.lSortedNodes: + hDst.write(oNode.getTxtRepr2(self.nBytesArc, self.nBytesNodeAddress, self.lArcVal)+"\n") + if nMethod == 3: + hDst.write(self.oRoot.getTxtRepr3(self.nBytesArc, self.nBytesNodeAddress, self.nBytesOffset, self.lArcVal)+"\n") + #hDst.write( ''.join( [ "%02X " % z for z in self.oRoot.convToBytes3(self.nBytesArc, self.nBytesNodeAddress, self.nBytesOffset) ] ).strip() ) + for oNode in self.lSortedNodes: + hDst.write(oNode.getTxtRepr3(self.nBytesArc, self.nBytesNodeAddress, self.nBytesOffset, self.lArcVal)+"\n") + hDst.close() + + def writeResults (self, sPathFile): + bFileExits = os.path.isfile("_lexicons.res.txt") + with open("_lexicons.res.txt", "a", encoding='utf-8', newline="\n") as hDst: + sFormat1 = "{:<12} {:>12} {:>5} {:>8} {:>8} {:>6} {:>8} {:>9} {:>9} {:>15} {:>12} {:>12}\n" + sFormat2 = "{:<12} {:>12,} {:>5,} {:>8,} {:>8} {:>6,} {:>8,} {:>9,} {:>9,} {:>15,} {:>12,} {:>12,}\n" + if not bFileExits: + hDst.write(sFormat1.format("Lexicon", "Entries", "Chars", "Affixes", "Stemming", "Tags", "Values", "Nodes", "Arcs", "Lexicon (Kb)", "Dict (Kb)", "LT Dict (Kb)")) + hDst.write(sFormat2.format(self.sLang, self.nEntry, self.nChar, self.nAff, self.cStemming + "FX", self.nTag, self.nArcVal, \ + self.nNode, self.nArc, os.path.getsize(self.sFile), os.path.getsize(sPathFile), \ + os.path.getsize("cfsa/dict/{}.dict".format(self.sLang)) if os.path.isfile("cfsa/dict/{}.dict".format(self.sLang)) else 0)) + hDst.close() + + + +class DawgNode: + NextId = 0 + NextPos = 1 # (version 2) + + def __init__ (self): + self.i = DawgNode.NextId + DawgNode.NextId += 1 + self.final = False + self.arcs = {} # key: arc value; value: a node + self.addr = 0 # address in the binary dictionary + self.pos = 0 # position in the binary dictionary (version 2) + self.size = 0 # size of node in bytes (version 3) + + @classmethod + def resetNextId (cls): + cls.NextId = 0 + + def setPos (self): # version 2 + self.pos = DawgNode.NextPos + DawgNode.NextPos += 1 + + def __str__ (self): + # Caution! this function is used for hashing and comparison! + l = [] + if self.final: + l.append("1") + else: + l.append("0") + for (key, node) in self.arcs.items(): + l.append(str(key)) + l.append(str(node.i)) + return "_".join(l) + + def __hash__ (self): + # Used as a key in a python dictionary. + return self.__str__().__hash__() + + def __eq__ (self, other): + # Used as a key in a python dictionary. + # Nodes are equivalent if they have identical arcs, and each identical arc leads to identical states. + return self.__str__() == other.__str__() + + def sortArcs (self, dValOccur): + self.arcs = collections.OrderedDict(sorted(self.arcs.items(), key=lambda t: dValOccur.get(t[0], 0), reverse=True)) + + def sortArcs2 (self, dValOccur, lArcVal): + self.arcs = collections.OrderedDict(sorted(self.arcs.items(), key=lambda t: dValOccur.get(lArcVal[t[0]], 0), reverse=True)) + + # VERSION 1 ===================================================================================================== + def convToBytes1 (self, nBytesArc, nBytesNodeAddress): + """ + Node scheme: + - Arc length is defined by nBytesArc + - Address length is defined by nBytesNodeAddress + + | Arc | Address of next node | + | | | + /---------------\ /---------------\ /---------------\ /---------------\ /---------------\ /---------------\ + | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | + \---------------/ \---------------/ \---------------/ \---------------/ \---------------/ \---------------/ + [...] + /---------------\ /---------------\ /---------------\ /---------------\ /---------------\ /---------------\ + | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | + \---------------/ \---------------/ \---------------/ \---------------/ \---------------/ \---------------/ + ^ ^ + | | + | | + | \___ if 1, last arc of this node + \_____ if 1, this node is final (only on the first arc) + """ + nArc = len(self.arcs) + nFinalNodeMask = 1 << ((nBytesArc*8)-1) + nFinalArcMask = 1 << ((nBytesArc*8)-2) + if len(self.arcs) == 0: + val = nFinalNodeMask | nFinalArcMask + by = val.to_bytes(nBytesArc, byteorder='big') + by += (0).to_bytes(nBytesNodeAddress, byteorder='big') + return by + by = b"" + for i, arc in enumerate(self.arcs, 1): + val = arc + if i == 1 and self.final: + val = val | nFinalNodeMask + if i == nArc: + val = val | nFinalArcMask + by += val.to_bytes(nBytesArc, byteorder='big') + by += self.arcs[arc].addr.to_bytes(nBytesNodeAddress, byteorder='big') + return by + + def getTxtRepr1 (self, nBytesArc, nBytesNodeAddress, lVal): + nArc = len(self.arcs) + nFinalNodeMask = 1 << ((nBytesArc*8)-1) + nFinalArcMask = 1 << ((nBytesArc*8)-2) + s = "i{:_>10} -- #{:_>10}\n".format(self.i, self.addr) + if len(self.arcs) == 0: + s += " {:<20} {:0>16} i{:_>10} #{:_>10}\n".format("", bin(nFinalNodeMask | nFinalArcMask)[2:], "0", "0") + return s + for i, arc in enumerate(self.arcs, 1): + val = arc + if i == 1 and self.final: + val = val | nFinalNodeMask + if i == nArc: + val = val | nFinalArcMask + s += " {:<20} {:0>16} i{:_>10} #{:_>10}\n".format(lVal[arc], bin(val)[2:], self.arcs[arc].i, self.arcs[arc].addr) + return s + + # VERSION 2 ===================================================================================================== + def convToBytes2 (self, nBytesArc, nBytesNodeAddress): + """ + Node scheme: + - Arc length is defined by nBytesArc + - Address length is defined by nBytesNodeAddress + + | Arc | Address of next node | + | | | + /---------------\ /---------------\ /---------------\ /---------------\ /---------------\ /---------------\ + | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | + \---------------/ \---------------/ \---------------/ \---------------/ \---------------/ \---------------/ + [...] + /---------------\ /---------------\ /---------------\ /---------------\ /---------------\ /---------------\ + | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | + \---------------/ \---------------/ \---------------/ \---------------/ \---------------/ \---------------/ + ^ ^ ^ + | | | + | | \_ if 1, caution, no address: next node is the following node + | \___ if 1, last arc of this node + \_____ if 1, this node is final (only on the first arc) + """ + nArc = len(self.arcs) + nFinalNodeMask = 1 << ((nBytesArc*8)-1) + nFinalArcMask = 1 << ((nBytesArc*8)-2) + nNextNodeMask = 1 << ((nBytesArc*8)-3) + if len(self.arcs) == 0: + val = nFinalNodeMask | nFinalArcMask + by = val.to_bytes(nBytesArc, byteorder='big') + by += (0).to_bytes(nBytesNodeAddress, byteorder='big') + return by + by = b"" + for i, arc in enumerate(self.arcs, 1): + val = arc + if i == 1 and self.final: + val = val | nFinalNodeMask + if i == nArc: + val = val | nFinalArcMask + if (self.pos + 1) == self.arcs[arc].pos and self.i != 0: + val = val | nNextNodeMask + by += val.to_bytes(nBytesArc, byteorder='big') + else: + by += val.to_bytes(nBytesArc, byteorder='big') + by += self.arcs[arc].addr.to_bytes(nBytesNodeAddress, byteorder='big') + return by + + def getTxtRepr2 (self, nBytesArc, nBytesNodeAddress, lVal): + nArc = len(self.arcs) + nFinalNodeMask = 1 << ((nBytesArc*8)-1) + nFinalArcMask = 1 << ((nBytesArc*8)-2) + nNextNodeMask = 1 << ((nBytesArc*8)-3) + s = "i{:_>10} -- #{:_>10}\n".format(self.i, self.addr) + if nArc == 0: + s += " {:<20} {:0>16} i{:_>10} #{:_>10}\n".format("", bin(nFinalNodeMask | nFinalArcMask)[2:], "0", "0") + return s + for i, arc in enumerate(self.arcs, 1): + val = arc + if i == 1 and self.final: + val = val | nFinalNodeMask + if i == nArc: + val = val | nFinalArcMask + if (self.pos + 1) == self.arcs[arc].pos and self.i != 0: + val = val | nNextNodeMask + s += " {:<20} {:0>16}\n".format(lVal[arc], bin(val)[2:], "") + else: + s += " {:<20} {:0>16} i{:_>10} #{:_>10}\n".format(lVal[arc], bin(val)[2:], self.arcs[arc].i, self.arcs[arc].addr) + return s + + # VERSION 3 ===================================================================================================== + def convToBytes3 (self, nBytesArc, nBytesNodeAddress, nBytesOffset): + """ + Node scheme: + - Arc length is defined by nBytesArc + - Address length is defined by nBytesNodeAddress + - Offset length is defined by nBytesOffset + + | Arc | Address of next node or offset to next node | + | | | + /---------------\ /---------------\ /---------------\ /---------------\ /---------------\ /---------------\ + |1|0|0| | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | + \---------------/ \---------------/ \---------------/ \---------------/ \---------------/ \---------------/ + [...] + /---------------\ /---------------\ /---------------\ + |0|0|1| | | | | | | | | | | | | | | | | | | | | | | | Offsets are shorter than addresses + \---------------/ \---------------/ \---------------/ + /---------------\ /---------------\ /---------------\ /---------------\ /---------------\ /---------------\ + |0|1|0| | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | + \---------------/ \---------------/ \---------------/ \---------------/ \---------------/ \---------------/ + + ^ ^ ^ + | | | + | | \_ if 1, offset instead of address of next node + | \___ if 1, last arc of this node + \_____ if 1, this node is final (only on the first arc) + """ + nArc = len(self.arcs) + nFinalNodeMask = 1 << ((nBytesArc*8)-1) + nFinalArcMask = 1 << ((nBytesArc*8)-2) + nNextNodeMask = 1 << ((nBytesArc*8)-3) + nMaxOffset = (2 ** (nBytesOffset * 8)) - 1 + if nArc == 0: + val = nFinalNodeMask | nFinalArcMask + by = val.to_bytes(nBytesArc, byteorder='big') + by += (0).to_bytes(nBytesNodeAddress, byteorder='big') + return by + by = b"" + for i, arc in enumerate(self.arcs, 1): + val = arc + if i == 1 and self.final: + val = val | nFinalNodeMask + if i == nArc: + val = val | nFinalArcMask + if 1 < (self.arcs[arc].addr - self.addr) < nMaxOffset and self.i != 0: + val = val | nNextNodeMask + by += val.to_bytes(nBytesArc, byteorder='big') + by += (self.arcs[arc].addr-self.addr).to_bytes(nBytesOffset, byteorder='big') + else: + by += val.to_bytes(nBytesArc, byteorder='big') + by += self.arcs[arc].addr.to_bytes(nBytesNodeAddress, byteorder='big') + return by + + def getTxtRepr3 (self, nBytesArc, nBytesNodeAddress, nBytesOffset, lVal): + nArc = len(self.arcs) + nFinalNodeMask = 1 << ((nBytesArc*8)-1) + nFinalArcMask = 1 << ((nBytesArc*8)-2) + nNextNodeMask = 1 << ((nBytesArc*8)-3) + nMaxOffset = (2 ** (nBytesOffset * 8)) - 1 + s = "i{:_>10} -- #{:_>10} ({})\n".format(self.i, self.addr, self.size) + if nArc == 0: + s += " {:<20} {:0>16} i{:_>10} #{:_>10}\n".format("", bin(nFinalNodeMask | nFinalArcMask)[2:], "0", "0") + return s + for i, arc in enumerate(self.arcs, 1): + val = arc + if i == 1 and self.final: + val = val | nFinalNodeMask + if i == nArc: + val = val | nFinalArcMask + if 1 < (self.arcs[arc].addr - self.addr) < nMaxOffset and self.i != 0: + val = val | nNextNodeMask + s += " {:<20} {:0>16} i{:_>10} +{:_>10}\n".format(lVal[arc], bin(val)[2:], self.arcs[arc].i, self.arcs[arc].addr - self.addr) + else: + s += " {:<20} {:0>16} i{:_>10} #{:_>10}\n".format(lVal[arc], bin(val)[2:], self.arcs[arc].i, self.arcs[arc].addr) + return s + + + +# Another attempt to sort node arcs + +_dCharOrder = { + # key: previous char, value: dictionary of chars {c: nValue} + "": {} +} + + +def addWordToCharDict (sWord): + cPrevious = "" + for cChar in sWord: + if cPrevious not in _dCharOrder: + _dCharOrder[cPrevious] = {} + _dCharOrder[cPrevious][cChar] = _dCharOrder[cPrevious].get(cChar, 0) + 1 + cPrevious = cChar + + +def getCharOrderAfterChar (cChar): + return _dCharOrder.get(cChar, None) + + +def displayCharOrder (): + for key, value in _dCharOrder.items(): + print("[" + key + "]: ", ", ".join([ c+":"+str(n) for c, n in sorted(value.items(), key=lambda t: t[1], reverse=True) ])) ADDED graphspell/echo.py Index: graphspell/echo.py ================================================================== --- /dev/null +++ graphspell/echo.py @@ -0,0 +1,29 @@ +#!python3 + +# The most boring yet indispensable function: print! + + +import sys + + +_CHARMAP = str.maketrans({ 'œ': 'ö', 'Œ': 'Ö', 'ʳ': "r", 'ᵉ': "e", '…': "_", \ + '“': '"', '”': '"', '„': '"', '‘': "'", '’': "'", \ + 'ā': 'â', 'Ā': 'Â', 'ē': 'ê', 'Ē': 'Ê', 'ī': 'î', 'Ī': 'Î', \ + 'ō': 'ô', 'Ō': 'Ô', 'ū': 'û', 'Ū': 'Û', 'Ÿ': 'Y', \ + 'ś': 's', 'ŝ': 's', \ + '—': '-', '–': '-' + }) + + +def echo (obj, sep=' ', end='\n', file=sys.stdout, flush=False): + """ Print for Windows to avoid Python crashes. + Encoding depends on Windows locale. No useful standard. + Always returns True (useful for debugging).""" + if sys.platform != "win32": + print(obj, sep=sep, end=end, file=file, flush=flush) + return True + try: + print(str(obj).translate(_CHARMAP), sep=sep, end=end, file=file, flush=flush) + except: + print(str(obj).encode('ascii', 'replace').decode('ascii', 'replace'), sep=sep, end=end, file=file, flush=flush) + return True ADDED graphspell/ibdawg.py Index: graphspell/ibdawg.py ================================================================== --- /dev/null +++ graphspell/ibdawg.py @@ -0,0 +1,720 @@ +#!python3 + +import os +import traceback +import pkgutil +import re +from functools import wraps +import time + +#import logging +#logging.basicConfig(filename="suggestions.log", level=logging.DEBUG) + +from . import str_transform as st +from . import char_player as cp +from .echo import echo + + +def timethis (func): + "decorator for the execution time" + @wraps(func) + def wrapper (*args, **kwargs): + fStart = time.time() + result = func(*args, **kwargs) + fEnd = time.time() + print(func.__name__, fEnd - fStart) + return result + return wrapper + + +class SuggResult: + """Structure for storing, classifying and filtering suggestions""" + + def __init__ (self, sWord, nDistLimit=-1): + self.sWord = sWord + self.sSimplifiedWord = cp.simplifyWord(sWord) + self.nDistLimit = nDistLimit if nDistLimit >= 0 else (len(sWord) // 3) + 1 + self.nMinDist = 1000 + self.aSugg = set() + self.dSugg = { 0: [], 1: [], 2: [] } + + def addSugg (self, sSugg, nDeep=0): + "add a suggestion" + #logging.info((nDeep * " ") + "__" + sSugg + "__") + if sSugg not in self.aSugg: + nDist = st.distanceDamerauLevenshtein(self.sSimplifiedWord, cp.simplifyWord(sSugg)) + if nDist <= self.nDistLimit: + if nDist not in self.dSugg: + self.dSugg[nDist] = [] + self.dSugg[nDist].append(sSugg) + self.aSugg.add(sSugg) + if nDist < self.nMinDist: + self.nMinDist = nDist + self.nDistLimit = min(self.nDistLimit, self.nMinDist+2) + + def getSuggestions (self, nSuggLimit=10, nDistLimit=-1): + "return a list of suggestions" + lRes = [] + if self.dSugg[0]: + # we sort the better results with the original word + self.dSugg[0].sort(key=lambda sSugg: st.distanceDamerauLevenshtein(self.sWord, sSugg)) + for lSugg in self.dSugg.values(): + lRes.extend(lSugg) + if len(lRes) > nSuggLimit: + break + lRes = list(cp.filterSugg(lRes)) + if self.sWord.istitle(): + lRes = list(map(lambda sSugg: sSugg.title(), lRes)) + elif self.sWord.isupper(): + lRes = list(map(lambda sSugg: sSugg.upper(), lRes)) + return lRes[:nSuggLimit] + + def reset (self): + self.aSugg.clear() + self.dSugg.clear() + + +class IBDAWG: + """INDEXABLE BINARY DIRECT ACYCLIC WORD GRAPH""" + + def __init__ (self, sDicName): + self.by = pkgutil.get_data(__package__, "_dictionaries/" + sDicName) + if not self.by: + raise OSError("# Error. File not found or not loadable: "+sDicName) + + if self.by[0:7] != b"/pyfsa/": + raise TypeError("# Error. Not a pyfsa binary dictionary. Header: {}".format(self.by[0:9])) + if not(self.by[7:8] == b"1" or self.by[7:8] == b"2" or self.by[7:8] == b"3"): + raise ValueError("# Error. Unknown dictionary version: {}".format(self.by[7:8])) + try: + header, info, values, bdic = self.by.split(b"\0\0\0\0", 3) + except Exception: + raise Exception + + self.sName = sDicName + self.nVersion = int(self.by[7:8].decode("utf-8")) + self.sHeader = header.decode("utf-8") + self.lArcVal = values.decode("utf-8").split("\t") + self.nArcVal = len(self.lArcVal) + self.byDic = bdic + + l = info.decode("utf-8").split("/") + self.sLang = l[0] + self.nChar = int(l[1]) + self.nBytesArc = int(l[2]) + self.nBytesNodeAddress = int(l[3]) + self.nEntries = int(l[4]) + self.nNode = int(l[5]) + self.nArc = int(l[6]) + self.nAff = int(l[7]) + self.cStemming = l[8] + if self.cStemming == "S": + self.funcStemming = st.changeWordWithSuffixCode + elif self.cStemming == "A": + self.funcStemming = st.changeWordWithAffixCode + else: + self.funcStemming = st.noStemming + self.nTag = self.nArcVal - self.nChar - self.nAff + # to get the value of an arc, to get the char of an arc with its value + self.dChar = {} + for i in range(1, self.nChar): + self.dChar[self.lArcVal[i]] = i + self.dCharVal = { v: k for k, v in self.dChar.items() } + + self._arcMask = (2 ** ((self.nBytesArc * 8) - 3)) - 1 + self._finalNodeMask = 1 << ((self.nBytesArc * 8) - 1) + self._lastArcMask = 1 << ((self.nBytesArc * 8) - 2) + self._addrBitMask = 1 << ((self.nBytesArc * 8) - 3) # version 2 + + self.nBytesOffset = 1 # version 3 + + # Configuring DAWG functions according to nVersion + if self.nVersion == 1: + self.morph = self._morph1 + self.stem = self._stem1 + self._lookupArcNode = self._lookupArcNode1 + self._getArcs = self._getArcs1 + self._writeNodes = self._writeNodes1 + elif self.nVersion == 2: + self.morph = self._morph2 + self.stem = self._stem2 + self._lookupArcNode = self._lookupArcNode2 + self._getArcs = self._getArcs2 + self._writeNodes = self._writeNodes2 + elif self.nVersion == 3: + self.morph = self._morph3 + self.stem = self._stem3 + self._lookupArcNode = self._lookupArcNode3 + self._getArcs = self._getArcs3 + self._writeNodes = self._writeNodes3 + else: + raise ValueError(" # Error: unknown code: {}".format(self.nVersion)) + + self.bOptNumSigle = False + self.bOptNumAtLast = False + + def getInfo (self): + return " Language: {0.sLang:>10} Version: {0.nVersion:>2} Stemming: {0.cStemming}FX\n" \ + " Arcs values: {0.nArcVal:>10,} = {0.nChar:>5,} characters, {0.nAff:>6,} affixes, {0.nTag:>6,} tags\n" \ + " Dictionary: {0.nEntries:>12,} entries, {0.nNode:>11,} nodes, {0.nArc:>11,} arcs\n" \ + " Address size: {0.nBytesNodeAddress:>1} bytes, Arc size: {0.nBytesArc:>1} bytes\n".format(self) + + def writeAsJSObject (self, spfDest, bInJSModule=False, bBinaryDictAsHexString=False): + "write IBDAWG as a JavaScript object in a JavaScript module" + import json + with open(spfDest, "w", encoding="utf-8", newline="\n") as hDst: + if bInJSModule: + hDst.write('// JavaScript\n// Generated data (do not edit)\n\n"use strict";\n\nconst dictionary = ') + hDst.write(json.dumps({ + "sName": self.sName, + "nVersion": self.nVersion, + "sHeader": self.sHeader, + "lArcVal": self.lArcVal, + "nArcVal": self.nArcVal, + # JavaScript is a pile of shit, so Mozilla’s JS parser don’t like file bigger than 4 Mb! + # So, if necessary, we use an hexadecimal string, that we will convert later in Firefox’s extension. + # https://github.com/mozilla/addons-linter/issues/1361 + "byDic": self.byDic.hex() if bBinaryDictAsHexString else [ e for e in self.byDic ], + "sLang": self.sLang, + "nChar": self.nChar, + "nBytesArc": self.nBytesArc, + "nBytesNodeAddress": self.nBytesNodeAddress, + "nEntries": self.nEntries, + "nNode": self.nNode, + "nArc": self.nArc, + "nAff": self.nAff, + "cStemming": self.cStemming, + "nTag": self.nTag, + "dChar": self.dChar, + "_arcMask": self._arcMask, + "_finalNodeMask": self._finalNodeMask, + "_lastArcMask": self._lastArcMask, + "_addrBitMask": self._addrBitMask, + "nBytesOffset": self.nBytesOffset + }, ensure_ascii=False)) + if bInJSModule: + hDst.write(";\n\nexports.dictionary = dictionary;\n") + + def isValidToken (self, sToken): + "checks if is valid (if there is hyphens in , is split, each part is checked)" + if self.isValid(sToken): + return True + if "-" in sToken: + if sToken.count("-") > 4: + return True + return all(self.isValid(sWord) for sWord in sToken.split("-")) + return False + + def isValid (self, sWord): + "checks if is valid (different casing tested if the first letter is a capital)" + if not sWord: + return None + if "’" in sWord: # ugly hack + sWord = sWord.replace("’", "'") + if self.lookup(sWord): + return True + if sWord[0:1].isupper(): + if len(sWord) > 1: + if sWord.istitle(): + return self.lookup(sWord.lower()) + if sWord.isupper(): + if self.bOptNumSigle: + return True + return self.lookup(sWord.lower()) or self.lookup(sWord.capitalize()) + return self.lookup(sWord[:1].lower() + sWord[1:]) + else: + return self.lookup(sWord.lower()) + return False + + def lookup (self, sWord): + "returns True if in dictionary (strict verification)" + iAddr = 0 + for c in sWord: + if c not in self.dChar: + return False + iAddr = self._lookupArcNode(self.dChar[c], iAddr) + if iAddr == None: + return False + return bool(int.from_bytes(self.byDic[iAddr:iAddr+self.nBytesArc], byteorder='big') & self._finalNodeMask) + + def getMorph (self, sWord): + "retrieves morphologies list, different casing allowed" + l = self.morph(sWord) + if sWord[0:1].isupper(): + l.extend(self.morph(sWord.lower())) + if sWord.isupper() and len(sWord) > 1: + l.extend(self.morph(sWord.capitalize())) + return l + + #@timethis + def suggest (self, sWord, nSuggLimit=10): + "returns a set of suggestions for " + sPfx, sWord, sSfx = cp.cut(sWord) + nMaxSwitch = max(len(sWord) // 3, 1) + nMaxDel = len(sWord) // 5 + nMaxHardRepl = max((len(sWord) - 5) // 4, 1) + oSuggResult = SuggResult(sWord) + self._suggest(oSuggResult, sWord, nMaxSwitch=nMaxSwitch, nMaxDel=nMaxDel, nMaxHardRepl=nMaxHardRepl) + if sWord.istitle(): + self._suggest(oSuggResult, sWord.lower(), nMaxSwitch=nMaxSwitch, nMaxDel=nMaxDel, nMaxHardRepl=nMaxHardRepl) + elif sWord.islower(): + self._suggest(oSuggResult, sWord.title(), nMaxSwitch=nMaxSwitch, nMaxDel=nMaxDel, nMaxHardRepl=nMaxHardRepl) + aSugg = oSuggResult.getSuggestions(nSuggLimit) + if sSfx or sPfx: + # we add what we removed + return list(map(lambda sSug: sPfx + sSug + sSfx, aSugg)) + return aSugg + + def _suggest (self, oSuggResult, sRemain, nMaxSwitch=0, nMaxDel=0, nMaxHardRepl=0, nDeep=0, iAddr=0, sNewWord="", bAvoidLoop=False): + # recursive function + #logging.info((nDeep * " ") + sNewWord + ":" + sRemain) + if not sRemain: + if int.from_bytes(self.byDic[iAddr:iAddr+self.nBytesArc], byteorder='big') & self._finalNodeMask: + oSuggResult.addSugg(sNewWord, nDeep) + for sTail in self._getTails(iAddr): + oSuggResult.addSugg(sNewWord+sTail, nDeep) + return + cCurrent = sRemain[0:1] + for cChar, jAddr in self._getCharArcs(iAddr): + if cChar in cp.d1to1.get(cCurrent, cCurrent): + self._suggest(oSuggResult, sRemain[1:], nMaxSwitch, nMaxDel, nMaxHardRepl, nDeep+1, jAddr, sNewWord+cChar) + elif not bAvoidLoop and nMaxHardRepl: + self._suggest(oSuggResult, sRemain[1:], nMaxSwitch, nMaxDel, nMaxHardRepl-1, nDeep+1, jAddr, sNewWord+cChar, True) + if not bAvoidLoop: # avoid infinite loop + if len(sRemain) > 1: + if cCurrent == sRemain[1:2]: + # same char, we remove 1 char without adding 1 to + self._suggest(oSuggResult, sRemain[1:], nMaxSwitch, nMaxDel, nMaxHardRepl, nDeep+1, iAddr, sNewWord) + else: + # switching chars + if nMaxSwitch: + self._suggest(oSuggResult, sRemain[1:2]+sRemain[0:1]+sRemain[2:], nMaxSwitch-1, nMaxDel, nMaxHardRepl, nDeep+1, iAddr, sNewWord, True) + # delete char + if nMaxDel: + self._suggest(oSuggResult, sRemain[1:], nMaxSwitch, nMaxDel-1, nMaxHardRepl, nDeep+1, iAddr, sNewWord, True) + # Phonetic replacements + for sRepl in cp.get1toXReplacement(sNewWord[-1:], cCurrent, sRemain[1:2]): + self._suggest(oSuggResult, sRepl + sRemain[1:], nMaxSwitch, nMaxDel, nMaxHardRepl, nDeep+1, iAddr, sNewWord, True) + for sRepl in cp.d2toX.get(sRemain[0:2], ()): + self._suggest(oSuggResult, sRepl + sRemain[2:], nMaxSwitch, nMaxDel, nMaxHardRepl, nDeep+1, iAddr, sNewWord, True) + # end of word + if len(sRemain) == 2: + for sRepl in cp.dFinal2.get(sRemain, ()): + self._suggest(oSuggResult, sRepl, nMaxSwitch, nMaxDel, nMaxHardRepl, nDeep+1, iAddr, sNewWord, True) + elif len(sRemain) == 1: + self._suggest(oSuggResult, "", nMaxSwitch, nMaxDel, nMaxHardRepl, nDeep+1, iAddr, sNewWord, True) # remove last char and go on + for sRepl in cp.dFinal1.get(sRemain, ()): + self._suggest(oSuggResult, sRepl, nMaxSwitch, nMaxDel, nMaxHardRepl, nDeep+1, iAddr, sNewWord, True) + + #@timethis + def suggest2 (self, sWord, nMaxSugg=10): + "returns a set of suggestions for " + sPfx, sWord, sSfx = cp.cut(sWord) + oSuggResult = SuggResult(sWord) + self._suggest2(oSuggResult) + aSugg = oSuggResult.getSuggestions() + if sSfx or sPfx: + # we add what we removed + return list(map(lambda sSug: sPfx + sSug + sSfx, aSugg)) + return aSugg + + def _suggest2 (self, oSuggResult, nDeep=0, iAddr=0, sNewWord=""): + # recursive function + #logging.info((nDeep * " ") + sNewWord) + if nDeep >= oSuggResult.nDistLimit: + sCleanNewWord = cp.simplifyWord(sNewWord) + if st.distanceSift4(oSuggResult.sCleanWord[:len(sCleanNewWord)], sCleanNewWord) > oSuggResult.nDistLimit: + return + if int.from_bytes(self.byDic[iAddr:iAddr+self.nBytesArc], byteorder='big') & self._finalNodeMask: + oSuggResult.addSugg(sNewWord, nDeep) + for cChar, jAddr in self._getCharArcsWithPriority(iAddr, oSuggResult.sWord[nDeep:nDeep+1]): + self._suggest2(oSuggResult, nDeep+1, jAddr, sNewWord+cChar) + return + + def _getCharArcs (self, iAddr): + "generator: yield all chars and addresses from node at address " + for nVal, jAddr in self._getArcs(iAddr): + if nVal < self.nChar: + yield (self.dCharVal[nVal], jAddr) + + def _getSimilarCharArcs (self, cChar, iAddr): + "generator: yield similar char of and address of the following node" + for c in cp.d1to1.get(cChar, [cChar]): + if c in self.dChar: + jAddr = self._lookupArcNode(self.dChar[c], iAddr) + if jAddr: + yield (c, jAddr) + + def _getCharArcsWithPriority (self, iAddr, cChar): + if not cChar: + yield from self._getCharArcs(iAddr) + lTuple = list(self._getCharArcs(iAddr)) + lTuple.sort(key=lambda t: 0 if t[0] in cp.d1to1.get(cChar, cChar) else 1) + yield from lTuple + + def _getTails (self, iAddr, sTail="", n=2): + "return a list of suffixes ending at a distance of from " + aTails = set() + for nVal, jAddr in self._getArcs(iAddr): + if nVal < self.nChar: + if int.from_bytes(self.byDic[jAddr:jAddr+self.nBytesArc], byteorder='big') & self._finalNodeMask: + aTails.add(sTail + self.dCharVal[nVal]) + if n and not aTails: + aTails.update(self._getTails(jAddr, sTail+self.dCharVal[nVal], n-1)) + return aTails + + def drawPath (self, sWord, iAddr=0): + "show the path taken by in the graph" + c1 = sWord[0:1] if sWord else " " + iPos = -1 + n = 0 + print(c1 + ": ", end="") + for c2, jAddr in self._getCharArcs(iAddr): + print(c2, end="") + if c2 == sWord[0:1]: + iNextNodeAddr = jAddr + iPos = n + n += 1 + if not sWord: + return + if iPos >= 0: + print("\n "+ " " * iPos + "|") + self.drawPath(sWord[1:], iNextNodeAddr) + + def select (self, sPattern=""): + "generator: returns all entries which morphology fits " + zPattern = None + try: + zPattern = re.compile(sPattern) + except: + print("# Error in regex pattern") + traceback.print_exc() + yield from self._select1(zPattern, 0, "") + + # def morph (self, sWord): + # is defined in __init__ + + # VERSION 1 + def _select1 (self, zPattern, iAddr, sWord): + # recursive generator + for nVal, jAddr in self._getArcs1(iAddr): + if nVal < self.nChar: + # simple character + yield from self._select1(zPattern, jAddr, sWord + self.lArcVal[nVal]) + else: + sEntry = sWord + "\t" + self.funcStemming(sWord, self.lArcVal[nVal]) + for nMorphVal, _ in self._getArcs1(jAddr): + if not zPattern or zPattern.search(self.lArcVal[nMorphVal]): + yield sEntry + "\t" + self.lArcVal[nMorphVal] + + def _morph1 (self, sWord): + "returns morphologies of " + iAddr = 0 + for c in sWord: + if c not in self.dChar: + return [] + iAddr = self._lookupArcNode(self.dChar[c], iAddr) + if iAddr == None: + return [] + if (int.from_bytes(self.byDic[iAddr:iAddr+self.nBytesArc], byteorder='big') & self._finalNodeMask): + l = [] + nRawArc = 0 + while not (nRawArc & self._lastArcMask): + iEndArcAddr = iAddr + self.nBytesArc + nRawArc = int.from_bytes(self.byDic[iAddr:iEndArcAddr], byteorder='big') + nArc = nRawArc & self._arcMask + if nArc >= self.nChar: + # This value is not a char, this is a stemming code + sStem = ">" + self.funcStemming(sWord, self.lArcVal[nArc]) + # Now , we go to the next node and retrieve all following arcs values, all of them are tags + iAddr2 = int.from_bytes(self.byDic[iEndArcAddr:iEndArcAddr+self.nBytesNodeAddress], byteorder='big') + nRawArc2 = 0 + while not (nRawArc2 & self._lastArcMask): + iEndArcAddr2 = iAddr2 + self.nBytesArc + nRawArc2 = int.from_bytes(self.byDic[iAddr2:iEndArcAddr2], byteorder='big') + l.append(sStem + " " + self.lArcVal[nRawArc2 & self._arcMask]) + iAddr2 = iEndArcAddr2+self.nBytesNodeAddress + iAddr = iEndArcAddr+self.nBytesNodeAddress + return l + return [] + + def _stem1 (self, sWord): + "returns stems list of " + iAddr = 0 + for c in sWord: + if c not in self.dChar: + return [] + iAddr = self._lookupArcNode(self.dChar[c], iAddr) + if iAddr == None: + return [] + if (int.from_bytes(self.byDic[iAddr:iAddr+self.nBytesArc], byteorder='big') & self._finalNodeMask): + l = [] + nRawArc = 0 + while not (nRawArc & self._lastArcMask): + iEndArcAddr = iAddr + self.nBytesArc + nRawArc = int.from_bytes(self.byDic[iAddr:iEndArcAddr], byteorder='big') + nArc = nRawArc & self._arcMask + if nArc >= self.nChar: + # This value is not a char, this is a stemming code + l.append(self.funcStemming(sWord, self.lArcVal[nArc])) + iAddr = iEndArcAddr+self.nBytesNodeAddress + return l + return [] + + def _lookupArcNode1 (self, nVal, iAddr): + "looks if is an arc at the node at , if yes, returns address of next node else None" + while True: + iEndArcAddr = iAddr+self.nBytesArc + nRawArc = int.from_bytes(self.byDic[iAddr:iEndArcAddr], byteorder='big') + if nVal == (nRawArc & self._arcMask): + # the value we are looking for + # we return the address of the next node + return int.from_bytes(self.byDic[iEndArcAddr:iEndArcAddr+self.nBytesNodeAddress], byteorder='big') + else: + # value not found + if (nRawArc & self._lastArcMask): + return None + iAddr = iEndArcAddr+self.nBytesNodeAddress + + def _getArcs1 (self, iAddr): + "generator: return all arcs at as tuples of (nVal, iAddr)" + while True: + iEndArcAddr = iAddr+self.nBytesArc + nRawArc = int.from_bytes(self.byDic[iAddr:iEndArcAddr], byteorder='big') + yield (nRawArc & self._arcMask, int.from_bytes(self.byDic[iEndArcAddr:iEndArcAddr+self.nBytesNodeAddress], byteorder='big')) + if (nRawArc & self._lastArcMask): + break + iAddr = iEndArcAddr+self.nBytesNodeAddress + + def _writeNodes1 (self, spfDest): + "for debugging only" + print(" > Write binary nodes") + with codecs.open(spfDest, 'w', 'utf-8', newline="\n") as hDst: + iAddr = 0 + hDst.write("i{:_>10} -- #{:_>10}\n".format("0", iAddr)) + while iAddr < len(self.byDic): + iEndArcAddr = iAddr+self.nBytesArc + nRawArc = int.from_bytes(self.byDic[iAddr:iEndArcAddr], byteorder='big') + nArc = nRawArc & self._arcMask + hDst.write(" {:<20} {:0>16} i{:>10} #{:_>10}\n".format(self.lArcVal[nArc], bin(nRawArc)[2:], "?", \ + int.from_bytes(self.byDic[iEndArcAddr:iEndArcAddr+self.nBytesNodeAddress], \ + byteorder='big'))) + iAddr = iEndArcAddr+self.nBytesNodeAddress + if (nRawArc & self._lastArcMask) and iAddr < len(self.byDic): + hDst.write("\ni{:_>10} -- #{:_>10}\n".format("?", iAddr)) + hDst.close() + + # VERSION 2 + def _morph2 (self, sWord): + "returns morphologies of " + iAddr = 0 + for c in sWord: + if c not in self.dChar: + return [] + iAddr = self._lookupArcNode(self.dChar[c], iAddr) + if iAddr == None: + return [] + if (int.from_bytes(self.byDic[iAddr:iAddr+self.nBytesArc], byteorder='big') & self._finalNodeMask): + l = [] + nRawArc = 0 + while not (nRawArc & self._lastArcMask): + iEndArcAddr = iAddr + self.nBytesArc + nRawArc = int.from_bytes(self.byDic[iAddr:iEndArcAddr], byteorder='big') + nArc = nRawArc & self._arcMask + if nArc >= self.nChar: + # This value is not a char, this is a stemming code + sStem = ">" + self.funcStemming(sWord, self.lArcVal[nArc]) + # Now , we go to the next node and retrieve all following arcs values, all of them are tags + if not (nRawArc & self._addrBitMask): + iAddr2 = int.from_bytes(self.byDic[iEndArcAddr:iEndArcAddr+self.nBytesNodeAddress], byteorder='big') + else: + # we go to the end of the node + iAddr2 = iEndArcAddr + while not (nRawArc & self._lastArcMask): + nRawArc = int.from_bytes(self.byDic[iAddr2:iAddr2+self.nBytesArc], byteorder='big') + iAddr2 += self.nBytesArc + self.nBytesNodeAddress + nRawArc2 = 0 + while not (nRawArc2 & self._lastArcMask): + iEndArcAddr2 = iAddr2 + self.nBytesArc + nRawArc2 = int.from_bytes(self.byDic[iAddr2:iEndArcAddr2], byteorder='big') + l.append(sStem + " " + self.lArcVal[nRawArc2 & self._arcMask]) + iAddr2 = iEndArcAddr2+self.nBytesNodeAddress if not (nRawArc2 & self._addrBitMask) else iEndArcAddr2 + iAddr = iEndArcAddr+self.nBytesNodeAddress if not (nRawArc & self._addrBitMask) else iEndArcAddr + return l + return [] + + def _stem2 (self, sWord): + "returns stems list of " + iAddr = 0 + for c in sWord: + if c not in self.dChar: + return [] + iAddr = self._lookupArcNode(self.dChar[c], iAddr) + if iAddr == None: + return [] + if (int.from_bytes(self.byDic[iAddr:iAddr+self.nBytesArc], byteorder='big') & self._finalNodeMask): + l = [] + nRawArc = 0 + while not (nRawArc & self._lastArcMask): + iEndArcAddr = iAddr + self.nBytesArc + nRawArc = int.from_bytes(self.byDic[iAddr:iEndArcAddr], byteorder='big') + nArc = nRawArc & self._arcMask + if nArc >= self.nChar: + # This value is not a char, this is a stemming code + l.append(self.funcStemming(sWord, self.lArcVal[nArc])) + # Now , we go to the next node + if not (nRawArc & self._addrBitMask): + iAddr2 = int.from_bytes(self.byDic[iEndArcAddr:iEndArcAddr+self.nBytesNodeAddress], byteorder='big') + else: + # we go to the end of the node + iAddr2 = iEndArcAddr + while not (nRawArc & self._lastArcMask): + nRawArc = int.from_bytes(self.byDic[iAddr2:iAddr2+self.nBytesArc], byteorder='big') + iAddr2 += self.nBytesArc + self.nBytesNodeAddress + iAddr = iEndArcAddr+self.nBytesNodeAddress if not (nRawArc & self._addrBitMask) else iEndArcAddr + return l + return [] + + def _lookupArcNode2 (self, nVal, iAddr): + "looks if is an arc at the node at , if yes, returns address of next node else None" + while True: + iEndArcAddr = iAddr+self.nBytesArc + nRawArc = int.from_bytes(self.byDic[iAddr:iEndArcAddr], byteorder='big') + if nVal == (nRawArc & self._arcMask): + # the value we are looking for + if not (nRawArc & self._addrBitMask): + # we return the address of the next node + return int.from_bytes(self.byDic[iEndArcAddr:iEndArcAddr+self.nBytesNodeAddress], byteorder='big') + else: + # we go to the end of the node + iAddr = iEndArcAddr + while not (nRawArc & self._lastArcMask): + nRawArc = int.from_bytes(self.byDic[iAddr:iAddr+self.nBytesArc], byteorder='big') + iAddr += self.nBytesArc + self.nBytesNodeAddress if not (nRawArc & self._addrBitMask) else self.nBytesArc + return iAddr + else: + # value not found + if (nRawArc & self._lastArcMask): + return None + iAddr = iEndArcAddr+self.nBytesNodeAddress if not (nRawArc & self._addrBitMask) else iEndArcAddr + + def _writeNodes2 (self, spfDest): + "for debugging only" + print(" > Write binary nodes") + with codecs.open(spfDest, 'w', 'utf-8', newline="\n") as hDst: + iAddr = 0 + hDst.write("i{:_>10} -- #{:_>10}\n".format("0", iAddr)) + while iAddr < len(self.byDic): + iEndArcAddr = iAddr+self.nBytesArc + nRawArc = int.from_bytes(self.byDic[iAddr:iEndArcAddr], byteorder='big') + nArc = nRawArc & self._arcMask + if not (nRawArc & self._addrBitMask): + iNextNodeAddr = int.from_bytes(self.byDic[iEndArcAddr:iEndArcAddr+self.nBytesNodeAddress], byteorder='big') + hDst.write(" {:<20} {:0>16} i{:>10} #{:_>10}\n".format(self.lArcVal[nArc], bin(nRawArc)[2:], "?", iNextNodeAddr)) + iAddr = iEndArcAddr+self.nBytesNodeAddress + else: + hDst.write(" {:<20} {:0>16}\n".format(self.lArcVal[nArc], bin(nRawArc)[2:])) + iAddr = iEndArcAddr + if (nRawArc & self._lastArcMask): + hDst.write("\ni{:_>10} -- #{:_>10}\n".format("?", iAddr)) + hDst.close() + + # VERSION 3 + def _morph3 (self, sWord): + "returns morphologies of " + iAddr = 0 + for c in sWord: + if c not in self.dChar: + return [] + iAddr = self._lookupArcNode(self.dChar[c], iAddr) + if iAddr == None: + return [] + if (int.from_bytes(self.byDic[iAddr:iAddr+self.nBytesArc], byteorder='big') & self._finalNodeMask): + l = [] + nRawArc = 0 + iAddrNode = iAddr + while not (nRawArc & self._lastArcMask): + iEndArcAddr = iAddr + self.nBytesArc + nRawArc = int.from_bytes(self.byDic[iAddr:iEndArcAddr], byteorder='big') + nArc = nRawArc & self._arcMask + if nArc >= self.nChar: + # This value is not a char, this is a stemming code + sStem = ">" + self.funcStemming(sWord, self.lArcVal[nArc]) + # Now , we go to the next node and retrieve all following arcs values, all of them are tags + if not (nRawArc & self._addrBitMask): + iAddr2 = int.from_bytes(self.byDic[iEndArcAddr:iEndArcAddr+self.nBytesNodeAddress], byteorder='big') + else: + iAddr2 = iAddrNode + int.from_bytes(self.byDic[iEndArcAddr:iEndArcAddr+self.nBytesOffset], byteorder='big') + nRawArc2 = 0 + while not (nRawArc2 & self._lastArcMask): + iEndArcAddr2 = iAddr2 + self.nBytesArc + nRawArc2 = int.from_bytes(self.byDic[iAddr2:iEndArcAddr2], byteorder='big') + l.append(sStem + " " + self.lArcVal[nRawArc2 & self._arcMask]) + iAddr2 = iEndArcAddr2+self.nBytesNodeAddress if not (nRawArc2 & self._addrBitMask) else iEndArcAddr2+self.nBytesOffset + iAddr = iEndArcAddr+self.nBytesNodeAddress if not (nRawArc & self._addrBitMask) else iEndArcAddr+self.nBytesOffset + return l + return [] + + def _stem3 (self, sWord): + "returns stems list of " + iAddr = 0 + for c in sWord: + if c not in self.dChar: + return [] + iAddr = self._lookupArcNode(self.dChar[c], iAddr) + if iAddr == None: + return [] + if (int.from_bytes(self.byDic[iAddr:iAddr+self.nBytesArc], byteorder='big') & self._finalNodeMask): + l = [] + nRawArc = 0 + iAddrNode = iAddr + while not (nRawArc & self._lastArcMask): + iEndArcAddr = iAddr + self.nBytesArc + nRawArc = int.from_bytes(self.byDic[iAddr:iEndArcAddr], byteorder='big') + nArc = nRawArc & self._arcMask + if nArc >= self.nChar: + # This value is not a char, this is a stemming code + l.append(self.funcStemming(sWord, self.lArcVal[nArc])) + iAddr = iEndArcAddr+self.nBytesNodeAddress if not (nRawArc & self._addrBitMask) else iEndArcAddr+self.nBytesOffset + return l + return [] + + def _lookupArcNode3 (self, nVal, iAddr): + "looks if is an arc at the node at , if yes, returns address of next node else None" + iAddrNode = iAddr + while True: + iEndArcAddr = iAddr+self.nBytesArc + nRawArc = int.from_bytes(self.byDic[iAddr:iEndArcAddr], byteorder='big') + if nVal == (nRawArc & self._arcMask): + # the value we are looking for + if not (nRawArc & self._addrBitMask): + return int.from_bytes(self.byDic[iEndArcAddr:iEndArcAddr+self.nBytesNodeAddress], byteorder='big') + else: + return iAddrNode + int.from_bytes(self.byDic[iEndArcAddr:iEndArcAddr+self.nBytesOffset], byteorder='big') + else: + # value not found + if (nRawArc & self._lastArcMask): + return None + iAddr = iEndArcAddr+self.nBytesNodeAddress if not (nRawArc & self._addrBitMask) else iEndArcAddr+self.nBytesOffset + + def _writeNodes3 (self, spfDest): + "for debugging only" + print(" > Write binary nodes") + with codecs.open(spfDest, 'w', 'utf-8', newline="\n") as hDst: + iAddr = 0 + hDst.write("i{:_>10} -- #{:_>10}\n".format("0", iAddr)) + while iAddr < len(self.byDic): + iEndArcAddr = iAddr+self.nBytesArc + nRawArc = int.from_bytes(self.byDic[iAddr:iEndArcAddr], byteorder='big') + nArc = nRawArc & self._arcMask + if not (nRawArc & self._addrBitMask): + iNextNodeAddr = int.from_bytes(self.byDic[iEndArcAddr:iEndArcAddr+self.nBytesNodeAddress], byteorder='big') + hDst.write(" {:<20} {:0>16} i{:>10} #{:_>10}\n".format(self.lArcVal[nArc], bin(nRawArc)[2:], "?", iNextNodeAddr)) + iAddr = iEndArcAddr+self.nBytesNodeAddress + else: + iNextNodeAddr = int.from_bytes(self.byDic[iEndArcAddr:iEndArcAddr+self.nBytesOffset], byteorder='big') + hDst.write(" {:<20} {:0>16} i{:>10} +{:_>10}\n".format(self.lArcVal[nArc], bin(nRawArc)[2:], "?", iNextNodeAddr)) + iAddr = iEndArcAddr+self.nBytesOffset + if (nRawArc & self._lastArcMask): + hDst.write("\ni{:_>10} -- #{:_>10}\n".format("?", iAddr)) + hDst.close() ADDED graphspell/keyboard_chars_proximity.py Index: graphspell/keyboard_chars_proximity.py ================================================================== --- /dev/null +++ graphspell/keyboard_chars_proximity.py @@ -0,0 +1,220 @@ +# Keyboard chars proximity + + +def getKeyboardMap (sKeyboard): + return _dKeyboardMap.get(sKeyboard.lower(), {}) + + +def getKeyboardList (): + return _dKeyboardMap.keys() + + +_dKeyboardMap = { + # keyboards by alphabetical order + # bépo, colemak and dvorak users are assumed to do less typing errors. + "azerty": { + # fr + # line 1 + "é": "az", + "è": "yu", + "ç": "àio", + "à": "op", + # line 2 + "a": "zéq", + "z": "aesq", + "e": "zrds", + "r": "etfd", + "t": "rygf", + "y": "tuhg", + "u": "yijh", + "i": "uokj", + "o": "iplk", + "p": "oml", + # line 3 + "q": "sawz", + "s": "qdzwxe", + "d": "sfexcr", + "f": "dgrcvt", + "g": "fhtvby", + "h": "gjybnu", + "j": "hkuni", + "k": "jlio", + "l": "kmop", + "m": "lùp", + "ù": "m", + # line 4 + "w": "xqs", + "x": "wcsd", + "c": "xvdf", + "v": "cbfg", + "b": "vngh", + "n": "bhj", + }, + "bépo": { + # fr + # line 2 + "b": "éa", + "é": "bpu", + "p": "éoi", + "o": "pèe", + "è": "o", + "v": "dt", + "d": "vls", + "l": "djr", + "j": "lzn", + "z": "jmw", + # line 3 + "a": "ubà", + "u": "aiéy", + "i": "uepx", + "e": "io", + "c": "t", + "t": "csvq", + "s": "trdg", + "r": "snlh", + "n": "rmjf", + "m": "nzç", + # line 4 + "à": "yêa", + "y": "àxu", + "x": "ywi", + "w": "z", + "k": "c", + "q": "gt", + "g": "qhs", + "h": "gfr", + "f": "hçn", + "ç": "fm", + }, + "colemak": { + # en, us, intl + # line 2 + "q": "wa", + "w": "qfr", + "f": "wps", + "p": "fgt", + "g": "pjd", + "j": "glh", + "l": "jun", + "u": "lye", + "y": "ui", + # line 3 + "a": "rqz", + "r": "aswx", + "s": "rtfc", + "t": "sdpv", + "d": "thgb", + "h": "dnjk", + "n": "helm", + "e": "niu", + "i": "eoy", + "o": "i", + # line 4 + "z": "xa", + "x": "zcr", + "c": "xvs", + "v": "cbt", + "b": "vkd", + "k": "bmh", + "m": "kn", + }, + "dvorak": { + # en, us, intl + # line 2 + "p": "yu", + "y": "pfi", + "f": "ygd", + "g": "fch", + "c": "grt", + "r": "cln", + "l": "rs", + # line 3 + "a": "o", + "o": "aeq", + "e": "ouj", + "u": "eipk", + "i": "udyx", + "d": "ihfb", + "h": "dtgm", + "t": "hncw", + "n": "tsrv", + "s": "nlz", + # line 4 + "q": "jo", + "j": "qke", + "k": "jxu", + "x": "kbi", + "b": "xmd", + "m": "bwh", + "w": "mvt", + "v": "wzn", + "z": "vs", + }, + "qwerty": { + # en, us, intl + # line 2 + "q": "wa", + "w": "qeas", + "e": "wrds", + "r": "etfd", + "t": "rygf", + "y": "tuhg", + "u": "yijh", + "i": "uokj", + "o": "iplk", + "p": "ol", + # line 3 + "a": "sqzw", + "s": "adwzxe", + "d": "sfexcr", + "f": "dgrcvt", + "g": "fhtvby", + "h": "gjybnu", + "j": "hkunmi", + "k": "jlimo", + "l": "kop", + # line 4 + "z": "xas", + "x": "zcsd", + "c": "xvdf", + "v": "cbfg", + "b": "vngh", + "n": "bmhj", + "m": "njk", + }, + "qwertz": { + # ge, au + # line 2 + "q": "wa", + "w": "qeas", + "e": "wrds", + "r": "etfd", + "t": "rzgf", + "z": "tuhg", + "u": "zijh", + "i": "uokj", + "o": "iplk", + "p": "oüöl", + "ü": "päö", + # line 3 + "a": "sqyw", + "s": "adwyxe", + "d": "sfexcr", + "f": "dgrcvt", + "g": "fhtvbz", + "h": "gjzbnu", + "j": "hkunmi", + "k": "jlimo", + "l": "köop", + "ö": "läpü", + "ä": "öü", + # line 4 + "y": "xas", + "x": "ycsd", + "c": "xvdf", + "v": "cbfg", + "b": "vngh", + "n": "bmhj", + "m": "njk", + } +} ADDED graphspell/progressbar.py Index: graphspell/progressbar.py ================================================================== --- /dev/null +++ graphspell/progressbar.py @@ -0,0 +1,35 @@ +# Textual progressbar +# by Olivier R. +# License: MPL 2 + +import time + +class ProgressBar: + "Textual progressbar" + + def __init__ (self, nMin=0, nMax=100, nWidth=78): + "initiate with minimum nMin to maximum nMax" + self.nMin = nMin + self.nMax = nMax + self.nSpan = nMax - nMin + self.nWidth = nWidth-9 + self.nAdvance = -1 + self.nCurVal = nMin + self.startTime = time.time() + self._update() + + def _update (self): + fDone = ((self.nCurVal - self.nMin) / self.nSpan) + nAdvance = int(fDone * self.nWidth) + if (nAdvance > self.nAdvance): + self.nAdvance = nAdvance + print("\r[ {}{} {}% ] ".format('>'*nAdvance, ' '*(self.nWidth-nAdvance), round(fDone*100)), end="") + + def increment (self, n=1): + "increment value by n (1 by default)" + self.nCurVal += n + self._update() + + def done (self): + "to call when it’s finished" + print("\r[ task done in {:.1f} s ] ".format(time.time() - self.startTime)) ADDED graphspell/spellchecker.py Index: graphspell/spellchecker.py ================================================================== --- /dev/null +++ graphspell/spellchecker.py @@ -0,0 +1,134 @@ +# Spellchecker +# Wrapper for the IBDAWG class. +# Useful to check several dictionaries at once. + +from . import ibdawg + + +dDictionaries = { + "fr": "French.bdic", + "en": "English.bdic" +} + + +class Spellchecker (): + + def __init__ (self, sLangCode): + self.sLangCode = sLangCode + self.oMainDic = None + if sLangCode in dDictionaries: + self.oMainDic = ibdawg.IBDAWG(dDictionaries[sLangCode]) + self.lOtherDic = [] + return bool(self.oMainDic) + + + def setMainDictionary (self, sDicName): + try: + self.oMainDic = ibdawg.IBDAWG(sDicName) + return True + except: + print("Error: <" + sDicName + "> not set as main dictionary.") + return False + + def addDictionary (self, sDicName): + try: + self.lOtherDic.append(ibdawg.IBDAWG(sDicName)) + return True + except: + print("Error: <" + sDicName + "> not added to the list.") + return False + + # Return codes: + # 0: invalid + # 1: correct in main dictionary + # 2+: correct in foreign dictionaries + + + # check in the main dictionary only + + def isValidToken (self, sToken): + "(in main dictionary) checks if sToken is valid (if there is hyphens in sToken, sToken is split, each part is checked)" + if self.oMainDic.isValidToken(sToken): + return 1 + return 0 + + def isValid (self, sWord): + "(in main dictionary) checks if sWord is valid (different casing tested if the first letter is a capital)" + if self.oMainDic.isValid(sWord): + return 1 + return 0 + + def lookup (self, sWord): + "(in main dictionary) checks if sWord is in dictionary as is (strict verification)" + if self.oMainDic.lookup(sWord): + return 1 + return 0 + + + # check in all dictionaries + + def isValidTokenAll (self, sToken): + "(in all dictionaries) checks if sToken is valid (if there is hyphens in sToken, sToken is split, each part is checked)" + if self.oMainDic.isValidToken(sToken): + return 1 + for i, oDic in enumerate(self.lOtherDic, 2): + if oDic.isValidToken(sToken): + return i + return 0 + + def isValidAll (self, sWord): + "(in all dictionaries) checks if sWord is valid (different casing tested if the first letter is a capital)" + if self.oMainDic.isValid(sToken): + return 1 + for i, oDic in enumerate(self.lOtherDic, 2): + if oDic.isValid(sToken): + return i + return 0 + + def lookupAll (self, sWord): + "(in all dictionaries) checks if sWord is in dictionary as is (strict verification)" + if self.oMainDic.lookup(sToken): + return 1 + for i, oDic in enumerate(self.lOtherDic, 2): + if oDic.lookup(sToken): + return i + return 0 + + + # check in dictionaries up to level n + + def isValidTokenLevel (self, sToken, nLevel): + "(in dictionaries up to level n) checks if sToken is valid (if there is hyphens in sToken, sToken is split, each part is checked)" + if self.oMainDic.isValidToken(sToken): + return 1 + if nLevel >= 2: + for i, oDic in enumerate(self.lOtherDic, 2): + if oDic.isValidToken(sToken): + return i + if i == nLevel: + break + return 0 + + def isValidLevel (self, sWord, nLevel): + "(in dictionaries up to level n) checks if sWord is valid (different casing tested if the first letter is a capital)" + if self.oMainDic.isValid(sToken): + return 1 + if nLevel >= 2: + for i, oDic in enumerate(self.lOtherDic, 2): + if oDic.isValid(sToken): + return i + if i == nLevel: + break + return 0 + + def lookupLevel (self, sWord, nLevel): + "(in dictionaries up to level n) checks if sWord is in dictionary as is (strict verification)" + if self.oMainDic.lookup(sToken): + return 1 + if nLevel >= 2: + for i, oDic in enumerate(self.lOtherDic, 2): + if oDic.lookup(sToken): + return i + if i == nLevel: + break + return 0 ADDED graphspell/str_transform.py Index: graphspell/str_transform.py ================================================================== --- /dev/null +++ graphspell/str_transform.py @@ -0,0 +1,203 @@ +#!python3 + + +#### DISTANCE CALCULATIONS + +def longestCommonSubstring (s1, s2): + # http://en.wikipedia.org/wiki/Longest_common_substring_problem + # http://en.wikibooks.org/wiki/Algorithm_implementation/Strings/Longest_common_substring + M = [ [0]*(1+len(s2)) for i in range(1+len(s1)) ] + longest, x_longest = 0, 0 + for x in range(1, 1+len(s1)): + for y in range(1, 1+len(s2)): + if s1[x-1] == s2[y-1]: + M[x][y] = M[x-1][y-1] + 1 + if M[x][y] > longest: + longest = M[x][y] + x_longest = x + else: + M[x][y] = 0 + return s1[x_longest-longest : x_longest] + + +def distanceDamerauLevenshtein (s1, s2): + "distance of Damerau-Levenshtein between and " + # https://fr.wikipedia.org/wiki/Distance_de_Damerau-Levenshtein + d = {} + nLen1 = len(s1) + nLen2 = len(s2) + for i in range(-1, nLen1+1): + d[i, -1] = i + 1 + for j in range(-1, nLen2+1): + d[-1, j] = j + 1 + for i in range(nLen1): + for j in range(nLen2): + nCost = 0 if s1[i] == s2[j] else 1 + d[i, j] = min( + d[i-1, j] + 1, # Deletion + d[i, j-1] + 1, # Insertion + d[i-1, j-1] + nCost, # Substitution + ) + if i and j and s1[i] == s2[j-1] and s1[i-1] == s2[j]: + d[i, j] = min(d[i, j], d[i-2, j-2] + nCost) # Transposition + return d[nLen1-1, nLen2-1] + + +def distanceSift4 (s1, s2, nMaxOffset=5): + "implementation of general Sift4." + # https://siderite.blogspot.com/2014/11/super-fast-and-accurate-string-distance.html + if not s1: + return len(s2) + if not s2: + return len(s1) + nLen1, nLen2 = len(s1), len(s2) + i1, i2 = 0, 0 # Cursors for each string + nLargestCS = 0 # Largest common substring + nLocalCS = 0 # Local common substring + nTrans = 0 # Number of transpositions ('ab' vs 'ba') + lOffset = [] # Offset pair array, for computing the transpositions + + while i1 < nLen1 and i2 < nLen2: + if s1[i1] == s2[i2]: + nLocalCS += 1 + # Check if current match is a transposition + bTrans = False + i = 0 + while i < len(lOffset): + t = lOffset[i] + if i1 <= t[0] or i2 <= t[1]: + bTrans = abs(i2-i1) >= abs(t[1] - t[0]) + if bTrans: + nTrans += 1 + elif not t[2]: + t[2] = True + nTrans += 1 + break + elif i1 > t[1] and i2 > t[0]: + del lOffset[i] + else: + i += 1 + lOffset.append([i1, i2, bTrans]) + else: + nLargestCS += nLocalCS + nLocalCS = 0 + if i1 != i2: + i1 = i2 = min(i1, i2) + for i in range(nMaxOffset): + if i1 + i >= nLen1 and i2 + i >= nLen2: + break + elif i1 + i < nLen1 and s1[i1+i] == s2[i2]: + i1 += i - 1 + i2 -= 1 + break + elif i2 + i < nLen2 and s1[i1] == s2[i2+i]: + i2 += i - 1 + i1 -= 1 + break + i1 += 1 + i2 += 1 + if i1 >= nLen1 or i2 >= nLen2: + nLargestCS += nLocalCS + nLocalCS = 0 + i1 = i2 = min(i1, i2) + nLargestCS += nLocalCS + return round(max(nLen1, nLen2) - nLargestCS + nTrans) + + +def showDistance (s1, s2): + print("Damerau-Levenshtein: " + s1 + "/" + s2 + " = " + distanceDamerauLevenshtein(s1, s2)) + print("Sift4:" + s1 + "/" + s2 + " = " + distanceSift4(s1, s2)) + + + + +#### STEMMING OPERATIONS + +## No stemming + +def noStemming (sFlex, sStem): + return sStem + +def rebuildWord (sFlex, cmd1, cmd2): + if cmd1 == "_": + return sFlex + n, c = cmd1.split(":") + s = s[:n] + c + s[n:] + if cmd2 == "_": + return s + n, c = cmd2.split(":") + return s[:n] + c + s[n:] + + +## Define affixes for stemming + +# Note: 48 is the ASCII code for "0" + + +# Suffix only +def defineSuffixCode (sFlex, sStem): + """ Returns a string defining how to get stem from flexion + "n(sfx)" + with n: a char with numeric meaning, "0" = 0, "1" = 1, ... ":" = 10, etc. (See ASCII table.) Says how many letters to strip from flexion. + sfx [optional]: string to add on flexion + Examples: + "0": strips nothing, adds nothing + "1er": strips 1 letter, adds "er" + "2": strips 2 letters, adds nothing + """ + if sFlex == sStem: + return "0" + jSfx = 0 + for i in range(min(len(sFlex), len(sStem))): + if sFlex[i] != sStem[i]: + break + jSfx += 1 + return chr(len(sFlex)-jSfx+48) + sStem[jSfx:] + + +def changeWordWithSuffixCode (sWord, sSfxCode): + if sSfxCode == "0": + return sWord + return sWord[:-(ord(sSfxCode[0])-48)] + sSfxCode[1:] if sSfxCode[0] != '0' else sWord + sSfxCode[1:] + + +# Prefix and suffix + +def defineAffixCode (sFlex, sStem): + """ Returns a string defining how to get stem from flexion. Examples: + "0" if stem = flexion + "stem" if no common substring + "n(pfx)/m(sfx)" + with n and m: chars with numeric meaning, "0" = 0, "1" = 1, ... ":" = 10, etc. (See ASCII table.) Says how many letters to strip from flexion. + pfx [optional]: string to add before the flexion + sfx [optional]: string to add after the flexion + """ + if sFlex == sStem: + return "0" + # is stem a substring of flexion? + n = sFlex.find(sStem) + if n >= 0: + return "{}/{}".format(chr(n+48), chr(len(sFlex)-(len(sStem)+n)+48)) + # no, so we are looking for common substring + sSubs = longestCommonSubstring(sFlex, sStem) + if len(sSubs) > 1: + iPos = sStem.find(sSubs) + sPfx = sStem[:iPos] + sSfx = sStem[iPos+len(sSubs):] + n = sFlex.find(sSubs) + m = len(sFlex) - (len(sSubs)+n) + sAff = "{}/".format(chr(n+48)) if not sPfx else "{}{}/".format(chr(n+48), sPfx) + sAff += chr(m+48) if not sSfx else "{}{}".format(chr(m+48), sSfx) + return sAff + return sStem + + +def changeWordWithAffixCode (sWord, sAffCode): + if sAffCode == "0": + return sWord + if '/' not in sAffCode: + return "# error #" + sPfxCode, sSfxCode = sAffCode.split('/') + sWord = sPfxCode[1:] + sWord[(ord(sPfxCode[0])-48):] + return sWord[:-(ord(sSfxCode[0])-48)] + sSfxCode[1:] if sSfxCode[0] != '0' else sWord + sSfxCode[1:] + ADDED graphspell/tokenizer.py Index: graphspell/tokenizer.py ================================================================== --- /dev/null +++ graphspell/tokenizer.py @@ -0,0 +1,49 @@ +# Very simple tokenizer + +import re + +_PATTERNS = { + "default": + ( + r'(?P/(?:bin|boot|dev|etc|home|lib|mnt|opt|root|sbin|tmp|usr|var|Bureau|Documents|Images|Musique|Public|Téléchargements|Vidéos)(?:/[\w.()-]+)*)', + r'(?P[a-zA-Z]:\\(?:Program Files(?: [(]x86[)]|)|[\w.()]+)(?:\\[\w.()-]+)*)', + r'(?P[.,?!:;…«»“”"()/·]+)', + r'(?P[A-Z][.][A-Z][.](?:[A-Z][.])*)', + r'(?P(?:https?://|www[.]|\w+[@.]\w\w+[@.])\w[\w./?&!%=+*"\'@$#-]+)', + r'(?P[#@][\w-]+)', + r'(?P<\w+.*?>|)', + r'(?P\[/?\w+\])', + r'(?P\d\d?h\d\d\b)', + r'(?P-?\d+(?:[.,]\d+))', + r"(?P\w+(?:[’'`-]\w+)*)" + ), + "fr": + ( + r'(?P/(?:bin|boot|dev|etc|home|lib|mnt|opt|root|sbin|tmp|usr|var|Bureau|Documents|Images|Musique|Public|Téléchargements|Vidéos)(?:/[\w.()-]+)*)', + r'(?P[a-zA-Z]:\\(?:Program Files(?: [(]x86[)]|)|[\w.()]+)(?:\\[\w.()-]+)*)', + r'(?P[.,?!:;…«»“”"()/·]+)', + r'(?P[A-Z][.][A-Z][.](?:[A-Z][.])*)', + r'(?P(?:https?://|www[.]|\w+[@.]\w\w+[@.])\w[\w./?&!%=+*"\'@$#-]+)', + r'(?P[#@][\w-]+)', + r'(?P<\w+.*?>|)', + r'(?P\[/?\w+\])', + r"(?P(?:l|d|n|m|t|s|j|c|ç|lorsqu|puisqu|jusqu|quoiqu|qu)['’`])", + r'(?P\d+(?:er|nd|e|de|ième|ème|eme)\b)', + r'(?P\d\d?h\d\d\b)', + r'(?P-?\d+(?:[.,]\d+|))', + r"(?P\w+(?:[’'`-]\w+)*)" + ) +} + + +class Tokenizer: + + def __init__ (self, sLang): + self.sLang = sLang + if sLang not in _PATTERNS: + self.sLang = "default" + self.zToken = re.compile( "(?i)" + '|'.join(sRegex for sRegex in _PATTERNS[sLang]) ) + + def genTokens (self, sText): + for m in self.zToken.finditer(sText): + yield { "sType": m.lastgroup, "sValue": m.group(), "nStart": m.start(), "nEnd": m.end() } Index: helpers.py ================================================================== --- helpers.py +++ helpers.py @@ -1,9 +1,10 @@ # Useful tools import os import shutil +import errno import zipfile from string import Template @@ -55,10 +56,20 @@ if not os.path.exists(sp): os.makedirs(sp, exist_ok=True) else: eraseFolder(sp) + +def copyFolderContent (spSrc, spDst): + try: + shutil.copytree(spSrc, spDst) + except OSError as e: + if e.errno == errno.ENOTDIR: + shutil.copy(spSrc, spDst) + else: + raise + def fileFile (spf, dVars): "return file as a text filed with variables from " return Template(open(spf, "r", encoding="utf-8").read()).safe_substitute(dVars) ADDED js_extension/map.js Index: js_extension/map.js ================================================================== --- /dev/null +++ js_extension/map.js @@ -0,0 +1,56 @@ + +// Map +/*jslint esversion: 6*/ + +if (Map.prototype.grammalecte === undefined) { + Map.prototype.gl_shallowCopy = function () { + let oNewMap = new Map(); + for (let [key, val] of this.entries()) { + oNewMap.set(key, val); + } + return oNewMap; + }; + + Map.prototype.gl_get = function (key, defaultValue) { + let res = this.get(key); + if (res !== undefined) { + return res; + } + return defaultValue; + }; + + Map.prototype.gl_toString = function () { + // Default .toString() gives nothing useful + let sRes = "{ "; + for (let [k, v] of this.entries()) { + sRes += (typeof k === "string") ? '"' + k + '": ' : k.toString() + ": "; + sRes += (typeof v === "string") ? '"' + v + '", ' : v.toString() + ", "; + } + sRes = sRes.slice(0, -2) + " }"; + return sRes; + }; + + Map.prototype.gl_update = function (dDict) { + for (let [k, v] of dDict.entries()) { + this.set(k, v); + } + }; + + Map.prototype.gl_updateOnlyExistingKeys = function (dDict) { + for (let [k, v] of dDict.entries()) { + if (this.has(k)){ + this.set(k, v); + } + } + }; + + Map.prototype.gl_reverse = function () { + let dNewMap = new Map(); + this.forEach((val, key) => { + dNewMap.set(val, key); + }); + return dNewMap; + }; + + Map.prototype.grammalecte = true; +} ADDED js_extension/regex.js Index: js_extension/regex.js ================================================================== --- /dev/null +++ js_extension/regex.js @@ -0,0 +1,90 @@ + +// regex +/*jslint esversion: 6*/ + +if (RegExp.prototype.grammalecte === undefined) { + RegExp.prototype.gl_exec2 = function (sText, aGroupsPos, aNegLookBefore=null) { + let m; + while ((m = this.exec(sText)) !== null) { + // we have to iterate over sText here too + // because first match doesn’t imply it’s a valid match according to negative lookbefore assertions, + // and even if first match is finally invalid, it doesn’t mean the following eligible matchs would be invalid too. + if (aNegLookBefore !== null) { + // check negative look before assertions + if ( !aNegLookBefore.some(sRegEx => (RegExp.leftContext.search(sRegEx) >= 0)) ) { + break; + } + } else { + break; + } + } + if (m === null) { + return null; + } + + let codePos; + let iPos = 0; + m.start = [m.index]; + m.end = [this.lastIndex]; + try { + if (m.length > 1) { + // there is subgroup(s) + if (aGroupsPos !== null) { + // aGroupsPos is defined + for (let i = 1; i <= m.length-1; i++) { + codePos = aGroupsPos[i-1]; + if (typeof codePos === "number") { + // position as a number + m.start.push(m.index + codePos); + m.end.push(m.index + codePos + m[i].length); + } else if (codePos === "$") { + // at the end of the pattern + m.start.push(this.lastIndex - m[i].length); + m.end.push(this.lastIndex); + } else if (codePos === "w") { + // word in the middle of the pattern + iPos = m[0].search("[ ’,()«»“”]"+m[i]+"[ ,’()«»“”]") + 1 + m.index; + m.start.push(iPos); + m.end.push(iPos + m[i].length); + } else if (codePos === "*") { + // anywhere + iPos = m[0].indexOf(m[i]) + m.index; + m.start.push(iPos); + m.end.push(iPos + m[i].length); + } else if (codePos === "**") { + // anywhere after previous group + iPos = m[0].indexOf(m[i], m.end[i-1]-m.index) + m.index; + m.start.push(iPos); + m.end.push(iPos + m[i].length); + } else if (codePos.startsWith(">")) { + // >x:_ + // todo: look in substring x + iPos = m[0].indexOf(m[i]) + m.index; + m.start.push(iPos); + m.end.push(iPos + m[i].length); + } else { + console.error("# Error: unknown positioning code in regex [" + this.source + "], for group[" + i.toString() +"], code: [" + codePos + "]"); + } + } + } else { + // no aGroupsPos + for (let subm of m.slice(1)) { + iPos = m[0].indexOf(subm) + m.index; + m.start.push(iPos); + m.end.push(iPos + subm.length); + } + } + } + } + catch (e) { + if (typeof(helpers) !== "undefined") { + helpers.logerror(e); + } else { + console.error(e); + } + } + return m; + }; + + RegExp.prototype.grammalecte = true; +} ADDED js_extension/set.js Index: js_extension/set.js ================================================================== --- /dev/null +++ js_extension/set.js @@ -0,0 +1,13 @@ + +// Set +/*jslint esversion: 6*/ + +if (Set.prototype.grammalecte === undefined) { + Set.prototype.gl_update = function (aSet) { + for (let elem of aSet) { + this.add(elem); + } + }; + + Set.prototype.grammalecte = true; +} ADDED js_extension/string.js Index: js_extension/string.js ================================================================== --- /dev/null +++ js_extension/string.js @@ -0,0 +1,58 @@ + +// String +/*jslint esversion: 6*/ + +if (String.prototype.grammalecte === undefined) { + String.prototype.gl_count = function (sSearch, bOverlapping) { + // http://jsperf.com/string-ocurrence-split-vs-match/8 + if (sSearch.length <= 0) { + return this.length + 1; + } + let nOccur = 0; + let iPos = 0; + let nStep = (bOverlapping) ? 1 : sSearch.length; + while ((iPos = this.indexOf(sSearch, iPos)) >= 0) { + nOccur++; + iPos += nStep; + } + return nOccur; + }; + String.prototype.gl_isDigit = function () { + return (this.search(/^[0-9⁰¹²³⁴⁵⁶⁷⁸⁹]+$/) !== -1); + }; + String.prototype.gl_isLowerCase = function () { + return (this.search(/^[a-zà-öø-ÿ0-9-]+$/) !== -1); + }; + String.prototype.gl_isUpperCase = function () { + return (this.search(/^[A-ZÀ-ÖØ-ߌ0-9-]+$/) !== -1); + }; + String.prototype.gl_isTitle = function () { + return (this.search(/^[A-ZÀ-ÖØ-ߌ][a-zà-öø-ÿ'’-]+$/) !== -1); + }; + String.prototype.gl_toCapitalize = function () { + return this.slice(0,1).toUpperCase() + this.slice(1).toLowerCase(); + }; + String.prototype.gl_expand = function (oMatch) { + let sNew = this; + for (let i = 0; i < oMatch.length ; i++) { + let z = new RegExp("\\\\"+parseInt(i), "g"); + sNew = sNew.replace(z, oMatch[i]); + } + return sNew; + }; + String.prototype.gl_trimRight = function (sChars) { + let z = new RegExp("["+sChars+"]+$"); + return this.replace(z, ""); + }; + String.prototype.gl_trimLeft = function (sChars) { + let z = new RegExp("^["+sChars+"]+"); + return this.replace(z, ""); + }; + String.prototype.gl_trim = function (sChars) { + let z1 = new RegExp("^["+sChars+"]+"); + let z2 = new RegExp("["+sChars+"]+$"); + return this.replace(z1, "").replace(z2, ""); + }; + + String.prototype.grammalecte = true; +} Index: lex_build.py ================================================================== --- lex_build.py +++ lex_build.py @@ -3,24 +3,24 @@ # Lexicon builder import argparse from distutils import dir_util -import grammalecte.dawg as fsa -from grammalecte.ibdawg import IBDAWG +import graphspell.dawg as fsa +from graphspell.ibdawg import IBDAWG def build (spfSrc, sLangName, sDicName, bJSON=False, cStemmingMethod="S", nCompressMethod=1): "transform a text lexicon as a binary indexable dictionary" oDAWG = fsa.DAWG(spfSrc, sLangName, cStemmingMethod) - dir_util.mkpath("grammalecte/_dictionaries") - oDAWG.writeInfo("grammalecte/_dictionaries/" + sDicName + ".info.txt") - oDAWG.createBinary("grammalecte/_dictionaries/" + sDicName + ".bdic", int(nCompressMethod)) + dir_util.mkpath("graphspell/_dictionaries") + oDAWG.writeInfo("graphspell/_dictionaries/" + sDicName + ".info.txt") + oDAWG.createBinary("graphspell/_dictionaries/" + sDicName + ".bdic", int(nCompressMethod)) if bJSON: - dir_util.mkpath("grammalecte-js/_dictionaries") + dir_util.mkpath("graphspell-js/_dictionaries") oDic = IBDAWG(sDicName + ".bdic") - oDic.writeAsJSObject("grammalecte-js/_dictionaries/" + sDicName + ".json", bBinaryDictAsHexString=True) + oDic.writeAsJSObject("graphspell-js/_dictionaries/" + sDicName + ".json", bBinaryDictAsHexString=True) def main (): xParser = argparse.ArgumentParser() xParser.add_argument("src_lexicon", type=str, help="path and file name of the source lexicon") Index: make.py ================================================================== --- make.py +++ make.py @@ -165,14 +165,17 @@ def copyGrammalectePyPackageInZipFile (hZip, spLangPack, sDicName, sAddPath=""): for sf in os.listdir("grammalecte"): if not os.path.isdir("grammalecte/"+sf): hZip.write("grammalecte/"+sf, sAddPath+"grammalecte/"+sf) + for sf in os.listdir("grammalecte/graphspell"): + if not os.path.isdir("grammalecte/graphspell/"+sf): + hZip.write("grammalecte/graphspell/"+sf, sAddPath+"grammalecte/graphspell/"+sf) + hZip.write("grammalecte/graphspell/_dictionaries/"+sDicName, sAddPath+"grammalecte/graphspell/_dictionaries/"+sDicName) for sf in os.listdir(spLangPack): if not os.path.isdir(spLangPack+"/"+sf): hZip.write(spLangPack+"/"+sf, sAddPath+spLangPack+"/"+sf) - hZip.write("grammalecte/_dictionaries/"+sDicName, sAddPath+"grammalecte/_dictionaries/"+sDicName) def create (sLang, xConfig, bInstallOXT, bJavaScript): oNow = datetime.datetime.now() print("============== MAKE GRAMMALECTE [{0}] at {1.hour:>2} h {1.minute:>2} min {1.second:>2} s ==============".format(sLang, oNow)) @@ -274,10 +277,29 @@ else: build_module.build(sLang, dVars, spLangPack) return dVars['version'] + +def copyGraphspellCore (): + helpers.createCleanFolder("grammalecte/graphspell") + helpers.createCleanFolder("grammalecte-js/graphspell") + dir_util.mkpath("grammalecte/graphspell/_dictionaries") + dir_util.mkpath("grammalecte-js/graphspell/_dictionaries") + for sf in os.listdir("graphspell"): + if not os.path.isdir("graphspell/"+sf): + file_util.copy_file("graphspell/"+sf, "grammalecte/graphspell") + for sf in os.listdir("graphspell-js"): + if not os.path.isdir("graphspell-js/"+sf): + file_util.copy_file("graphspell-js/"+sf, "grammalecte-js/graphspell") + + +def copyGraphspellDictionary (sDicName): + file_util.copy_file("graphspell/_dictionaries/"+sDicName.strip()+".bdic", "grammalecte/graphspell/_dictionaries") + file_util.copy_file("graphspell/_dictionaries/"+sDicName.strip()+".info.txt", "grammalecte/graphspell/_dictionaries") + file_util.copy_file("graphspell-js/_dictionaries/"+sDicName.strip()+".json", "grammalecte-js/graphspell/_dictionaries") + def main (): print("Python: " + sys.version) xParser = argparse.ArgumentParser() xParser.add_argument("lang", type=str, nargs='+', help="lang project to generate (name of folder in /lang)") @@ -300,10 +322,12 @@ xArgs.build_data_after = True dir_util.mkpath("_build") dir_util.mkpath("grammalecte") dir_util.mkpath("grammalecte-js") + + copyGraphspellCore() for sLang in xArgs.lang: if os.path.exists("gc_lang/"+sLang) and os.path.isdir("gc_lang/"+sLang): xConfig = getConfig(sLang) dVars = xConfig._sections['args'] @@ -322,16 +346,20 @@ build_data_module = importlib.import_module("gc_lang."+sLang+".build_data") except ImportError: print("# Error. Couldn’t import file build_data.py in folder gc_lang/"+sLang) if build_data_module and xArgs.build_data_before: build_data_module.before('gc_lang/'+sLang, dVars, xArgs.javascript) - if xArgs.dict or not os.path.exists("grammalecte/_dictionaries"): + if xArgs.dict: import lex_build lex_build.build(dVars['lexicon_src'], dVars['lang_name'], dVars['dic_name'], xArgs.javascript, dVars['stemming_method'], int(dVars['fsa_method'])) if build_data_module and xArgs.build_data_after: build_data_module.after('gc_lang/'+sLang, dVars, xArgs.javascript) + # copy dictionaries from Graphspell + for sDicName in dVars['dic_name'].split(","): + copyGraphspellDictionary(sDicName) + # make sVersion = create(sLang, xConfig, xArgs.install, xArgs.javascript, ) # tests if xArgs.tests or xArgs.perf or xArgs.perf_memo: Index: reader.py ================================================================== --- reader.py +++ reader.py @@ -3,11 +3,11 @@ import os import sys import re -import grammalecte.ibdawg as ibdawg +import graphspell.ibdawg as ibdawg oDict = ibdawg.IBDAWG("French.bdic") def readFile (spf):