DELETED gc_core/js/char_player.js Index: gc_core/js/char_player.js ================================================================== --- gc_core/js/char_player.js +++ /dev/null @@ -1,330 +0,0 @@ -// list of similar chars -// useful for suggestion mechanism - -${map} - - -var char_player = { - - _dTransChars: new Map([ - ['à', 'a'], ['é', 'e'], ['î', 'i'], ['ô', 'o'], ['û', 'u'], ['ÿ', 'i'], ['y', 'i'], - ['â', 'a'], ['è', 'e'], ['ï', 'i'], ['ö', 'o'], ['ù', 'u'], ['ŷ', 'i'], - ['ä', 'a'], ['ê', 'e'], ['í', 'i'], ['ó', 'o'], ['ü', 'u'], ['ý', 'i'], - ['á', 'a'], ['ë', 'e'], ['ì', 'i'], ['ò', 'o'], ['ú', 'u'], ['ỳ', 'i'], - ['ā', 'a'], ['ē', 'e'], ['ī', 'i'], ['ō', 'o'], ['ū', 'u'], ['ȳ', 'i'], - ['ñ', 'n'], ['k', 'q'], ['w', 'v'], - ['œ', 'oe'], ['æ', 'ae'], - ]), - - simplifyWord: function (sWord) { - // word simplication before calculating distance between words - sWord = sWord.toLowerCase(); - let sNewWord = ""; - let i = 1; - for (let c of sWord) { - let cNew = this._dTransChars.gl_get(c, c); - let cNext = sWord.slice(i, i+1) - if (cNew != this._dTransChars.gl_get(cNext, cNext)) { - sNewWord += cNew; - } - i++; - } - return sNewWord.replace(/eau/g, "o").replace(/au/g, "o").replace(/ai/g, "e").replace(/ei/g, "e").replace(/ph/g, "f"); - }, - - aVowel: new Set("aáàâäāeéèêëēiíìîïīoóòôöōuúùûüūyýỳŷÿȳœæAÁÀÂÄĀEÉÈÊËĒIÍÌÎÏĪOÓÒÔÖŌUÚÙÛÜŪYÝỲŶŸȲŒÆ"), - aConsonant: new Set("bcçdfghjklmnñpqrstvwxzBCÇDFGHJKLMNÑPQRSTVWXZ"), - aDouble: new Set("bcdfjklmnprstzBCDFJKLMNPRSTZ"), // letters that may be used twice successively - - - // Similar chars - - d1to1: new Map([ - ["1", "liîLIÎ"], - ["2", "zZ"], - ["3", "eéèêEÉÈÊ"], - ["4", "aàâAÀÂ"], - ["5", "sgSG"], - ["6", "bdgBDG"], - ["7", "ltLT"], - ["8", "bB"], - ["9", "gbdGBD"], - ["0", "oôOÔ"], - - ["a", "aàâáäæ"], - ["A", "AÀÂÁÄÆ"], - ["à", "aàâáäæ"], - ["À", "AÀÂÁÄÆ"], - ["â", "aàâáäæ"], - ["Â", "AÀÂÁÄÆ"], - ["á", "aàâáäæ"], - ["Á", "AÀÂÁÄÆ"], - ["ä", "aàâáäæ"], - ["Ä", "AÀÂÁÄÆ"], - - ["æ", "æéa"], - ["Æ", "ÆÉA"], - - ["c", "cçskqśŝ"], - ["C", "CÇSKQŚŜ"], - ["ç", "cçskqśŝ"], - ["Ç", "CÇSKQŚŜ"], - - ["e", "eéèêëœ"], - ["E", "EÉÈÊËŒ"], - ["é", "eéèêëœ"], - ["É", "EÉÈÊËŒ"], - ["ê", "eéèêëœ"], - ["Ê", "EÉÈÊËŒ"], - ["è", "eéèêëœ"], - ["È", "EÉÈÊËŒ"], - ["ë", "eéèêëœ"], - ["Ë", "EÉÈÊËŒ"], - - ["g", "gj"], - ["G", "GJ"], - - ["i", "iîïyíìÿ"], - ["I", "IÎÏYÍÌŸ"], - ["î", "iîïyíìÿ"], - ["Î", "IÎÏYÍÌŸ"], - ["ï", "iîïyíìÿ"], - ["Ï", "IÎÏYÍÌŸ"], - ["í", "iîïyíìÿ"], - ["Í", "IÎÏYÍÌŸ"], - ["ì", "iîïyíìÿ"], - ["Ì", "IÎÏYÍÌŸ"], - - ["j", "jg"], - ["J", "JG"], - - ["k", "kcq"], - ["K", "KCQ"], - - ["n", "nñ"], - ["N", "NÑ"], - - ["o", "oôóòöœ"], - ["O", "OÔÓÒÖŒ"], - ["ô", "oôóòöœ"], - ["Ô", "OÔÓÒÖŒ"], - ["ó", "oôóòöœ"], - ["Ó", "OÔÓÒÖŒ"], - ["ò", "oôóòöœ"], - ["Ò", "OÔÓÒÖŒ"], - ["ö", "oôóòöœ"], - ["Ö", "OÔÓÒÖŒ"], - - ["œ", "œoôeéèêë"], - ["Œ", "ŒOÔEÉÈÊË"], - - ["q", "qck"], - ["Q", "QCK"], - - ["s", "sśŝcç"], - ["S", "SŚŜCÇ"], - ["ś", "sśŝcç"], - ["Ś", "SŚŜCÇ"], - ["ŝ", "sśŝcç"], - ["Ŝ", "SŚŜCÇ"], - - ["u", "uûùüú"], - ["U", "UÛÙÜÚ"], - ["û", "uûùüú"], - ["Û", "UÛÙÜÚ"], - ["ù", "uûùüú"], - ["Ù", "UÛÙÜÚ"], - ["ü", "uûùüú"], - ["Ü", "UÛÙÜÚ"], - ["ú", "uûùüú"], - ["Ú", "UÛÙÜÚ"], - - ["v", "vw"], - ["V", "VW"], - - ["w", "wv"], - ["W", "WV"], - - ["x", "xck"], - ["X", "XCK"], - - ["y", "yÿiîŷýỳ"], - ["Y", "YŸIÎŶÝỲ"], - ["ÿ", "yÿiîŷýỳ"], - ["Ÿ", "YŸIÎŶÝỲ"], - ["ŷ", "yÿiîŷýỳ"], - ["Ŷ", "YŸIÎŶÝỲ"], - ["ý", "yÿiîŷýỳ"], - ["Ý", "YŸIÎŶÝỲ"], - ["ỳ", "yÿiîŷýỳ"], - ["Ỳ", "YŸIÎŶÝỲ"], - - ["z", "zs"], - ["Z", "ZS"], - ]), - - d1toX: new Map([ - ["æ", ["ae",]], - ["Æ", ["AE",]], - ["b", ["bb",]], - ["B", ["BB",]], - ["c", ["cc", "ss", "qu", "ch"]], - ["C", ["CC", "SS", "QU", "CH"]], - ["d", ["dd",]], - ["D", ["DD",]], - ["é", ["ai", "ei"]], - ["É", ["AI", "EI"]], - ["f", ["ff", "ph"]], - ["F", ["FF", "PH"]], - ["g", ["gu", "ge", "gg", "gh"]], - ["G", ["GU", "GE", "GG", "GH"]], - ["j", ["jj", "dj"]], - ["J", ["JJ", "DJ"]], - ["k", ["qu", "ck", "ch", "cu", "kk", "kh"]], - ["K", ["QU", "CK", "CH", "CU", "KK", "KH"]], - ["l", ["ll",]], - ["L", ["LL",]], - ["m", ["mm", "mn"]], - ["M", ["MM", "MN"]], - ["n", ["nn", "nm", "mn"]], - ["N", ["NN", "NM", "MN"]], - ["o", ["au", "eau"]], - ["O", ["AU", "EAU"]], - ["œ", ["oe", "eu"]], - ["Œ", ["OE", "EU"]], - ["p", ["pp", "ph"]], - ["P", ["PP", "PH"]], - ["q", ["qu", "ch", "cq", "ck", "kk"]], - ["Q", ["QU", "CH", "CQ", "CK", "KK"]], - ["r", ["rr",]], - ["R", ["RR",]], - ["s", ["ss", "sh"]], - ["S", ["SS", "SH"]], - ["t", ["tt", "th"]], - ["T", ["TT", "TH"]], - ["x", ["cc", "ct", "xx"]], - ["X", ["CC", "CT", "XX"]], - ["z", ["ss", "zh"]], - ["Z", ["SS", "ZH"]], - ]), - - get1toXReplacement: function (cPrev, cCur, cNext) { - if (this.aConsonant.has(cCur) && (this.aConsonant.has(cPrev) || this.aConsonant.has(cNext))) { - return []; - } - return this.d1toX.gl_get(cCur, []); - }, - - d2toX: new Map([ - ["am", ["an", "en", "em"]], - ["AM", ["AN", "EN", "EM"]], - ["an", ["am", "en", "em"]], - ["AN", ["AM", "EN", "EM"]], - ["au", ["eau", "o", "ô"]], - ["AU", ["EAU", "O", "Ô"]], - ["em", ["an", "am", "en"]], - ["EM", ["AN", "AM", "EN"]], - ["en", ["an", "am", "em"]], - ["EN", ["AN", "AM", "EM"]], - ["ai", ["ei", "é", "è", "ê", "ë"]], - ["AI", ["EI", "É", "È", "Ê", "Ë"]], - ["ei", ["ai", "é", "è", "ê", "ë"]], - ["EI", ["AI", "É", "È", "Ê", "Ë"]], - ["ch", ["sh", "c", "ss"]], - ["CH", ["SH", "C", "SS"]], - ["ct", ["x", "cc"]], - ["CT", ["X", "CC"]], - ["oa", ["oi",]], - ["OA", ["OI",]], - ["oi", ["oa", "oie"]], - ["OI", ["OA", "OIE"]], - ["ph", ["f",]], - ["PH", ["F",]], - ["qu", ["q", "cq", "ck", "c", "k"]], - ["QU", ["Q", "CQ", "CK", "C", "K"]], - ["ss", ["c", "ç"]], - ["SS", ["C", "Ç"]], - ["un", ["ein",]], - ["UN", ["EIN",]], - ]), - - // End of word - dFinal1: new Map([ - ["a", ["as", "at", "ant", "ah"]], - ["A", ["AS", "AT", "ANT", "AH"]], - ["c", ["ch",]], - ["C", ["CH",]], - ["e", ["et", "er", "ets", "ée", "ez", "ai", "ais", "ait", "ent", "eh"]], - ["E", ["ET", "ER", "ETS", "ÉE", "EZ", "AI", "AIS", "AIT", "ENT", "EH"]], - ["é", ["et", "er", "ets", "ée", "ez", "ai", "ais", "ait"]], - ["É", ["ET", "ER", "ETS", "ÉE", "EZ", "AI", "AIS", "AIT"]], - ["è", ["et", "er", "ets", "ée", "ez", "ai", "ais", "ait"]], - ["È", ["ET", "ER", "ETS", "ÉE", "EZ", "AI", "AIS", "AIT"]], - ["ê", ["et", "er", "ets", "ée", "ez", "ai", "ais", "ait"]], - ["Ê", ["ET", "ER", "ETS", "ÉE", "EZ", "AI", "AIS", "AIT"]], - ["ë", ["et", "er", "ets", "ée", "ez", "ai", "ais", "ait"]], - ["Ë", ["ET", "ER", "ETS", "ÉE", "EZ", "AI", "AIS", "AIT"]], - ["g", ["gh",]], - ["G", ["GH",]], - ["i", ["is", "it", "ie", "in"]], - ["I", ["IS", "IT", "IE", "IN"]], - ["n", ["nt", "nd", "ns", "nh"]], - ["N", ["NT", "ND", "NS", "NH"]], - ["o", ["aut", "ot", "os"]], - ["O", ["AUT", "OT", "OS"]], - ["ô", ["aut", "ot", "os"]], - ["Ô", ["AUT", "OT", "OS"]], - ["ö", ["aut", "ot", "os"]], - ["Ö", ["AUT", "OT", "OS"]], - ["p", ["ph",]], - ["P", ["PH",]], - ["s", ["sh",]], - ["S", ["SH",]], - ["t", ["th",]], - ["T", ["TH",]], - ["u", ["ut", "us", "uh"]], - ["U", ["UT", "US", "UH"]], - ]), - - dFinal2: new Map([ - ["ai", ["aient", "ais", "et"]], - ["AI", ["AIENT", "AIS", "ET"]], - ["an", ["ant", "ent"]], - ["AN", ["ANT", "ENT"]], - ["en", ["ent", "ant"]], - ["EN", ["ENT", "ANT"]], - ["ei", ["ait", "ais"]], - ["EI", ["AIT", "AIS"]], - ["on", ["ons", "ont"]], - ["ON", ["ONS", "ONT"]], - ["oi", ["ois", "oit", "oix"]], - ["OI", ["OIS", "OIT", "OIX"]], - ]), - - - // Préfixes et suffixes - aPfx1: new Set([ - "anti", "archi", "contre", "hyper", "mé", "méta", "im", "in", "ir", "par", "proto", - "pseudo", "pré", "re", "ré", "sans", "sous", "supra", "sur", "ultra" - ]), - - aPfx2: new Set([ - "belgo", "franco", "génito", "gynéco", "médico", "russo" - ]), - - - cut: function (sWord) { - // returns an arry of strings (prefix, trimed_word, suffix) - let m = /^([a-zA-Zà-öÀ-Ö0-9_ø-ÿØ-ßĀ-ʯfi-st]+)(-(?:t-|)(?:ils?|elles|on|je|tu|nous|vous)$)/.exec(sWord); - if (m) { - return ["", m[1], m[2]]; - } - return ["", sWord, ""]; - }, - - // Other functions - filterSugg: function (aSugg) { - return aSugg.filter((sSugg) => { return !sSugg.endsWith("è") && !sSugg.endsWith("È"); }); - } - -} DELETED gc_core/js/helpers.js Index: gc_core/js/helpers.js ================================================================== --- gc_core/js/helpers.js +++ /dev/null @@ -1,100 +0,0 @@ - -// HELPERS -/*jslint esversion: 6*/ -/*global console,require,exports,XMLHttpRequest*/ - -"use strict"; - -// In Firefox, there is no console.log in PromiseWorker, but there is worker.log. -// In Thunderbird, you can’t access to console directly. So it’s required to pass a log function. -let funcOutput = null; - -var helpers = { - - setLogOutput: function (func) { - funcOutput = func; - }, - - echo: function (obj) { - if (funcOutput !== null) { - funcOutput(obj); - } else { - console.log(obj); - } - return true; - }, - - logerror: function (e, bStack=false) { - let sMsg = "\n" + e.fileName + "\n" + e.name + "\nline: " + e.lineNumber + "\n" + e.message; - if (bStack) { - sMsg += "\n--- Stack ---\n" + e.stack; - } - if (funcOutput !== null) { - funcOutput(sMsg); - } else { - console.error(sMsg); - } - }, - - inspect: function (o) { - let sMsg = "__inspect__: " + typeof o; - for (let sParam in o) { - sMsg += "\n" + sParam + ": " + o.sParam; - } - sMsg += "\n" + JSON.stringify(o) + "\n__end__"; - this.echo(sMsg); - }, - - loadFile: function (spf) { - // load ressources in workers (suggested by Mozilla extensions reviewers) - // for more options have a look here: https://gist.github.com/Noitidart/ec1e6b9a593ec7e3efed - // if not in workers, use sdk/data.load() instead - try { - let xRequest; - if (typeof XMLHttpRequest !== "undefined") { - xRequest = new XMLHttpRequest(); - } else { - // JS sucks again… necessary for Thunderbird - let { Cc, Ci } = require("chrome"); - xRequest = Cc["@mozilla.org/xmlextras/xmlhttprequest;1"].createInstance(); - xRequest.QueryInterface(Ci.nsIXMLHttpRequest); - } - xRequest.open('GET', spf, false); // 3rd arg is false for synchronous, sync is acceptable in workers - xRequest.overrideMimeType('text/json'); - xRequest.send(); - return xRequest.responseText; - } - catch (e) { - this.logerror(e); - return null; - } - }, - - // conversions - objectToMap: function (obj) { - let m = new Map(); - for (let param in obj) { - m.set(param, obj[param]); - } - return m; - }, - - mapToObject: function (m) { - let obj = {}; - for (let [k, v] of m) { - obj[k] = v; - } - return obj; - } -}; - - -if (typeof(exports) !== 'undefined') { - exports.setLogOutput = helpers.setLogOutput; - exports.echo = helpers.echo; - exports.logerror = helpers.logerror; - exports.inspect = helpers.inspect; - exports.loadFile = helpers.loadFile; - exports.objectToMap = helpers.objectToMap; - exports.mapToObject = helpers.mapToObject; -} DELETED gc_core/js/ibdawg.js Index: gc_core/js/ibdawg.js ================================================================== --- gc_core/js/ibdawg.js +++ /dev/null @@ -1,513 +0,0 @@ -//// IBDAWG -/*jslint esversion: 6*/ -/*global console,require,exports*/ - -"use strict"; - - -if (typeof(require) !== 'undefined') { - var str_transform = require("resource://grammalecte/str_transform.js"); - var helpers = require("resource://grammalecte/helpers.js"); - var char_player = require("resource://grammalecte/char_player.js"); -} - - -// Don’t remove . Necessary in TB. -${string} -${map} -${set} - - -class SuggResult { - // Structure for storing, classifying and filtering suggestions - - constructor (sWord, nDistLimit=-1) { - this.sWord = sWord; - this.sSimplifiedWord = char_player.simplifyWord(sWord); - this.nDistLimit = (nDistLimit >= 0) ? nDistLimit : Math.floor(sWord.length / 3) + 1; - this.nMinDist = 1000; - this.aSugg = new Set(); - this.dSugg = new Map([ [0, []], [1, []], [2, []] ]); - } - - addSugg (sSugg, nDeep=0) { - // add a suggestion - if (!this.aSugg.has(sSugg)) { - let nDist = str_transform.distanceDamerauLevenshtein(this.sSimplifiedWord, char_player.simplifyWord(sSugg)); - if (nDist <= this.nDistLimit) { - if (!this.dSugg.has(nDist)) { - this.dSugg.set(nDist, []); - } - this.dSugg.get(nDist).push(sSugg); - this.aSugg.add(sSugg); - if (nDist < this.nMinDist) { - this.nMinDist = nDist; - } - this.nDistLimit = Math.min(this.nDistLimit, this.nMinDist+2); - } - } - } - - getSuggestions (nSuggLimit=10, nDistLimit=-1) { - // return a list of suggestions - let lRes = []; - if (this.dSugg.get(0).length) { - // we sort the better results with the original word - let dDistTemp = new Map(); - lRes.forEach((sSugg) => { dDistTemp.set(sSugg, str_transform.distanceDamerauLevenshtein(this.sWord, sSugg)); }); - lRes = lRes.sort((sA, sB) => { return dDistTemp.get(sA) - dDistTemp.get(sB); }); - dDistTemp.clear(); - } - for (let lSugg of this.dSugg.values()) { - for (let sSugg of lSugg) { lRes.push(sSugg); } - if (lRes.length > nSuggLimit) { - break; - } - } - lRes = char_player.filterSugg(lRes); - if (this.sWord.gl_isTitle()) { - lRes = lRes.map((sSugg) => { return sSugg.gl_toCapitalize(); }); - } - else if (this.sWord.gl_isUpperCase()) { - lRes = lRes.map((sSugg) => { return sSugg.toUpperCase(); }); - } - return lRes.slice(0, nSuggLimit); - } - - reset () { - this.aSugg.clear(); - this.dSugg.clear(); - } -} - - -class IBDAWG { - // INDEXABLE BINARY DIRECT ACYCLIC WORD GRAPH - - constructor (sDicName, sPath="") { - try { - let sURL = (sPath !== "") ? sPath + "/" + sDicName : "resource://grammalecte/_dictionaries/"+sDicName; - const dict = JSON.parse(helpers.loadFile(sURL)); - Object.assign(this, dict); - } - catch (e) { - throw Error("# Error. File not found or not loadable.\n" + e.message + "\n"); - } - /* - Properties: - sName, nVersion, sHeader, lArcVal, nArcVal, byDic, sLang, nChar, nBytesArc, nBytesNodeAddress, - nEntries, nNode, nArc, nAff, cStemming, nTag, dChar, _arcMask, _finalNodeMask, _lastArcMask, _addrBitMask, nBytesOffset, - */ - - /* - Bug workaround. - Mozilla’s JS parser sucks. Can’t read file bigger than 4 Mb! - So we convert huge hexadecimal string to list of numbers… - https://github.com/mozilla/addons-linter/issues/1361 - */ - let lTemp = []; - for (let i = 0; i < this.byDic.length; i+=2) { - lTemp.push(parseInt(this.byDic.slice(i, i+2), 16)); - } - this.byDic = lTemp; - /* end of bug workaround */ - - if (!this.sHeader.startsWith("/pyfsa/")) { - throw TypeError("# Error. Not a pyfsa binary dictionary. Header: " + this.sHeader); - } - if (!(this.nVersion == "1" || this.nVersion == "2" || this.nVersion == "3")) { - throw RangeError("# Error. Unknown dictionary version: " + this.nVersion); - } - // to get the value of an arc, to get the char of an arc with its value - this.dChar = helpers.objectToMap(this.dChar); - this.dCharVal = this.dChar.gl_reverse(); - //this.byDic = new Uint8Array(this.byDic); // not quicker, even slower - - if (this.cStemming == "S") { - this.funcStemming = str_transform.getStemFromSuffixCode; - } else if (this.cStemming == "A") { - this.funcStemming = str_transform.getStemFromAffixCode; - } else { - this.funcStemming = str_transform.noStemming; - } - - // Configuring DAWG functions according to nVersion - switch (this.nVersion) { - case 1: - this.morph = this._morph1; - this.stem = this._stem1; - this._lookupArcNode = this._lookupArcNode1; - this._getArcs = this._getArcs1; - this._writeNodes = this._writeNodes1; - break; - case 2: - this.morph = this._morph2; - this.stem = this._stem2; - this._lookupArcNode = this._lookupArcNode2; - this._getArcs = this._getArcs2; - this._writeNodes = this._writeNodes2; - break; - case 3: - this.morph = this._morph3; - this.stem = this._stem3; - this._lookupArcNode = this._lookupArcNode3; - this._getArcs = this._getArcs3; - this._writeNodes = this._writeNodes3; - break; - default: - throw ValueError("# Error: unknown code: " + this.nVersion); - } - //console.log(this.getInfo()); - this.bOptNumSigle = true; - this.bOptNumAtLast = false; - } - - getInfo () { - return ` Language: ${this.sLang} Version: ${this.nVersion} Stemming: ${this.cStemming}FX\n` + - ` Arcs values: ${this.nArcVal} = ${this.nChar} characters, ${this.nAff} affixes, ${this.nTag} tags\n` + - ` Dictionary: ${this.nEntries} entries, ${this.nNode} nodes, ${this.nArc} arcs\n` + - ` Address size: ${this.nBytesNodeAddress} bytes, Arc size: ${this.nBytesArc} bytes\n`; - } - - isValidToken (sToken) { - // checks if sToken is valid (if there is hyphens in sToken, sToken is split, each part is checked) - if (this.isValid(sToken)) { - return true; - } - if (sToken.includes("-")) { - if (sToken.gl_count("-") > 4) { - return true; - } - return sToken.split("-").every(sWord => this.isValid(sWord)); - } - return false; - } - - isValid (sWord) { - // checks if sWord is valid (different casing tested if the first letter is a capital) - if (!sWord) { - return null; - } - if (sWord.includes("’")) { // ugly hack - sWord = sWord.replace("’", "'"); - } - if (this.lookup(sWord)) { - return true; - } - if (sWord.charAt(0).gl_isUpperCase()) { - if (sWord.length > 1) { - if (sWord.gl_isTitle()) { - return !!this.lookup(sWord.toLowerCase()); - } - if (sWord.gl_isUpperCase()) { - if (this.bOptNumSigle) { - return true; - } - return !!(this.lookup(sWord.toLowerCase()) || this.lookup(sWord.gl_toCapitalize())); - } - return !!this.lookup(sWord.slice(0, 1).toLowerCase() + sWord.slice(1)); - } else { - return !!this.lookup(sWord.toLowerCase()); - } - } - return false; - } - - _convBytesToInteger (aBytes) { - // Byte order = Big Endian (bigger first) - let nVal = 0; - let nWeight = (aBytes.length - 1) * 8; - for (let n of aBytes) { - nVal += n << nWeight; - nWeight = nWeight - 8; - } - return nVal; - } - - lookup (sWord) { - // returns true if sWord in dictionary (strict verification) - let iAddr = 0; - for (let c of sWord) { - if (!this.dChar.has(c)) { - return false; - } - iAddr = this._lookupArcNode(this.dChar.get(c), iAddr); - if (iAddr === null) { - return false; - } - } - return Boolean(this._convBytesToInteger(this.byDic.slice(iAddr, iAddr+this.nBytesArc)) & this._finalNodeMask); - } - - getMorph (sWord) { - // retrieves morphologies list, different casing allowed - let l = this.morph(sWord); - if (sWord[0].gl_isUpperCase()) { - l = l.concat(this.morph(sWord.toLowerCase())); - if (sWord.gl_isUpperCase() && sWord.length > 1) { - l = l.concat(this.morph(sWord.gl_toCapitalize())); - } - } - return l; - } - - suggest (sWord, nSuggLimit=10) { - // returns a array of suggestions for - let sPfx = ""; - let sSfx = ""; - [sPfx, sWord, sSfx] = char_player.cut(sWord); - let nMaxSwitch = Math.max(Math.floor(sWord.length / 3), 1); - let nMaxDel = Math.floor(sWord.length / 5); - let nMaxHardRepl = Math.max(Math.floor((sWord.length - 5) / 4), 1); - let oSuggResult = new SuggResult(sWord); - this._suggest(oSuggResult, sWord, nMaxSwitch, nMaxDel, nMaxHardRepl); - if (sWord.gl_isTitle()) { - this._suggest(oSuggResult, sWord.toLowerCase(), nMaxSwitch, nMaxDel, nMaxHardRepl); - } - else if (sWord.gl_isLowerCase()) { - this._suggest(oSuggResult, sWord.gl_toCapitalize(), nMaxSwitch, nMaxDel, nMaxHardRepl); - } - let aSugg = oSuggResult.getSuggestions(nSuggLimit); - if (sSfx || sPfx) { - // we add what we removed - return aSugg.map( (sSugg) => { return sPfx + sSugg + sSfx } ); - } - return aSugg; - } - - _suggest (oSuggResult, sRemain, nMaxSwitch=0, nMaxDel=0, nMaxHardRepl=0, nDeep=0, iAddr=0, sNewWord="", bAvoidLoop=false) { - // returns a set of suggestions - // recursive function - if (sRemain == "") { - if (this._convBytesToInteger(this.byDic.slice(iAddr, iAddr+this.nBytesArc)) & this._finalNodeMask) { - oSuggResult.addSugg(sNewWord); - } - for (let sTail of this._getTails(iAddr)) { - oSuggResult.addSugg(sNewWord+sTail); - } - return; - } - let cCurrent = sRemain.slice(0, 1); - for (let [cChar, jAddr] of this._getCharArcs(iAddr)) { - if (char_player.d1to1.gl_get(cCurrent, cCurrent).indexOf(cChar) != -1) { - this._suggest(oSuggResult, sRemain.slice(1), nMaxSwitch, nMaxDel, nMaxHardRepl, nDeep+1, jAddr, sNewWord+cChar); - } - else if (!bAvoidLoop && nMaxHardRepl) { - this._suggest(oSuggResult, sRemain.slice(1), nMaxSwitch, nMaxDel, nMaxHardRepl-1, nDeep+1, jAddr, sNewWord+cChar, true); - } - } - if (!bAvoidLoop) { // avoid infinite loop - if (sRemain.length > 1) { - if (cCurrent == sRemain.slice(1, 2)) { - // same char, we remove 1 char without adding 1 to - this._suggest(oSuggResult, sRemain.slice(1), nMaxSwitch, nMaxDel, nMaxHardRepl, nDeep+1, iAddr, sNewWord); - } - else { - // switching chars - if (nMaxSwitch > 0) { - this._suggest(oSuggResult, sRemain.slice(1, 2)+sRemain.slice(0, 1)+sRemain.slice(2), nMaxSwitch-1, nMaxDel, nMaxHardRepl, nDeep+1, iAddr, sNewWord, true); - } - // delete char - if (nMaxDel > 0) { - this._suggest(oSuggResult, sRemain.slice(1), nMaxSwitch, nMaxDel-1, nMaxHardRepl, nDeep+1, iAddr, sNewWord, true); - } - } - // Phonetic replacements - for (let sRepl of char_player.get1toXReplacement(sNewWord.slice(-1), cCurrent, sRemain.slice(1,2))) { - this._suggest(oSuggResult, sRepl + sRemain.slice(1), nMaxSwitch, nMaxDel, nMaxHardRepl, nDeep+1, iAddr, sNewWord, true); - } - for (let sRepl of char_player.d2toX.gl_get(sRemain.slice(0, 2), [])) { - this._suggest(oSuggResult, sRepl + sRemain.slice(2), nMaxSwitch, nMaxDel, nMaxHardRepl, nDeep+1, iAddr, sNewWord, true); - } - } - // end of word - if (sRemain.length == 2) { - for (let sRepl of char_player.dFinal2.gl_get(sRemain, [])) { - this._suggest(oSuggResult, sRepl, nMaxSwitch, nMaxDel, nMaxHardRepl, nDeep+1, iAddr, sNewWord, true); - } - } - else if (sRemain.length == 1) { - this._suggest(oSuggResult, "", nMaxSwitch, nMaxDel, nMaxHardRepl, nDeep+1, iAddr, sNewWord, true); // remove last char and go on - for (let sRepl of char_player.dFinal1.gl_get(sRemain, [])) { - this._suggest(oSuggResult, sRepl, nMaxSwitch, nMaxDel, nMaxHardRepl, nDeep+1, iAddr, sNewWord, true); - } - } - } - } - - * _getCharArcs (iAddr) { - // generator: yield all chars and addresses from node at address - for (let [nVal, jAddr] of this._getArcs(iAddr)) { - if (nVal < this.nChar) { - yield [this.dCharVal.get(nVal), jAddr]; - } - } - } - - * _getSimilarCharArcs (cChar, iAddr) { - // generator: yield similar char of and address of the following node - for (let c of char_player.d1to1.gl_get(cChar, [cChar])) { - if (this.dChar.has(c)) { - let jAddr = this._lookupArcNode(this.dChar.get(c), iAddr); - if (jAddr) { - yield [c, jAddr]; - } - } - } - } - - _getTails (iAddr, sTail="", n=2) { - // return a list of suffixes ending at a distance of from - let aTails = new Set(); - for (let [nVal, jAddr] of this._getArcs(iAddr)) { - if (nVal < this.nChar) { - if (this._convBytesToInteger(this.byDic.slice(jAddr, jAddr+this.nBytesArc)) & this._finalNodeMask) { - aTails.add(sTail + this.dCharVal.get(nVal)); - } - if (n && aTails.size == 0) { - aTails.gl_update(this._getTails(jAddr, sTail+this.dCharVal.get(nVal), n-1)); - } - } - } - return aTails; - } - - // morph (sWord) { - // is defined in constructor - // } - - // VERSION 1 - _morph1 (sWord) { - // returns morphologies of sWord - let iAddr = 0; - for (let c of sWord) { - if (!this.dChar.has(c)) { - return []; - } - iAddr = this._lookupArcNode(this.dChar.get(c), iAddr); - if (iAddr === null) { - return []; - } - } - if (this._convBytesToInteger(this.byDic.slice(iAddr, iAddr+this.nBytesArc)) & this._finalNodeMask) { - let l = []; - let nRawArc = 0; - while (!(nRawArc & this._lastArcMask)) { - let iEndArcAddr = iAddr + this.nBytesArc; - nRawArc = this._convBytesToInteger(this.byDic.slice(iAddr, iEndArcAddr)); - let nArc = nRawArc & this._arcMask; - if (nArc >= this.nChar) { - // This value is not a char, this is a stemming code - let sStem = ">" + this.funcStemming(sWord, this.lArcVal[nArc]); - // Now , we go to the next node and retrieve all following arcs values, all of them are tags - let iAddr2 = this._convBytesToInteger(this.byDic.slice(iEndArcAddr, iEndArcAddr+this.nBytesNodeAddress)); - let nRawArc2 = 0; - while (!(nRawArc2 & this._lastArcMask)) { - let iEndArcAddr2 = iAddr2 + this.nBytesArc; - nRawArc2 = this._convBytesToInteger(this.byDic.slice(iAddr2, iEndArcAddr2)); - l.push(sStem + " " + this.lArcVal[nRawArc2 & this._arcMask]); - iAddr2 = iEndArcAddr2+this.nBytesNodeAddress; - } - } - iAddr = iEndArcAddr + this.nBytesNodeAddress; - } - return l; - } - return []; - } - - _stem1 (sWord) { - // returns stems list of sWord - let iAddr = 0; - for (let c of sWord) { - if (!this.dChar.has(c)) { - return []; - } - iAddr = this._lookupArcNode(this.dChar.get(c), iAddr); - if (iAddr === null) { - return []; - } - } - if (this._convBytesToInteger(this.byDic.slice(iAddr, iAddr+this.nBytesArc)) & this._finalNodeMask) { - let l = []; - let nRawArc = 0; - while (!(nRawArc & this._lastArcMask)) { - let iEndArcAddr = iAddr + this.nBytesArc; - nRawArc = this._convBytesToInteger(this.byDic.slice(iAddr, iEndArcAddr)); - let nArc = nRawArc & this._arcMask; - if (nArc >= this.nChar) { - // This value is not a char, this is a stemming code - l.push(this.funcStemming(sWord, this.lArcVal[nArc])); - } - iAddr = iEndArcAddr + this.nBytesNodeAddress; - } - return l; - } - return []; - } - - _lookupArcNode1 (nVal, iAddr) { - // looks if nVal is an arc at the node at iAddr, if yes, returns address of next node else None - while (true) { - let iEndArcAddr = iAddr+this.nBytesArc; - let nRawArc = this._convBytesToInteger(this.byDic.slice(iAddr, iEndArcAddr)); - if (nVal == (nRawArc & this._arcMask)) { - // the value we are looking for - // we return the address of the next node - return this._convBytesToInteger(this.byDic.slice(iEndArcAddr, iEndArcAddr+this.nBytesNodeAddress)); - } - else { - // value not found - if (nRawArc & this._lastArcMask) { - return null; - } - iAddr = iEndArcAddr + this.nBytesNodeAddress; - } - } - } - - * _getArcs1 (iAddr) { - "generator: return all arcs at as tuples of (nVal, iAddr)" - while (true) { - let iEndArcAddr = iAddr+this.nBytesArc; - let nRawArc = this._convBytesToInteger(this.byDic.slice(iAddr, iEndArcAddr)); - yield [nRawArc & this._arcMask, this._convBytesToInteger(this.byDic.slice(iEndArcAddr, iEndArcAddr+this.nBytesNodeAddress))]; - if (nRawArc & this._lastArcMask) { - break; - } - iAddr = iEndArcAddr+this.nBytesNodeAddress; - } - } - - // VERSION 2 - _morph2 (sWord) { - // to do - } - - _stem2 (sWord) { - // to do - } - - _lookupArcNode2 (nVal, iAddr) { - // to do - } - - - // VERSION 3 - _morph3 (sWord) { - // to do - } - - _stem3 (sWord) { - // to do - } - - _lookupArcNode3 (nVal, iAddr) { - // to do - } -} - - -if (typeof(exports) !== 'undefined') { - exports.IBDAWG = IBDAWG; -} DELETED gc_core/js/jsex_map.js Index: gc_core/js/jsex_map.js ================================================================== --- gc_core/js/jsex_map.js +++ /dev/null @@ -1,56 +0,0 @@ - -// Map -/*jslint esversion: 6*/ - -if (Map.prototype.grammalecte === undefined) { - Map.prototype.gl_shallowCopy = function () { - let oNewMap = new Map(); - for (let [key, val] of this.entries()) { - oNewMap.set(key, val); - } - return oNewMap; - }; - - Map.prototype.gl_get = function (key, defaultValue) { - let res = this.get(key); - if (res !== undefined) { - return res; - } - return defaultValue; - }; - - Map.prototype.gl_toString = function () { - // Default .toString() gives nothing useful - let sRes = "{ "; - for (let [k, v] of this.entries()) { - sRes += (typeof k === "string") ? '"' + k + '": ' : k.toString() + ": "; - sRes += (typeof v === "string") ? '"' + v + '", ' : v.toString() + ", "; - } - sRes = sRes.slice(0, -2) + " }"; - return sRes; - }; - - Map.prototype.gl_update = function (dDict) { - for (let [k, v] of dDict.entries()) { - this.set(k, v); - } - }; - - Map.prototype.gl_updateOnlyExistingKeys = function (dDict) { - for (let [k, v] of dDict.entries()) { - if (this.has(k)){ - this.set(k, v); - } - } - }; - - Map.prototype.gl_reverse = function () { - let dNewMap = new Map(); - this.forEach((val, key) => { - dNewMap.set(val, key); - }); - return dNewMap; - }; - - Map.prototype.grammalecte = true; -} DELETED gc_core/js/jsex_regex.js Index: gc_core/js/jsex_regex.js ================================================================== --- gc_core/js/jsex_regex.js +++ /dev/null @@ -1,90 +0,0 @@ - -// regex -/*jslint esversion: 6*/ - -if (RegExp.prototype.grammalecte === undefined) { - RegExp.prototype.gl_exec2 = function (sText, aGroupsPos, aNegLookBefore=null) { - let m; - while ((m = this.exec(sText)) !== null) { - // we have to iterate over sText here too - // because first match doesn’t imply it’s a valid match according to negative lookbefore assertions, - // and even if first match is finally invalid, it doesn’t mean the following eligible matchs would be invalid too. - if (aNegLookBefore !== null) { - // check negative look before assertions - if ( !aNegLookBefore.some(sRegEx => (RegExp.leftContext.search(sRegEx) >= 0)) ) { - break; - } - } else { - break; - } - } - if (m === null) { - return null; - } - - let codePos; - let iPos = 0; - m.start = [m.index]; - m.end = [this.lastIndex]; - try { - if (m.length > 1) { - // there is subgroup(s) - if (aGroupsPos !== null) { - // aGroupsPos is defined - for (let i = 1; i <= m.length-1; i++) { - codePos = aGroupsPos[i-1]; - if (typeof codePos === "number") { - // position as a number - m.start.push(m.index + codePos); - m.end.push(m.index + codePos + m[i].length); - } else if (codePos === "$") { - // at the end of the pattern - m.start.push(this.lastIndex - m[i].length); - m.end.push(this.lastIndex); - } else if (codePos === "w") { - // word in the middle of the pattern - iPos = m[0].search("[ ’,()«»“”]"+m[i]+"[ ,’()«»“”]") + 1 + m.index; - m.start.push(iPos); - m.end.push(iPos + m[i].length); - } else if (codePos === "*") { - // anywhere - iPos = m[0].indexOf(m[i]) + m.index; - m.start.push(iPos); - m.end.push(iPos + m[i].length); - } else if (codePos === "**") { - // anywhere after previous group - iPos = m[0].indexOf(m[i], m.end[i-1]-m.index) + m.index; - m.start.push(iPos); - m.end.push(iPos + m[i].length); - } else if (codePos.startsWith(">")) { - // >x:_ - // todo: look in substring x - iPos = m[0].indexOf(m[i]) + m.index; - m.start.push(iPos); - m.end.push(iPos + m[i].length); - } else { - console.error("# Error: unknown positioning code in regex [" + this.source + "], for group[" + i.toString() +"], code: [" + codePos + "]"); - } - } - } else { - // no aGroupsPos - for (let subm of m.slice(1)) { - iPos = m[0].indexOf(subm) + m.index; - m.start.push(iPos); - m.end.push(iPos + subm.length); - } - } - } - } - catch (e) { - if (typeof(helpers) !== "undefined") { - helpers.logerror(e); - } else { - console.error(e); - } - } - return m; - }; - - RegExp.prototype.grammalecte = true; -} DELETED gc_core/js/jsex_set.js Index: gc_core/js/jsex_set.js ================================================================== --- gc_core/js/jsex_set.js +++ /dev/null @@ -1,13 +0,0 @@ - -// Set -/*jslint esversion: 6*/ - -if (Set.prototype.grammalecte === undefined) { - Set.prototype.gl_update = function (aSet) { - for (let elem of aSet) { - this.add(elem); - } - }; - - Set.prototype.grammalecte = true; -} DELETED gc_core/js/jsex_string.js Index: gc_core/js/jsex_string.js ================================================================== --- gc_core/js/jsex_string.js +++ /dev/null @@ -1,58 +0,0 @@ - -// String -/*jslint esversion: 6*/ - -if (String.prototype.grammalecte === undefined) { - String.prototype.gl_count = function (sSearch, bOverlapping) { - // http://jsperf.com/string-ocurrence-split-vs-match/8 - if (sSearch.length <= 0) { - return this.length + 1; - } - let nOccur = 0; - let iPos = 0; - let nStep = (bOverlapping) ? 1 : sSearch.length; - while ((iPos = this.indexOf(sSearch, iPos)) >= 0) { - nOccur++; - iPos += nStep; - } - return nOccur; - }; - String.prototype.gl_isDigit = function () { - return (this.search(/^[0-9⁰¹²³⁴⁵⁶⁷⁸⁹]+$/) !== -1); - }; - String.prototype.gl_isLowerCase = function () { - return (this.search(/^[a-zà-öø-ÿ0-9-]+$/) !== -1); - }; - String.prototype.gl_isUpperCase = function () { - return (this.search(/^[A-ZÀ-ÖØ-ߌ0-9-]+$/) !== -1); - }; - String.prototype.gl_isTitle = function () { - return (this.search(/^[A-ZÀ-ÖØ-ߌ][a-zà-öø-ÿ'’-]+$/) !== -1); - }; - String.prototype.gl_toCapitalize = function () { - return this.slice(0,1).toUpperCase() + this.slice(1).toLowerCase(); - }; - String.prototype.gl_expand = function (oMatch) { - let sNew = this; - for (let i = 0; i < oMatch.length ; i++) { - let z = new RegExp("\\\\"+parseInt(i), "g"); - sNew = sNew.replace(z, oMatch[i]); - } - return sNew; - }; - String.prototype.gl_trimRight = function (sChars) { - let z = new RegExp("["+sChars+"]+$"); - return this.replace(z, ""); - }; - String.prototype.gl_trimLeft = function (sChars) { - let z = new RegExp("^["+sChars+"]+"); - return this.replace(z, ""); - }; - String.prototype.gl_trim = function (sChars) { - let z1 = new RegExp("^["+sChars+"]+"); - let z2 = new RegExp("["+sChars+"]+$"); - return this.replace(z1, "").replace(z2, ""); - }; - - String.prototype.grammalecte = true; -} Index: gc_core/js/lang_core/gc_engine.js ================================================================== --- gc_core/js/lang_core/gc_engine.js +++ gc_core/js/lang_core/gc_engine.js @@ -8,11 +8,11 @@ ${regex} ${map} if (typeof(require) !== 'undefined') { - var helpers = require("resource://grammalecte/helpers.js"); + var helpers = require("resource://grammalecte/graphspell/helpers.js"); var gc_options = require("resource://grammalecte/${lang}/gc_options.js"); var gc_rules = require("resource://grammalecte/${lang}/gc_rules.js"); var cregex = require("resource://grammalecte/${lang}/cregex.js"); var text = require("resource://grammalecte/text.js"); var echo = helpers.echo; @@ -320,11 +320,11 @@ //// Initialization load: function (sContext="JavaScript", sPath="") { try { if (typeof(require) !== 'undefined') { - var ibdawg = require("resource://grammalecte/ibdawg.js"); + var ibdawg = require("resource://grammalecte/graphspell/ibdawg.js"); _oDict = new ibdawg.IBDAWG("${dic_name}.json"); } else { _oDict = new IBDAWG("${dic_name}.json", sPath); } _sAppContext = sContext; DELETED gc_core/js/str_transform.js Index: gc_core/js/str_transform.js ================================================================== --- gc_core/js/str_transform.js +++ /dev/null @@ -1,121 +0,0 @@ -//// STRING TRANSFORMATION -/*jslint esversion: 6*/ - -// Note: 48 is the ASCII code for "0" - -var str_transform = { - - distanceDamerauLevenshtein2: function (s1, s2) { - // distance of Damerau-Levenshtein between and - // https://fr.wikipedia.org/wiki/Distance_de_Damerau-Levenshtein - try { - let nLen1 = s1.length; - let nLen2 = s2.length; - let matrix = []; - for (let i = 0; i <= nLen1; i++) { - matrix[i] = new Array(nLen2 + 1); - } - for (let i = 0; i <= nLen1; i++) { - matrix[i][0] = i; - } - for (let j = 0; j <= nLen2; j++) { - matrix[0][j] = j; - } - for (let i = 1; i <= nLen1; i++) { - for (let j = 1; j <= nLen2; j++) { - let nCost = (s1[i] === s2[j]) ? 0 : 1; - matrix[i][j] = Math.min( - matrix[i-1][j] + 1, // Deletion - matrix[i][j-1] + 1, // Insertion - matrix[i-1][j-1] + nCost // Substitution - ); - if (i > 1 && j > 1 && s1[i] == s2[j-1] && s1[i-1] == s2[j]) { - matrix[i][j] = Math.min(matrix[i][j], matrix[i-2][j-2] + nCost); // Transposition - } - } - } - return matrix[nLen1][nLen2]; - } - catch (e) { - helpers.logerror(e); - } - }, - - distanceDamerauLevenshtein: function (s1, s2) { - // distance of Damerau-Levenshtein between and - // https://fr.wikipedia.org/wiki/Distance_de_Damerau-Levenshtein - try { - let nLen1 = s1.length; - let nLen2 = s2.length; - let INF = nLen1 + nLen2; - let matrix = []; - let sd = {}; - for (let i = 0; i < nLen1+2; i++) { - matrix[i] = new Array(nLen2+2); - } - matrix[0][0] = INF; - for (let i = 0; i <= nLen1; i++) { - matrix[i+1][1] = i; - matrix[i+1][0] = INF; - sd[s1[i]] = 0; - } - for (let j = 0; j <= nLen2; j++) { - matrix[1][j+1] = j; - matrix[0][j+1] = INF; - sd[s2[j]] = 0; - } - - for (let i = 1; i <= nLen1; i++) { - let DB = 0; - for (let j = 1; j <= nLen2; j++) { - let i1 = sd[s2[j-1]]; - let j1 = DB; - if (s1[i-1] === s2[j-1]) { - matrix[i+1][j+1] = matrix[i][j]; - DB = j; - } - else { - matrix[i+1][j+1] = Math.min(matrix[i][j], Math.min(matrix[i+1][j], matrix[i][j+1])) + 1; - } - matrix[i+1][j+1] = Math.min(matrix[i+1][j+1], matrix[i1] ? matrix[i1][j1] + (i-i1-1) + 1 + (j-j1-1) : Infinity); - } - sd[s1[i-1]] = i; - } - return matrix[nLen1+1][nLen2+1]; - } - catch (e) { - helpers.logerror(e); - } - }, - - showDistance (s1, s2) { - console.log(`Distance: ${s1} / ${s2} = ${this.distanceDamerauLevenshtein(s1, s2)})`); - }, - - getStemFromSuffixCode: function (sFlex, sSfxCode) { - // Suffix only - if (sSfxCode == "0") { - return sFlex; - } - return sSfxCode[0] == '0' ? sFlex + sSfxCode.slice(1) : sFlex.slice(0, -(sSfxCode.charCodeAt(0)-48)) + sSfxCode.slice(1); - }, - - getStemFromAffixCode: function (sFlex, sAffCode) { - // Prefix and suffix - if (sAffCode == "0") { - return sFlex; - } - if (!sAffCode.includes("/")) { - return "# error #"; - } - let [sPfxCode, sSfxCode] = sAffCode.split('/'); - sFlex = sPfxCode.slice(1) + sFlex.slice(sPfxCode.charCodeAt(0)-48); - return sSfxCode[0] == '0' ? sFlex + sSfxCode.slice(1) : sFlex.slice(0, -(sSfxCode.charCodeAt(0)-48)) + sSfxCode.slice(1); - } -}; - - -if (typeof(exports) !== 'undefined') { - exports.getStemFromSuffixCode = str_transform.getStemFromSuffixCode; - exports.getStemFromAffixCode = str_transform.getStemFromAffixCode; -} Index: gc_core/js/tests.js ================================================================== --- gc_core/js/tests.js +++ gc_core/js/tests.js @@ -4,11 +4,11 @@ "use strict"; if (typeof(require) !== 'undefined') { - var helpers = require("resource://grammalecte/helpers.js"); + var helpers = require("resource://grammalecte/graphspell/helpers.js"); } class TestGrammarChecking { Index: gc_core/js/text.js ================================================================== --- gc_core/js/text.js +++ gc_core/js/text.js @@ -4,11 +4,11 @@ "use strict"; if (typeof(require) !== 'undefined') { - var helpers = require("resource://grammalecte/helpers.js"); + var helpers = require("resource://grammalecte/graphspell/helpers.js"); } var text = { getParagraph: function* (sText, sSepParagraph = "\n") { DELETED gc_core/js/tokenizer.js Index: gc_core/js/tokenizer.js ================================================================== --- gc_core/js/tokenizer.js +++ /dev/null @@ -1,105 +0,0 @@ -// JavaScript -// Very simple tokenizer -/*jslint esversion: 6*/ -/*global require,exports*/ - -"use strict"; - - -if (typeof(require) !== 'undefined') { - var helpers = require("resource://grammalecte/helpers.js"); -} - - -const aTkzPatterns = { - // All regexps must start with ^. - "default": - [ - [/^[   \t]+/, 'SPACE'], - [/^\/(?:~|bin|boot|dev|etc|home|lib|mnt|opt|root|sbin|tmp|usr|var|Bureau|Documents|Images|Musique|Public|Téléchargements|Vidéos)(?:\/[a-zA-Zà-öÀ-Ö0-9ø-ÿØ-ßĀ-ʯfi-st_.()-]+)*/, 'FOLDERUNIX'], - [/^[a-zA-Z]:\\(?:Program Files(?: \(x86\)|)|[a-zA-Zà-öÀ-Ö0-9ø-ÿØ-ßĀ-ʯfi-st.()]+)(?:\\[a-zA-Zà-öÀ-Ö0-9ø-ÿØ-ßĀ-ʯfi-st_.()-]+)*/, 'FOLDERWIN'], - [/^[,.;:!?…«»“”‘’"(){}\[\]/·–—]+/, 'SEPARATOR'], - [/^[A-Z][.][A-Z][.](?:[A-Z][.])*/, 'ACRONYM'], - [/^(?:https?:\/\/|www[.]|[a-zA-Zà-öÀ-Ö0-9ø-ÿØ-ßĀ-ʯfi-st_-]+[@.][a-zA-Zà-öÀ-Ö0-9ø-ÿØ-ßĀ-ʯfi-st_-]{2,}[@.])[a-zA-Z0-9][a-zA-Zà-öÀ-Ö0-9ø-ÿØ-ßĀ-ʯfi-st_.\/?&!%=+*"'@$#-]+/, 'LINK'], - [/^[#@][a-zA-Zà-öÀ-Ö0-9ø-ÿØ-ßĀ-ʯfi-st_-]+/, 'TAG'], - [/^<[a-zA-Zà-öÀ-Ö0-9ø-ÿØ-ßĀ-ʯfi-st]+.*?>|<\/[a-zA-Zà-öÀ-Ö0-9ø-ÿØ-ßĀ-ʯfi-st]+ *>/, 'HTML'], - [/^\[\/?[a-zA-Zà-öÀ-Ö0-9ø-ÿØ-ßĀ-ʯfi-st]+\]/, 'PSEUDOHTML'], - [/^&\w+;(?:\w+;|)/, 'HTMLENTITY'], - [/^\d\d?h\d\d\b/, 'HOUR'], - [/^-?\d+(?:[.,]\d+|)/, 'NUM'], - [/^[a-zA-Zà-öÀ-Ö0-9ø-ÿØ-ßĀ-ʯfi-st]+(?:[’'`-][a-zA-Zà-öÀ-Ö0-9ø-ÿØ-ßĀ-ʯfi-st]+)*/, 'WORD'] - ], - "fr": - [ - [/^[   \t]+/, 'SPACE'], - [/^\/(?:~|bin|boot|dev|etc|home|lib|mnt|opt|root|sbin|tmp|usr|var|Bureau|Documents|Images|Musique|Public|Téléchargements|Vidéos)(?:\/[a-zA-Zà-öÀ-Ö0-9ø-ÿØ-ßĀ-ʯfi-st_.()-]+)*/, 'FOLDERUNIX'], - [/^[a-zA-Z]:\\(?:Program Files(?: \(x86\)|)|[a-zA-Zà-öÀ-Ö0-9ø-ÿØ-ßĀ-ʯfi-st.()]+)(?:\\[a-zA-Zà-öÀ-Ö0-9ø-ÿØ-ßĀ-ʯfi-st_.()-]+)*/, 'FOLDERWIN'], - [/^[,.;:!?…«»“”‘’"(){}\[\]/·–—]+/, 'SEPARATOR'], - [/^[A-Z][.][A-Z][.](?:[A-Z][.])*/, 'ACRONYM'], - [/^(?:https?:\/\/|www[.]|[a-zA-Zà-öÀ-Ö0-9ø-ÿØ-ßĀ-ʯfi-st_-]+[@.][a-zA-Zà-öÀ-Ö0-9ø-ÿØ-ßĀ-ʯfi-st_-]{2,}[@.])[a-zA-Z0-9][a-zA-Zà-öÀ-Ö0-9ø-ÿØ-ßĀ-ʯfi-st_.\/?&!%=+*"'@$#-]+/, 'LINK'], - [/^[#@][a-zA-Zà-öÀ-Ö0-9ø-ÿØ-ßĀ-ʯfi-st_-]+/, 'TAG'], - [/^<[a-zA-Zà-öÀ-Ö0-9ø-ÿØ-ßĀ-ʯfi-st]+.*?>|<\/[a-zA-Zà-öÀ-Ö0-9ø-ÿØ-ßĀ-ʯfi-st]+ *>/, 'HTML'], - [/^\[\/?[a-zA-Zà-öÀ-Ö0-9ø-ÿØ-ßĀ-ʯfi-st]+\]/, 'PSEUDOHTML'], - [/^&\w+;(?:\w+;|)/, 'HTMLENTITY'], - [/^(?:l|d|n|m|t|s|j|c|ç|lorsqu|puisqu|jusqu|quoiqu|qu)['’`]/i, 'ELPFX'], - [/^\d\d?[hm]\d\d\b/, 'HOUR'], - [/^\d+(?:er|nd|e|de|ième|ème|eme)s?\b/, 'ORDINAL'], - [/^-?\d+(?:[.,]\d+|)/, 'NUM'], - [/^[a-zA-Zà-öÀ-Ö0-9ø-ÿØ-ßĀ-ʯfi-st]+(?:[’'`-][a-zA-Zà-öÀ-Ö0-9ø-ÿØ-ßĀ-ʯfi-st]+)*/, 'WORD'] - ] -}; - - -class Tokenizer { - - constructor (sLang) { - this.sLang = sLang; - if (!aTkzPatterns.hasOwnProperty(sLang)) { - this.sLang = "default"; - } - this.aRules = aTkzPatterns[this.sLang]; - } - - * genTokens (sText) { - let m; - let i = 0; - while (sText) { - let nCut = 1; - for (let [zRegex, sType] of this.aRules) { - try { - if ((m = zRegex.exec(sText)) !== null) { - if (sType == 'SEPARATOR') { - for (let c of m[0]) { - yield { "sType": sType, "sValue": c, "nStart": i, "nEnd": i + m[0].length } - } - } else { - yield { "sType": sType, "sValue": m[0], "nStart": i, "nEnd": i + m[0].length } - } - nCut = m[0].length; - break; - } - } - catch (e) { - helpers.logerror(e); - } - } - i += nCut; - sText = sText.slice(nCut); - } - } - - getSpellingErrors (sText, oDict) { - let aSpellErr = []; - for (let oToken of this.genTokens(sText)) { - if (oToken.sType === 'WORD' && !oDict.isValidToken(oToken.sValue)) { - aSpellErr.push(oToken); - } - } - return aSpellErr; - } -} - - -if (typeof(exports) !== 'undefined') { - exports.Tokenizer = Tokenizer; -} DELETED gc_core/py/char_player.py Index: gc_core/py/char_player.py ================================================================== --- gc_core/py/char_player.py +++ /dev/null @@ -1,324 +0,0 @@ -# list of similar chars -# useful for suggestion mechanism - -import re - - -_xTransChars = str.maketrans({ - 'à': 'a', 'é': 'e', 'î': 'i', 'ô': 'o', 'û': 'u', 'ÿ': 'i', "y": "i", - 'â': 'a', 'è': 'e', 'ï': 'i', 'ö': 'o', 'ù': 'u', 'ŷ': 'i', - 'ä': 'a', 'ê': 'e', 'í': 'i', 'ó': 'o', 'ü': 'u', 'ý': 'i', - 'á': 'a', 'ë': 'e', 'ì': 'i', 'ò': 'o', 'ú': 'u', 'ỳ': 'i', - 'ā': 'a', 'ē': 'e', 'ī': 'i', 'ō': 'o', 'ū': 'u', 'ȳ': 'i', - 'ñ': 'n', 'k': 'q', 'w': 'v', - 'œ': 'oe', 'æ': 'ae', -}) - -def simplifyWord (sWord): - "word simplication before calculating distance between words" - sWord = sWord.lower().translate(_xTransChars) - sNewWord = "" - for i, c in enumerate(sWord, 1): - if c != sWord[i:i+1]: - sNewWord += c - return sNewWord.replace("eau", "o").replace("au", "o").replace("ai", "e").replace("ei", "e").replace("ph", "f") - - -aVowel = set("aáàâäāeéèêëēiíìîïīoóòôöōuúùûüūyýỳŷÿȳœæAÁÀÂÄĀEÉÈÊËĒIÍÌÎÏĪOÓÒÔÖŌUÚÙÛÜŪYÝỲŶŸȲŒÆ") -aConsonant = set("bcçdfghjklmnñpqrstvwxzBCÇDFGHJKLMNÑPQRSTVWXZ") -aDouble = set("bcdfjklmnprstzBCDFJKLMNPRSTZ") # letters that may be used twice successively - - -# Similar chars - -d1to1 = { - "1": "liîLIÎ", - "2": "zZ", - "3": "eéèêEÉÈÊ", - "4": "aàâAÀÂ", - "5": "sgSG", - "6": "bdgBDG", - "7": "ltLT", - "8": "bB", - "9": "gbdGBD", - "0": "oôOÔ", - - "a": "aàâáäæ", - "A": "AÀÂÁÄÆ", - "à": "aàâáäæ", - "À": "AÀÂÁÄÆ", - "â": "aàâáäæ", - "Â": "AÀÂÁÄÆ", - "á": "aàâáäæ", - "Á": "AÀÂÁÄÆ", - "ä": "aàâáäæ", - "Ä": "AÀÂÁÄÆ", - - "æ": "æéa", - "Æ": "ÆÉA", - - "c": "cçskqśŝ", - "C": "CÇSKQŚŜ", - "ç": "cçskqśŝ", - "Ç": "CÇSKQŚŜ", - - "e": "eéèêëœ", - "E": "EÉÈÊËŒ", - "é": "eéèêëœ", - "É": "EÉÈÊËŒ", - "ê": "eéèêëœ", - "Ê": "EÉÈÊËŒ", - "è": "eéèêëœ", - "È": "EÉÈÊËŒ", - "ë": "eéèêëœ", - "Ë": "EÉÈÊËŒ", - - "g": "gj", - "G": "GJ", - - "i": "iîïyíìÿ", - "I": "IÎÏYÍÌŸ", - "î": "iîïyíìÿ", - "Î": "IÎÏYÍÌŸ", - "ï": "iîïyíìÿ", - "Ï": "IÎÏYÍÌŸ", - "í": "iîïyíìÿ", - "Í": "IÎÏYÍÌŸ", - "ì": "iîïyíìÿ", - "Ì": "IÎÏYÍÌŸ", - - "j": "jg", - "J": "JG", - - "k": "kcq", - "K": "KCQ", - - "n": "nñ", - "N": "NÑ", - - "o": "oôóòöœ", - "O": "OÔÓÒÖŒ", - "ô": "oôóòöœ", - "Ô": "OÔÓÒÖŒ", - "ó": "oôóòöœ", - "Ó": "OÔÓÒÖŒ", - "ò": "oôóòöœ", - "Ò": "OÔÓÒÖŒ", - "ö": "oôóòöœ", - "Ö": "OÔÓÒÖŒ", - - "œ": "œoôeéèêë", - "Œ": "ŒOÔEÉÈÊË", - - "q": "qck", - "Q": "QCK", - - "s": "sśŝcç", - "S": "SŚŜCÇ", - "ś": "sśŝcç", - "Ś": "SŚŜCÇ", - "ŝ": "sśŝcç", - "Ŝ": "SŚŜCÇ", - - "u": "uûùüú", - "U": "UÛÙÜÚ", - "û": "uûùüú", - "Û": "UÛÙÜÚ", - "ù": "uûùüú", - "Ù": "UÛÙÜÚ", - "ü": "uûùüú", - "Ü": "UÛÙÜÚ", - "ú": "uûùüú", - "Ú": "UÛÙÜÚ", - - "v": "vw", - "V": "VW", - - "w": "wv", - "W": "WV", - - "x": "xck", - "X": "XCK", - - "y": "yÿiîŷýỳ", - "Y": "YŸIÎŶÝỲ", - "ÿ": "yÿiîŷýỳ", - "Ÿ": "YŸIÎŶÝỲ", - "ŷ": "yÿiîŷýỳ", - "Ŷ": "YŸIÎŶÝỲ", - "ý": "yÿiîŷýỳ", - "Ý": "YŸIÎŶÝỲ", - "ỳ": "yÿiîŷýỳ", - "Ỳ": "YŸIÎŶÝỲ", - - "z": "zs", - "Z": "ZS", -} - -d1toX = { - "æ": ("ae",), - "Æ": ("AE",), - "b": ("bb",), - "B": ("BB",), - "c": ("cc", "ss", "qu", "ch"), - "C": ("CC", "SS", "QU", "CH"), - "d": ("dd",), - "D": ("DD",), - "é": ("ai", "ei"), - "É": ("AI", "EI"), - "f": ("ff", "ph"), - "F": ("FF", "PH"), - "g": ("gu", "ge", "gg", "gh"), - "G": ("GU", "GE", "GG", "GH"), - "j": ("jj", "dj"), - "J": ("JJ", "DJ"), - "k": ("qu", "ck", "ch", "cu", "kk", "kh"), - "K": ("QU", "CK", "CH", "CU", "KK", "KH"), - "l": ("ll",), - "L": ("LL",), - "m": ("mm", "mn"), - "M": ("MM", "MN"), - "n": ("nn", "nm", "mn"), - "N": ("NN", "NM", "MN"), - "o": ("au", "eau"), - "O": ("AU", "EAU"), - "œ": ("oe", "eu"), - "Œ": ("OE", "EU"), - "p": ("pp", "ph"), - "P": ("PP", "PH"), - "q": ("qu", "ch", "cq", "ck", "kk"), - "Q": ("QU", "CH", "CQ", "CK", "KK"), - "r": ("rr",), - "R": ("RR",), - "s": ("ss", "sh"), - "S": ("SS", "SH"), - "t": ("tt", "th"), - "T": ("TT", "TH"), - "x": ("cc", "ct", "xx"), - "X": ("CC", "CT", "XX"), - "z": ("ss", "zh"), - "Z": ("SS", "ZH"), -} - - -def get1toXReplacement (cPrev, cCur, cNext): - if cCur in aConsonant and (cPrev in aConsonant or cNext in aConsonant): - return () - return d1toX.get(cCur, ()) - - -d2toX = { - "am": ("an", "en", "em"), - "AM": ("AN", "EN", "EM"), - "an": ("am", "en", "em"), - "AN": ("AM", "EN", "EM"), - "au": ("eau", "o", "ô"), - "AU": ("EAU", "O", "Ô"), - "em": ("an", "am", "en"), - "EM": ("AN", "AM", "EN"), - "en": ("an", "am", "em"), - "EN": ("AN", "AM", "EM"), - "ai": ("ei", "é", "è", "ê", "ë"), - "AI": ("EI", "É", "È", "Ê", "Ë"), - "ei": ("ai", "é", "è", "ê", "ë"), - "EI": ("AI", "É", "È", "Ê", "Ë"), - "ch": ("sh", "c", "ss"), - "CH": ("SH", "C", "SS"), - "ct": ("x", "cc"), - "CT": ("X", "CC"), - "oa": ("oi",), - "OA": ("OI",), - "oi": ("oa", "oie"), - "OI": ("OA", "OIE"), - "ph": ("f",), - "PH": ("F",), - "qu": ("q", "cq", "ck", "c", "k"), - "QU": ("Q", "CQ", "CK", "C", "K"), - "ss": ("c", "ç"), - "SS": ("C", "Ç"), - "un": ("ein",), - "UN": ("EIN",), -} - - -# End of word - -dFinal1 = { - "a": ("as", "at", "ant", "ah"), - "A": ("AS", "AT", "ANT", "AH"), - "c": ("ch",), - "C": ("CH",), - "e": ("et", "er", "ets", "ée", "ez", "ai", "ais", "ait", "ent", "eh"), - "E": ("ET", "ER", "ETS", "ÉE", "EZ", "AI", "AIS", "AIT", "ENT", "EH"), - "é": ("et", "er", "ets", "ée", "ez", "ai", "ais", "ait"), - "É": ("ET", "ER", "ETS", "ÉE", "EZ", "AI", "AIS", "AIT"), - "è": ("et", "er", "ets", "ée", "ez", "ai", "ais", "ait"), - "È": ("ET", "ER", "ETS", "ÉE", "EZ", "AI", "AIS", "AIT"), - "ê": ("et", "er", "ets", "ée", "ez", "ai", "ais", "ait"), - "Ê": ("ET", "ER", "ETS", "ÉE", "EZ", "AI", "AIS", "AIT"), - "ë": ("et", "er", "ets", "ée", "ez", "ai", "ais", "ait"), - "Ë": ("ET", "ER", "ETS", "ÉE", "EZ", "AI", "AIS", "AIT"), - "g": ("gh",), - "G": ("GH",), - "i": ("is", "it", "ie", "in"), - "I": ("IS", "IT", "IE", "IN"), - "n": ("nt", "nd", "ns", "nh"), - "N": ("NT", "ND", "NS", "NH"), - "o": ("aut", "ot", "os"), - "O": ("AUT", "OT", "OS"), - "ô": ("aut", "ot", "os"), - "Ô": ("AUT", "OT", "OS"), - "ö": ("aut", "ot", "os"), - "Ö": ("AUT", "OT", "OS"), - "p": ("ph",), - "P": ("PH",), - "s": ("sh",), - "S": ("SH",), - "t": ("th",), - "T": ("TH",), - "u": ("ut", "us", "uh"), - "U": ("UT", "US", "UH"), -} - -dFinal2 = { - "ai": ("aient", "ais", "et"), - "AI": ("AIENT", "AIS", "ET"), - "an": ("ant", "ent"), - "AN": ("ANT", "ENT"), - "en": ("ent", "ant"), - "EN": ("ENT", "ANT"), - "ei": ("ait", "ais"), - "EI": ("AIT", "AIS"), - "on": ("ons", "ont"), - "ON": ("ONS", "ONT"), - "oi": ("ois", "oit", "oix"), - "OI": ("OIS", "OIT", "OIX"), -} - - -# Préfixes et suffixes - -aPfx1 = frozenset([ - "anti", "archi", "contre", "hyper", "mé", "méta", "im", "in", "ir", "par", "proto", - "pseudo", "pré", "re", "ré", "sans", "sous", "supra", "sur", "ultra" -]) -aPfx2 = frozenset([ - "belgo", "franco", "génito", "gynéco", "médico", "russo" -]) - - -_zMotAvecPronom = re.compile("^(?i)(\\w+)(-(?:t-|)(?:ils?|elles?|on|je|tu|nous|vous))$") - -def cut (sWord): - "returns a tuple of strings (prefix, trimed_word, suffix)" - m = _zMotAvecPronom.search(sWord) - if m: - return ("", m.group(1), m.group(2)) - return ("", sWord, "") - - -# Other functions - -def filterSugg (aSugg): - "exclude suggestions" - return filter(lambda sSugg: not sSugg.endswith(("è", "È")), aSugg) DELETED gc_core/py/dawg.py Index: gc_core/py/dawg.py ================================================================== --- gc_core/py/dawg.py +++ /dev/null @@ -1,775 +0,0 @@ -#!python3 - -# FSA DICTIONARY BUILDER -# -# by Olivier R. -# License: MPL 2 -# -# This tool encodes lexicon into an indexable binary dictionary -# Input files MUST be encoded in UTF-8. - - -import sys -import os -import collections - -from . import str_transform as st -from .progressbar import ProgressBar - - - -def readFile (spf): - print(" < Read lexicon: " + spf) - if os.path.isfile(spf): - with open(spf, "r", encoding="utf-8") as hSrc: - for sLine in hSrc: - sLine = sLine.strip() - if sLine and not sLine.startswith("#"): - yield sLine - else: - raise OSError("# Error. File not found or not loadable: " + spf) - - -def getElemsFromFile (spf): - "returns tuple of (flexion, stem, tags) from lexicon file" - nErr = 0 - if not spf.endswith(".clex"): - for sLine in readFile(spf): - try: - sFlex, sStem, sTag = sLine.split("\t") - yield (sFlex, sStem, sTag) - except: - nErr += 1 - else: - sTag = "_" # neutral tag - sTag2 = "" - for sLine in readFile(spf): - if sLine.startswith("[") and sLine.endswith("]"): - # tag line - if "-->" in sLine: - try: - sTag, sSfxCode, sTag2 = sLine[1:-1].split(" --> ") - except: - nErr += 1 - continue - sTag = sTag.strip() - sSfxCode = sSfxCode.strip() - sTag2 = sTag2.strip() - else: - sTag = sLine[1:-1] - sTag2 = "" - else: - # entry line - if "\t" in sLine: - if sLine.count("\t") > 1: - nErr += 1 - continue - sFlex, sStem = sLine.split("\t") - else: - sFlex = sStem = sLine - #print(sFlex, sStem, sTag) - yield (sFlex, sStem, sTag) - if sTag2: - sFlex2 = st.changeWordWithSuffixCode(sFlex, sSfxCode) - #print(sFlex2, sStem, sTag2) - yield (sFlex2, sStem, sTag2) - if nErr: - print(" # Lines ignored: {:>10}".format(nErr)) - - - -class DAWG: - """DIRECT ACYCLIC WORD GRAPH""" - # This code is inspired from Steve Hanov’s DAWG, 2011. (http://stevehanov.ca/blog/index.php?id=115) - # We store suffix/affix codes and tags within the graph after the “real” word. - # A word is a list of numbers [ c1, c2, c3 . . . cN, iAffix, iTags] - # Each arc is an index in self.lArcVal, where are stored characters, suffix/affix codes for stemming and tags. - # Important: As usual, the last node (after ‘iTags’) is tagged final, AND the node after ‘cN’ is ALSO tagged final. - - def __init__ (self, spfSrc, sLangName, cStemming): - print("===== Direct Acyclic Word Graph - Minimal Acyclic Finite State Automaton =====") - cStemming = cStemming.upper() - if cStemming == "A": - funcStemmingGen = st.defineAffixCode - elif cStemming == "S": - funcStemmingGen = st.defineSuffixCode - elif cStemming == "N": - funcStemmingGen = st.noStemming - else: - raise ValueError("# Error. Unknown stemming code: {}".format(cStemming)) - - lEntry = [] - lChar = ['']; dChar = {}; nChar = 1; dCharOccur = {} - lAff = []; dAff = {}; nAff = 0; dAffOccur = {} - lTag = []; dTag = {}; nTag = 0; dTagOccur = {} - nErr = 0 - - # read lexicon - for sFlex, sStem, sTag in getElemsFromFile(spfSrc): - addWordToCharDict(sFlex) - # chars - for c in sFlex: - if c not in dChar: - dChar[c] = nChar - lChar.append(c) - nChar += 1 - dCharOccur[c] = dCharOccur.get(c, 0) + 1 - # affixes to find stem from flexion - aff = funcStemmingGen(sFlex, sStem) - if aff not in dAff: - dAff[aff] = nAff - lAff.append(aff) - nAff += 1 - dAffOccur[aff] = dCharOccur.get(aff, 0) + 1 - # tags - if sTag not in dTag: - dTag[sTag] = nTag - lTag.append(sTag) - nTag += 1 - dTagOccur[sTag] = dTagOccur.get(sTag, 0) + 1 - lEntry.append((sFlex, dAff[aff], dTag[sTag])) - if not lEntry: - raise ValueError("# Error. Empty lexicon") - - # Preparing DAWG - print(" > Preparing list of words") - lVal = lChar + lAff + lTag - lWord = [ [dChar[c] for c in sFlex] + [iAff+nChar] + [iTag+nChar+nAff] for sFlex, iAff, iTag in lEntry ] - lEntry = None - - # Dictionary of arc values occurrency, to sort arcs of each node - dValOccur = dict( [ (dChar[c], dCharOccur[c]) for c in dChar ] \ - + [ (dAff[aff]+nChar, dAffOccur[aff]) for aff in dAff ] \ - + [ (dTag[tag]+nChar+nAff, dTagOccur[tag]) for tag in dTag ] ) - #with open(spfSrc[:-8]+".valuesfreq.txt", 'w', encoding='utf-8') as hFreqDst: # DEBUG - # for iKey, nOcc in sorted(dValOccur.items(), key=lambda t: t[1], reverse=True): - # hFreqDst.write("{}: {}\n".format(lVal[iKey], nOcc)) - # hFreqDst.close() - - self.sFile = spfSrc - self.sLang = sLangName - self.nEntry = len(lWord) - self.aPreviousEntry = [] - DawgNode.resetNextId() - self.oRoot = DawgNode() - self.lUncheckedNodes = [] # list of nodes that have not been checked for duplication. - self.lMinimizedNodes = {} # list of unique nodes that have been checked for duplication. - self.lSortedNodes = [] # version 2 and 3 - self.nNode = 0 - self.nArc = 0 - self.dChar = dChar - self.nChar = len(dChar) - self.nAff = nAff - self.lArcVal = lVal - self.nArcVal = len(lVal) - self.nTag = self.nArcVal - self.nChar - nAff - self.cStemming = cStemming - if cStemming == "A": - self.funcStemming = st.changeWordWithAffixCode - elif cStemming == "S": - self.funcStemming = st.changeWordWithSuffixCode - else: - self.funcStemming = st.noStemming - - # build - lWord.sort() - oProgBar = ProgressBar(0, len(lWord)) - for aEntry in lWord: - self.insert(aEntry) - oProgBar.increment(1) - oProgBar.done() - self.finish() - self.countNodes() - self.countArcs() - self.sortNodes() - self.sortNodeArcs(dValOccur) - #self.sortNodeArcs2 (self.oRoot, "") - self.displayInfo() - - # BUILD DAWG - def insert (self, aEntry): - if aEntry < self.aPreviousEntry: - sys.exit("# Error: Words must be inserted in alphabetical order.") - - # find common prefix between word and previous word - nCommonPrefix = 0 - for i in range(min(len(aEntry), len(self.aPreviousEntry))): - if aEntry[i] != self.aPreviousEntry[i]: - break - nCommonPrefix += 1 - - # Check the lUncheckedNodes for redundant nodes, proceeding from last - # one down to the common prefix size. Then truncate the list at that point. - self._minimize(nCommonPrefix) - - # add the suffix, starting from the correct node mid-way through the graph - if len(self.lUncheckedNodes) == 0: - oNode = self.oRoot - else: - oNode = self.lUncheckedNodes[-1][2] - - iChar = nCommonPrefix - for c in aEntry[nCommonPrefix:]: - oNextNode = DawgNode() - oNode.arcs[c] = oNextNode - self.lUncheckedNodes.append((oNode, c, oNextNode)) - if iChar == (len(aEntry) - 2): - oNode.final = True - iChar += 1 - oNode = oNextNode - oNode.final = True - self.aPreviousEntry = aEntry - - def finish (self): - "minimize unchecked nodes" - self._minimize(0) - - def _minimize (self, downTo): - # proceed from the leaf up to a certain point - for i in range( len(self.lUncheckedNodes)-1, downTo-1, -1 ): - oNode, char, oChildNode = self.lUncheckedNodes[i] - if oChildNode in self.lMinimizedNodes: - # replace the child with the previously encountered one - oNode.arcs[char] = self.lMinimizedNodes[oChildNode] - else: - # add the state to the minimized nodes. - self.lMinimizedNodes[oChildNode] = oChildNode - self.lUncheckedNodes.pop() - - def countNodes (self): - self.nNode = len(self.lMinimizedNodes) - - def countArcs (self): - self.nArc = 0 - for oNode in self.lMinimizedNodes: - self.nArc += len(oNode.arcs) - - def sortNodeArcs (self, dValOccur): - print(" > Sort node arcs") - self.oRoot.sortArcs(dValOccur) - for oNode in self.lMinimizedNodes: - oNode.sortArcs(dValOccur) - - def sortNodeArcs2 (self, oNode, cPrevious=""): - # recursive function - dCharOccur = getCharOrderAfterChar(cPrevious) - if dCharOccur: - oNode.sortArcs2(dCharOccur, self.lArcVal) - for nArcVal, oNextNode in oNode.arcs.items(): - self.sortNodeArcs2(oNextNode, self.lArcVal[nArcVal]) - - def sortNodes (self): - print(" > Sort nodes") - for oNode in self.oRoot.arcs.values(): - self._parseNodes(oNode) - - def _parseNodes (self, oNode): - # Warning: recursive method - if oNode.pos > 0: - return - oNode.setPos() - self.lSortedNodes.append(oNode) - for oNextNode in oNode.arcs.values(): - self._parseNodes(oNextNode) - - def lookup (self, sWord): - oNode = self.oRoot - for c in sWord: - if self.dChar.get(c, '') not in oNode.arcs: - return False - oNode = oNode.arcs[self.dChar[c]] - return oNode.final - - def morph (self, sWord): - oNode = self.oRoot - for c in sWord: - if self.dChar.get(c, '') not in oNode.arcs: - return '' - oNode = oNode.arcs[self.dChar[c]] - if oNode.final: - s = "* " - for arc in oNode.arcs: - if arc >= self.nChar: - s += " [" + self.funcStemming(sWord, self.lArcVal[arc]) - oNode2 = oNode.arcs[arc] - for arc2 in oNode2.arcs: - s += " / " + self.lArcVal[arc2] - s += "]" - return s - return '' - - def displayInfo (self): - print(" * {:<12} {:>16,}".format("Entries:", self.nEntry)) - print(" * {:<12} {:>16,}".format("Characters:", self.nChar)) - print(" * {:<12} {:>16,}".format("Affixes:", self.nAff)) - print(" * {:<12} {:>16,}".format("Tags:", self.nTag)) - print(" * {:<12} {:>16,}".format("Arc values:", self.nArcVal)) - print(" * {:<12} {:>16,}".format("Nodes:", self.nNode)) - print(" * {:<12} {:>16,}".format("Arcs:", self.nArc)) - print(" * {:<12} {:>16}".format("Stemming:", self.cStemming + "FX")) - - def getArcStats (self): - d = {} - for oNode in self.lMinimizedNodes: - n = len(oNode.arcs) - d[n] = d.get(n, 0) + 1 - s = " * Nodes:\n" - for n in d: - s = s + " {:>9} nodes have {:>3} arcs\n".format(d[n], n) - return s - - def writeInfo (self, sPathFile): - print(" > Write informations") - with open(sPathFile, 'w', encoding='utf-8', newline="\n") as hDst: - hDst.write(self.getArcStats()) - hDst.write("\n * Values:\n") - for i, s in enumerate(self.lArcVal): - hDst.write(" {:>6}. {}\n".format(i, s)) - hDst.close() - - # BINARY CONVERSION - def createBinary (self, sPathFile, nMethod, bDebug=False): - print(" > Write DAWG as an indexable binary dictionary [method: %d]" % nMethod) - if nMethod == 1: - self.nBytesArc = ( (self.nArcVal.bit_length() + 2) // 8 ) + 1 # We add 2 bits. See DawgNode.convToBytes1() - self._calcNumBytesNodeAddress() - self._calcNodesAddress1() - elif nMethod == 2: - self.nBytesArc = ( (self.nArcVal.bit_length() + 3) // 8 ) + 1 # We add 3 bits. See DawgNode.convToBytes2() - self._calcNumBytesNodeAddress() - self._calcNodesAddress2() - elif nMethod == 3: - self.nBytesArc = ( (self.nArcVal.bit_length() + 3) // 8 ) + 1 # We add 3 bits. See DawgNode.convToBytes3() - self.nBytesOffset = 1 - self.nMaxOffset = (2 ** (self.nBytesOffset * 8)) - 1 - self._calcNumBytesNodeAddress() - self._calcNodesAddress3() - else: - print(" # Error: unknown compression method") - print(" Arc values (chars, affixes and tags): {} -> {} bytes".format( self.nArcVal, len("\t".join(self.lArcVal).encode("utf-8")) )) - print(" Arc size: {} bytes, Address size: {} bytes -> {} * {} = {} bytes".format( self.nBytesArc, self.nBytesNodeAddress, \ - self.nBytesArc+self.nBytesNodeAddress, self.nArc, \ - (self.nBytesArc+self.nBytesNodeAddress)*self.nArc )) - self._writeBinary(sPathFile, nMethod) - if bDebug: - self._writeNodes(sPathFile, nMethod) - - def _calcNumBytesNodeAddress (self): - "how many bytes needed to store all nodes/arcs in the binary dictionary" - self.nBytesNodeAddress = 1 - while ((self.nBytesArc + self.nBytesNodeAddress) * self.nArc) > (2 ** (self.nBytesNodeAddress * 8)): - self.nBytesNodeAddress += 1 - - def _calcNodesAddress1 (self): - nBytesNode = self.nBytesArc + self.nBytesNodeAddress - iAddr = len(self.oRoot.arcs) * nBytesNode - for oNode in self.lMinimizedNodes: - oNode.addr = iAddr - iAddr += max(len(oNode.arcs), 1) * nBytesNode - - def _calcNodesAddress2 (self): - nBytesNode = self.nBytesArc + self.nBytesNodeAddress - iAddr = len(self.oRoot.arcs) * nBytesNode - for oNode in self.lSortedNodes: - oNode.addr = iAddr - iAddr += max(len(oNode.arcs), 1) * nBytesNode - for oNextNode in oNode.arcs.values(): - if (oNode.pos + 1) == oNextNode.pos: - iAddr -= self.nBytesNodeAddress - #break - - def _calcNodesAddress3 (self): - nBytesNode = self.nBytesArc + self.nBytesNodeAddress - # theorical nodes size if only addresses and no offset - self.oRoot.size = len(self.oRoot.arcs) * nBytesNode - for oNode in self.lSortedNodes: - oNode.size = max(len(oNode.arcs), 1) * nBytesNode - # rewind and calculate dropdown from the end, several times - nDiff = self.nBytesNodeAddress - self.nBytesOffset - bEnd = False - while not bEnd: - bEnd = True - # recalculate addresses - iAddr = self.oRoot.size - for oNode in self.lSortedNodes: - oNode.addr = iAddr - iAddr += oNode.size - # rewind and calculate dropdown from the end, several times - for i in range(self.nNode-1, -1, -1): - nSize = max(len(self.lSortedNodes[i].arcs), 1) * nBytesNode - for oNextNode in self.lSortedNodes[i].arcs.values(): - if 1 < (oNextNode.addr - self.lSortedNodes[i].addr) < self.nMaxOffset: - nSize -= nDiff - if self.lSortedNodes[i].size != nSize: - self.lSortedNodes[i].size = nSize - bEnd = False - - def _writeBinary (self, sPathFile, nMethod): - """ - Format of the binary indexable dictionary: - Each section is separated with 4 bytes of \0 - - - Section Header: - /pyfsa/[version] - * version is an ASCII string - - - Section Informations: - /[tag_lang] - /[number of chars] - /[number of bytes for each arc] - /[number of bytes for each address node] - /[number of entries] - /[number of nodes] - /[number of arcs] - /[number of affixes] - * each field is a ASCII string - /[stemming code] - * "S" means stems are generated by /suffix_code/, "A" means they are generated by /affix_code/ - See defineSuffixCode() and defineAffixCode() for details. - "N" means no stemming - - - Section Values: - * a list of strings encoded in binary from utf-8, each value separated with a tabulation - - - Section Word Graph (nodes / arcs) - * A list of nodes which are a list of arcs with an address of the next node. - See DawgNode.convToBytes() for details. - """ - if not sPathFile.endswith(".bdic"): - sPathFile += "."+str(nMethod)+".bdic" - with open(sPathFile, 'wb') as hDst: - # header - hDst.write("/pyfsa/{}/".format(nMethod).encode("utf-8")) - hDst.write(b"\0\0\0\0") - # infos - hDst.write("{}/{}/{}/{}/{}/{}/{}/{}/{}".format(self.sLang, self.nChar, self.nBytesArc, self.nBytesNodeAddress, \ - self.nEntry, self.nNode, self.nArc, self.nAff, self.cStemming).encode("utf-8")) - hDst.write(b"\0\0\0\0") - # lArcVal - hDst.write("\t".join(self.lArcVal).encode("utf-8")) - hDst.write(b"\0\0\0\0") - # DAWG: nodes / arcs - if nMethod == 1: - hDst.write(self.oRoot.convToBytes1(self.nBytesArc, self.nBytesNodeAddress)) - for oNode in self.lMinimizedNodes: - hDst.write(oNode.convToBytes1(self.nBytesArc, self.nBytesNodeAddress)) - elif nMethod == 2: - hDst.write(self.oRoot.convToBytes2(self.nBytesArc, self.nBytesNodeAddress)) - for oNode in self.lSortedNodes: - hDst.write(oNode.convToBytes2(self.nBytesArc, self.nBytesNodeAddress)) - elif nMethod == 3: - hDst.write(self.oRoot.convToBytes3(self.nBytesArc, self.nBytesNodeAddress, self.nBytesOffset)) - for oNode in self.lSortedNodes: - hDst.write(oNode.convToBytes3(self.nBytesArc, self.nBytesNodeAddress, self.nBytesOffset)) - hDst.close() - - def _writeNodes (self, sPathFile, nMethod): - "for debugging only" - print(" > Write nodes") - with open(sPathFile+".nodes."+str(nMethod)+".txt", 'w', encoding='utf-8', newline="\n") as hDst: - if nMethod == 1: - hDst.write(self.oRoot.getTxtRepr1(self.nBytesArc, self.nBytesNodeAddress, self.lArcVal)+"\n") - #hDst.write( ''.join( [ "%02X " % z for z in self.oRoot.convToBytes1(self.nBytesArc, self.nBytesNodeAddress) ] ).strip() ) - for oNode in self.lMinimizedNodes: - hDst.write(oNode.getTxtRepr1(self.nBytesArc, self.nBytesNodeAddress, self.lArcVal)+"\n") - if nMethod == 2: - hDst.write(self.oRoot.getTxtRepr2(self.nBytesArc, self.nBytesNodeAddress, self.lArcVal)+"\n") - for oNode in self.lSortedNodes: - hDst.write(oNode.getTxtRepr2(self.nBytesArc, self.nBytesNodeAddress, self.lArcVal)+"\n") - if nMethod == 3: - hDst.write(self.oRoot.getTxtRepr3(self.nBytesArc, self.nBytesNodeAddress, self.nBytesOffset, self.lArcVal)+"\n") - #hDst.write( ''.join( [ "%02X " % z for z in self.oRoot.convToBytes3(self.nBytesArc, self.nBytesNodeAddress, self.nBytesOffset) ] ).strip() ) - for oNode in self.lSortedNodes: - hDst.write(oNode.getTxtRepr3(self.nBytesArc, self.nBytesNodeAddress, self.nBytesOffset, self.lArcVal)+"\n") - hDst.close() - - def writeResults (self, sPathFile): - bFileExits = os.path.isfile("_lexicons.res.txt") - with open("_lexicons.res.txt", "a", encoding='utf-8', newline="\n") as hDst: - sFormat1 = "{:<12} {:>12} {:>5} {:>8} {:>8} {:>6} {:>8} {:>9} {:>9} {:>15} {:>12} {:>12}\n" - sFormat2 = "{:<12} {:>12,} {:>5,} {:>8,} {:>8} {:>6,} {:>8,} {:>9,} {:>9,} {:>15,} {:>12,} {:>12,}\n" - if not bFileExits: - hDst.write(sFormat1.format("Lexicon", "Entries", "Chars", "Affixes", "Stemming", "Tags", "Values", "Nodes", "Arcs", "Lexicon (Kb)", "Dict (Kb)", "LT Dict (Kb)")) - hDst.write(sFormat2.format(self.sLang, self.nEntry, self.nChar, self.nAff, self.cStemming + "FX", self.nTag, self.nArcVal, \ - self.nNode, self.nArc, os.path.getsize(self.sFile), os.path.getsize(sPathFile), \ - os.path.getsize("cfsa/dict/{}.dict".format(self.sLang)) if os.path.isfile("cfsa/dict/{}.dict".format(self.sLang)) else 0)) - hDst.close() - - - -class DawgNode: - NextId = 0 - NextPos = 1 # (version 2) - - def __init__ (self): - self.i = DawgNode.NextId - DawgNode.NextId += 1 - self.final = False - self.arcs = {} # key: arc value; value: a node - self.addr = 0 # address in the binary dictionary - self.pos = 0 # position in the binary dictionary (version 2) - self.size = 0 # size of node in bytes (version 3) - - @classmethod - def resetNextId (cls): - cls.NextId = 0 - - def setPos (self): # version 2 - self.pos = DawgNode.NextPos - DawgNode.NextPos += 1 - - def __str__ (self): - # Caution! this function is used for hashing and comparison! - l = [] - if self.final: - l.append("1") - else: - l.append("0") - for (key, node) in self.arcs.items(): - l.append(str(key)) - l.append(str(node.i)) - return "_".join(l) - - def __hash__ (self): - # Used as a key in a python dictionary. - return self.__str__().__hash__() - - def __eq__ (self, other): - # Used as a key in a python dictionary. - # Nodes are equivalent if they have identical arcs, and each identical arc leads to identical states. - return self.__str__() == other.__str__() - - def sortArcs (self, dValOccur): - self.arcs = collections.OrderedDict(sorted(self.arcs.items(), key=lambda t: dValOccur.get(t[0], 0), reverse=True)) - - def sortArcs2 (self, dValOccur, lArcVal): - self.arcs = collections.OrderedDict(sorted(self.arcs.items(), key=lambda t: dValOccur.get(lArcVal[t[0]], 0), reverse=True)) - - # VERSION 1 ===================================================================================================== - def convToBytes1 (self, nBytesArc, nBytesNodeAddress): - """ - Node scheme: - - Arc length is defined by nBytesArc - - Address length is defined by nBytesNodeAddress - - | Arc | Address of next node | - | | | - /---------------\ /---------------\ /---------------\ /---------------\ /---------------\ /---------------\ - | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | - \---------------/ \---------------/ \---------------/ \---------------/ \---------------/ \---------------/ - [...] - /---------------\ /---------------\ /---------------\ /---------------\ /---------------\ /---------------\ - | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | - \---------------/ \---------------/ \---------------/ \---------------/ \---------------/ \---------------/ - ^ ^ - | | - | | - | \___ if 1, last arc of this node - \_____ if 1, this node is final (only on the first arc) - """ - nArc = len(self.arcs) - nFinalNodeMask = 1 << ((nBytesArc*8)-1) - nFinalArcMask = 1 << ((nBytesArc*8)-2) - if len(self.arcs) == 0: - val = nFinalNodeMask | nFinalArcMask - by = val.to_bytes(nBytesArc, byteorder='big') - by += (0).to_bytes(nBytesNodeAddress, byteorder='big') - return by - by = b"" - for i, arc in enumerate(self.arcs, 1): - val = arc - if i == 1 and self.final: - val = val | nFinalNodeMask - if i == nArc: - val = val | nFinalArcMask - by += val.to_bytes(nBytesArc, byteorder='big') - by += self.arcs[arc].addr.to_bytes(nBytesNodeAddress, byteorder='big') - return by - - def getTxtRepr1 (self, nBytesArc, nBytesNodeAddress, lVal): - nArc = len(self.arcs) - nFinalNodeMask = 1 << ((nBytesArc*8)-1) - nFinalArcMask = 1 << ((nBytesArc*8)-2) - s = "i{:_>10} -- #{:_>10}\n".format(self.i, self.addr) - if len(self.arcs) == 0: - s += " {:<20} {:0>16} i{:_>10} #{:_>10}\n".format("", bin(nFinalNodeMask | nFinalArcMask)[2:], "0", "0") - return s - for i, arc in enumerate(self.arcs, 1): - val = arc - if i == 1 and self.final: - val = val | nFinalNodeMask - if i == nArc: - val = val | nFinalArcMask - s += " {:<20} {:0>16} i{:_>10} #{:_>10}\n".format(lVal[arc], bin(val)[2:], self.arcs[arc].i, self.arcs[arc].addr) - return s - - # VERSION 2 ===================================================================================================== - def convToBytes2 (self, nBytesArc, nBytesNodeAddress): - """ - Node scheme: - - Arc length is defined by nBytesArc - - Address length is defined by nBytesNodeAddress - - | Arc | Address of next node | - | | | - /---------------\ /---------------\ /---------------\ /---------------\ /---------------\ /---------------\ - | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | - \---------------/ \---------------/ \---------------/ \---------------/ \---------------/ \---------------/ - [...] - /---------------\ /---------------\ /---------------\ /---------------\ /---------------\ /---------------\ - | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | - \---------------/ \---------------/ \---------------/ \---------------/ \---------------/ \---------------/ - ^ ^ ^ - | | | - | | \_ if 1, caution, no address: next node is the following node - | \___ if 1, last arc of this node - \_____ if 1, this node is final (only on the first arc) - """ - nArc = len(self.arcs) - nFinalNodeMask = 1 << ((nBytesArc*8)-1) - nFinalArcMask = 1 << ((nBytesArc*8)-2) - nNextNodeMask = 1 << ((nBytesArc*8)-3) - if len(self.arcs) == 0: - val = nFinalNodeMask | nFinalArcMask - by = val.to_bytes(nBytesArc, byteorder='big') - by += (0).to_bytes(nBytesNodeAddress, byteorder='big') - return by - by = b"" - for i, arc in enumerate(self.arcs, 1): - val = arc - if i == 1 and self.final: - val = val | nFinalNodeMask - if i == nArc: - val = val | nFinalArcMask - if (self.pos + 1) == self.arcs[arc].pos and self.i != 0: - val = val | nNextNodeMask - by += val.to_bytes(nBytesArc, byteorder='big') - else: - by += val.to_bytes(nBytesArc, byteorder='big') - by += self.arcs[arc].addr.to_bytes(nBytesNodeAddress, byteorder='big') - return by - - def getTxtRepr2 (self, nBytesArc, nBytesNodeAddress, lVal): - nArc = len(self.arcs) - nFinalNodeMask = 1 << ((nBytesArc*8)-1) - nFinalArcMask = 1 << ((nBytesArc*8)-2) - nNextNodeMask = 1 << ((nBytesArc*8)-3) - s = "i{:_>10} -- #{:_>10}\n".format(self.i, self.addr) - if nArc == 0: - s += " {:<20} {:0>16} i{:_>10} #{:_>10}\n".format("", bin(nFinalNodeMask | nFinalArcMask)[2:], "0", "0") - return s - for i, arc in enumerate(self.arcs, 1): - val = arc - if i == 1 and self.final: - val = val | nFinalNodeMask - if i == nArc: - val = val | nFinalArcMask - if (self.pos + 1) == self.arcs[arc].pos and self.i != 0: - val = val | nNextNodeMask - s += " {:<20} {:0>16}\n".format(lVal[arc], bin(val)[2:], "") - else: - s += " {:<20} {:0>16} i{:_>10} #{:_>10}\n".format(lVal[arc], bin(val)[2:], self.arcs[arc].i, self.arcs[arc].addr) - return s - - # VERSION 3 ===================================================================================================== - def convToBytes3 (self, nBytesArc, nBytesNodeAddress, nBytesOffset): - """ - Node scheme: - - Arc length is defined by nBytesArc - - Address length is defined by nBytesNodeAddress - - Offset length is defined by nBytesOffset - - | Arc | Address of next node or offset to next node | - | | | - /---------------\ /---------------\ /---------------\ /---------------\ /---------------\ /---------------\ - |1|0|0| | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | - \---------------/ \---------------/ \---------------/ \---------------/ \---------------/ \---------------/ - [...] - /---------------\ /---------------\ /---------------\ - |0|0|1| | | | | | | | | | | | | | | | | | | | | | | | Offsets are shorter than addresses - \---------------/ \---------------/ \---------------/ - /---------------\ /---------------\ /---------------\ /---------------\ /---------------\ /---------------\ - |0|1|0| | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | - \---------------/ \---------------/ \---------------/ \---------------/ \---------------/ \---------------/ - - ^ ^ ^ - | | | - | | \_ if 1, offset instead of address of next node - | \___ if 1, last arc of this node - \_____ if 1, this node is final (only on the first arc) - """ - nArc = len(self.arcs) - nFinalNodeMask = 1 << ((nBytesArc*8)-1) - nFinalArcMask = 1 << ((nBytesArc*8)-2) - nNextNodeMask = 1 << ((nBytesArc*8)-3) - nMaxOffset = (2 ** (nBytesOffset * 8)) - 1 - if nArc == 0: - val = nFinalNodeMask | nFinalArcMask - by = val.to_bytes(nBytesArc, byteorder='big') - by += (0).to_bytes(nBytesNodeAddress, byteorder='big') - return by - by = b"" - for i, arc in enumerate(self.arcs, 1): - val = arc - if i == 1 and self.final: - val = val | nFinalNodeMask - if i == nArc: - val = val | nFinalArcMask - if 1 < (self.arcs[arc].addr - self.addr) < nMaxOffset and self.i != 0: - val = val | nNextNodeMask - by += val.to_bytes(nBytesArc, byteorder='big') - by += (self.arcs[arc].addr-self.addr).to_bytes(nBytesOffset, byteorder='big') - else: - by += val.to_bytes(nBytesArc, byteorder='big') - by += self.arcs[arc].addr.to_bytes(nBytesNodeAddress, byteorder='big') - return by - - def getTxtRepr3 (self, nBytesArc, nBytesNodeAddress, nBytesOffset, lVal): - nArc = len(self.arcs) - nFinalNodeMask = 1 << ((nBytesArc*8)-1) - nFinalArcMask = 1 << ((nBytesArc*8)-2) - nNextNodeMask = 1 << ((nBytesArc*8)-3) - nMaxOffset = (2 ** (nBytesOffset * 8)) - 1 - s = "i{:_>10} -- #{:_>10} ({})\n".format(self.i, self.addr, self.size) - if nArc == 0: - s += " {:<20} {:0>16} i{:_>10} #{:_>10}\n".format("", bin(nFinalNodeMask | nFinalArcMask)[2:], "0", "0") - return s - for i, arc in enumerate(self.arcs, 1): - val = arc - if i == 1 and self.final: - val = val | nFinalNodeMask - if i == nArc: - val = val | nFinalArcMask - if 1 < (self.arcs[arc].addr - self.addr) < nMaxOffset and self.i != 0: - val = val | nNextNodeMask - s += " {:<20} {:0>16} i{:_>10} +{:_>10}\n".format(lVal[arc], bin(val)[2:], self.arcs[arc].i, self.arcs[arc].addr - self.addr) - else: - s += " {:<20} {:0>16} i{:_>10} #{:_>10}\n".format(lVal[arc], bin(val)[2:], self.arcs[arc].i, self.arcs[arc].addr) - return s - - - -# Another attempt to sort node arcs - -_dCharOrder = { - # key: previous char, value: dictionary of chars {c: nValue} - "": {} -} - - -def addWordToCharDict (sWord): - cPrevious = "" - for cChar in sWord: - if cPrevious not in _dCharOrder: - _dCharOrder[cPrevious] = {} - _dCharOrder[cPrevious][cChar] = _dCharOrder[cPrevious].get(cChar, 0) + 1 - cPrevious = cChar - - -def getCharOrderAfterChar (cChar): - return _dCharOrder.get(cChar, None) - - -def displayCharOrder (): - for key, value in _dCharOrder.items(): - print("[" + key + "]: ", ", ".join([ c+":"+str(n) for c, n in sorted(value.items(), key=lambda t: t[1], reverse=True) ])) DELETED gc_core/py/echo.py Index: gc_core/py/echo.py ================================================================== --- gc_core/py/echo.py +++ /dev/null @@ -1,29 +0,0 @@ -#!python3 - -# The most boring yet indispensable function: print! - - -import sys - - -_CHARMAP = str.maketrans({ 'œ': 'ö', 'Œ': 'Ö', 'ʳ': "r", 'ᵉ': "e", '…': "_", \ - '“': '"', '”': '"', '„': '"', '‘': "'", '’': "'", \ - 'ā': 'â', 'Ā': 'Â', 'ē': 'ê', 'Ē': 'Ê', 'ī': 'î', 'Ī': 'Î', \ - 'ō': 'ô', 'Ō': 'Ô', 'ū': 'û', 'Ū': 'Û', 'Ÿ': 'Y', \ - 'ś': 's', 'ŝ': 's', \ - '—': '-', '–': '-' - }) - - -def echo (obj, sep=' ', end='\n', file=sys.stdout, flush=False): - """ Print for Windows to avoid Python crashes. - Encoding depends on Windows locale. No useful standard. - Always returns True (useful for debugging).""" - if sys.platform != "win32": - print(obj, sep=sep, end=end, file=file, flush=flush) - return True - try: - print(str(obj).translate(_CHARMAP), sep=sep, end=end, file=file, flush=flush) - except: - print(str(obj).encode('ascii', 'replace').decode('ascii', 'replace'), sep=sep, end=end, file=file, flush=flush) - return True DELETED gc_core/py/ibdawg.py Index: gc_core/py/ibdawg.py ================================================================== --- gc_core/py/ibdawg.py +++ /dev/null @@ -1,720 +0,0 @@ -#!python3 - -import os -import traceback -import pkgutil -import re -from functools import wraps -import time - -#import logging -#logging.basicConfig(filename="suggestions.log", level=logging.DEBUG) - -from . import str_transform as st -from . import char_player as cp -from .echo import echo - - -def timethis (func): - "decorator for the execution time" - @wraps(func) - def wrapper (*args, **kwargs): - fStart = time.time() - result = func(*args, **kwargs) - fEnd = time.time() - print(func.__name__, fEnd - fStart) - return result - return wrapper - - -class SuggResult: - """Structure for storing, classifying and filtering suggestions""" - - def __init__ (self, sWord, nDistLimit=-1): - self.sWord = sWord - self.sSimplifiedWord = cp.simplifyWord(sWord) - self.nDistLimit = nDistLimit if nDistLimit >= 0 else (len(sWord) // 3) + 1 - self.nMinDist = 1000 - self.aSugg = set() - self.dSugg = { 0: [], 1: [], 2: [] } - - def addSugg (self, sSugg, nDeep=0): - "add a suggestion" - #logging.info((nDeep * " ") + "__" + sSugg + "__") - if sSugg not in self.aSugg: - nDist = st.distanceDamerauLevenshtein(self.sSimplifiedWord, cp.simplifyWord(sSugg)) - if nDist <= self.nDistLimit: - if nDist not in self.dSugg: - self.dSugg[nDist] = [] - self.dSugg[nDist].append(sSugg) - self.aSugg.add(sSugg) - if nDist < self.nMinDist: - self.nMinDist = nDist - self.nDistLimit = min(self.nDistLimit, self.nMinDist+2) - - def getSuggestions (self, nSuggLimit=10, nDistLimit=-1): - "return a list of suggestions" - lRes = [] - if self.dSugg[0]: - # we sort the better results with the original word - self.dSugg[0].sort(key=lambda sSugg: st.distanceDamerauLevenshtein(self.sWord, sSugg)) - for lSugg in self.dSugg.values(): - lRes.extend(lSugg) - if len(lRes) > nSuggLimit: - break - lRes = list(cp.filterSugg(lRes)) - if self.sWord.istitle(): - lRes = list(map(lambda sSugg: sSugg.title(), lRes)) - elif self.sWord.isupper(): - lRes = list(map(lambda sSugg: sSugg.upper(), lRes)) - return lRes[:nSuggLimit] - - def reset (self): - self.aSugg.clear() - self.dSugg.clear() - - -class IBDAWG: - """INDEXABLE BINARY DIRECT ACYCLIC WORD GRAPH""" - - def __init__ (self, sDicName): - self.by = pkgutil.get_data(__package__, "_dictionaries/" + sDicName) - if not self.by: - raise OSError("# Error. File not found or not loadable: "+sDicName) - - if self.by[0:7] != b"/pyfsa/": - raise TypeError("# Error. Not a pyfsa binary dictionary. Header: {}".format(self.by[0:9])) - if not(self.by[7:8] == b"1" or self.by[7:8] == b"2" or self.by[7:8] == b"3"): - raise ValueError("# Error. Unknown dictionary version: {}".format(self.by[7:8])) - try: - header, info, values, bdic = self.by.split(b"\0\0\0\0", 3) - except Exception: - raise Exception - - self.sName = sDicName - self.nVersion = int(self.by[7:8].decode("utf-8")) - self.sHeader = header.decode("utf-8") - self.lArcVal = values.decode("utf-8").split("\t") - self.nArcVal = len(self.lArcVal) - self.byDic = bdic - - l = info.decode("utf-8").split("/") - self.sLang = l[0] - self.nChar = int(l[1]) - self.nBytesArc = int(l[2]) - self.nBytesNodeAddress = int(l[3]) - self.nEntries = int(l[4]) - self.nNode = int(l[5]) - self.nArc = int(l[6]) - self.nAff = int(l[7]) - self.cStemming = l[8] - if self.cStemming == "S": - self.funcStemming = st.changeWordWithSuffixCode - elif self.cStemming == "A": - self.funcStemming = st.changeWordWithAffixCode - else: - self.funcStemming = st.noStemming - self.nTag = self.nArcVal - self.nChar - self.nAff - # to get the value of an arc, to get the char of an arc with its value - self.dChar = {} - for i in range(1, self.nChar): - self.dChar[self.lArcVal[i]] = i - self.dCharVal = { v: k for k, v in self.dChar.items() } - - self._arcMask = (2 ** ((self.nBytesArc * 8) - 3)) - 1 - self._finalNodeMask = 1 << ((self.nBytesArc * 8) - 1) - self._lastArcMask = 1 << ((self.nBytesArc * 8) - 2) - self._addrBitMask = 1 << ((self.nBytesArc * 8) - 3) # version 2 - - self.nBytesOffset = 1 # version 3 - - # Configuring DAWG functions according to nVersion - if self.nVersion == 1: - self.morph = self._morph1 - self.stem = self._stem1 - self._lookupArcNode = self._lookupArcNode1 - self._getArcs = self._getArcs1 - self._writeNodes = self._writeNodes1 - elif self.nVersion == 2: - self.morph = self._morph2 - self.stem = self._stem2 - self._lookupArcNode = self._lookupArcNode2 - self._getArcs = self._getArcs2 - self._writeNodes = self._writeNodes2 - elif self.nVersion == 3: - self.morph = self._morph3 - self.stem = self._stem3 - self._lookupArcNode = self._lookupArcNode3 - self._getArcs = self._getArcs3 - self._writeNodes = self._writeNodes3 - else: - raise ValueError(" # Error: unknown code: {}".format(self.nVersion)) - - self.bOptNumSigle = False - self.bOptNumAtLast = False - - def getInfo (self): - return " Language: {0.sLang:>10} Version: {0.nVersion:>2} Stemming: {0.cStemming}FX\n" \ - " Arcs values: {0.nArcVal:>10,} = {0.nChar:>5,} characters, {0.nAff:>6,} affixes, {0.nTag:>6,} tags\n" \ - " Dictionary: {0.nEntries:>12,} entries, {0.nNode:>11,} nodes, {0.nArc:>11,} arcs\n" \ - " Address size: {0.nBytesNodeAddress:>1} bytes, Arc size: {0.nBytesArc:>1} bytes\n".format(self) - - def writeAsJSObject (self, spfDest, bInJSModule=False, bBinaryDictAsHexString=False): - "write IBDAWG as a JavaScript object in a JavaScript module" - import json - with open(spfDest, "w", encoding="utf-8", newline="\n") as hDst: - if bInJSModule: - hDst.write('// JavaScript\n// Generated data (do not edit)\n\n"use strict";\n\nconst dictionary = ') - hDst.write(json.dumps({ - "sName": self.sName, - "nVersion": self.nVersion, - "sHeader": self.sHeader, - "lArcVal": self.lArcVal, - "nArcVal": self.nArcVal, - # JavaScript is a pile of shit, so Mozilla’s JS parser don’t like file bigger than 4 Mb! - # So, if necessary, we use an hexadecimal string, that we will convert later in Firefox’s extension. - # https://github.com/mozilla/addons-linter/issues/1361 - "byDic": self.byDic.hex() if bBinaryDictAsHexString else [ e for e in self.byDic ], - "sLang": self.sLang, - "nChar": self.nChar, - "nBytesArc": self.nBytesArc, - "nBytesNodeAddress": self.nBytesNodeAddress, - "nEntries": self.nEntries, - "nNode": self.nNode, - "nArc": self.nArc, - "nAff": self.nAff, - "cStemming": self.cStemming, - "nTag": self.nTag, - "dChar": self.dChar, - "_arcMask": self._arcMask, - "_finalNodeMask": self._finalNodeMask, - "_lastArcMask": self._lastArcMask, - "_addrBitMask": self._addrBitMask, - "nBytesOffset": self.nBytesOffset - }, ensure_ascii=False)) - if bInJSModule: - hDst.write(";\n\nexports.dictionary = dictionary;\n") - - def isValidToken (self, sToken): - "checks if is valid (if there is hyphens in , is split, each part is checked)" - if self.isValid(sToken): - return True - if "-" in sToken: - if sToken.count("-") > 4: - return True - return all(self.isValid(sWord) for sWord in sToken.split("-")) - return False - - def isValid (self, sWord): - "checks if is valid (different casing tested if the first letter is a capital)" - if not sWord: - return None - if "’" in sWord: # ugly hack - sWord = sWord.replace("’", "'") - if self.lookup(sWord): - return True - if sWord[0:1].isupper(): - if len(sWord) > 1: - if sWord.istitle(): - return self.lookup(sWord.lower()) - if sWord.isupper(): - if self.bOptNumSigle: - return True - return self.lookup(sWord.lower()) or self.lookup(sWord.capitalize()) - return self.lookup(sWord[:1].lower() + sWord[1:]) - else: - return self.lookup(sWord.lower()) - return False - - def lookup (self, sWord): - "returns True if in dictionary (strict verification)" - iAddr = 0 - for c in sWord: - if c not in self.dChar: - return False - iAddr = self._lookupArcNode(self.dChar[c], iAddr) - if iAddr == None: - return False - return bool(int.from_bytes(self.byDic[iAddr:iAddr+self.nBytesArc], byteorder='big') & self._finalNodeMask) - - def getMorph (self, sWord): - "retrieves morphologies list, different casing allowed" - l = self.morph(sWord) - if sWord[0:1].isupper(): - l.extend(self.morph(sWord.lower())) - if sWord.isupper() and len(sWord) > 1: - l.extend(self.morph(sWord.capitalize())) - return l - - #@timethis - def suggest (self, sWord, nSuggLimit=10): - "returns a set of suggestions for " - sPfx, sWord, sSfx = cp.cut(sWord) - nMaxSwitch = max(len(sWord) // 3, 1) - nMaxDel = len(sWord) // 5 - nMaxHardRepl = max((len(sWord) - 5) // 4, 1) - oSuggResult = SuggResult(sWord) - self._suggest(oSuggResult, sWord, nMaxSwitch=nMaxSwitch, nMaxDel=nMaxDel, nMaxHardRepl=nMaxHardRepl) - if sWord.istitle(): - self._suggest(oSuggResult, sWord.lower(), nMaxSwitch=nMaxSwitch, nMaxDel=nMaxDel, nMaxHardRepl=nMaxHardRepl) - elif sWord.islower(): - self._suggest(oSuggResult, sWord.title(), nMaxSwitch=nMaxSwitch, nMaxDel=nMaxDel, nMaxHardRepl=nMaxHardRepl) - aSugg = oSuggResult.getSuggestions(nSuggLimit) - if sSfx or sPfx: - # we add what we removed - return list(map(lambda sSug: sPfx + sSug + sSfx, aSugg)) - return aSugg - - def _suggest (self, oSuggResult, sRemain, nMaxSwitch=0, nMaxDel=0, nMaxHardRepl=0, nDeep=0, iAddr=0, sNewWord="", bAvoidLoop=False): - # recursive function - #logging.info((nDeep * " ") + sNewWord + ":" + sRemain) - if not sRemain: - if int.from_bytes(self.byDic[iAddr:iAddr+self.nBytesArc], byteorder='big') & self._finalNodeMask: - oSuggResult.addSugg(sNewWord, nDeep) - for sTail in self._getTails(iAddr): - oSuggResult.addSugg(sNewWord+sTail, nDeep) - return - cCurrent = sRemain[0:1] - for cChar, jAddr in self._getCharArcs(iAddr): - if cChar in cp.d1to1.get(cCurrent, cCurrent): - self._suggest(oSuggResult, sRemain[1:], nMaxSwitch, nMaxDel, nMaxHardRepl, nDeep+1, jAddr, sNewWord+cChar) - elif not bAvoidLoop and nMaxHardRepl: - self._suggest(oSuggResult, sRemain[1:], nMaxSwitch, nMaxDel, nMaxHardRepl-1, nDeep+1, jAddr, sNewWord+cChar, True) - if not bAvoidLoop: # avoid infinite loop - if len(sRemain) > 1: - if cCurrent == sRemain[1:2]: - # same char, we remove 1 char without adding 1 to - self._suggest(oSuggResult, sRemain[1:], nMaxSwitch, nMaxDel, nMaxHardRepl, nDeep+1, iAddr, sNewWord) - else: - # switching chars - if nMaxSwitch: - self._suggest(oSuggResult, sRemain[1:2]+sRemain[0:1]+sRemain[2:], nMaxSwitch-1, nMaxDel, nMaxHardRepl, nDeep+1, iAddr, sNewWord, True) - # delete char - if nMaxDel: - self._suggest(oSuggResult, sRemain[1:], nMaxSwitch, nMaxDel-1, nMaxHardRepl, nDeep+1, iAddr, sNewWord, True) - # Phonetic replacements - for sRepl in cp.get1toXReplacement(sNewWord[-1:], cCurrent, sRemain[1:2]): - self._suggest(oSuggResult, sRepl + sRemain[1:], nMaxSwitch, nMaxDel, nMaxHardRepl, nDeep+1, iAddr, sNewWord, True) - for sRepl in cp.d2toX.get(sRemain[0:2], ()): - self._suggest(oSuggResult, sRepl + sRemain[2:], nMaxSwitch, nMaxDel, nMaxHardRepl, nDeep+1, iAddr, sNewWord, True) - # end of word - if len(sRemain) == 2: - for sRepl in cp.dFinal2.get(sRemain, ()): - self._suggest(oSuggResult, sRepl, nMaxSwitch, nMaxDel, nMaxHardRepl, nDeep+1, iAddr, sNewWord, True) - elif len(sRemain) == 1: - self._suggest(oSuggResult, "", nMaxSwitch, nMaxDel, nMaxHardRepl, nDeep+1, iAddr, sNewWord, True) # remove last char and go on - for sRepl in cp.dFinal1.get(sRemain, ()): - self._suggest(oSuggResult, sRepl, nMaxSwitch, nMaxDel, nMaxHardRepl, nDeep+1, iAddr, sNewWord, True) - - #@timethis - def suggest2 (self, sWord, nMaxSugg=10): - "returns a set of suggestions for " - sPfx, sWord, sSfx = cp.cut(sWord) - oSuggResult = SuggResult(sWord) - self._suggest2(oSuggResult) - aSugg = oSuggResult.getSuggestions() - if sSfx or sPfx: - # we add what we removed - return list(map(lambda sSug: sPfx + sSug + sSfx, aSugg)) - return aSugg - - def _suggest2 (self, oSuggResult, nDeep=0, iAddr=0, sNewWord=""): - # recursive function - #logging.info((nDeep * " ") + sNewWord) - if nDeep >= oSuggResult.nDistLimit: - sCleanNewWord = cp.simplifyWord(sNewWord) - if st.distanceSift4(oSuggResult.sCleanWord[:len(sCleanNewWord)], sCleanNewWord) > oSuggResult.nDistLimit: - return - if int.from_bytes(self.byDic[iAddr:iAddr+self.nBytesArc], byteorder='big') & self._finalNodeMask: - oSuggResult.addSugg(sNewWord, nDeep) - for cChar, jAddr in self._getCharArcsWithPriority(iAddr, oSuggResult.sWord[nDeep:nDeep+1]): - self._suggest2(oSuggResult, nDeep+1, jAddr, sNewWord+cChar) - return - - def _getCharArcs (self, iAddr): - "generator: yield all chars and addresses from node at address " - for nVal, jAddr in self._getArcs(iAddr): - if nVal < self.nChar: - yield (self.dCharVal[nVal], jAddr) - - def _getSimilarCharArcs (self, cChar, iAddr): - "generator: yield similar char of and address of the following node" - for c in cp.d1to1.get(cChar, [cChar]): - if c in self.dChar: - jAddr = self._lookupArcNode(self.dChar[c], iAddr) - if jAddr: - yield (c, jAddr) - - def _getCharArcsWithPriority (self, iAddr, cChar): - if not cChar: - yield from self._getCharArcs(iAddr) - lTuple = list(self._getCharArcs(iAddr)) - lTuple.sort(key=lambda t: 0 if t[0] in cp.d1to1.get(cChar, cChar) else 1) - yield from lTuple - - def _getTails (self, iAddr, sTail="", n=2): - "return a list of suffixes ending at a distance of from " - aTails = set() - for nVal, jAddr in self._getArcs(iAddr): - if nVal < self.nChar: - if int.from_bytes(self.byDic[jAddr:jAddr+self.nBytesArc], byteorder='big') & self._finalNodeMask: - aTails.add(sTail + self.dCharVal[nVal]) - if n and not aTails: - aTails.update(self._getTails(jAddr, sTail+self.dCharVal[nVal], n-1)) - return aTails - - def drawPath (self, sWord, iAddr=0): - "show the path taken by in the graph" - c1 = sWord[0:1] if sWord else " " - iPos = -1 - n = 0 - print(c1 + ": ", end="") - for c2, jAddr in self._getCharArcs(iAddr): - print(c2, end="") - if c2 == sWord[0:1]: - iNextNodeAddr = jAddr - iPos = n - n += 1 - if not sWord: - return - if iPos >= 0: - print("\n "+ " " * iPos + "|") - self.drawPath(sWord[1:], iNextNodeAddr) - - def select (self, sPattern=""): - "generator: returns all entries which morphology fits " - zPattern = None - try: - zPattern = re.compile(sPattern) - except: - print("# Error in regex pattern") - traceback.print_exc() - yield from self._select1(zPattern, 0, "") - - # def morph (self, sWord): - # is defined in __init__ - - # VERSION 1 - def _select1 (self, zPattern, iAddr, sWord): - # recursive generator - for nVal, jAddr in self._getArcs1(iAddr): - if nVal < self.nChar: - # simple character - yield from self._select1(zPattern, jAddr, sWord + self.lArcVal[nVal]) - else: - sEntry = sWord + "\t" + self.funcStemming(sWord, self.lArcVal[nVal]) - for nMorphVal, _ in self._getArcs1(jAddr): - if not zPattern or zPattern.search(self.lArcVal[nMorphVal]): - yield sEntry + "\t" + self.lArcVal[nMorphVal] - - def _morph1 (self, sWord): - "returns morphologies of " - iAddr = 0 - for c in sWord: - if c not in self.dChar: - return [] - iAddr = self._lookupArcNode(self.dChar[c], iAddr) - if iAddr == None: - return [] - if (int.from_bytes(self.byDic[iAddr:iAddr+self.nBytesArc], byteorder='big') & self._finalNodeMask): - l = [] - nRawArc = 0 - while not (nRawArc & self._lastArcMask): - iEndArcAddr = iAddr + self.nBytesArc - nRawArc = int.from_bytes(self.byDic[iAddr:iEndArcAddr], byteorder='big') - nArc = nRawArc & self._arcMask - if nArc >= self.nChar: - # This value is not a char, this is a stemming code - sStem = ">" + self.funcStemming(sWord, self.lArcVal[nArc]) - # Now , we go to the next node and retrieve all following arcs values, all of them are tags - iAddr2 = int.from_bytes(self.byDic[iEndArcAddr:iEndArcAddr+self.nBytesNodeAddress], byteorder='big') - nRawArc2 = 0 - while not (nRawArc2 & self._lastArcMask): - iEndArcAddr2 = iAddr2 + self.nBytesArc - nRawArc2 = int.from_bytes(self.byDic[iAddr2:iEndArcAddr2], byteorder='big') - l.append(sStem + " " + self.lArcVal[nRawArc2 & self._arcMask]) - iAddr2 = iEndArcAddr2+self.nBytesNodeAddress - iAddr = iEndArcAddr+self.nBytesNodeAddress - return l - return [] - - def _stem1 (self, sWord): - "returns stems list of " - iAddr = 0 - for c in sWord: - if c not in self.dChar: - return [] - iAddr = self._lookupArcNode(self.dChar[c], iAddr) - if iAddr == None: - return [] - if (int.from_bytes(self.byDic[iAddr:iAddr+self.nBytesArc], byteorder='big') & self._finalNodeMask): - l = [] - nRawArc = 0 - while not (nRawArc & self._lastArcMask): - iEndArcAddr = iAddr + self.nBytesArc - nRawArc = int.from_bytes(self.byDic[iAddr:iEndArcAddr], byteorder='big') - nArc = nRawArc & self._arcMask - if nArc >= self.nChar: - # This value is not a char, this is a stemming code - l.append(self.funcStemming(sWord, self.lArcVal[nArc])) - iAddr = iEndArcAddr+self.nBytesNodeAddress - return l - return [] - - def _lookupArcNode1 (self, nVal, iAddr): - "looks if is an arc at the node at , if yes, returns address of next node else None" - while True: - iEndArcAddr = iAddr+self.nBytesArc - nRawArc = int.from_bytes(self.byDic[iAddr:iEndArcAddr], byteorder='big') - if nVal == (nRawArc & self._arcMask): - # the value we are looking for - # we return the address of the next node - return int.from_bytes(self.byDic[iEndArcAddr:iEndArcAddr+self.nBytesNodeAddress], byteorder='big') - else: - # value not found - if (nRawArc & self._lastArcMask): - return None - iAddr = iEndArcAddr+self.nBytesNodeAddress - - def _getArcs1 (self, iAddr): - "generator: return all arcs at as tuples of (nVal, iAddr)" - while True: - iEndArcAddr = iAddr+self.nBytesArc - nRawArc = int.from_bytes(self.byDic[iAddr:iEndArcAddr], byteorder='big') - yield (nRawArc & self._arcMask, int.from_bytes(self.byDic[iEndArcAddr:iEndArcAddr+self.nBytesNodeAddress], byteorder='big')) - if (nRawArc & self._lastArcMask): - break - iAddr = iEndArcAddr+self.nBytesNodeAddress - - def _writeNodes1 (self, spfDest): - "for debugging only" - print(" > Write binary nodes") - with codecs.open(spfDest, 'w', 'utf-8', newline="\n") as hDst: - iAddr = 0 - hDst.write("i{:_>10} -- #{:_>10}\n".format("0", iAddr)) - while iAddr < len(self.byDic): - iEndArcAddr = iAddr+self.nBytesArc - nRawArc = int.from_bytes(self.byDic[iAddr:iEndArcAddr], byteorder='big') - nArc = nRawArc & self._arcMask - hDst.write(" {:<20} {:0>16} i{:>10} #{:_>10}\n".format(self.lArcVal[nArc], bin(nRawArc)[2:], "?", \ - int.from_bytes(self.byDic[iEndArcAddr:iEndArcAddr+self.nBytesNodeAddress], \ - byteorder='big'))) - iAddr = iEndArcAddr+self.nBytesNodeAddress - if (nRawArc & self._lastArcMask) and iAddr < len(self.byDic): - hDst.write("\ni{:_>10} -- #{:_>10}\n".format("?", iAddr)) - hDst.close() - - # VERSION 2 - def _morph2 (self, sWord): - "returns morphologies of " - iAddr = 0 - for c in sWord: - if c not in self.dChar: - return [] - iAddr = self._lookupArcNode(self.dChar[c], iAddr) - if iAddr == None: - return [] - if (int.from_bytes(self.byDic[iAddr:iAddr+self.nBytesArc], byteorder='big') & self._finalNodeMask): - l = [] - nRawArc = 0 - while not (nRawArc & self._lastArcMask): - iEndArcAddr = iAddr + self.nBytesArc - nRawArc = int.from_bytes(self.byDic[iAddr:iEndArcAddr], byteorder='big') - nArc = nRawArc & self._arcMask - if nArc >= self.nChar: - # This value is not a char, this is a stemming code - sStem = ">" + self.funcStemming(sWord, self.lArcVal[nArc]) - # Now , we go to the next node and retrieve all following arcs values, all of them are tags - if not (nRawArc & self._addrBitMask): - iAddr2 = int.from_bytes(self.byDic[iEndArcAddr:iEndArcAddr+self.nBytesNodeAddress], byteorder='big') - else: - # we go to the end of the node - iAddr2 = iEndArcAddr - while not (nRawArc & self._lastArcMask): - nRawArc = int.from_bytes(self.byDic[iAddr2:iAddr2+self.nBytesArc], byteorder='big') - iAddr2 += self.nBytesArc + self.nBytesNodeAddress - nRawArc2 = 0 - while not (nRawArc2 & self._lastArcMask): - iEndArcAddr2 = iAddr2 + self.nBytesArc - nRawArc2 = int.from_bytes(self.byDic[iAddr2:iEndArcAddr2], byteorder='big') - l.append(sStem + " " + self.lArcVal[nRawArc2 & self._arcMask]) - iAddr2 = iEndArcAddr2+self.nBytesNodeAddress if not (nRawArc2 & self._addrBitMask) else iEndArcAddr2 - iAddr = iEndArcAddr+self.nBytesNodeAddress if not (nRawArc & self._addrBitMask) else iEndArcAddr - return l - return [] - - def _stem2 (self, sWord): - "returns stems list of " - iAddr = 0 - for c in sWord: - if c not in self.dChar: - return [] - iAddr = self._lookupArcNode(self.dChar[c], iAddr) - if iAddr == None: - return [] - if (int.from_bytes(self.byDic[iAddr:iAddr+self.nBytesArc], byteorder='big') & self._finalNodeMask): - l = [] - nRawArc = 0 - while not (nRawArc & self._lastArcMask): - iEndArcAddr = iAddr + self.nBytesArc - nRawArc = int.from_bytes(self.byDic[iAddr:iEndArcAddr], byteorder='big') - nArc = nRawArc & self._arcMask - if nArc >= self.nChar: - # This value is not a char, this is a stemming code - l.append(self.funcStemming(sWord, self.lArcVal[nArc])) - # Now , we go to the next node - if not (nRawArc & self._addrBitMask): - iAddr2 = int.from_bytes(self.byDic[iEndArcAddr:iEndArcAddr+self.nBytesNodeAddress], byteorder='big') - else: - # we go to the end of the node - iAddr2 = iEndArcAddr - while not (nRawArc & self._lastArcMask): - nRawArc = int.from_bytes(self.byDic[iAddr2:iAddr2+self.nBytesArc], byteorder='big') - iAddr2 += self.nBytesArc + self.nBytesNodeAddress - iAddr = iEndArcAddr+self.nBytesNodeAddress if not (nRawArc & self._addrBitMask) else iEndArcAddr - return l - return [] - - def _lookupArcNode2 (self, nVal, iAddr): - "looks if is an arc at the node at , if yes, returns address of next node else None" - while True: - iEndArcAddr = iAddr+self.nBytesArc - nRawArc = int.from_bytes(self.byDic[iAddr:iEndArcAddr], byteorder='big') - if nVal == (nRawArc & self._arcMask): - # the value we are looking for - if not (nRawArc & self._addrBitMask): - # we return the address of the next node - return int.from_bytes(self.byDic[iEndArcAddr:iEndArcAddr+self.nBytesNodeAddress], byteorder='big') - else: - # we go to the end of the node - iAddr = iEndArcAddr - while not (nRawArc & self._lastArcMask): - nRawArc = int.from_bytes(self.byDic[iAddr:iAddr+self.nBytesArc], byteorder='big') - iAddr += self.nBytesArc + self.nBytesNodeAddress if not (nRawArc & self._addrBitMask) else self.nBytesArc - return iAddr - else: - # value not found - if (nRawArc & self._lastArcMask): - return None - iAddr = iEndArcAddr+self.nBytesNodeAddress if not (nRawArc & self._addrBitMask) else iEndArcAddr - - def _writeNodes2 (self, spfDest): - "for debugging only" - print(" > Write binary nodes") - with codecs.open(spfDest, 'w', 'utf-8', newline="\n") as hDst: - iAddr = 0 - hDst.write("i{:_>10} -- #{:_>10}\n".format("0", iAddr)) - while iAddr < len(self.byDic): - iEndArcAddr = iAddr+self.nBytesArc - nRawArc = int.from_bytes(self.byDic[iAddr:iEndArcAddr], byteorder='big') - nArc = nRawArc & self._arcMask - if not (nRawArc & self._addrBitMask): - iNextNodeAddr = int.from_bytes(self.byDic[iEndArcAddr:iEndArcAddr+self.nBytesNodeAddress], byteorder='big') - hDst.write(" {:<20} {:0>16} i{:>10} #{:_>10}\n".format(self.lArcVal[nArc], bin(nRawArc)[2:], "?", iNextNodeAddr)) - iAddr = iEndArcAddr+self.nBytesNodeAddress - else: - hDst.write(" {:<20} {:0>16}\n".format(self.lArcVal[nArc], bin(nRawArc)[2:])) - iAddr = iEndArcAddr - if (nRawArc & self._lastArcMask): - hDst.write("\ni{:_>10} -- #{:_>10}\n".format("?", iAddr)) - hDst.close() - - # VERSION 3 - def _morph3 (self, sWord): - "returns morphologies of " - iAddr = 0 - for c in sWord: - if c not in self.dChar: - return [] - iAddr = self._lookupArcNode(self.dChar[c], iAddr) - if iAddr == None: - return [] - if (int.from_bytes(self.byDic[iAddr:iAddr+self.nBytesArc], byteorder='big') & self._finalNodeMask): - l = [] - nRawArc = 0 - iAddrNode = iAddr - while not (nRawArc & self._lastArcMask): - iEndArcAddr = iAddr + self.nBytesArc - nRawArc = int.from_bytes(self.byDic[iAddr:iEndArcAddr], byteorder='big') - nArc = nRawArc & self._arcMask - if nArc >= self.nChar: - # This value is not a char, this is a stemming code - sStem = ">" + self.funcStemming(sWord, self.lArcVal[nArc]) - # Now , we go to the next node and retrieve all following arcs values, all of them are tags - if not (nRawArc & self._addrBitMask): - iAddr2 = int.from_bytes(self.byDic[iEndArcAddr:iEndArcAddr+self.nBytesNodeAddress], byteorder='big') - else: - iAddr2 = iAddrNode + int.from_bytes(self.byDic[iEndArcAddr:iEndArcAddr+self.nBytesOffset], byteorder='big') - nRawArc2 = 0 - while not (nRawArc2 & self._lastArcMask): - iEndArcAddr2 = iAddr2 + self.nBytesArc - nRawArc2 = int.from_bytes(self.byDic[iAddr2:iEndArcAddr2], byteorder='big') - l.append(sStem + " " + self.lArcVal[nRawArc2 & self._arcMask]) - iAddr2 = iEndArcAddr2+self.nBytesNodeAddress if not (nRawArc2 & self._addrBitMask) else iEndArcAddr2+self.nBytesOffset - iAddr = iEndArcAddr+self.nBytesNodeAddress if not (nRawArc & self._addrBitMask) else iEndArcAddr+self.nBytesOffset - return l - return [] - - def _stem3 (self, sWord): - "returns stems list of " - iAddr = 0 - for c in sWord: - if c not in self.dChar: - return [] - iAddr = self._lookupArcNode(self.dChar[c], iAddr) - if iAddr == None: - return [] - if (int.from_bytes(self.byDic[iAddr:iAddr+self.nBytesArc], byteorder='big') & self._finalNodeMask): - l = [] - nRawArc = 0 - iAddrNode = iAddr - while not (nRawArc & self._lastArcMask): - iEndArcAddr = iAddr + self.nBytesArc - nRawArc = int.from_bytes(self.byDic[iAddr:iEndArcAddr], byteorder='big') - nArc = nRawArc & self._arcMask - if nArc >= self.nChar: - # This value is not a char, this is a stemming code - l.append(self.funcStemming(sWord, self.lArcVal[nArc])) - iAddr = iEndArcAddr+self.nBytesNodeAddress if not (nRawArc & self._addrBitMask) else iEndArcAddr+self.nBytesOffset - return l - return [] - - def _lookupArcNode3 (self, nVal, iAddr): - "looks if is an arc at the node at , if yes, returns address of next node else None" - iAddrNode = iAddr - while True: - iEndArcAddr = iAddr+self.nBytesArc - nRawArc = int.from_bytes(self.byDic[iAddr:iEndArcAddr], byteorder='big') - if nVal == (nRawArc & self._arcMask): - # the value we are looking for - if not (nRawArc & self._addrBitMask): - return int.from_bytes(self.byDic[iEndArcAddr:iEndArcAddr+self.nBytesNodeAddress], byteorder='big') - else: - return iAddrNode + int.from_bytes(self.byDic[iEndArcAddr:iEndArcAddr+self.nBytesOffset], byteorder='big') - else: - # value not found - if (nRawArc & self._lastArcMask): - return None - iAddr = iEndArcAddr+self.nBytesNodeAddress if not (nRawArc & self._addrBitMask) else iEndArcAddr+self.nBytesOffset - - def _writeNodes3 (self, spfDest): - "for debugging only" - print(" > Write binary nodes") - with codecs.open(spfDest, 'w', 'utf-8', newline="\n") as hDst: - iAddr = 0 - hDst.write("i{:_>10} -- #{:_>10}\n".format("0", iAddr)) - while iAddr < len(self.byDic): - iEndArcAddr = iAddr+self.nBytesArc - nRawArc = int.from_bytes(self.byDic[iAddr:iEndArcAddr], byteorder='big') - nArc = nRawArc & self._arcMask - if not (nRawArc & self._addrBitMask): - iNextNodeAddr = int.from_bytes(self.byDic[iEndArcAddr:iEndArcAddr+self.nBytesNodeAddress], byteorder='big') - hDst.write(" {:<20} {:0>16} i{:>10} #{:_>10}\n".format(self.lArcVal[nArc], bin(nRawArc)[2:], "?", iNextNodeAddr)) - iAddr = iEndArcAddr+self.nBytesNodeAddress - else: - iNextNodeAddr = int.from_bytes(self.byDic[iEndArcAddr:iEndArcAddr+self.nBytesOffset], byteorder='big') - hDst.write(" {:<20} {:0>16} i{:>10} +{:_>10}\n".format(self.lArcVal[nArc], bin(nRawArc)[2:], "?", iNextNodeAddr)) - iAddr = iEndArcAddr+self.nBytesOffset - if (nRawArc & self._lastArcMask): - hDst.write("\ni{:_>10} -- #{:_>10}\n".format("?", iAddr)) - hDst.close() DELETED gc_core/py/keyboard_chars_proximity.py Index: gc_core/py/keyboard_chars_proximity.py ================================================================== --- gc_core/py/keyboard_chars_proximity.py +++ /dev/null @@ -1,220 +0,0 @@ -# Keyboard chars proximity - - -def getKeyboardMap (sKeyboard): - return _dKeyboardMap.get(sKeyboard.lower(), {}) - - -def getKeyboardList (): - return _dKeyboardMap.keys() - - -_dKeyboardMap = { - # keyboards by alphabetical order - # bépo, colemak and dvorak users are assumed to do less typing errors. - "azerty": { - # fr - # line 1 - "é": "az", - "è": "yu", - "ç": "àio", - "à": "op", - # line 2 - "a": "zéq", - "z": "aesq", - "e": "zrds", - "r": "etfd", - "t": "rygf", - "y": "tuhg", - "u": "yijh", - "i": "uokj", - "o": "iplk", - "p": "oml", - # line 3 - "q": "sawz", - "s": "qdzwxe", - "d": "sfexcr", - "f": "dgrcvt", - "g": "fhtvby", - "h": "gjybnu", - "j": "hkuni", - "k": "jlio", - "l": "kmop", - "m": "lùp", - "ù": "m", - # line 4 - "w": "xqs", - "x": "wcsd", - "c": "xvdf", - "v": "cbfg", - "b": "vngh", - "n": "bhj", - }, - "bépo": { - # fr - # line 2 - "b": "éa", - "é": "bpu", - "p": "éoi", - "o": "pèe", - "è": "o", - "v": "dt", - "d": "vls", - "l": "djr", - "j": "lzn", - "z": "jmw", - # line 3 - "a": "ubà", - "u": "aiéy", - "i": "uepx", - "e": "io", - "c": "t", - "t": "csvq", - "s": "trdg", - "r": "snlh", - "n": "rmjf", - "m": "nzç", - # line 4 - "à": "yêa", - "y": "àxu", - "x": "ywi", - "w": "z", - "k": "c", - "q": "gt", - "g": "qhs", - "h": "gfr", - "f": "hçn", - "ç": "fm", - }, - "colemak": { - # en, us, intl - # line 2 - "q": "wa", - "w": "qfr", - "f": "wps", - "p": "fgt", - "g": "pjd", - "j": "glh", - "l": "jun", - "u": "lye", - "y": "ui", - # line 3 - "a": "rqz", - "r": "aswx", - "s": "rtfc", - "t": "sdpv", - "d": "thgb", - "h": "dnjk", - "n": "helm", - "e": "niu", - "i": "eoy", - "o": "i", - # line 4 - "z": "xa", - "x": "zcr", - "c": "xvs", - "v": "cbt", - "b": "vkd", - "k": "bmh", - "m": "kn", - }, - "dvorak": { - # en, us, intl - # line 2 - "p": "yu", - "y": "pfi", - "f": "ygd", - "g": "fch", - "c": "grt", - "r": "cln", - "l": "rs", - # line 3 - "a": "o", - "o": "aeq", - "e": "ouj", - "u": "eipk", - "i": "udyx", - "d": "ihfb", - "h": "dtgm", - "t": "hncw", - "n": "tsrv", - "s": "nlz", - # line 4 - "q": "jo", - "j": "qke", - "k": "jxu", - "x": "kbi", - "b": "xmd", - "m": "bwh", - "w": "mvt", - "v": "wzn", - "z": "vs", - }, - "qwerty": { - # en, us, intl - # line 2 - "q": "wa", - "w": "qeas", - "e": "wrds", - "r": "etfd", - "t": "rygf", - "y": "tuhg", - "u": "yijh", - "i": "uokj", - "o": "iplk", - "p": "ol", - # line 3 - "a": "sqzw", - "s": "adwzxe", - "d": "sfexcr", - "f": "dgrcvt", - "g": "fhtvby", - "h": "gjybnu", - "j": "hkunmi", - "k": "jlimo", - "l": "kop", - # line 4 - "z": "xas", - "x": "zcsd", - "c": "xvdf", - "v": "cbfg", - "b": "vngh", - "n": "bmhj", - "m": "njk", - }, - "qwertz": { - # ge, au - # line 2 - "q": "wa", - "w": "qeas", - "e": "wrds", - "r": "etfd", - "t": "rzgf", - "z": "tuhg", - "u": "zijh", - "i": "uokj", - "o": "iplk", - "p": "oüöl", - "ü": "päö", - # line 3 - "a": "sqyw", - "s": "adwyxe", - "d": "sfexcr", - "f": "dgrcvt", - "g": "fhtvbz", - "h": "gjzbnu", - "j": "hkunmi", - "k": "jlimo", - "l": "köop", - "ö": "läpü", - "ä": "öü", - # line 4 - "y": "xas", - "x": "ycsd", - "c": "xvdf", - "v": "cbfg", - "b": "vngh", - "n": "bmhj", - "m": "njk", - } -} Index: gc_core/py/lang_core/gc_engine.py ================================================================== --- gc_core/py/lang_core/gc_engine.py +++ gc_core/py/lang_core/gc_engine.py @@ -6,12 +6,12 @@ import os import traceback #import unicodedata from itertools import chain -from ..ibdawg import IBDAWG -from ..echo import echo +from ..graphspell.ibdawg import IBDAWG +from ..graphspell.echo import echo from . import gc_options __all__ = [ "lang", "locales", "pkg", "name", "version", "author", \ "load", "parse", "getDictionary", \ DELETED gc_core/py/progressbar.py Index: gc_core/py/progressbar.py ================================================================== --- gc_core/py/progressbar.py +++ /dev/null @@ -1,35 +0,0 @@ -# Textual progressbar -# by Olivier R. -# License: MPL 2 - -import time - -class ProgressBar: - "Textual progressbar" - - def __init__ (self, nMin=0, nMax=100, nWidth=78): - "initiate with minimum nMin to maximum nMax" - self.nMin = nMin - self.nMax = nMax - self.nSpan = nMax - nMin - self.nWidth = nWidth-9 - self.nAdvance = -1 - self.nCurVal = nMin - self.startTime = time.time() - self._update() - - def _update (self): - fDone = ((self.nCurVal - self.nMin) / self.nSpan) - nAdvance = int(fDone * self.nWidth) - if (nAdvance > self.nAdvance): - self.nAdvance = nAdvance - print("\r[ {}{} {}% ] ".format('>'*nAdvance, ' '*(self.nWidth-nAdvance), round(fDone*100)), end="") - - def increment (self, n=1): - "increment value by n (1 by default)" - self.nCurVal += n - self._update() - - def done (self): - "to call when it’s finished" - print("\r[ task done in {:.1f} s ] ".format(time.time() - self.startTime)) DELETED gc_core/py/spellchecker.py Index: gc_core/py/spellchecker.py ================================================================== --- gc_core/py/spellchecker.py +++ /dev/null @@ -1,134 +0,0 @@ -# Spellchecker -# Wrapper for the IBDAWG class. -# Useful to check several dictionaries at once. - -from . import ibdawg - - -dDictionaries = { - "fr": "French.bdic", - "en": "English.bdic" -} - - -class Spellchecker (): - - def __init__ (self, sLangCode): - self.sLangCode = sLangCode - self.oMainDic = None - if sLangCode in dDictionaries: - self.oMainDic = ibdawg.IBDAWG(dDictionaries[sLangCode]) - self.lOtherDic = [] - return bool(self.oMainDic) - - - def setMainDictionary (self, sDicName): - try: - self.oMainDic = ibdawg.IBDAWG(sDicName) - return True - except: - print("Error: <" + sDicName + "> not set as main dictionary.") - return False - - def addDictionary (self, sDicName): - try: - self.lOtherDic.append(ibdawg.IBDAWG(sDicName)) - return True - except: - print("Error: <" + sDicName + "> not added to the list.") - return False - - # Return codes: - # 0: invalid - # 1: correct in main dictionary - # 2+: correct in foreign dictionaries - - - # check in the main dictionary only - - def isValidToken (self, sToken): - "(in main dictionary) checks if sToken is valid (if there is hyphens in sToken, sToken is split, each part is checked)" - if self.oMainDic.isValidToken(sToken): - return 1 - return 0 - - def isValid (self, sWord): - "(in main dictionary) checks if sWord is valid (different casing tested if the first letter is a capital)" - if self.oMainDic.isValid(sWord): - return 1 - return 0 - - def lookup (self, sWord): - "(in main dictionary) checks if sWord is in dictionary as is (strict verification)" - if self.oMainDic.lookup(sWord): - return 1 - return 0 - - - # check in all dictionaries - - def isValidTokenAll (self, sToken): - "(in all dictionaries) checks if sToken is valid (if there is hyphens in sToken, sToken is split, each part is checked)" - if self.oMainDic.isValidToken(sToken): - return 1 - for i, oDic in enumerate(self.lOtherDic, 2): - if oDic.isValidToken(sToken): - return i - return 0 - - def isValidAll (self, sWord): - "(in all dictionaries) checks if sWord is valid (different casing tested if the first letter is a capital)" - if self.oMainDic.isValid(sToken): - return 1 - for i, oDic in enumerate(self.lOtherDic, 2): - if oDic.isValid(sToken): - return i - return 0 - - def lookupAll (self, sWord): - "(in all dictionaries) checks if sWord is in dictionary as is (strict verification)" - if self.oMainDic.lookup(sToken): - return 1 - for i, oDic in enumerate(self.lOtherDic, 2): - if oDic.lookup(sToken): - return i - return 0 - - - # check in dictionaries up to level n - - def isValidTokenLevel (self, sToken, nLevel): - "(in dictionaries up to level n) checks if sToken is valid (if there is hyphens in sToken, sToken is split, each part is checked)" - if self.oMainDic.isValidToken(sToken): - return 1 - if nLevel >= 2: - for i, oDic in enumerate(self.lOtherDic, 2): - if oDic.isValidToken(sToken): - return i - if i == nLevel: - break - return 0 - - def isValidLevel (self, sWord, nLevel): - "(in dictionaries up to level n) checks if sWord is valid (different casing tested if the first letter is a capital)" - if self.oMainDic.isValid(sToken): - return 1 - if nLevel >= 2: - for i, oDic in enumerate(self.lOtherDic, 2): - if oDic.isValid(sToken): - return i - if i == nLevel: - break - return 0 - - def lookupLevel (self, sWord, nLevel): - "(in dictionaries up to level n) checks if sWord is in dictionary as is (strict verification)" - if self.oMainDic.lookup(sToken): - return 1 - if nLevel >= 2: - for i, oDic in enumerate(self.lOtherDic, 2): - if oDic.lookup(sToken): - return i - if i == nLevel: - break - return 0 DELETED gc_core/py/str_transform.py Index: gc_core/py/str_transform.py ================================================================== --- gc_core/py/str_transform.py +++ /dev/null @@ -1,203 +0,0 @@ -#!python3 - - -#### DISTANCE CALCULATIONS - -def longestCommonSubstring (s1, s2): - # http://en.wikipedia.org/wiki/Longest_common_substring_problem - # http://en.wikibooks.org/wiki/Algorithm_implementation/Strings/Longest_common_substring - M = [ [0]*(1+len(s2)) for i in range(1+len(s1)) ] - longest, x_longest = 0, 0 - for x in range(1, 1+len(s1)): - for y in range(1, 1+len(s2)): - if s1[x-1] == s2[y-1]: - M[x][y] = M[x-1][y-1] + 1 - if M[x][y] > longest: - longest = M[x][y] - x_longest = x - else: - M[x][y] = 0 - return s1[x_longest-longest : x_longest] - - -def distanceDamerauLevenshtein (s1, s2): - "distance of Damerau-Levenshtein between and " - # https://fr.wikipedia.org/wiki/Distance_de_Damerau-Levenshtein - d = {} - nLen1 = len(s1) - nLen2 = len(s2) - for i in range(-1, nLen1+1): - d[i, -1] = i + 1 - for j in range(-1, nLen2+1): - d[-1, j] = j + 1 - for i in range(nLen1): - for j in range(nLen2): - nCost = 0 if s1[i] == s2[j] else 1 - d[i, j] = min( - d[i-1, j] + 1, # Deletion - d[i, j-1] + 1, # Insertion - d[i-1, j-1] + nCost, # Substitution - ) - if i and j and s1[i] == s2[j-1] and s1[i-1] == s2[j]: - d[i, j] = min(d[i, j], d[i-2, j-2] + nCost) # Transposition - return d[nLen1-1, nLen2-1] - - -def distanceSift4 (s1, s2, nMaxOffset=5): - "implementation of general Sift4." - # https://siderite.blogspot.com/2014/11/super-fast-and-accurate-string-distance.html - if not s1: - return len(s2) - if not s2: - return len(s1) - nLen1, nLen2 = len(s1), len(s2) - i1, i2 = 0, 0 # Cursors for each string - nLargestCS = 0 # Largest common substring - nLocalCS = 0 # Local common substring - nTrans = 0 # Number of transpositions ('ab' vs 'ba') - lOffset = [] # Offset pair array, for computing the transpositions - - while i1 < nLen1 and i2 < nLen2: - if s1[i1] == s2[i2]: - nLocalCS += 1 - # Check if current match is a transposition - bTrans = False - i = 0 - while i < len(lOffset): - t = lOffset[i] - if i1 <= t[0] or i2 <= t[1]: - bTrans = abs(i2-i1) >= abs(t[1] - t[0]) - if bTrans: - nTrans += 1 - elif not t[2]: - t[2] = True - nTrans += 1 - break - elif i1 > t[1] and i2 > t[0]: - del lOffset[i] - else: - i += 1 - lOffset.append([i1, i2, bTrans]) - else: - nLargestCS += nLocalCS - nLocalCS = 0 - if i1 != i2: - i1 = i2 = min(i1, i2) - for i in range(nMaxOffset): - if i1 + i >= nLen1 and i2 + i >= nLen2: - break - elif i1 + i < nLen1 and s1[i1+i] == s2[i2]: - i1 += i - 1 - i2 -= 1 - break - elif i2 + i < nLen2 and s1[i1] == s2[i2+i]: - i2 += i - 1 - i1 -= 1 - break - i1 += 1 - i2 += 1 - if i1 >= nLen1 or i2 >= nLen2: - nLargestCS += nLocalCS - nLocalCS = 0 - i1 = i2 = min(i1, i2) - nLargestCS += nLocalCS - return round(max(nLen1, nLen2) - nLargestCS + nTrans) - - -def showDistance (s1, s2): - print("Damerau-Levenshtein: " + s1 + "/" + s2 + " = " + distanceDamerauLevenshtein(s1, s2)) - print("Sift4:" + s1 + "/" + s2 + " = " + distanceSift4(s1, s2)) - - - - -#### STEMMING OPERATIONS - -## No stemming - -def noStemming (sFlex, sStem): - return sStem - -def rebuildWord (sFlex, cmd1, cmd2): - if cmd1 == "_": - return sFlex - n, c = cmd1.split(":") - s = s[:n] + c + s[n:] - if cmd2 == "_": - return s - n, c = cmd2.split(":") - return s[:n] + c + s[n:] - - -## Define affixes for stemming - -# Note: 48 is the ASCII code for "0" - - -# Suffix only -def defineSuffixCode (sFlex, sStem): - """ Returns a string defining how to get stem from flexion - "n(sfx)" - with n: a char with numeric meaning, "0" = 0, "1" = 1, ... ":" = 10, etc. (See ASCII table.) Says how many letters to strip from flexion. - sfx [optional]: string to add on flexion - Examples: - "0": strips nothing, adds nothing - "1er": strips 1 letter, adds "er" - "2": strips 2 letters, adds nothing - """ - if sFlex == sStem: - return "0" - jSfx = 0 - for i in range(min(len(sFlex), len(sStem))): - if sFlex[i] != sStem[i]: - break - jSfx += 1 - return chr(len(sFlex)-jSfx+48) + sStem[jSfx:] - - -def changeWordWithSuffixCode (sWord, sSfxCode): - if sSfxCode == "0": - return sWord - return sWord[:-(ord(sSfxCode[0])-48)] + sSfxCode[1:] if sSfxCode[0] != '0' else sWord + sSfxCode[1:] - - -# Prefix and suffix - -def defineAffixCode (sFlex, sStem): - """ Returns a string defining how to get stem from flexion. Examples: - "0" if stem = flexion - "stem" if no common substring - "n(pfx)/m(sfx)" - with n and m: chars with numeric meaning, "0" = 0, "1" = 1, ... ":" = 10, etc. (See ASCII table.) Says how many letters to strip from flexion. - pfx [optional]: string to add before the flexion - sfx [optional]: string to add after the flexion - """ - if sFlex == sStem: - return "0" - # is stem a substring of flexion? - n = sFlex.find(sStem) - if n >= 0: - return "{}/{}".format(chr(n+48), chr(len(sFlex)-(len(sStem)+n)+48)) - # no, so we are looking for common substring - sSubs = longestCommonSubstring(sFlex, sStem) - if len(sSubs) > 1: - iPos = sStem.find(sSubs) - sPfx = sStem[:iPos] - sSfx = sStem[iPos+len(sSubs):] - n = sFlex.find(sSubs) - m = len(sFlex) - (len(sSubs)+n) - sAff = "{}/".format(chr(n+48)) if not sPfx else "{}{}/".format(chr(n+48), sPfx) - sAff += chr(m+48) if not sSfx else "{}{}".format(chr(m+48), sSfx) - return sAff - return sStem - - -def changeWordWithAffixCode (sWord, sAffCode): - if sAffCode == "0": - return sWord - if '/' not in sAffCode: - return "# error #" - sPfxCode, sSfxCode = sAffCode.split('/') - sWord = sPfxCode[1:] + sWord[(ord(sPfxCode[0])-48):] - return sWord[:-(ord(sSfxCode[0])-48)] + sSfxCode[1:] if sSfxCode[0] != '0' else sWord + sSfxCode[1:] - DELETED gc_core/py/tokenizer.py Index: gc_core/py/tokenizer.py ================================================================== --- gc_core/py/tokenizer.py +++ /dev/null @@ -1,49 +0,0 @@ -# Very simple tokenizer - -import re - -_PATTERNS = { - "default": - ( - r'(?P/(?:bin|boot|dev|etc|home|lib|mnt|opt|root|sbin|tmp|usr|var|Bureau|Documents|Images|Musique|Public|Téléchargements|Vidéos)(?:/[\w.()-]+)*)', - r'(?P[a-zA-Z]:\\(?:Program Files(?: [(]x86[)]|)|[\w.()]+)(?:\\[\w.()-]+)*)', - r'(?P[.,?!:;…«»“”"()/·]+)', - r'(?P[A-Z][.][A-Z][.](?:[A-Z][.])*)', - r'(?P(?:https?://|www[.]|\w+[@.]\w\w+[@.])\w[\w./?&!%=+*"\'@$#-]+)', - r'(?P[#@][\w-]+)', - r'(?P<\w+.*?>|)', - r'(?P\[/?\w+\])', - r'(?P\d\d?h\d\d\b)', - r'(?P-?\d+(?:[.,]\d+))', - r"(?P\w+(?:[’'`-]\w+)*)" - ), - "fr": - ( - r'(?P/(?:bin|boot|dev|etc|home|lib|mnt|opt|root|sbin|tmp|usr|var|Bureau|Documents|Images|Musique|Public|Téléchargements|Vidéos)(?:/[\w.()-]+)*)', - r'(?P[a-zA-Z]:\\(?:Program Files(?: [(]x86[)]|)|[\w.()]+)(?:\\[\w.()-]+)*)', - r'(?P[.,?!:;…«»“”"()/·]+)', - r'(?P[A-Z][.][A-Z][.](?:[A-Z][.])*)', - r'(?P(?:https?://|www[.]|\w+[@.]\w\w+[@.])\w[\w./?&!%=+*"\'@$#-]+)', - r'(?P[#@][\w-]+)', - r'(?P<\w+.*?>|)', - r'(?P\[/?\w+\])', - r"(?P(?:l|d|n|m|t|s|j|c|ç|lorsqu|puisqu|jusqu|quoiqu|qu)['’`])", - r'(?P\d+(?:er|nd|e|de|ième|ème|eme)\b)', - r'(?P\d\d?h\d\d\b)', - r'(?P-?\d+(?:[.,]\d+|))', - r"(?P\w+(?:[’'`-]\w+)*)" - ) -} - - -class Tokenizer: - - def __init__ (self, sLang): - self.sLang = sLang - if sLang not in _PATTERNS: - self.sLang = "default" - self.zToken = re.compile( "(?i)" + '|'.join(sRegex for sRegex in _PATTERNS[sLang]) ) - - def genTokens (self, sText): - for m in self.zToken.finditer(sText): - yield { "sType": m.lastgroup, "sValue": m.group(), "nStart": m.start(), "nEnd": m.end() } Index: gc_lang/fr/build.py ================================================================== --- gc_lang/fr/build.py +++ gc_lang/fr/build.py @@ -37,11 +37,11 @@ sHTML += '\n' return sHTML def createFirefoxExtension (sLang, dVars): - "create extension for Firefox" + "create extension for Firefox (obsolete)" print("Building extension for Firefox") helpers.createCleanFolder("_build/xpi/"+sLang) dir_util.copy_tree("gc_lang/"+sLang+"/xpi/", "_build/xpi/"+sLang) dir_util.copy_tree("grammalecte-js", "_build/xpi/"+sLang+"/grammalecte") sHTML, dProperties = _createOptionsForFirefox(dVars) @@ -56,10 +56,11 @@ with helpers.cd("_build/xpi/"+sLang): os.system("jpm xpi") def _createOptionsForFirefox (dVars): + # obsolete sHTML = "" for sSection, lOpt in dVars['lStructOpt']: sHTML += '\n
\n

\n' for lLineOpt in lOpt: for sOpt in lLineOpt: @@ -81,11 +82,10 @@ _copyGrammalecteJSPackageInZipFile(hZip, spLangPack, dVars['dic_name']+".json") for spf in ["LICENSE.txt", "LICENSE.fr.txt"]: hZip.write(spf) dVars = _createOptionsForThunderbird(dVars) helpers.addFolderToZipAndFileFile(hZip, "gc_lang/"+sLang+"/tb", "", dVars, True) - hZip.write("gc_lang/"+sLang+"/xpi/gce_worker.js", "worker/gce_worker.js") spDict = "gc_lang/"+sLang+"/xpi/data/dictionaries" for sp in os.listdir(spDict): if os.path.isdir(spDict+"/"+sp): hZip.write(spDict+"/"+sp+"/"+sp+".dic", "content/dictionaries/"+sp+"/"+sp+".dic") hZip.write(spDict+"/"+sp+"/"+sp+".aff", "content/dictionaries/"+sp+"/"+sp+".aff") @@ -112,9 +112,12 @@ def _copyGrammalecteJSPackageInZipFile (hZip, spLangPack, sDicName, sAddPath=""): for sf in os.listdir("grammalecte-js"): if not os.path.isdir("grammalecte-js/"+sf): hZip.write("grammalecte-js/"+sf, sAddPath+"grammalecte-js/"+sf) + for sf in os.listdir("grammalecte-js/graphspell"): + if not os.path.isdir("grammalecte-js/graphspell/"+sf): + hZip.write("grammalecte-js/graphspell/"+sf, sAddPath+"grammalecte-js/graphspell/"+sf) + hZip.write("grammalecte-js/graphspell/_dictionaries/"+sDicName, sAddPath+"grammalecte-js/graphspell/_dictionaries/"+sDicName) for sf in os.listdir(spLangPack): if not os.path.isdir(spLangPack+"/"+sf): hZip.write(spLangPack+"/"+sf, sAddPath+spLangPack+"/"+sf) - hZip.write("grammalecte-js/_dictionaries/"+sDicName, sAddPath+"grammalecte-js/_dictionaries/"+sDicName) Index: gc_lang/fr/build_data.py ================================================================== --- gc_lang/fr/build_data.py +++ gc_lang/fr/build_data.py @@ -7,15 +7,15 @@ import json import os import itertools -import grammalecte.ibdawg as ibdawg -from grammalecte.echo import echo -from grammalecte.str_transform import defineSuffixCode +import graphspell.ibdawg as ibdawg +from graphspell.echo import echo +from graphspell.str_transform import defineSuffixCode +import graphspell.tokenizer as tkz import grammalecte.fr.conj as conj -import grammalecte.tokenizer as tkz class cd: """Context manager for changing the current working directory""" def __init__ (self, newPath): Index: gc_lang/fr/modules-js/conj.js ================================================================== --- gc_lang/fr/modules-js/conj.js +++ gc_lang/fr/modules-js/conj.js @@ -7,11 +7,11 @@ ${map} if (typeof(require) !== 'undefined') { - var helpers = require("resource://grammalecte/helpers.js"); + var helpers = require("resource://grammalecte/graphspell/helpers.js"); } var conj = { _lVtyp: [], _lTags: [], Index: gc_lang/fr/modules-js/lexicographe.js ================================================================== --- gc_lang/fr/modules-js/lexicographe.js +++ gc_lang/fr/modules-js/lexicographe.js @@ -8,11 +8,11 @@ ${string} ${map} if (typeof (require) !== 'undefined') { - var helpers = require("resource://grammalecte/helpers.js"); + var helpers = require("resource://grammalecte/graphspell/helpers.js"); } const _dTag = new Map([ [':G', "[mot grammatical]"], [':N', " nom,"], Index: gc_lang/fr/modules-js/mfsp.js ================================================================== --- gc_lang/fr/modules-js/mfsp.js +++ gc_lang/fr/modules-js/mfsp.js @@ -4,11 +4,11 @@ "use strict"; if (typeof(require) !== 'undefined') { - var helpers = require("resource://grammalecte/helpers.js"); + var helpers = require("resource://grammalecte/graphspell/helpers.js"); } var mfsp = { // list of affix codes Index: gc_lang/fr/modules-js/phonet.js ================================================================== --- gc_lang/fr/modules-js/phonet.js +++ gc_lang/fr/modules-js/phonet.js @@ -1,10 +1,10 @@ // Grammalecte - Suggestion phonétique /*jslint esversion: 6*/ if (typeof(require) !== 'undefined') { - var helpers = require("resource://grammalecte/helpers.js"); + var helpers = require("resource://grammalecte/graphspell/helpers.js"); } var phonet = { _dWord: new Map(), Index: gc_lang/fr/modules/tests.py ================================================================== --- gc_lang/fr/modules/tests.py +++ gc_lang/fr/modules/tests.py @@ -5,16 +5,16 @@ import os import re import time -from .. import ibdawg +from ..graphspell.ibdawg import IBDAWG +from ..graphspell.echo import echo from . import gc_engine as gce from . import conj from . import phonet from . import mfsp -from ..echo import echo def _fuckBackslashUTF8 (s): "fuck that shit" return s.replace("\u2019", "'").replace("\u2013", "–").replace("\u2014", "—") @@ -22,11 +22,11 @@ class TestDictionary (unittest.TestCase): @classmethod def setUpClass (cls): - cls.oDic = ibdawg.IBDAWG("French.bdic") + cls.oDic = IBDAWG("French.bdic") def test_lookup (self): for sWord in ["branche", "Émilie"]: self.assertTrue(self.oDic.lookup(sWord), sWord) Index: gc_lang/fr/rules.grx ================================================================== --- gc_lang/fr/rules.grx +++ gc_lang/fr/rules.grx @@ -5624,16 +5624,16 @@ (suiv\w+) +((?:ça +|ce(?:ci|la) +|)de (pr[èé]s?|prêts?)) @@0,$,$ <<- morph(\1, ">suivre ", False) >>> <<- \3 != "près" -3>> près # Confusion : écrivez “près” pour dire “proche de quelque chose”.|https://fr.wiktionary.org/wiki/pr%C3%A8s <<- ~2>> * __[i](loc_tenir_à_distance)__ - (t[eiî]\w+) +(([àa]) distance +(?:respectable +|))d(?:es?|u) @@0,$,w + (t[eiî]\w+) +(([àa]) distance +(?:respectable +|))d(?:es?|u) @@0,*,w <<- morph(\1, ">tenir ", False) >>> <<- \3 == "a" -3>> à # Confusion : “a” est une conjugaison du verbe “avoir”. Pour la préposition, écrivez “à”. <<- ~2>> * __[i](loc_tenir_compte)__ - (t[eiî]\w+) +(co(?:mp?|n)tes?|au courant) @@0,w + (t[eiî]\w+) +(co(?:mp?|n)tes?|au courant) @@0,$ <<- morph(\1, ">tenir ", False) >>> <<- morph(\2, ">co[mn]te(?:sse|) ", False) -2>> compte # Confusion. Dans la locution “tenir compte”, écrivez “compte” au singulier.|https://fr.wiktionary.org/wiki/tenir_compte <<- ~2>> * __[i](p_tirer_profit)__ (tir\w+) +(avantage|profit) d(?:es?|u) @@0,w ADDED gc_lang/fr/tb/worker/gce_worker.js Index: gc_lang/fr/tb/worker/gce_worker.js ================================================================== --- /dev/null +++ gc_lang/fr/tb/worker/gce_worker.js @@ -0,0 +1,132 @@ +// JavaScript + +// Grammar checker engine +// PromiseWorker +// This code is executed in a separate thread (×20 faster too!!!) + +// Firefox WTF: it’s impossible to use require as in the main thread here, +// so it is required to declare a resource in the file “chrome.manifest”. + + +"use strict"; + +// copy/paste +// https://developer.mozilla.org/en-US/docs/Mozilla/JavaScript_code_modules/PromiseWorker.jsm + +importScripts("resource://gre/modules/workers/require.js"); +let PromiseWorker = require("resource://gre/modules/workers/PromiseWorker.js"); + +// Instantiate AbstractWorker (see below). +let worker = new PromiseWorker.AbstractWorker(); + +worker.dispatch = function(method, args = []) { + // Dispatch a call to method `method` with args `args` + return self[method](...args); +}; +worker.postMessage = function(...args) { + // Post a message to the main thread + self.postMessage(...args); +}; +worker.close = function() { + // Close the worker + self.close(); +}; +worker.log = function(...args) { + // Log (or discard) messages (optional) + dump("Worker: " + args.join(" ") + "\n"); +}; + +// Connect it to message port. +self.addEventListener("message", msg => worker.handleMessage(msg)); + +// end of copy/paste + + +// no console here, use “dump” + +let gce = null; // module: grammar checker engine +let text = null; +let tkz = null; // module: tokenizer +let lxg = null; // module: lexicographer +let helpers = null; + +let oTokenizer = null; +let oDict = null; +let oLxg = null; + +function loadGrammarChecker (sGCOptions="", sContext="JavaScript") { + if (gce === null) { + try { + gce = require("resource://grammalecte/fr/gc_engine.js"); + helpers = require("resource://grammalecte/graphspell/helpers.js"); + text = require("resource://grammalecte/text.js"); + tkz = require("resource://grammalecte/graphspell/tokenizer.js"); + //lxg = require("resource://grammalecte/fr/lexicographe.js"); + oTokenizer = new tkz.Tokenizer("fr"); + //helpers.setLogOutput(worker.log); + gce.load(sContext); + oDict = gce.getDictionary(); + if (sGCOptions !== "") { + gce.setOptions(helpers.objectToMap(JSON.parse(sGCOptions))); + } + // we always retrieve options from the gce, for setOptions filters obsolete options + return gce.getOptions().gl_toString(); + } + catch (e) { + console.log("# Error: " + e.fileName + "\n" + e.name + "\nline: " + e.lineNumber + "\n" + e.message); + } + } +} + +function parse (sText, sCountry, bDebug, bContext) { + let aGrammErr = gce.parse(sText, sCountry, bDebug, bContext); + return JSON.stringify(aGrammErr); +} + +function parseAndSpellcheck (sText, sCountry, bDebug, bContext) { + let aGrammErr = gce.parse(sText, sCountry, bDebug, bContext); + let aSpellErr = oTokenizer.getSpellingErrors(sText, oDict); + return JSON.stringify({ aGrammErr: aGrammErr, aSpellErr: aSpellErr }); +} + +function getOptions () { + return gce.getOptions().gl_toString(); +} + +function getDefaultOptions () { + return gce.getDefaultOptions().gl_toString(); +} + +function setOptions (sGCOptions) { + gce.setOptions(helpers.objectToMap(JSON.parse(sGCOptions))); + return gce.getOptions().gl_toString(); +} + +function setOption (sOptName, bValue) { + gce.setOptions(new Map([ [sOptName, bValue] ])); + return gce.getOptions().gl_toString(); +} + +function resetOptions () { + gce.resetOptions(); + return gce.getOptions().gl_toString(); +} + +function fullTests (sGCOptions="") { + if (!gce || !oDict) { + return "# Error: grammar checker or dictionary not loaded." + } + let dMemoOptions = gce.getOptions(); + if (sGCOptions) { + gce.setOptions(helpers.objectToMap(JSON.parse(sGCOptions))); + } + let tests = require("resource://grammalecte/tests.js"); + let oTest = new tests.TestGrammarChecking(gce); + let sAllRes = ""; + for (let sRes of oTest.testParse()) { + console.log(sRes+"\n"); + sAllRes += sRes+"\n"; + } + gce.setOptions(dMemoOptions); + return sAllRes; +} Index: gc_lang/fr/webext/gce_worker.js ================================================================== --- gc_lang/fr/webext/gce_worker.js +++ gc_lang/fr/webext/gce_worker.js @@ -30,16 +30,16 @@ //console.log("[Worker] GC Engine Worker [start]"); //console.log(self); -importScripts("grammalecte/helpers.js"); -importScripts("grammalecte/str_transform.js"); -importScripts("grammalecte/char_player.js"); -importScripts("grammalecte/ibdawg.js"); +importScripts("grammalecte/graphspell/helpers.js"); +importScripts("grammalecte/graphspell/str_transform.js"); +importScripts("grammalecte/graphspell/char_player.js"); +importScripts("grammalecte/graphspell/ibdawg.js"); importScripts("grammalecte/text.js"); -importScripts("grammalecte/tokenizer.js"); +importScripts("grammalecte/graphspell/tokenizer.js"); importScripts("grammalecte/fr/conj.js"); importScripts("grammalecte/fr/mfsp.js"); importScripts("grammalecte/fr/phonet.js"); importScripts("grammalecte/fr/cregex.js"); importScripts("grammalecte/fr/gc_options.js"); @@ -51,11 +51,11 @@ Warning. Initialization can’t be completed at startup of the worker, for we need the path of the extension to load data stored in JSON files. This path is retrieved in background.js and passed with the event “init”. */ - +console.log("[Worker] imports odne"); function createResponse (sActionDone, result, dInfo, bEnd, bError=false) { return { "sActionDone": sActionDone, "result": result, // can be of any type @@ -158,11 +158,11 @@ //console.log("[Worker] Loading… Extension path: " + sExtensionPath); conj.init(helpers.loadFile(sExtensionPath + "/grammalecte/fr/conj_data.json")); phonet.init(helpers.loadFile(sExtensionPath + "/grammalecte/fr/phonet_data.json")); mfsp.init(helpers.loadFile(sExtensionPath + "/grammalecte/fr/mfsp_data.json")); //console.log("[Worker] Modules have been initialized…"); - gc_engine.load(sContext, sExtensionPath+"grammalecte/_dictionaries"); + gc_engine.load(sContext, sExtensionPath+"grammalecte/graphspell/_dictionaries"); oDict = gc_engine.getDictionary(); oTest = new TestGrammarChecking(gc_engine, sExtensionPath+"/grammalecte/fr/tests_data.json"); oTokenizer = new Tokenizer("fr"); oLocution = helpers.loadFile(sExtensionPath + "/grammalecte/fr/locutions_data.json"); Index: grammalecte-cli.py ================================================================== --- grammalecte-cli.py +++ grammalecte-cli.py @@ -7,12 +7,12 @@ import grammalecte.fr as gce import grammalecte.fr.lexicographe as lxg import grammalecte.fr.textformatter as tf import grammalecte.text as txt -import grammalecte.tokenizer as tkz -from grammalecte.echo import echo +import grammalecte.graphspell.tokenizer as tkz +from grammalecte.graphspell.echo import echo _EXAMPLE = "Quoi ? Racontes ! Racontes-moi ! Bon sangg, parles ! Oui. Il y a des menteur partout. " \ "Je suit sidéré par la brutales arrogance de cette homme-là. Quelle salopard ! Un escrocs de la pire espece. " \ "Quant sera t’il châtiés pour ses mensonge ? Merde ! J’en aie marre." Index: grammalecte-server.py ================================================================== --- grammalecte-server.py +++ grammalecte-server.py @@ -12,12 +12,12 @@ import grammalecte.fr as gce import grammalecte.fr.lexicographe as lxg import grammalecte.fr.textformatter as tf import grammalecte.text as txt -import grammalecte.tokenizer as tkz -from grammalecte.echo import echo +import grammalecte.graphspell.tokenizer as tkz +from grammalecte.graphspell.echo import echo HOMEPAGE = """ ADDED graphspell-js/char_player.js Index: graphspell-js/char_player.js ================================================================== --- /dev/null +++ graphspell-js/char_player.js @@ -0,0 +1,330 @@ +// list of similar chars +// useful for suggestion mechanism + +${map} + + +var char_player = { + + _dTransChars: new Map([ + ['à', 'a'], ['é', 'e'], ['î', 'i'], ['ô', 'o'], ['û', 'u'], ['ÿ', 'i'], ['y', 'i'], + ['â', 'a'], ['è', 'e'], ['ï', 'i'], ['ö', 'o'], ['ù', 'u'], ['ŷ', 'i'], + ['ä', 'a'], ['ê', 'e'], ['í', 'i'], ['ó', 'o'], ['ü', 'u'], ['ý', 'i'], + ['á', 'a'], ['ë', 'e'], ['ì', 'i'], ['ò', 'o'], ['ú', 'u'], ['ỳ', 'i'], + ['ā', 'a'], ['ē', 'e'], ['ī', 'i'], ['ō', 'o'], ['ū', 'u'], ['ȳ', 'i'], + ['ñ', 'n'], ['k', 'q'], ['w', 'v'], + ['œ', 'oe'], ['æ', 'ae'], + ]), + + simplifyWord: function (sWord) { + // word simplication before calculating distance between words + sWord = sWord.toLowerCase(); + let sNewWord = ""; + let i = 1; + for (let c of sWord) { + let cNew = this._dTransChars.gl_get(c, c); + let cNext = sWord.slice(i, i+1) + if (cNew != this._dTransChars.gl_get(cNext, cNext)) { + sNewWord += cNew; + } + i++; + } + return sNewWord.replace(/eau/g, "o").replace(/au/g, "o").replace(/ai/g, "e").replace(/ei/g, "e").replace(/ph/g, "f"); + }, + + aVowel: new Set("aáàâäāeéèêëēiíìîïīoóòôöōuúùûüūyýỳŷÿȳœæAÁÀÂÄĀEÉÈÊËĒIÍÌÎÏĪOÓÒÔÖŌUÚÙÛÜŪYÝỲŶŸȲŒÆ"), + aConsonant: new Set("bcçdfghjklmnñpqrstvwxzBCÇDFGHJKLMNÑPQRSTVWXZ"), + aDouble: new Set("bcdfjklmnprstzBCDFJKLMNPRSTZ"), // letters that may be used twice successively + + + // Similar chars + + d1to1: new Map([ + ["1", "liîLIÎ"], + ["2", "zZ"], + ["3", "eéèêEÉÈÊ"], + ["4", "aàâAÀÂ"], + ["5", "sgSG"], + ["6", "bdgBDG"], + ["7", "ltLT"], + ["8", "bB"], + ["9", "gbdGBD"], + ["0", "oôOÔ"], + + ["a", "aàâáäæ"], + ["A", "AÀÂÁÄÆ"], + ["à", "aàâáäæ"], + ["À", "AÀÂÁÄÆ"], + ["â", "aàâáäæ"], + ["Â", "AÀÂÁÄÆ"], + ["á", "aàâáäæ"], + ["Á", "AÀÂÁÄÆ"], + ["ä", "aàâáäæ"], + ["Ä", "AÀÂÁÄÆ"], + + ["æ", "æéa"], + ["Æ", "ÆÉA"], + + ["c", "cçskqśŝ"], + ["C", "CÇSKQŚŜ"], + ["ç", "cçskqśŝ"], + ["Ç", "CÇSKQŚŜ"], + + ["e", "eéèêëœ"], + ["E", "EÉÈÊËŒ"], + ["é", "eéèêëœ"], + ["É", "EÉÈÊËŒ"], + ["ê", "eéèêëœ"], + ["Ê", "EÉÈÊËŒ"], + ["è", "eéèêëœ"], + ["È", "EÉÈÊËŒ"], + ["ë", "eéèêëœ"], + ["Ë", "EÉÈÊËŒ"], + + ["g", "gj"], + ["G", "GJ"], + + ["i", "iîïyíìÿ"], + ["I", "IÎÏYÍÌŸ"], + ["î", "iîïyíìÿ"], + ["Î", "IÎÏYÍÌŸ"], + ["ï", "iîïyíìÿ"], + ["Ï", "IÎÏYÍÌŸ"], + ["í", "iîïyíìÿ"], + ["Í", "IÎÏYÍÌŸ"], + ["ì", "iîïyíìÿ"], + ["Ì", "IÎÏYÍÌŸ"], + + ["j", "jg"], + ["J", "JG"], + + ["k", "kcq"], + ["K", "KCQ"], + + ["n", "nñ"], + ["N", "NÑ"], + + ["o", "oôóòöœ"], + ["O", "OÔÓÒÖŒ"], + ["ô", "oôóòöœ"], + ["Ô", "OÔÓÒÖŒ"], + ["ó", "oôóòöœ"], + ["Ó", "OÔÓÒÖŒ"], + ["ò", "oôóòöœ"], + ["Ò", "OÔÓÒÖŒ"], + ["ö", "oôóòöœ"], + ["Ö", "OÔÓÒÖŒ"], + + ["œ", "œoôeéèêë"], + ["Œ", "ŒOÔEÉÈÊË"], + + ["q", "qck"], + ["Q", "QCK"], + + ["s", "sśŝcç"], + ["S", "SŚŜCÇ"], + ["ś", "sśŝcç"], + ["Ś", "SŚŜCÇ"], + ["ŝ", "sśŝcç"], + ["Ŝ", "SŚŜCÇ"], + + ["u", "uûùüú"], + ["U", "UÛÙÜÚ"], + ["û", "uûùüú"], + ["Û", "UÛÙÜÚ"], + ["ù", "uûùüú"], + ["Ù", "UÛÙÜÚ"], + ["ü", "uûùüú"], + ["Ü", "UÛÙÜÚ"], + ["ú", "uûùüú"], + ["Ú", "UÛÙÜÚ"], + + ["v", "vw"], + ["V", "VW"], + + ["w", "wv"], + ["W", "WV"], + + ["x", "xck"], + ["X", "XCK"], + + ["y", "yÿiîŷýỳ"], + ["Y", "YŸIÎŶÝỲ"], + ["ÿ", "yÿiîŷýỳ"], + ["Ÿ", "YŸIÎŶÝỲ"], + ["ŷ", "yÿiîŷýỳ"], + ["Ŷ", "YŸIÎŶÝỲ"], + ["ý", "yÿiîŷýỳ"], + ["Ý", "YŸIÎŶÝỲ"], + ["ỳ", "yÿiîŷýỳ"], + ["Ỳ", "YŸIÎŶÝỲ"], + + ["z", "zs"], + ["Z", "ZS"], + ]), + + d1toX: new Map([ + ["æ", ["ae",]], + ["Æ", ["AE",]], + ["b", ["bb",]], + ["B", ["BB",]], + ["c", ["cc", "ss", "qu", "ch"]], + ["C", ["CC", "SS", "QU", "CH"]], + ["d", ["dd",]], + ["D", ["DD",]], + ["é", ["ai", "ei"]], + ["É", ["AI", "EI"]], + ["f", ["ff", "ph"]], + ["F", ["FF", "PH"]], + ["g", ["gu", "ge", "gg", "gh"]], + ["G", ["GU", "GE", "GG", "GH"]], + ["j", ["jj", "dj"]], + ["J", ["JJ", "DJ"]], + ["k", ["qu", "ck", "ch", "cu", "kk", "kh"]], + ["K", ["QU", "CK", "CH", "CU", "KK", "KH"]], + ["l", ["ll",]], + ["L", ["LL",]], + ["m", ["mm", "mn"]], + ["M", ["MM", "MN"]], + ["n", ["nn", "nm", "mn"]], + ["N", ["NN", "NM", "MN"]], + ["o", ["au", "eau"]], + ["O", ["AU", "EAU"]], + ["œ", ["oe", "eu"]], + ["Œ", ["OE", "EU"]], + ["p", ["pp", "ph"]], + ["P", ["PP", "PH"]], + ["q", ["qu", "ch", "cq", "ck", "kk"]], + ["Q", ["QU", "CH", "CQ", "CK", "KK"]], + ["r", ["rr",]], + ["R", ["RR",]], + ["s", ["ss", "sh"]], + ["S", ["SS", "SH"]], + ["t", ["tt", "th"]], + ["T", ["TT", "TH"]], + ["x", ["cc", "ct", "xx"]], + ["X", ["CC", "CT", "XX"]], + ["z", ["ss", "zh"]], + ["Z", ["SS", "ZH"]], + ]), + + get1toXReplacement: function (cPrev, cCur, cNext) { + if (this.aConsonant.has(cCur) && (this.aConsonant.has(cPrev) || this.aConsonant.has(cNext))) { + return []; + } + return this.d1toX.gl_get(cCur, []); + }, + + d2toX: new Map([ + ["am", ["an", "en", "em"]], + ["AM", ["AN", "EN", "EM"]], + ["an", ["am", "en", "em"]], + ["AN", ["AM", "EN", "EM"]], + ["au", ["eau", "o", "ô"]], + ["AU", ["EAU", "O", "Ô"]], + ["em", ["an", "am", "en"]], + ["EM", ["AN", "AM", "EN"]], + ["en", ["an", "am", "em"]], + ["EN", ["AN", "AM", "EM"]], + ["ai", ["ei", "é", "è", "ê", "ë"]], + ["AI", ["EI", "É", "È", "Ê", "Ë"]], + ["ei", ["ai", "é", "è", "ê", "ë"]], + ["EI", ["AI", "É", "È", "Ê", "Ë"]], + ["ch", ["sh", "c", "ss"]], + ["CH", ["SH", "C", "SS"]], + ["ct", ["x", "cc"]], + ["CT", ["X", "CC"]], + ["oa", ["oi",]], + ["OA", ["OI",]], + ["oi", ["oa", "oie"]], + ["OI", ["OA", "OIE"]], + ["ph", ["f",]], + ["PH", ["F",]], + ["qu", ["q", "cq", "ck", "c", "k"]], + ["QU", ["Q", "CQ", "CK", "C", "K"]], + ["ss", ["c", "ç"]], + ["SS", ["C", "Ç"]], + ["un", ["ein",]], + ["UN", ["EIN",]], + ]), + + // End of word + dFinal1: new Map([ + ["a", ["as", "at", "ant", "ah"]], + ["A", ["AS", "AT", "ANT", "AH"]], + ["c", ["ch",]], + ["C", ["CH",]], + ["e", ["et", "er", "ets", "ée", "ez", "ai", "ais", "ait", "ent", "eh"]], + ["E", ["ET", "ER", "ETS", "ÉE", "EZ", "AI", "AIS", "AIT", "ENT", "EH"]], + ["é", ["et", "er", "ets", "ée", "ez", "ai", "ais", "ait"]], + ["É", ["ET", "ER", "ETS", "ÉE", "EZ", "AI", "AIS", "AIT"]], + ["è", ["et", "er", "ets", "ée", "ez", "ai", "ais", "ait"]], + ["È", ["ET", "ER", "ETS", "ÉE", "EZ", "AI", "AIS", "AIT"]], + ["ê", ["et", "er", "ets", "ée", "ez", "ai", "ais", "ait"]], + ["Ê", ["ET", "ER", "ETS", "ÉE", "EZ", "AI", "AIS", "AIT"]], + ["ë", ["et", "er", "ets", "ée", "ez", "ai", "ais", "ait"]], + ["Ë", ["ET", "ER", "ETS", "ÉE", "EZ", "AI", "AIS", "AIT"]], + ["g", ["gh",]], + ["G", ["GH",]], + ["i", ["is", "it", "ie", "in"]], + ["I", ["IS", "IT", "IE", "IN"]], + ["n", ["nt", "nd", "ns", "nh"]], + ["N", ["NT", "ND", "NS", "NH"]], + ["o", ["aut", "ot", "os"]], + ["O", ["AUT", "OT", "OS"]], + ["ô", ["aut", "ot", "os"]], + ["Ô", ["AUT", "OT", "OS"]], + ["ö", ["aut", "ot", "os"]], + ["Ö", ["AUT", "OT", "OS"]], + ["p", ["ph",]], + ["P", ["PH",]], + ["s", ["sh",]], + ["S", ["SH",]], + ["t", ["th",]], + ["T", ["TH",]], + ["u", ["ut", "us", "uh"]], + ["U", ["UT", "US", "UH"]], + ]), + + dFinal2: new Map([ + ["ai", ["aient", "ais", "et"]], + ["AI", ["AIENT", "AIS", "ET"]], + ["an", ["ant", "ent"]], + ["AN", ["ANT", "ENT"]], + ["en", ["ent", "ant"]], + ["EN", ["ENT", "ANT"]], + ["ei", ["ait", "ais"]], + ["EI", ["AIT", "AIS"]], + ["on", ["ons", "ont"]], + ["ON", ["ONS", "ONT"]], + ["oi", ["ois", "oit", "oix"]], + ["OI", ["OIS", "OIT", "OIX"]], + ]), + + + // Préfixes et suffixes + aPfx1: new Set([ + "anti", "archi", "contre", "hyper", "mé", "méta", "im", "in", "ir", "par", "proto", + "pseudo", "pré", "re", "ré", "sans", "sous", "supra", "sur", "ultra" + ]), + + aPfx2: new Set([ + "belgo", "franco", "génito", "gynéco", "médico", "russo" + ]), + + + cut: function (sWord) { + // returns an arry of strings (prefix, trimed_word, suffix) + let m = /^([a-zA-Zà-öÀ-Ö0-9_ø-ÿØ-ßĀ-ʯfi-st]+)(-(?:t-|)(?:ils?|elles|on|je|tu|nous|vous)$)/.exec(sWord); + if (m) { + return ["", m[1], m[2]]; + } + return ["", sWord, ""]; + }, + + // Other functions + filterSugg: function (aSugg) { + return aSugg.filter((sSugg) => { return !sSugg.endsWith("è") && !sSugg.endsWith("È"); }); + } + +} ADDED graphspell-js/helpers.js Index: graphspell-js/helpers.js ================================================================== --- /dev/null +++ graphspell-js/helpers.js @@ -0,0 +1,102 @@ + +// HELPERS +/*jslint esversion: 6*/ +/*global console,require,exports,XMLHttpRequest*/ + +"use strict"; + +// In Firefox, there is no console.log in PromiseWorker, but there is worker.log. +// In Thunderbird, you can’t access to console directly. So it’s required to pass a log function. +let funcOutput = null; + +var helpers = { + + setLogOutput: function (func) { + // probably obsolete now, as console.log seems to work everywhere (at last!) + funcOutput = func; + }, + + echo: function (obj) { + // probably obsolete now, as console.log seems to work everywhere (at last!) + if (funcOutput !== null) { + funcOutput(obj); + } else { + console.log(obj); + } + return true; + }, + + logerror: function (e, bStack=false) { + let sMsg = "\n" + e.fileName + "\n" + e.name + "\nline: " + e.lineNumber + "\n" + e.message; + if (bStack) { + sMsg += "\n--- Stack ---\n" + e.stack; + } + if (funcOutput !== null) { + funcOutput(sMsg); + } else { + console.error(sMsg); + } + }, + + inspect: function (o) { + let sMsg = "__inspect__: " + typeof o; + for (let sParam in o) { + sMsg += "\n" + sParam + ": " + o.sParam; + } + sMsg += "\n" + JSON.stringify(o) + "\n__end__"; + this.echo(sMsg); + }, + + loadFile: function (spf) { + // load ressources in workers (suggested by Mozilla extensions reviewers) + // for more options have a look here: https://gist.github.com/Noitidart/ec1e6b9a593ec7e3efed + // if not in workers, use sdk/data.load() instead + try { + let xRequest; + if (typeof XMLHttpRequest !== "undefined") { + xRequest = new XMLHttpRequest(); + } else { + // JS sucks again… necessary for Thunderbird + let { Cc, Ci } = require("chrome"); + xRequest = Cc["@mozilla.org/xmlextras/xmlhttprequest;1"].createInstance(); + xRequest.QueryInterface(Ci.nsIXMLHttpRequest); + } + xRequest.open('GET', spf, false); // 3rd arg is false for synchronous, sync is acceptable in workers + xRequest.overrideMimeType('text/json'); + xRequest.send(); + return xRequest.responseText; + } + catch (e) { + this.logerror(e); + return null; + } + }, + + // conversions + objectToMap: function (obj) { + let m = new Map(); + for (let param in obj) { + m.set(param, obj[param]); + } + return m; + }, + + mapToObject: function (m) { + let obj = {}; + for (let [k, v] of m) { + obj[k] = v; + } + return obj; + } +}; + + +if (typeof(exports) !== 'undefined') { + exports.setLogOutput = helpers.setLogOutput; + exports.echo = helpers.echo; + exports.logerror = helpers.logerror; + exports.inspect = helpers.inspect; + exports.loadFile = helpers.loadFile; + exports.objectToMap = helpers.objectToMap; + exports.mapToObject = helpers.mapToObject; +} ADDED graphspell-js/ibdawg.js Index: graphspell-js/ibdawg.js ================================================================== --- /dev/null +++ graphspell-js/ibdawg.js @@ -0,0 +1,513 @@ +//// IBDAWG +/*jslint esversion: 6*/ +/*global console,require,exports*/ + +"use strict"; + + +if (typeof(require) !== 'undefined') { + var str_transform = require("resource://grammalecte/graphspell/str_transform.js"); + var helpers = require("resource://grammalecte/graphspell/helpers.js"); + var char_player = require("resource://grammalecte/graphspell/char_player.js"); +} + + +// Don’t remove . Necessary in TB. +${string} +${map} +${set} + + +class SuggResult { + // Structure for storing, classifying and filtering suggestions + + constructor (sWord, nDistLimit=-1) { + this.sWord = sWord; + this.sSimplifiedWord = char_player.simplifyWord(sWord); + this.nDistLimit = (nDistLimit >= 0) ? nDistLimit : Math.floor(sWord.length / 3) + 1; + this.nMinDist = 1000; + this.aSugg = new Set(); + this.dSugg = new Map([ [0, []], [1, []], [2, []] ]); + } + + addSugg (sSugg, nDeep=0) { + // add a suggestion + if (!this.aSugg.has(sSugg)) { + let nDist = str_transform.distanceDamerauLevenshtein(this.sSimplifiedWord, char_player.simplifyWord(sSugg)); + if (nDist <= this.nDistLimit) { + if (!this.dSugg.has(nDist)) { + this.dSugg.set(nDist, []); + } + this.dSugg.get(nDist).push(sSugg); + this.aSugg.add(sSugg); + if (nDist < this.nMinDist) { + this.nMinDist = nDist; + } + this.nDistLimit = Math.min(this.nDistLimit, this.nMinDist+2); + } + } + } + + getSuggestions (nSuggLimit=10, nDistLimit=-1) { + // return a list of suggestions + let lRes = []; + if (this.dSugg.get(0).length) { + // we sort the better results with the original word + let dDistTemp = new Map(); + lRes.forEach((sSugg) => { dDistTemp.set(sSugg, str_transform.distanceDamerauLevenshtein(this.sWord, sSugg)); }); + lRes = lRes.sort((sA, sB) => { return dDistTemp.get(sA) - dDistTemp.get(sB); }); + dDistTemp.clear(); + } + for (let lSugg of this.dSugg.values()) { + for (let sSugg of lSugg) { lRes.push(sSugg); } + if (lRes.length > nSuggLimit) { + break; + } + } + lRes = char_player.filterSugg(lRes); + if (this.sWord.gl_isTitle()) { + lRes = lRes.map((sSugg) => { return sSugg.gl_toCapitalize(); }); + } + else if (this.sWord.gl_isUpperCase()) { + lRes = lRes.map((sSugg) => { return sSugg.toUpperCase(); }); + } + return lRes.slice(0, nSuggLimit); + } + + reset () { + this.aSugg.clear(); + this.dSugg.clear(); + } +} + + +class IBDAWG { + // INDEXABLE BINARY DIRECT ACYCLIC WORD GRAPH + + constructor (sDicName, sPath="") { + try { + let sURL = (sPath !== "") ? sPath + "/" + sDicName : "resource://grammalecte/graphspell/_dictionaries/"+sDicName; + const dict = JSON.parse(helpers.loadFile(sURL)); + Object.assign(this, dict); + } + catch (e) { + throw Error("# Error. File not found or not loadable.\n" + e.message + "\n"); + } + /* + Properties: + sName, nVersion, sHeader, lArcVal, nArcVal, byDic, sLang, nChar, nBytesArc, nBytesNodeAddress, + nEntries, nNode, nArc, nAff, cStemming, nTag, dChar, _arcMask, _finalNodeMask, _lastArcMask, _addrBitMask, nBytesOffset, + */ + + /* + Bug workaround. + Mozilla’s JS parser sucks. Can’t read file bigger than 4 Mb! + So we convert huge hexadecimal string to list of numbers… + https://github.com/mozilla/addons-linter/issues/1361 + */ + let lTemp = []; + for (let i = 0; i < this.byDic.length; i+=2) { + lTemp.push(parseInt(this.byDic.slice(i, i+2), 16)); + } + this.byDic = lTemp; + /* end of bug workaround */ + + if (!this.sHeader.startsWith("/pyfsa/")) { + throw TypeError("# Error. Not a pyfsa binary dictionary. Header: " + this.sHeader); + } + if (!(this.nVersion == "1" || this.nVersion == "2" || this.nVersion == "3")) { + throw RangeError("# Error. Unknown dictionary version: " + this.nVersion); + } + // to get the value of an arc, to get the char of an arc with its value + this.dChar = helpers.objectToMap(this.dChar); + this.dCharVal = this.dChar.gl_reverse(); + //this.byDic = new Uint8Array(this.byDic); // not quicker, even slower + + if (this.cStemming == "S") { + this.funcStemming = str_transform.getStemFromSuffixCode; + } else if (this.cStemming == "A") { + this.funcStemming = str_transform.getStemFromAffixCode; + } else { + this.funcStemming = str_transform.noStemming; + } + + // Configuring DAWG functions according to nVersion + switch (this.nVersion) { + case 1: + this.morph = this._morph1; + this.stem = this._stem1; + this._lookupArcNode = this._lookupArcNode1; + this._getArcs = this._getArcs1; + this._writeNodes = this._writeNodes1; + break; + case 2: + this.morph = this._morph2; + this.stem = this._stem2; + this._lookupArcNode = this._lookupArcNode2; + this._getArcs = this._getArcs2; + this._writeNodes = this._writeNodes2; + break; + case 3: + this.morph = this._morph3; + this.stem = this._stem3; + this._lookupArcNode = this._lookupArcNode3; + this._getArcs = this._getArcs3; + this._writeNodes = this._writeNodes3; + break; + default: + throw ValueError("# Error: unknown code: " + this.nVersion); + } + //console.log(this.getInfo()); + this.bOptNumSigle = true; + this.bOptNumAtLast = false; + } + + getInfo () { + return ` Language: ${this.sLang} Version: ${this.nVersion} Stemming: ${this.cStemming}FX\n` + + ` Arcs values: ${this.nArcVal} = ${this.nChar} characters, ${this.nAff} affixes, ${this.nTag} tags\n` + + ` Dictionary: ${this.nEntries} entries, ${this.nNode} nodes, ${this.nArc} arcs\n` + + ` Address size: ${this.nBytesNodeAddress} bytes, Arc size: ${this.nBytesArc} bytes\n`; + } + + isValidToken (sToken) { + // checks if sToken is valid (if there is hyphens in sToken, sToken is split, each part is checked) + if (this.isValid(sToken)) { + return true; + } + if (sToken.includes("-")) { + if (sToken.gl_count("-") > 4) { + return true; + } + return sToken.split("-").every(sWord => this.isValid(sWord)); + } + return false; + } + + isValid (sWord) { + // checks if sWord is valid (different casing tested if the first letter is a capital) + if (!sWord) { + return null; + } + if (sWord.includes("’")) { // ugly hack + sWord = sWord.replace("’", "'"); + } + if (this.lookup(sWord)) { + return true; + } + if (sWord.charAt(0).gl_isUpperCase()) { + if (sWord.length > 1) { + if (sWord.gl_isTitle()) { + return !!this.lookup(sWord.toLowerCase()); + } + if (sWord.gl_isUpperCase()) { + if (this.bOptNumSigle) { + return true; + } + return !!(this.lookup(sWord.toLowerCase()) || this.lookup(sWord.gl_toCapitalize())); + } + return !!this.lookup(sWord.slice(0, 1).toLowerCase() + sWord.slice(1)); + } else { + return !!this.lookup(sWord.toLowerCase()); + } + } + return false; + } + + _convBytesToInteger (aBytes) { + // Byte order = Big Endian (bigger first) + let nVal = 0; + let nWeight = (aBytes.length - 1) * 8; + for (let n of aBytes) { + nVal += n << nWeight; + nWeight = nWeight - 8; + } + return nVal; + } + + lookup (sWord) { + // returns true if sWord in dictionary (strict verification) + let iAddr = 0; + for (let c of sWord) { + if (!this.dChar.has(c)) { + return false; + } + iAddr = this._lookupArcNode(this.dChar.get(c), iAddr); + if (iAddr === null) { + return false; + } + } + return Boolean(this._convBytesToInteger(this.byDic.slice(iAddr, iAddr+this.nBytesArc)) & this._finalNodeMask); + } + + getMorph (sWord) { + // retrieves morphologies list, different casing allowed + let l = this.morph(sWord); + if (sWord[0].gl_isUpperCase()) { + l = l.concat(this.morph(sWord.toLowerCase())); + if (sWord.gl_isUpperCase() && sWord.length > 1) { + l = l.concat(this.morph(sWord.gl_toCapitalize())); + } + } + return l; + } + + suggest (sWord, nSuggLimit=10) { + // returns a array of suggestions for + let sPfx = ""; + let sSfx = ""; + [sPfx, sWord, sSfx] = char_player.cut(sWord); + let nMaxSwitch = Math.max(Math.floor(sWord.length / 3), 1); + let nMaxDel = Math.floor(sWord.length / 5); + let nMaxHardRepl = Math.max(Math.floor((sWord.length - 5) / 4), 1); + let oSuggResult = new SuggResult(sWord); + this._suggest(oSuggResult, sWord, nMaxSwitch, nMaxDel, nMaxHardRepl); + if (sWord.gl_isTitle()) { + this._suggest(oSuggResult, sWord.toLowerCase(), nMaxSwitch, nMaxDel, nMaxHardRepl); + } + else if (sWord.gl_isLowerCase()) { + this._suggest(oSuggResult, sWord.gl_toCapitalize(), nMaxSwitch, nMaxDel, nMaxHardRepl); + } + let aSugg = oSuggResult.getSuggestions(nSuggLimit); + if (sSfx || sPfx) { + // we add what we removed + return aSugg.map( (sSugg) => { return sPfx + sSugg + sSfx } ); + } + return aSugg; + } + + _suggest (oSuggResult, sRemain, nMaxSwitch=0, nMaxDel=0, nMaxHardRepl=0, nDeep=0, iAddr=0, sNewWord="", bAvoidLoop=false) { + // returns a set of suggestions + // recursive function + if (sRemain == "") { + if (this._convBytesToInteger(this.byDic.slice(iAddr, iAddr+this.nBytesArc)) & this._finalNodeMask) { + oSuggResult.addSugg(sNewWord); + } + for (let sTail of this._getTails(iAddr)) { + oSuggResult.addSugg(sNewWord+sTail); + } + return; + } + let cCurrent = sRemain.slice(0, 1); + for (let [cChar, jAddr] of this._getCharArcs(iAddr)) { + if (char_player.d1to1.gl_get(cCurrent, cCurrent).indexOf(cChar) != -1) { + this._suggest(oSuggResult, sRemain.slice(1), nMaxSwitch, nMaxDel, nMaxHardRepl, nDeep+1, jAddr, sNewWord+cChar); + } + else if (!bAvoidLoop && nMaxHardRepl) { + this._suggest(oSuggResult, sRemain.slice(1), nMaxSwitch, nMaxDel, nMaxHardRepl-1, nDeep+1, jAddr, sNewWord+cChar, true); + } + } + if (!bAvoidLoop) { // avoid infinite loop + if (sRemain.length > 1) { + if (cCurrent == sRemain.slice(1, 2)) { + // same char, we remove 1 char without adding 1 to + this._suggest(oSuggResult, sRemain.slice(1), nMaxSwitch, nMaxDel, nMaxHardRepl, nDeep+1, iAddr, sNewWord); + } + else { + // switching chars + if (nMaxSwitch > 0) { + this._suggest(oSuggResult, sRemain.slice(1, 2)+sRemain.slice(0, 1)+sRemain.slice(2), nMaxSwitch-1, nMaxDel, nMaxHardRepl, nDeep+1, iAddr, sNewWord, true); + } + // delete char + if (nMaxDel > 0) { + this._suggest(oSuggResult, sRemain.slice(1), nMaxSwitch, nMaxDel-1, nMaxHardRepl, nDeep+1, iAddr, sNewWord, true); + } + } + // Phonetic replacements + for (let sRepl of char_player.get1toXReplacement(sNewWord.slice(-1), cCurrent, sRemain.slice(1,2))) { + this._suggest(oSuggResult, sRepl + sRemain.slice(1), nMaxSwitch, nMaxDel, nMaxHardRepl, nDeep+1, iAddr, sNewWord, true); + } + for (let sRepl of char_player.d2toX.gl_get(sRemain.slice(0, 2), [])) { + this._suggest(oSuggResult, sRepl + sRemain.slice(2), nMaxSwitch, nMaxDel, nMaxHardRepl, nDeep+1, iAddr, sNewWord, true); + } + } + // end of word + if (sRemain.length == 2) { + for (let sRepl of char_player.dFinal2.gl_get(sRemain, [])) { + this._suggest(oSuggResult, sRepl, nMaxSwitch, nMaxDel, nMaxHardRepl, nDeep+1, iAddr, sNewWord, true); + } + } + else if (sRemain.length == 1) { + this._suggest(oSuggResult, "", nMaxSwitch, nMaxDel, nMaxHardRepl, nDeep+1, iAddr, sNewWord, true); // remove last char and go on + for (let sRepl of char_player.dFinal1.gl_get(sRemain, [])) { + this._suggest(oSuggResult, sRepl, nMaxSwitch, nMaxDel, nMaxHardRepl, nDeep+1, iAddr, sNewWord, true); + } + } + } + } + + * _getCharArcs (iAddr) { + // generator: yield all chars and addresses from node at address + for (let [nVal, jAddr] of this._getArcs(iAddr)) { + if (nVal < this.nChar) { + yield [this.dCharVal.get(nVal), jAddr]; + } + } + } + + * _getSimilarCharArcs (cChar, iAddr) { + // generator: yield similar char of and address of the following node + for (let c of char_player.d1to1.gl_get(cChar, [cChar])) { + if (this.dChar.has(c)) { + let jAddr = this._lookupArcNode(this.dChar.get(c), iAddr); + if (jAddr) { + yield [c, jAddr]; + } + } + } + } + + _getTails (iAddr, sTail="", n=2) { + // return a list of suffixes ending at a distance of from + let aTails = new Set(); + for (let [nVal, jAddr] of this._getArcs(iAddr)) { + if (nVal < this.nChar) { + if (this._convBytesToInteger(this.byDic.slice(jAddr, jAddr+this.nBytesArc)) & this._finalNodeMask) { + aTails.add(sTail + this.dCharVal.get(nVal)); + } + if (n && aTails.size == 0) { + aTails.gl_update(this._getTails(jAddr, sTail+this.dCharVal.get(nVal), n-1)); + } + } + } + return aTails; + } + + // morph (sWord) { + // is defined in constructor + // } + + // VERSION 1 + _morph1 (sWord) { + // returns morphologies of sWord + let iAddr = 0; + for (let c of sWord) { + if (!this.dChar.has(c)) { + return []; + } + iAddr = this._lookupArcNode(this.dChar.get(c), iAddr); + if (iAddr === null) { + return []; + } + } + if (this._convBytesToInteger(this.byDic.slice(iAddr, iAddr+this.nBytesArc)) & this._finalNodeMask) { + let l = []; + let nRawArc = 0; + while (!(nRawArc & this._lastArcMask)) { + let iEndArcAddr = iAddr + this.nBytesArc; + nRawArc = this._convBytesToInteger(this.byDic.slice(iAddr, iEndArcAddr)); + let nArc = nRawArc & this._arcMask; + if (nArc >= this.nChar) { + // This value is not a char, this is a stemming code + let sStem = ">" + this.funcStemming(sWord, this.lArcVal[nArc]); + // Now , we go to the next node and retrieve all following arcs values, all of them are tags + let iAddr2 = this._convBytesToInteger(this.byDic.slice(iEndArcAddr, iEndArcAddr+this.nBytesNodeAddress)); + let nRawArc2 = 0; + while (!(nRawArc2 & this._lastArcMask)) { + let iEndArcAddr2 = iAddr2 + this.nBytesArc; + nRawArc2 = this._convBytesToInteger(this.byDic.slice(iAddr2, iEndArcAddr2)); + l.push(sStem + " " + this.lArcVal[nRawArc2 & this._arcMask]); + iAddr2 = iEndArcAddr2+this.nBytesNodeAddress; + } + } + iAddr = iEndArcAddr + this.nBytesNodeAddress; + } + return l; + } + return []; + } + + _stem1 (sWord) { + // returns stems list of sWord + let iAddr = 0; + for (let c of sWord) { + if (!this.dChar.has(c)) { + return []; + } + iAddr = this._lookupArcNode(this.dChar.get(c), iAddr); + if (iAddr === null) { + return []; + } + } + if (this._convBytesToInteger(this.byDic.slice(iAddr, iAddr+this.nBytesArc)) & this._finalNodeMask) { + let l = []; + let nRawArc = 0; + while (!(nRawArc & this._lastArcMask)) { + let iEndArcAddr = iAddr + this.nBytesArc; + nRawArc = this._convBytesToInteger(this.byDic.slice(iAddr, iEndArcAddr)); + let nArc = nRawArc & this._arcMask; + if (nArc >= this.nChar) { + // This value is not a char, this is a stemming code + l.push(this.funcStemming(sWord, this.lArcVal[nArc])); + } + iAddr = iEndArcAddr + this.nBytesNodeAddress; + } + return l; + } + return []; + } + + _lookupArcNode1 (nVal, iAddr) { + // looks if nVal is an arc at the node at iAddr, if yes, returns address of next node else None + while (true) { + let iEndArcAddr = iAddr+this.nBytesArc; + let nRawArc = this._convBytesToInteger(this.byDic.slice(iAddr, iEndArcAddr)); + if (nVal == (nRawArc & this._arcMask)) { + // the value we are looking for + // we return the address of the next node + return this._convBytesToInteger(this.byDic.slice(iEndArcAddr, iEndArcAddr+this.nBytesNodeAddress)); + } + else { + // value not found + if (nRawArc & this._lastArcMask) { + return null; + } + iAddr = iEndArcAddr + this.nBytesNodeAddress; + } + } + } + + * _getArcs1 (iAddr) { + "generator: return all arcs at as tuples of (nVal, iAddr)" + while (true) { + let iEndArcAddr = iAddr+this.nBytesArc; + let nRawArc = this._convBytesToInteger(this.byDic.slice(iAddr, iEndArcAddr)); + yield [nRawArc & this._arcMask, this._convBytesToInteger(this.byDic.slice(iEndArcAddr, iEndArcAddr+this.nBytesNodeAddress))]; + if (nRawArc & this._lastArcMask) { + break; + } + iAddr = iEndArcAddr+this.nBytesNodeAddress; + } + } + + // VERSION 2 + _morph2 (sWord) { + // to do + } + + _stem2 (sWord) { + // to do + } + + _lookupArcNode2 (nVal, iAddr) { + // to do + } + + + // VERSION 3 + _morph3 (sWord) { + // to do + } + + _stem3 (sWord) { + // to do + } + + _lookupArcNode3 (nVal, iAddr) { + // to do + } +} + + +if (typeof(exports) !== 'undefined') { + exports.IBDAWG = IBDAWG; +} ADDED graphspell-js/str_transform.js Index: graphspell-js/str_transform.js ================================================================== --- /dev/null +++ graphspell-js/str_transform.js @@ -0,0 +1,121 @@ +//// STRING TRANSFORMATION +/*jslint esversion: 6*/ + +// Note: 48 is the ASCII code for "0" + +var str_transform = { + + distanceDamerauLevenshtein2: function (s1, s2) { + // distance of Damerau-Levenshtein between and + // https://fr.wikipedia.org/wiki/Distance_de_Damerau-Levenshtein + try { + let nLen1 = s1.length; + let nLen2 = s2.length; + let matrix = []; + for (let i = 0; i <= nLen1; i++) { + matrix[i] = new Array(nLen2 + 1); + } + for (let i = 0; i <= nLen1; i++) { + matrix[i][0] = i; + } + for (let j = 0; j <= nLen2; j++) { + matrix[0][j] = j; + } + for (let i = 1; i <= nLen1; i++) { + for (let j = 1; j <= nLen2; j++) { + let nCost = (s1[i] === s2[j]) ? 0 : 1; + matrix[i][j] = Math.min( + matrix[i-1][j] + 1, // Deletion + matrix[i][j-1] + 1, // Insertion + matrix[i-1][j-1] + nCost // Substitution + ); + if (i > 1 && j > 1 && s1[i] == s2[j-1] && s1[i-1] == s2[j]) { + matrix[i][j] = Math.min(matrix[i][j], matrix[i-2][j-2] + nCost); // Transposition + } + } + } + return matrix[nLen1][nLen2]; + } + catch (e) { + helpers.logerror(e); + } + }, + + distanceDamerauLevenshtein: function (s1, s2) { + // distance of Damerau-Levenshtein between and + // https://fr.wikipedia.org/wiki/Distance_de_Damerau-Levenshtein + try { + let nLen1 = s1.length; + let nLen2 = s2.length; + let INF = nLen1 + nLen2; + let matrix = []; + let sd = {}; + for (let i = 0; i < nLen1+2; i++) { + matrix[i] = new Array(nLen2+2); + } + matrix[0][0] = INF; + for (let i = 0; i <= nLen1; i++) { + matrix[i+1][1] = i; + matrix[i+1][0] = INF; + sd[s1[i]] = 0; + } + for (let j = 0; j <= nLen2; j++) { + matrix[1][j+1] = j; + matrix[0][j+1] = INF; + sd[s2[j]] = 0; + } + + for (let i = 1; i <= nLen1; i++) { + let DB = 0; + for (let j = 1; j <= nLen2; j++) { + let i1 = sd[s2[j-1]]; + let j1 = DB; + if (s1[i-1] === s2[j-1]) { + matrix[i+1][j+1] = matrix[i][j]; + DB = j; + } + else { + matrix[i+1][j+1] = Math.min(matrix[i][j], Math.min(matrix[i+1][j], matrix[i][j+1])) + 1; + } + matrix[i+1][j+1] = Math.min(matrix[i+1][j+1], matrix[i1] ? matrix[i1][j1] + (i-i1-1) + 1 + (j-j1-1) : Infinity); + } + sd[s1[i-1]] = i; + } + return matrix[nLen1+1][nLen2+1]; + } + catch (e) { + helpers.logerror(e); + } + }, + + showDistance (s1, s2) { + console.log(`Distance: ${s1} / ${s2} = ${this.distanceDamerauLevenshtein(s1, s2)})`); + }, + + getStemFromSuffixCode: function (sFlex, sSfxCode) { + // Suffix only + if (sSfxCode == "0") { + return sFlex; + } + return sSfxCode[0] == '0' ? sFlex + sSfxCode.slice(1) : sFlex.slice(0, -(sSfxCode.charCodeAt(0)-48)) + sSfxCode.slice(1); + }, + + getStemFromAffixCode: function (sFlex, sAffCode) { + // Prefix and suffix + if (sAffCode == "0") { + return sFlex; + } + if (!sAffCode.includes("/")) { + return "# error #"; + } + let [sPfxCode, sSfxCode] = sAffCode.split('/'); + sFlex = sPfxCode.slice(1) + sFlex.slice(sPfxCode.charCodeAt(0)-48); + return sSfxCode[0] == '0' ? sFlex + sSfxCode.slice(1) : sFlex.slice(0, -(sSfxCode.charCodeAt(0)-48)) + sSfxCode.slice(1); + } +}; + + +if (typeof(exports) !== 'undefined') { + exports.getStemFromSuffixCode = str_transform.getStemFromSuffixCode; + exports.getStemFromAffixCode = str_transform.getStemFromAffixCode; +} ADDED graphspell-js/tokenizer.js Index: graphspell-js/tokenizer.js ================================================================== --- /dev/null +++ graphspell-js/tokenizer.js @@ -0,0 +1,105 @@ +// JavaScript +// Very simple tokenizer +/*jslint esversion: 6*/ +/*global require,exports*/ + +"use strict"; + + +if (typeof(require) !== 'undefined') { + var helpers = require("resource://grammalecte/graphspell/helpers.js"); +} + + +const aTkzPatterns = { + // All regexps must start with ^. + "default": + [ + [/^[   \t]+/, 'SPACE'], + [/^\/(?:~|bin|boot|dev|etc|home|lib|mnt|opt|root|sbin|tmp|usr|var|Bureau|Documents|Images|Musique|Public|Téléchargements|Vidéos)(?:\/[a-zA-Zà-öÀ-Ö0-9ø-ÿØ-ßĀ-ʯfi-st_.()-]+)*/, 'FOLDERUNIX'], + [/^[a-zA-Z]:\\(?:Program Files(?: \(x86\)|)|[a-zA-Zà-öÀ-Ö0-9ø-ÿØ-ßĀ-ʯfi-st.()]+)(?:\\[a-zA-Zà-öÀ-Ö0-9ø-ÿØ-ßĀ-ʯfi-st_.()-]+)*/, 'FOLDERWIN'], + [/^[,.;:!?…«»“”‘’"(){}\[\]/·–—]+/, 'SEPARATOR'], + [/^[A-Z][.][A-Z][.](?:[A-Z][.])*/, 'ACRONYM'], + [/^(?:https?:\/\/|www[.]|[a-zA-Zà-öÀ-Ö0-9ø-ÿØ-ßĀ-ʯfi-st_-]+[@.][a-zA-Zà-öÀ-Ö0-9ø-ÿØ-ßĀ-ʯfi-st_-]{2,}[@.])[a-zA-Z0-9][a-zA-Zà-öÀ-Ö0-9ø-ÿØ-ßĀ-ʯfi-st_.\/?&!%=+*"'@$#-]+/, 'LINK'], + [/^[#@][a-zA-Zà-öÀ-Ö0-9ø-ÿØ-ßĀ-ʯfi-st_-]+/, 'TAG'], + [/^<[a-zA-Zà-öÀ-Ö0-9ø-ÿØ-ßĀ-ʯfi-st]+.*?>|<\/[a-zA-Zà-öÀ-Ö0-9ø-ÿØ-ßĀ-ʯfi-st]+ *>/, 'HTML'], + [/^\[\/?[a-zA-Zà-öÀ-Ö0-9ø-ÿØ-ßĀ-ʯfi-st]+\]/, 'PSEUDOHTML'], + [/^&\w+;(?:\w+;|)/, 'HTMLENTITY'], + [/^\d\d?h\d\d\b/, 'HOUR'], + [/^-?\d+(?:[.,]\d+|)/, 'NUM'], + [/^[a-zA-Zà-öÀ-Ö0-9ø-ÿØ-ßĀ-ʯfi-st]+(?:[’'`-][a-zA-Zà-öÀ-Ö0-9ø-ÿØ-ßĀ-ʯfi-st]+)*/, 'WORD'] + ], + "fr": + [ + [/^[   \t]+/, 'SPACE'], + [/^\/(?:~|bin|boot|dev|etc|home|lib|mnt|opt|root|sbin|tmp|usr|var|Bureau|Documents|Images|Musique|Public|Téléchargements|Vidéos)(?:\/[a-zA-Zà-öÀ-Ö0-9ø-ÿØ-ßĀ-ʯfi-st_.()-]+)*/, 'FOLDERUNIX'], + [/^[a-zA-Z]:\\(?:Program Files(?: \(x86\)|)|[a-zA-Zà-öÀ-Ö0-9ø-ÿØ-ßĀ-ʯfi-st.()]+)(?:\\[a-zA-Zà-öÀ-Ö0-9ø-ÿØ-ßĀ-ʯfi-st_.()-]+)*/, 'FOLDERWIN'], + [/^[,.;:!?…«»“”‘’"(){}\[\]/·–—]+/, 'SEPARATOR'], + [/^[A-Z][.][A-Z][.](?:[A-Z][.])*/, 'ACRONYM'], + [/^(?:https?:\/\/|www[.]|[a-zA-Zà-öÀ-Ö0-9ø-ÿØ-ßĀ-ʯfi-st_-]+[@.][a-zA-Zà-öÀ-Ö0-9ø-ÿØ-ßĀ-ʯfi-st_-]{2,}[@.])[a-zA-Z0-9][a-zA-Zà-öÀ-Ö0-9ø-ÿØ-ßĀ-ʯfi-st_.\/?&!%=+*"'@$#-]+/, 'LINK'], + [/^[#@][a-zA-Zà-öÀ-Ö0-9ø-ÿØ-ßĀ-ʯfi-st_-]+/, 'TAG'], + [/^<[a-zA-Zà-öÀ-Ö0-9ø-ÿØ-ßĀ-ʯfi-st]+.*?>|<\/[a-zA-Zà-öÀ-Ö0-9ø-ÿØ-ßĀ-ʯfi-st]+ *>/, 'HTML'], + [/^\[\/?[a-zA-Zà-öÀ-Ö0-9ø-ÿØ-ßĀ-ʯfi-st]+\]/, 'PSEUDOHTML'], + [/^&\w+;(?:\w+;|)/, 'HTMLENTITY'], + [/^(?:l|d|n|m|t|s|j|c|ç|lorsqu|puisqu|jusqu|quoiqu|qu)['’`]/i, 'ELPFX'], + [/^\d\d?[hm]\d\d\b/, 'HOUR'], + [/^\d+(?:er|nd|e|de|ième|ème|eme)s?\b/, 'ORDINAL'], + [/^-?\d+(?:[.,]\d+|)/, 'NUM'], + [/^[a-zA-Zà-öÀ-Ö0-9ø-ÿØ-ßĀ-ʯfi-st]+(?:[’'`-][a-zA-Zà-öÀ-Ö0-9ø-ÿØ-ßĀ-ʯfi-st]+)*/, 'WORD'] + ] +}; + + +class Tokenizer { + + constructor (sLang) { + this.sLang = sLang; + if (!aTkzPatterns.hasOwnProperty(sLang)) { + this.sLang = "default"; + } + this.aRules = aTkzPatterns[this.sLang]; + } + + * genTokens (sText) { + let m; + let i = 0; + while (sText) { + let nCut = 1; + for (let [zRegex, sType] of this.aRules) { + try { + if ((m = zRegex.exec(sText)) !== null) { + if (sType == 'SEPARATOR') { + for (let c of m[0]) { + yield { "sType": sType, "sValue": c, "nStart": i, "nEnd": i + m[0].length } + } + } else { + yield { "sType": sType, "sValue": m[0], "nStart": i, "nEnd": i + m[0].length } + } + nCut = m[0].length; + break; + } + } + catch (e) { + helpers.logerror(e); + } + } + i += nCut; + sText = sText.slice(nCut); + } + } + + getSpellingErrors (sText, oDict) { + let aSpellErr = []; + for (let oToken of this.genTokens(sText)) { + if (oToken.sType === 'WORD' && !oDict.isValidToken(oToken.sValue)) { + aSpellErr.push(oToken); + } + } + return aSpellErr; + } +} + + +if (typeof(exports) !== 'undefined') { + exports.Tokenizer = Tokenizer; +} ADDED graphspell/char_player.py Index: graphspell/char_player.py ================================================================== --- /dev/null +++ graphspell/char_player.py @@ -0,0 +1,324 @@ +# list of similar chars +# useful for suggestion mechanism + +import re + + +_xTransChars = str.maketrans({ + 'à': 'a', 'é': 'e', 'î': 'i', 'ô': 'o', 'û': 'u', 'ÿ': 'i', "y": "i", + 'â': 'a', 'è': 'e', 'ï': 'i', 'ö': 'o', 'ù': 'u', 'ŷ': 'i', + 'ä': 'a', 'ê': 'e', 'í': 'i', 'ó': 'o', 'ü': 'u', 'ý': 'i', + 'á': 'a', 'ë': 'e', 'ì': 'i', 'ò': 'o', 'ú': 'u', 'ỳ': 'i', + 'ā': 'a', 'ē': 'e', 'ī': 'i', 'ō': 'o', 'ū': 'u', 'ȳ': 'i', + 'ñ': 'n', 'k': 'q', 'w': 'v', + 'œ': 'oe', 'æ': 'ae', +}) + +def simplifyWord (sWord): + "word simplication before calculating distance between words" + sWord = sWord.lower().translate(_xTransChars) + sNewWord = "" + for i, c in enumerate(sWord, 1): + if c != sWord[i:i+1]: + sNewWord += c + return sNewWord.replace("eau", "o").replace("au", "o").replace("ai", "e").replace("ei", "e").replace("ph", "f") + + +aVowel = set("aáàâäāeéèêëēiíìîïīoóòôöōuúùûüūyýỳŷÿȳœæAÁÀÂÄĀEÉÈÊËĒIÍÌÎÏĪOÓÒÔÖŌUÚÙÛÜŪYÝỲŶŸȲŒÆ") +aConsonant = set("bcçdfghjklmnñpqrstvwxzBCÇDFGHJKLMNÑPQRSTVWXZ") +aDouble = set("bcdfjklmnprstzBCDFJKLMNPRSTZ") # letters that may be used twice successively + + +# Similar chars + +d1to1 = { + "1": "liîLIÎ", + "2": "zZ", + "3": "eéèêEÉÈÊ", + "4": "aàâAÀÂ", + "5": "sgSG", + "6": "bdgBDG", + "7": "ltLT", + "8": "bB", + "9": "gbdGBD", + "0": "oôOÔ", + + "a": "aàâáäæ", + "A": "AÀÂÁÄÆ", + "à": "aàâáäæ", + "À": "AÀÂÁÄÆ", + "â": "aàâáäæ", + "Â": "AÀÂÁÄÆ", + "á": "aàâáäæ", + "Á": "AÀÂÁÄÆ", + "ä": "aàâáäæ", + "Ä": "AÀÂÁÄÆ", + + "æ": "æéa", + "Æ": "ÆÉA", + + "c": "cçskqśŝ", + "C": "CÇSKQŚŜ", + "ç": "cçskqśŝ", + "Ç": "CÇSKQŚŜ", + + "e": "eéèêëœ", + "E": "EÉÈÊËŒ", + "é": "eéèêëœ", + "É": "EÉÈÊËŒ", + "ê": "eéèêëœ", + "Ê": "EÉÈÊËŒ", + "è": "eéèêëœ", + "È": "EÉÈÊËŒ", + "ë": "eéèêëœ", + "Ë": "EÉÈÊËŒ", + + "g": "gj", + "G": "GJ", + + "i": "iîïyíìÿ", + "I": "IÎÏYÍÌŸ", + "î": "iîïyíìÿ", + "Î": "IÎÏYÍÌŸ", + "ï": "iîïyíìÿ", + "Ï": "IÎÏYÍÌŸ", + "í": "iîïyíìÿ", + "Í": "IÎÏYÍÌŸ", + "ì": "iîïyíìÿ", + "Ì": "IÎÏYÍÌŸ", + + "j": "jg", + "J": "JG", + + "k": "kcq", + "K": "KCQ", + + "n": "nñ", + "N": "NÑ", + + "o": "oôóòöœ", + "O": "OÔÓÒÖŒ", + "ô": "oôóòöœ", + "Ô": "OÔÓÒÖŒ", + "ó": "oôóòöœ", + "Ó": "OÔÓÒÖŒ", + "ò": "oôóòöœ", + "Ò": "OÔÓÒÖŒ", + "ö": "oôóòöœ", + "Ö": "OÔÓÒÖŒ", + + "œ": "œoôeéèêë", + "Œ": "ŒOÔEÉÈÊË", + + "q": "qck", + "Q": "QCK", + + "s": "sśŝcç", + "S": "SŚŜCÇ", + "ś": "sśŝcç", + "Ś": "SŚŜCÇ", + "ŝ": "sśŝcç", + "Ŝ": "SŚŜCÇ", + + "u": "uûùüú", + "U": "UÛÙÜÚ", + "û": "uûùüú", + "Û": "UÛÙÜÚ", + "ù": "uûùüú", + "Ù": "UÛÙÜÚ", + "ü": "uûùüú", + "Ü": "UÛÙÜÚ", + "ú": "uûùüú", + "Ú": "UÛÙÜÚ", + + "v": "vw", + "V": "VW", + + "w": "wv", + "W": "WV", + + "x": "xck", + "X": "XCK", + + "y": "yÿiîŷýỳ", + "Y": "YŸIÎŶÝỲ", + "ÿ": "yÿiîŷýỳ", + "Ÿ": "YŸIÎŶÝỲ", + "ŷ": "yÿiîŷýỳ", + "Ŷ": "YŸIÎŶÝỲ", + "ý": "yÿiîŷýỳ", + "Ý": "YŸIÎŶÝỲ", + "ỳ": "yÿiîŷýỳ", + "Ỳ": "YŸIÎŶÝỲ", + + "z": "zs", + "Z": "ZS", +} + +d1toX = { + "æ": ("ae",), + "Æ": ("AE",), + "b": ("bb",), + "B": ("BB",), + "c": ("cc", "ss", "qu", "ch"), + "C": ("CC", "SS", "QU", "CH"), + "d": ("dd",), + "D": ("DD",), + "é": ("ai", "ei"), + "É": ("AI", "EI"), + "f": ("ff", "ph"), + "F": ("FF", "PH"), + "g": ("gu", "ge", "gg", "gh"), + "G": ("GU", "GE", "GG", "GH"), + "j": ("jj", "dj"), + "J": ("JJ", "DJ"), + "k": ("qu", "ck", "ch", "cu", "kk", "kh"), + "K": ("QU", "CK", "CH", "CU", "KK", "KH"), + "l": ("ll",), + "L": ("LL",), + "m": ("mm", "mn"), + "M": ("MM", "MN"), + "n": ("nn", "nm", "mn"), + "N": ("NN", "NM", "MN"), + "o": ("au", "eau"), + "O": ("AU", "EAU"), + "œ": ("oe", "eu"), + "Œ": ("OE", "EU"), + "p": ("pp", "ph"), + "P": ("PP", "PH"), + "q": ("qu", "ch", "cq", "ck", "kk"), + "Q": ("QU", "CH", "CQ", "CK", "KK"), + "r": ("rr",), + "R": ("RR",), + "s": ("ss", "sh"), + "S": ("SS", "SH"), + "t": ("tt", "th"), + "T": ("TT", "TH"), + "x": ("cc", "ct", "xx"), + "X": ("CC", "CT", "XX"), + "z": ("ss", "zh"), + "Z": ("SS", "ZH"), +} + + +def get1toXReplacement (cPrev, cCur, cNext): + if cCur in aConsonant and (cPrev in aConsonant or cNext in aConsonant): + return () + return d1toX.get(cCur, ()) + + +d2toX = { + "am": ("an", "en", "em"), + "AM": ("AN", "EN", "EM"), + "an": ("am", "en", "em"), + "AN": ("AM", "EN", "EM"), + "au": ("eau", "o", "ô"), + "AU": ("EAU", "O", "Ô"), + "em": ("an", "am", "en"), + "EM": ("AN", "AM", "EN"), + "en": ("an", "am", "em"), + "EN": ("AN", "AM", "EM"), + "ai": ("ei", "é", "è", "ê", "ë"), + "AI": ("EI", "É", "È", "Ê", "Ë"), + "ei": ("ai", "é", "è", "ê", "ë"), + "EI": ("AI", "É", "È", "Ê", "Ë"), + "ch": ("sh", "c", "ss"), + "CH": ("SH", "C", "SS"), + "ct": ("x", "cc"), + "CT": ("X", "CC"), + "oa": ("oi",), + "OA": ("OI",), + "oi": ("oa", "oie"), + "OI": ("OA", "OIE"), + "ph": ("f",), + "PH": ("F",), + "qu": ("q", "cq", "ck", "c", "k"), + "QU": ("Q", "CQ", "CK", "C", "K"), + "ss": ("c", "ç"), + "SS": ("C", "Ç"), + "un": ("ein",), + "UN": ("EIN",), +} + + +# End of word + +dFinal1 = { + "a": ("as", "at", "ant", "ah"), + "A": ("AS", "AT", "ANT", "AH"), + "c": ("ch",), + "C": ("CH",), + "e": ("et", "er", "ets", "ée", "ez", "ai", "ais", "ait", "ent", "eh"), + "E": ("ET", "ER", "ETS", "ÉE", "EZ", "AI", "AIS", "AIT", "ENT", "EH"), + "é": ("et", "er", "ets", "ée", "ez", "ai", "ais", "ait"), + "É": ("ET", "ER", "ETS", "ÉE", "EZ", "AI", "AIS", "AIT"), + "è": ("et", "er", "ets", "ée", "ez", "ai", "ais", "ait"), + "È": ("ET", "ER", "ETS", "ÉE", "EZ", "AI", "AIS", "AIT"), + "ê": ("et", "er", "ets", "ée", "ez", "ai", "ais", "ait"), + "Ê": ("ET", "ER", "ETS", "ÉE", "EZ", "AI", "AIS", "AIT"), + "ë": ("et", "er", "ets", "ée", "ez", "ai", "ais", "ait"), + "Ë": ("ET", "ER", "ETS", "ÉE", "EZ", "AI", "AIS", "AIT"), + "g": ("gh",), + "G": ("GH",), + "i": ("is", "it", "ie", "in"), + "I": ("IS", "IT", "IE", "IN"), + "n": ("nt", "nd", "ns", "nh"), + "N": ("NT", "ND", "NS", "NH"), + "o": ("aut", "ot", "os"), + "O": ("AUT", "OT", "OS"), + "ô": ("aut", "ot", "os"), + "Ô": ("AUT", "OT", "OS"), + "ö": ("aut", "ot", "os"), + "Ö": ("AUT", "OT", "OS"), + "p": ("ph",), + "P": ("PH",), + "s": ("sh",), + "S": ("SH",), + "t": ("th",), + "T": ("TH",), + "u": ("ut", "us", "uh"), + "U": ("UT", "US", "UH"), +} + +dFinal2 = { + "ai": ("aient", "ais", "et"), + "AI": ("AIENT", "AIS", "ET"), + "an": ("ant", "ent"), + "AN": ("ANT", "ENT"), + "en": ("ent", "ant"), + "EN": ("ENT", "ANT"), + "ei": ("ait", "ais"), + "EI": ("AIT", "AIS"), + "on": ("ons", "ont"), + "ON": ("ONS", "ONT"), + "oi": ("ois", "oit", "oix"), + "OI": ("OIS", "OIT", "OIX"), +} + + +# Préfixes et suffixes + +aPfx1 = frozenset([ + "anti", "archi", "contre", "hyper", "mé", "méta", "im", "in", "ir", "par", "proto", + "pseudo", "pré", "re", "ré", "sans", "sous", "supra", "sur", "ultra" +]) +aPfx2 = frozenset([ + "belgo", "franco", "génito", "gynéco", "médico", "russo" +]) + + +_zMotAvecPronom = re.compile("^(?i)(\\w+)(-(?:t-|)(?:ils?|elles?|on|je|tu|nous|vous))$") + +def cut (sWord): + "returns a tuple of strings (prefix, trimed_word, suffix)" + m = _zMotAvecPronom.search(sWord) + if m: + return ("", m.group(1), m.group(2)) + return ("", sWord, "") + + +# Other functions + +def filterSugg (aSugg): + "exclude suggestions" + return filter(lambda sSugg: not sSugg.endswith(("è", "È")), aSugg) ADDED graphspell/dawg.py Index: graphspell/dawg.py ================================================================== --- /dev/null +++ graphspell/dawg.py @@ -0,0 +1,775 @@ +#!python3 + +# FSA DICTIONARY BUILDER +# +# by Olivier R. +# License: MPL 2 +# +# This tool encodes lexicon into an indexable binary dictionary +# Input files MUST be encoded in UTF-8. + + +import sys +import os +import collections + +from . import str_transform as st +from .progressbar import ProgressBar + + + +def readFile (spf): + print(" < Read lexicon: " + spf) + if os.path.isfile(spf): + with open(spf, "r", encoding="utf-8") as hSrc: + for sLine in hSrc: + sLine = sLine.strip() + if sLine and not sLine.startswith("#"): + yield sLine + else: + raise OSError("# Error. File not found or not loadable: " + spf) + + +def getElemsFromFile (spf): + "returns tuple of (flexion, stem, tags) from lexicon file" + nErr = 0 + if not spf.endswith(".clex"): + for sLine in readFile(spf): + try: + sFlex, sStem, sTag = sLine.split("\t") + yield (sFlex, sStem, sTag) + except: + nErr += 1 + else: + sTag = "_" # neutral tag + sTag2 = "" + for sLine in readFile(spf): + if sLine.startswith("[") and sLine.endswith("]"): + # tag line + if "-->" in sLine: + try: + sTag, sSfxCode, sTag2 = sLine[1:-1].split(" --> ") + except: + nErr += 1 + continue + sTag = sTag.strip() + sSfxCode = sSfxCode.strip() + sTag2 = sTag2.strip() + else: + sTag = sLine[1:-1] + sTag2 = "" + else: + # entry line + if "\t" in sLine: + if sLine.count("\t") > 1: + nErr += 1 + continue + sFlex, sStem = sLine.split("\t") + else: + sFlex = sStem = sLine + #print(sFlex, sStem, sTag) + yield (sFlex, sStem, sTag) + if sTag2: + sFlex2 = st.changeWordWithSuffixCode(sFlex, sSfxCode) + #print(sFlex2, sStem, sTag2) + yield (sFlex2, sStem, sTag2) + if nErr: + print(" # Lines ignored: {:>10}".format(nErr)) + + + +class DAWG: + """DIRECT ACYCLIC WORD GRAPH""" + # This code is inspired from Steve Hanov’s DAWG, 2011. (http://stevehanov.ca/blog/index.php?id=115) + # We store suffix/affix codes and tags within the graph after the “real” word. + # A word is a list of numbers [ c1, c2, c3 . . . cN, iAffix, iTags] + # Each arc is an index in self.lArcVal, where are stored characters, suffix/affix codes for stemming and tags. + # Important: As usual, the last node (after ‘iTags’) is tagged final, AND the node after ‘cN’ is ALSO tagged final. + + def __init__ (self, spfSrc, sLangName, cStemming): + print("===== Direct Acyclic Word Graph - Minimal Acyclic Finite State Automaton =====") + cStemming = cStemming.upper() + if cStemming == "A": + funcStemmingGen = st.defineAffixCode + elif cStemming == "S": + funcStemmingGen = st.defineSuffixCode + elif cStemming == "N": + funcStemmingGen = st.noStemming + else: + raise ValueError("# Error. Unknown stemming code: {}".format(cStemming)) + + lEntry = [] + lChar = ['']; dChar = {}; nChar = 1; dCharOccur = {} + lAff = []; dAff = {}; nAff = 0; dAffOccur = {} + lTag = []; dTag = {}; nTag = 0; dTagOccur = {} + nErr = 0 + + # read lexicon + for sFlex, sStem, sTag in getElemsFromFile(spfSrc): + addWordToCharDict(sFlex) + # chars + for c in sFlex: + if c not in dChar: + dChar[c] = nChar + lChar.append(c) + nChar += 1 + dCharOccur[c] = dCharOccur.get(c, 0) + 1 + # affixes to find stem from flexion + aff = funcStemmingGen(sFlex, sStem) + if aff not in dAff: + dAff[aff] = nAff + lAff.append(aff) + nAff += 1 + dAffOccur[aff] = dCharOccur.get(aff, 0) + 1 + # tags + if sTag not in dTag: + dTag[sTag] = nTag + lTag.append(sTag) + nTag += 1 + dTagOccur[sTag] = dTagOccur.get(sTag, 0) + 1 + lEntry.append((sFlex, dAff[aff], dTag[sTag])) + if not lEntry: + raise ValueError("# Error. Empty lexicon") + + # Preparing DAWG + print(" > Preparing list of words") + lVal = lChar + lAff + lTag + lWord = [ [dChar[c] for c in sFlex] + [iAff+nChar] + [iTag+nChar+nAff] for sFlex, iAff, iTag in lEntry ] + lEntry = None + + # Dictionary of arc values occurrency, to sort arcs of each node + dValOccur = dict( [ (dChar[c], dCharOccur[c]) for c in dChar ] \ + + [ (dAff[aff]+nChar, dAffOccur[aff]) for aff in dAff ] \ + + [ (dTag[tag]+nChar+nAff, dTagOccur[tag]) for tag in dTag ] ) + #with open(spfSrc[:-8]+".valuesfreq.txt", 'w', encoding='utf-8') as hFreqDst: # DEBUG + # for iKey, nOcc in sorted(dValOccur.items(), key=lambda t: t[1], reverse=True): + # hFreqDst.write("{}: {}\n".format(lVal[iKey], nOcc)) + # hFreqDst.close() + + self.sFile = spfSrc + self.sLang = sLangName + self.nEntry = len(lWord) + self.aPreviousEntry = [] + DawgNode.resetNextId() + self.oRoot = DawgNode() + self.lUncheckedNodes = [] # list of nodes that have not been checked for duplication. + self.lMinimizedNodes = {} # list of unique nodes that have been checked for duplication. + self.lSortedNodes = [] # version 2 and 3 + self.nNode = 0 + self.nArc = 0 + self.dChar = dChar + self.nChar = len(dChar) + self.nAff = nAff + self.lArcVal = lVal + self.nArcVal = len(lVal) + self.nTag = self.nArcVal - self.nChar - nAff + self.cStemming = cStemming + if cStemming == "A": + self.funcStemming = st.changeWordWithAffixCode + elif cStemming == "S": + self.funcStemming = st.changeWordWithSuffixCode + else: + self.funcStemming = st.noStemming + + # build + lWord.sort() + oProgBar = ProgressBar(0, len(lWord)) + for aEntry in lWord: + self.insert(aEntry) + oProgBar.increment(1) + oProgBar.done() + self.finish() + self.countNodes() + self.countArcs() + self.sortNodes() + self.sortNodeArcs(dValOccur) + #self.sortNodeArcs2 (self.oRoot, "") + self.displayInfo() + + # BUILD DAWG + def insert (self, aEntry): + if aEntry < self.aPreviousEntry: + sys.exit("# Error: Words must be inserted in alphabetical order.") + + # find common prefix between word and previous word + nCommonPrefix = 0 + for i in range(min(len(aEntry), len(self.aPreviousEntry))): + if aEntry[i] != self.aPreviousEntry[i]: + break + nCommonPrefix += 1 + + # Check the lUncheckedNodes for redundant nodes, proceeding from last + # one down to the common prefix size. Then truncate the list at that point. + self._minimize(nCommonPrefix) + + # add the suffix, starting from the correct node mid-way through the graph + if len(self.lUncheckedNodes) == 0: + oNode = self.oRoot + else: + oNode = self.lUncheckedNodes[-1][2] + + iChar = nCommonPrefix + for c in aEntry[nCommonPrefix:]: + oNextNode = DawgNode() + oNode.arcs[c] = oNextNode + self.lUncheckedNodes.append((oNode, c, oNextNode)) + if iChar == (len(aEntry) - 2): + oNode.final = True + iChar += 1 + oNode = oNextNode + oNode.final = True + self.aPreviousEntry = aEntry + + def finish (self): + "minimize unchecked nodes" + self._minimize(0) + + def _minimize (self, downTo): + # proceed from the leaf up to a certain point + for i in range( len(self.lUncheckedNodes)-1, downTo-1, -1 ): + oNode, char, oChildNode = self.lUncheckedNodes[i] + if oChildNode in self.lMinimizedNodes: + # replace the child with the previously encountered one + oNode.arcs[char] = self.lMinimizedNodes[oChildNode] + else: + # add the state to the minimized nodes. + self.lMinimizedNodes[oChildNode] = oChildNode + self.lUncheckedNodes.pop() + + def countNodes (self): + self.nNode = len(self.lMinimizedNodes) + + def countArcs (self): + self.nArc = 0 + for oNode in self.lMinimizedNodes: + self.nArc += len(oNode.arcs) + + def sortNodeArcs (self, dValOccur): + print(" > Sort node arcs") + self.oRoot.sortArcs(dValOccur) + for oNode in self.lMinimizedNodes: + oNode.sortArcs(dValOccur) + + def sortNodeArcs2 (self, oNode, cPrevious=""): + # recursive function + dCharOccur = getCharOrderAfterChar(cPrevious) + if dCharOccur: + oNode.sortArcs2(dCharOccur, self.lArcVal) + for nArcVal, oNextNode in oNode.arcs.items(): + self.sortNodeArcs2(oNextNode, self.lArcVal[nArcVal]) + + def sortNodes (self): + print(" > Sort nodes") + for oNode in self.oRoot.arcs.values(): + self._parseNodes(oNode) + + def _parseNodes (self, oNode): + # Warning: recursive method + if oNode.pos > 0: + return + oNode.setPos() + self.lSortedNodes.append(oNode) + for oNextNode in oNode.arcs.values(): + self._parseNodes(oNextNode) + + def lookup (self, sWord): + oNode = self.oRoot + for c in sWord: + if self.dChar.get(c, '') not in oNode.arcs: + return False + oNode = oNode.arcs[self.dChar[c]] + return oNode.final + + def morph (self, sWord): + oNode = self.oRoot + for c in sWord: + if self.dChar.get(c, '') not in oNode.arcs: + return '' + oNode = oNode.arcs[self.dChar[c]] + if oNode.final: + s = "* " + for arc in oNode.arcs: + if arc >= self.nChar: + s += " [" + self.funcStemming(sWord, self.lArcVal[arc]) + oNode2 = oNode.arcs[arc] + for arc2 in oNode2.arcs: + s += " / " + self.lArcVal[arc2] + s += "]" + return s + return '' + + def displayInfo (self): + print(" * {:<12} {:>16,}".format("Entries:", self.nEntry)) + print(" * {:<12} {:>16,}".format("Characters:", self.nChar)) + print(" * {:<12} {:>16,}".format("Affixes:", self.nAff)) + print(" * {:<12} {:>16,}".format("Tags:", self.nTag)) + print(" * {:<12} {:>16,}".format("Arc values:", self.nArcVal)) + print(" * {:<12} {:>16,}".format("Nodes:", self.nNode)) + print(" * {:<12} {:>16,}".format("Arcs:", self.nArc)) + print(" * {:<12} {:>16}".format("Stemming:", self.cStemming + "FX")) + + def getArcStats (self): + d = {} + for oNode in self.lMinimizedNodes: + n = len(oNode.arcs) + d[n] = d.get(n, 0) + 1 + s = " * Nodes:\n" + for n in d: + s = s + " {:>9} nodes have {:>3} arcs\n".format(d[n], n) + return s + + def writeInfo (self, sPathFile): + print(" > Write informations") + with open(sPathFile, 'w', encoding='utf-8', newline="\n") as hDst: + hDst.write(self.getArcStats()) + hDst.write("\n * Values:\n") + for i, s in enumerate(self.lArcVal): + hDst.write(" {:>6}. {}\n".format(i, s)) + hDst.close() + + # BINARY CONVERSION + def createBinary (self, sPathFile, nMethod, bDebug=False): + print(" > Write DAWG as an indexable binary dictionary [method: %d]" % nMethod) + if nMethod == 1: + self.nBytesArc = ( (self.nArcVal.bit_length() + 2) // 8 ) + 1 # We add 2 bits. See DawgNode.convToBytes1() + self._calcNumBytesNodeAddress() + self._calcNodesAddress1() + elif nMethod == 2: + self.nBytesArc = ( (self.nArcVal.bit_length() + 3) // 8 ) + 1 # We add 3 bits. See DawgNode.convToBytes2() + self._calcNumBytesNodeAddress() + self._calcNodesAddress2() + elif nMethod == 3: + self.nBytesArc = ( (self.nArcVal.bit_length() + 3) // 8 ) + 1 # We add 3 bits. See DawgNode.convToBytes3() + self.nBytesOffset = 1 + self.nMaxOffset = (2 ** (self.nBytesOffset * 8)) - 1 + self._calcNumBytesNodeAddress() + self._calcNodesAddress3() + else: + print(" # Error: unknown compression method") + print(" Arc values (chars, affixes and tags): {} -> {} bytes".format( self.nArcVal, len("\t".join(self.lArcVal).encode("utf-8")) )) + print(" Arc size: {} bytes, Address size: {} bytes -> {} * {} = {} bytes".format( self.nBytesArc, self.nBytesNodeAddress, \ + self.nBytesArc+self.nBytesNodeAddress, self.nArc, \ + (self.nBytesArc+self.nBytesNodeAddress)*self.nArc )) + self._writeBinary(sPathFile, nMethod) + if bDebug: + self._writeNodes(sPathFile, nMethod) + + def _calcNumBytesNodeAddress (self): + "how many bytes needed to store all nodes/arcs in the binary dictionary" + self.nBytesNodeAddress = 1 + while ((self.nBytesArc + self.nBytesNodeAddress) * self.nArc) > (2 ** (self.nBytesNodeAddress * 8)): + self.nBytesNodeAddress += 1 + + def _calcNodesAddress1 (self): + nBytesNode = self.nBytesArc + self.nBytesNodeAddress + iAddr = len(self.oRoot.arcs) * nBytesNode + for oNode in self.lMinimizedNodes: + oNode.addr = iAddr + iAddr += max(len(oNode.arcs), 1) * nBytesNode + + def _calcNodesAddress2 (self): + nBytesNode = self.nBytesArc + self.nBytesNodeAddress + iAddr = len(self.oRoot.arcs) * nBytesNode + for oNode in self.lSortedNodes: + oNode.addr = iAddr + iAddr += max(len(oNode.arcs), 1) * nBytesNode + for oNextNode in oNode.arcs.values(): + if (oNode.pos + 1) == oNextNode.pos: + iAddr -= self.nBytesNodeAddress + #break + + def _calcNodesAddress3 (self): + nBytesNode = self.nBytesArc + self.nBytesNodeAddress + # theorical nodes size if only addresses and no offset + self.oRoot.size = len(self.oRoot.arcs) * nBytesNode + for oNode in self.lSortedNodes: + oNode.size = max(len(oNode.arcs), 1) * nBytesNode + # rewind and calculate dropdown from the end, several times + nDiff = self.nBytesNodeAddress - self.nBytesOffset + bEnd = False + while not bEnd: + bEnd = True + # recalculate addresses + iAddr = self.oRoot.size + for oNode in self.lSortedNodes: + oNode.addr = iAddr + iAddr += oNode.size + # rewind and calculate dropdown from the end, several times + for i in range(self.nNode-1, -1, -1): + nSize = max(len(self.lSortedNodes[i].arcs), 1) * nBytesNode + for oNextNode in self.lSortedNodes[i].arcs.values(): + if 1 < (oNextNode.addr - self.lSortedNodes[i].addr) < self.nMaxOffset: + nSize -= nDiff + if self.lSortedNodes[i].size != nSize: + self.lSortedNodes[i].size = nSize + bEnd = False + + def _writeBinary (self, sPathFile, nMethod): + """ + Format of the binary indexable dictionary: + Each section is separated with 4 bytes of \0 + + - Section Header: + /pyfsa/[version] + * version is an ASCII string + + - Section Informations: + /[tag_lang] + /[number of chars] + /[number of bytes for each arc] + /[number of bytes for each address node] + /[number of entries] + /[number of nodes] + /[number of arcs] + /[number of affixes] + * each field is a ASCII string + /[stemming code] + * "S" means stems are generated by /suffix_code/, "A" means they are generated by /affix_code/ + See defineSuffixCode() and defineAffixCode() for details. + "N" means no stemming + + - Section Values: + * a list of strings encoded in binary from utf-8, each value separated with a tabulation + + - Section Word Graph (nodes / arcs) + * A list of nodes which are a list of arcs with an address of the next node. + See DawgNode.convToBytes() for details. + """ + if not sPathFile.endswith(".bdic"): + sPathFile += "."+str(nMethod)+".bdic" + with open(sPathFile, 'wb') as hDst: + # header + hDst.write("/pyfsa/{}/".format(nMethod).encode("utf-8")) + hDst.write(b"\0\0\0\0") + # infos + hDst.write("{}/{}/{}/{}/{}/{}/{}/{}/{}".format(self.sLang, self.nChar, self.nBytesArc, self.nBytesNodeAddress, \ + self.nEntry, self.nNode, self.nArc, self.nAff, self.cStemming).encode("utf-8")) + hDst.write(b"\0\0\0\0") + # lArcVal + hDst.write("\t".join(self.lArcVal).encode("utf-8")) + hDst.write(b"\0\0\0\0") + # DAWG: nodes / arcs + if nMethod == 1: + hDst.write(self.oRoot.convToBytes1(self.nBytesArc, self.nBytesNodeAddress)) + for oNode in self.lMinimizedNodes: + hDst.write(oNode.convToBytes1(self.nBytesArc, self.nBytesNodeAddress)) + elif nMethod == 2: + hDst.write(self.oRoot.convToBytes2(self.nBytesArc, self.nBytesNodeAddress)) + for oNode in self.lSortedNodes: + hDst.write(oNode.convToBytes2(self.nBytesArc, self.nBytesNodeAddress)) + elif nMethod == 3: + hDst.write(self.oRoot.convToBytes3(self.nBytesArc, self.nBytesNodeAddress, self.nBytesOffset)) + for oNode in self.lSortedNodes: + hDst.write(oNode.convToBytes3(self.nBytesArc, self.nBytesNodeAddress, self.nBytesOffset)) + hDst.close() + + def _writeNodes (self, sPathFile, nMethod): + "for debugging only" + print(" > Write nodes") + with open(sPathFile+".nodes."+str(nMethod)+".txt", 'w', encoding='utf-8', newline="\n") as hDst: + if nMethod == 1: + hDst.write(self.oRoot.getTxtRepr1(self.nBytesArc, self.nBytesNodeAddress, self.lArcVal)+"\n") + #hDst.write( ''.join( [ "%02X " % z for z in self.oRoot.convToBytes1(self.nBytesArc, self.nBytesNodeAddress) ] ).strip() ) + for oNode in self.lMinimizedNodes: + hDst.write(oNode.getTxtRepr1(self.nBytesArc, self.nBytesNodeAddress, self.lArcVal)+"\n") + if nMethod == 2: + hDst.write(self.oRoot.getTxtRepr2(self.nBytesArc, self.nBytesNodeAddress, self.lArcVal)+"\n") + for oNode in self.lSortedNodes: + hDst.write(oNode.getTxtRepr2(self.nBytesArc, self.nBytesNodeAddress, self.lArcVal)+"\n") + if nMethod == 3: + hDst.write(self.oRoot.getTxtRepr3(self.nBytesArc, self.nBytesNodeAddress, self.nBytesOffset, self.lArcVal)+"\n") + #hDst.write( ''.join( [ "%02X " % z for z in self.oRoot.convToBytes3(self.nBytesArc, self.nBytesNodeAddress, self.nBytesOffset) ] ).strip() ) + for oNode in self.lSortedNodes: + hDst.write(oNode.getTxtRepr3(self.nBytesArc, self.nBytesNodeAddress, self.nBytesOffset, self.lArcVal)+"\n") + hDst.close() + + def writeResults (self, sPathFile): + bFileExits = os.path.isfile("_lexicons.res.txt") + with open("_lexicons.res.txt", "a", encoding='utf-8', newline="\n") as hDst: + sFormat1 = "{:<12} {:>12} {:>5} {:>8} {:>8} {:>6} {:>8} {:>9} {:>9} {:>15} {:>12} {:>12}\n" + sFormat2 = "{:<12} {:>12,} {:>5,} {:>8,} {:>8} {:>6,} {:>8,} {:>9,} {:>9,} {:>15,} {:>12,} {:>12,}\n" + if not bFileExits: + hDst.write(sFormat1.format("Lexicon", "Entries", "Chars", "Affixes", "Stemming", "Tags", "Values", "Nodes", "Arcs", "Lexicon (Kb)", "Dict (Kb)", "LT Dict (Kb)")) + hDst.write(sFormat2.format(self.sLang, self.nEntry, self.nChar, self.nAff, self.cStemming + "FX", self.nTag, self.nArcVal, \ + self.nNode, self.nArc, os.path.getsize(self.sFile), os.path.getsize(sPathFile), \ + os.path.getsize("cfsa/dict/{}.dict".format(self.sLang)) if os.path.isfile("cfsa/dict/{}.dict".format(self.sLang)) else 0)) + hDst.close() + + + +class DawgNode: + NextId = 0 + NextPos = 1 # (version 2) + + def __init__ (self): + self.i = DawgNode.NextId + DawgNode.NextId += 1 + self.final = False + self.arcs = {} # key: arc value; value: a node + self.addr = 0 # address in the binary dictionary + self.pos = 0 # position in the binary dictionary (version 2) + self.size = 0 # size of node in bytes (version 3) + + @classmethod + def resetNextId (cls): + cls.NextId = 0 + + def setPos (self): # version 2 + self.pos = DawgNode.NextPos + DawgNode.NextPos += 1 + + def __str__ (self): + # Caution! this function is used for hashing and comparison! + l = [] + if self.final: + l.append("1") + else: + l.append("0") + for (key, node) in self.arcs.items(): + l.append(str(key)) + l.append(str(node.i)) + return "_".join(l) + + def __hash__ (self): + # Used as a key in a python dictionary. + return self.__str__().__hash__() + + def __eq__ (self, other): + # Used as a key in a python dictionary. + # Nodes are equivalent if they have identical arcs, and each identical arc leads to identical states. + return self.__str__() == other.__str__() + + def sortArcs (self, dValOccur): + self.arcs = collections.OrderedDict(sorted(self.arcs.items(), key=lambda t: dValOccur.get(t[0], 0), reverse=True)) + + def sortArcs2 (self, dValOccur, lArcVal): + self.arcs = collections.OrderedDict(sorted(self.arcs.items(), key=lambda t: dValOccur.get(lArcVal[t[0]], 0), reverse=True)) + + # VERSION 1 ===================================================================================================== + def convToBytes1 (self, nBytesArc, nBytesNodeAddress): + """ + Node scheme: + - Arc length is defined by nBytesArc + - Address length is defined by nBytesNodeAddress + + | Arc | Address of next node | + | | | + /---------------\ /---------------\ /---------------\ /---------------\ /---------------\ /---------------\ + | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | + \---------------/ \---------------/ \---------------/ \---------------/ \---------------/ \---------------/ + [...] + /---------------\ /---------------\ /---------------\ /---------------\ /---------------\ /---------------\ + | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | + \---------------/ \---------------/ \---------------/ \---------------/ \---------------/ \---------------/ + ^ ^ + | | + | | + | \___ if 1, last arc of this node + \_____ if 1, this node is final (only on the first arc) + """ + nArc = len(self.arcs) + nFinalNodeMask = 1 << ((nBytesArc*8)-1) + nFinalArcMask = 1 << ((nBytesArc*8)-2) + if len(self.arcs) == 0: + val = nFinalNodeMask | nFinalArcMask + by = val.to_bytes(nBytesArc, byteorder='big') + by += (0).to_bytes(nBytesNodeAddress, byteorder='big') + return by + by = b"" + for i, arc in enumerate(self.arcs, 1): + val = arc + if i == 1 and self.final: + val = val | nFinalNodeMask + if i == nArc: + val = val | nFinalArcMask + by += val.to_bytes(nBytesArc, byteorder='big') + by += self.arcs[arc].addr.to_bytes(nBytesNodeAddress, byteorder='big') + return by + + def getTxtRepr1 (self, nBytesArc, nBytesNodeAddress, lVal): + nArc = len(self.arcs) + nFinalNodeMask = 1 << ((nBytesArc*8)-1) + nFinalArcMask = 1 << ((nBytesArc*8)-2) + s = "i{:_>10} -- #{:_>10}\n".format(self.i, self.addr) + if len(self.arcs) == 0: + s += " {:<20} {:0>16} i{:_>10} #{:_>10}\n".format("", bin(nFinalNodeMask | nFinalArcMask)[2:], "0", "0") + return s + for i, arc in enumerate(self.arcs, 1): + val = arc + if i == 1 and self.final: + val = val | nFinalNodeMask + if i == nArc: + val = val | nFinalArcMask + s += " {:<20} {:0>16} i{:_>10} #{:_>10}\n".format(lVal[arc], bin(val)[2:], self.arcs[arc].i, self.arcs[arc].addr) + return s + + # VERSION 2 ===================================================================================================== + def convToBytes2 (self, nBytesArc, nBytesNodeAddress): + """ + Node scheme: + - Arc length is defined by nBytesArc + - Address length is defined by nBytesNodeAddress + + | Arc | Address of next node | + | | | + /---------------\ /---------------\ /---------------\ /---------------\ /---------------\ /---------------\ + | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | + \---------------/ \---------------/ \---------------/ \---------------/ \---------------/ \---------------/ + [...] + /---------------\ /---------------\ /---------------\ /---------------\ /---------------\ /---------------\ + | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | + \---------------/ \---------------/ \---------------/ \---------------/ \---------------/ \---------------/ + ^ ^ ^ + | | | + | | \_ if 1, caution, no address: next node is the following node + | \___ if 1, last arc of this node + \_____ if 1, this node is final (only on the first arc) + """ + nArc = len(self.arcs) + nFinalNodeMask = 1 << ((nBytesArc*8)-1) + nFinalArcMask = 1 << ((nBytesArc*8)-2) + nNextNodeMask = 1 << ((nBytesArc*8)-3) + if len(self.arcs) == 0: + val = nFinalNodeMask | nFinalArcMask + by = val.to_bytes(nBytesArc, byteorder='big') + by += (0).to_bytes(nBytesNodeAddress, byteorder='big') + return by + by = b"" + for i, arc in enumerate(self.arcs, 1): + val = arc + if i == 1 and self.final: + val = val | nFinalNodeMask + if i == nArc: + val = val | nFinalArcMask + if (self.pos + 1) == self.arcs[arc].pos and self.i != 0: + val = val | nNextNodeMask + by += val.to_bytes(nBytesArc, byteorder='big') + else: + by += val.to_bytes(nBytesArc, byteorder='big') + by += self.arcs[arc].addr.to_bytes(nBytesNodeAddress, byteorder='big') + return by + + def getTxtRepr2 (self, nBytesArc, nBytesNodeAddress, lVal): + nArc = len(self.arcs) + nFinalNodeMask = 1 << ((nBytesArc*8)-1) + nFinalArcMask = 1 << ((nBytesArc*8)-2) + nNextNodeMask = 1 << ((nBytesArc*8)-3) + s = "i{:_>10} -- #{:_>10}\n".format(self.i, self.addr) + if nArc == 0: + s += " {:<20} {:0>16} i{:_>10} #{:_>10}\n".format("", bin(nFinalNodeMask | nFinalArcMask)[2:], "0", "0") + return s + for i, arc in enumerate(self.arcs, 1): + val = arc + if i == 1 and self.final: + val = val | nFinalNodeMask + if i == nArc: + val = val | nFinalArcMask + if (self.pos + 1) == self.arcs[arc].pos and self.i != 0: + val = val | nNextNodeMask + s += " {:<20} {:0>16}\n".format(lVal[arc], bin(val)[2:], "") + else: + s += " {:<20} {:0>16} i{:_>10} #{:_>10}\n".format(lVal[arc], bin(val)[2:], self.arcs[arc].i, self.arcs[arc].addr) + return s + + # VERSION 3 ===================================================================================================== + def convToBytes3 (self, nBytesArc, nBytesNodeAddress, nBytesOffset): + """ + Node scheme: + - Arc length is defined by nBytesArc + - Address length is defined by nBytesNodeAddress + - Offset length is defined by nBytesOffset + + | Arc | Address of next node or offset to next node | + | | | + /---------------\ /---------------\ /---------------\ /---------------\ /---------------\ /---------------\ + |1|0|0| | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | + \---------------/ \---------------/ \---------------/ \---------------/ \---------------/ \---------------/ + [...] + /---------------\ /---------------\ /---------------\ + |0|0|1| | | | | | | | | | | | | | | | | | | | | | | | Offsets are shorter than addresses + \---------------/ \---------------/ \---------------/ + /---------------\ /---------------\ /---------------\ /---------------\ /---------------\ /---------------\ + |0|1|0| | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | + \---------------/ \---------------/ \---------------/ \---------------/ \---------------/ \---------------/ + + ^ ^ ^ + | | | + | | \_ if 1, offset instead of address of next node + | \___ if 1, last arc of this node + \_____ if 1, this node is final (only on the first arc) + """ + nArc = len(self.arcs) + nFinalNodeMask = 1 << ((nBytesArc*8)-1) + nFinalArcMask = 1 << ((nBytesArc*8)-2) + nNextNodeMask = 1 << ((nBytesArc*8)-3) + nMaxOffset = (2 ** (nBytesOffset * 8)) - 1 + if nArc == 0: + val = nFinalNodeMask | nFinalArcMask + by = val.to_bytes(nBytesArc, byteorder='big') + by += (0).to_bytes(nBytesNodeAddress, byteorder='big') + return by + by = b"" + for i, arc in enumerate(self.arcs, 1): + val = arc + if i == 1 and self.final: + val = val | nFinalNodeMask + if i == nArc: + val = val | nFinalArcMask + if 1 < (self.arcs[arc].addr - self.addr) < nMaxOffset and self.i != 0: + val = val | nNextNodeMask + by += val.to_bytes(nBytesArc, byteorder='big') + by += (self.arcs[arc].addr-self.addr).to_bytes(nBytesOffset, byteorder='big') + else: + by += val.to_bytes(nBytesArc, byteorder='big') + by += self.arcs[arc].addr.to_bytes(nBytesNodeAddress, byteorder='big') + return by + + def getTxtRepr3 (self, nBytesArc, nBytesNodeAddress, nBytesOffset, lVal): + nArc = len(self.arcs) + nFinalNodeMask = 1 << ((nBytesArc*8)-1) + nFinalArcMask = 1 << ((nBytesArc*8)-2) + nNextNodeMask = 1 << ((nBytesArc*8)-3) + nMaxOffset = (2 ** (nBytesOffset * 8)) - 1 + s = "i{:_>10} -- #{:_>10} ({})\n".format(self.i, self.addr, self.size) + if nArc == 0: + s += " {:<20} {:0>16} i{:_>10} #{:_>10}\n".format("", bin(nFinalNodeMask | nFinalArcMask)[2:], "0", "0") + return s + for i, arc in enumerate(self.arcs, 1): + val = arc + if i == 1 and self.final: + val = val | nFinalNodeMask + if i == nArc: + val = val | nFinalArcMask + if 1 < (self.arcs[arc].addr - self.addr) < nMaxOffset and self.i != 0: + val = val | nNextNodeMask + s += " {:<20} {:0>16} i{:_>10} +{:_>10}\n".format(lVal[arc], bin(val)[2:], self.arcs[arc].i, self.arcs[arc].addr - self.addr) + else: + s += " {:<20} {:0>16} i{:_>10} #{:_>10}\n".format(lVal[arc], bin(val)[2:], self.arcs[arc].i, self.arcs[arc].addr) + return s + + + +# Another attempt to sort node arcs + +_dCharOrder = { + # key: previous char, value: dictionary of chars {c: nValue} + "": {} +} + + +def addWordToCharDict (sWord): + cPrevious = "" + for cChar in sWord: + if cPrevious not in _dCharOrder: + _dCharOrder[cPrevious] = {} + _dCharOrder[cPrevious][cChar] = _dCharOrder[cPrevious].get(cChar, 0) + 1 + cPrevious = cChar + + +def getCharOrderAfterChar (cChar): + return _dCharOrder.get(cChar, None) + + +def displayCharOrder (): + for key, value in _dCharOrder.items(): + print("[" + key + "]: ", ", ".join([ c+":"+str(n) for c, n in sorted(value.items(), key=lambda t: t[1], reverse=True) ])) ADDED graphspell/echo.py Index: graphspell/echo.py ================================================================== --- /dev/null +++ graphspell/echo.py @@ -0,0 +1,29 @@ +#!python3 + +# The most boring yet indispensable function: print! + + +import sys + + +_CHARMAP = str.maketrans({ 'œ': 'ö', 'Œ': 'Ö', 'ʳ': "r", 'ᵉ': "e", '…': "_", \ + '“': '"', '”': '"', '„': '"', '‘': "'", '’': "'", \ + 'ā': 'â', 'Ā': 'Â', 'ē': 'ê', 'Ē': 'Ê', 'ī': 'î', 'Ī': 'Î', \ + 'ō': 'ô', 'Ō': 'Ô', 'ū': 'û', 'Ū': 'Û', 'Ÿ': 'Y', \ + 'ś': 's', 'ŝ': 's', \ + '—': '-', '–': '-' + }) + + +def echo (obj, sep=' ', end='\n', file=sys.stdout, flush=False): + """ Print for Windows to avoid Python crashes. + Encoding depends on Windows locale. No useful standard. + Always returns True (useful for debugging).""" + if sys.platform != "win32": + print(obj, sep=sep, end=end, file=file, flush=flush) + return True + try: + print(str(obj).translate(_CHARMAP), sep=sep, end=end, file=file, flush=flush) + except: + print(str(obj).encode('ascii', 'replace').decode('ascii', 'replace'), sep=sep, end=end, file=file, flush=flush) + return True ADDED graphspell/ibdawg.py Index: graphspell/ibdawg.py ================================================================== --- /dev/null +++ graphspell/ibdawg.py @@ -0,0 +1,720 @@ +#!python3 + +import os +import traceback +import pkgutil +import re +from functools import wraps +import time + +#import logging +#logging.basicConfig(filename="suggestions.log", level=logging.DEBUG) + +from . import str_transform as st +from . import char_player as cp +from .echo import echo + + +def timethis (func): + "decorator for the execution time" + @wraps(func) + def wrapper (*args, **kwargs): + fStart = time.time() + result = func(*args, **kwargs) + fEnd = time.time() + print(func.__name__, fEnd - fStart) + return result + return wrapper + + +class SuggResult: + """Structure for storing, classifying and filtering suggestions""" + + def __init__ (self, sWord, nDistLimit=-1): + self.sWord = sWord + self.sSimplifiedWord = cp.simplifyWord(sWord) + self.nDistLimit = nDistLimit if nDistLimit >= 0 else (len(sWord) // 3) + 1 + self.nMinDist = 1000 + self.aSugg = set() + self.dSugg = { 0: [], 1: [], 2: [] } + + def addSugg (self, sSugg, nDeep=0): + "add a suggestion" + #logging.info((nDeep * " ") + "__" + sSugg + "__") + if sSugg not in self.aSugg: + nDist = st.distanceDamerauLevenshtein(self.sSimplifiedWord, cp.simplifyWord(sSugg)) + if nDist <= self.nDistLimit: + if nDist not in self.dSugg: + self.dSugg[nDist] = [] + self.dSugg[nDist].append(sSugg) + self.aSugg.add(sSugg) + if nDist < self.nMinDist: + self.nMinDist = nDist + self.nDistLimit = min(self.nDistLimit, self.nMinDist+2) + + def getSuggestions (self, nSuggLimit=10, nDistLimit=-1): + "return a list of suggestions" + lRes = [] + if self.dSugg[0]: + # we sort the better results with the original word + self.dSugg[0].sort(key=lambda sSugg: st.distanceDamerauLevenshtein(self.sWord, sSugg)) + for lSugg in self.dSugg.values(): + lRes.extend(lSugg) + if len(lRes) > nSuggLimit: + break + lRes = list(cp.filterSugg(lRes)) + if self.sWord.istitle(): + lRes = list(map(lambda sSugg: sSugg.title(), lRes)) + elif self.sWord.isupper(): + lRes = list(map(lambda sSugg: sSugg.upper(), lRes)) + return lRes[:nSuggLimit] + + def reset (self): + self.aSugg.clear() + self.dSugg.clear() + + +class IBDAWG: + """INDEXABLE BINARY DIRECT ACYCLIC WORD GRAPH""" + + def __init__ (self, sDicName): + self.by = pkgutil.get_data(__package__, "_dictionaries/" + sDicName) + if not self.by: + raise OSError("# Error. File not found or not loadable: "+sDicName) + + if self.by[0:7] != b"/pyfsa/": + raise TypeError("# Error. Not a pyfsa binary dictionary. Header: {}".format(self.by[0:9])) + if not(self.by[7:8] == b"1" or self.by[7:8] == b"2" or self.by[7:8] == b"3"): + raise ValueError("# Error. Unknown dictionary version: {}".format(self.by[7:8])) + try: + header, info, values, bdic = self.by.split(b"\0\0\0\0", 3) + except Exception: + raise Exception + + self.sName = sDicName + self.nVersion = int(self.by[7:8].decode("utf-8")) + self.sHeader = header.decode("utf-8") + self.lArcVal = values.decode("utf-8").split("\t") + self.nArcVal = len(self.lArcVal) + self.byDic = bdic + + l = info.decode("utf-8").split("/") + self.sLang = l[0] + self.nChar = int(l[1]) + self.nBytesArc = int(l[2]) + self.nBytesNodeAddress = int(l[3]) + self.nEntries = int(l[4]) + self.nNode = int(l[5]) + self.nArc = int(l[6]) + self.nAff = int(l[7]) + self.cStemming = l[8] + if self.cStemming == "S": + self.funcStemming = st.changeWordWithSuffixCode + elif self.cStemming == "A": + self.funcStemming = st.changeWordWithAffixCode + else: + self.funcStemming = st.noStemming + self.nTag = self.nArcVal - self.nChar - self.nAff + # to get the value of an arc, to get the char of an arc with its value + self.dChar = {} + for i in range(1, self.nChar): + self.dChar[self.lArcVal[i]] = i + self.dCharVal = { v: k for k, v in self.dChar.items() } + + self._arcMask = (2 ** ((self.nBytesArc * 8) - 3)) - 1 + self._finalNodeMask = 1 << ((self.nBytesArc * 8) - 1) + self._lastArcMask = 1 << ((self.nBytesArc * 8) - 2) + self._addrBitMask = 1 << ((self.nBytesArc * 8) - 3) # version 2 + + self.nBytesOffset = 1 # version 3 + + # Configuring DAWG functions according to nVersion + if self.nVersion == 1: + self.morph = self._morph1 + self.stem = self._stem1 + self._lookupArcNode = self._lookupArcNode1 + self._getArcs = self._getArcs1 + self._writeNodes = self._writeNodes1 + elif self.nVersion == 2: + self.morph = self._morph2 + self.stem = self._stem2 + self._lookupArcNode = self._lookupArcNode2 + self._getArcs = self._getArcs2 + self._writeNodes = self._writeNodes2 + elif self.nVersion == 3: + self.morph = self._morph3 + self.stem = self._stem3 + self._lookupArcNode = self._lookupArcNode3 + self._getArcs = self._getArcs3 + self._writeNodes = self._writeNodes3 + else: + raise ValueError(" # Error: unknown code: {}".format(self.nVersion)) + + self.bOptNumSigle = False + self.bOptNumAtLast = False + + def getInfo (self): + return " Language: {0.sLang:>10} Version: {0.nVersion:>2} Stemming: {0.cStemming}FX\n" \ + " Arcs values: {0.nArcVal:>10,} = {0.nChar:>5,} characters, {0.nAff:>6,} affixes, {0.nTag:>6,} tags\n" \ + " Dictionary: {0.nEntries:>12,} entries, {0.nNode:>11,} nodes, {0.nArc:>11,} arcs\n" \ + " Address size: {0.nBytesNodeAddress:>1} bytes, Arc size: {0.nBytesArc:>1} bytes\n".format(self) + + def writeAsJSObject (self, spfDest, bInJSModule=False, bBinaryDictAsHexString=False): + "write IBDAWG as a JavaScript object in a JavaScript module" + import json + with open(spfDest, "w", encoding="utf-8", newline="\n") as hDst: + if bInJSModule: + hDst.write('// JavaScript\n// Generated data (do not edit)\n\n"use strict";\n\nconst dictionary = ') + hDst.write(json.dumps({ + "sName": self.sName, + "nVersion": self.nVersion, + "sHeader": self.sHeader, + "lArcVal": self.lArcVal, + "nArcVal": self.nArcVal, + # JavaScript is a pile of shit, so Mozilla’s JS parser don’t like file bigger than 4 Mb! + # So, if necessary, we use an hexadecimal string, that we will convert later in Firefox’s extension. + # https://github.com/mozilla/addons-linter/issues/1361 + "byDic": self.byDic.hex() if bBinaryDictAsHexString else [ e for e in self.byDic ], + "sLang": self.sLang, + "nChar": self.nChar, + "nBytesArc": self.nBytesArc, + "nBytesNodeAddress": self.nBytesNodeAddress, + "nEntries": self.nEntries, + "nNode": self.nNode, + "nArc": self.nArc, + "nAff": self.nAff, + "cStemming": self.cStemming, + "nTag": self.nTag, + "dChar": self.dChar, + "_arcMask": self._arcMask, + "_finalNodeMask": self._finalNodeMask, + "_lastArcMask": self._lastArcMask, + "_addrBitMask": self._addrBitMask, + "nBytesOffset": self.nBytesOffset + }, ensure_ascii=False)) + if bInJSModule: + hDst.write(";\n\nexports.dictionary = dictionary;\n") + + def isValidToken (self, sToken): + "checks if is valid (if there is hyphens in , is split, each part is checked)" + if self.isValid(sToken): + return True + if "-" in sToken: + if sToken.count("-") > 4: + return True + return all(self.isValid(sWord) for sWord in sToken.split("-")) + return False + + def isValid (self, sWord): + "checks if is valid (different casing tested if the first letter is a capital)" + if not sWord: + return None + if "’" in sWord: # ugly hack + sWord = sWord.replace("’", "'") + if self.lookup(sWord): + return True + if sWord[0:1].isupper(): + if len(sWord) > 1: + if sWord.istitle(): + return self.lookup(sWord.lower()) + if sWord.isupper(): + if self.bOptNumSigle: + return True + return self.lookup(sWord.lower()) or self.lookup(sWord.capitalize()) + return self.lookup(sWord[:1].lower() + sWord[1:]) + else: + return self.lookup(sWord.lower()) + return False + + def lookup (self, sWord): + "returns True if in dictionary (strict verification)" + iAddr = 0 + for c in sWord: + if c not in self.dChar: + return False + iAddr = self._lookupArcNode(self.dChar[c], iAddr) + if iAddr == None: + return False + return bool(int.from_bytes(self.byDic[iAddr:iAddr+self.nBytesArc], byteorder='big') & self._finalNodeMask) + + def getMorph (self, sWord): + "retrieves morphologies list, different casing allowed" + l = self.morph(sWord) + if sWord[0:1].isupper(): + l.extend(self.morph(sWord.lower())) + if sWord.isupper() and len(sWord) > 1: + l.extend(self.morph(sWord.capitalize())) + return l + + #@timethis + def suggest (self, sWord, nSuggLimit=10): + "returns a set of suggestions for " + sPfx, sWord, sSfx = cp.cut(sWord) + nMaxSwitch = max(len(sWord) // 3, 1) + nMaxDel = len(sWord) // 5 + nMaxHardRepl = max((len(sWord) - 5) // 4, 1) + oSuggResult = SuggResult(sWord) + self._suggest(oSuggResult, sWord, nMaxSwitch=nMaxSwitch, nMaxDel=nMaxDel, nMaxHardRepl=nMaxHardRepl) + if sWord.istitle(): + self._suggest(oSuggResult, sWord.lower(), nMaxSwitch=nMaxSwitch, nMaxDel=nMaxDel, nMaxHardRepl=nMaxHardRepl) + elif sWord.islower(): + self._suggest(oSuggResult, sWord.title(), nMaxSwitch=nMaxSwitch, nMaxDel=nMaxDel, nMaxHardRepl=nMaxHardRepl) + aSugg = oSuggResult.getSuggestions(nSuggLimit) + if sSfx or sPfx: + # we add what we removed + return list(map(lambda sSug: sPfx + sSug + sSfx, aSugg)) + return aSugg + + def _suggest (self, oSuggResult, sRemain, nMaxSwitch=0, nMaxDel=0, nMaxHardRepl=0, nDeep=0, iAddr=0, sNewWord="", bAvoidLoop=False): + # recursive function + #logging.info((nDeep * " ") + sNewWord + ":" + sRemain) + if not sRemain: + if int.from_bytes(self.byDic[iAddr:iAddr+self.nBytesArc], byteorder='big') & self._finalNodeMask: + oSuggResult.addSugg(sNewWord, nDeep) + for sTail in self._getTails(iAddr): + oSuggResult.addSugg(sNewWord+sTail, nDeep) + return + cCurrent = sRemain[0:1] + for cChar, jAddr in self._getCharArcs(iAddr): + if cChar in cp.d1to1.get(cCurrent, cCurrent): + self._suggest(oSuggResult, sRemain[1:], nMaxSwitch, nMaxDel, nMaxHardRepl, nDeep+1, jAddr, sNewWord+cChar) + elif not bAvoidLoop and nMaxHardRepl: + self._suggest(oSuggResult, sRemain[1:], nMaxSwitch, nMaxDel, nMaxHardRepl-1, nDeep+1, jAddr, sNewWord+cChar, True) + if not bAvoidLoop: # avoid infinite loop + if len(sRemain) > 1: + if cCurrent == sRemain[1:2]: + # same char, we remove 1 char without adding 1 to + self._suggest(oSuggResult, sRemain[1:], nMaxSwitch, nMaxDel, nMaxHardRepl, nDeep+1, iAddr, sNewWord) + else: + # switching chars + if nMaxSwitch: + self._suggest(oSuggResult, sRemain[1:2]+sRemain[0:1]+sRemain[2:], nMaxSwitch-1, nMaxDel, nMaxHardRepl, nDeep+1, iAddr, sNewWord, True) + # delete char + if nMaxDel: + self._suggest(oSuggResult, sRemain[1:], nMaxSwitch, nMaxDel-1, nMaxHardRepl, nDeep+1, iAddr, sNewWord, True) + # Phonetic replacements + for sRepl in cp.get1toXReplacement(sNewWord[-1:], cCurrent, sRemain[1:2]): + self._suggest(oSuggResult, sRepl + sRemain[1:], nMaxSwitch, nMaxDel, nMaxHardRepl, nDeep+1, iAddr, sNewWord, True) + for sRepl in cp.d2toX.get(sRemain[0:2], ()): + self._suggest(oSuggResult, sRepl + sRemain[2:], nMaxSwitch, nMaxDel, nMaxHardRepl, nDeep+1, iAddr, sNewWord, True) + # end of word + if len(sRemain) == 2: + for sRepl in cp.dFinal2.get(sRemain, ()): + self._suggest(oSuggResult, sRepl, nMaxSwitch, nMaxDel, nMaxHardRepl, nDeep+1, iAddr, sNewWord, True) + elif len(sRemain) == 1: + self._suggest(oSuggResult, "", nMaxSwitch, nMaxDel, nMaxHardRepl, nDeep+1, iAddr, sNewWord, True) # remove last char and go on + for sRepl in cp.dFinal1.get(sRemain, ()): + self._suggest(oSuggResult, sRepl, nMaxSwitch, nMaxDel, nMaxHardRepl, nDeep+1, iAddr, sNewWord, True) + + #@timethis + def suggest2 (self, sWord, nMaxSugg=10): + "returns a set of suggestions for " + sPfx, sWord, sSfx = cp.cut(sWord) + oSuggResult = SuggResult(sWord) + self._suggest2(oSuggResult) + aSugg = oSuggResult.getSuggestions() + if sSfx or sPfx: + # we add what we removed + return list(map(lambda sSug: sPfx + sSug + sSfx, aSugg)) + return aSugg + + def _suggest2 (self, oSuggResult, nDeep=0, iAddr=0, sNewWord=""): + # recursive function + #logging.info((nDeep * " ") + sNewWord) + if nDeep >= oSuggResult.nDistLimit: + sCleanNewWord = cp.simplifyWord(sNewWord) + if st.distanceSift4(oSuggResult.sCleanWord[:len(sCleanNewWord)], sCleanNewWord) > oSuggResult.nDistLimit: + return + if int.from_bytes(self.byDic[iAddr:iAddr+self.nBytesArc], byteorder='big') & self._finalNodeMask: + oSuggResult.addSugg(sNewWord, nDeep) + for cChar, jAddr in self._getCharArcsWithPriority(iAddr, oSuggResult.sWord[nDeep:nDeep+1]): + self._suggest2(oSuggResult, nDeep+1, jAddr, sNewWord+cChar) + return + + def _getCharArcs (self, iAddr): + "generator: yield all chars and addresses from node at address " + for nVal, jAddr in self._getArcs(iAddr): + if nVal < self.nChar: + yield (self.dCharVal[nVal], jAddr) + + def _getSimilarCharArcs (self, cChar, iAddr): + "generator: yield similar char of and address of the following node" + for c in cp.d1to1.get(cChar, [cChar]): + if c in self.dChar: + jAddr = self._lookupArcNode(self.dChar[c], iAddr) + if jAddr: + yield (c, jAddr) + + def _getCharArcsWithPriority (self, iAddr, cChar): + if not cChar: + yield from self._getCharArcs(iAddr) + lTuple = list(self._getCharArcs(iAddr)) + lTuple.sort(key=lambda t: 0 if t[0] in cp.d1to1.get(cChar, cChar) else 1) + yield from lTuple + + def _getTails (self, iAddr, sTail="", n=2): + "return a list of suffixes ending at a distance of from " + aTails = set() + for nVal, jAddr in self._getArcs(iAddr): + if nVal < self.nChar: + if int.from_bytes(self.byDic[jAddr:jAddr+self.nBytesArc], byteorder='big') & self._finalNodeMask: + aTails.add(sTail + self.dCharVal[nVal]) + if n and not aTails: + aTails.update(self._getTails(jAddr, sTail+self.dCharVal[nVal], n-1)) + return aTails + + def drawPath (self, sWord, iAddr=0): + "show the path taken by in the graph" + c1 = sWord[0:1] if sWord else " " + iPos = -1 + n = 0 + print(c1 + ": ", end="") + for c2, jAddr in self._getCharArcs(iAddr): + print(c2, end="") + if c2 == sWord[0:1]: + iNextNodeAddr = jAddr + iPos = n + n += 1 + if not sWord: + return + if iPos >= 0: + print("\n "+ " " * iPos + "|") + self.drawPath(sWord[1:], iNextNodeAddr) + + def select (self, sPattern=""): + "generator: returns all entries which morphology fits " + zPattern = None + try: + zPattern = re.compile(sPattern) + except: + print("# Error in regex pattern") + traceback.print_exc() + yield from self._select1(zPattern, 0, "") + + # def morph (self, sWord): + # is defined in __init__ + + # VERSION 1 + def _select1 (self, zPattern, iAddr, sWord): + # recursive generator + for nVal, jAddr in self._getArcs1(iAddr): + if nVal < self.nChar: + # simple character + yield from self._select1(zPattern, jAddr, sWord + self.lArcVal[nVal]) + else: + sEntry = sWord + "\t" + self.funcStemming(sWord, self.lArcVal[nVal]) + for nMorphVal, _ in self._getArcs1(jAddr): + if not zPattern or zPattern.search(self.lArcVal[nMorphVal]): + yield sEntry + "\t" + self.lArcVal[nMorphVal] + + def _morph1 (self, sWord): + "returns morphologies of " + iAddr = 0 + for c in sWord: + if c not in self.dChar: + return [] + iAddr = self._lookupArcNode(self.dChar[c], iAddr) + if iAddr == None: + return [] + if (int.from_bytes(self.byDic[iAddr:iAddr+self.nBytesArc], byteorder='big') & self._finalNodeMask): + l = [] + nRawArc = 0 + while not (nRawArc & self._lastArcMask): + iEndArcAddr = iAddr + self.nBytesArc + nRawArc = int.from_bytes(self.byDic[iAddr:iEndArcAddr], byteorder='big') + nArc = nRawArc & self._arcMask + if nArc >= self.nChar: + # This value is not a char, this is a stemming code + sStem = ">" + self.funcStemming(sWord, self.lArcVal[nArc]) + # Now , we go to the next node and retrieve all following arcs values, all of them are tags + iAddr2 = int.from_bytes(self.byDic[iEndArcAddr:iEndArcAddr+self.nBytesNodeAddress], byteorder='big') + nRawArc2 = 0 + while not (nRawArc2 & self._lastArcMask): + iEndArcAddr2 = iAddr2 + self.nBytesArc + nRawArc2 = int.from_bytes(self.byDic[iAddr2:iEndArcAddr2], byteorder='big') + l.append(sStem + " " + self.lArcVal[nRawArc2 & self._arcMask]) + iAddr2 = iEndArcAddr2+self.nBytesNodeAddress + iAddr = iEndArcAddr+self.nBytesNodeAddress + return l + return [] + + def _stem1 (self, sWord): + "returns stems list of " + iAddr = 0 + for c in sWord: + if c not in self.dChar: + return [] + iAddr = self._lookupArcNode(self.dChar[c], iAddr) + if iAddr == None: + return [] + if (int.from_bytes(self.byDic[iAddr:iAddr+self.nBytesArc], byteorder='big') & self._finalNodeMask): + l = [] + nRawArc = 0 + while not (nRawArc & self._lastArcMask): + iEndArcAddr = iAddr + self.nBytesArc + nRawArc = int.from_bytes(self.byDic[iAddr:iEndArcAddr], byteorder='big') + nArc = nRawArc & self._arcMask + if nArc >= self.nChar: + # This value is not a char, this is a stemming code + l.append(self.funcStemming(sWord, self.lArcVal[nArc])) + iAddr = iEndArcAddr+self.nBytesNodeAddress + return l + return [] + + def _lookupArcNode1 (self, nVal, iAddr): + "looks if is an arc at the node at , if yes, returns address of next node else None" + while True: + iEndArcAddr = iAddr+self.nBytesArc + nRawArc = int.from_bytes(self.byDic[iAddr:iEndArcAddr], byteorder='big') + if nVal == (nRawArc & self._arcMask): + # the value we are looking for + # we return the address of the next node + return int.from_bytes(self.byDic[iEndArcAddr:iEndArcAddr+self.nBytesNodeAddress], byteorder='big') + else: + # value not found + if (nRawArc & self._lastArcMask): + return None + iAddr = iEndArcAddr+self.nBytesNodeAddress + + def _getArcs1 (self, iAddr): + "generator: return all arcs at as tuples of (nVal, iAddr)" + while True: + iEndArcAddr = iAddr+self.nBytesArc + nRawArc = int.from_bytes(self.byDic[iAddr:iEndArcAddr], byteorder='big') + yield (nRawArc & self._arcMask, int.from_bytes(self.byDic[iEndArcAddr:iEndArcAddr+self.nBytesNodeAddress], byteorder='big')) + if (nRawArc & self._lastArcMask): + break + iAddr = iEndArcAddr+self.nBytesNodeAddress + + def _writeNodes1 (self, spfDest): + "for debugging only" + print(" > Write binary nodes") + with codecs.open(spfDest, 'w', 'utf-8', newline="\n") as hDst: + iAddr = 0 + hDst.write("i{:_>10} -- #{:_>10}\n".format("0", iAddr)) + while iAddr < len(self.byDic): + iEndArcAddr = iAddr+self.nBytesArc + nRawArc = int.from_bytes(self.byDic[iAddr:iEndArcAddr], byteorder='big') + nArc = nRawArc & self._arcMask + hDst.write(" {:<20} {:0>16} i{:>10} #{:_>10}\n".format(self.lArcVal[nArc], bin(nRawArc)[2:], "?", \ + int.from_bytes(self.byDic[iEndArcAddr:iEndArcAddr+self.nBytesNodeAddress], \ + byteorder='big'))) + iAddr = iEndArcAddr+self.nBytesNodeAddress + if (nRawArc & self._lastArcMask) and iAddr < len(self.byDic): + hDst.write("\ni{:_>10} -- #{:_>10}\n".format("?", iAddr)) + hDst.close() + + # VERSION 2 + def _morph2 (self, sWord): + "returns morphologies of " + iAddr = 0 + for c in sWord: + if c not in self.dChar: + return [] + iAddr = self._lookupArcNode(self.dChar[c], iAddr) + if iAddr == None: + return [] + if (int.from_bytes(self.byDic[iAddr:iAddr+self.nBytesArc], byteorder='big') & self._finalNodeMask): + l = [] + nRawArc = 0 + while not (nRawArc & self._lastArcMask): + iEndArcAddr = iAddr + self.nBytesArc + nRawArc = int.from_bytes(self.byDic[iAddr:iEndArcAddr], byteorder='big') + nArc = nRawArc & self._arcMask + if nArc >= self.nChar: + # This value is not a char, this is a stemming code + sStem = ">" + self.funcStemming(sWord, self.lArcVal[nArc]) + # Now , we go to the next node and retrieve all following arcs values, all of them are tags + if not (nRawArc & self._addrBitMask): + iAddr2 = int.from_bytes(self.byDic[iEndArcAddr:iEndArcAddr+self.nBytesNodeAddress], byteorder='big') + else: + # we go to the end of the node + iAddr2 = iEndArcAddr + while not (nRawArc & self._lastArcMask): + nRawArc = int.from_bytes(self.byDic[iAddr2:iAddr2+self.nBytesArc], byteorder='big') + iAddr2 += self.nBytesArc + self.nBytesNodeAddress + nRawArc2 = 0 + while not (nRawArc2 & self._lastArcMask): + iEndArcAddr2 = iAddr2 + self.nBytesArc + nRawArc2 = int.from_bytes(self.byDic[iAddr2:iEndArcAddr2], byteorder='big') + l.append(sStem + " " + self.lArcVal[nRawArc2 & self._arcMask]) + iAddr2 = iEndArcAddr2+self.nBytesNodeAddress if not (nRawArc2 & self._addrBitMask) else iEndArcAddr2 + iAddr = iEndArcAddr+self.nBytesNodeAddress if not (nRawArc & self._addrBitMask) else iEndArcAddr + return l + return [] + + def _stem2 (self, sWord): + "returns stems list of " + iAddr = 0 + for c in sWord: + if c not in self.dChar: + return [] + iAddr = self._lookupArcNode(self.dChar[c], iAddr) + if iAddr == None: + return [] + if (int.from_bytes(self.byDic[iAddr:iAddr+self.nBytesArc], byteorder='big') & self._finalNodeMask): + l = [] + nRawArc = 0 + while not (nRawArc & self._lastArcMask): + iEndArcAddr = iAddr + self.nBytesArc + nRawArc = int.from_bytes(self.byDic[iAddr:iEndArcAddr], byteorder='big') + nArc = nRawArc & self._arcMask + if nArc >= self.nChar: + # This value is not a char, this is a stemming code + l.append(self.funcStemming(sWord, self.lArcVal[nArc])) + # Now , we go to the next node + if not (nRawArc & self._addrBitMask): + iAddr2 = int.from_bytes(self.byDic[iEndArcAddr:iEndArcAddr+self.nBytesNodeAddress], byteorder='big') + else: + # we go to the end of the node + iAddr2 = iEndArcAddr + while not (nRawArc & self._lastArcMask): + nRawArc = int.from_bytes(self.byDic[iAddr2:iAddr2+self.nBytesArc], byteorder='big') + iAddr2 += self.nBytesArc + self.nBytesNodeAddress + iAddr = iEndArcAddr+self.nBytesNodeAddress if not (nRawArc & self._addrBitMask) else iEndArcAddr + return l + return [] + + def _lookupArcNode2 (self, nVal, iAddr): + "looks if is an arc at the node at , if yes, returns address of next node else None" + while True: + iEndArcAddr = iAddr+self.nBytesArc + nRawArc = int.from_bytes(self.byDic[iAddr:iEndArcAddr], byteorder='big') + if nVal == (nRawArc & self._arcMask): + # the value we are looking for + if not (nRawArc & self._addrBitMask): + # we return the address of the next node + return int.from_bytes(self.byDic[iEndArcAddr:iEndArcAddr+self.nBytesNodeAddress], byteorder='big') + else: + # we go to the end of the node + iAddr = iEndArcAddr + while not (nRawArc & self._lastArcMask): + nRawArc = int.from_bytes(self.byDic[iAddr:iAddr+self.nBytesArc], byteorder='big') + iAddr += self.nBytesArc + self.nBytesNodeAddress if not (nRawArc & self._addrBitMask) else self.nBytesArc + return iAddr + else: + # value not found + if (nRawArc & self._lastArcMask): + return None + iAddr = iEndArcAddr+self.nBytesNodeAddress if not (nRawArc & self._addrBitMask) else iEndArcAddr + + def _writeNodes2 (self, spfDest): + "for debugging only" + print(" > Write binary nodes") + with codecs.open(spfDest, 'w', 'utf-8', newline="\n") as hDst: + iAddr = 0 + hDst.write("i{:_>10} -- #{:_>10}\n".format("0", iAddr)) + while iAddr < len(self.byDic): + iEndArcAddr = iAddr+self.nBytesArc + nRawArc = int.from_bytes(self.byDic[iAddr:iEndArcAddr], byteorder='big') + nArc = nRawArc & self._arcMask + if not (nRawArc & self._addrBitMask): + iNextNodeAddr = int.from_bytes(self.byDic[iEndArcAddr:iEndArcAddr+self.nBytesNodeAddress], byteorder='big') + hDst.write(" {:<20} {:0>16} i{:>10} #{:_>10}\n".format(self.lArcVal[nArc], bin(nRawArc)[2:], "?", iNextNodeAddr)) + iAddr = iEndArcAddr+self.nBytesNodeAddress + else: + hDst.write(" {:<20} {:0>16}\n".format(self.lArcVal[nArc], bin(nRawArc)[2:])) + iAddr = iEndArcAddr + if (nRawArc & self._lastArcMask): + hDst.write("\ni{:_>10} -- #{:_>10}\n".format("?", iAddr)) + hDst.close() + + # VERSION 3 + def _morph3 (self, sWord): + "returns morphologies of " + iAddr = 0 + for c in sWord: + if c not in self.dChar: + return [] + iAddr = self._lookupArcNode(self.dChar[c], iAddr) + if iAddr == None: + return [] + if (int.from_bytes(self.byDic[iAddr:iAddr+self.nBytesArc], byteorder='big') & self._finalNodeMask): + l = [] + nRawArc = 0 + iAddrNode = iAddr + while not (nRawArc & self._lastArcMask): + iEndArcAddr = iAddr + self.nBytesArc + nRawArc = int.from_bytes(self.byDic[iAddr:iEndArcAddr], byteorder='big') + nArc = nRawArc & self._arcMask + if nArc >= self.nChar: + # This value is not a char, this is a stemming code + sStem = ">" + self.funcStemming(sWord, self.lArcVal[nArc]) + # Now , we go to the next node and retrieve all following arcs values, all of them are tags + if not (nRawArc & self._addrBitMask): + iAddr2 = int.from_bytes(self.byDic[iEndArcAddr:iEndArcAddr+self.nBytesNodeAddress], byteorder='big') + else: + iAddr2 = iAddrNode + int.from_bytes(self.byDic[iEndArcAddr:iEndArcAddr+self.nBytesOffset], byteorder='big') + nRawArc2 = 0 + while not (nRawArc2 & self._lastArcMask): + iEndArcAddr2 = iAddr2 + self.nBytesArc + nRawArc2 = int.from_bytes(self.byDic[iAddr2:iEndArcAddr2], byteorder='big') + l.append(sStem + " " + self.lArcVal[nRawArc2 & self._arcMask]) + iAddr2 = iEndArcAddr2+self.nBytesNodeAddress if not (nRawArc2 & self._addrBitMask) else iEndArcAddr2+self.nBytesOffset + iAddr = iEndArcAddr+self.nBytesNodeAddress if not (nRawArc & self._addrBitMask) else iEndArcAddr+self.nBytesOffset + return l + return [] + + def _stem3 (self, sWord): + "returns stems list of " + iAddr = 0 + for c in sWord: + if c not in self.dChar: + return [] + iAddr = self._lookupArcNode(self.dChar[c], iAddr) + if iAddr == None: + return [] + if (int.from_bytes(self.byDic[iAddr:iAddr+self.nBytesArc], byteorder='big') & self._finalNodeMask): + l = [] + nRawArc = 0 + iAddrNode = iAddr + while not (nRawArc & self._lastArcMask): + iEndArcAddr = iAddr + self.nBytesArc + nRawArc = int.from_bytes(self.byDic[iAddr:iEndArcAddr], byteorder='big') + nArc = nRawArc & self._arcMask + if nArc >= self.nChar: + # This value is not a char, this is a stemming code + l.append(self.funcStemming(sWord, self.lArcVal[nArc])) + iAddr = iEndArcAddr+self.nBytesNodeAddress if not (nRawArc & self._addrBitMask) else iEndArcAddr+self.nBytesOffset + return l + return [] + + def _lookupArcNode3 (self, nVal, iAddr): + "looks if is an arc at the node at , if yes, returns address of next node else None" + iAddrNode = iAddr + while True: + iEndArcAddr = iAddr+self.nBytesArc + nRawArc = int.from_bytes(self.byDic[iAddr:iEndArcAddr], byteorder='big') + if nVal == (nRawArc & self._arcMask): + # the value we are looking for + if not (nRawArc & self._addrBitMask): + return int.from_bytes(self.byDic[iEndArcAddr:iEndArcAddr+self.nBytesNodeAddress], byteorder='big') + else: + return iAddrNode + int.from_bytes(self.byDic[iEndArcAddr:iEndArcAddr+self.nBytesOffset], byteorder='big') + else: + # value not found + if (nRawArc & self._lastArcMask): + return None + iAddr = iEndArcAddr+self.nBytesNodeAddress if not (nRawArc & self._addrBitMask) else iEndArcAddr+self.nBytesOffset + + def _writeNodes3 (self, spfDest): + "for debugging only" + print(" > Write binary nodes") + with codecs.open(spfDest, 'w', 'utf-8', newline="\n") as hDst: + iAddr = 0 + hDst.write("i{:_>10} -- #{:_>10}\n".format("0", iAddr)) + while iAddr < len(self.byDic): + iEndArcAddr = iAddr+self.nBytesArc + nRawArc = int.from_bytes(self.byDic[iAddr:iEndArcAddr], byteorder='big') + nArc = nRawArc & self._arcMask + if not (nRawArc & self._addrBitMask): + iNextNodeAddr = int.from_bytes(self.byDic[iEndArcAddr:iEndArcAddr+self.nBytesNodeAddress], byteorder='big') + hDst.write(" {:<20} {:0>16} i{:>10} #{:_>10}\n".format(self.lArcVal[nArc], bin(nRawArc)[2:], "?", iNextNodeAddr)) + iAddr = iEndArcAddr+self.nBytesNodeAddress + else: + iNextNodeAddr = int.from_bytes(self.byDic[iEndArcAddr:iEndArcAddr+self.nBytesOffset], byteorder='big') + hDst.write(" {:<20} {:0>16} i{:>10} +{:_>10}\n".format(self.lArcVal[nArc], bin(nRawArc)[2:], "?", iNextNodeAddr)) + iAddr = iEndArcAddr+self.nBytesOffset + if (nRawArc & self._lastArcMask): + hDst.write("\ni{:_>10} -- #{:_>10}\n".format("?", iAddr)) + hDst.close() ADDED graphspell/keyboard_chars_proximity.py Index: graphspell/keyboard_chars_proximity.py ================================================================== --- /dev/null +++ graphspell/keyboard_chars_proximity.py @@ -0,0 +1,220 @@ +# Keyboard chars proximity + + +def getKeyboardMap (sKeyboard): + return _dKeyboardMap.get(sKeyboard.lower(), {}) + + +def getKeyboardList (): + return _dKeyboardMap.keys() + + +_dKeyboardMap = { + # keyboards by alphabetical order + # bépo, colemak and dvorak users are assumed to do less typing errors. + "azerty": { + # fr + # line 1 + "é": "az", + "è": "yu", + "ç": "àio", + "à": "op", + # line 2 + "a": "zéq", + "z": "aesq", + "e": "zrds", + "r": "etfd", + "t": "rygf", + "y": "tuhg", + "u": "yijh", + "i": "uokj", + "o": "iplk", + "p": "oml", + # line 3 + "q": "sawz", + "s": "qdzwxe", + "d": "sfexcr", + "f": "dgrcvt", + "g": "fhtvby", + "h": "gjybnu", + "j": "hkuni", + "k": "jlio", + "l": "kmop", + "m": "lùp", + "ù": "m", + # line 4 + "w": "xqs", + "x": "wcsd", + "c": "xvdf", + "v": "cbfg", + "b": "vngh", + "n": "bhj", + }, + "bépo": { + # fr + # line 2 + "b": "éa", + "é": "bpu", + "p": "éoi", + "o": "pèe", + "è": "o", + "v": "dt", + "d": "vls", + "l": "djr", + "j": "lzn", + "z": "jmw", + # line 3 + "a": "ubà", + "u": "aiéy", + "i": "uepx", + "e": "io", + "c": "t", + "t": "csvq", + "s": "trdg", + "r": "snlh", + "n": "rmjf", + "m": "nzç", + # line 4 + "à": "yêa", + "y": "àxu", + "x": "ywi", + "w": "z", + "k": "c", + "q": "gt", + "g": "qhs", + "h": "gfr", + "f": "hçn", + "ç": "fm", + }, + "colemak": { + # en, us, intl + # line 2 + "q": "wa", + "w": "qfr", + "f": "wps", + "p": "fgt", + "g": "pjd", + "j": "glh", + "l": "jun", + "u": "lye", + "y": "ui", + # line 3 + "a": "rqz", + "r": "aswx", + "s": "rtfc", + "t": "sdpv", + "d": "thgb", + "h": "dnjk", + "n": "helm", + "e": "niu", + "i": "eoy", + "o": "i", + # line 4 + "z": "xa", + "x": "zcr", + "c": "xvs", + "v": "cbt", + "b": "vkd", + "k": "bmh", + "m": "kn", + }, + "dvorak": { + # en, us, intl + # line 2 + "p": "yu", + "y": "pfi", + "f": "ygd", + "g": "fch", + "c": "grt", + "r": "cln", + "l": "rs", + # line 3 + "a": "o", + "o": "aeq", + "e": "ouj", + "u": "eipk", + "i": "udyx", + "d": "ihfb", + "h": "dtgm", + "t": "hncw", + "n": "tsrv", + "s": "nlz", + # line 4 + "q": "jo", + "j": "qke", + "k": "jxu", + "x": "kbi", + "b": "xmd", + "m": "bwh", + "w": "mvt", + "v": "wzn", + "z": "vs", + }, + "qwerty": { + # en, us, intl + # line 2 + "q": "wa", + "w": "qeas", + "e": "wrds", + "r": "etfd", + "t": "rygf", + "y": "tuhg", + "u": "yijh", + "i": "uokj", + "o": "iplk", + "p": "ol", + # line 3 + "a": "sqzw", + "s": "adwzxe", + "d": "sfexcr", + "f": "dgrcvt", + "g": "fhtvby", + "h": "gjybnu", + "j": "hkunmi", + "k": "jlimo", + "l": "kop", + # line 4 + "z": "xas", + "x": "zcsd", + "c": "xvdf", + "v": "cbfg", + "b": "vngh", + "n": "bmhj", + "m": "njk", + }, + "qwertz": { + # ge, au + # line 2 + "q": "wa", + "w": "qeas", + "e": "wrds", + "r": "etfd", + "t": "rzgf", + "z": "tuhg", + "u": "zijh", + "i": "uokj", + "o": "iplk", + "p": "oüöl", + "ü": "päö", + # line 3 + "a": "sqyw", + "s": "adwyxe", + "d": "sfexcr", + "f": "dgrcvt", + "g": "fhtvbz", + "h": "gjzbnu", + "j": "hkunmi", + "k": "jlimo", + "l": "köop", + "ö": "läpü", + "ä": "öü", + # line 4 + "y": "xas", + "x": "ycsd", + "c": "xvdf", + "v": "cbfg", + "b": "vngh", + "n": "bmhj", + "m": "njk", + } +} ADDED graphspell/progressbar.py Index: graphspell/progressbar.py ================================================================== --- /dev/null +++ graphspell/progressbar.py @@ -0,0 +1,35 @@ +# Textual progressbar +# by Olivier R. +# License: MPL 2 + +import time + +class ProgressBar: + "Textual progressbar" + + def __init__ (self, nMin=0, nMax=100, nWidth=78): + "initiate with minimum nMin to maximum nMax" + self.nMin = nMin + self.nMax = nMax + self.nSpan = nMax - nMin + self.nWidth = nWidth-9 + self.nAdvance = -1 + self.nCurVal = nMin + self.startTime = time.time() + self._update() + + def _update (self): + fDone = ((self.nCurVal - self.nMin) / self.nSpan) + nAdvance = int(fDone * self.nWidth) + if (nAdvance > self.nAdvance): + self.nAdvance = nAdvance + print("\r[ {}{} {}% ] ".format('>'*nAdvance, ' '*(self.nWidth-nAdvance), round(fDone*100)), end="") + + def increment (self, n=1): + "increment value by n (1 by default)" + self.nCurVal += n + self._update() + + def done (self): + "to call when it’s finished" + print("\r[ task done in {:.1f} s ] ".format(time.time() - self.startTime)) ADDED graphspell/spellchecker.py Index: graphspell/spellchecker.py ================================================================== --- /dev/null +++ graphspell/spellchecker.py @@ -0,0 +1,134 @@ +# Spellchecker +# Wrapper for the IBDAWG class. +# Useful to check several dictionaries at once. + +from . import ibdawg + + +dDictionaries = { + "fr": "French.bdic", + "en": "English.bdic" +} + + +class Spellchecker (): + + def __init__ (self, sLangCode): + self.sLangCode = sLangCode + self.oMainDic = None + if sLangCode in dDictionaries: + self.oMainDic = ibdawg.IBDAWG(dDictionaries[sLangCode]) + self.lOtherDic = [] + return bool(self.oMainDic) + + + def setMainDictionary (self, sDicName): + try: + self.oMainDic = ibdawg.IBDAWG(sDicName) + return True + except: + print("Error: <" + sDicName + "> not set as main dictionary.") + return False + + def addDictionary (self, sDicName): + try: + self.lOtherDic.append(ibdawg.IBDAWG(sDicName)) + return True + except: + print("Error: <" + sDicName + "> not added to the list.") + return False + + # Return codes: + # 0: invalid + # 1: correct in main dictionary + # 2+: correct in foreign dictionaries + + + # check in the main dictionary only + + def isValidToken (self, sToken): + "(in main dictionary) checks if sToken is valid (if there is hyphens in sToken, sToken is split, each part is checked)" + if self.oMainDic.isValidToken(sToken): + return 1 + return 0 + + def isValid (self, sWord): + "(in main dictionary) checks if sWord is valid (different casing tested if the first letter is a capital)" + if self.oMainDic.isValid(sWord): + return 1 + return 0 + + def lookup (self, sWord): + "(in main dictionary) checks if sWord is in dictionary as is (strict verification)" + if self.oMainDic.lookup(sWord): + return 1 + return 0 + + + # check in all dictionaries + + def isValidTokenAll (self, sToken): + "(in all dictionaries) checks if sToken is valid (if there is hyphens in sToken, sToken is split, each part is checked)" + if self.oMainDic.isValidToken(sToken): + return 1 + for i, oDic in enumerate(self.lOtherDic, 2): + if oDic.isValidToken(sToken): + return i + return 0 + + def isValidAll (self, sWord): + "(in all dictionaries) checks if sWord is valid (different casing tested if the first letter is a capital)" + if self.oMainDic.isValid(sToken): + return 1 + for i, oDic in enumerate(self.lOtherDic, 2): + if oDic.isValid(sToken): + return i + return 0 + + def lookupAll (self, sWord): + "(in all dictionaries) checks if sWord is in dictionary as is (strict verification)" + if self.oMainDic.lookup(sToken): + return 1 + for i, oDic in enumerate(self.lOtherDic, 2): + if oDic.lookup(sToken): + return i + return 0 + + + # check in dictionaries up to level n + + def isValidTokenLevel (self, sToken, nLevel): + "(in dictionaries up to level n) checks if sToken is valid (if there is hyphens in sToken, sToken is split, each part is checked)" + if self.oMainDic.isValidToken(sToken): + return 1 + if nLevel >= 2: + for i, oDic in enumerate(self.lOtherDic, 2): + if oDic.isValidToken(sToken): + return i + if i == nLevel: + break + return 0 + + def isValidLevel (self, sWord, nLevel): + "(in dictionaries up to level n) checks if sWord is valid (different casing tested if the first letter is a capital)" + if self.oMainDic.isValid(sToken): + return 1 + if nLevel >= 2: + for i, oDic in enumerate(self.lOtherDic, 2): + if oDic.isValid(sToken): + return i + if i == nLevel: + break + return 0 + + def lookupLevel (self, sWord, nLevel): + "(in dictionaries up to level n) checks if sWord is in dictionary as is (strict verification)" + if self.oMainDic.lookup(sToken): + return 1 + if nLevel >= 2: + for i, oDic in enumerate(self.lOtherDic, 2): + if oDic.lookup(sToken): + return i + if i == nLevel: + break + return 0 ADDED graphspell/str_transform.py Index: graphspell/str_transform.py ================================================================== --- /dev/null +++ graphspell/str_transform.py @@ -0,0 +1,203 @@ +#!python3 + + +#### DISTANCE CALCULATIONS + +def longestCommonSubstring (s1, s2): + # http://en.wikipedia.org/wiki/Longest_common_substring_problem + # http://en.wikibooks.org/wiki/Algorithm_implementation/Strings/Longest_common_substring + M = [ [0]*(1+len(s2)) for i in range(1+len(s1)) ] + longest, x_longest = 0, 0 + for x in range(1, 1+len(s1)): + for y in range(1, 1+len(s2)): + if s1[x-1] == s2[y-1]: + M[x][y] = M[x-1][y-1] + 1 + if M[x][y] > longest: + longest = M[x][y] + x_longest = x + else: + M[x][y] = 0 + return s1[x_longest-longest : x_longest] + + +def distanceDamerauLevenshtein (s1, s2): + "distance of Damerau-Levenshtein between and " + # https://fr.wikipedia.org/wiki/Distance_de_Damerau-Levenshtein + d = {} + nLen1 = len(s1) + nLen2 = len(s2) + for i in range(-1, nLen1+1): + d[i, -1] = i + 1 + for j in range(-1, nLen2+1): + d[-1, j] = j + 1 + for i in range(nLen1): + for j in range(nLen2): + nCost = 0 if s1[i] == s2[j] else 1 + d[i, j] = min( + d[i-1, j] + 1, # Deletion + d[i, j-1] + 1, # Insertion + d[i-1, j-1] + nCost, # Substitution + ) + if i and j and s1[i] == s2[j-1] and s1[i-1] == s2[j]: + d[i, j] = min(d[i, j], d[i-2, j-2] + nCost) # Transposition + return d[nLen1-1, nLen2-1] + + +def distanceSift4 (s1, s2, nMaxOffset=5): + "implementation of general Sift4." + # https://siderite.blogspot.com/2014/11/super-fast-and-accurate-string-distance.html + if not s1: + return len(s2) + if not s2: + return len(s1) + nLen1, nLen2 = len(s1), len(s2) + i1, i2 = 0, 0 # Cursors for each string + nLargestCS = 0 # Largest common substring + nLocalCS = 0 # Local common substring + nTrans = 0 # Number of transpositions ('ab' vs 'ba') + lOffset = [] # Offset pair array, for computing the transpositions + + while i1 < nLen1 and i2 < nLen2: + if s1[i1] == s2[i2]: + nLocalCS += 1 + # Check if current match is a transposition + bTrans = False + i = 0 + while i < len(lOffset): + t = lOffset[i] + if i1 <= t[0] or i2 <= t[1]: + bTrans = abs(i2-i1) >= abs(t[1] - t[0]) + if bTrans: + nTrans += 1 + elif not t[2]: + t[2] = True + nTrans += 1 + break + elif i1 > t[1] and i2 > t[0]: + del lOffset[i] + else: + i += 1 + lOffset.append([i1, i2, bTrans]) + else: + nLargestCS += nLocalCS + nLocalCS = 0 + if i1 != i2: + i1 = i2 = min(i1, i2) + for i in range(nMaxOffset): + if i1 + i >= nLen1 and i2 + i >= nLen2: + break + elif i1 + i < nLen1 and s1[i1+i] == s2[i2]: + i1 += i - 1 + i2 -= 1 + break + elif i2 + i < nLen2 and s1[i1] == s2[i2+i]: + i2 += i - 1 + i1 -= 1 + break + i1 += 1 + i2 += 1 + if i1 >= nLen1 or i2 >= nLen2: + nLargestCS += nLocalCS + nLocalCS = 0 + i1 = i2 = min(i1, i2) + nLargestCS += nLocalCS + return round(max(nLen1, nLen2) - nLargestCS + nTrans) + + +def showDistance (s1, s2): + print("Damerau-Levenshtein: " + s1 + "/" + s2 + " = " + distanceDamerauLevenshtein(s1, s2)) + print("Sift4:" + s1 + "/" + s2 + " = " + distanceSift4(s1, s2)) + + + + +#### STEMMING OPERATIONS + +## No stemming + +def noStemming (sFlex, sStem): + return sStem + +def rebuildWord (sFlex, cmd1, cmd2): + if cmd1 == "_": + return sFlex + n, c = cmd1.split(":") + s = s[:n] + c + s[n:] + if cmd2 == "_": + return s + n, c = cmd2.split(":") + return s[:n] + c + s[n:] + + +## Define affixes for stemming + +# Note: 48 is the ASCII code for "0" + + +# Suffix only +def defineSuffixCode (sFlex, sStem): + """ Returns a string defining how to get stem from flexion + "n(sfx)" + with n: a char with numeric meaning, "0" = 0, "1" = 1, ... ":" = 10, etc. (See ASCII table.) Says how many letters to strip from flexion. + sfx [optional]: string to add on flexion + Examples: + "0": strips nothing, adds nothing + "1er": strips 1 letter, adds "er" + "2": strips 2 letters, adds nothing + """ + if sFlex == sStem: + return "0" + jSfx = 0 + for i in range(min(len(sFlex), len(sStem))): + if sFlex[i] != sStem[i]: + break + jSfx += 1 + return chr(len(sFlex)-jSfx+48) + sStem[jSfx:] + + +def changeWordWithSuffixCode (sWord, sSfxCode): + if sSfxCode == "0": + return sWord + return sWord[:-(ord(sSfxCode[0])-48)] + sSfxCode[1:] if sSfxCode[0] != '0' else sWord + sSfxCode[1:] + + +# Prefix and suffix + +def defineAffixCode (sFlex, sStem): + """ Returns a string defining how to get stem from flexion. Examples: + "0" if stem = flexion + "stem" if no common substring + "n(pfx)/m(sfx)" + with n and m: chars with numeric meaning, "0" = 0, "1" = 1, ... ":" = 10, etc. (See ASCII table.) Says how many letters to strip from flexion. + pfx [optional]: string to add before the flexion + sfx [optional]: string to add after the flexion + """ + if sFlex == sStem: + return "0" + # is stem a substring of flexion? + n = sFlex.find(sStem) + if n >= 0: + return "{}/{}".format(chr(n+48), chr(len(sFlex)-(len(sStem)+n)+48)) + # no, so we are looking for common substring + sSubs = longestCommonSubstring(sFlex, sStem) + if len(sSubs) > 1: + iPos = sStem.find(sSubs) + sPfx = sStem[:iPos] + sSfx = sStem[iPos+len(sSubs):] + n = sFlex.find(sSubs) + m = len(sFlex) - (len(sSubs)+n) + sAff = "{}/".format(chr(n+48)) if not sPfx else "{}{}/".format(chr(n+48), sPfx) + sAff += chr(m+48) if not sSfx else "{}{}".format(chr(m+48), sSfx) + return sAff + return sStem + + +def changeWordWithAffixCode (sWord, sAffCode): + if sAffCode == "0": + return sWord + if '/' not in sAffCode: + return "# error #" + sPfxCode, sSfxCode = sAffCode.split('/') + sWord = sPfxCode[1:] + sWord[(ord(sPfxCode[0])-48):] + return sWord[:-(ord(sSfxCode[0])-48)] + sSfxCode[1:] if sSfxCode[0] != '0' else sWord + sSfxCode[1:] + ADDED graphspell/tokenizer.py Index: graphspell/tokenizer.py ================================================================== --- /dev/null +++ graphspell/tokenizer.py @@ -0,0 +1,49 @@ +# Very simple tokenizer + +import re + +_PATTERNS = { + "default": + ( + r'(?P/(?:bin|boot|dev|etc|home|lib|mnt|opt|root|sbin|tmp|usr|var|Bureau|Documents|Images|Musique|Public|Téléchargements|Vidéos)(?:/[\w.()-]+)*)', + r'(?P[a-zA-Z]:\\(?:Program Files(?: [(]x86[)]|)|[\w.()]+)(?:\\[\w.()-]+)*)', + r'(?P[.,?!:;…«»“”"()/·]+)', + r'(?P[A-Z][.][A-Z][.](?:[A-Z][.])*)', + r'(?P(?:https?://|www[.]|\w+[@.]\w\w+[@.])\w[\w./?&!%=+*"\'@$#-]+)', + r'(?P[#@][\w-]+)', + r'(?P<\w+.*?>|)', + r'(?P\[/?\w+\])', + r'(?P\d\d?h\d\d\b)', + r'(?P-?\d+(?:[.,]\d+))', + r"(?P\w+(?:[’'`-]\w+)*)" + ), + "fr": + ( + r'(?P/(?:bin|boot|dev|etc|home|lib|mnt|opt|root|sbin|tmp|usr|var|Bureau|Documents|Images|Musique|Public|Téléchargements|Vidéos)(?:/[\w.()-]+)*)', + r'(?P[a-zA-Z]:\\(?:Program Files(?: [(]x86[)]|)|[\w.()]+)(?:\\[\w.()-]+)*)', + r'(?P[.,?!:;…«»“”"()/·]+)', + r'(?P[A-Z][.][A-Z][.](?:[A-Z][.])*)', + r'(?P(?:https?://|www[.]|\w+[@.]\w\w+[@.])\w[\w./?&!%=+*"\'@$#-]+)', + r'(?P[#@][\w-]+)', + r'(?P<\w+.*?>|)', + r'(?P\[/?\w+\])', + r"(?P(?:l|d|n|m|t|s|j|c|ç|lorsqu|puisqu|jusqu|quoiqu|qu)['’`])", + r'(?P\d+(?:er|nd|e|de|ième|ème|eme)\b)', + r'(?P\d\d?h\d\d\b)', + r'(?P-?\d+(?:[.,]\d+|))', + r"(?P\w+(?:[’'`-]\w+)*)" + ) +} + + +class Tokenizer: + + def __init__ (self, sLang): + self.sLang = sLang + if sLang not in _PATTERNS: + self.sLang = "default" + self.zToken = re.compile( "(?i)" + '|'.join(sRegex for sRegex in _PATTERNS[sLang]) ) + + def genTokens (self, sText): + for m in self.zToken.finditer(sText): + yield { "sType": m.lastgroup, "sValue": m.group(), "nStart": m.start(), "nEnd": m.end() } Index: helpers.py ================================================================== --- helpers.py +++ helpers.py @@ -1,9 +1,10 @@ # Useful tools import os import shutil +import errno import zipfile from string import Template @@ -55,10 +56,20 @@ if not os.path.exists(sp): os.makedirs(sp, exist_ok=True) else: eraseFolder(sp) + +def copyFolderContent (spSrc, spDst): + try: + shutil.copytree(spSrc, spDst) + except OSError as e: + if e.errno == errno.ENOTDIR: + shutil.copy(spSrc, spDst) + else: + raise + def fileFile (spf, dVars): "return file as a text filed with variables from " return Template(open(spf, "r", encoding="utf-8").read()).safe_substitute(dVars) ADDED js_extension/map.js Index: js_extension/map.js ================================================================== --- /dev/null +++ js_extension/map.js @@ -0,0 +1,56 @@ + +// Map +/*jslint esversion: 6*/ + +if (Map.prototype.grammalecte === undefined) { + Map.prototype.gl_shallowCopy = function () { + let oNewMap = new Map(); + for (let [key, val] of this.entries()) { + oNewMap.set(key, val); + } + return oNewMap; + }; + + Map.prototype.gl_get = function (key, defaultValue) { + let res = this.get(key); + if (res !== undefined) { + return res; + } + return defaultValue; + }; + + Map.prototype.gl_toString = function () { + // Default .toString() gives nothing useful + let sRes = "{ "; + for (let [k, v] of this.entries()) { + sRes += (typeof k === "string") ? '"' + k + '": ' : k.toString() + ": "; + sRes += (typeof v === "string") ? '"' + v + '", ' : v.toString() + ", "; + } + sRes = sRes.slice(0, -2) + " }"; + return sRes; + }; + + Map.prototype.gl_update = function (dDict) { + for (let [k, v] of dDict.entries()) { + this.set(k, v); + } + }; + + Map.prototype.gl_updateOnlyExistingKeys = function (dDict) { + for (let [k, v] of dDict.entries()) { + if (this.has(k)){ + this.set(k, v); + } + } + }; + + Map.prototype.gl_reverse = function () { + let dNewMap = new Map(); + this.forEach((val, key) => { + dNewMap.set(val, key); + }); + return dNewMap; + }; + + Map.prototype.grammalecte = true; +} ADDED js_extension/regex.js Index: js_extension/regex.js ================================================================== --- /dev/null +++ js_extension/regex.js @@ -0,0 +1,90 @@ + +// regex +/*jslint esversion: 6*/ + +if (RegExp.prototype.grammalecte === undefined) { + RegExp.prototype.gl_exec2 = function (sText, aGroupsPos, aNegLookBefore=null) { + let m; + while ((m = this.exec(sText)) !== null) { + // we have to iterate over sText here too + // because first match doesn’t imply it’s a valid match according to negative lookbefore assertions, + // and even if first match is finally invalid, it doesn’t mean the following eligible matchs would be invalid too. + if (aNegLookBefore !== null) { + // check negative look before assertions + if ( !aNegLookBefore.some(sRegEx => (RegExp.leftContext.search(sRegEx) >= 0)) ) { + break; + } + } else { + break; + } + } + if (m === null) { + return null; + } + + let codePos; + let iPos = 0; + m.start = [m.index]; + m.end = [this.lastIndex]; + try { + if (m.length > 1) { + // there is subgroup(s) + if (aGroupsPos !== null) { + // aGroupsPos is defined + for (let i = 1; i <= m.length-1; i++) { + codePos = aGroupsPos[i-1]; + if (typeof codePos === "number") { + // position as a number + m.start.push(m.index + codePos); + m.end.push(m.index + codePos + m[i].length); + } else if (codePos === "$") { + // at the end of the pattern + m.start.push(this.lastIndex - m[i].length); + m.end.push(this.lastIndex); + } else if (codePos === "w") { + // word in the middle of the pattern + iPos = m[0].search("[ ’,()«»“”]"+m[i]+"[ ,’()«»“”]") + 1 + m.index; + m.start.push(iPos); + m.end.push(iPos + m[i].length); + } else if (codePos === "*") { + // anywhere + iPos = m[0].indexOf(m[i]) + m.index; + m.start.push(iPos); + m.end.push(iPos + m[i].length); + } else if (codePos === "**") { + // anywhere after previous group + iPos = m[0].indexOf(m[i], m.end[i-1]-m.index) + m.index; + m.start.push(iPos); + m.end.push(iPos + m[i].length); + } else if (codePos.startsWith(">")) { + // >x:_ + // todo: look in substring x + iPos = m[0].indexOf(m[i]) + m.index; + m.start.push(iPos); + m.end.push(iPos + m[i].length); + } else { + console.error("# Error: unknown positioning code in regex [" + this.source + "], for group[" + i.toString() +"], code: [" + codePos + "]"); + } + } + } else { + // no aGroupsPos + for (let subm of m.slice(1)) { + iPos = m[0].indexOf(subm) + m.index; + m.start.push(iPos); + m.end.push(iPos + subm.length); + } + } + } + } + catch (e) { + if (typeof(helpers) !== "undefined") { + helpers.logerror(e); + } else { + console.error(e); + } + } + return m; + }; + + RegExp.prototype.grammalecte = true; +} ADDED js_extension/set.js Index: js_extension/set.js ================================================================== --- /dev/null +++ js_extension/set.js @@ -0,0 +1,13 @@ + +// Set +/*jslint esversion: 6*/ + +if (Set.prototype.grammalecte === undefined) { + Set.prototype.gl_update = function (aSet) { + for (let elem of aSet) { + this.add(elem); + } + }; + + Set.prototype.grammalecte = true; +} ADDED js_extension/string.js Index: js_extension/string.js ================================================================== --- /dev/null +++ js_extension/string.js @@ -0,0 +1,58 @@ + +// String +/*jslint esversion: 6*/ + +if (String.prototype.grammalecte === undefined) { + String.prototype.gl_count = function (sSearch, bOverlapping) { + // http://jsperf.com/string-ocurrence-split-vs-match/8 + if (sSearch.length <= 0) { + return this.length + 1; + } + let nOccur = 0; + let iPos = 0; + let nStep = (bOverlapping) ? 1 : sSearch.length; + while ((iPos = this.indexOf(sSearch, iPos)) >= 0) { + nOccur++; + iPos += nStep; + } + return nOccur; + }; + String.prototype.gl_isDigit = function () { + return (this.search(/^[0-9⁰¹²³⁴⁵⁶⁷⁸⁹]+$/) !== -1); + }; + String.prototype.gl_isLowerCase = function () { + return (this.search(/^[a-zà-öø-ÿ0-9-]+$/) !== -1); + }; + String.prototype.gl_isUpperCase = function () { + return (this.search(/^[A-ZÀ-ÖØ-ߌ0-9-]+$/) !== -1); + }; + String.prototype.gl_isTitle = function () { + return (this.search(/^[A-ZÀ-ÖØ-ߌ][a-zà-öø-ÿ'’-]+$/) !== -1); + }; + String.prototype.gl_toCapitalize = function () { + return this.slice(0,1).toUpperCase() + this.slice(1).toLowerCase(); + }; + String.prototype.gl_expand = function (oMatch) { + let sNew = this; + for (let i = 0; i < oMatch.length ; i++) { + let z = new RegExp("\\\\"+parseInt(i), "g"); + sNew = sNew.replace(z, oMatch[i]); + } + return sNew; + }; + String.prototype.gl_trimRight = function (sChars) { + let z = new RegExp("["+sChars+"]+$"); + return this.replace(z, ""); + }; + String.prototype.gl_trimLeft = function (sChars) { + let z = new RegExp("^["+sChars+"]+"); + return this.replace(z, ""); + }; + String.prototype.gl_trim = function (sChars) { + let z1 = new RegExp("^["+sChars+"]+"); + let z2 = new RegExp("["+sChars+"]+$"); + return this.replace(z1, "").replace(z2, ""); + }; + + String.prototype.grammalecte = true; +} Index: lex_build.py ================================================================== --- lex_build.py +++ lex_build.py @@ -3,24 +3,24 @@ # Lexicon builder import argparse from distutils import dir_util -import grammalecte.dawg as fsa -from grammalecte.ibdawg import IBDAWG +import graphspell.dawg as fsa +from graphspell.ibdawg import IBDAWG def build (spfSrc, sLangName, sDicName, bJSON=False, cStemmingMethod="S", nCompressMethod=1): "transform a text lexicon as a binary indexable dictionary" oDAWG = fsa.DAWG(spfSrc, sLangName, cStemmingMethod) - dir_util.mkpath("grammalecte/_dictionaries") - oDAWG.writeInfo("grammalecte/_dictionaries/" + sDicName + ".info.txt") - oDAWG.createBinary("grammalecte/_dictionaries/" + sDicName + ".bdic", int(nCompressMethod)) + dir_util.mkpath("graphspell/_dictionaries") + oDAWG.writeInfo("graphspell/_dictionaries/" + sDicName + ".info.txt") + oDAWG.createBinary("graphspell/_dictionaries/" + sDicName + ".bdic", int(nCompressMethod)) if bJSON: - dir_util.mkpath("grammalecte-js/_dictionaries") + dir_util.mkpath("graphspell-js/_dictionaries") oDic = IBDAWG(sDicName + ".bdic") - oDic.writeAsJSObject("grammalecte-js/_dictionaries/" + sDicName + ".json", bBinaryDictAsHexString=True) + oDic.writeAsJSObject("graphspell-js/_dictionaries/" + sDicName + ".json", bBinaryDictAsHexString=True) def main (): xParser = argparse.ArgumentParser() xParser.add_argument("src_lexicon", type=str, help="path and file name of the source lexicon") Index: make.py ================================================================== --- make.py +++ make.py @@ -165,14 +165,17 @@ def copyGrammalectePyPackageInZipFile (hZip, spLangPack, sDicName, sAddPath=""): for sf in os.listdir("grammalecte"): if not os.path.isdir("grammalecte/"+sf): hZip.write("grammalecte/"+sf, sAddPath+"grammalecte/"+sf) + for sf in os.listdir("grammalecte/graphspell"): + if not os.path.isdir("grammalecte/graphspell/"+sf): + hZip.write("grammalecte/graphspell/"+sf, sAddPath+"grammalecte/graphspell/"+sf) + hZip.write("grammalecte/graphspell/_dictionaries/"+sDicName, sAddPath+"grammalecte/graphspell/_dictionaries/"+sDicName) for sf in os.listdir(spLangPack): if not os.path.isdir(spLangPack+"/"+sf): hZip.write(spLangPack+"/"+sf, sAddPath+spLangPack+"/"+sf) - hZip.write("grammalecte/_dictionaries/"+sDicName, sAddPath+"grammalecte/_dictionaries/"+sDicName) def create (sLang, xConfig, bInstallOXT, bJavaScript): oNow = datetime.datetime.now() print("============== MAKE GRAMMALECTE [{0}] at {1.hour:>2} h {1.minute:>2} min {1.second:>2} s ==============".format(sLang, oNow)) @@ -203,10 +206,16 @@ sCodePlugins += "\n\n" + open(spLang+'/modules/'+sf, "r", encoding="utf-8").read() print(sf, end=", ") print() dVars["plugins"] = sCodePlugins + ## COPY GC_CORE COMMON FILES + for sf in os.listdir("gc_core/py"): + if not os.path.isdir("gc_core/py/"+sf): + helpers.copyAndFileTemplate("gc_core/py/"+sf, "grammalecte/"+sf, dVars) + open("grammalecte/WARNING.txt", "w", encoding="utf-8", newline="\n").write(sWarningMessage) + ## CREATE GRAMMAR CHECKER PACKAGE spLangPack = "grammalecte/"+sLang helpers.createCleanFolder(spLangPack) for sf in os.listdir("gc_core/py/lang_core"): if not os.path.isdir("gc_core/py/lang_core/"+sf): @@ -248,13 +257,12 @@ # create folder spLangPack = "grammalecte-js/"+sLang helpers.createCleanFolder(spLangPack) # create files - for sf in os.listdir("gc_core/js"): - if not os.path.isdir("gc_core/js/"+sf) and sf.startswith("jsex_"): - dVars[sf[5:-3]] = open("gc_core/js/"+sf, "r", encoding="utf-8").read() + for sf in os.listdir("js_extension"): + dVars[sf[:-3]] = open("js_extension/"+sf, "r", encoding="utf-8").read() for sf in os.listdir("gc_core/js"): if not os.path.isdir("gc_core/js/"+sf) and not sf.startswith("jsex_"): helpers.copyAndFileTemplate("gc_core/js/"+sf, "grammalecte-js/"+sf, dVars) open("grammalecte-js/WARNING.txt", "w", encoding="utf-8", newline="\n").write(sWarningMessage) for sf in os.listdir("gc_core/js/lang_core"): @@ -274,10 +282,35 @@ else: build_module.build(sLang, dVars, spLangPack) return dVars['version'] + +def copyGraphspellCore (bJavaScript=False): + helpers.createCleanFolder("grammalecte/graphspell") + dir_util.mkpath("grammalecte/graphspell/_dictionaries") + for sf in os.listdir("graphspell"): + if not os.path.isdir("graphspell/"+sf): + file_util.copy_file("graphspell/"+sf, "grammalecte/graphspell") + if bJavaScript: + helpers.createCleanFolder("grammalecte-js/graphspell") + dir_util.mkpath("grammalecte-js/graphspell/_dictionaries") + dVars = {} + for sf in os.listdir("js_extension"): + dVars[sf[:-3]] = open("js_extension/"+sf, "r", encoding="utf-8").read() + for sf in os.listdir("graphspell-js"): + if not os.path.isdir("graphspell-js/"+sf): + file_util.copy_file("graphspell-js/"+sf, "grammalecte-js/graphspell") + helpers.copyAndFileTemplate("graphspell-js/"+sf, "grammalecte-js/graphspell/"+sf, dVars) + + +def copyGraphspellDictionary (sDicName, bJavaScript=False): + file_util.copy_file("graphspell/_dictionaries/"+sDicName.strip()+".bdic", "grammalecte/graphspell/_dictionaries") + file_util.copy_file("graphspell/_dictionaries/"+sDicName.strip()+".info.txt", "grammalecte/graphspell/_dictionaries") + if bJavaScript: + file_util.copy_file("graphspell-js/_dictionaries/"+sDicName.strip()+".json", "grammalecte-js/graphspell/_dictionaries") + def main (): print("Python: " + sys.version) xParser = argparse.ArgumentParser() xParser.add_argument("lang", type=str, nargs='+', help="lang project to generate (name of folder in /lang)") @@ -299,23 +332,20 @@ xArgs.build_data_before = True xArgs.build_data_after = True dir_util.mkpath("_build") dir_util.mkpath("grammalecte") - dir_util.mkpath("grammalecte-js") + if xArgs.javascript: + dir_util.mkpath("grammalecte-js") + + copyGraphspellCore(xArgs.javascript) for sLang in xArgs.lang: if os.path.exists("gc_lang/"+sLang) and os.path.isdir("gc_lang/"+sLang): xConfig = getConfig(sLang) dVars = xConfig._sections['args'] - # copy gc_core common file in Python now to be able to compile dictionary if required - for sf in os.listdir("gc_core/py"): - if not os.path.isdir("gc_core/py/"+sf): - helpers.copyAndFileTemplate("gc_core/py/"+sf, "grammalecte/"+sf, dVars) - open("grammalecte/WARNING.txt", "w", encoding="utf-8", newline="\n").write(sWarningMessage) - # build data build_data_module = None if xArgs.build_data_before or xArgs.build_data_after: # lang data try: @@ -322,16 +352,20 @@ build_data_module = importlib.import_module("gc_lang."+sLang+".build_data") except ImportError: print("# Error. Couldn’t import file build_data.py in folder gc_lang/"+sLang) if build_data_module and xArgs.build_data_before: build_data_module.before('gc_lang/'+sLang, dVars, xArgs.javascript) - if xArgs.dict or not os.path.exists("grammalecte/_dictionaries"): + if xArgs.dict: import lex_build lex_build.build(dVars['lexicon_src'], dVars['lang_name'], dVars['dic_name'], xArgs.javascript, dVars['stemming_method'], int(dVars['fsa_method'])) if build_data_module and xArgs.build_data_after: build_data_module.after('gc_lang/'+sLang, dVars, xArgs.javascript) + # copy dictionaries from Graphspell + for sDicName in dVars['dic_name'].split(","): + copyGraphspellDictionary(sDicName, xArgs.javascript) + # make sVersion = create(sLang, xConfig, xArgs.install, xArgs.javascript, ) # tests if xArgs.tests or xArgs.perf or xArgs.perf_memo: Index: reader.py ================================================================== --- reader.py +++ reader.py @@ -3,11 +3,11 @@ import os import sys import re -import grammalecte.ibdawg as ibdawg +import graphspell.ibdawg as ibdawg oDict = ibdawg.IBDAWG("French.bdic") def readFile (spf):