Index: gc_core/js/lang_core/gc_engine.js ================================================================== --- gc_core/js/lang_core/gc_engine.js +++ gc_core/js/lang_core/gc_engine.js @@ -321,13 +321,13 @@ load: function (sContext="JavaScript", sPath="") { try { if (typeof(require) !== 'undefined') { var ibdawg = require("resource://grammalecte/graphspell/ibdawg.js"); - _oDict = new ibdawg.IBDAWG("${dic_name}.json"); + _oDict = new ibdawg.IBDAWG("${dic_filename}.json"); } else { - _oDict = new IBDAWG("${dic_name}.json", sPath); + _oDict = new IBDAWG("${dic_filename}.json", sPath); } _sAppContext = sContext; _dOptions = gc_options.getOptions(sContext).gl_shallowCopy(); // duplication necessary, to be able to reset to default } catch (e) { Index: gc_core/py/lang_core/gc_engine.py ================================================================== --- gc_core/py/lang_core/gc_engine.py +++ gc_core/py/lang_core/gc_engine.py @@ -290,11 +290,11 @@ def load (sContext="Python"): global _oDict global _sAppContext global _dOptions try: - _oDict = IBDAWG("${dic_name}.bdic") + _oDict = IBDAWG("${dic_filename}.bdic") _sAppContext = sContext _dOptions = dict(gc_options.getOptions(sContext)) # duplication necessary, to be able to reset to default except: traceback.print_exc() Index: gc_lang/fr/build.py ================================================================== --- gc_lang/fr/build.py +++ gc_lang/fr/build.py @@ -77,11 +77,11 @@ "create extension for Thunderbird" print("Building extension for Thunderbird") sExtensionName = dVars['tb_identifier'] + "-v" + dVars['version'] + '.xpi' spfZip = "_build/" + sExtensionName hZip = zipfile.ZipFile(spfZip, mode='w', compression=zipfile.ZIP_DEFLATED) - _copyGrammalecteJSPackageInZipFile(hZip, spLangPack, dVars['dic_name']+".json") + _copyGrammalecteJSPackageInZipFile(hZip, spLangPack, dVars['dic_filename']+".json") for spf in ["LICENSE.txt", "LICENSE.fr.txt"]: hZip.write(spf) dVars = _createOptionsForThunderbird(dVars) helpers.addFolderToZipAndFileFile(hZip, "gc_lang/"+sLang+"/tb", "", dVars, True) spDict = "gc_lang/"+sLang+"/xpi/data/dictionaries" Index: gc_lang/fr/config.ini ================================================================== --- gc_lang/fr/config.ini +++ gc_lang/fr/config.ini @@ -14,12 +14,14 @@ extras = README_fr.txt logo = logo.png # lexicon source lexicon_src = lexicons/French.lex +# binary dictionary file name +dic_filename = fr # binary dictionary name -dic_name = fr +dic_name = French # Finite state automaton compression: 1, 2 (experimental) or 3 (experimental) fsa_method = 1 # stemming method: S for suffixes only, A for prefixes and suffixes stemming_method = S Index: gc_lang/fr/modules/tests.py ================================================================== --- gc_lang/fr/modules/tests.py +++ gc_lang/fr/modules/tests.py @@ -22,11 +22,11 @@ class TestDictionary (unittest.TestCase): @classmethod def setUpClass (cls): - cls.oDic = IBDAWG("${dic_name}.bdic") + cls.oDic = IBDAWG("${dic_filename}.bdic") def test_lookup (self): for sWord in ["branche", "Émilie"]: self.assertTrue(self.oDic.lookup(sWord), sWord) Index: gc_lang/fr/webext/panel/lex_editor.js ================================================================== --- gc_lang/fr/webext/panel/lex_editor.js +++ gc_lang/fr/webext/panel/lex_editor.js @@ -598,11 +598,11 @@ }, build: function (lEntry) { oWidgets.showElement("build_progress"); let xProgressNode = document.getElementById("build_progress"); - let oDAWG = new DAWG(lEntry, "Français - dictionnaire personnel", "S", xProgressNode); + let oDAWG = new DAWG(lEntry, "S", "fr", "Français", "Dictionnaire personnel", xProgressNode); this.oJSON = oDAWG.createBinary(1); this.save(); oWidgets.hideElement("build_progress"); oWidgets.showElement("export_button"); }, Index: graphspell-js/dawg.js ================================================================== --- graphspell-js/dawg.js +++ graphspell-js/dawg.js @@ -26,11 +26,11 @@ A word is a list of numbers [ c1, c2, c3 . . . cN, iAffix, iTags] Each arc is an index in this.lArcVal, where are stored characters, suffix/affix codes for stemming and tags. Important: As usual, the last node (after ‘iTags’) is tagged final, AND the node after ‘cN’ is ALSO tagged final. */ - constructor (lEntrySrc, sLangCode, sLangName, sDicName, cStemming, xProgressBarNode=null) { + constructor (lEntrySrc, cStemming, sLangCode, sLangName="", sDicName="", xProgressBarNode=null) { console.log("===== Direct Acyclic Word Graph - Minimal Acyclic Finite State Automaton ====="); let funcStemmingGen = null; switch (cStemming.toUpperCase()) { case "A": funcStemmingGen = str_transform.defineAffixCode; break; @@ -376,11 +376,11 @@ "sLangCode": this.sLangCode, "sLangName": this.sLangName, "sDicName": this.sDicName, "sFileName": "[none]", "sDate": this._getDate(), - "nEntries": this.nEntry, + "nEntry": this.nEntry, "nChar": this.nChar, "nAff": this.nAff, "nTag": this.nTag, "cStemming": this.cStemming, "dChar": helpers.mapToObject(this.dChar), Index: graphspell-js/ibdawg.js ================================================================== --- graphspell-js/ibdawg.js +++ graphspell-js/ibdawg.js @@ -101,11 +101,11 @@ throw Error("# Error. File not found or not loadable.\n" + e.message + "\n"); } /* Properties: sName, nCompressionMethod, sHeader, lArcVal, nArcVal, sByDic, sLang, nChar, nBytesArc, nBytesNodeAddress, - nEntries, nNode, nArc, nAff, cStemming, nTag, dChar, nBytesOffset, + nEntry, nNode, nArc, nAff, cStemming, nTag, dChar, nBytesOffset, */ /* Bug workaround. Mozilla’s JS parser sucks. Can’t read file bigger than 4 Mb! @@ -177,11 +177,11 @@ getInfo () { return ` Language: ${this.sLangName} Lang code: ${this.sLangCode} Dictionary name: ${this.sDicName}\n` + ` Compression method: ${this.nCompressionMethod} Date: ${this.sDate} Stemming: ${this.cStemming}FX\n` + ` Arcs values: ${this.nArcVal} = ${this.nChar} characters, ${this.nAff} affixes, ${this.nTag} tags\n` + - ` Dictionary: ${this.nEntries} entries, ${this.nNode} nodes, ${this.nArc} arcs\n` + + ` Dictionary: ${this.nEntry} entries, ${this.nNode} nodes, ${this.nArc} arcs\n` + ` Address size: ${this.nBytesNodeAddress} bytes, Arc size: ${this.nBytesArc} bytes\n`; } isValidToken (sToken) { // checks if sToken is valid (if there is hyphens in sToken, sToken is split, each part is checked) Index: graphspell/dawg.py ================================================================== --- graphspell/dawg.py +++ graphspell/dawg.py @@ -39,11 +39,11 @@ # We store suffix/affix codes and tags within the graph after the “real” word. # A word is a list of numbers [ c1, c2, c3 . . . cN, iAffix, iTags] # Each arc is an index in self.lArcVal, where are stored characters, suffix/affix codes for stemming and tags. # Important: As usual, the last node (after ‘iTags’) is tagged final, AND the node after ‘cN’ is ALSO tagged final. - def __init__ (self, spfSrc, sLangCode, sLangName, sDicName, cStemming): + def __init__ (self, spfSrc, cStemming, sLangCode, sLangName="", sDicName=""): print("===== Direct Acyclic Word Graph - Minimal Acyclic Finite State Automaton =====") cStemming = cStemming.upper() if cStemming == "A": funcStemmingGen = st.defineAffixCode elif cStemming == "S": @@ -414,11 +414,11 @@ "sLangCode": self.sLangCode, "sLangName": self.sLangName, "sDicName": self.sDicName, "sFileName": self.sFileName, "sDate": str(datetime.datetime.now())[:-7], - "nEntries": self.nEntry, + "nEntry": self.nEntry, "nChar": self.nChar, "nAff": self.nAff, "nTag": self.nTag, "cStemming": self.cStemming, "dChar": self.dChar, @@ -427,15 +427,15 @@ "nArcVal": self.nArcVal, "lArcVal": self.lArcVal, "nCompressionMethod": nCompressionMethod, "nBytesArc": self.nBytesArc, "nBytesNodeAddress": self.nBytesNodeAddress, - "nBytesOffset": self.nBytesOffset + "nBytesOffset": self.nBytesOffset, # JavaScript is a pile of shit, so Mozilla’s JS parser don’t like file bigger than 4 Mb! # So, if necessary, we use an hexadecimal string, that we will convert later in Firefox’s extension. # https://github.com/mozilla/addons-linter/issues/1361 - "sByDic": byDic.hex() if bBinaryDictAsHexString else [ e for e in byDic ], + "sByDic": byDic.hex() if bBinaryDictAsHexString else [ e for e in byDic ] }, ensure_ascii=False)) if bInJSModule: hDst.write(";\n\nexports.dictionary = dictionary;\n") def _writeBinary (self, sPathFile, nCompressionMethod): Index: graphspell/ibdawg.py ================================================================== --- graphspell/ibdawg.py +++ graphspell/ibdawg.py @@ -76,14 +76,14 @@ class IBDAWG: """INDEXABLE BINARY DIRECT ACYCLIC WORD GRAPH""" - def __init__ (self, sDicName): - self.by = pkgutil.get_data(__package__, "_dictionaries/" + sDicName) + def __init__ (self, sfDict): + self.by = pkgutil.get_data(__package__, "_dictionaries/" + sfDict) if not self.by: - raise OSError("# Error. File not found or not loadable: "+sDicName) + raise OSError("# Error. File not found or not loadable: "+sfDict) if self.by[0:7] != b"/pyfsa/": raise TypeError("# Error. Not a pyfsa binary dictionary. Header: {}".format(self.by[0:9])) if not(self.by[7:8] == b"1" or self.by[7:8] == b"2" or self.by[7:8] == b"3"): raise ValueError("# Error. Unknown dictionary version: {}".format(self.by[7:8])) @@ -90,23 +90,25 @@ try: header, info, values, bdic = self.by.split(b"\0\0\0\0", 3) except Exception: raise Exception - self.sName = sDicName + self.sFileName = sfDict self.nCompressionMethod = int(self.by[7:8].decode("utf-8")) self.sHeader = header.decode("utf-8") self.lArcVal = values.decode("utf-8").split("\t") self.nArcVal = len(self.lArcVal) self.byDic = bdic l = info.decode("utf-8").split("/") - self.sLang = l[0] + self.sLangCode = "xx" + self.sLangName = l[0] + self.sDicName = "" self.nChar = int(l[1]) self.nBytesArc = int(l[2]) self.nBytesNodeAddress = int(l[3]) - self.nEntries = int(l[4]) + self.nEntry = int(l[4]) self.nNode = int(l[5]) self.nArc = int(l[6]) self.nAff = int(l[7]) self.cStemming = l[8] if self.cStemming == "S": @@ -156,45 +158,51 @@ def getInfo (self): return " Language: {0.sLangName} Lang code: {0.sLangCode} Dictionary name: {0.sDicName}" \ " Compression method: {0.nCompressionMethod:>2} Date: {0.sDate} Stemming: {0.cStemming}FX\n" \ " Arcs values: {0.nArcVal:>10,} = {0.nChar:>5,} characters, {0.nAff:>6,} affixes, {0.nTag:>6,} tags\n" \ - " Dictionary: {0.nEntries:>12,} entries, {0.nNode:>11,} nodes, {0.nArc:>11,} arcs\n" \ + " Dictionary: {0.nEntry:>12,} entries, {0.nNode:>11,} nodes, {0.nArc:>11,} arcs\n" \ " Address size: {0.nBytesNodeAddress:>1} bytes, Arc size: {0.nBytesArc:>1} bytes\n".format(self) def writeAsJSObject (self, spfDest, bInJSModule=False, bBinaryDictAsHexString=False): "write IBDAWG as a JavaScript object in a JavaScript module" import json with open(spfDest, "w", encoding="utf-8", newline="\n") as hDst: if bInJSModule: hDst.write('// JavaScript\n// Generated data (do not edit)\n\n"use strict";\n\nconst dictionary = ') hDst.write(json.dumps({ - "sName": self.sName, - "nCompressionMethod": self.nCompressionMethod, + "sHeader": "/pyfsa/", + "sLangCode": self.sLangCode, + "sLangName": self.sLangName, + "sDicName": self.sDicName, + "sFileName": self.sFileName, "sDate": str(datetime.datetime.now())[:-7], - "sHeader": self.sHeader, - "lArcVal": self.lArcVal, + "nEntry": self.nEntry, + "nChar": self.nChar, + "nAff": self.nAff, + "nTag": self.nTag, + "cStemming": self.cStemming, + "dChar": self.dChar, + "nNode": self.nNode, + "nArc": self.nArc, "nArcVal": self.nArcVal, + "lArcVal": self.lArcVal, + "nCompressionMethod": self.nCompressionMethod, + "nBytesArc": self.nBytesArc, + "nBytesNodeAddress": self.nBytesNodeAddress, + "nBytesOffset": self.nBytesOffset, # JavaScript is a pile of shit, so Mozilla’s JS parser don’t like file bigger than 4 Mb! # So, if necessary, we use an hexadecimal string, that we will convert later in Firefox’s extension. # https://github.com/mozilla/addons-linter/issues/1361 - "byDic": self.byDic.hex() if bBinaryDictAsHexString else [ e for e in self.byDic ], - "sLang": self.sLang, - "nChar": self.nChar, - "nBytesArc": self.nBytesArc, - "nBytesNodeAddress": self.nBytesNodeAddress, - "nEntries": self.nEntries, - "nNode": self.nNode, - "nArc": self.nArc, - "nAff": self.nAff, - "cStemming": self.cStemming, - "nTag": self.nTag, - "dChar": self.dChar, - "nBytesOffset": self.nBytesOffset + "sByDic": self.byDic.hex() if bBinaryDictAsHexString else [ e for e in self.byDic ] }, ensure_ascii=False)) if bInJSModule: hDst.write(";\n\nexports.dictionary = dictionary;\n") + + + + def isValidToken (self, sToken): "checks if is valid (if there is hyphens in , is split, each part is checked)" if self.isValid(sToken): return True Index: lex_build.py ================================================================== --- lex_build.py +++ lex_build.py @@ -7,31 +7,32 @@ import graphspell.dawg as fsa from graphspell.ibdawg import IBDAWG -def build (spfSrc, sLangName, sDicName, bJSON=False, cStemmingMethod="S", nCompressMethod=1): +def build (spfSrc, sLangCode, sLangName, sfDict, bJSON=False, sDicName="", cStemmingMethod="S", nCompressMethod=1): "transform a text lexicon as a binary indexable dictionary" - oDAWG = fsa.DAWG(spfSrc, sLangName, cStemmingMethod) + oDAWG = fsa.DAWG(spfSrc, cStemmingMethod, sLangCode, sLangName, sDicName) dir_util.mkpath("graphspell/_dictionaries") - oDAWG.writeInfo("graphspell/_dictionaries/" + sDicName + ".info.txt") - oDAWG.createBinary("graphspell/_dictionaries/" + sDicName + ".bdic", int(nCompressMethod)) + oDAWG.writeInfo("graphspell/_dictionaries/" + sfDict + ".info.txt") + oDAWG.createBinary("graphspell/_dictionaries/" + sfDict + ".bdic", int(nCompressMethod)) if bJSON: dir_util.mkpath("graphspell-js/_dictionaries") - oDic = IBDAWG(sDicName + ".bdic") - oDic.writeAsJSObject("graphspell-js/_dictionaries/" + sDicName + ".json", bBinaryDictAsHexString=True) + oDic = IBDAWG(sfDict + ".bdic") + oDic.writeAsJSObject("graphspell-js/_dictionaries/" + sfDict + ".json", bBinaryDictAsHexString=True) def main (): xParser = argparse.ArgumentParser() xParser.add_argument("src_lexicon", type=str, help="path and file name of the source lexicon") + xParser.add_argument("lang_code", type=str, help="language code") xParser.add_argument("lang_name", type=str, help="language name") - xParser.add_argument("dic_name", type=str, help="dictionary file name (without extension)") + xParser.add_argument("dic_filename", type=str, help="dictionary file name (without extension)") xParser.add_argument("-js", "--json", help="Build dictionary in JSON", action="store_true") xParser.add_argument("-s", "--stemming", help="stemming method: S=suffixes, A=affixes, N=no stemming", type=str, choices=["S", "A", "N"], default="S") xParser.add_argument("-c", "--compress", help="compression method: 1, 2 (beta), 3, (beta)", type=int, choices=[1, 2, 3], default=1) xArgs = xParser.parse_args() - build(xArgs.src_lexicon, xArgs.lang_name, xArgs.dic_name, xArgs.json) + build(xArgs.src_lexicon, xArgs.lang_code, xArgs.lang_name, xArgs.dic_filename, xArgs.json) if __name__ == '__main__': main() Index: make.py ================================================================== --- make.py +++ make.py @@ -76,11 +76,11 @@ print("Building extension for Writer") spfZip = "_build/" + dVars['name'] + "-"+ dVars['lang'] +"-v" + dVars['version'] + '.oxt' hZip = zipfile.ZipFile(spfZip, mode='w', compression=zipfile.ZIP_DEFLATED) # Package and parser - copyGrammalectePyPackageInZipFile(hZip, spLangPack, dVars['dic_name']+".bdic", "pythonpath/") + copyGrammalectePyPackageInZipFile(hZip, spLangPack, dVars['dic_filename']+".bdic", "pythonpath/") hZip.write("grammalecte-cli.py", "pythonpath/grammalecte-cli.py") # Extension files hZip.writestr("META-INF/manifest.xml", helpers.fileFile("gc_core/py/oxt/manifest.xml", dVars)) hZip.writestr("description.xml", helpers.fileFile("gc_core/py/oxt/description.xml", dVars)) @@ -154,26 +154,26 @@ def createPackageZip (sLang, dVars, spLangPack): "create server zip" spfZip = "_build/" + dVars['name'] + "-"+ dVars['lang'] +"-v" + dVars['version'] + '.zip' hZip = zipfile.ZipFile(spfZip, mode='w', compression=zipfile.ZIP_DEFLATED) - copyGrammalectePyPackageInZipFile(hZip, spLangPack, dVars['dic_name']+".bdic") + copyGrammalectePyPackageInZipFile(hZip, spLangPack, dVars['dic_filename']+".bdic") for spf in ["grammalecte-cli.py", "grammalecte-server.py", "bottle.py", \ "grammalecte-server-options._global.ini", "grammalecte-server-options."+sLang+".ini", \ "README.txt", "LICENSE.txt", "LICENSE.fr.txt"]: hZip.write(spf) hZip.writestr("setup.py", helpers.fileFile("gc_lang/fr/setup.py", dVars)) -def copyGrammalectePyPackageInZipFile (hZip, spLangPack, sDicName, sAddPath=""): +def copyGrammalectePyPackageInZipFile (hZip, spLangPack, sfDict, sAddPath=""): for sf in os.listdir("grammalecte"): if not os.path.isdir("grammalecte/"+sf): hZip.write("grammalecte/"+sf, sAddPath+"grammalecte/"+sf) for sf in os.listdir("grammalecte/graphspell"): if not os.path.isdir("grammalecte/graphspell/"+sf): hZip.write("grammalecte/graphspell/"+sf, sAddPath+"grammalecte/graphspell/"+sf) - hZip.write("grammalecte/graphspell/_dictionaries/"+sDicName, sAddPath+"grammalecte/graphspell/_dictionaries/"+sDicName) + hZip.write("grammalecte/graphspell/_dictionaries/"+sfDict, sAddPath+"grammalecte/graphspell/_dictionaries/"+sfDict) for sf in os.listdir(spLangPack): if not os.path.isdir(spLangPack+"/"+sf): hZip.write(spLangPack+"/"+sf, sAddPath+spLangPack+"/"+sf) @@ -303,22 +303,23 @@ file_util.copy_file("graphspell-js/"+sf, "grammalecte-js/graphspell") helpers.copyAndFileTemplate("graphspell-js/"+sf, "grammalecte-js/graphspell/"+sf, dVars) def copyGraphspellDictionary (dVars, bJavaScript=False): - spfPyDic = "graphspell/_dictionaries/"+dVars['dic_name']+".bdic" - spfJSDic = "graphspell-js/_dictionaries/"+dVars['dic_name']+".json" + spfPyDic = "graphspell/_dictionaries/"+dVars['dic_filename']+".bdic" + spfJSDic = "graphspell-js/_dictionaries/"+dVars['dic_filename']+".json" if not os.path.isfile(spfPyDic) or (bJavaScript and not os.path.isfile(spfJSDic)): buildDictionary(dVars, bJavaScript) file_util.copy_file(spfPyDic, "grammalecte/graphspell/_dictionaries") file_util.copy_file(spfPyDic[:-5]+".info.txt", "grammalecte/graphspell/_dictionaries") if bJavaScript: file_util.copy_file(spfJSDic, "grammalecte-js/graphspell/_dictionaries") def buildDictionary (dVars, bJavaScript): - lex_build.build(dVars['lexicon_src'], dVars['lang_name'], dVars['dic_name'], bJavaScript, dVars['stemming_method'], int(dVars['fsa_method'])) + lex_build.build(dVars['lexicon_src'], dVars['lang'], dVars['lang_name'], dVars['dic_filename'], \ + bJavaScript, dVars['dic_name'], dVars['stemming_method'], int(dVars['fsa_method'])) def main (): print("Python: " + sys.version) xParser = argparse.ArgumentParser()