Overview
Comment: | [graphspell][fx] dawg: remove useless parameters |
---|---|
Downloads: | Tarball | ZIP archive | SQL archive |
Timelines: | family | ancestors | descendants | both | fx | graphspell | dict2 |
Files: | files | file ages | folders |
SHA3-256: |
5538934848f82156ff524ccf9654299b |
User & Date: | olr on 2020-11-05 16:25:58 |
Other Links: | branch diff | manifest | tags |
Context
2020-11-07
| ||
11:40 | [graphspell][build][lo][fx] merge dict2: use binary list instead of binary string, drop support for binary file -> use JSON, code cleaning check-in: 40ebc5eada user: olr tags: trunk, build, major_change, fx, lo, graphspell | |
2020-11-05
| ||
16:25 | [graphspell][fx] dawg: remove useless parameters Closed-Leaf check-in: 5538934848 user: olr tags: fx, graphspell, dict2 | |
13:27 | [graphspell][js] fix syntax error check-in: 6d54aadbb1 user: olr tags: graphspell, dict2 | |
Changes
Modified gc_lang/fr/webext/panel/lex_editor.js from [a95656b530] to [ffa654539b].
︙ | ︙ | |||
757 758 759 760 761 762 763 | }, build: function () { let xProgressNode = document.getElementById("wait_progress"); let lEntry = oLexiconTable.getEntries(); if (lEntry.length > 0) { let oDAWG = new DAWG(lEntry, "S", "fr", "Français", this.sName, this.sDescription, xProgressNode); | | | 757 758 759 760 761 762 763 764 765 766 767 768 769 770 771 | }, build: function () { let xProgressNode = document.getElementById("wait_progress"); let lEntry = oLexiconTable.getEntries(); if (lEntry.length > 0) { let oDAWG = new DAWG(lEntry, "S", "fr", "Français", this.sName, this.sDescription, xProgressNode); let oJSON = oDAWG.createBinaryJSON(); oDictHandler.saveDictionary(this.sName, oJSON); this.oIBDAWG = new IBDAWG(oJSON); this.setDictData(this.oIBDAWG.nEntry, this.oIBDAWG.sDate); } else { oDictHandler.saveDictionary(this.sName, null); this.setDictData(0, "[néant]"); } |
︙ | ︙ |
Modified graphspell-js/dawg.js from [4cbdb7b217] to [82fa7fdc68].
︙ | ︙ | |||
340 341 342 343 344 345 346 | } } } } } // BINARY CONVERSION | | | 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 | } } } } } // BINARY CONVERSION _calculateBinary () { console.log("Write DAWG as an indexable binary dictionary"); this.nBytesArc = Math.floor( (this.nArcVal.toString(2).length + 2) / 8 ) + 1; // We add 2 bits. See DawgNode.convToBytes() this.nBytesOffset = 0; this._calcNumBytesNodeAddress(); this._calcNodesAddress(); this.sByDic = this.oRoot.convToBytes(this.nBytesArc, this.nBytesNodeAddress); for (let oNode of this.dMinimizedNodes.values()) { |
︙ | ︙ | |||
403 404 405 406 407 408 409 | for (let n of aBytes) { nVal += n << nWeight; nWeight = nWeight - 8; } return nVal; } | | | < | 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 422 423 424 425 426 427 428 429 430 431 432 433 434 435 436 437 | for (let n of aBytes) { nVal += n << nWeight; nWeight = nWeight - 8; } return nVal; } createBinaryJSON () { this._calculateBinary(); this._binaryToList(); let oJSON = { "sHeader": "/grammalecte-fsa/", "sLangCode": this.sLangCode, "sLangName": this.sLangName, "sDicName": this.sDicName, "sDescription": this.sDescription, "sFileName": "[none]", "sDate": this._getDate(), "nEntry": this.nEntry, "nChar": this.nChar, "nAff": this.nAff, "nTag": this.nTag, "cStemming": this.cStemming, "dChar": helpers.mapToObject(this.dChar), "nNode": this.nNode, "nArc": this.nArc, "lArcVal": this.lArcVal, "nArcVal": this.nArcVal, "nBytesArc": this.nBytesArc, "nBytesNodeAddress": this.nBytesNodeAddress, "nBytesOffset": this.nBytesOffset, //"sByDic": this.sByDic, // binary word graph "lByDic": this.lByDic, "l2grams": Array.from(this.a2grams) }; |
︙ | ︙ |
Modified graphspell-js/dic_merger.js from [dea1fd0b02] to [10f02569ea].
︙ | ︙ | |||
34 35 36 37 38 39 40 | } } if (xProgressBar) { xProgressBar.value = xProgressBar.max; } try { let oDAWG = new DAWG(lEntry, cStemming, sLangCode, sLangName, sDicName, sDescription, xProgressBar); | | | 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 | } } if (xProgressBar) { xProgressBar.value = xProgressBar.max; } try { let oDAWG = new DAWG(lEntry, cStemming, sLangCode, sLangName, sDicName, sDescription, xProgressBar); let oDict = oDAWG.createBinaryJSON(); return oDict; } catch (e) { console.log("Dictionaries merger: unable to generate merged dictionary"); console.error(e); return null; } |
︙ | ︙ |
Modified graphspell-js/ibdawg.js from [8c83fe3dde] to [73dd04c644].
︙ | ︙ | |||
143 144 145 146 147 148 149 | console.error(e); console.log("path: " + sPath); console.log("dic:" + source.slice(0, 1000)); throw Error("# Error. File not found or not loadable.\n" + e.message + "\n"); } /* Properties: | | | 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 | console.error(e); console.log("path: " + sPath); console.log("dic:" + source.slice(0, 1000)); throw Error("# Error. File not found or not loadable.\n" + e.message + "\n"); } /* Properties: sName, sHeader, lArcVal, nArcVal, sByDic, sLang, nChar, nBytesArc, nBytesNodeAddress, nEntry, nNode, nArc, nAff, cStemming, nTag, dChar, nBytesOffset, */ if (!(this.sHeader.startsWith("/grammalecte-fsa/") || this.sHeader.startsWith("/pyfsa/"))) { throw TypeError("# Error. Not a grammalecte-fsa binary dictionary. Header: " + this.sHeader); } // <dChar> to get the value of an arc, <dCharVal> to get the char of an arc with its value |
︙ | ︙ | |||
207 208 209 210 211 212 213 | if (self && self.hasOwnProperty("lexgraph_"+this.sLangCode)) { // self is the Worker this.lexicographer = self["lexgraph_"+this.sLangCode]; } } getInfo () { return ` Language: ${this.sLangName} Lang code: ${this.sLangCode} Dictionary name: ${this.sDicName}\n` + | | | 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 | if (self && self.hasOwnProperty("lexgraph_"+this.sLangCode)) { // self is the Worker this.lexicographer = self["lexgraph_"+this.sLangCode]; } } getInfo () { return ` Language: ${this.sLangName} Lang code: ${this.sLangCode} Dictionary name: ${this.sDicName}\n` + ` Date: ${this.sDate} Stemming: ${this.cStemming}FX\n` + ` Arcs values: ${this.nArcVal} = ${this.nChar} characters, ${this.nAff} affixes, ${this.nTag} tags\n` + ` Dictionary: ${this.nEntry} entries, ${this.nNode} nodes, ${this.nArc} arcs\n` + ` Address size: ${this.nBytesNodeAddress} bytes, Arc size: ${this.nBytesArc} bytes\n`; } getJSON () { let oJSON = { |
︙ | ︙ | |||
232 233 234 235 236 237 238 | "nTag": this.nTag, "cStemming": this.cStemming, "dChar": helpers.mapToObject(this.dChar), "nNode": this.nNode, "nArc": this.nArc, "lArcVal": this.lArcVal, "nArcVal": this.nArcVal, | < | 232 233 234 235 236 237 238 239 240 241 242 243 244 245 | "nTag": this.nTag, "cStemming": this.cStemming, "dChar": helpers.mapToObject(this.dChar), "nNode": this.nNode, "nArc": this.nArc, "lArcVal": this.lArcVal, "nArcVal": this.nArcVal, "nBytesArc": this.nBytesArc, "nBytesNodeAddress": this.nBytesNodeAddress, "nBytesOffset": this.nBytesOffset, "sByDic": this.sByDic, // binary word graph "l2grams": this.l2grams }; return oJSON; |
︙ | ︙ |
Modified graphspell/dawg.py from [781b24100a] to [729715ac89].
︙ | ︙ | |||
354 355 356 357 358 359 360 | sEntry = sWord + "\t" + self.funcStemming(sWord, self.lArcVal[nVal]) for nMorphVal, _ in oNextNode.arcs.items(): if not zPattern or zPattern.search(self.lArcVal[nMorphVal]): yield sEntry + "\t" + self.lArcVal[nMorphVal] # BINARY CONVERSION | | | 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 | sEntry = sWord + "\t" + self.funcStemming(sWord, self.lArcVal[nVal]) for nMorphVal, _ in oNextNode.arcs.items(): if not zPattern or zPattern.search(self.lArcVal[nMorphVal]): yield sEntry + "\t" + self.lArcVal[nMorphVal] # BINARY CONVERSION def _calculateBinary (self): print(" > Write DAWG as an indexable binary dictionary") self.nBytesArc = ( (self.nArcVal.bit_length() + 2) // 8 ) + 1 # We add 2 bits. See DawgNode.convToBytes() self.nBytesOffset = 0 self._calcNumBytesNodeAddress() self._calcNodesAddress() self.byDic = b"" self.byDic = self.oRoot.convToBytes(self.nBytesArc, self.nBytesNodeAddress) |
︙ | ︙ | |||
383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 | nBytesNode = self.nBytesArc + self.nBytesNodeAddress iAddr = len(self.oRoot.arcs) * nBytesNode for oNode in self.lMinimizedNodes: oNode.addr = iAddr iAddr += max(len(oNode.arcs), 1) * nBytesNode def _binaryToList (self): self.lByDic = [] nAcc = 0 byBuffer = b"" nDivisor = (self.nBytesArc + self.nBytesNodeAddress) / 2 for i in range(0, len(self.byDic)): byBuffer += self.byDic[i:i+1] if nAcc == (self.nBytesArc - 1): self.lByDic.append(int.from_bytes(byBuffer, byteorder="big")) byBuffer = b"" elif nAcc == (self.nBytesArc + self.nBytesNodeAddress - 1): self.lByDic.append(round(int.from_bytes(byBuffer, byteorder="big") / nDivisor)) byBuffer = b"" nAcc = -1 nAcc = nAcc + 1 | > > > > > > > | | < | | < < | < < | | | 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 422 423 424 425 426 427 428 429 430 431 432 433 434 435 436 437 438 439 440 441 442 443 444 445 446 447 448 449 450 451 452 453 454 455 456 457 458 459 460 461 462 463 464 465 | nBytesNode = self.nBytesArc + self.nBytesNodeAddress iAddr = len(self.oRoot.arcs) * nBytesNode for oNode in self.lMinimizedNodes: oNode.addr = iAddr iAddr += max(len(oNode.arcs), 1) * nBytesNode def _binaryToList (self): """ Convert binary string to binary list BEFORE: Arc Address Arc Address ||||||||| ||||||||| ||||||||| ||||||||| ||||||||| ||||||||| ||||||||| ||||||||| ||||||||| ||||||||| ||||||||| ||||||||| ... AFTER: list of integers: [ arc, address, arc, address, arc, address, ... arc, address ] """ self.lByDic = [] nAcc = 0 byBuffer = b"" nDivisor = (self.nBytesArc + self.nBytesNodeAddress) / 2 for i in range(0, len(self.byDic)): byBuffer += self.byDic[i:i+1] if nAcc == (self.nBytesArc - 1): self.lByDic.append(int.from_bytes(byBuffer, byteorder="big")) byBuffer = b"" elif nAcc == (self.nBytesArc + self.nBytesNodeAddress - 1): self.lByDic.append(round(int.from_bytes(byBuffer, byteorder="big") / nDivisor)) byBuffer = b"" nAcc = -1 nAcc = nAcc + 1 def getBinaryAsJSON (self): "return a JSON string containing all necessary data of the dictionary (compressed as a binary string)" self._calculateBinary() self._binaryToList() return { "sHeader": "/grammalecte-fsa/", "sLangCode": self.sLangCode, "sLangName": self.sLangName, "sDicName": self.sDicName, "sDescription": self.sDescription, "sFileName": self.sFileName, "sDate": self._getDate(), "nEntry": self.nEntry, "nChar": self.nChar, "nAff": self.nAff, "nTag": self.nTag, "cStemming": self.cStemming, "dChar": self.dChar, "nNode": self.nNode, "nArc": self.nArc, "nArcVal": self.nArcVal, "lArcVal": self.lArcVal, "nBytesArc": self.nBytesArc, "nBytesNodeAddress": self.nBytesNodeAddress, "nBytesOffset": self.nBytesOffset, # Mozilla’s JS parser don’t like file bigger than 4 Mb! # So, if necessary, we use an hexadecimal string, that we will convert later in Firefox’s extension. # https://github.com/mozilla/addons-linter/issues/1361 #"sByDic": self.byDic.hex(), "lByDic": self.lByDic, "l2grams": list(self.a2grams) } def writeAsJSObject (self, spfDst): "write a file (JSON or JS module) with all the necessary data" if not spfDst.endswith(".json"): spfDst += ".json" with open(spfDst, "w", encoding="utf-8", newline="\n") as hDst: hDst.write( json.dumps(self.getBinaryAsJSON(), ensure_ascii=False) ) def _getDate (self): return time.strftime("%Y-%m-%d %H:%M:%S") def _writeNodes (self, sPathFile): "for debugging only" print(" > Write nodes") with open(sPathFile+".nodes.txt", 'w', encoding='utf-8', newline="\n") as hDst: hDst.write(self.oRoot.getTxtRepr(self.nBytesArc, self.lArcVal)+"\n") #hDst.write( ''.join( [ "%02X " % z for z in self.oRoot.convToBytes(self.nBytesArc, self.nBytesNodeAddress) ] ).strip() ) for oNode in self.lMinimizedNodes: hDst.write(oNode.getTxtRepr(self.nBytesArc, self.lArcVal)+"\n") |
︙ | ︙ |
Modified graphspell/ibdawg.py from [953707753a] to [15700e29f3].
︙ | ︙ | |||
170 171 172 173 174 175 176 | self.lexicographer = importlib.import_module(".lexgraph_"+self.sLangCode, "grammalecte.graphspell") except ImportError: print("# No module <graphspell.lexgraph_"+self.sLangCode+".py>") def getInfo (self): "return string about the IBDAWG" return " Language: {0.sLangName} Lang code: {0.sLangCode} Dictionary name: {0.sDicName}" \ | | | 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 | self.lexicographer = importlib.import_module(".lexgraph_"+self.sLangCode, "grammalecte.graphspell") except ImportError: print("# No module <graphspell.lexgraph_"+self.sLangCode+".py>") def getInfo (self): "return string about the IBDAWG" return " Language: {0.sLangName} Lang code: {0.sLangCode} Dictionary name: {0.sDicName}" \ " Date: {0.sDate} Stemming: {0.cStemming}FX\n" \ " Arcs values: {0.nArcVal:>10,} = {0.nChar:>5,} characters, {0.nAff:>6,} affixes, {0.nTag:>6,} tags\n" \ " Dictionary: {0.nEntry:>12,} entries, {0.nNode:>11,} nodes, {0.nArc:>11,} arcs\n" \ " Address size: {0.nBytesNodeAddress:>1} bytes, Arc size: {0.nBytesArc:>1} bytes\n".format(self) def isValidToken (self, sToken): "checks if <sToken> is valid (if there is hyphens in <sToken>, <sToken> is split, each part is checked)" sToken = st.spellingNormalization(sToken) |
︙ | ︙ |