Overview
Comment: | [graphspell][py] dawg: API modifications + add function to get dictionary as JSON |
---|---|
Downloads: | Tarball | ZIP archive | SQL archive |
Timelines: | family | ancestors | descendants | both | graphspell | multid |
Files: | files | file ages | folders |
SHA3-256: |
8a0391b163615cf781a45347e0b0a156 |
User & Date: | olr on 2018-02-27 20:50:57 |
Other Links: | branch diff | manifest | tags |
Context
2018-02-28
| ||
06:03 | [lo] lexicon editor ui: widen space for labels check-in: db28345207 user: olr tags: lo, multid | |
2018-02-27
| ||
20:50 | [graphspell][py] dawg: API modifications + add function to get dictionary as JSON check-in: 8a0391b163 user: olr tags: graphspell, multid | |
18:07 | [graphspell][py] dawg: ability to build lexicon directly from a list of tuples check-in: c65e578338 user: olr tags: graphspell, multid | |
Changes
Modified graphspell/dawg.py from [059d031769] to [f1e3328ca8].
︙ | ︙ | |||
305 306 307 308 309 310 311 | sEntry = sWord + "\t" + self.funcStemming(sWord, self.lArcVal[nVal]) for nMorphVal, _ in oNextNode.arcs.items(): if not zPattern or zPattern.search(self.lArcVal[nMorphVal]): yield sEntry + "\t" + self.lArcVal[nMorphVal] # BINARY CONVERSION | | | 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 | sEntry = sWord + "\t" + self.funcStemming(sWord, self.lArcVal[nVal]) for nMorphVal, _ in oNextNode.arcs.items(): if not zPattern or zPattern.search(self.lArcVal[nMorphVal]): yield sEntry + "\t" + self.lArcVal[nMorphVal] # BINARY CONVERSION def _calculateBinary (self, nCompressionMethod): print(" > Write DAWG as an indexable binary dictionary [method: %d]" % nCompressionMethod) if nCompressionMethod == 1: self.nBytesArc = ( (self.nArcVal.bit_length() + 2) // 8 ) + 1 # We add 2 bits. See DawgNode.convToBytes1() self.nBytesOffset = 0 self._calcNumBytesNodeAddress() self._calcNodesAddress1() elif nCompressionMethod == 2: |
︙ | ︙ | |||
329 330 331 332 333 334 335 | self._calcNodesAddress3() else: print(" # Error: unknown compression method") print(" Arc values (chars, affixes and tags): {} -> {} bytes".format( self.nArcVal, len("\t".join(self.lArcVal).encode("utf-8")) )) print(" Arc size: {} bytes, Address size: {} bytes -> {} * {} = {} bytes".format( self.nBytesArc, self.nBytesNodeAddress, \ self.nBytesArc+self.nBytesNodeAddress, self.nArc, \ (self.nBytesArc+self.nBytesNodeAddress)*self.nArc )) | < < < < | 329 330 331 332 333 334 335 336 337 338 339 340 341 342 | self._calcNodesAddress3() else: print(" # Error: unknown compression method") print(" Arc values (chars, affixes and tags): {} -> {} bytes".format( self.nArcVal, len("\t".join(self.lArcVal).encode("utf-8")) )) print(" Arc size: {} bytes, Address size: {} bytes -> {} * {} = {} bytes".format( self.nBytesArc, self.nBytesNodeAddress, \ self.nBytesArc+self.nBytesNodeAddress, self.nArc, \ (self.nBytesArc+self.nBytesNodeAddress)*self.nArc )) def _calcNumBytesNodeAddress (self): "how many bytes needed to store all nodes/arcs in the binary dictionary" self.nBytesNodeAddress = 1 while ((self.nBytesArc + self.nBytesNodeAddress) * self.nArc) > (2 ** (self.nBytesNodeAddress * 8)): self.nBytesNodeAddress += 1 |
︙ | ︙ | |||
384 385 386 387 388 389 390 | for oNextNode in self.lSortedNodes[i].arcs.values(): if 1 < (oNextNode.addr - self.lSortedNodes[i].addr) < self.nMaxOffset: nSize -= nDiff if self.lSortedNodes[i].size != nSize: self.lSortedNodes[i].size = nSize bEnd = False | | < | < < < < < | | | > > > > > > > > > | | 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 422 423 424 425 426 427 428 429 430 431 432 433 434 435 436 437 438 439 440 441 442 443 444 445 446 | for oNextNode in self.lSortedNodes[i].arcs.values(): if 1 < (oNextNode.addr - self.lSortedNodes[i].addr) < self.nMaxOffset: nSize -= nDiff if self.lSortedNodes[i].size != nSize: self.lSortedNodes[i].size = nSize bEnd = False def getBinaryAsJSON (self, nCompressionMethod=1, bBinaryDictAsHexString=True): self._calculateBinary(nCompressionMethod) byDic = b"" if nCompressionMethod == 1: byDic = self.oRoot.convToBytes1(self.nBytesArc, self.nBytesNodeAddress) for oNode in self.lMinimizedNodes: byDic += oNode.convToBytes1(self.nBytesArc, self.nBytesNodeAddress) elif nCompressionMethod == 2: byDic = self.oRoot.convToBytes2(self.nBytesArc, self.nBytesNodeAddress) for oNode in self.lSortedNodes: byDic += oNode.convToBytes2(self.nBytesArc, self.nBytesNodeAddress) elif nCompressionMethod == 3: byDic = self.oRoot.convToBytes3(self.nBytesArc, self.nBytesNodeAddress, self.nBytesOffset) for oNode in self.lSortedNodes: byDic += oNode.convToBytes3(self.nBytesArc, self.nBytesNodeAddress, self.nBytesOffset) return json.dumps({ "sHeader": "/pyfsa/", "sLangCode": self.sLangCode, "sLangName": self.sLangName, "sDicName": self.sDicName, "sFileName": self.sFileName, "sDate": self._getDate(), "nEntry": self.nEntry, "nChar": self.nChar, "nAff": self.nAff, "nTag": self.nTag, "cStemming": self.cStemming, "dChar": self.dChar, "nNode": self.nNode, "nArc": self.nArc, "nArcVal": self.nArcVal, "lArcVal": self.lArcVal, "nCompressionMethod": nCompressionMethod, "nBytesArc": self.nBytesArc, "nBytesNodeAddress": self.nBytesNodeAddress, "nBytesOffset": self.nBytesOffset, # Mozilla’s JS parser don’t like file bigger than 4 Mb! # So, if necessary, we use an hexadecimal string, that we will convert later in Firefox’s extension. # https://github.com/mozilla/addons-linter/issues/1361 "sByDic": byDic.hex() if bBinaryDictAsHexString else [ e for e in byDic ] }, ensure_ascii=False) def writeAsJSObject (self, spfDst, nCompressionMethod, bInJSModule=False, bBinaryDictAsHexString=True): if not spfDst.endswith(".json"): spfDst += "."+str(nCompressionMethod)+".json" with open(spfDst, "w", encoding="utf-8", newline="\n") as hDst: if bInJSModule: hDst.write('// JavaScript\n// Generated data (do not edit)\n\n"use strict";\n\nconst dictionary = ') hDst.write( self.getBinaryAsJSON(nCompressionMethod, bBinaryDictAsHexString) ) if bInJSModule: hDst.write(";\n\nexports.dictionary = dictionary;\n") def writeBinary (self, sPathFile, nCompressionMethod, bDebug=False): """ Format of the binary indexable dictionary: Each section is separated with 4 bytes of \0 - Section Header: /pyfsa/[compression method] * compression method is an ASCII string |
︙ | ︙ | |||
468 469 470 471 472 473 474 475 476 477 478 479 480 481 | - Section Values: * a list of strings encoded in binary from utf-8, each value separated with a tabulation - Section Word Graph (nodes / arcs) * A list of nodes which are a list of arcs with an address of the next node. See DawgNode.convToBytes() for details. """ if not sPathFile.endswith(".bdic"): sPathFile += "."+str(nCompressionMethod)+".bdic" with open(sPathFile, 'wb') as hDst: # header hDst.write("/pyfsa/{}/".format(nCompressionMethod).encode("utf-8")) hDst.write(b"\0\0\0\0") # infos | > | 467 468 469 470 471 472 473 474 475 476 477 478 479 480 481 | - Section Values: * a list of strings encoded in binary from utf-8, each value separated with a tabulation - Section Word Graph (nodes / arcs) * A list of nodes which are a list of arcs with an address of the next node. See DawgNode.convToBytes() for details. """ self._calculateBinary(nCompressionMethod) if not sPathFile.endswith(".bdic"): sPathFile += "."+str(nCompressionMethod)+".bdic" with open(sPathFile, 'wb') as hDst: # header hDst.write("/pyfsa/{}/".format(nCompressionMethod).encode("utf-8")) hDst.write(b"\0\0\0\0") # infos |
︙ | ︙ | |||
496 497 498 499 500 501 502 | hDst.write(self.oRoot.convToBytes2(self.nBytesArc, self.nBytesNodeAddress)) for oNode in self.lSortedNodes: hDst.write(oNode.convToBytes2(self.nBytesArc, self.nBytesNodeAddress)) elif nCompressionMethod == 3: hDst.write(self.oRoot.convToBytes3(self.nBytesArc, self.nBytesNodeAddress, self.nBytesOffset)) for oNode in self.lSortedNodes: hDst.write(oNode.convToBytes3(self.nBytesArc, self.nBytesNodeAddress, self.nBytesOffset)) | | > | 496 497 498 499 500 501 502 503 504 505 506 507 508 509 510 511 | hDst.write(self.oRoot.convToBytes2(self.nBytesArc, self.nBytesNodeAddress)) for oNode in self.lSortedNodes: hDst.write(oNode.convToBytes2(self.nBytesArc, self.nBytesNodeAddress)) elif nCompressionMethod == 3: hDst.write(self.oRoot.convToBytes3(self.nBytesArc, self.nBytesNodeAddress, self.nBytesOffset)) for oNode in self.lSortedNodes: hDst.write(oNode.convToBytes3(self.nBytesArc, self.nBytesNodeAddress, self.nBytesOffset)) if bDebug: self._writeNodes(sPathFile, nCompressionMethod) def _getDate (self): return time.strftime("%Y.%m.%d, %H:%M") def _writeNodes (self, sPathFile, nCompressionMethod): "for debugging only" print(" > Write nodes") |
︙ | ︙ | |||
519 520 521 522 523 524 525 | for oNode in self.lSortedNodes: hDst.write(oNode.getTxtRepr2(self.nBytesArc, self.nBytesNodeAddress, self.lArcVal)+"\n") if nCompressionMethod == 3: hDst.write(self.oRoot.getTxtRepr3(self.nBytesArc, self.nBytesNodeAddress, self.nBytesOffset, self.lArcVal)+"\n") #hDst.write( ''.join( [ "%02X " % z for z in self.oRoot.convToBytes3(self.nBytesArc, self.nBytesNodeAddress, self.nBytesOffset) ] ).strip() ) for oNode in self.lSortedNodes: hDst.write(oNode.getTxtRepr3(self.nBytesArc, self.nBytesNodeAddress, self.nBytesOffset, self.lArcVal)+"\n") | < < < < < < < < < < < < < | 520 521 522 523 524 525 526 527 528 529 530 531 532 533 | for oNode in self.lSortedNodes: hDst.write(oNode.getTxtRepr2(self.nBytesArc, self.nBytesNodeAddress, self.lArcVal)+"\n") if nCompressionMethod == 3: hDst.write(self.oRoot.getTxtRepr3(self.nBytesArc, self.nBytesNodeAddress, self.nBytesOffset, self.lArcVal)+"\n") #hDst.write( ''.join( [ "%02X " % z for z in self.oRoot.convToBytes3(self.nBytesArc, self.nBytesNodeAddress, self.nBytesOffset) ] ).strip() ) for oNode in self.lSortedNodes: hDst.write(oNode.getTxtRepr3(self.nBytesArc, self.nBytesNodeAddress, self.nBytesOffset, self.lArcVal)+"\n") class DawgNode: NextId = 0 NextPos = 1 # (version 2) |
︙ | ︙ |
Modified lex_build.py from [41cc230f34] to [2d1c4b9aa4].
︙ | ︙ | |||
10 11 12 13 14 15 16 | def build (spfSrc, sLangCode, sLangName, sfDict, bJSON=False, sDicName="", cStemmingMethod="S", nCompressMethod=1): "transform a text lexicon as a binary indexable dictionary" oDAWG = fsa.DAWG(spfSrc, cStemmingMethod, sLangCode, sLangName, sDicName) dir_util.mkpath("graphspell/_dictionaries") oDAWG.writeInfo("graphspell/_dictionaries/" + sfDict + ".info.txt") | | | 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 | def build (spfSrc, sLangCode, sLangName, sfDict, bJSON=False, sDicName="", cStemmingMethod="S", nCompressMethod=1): "transform a text lexicon as a binary indexable dictionary" oDAWG = fsa.DAWG(spfSrc, cStemmingMethod, sLangCode, sLangName, sDicName) dir_util.mkpath("graphspell/_dictionaries") oDAWG.writeInfo("graphspell/_dictionaries/" + sfDict + ".info.txt") oDAWG.writeBinary("graphspell/_dictionaries/" + sfDict + ".bdic", int(nCompressMethod)) if bJSON: dir_util.mkpath("graphspell-js/_dictionaries") oDic = IBDAWG(sfDict + ".bdic") oDic.writeAsJSObject("graphspell-js/_dictionaries/" + sfDict + ".json", bBinaryDictAsHexString=True) def main (): |
︙ | ︙ |