Overview
Comment: | [graphspell][py] dawg: API modifications + add function to get dictionary as JSON |
---|---|
Downloads: | Tarball | ZIP archive | SQL archive |
Timelines: | family | ancestors | descendants | both | graphspell | multid |
Files: | files | file ages | folders |
SHA3-256: |
8a0391b163615cf781a45347e0b0a156 |
User & Date: | olr on 2018-02-27 20:50:57 |
Other Links: | branch diff | manifest | tags |
Context
2018-02-28
| ||
06:03 | [lo] lexicon editor ui: widen space for labels check-in: db28345207 user: olr tags: lo, multid | |
2018-02-27
| ||
20:50 | [graphspell][py] dawg: API modifications + add function to get dictionary as JSON check-in: 8a0391b163 user: olr tags: graphspell, multid | |
18:07 | [graphspell][py] dawg: ability to build lexicon directly from a list of tuples check-in: c65e578338 user: olr tags: graphspell, multid | |
Changes
Modified graphspell/dawg.py from [059d031769] to [f1e3328ca8].
︙ | |||
305 306 307 308 309 310 311 | 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 | - + | sEntry = sWord + "\t" + self.funcStemming(sWord, self.lArcVal[nVal]) for nMorphVal, _ in oNextNode.arcs.items(): if not zPattern or zPattern.search(self.lArcVal[nMorphVal]): yield sEntry + "\t" + self.lArcVal[nMorphVal] # BINARY CONVERSION |
︙ | |||
329 330 331 332 333 334 335 | 329 330 331 332 333 334 335 336 337 338 339 340 341 342 | - - - - | self._calcNodesAddress3() else: print(" # Error: unknown compression method") print(" Arc values (chars, affixes and tags): {} -> {} bytes".format( self.nArcVal, len("\t".join(self.lArcVal).encode("utf-8")) )) print(" Arc size: {} bytes, Address size: {} bytes -> {} * {} = {} bytes".format( self.nBytesArc, self.nBytesNodeAddress, \ self.nBytesArc+self.nBytesNodeAddress, self.nArc, \ (self.nBytesArc+self.nBytesNodeAddress)*self.nArc )) |
︙ | |||
384 385 386 387 388 389 390 | 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 422 423 424 425 426 427 428 429 430 431 432 433 434 435 436 437 438 439 440 441 442 443 444 445 446 | - + - - + - - - - - - + - + - + + + + + + + + + + - + | for oNextNode in self.lSortedNodes[i].arcs.values(): if 1 < (oNextNode.addr - self.lSortedNodes[i].addr) < self.nMaxOffset: nSize -= nDiff if self.lSortedNodes[i].size != nSize: self.lSortedNodes[i].size = nSize bEnd = False |
︙ | |||
468 469 470 471 472 473 474 475 476 477 478 479 480 481 | 467 468 469 470 471 472 473 474 475 476 477 478 479 480 481 | + | - Section Values: * a list of strings encoded in binary from utf-8, each value separated with a tabulation - Section Word Graph (nodes / arcs) * A list of nodes which are a list of arcs with an address of the next node. See DawgNode.convToBytes() for details. """ self._calculateBinary(nCompressionMethod) if not sPathFile.endswith(".bdic"): sPathFile += "."+str(nCompressionMethod)+".bdic" with open(sPathFile, 'wb') as hDst: # header hDst.write("/pyfsa/{}/".format(nCompressionMethod).encode("utf-8")) hDst.write(b"\0\0\0\0") # infos |
︙ | |||
496 497 498 499 500 501 502 | 496 497 498 499 500 501 502 503 504 505 506 507 508 509 510 511 | - + + | hDst.write(self.oRoot.convToBytes2(self.nBytesArc, self.nBytesNodeAddress)) for oNode in self.lSortedNodes: hDst.write(oNode.convToBytes2(self.nBytesArc, self.nBytesNodeAddress)) elif nCompressionMethod == 3: hDst.write(self.oRoot.convToBytes3(self.nBytesArc, self.nBytesNodeAddress, self.nBytesOffset)) for oNode in self.lSortedNodes: hDst.write(oNode.convToBytes3(self.nBytesArc, self.nBytesNodeAddress, self.nBytesOffset)) |
︙ | |||
519 520 521 522 523 524 525 | 520 521 522 523 524 525 526 527 528 529 530 531 532 533 | - - - - - - - - - - - - - | for oNode in self.lSortedNodes: hDst.write(oNode.getTxtRepr2(self.nBytesArc, self.nBytesNodeAddress, self.lArcVal)+"\n") if nCompressionMethod == 3: hDst.write(self.oRoot.getTxtRepr3(self.nBytesArc, self.nBytesNodeAddress, self.nBytesOffset, self.lArcVal)+"\n") #hDst.write( ''.join( [ "%02X " % z for z in self.oRoot.convToBytes3(self.nBytesArc, self.nBytesNodeAddress, self.nBytesOffset) ] ).strip() ) for oNode in self.lSortedNodes: hDst.write(oNode.getTxtRepr3(self.nBytesArc, self.nBytesNodeAddress, self.nBytesOffset, self.lArcVal)+"\n") |
︙ |
Modified lex_build.py from [41cc230f34] to [2d1c4b9aa4].
︙ | |||
10 11 12 13 14 15 16 | 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 | - + | def build (spfSrc, sLangCode, sLangName, sfDict, bJSON=False, sDicName="", cStemmingMethod="S", nCompressMethod=1): "transform a text lexicon as a binary indexable dictionary" oDAWG = fsa.DAWG(spfSrc, cStemmingMethod, sLangCode, sLangName, sDicName) dir_util.mkpath("graphspell/_dictionaries") oDAWG.writeInfo("graphspell/_dictionaries/" + sfDict + ".info.txt") |
︙ |