Overview
| Comment: | [graphspell] ibdawg: code cleaning, remove old code, useless compression versions |
|---|---|
| Downloads: | Tarball | ZIP archive | SQL archive |
| Timelines: | family | ancestors | descendants | both | graphspell | dict2 |
| Files: | files | file ages | folders |
| SHA3-256: |
86250e8e6cc98d64ac880a7d26c35ce7 |
| User & Date: | olr on 2020-11-04 12:02:12 |
| Other Links: | branch diff | manifest | tags |
Context
|
2020-11-04
| ||
| 12:21 | [graphspell][py] ibdawg: remove binary dict support check-in: 866ec22f7d user: olr tags: graphspell, dict2 | |
| 12:02 | [graphspell] ibdawg: code cleaning, remove old code, useless compression versions check-in: 86250e8e6c user: olr tags: graphspell, dict2 | |
| 11:37 | [build][graphspell][lo] dictionary: drop support for binary file -> use JSON check-in: 05fb167483 user: olr tags: build, lo, graphspell, dict2 | |
Changes
Modified graphspell-js/dawg.js from [525275df92] to [bb21108d9e].
| ︙ | ︙ | |||
340 341 342 343 344 345 346 |
}
}
}
}
}
// BINARY CONVERSION
| | | < | | | | < < < | | < < | | | < | 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 |
}
}
}
}
}
// BINARY CONVERSION
createBinaryJSON (nCompressionMethod=1) {
console.log("Write DAWG as an indexable binary dictionary");
this.nBytesArc = Math.floor( (this.nArcVal.toString(2).length + 2) / 8 ) + 1; // We add 2 bits. See DawgNode.convToBytes()
this.nBytesOffset = 0;
this._calcNumBytesNodeAddress();
this._calcNodesAddress();
console.log("Arc values (chars, affixes and tags): " + this.nArcVal);
console.log("Arc size: "+this.nBytesArc+" bytes, Address size: "+this.nBytesNodeAddress+" bytes");
console.log("-> " + this.nBytesArc+this.nBytesNodeAddress + " * " + this.nArc + " = " + (this.nBytesArc+this.nBytesNodeAddress)*this.nArc + " bytes");
return this._createJSON(nCompressionMethod);
}
_calcNumBytesNodeAddress () {
// how many bytes needed to store all nodes/arcs in the binary dictionary
this.nBytesNodeAddress = 1;
while (((this.nBytesArc + this.nBytesNodeAddress) * this.nArc) > (2 ** (this.nBytesNodeAddress * 8))) {
this.nBytesNodeAddress += 1;
}
}
_calcNodesAddress () {
let nBytesNode = this.nBytesArc + this.nBytesNodeAddress;
let iAddr = this.oRoot.arcs.size * nBytesNode;
for (let oNode of this.dMinimizedNodes.values()) {
oNode.addr = iAddr;
iAddr += Math.max(oNode.arcs.size, 1) * nBytesNode;
}
}
_createJSON (nCompressionMethod=1) {
let sByDic = this.oRoot.convToBytes(this.nBytesArc, this.nBytesNodeAddress);
for (let oNode of this.dMinimizedNodes.values()) {
sByDic += oNode.convToBytes(this.nBytesArc, this.nBytesNodeAddress);
}
let oJSON = {
"sHeader": "/grammalecte-fsa/",
"sLangCode": this.sLangCode,
"sLangName": this.sLangName,
"sDicName": this.sDicName,
"sDescription": this.sDescription,
|
| ︙ | ︙ | |||
492 493 494 495 496 497 498 |
for (let oNode of this.arcs.values()) {
oNode.display(nTab+1, lArcVal, bRecur);
}
}
}
// VERSION 1 =====================================================================================================
| | | 485 486 487 488 489 490 491 492 493 494 495 496 497 498 499 |
for (let oNode of this.arcs.values()) {
oNode.display(nTab+1, lArcVal, bRecur);
}
}
}
// VERSION 1 =====================================================================================================
convToBytes (nBytesArc, nBytesNodeAddress) {
/*
Node scheme:
- Arc length is defined by nBytesArc
- Address length is defined by nBytesNodeAddress
| Arc | Address of next node |
| | |
|
| ︙ | ︙ |
Modified graphspell/dawg.py from [c083d6a347] to [8c6420c0dc].
| ︙ | ︙ | |||
354 355 356 357 358 359 360 |
sEntry = sWord + "\t" + self.funcStemming(sWord, self.lArcVal[nVal])
for nMorphVal, _ in oNextNode.arcs.items():
if not zPattern or zPattern.search(self.lArcVal[nMorphVal]):
yield sEntry + "\t" + self.lArcVal[nMorphVal]
# BINARY CONVERSION
| | | < | | | | < < < < < < < < < < < < < | < < < < < < < < < < < < < < < < < < < < < < < < < < < < < < < < < < < < < < | | | < < < < < < < < | 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 |
sEntry = sWord + "\t" + self.funcStemming(sWord, self.lArcVal[nVal])
for nMorphVal, _ in oNextNode.arcs.items():
if not zPattern or zPattern.search(self.lArcVal[nMorphVal]):
yield sEntry + "\t" + self.lArcVal[nMorphVal]
# BINARY CONVERSION
def _calculateBinary (self, nCompressionMethod=1):
print(" > Write DAWG as an indexable binary dictionary")
self.nBytesArc = ( (self.nArcVal.bit_length() + 2) // 8 ) + 1 # We add 2 bits. See DawgNode.convToBytes()
self.nBytesOffset = 0
self._calcNumBytesNodeAddress()
self._calcNodesAddress()
print(" Arc values (chars, affixes and tags): {} -> {} bytes".format( self.nArcVal, len("\t".join(self.lArcVal).encode("utf-8")) ))
print(" Arc size: {} bytes, Address size: {} bytes -> {} * {} = {} bytes".format( self.nBytesArc, self.nBytesNodeAddress, \
self.nBytesArc+self.nBytesNodeAddress, self.nArc, \
(self.nBytesArc+self.nBytesNodeAddress)*self.nArc ))
def _calcNumBytesNodeAddress (self):
"how many bytes needed to store all nodes/arcs in the binary dictionary"
self.nBytesNodeAddress = 1
while ((self.nBytesArc + self.nBytesNodeAddress) * self.nArc) > (2 ** (self.nBytesNodeAddress * 8)):
self.nBytesNodeAddress += 1
def _calcNodesAddress (self):
nBytesNode = self.nBytesArc + self.nBytesNodeAddress
iAddr = len(self.oRoot.arcs) * nBytesNode
for oNode in self.lMinimizedNodes:
oNode.addr = iAddr
iAddr += max(len(oNode.arcs), 1) * nBytesNode
def getBinaryAsJSON (self, nCompressionMethod=1, bBinaryDictAsHexString=True):
"return a JSON string containing all necessary data of the dictionary (compressed as a binary string)"
self._calculateBinary(nCompressionMethod)
byDic = b""
byDic = self.oRoot.convToBytes(self.nBytesArc, self.nBytesNodeAddress)
for oNode in self.lMinimizedNodes:
byDic += oNode.convToBytes(self.nBytesArc, self.nBytesNodeAddress)
return {
"sHeader": "/grammalecte-fsa/",
"sLangCode": self.sLangCode,
"sLangName": self.sLangName,
"sDicName": self.sDicName,
"sDescription": self.sDescription,
"sFileName": self.sFileName,
|
| ︙ | ︙ | |||
485 486 487 488 489 490 491 |
with open(spfDst, "w", encoding="utf-8", newline="\n") as hDst:
if bInJSModule:
hDst.write('// JavaScript\n// Generated data (do not edit)\n\n"use strict";\n\nconst dictionary = ')
hDst.write( json.dumps(self.getBinaryAsJSON(nCompressionMethod, bBinaryDictAsHexString), ensure_ascii=False) )
if bInJSModule:
hDst.write(";\n\nexports.dictionary = dictionary;\n")
| < < < < < < < < < < < < < < < < < < < < < < < < < < < < < < < < < < < < < < < < < < < < < < < < < < < < < < < < < < < < < < < < < < < < < < < < < < < | < | | | | < < < < < < < < < | 425 426 427 428 429 430 431 432 433 434 435 436 437 438 439 440 441 442 443 444 445 446 447 448 449 |
with open(spfDst, "w", encoding="utf-8", newline="\n") as hDst:
if bInJSModule:
hDst.write('// JavaScript\n// Generated data (do not edit)\n\n"use strict";\n\nconst dictionary = ')
hDst.write( json.dumps(self.getBinaryAsJSON(nCompressionMethod, bBinaryDictAsHexString), ensure_ascii=False) )
if bInJSModule:
hDst.write(";\n\nexports.dictionary = dictionary;\n")
def _getDate (self):
return time.strftime("%Y-%m-%d %H:%M:%S")
def _writeNodes (self, sPathFile, nCompressionMethod=1):
"for debugging only"
print(" > Write nodes")
with open(sPathFile+".nodes."+str(nCompressionMethod)+".txt", 'w', encoding='utf-8', newline="\n") as hDst:
hDst.write(self.oRoot.getTxtRepr(self.nBytesArc, self.lArcVal)+"\n")
#hDst.write( ''.join( [ "%02X " % z for z in self.oRoot.convToBytes(self.nBytesArc, self.nBytesNodeAddress) ] ).strip() )
for oNode in self.lMinimizedNodes:
hDst.write(oNode.getTxtRepr(self.nBytesArc, self.lArcVal)+"\n")
class DawgNode:
"""Node of the word graph"""
NextId = 0
|
| ︙ | ︙ | |||
644 645 646 647 648 649 650 |
self.arcs = collections.OrderedDict(sorted(self.arcs.items(), key=lambda t: dValOccur.get(t[0], 0), reverse=True))
def sortArcs2 (self, dValOccur, lArcVal):
"sort arcs of each node depending on the previous char"
self.arcs = collections.OrderedDict(sorted(self.arcs.items(), key=lambda t: dValOccur.get(lArcVal[t[0]], 0), reverse=True))
# VERSION 1 =====================================================================================================
| | | 499 500 501 502 503 504 505 506 507 508 509 510 511 512 513 |
self.arcs = collections.OrderedDict(sorted(self.arcs.items(), key=lambda t: dValOccur.get(t[0], 0), reverse=True))
def sortArcs2 (self, dValOccur, lArcVal):
"sort arcs of each node depending on the previous char"
self.arcs = collections.OrderedDict(sorted(self.arcs.items(), key=lambda t: dValOccur.get(lArcVal[t[0]], 0), reverse=True))
# VERSION 1 =====================================================================================================
def convToBytes (self, nBytesArc, nBytesNodeAddress):
"""
Convert to bytes (method 1).
Node scheme:
- Arc length is defined by nBytesArc
- Address length is defined by nBytesNodeAddress
|
| ︙ | ︙ | |||
686 687 688 689 690 691 692 |
val = val | nFinalNodeMask
if i == nArc:
val = val | nFinalArcMask
by += val.to_bytes(nBytesArc, byteorder='big')
by += self.arcs[arc].addr.to_bytes(nBytesNodeAddress, byteorder='big')
return by
| | < < < < < < < < < < < < < < < < < < < < < < < < < < < < < < < < < < < < < < < < < < < < < < < < < < < < < < < < < < < < < < < < < < < < < < < < < < < < < < < < < < < < < < < < < < < < < < < < < < < < < < < < < < < < < < < < < < < < < < < < < < < < < < < < < < < < < < < < < < < < < < < < < < < < < < < | 541 542 543 544 545 546 547 548 549 550 551 552 553 554 555 556 557 558 559 560 561 562 563 564 565 566 567 568 569 570 571 |
val = val | nFinalNodeMask
if i == nArc:
val = val | nFinalArcMask
by += val.to_bytes(nBytesArc, byteorder='big')
by += self.arcs[arc].addr.to_bytes(nBytesNodeAddress, byteorder='big')
return by
def getTxtRepr (self, nBytesArc, lVal):
"return representation as string of node (method 1)"
nArc = len(self.arcs)
nFinalNodeMask = 1 << ((nBytesArc*8)-1)
nFinalArcMask = 1 << ((nBytesArc*8)-2)
s = "i{:_>10} -- #{:_>10}\n".format(self.i, self.addr)
if not nArc:
s += " {:<20} {:0>16} i{:_>10} #{:_>10}\n".format("", bin(nFinalNodeMask | nFinalArcMask)[2:], "0", "0")
return s
for i, arc in enumerate(self.arcs, 1):
val = arc
if i == 1 and self.final:
val = val | nFinalNodeMask
if i == nArc:
val = val | nFinalArcMask
s += " {:<20} {:0>16} i{:_>10} #{:_>10}\n".format(lVal[arc], bin(val)[2:], self.arcs[arc].i, self.arcs[arc].addr)
return s
# Another attempt to sort node arcs
_dCharOrder = {
# key: previous char, value: dictionary of chars {c: nValue}
"": {}
|
| ︙ | ︙ |