411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
|
hDst.write('// JavaScript\n// Generated data (do not edit)\n\n"use strict";\n\nconst dictionary = ')
hDst.write(json.dumps({
"sHeader": "/pyfsa/",
"sLangCode": self.sLangCode,
"sLangName": self.sLangName,
"sDicName": self.sDicName,
"sFileName": self.sFileName,
"sDate": str(datetime.datetime.now())[:-7],
"nEntry": self.nEntry,
"nChar": self.nChar,
"nAff": self.nAff,
"nTag": self.nTag,
"cStemming": self.cStemming,
"dChar": self.dChar,
"nNode": self.nNode,
|
|
|
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
|
hDst.write('// JavaScript\n// Generated data (do not edit)\n\n"use strict";\n\nconst dictionary = ')
hDst.write(json.dumps({
"sHeader": "/pyfsa/",
"sLangCode": self.sLangCode,
"sLangName": self.sLangName,
"sDicName": self.sDicName,
"sFileName": self.sFileName,
"sDate": self._getDate(),
"nEntry": self.nEntry,
"nChar": self.nChar,
"nAff": self.nAff,
"nTag": self.nTag,
"cStemming": self.cStemming,
"dChar": self.dChar,
"nNode": self.nNode,
|
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
485
486
487
488
489
490
491
492
493
494
495
496
497
498
499
500
501
502
503
504
505
506
|
Each section is separated with 4 bytes of \0
- Section Header:
/pyfsa/[compression method]
* compression method is an ASCII string
- Section Informations:
/[tag_lang]
/[number of chars]
/[number of bytes for each arc]
/[number of bytes for each address node]
/[number of entries]
/[number of nodes]
/[number of arcs]
/[number of affixes]
* each field is a ASCII string
/[stemming code]
* "S" means stems are generated by /suffix_code/, "A" means they are generated by /affix_code/
See defineSuffixCode() and defineAffixCode() for details.
"N" means no stemming
- Section Values:
* a list of strings encoded in binary from utf-8, each value separated with a tabulation
- Section Word Graph (nodes / arcs)
* A list of nodes which are a list of arcs with an address of the next node.
See DawgNode.convToBytes() for details.
"""
if not sPathFile.endswith(".bdic"):
sPathFile += "."+str(nCompressionMethod)+".bdic"
with open(sPathFile, 'wb') as hDst:
# header
hDst.write("/pyfsa/{}/".format(nCompressionMethod).encode("utf-8"))
hDst.write(b"\0\0\0\0")
# infos
hDst.write("{}/{}/{}/{}/{}/{}/{}/{}/{}".format(self.sLangName, self.nChar, self.nBytesArc, self.nBytesNodeAddress, \
self.nEntry, self.nNode, self.nArc, self.nAff, self.cStemming).encode("utf-8"))
hDst.write(b"\0\0\0\0")
# lArcVal
hDst.write("\t".join(self.lArcVal).encode("utf-8"))
hDst.write(b"\0\0\0\0")
# DAWG: nodes / arcs
if nCompressionMethod == 1:
hDst.write(self.oRoot.convToBytes1(self.nBytesArc, self.nBytesNodeAddress))
for oNode in self.lMinimizedNodes:
hDst.write(oNode.convToBytes1(self.nBytesArc, self.nBytesNodeAddress))
elif nCompressionMethod == 2:
hDst.write(self.oRoot.convToBytes2(self.nBytesArc, self.nBytesNodeAddress))
for oNode in self.lSortedNodes:
hDst.write(oNode.convToBytes2(self.nBytesArc, self.nBytesNodeAddress))
elif nCompressionMethod == 3:
hDst.write(self.oRoot.convToBytes3(self.nBytesArc, self.nBytesNodeAddress, self.nBytesOffset))
for oNode in self.lSortedNodes:
hDst.write(oNode.convToBytes3(self.nBytesArc, self.nBytesNodeAddress, self.nBytesOffset))
hDst.close()
def _writeNodes (self, sPathFile, nCompressionMethod):
"for debugging only"
print(" > Write nodes")
with open(sPathFile+".nodes."+str(nCompressionMethod)+".txt", 'w', encoding='utf-8', newline="\n") as hDst:
if nCompressionMethod == 1:
hDst.write(self.oRoot.getTxtRepr1(self.nBytesArc, self.nBytesNodeAddress, self.lArcVal)+"\n")
#hDst.write( ''.join( [ "%02X " % z for z in self.oRoot.convToBytes1(self.nBytesArc, self.nBytesNodeAddress) ] ).strip() )
|
|
>
>
>
|
>
>
|
|
>
>
>
>
|
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
485
486
487
488
489
490
491
492
493
494
495
496
497
498
499
500
501
502
503
504
505
506
507
508
509
510
511
512
513
514
515
|
Each section is separated with 4 bytes of \0
- Section Header:
/pyfsa/[compression method]
* compression method is an ASCII string
- Section Informations:
/[lang code]
/[lang name]
/[dictionary name]
/[date creation]
/[number of chars]
/[number of bytes for each arc]
/[number of bytes for each address node]
/[number of entries]
/[number of nodes]
/[number of arcs]
/[number of affixes]
* each field is a ASCII string
/[stemming code]
* "S" means stems are generated by /suffix_code/,
"A" means they are generated by /affix_code/
See defineSuffixCode() and defineAffixCode() for details.
"N" means no stemming
- Section Values:
* a list of strings encoded in binary from utf-8, each value separated with a tabulation
- Section Word Graph (nodes / arcs)
* A list of nodes which are a list of arcs with an address of the next node.
See DawgNode.convToBytes() for details.
"""
if not sPathFile.endswith(".bdic"):
sPathFile += "."+str(nCompressionMethod)+".bdic"
with open(sPathFile, 'wb') as hDst:
# header
hDst.write("/pyfsa/{}/".format(nCompressionMethod).encode("utf-8"))
hDst.write(b"\0\0\0\0")
# infos
sInfo = "{}//{}//{}//{}//{}//{}//{}//{}//{}//{}//{}//{}//".format(self.sLangCode, self.sLangName, self.sDicName, self._getDate(), \
self.nChar, self.nBytesArc, self.nBytesNodeAddress, \
self.nEntry, self.nNode, self.nArc, self.nAff, self.cStemming)
hDst.write(sInfo.encode("utf-8"))
hDst.write(b"\0\0\0\0")
# lArcVal
hDst.write("\t".join(self.lArcVal).encode("utf-8"))
hDst.write(b"\0\0\0\0")
# DAWG: nodes / arcs
if nCompressionMethod == 1:
hDst.write(self.oRoot.convToBytes1(self.nBytesArc, self.nBytesNodeAddress))
for oNode in self.lMinimizedNodes:
hDst.write(oNode.convToBytes1(self.nBytesArc, self.nBytesNodeAddress))
elif nCompressionMethod == 2:
hDst.write(self.oRoot.convToBytes2(self.nBytesArc, self.nBytesNodeAddress))
for oNode in self.lSortedNodes:
hDst.write(oNode.convToBytes2(self.nBytesArc, self.nBytesNodeAddress))
elif nCompressionMethod == 3:
hDst.write(self.oRoot.convToBytes3(self.nBytesArc, self.nBytesNodeAddress, self.nBytesOffset))
for oNode in self.lSortedNodes:
hDst.write(oNode.convToBytes3(self.nBytesArc, self.nBytesNodeAddress, self.nBytesOffset))
hDst.close()
def _getDate (self):
return time.strftime("%Y.%m.%d %H:%M")
def _writeNodes (self, sPathFile, nCompressionMethod):
"for debugging only"
print(" > Write nodes")
with open(sPathFile+".nodes."+str(nCompressionMethod)+".txt", 'w', encoding='utf-8', newline="\n") as hDst:
if nCompressionMethod == 1:
hDst.write(self.oRoot.getTxtRepr1(self.nBytesArc, self.nBytesNodeAddress, self.lArcVal)+"\n")
#hDst.write( ''.join( [ "%02X " % z for z in self.oRoot.convToBytes1(self.nBytesArc, self.nBytesNodeAddress) ] ).strip() )
|