Index: gc_lang/fr/perf_memo.txt ================================================================== --- gc_lang/fr/perf_memo.txt +++ gc_lang/fr/perf_memo.txt @@ -20,5 +20,6 @@ 0.5.15 2017.01.22 11:44 4.85204 1.16134 0.770762 0.227874 0.244574 0.253305 0.58831 0.319987 0.0603996 0.00694786 0.5.15 2017.01.22 11:47 4.85593 1.15248 0.762924 0.22744 0.243461 0.254609 0.586741 0.317503 0.0588827 0.00701016 (unicode normalisation NFC) 0.5.15 2017.01.31 12:06 4.88227 1.18008 0.782217 0.232617 0.247672 0.257628 0.596903 0.32169 0.0603505 0.00695196 0.5.15 2017.02.05 10:10 4.90222 1.18444 0.786696 0.233413 0.25071 0.260214 0.602112 0.325235 0.0609932 0.00706897 0.5.16 2017.05.12 07:41 4.92201 1.19269 0.80639 0.239147 0.257518 0.266523 0.62111 0.33359 0.0634668 0.00757178 +0.6.1 2018.02.12 09:58 5.25924 1.2649 0.878442 0.257465 0.280558 0.293903 0.686887 0.391275 0.0672474 0.00824723 Index: graphspell/dawg.py ================================================================== --- graphspell/dawg.py +++ graphspell/dawg.py @@ -11,11 +11,11 @@ import sys import os import collections import json -import datetime +import time from . import str_transform as st from .progressbar import ProgressBar @@ -413,11 +413,11 @@ "sHeader": "/pyfsa/", "sLangCode": self.sLangCode, "sLangName": self.sLangName, "sDicName": self.sDicName, "sFileName": self.sFileName, - "sDate": str(datetime.datetime.now())[:-7], + "sDate": self._getDate(), "nEntry": self.nEntry, "nChar": self.nChar, "nAff": self.nAff, "nTag": self.nTag, "cStemming": self.cStemming, @@ -446,21 +446,25 @@ - Section Header: /pyfsa/[compression method] * compression method is an ASCII string - Section Informations: - /[tag_lang] + /[lang code] + /[lang name] + /[dictionary name] + /[date creation] /[number of chars] /[number of bytes for each arc] /[number of bytes for each address node] /[number of entries] /[number of nodes] /[number of arcs] /[number of affixes] * each field is a ASCII string /[stemming code] - * "S" means stems are generated by /suffix_code/, "A" means they are generated by /affix_code/ + * "S" means stems are generated by /suffix_code/, + "A" means they are generated by /affix_code/ See defineSuffixCode() and defineAffixCode() for details. "N" means no stemming - Section Values: * a list of strings encoded in binary from utf-8, each value separated with a tabulation @@ -474,12 +478,14 @@ with open(sPathFile, 'wb') as hDst: # header hDst.write("/pyfsa/{}/".format(nCompressionMethod).encode("utf-8")) hDst.write(b"\0\0\0\0") # infos - hDst.write("{}/{}/{}/{}/{}/{}/{}/{}/{}".format(self.sLangName, self.nChar, self.nBytesArc, self.nBytesNodeAddress, \ - self.nEntry, self.nNode, self.nArc, self.nAff, self.cStemming).encode("utf-8")) + sInfo = "{}//{}//{}//{}//{}//{}//{}//{}//{}//{}//{}//{}//".format(self.sLangCode, self.sLangName, self.sDicName, self._getDate(), \ + self.nChar, self.nBytesArc, self.nBytesNodeAddress, \ + self.nEntry, self.nNode, self.nArc, self.nAff, self.cStemming) + hDst.write(sInfo.encode("utf-8")) hDst.write(b"\0\0\0\0") # lArcVal hDst.write("\t".join(self.lArcVal).encode("utf-8")) hDst.write(b"\0\0\0\0") # DAWG: nodes / arcs @@ -495,10 +501,13 @@ hDst.write(self.oRoot.convToBytes3(self.nBytesArc, self.nBytesNodeAddress, self.nBytesOffset)) for oNode in self.lSortedNodes: hDst.write(oNode.convToBytes3(self.nBytesArc, self.nBytesNodeAddress, self.nBytesOffset)) hDst.close() + def _getDate (self): + return time.strftime("%Y.%m.%d %H:%M") + def _writeNodes (self, sPathFile, nCompressionMethod): "for debugging only" print(" > Write nodes") with open(sPathFile+".nodes."+str(nCompressionMethod)+".txt", 'w', encoding='utf-8', newline="\n") as hDst: if nCompressionMethod == 1: Index: graphspell/ibdawg.py ================================================================== --- graphspell/ibdawg.py +++ graphspell/ibdawg.py @@ -3,13 +3,13 @@ import os import traceback import pkgutil import re from functools import wraps -import datetime import time import json +import binascii #import logging #logging.basicConfig(filename="suggestions.log", level=logging.DEBUG) from . import str_transform as st @@ -146,22 +146,23 @@ self.sHeader = header.decode("utf-8") self.lArcVal = values.decode("utf-8").split("\t") self.nArcVal = len(self.lArcVal) self.byDic = bdic - l = info.decode("utf-8").split("/") - self.sLangCode = "xx" - self.sLangName = l[0] - self.sDicName = "" - self.nChar = int(l[1]) - self.nBytesArc = int(l[2]) - self.nBytesNodeAddress = int(l[3]) - self.nEntry = int(l[4]) - self.nNode = int(l[5]) - self.nArc = int(l[6]) - self.nAff = int(l[7]) - self.cStemming = l[8] + l = info.decode("utf-8").split("//") + self.sLangCode = l.pop(0) + self.sLangName = l.pop(0) + self.sDicName = l.pop(0) + self.sDate = l.pop(0) + self.nChar = int(l.pop(0)) + self.nBytesArc = int(l.pop(0)) + self.nBytesNodeAddress = int(l.pop(0)) + self.nEntry = int(l.pop(0)) + self.nNode = int(l.pop(0)) + self.nArc = int(l.pop(0)) + self.nAff = int(l.pop(0)) + self.cStemming = l.pop(0) self.nTag = self.nArcVal - self.nChar - self.nAff # to get the value of an arc, to get the char of an arc with its value self.dChar = {} for i in range(1, self.nChar): self.dChar[self.lArcVal[i]] = i @@ -189,11 +190,11 @@ "sHeader": "/pyfsa/", "sLangCode": self.sLangCode, "sLangName": self.sLangName, "sDicName": self.sDicName, "sFileName": self.sFileName, - "sDate": str(datetime.datetime.now())[:-7], + "sDate": time.strftime("%Y.%m.%d %H:%M"), "nEntry": self.nEntry, "nChar": self.nChar, "nAff": self.nAff, "nTag": self.nTag, "cStemming": self.cStemming,