︙ | | | ︙ | |
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
|
aEntry = set()
lChar = ['']; dChar = {}; nChar = 1; dCharOccur = {}
lAff = []; dAff = {}; nAff = 0; dAffOccur = {}
lTag = []; dTag = {}; nTag = 0; dTagOccur = {}
nErr = 0
try:
zFilter = re.compile(sSelectFilterRegex) if sSelectFilterRegex else None
except:
print(" # Error. Wrong filter regex. Filter ignored.")
traceback.print_exc()
zFilter = None
# read lexicon
if type(src) is str:
iterable = readFile(src)
else:
iterable = src
for sFlex, sStem, sTag in iterable:
if not zFilter or zFilter.search(sTag):
addWordToCharDict(sFlex)
# chars
for c in sFlex:
if c not in dChar:
dChar[c] = nChar
lChar.append(c)
nChar += 1
|
>
>
>
|
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
|
aEntry = set()
lChar = ['']; dChar = {}; nChar = 1; dCharOccur = {}
lAff = []; dAff = {}; nAff = 0; dAffOccur = {}
lTag = []; dTag = {}; nTag = 0; dTagOccur = {}
nErr = 0
self.a2grams = set()
try:
zFilter = re.compile(sSelectFilterRegex) if sSelectFilterRegex else None
except:
print(" # Error. Wrong filter regex. Filter ignored.")
traceback.print_exc()
zFilter = None
# read lexicon
if type(src) is str:
iterable = readFile(src)
else:
iterable = src
for sFlex, sStem, sTag in iterable:
if not zFilter or zFilter.search(sTag):
self.a2grams.update(st.getNgrams(sFlex))
addWordToCharDict(sFlex)
# chars
for c in sFlex:
if c not in dChar:
dChar[c] = nChar
lChar.append(c)
nChar += 1
|
︙ | | | ︙ | |
281
282
283
284
285
286
287
288
289
290
291
292
293
294
|
print(" * {:<12} {:>16,}".format("Entries:", self.nEntry))
print(" * {:<12} {:>16,}".format("Characters:", self.nChar))
print(" * {:<12} {:>16,}".format("Affixes:", self.nAff))
print(" * {:<12} {:>16,}".format("Tags:", self.nTag))
print(" * {:<12} {:>16,}".format("Arc values:", self.nArcVal))
print(" * {:<12} {:>16,}".format("Nodes:", self.nNode))
print(" * {:<12} {:>16,}".format("Arcs:", self.nArc))
print(" * {:<12} {:>16}".format("Stemming:", self.cStemming + "FX"))
def getArcStats (self):
"return a string with statistics about nodes and arcs"
d = {}
for oNode in self.lMinimizedNodes:
n = len(oNode.arcs)
|
>
|
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
|
print(" * {:<12} {:>16,}".format("Entries:", self.nEntry))
print(" * {:<12} {:>16,}".format("Characters:", self.nChar))
print(" * {:<12} {:>16,}".format("Affixes:", self.nAff))
print(" * {:<12} {:>16,}".format("Tags:", self.nTag))
print(" * {:<12} {:>16,}".format("Arc values:", self.nArcVal))
print(" * {:<12} {:>16,}".format("Nodes:", self.nNode))
print(" * {:<12} {:>16,}".format("Arcs:", self.nArc))
print(" * {:<12} {:>16,}".format("2grams:", len(self.a2grams)))
print(" * {:<12} {:>16}".format("Stemming:", self.cStemming + "FX"))
def getArcStats (self):
"return a string with statistics about nodes and arcs"
d = {}
for oNode in self.lMinimizedNodes:
n = len(oNode.arcs)
|
︙ | | | ︙ | |
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
|
"nCompressionMethod": nCompressionMethod,
"nBytesArc": self.nBytesArc,
"nBytesNodeAddress": self.nBytesNodeAddress,
"nBytesOffset": self.nBytesOffset,
# Mozilla’s JS parser don’t like file bigger than 4 Mb!
# So, if necessary, we use an hexadecimal string, that we will convert later in Firefox’s extension.
# https://github.com/mozilla/addons-linter/issues/1361
"sByDic": byDic.hex() if bBinaryDictAsHexString else [ e for e in byDic ]
}
def writeAsJSObject (self, spfDst, nCompressionMethod, bInJSModule=False, bBinaryDictAsHexString=True):
"write a file (JSON or JS module) with all the necessary data"
if not spfDst.endswith(".json"):
spfDst += "."+str(nCompressionMethod)+".json"
with open(spfDst, "w", encoding="utf-8", newline="\n") as hDst:
|
|
>
|
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
|
"nCompressionMethod": nCompressionMethod,
"nBytesArc": self.nBytesArc,
"nBytesNodeAddress": self.nBytesNodeAddress,
"nBytesOffset": self.nBytesOffset,
# Mozilla’s JS parser don’t like file bigger than 4 Mb!
# So, if necessary, we use an hexadecimal string, that we will convert later in Firefox’s extension.
# https://github.com/mozilla/addons-linter/issues/1361
"sByDic": byDic.hex() if bBinaryDictAsHexString else [ e for e in byDic ],
"l2grams": list(self.a2grams)
}
def writeAsJSObject (self, spfDst, nCompressionMethod, bInJSModule=False, bBinaryDictAsHexString=True):
"write a file (JSON or JS module) with all the necessary data"
if not spfDst.endswith(".json"):
spfDst += "."+str(nCompressionMethod)+".json"
with open(spfDst, "w", encoding="utf-8", newline="\n") as hDst:
|
︙ | | | ︙ | |
494
495
496
497
498
499
500
501
502
503
504
505
506
507
508
509
510
511
512
513
514
515
516
517
518
519
520
521
522
523
524
|
- Section Values:
* a list of strings encoded in binary from utf-8, each value separated with a tabulation
- Section Word Graph (nodes / arcs)
* A list of nodes which are a list of arcs with an address of the next node.
See DawgNode.convToBytes() for details.
"""
self._calculateBinary(nCompressionMethod)
if not sPathFile.endswith(".bdic"):
sPathFile += "."+str(nCompressionMethod)+".bdic"
with open(sPathFile, 'wb') as hDst:
# header
hDst.write("/grammalecte-fsa/{}/".format(nCompressionMethod).encode("utf-8"))
hDst.write(b"\0\0\0\0")
# infos
sInfo = "{}//{}//{}//{}//{}//{}//{}//{}//{}//{}//{}//{}//".format(self.sLangCode, self.sLangName, self.sDicName, self._getDate(), \
self.nChar, self.nBytesArc, self.nBytesNodeAddress, \
self.nEntry, self.nNode, self.nArc, self.nAff, self.cStemming)
hDst.write(sInfo.encode("utf-8"))
hDst.write(b"\0\0\0\0")
# lArcVal
hDst.write("\t".join(self.lArcVal).encode("utf-8"))
hDst.write(b"\0\0\0\0")
# DAWG: nodes / arcs
if nCompressionMethod == 1:
hDst.write(self.oRoot.convToBytes1(self.nBytesArc, self.nBytesNodeAddress))
for oNode in self.lMinimizedNodes:
hDst.write(oNode.convToBytes1(self.nBytesArc, self.nBytesNodeAddress))
elif nCompressionMethod == 2:
hDst.write(self.oRoot.convToBytes2(self.nBytesArc, self.nBytesNodeAddress))
|
>
>
>
>
>
>
|
499
500
501
502
503
504
505
506
507
508
509
510
511
512
513
514
515
516
517
518
519
520
521
522
523
524
525
526
527
528
529
530
531
532
533
534
535
|
- Section Values:
* a list of strings encoded in binary from utf-8, each value separated with a tabulation
- Section Word Graph (nodes / arcs)
* A list of nodes which are a list of arcs with an address of the next node.
See DawgNode.convToBytes() for details.
- Section 2grams:
* A list of 2grams (as strings: 2 chars) encoded in binary from utf-8, each value separated with a tabulation
"""
self._calculateBinary(nCompressionMethod)
if not sPathFile.endswith(".bdic"):
sPathFile += "."+str(nCompressionMethod)+".bdic"
with open(sPathFile, 'wb') as hDst:
# header
hDst.write("/grammalecte-fsa/{}/".format(nCompressionMethod).encode("utf-8"))
hDst.write(b"\0\0\0\0")
# infos
sInfo = "{}//{}//{}//{}//{}//{}//{}//{}//{}//{}//{}//{}//".format(self.sLangCode, self.sLangName, self.sDicName, self._getDate(), \
self.nChar, self.nBytesArc, self.nBytesNodeAddress, \
self.nEntry, self.nNode, self.nArc, self.nAff, self.cStemming)
hDst.write(sInfo.encode("utf-8"))
hDst.write(b"\0\0\0\0")
# lArcVal
hDst.write("\t".join(self.lArcVal).encode("utf-8"))
hDst.write(b"\0\0\0\0")
# 2grams
hDst.write("\t".join(self.a2grams).encode("utf-8"))
hDst.write(b"\0\0\0\0")
# DAWG: nodes / arcs
if nCompressionMethod == 1:
hDst.write(self.oRoot.convToBytes1(self.nBytesArc, self.nBytesNodeAddress))
for oNode in self.lMinimizedNodes:
hDst.write(oNode.convToBytes1(self.nBytesArc, self.nBytesNodeAddress))
elif nCompressionMethod == 2:
hDst.write(self.oRoot.convToBytes2(self.nBytesArc, self.nBytesNodeAddress))
|
︙ | | | ︙ | |