︙ | | | ︙ | |
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
|
"""DIRECT ACYCLIC WORD GRAPH"""
# This code is inspired from Steve Hanov’s DAWG, 2011. (http://stevehanov.ca/blog/index.php?id=115)
# We store suffix/affix codes and tags within the graph after the “real” word.
# A word is a list of numbers [ c1, c2, c3 . . . cN, iAffix, iTags]
# Each arc is an index in self.lArcVal, where are stored characters, suffix/affix codes for stemming and tags.
# Important: As usual, the last node (after ‘iTags’) is tagged final, AND the node after ‘cN’ is ALSO tagged final.
def __init__ (self, src, cStemming, sLangCode, sLangName="", sDicName="", sSelectFilterRegex=""):
print("===== Direct Acyclic Word Graph - Minimal Acyclic Finite State Automaton =====")
cStemming = cStemming.upper()
if cStemming == "A":
funcStemmingGen = st.defineAffixCode
elif cStemming == "S":
funcStemmingGen = st.defineSuffixCode
elif cStemming == "N":
|
|
|
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
|
"""DIRECT ACYCLIC WORD GRAPH"""
# This code is inspired from Steve Hanov’s DAWG, 2011. (http://stevehanov.ca/blog/index.php?id=115)
# We store suffix/affix codes and tags within the graph after the “real” word.
# A word is a list of numbers [ c1, c2, c3 . . . cN, iAffix, iTags]
# Each arc is an index in self.lArcVal, where are stored characters, suffix/affix codes for stemming and tags.
# Important: As usual, the last node (after ‘iTags’) is tagged final, AND the node after ‘cN’ is ALSO tagged final.
def __init__ (self, src, cStemming, sLangCode, sLangName="", sDicName="", sDescription="", sSelectFilterRegex=""):
print("===== Direct Acyclic Word Graph - Minimal Acyclic Finite State Automaton =====")
cStemming = cStemming.upper()
if cStemming == "A":
funcStemmingGen = st.defineAffixCode
elif cStemming == "S":
funcStemmingGen = st.defineSuffixCode
elif cStemming == "N":
|
︙ | | | ︙ | |
117
118
119
120
121
122
123
124
125
126
127
128
129
130
|
+ [ (dAff[aff]+nChar, dAffOccur[aff]) for aff in dAff ] \
+ [ (dTag[tag]+nChar+nAff, dTagOccur[tag]) for tag in dTag ] )
self.sFileName = src if type(src) is str else "[None]"
self.sLangCode = sLangCode
self.sLangName = sLangName
self.sDicName = sDicName
self.nEntry = len(lWord)
self.aPreviousEntry = []
DawgNode.resetNextId()
self.oRoot = DawgNode()
self.lUncheckedNodes = [] # list of nodes that have not been checked for duplication.
self.lMinimizedNodes = {} # list of unique nodes that have been checked for duplication.
self.lSortedNodes = [] # version 2 and 3
|
>
|
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
|
+ [ (dAff[aff]+nChar, dAffOccur[aff]) for aff in dAff ] \
+ [ (dTag[tag]+nChar+nAff, dTagOccur[tag]) for tag in dTag ] )
self.sFileName = src if type(src) is str else "[None]"
self.sLangCode = sLangCode
self.sLangName = sLangName
self.sDicName = sDicName
self.sDescription = sDescription
self.nEntry = len(lWord)
self.aPreviousEntry = []
DawgNode.resetNextId()
self.oRoot = DawgNode()
self.lUncheckedNodes = [] # list of nodes that have not been checked for duplication.
self.lMinimizedNodes = {} # list of unique nodes that have been checked for duplication.
self.lSortedNodes = [] # version 2 and 3
|
︙ | | | ︙ | |
429
430
431
432
433
434
435
436
437
438
439
440
441
442
|
for oNode in self.lSortedNodes:
byDic += oNode.convToBytes3(self.nBytesArc, self.nBytesNodeAddress, self.nBytesOffset)
return {
"sHeader": "/grammalecte-fsa/",
"sLangCode": self.sLangCode,
"sLangName": self.sLangName,
"sDicName": self.sDicName,
"sFileName": self.sFileName,
"sDate": self._getDate(),
"nEntry": self.nEntry,
"nChar": self.nChar,
"nAff": self.nAff,
"nTag": self.nTag,
"cStemming": self.cStemming,
|
>
|
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
|
for oNode in self.lSortedNodes:
byDic += oNode.convToBytes3(self.nBytesArc, self.nBytesNodeAddress, self.nBytesOffset)
return {
"sHeader": "/grammalecte-fsa/",
"sLangCode": self.sLangCode,
"sLangName": self.sLangName,
"sDicName": self.sDicName,
"sDescription": self.sDescription,
"sFileName": self.sFileName,
"sDate": self._getDate(),
"nEntry": self.nEntry,
"nChar": self.nChar,
"nAff": self.nAff,
"nTag": self.nTag,
"cStemming": self.cStemming,
|
︙ | | | ︙ | |
511
512
513
514
515
516
517
518
519
520
521
522
523
524
525
526
527
|
if not sPathFile.endswith(".bdic"):
sPathFile += "."+str(nCompressionMethod)+".bdic"
with open(sPathFile, 'wb') as hDst:
# header
hDst.write("/grammalecte-fsa/{}/".format(nCompressionMethod).encode("utf-8"))
hDst.write(b"\0\0\0\0")
# infos
sInfo = "{}//{}//{}//{}//{}//{}//{}//{}//{}//{}//{}//{}//".format(self.sLangCode, self.sLangName, self.sDicName, self._getDate(), \
self.nChar, self.nBytesArc, self.nBytesNodeAddress, \
self.nEntry, self.nNode, self.nArc, self.nAff, self.cStemming)
hDst.write(sInfo.encode("utf-8"))
hDst.write(b"\0\0\0\0")
# lArcVal
hDst.write("\t".join(self.lArcVal).encode("utf-8"))
hDst.write(b"\0\0\0\0")
# 2grams
hDst.write("\t".join(self.a2grams).encode("utf-8"))
|
|
|
|
|
513
514
515
516
517
518
519
520
521
522
523
524
525
526
527
528
529
|
if not sPathFile.endswith(".bdic"):
sPathFile += "."+str(nCompressionMethod)+".bdic"
with open(sPathFile, 'wb') as hDst:
# header
hDst.write("/grammalecte-fsa/{}/".format(nCompressionMethod).encode("utf-8"))
hDst.write(b"\0\0\0\0")
# infos
sInfo = "{}//{}//{}//{}//{}//{}//{}//{}//{}//{}//{}//{}//{}".format(self.sLangCode, self.sLangName, self.sDicName, self.sDescription, self._getDate(), \
self.nChar, self.nBytesArc, self.nBytesNodeAddress, \
self.nEntry, self.nNode, self.nArc, self.nAff, self.cStemming)
hDst.write(sInfo.encode("utf-8"))
hDst.write(b"\0\0\0\0")
# lArcVal
hDst.write("\t".join(self.lArcVal).encode("utf-8"))
hDst.write(b"\0\0\0\0")
# 2grams
hDst.write("\t".join(self.a2grams).encode("utf-8"))
|
︙ | | | ︙ | |