74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
|
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
|
-
-
+
+
-
+
-
+
-
+
+
+
-
+
|
self.aSugg.clear()
self.dSugg.clear()
class IBDAWG:
"""INDEXABLE BINARY DIRECT ACYCLIC WORD GRAPH"""
def __init__ (self, sDicName):
self.by = pkgutil.get_data(__package__, "_dictionaries/" + sDicName)
def __init__ (self, sfDict):
self.by = pkgutil.get_data(__package__, "_dictionaries/" + sfDict)
if not self.by:
raise OSError("# Error. File not found or not loadable: "+sDicName)
raise OSError("# Error. File not found or not loadable: "+sfDict)
if self.by[0:7] != b"/pyfsa/":
raise TypeError("# Error. Not a pyfsa binary dictionary. Header: {}".format(self.by[0:9]))
if not(self.by[7:8] == b"1" or self.by[7:8] == b"2" or self.by[7:8] == b"3"):
raise ValueError("# Error. Unknown dictionary version: {}".format(self.by[7:8]))
try:
header, info, values, bdic = self.by.split(b"\0\0\0\0", 3)
except Exception:
raise Exception
self.sName = sDicName
self.sFileName = sfDict
self.nCompressionMethod = int(self.by[7:8].decode("utf-8"))
self.sHeader = header.decode("utf-8")
self.lArcVal = values.decode("utf-8").split("\t")
self.nArcVal = len(self.lArcVal)
self.byDic = bdic
l = info.decode("utf-8").split("/")
self.sLang = l[0]
self.sLangCode = "xx"
self.sLangName = l[0]
self.sDicName = ""
self.nChar = int(l[1])
self.nBytesArc = int(l[2])
self.nBytesNodeAddress = int(l[3])
self.nEntries = int(l[4])
self.nEntry = int(l[4])
self.nNode = int(l[5])
self.nArc = int(l[6])
self.nAff = int(l[7])
self.cStemming = l[8]
if self.cStemming == "S":
self.funcStemming = st.changeWordWithSuffixCode
elif self.cStemming == "A":
|
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
|
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
|
-
+
+
+
+
-
-
+
+
+
-
-
+
+
+
+
+
+
+
+
+
+
+
+
-
+
-
-
-
-
-
-
-
-
-
-
-
-
+
+
+
+
|
self.bOptNumSigle = False
self.bOptNumAtLast = False
def getInfo (self):
return " Language: {0.sLangName} Lang code: {0.sLangCode} Dictionary name: {0.sDicName}" \
" Compression method: {0.nCompressionMethod:>2} Date: {0.sDate} Stemming: {0.cStemming}FX\n" \
" Arcs values: {0.nArcVal:>10,} = {0.nChar:>5,} characters, {0.nAff:>6,} affixes, {0.nTag:>6,} tags\n" \
" Dictionary: {0.nEntries:>12,} entries, {0.nNode:>11,} nodes, {0.nArc:>11,} arcs\n" \
" Dictionary: {0.nEntry:>12,} entries, {0.nNode:>11,} nodes, {0.nArc:>11,} arcs\n" \
" Address size: {0.nBytesNodeAddress:>1} bytes, Arc size: {0.nBytesArc:>1} bytes\n".format(self)
def writeAsJSObject (self, spfDest, bInJSModule=False, bBinaryDictAsHexString=False):
"write IBDAWG as a JavaScript object in a JavaScript module"
import json
with open(spfDest, "w", encoding="utf-8", newline="\n") as hDst:
if bInJSModule:
hDst.write('// JavaScript\n// Generated data (do not edit)\n\n"use strict";\n\nconst dictionary = ')
hDst.write(json.dumps({
"sHeader": "/pyfsa/",
"sLangCode": self.sLangCode,
"sLangName": self.sLangName,
"sName": self.sName,
"nCompressionMethod": self.nCompressionMethod,
"sDicName": self.sDicName,
"sFileName": self.sFileName,
"sDate": str(datetime.datetime.now())[:-7],
"nEntry": self.nEntry,
"sHeader": self.sHeader,
"lArcVal": self.lArcVal,
"nChar": self.nChar,
"nAff": self.nAff,
"nTag": self.nTag,
"cStemming": self.cStemming,
"dChar": self.dChar,
"nNode": self.nNode,
"nArc": self.nArc,
"nArcVal": self.nArcVal,
"lArcVal": self.lArcVal,
"nCompressionMethod": self.nCompressionMethod,
"nBytesArc": self.nBytesArc,
"nBytesNodeAddress": self.nBytesNodeAddress,
"nBytesOffset": self.nBytesOffset,
# JavaScript is a pile of shit, so Mozilla’s JS parser don’t like file bigger than 4 Mb!
# So, if necessary, we use an hexadecimal string, that we will convert later in Firefox’s extension.
# https://github.com/mozilla/addons-linter/issues/1361
"byDic": self.byDic.hex() if bBinaryDictAsHexString else [ e for e in self.byDic ],
"sByDic": self.byDic.hex() if bBinaryDictAsHexString else [ e for e in self.byDic ]
"sLang": self.sLang,
"nChar": self.nChar,
"nBytesArc": self.nBytesArc,
"nBytesNodeAddress": self.nBytesNodeAddress,
"nEntries": self.nEntries,
"nNode": self.nNode,
"nArc": self.nArc,
"nAff": self.nAff,
"cStemming": self.cStemming,
"nTag": self.nTag,
"dChar": self.dChar,
"nBytesOffset": self.nBytesOffset
}, ensure_ascii=False))
if bInJSModule:
hDst.write(";\n\nexports.dictionary = dictionary;\n")
def isValidToken (self, sToken):
"checks if <sToken> is valid (if there is hyphens in <sToken>, <sToken> is split, each part is checked)"
if self.isValid(sToken):
return True
if "-" in sToken:
if sToken.count("-") > 4:
|