Index: graphspell/dawg.py ================================================================== --- graphspell/dawg.py +++ graphspell/dawg.py @@ -25,11 +25,11 @@ if os.path.isfile(spf): with open(spf, "r", encoding="utf-8") as hSrc: for sLine in hSrc: sLine = sLine.strip() if sLine and not sLine.startswith("#"): - yield sLine + yield sLine.split("\t") else: raise OSError("# Error. File not found or not loadable: " + spf) @@ -39,11 +39,11 @@ # We store suffix/affix codes and tags within the graph after the “real” word. # A word is a list of numbers [ c1, c2, c3 . . . cN, iAffix, iTags] # Each arc is an index in self.lArcVal, where are stored characters, suffix/affix codes for stemming and tags. # Important: As usual, the last node (after ‘iTags’) is tagged final, AND the node after ‘cN’ is ALSO tagged final. - def __init__ (self, spfSrc, cStemming, sLangCode, sLangName="", sDicName=""): + def __init__ (self, src, cStemming, sLangCode, sLangName="", sDicName=""): print("===== Direct Acyclic Word Graph - Minimal Acyclic Finite State Automaton =====") cStemming = cStemming.upper() if cStemming == "A": funcStemmingGen = st.defineAffixCode elif cStemming == "S": @@ -56,14 +56,17 @@ lEntry = [] lChar = ['']; dChar = {}; nChar = 1; dCharOccur = {} lAff = []; dAff = {}; nAff = 0; dAffOccur = {} lTag = []; dTag = {}; nTag = 0; dTagOccur = {} nErr = 0 - + # read lexicon - for sLine in readFile(spfSrc): - sFlex, sStem, sTag = sLine.split("\t") + if type(src) is str: + iterable = readFile(src) + else: + iterable = src + for sFlex, sStem, sTag in iterable: addWordToCharDict(sFlex) # chars for c in sFlex: if c not in dChar: dChar[c] = nChar @@ -95,16 +98,12 @@ # Dictionary of arc values occurrency, to sort arcs of each node dValOccur = dict( [ (dChar[c], dCharOccur[c]) for c in dChar ] \ + [ (dAff[aff]+nChar, dAffOccur[aff]) for aff in dAff ] \ + [ (dTag[tag]+nChar+nAff, dTagOccur[tag]) for tag in dTag ] ) - #with open(spfSrc[:-8]+".valuesfreq.txt", 'w', encoding='utf-8') as hFreqDst: # DEBUG - # for iKey, nOcc in sorted(dValOccur.items(), key=lambda t: t[1], reverse=True): - # hFreqDst.write("{}: {}\n".format(lVal[iKey], nOcc)) - # hFreqDst.close() - self.sFileName = spfSrc + self.sFileName = src if type(src) is str else "[None]" self.sLangCode = sLangCode self.sLangName = sLangName self.sDicName = sDicName self.nEntry = len(lWord) self.aPreviousEntry = []