Index: graphspell/dawg.py ================================================================== --- graphspell/dawg.py +++ graphspell/dawg.py @@ -39,11 +39,11 @@ # We store suffix/affix codes and tags within the graph after the “real” word. # A word is a list of numbers [ c1, c2, c3 . . . cN, iAffix, iTags] # Each arc is an index in self.lArcVal, where are stored characters, suffix/affix codes for stemming and tags. # Important: As usual, the last node (after ‘iTags’) is tagged final, AND the node after ‘cN’ is ALSO tagged final. - def __init__ (self, src, cStemming, sLangCode, sLangName="", sDicName=""): + def __init__ (self, src, cStemming, sLangCode, sLangName="", sDicName="", sSelectFilterRegex=""): print("===== Direct Acyclic Word Graph - Minimal Acyclic Finite State Automaton =====") cStemming = cStemming.upper() if cStemming == "A": funcStemmingGen = st.defineAffixCode elif cStemming == "S": @@ -57,38 +57,45 @@ lChar = ['']; dChar = {}; nChar = 1; dCharOccur = {} lAff = []; dAff = {}; nAff = 0; dAffOccur = {} lTag = []; dTag = {}; nTag = 0; dTagOccur = {} nErr = 0 + try: + zFilter = re.compile(sSelectFilterRegex) if sSelectFilterRegex else None + except: + print(" # Error. Wrong filter regex. Filter ignored.") + zFilter = None + # read lexicon if type(src) is str: iterable = readFile(src) else: iterable = src for sFlex, sStem, sTag in iterable: - addWordToCharDict(sFlex) - # chars - for c in sFlex: - if c not in dChar: - dChar[c] = nChar - lChar.append(c) - nChar += 1 - dCharOccur[c] = dCharOccur.get(c, 0) + 1 - # affixes to find stem from flexion - sAff = funcStemmingGen(sFlex, sStem) - if sAff not in dAff: - dAff[sAff] = nAff - lAff.append(sAff) - nAff += 1 - dAffOccur[sAff] = dCharOccur.get(sAff, 0) + 1 - # tags - if sTag not in dTag: - dTag[sTag] = nTag - lTag.append(sTag) - nTag += 1 - dTagOccur[sTag] = dTagOccur.get(sTag, 0) + 1 - aEntry.add((sFlex, dAff[sAff], dTag[sTag])) + if not zFilter or zFilter.search(sTag): + addWordToCharDict(sFlex) + # chars + for c in sFlex: + if c not in dChar: + dChar[c] = nChar + lChar.append(c) + nChar += 1 + dCharOccur[c] = dCharOccur.get(c, 0) + 1 + # affixes to find stem from flexion + sAff = funcStemmingGen(sFlex, sStem) + if sAff not in dAff: + dAff[sAff] = nAff + lAff.append(sAff) + nAff += 1 + dAffOccur[sAff] = dCharOccur.get(sAff, 0) + 1 + # tags + if sTag not in dTag: + dTag[sTag] = nTag + lTag.append(sTag) + nTag += 1 + dTagOccur[sTag] = dTagOccur.get(sTag, 0) + 1 + aEntry.add((sFlex, dAff[sAff], dTag[sTag])) if not aEntry: raise ValueError("# Error. Empty lexicon") # Preparing DAWG print(" > Preparing list of words")