Overview
| Comment: | [graphspell][py] dawg builder: filter entries with regex |
|---|---|
| Downloads: | Tarball | ZIP archive | SQL archive |
| Timelines: | family | ancestors | descendants | both | trunk | graphspell |
| Files: | files | file ages | folders |
| SHA3-256: |
96692bb88326c20e8587e6e98a30cdcd |
| User & Date: | olr on 2018-05-04 08:16:18 |
| Other Links: | manifest | tags |
Context
|
2018-05-04
| ||
| 10:29 | [build][graphspell] multiple main dictionaries check-in: ca4c833876 user: olr tags: trunk, build, graphspell | |
| 08:16 | [graphspell][py] dawg builder: filter entries with regex check-in: 96692bb883 user: olr tags: trunk, graphspell | |
|
2018-05-03
| ||
| 11:33 | [graphspell][py][bug] fix words selection check-in: 0eaff45865 user: olr tags: trunk, graphspell | |
Changes
Modified graphspell/dawg.py from [63684196d2] to [64364f5bf4].
| ︙ | ︙ | |||
37 38 39 40 41 42 43 |
"""DIRECT ACYCLIC WORD GRAPH"""
# This code is inspired from Steve Hanov’s DAWG, 2011. (http://stevehanov.ca/blog/index.php?id=115)
# We store suffix/affix codes and tags within the graph after the “real” word.
# A word is a list of numbers [ c1, c2, c3 . . . cN, iAffix, iTags]
# Each arc is an index in self.lArcVal, where are stored characters, suffix/affix codes for stemming and tags.
# Important: As usual, the last node (after ‘iTags’) is tagged final, AND the node after ‘cN’ is ALSO tagged final.
| | > > > > > > > | | | | | | | | | | | | | | | | | | | | | | | 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 |
"""DIRECT ACYCLIC WORD GRAPH"""
# This code is inspired from Steve Hanov’s DAWG, 2011. (http://stevehanov.ca/blog/index.php?id=115)
# We store suffix/affix codes and tags within the graph after the “real” word.
# A word is a list of numbers [ c1, c2, c3 . . . cN, iAffix, iTags]
# Each arc is an index in self.lArcVal, where are stored characters, suffix/affix codes for stemming and tags.
# Important: As usual, the last node (after ‘iTags’) is tagged final, AND the node after ‘cN’ is ALSO tagged final.
def __init__ (self, src, cStemming, sLangCode, sLangName="", sDicName="", sSelectFilterRegex=""):
print("===== Direct Acyclic Word Graph - Minimal Acyclic Finite State Automaton =====")
cStemming = cStemming.upper()
if cStemming == "A":
funcStemmingGen = st.defineAffixCode
elif cStemming == "S":
funcStemmingGen = st.defineSuffixCode
elif cStemming == "N":
funcStemmingGen = st.noStemming
else:
raise ValueError("# Error. Unknown stemming code: {}".format(cStemming))
aEntry = set()
lChar = ['']; dChar = {}; nChar = 1; dCharOccur = {}
lAff = []; dAff = {}; nAff = 0; dAffOccur = {}
lTag = []; dTag = {}; nTag = 0; dTagOccur = {}
nErr = 0
try:
zFilter = re.compile(sSelectFilterRegex) if sSelectFilterRegex else None
except:
print(" # Error. Wrong filter regex. Filter ignored.")
zFilter = None
# read lexicon
if type(src) is str:
iterable = readFile(src)
else:
iterable = src
for sFlex, sStem, sTag in iterable:
if not zFilter or zFilter.search(sTag):
addWordToCharDict(sFlex)
# chars
for c in sFlex:
if c not in dChar:
dChar[c] = nChar
lChar.append(c)
nChar += 1
dCharOccur[c] = dCharOccur.get(c, 0) + 1
# affixes to find stem from flexion
sAff = funcStemmingGen(sFlex, sStem)
if sAff not in dAff:
dAff[sAff] = nAff
lAff.append(sAff)
nAff += 1
dAffOccur[sAff] = dCharOccur.get(sAff, 0) + 1
# tags
if sTag not in dTag:
dTag[sTag] = nTag
lTag.append(sTag)
nTag += 1
dTagOccur[sTag] = dTagOccur.get(sTag, 0) + 1
aEntry.add((sFlex, dAff[sAff], dTag[sTag]))
if not aEntry:
raise ValueError("# Error. Empty lexicon")
# Preparing DAWG
print(" > Preparing list of words")
lVal = lChar + lAff + lTag
lWord = [ [dChar[c] for c in sFlex] + [iAff+nChar] + [iTag+nChar+nAff] for sFlex, iAff, iTag in aEntry ]
|
| ︙ | ︙ |