Overview
| Comment: | [core] dawg: accept personal lexicon |
|---|---|
| Downloads: | Tarball | ZIP archive | SQL archive |
| Timelines: | family | ancestors | descendants | both | trunk | core | new_feature |
| Files: | files | file ages | folders |
| SHA3-256: |
3916c538b58d03f2b51613bc833cdae9 |
| User & Date: | olr on 2017-06-23 14:43:25 |
| Other Links: | manifest | tags |
Context
|
2017-06-23
| ||
| 17:11 | [core] dawg: compressed lexicon check-in: e5f3698eb4 user: olr tags: trunk, build, new_feature | |
| 14:43 | [core] dawg: accept personal lexicon check-in: 3916c538b5 user: olr tags: trunk, core, new_feature | |
| 13:19 | [build] lex_build.py: main() + options check-in: e091821b50 user: olr tags: trunk, build | |
Changes
Modified gc_core/py/dawg.py from [a9c487538f] to [a30caaeab0].
| ︙ | ︙ | |||
12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 |
import sys
import os
import collections
from . import str_transform as st
from .progressbar import ProgressBar
class DAWG:
"""DIRECT ACYCLIC WORD GRAPH"""
# This code is inspired from Steve Hanov’s DAWG, 2011. (http://stevehanov.ca/blog/index.php?id=115)
# We store suffix/affix codes and tags within the graph after the “real” word.
# A word is a list of numbers [ c1, c2, c3 . . . cN, iAffix, iTags]
# Each arc is an index in self.lArcVal, where are stored characters, suffix/affix codes for stemming and tags.
# Important: As usual, the last node (after ‘iTags’) is tagged final, AND the node after ‘cN’ is ALSO tagged final.
| > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > | | < | < < < < < < < < < | | | | | | | | | | | | | | | | | | | | | < < < | | < | 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 |
import sys
import os
import collections
from . import str_transform as st
from .progressbar import ProgressBar
def readFile (spf):
if os.path.isfile(spf):
with open(spf, "r", encoding="utf-8") as hSrc:
for sLine in hSrc:
sLine = sLine.strip()
if sLine and not sLine.startswith("#"):
yield sLine
else:
raise OSError("# Error. File not found or not loadable: " + spf)
def getElemsFromFile (spf, bCompressedDic=False):
nErr = 0
if not bCompressedDic:
for sLine in readFile(spf):
try:
sFlex, sStem, sTag = sLine.split("\t")
yield (sFlex, sStem, sTag)
except:
nErr += 1
else:
sTag = ":_" # neutral tag
for sLine in readFile(spf):
if sLine.startswith("[") and sLine.endswith("]"):
sTag = sLine[1:-1]
continue
else:
if "\t" in sLine:
if sLine.count("\t") > 1:
nErr += 1
continue
sFlex, sStem = sLine.split("\t")
else:
sFlex = sStem = sLine
yield (sFlex, sStem, sTag)
if nErr:
print(" # Lines ignored: {:>10}".format(nErr))
class DAWG:
"""DIRECT ACYCLIC WORD GRAPH"""
# This code is inspired from Steve Hanov’s DAWG, 2011. (http://stevehanov.ca/blog/index.php?id=115)
# We store suffix/affix codes and tags within the graph after the “real” word.
# A word is a list of numbers [ c1, c2, c3 . . . cN, iAffix, iTags]
# Each arc is an index in self.lArcVal, where are stored characters, suffix/affix codes for stemming and tags.
# Important: As usual, the last node (after ‘iTags’) is tagged final, AND the node after ‘cN’ is ALSO tagged final.
def __init__ (self, spfSrc, sLangName, cStemming, bCompressedDic=False):
print("===== Direct Acyclic Word Graph - Minimal Acyclic Finite State Automaton =====")
cStemming = cStemming.upper()
if cStemming == "A":
funcStemmingGen = st.defineAffixCode
elif cStemming == "S":
funcStemmingGen = st.defineSuffixCode
elif cStemming == "N":
funcStemmingGen = st.noStemming
else:
raise ValueError("# Error. Unknown stemming code: {}".format(cStemming))
lEntry = []
lChar = ['']; dChar = {}; nChar = 1; dCharOccur = {}
lAff = []; dAff = {}; nAff = 0; dAffOccur = {}
lTag = []; dTag = {}; nTag = 0; dTagOccur = {}
nErr = 0
# read lexicon
for sFlex, sStem, sTag in getElemsFromFile(spfSrc, bCompressedDic):
# chars
for c in sFlex:
if c not in dChar:
dChar[c] = nChar
lChar.append(c)
nChar += 1
dCharOccur[c] = dCharOccur.get(c, 0) + 1
# affixes to find stem from flexion
aff = funcStemmingGen(sFlex, sStem)
if aff not in dAff:
dAff[aff] = nAff
lAff.append(aff)
nAff += 1
dAffOccur[aff] = dCharOccur.get(aff, 0) + 1
# tags
if sTag not in dTag:
dTag[sTag] = nTag
lTag.append(sTag)
nTag += 1
dTagOccur[sTag] = dTagOccur.get(sTag, 0) + 1
lEntry.append((sFlex, dAff[aff], dTag[sTag]))
if not lEntry:
raise ValueError("# Error. Empty lexicon")
# Preparing DAWG
print(" > Preparing list of words")
lVal = lChar + lAff + lTag
lWord = [ [dChar[c] for c in sFlex] + [iAff+nChar] + [iTag+nChar+nAff] for sFlex, iAff, iTag in lEntry ]
lEntry = None
|
| ︙ | ︙ | |||
111 112 113 114 115 116 117 |
self.nArc = 0
self.dChar = dChar
self.nChar = len(dChar)
self.nAff = nAff
self.lArcVal = lVal
self.nArcVal = len(lVal)
self.nTag = self.nArcVal - self.nChar - nAff
| | | 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 |
self.nArc = 0
self.dChar = dChar
self.nChar = len(dChar)
self.nAff = nAff
self.lArcVal = lVal
self.nArcVal = len(lVal)
self.nTag = self.nArcVal - self.nChar - nAff
self.cStemming = cStemming
if cStemming == "A":
self.funcStemming = st.getStemFromAffixCode
elif cStemming == "S":
self.funcStemming = st.getStemFromSuffixCode
else:
self.funcStemming = st.noStemming
|
| ︙ | ︙ |