Overview
Comment: | [core] dawg: compressed lexicon |
---|---|
Downloads: | Tarball | ZIP archive | SQL archive |
Timelines: | family | ancestors | descendants | both | trunk | build | new_feature |
Files: | files | file ages | folders |
SHA3-256: |
e5f3698eb421d38fb5422cc0f98de270 |
User & Date: | olr on 2017-06-23 17:11:10 |
Other Links: | manifest | tags |
Context
2017-06-23
| ||
17:25 | [core] str_transform: change functions names check-in: 766f20e23c user: olr tags: trunk, core | |
17:11 | [core] dawg: compressed lexicon check-in: e5f3698eb4 user: olr tags: trunk, build, new_feature | |
14:43 | [core] dawg: accept personal lexicon check-in: 3916c538b5 user: olr tags: trunk, core, new_feature | |
Changes
Modified gc_core/py/dawg.py from [a30caaeab0] to [7e6ed7295c].
︙ | ︙ | |||
14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 | import collections from . import str_transform as st from .progressbar import ProgressBar def readFile (spf): if os.path.isfile(spf): with open(spf, "r", encoding="utf-8") as hSrc: for sLine in hSrc: sLine = sLine.strip() if sLine and not sLine.startswith("#"): yield sLine else: raise OSError("# Error. File not found or not loadable: " + spf) | > | > | | > > > > | > > | > > > | > > > > > > > > > | | | 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 | import collections from . import str_transform as st from .progressbar import ProgressBar def readFile (spf): print("Read lexicon: " + spf) if os.path.isfile(spf): with open(spf, "r", encoding="utf-8") as hSrc: for sLine in hSrc: sLine = sLine.strip() if sLine and not sLine.startswith("#"): yield sLine else: raise OSError("# Error. File not found or not loadable: " + spf) def getElemsFromFile (spf): "returns tuple of (flexion, stem, tags) from lexicon file" nErr = 0 if not spf.endswith(".clex"): for sLine in readFile(spf): try: sFlex, sStem, sTag = sLine.split("\t") yield (sFlex, sStem, sTag) except: nErr += 1 else: sTag = "_" # neutral tag sTag2 = "" for sLine in readFile(spf): if sLine.startswith("[") and sLine.endswith("]"): # tag line if "-->" in sLine: try: sTag, sSfxCode, sTag2 = sLine[1:-1].split(" --> ") except: nErr += 1 continue sTag = sTag.strip() sSfxCode = sSfxCode.strip() sTag2 = sTag2.strip() else: sTag = sLine[1:-1] sTag2 = "" else: # entry line if "\t" in sLine: if sLine.count("\t") > 1: nErr += 1 continue sFlex, sStem = sLine.split("\t") else: sFlex = sStem = sLine #print(sFlex, sStem, sTag) yield (sFlex, sStem, sTag) if sTag2: sFlex2 = st.getStemFromSuffixCode(sFlex, sSfxCode) #print(sFlex2, sStem, sTag2) yield (sFlex2, sStem, sTag2) if nErr: print(" # Lines ignored: {:>10}".format(nErr)) class DAWG: """DIRECT ACYCLIC WORD GRAPH""" # This code is inspired from Steve Hanov’s DAWG, 2011. (http://stevehanov.ca/blog/index.php?id=115) # We store suffix/affix codes and tags within the graph after the “real” word. # A word is a list of numbers [ c1, c2, c3 . . . cN, iAffix, iTags] # Each arc is an index in self.lArcVal, where are stored characters, suffix/affix codes for stemming and tags. # Important: As usual, the last node (after ‘iTags’) is tagged final, AND the node after ‘cN’ is ALSO tagged final. def __init__ (self, spfSrc, sLangName, cStemming): print("===== Direct Acyclic Word Graph - Minimal Acyclic Finite State Automaton =====") cStemming = cStemming.upper() if cStemming == "A": funcStemmingGen = st.defineAffixCode elif cStemming == "S": funcStemmingGen = st.defineSuffixCode elif cStemming == "N": funcStemmingGen = st.noStemming else: raise ValueError("# Error. Unknown stemming code: {}".format(cStemming)) lEntry = [] lChar = ['']; dChar = {}; nChar = 1; dCharOccur = {} lAff = []; dAff = {}; nAff = 0; dAffOccur = {} lTag = []; dTag = {}; nTag = 0; dTagOccur = {} nErr = 0 # read lexicon for sFlex, sStem, sTag in getElemsFromFile(spfSrc): # chars for c in sFlex: if c not in dChar: dChar[c] = nChar lChar.append(c) nChar += 1 dCharOccur[c] = dCharOccur.get(c, 0) + 1 |
︙ | ︙ |