Grammalecte  Check-in [d205a5a601]

Overview
Comment:[graphspell][py] new functions: getLemma() and countWordsOccurrences()
Downloads: Tarball | ZIP archive | SQL archive
Timelines: family | ancestors | descendants | both | trunk | graphspell
Files: files | file ages | folders
SHA3-256: d205a5a60172d3c9150901a064507045d27f698085cffce78089999ef11119f8
User & Date: olr on 2018-02-21 19:13:21
Original Comment: [graphspell] new functions: getLemma() and countWordsOccurrences()
Other Links: manifest | tags
Context
2018-02-21
19:14
[graphspell][py] defaut module import check-in: 31837970bd user: olr tags: trunk, graphspell
19:13
[graphspell][py] new functions: getLemma() and countWordsOccurrences() check-in: d205a5a601 user: olr tags: trunk, graphspell
11:53
[build] new command for future graph rules check-in: c4eb507f6d user: olr tags: trunk, build
Changes

Modified graphspell/spellchecker.py from [b9fb2c7b70] to [dbd02131cc].

73
74
75
76
77
78
79
















80
81
82
83
84
85
86
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102







+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+







            if dToken['sType'] == "WORD" and not self.isValidToken(dToken['sValue']):
                if bSpellSugg:
                    dToken['aSuggestions'] = []
                    for lSugg in self.suggest(dToken['sValue']):
                        dToken['aSuggestions'].extend(lSugg)
                aSpellErrs.append(dToken)
        return aSpellErrs

    def countWordsOccurrences (self, sText, bByLemma=False, bOnlyUnknownWords=False, dWord={}):
        if not self.oTokenizer:
            self.loadTokenizer()
        for dToken in self.oTokenizer.genTokens(sText):
            if dToken['sType'] == "WORD":
                if bOnlyUnknownWords:
                    if not self.isValidToken(dToken['sValue']):
                        dWord[dToken['sValue']] = dWord.get(dToken['sValue'], 0) + 1
                else:
                    if not bByLemma:
                        dWord[dToken['sValue']] = dWord.get(dToken['sValue'], 0) + 1
                    else:
                        for sLemma in self.getLemma(dToken['sValue']):
                            dWord[sLemma] = dWord.get(sLemma, 0) + 1
        return dWord

    # IBDAWG functions

    def isValidToken (self, sToken):
        "checks if sToken is valid (if there is hyphens in sToken, sToken is split, each part is checked)"
        if self.oMainDic.isValidToken(sToken):
            return True
115
116
117
118
119
120
121



122
123
124
125
126
127
128
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147







+
+
+







        lResult = self.oMainDic.getMorph(sWord)
        if self.oExtendedDic:
            lResult.extend(self.oExtendedDic.getMorph(sWord))
        if self.oPersonalDic:
            lResult.extend(self.oPersonalDic.getMorph(sWord))
        return lResult

    def getLemma (self, sWord):
        return set([ s[1:s.find(" ")]  for s in self.getMorph(sWord) ])

    def suggest (self, sWord, nSuggLimit=10):
        "generator: returns 1, 2 or 3 lists of suggestions"
        yield self.oMainDic.suggest(sWord, nSuggLimit)
        if self.oExtendedDic:
            yield self.oExtendedDic.suggest(sWord, nSuggLimit)
        if self.oPersonalDic:
            yield self.oPersonalDic.suggest(sWord, nSuggLimit)