Index: graphspell/spellchecker.py ================================================================== --- graphspell/spellchecker.py +++ graphspell/spellchecker.py @@ -75,10 +75,26 @@ dToken['aSuggestions'] = [] for lSugg in self.suggest(dToken['sValue']): dToken['aSuggestions'].extend(lSugg) aSpellErrs.append(dToken) return aSpellErrs + + def countWordsOccurrences (self, sText, bByLemma=False, bOnlyUnknownWords=False, dWord={}): + if not self.oTokenizer: + self.loadTokenizer() + for dToken in self.oTokenizer.genTokens(sText): + if dToken['sType'] == "WORD": + if bOnlyUnknownWords: + if not self.isValidToken(dToken['sValue']): + dWord[dToken['sValue']] = dWord.get(dToken['sValue'], 0) + 1 + else: + if not bByLemma: + dWord[dToken['sValue']] = dWord.get(dToken['sValue'], 0) + 1 + else: + for sLemma in self.getLemma(dToken['sValue']): + dWord[sLemma] = dWord.get(sLemma, 0) + 1 + return dWord # IBDAWG functions def isValidToken (self, sToken): "checks if sToken is valid (if there is hyphens in sToken, sToken is split, each part is checked)" @@ -117,10 +133,13 @@ lResult.extend(self.oExtendedDic.getMorph(sWord)) if self.oPersonalDic: lResult.extend(self.oPersonalDic.getMorph(sWord)) return lResult + def getLemma (self, sWord): + return set([ s[1:s.find(" ")] for s in self.getMorph(sWord) ]) + def suggest (self, sWord, nSuggLimit=10): "generator: returns 1, 2 or 3 lists of suggestions" yield self.oMainDic.suggest(sWord, nSuggLimit) if self.oExtendedDic: yield self.oExtendedDic.suggest(sWord, nSuggLimit)