Grammalecte  Diff

Differences From Artifact [78af5c9dd3]:

To Artifact [4f7f19d7d1]:


9
10
11
12
13
14
15

16
17
18
19
20
21
22
import traceback
import pkgutil
import re
from functools import wraps
import time
import json
import binascii

from collections import OrderedDict

#import logging
#logging.basicConfig(filename="suggestions.log", level=logging.DEBUG)

from . import str_transform as st
from . import char_player as cp







>







9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
import traceback
import pkgutil
import re
from functools import wraps
import time
import json
import binascii
import importlib
from collections import OrderedDict

#import logging
#logging.basicConfig(filename="suggestions.log", level=logging.DEBUG)

from . import str_transform as st
from . import char_player as cp
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67


class SuggResult:
    """Structure for storing, classifying and filtering suggestions"""

    def __init__ (self, sWord, nDistLimit=-1):
        self.sWord = sWord
        self.sSimplifiedWord = cp.simplifyWord(sWord)
        self.nDistLimit = nDistLimit  if nDistLimit >= 0  else  (len(sWord) // 3) + 1
        self.nMinDist = 1000
        self.aSugg = set()
        self.dSugg = { 0: [],  1: [],  2: [] }
        self.aAllSugg = set()       # all found words even those refused

    def addSugg (self, sSugg, nDeep=0):
        "add a suggestion"
        #logging.info((nDeep * "  ") + "__" + sSugg + "__")
        if sSugg in self.aAllSugg:
            return
        self.aAllSugg.add(sSugg)
        if sSugg not in self.aSugg:
            #nDist = min(st.distanceDamerauLevenshtein(self.sWord, sSugg), st.distanceDamerauLevenshtein(self.sSimplifiedWord, cp.simplifyWord(sSugg)))
            nDist = st.distanceDamerauLevenshtein(self.sSimplifiedWord, cp.simplifyWord(sSugg))
            #logging.info((nDeep * "  ") + "__" + sSugg + "__ :" + self.sSimplifiedWord +"|"+ cp.simplifyWord(sSugg) +" -> "+ str(nDist))
            if nDist <= self.nDistLimit:
                if " " in sSugg:
                    nDist += 1
                if nDist not in self.dSugg:
                    self.dSugg[nDist] = []
                self.dSugg[nDist].append(sSugg)
                self.aSugg.add(sSugg)







|













|
|
|







38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68


class SuggResult:
    """Structure for storing, classifying and filtering suggestions"""

    def __init__ (self, sWord, nDistLimit=-1):
        self.sWord = sWord
        self.sSimplifiedWord = st.simplifyWord(sWord)
        self.nDistLimit = nDistLimit  if nDistLimit >= 0  else  (len(sWord) // 3) + 1
        self.nMinDist = 1000
        self.aSugg = set()
        self.dSugg = { 0: [],  1: [],  2: [] }
        self.aAllSugg = set()       # all found words even those refused

    def addSugg (self, sSugg, nDeep=0):
        "add a suggestion"
        #logging.info((nDeep * "  ") + "__" + sSugg + "__")
        if sSugg in self.aAllSugg:
            return
        self.aAllSugg.add(sSugg)
        if sSugg not in self.aSugg:
            #nDist = min(st.distanceDamerauLevenshtein(self.sWord, sSugg), st.distanceDamerauLevenshtein(self.sSimplifiedWord, st.simplifyWord(sSugg)))
            nDist = st.distanceDamerauLevenshtein(self.sSimplifiedWord, st.simplifyWord(sSugg))
            #logging.info((nDeep * "  ") + "__" + sSugg + "__ :" + self.sSimplifiedWord +"|"+ st.simplifyWord(sSugg) +" -> "+ str(nDist))
            if nDist <= self.nDistLimit:
                if " " in sSugg:
                    nDist += 1
                if nDist not in self.dSugg:
                    self.dSugg[nDist] = []
                self.dSugg[nDist].append(sSugg)
                self.aSugg.add(sSugg)
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
            self.dSugg[1].sort(key=lambda sSugg: st.distanceDamerauLevenshtein(self.sWord, sSugg))
        lRes = self.dSugg.pop(0)
        for nDist, lSugg in self.dSugg.items():
            if nDist <= self.nDistLimit:
                lRes.extend(lSugg)
                if len(lRes) > nSuggLimit:
                    break
        lRes = list(cp.filterSugg(lRes))
        if self.sWord.isupper():
            lRes = list(OrderedDict.fromkeys(map(lambda sSugg: sSugg.upper(), lRes))) # use dict, when Python 3.6+
        elif self.sWord[0:1].isupper():
            # dont’ use <.istitle>
            lRes = list(OrderedDict.fromkeys(map(lambda sSugg: sSugg[0:1].upper()+sSugg[1:], lRes))) # use dict, when Python 3.6+
        return lRes[:nSuggLimit]








<







79
80
81
82
83
84
85

86
87
88
89
90
91
92
            self.dSugg[1].sort(key=lambda sSugg: st.distanceDamerauLevenshtein(self.sWord, sSugg))
        lRes = self.dSugg.pop(0)
        for nDist, lSugg in self.dSugg.items():
            if nDist <= self.nDistLimit:
                lRes.extend(lSugg)
                if len(lRes) > nSuggLimit:
                    break

        if self.sWord.isupper():
            lRes = list(OrderedDict.fromkeys(map(lambda sSugg: sSugg.upper(), lRes))) # use dict, when Python 3.6+
        elif self.sWord[0:1].isupper():
            # dont’ use <.istitle>
            lRes = list(OrderedDict.fromkeys(map(lambda sSugg: sSugg[0:1].upper()+sSugg[1:], lRes))) # use dict, when Python 3.6+
        return lRes[:nSuggLimit]

149
150
151
152
153
154
155








156
157
158
159
160
161
162
            self._getArcs = self._getArcs3
            self._writeNodes = self._writeNodes3
        else:
            raise ValueError("  # Error: unknown code: {}".format(self.nCompressionMethod))

        self.bAcronymValid = False
        self.bNumAtLastValid = False









    def _initBinary (self):
        "initialize with binary structure file"
        if self.by[0:17] != b"/grammalecte-fsa/":
            raise TypeError("# Error. Not a grammalecte-fsa binary dictionary. Header: {}".format(self.by[0:9]))
        if not(self.by[17:18] == b"1" or self.by[17:18] == b"2" or self.by[17:18] == b"3"):
            raise ValueError("# Error. Unknown dictionary version: {}".format(self.by[17:18]))







>
>
>
>
>
>
>
>







149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
            self._getArcs = self._getArcs3
            self._writeNodes = self._writeNodes3
        else:
            raise ValueError("  # Error: unknown code: {}".format(self.nCompressionMethod))

        self.bAcronymValid = False
        self.bNumAtLastValid = False

        # lexicographer module ?
        self.lexicographer = None
        try:
            self.lexicographer = importlib.import_module("graphspell.lexgraph_"+self.sLangCode)
        except ImportError:
            print("# No module <graphspell.lexgraph_"+self.sLangCode+".py>")


    def _initBinary (self):
        "initialize with binary structure file"
        if self.by[0:17] != b"/grammalecte-fsa/":
            raise TypeError("# Error. Not a grammalecte-fsa binary dictionary. Header: {}".format(self.by[0:9]))
        if not(self.by[17:18] == b"1" or self.by[17:18] == b"2" or self.by[17:18] == b"3"):
            raise ValueError("# Error. Unknown dictionary version: {}".format(self.by[17:18]))
301
302
303
304
305
306
307



308
309
310
311
312
313
314
315
316
317
318


319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
        return l

    #@timethis
    def suggest (self, sWord, nSuggLimit=10, bSplitTrailingNumbers=False):
        "returns a set of suggestions for <sWord>"
        sWord = sWord.rstrip(".")   # useful for LibreOffice
        sWord = st.spellingNormalization(sWord)



        sPfx, sWord, sSfx = cp.cut(sWord)
        nMaxSwitch = max(len(sWord) // 3, 1)
        nMaxDel = len(sWord) // 5
        nMaxHardRepl = max((len(sWord) - 5) // 4, 1)
        nMaxJump = max(len(sWord) // 4, 1)
        oSuggResult = SuggResult(sWord)
        if bSplitTrailingNumbers:
            self._splitTrailingNumbers(oSuggResult, sWord)
        self._splitSuggest(oSuggResult, sWord)
        self._suggest(oSuggResult, sWord, nMaxSwitch, nMaxDel, nMaxHardRepl, nMaxJump)
        aSugg = oSuggResult.getSuggestions(nSuggLimit)


        if sSfx or sPfx:
            # we add what we removed
            return list(map(lambda sSug: sPfx + sSug + sSfx, aSugg))
        return aSugg

    def _splitTrailingNumbers (self, oSuggResult, sWord):
        m = re.match(r"(\D+)([0-9]+)$", sWord)
        if m and m.group(1)[-1:].isalpha():
            oSuggResult.addSugg(m.group(1) + " " + cp.numbersToExponent(m.group(2)))

    def _splitSuggest (self, oSuggResult, sWord):
        # split at apostrophes
        for cSplitter in "'’":
            if cSplitter in sWord:
                sWord1, sWord2 = sWord.split(cSplitter, 1)
                if self.isValid(sWord1) and self.isValid(sWord2):







>
>
>
|










>
>








|







309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
        return l

    #@timethis
    def suggest (self, sWord, nSuggLimit=10, bSplitTrailingNumbers=False):
        "returns a set of suggestions for <sWord>"
        sWord = sWord.rstrip(".")   # useful for LibreOffice
        sWord = st.spellingNormalization(sWord)
        sPfx = ""
        sSfx = ""
        if self.lexicographer:
            sPfx, sWord, sSfx = self.lexicographer.split(sWord)
        nMaxSwitch = max(len(sWord) // 3, 1)
        nMaxDel = len(sWord) // 5
        nMaxHardRepl = max((len(sWord) - 5) // 4, 1)
        nMaxJump = max(len(sWord) // 4, 1)
        oSuggResult = SuggResult(sWord)
        if bSplitTrailingNumbers:
            self._splitTrailingNumbers(oSuggResult, sWord)
        self._splitSuggest(oSuggResult, sWord)
        self._suggest(oSuggResult, sWord, nMaxSwitch, nMaxDel, nMaxHardRepl, nMaxJump)
        aSugg = oSuggResult.getSuggestions(nSuggLimit)
        if self.lexicographer:
            aSugg = self.lexicographer.filterSugg(aSugg)
        if sSfx or sPfx:
            # we add what we removed
            return list(map(lambda sSug: sPfx + sSug + sSfx, aSugg))
        return aSugg

    def _splitTrailingNumbers (self, oSuggResult, sWord):
        m = re.match(r"(\D+)([0-9]+)$", sWord)
        if m and m.group(1)[-1:].isalpha():
            oSuggResult.addSugg(m.group(1) + " " + st.numbersToExponent(m.group(2)))

    def _splitSuggest (self, oSuggResult, sWord):
        # split at apostrophes
        for cSplitter in "'’":
            if cSplitter in sWord:
                sWord1, sWord2 = sWord.split(cSplitter, 1)
                if self.isValid(sWord1) and self.isValid(sWord2):