Grammalecte  Check-in [05fb167483]

Overview
Comment:[build][graphspell][lo] dictionary: drop support for binary file -> use JSON
Downloads: Tarball | ZIP archive | SQL archive
Timelines: family | ancestors | descendants | both | build | lo | graphspell | dict2
Files: files | file ages | folders
SHA3-256: 05fb167483021d19c671911fede3595b5b049d849f4082abcdbe0ad4776ad1e6
User & Date: olr on 2020-11-04 11:37:30
Other Links: branch diff | manifest | tags
Context
2020-11-04
12:02
[graphspell] ibdawg: code cleaning, remove old code, useless compression versions check-in: 86250e8e6c user: olr tags: graphspell, dict2
11:37
[build][graphspell][lo] dictionary: drop support for binary file -> use JSON check-in: 05fb167483 user: olr tags: build, lo, graphspell, dict2
2020-11-03
12:35
[fr] ajustements check-in: 3cffdae3b0 user: olr tags: trunk, fr
Changes

Modified gc_lang/fr/build_data.py from [6e865955c0] to [3d9c0f4ca9].

48
49
50
51
52
53
54
55

56
57
58
59
60
61
62
48
49
50
51
52
53
54

55
56
57
58
59
60
61
62







-
+







        raise OSError("# Error. File not found or not loadable: " + spf)


def loadDictionary ():
    global oDict
    if not oDict:
        try:
            oDict = ibdawg.IBDAWG("fr-allvars.bdic")
            oDict = ibdawg.IBDAWG("fr-allvars.json")
        except:
            traceback.print_exc()


def makeDictionaries (sp, sVersion):
    with cd(sp+"/dictionnaire"):
        if platform.system() == "Windows":

Modified gc_lang/fr/oxt/ContextMenu/ContextMenu.py from [c17c3b29b1] to [f3255b533a].

131
132
133
134
135
136
137
138

139
140
141
142
143
144
145
131
132
133
134
135
136
137

138
139
140
141
142
143
144
145







-
+







            if not oSpellChecker:
                xCurCtx = uno.getComponentContext()
                oGC = self.ctx.ServiceManager.createInstanceWithContext("org.openoffice.comp.pyuno.Lightproof.grammalecte", self.ctx)
                if hasattr(oGC, "getSpellChecker"):
                    # https://bugs.documentfoundation.org/show_bug.cgi?id=97790
                    oSpellChecker = oGC.getSpellChecker()
                else:
                    oSpellChecker = SpellChecker("${lang}", "fr-allvars.bdic")
                    oSpellChecker = SpellChecker("${lang}", "fr-allvars.json")
        except:
            traceback.print_exc()

    def execute (self, args):
        if not args:
            return
        try:

Modified gc_lang/fr/oxt/DictOptions/LexiconEditor.py from [828f4f365e] to [5ef5214006].

404
405
406
407
408
409
410
411

412
413
414
415
416
417
418
404
405
406
407
408
409
410

411
412
413
414
415
416
417
418







-
+








    @_waitPointer
    def importDictionary (self):
        spfImported = ""
        try:
            xFilePicker = self.xSvMgr.createInstanceWithContext('com.sun.star.ui.dialogs.FilePicker', self.ctx)  # other possibility: com.sun.star.ui.dialogs.SystemFilePicker
            xFilePicker.initialize([uno.getConstantByName("com.sun.star.ui.dialogs.TemplateDescription.FILEOPEN_SIMPLE")]) # seems useless
            xFilePicker.appendFilter("Supported files", "*.json; *.bdic")
            xFilePicker.appendFilter("Supported files", "*.json")
            xFilePicker.setDefaultName("fr.__personal__.json") # useless, doesn’t work
            xFilePicker.setDisplayDirectory("")
            xFilePicker.setMultiSelectionMode(False)
            nResult = xFilePicker.execute()
            if nResult == 1:
                # lFile = xFilePicker.getSelectedFiles()
                lFile = xFilePicker.getFiles()
459
460
461
462
463
464
465
466

467
468
469
470
471
472
473
459
460
461
462
463
464
465

466
467
468
469
470
471
472
473







-
+







            self.xDateDic.Label = self.dUI.get("void", "#err")
        MessageBox(self.xDocument, self.dUI.get('save_message', "#err"), self.dUI.get('save_title', "#err"))

    def exportDictionary (self):
        try:
            xFilePicker = self.xSvMgr.createInstanceWithContext('com.sun.star.ui.dialogs.FilePicker', self.ctx)  # other possibility: com.sun.star.ui.dialogs.SystemFilePicker
            xFilePicker.initialize([uno.getConstantByName("com.sun.star.ui.dialogs.TemplateDescription.FILESAVE_SIMPLE")]) # seems useless
            xFilePicker.appendFilter("Supported files", "*.json; *.bdic")
            xFilePicker.appendFilter("Supported files", "*.json")
            xFilePicker.setDefaultName("fr.__personal__.json") # useless, doesn’t work
            xFilePicker.setDisplayDirectory("")
            xFilePicker.setMultiSelectionMode(False)
            nResult = xFilePicker.execute()
            if nResult == 1:
                # lFile = xFilePicker.getSelectedFiles()
                lFile = xFilePicker.getFiles()

Modified gc_lang/fr/oxt/DictOptions/SearchWords.py from [764a885065] to [2c4ada79ef].

182
183
184
185
186
187
188
189

190
191
192
193
194
195
196
182
183
184
185
186
187
188

189
190
191
192
193
194
195
196







-
+







            elif xActionEvent.ActionCommand == "Close":
                self.xContainer.endExecute()
        except:
            traceback.print_exc()

    def initSpellChecker (self):
        if not self.oSpellChecker:
            self.oSpellChecker = sc.SpellChecker("fr", "fr-allvars.bdic", "", self.oPersonalDicJSON)
            self.oSpellChecker = sc.SpellChecker("fr", "fr-allvars.json", "", self.oPersonalDicJSON)

    @_waitPointer
    def searchSimilar (self):
        self.initSpellChecker()
        sWord = self.xWord.Text.strip()
        if sWord:
            xGridDataModel = self.xGridModel.GridDataModel

Modified gc_lang/fr/oxt/Graphspell.py from [810dc52bd8] to [76770ac233].

65
66
67
68
69
70
71
72

73
74
75
76
77
78
79
65
66
67
68
69
70
71

72
73
74
75
76
77
78
79







-
+







                sPersonalDicJSON = self.xOptionNode.getPropertyValue("personal_dic")
                if sPersonalDicJSON:
                    try:
                        personal_dic = json.loads(sPersonalDicJSON)
                    except:
                        print("Graphspell: wrong personal_dic")
                        traceback.print_exc()
            self.oGraphspell = SpellChecker("fr", "fr-"+sMainDicName+".bdic", "", personal_dic)
            self.oGraphspell = SpellChecker("fr", "fr-"+sMainDicName+".json", "", personal_dic)
            self.loadHunspell()
            # print("Graphspell: init done")
        except:
            print("Graphspell: init failed")
            traceback.print_exc()

    def loadHunspell (self):

Modified gc_lang/fr/setup.py from [8a0db0631b] to [955f5741fe].

89
90
91
92
93
94
95
96

97
98
99
100
101
102
103
89
90
91
92
93
94
95

96
97
98
99
100
101
102
103







-
+







    #     'test': ['coverage'],
    # },

    # If there are data files included in your packages that need to be
    # installed, specify them here.  If using Python 2.6 or less, then these
    # have to be included in MANIFEST.in as well.
    package_data={
        'grammalecte': ['graphspell/_dictionaries/*.bdic', '*.txt']
        'grammalecte': ['graphspell/_dictionaries/*.json', '*.txt']
    },

    # Although 'package_data' is the preferred approach, in some case you may
    # need to place data files outside of your packages. See:
    # http://docs.python.org/3.4/distutils/setupscript.html#installing-additional-files # noqa
    # In this case, 'data_file' will be installed into '<sys.prefix>/my_data'
    # data_files=[('my_data', ['data/data_file'])],

Modified graphspell/dawg.py from [b60434a390] to [c083d6a347].

474
475
476
477
478
479
480
481

482
483
484
485
486
487
488
474
475
476
477
478
479
480

481
482
483
484
485
486
487
488







-
+







            # Mozilla’s JS parser don’t like file bigger than 4 Mb!
            # So, if necessary, we use an hexadecimal string, that we will convert later in Firefox’s extension.
            # https://github.com/mozilla/addons-linter/issues/1361
            "sByDic": byDic.hex()  if bBinaryDictAsHexString  else [ e  for e in byDic ],
            "l2grams": list(self.a2grams)
        }

    def writeAsJSObject (self, spfDst, nCompressionMethod, bInJSModule=False, bBinaryDictAsHexString=True):
    def writeAsJSObject (self, spfDst, nCompressionMethod=1, bInJSModule=False, bBinaryDictAsHexString=True):
        "write a file (JSON or JS module) with all the necessary data"
        if not spfDst.endswith(".json"):
            spfDst += "."+str(nCompressionMethod)+".json"
        with open(spfDst, "w", encoding="utf-8", newline="\n") as hDst:
            if bInJSModule:
                hDst.write('// JavaScript\n// Generated data (do not edit)\n\n"use strict";\n\nconst dictionary = ')
            hDst.write( json.dumps(self.getBinaryAsJSON(nCompressionMethod, bBinaryDictAsHexString), ensure_ascii=False) )

Modified graphspell/spellchecker.py from [2bdbe76996] to [9b47d651ea].

12
13
14
15
16
17
18
19
20


21
22
23
24
25
26
27
12
13
14
15
16
17
18


19
20
21
22
23
24
25
26
27







-
-
+
+







import traceback

from . import ibdawg
from . import tokenizer


dDefaultDictionaries = {
    "fr": "fr-allvars.bdic",
    "en": "en.bdic"
    "fr": "fr-allvars.json",
    "en": "en.json"
}


class SpellChecker ():
    "SpellChecker: wrapper for the IBDAWG class"

    def __init__ (self, sLangCode, sfMainDic="", sfCommunityDic="", sfPersonalDic=""):

Modified lex_build.py from [0d00b07703] to [1b2b5d0ea9].

1
2
3
4
5
6
7
8
9
10
11
12
13
14

15
16
17
18
19
20




21

22
23


24
25
26
27
28
29
30
1
2
3
4
5
6
7
8
9
10
11
12
13

14
15
16
17



18
19
20
21
22
23


24
25
26
27
28
29
30
31
32













-
+



-
-
-
+
+
+
+

+
-
-
+
+







#!python3

"""
Lexicon builder
"""

import argparse
from distutils import dir_util

import graphspell.dawg as fsa
from graphspell.ibdawg import IBDAWG


def build (spfSrc, sLangCode, sLangName, sfDict, bJSON=False, sDicName="", sDescription="", sFilter="", cStemmingMethod="S", nCompressMethod=1):
def build (spfSrc, sLangCode, sLangName, sfDict, bJavaScript=False, sDicName="", sDescription="", sFilter="", cStemmingMethod="S", nCompressMethod=1):
    "transform a text lexicon as a binary indexable dictionary"
    oDAWG = fsa.DAWG(spfSrc, cStemmingMethod, sLangCode, sLangName, sDicName, sDescription, sFilter)
    dir_util.mkpath("graphspell/_dictionaries")
    oDAWG.writeInfo("graphspell/_dictionaries/" + sfDict + ".info.txt")
    oDAWG.writeBinary("graphspell/_dictionaries/" + sfDict + ".bdic", int(nCompressMethod))
    if bJSON:
    #oDAWG.writeInfo("graphspell/_dictionaries/" + sfDict + ".info.txt")
    #oDAWG.writeBinary("graphspell/_dictionaries/" + sfDict + ".bdic", int(nCompressMethod))
    oDAWG.writeAsJSObject("graphspell/_dictionaries/" + sfDict + ".json")
    if bJavaScript:
        dir_util.mkpath("graphspell-js/_dictionaries")
        oDAWG.writeAsJSObject("graphspell-js/_dictionaries/" + sfDict + ".json")
        oDic = IBDAWG(sfDict + ".bdic")
        oDic.writeAsJSObject("graphspell-js/_dictionaries/" + sfDict + ".json", bBinaryDictAsHexString=True)
        #oDic = IBDAWG(sfDict + ".bdic")
        #oDic.writeAsJSObject("graphspell-js/_dictionaries/" + sfDict + ".json", bBinaryDictAsHexString=True)


def main ():
    "parse args from CLI"
    xParser = argparse.ArgumentParser()
    xParser.add_argument("src_lexicon", type=str, help="path and file name of the source lexicon")
    xParser.add_argument("lang_code", type=str, help="language code")

Modified make.py from [f59af684eb] to [a76be310e9].

313
314
315
316
317
318
319
320

321
322
323
324
325
326

327
328
329
330
331

332
333
334
335
336
337
338
313
314
315
316
317
318
319

320
321
322
323
324
325

326
327
328
329
330

331
332
333
334
335
336
337
338







-
+





-
+




-
+







    dVars["dic_personal_filename_js"] = ""
    lDict = [ ("main", s)  for s in dVars['dic_filenames'].split(",") ]
    if bCommunityDict:
        lDict.append(("community", dVars['dic_community_filename']))
    if bPersonalDict:
        lDict.append(("personal", dVars['dic_personal_filename']))
    for sType, sFileName in lDict:
        spfPyDic = f"graphspell/_dictionaries/{sFileName}.bdic"
        spfPyDic = f"graphspell/_dictionaries/{sFileName}.json"
        spfJSDic = f"graphspell-js/_dictionaries/{sFileName}.json"
        if not os.path.isfile(spfPyDic) or (bJavaScript and not os.path.isfile(spfJSDic)):
            buildDictionary(dVars, sType, bJavaScript)
        print("  +", spfPyDic)
        file_util.copy_file(spfPyDic, "grammalecte/graphspell/_dictionaries")
        dVars['dic_'+sType+'_filename_py'] = sFileName + '.bdic'
        dVars['dic_'+sType+'_filename_py'] = sFileName + '.json'
        if bJavaScript:
            print("  +", spfJSDic)
            file_util.copy_file(spfJSDic, "grammalecte-js/graphspell/_dictionaries")
            dVars['dic_'+sType+'_filename_js'] = sFileName + '.json'
    dVars['dic_main_filename_py'] = dVars['dic_default_filename_py'] + ".bdic"
    dVars['dic_main_filename_py'] = dVars['dic_default_filename_py'] + ".json"
    dVars['dic_main_filename_js'] = dVars['dic_default_filename_js'] + ".json"


def buildDictionary (dVars, sType, bJavaScript=False):
    "build binary dictionary for Graphspell from lexicons"
    if sType == "main":
        spfLexSrc = dVars['lexicon_src']

Modified reader.py from [66f5eb17ae] to [e2706fc6a2].

1
2
3
4
5
6
7
8
9
10

11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29

30
31
32
33
34
35
36
1
2
3
4
5
6
7
8
9

10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28

29
30
31
32
33
34
35
36









-
+


















-
+







#!python3
# Just a file for one-shot scripts

import os
import sys
import re

import graphspell.ibdawg as ibdawg

oDict = ibdawg.IBDAWG("French.bdic")
oDict = ibdawg.IBDAWG("fr-allvars.json")


def readFile (spf):
    if os.path.isfile(spf):
        with open(spf, "r", encoding="utf-8") as hSrc:
            for sLine in hSrc:
                yield sLine
    else:
        print("# Error: file not found.")

# --------------------------------------------------------------------------------------------------

def listUnknownWords (spf):
    with open(spf+".res.txt", "w", encoding="utf-8") as hDst:
        for sLine in readFile(spfSrc):
            sLine = sLine.strip()
            if sLine:
                for sWord in sLine.split():
                    if not oDict.isValid(sWord): 
                    if not oDict.isValid(sWord):
                        hDst.write(sWord+"\n")

# --------------------------------------------------------------------------------------------------

def createLexStatFile (spf, dStat):
    dWord = {}
    for i, sLine in enumerate(readFile(spf)):