Overview
Comment: | [build][graphspell] multiple main dictionaries |
---|---|
Downloads: | Tarball | ZIP archive | SQL archive |
Timelines: | family | ancestors | descendants | both | trunk | build | graphspell |
Files: | files | file ages | folders |
SHA3-256: |
ca4c8338762a1621b88820f9c82d363d |
User & Date: | olr on 2018-05-04 10:29:05 |
Other Links: | manifest | tags |
Context
2018-05-04
| ||
11:37 | [lo] load selected dictionary from saved options check-in: 7c4223d402 user: olr tags: trunk, lo | |
10:29 | [build][graphspell] multiple main dictionaries check-in: ca4c833876 user: olr tags: trunk, build, graphspell | |
08:16 | [graphspell][py] dawg builder: filter entries with regex check-in: 96692bb883 user: olr tags: trunk, graphspell | |
Changes
Modified gc_lang/fr/config.ini from [c14dc709ed] to [dbf2bf89ee].
︙ | ︙ | |||
12 13 14 15 16 17 18 | link = http://grammalecte.net description = Correcteur grammatical pour le français. extras = README_fr.txt logo = logo.png # main dictionary lexicon_src = lexicons/French.lex | | | > > > | 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 | link = http://grammalecte.net description = Correcteur grammatical pour le français. extras = README_fr.txt logo = logo.png # main dictionary lexicon_src = lexicons/French.lex dic_filenames = fr-allvars,fr-classic,fr-reform dic_name = Français,Français (Classique/Moderne),Français (Réforme 1990) dic_filter = ,[*CMPX]$,[*RPX]$ dic_default_filename_py = fr-allvars dic_default_filename_js = fr-allvars # extended dictionary lexicon_extended_src = lexicons/French.extended.lex dic_extended_filename = fr.extended dic_extended_name = Français - dictionnaire étendu # community dictionary lexicon_community_src = lexicons/French.community.lex dic_community_filename = fr.community |
︙ | ︙ |
Modified gc_lang/fr/dictionnaire/genfrdic.py from [5f240a0703] to [59732a18e1].
︙ | ︙ | |||
59 60 61 62 63 64 65 | 'shortname': '“Classique”', 'asciiName': 'fr-classique', 'mozAsciiName': 'fr-FR-classic', 'subDicts': '*MCX', 'mozId': 'fr-dicollecte-classique', 'description': "Dictionnaire français “Classique”" } | < < < < < < < < | 59 60 61 62 63 64 65 66 67 68 69 70 71 72 | 'shortname': '“Classique”', 'asciiName': 'fr-classique', 'mozAsciiName': 'fr-FR-classic', 'subDicts': '*MCX', 'mozId': 'fr-dicollecte-classique', 'description': "Dictionnaire français “Classique”" } dREFORME1990 = { 'name': 'DICTIONNAIRE ORTHOGRAPHIQUE FRANÇAIS “RÉFORME 1990”', 'shortname': '“Réforme 1990”', 'asciiName': 'fr-reforme1990', 'mozAsciiName': 'fr-FR-reform', 'subDicts': '*RX', 'mozId': 'fr-dicollecte-reforme1990', 'description': "Dictionnaire français “Réforme 1990”" } |
︙ | ︙ |
Modified gc_lang/fr/modules/tests.py from [43d45242b9] to [2e6f413e05].
︙ | ︙ | |||
20 21 22 23 24 25 26 | return s.replace("\u2019", "'").replace("\u2013", "–").replace("\u2014", "—") class TestDictionary (unittest.TestCase): @classmethod def setUpClass (cls): | | | 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 | return s.replace("\u2019", "'").replace("\u2013", "–").replace("\u2014", "—") class TestDictionary (unittest.TestCase): @classmethod def setUpClass (cls): cls.oDic = IBDAWG("${dic_main_filename_py}") def test_lookup (self): for sWord in ["branche", "Émilie"]: self.assertTrue(self.oDic.lookup(sWord), sWord) def test_lookup_failed (self): for sWord in ["Branche", "BRANCHE", "BranchE", "BRanche", "BRAnCHE", "émilie"]: |
︙ | ︙ |
Modified gc_lang/fr/oxt/ContextMenu/ContextMenu.py from [512c45de75] to [03a78a32c7].
︙ | ︙ | |||
127 128 129 130 131 132 133 | if not oSpellChecker: xCurCtx = uno.getComponentContext() oGC = self.ctx.ServiceManager.createInstanceWithContext("org.openoffice.comp.pyuno.Lightproof.grammalecte", self.ctx) if hasattr(oGC, "getSpellChecker"): # https://bugs.documentfoundation.org/show_bug.cgi?id=97790 oSpellChecker = oGC.getSpellChecker() else: | | | 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 | if not oSpellChecker: xCurCtx = uno.getComponentContext() oGC = self.ctx.ServiceManager.createInstanceWithContext("org.openoffice.comp.pyuno.Lightproof.grammalecte", self.ctx) if hasattr(oGC, "getSpellChecker"): # https://bugs.documentfoundation.org/show_bug.cgi?id=97790 oSpellChecker = oGC.getSpellChecker() else: oSpellChecker = SpellChecker("${lang}", "fr-allvars.bdic") if not oLexicographe: oLexicographe = lxg.Lexicographe(oSpellChecker) except: traceback.print_exc() def execute (self, args): if not args: |
︙ | ︙ |
Modified gc_lang/fr/oxt/Graphspell.py from [46a0993dea] to [0c5cbde982].
︙ | ︙ | |||
61 62 63 64 65 66 67 | sPersonalDicJSON = self.xOptionNode.getPropertyValue("personal_dic") if sPersonalDicJSON: try: personal_dic = json.loads(sPersonalDicJSON) except: print("Graphspell: wrong personal_dic") traceback.print_exc() | | | 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 | sPersonalDicJSON = self.xOptionNode.getPropertyValue("personal_dic") if sPersonalDicJSON: try: personal_dic = json.loads(sPersonalDicJSON) except: print("Graphspell: wrong personal_dic") traceback.print_exc() self.oGraphspell = SpellChecker("fr", "fr-allvars.bdic", "", "", personal_dic) self.loadHunspell() # print("Graphspell: init done") except: print("Graphspell: init failed") traceback.print_exc() def loadHunspell (self): |
︙ | ︙ |
Modified gc_lang/fr/rules.grx from [6e3d910360] to [a037a65419].
︙ | ︙ | |||
11494 11495 11496 11497 11498 11499 11500 | <<- morphex(\2, ":V", ":(?:G|2p|3p!|[ISK].*:2s)") -2>> =suggVerb(@, ":2s") # Conjugaison erronée. Accord avec « \1 ». Le verbe devrait être à la 2ᵉ personne du singulier. TEST: Tu ne {{ment}} jamais. TEST: Tu {{a}} mal ? TEST: Tu ne le lui {{prend}} pas. TEST: Tu ne m’{{attendra}} pas. | | | 11494 11495 11496 11497 11498 11499 11500 11501 11502 11503 11504 11505 11506 11507 11508 | <<- morphex(\2, ":V", ":(?:G|2p|3p!|[ISK].*:2s)") -2>> =suggVerb(@, ":2s") # Conjugaison erronée. Accord avec « \1 ». Le verbe devrait être à la 2ᵉ personne du singulier. TEST: Tu ne {{ment}} jamais. TEST: Tu {{a}} mal ? TEST: Tu ne le lui {{prend}} pas. TEST: Tu ne m’{{attendra}} pas. TEST: toi qui n’y {{connaît}} rien, ne nous ennuie pas avec tes théories. ## 3sg __[i]/conj(conj_il)__ (?<!t’)(il) +({w_1}) @@0,$ <<- morphex(\2, ":V", ":(?:3s|P|G)") and not (morph(\2, ":[PQ]", False) and morph(word(-1), ":V0.*:3s", False, False)) -2>> =suggVerb(@, ":3s") # Conjugaison erronée. Accord avec « \1 ». Le verbe devrait être à la 3ᵉ personne du singulier. |
︙ | ︙ |
Modified graphspell/dawg.py from [64364f5bf4] to [eb988983d4].
︙ | ︙ | |||
10 11 12 13 14 15 16 17 18 19 20 21 22 23 | import sys import os import collections import json import time from . import str_transform as st from .progressbar import ProgressBar def readFile (spf): | > > | 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 | import sys import os import collections import json import time import re import traceback from . import str_transform as st from .progressbar import ProgressBar def readFile (spf): |
︙ | ︙ | |||
59 60 61 62 63 64 65 66 67 68 69 70 71 72 | lTag = []; dTag = {}; nTag = 0; dTagOccur = {} nErr = 0 try: zFilter = re.compile(sSelectFilterRegex) if sSelectFilterRegex else None except: print(" # Error. Wrong filter regex. Filter ignored.") zFilter = None # read lexicon if type(src) is str: iterable = readFile(src) else: iterable = src | > | 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 | lTag = []; dTag = {}; nTag = 0; dTagOccur = {} nErr = 0 try: zFilter = re.compile(sSelectFilterRegex) if sSelectFilterRegex else None except: print(" # Error. Wrong filter regex. Filter ignored.") traceback.print_exc() zFilter = None # read lexicon if type(src) is str: iterable = readFile(src) else: iterable = src |
︙ | ︙ | |||
95 96 97 98 99 100 101 102 103 104 105 106 107 108 | dTagOccur[sTag] = dTagOccur.get(sTag, 0) + 1 aEntry.add((sFlex, dAff[sAff], dTag[sTag])) if not aEntry: raise ValueError("# Error. Empty lexicon") # Preparing DAWG print(" > Preparing list of words") lVal = lChar + lAff + lTag lWord = [ [dChar[c] for c in sFlex] + [iAff+nChar] + [iTag+nChar+nAff] for sFlex, iAff, iTag in aEntry ] aEntry = None # Dictionary of arc values occurrency, to sort arcs of each node dValOccur = dict( [ (dChar[c], dCharOccur[c]) for c in dChar ] \ + [ (dAff[aff]+nChar, dAffOccur[aff]) for aff in dAff ] \ | > | 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 | dTagOccur[sTag] = dTagOccur.get(sTag, 0) + 1 aEntry.add((sFlex, dAff[sAff], dTag[sTag])) if not aEntry: raise ValueError("# Error. Empty lexicon") # Preparing DAWG print(" > Preparing list of words") print(" Filter: " + (sSelectFilterRegex or "[None]")) lVal = lChar + lAff + lTag lWord = [ [dChar[c] for c in sFlex] + [iAff+nChar] + [iTag+nChar+nAff] for sFlex, iAff, iTag in aEntry ] aEntry = None # Dictionary of arc values occurrency, to sort arcs of each node dValOccur = dict( [ (dChar[c], dCharOccur[c]) for c in dChar ] \ + [ (dAff[aff]+nChar, dAffOccur[aff]) for aff in dAff ] \ |
︙ | ︙ |
Modified lex_build.py from [2d1c4b9aa4] to [346704203c].
1 2 3 4 5 6 7 8 9 10 11 | #!python3 # Lexicon builder import argparse from distutils import dir_util import graphspell.dawg as fsa from graphspell.ibdawg import IBDAWG | | | | 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 | #!python3 # Lexicon builder import argparse from distutils import dir_util import graphspell.dawg as fsa from graphspell.ibdawg import IBDAWG def build (spfSrc, sLangCode, sLangName, sfDict, bJSON=False, sDicName="", sFilter="", cStemmingMethod="S", nCompressMethod=1): "transform a text lexicon as a binary indexable dictionary" oDAWG = fsa.DAWG(spfSrc, cStemmingMethod, sLangCode, sLangName, sDicName, sFilter) dir_util.mkpath("graphspell/_dictionaries") oDAWG.writeInfo("graphspell/_dictionaries/" + sfDict + ".info.txt") oDAWG.writeBinary("graphspell/_dictionaries/" + sfDict + ".bdic", int(nCompressMethod)) if bJSON: dir_util.mkpath("graphspell-js/_dictionaries") oDic = IBDAWG(sfDict + ".bdic") oDic.writeAsJSObject("graphspell-js/_dictionaries/" + sfDict + ".json", bBinaryDictAsHexString=True) |
︙ | ︙ |
Modified make.py from [a8ca755148] to [eb03bf4198].
︙ | ︙ | |||
312 313 314 315 316 317 318 | dVars["dic_main_filename_js"] = "" dVars["dic_extended_filename_py"] = "" dVars["dic_extended_filename_js"] = "" dVars["dic_community_filename_py"] = "" dVars["dic_community_filename_js"] = "" dVars["dic_personal_filename_py"] = "" dVars["dic_personal_filename_js"] = "" | | > > > | | > > > > | | | | | | | | | | | | | | 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 | dVars["dic_main_filename_js"] = "" dVars["dic_extended_filename_py"] = "" dVars["dic_extended_filename_js"] = "" dVars["dic_community_filename_py"] = "" dVars["dic_community_filename_js"] = "" dVars["dic_personal_filename_py"] = "" dVars["dic_personal_filename_js"] = "" lDict = [ ("main", s) for s in dVars['dic_filenames'].split(",") ] if bExtendedDict: lDict.append(("extended", dVars['dic_extended_filename'])) if bCommunityDict: lDict.append(("community", dVars['dic_community_filename'])) if bPersonalDict: lDict.append(("personal", dVars['dic_personal_filename'])) for sType, sFileName in lDict: spfPyDic = "graphspell/_dictionaries/" + sFileName + ".bdic" spfJSDic = "graphspell-js/_dictionaries/" + sFileName + ".json" if not os.path.isfile(spfPyDic) or (bJavaScript and not os.path.isfile(spfJSDic)): buildDictionary(dVars, sType, bJavaScript) print(spfPyDic) file_util.copy_file(spfPyDic, "grammalecte/graphspell/_dictionaries") dVars['dic_'+sType+'_filename_py'] = sFileName + '.bdic' if bJavaScript: print(spfJSDic) file_util.copy_file(spfJSDic, "grammalecte-js/graphspell/_dictionaries") dVars['dic_'+sType+'_filename_js'] = sFileName + '.json' dVars['dic_main_filename_py'] = dVars['dic_default_filename_py'] + ".bdic" dVars['dic_main_filename_js'] = dVars['dic_default_filename_js'] + ".json" def buildDictionary (dVars, sType, bJavaScript=False): if sType == "main": spfLexSrc = dVars['lexicon_src'] l_sfDictDst = dVars['dic_filenames'].split(",") l_sDicName = dVars['dic_name'].split(",") l_sFilter = dVars['dic_filter'].split(",") for sfDictDst, sDicName, sFilter in zip(l_sfDictDst, l_sDicName, l_sFilter): lex_build.build(spfLexSrc, dVars['lang'], dVars['lang_name'], sfDictDst, bJavaScript, sDicName, sFilter, dVars['stemming_method'], int(dVars['fsa_method'])) else: if sType == "extended": spfLexSrc = dVars['lexicon_extended_src'] sfDictDst = dVars['dic_extended_filename'] sDicName = dVars['dic_extended_name'] elif sType == "community": spfLexSrc = dVars['lexicon_community_src'] sfDictDst = dVars['dic_community_filename'] sDicName = dVars['dic_community_name'] elif sType == "personal": spfLexSrc = dVars['lexicon_personal_src'] sfDictDst = dVars['dic_personal_filename'] sDicName = dVars['dic_personal_name'] lex_build.build(spfLexSrc, dVars['lang'], dVars['lang_name'], sfDictDst, bJavaScript, sDicName, "", dVars['stemming_method'], int(dVars['fsa_method'])) def main (): print("Python: " + sys.version) xParser = argparse.ArgumentParser() xParser.add_argument("lang", type=str, nargs='+', help="lang project to generate (name of folder in /lang)") |
︙ | ︙ |