︙ | | | ︙ | |
8
9
10
11
12
13
14
15
16
17
18
19
20
21
|
#import unicodedata
from itertools import chain
from ..graphspell.spellchecker import SpellChecker
from ..graphspell.echo import echo
from . import gc_options
__all__ = [ "lang", "locales", "pkg", "name", "version", "author", \
"load", "parse", "getSpellChecker", \
"setOption", "setOptions", "getOptions", "getDefaultOptions", "getOptionsLabels", "resetOptions", "displayOptions", \
"ignoreRule", "resetIgnoreRules", "reactivateRule", "listRules", "displayRules" ]
__version__ = "${version}"
|
>
>
>
|
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
|
#import unicodedata
from itertools import chain
from ..graphspell.spellchecker import SpellChecker
from ..graphspell.echo import echo
from . import gc_options
from ..graphspell.tokenizer import Tokenizer
from .gc_rules_graph import dGraph
__all__ = [ "lang", "locales", "pkg", "name", "version", "author", \
"load", "parse", "getSpellChecker", \
"setOption", "setOptions", "getOptions", "getDefaultOptions", "getOptionsLabels", "resetOptions", "displayOptions", \
"ignoreRule", "resetIgnoreRules", "reactivateRule", "listRules", "displayRules" ]
__version__ = "${version}"
|
︙ | | | ︙ | |
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
|
_rules = None # module gc_rules
# data
_sAppContext = "" # what software is running
_dOptions = None
_aIgnoredRules = set()
_oSpellChecker = None
_dAnalyses = {} # cache for data from dictionary
#### Parsing
def parse (sText, sCountry="${country_default}", bDebug=False, dOptions=None, bContext=False):
"analyses the paragraph sText and returns list of errors"
#sText = unicodedata.normalize("NFC", sText)
aErrors = None
sAlt = sText
dDA = {} # Disambiguisator. Key = position; value = list of morphologies
dPriority = {} # Key = position; value = priority
dOpt = _dOptions if not dOptions else dOptions
# parse paragraph
try:
sNew, aErrors = _proofread(sText, sAlt, 0, True, dDA, dPriority, sCountry, dOpt, bDebug, bContext)
if sNew:
sText = sNew
except:
raise
# cleanup
if " " in sText:
sText = sText.replace(" ", ' ') # nbsp
if " " in sText:
sText = sText.replace(" ", ' ') # nnbsp
if "'" in sText:
sText = sText.replace("'", "’")
if "‑" in sText:
sText = sText.replace("‑", "-") # nobreakdash
# parse sentences
for iStart, iEnd in _getSentenceBoundaries(sText):
if 4 < (iEnd - iStart) < 2000:
dDA.clear()
try:
_, errs = _proofread(sText[iStart:iEnd], sAlt[iStart:iEnd], iStart, False, dDA, dPriority, sCountry, dOpt, bDebug, bContext)
aErrors.update(errs)
except:
raise
return aErrors.values() # this is a view (iterable)
def _getSentenceBoundaries (sText):
|
<
|
|
|
>
|
|
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
|
_rules = None # module gc_rules
# data
_sAppContext = "" # what software is running
_dOptions = None
_aIgnoredRules = set()
_oSpellChecker = None
_oTokenizer = None
#### Parsing
def parse (sText, sCountry="${country_default}", bDebug=False, dOptions=None, bContext=False):
"analyses the paragraph sText and returns list of errors"
#sText = unicodedata.normalize("NFC", sText)
aErrors = None
sRealText = sText
dDA = {} # Disambiguisator. Key = position; value = list of morphologies
dPriority = {} # Key = position; value = priority
dOpt = _dOptions if not dOptions else dOptions
# parse paragraph
try:
sNew, aErrors = _proofread(sText, sRealText, 0, True, dDA, dPriority, sCountry, dOpt, bDebug, bContext)
if sNew:
sText = sNew
except:
raise
# cleanup
if " " in sText:
sText = sText.replace(" ", ' ') # nbsp
if " " in sText:
sText = sText.replace(" ", ' ') # nnbsp
if "'" in sText:
sText = sText.replace("'", "’")
if "‑" in sText:
sText = sText.replace("‑", "-") # nobreakdash
# parse sentences
for iStart, iEnd in _getSentenceBoundaries(sText):
if 4 < (iEnd - iStart) < 2000:
dDA.clear()
try:
# regex parser
_, errs = _proofread(sText[iStart:iEnd], sRealText[iStart:iEnd], iStart, False, dDA, dPriority, sCountry, dOpt, bDebug, bContext)
aErrors.update(errs)
except:
raise
return aErrors.values() # this is a view (iterable)
def _getSentenceBoundaries (sText):
|
︙ | | | ︙ | |
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
|
_createError = _createDictError
def load (sContext="Python"):
global _oSpellChecker
global _sAppContext
global _dOptions
try:
_oSpellChecker = SpellChecker("${lang}", "${dic_main_filename_py}", "${dic_extended_filename_py}", "${dic_community_filename_py}", "${dic_personal_filename_py}")
_sAppContext = sContext
_dOptions = dict(gc_options.getOptions(sContext)) # duplication necessary, to be able to reset to default
except:
traceback.print_exc()
def setOption (sOpt, bVal):
if sOpt in _dOptions:
_dOptions[sOpt] = bVal
|
>
>
>
|
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
|
_createError = _createDictError
def load (sContext="Python"):
global _oSpellChecker
global _sAppContext
global _dOptions
global _oTokenizer
try:
_oSpellChecker = SpellChecker("${lang}", "${dic_main_filename_py}", "${dic_extended_filename_py}", "${dic_community_filename_py}", "${dic_personal_filename_py}")
_sAppContext = sContext
_dOptions = dict(gc_options.getOptions(sContext)) # duplication necessary, to be able to reset to default
_oTokenizer = _oSpellChecker.getTokenizer()
_oSpellChecker.activateStorage()
except:
traceback.print_exc()
def setOption (sOpt, bVal):
if sOpt in _dOptions:
_dOptions[sOpt] = bVal
|
︙ | | | ︙ | |
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
|
return os.path.join(os.path.dirname(sys.modules[__name__].__file__), __name__ + ".py")
#### common functions
# common regexes
_zEndOfSentence = re.compile('([.?!:;…][ .?!… »”")]*|.$)')
_zBeginOfParagraph = re.compile("^\W*")
_zEndOfParagraph = re.compile("\W*$")
_zNextWord = re.compile(" +(\w[\w-]*)")
_zPrevWord = re.compile("(\w[\w-]*) +$")
def option (sOpt):
"return True if option sOpt is active"
return _dOptions.get(sOpt, False)
def displayInfo (dDA, tWord):
"for debugging: retrieve info of word"
if not tWord:
echo("> nothing to find")
return True
if tWord[1] not in _dAnalyses and not _storeMorphFromFSA(tWord[1]):
echo("> not in FSA")
return True
if tWord[0] in dDA:
echo("DA: " + str(dDA[tWord[0]]))
echo("FSA: " + str(_dAnalyses[tWord[1]]))
return True
def _storeMorphFromFSA (sWord):
"retrieves morphologies list from _oSpellChecker -> _dAnalyses"
global _dAnalyses
_dAnalyses[sWord] = _oSpellChecker.getMorph(sWord)
return True if _dAnalyses[sWord] else False
def morph (dDA, tWord, sPattern, bStrict=True, bNoWord=False):
"analyse a tuple (position, word), return True if sPattern in morphologies (disambiguation on)"
if not tWord:
return bNoWord
if tWord[1] not in _dAnalyses and not _storeMorphFromFSA(tWord[1]):
return False
lMorph = dDA[tWord[0]] if tWord[0] in dDA else _dAnalyses[tWord[1]]
if not lMorph:
return False
p = re.compile(sPattern)
if bStrict:
return all(p.search(s) for s in lMorph)
return any(p.search(s) for s in lMorph)
def morphex (dDA, tWord, sPattern, sNegPattern, bNoWord=False):
"analyse a tuple (position, word), returns True if not sNegPattern in word morphologies and sPattern in word morphologies (disambiguation on)"
if not tWord:
return bNoWord
if tWord[1] not in _dAnalyses and not _storeMorphFromFSA(tWord[1]):
return False
lMorph = dDA[tWord[0]] if tWord[0] in dDA else _dAnalyses[tWord[1]]
# check negative condition
np = re.compile(sNegPattern)
if any(np.search(s) for s in lMorph):
return False
# search sPattern
p = re.compile(sPattern)
return any(p.search(s) for s in lMorph)
def analyse (sWord, sPattern, bStrict=True):
"analyse a word, return True if sPattern in morphologies (disambiguation off)"
if sWord not in _dAnalyses and not _storeMorphFromFSA(sWord):
return False
if not _dAnalyses[sWord]:
return False
p = re.compile(sPattern)
if bStrict:
return all(p.search(s) for s in _dAnalyses[sWord])
return any(p.search(s) for s in _dAnalyses[sWord])
def analysex (sWord, sPattern, sNegPattern):
"analyse a word, returns True if not sNegPattern in word morphologies and sPattern in word morphologies (disambiguation off)"
if sWord not in _dAnalyses and not _storeMorphFromFSA(sWord):
return False
# check negative condition
np = re.compile(sNegPattern)
if any(np.search(s) for s in _dAnalyses[sWord]):
return False
# search sPattern
p = re.compile(sPattern)
return any(p.search(s) for s in _dAnalyses[sWord])
def stem (sWord):
"returns a list of sWord's stems"
if not sWord:
return []
if sWord not in _dAnalyses and not _storeMorphFromFSA(sWord):
return []
return [ s[1:s.find(" ")] for s in _dAnalyses[sWord] ]
## functions to get text outside pattern scope
# warning: check compile_rules.py to understand how it works
def nextword (s, iStart, n):
|
|
|
|
|
|
|
>
|
|
<
<
<
<
<
<
<
<
<
|
|
>
<
|
<
|
|
|
|
>
|
|
<
<
<
<
<
<
<
<
|
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
|
return os.path.join(os.path.dirname(sys.modules[__name__].__file__), __name__ + ".py")
#### common functions
# common regexes
_zEndOfSentence = re.compile(r'([.?!:;…][ .?!… »”")]*|.$)')
_zBeginOfParagraph = re.compile(r"^\W*")
_zEndOfParagraph = re.compile(r"\W*$")
_zNextWord = re.compile(r" +(\w[\w-]*)")
_zPrevWord = re.compile(r"(\w[\w-]*) +$")
def option (sOpt):
"return True if option sOpt is active"
return _dOptions.get(sOpt, False)
def displayInfo (dDA, tWord):
"for debugging: retrieve info of word"
if not tWord:
echo("> nothing to find")
return True
lMorph = _oSpellChecker.getMorph(tWord[1])
if not lMorph:
echo("> not in dictionary")
return True
if tWord[0] in dDA:
echo("DA: " + str(dDA[tWord[0]]))
echo("FSA: " + str(lMorph))
return True
def morph (dDA, tWord, sPattern, bStrict=True, bNoWord=False):
"analyse a tuple (position, word), return True if sPattern in morphologies (disambiguation on)"
if not tWord:
return bNoWord
lMorph = dDA[tWord[0]] if tWord[0] in dDA else _oSpellChecker.getMorph(tWord[1])
if not lMorph:
return False
p = re.compile(sPattern)
if bStrict:
return all(p.search(s) for s in lMorph)
return any(p.search(s) for s in lMorph)
def morphex (dDA, tWord, sPattern, sNegPattern, bNoWord=False):
"analyse a tuple (position, word), returns True if not sNegPattern in word morphologies and sPattern in word morphologies (disambiguation on)"
if not tWord:
return bNoWord
lMorph = dDA[tWord[0]] if tWord[0] in dDA else _oSpellChecker.getMorph(tWord[1])
if not lMorph:
return False
# check negative condition
np = re.compile(sNegPattern)
if any(np.search(s) for s in lMorph):
return False
# search sPattern
p = re.compile(sPattern)
return any(p.search(s) for s in lMorph)
def analyse (sWord, sPattern, bStrict=True):
"analyse a word, return True if sPattern in morphologies (disambiguation off)"
lMorph = _oSpellChecker.getMorph(sWord)
if not lMorph:
return False
p = re.compile(sPattern)
if bStrict:
return all(p.search(s) for s in lMorph)
return any(p.search(s) for s in lMorph)
def analysex (sWord, sPattern, sNegPattern):
"analyse a word, returns True if not sNegPattern in word morphologies and sPattern in word morphologies (disambiguation off)"
lMorph = _oSpellChecker.getMorph(sWord)
if not lMorph:
return False
# check negative condition
np = re.compile(sNegPattern)
if any(np.search(s) for s in lMorph):
return False
# search sPattern
p = re.compile(sPattern)
return any(p.search(s) for s in lMorph)
## functions to get text outside pattern scope
# warning: check compile_rules.py to understand how it works
def nextword (s, iStart, n):
|
︙ | | | ︙ | |
532
533
534
535
536
537
538
539
540
541
542
543
544
545
546
547
548
549
550
551
552
553
554
555
556
557
558
559
560
561
562
563
564
565
566
567
568
569
570
571
572
573
574
575
576
577
578
579
580
581
582
583
584
585
|
#### Disambiguator
def select (dDA, nPos, sWord, sPattern, lDefault=None):
if not sWord:
return True
if nPos in dDA:
return True
if sWord not in _dAnalyses and not _storeMorphFromFSA(sWord):
return True
if len(_dAnalyses[sWord]) == 1:
return True
lSelect = [ sMorph for sMorph in _dAnalyses[sWord] if re.search(sPattern, sMorph) ]
if lSelect:
if len(lSelect) != len(_dAnalyses[sWord]):
dDA[nPos] = lSelect
#echo("= "+sWord+" "+str(dDA.get(nPos, "null")))
elif lDefault:
dDA[nPos] = lDefault
#echo("= "+sWord+" "+str(dDA.get(nPos, "null")))
return True
def exclude (dDA, nPos, sWord, sPattern, lDefault=None):
if not sWord:
return True
if nPos in dDA:
return True
if sWord not in _dAnalyses and not _storeMorphFromFSA(sWord):
return True
if len(_dAnalyses[sWord]) == 1:
return True
lSelect = [ sMorph for sMorph in _dAnalyses[sWord] if not re.search(sPattern, sMorph) ]
if lSelect:
if len(lSelect) != len(_dAnalyses[sWord]):
dDA[nPos] = lSelect
#echo("= "+sWord+" "+str(dDA.get(nPos, "null")))
elif lDefault:
dDA[nPos] = lDefault
#echo("= "+sWord+" "+str(dDA.get(nPos, "null")))
return True
def define (dDA, nPos, lMorph):
dDA[nPos] = lMorph
#echo("= "+str(nPos)+" "+str(dDA[nPos]))
return True
#### GRAMMAR CHECKER PLUGINS
${plugins}
${callables}
|
|
<
|
|
|
<
<
|
<
|
|
|
<
<
<
>
>
|
522
523
524
525
526
527
528
529
530
531
532
533
534
535
536
537
538
539
540
541
542
543
544
545
546
547
548
549
550
551
552
553
554
555
556
557
558
559
560
561
562
563
564
565
566
567
568
569
570
|
#### Disambiguator
def select (dDA, nPos, sWord, sPattern, lDefault=None):
if not sWord:
return True
if nPos in dDA:
return True
lMorph = _oSpellChecker.getMorph(sWord)
if not lMorph or len(lMorph) == 1:
return True
lSelect = [ sMorph for sMorph in lMorph if re.search(sPattern, sMorph) ]
if lSelect:
if len(lSelect) != len(lMorph):
dDA[nPos] = lSelect
elif lDefault:
dDA[nPos] = lDefault
return True
def exclude (dDA, nPos, sWord, sPattern, lDefault=None):
if not sWord:
return True
if nPos in dDA:
return True
lMorph = _oSpellChecker.getMorph(sWord)
if not lMorph or len(lMorph) == 1:
return True
lSelect = [ sMorph for sMorph in lMorph if not re.search(sPattern, sMorph) ]
if lSelect:
if len(lSelect) != len(lMorph):
dDA[nPos] = lSelect
elif lDefault:
dDA[nPos] = lDefault
return True
def define (dDA, nPos, lMorph):
dDA[nPos] = lMorph
return True
#### GRAMMAR CHECKER PLUGINS
${plugins}
#### CALLABLES (generated code)
${callables}
|