1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
|
"""
Lexicographer for the French language
"""
# Note:
# This mode must contains at least:
# <dSugg> : a dictionary for default suggestions.
# <bLexicographer> : a boolean False
# if the boolean is True, 4 functions are required:
# split(sWord) -> returns a list of string (that will be analyzed)
# analyze(sWord) -> returns a string with the meaning of word
# readableMorph(sMorph) -> returns a string with the meaning of tags
# filterSugg(aWord) -> returns a filtered list of suggestions
import re
#### Suggestions
|
>
|
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
|
"""
Lexicographer for the French language
"""
# Note:
# This mode must contains at least:
# <dSugg> : a dictionary for default suggestions.
# <bLexicographer> : a boolean False
# if the boolean is True, 4 functions are required:
# split(sWord) -> returns a list of string (that will be analyzed)
# analyze(sWord) -> returns a string with the meaning of word
# readableMorph(sMorph) -> returns a string with the meaning of tags
# setLabelsOnToken(dToken) -> adds readable information on token
# filterSugg(aWord) -> returns a filtered list of suggestions
import re
#### Suggestions
|
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
|
if not sRes:
return " [" + sMorph + "]: étiquettes inconnues"
return sRes.rstrip(",")
_zPartDemForm = re.compile("([\\w]+)-(là|ci)$")
_zInterroVerb = re.compile("([\\w]+)(-(?:t-(?:ie?l|elle|on)|je|tu|ie?ls?|elles?|on|[nv]ous))$")
_zImperatifVerb = re.compile("([\\w]+)(-(?:l(?:es?|a)-(?:moi|toi|lui|[nv]ous|leur)|y|en|[mts][’'](?:y|en)|les?|la|[mt]oi|leur|lui))$")
def setLabelsOnToken (dToken):
# Token: .sType, .sValue, .nStart, .nEnd, .lMorph
try:
if dToken["sType"] == "PUNC" or dToken["sType"] == "SIGN":
dToken["aLabels"] = [_dValues.get(dToken["sValue"], "signe de ponctuation divers")]
elif dToken["sType"] == 'NUM':
|
|
|
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
|
if not sRes:
return " [" + sMorph + "]: étiquettes inconnues"
return sRes.rstrip(",")
_zPartDemForm = re.compile("([\\w]+)-(là|ci)$")
_zInterroVerb = re.compile("([\\w]+)(-(?:t-(?:ie?l|elle|on)|je|tu|ie?ls?|elles?|on|[nv]ous))$")
_zImperatifVerb = re.compile("([\\w]+)(-(?:l(?:es?|a)-(?:moi|toi|lui|[nv]ous|leur)|y|en|[mts]['’ʼ‘‛´`′‵՚ꞌꞋ](?:y|en)|les?|la|[mt]oi|leur|lui))$")
def setLabelsOnToken (dToken):
# Token: .sType, .sValue, .nStart, .nEnd, .lMorph
try:
if dToken["sType"] == "PUNC" or dToken["sType"] == "SIGN":
dToken["aLabels"] = [_dValues.get(dToken["sValue"], "signe de ponctuation divers")]
elif dToken["sType"] == 'NUM':
|
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
|
elif dToken["sType"] == 'PSEUDOHTML':
dToken["aLabels"] = ["balise pseudo-HTML"]
elif dToken["sType"] == 'HTMLENTITY':
dToken["aLabels"] = ["entité caractère XML/HTML"]
elif dToken["sType"] == 'HOUR':
dToken["aLabels"] = ["heure"]
elif dToken["sType"] == 'WORD_ELIDED':
dToken["aLabels"] = [_dValues.get(dToken["sValue"], "préfixe élidé inconnu")]
elif dToken["sType"] == 'WORD_ORDINAL':
dToken["aLabels"] = ["nombre ordinal"]
elif dToken["sType"] == 'FOLDERUNIX':
dToken["aLabels"] = ["dossier UNIX (et dérivés)"]
elif dToken["sType"] == 'FOLDERWIN':
dToken["aLabels"] = ["dossier Windows"]
elif dToken["sType"] == 'WORD_ACRONYM':
|
|
|
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
|
elif dToken["sType"] == 'PSEUDOHTML':
dToken["aLabels"] = ["balise pseudo-HTML"]
elif dToken["sType"] == 'HTMLENTITY':
dToken["aLabels"] = ["entité caractère XML/HTML"]
elif dToken["sType"] == 'HOUR':
dToken["aLabels"] = ["heure"]
elif dToken["sType"] == 'WORD_ELIDED':
dToken["aLabels"] = [_dValues.get(dToken["sValue"].lower(), "préfixe élidé inconnu")]
elif dToken["sType"] == 'WORD_ORDINAL':
dToken["aLabels"] = ["nombre ordinal"]
elif dToken["sType"] == 'FOLDERUNIX':
dToken["aLabels"] = ["dossier UNIX (et dérivés)"]
elif dToken["sType"] == 'FOLDERWIN':
dToken["aLabels"] = ["dossier Windows"]
elif dToken["sType"] == 'WORD_ACRONYM':
|