Grammalecte  Changes On Branch ffd3e274fa2decd3

Changes In Branch rg Through [ffd3e274fa] Excluding Merge-Ins

This is equivalent to a diff from cb3f319c08 to ffd3e274fa

2018-07-28
09:08
[fr] options manquantes et nettoyage check-in: c79208cbc2 user: olr tags: fr, rg
00:14
[build] DARG: shorten building result display check-in: ffd3e274fa user: olr tags: build, rg
2018-07-27
18:11
[fr] conversion: regex rules -> graph rules check-in: ebc7b7f342 user: olr tags: fr, rg
2018-06-25
07:58
[fr] faux positif: en tant que président du conseil (trailing spaces automatically removed) check-in: 37fb199673 user: olr tags: trunk, fr
2018-06-24
19:03
merge trunk check-in: 099647c959 user: olr tags: rg
2018-06-22
07:46
[cli] option to load personal dictionary check-in: cb3f319c08 user: olr tags: trunk, cli
2018-06-15
20:44
[fr] faux positif: accord de laisser avec les pronoms sans impératif check-in: 24d41be12e user: olr tags: trunk, fr

Modified compile_rules.py from [1ea2b6d97a] to [e8250665d2].




1
2
3
4
5
6

7
8
9
10
11
12
13
14
15
16
17
18
19
20
21

22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63




import re
import traceback
import json

import compile_rules_js_convert as jsconv



dDEF = {}
lFUNCTIONS = []

aRULESET = set()     # set of rule-ids to check if there is several rules with the same id
nRULEWITHOUTNAME = 0

dJSREGEXES = {}

sWORDLIMITLEFT  = r"(?<![\w.,–-])"   # r"(?<![-.,—])\b"  seems slower
sWORDLIMITRIGHT = r"(?![\w–-])"      # r"\b(?!-—)"       seems slower


def prepareFunction (s):

    s = s.replace("__also__", "bCondMemo")
    s = s.replace("__else__", "not bCondMemo")
    s = re.sub(r"isStart *\(\)", 'before("^ *$|, *$")', s)
    s = re.sub(r"isRealStart *\(\)", 'before("^ *$")', s)
    s = re.sub(r"isStart0 *\(\)", 'before0("^ *$|, *$")', s)
    s = re.sub(r"isRealStart0 *\(\)", 'before0("^ *$")', s)
    s = re.sub(r"isEnd *\(\)", 'after("^ *$|^,")', s)
    s = re.sub(r"isRealEnd *\(\)", 'after("^ *$")', s)
    s = re.sub(r"isEnd0 *\(\)", 'after0("^ *$|^,")', s)
    s = re.sub(r"isRealEnd0 *\(\)", 'after0("^ *$")', s)
    s = re.sub(r"(select|exclude)[(][\\](\d+)", '\\1(dDA, m.start(\\2), m.group(\\2)', s)
    s = re.sub(r"define[(][\\](\d+)", 'define(dDA, m.start(\\1)', s)
    s = re.sub(r"(morph|morphex|displayInfo)[(][\\](\d+)", '\\1((m.start(\\2), m.group(\\2))', s)
    s = re.sub(r"(morph|morphex|displayInfo)[(]", '\\1(dDA, ', s)
    s = re.sub(r"(sugg\w+|switch\w+)\(@", '\\1(m.group(i[4])', s)
    s = re.sub(r"word\(\s*1\b", 'nextword1(s, m.end()', s)                                  # word(1)
    s = re.sub(r"word\(\s*-1\b", 'prevword1(s, m.start()', s)                               # word(-1)
    s = re.sub(r"word\(\s*(\d)", 'nextword(s, m.end(), \\1', s)                             # word(n)
    s = re.sub(r"word\(\s*-(\d)", 'prevword(s, m.start(), \\1', s)                          # word(-n)
    s = re.sub(r"before\(\s*", 'look(s[:m.start()], ', s)                                   # before(s)
    s = re.sub(r"after\(\s*", 'look(s[m.end():], ', s)                                      # after(s)
    s = re.sub(r"textarea\(\s*", 'look(s, ', s)                                             # textarea(s)
    s = re.sub(r"before_chk1\(\s*", 'look_chk1(dDA, s[:m.start()], 0, ', s)                 # before_chk1(s)
    s = re.sub(r"after_chk1\(\s*", 'look_chk1(dDA, s[m.end():], m.end(), ', s)              # after_chk1(s)
    s = re.sub(r"textarea_chk1\(\s*", 'look_chk1(dDA, s, 0, ', s)                           # textarea_chk1(s)
    s = re.sub(r"/0", 'sx[m.start():m.end()]', s)                                           # /0
    s = re.sub(r"before0\(\s*", 'look(sx[:m.start()], ', s)                                 # before0(s)
    s = re.sub(r"after0\(\s*", 'look(sx[m.end():], ', s)                                    # after0(s)
    s = re.sub(r"textarea0\(\s*", 'look(sx, ', s)                                           # textarea0(s)
    s = re.sub(r"before0_chk1\(\s*", 'look_chk1(dDA, sx[:m.start()], 0, ', s)               # before0_chk1(s)
    s = re.sub(r"after0_chk1\(\s*", 'look_chk1(dDA, sx[m.end():], m.end(), ', s)            # after0_chk1(s)
    s = re.sub(r"textarea0_chk1\(\s*", 'look_chk1(dDA, sx, 0, ', s)                         # textarea0_chk1(s)
    s = re.sub(r"isEndOfNG\(\s*\)", 'isEndOfNG(dDA, s[m.end():], m.end())', s)              # isEndOfNG(s)
    s = re.sub(r"isNextNotCOD\(\s*\)", 'isNextNotCOD(dDA, s[m.end():], m.end())', s)        # isNextNotCOD(s)
    s = re.sub(r"isNextVerb\(\s*\)", 'isNextVerb(dDA, s[m.end():], m.end())', s)            # isNextVerb(s)
    s = re.sub(r"\bspell *[(]", '_oSpellChecker.isValid(', s)
    s = re.sub(r"[\\](\d+)", 'm.group(\\1)', s)
    return s


def uppercase (s, sLang):
    "(flag i is not enough): converts regex to uppercase regex: 'foo' becomes '[Ff][Oo][Oo]', but 'Bar' becomes 'B[Aa][Rr]'."
>
>
>






>















>










|
|

|








|
|
|




|
|
|
|
|
|







1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
"""
Grammalecte: compile rules
"""

import re
import traceback
import json

import compile_rules_js_convert as jsconv
import compile_rules_graph as crg


dDEF = {}
lFUNCTIONS = []

aRULESET = set()     # set of rule-ids to check if there is several rules with the same id
nRULEWITHOUTNAME = 0

dJSREGEXES = {}

sWORDLIMITLEFT  = r"(?<![\w.,–-])"   # r"(?<![-.,—])\b"  seems slower
sWORDLIMITRIGHT = r"(?![\w–-])"      # r"\b(?!-—)"       seems slower


def prepareFunction (s):
    "convert simple rule syntax to a string of Python code"
    s = s.replace("__also__", "bCondMemo")
    s = s.replace("__else__", "not bCondMemo")
    s = re.sub(r"isStart *\(\)", 'before("^ *$|, *$")', s)
    s = re.sub(r"isRealStart *\(\)", 'before("^ *$")', s)
    s = re.sub(r"isStart0 *\(\)", 'before0("^ *$|, *$")', s)
    s = re.sub(r"isRealStart0 *\(\)", 'before0("^ *$")', s)
    s = re.sub(r"isEnd *\(\)", 'after("^ *$|^,")', s)
    s = re.sub(r"isRealEnd *\(\)", 'after("^ *$")', s)
    s = re.sub(r"isEnd0 *\(\)", 'after0("^ *$|^,")', s)
    s = re.sub(r"isRealEnd0 *\(\)", 'after0("^ *$")', s)
    s = re.sub(r"(select|exclude)[(][\\](\d+)", '\\1(dTokenPos, m.start(\\2), m.group(\\2)', s)
    s = re.sub(r"define[(][\\](\d+)", 'define(dTokenPos, m.start(\\1)', s)
    s = re.sub(r"(morph|morphex|displayInfo)[(][\\](\d+)", '\\1((m.start(\\2), m.group(\\2))', s)
    s = re.sub(r"(morph|morphex|displayInfo)[(]", '\\1(dTokenPos, ', s)
    s = re.sub(r"(sugg\w+|switch\w+)\(@", '\\1(m.group(i[4])', s)
    s = re.sub(r"word\(\s*1\b", 'nextword1(s, m.end()', s)                                  # word(1)
    s = re.sub(r"word\(\s*-1\b", 'prevword1(s, m.start()', s)                               # word(-1)
    s = re.sub(r"word\(\s*(\d)", 'nextword(s, m.end(), \\1', s)                             # word(n)
    s = re.sub(r"word\(\s*-(\d)", 'prevword(s, m.start(), \\1', s)                          # word(-n)
    s = re.sub(r"before\(\s*", 'look(s[:m.start()], ', s)                                   # before(s)
    s = re.sub(r"after\(\s*", 'look(s[m.end():], ', s)                                      # after(s)
    s = re.sub(r"textarea\(\s*", 'look(s, ', s)                                             # textarea(s)
    s = re.sub(r"before_chk1\(\s*", 'look_chk1(dTokenPos, s[:m.start()], 0, ', s)           # before_chk1(s)
    s = re.sub(r"after_chk1\(\s*", 'look_chk1(dTokenPos, s[m.end():], m.end(), ', s)        # after_chk1(s)
    s = re.sub(r"textarea_chk1\(\s*", 'look_chk1(dTokenPos, s, 0, ', s)                     # textarea_chk1(s)
    s = re.sub(r"/0", 'sx[m.start():m.end()]', s)                                           # /0
    s = re.sub(r"before0\(\s*", 'look(sx[:m.start()], ', s)                                 # before0(s)
    s = re.sub(r"after0\(\s*", 'look(sx[m.end():], ', s)                                    # after0(s)
    s = re.sub(r"textarea0\(\s*", 'look(sx, ', s)                                           # textarea0(s)
    s = re.sub(r"before0_chk1\(\s*", 'look_chk1(dTokenPos, sx[:m.start()], 0, ', s)         # before0_chk1(s)
    s = re.sub(r"after0_chk1\(\s*", 'look_chk1(dTokenPos, sx[m.end():], m.end(), ', s)      # after0_chk1(s)
    s = re.sub(r"textarea0_chk1\(\s*", 'look_chk1(dTokenPos, sx, 0, ', s)                   # textarea0_chk1(s)
    s = re.sub(r"isEndOfNG\(\s*\)", 'isEndOfNG(dTokenPos, s[m.end():], m.end())', s)        # isEndOfNG(s)
    s = re.sub(r"isNextNotCOD\(\s*\)", 'isNextNotCOD(dTokenPos, s[m.end():], m.end())', s)  # isNextNotCOD(s)
    s = re.sub(r"isNextVerb\(\s*\)", 'isNextVerb(dTokenPos, s[m.end():], m.end())', s)      # isNextVerb(s)
    s = re.sub(r"\bspell *[(]", '_oSpellChecker.isValid(', s)
    s = re.sub(r"[\\](\d+)", 'm.group(\\1)', s)
    return s


def uppercase (s, sLang):
    "(flag i is not enough): converts regex to uppercase regex: 'foo' becomes '[Ff][Oo][Oo]', but 'Bar' becomes 'B[Aa][Rr]'."
95
96
97
98
99
100
101

102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117









118
119
120
121
122
123
124
125
126
127
128
129
130
131
            nState = 4
        elif nState == 4:
            nState = 0
    return sUp


def countGroupInRegex (sRegex):

    try:
        return re.compile(sRegex).groups
    except:
        traceback.print_exc()
        print(sRegex)
    return 0


def createRule (s, nIdLine, sLang, bParagraph, dOptPriority):
    "returns rule as list [option name, regex, bCaseInsensitive, identifier, list of actions]"
    global dJSREGEXES
    global nRULEWITHOUTNAME

    #### OPTIONS
    sLineId = str(nIdLine) + ("p" if bParagraph else "s")
    sRuleId = sLineId









    sOption = False         # False or [a-z0-9]+ name
    nPriority = 4           # Default is 4, value must be between 0 and 9
    tGroups = None          # code for groups positioning (only useful for JavaScript)
    cCaseMode = 'i'         # i: case insensitive,  s: case sensitive,  u: uppercasing allowed
    cWordLimitLeft = '['    # [: word limit, <: no specific limit
    cWordLimitRight = ']'   # ]: word limit, >: no specific limit
    m = re.match("^__(?P<borders_and_case>[[<]\\w[]>])(?P<option>/[a-zA-Z0-9]+|)(?P<ruleid>\\(\\w+\\)|)(?P<priority>![0-9]|)__ *", s)
    if m:
        cWordLimitLeft = m.group('borders_and_case')[0]
        cCaseMode = m.group('borders_and_case')[1]
        cWordLimitRight = m.group('borders_and_case')[2]
        sOption = m.group('option')[1:]  if m.group('option')  else False
        if m.group('ruleid'):
            sRuleId =  m.group('ruleid')[1:-1]







>













<


>
>
>
>
>
>
>
>
>






|







100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120

121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
            nState = 4
        elif nState == 4:
            nState = 0
    return sUp


def countGroupInRegex (sRegex):
    "returns the number of groups in <sRegex>"
    try:
        return re.compile(sRegex).groups
    except:
        traceback.print_exc()
        print(sRegex)
    return 0


def createRule (s, nIdLine, sLang, bParagraph, dOptPriority):
    "returns rule as list [option name, regex, bCaseInsensitive, identifier, list of actions]"
    global dJSREGEXES
    global nRULEWITHOUTNAME


    sLineId = str(nIdLine) + ("p" if bParagraph else "s")
    sRuleId = sLineId

    #### GRAPH CALL
    if s.startswith("@@@@"):
        if bParagraph:
            print("Error. Graph call can be made only after the first pass (sentence by sentence)")
            exit()
        return ["@@@@", s[4:], sLineId]

    #### OPTIONS
    sOption = False         # False or [a-z0-9]+ name
    nPriority = 4           # Default is 4, value must be between 0 and 9
    tGroups = None          # code for groups positioning (only useful for JavaScript)
    cCaseMode = 'i'         # i: case insensitive,  s: case sensitive,  u: uppercasing allowed
    cWordLimitLeft = '['    # [: word limit, <: no specific limit
    cWordLimitRight = ']'   # ]: word limit, >: no specific limit
    m = re.match("^__(?P<borders_and_case>[\\[<]\\w[\\]>])(?P<option>/[a-zA-Z0-9]+|)(?P<ruleid>\\(\\w+\\)|)(?P<priority>![0-9]|)__ *", s)
    if m:
        cWordLimitLeft = m.group('borders_and_case')[0]
        cCaseMode = m.group('borders_and_case')[1]
        cWordLimitRight = m.group('borders_and_case')[2]
        sOption = m.group('option')[1:]  if m.group('option')  else False
        if m.group('ruleid'):
            sRuleId =  m.group('ruleid')[1:-1]
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
    #### REGEX TRIGGER
    i = s.find(" <<-")
    if i == -1:
        print("# Error: no condition at line " + sLineId)
        return None
    sRegex = s[:i].strip()
    s = s[i+4:]
    
    # JS groups positioning codes
    m = re.search("@@\\S+", sRegex)
    if m:
        tGroups = jsconv.groupsPositioningCodeToList(sRegex[m.start()+2:])
        sRegex = sRegex[:m.start()].strip()
    # JS regex
    m = re.search("<js>.+</js>i?", sRegex)







|







159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
    #### REGEX TRIGGER
    i = s.find(" <<-")
    if i == -1:
        print("# Error: no condition at line " + sLineId)
        return None
    sRegex = s[:i].strip()
    s = s[i+4:]

    # JS groups positioning codes
    m = re.search("@@\\S+", sRegex)
    if m:
        tGroups = jsconv.groupsPositioningCodeToList(sRegex[m.start()+2:])
        sRegex = sRegex[:m.start()].strip()
    # JS regex
    m = re.search("<js>.+</js>i?", sRegex)
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229















230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276

277
278
279
280
281
282
283
284
285
286
287
288
289
290

291
292
293
294
295
296
297
298
299
300
301



302
303
304

305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333

334
335
336
337
338
339
340
341
342
343
344
345

346
347

348
349
350
351
352
353

354
355
356
357
358
359
360
        sRegex = sRegex.replace("(?i)", "")
        sRegex = uppercase(sRegex, sLang)
    else:
        print("# Unknown case mode [" + cCaseMode + "] at line " + sLineId)

    ## check regex
    try:
        z = re.compile(sRegex)
    except:
        print("# Regex error at line ", nIdLine)
        print(sRegex)
        traceback.print_exc()
        return None
    ## groups in non grouping parenthesis
    for x in re.finditer("\(\?:[^)]*\([[\w -]", sRegex):
        print("# Warning: groups inside non grouping parenthesis in regex at line " + sLineId)

    #### PARSE ACTIONS
    lActions = []
    nAction = 1
    for sAction in s.split(" <<- "):
        t = createAction(sRuleId + "_" + str(nAction), sAction, nGroup)
        nAction += 1
        if t:
            lActions.append(t)
    if not lActions:
        return None

    return [sOption, sRegex, bCaseInsensitive, sLineId, sRuleId, nPriority, lActions, tGroups]

















def createAction (sIdAction, sAction, nGroup):
    "returns an action to perform as a tuple (condition, action type, action[, iGroup [, message, URL ]])"
    global lFUNCTIONS

    m = re.search(r"([-~=>])(\d*|)>>", sAction)
    if not m:
        print("# No action at line " + sIdAction)
        return None

    #### CONDITION
    sCondition = sAction[:m.start()].strip()
    if sCondition:
        sCondition = prepareFunction(sCondition)
        lFUNCTIONS.append(("c_"+sIdAction, sCondition))
        for x in re.finditer("[.](?:group|start|end)[(](\d+)[)]", sCondition):
            if int(x.group(1)) > nGroup:
                print("# Error in groups in condition at line " + sIdAction + " ("+str(nGroup)+" groups only)")
        if ".match" in sCondition:
            print("# Error. JS compatibility. Don't use .match() in condition, use .search()")
        sCondition = "c_"+sIdAction
    else:
        sCondition = None

    #### iGroup / positioning
    iGroup = int(m.group(2)) if m.group(2) else 0
    if iGroup > nGroup:
        print("# Selected group > group number in regex at line " + sIdAction)
    
    #### ACTION
    sAction = sAction[m.end():].strip()
    cAction = m.group(1)
    if cAction == "-":
        ## error
        iMsg = sAction.find(" # ")
        if iMsg == -1:
            sMsg = "# Error. Error message not found."
            sURL = ""
            print(sMsg + " Action id: " + sIdAction)
        else:
            sMsg = sAction[iMsg+3:].strip()
            sAction = sAction[:iMsg].strip()
            sURL = ""
            mURL = re.search("[|] *(https?://.*)", sMsg)
            if mURL:
                sURL = mURL.group(1).strip()
                sMsg = sMsg[:mURL.start(0)].strip()

            if sMsg[0:1] == "=":
                sMsg = prepareFunction(sMsg[1:])
                lFUNCTIONS.append(("m_"+sIdAction, sMsg))
                for x in re.finditer("group[(](\d+)[)]", sMsg):
                    if int(x.group(1)) > nGroup:
                        print("# Error in groups in message at line " + sIdAction + " ("+str(nGroup)+" groups only)")
                sMsg = "=m_"+sIdAction
            else:
                for x in re.finditer(r"\\(\d+)", sMsg):
                    if int(x.group(1)) > nGroup:
                        print("# Error in groups in message at line " + sIdAction + " ("+str(nGroup)+" groups only)")
                if re.search("[.]\\w+[(]", sMsg):
                    print("# Error in message at line " + sIdAction + ":  This message looks like code. Line should begin with =")
            

    if sAction[0:1] == "=" or cAction == "=":
        if "define" in sAction and not re.search(r"define\(\\\d+ *, *\[.*\] *\)", sAction):
            print("# Error in action at line " + sIdAction + ": second argument for define must be a list of strings")
        sAction = prepareFunction(sAction)
        sAction = sAction.replace("m.group(i[4])", "m.group("+str(iGroup)+")")
        for x in re.finditer("group[(](\d+)[)]", sAction):
            if int(x.group(1)) > nGroup:
                print("# Error in groups in replacement at line " + sIdAction + " ("+str(nGroup)+" groups only)")
    else:
        for x in re.finditer(r"\\(\d+)", sAction):
            if int(x.group(1)) > nGroup:



                print("# Error in groups in replacement at line " + sIdAction + " ("+str(nGroup)+" groups only)")
        if re.search("[.]\\w+[(]|sugg\\w+[(]", sAction):
            print("# Error in action at line " + sIdAction + ":  This action looks like code. Line should begin with =")


    if cAction == "-":
        ## error detected --> suggestion
        if not sAction:
            print("# Error in action at line " + sIdAction + ":  This action is empty.")
        if sAction[0:1] == "=":
            lFUNCTIONS.append(("s_"+sIdAction, sAction[1:]))
            sAction = "=s_"+sIdAction
        elif sAction.startswith('"') and sAction.endswith('"'):
            sAction = sAction[1:-1]
        if not sMsg:
            print("# Error in action at line " + sIdAction + ":  the message is empty.")
        return [sCondition, cAction, sAction, iGroup, sMsg, sURL]
    elif cAction == "~":
        ## text processor
        if not sAction:
            print("# Error in action at line " + sIdAction + ":  This action is empty.")
        if sAction[0:1] == "=":
            lFUNCTIONS.append(("p_"+sIdAction, sAction[1:]))
            sAction = "=p_"+sIdAction
        elif sAction.startswith('"') and sAction.endswith('"'):
            sAction = sAction[1:-1]
        return [sCondition, cAction, sAction, iGroup]
    elif cAction == "=":
        ## disambiguator
        if sAction[0:1] == "=":
            sAction = sAction[1:]
        if not sAction:
            print("# Error in action at line " + sIdAction + ":  This action is empty.")

        lFUNCTIONS.append(("d_"+sIdAction, sAction))
        sAction = "d_"+sIdAction
        return [sCondition, cAction, sAction]
    elif cAction == ">":
        ## no action, break loop if condition is False
        return [sCondition, cAction, ""]
    else:
        print("# Unknown action at line " + sIdAction)
        return None


def _calcRulesStats (lRules):

    d = {'=':0, '~': 0, '-': 0, '>': 0}
    for aRule in lRules:

        for aAction in aRule[6]:
            d[aAction[1]] = d[aAction[1]] + 1
    return (d, len(lRules))


def displayStats (lParagraphRules, lSentenceRules):

    print("  {:>18} {:>18} {:>18} {:>18}".format("DISAMBIGUATOR", "TEXT PROCESSOR", "GRAMMAR CHECKING", "REGEX"))
    d, nRule = _calcRulesStats(lParagraphRules)
    print("§ {:>10} actions {:>10} actions {:>10} actions  in {:>8} rules".format(d['='], d['~'], d['-'], nRule))
    d, nRule = _calcRulesStats(lSentenceRules)
    print("s {:>10} actions {:>10} actions {:>10} actions  in {:>8} rules".format(d['='], d['~'], d['-'], nRule))









|






|















>
>
>
>
>
>
>
>
>
>
>
>
>
>
>



<
<









|
|
<
<


|







|


















>


|
<
<
<
|

<
<
|
<
<
|
>

<
<


<
<
<

|
|
>
>
>
|
|
|
>



<
<

|
|







<
<

|
|







|
|
>
|
|

<
<
<






>


>
|
|




>







214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261


262
263
264
265
266
267
268
269
270
271
272


273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305



306
307


308


309
310
311


312
313



314
315
316
317
318
319
320
321
322
323
324
325
326


327
328
329
330
331
332
333
334
335
336


337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352



353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
        sRegex = sRegex.replace("(?i)", "")
        sRegex = uppercase(sRegex, sLang)
    else:
        print("# Unknown case mode [" + cCaseMode + "] at line " + sLineId)

    ## check regex
    try:
        re.compile(sRegex)
    except:
        print("# Regex error at line ", nIdLine)
        print(sRegex)
        traceback.print_exc()
        return None
    ## groups in non grouping parenthesis
    for x in re.finditer(r"\(\?:[^)]*\([\[\w -]", sRegex):
        print("# Warning: groups inside non grouping parenthesis in regex at line " + sLineId)

    #### PARSE ACTIONS
    lActions = []
    nAction = 1
    for sAction in s.split(" <<- "):
        t = createAction(sRuleId + "_" + str(nAction), sAction, nGroup)
        nAction += 1
        if t:
            lActions.append(t)
    if not lActions:
        return None

    return [sOption, sRegex, bCaseInsensitive, sLineId, sRuleId, nPriority, lActions, tGroups]


def checkReferenceNumbers (sText, sActionId, nToken):
    "check if token references in <sText> greater than <nToken> (debugging)"
    for x in re.finditer(r"\\(\d+)", sText):
        if int(x.group(1)) > nToken:
            print("# Error in token index at line " + sActionId + " ("+str(nToken)+" tokens only)")
            print(sText)


def checkIfThereIsCode (sText, sActionId):
    "check if there is code in <sText> (debugging)"
    if re.search("[.]\\w+[(]|sugg\\w+[(]|\\([0-9]|\\[[0-9]", sText):
        print("# Warning at line " + sActionId + ":  This message looks like code. Line should probably begin with =")
        print(sText)


def createAction (sIdAction, sAction, nGroup):
    "returns an action to perform as a tuple (condition, action type, action[, iGroup [, message, URL ]])"


    m = re.search(r"([-~=>])(\d*|)>>", sAction)
    if not m:
        print("# No action at line " + sIdAction)
        return None

    #### CONDITION
    sCondition = sAction[:m.start()].strip()
    if sCondition:
        sCondition = prepareFunction(sCondition)
        lFUNCTIONS.append(("_c_"+sIdAction, sCondition))
        checkReferenceNumbers(sCondition, sIdAction, nGroup)


        if ".match" in sCondition:
            print("# Error. JS compatibility. Don't use .match() in condition, use .search()")
        sCondition = "_c_"+sIdAction
    else:
        sCondition = None

    #### iGroup / positioning
    iGroup = int(m.group(2)) if m.group(2) else 0
    if iGroup > nGroup:
        print("# Selected group > group number in regex at line " + sIdAction)

    #### ACTION
    sAction = sAction[m.end():].strip()
    cAction = m.group(1)
    if cAction == "-":
        ## error
        iMsg = sAction.find(" # ")
        if iMsg == -1:
            sMsg = "# Error. Error message not found."
            sURL = ""
            print(sMsg + " Action id: " + sIdAction)
        else:
            sMsg = sAction[iMsg+3:].strip()
            sAction = sAction[:iMsg].strip()
            sURL = ""
            mURL = re.search("[|] *(https?://.*)", sMsg)
            if mURL:
                sURL = mURL.group(1).strip()
                sMsg = sMsg[:mURL.start(0)].strip()
            checkReferenceNumbers(sMsg, sIdAction, nGroup)
            if sMsg[0:1] == "=":
                sMsg = prepareFunction(sMsg[1:])
                lFUNCTIONS.append(("_m_"+sIdAction, sMsg))



                sMsg = "=_m_"+sIdAction
            else:


                checkIfThereIsCode(sMsg, sIdAction)



    checkReferenceNumbers(sAction, sIdAction, nGroup)
    if sAction[0:1] == "=" or cAction == "=":


        sAction = prepareFunction(sAction)
        sAction = sAction.replace("m.group(i[4])", "m.group("+str(iGroup)+")")



    else:
        checkIfThereIsCode(sAction, sIdAction)

    if cAction == ">":
        ## no action, break loop if condition is False
        return [sCondition, cAction, ""]

    if not sAction:
        print("# Error in action at line " + sIdAction + ":  This action is empty.")
        return None

    if cAction == "-":
        ## error detected --> suggestion


        if sAction[0:1] == "=":
            lFUNCTIONS.append(("_s_"+sIdAction, sAction[1:]))
            sAction = "=_s_"+sIdAction
        elif sAction.startswith('"') and sAction.endswith('"'):
            sAction = sAction[1:-1]
        if not sMsg:
            print("# Error in action at line " + sIdAction + ":  the message is empty.")
        return [sCondition, cAction, sAction, iGroup, sMsg, sURL]
    elif cAction == "~":
        ## text processor


        if sAction[0:1] == "=":
            lFUNCTIONS.append(("_p_"+sIdAction, sAction[1:]))
            sAction = "=_p_"+sIdAction
        elif sAction.startswith('"') and sAction.endswith('"'):
            sAction = sAction[1:-1]
        return [sCondition, cAction, sAction, iGroup]
    elif cAction == "=":
        ## disambiguator
        if sAction[0:1] == "=":
            sAction = sAction[1:]
        if "define" in sAction and not re.search(r"define\(dTokenPos, *m\.start.*, \[.*\] *\)", sAction):
            print("# Error in action at line " + sIdAction + ": second argument for define must be a list of strings")
            print(sAction)
        lFUNCTIONS.append(("_d_"+sIdAction, sAction))
        sAction = "_d_"+sIdAction
        return [sCondition, cAction, sAction]



    else:
        print("# Unknown action at line " + sIdAction)
        return None


def _calcRulesStats (lRules):
    "count rules and actions"
    d = {'=':0, '~': 0, '-': 0, '>': 0}
    for aRule in lRules:
        if aRule[0] != "@@@@":
            for aAction in aRule[6]:
                d[aAction[1]] = d[aAction[1]] + 1
    return (d, len(lRules))


def displayStats (lParagraphRules, lSentenceRules):
    "display rules numbers"
    print("  {:>18} {:>18} {:>18} {:>18}".format("DISAMBIGUATOR", "TEXT PROCESSOR", "GRAMMAR CHECKING", "REGEX"))
    d, nRule = _calcRulesStats(lParagraphRules)
    print("§ {:>10} actions {:>10} actions {:>10} actions  in {:>8} rules".format(d['='], d['~'], d['-'], nRule))
    d, nRule = _calcRulesStats(lSentenceRules)
    print("s {:>10} actions {:>10} actions {:>10} actions  in {:>8} rules".format(d['='], d['~'], d['-'], nRule))


389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
            m = re.match("OPTGROUP/([a-z0-9]+):(.+)$", sLine)
            lStructOpt.append( (m.group(1), list(map(str.split, m.group(2).split(",")))) )
        elif sLine.startswith("OPTSOFTWARE:"):
            lOpt = [ [s, {}]  for s in sLine[12:].strip().split() ]  # don’t use tuples (s, {}), because unknown to JS
        elif sLine.startswith("OPT/"):
            m = re.match("OPT/([a-z0-9]+):(.+)$", sLine)
            for i, sOpt in enumerate(m.group(2).split()):
                lOpt[i][1][m.group(1)] =  eval(sOpt)
        elif sLine.startswith("OPTPRIORITY/"):
            m = re.match("OPTPRIORITY/([a-z0-9]+): *([0-9])$", sLine)
            dOptPriority[m.group(1)] = int(m.group(2))
        elif sLine.startswith("OPTLANG/"):
            m = re.match("OPTLANG/([a-z][a-z](?:_[A-Z][A-Z]|)):(.+)$", sLine)
            sLang = m.group(1)[:2]
            dOptLabel[sLang] = { "__optiontitle__": m.group(2).strip() }







|







405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
            m = re.match("OPTGROUP/([a-z0-9]+):(.+)$", sLine)
            lStructOpt.append( (m.group(1), list(map(str.split, m.group(2).split(",")))) )
        elif sLine.startswith("OPTSOFTWARE:"):
            lOpt = [ [s, {}]  for s in sLine[12:].strip().split() ]  # don’t use tuples (s, {}), because unknown to JS
        elif sLine.startswith("OPT/"):
            m = re.match("OPT/([a-z0-9]+):(.+)$", sLine)
            for i, sOpt in enumerate(m.group(2).split()):
                lOpt[i][1][m.group(1)] = eval(sOpt)
        elif sLine.startswith("OPTPRIORITY/"):
            m = re.match("OPTPRIORITY/([a-z0-9]+): *([0-9])$", sLine)
            dOptPriority[m.group(1)] = int(m.group(2))
        elif sLine.startswith("OPTLANG/"):
            m = re.match("OPTLANG/([a-z][a-z](?:_[A-Z][A-Z]|)):(.+)$", sLine)
            sLang = m.group(1)[:2]
            dOptLabel[sLang] = { "__optiontitle__": m.group(2).strip() }
413
414
415
416
417
418
419

420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445

446
447
448

449
450
451
452
453
454
455
456
457
458
459
460
461
462

463
464

465
466

467
468
469
470
471

472
473
474





















475

476
477

478
479
480
481
482
483
484
    print("  options defined for: " + ", ".join([ t[0] for t in lOpt ]))
    dOptions = { "lStructOpt": lStructOpt, "dOptLabel": dOptLabel, "sDefaultUILang": sDefaultUILang }
    dOptions.update({ "dOpt"+k: v  for k, v in lOpt })
    return dOptions, dOptPriority


def printBookmark (nLevel, sComment, nLine):

    print("  {:>6}:  {}".format(nLine, "  " * nLevel + sComment))


def make (spLang, sLang, bJavaScript):
    "compile rules, returns a dictionary of values"
    # for clarity purpose, don’t create any file here

    print("> read rules file...")
    try:
        lRules = open(spLang + "/rules.grx", 'r', encoding="utf-8").readlines()
    except:
        print("Error. Rules file in project [" + sLang + "] not found.")
        exit()

    # removing comments, zeroing empty lines, creating definitions, storing tests, merging rule lines
    print("  parsing rules...")
    global dDEF
    lLine = []
    lRuleLine = []
    lTest = []
    lOpt = []
    zBookmark = re.compile("^!!+")
    zGraphLink = re.compile(r"^@@@@GRAPHLINK>(\w+)@@@@")

    for i, sLine in enumerate(lRules, 1):
        if sLine.startswith('#END'):

            printBookmark(0, "BREAK BY #END", i)
            break
        elif sLine.startswith("#"):

            pass
        elif sLine.startswith("@@@@"):
            m = re.match(r"^@@@@GRAPHLINK>(\w+)@@@@", sLine.strip())
            if m:
                #lRuleLine.append(["@GRAPHLINK", m.group(1)])
                printBookmark(1, "@GRAPHLINK: " + m.group(1), i)
        elif sLine.startswith("DEF:"):
            m = re.match("DEF: +([a-zA-Z_][a-zA-Z_0-9]*) +(.+)$", sLine.strip())
            if m:
                dDEF["{"+m.group(1)+"}"] = m.group(2)
            else:
                print("Error in definition: ", end="")
                print(sLine.strip())
        elif sLine.startswith("TEST:"):

            lTest.append("{:<8}".format(i) + "  " + sLine[5:].strip())
        elif sLine.startswith("TODO:"):

            pass
        elif sLine.startswith(("OPTGROUP/", "OPTSOFTWARE:", "OPT/", "OPTLANG/", "OPTDEFAULTUILANG:", "OPTLABEL/", "OPTPRIORITY/")):

            lOpt.append(sLine)
        elif re.match("[  \t]*$", sLine):
            pass
        elif sLine.startswith("!!"):
            m = zBookmark.search(sLine)

            nExMk = len(m.group(0))
            if sLine[nExMk:].strip():
                printBookmark(nExMk-2, sLine[nExMk:].strip(), i)





















        elif sLine.startswith(("    ", "\t")):

            lRuleLine[len(lRuleLine)-1][1] += " " + sLine.strip()
        else:

            lRuleLine.append([i, sLine.strip()])

    # generating options files
    print("  parsing options...")
    try:
        dOptions, dOptPriority = prepareOptions(lOpt)
    except:







>
















<
<



|
|



>



>

|
<
|
<
<
<







>


>


>

<
<

|
>


|
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>

>
|

>







429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452


453
454
455
456
457
458
459
460
461
462
463
464
465
466
467

468



469
470
471
472
473
474
475
476
477
478
479
480
481
482
483


484
485
486
487
488
489
490
491
492
493
494
495
496
497
498
499
500
501
502
503
504
505
506
507
508
509
510
511
512
513
514
515
516
517
518
519
520
521
522
    print("  options defined for: " + ", ".join([ t[0] for t in lOpt ]))
    dOptions = { "lStructOpt": lStructOpt, "dOptLabel": dOptLabel, "sDefaultUILang": sDefaultUILang }
    dOptions.update({ "dOpt"+k: v  for k, v in lOpt })
    return dOptions, dOptPriority


def printBookmark (nLevel, sComment, nLine):
    "print bookmark within the rules file"
    print("  {:>6}:  {}".format(nLine, "  " * nLevel + sComment))


def make (spLang, sLang, bJavaScript):
    "compile rules, returns a dictionary of values"
    # for clarity purpose, don’t create any file here

    print("> read rules file...")
    try:
        lRules = open(spLang + "/rules.grx", 'r', encoding="utf-8").readlines()
    except:
        print("Error. Rules file in project [" + sLang + "] not found.")
        exit()

    # removing comments, zeroing empty lines, creating definitions, storing tests, merging rule lines
    print("  parsing rules...")


    lRuleLine = []
    lTest = []
    lOpt = []
    bGraph = False
    lGraphRule = []

    for i, sLine in enumerate(lRules, 1):
        if sLine.startswith('#END'):
            # arbitrary end
            printBookmark(0, "BREAK BY #END", i)
            break
        elif sLine.startswith("#"):
            # comment
            pass
        elif sLine.startswith("DEF:"):

            # definition



            m = re.match("DEF: +([a-zA-Z_][a-zA-Z_0-9]*) +(.+)$", sLine.strip())
            if m:
                dDEF["{"+m.group(1)+"}"] = m.group(2)
            else:
                print("Error in definition: ", end="")
                print(sLine.strip())
        elif sLine.startswith("TEST:"):
            # test
            lTest.append("{:<8}".format(i) + "  " + sLine[5:].strip())
        elif sLine.startswith("TODO:"):
            # todo
            pass
        elif sLine.startswith(("OPTGROUP/", "OPTSOFTWARE:", "OPT/", "OPTLANG/", "OPTDEFAULTUILANG:", "OPTLABEL/", "OPTPRIORITY/")):
            # options
            lOpt.append(sLine)


        elif sLine.startswith("!!"):
            # bookmark
            m = re.match("!!+", sLine)
            nExMk = len(m.group(0))
            if sLine[nExMk:].strip():
                printBookmark(nExMk-2, sLine[nExMk:-3].strip(), i)
        # Graph rules
        elif sLine.startswith("@@@@GRAPH:"):
            # rules graph call
            m = re.match(r"@@@@GRAPH: *(\w+)", sLine.strip())
            if m:
                printBookmark(1, "____ GRAPH: " + m.group(1) + " ____", i)
                lRuleLine.append([i, "@@@@"+m.group(1)])
                bGraph = True
            lGraphRule.append([i, sLine])
            bGraph = True
        elif sLine.startswith("@@@@END_GRAPH"):
            #lGraphRule.append([i, sLine])
            bGraph = False
        elif re.match("@@@@ *$", sLine):
            pass
        elif bGraph:
            lGraphRule.append([i, sLine])
        # Regex rules
        elif re.match("[  \t]*$", sLine):
            # empty line
            pass
        elif sLine.startswith(("    ", "\t")):
            # rule (continuation)
            lRuleLine[-1][1] += " " + sLine.strip()
        else:
            # new rule
            lRuleLine.append([i, sLine.strip()])

    # generating options files
    print("  parsing options...")
    try:
        dOptions, dOptPriority = prepareOptions(lOpt)
    except:
513
514
515
516
517
518
519
520
521
522
523
524
525
526
527
528
529
530
531
532
533
534
535
536
537
538
539
540
541
542
543
544
545
546
547
548
549
550
551
552
553
554




555
                        lSentenceRulesJS.append(jsconv.pyRuleToJS(aRule, dJSREGEXES, sWORDLIMITLEFT))

    # creating file with all functions callable by rules
    print("  creating callables...")
    sPyCallables = "# generated code, do not edit\n"
    sJSCallables = "// generated code, do not edit\nconst oEvalFunc = {\n"
    for sFuncName, sReturn in lFUNCTIONS:
        cType = sFuncName[0:1]
        if cType == "c": # condition
            sParams = "s, sx, m, dDA, sCountry, bCondMemo"
        elif cType == "m": # message
            sParams = "s, m"
        elif cType == "s": # suggestion
            sParams = "s, m"
        elif cType == "p": # preprocessor
            sParams = "s, m"
        elif cType == "d": # disambiguator
            sParams = "s, m, dDA"
        else:
            print("# Unknown function type in [" + sFuncName + "]")
            continue
        sPyCallables += "def {} ({}):\n".format(sFuncName, sParams)
        sPyCallables += "    return " + sReturn + "\n"
        sJSCallables += "    {}: function ({})".format(sFuncName, sParams) + " {\n"
        sJSCallables += "        return " + jsconv.py2js(sReturn) + ";\n"
        sJSCallables += "    },\n"
    sJSCallables += "}\n"

    displayStats(lParagraphRules, lSentenceRules)

    print("Unnamed rules: " + str(nRULEWITHOUTNAME))

    d = { "callables": sPyCallables,
          "callablesJS": sJSCallables,
          "gctests": sGCTests,
          "gctestsJS": sGCTestsJS,
          "paragraph_rules": mergeRulesByOption(lParagraphRules),
          "sentence_rules": mergeRulesByOption(lSentenceRules),
          "paragraph_rules_JS": jsconv.writeRulesToJSArray(mergeRulesByOption(lParagraphRulesJS)),
          "sentence_rules_JS": jsconv.writeRulesToJSArray(mergeRulesByOption(lSentenceRulesJS)) }
    d.update(dOptions)





    return d







<
|
|
|

|

|

|
|














|
|
|
|
|
|
|
|
|

>
>
>
>
|
551
552
553
554
555
556
557

558
559
560
561
562
563
564
565
566
567
568
569
570
571
572
573
574
575
576
577
578
579
580
581
582
583
584
585
586
587
588
589
590
591
592
593
594
595
596
                        lSentenceRulesJS.append(jsconv.pyRuleToJS(aRule, dJSREGEXES, sWORDLIMITLEFT))

    # creating file with all functions callable by rules
    print("  creating callables...")
    sPyCallables = "# generated code, do not edit\n"
    sJSCallables = "// generated code, do not edit\nconst oEvalFunc = {\n"
    for sFuncName, sReturn in lFUNCTIONS:

        if sFuncName.startswith("_c_"): # condition
            sParams = "s, sx, m, dTokenPos, sCountry, bCondMemo"
        elif sFuncName.startswith("_m_"): # message
            sParams = "s, m"
        elif sFuncName.startswith("_s_"): # suggestion
            sParams = "s, m"
        elif sFuncName.startswith("_p_"): # preprocessor
            sParams = "s, m"
        elif sFuncName.startswith("_d_"): # disambiguator
            sParams = "s, m, dTokenPos"
        else:
            print("# Unknown function type in [" + sFuncName + "]")
            continue
        sPyCallables += "def {} ({}):\n".format(sFuncName, sParams)
        sPyCallables += "    return " + sReturn + "\n"
        sJSCallables += "    {}: function ({})".format(sFuncName, sParams) + " {\n"
        sJSCallables += "        return " + jsconv.py2js(sReturn) + ";\n"
        sJSCallables += "    },\n"
    sJSCallables += "}\n"

    displayStats(lParagraphRules, lSentenceRules)

    print("Unnamed rules: " + str(nRULEWITHOUTNAME))

    dVars = {   "callables": sPyCallables,
                "callablesJS": sJSCallables,
                "gctests": sGCTests,
                "gctestsJS": sGCTestsJS,
                "paragraph_rules": mergeRulesByOption(lParagraphRules),
                "sentence_rules": mergeRulesByOption(lSentenceRules),
                "paragraph_rules_JS": jsconv.writeRulesToJSArray(mergeRulesByOption(lParagraphRulesJS)),
                "sentence_rules_JS": jsconv.writeRulesToJSArray(mergeRulesByOption(lSentenceRulesJS)) }
    dVars.update(dOptions)

    # compile graph rules
    dVars2 = crg.make(lGraphRule, dDEF, sLang, dOptPriority, bJavaScript)
    dVars.update(dVars2)

    return dVars

Added compile_rules_graph.py version [e2133c86ff].





































































































































































































































































































































































































































































































































































































































































































































































































































>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
"""
Grammalecte: compile rules
Create a Direct Acyclic Rule Graphs (DARGs)
"""

import re
import traceback
import json

import darg


dACTIONS = {}
dFUNCTIONS = {}


def prepareFunction (s):
    "convert simple rule syntax to a string of Python code"
    s = s.replace("__also__", "bCondMemo")
    s = s.replace("__else__", "not bCondMemo")
    s = s.replace("sContext", "_sAppContext")
    s = re.sub(r"(morph|morphVC|analyse|value|displayInfo)[(]\\(\d+)", 'g_\\1(lToken[\\2+nTokenOffset]', s)
    s = re.sub(r"(select|exclude|define|define_from)[(][\\](\d+)", 'g_\\1(lToken[\\2+nTokenOffset]', s)
    s = re.sub(r"(tag_before|tag_after)[(][\\](\d+)", 'g_\\1(lToken[\\2+nTokenOffset], dTags', s)
    s = re.sub(r"space_after[(][\\](\d+)", 'g_space_between_tokens(lToken[\\1+nTokenOffset], lToken[\\1+nTokenOffset+1]', s)
    s = re.sub(r"analyse_with_next[(][\\](\d+)", 'g_merged_analyse(lToken[\\1+nTokenOffset], lToken[\\1+nTokenOffset+1]', s)
    s = re.sub(r"(morph|analyse|value)\(>1", 'g_\\1(lToken[nLastToken+1]', s)                       # next token
    s = re.sub(r"(morph|analyse|value)\(<1", 'g_\\1(lToken[nTokenOffset]', s)                       # previous token
    s = re.sub(r"(morph|analyse|value)\(>(\d+)", 'g_\\1(g_token(lToken, nLastToken+\\2)', s)          # next token
    s = re.sub(r"(morph|analyse|value)\(<(\d+)", 'g_\\1(g_token(lToken, nTokenOffset+1-\\2)', s)      # previous token
    s = re.sub(r"\bspell *[(]", '_oSpellChecker.isValid(', s)
    s = re.sub(r"\bbefore\(\s*", 'look(sSentence[:lToken[1+nTokenOffset]["nStart"]], ', s)          # before(s)
    s = re.sub(r"\bafter\(\s*", 'look(sSentence[lToken[nLastToken]["nEnd"]:], ', s)                 # after(s)
    s = re.sub(r"\bbefore0\(\s*", 'look(sSentence0[:lToken[1+nTokenOffset]["nStart"]], ', s)        # before0(s)
    s = re.sub(r"\bafter0\(\s*", 'look(sSentence[lToken[nLastToken]["nEnd"]:], ', s)                # after0(s)
    s = re.sub(r"[\\](\d+)", 'lToken[\\1+nTokenOffset]["sValue"]', s)
    return s


def genTokenLines (sTokenLine, dDef):
    "tokenize a string and return a list of lines of tokens"
    lToken = sTokenLine.split()
    lTokenLines = None
    for sToken in lToken:
        # optional token?
        bNullPossible = sToken.startswith("?") and sToken.endswith("¿")
        if bNullPossible:
            sToken = sToken[1:-1]
        # token with definition?
        if sToken.startswith("({") and sToken.endswith("})") and sToken[1:-1] in dDef:
            sToken = "(" + dDef[sToken[1:-1]] + ")"
        elif sToken.startswith("{") and sToken.endswith("}") and sToken in dDef:
            sToken = dDef[sToken]
        if ( (sToken.startswith("[") and sToken.endswith("]")) or (sToken.startswith("([") and sToken.endswith("])")) ):
            # multiple token
            bSelectedGroup = sToken.startswith("(") and sToken.endswith(")")
            if bSelectedGroup:
                sToken = sToken[1:-1]
            lNewToken = sToken[1:-1].split("|")
            if not lTokenLines:
                lTokenLines = [ ["("+s+")"]  for s  in lNewToken ]  if bSelectedGroup  else [ [s]  for s  in lNewToken ]
                if bNullPossible:
                    lTokenLines.extend([ []  for i  in range(len(lNewToken)+1) ])
            else:
                lNewTemp = []
                if bNullPossible:
                    for aRule in lTokenLines:
                        for sElem in lNewToken:
                            aNewRule = list(aRule)
                            aNewRule.append(sElem)
                            lNewTemp.append(aNewRule)
                else:
                    sElem1 = lNewToken.pop(0)
                    for aRule in lTokenLines:
                        for sElem in lNewToken:
                            aNewRule = list(aRule)
                            aNewRule.append("(" + sElem + ")"  if bSelectedGroup  else sElem)
                            lNewTemp.append(aNewRule)
                        aRule.append("(" + sElem1 + ")"  if bSelectedGroup  else sElem1)
                lTokenLines.extend(lNewTemp)
        else:
            # simple token
            if not lTokenLines:
                lTokenLines = [[sToken], []]  if bNullPossible  else [[sToken]]
            else:
                if bNullPossible:
                    lNewTemp = []
                    for aRule in lTokenLines:
                        lNew = list(aRule)
                        lNew.append(sToken)
                        lNewTemp.append(lNew)
                    lTokenLines.extend(lNewTemp)
                else:
                    for aRule in lTokenLines:
                        aRule.append(sToken)
    for aRule in lTokenLines:
        yield aRule


def createRule (iLine, sRuleName, sTokenLine, iActionBlock, sActions, nPriority, dOptPriority, dDef):
    "generator: create rule as list"
    # print(iLine, "//", sRuleName, "//", sTokenLine, "//", sActions, "//", nPriority)
    for lToken in genTokenLines(sTokenLine, dDef):
        # Calculate positions
        dPos = {}   # key: iGroup, value: iToken
        iGroup = 0
        #if iLine == 2211: # debug
        #    print(lToken)
        for i, sToken in enumerate(lToken):
            if sToken.startswith("(") and sToken.endswith(")"):
                lToken[i] = sToken[1:-1]
                iGroup += 1
                dPos[iGroup] = i + 1    # we add 1, for we count tokens from 1 to n (not from 0)

        # Parse actions
        for iAction, sAction in enumerate(sActions.split(" <<- ")):
            sAction = sAction.strip()
            if sAction:
                sActionId = sRuleName + "__b" + str(iActionBlock) + "_a" + str(iAction) + "_" + str(len(lToken))
                aAction = createAction(sActionId, sAction, nPriority, dOptPriority, len(lToken), dPos)
                if aAction:
                    dACTIONS[sActionId] = aAction
                    lResult = list(lToken)
                    lResult.extend(["##"+str(iLine), sActionId])
                    yield lResult
                else:
                    print(" # Error on action at line:", iLine)


def changeReferenceToken (sText, dPos):
    "change group reference in <sText> with values in <dPos>"
    for i in range(len(dPos), 0, -1):
        sText = sText.replace("\\"+str(i), "\\"+str(dPos[i]))
    return sText


def checkTokenNumbers (sText, sActionId, nToken):
    "check if token references in <sText> greater than <nToken> (debugging)"
    for x in re.finditer(r"\\(\d+)", sText):
        if int(x.group(1)) > nToken:
            print("# Error in token index at line " + sActionId + " ("+str(nToken)+" tokens only)")
            print(sText)


def checkIfThereIsCode (sText, sActionId):
    "check if there is code in <sText> (debugging)"
    if re.search("[.]\\w+[(]|sugg\\w+[(]|\\([0-9]|\\[[0-9]", sText):
        print("# Warning at line " + sActionId + ":  This message looks like code. Line should probably begin with =")
        print(sText)


def createAction (sActionId, sAction, nPriority, dOptPriority, nToken, dPos):
    "create action rule as a list"
    # Option
    sOption = False
    m = re.match("/(\\w+)/", sAction)
    if m:
        sOption = m.group(1)
        sAction = sAction[m.end():].strip()
    if nPriority == -1:
        nPriority = dOptPriority.get(sOption, 4)
    # valid action?
    m = re.search(r"(?P<action>[-~=/%>])(?P<start>\d+\.?|)(?P<end>:\.?\d+|)>>", sAction)
    if not m:
        print(" # Error. No action found at: ", sActionId)
        return None
    # Condition
    sCondition = sAction[:m.start()].strip()
    if sCondition:
        sCondition = changeReferenceToken(sCondition, dPos)
        sCondition = prepareFunction(sCondition)
        dFUNCTIONS["_g_c_"+sActionId] = sCondition
        sCondition = "_g_c_"+sActionId
    else:
        sCondition = ""
    # Action
    cAction = m.group("action")
    sAction = sAction[m.end():].strip()
    sAction = changeReferenceToken(sAction, dPos)
    if not m.group("start"):
        iStartAction = 1
        iEndAction = 0
    else:
        if cAction != "-" and (m.group("start").endswith(".") or m.group("end").startswith(":.")):
            print(" # Error. Wrong selection on tokens.", sActionId)
            return None
        iStartAction = int(m.group("start"))  if not m.group("start").endswith(".")  else int("-"+m.group("start")[:-1])
        if not m.group("end"):
            iEndAction = iStartAction
        else:
            iEndAction = int(m.group("end")[1:])  if not m.group("end").startswith(":.")  else int("-" + m.group("end")[2:])
    if dPos and m.group("start"):
        try:
            iStartAction = dPos[iStartAction]
            if iEndAction:
                iEndAction = dPos[iEndAction]
        except:
            print("# Error. Wrong groups in: " + sActionId)
            print("  iStartAction:", iStartAction, "iEndAction:", iEndAction)
            print(" ", dPos)

    if cAction == "-":
        ## error
        iMsg = sAction.find(" # ")
        if iMsg == -1:
            sMsg = "# Error. Error message not found."
            sURL = ""
            print(sMsg + " Action id: " + sActionId)
        else:
            sMsg = sAction[iMsg+3:].strip()
            sAction = sAction[:iMsg].strip()
            sURL = ""
            mURL = re.search("[|] *(https?://.*)", sMsg)
            if mURL:
                sURL = mURL.group(1).strip()
                sMsg = sMsg[:mURL.start(0)].strip()
            checkTokenNumbers(sMsg, sActionId, nToken)
            if sMsg[0:1] == "=":
                sMsg = prepareFunction(sMsg[1:])
                dFUNCTIONS["g_m_"+sActionId] = sMsg
                sMsg = "=g_m_"+sActionId
            else:
                checkIfThereIsCode(sMsg, sActionId)

    # checking consistancy
    checkTokenNumbers(sAction, sActionId, nToken)

    if cAction == ">":
        ## no action, break loop if condition is False
        return [sOption, sCondition, cAction, ""]

    if not sAction:
        print("# Error in action at line " + sActionId + ":  This action is empty.")

    if sAction[0:1] != "=" and cAction != "=":
        checkIfThereIsCode(sAction, sActionId)

    if cAction == "-":
        ## error detected --> suggestion
        if sAction[0:1] == "=":
            sAction = prepareFunction(sAction)
            dFUNCTIONS["_g_s_"+sActionId] = sAction[1:]
            sAction = "=_g_s_"+sActionId
        elif sAction.startswith('"') and sAction.endswith('"'):
            sAction = sAction[1:-1]
        if not sMsg:
            print("# Error in action at line " + sActionId + ":  The message is empty.")
        return [sOption, sCondition, cAction, sAction, iStartAction, iEndAction, nPriority, sMsg, sURL]
    elif cAction == "~":
        ## text processor
        if sAction[0:1] == "=":
            sAction = prepareFunction(sAction)
            dFUNCTIONS["_g_p_"+sActionId] = sAction[1:]
            sAction = "=_g_p_"+sActionId
        elif sAction.startswith('"') and sAction.endswith('"'):
            sAction = sAction[1:-1]
        return [sOption, sCondition, cAction, sAction, iStartAction, iEndAction]
    elif cAction == "%" or cAction == "/":
        ## tags
        return [sOption, sCondition, cAction, sAction, iStartAction, iEndAction]
    elif cAction == "=":
        ## disambiguator
        if sAction[0:1] == "=":
            sAction = sAction[1:]
        if "define(" in sAction and not re.search(r"define\(\\\d+ *, *\[.*\] *\)", sAction):
            print("# Error in action at line " + sActionId + ": second argument for <define> must be a list of strings")
        sAction = prepareFunction(sAction)
        dFUNCTIONS["_g_d_"+sActionId] = sAction
        sAction = "_g_d_"+sActionId
        return [sOption, sCondition, cAction, sAction]
    else:
        print(" # Unknown action.", sActionId)
        return None


def make (lRule, dDef, sLang, dOptPriority, bJavaScript):
    "compile rules, returns a dictionary of values"
    # for clarity purpose, don’t create any file here

    # removing comments, zeroing empty lines, creating definitions, storing tests, merging rule lines
    print("  parsing rules...")
    lTokenLine = []
    sActions = ""
    nPriority = -1
    dAllGraph = {}
    sGraphName = ""
    iActionBlock = 0

    for i, sLine in lRule:
        sLine = sLine.rstrip()
        if "\t" in sLine:
            # tabulation not allowed
            print("Error. Tabulation at line: ", i)
            exit()
        elif sLine.startswith("@@@@GRAPH: "):
            # rules graph call
            m = re.match(r"@@@@GRAPH: *(\w+)", sLine.strip())
            if m:
                sGraphName = m.group(1)
                if sGraphName in dAllGraph:
                    print("Error. Graph name " + sGraphName + " already exists.")
                    exit()
                dAllGraph[sGraphName] = []
            else:
                print("Error. Graph name not found at line", i)
                exit()
        elif sLine.startswith("__") and sLine.endswith("__"):
            # new rule group
            m = re.match("__(\\w+)(!\\d|)__", sLine)
            if m:
                sRuleName = m.group(1)
                iActionBlock = 1
                nPriority = int(m.group(2)[1:]) if m.group(2)  else -1
            else:
                print("Syntax error in rule group: ", sLine, " -- line:", i)
                exit()
        elif re.search("^    +<<- ", sLine) or sLine.startswith("        ") \
                or re.search("^    +#", sLine) or re.search(r"^    [-~=>/](?:\d\.?(?::\.?\d+|)|)>> ", sLine) :
            # actions
            sActions += " " + sLine.strip()
        elif re.match("[  ]*$", sLine):
            # empty line to end merging
            if not lTokenLine:
                continue
            if not sActions:
                print("Error. No action found at line:", i)
                exit()
            if not sGraphName:
                print("Error. All rules must belong to a named graph. Line: ", i)
                exit()
            for j, sTokenLine in lTokenLine:
                dAllGraph[sGraphName].append((j, sRuleName, sTokenLine, iActionBlock, sActions, nPriority))
            lTokenLine.clear()
            sActions = ""
            iActionBlock += 1
        elif sLine.startswith(("    ")):
            # tokens
            lTokenLine.append([i, sLine.strip()])
        else:
            print("Unknown line:")
            print(sLine)

    # processing rules
    print("  preparing rules...")
    for sGraphName, lRuleLine in dAllGraph.items():
        lPreparedRule = []
        for i, sRuleGroup, sTokenLine, iActionBlock, sActions, nPriority in lRuleLine:
            for lRule in createRule(i, sRuleGroup, sTokenLine, iActionBlock, sActions, nPriority, dOptPriority, dDef):
                lPreparedRule.append(lRule)
        # Graph creation
        oDARG = darg.DARG(lPreparedRule, sLang)
        dAllGraph[sGraphName] = oDARG.createGraph()
        # Debugging
        if False:
            print("\nRULES:")
            for e in lPreparedRule:
                if e[-2] == "##2211":
                    print(e)
        if False:
            print("\nGRAPH:", sGraphName)
            for k, v in dAllGraph[sGraphName].items():
                print(k, "\t", v)

    # creating file with all functions callable by rules
    print("  creating callables...")
    sPyCallables = "# generated code, do not edit\n"
    #sJSCallables = "// generated code, do not edit\nconst oEvalFunc = {\n"
    for sFuncName, sReturn in dFUNCTIONS.items():
        if sFuncName.startswith("_g_c_"): # condition
            sParams = "lToken, nTokenOffset, nLastToken, sCountry, bCondMemo, dTags, sSentence, sSentence0"
        elif sFuncName.startswith("g_m_"): # message
            sParams = "lToken, nTokenOffset"
        elif sFuncName.startswith("_g_s_"): # suggestion
            sParams = "lToken, nTokenOffset"
        elif sFuncName.startswith("_g_p_"): # preprocessor
            sParams = "lToken, nTokenOffset"
        elif sFuncName.startswith("_g_d_"): # disambiguator
            sParams = "lToken, nTokenOffset"
        else:
            print("# Unknown function type in [" + sFuncName + "]")
            continue
        sPyCallables += "def {} ({}):\n".format(sFuncName, sParams)
        sPyCallables += "    return " + sReturn + "\n"
        #sJSCallables += "    {}: function ({})".format(sFuncName, sParams) + " {\n"
        #sJSCallables += "        return " + jsconv.py2js(sReturn) + ";\n"
        #sJSCallables += "    },\n"
    #sJSCallables += "}\n"

    # Debugging
    if False:
        print("\nActions:")
        for sActionName, aAction in dACTIONS.items():
            print(sActionName, aAction)
        print("\nFunctions:")
        print(sPyCallables)

    # Result
    return {
        "graph_callables": sPyCallables,
        "rules_graphs": dAllGraph,
        "rules_actions": dACTIONS
    }

Modified compile_rules_js_convert.py from [5ad87f3f46] to [9aa0239064].


1

2
3
4
5
6
7
8

# Convert Python code to JavaScript code


import copy
import re
import json


def py2js (sCode):
>
|
>







1
2
3
4
5
6
7
8
9
10
"""
Convert Python code and regexes to JavaScript code
"""

import copy
import re
import json


def py2js (sCode):
114
115
116
117
118
119
120

121



122
123
124
125
126
127
128
129
130
131
132
133
134

135
136

137
138
139
140
141
142
143
144
145
146
147





148
149
150
151
152

153
154
155
156
        sRegex = sRegex + "i"
    if not lNegLookBeforeRegex:
        lNegLookBeforeRegex = None
    return (sRegex, lNegLookBeforeRegex)


def pyRuleToJS (lRule, dJSREGEXES, sWORDLIMITLEFT):

    lRuleJS = copy.deepcopy(lRule)



    del lRule[-1] # tGroups positioning codes are useless for Python
    # error messages
    for aAction in lRuleJS[6]:
        if aAction[1] == "-":
            aAction[2] = aAction[2].replace(" ", " ") # nbsp --> nnbsp
            aAction[4] = aAction[4].replace("« ", "« ").replace(" »", " »").replace(" :", " :").replace(" :", " :")
    # js regexes
    lRuleJS[1], lNegLookBehindRegex = regex2js(dJSREGEXES.get(lRuleJS[3], lRuleJS[1]), sWORDLIMITLEFT)
    lRuleJS.append(lNegLookBehindRegex)
    return lRuleJS


def writeRulesToJSArray (lRules):

    sArray = "[\n"
    for sOption, aRuleGroup in lRules:

        sArray += '  ["' + sOption + '", [\n'  if sOption  else  "  [false, [\n"
        for sRegex, bCaseInsensitive, sLineId, sRuleId, nPriority, lActions, aGroups, aNegLookBehindRegex in aRuleGroup:
            sArray += '    [' + sRegex + ", "
            sArray += "true, " if bCaseInsensitive  else "false, "
            sArray += '"' + sLineId + '", '
            sArray += '"' + sRuleId + '", '
            sArray += str(nPriority) + ", "
            sArray += json.dumps(lActions, ensure_ascii=False) + ", "
            sArray += json.dumps(aGroups, ensure_ascii=False) + ", "
            sArray += json.dumps(aNegLookBehindRegex, ensure_ascii=False) + "],\n"
        sArray += "  ]],\n"





    sArray += "]"
    return sArray


def groupsPositioningCodeToList (sGroupsPositioningCode):

    if not sGroupsPositioningCode:
        return None
    return [ int(sCode)  if sCode.isdigit() or (sCode[0:1] == "-" and sCode[1:].isdigit())  else sCode \
             for sCode in sGroupsPositioningCode.split(",") ]







>

>
>
>













>


>
|
|
|
|
|
|
|
|
|
|
|
>
>
>
>
>





>




116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
        sRegex = sRegex + "i"
    if not lNegLookBeforeRegex:
        lNegLookBeforeRegex = None
    return (sRegex, lNegLookBeforeRegex)


def pyRuleToJS (lRule, dJSREGEXES, sWORDLIMITLEFT):
    "modify Python rules -> JS rules"
    lRuleJS = copy.deepcopy(lRule)
    # graph rules
    if lRuleJS[0] == "@@@@":
        return lRuleJS
    del lRule[-1] # tGroups positioning codes are useless for Python
    # error messages
    for aAction in lRuleJS[6]:
        if aAction[1] == "-":
            aAction[2] = aAction[2].replace(" ", " ") # nbsp --> nnbsp
            aAction[4] = aAction[4].replace("« ", "« ").replace(" »", " »").replace(" :", " :").replace(" :", " :")
    # js regexes
    lRuleJS[1], lNegLookBehindRegex = regex2js(dJSREGEXES.get(lRuleJS[3], lRuleJS[1]), sWORDLIMITLEFT)
    lRuleJS.append(lNegLookBehindRegex)
    return lRuleJS


def writeRulesToJSArray (lRules):
    "create rules as a string of arrays (to be bundled in a JSON string)"
    sArray = "[\n"
    for sOption, aRuleGroup in lRules:
        if sOption != "@@@@":
            sArray += '  ["' + sOption + '", [\n'  if sOption  else  "  [false, [\n"
            for sRegex, bCaseInsensitive, sLineId, sRuleId, nPriority, lActions, aGroups, aNegLookBehindRegex in aRuleGroup:
                sArray += '    [' + sRegex + ", "
                sArray += "true, " if bCaseInsensitive  else "false, "
                sArray += '"' + sLineId + '", '
                sArray += '"' + sRuleId + '", '
                sArray += str(nPriority) + ", "
                sArray += json.dumps(lActions, ensure_ascii=False) + ", "
                sArray += json.dumps(aGroups, ensure_ascii=False) + ", "
                sArray += json.dumps(aNegLookBehindRegex, ensure_ascii=False) + "],\n"
            sArray += "  ]],\n"
        else:
            sArray += '  ["' + sOption + '", [\n'
            for sGraphName, sLineId in aRuleGroup:
                sArray += '    ["' + sGraphName + '", "' + sLineId + '"],\n"'
            sArray += "  ]],\n"
    sArray += "]"
    return sArray


def groupsPositioningCodeToList (sGroupsPositioningCode):
    "convert <sGroupsPositioningCode> to a list of codes (numbers or strings)"
    if not sGroupsPositioningCode:
        return None
    return [ int(sCode)  if sCode.isdigit() or (sCode[0:1] == "-" and sCode[1:].isdigit())  else sCode \
             for sCode in sGroupsPositioningCode.split(",") ]

Added darg.py version [11706e17f5].























































































































































































































































































































































































































































































































>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
#!python3

"""
RULE GRAPH BUILDER
"""

# by Olivier R.
# License: MPL 2

import re
import traceback



class DARG:
    """DIRECT ACYCLIC RULE GRAPH"""
    # This code is inspired from Steve Hanov’s DAWG, 2011. (http://stevehanov.ca/blog/index.php?id=115)

    def __init__ (self, lRule, sLangCode):
        print(" > Direct Acyclic Rule Graph (DARG)", end=" ")

        # Preparing DARG
        self.sLangCode = sLangCode
        self.nRule = len(lRule)
        self.aPreviousRule = []
        Node.resetNextId()
        self.oRoot = Node()
        self.lUncheckedNodes = []  # list of nodes that have not been checked for duplication.
        self.lMinimizedNodes = {}  # list of unique nodes that have been checked for duplication.
        self.nNode = 0
        self.nArc = 0

        # build
        lRule.sort()
        for aRule in lRule:
            self.insert(aRule)
        self.finish()
        self.countNodes()
        self.countArcs()
        self.displayInfo()

    # BUILD DARG
    def insert (self, aRule):
        "insert a new rule (tokens must be inserted in order)"
        if aRule < self.aPreviousRule:
            exit("# Error: tokens must be inserted in order.")

        # find common prefix between word and previous word
        nCommonPrefix = 0
        for i in range(min(len(aRule), len(self.aPreviousRule))):
            if aRule[i] != self.aPreviousRule[i]:
                break
            nCommonPrefix += 1

        # Check the lUncheckedNodes for redundant nodes, proceeding from last
        # one down to the common prefix size. Then truncate the list at that point.
        self._minimize(nCommonPrefix)

        # add the suffix, starting from the correct node mid-way through the graph
        if len(self.lUncheckedNodes) == 0:
            oNode = self.oRoot
        else:
            oNode = self.lUncheckedNodes[-1][2]

        iToken = nCommonPrefix
        for sToken in aRule[nCommonPrefix:]:
            oNextNode = Node()
            oNode.dArcs[sToken] = oNextNode
            self.lUncheckedNodes.append((oNode, sToken, oNextNode))
            if iToken == (len(aRule) - 2):
                oNode.bFinal = True
            iToken += 1
            oNode = oNextNode
        oNode.bFinal = True
        self.aPreviousRule = aRule

    def finish (self):
        "minimize unchecked nodes"
        self._minimize(0)

    def _minimize (self, downTo):
        # proceed from the leaf up to a certain point
        for i in range( len(self.lUncheckedNodes)-1, downTo-1, -1 ):
            oNode, sToken, oChildNode = self.lUncheckedNodes[i]
            if oChildNode in self.lMinimizedNodes:
                # replace the child with the previously encountered one
                oNode.dArcs[sToken] = self.lMinimizedNodes[oChildNode]
            else:
                # add the state to the minimized nodes.
                self.lMinimizedNodes[oChildNode] = oChildNode
            self.lUncheckedNodes.pop()

    def countNodes (self):
        "count nodes within the whole graph"
        self.nNode = len(self.lMinimizedNodes)

    def countArcs (self):
        "count arcs within the whole graph"
        self.nArc = 0
        for oNode in self.lMinimizedNodes:
            self.nArc += len(oNode.dArcs)

    def displayInfo (self):
        "display informations about the rule graph"
        print(": {:>10,} rules,  {:>10,} nodes,  {:>10,} arcs".format(self.nRule, self.nNode, self.nArc))

    def createGraph (self):
        "create the graph as a dictionary"
        dGraph = { 0: self.oRoot.getNodeAsDict() }
        for oNode in self.lMinimizedNodes:
            sHashId = oNode.__hash__()
            if sHashId not in dGraph:
                dGraph[sHashId] = oNode.getNodeAsDict()
            else:
                print("Error. Double node… same id: ", sHashId)
                print(str(oNode.getNodeAsDict()))
        dGraph = self._rewriteKeysOfDARG(dGraph)
        self._checkRegexes(dGraph)
        return dGraph

    def _rewriteKeysOfDARG (self, dGraph):
        "keys of DARG are long numbers (hashes): this function replace these hashes with smaller numbers (to reduce storing size)"
        # create translation dictionary
        dKeyTrans = {}
        for i, nKey in enumerate(dGraph):
            dKeyTrans[nKey] = i
        # replace keys
        dNewGraph = {}
        for nKey, dVal in dGraph.items():
            dNewGraph[dKeyTrans[nKey]] = dVal
        for nKey, dVal in dGraph.items():
            for sArc, val in dVal.items():
                if type(val) is int:
                    dVal[sArc] = dKeyTrans[val]
                else:
                    for sArc, nKey in val.items():
                        val[sArc] = dKeyTrans[nKey]
        return dNewGraph

    def _checkRegexes (self, dGraph):
        "check validity of regexes"
        aRegex = set()
        for nKey, dVal in dGraph.items():
            if "<re_value>" in dVal:
                for sRegex in dVal["<re_value>"]:
                    if sRegex not in aRegex:
                        self._checkRegex(sRegex)
                        aRegex.add(sRegex)
            if "<re_morph>" in dVal:
                for sRegex in dVal["<re_morph>"]:
                    if sRegex not in aRegex:
                        self._checkRegex(sRegex)
                        aRegex.add(sRegex)
        aRegex.clear()

    def _checkRegex (self, sRegex):
        #print(sRegex)
        if "¬" in sRegex:
            sPattern, sNegPattern = sRegex.split("¬")
            try:
                if not sNegPattern:
                    print("# Warning! Empty negpattern:", sRegex)
                re.compile(sPattern)
                if sNegPattern != "*":
                    re.compile(sNegPattern)
            except:
                print("# Error. Wrong regex:", sRegex)
                exit()
        else:
            try:
                if not sRegex:
                    print("# Warning! Empty pattern:", sRegex)
                re.compile(sRegex)
            except:
                print("# Error. Wrong regex:", sRegex)
                exit()


class Node:
    """Node of the rule graph"""

    NextId = 0

    def __init__ (self):
        self.i = Node.NextId
        Node.NextId += 1
        self.bFinal = False
        self.dArcs = {}          # key: arc value; value: a node

    @classmethod
    def resetNextId (cls):
        "reset to 0 the node counter"
        cls.NextId = 0

    def __str__ (self):
        # Caution! this function is used for hashing and comparison!
        cFinal = "1"  if self.bFinal  else "0"
        l = [cFinal]
        for (key, oNode) in self.dArcs.items():
            l.append(str(key))
            l.append(str(oNode.i))
        return "_".join(l)

    def __hash__ (self):
        # Used as a key in a python dictionary.
        return self.__str__().__hash__()

    def __eq__ (self, other):
        # Used as a key in a python dictionary.
        # Nodes are equivalent if they have identical arcs, and each identical arc leads to identical states.
        return self.__str__() == other.__str__()

    def getNodeAsDict (self):
        "returns the node as a dictionary structure"
        dNode = {}
        dReValue = {}
        dReMorph = {}
        dRule = {}
        dLemma = {}
        dMeta = {}
        dTag = {}
        for sArc, oNode in self.dArcs.items():
            if sArc.startswith("@") and len(sArc) > 1:
                dReMorph[sArc[1:]] = oNode.__hash__()
            elif sArc.startswith("~") and len(sArc) > 1:
                dReValue[sArc[1:]] = oNode.__hash__()
            elif sArc.startswith(">") and len(sArc) > 1:
                dLemma[sArc[1:]] = oNode.__hash__()
            elif sArc.startswith("*") and len(sArc) > 1:
                dMeta[sArc[1:]] = oNode.__hash__()
            elif sArc.startswith("/") and len(sArc) > 1:
                dTag[sArc[1:]] = oNode.__hash__()
            elif sArc.startswith("##"):
                dRule[sArc[1:]] = oNode.__hash__()
            else:
                dNode[sArc] = oNode.__hash__()
        if dReValue:
            dNode["<re_value>"] = dReValue
        if dReMorph:
            dNode["<re_morph>"] = dReMorph
        if dLemma:
            dNode["<lemmas>"] = dLemma
        if dTag:
            dNode["<tags>"] = dTag
        if dMeta:
            dNode["<meta>"] = dMeta
        if dRule:
            dNode["<rules>"] = dRule
        #if self.bFinal:
        #    dNode["<final>"] = 1
        return dNode

Modified gc_core/js/lang_core/gc_engine.js from [7ee1350cd7] to [12095116ac].

35
36
37
38
39
40
41
42
43
44
45
46
47
48
49


// data
let _sAppContext = "";                                  // what software is running
let _dOptions = null;
let _aIgnoredRules = new Set();
let _oSpellChecker = null;
let _dAnalyses = new Map();                             // cache for data from dictionary


var gc_engine = {

    //// Informations

    lang: "${lang}",







<







35
36
37
38
39
40
41

42
43
44
45
46
47
48


// data
let _sAppContext = "";                                  // what software is running
let _dOptions = null;
let _aIgnoredRules = new Set();
let _oSpellChecker = null;



var gc_engine = {

    //// Informations

    lang: "${lang}",
325
326
327
328
329
330
331

332
333
334
335
336
337
338
                var spellchecker = require("resource://grammalecte/graphspell/spellchecker.js");
                _oSpellChecker = new spellchecker.SpellChecker("${lang}", "", "${dic_main_filename_js}", "${dic_extended_filename_js}", "${dic_community_filename_js}", "${dic_personal_filename_js}");
            } else {
                _oSpellChecker = new SpellChecker("${lang}", sPath, "${dic_main_filename_js}", "${dic_extended_filename_js}", "${dic_community_filename_js}", "${dic_personal_filename_js}");
            }
            _sAppContext = sContext;
            _dOptions = gc_options.getOptions(sContext).gl_shallowCopy();     // duplication necessary, to be able to reset to default

        }
        catch (e) {
            helpers.logerror(e);
        }
    },

    getSpellChecker: function () {







>







324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
                var spellchecker = require("resource://grammalecte/graphspell/spellchecker.js");
                _oSpellChecker = new spellchecker.SpellChecker("${lang}", "", "${dic_main_filename_js}", "${dic_extended_filename_js}", "${dic_community_filename_js}", "${dic_personal_filename_js}");
            } else {
                _oSpellChecker = new SpellChecker("${lang}", sPath, "${dic_main_filename_js}", "${dic_extended_filename_js}", "${dic_community_filename_js}", "${dic_personal_filename_js}");
            }
            _sAppContext = sContext;
            _dOptions = gc_options.getOptions(sContext).gl_shallowCopy();     // duplication necessary, to be able to reset to default
            _oSpellChecker.activateStorage();
        }
        catch (e) {
            helpers.logerror(e);
        }
    },

    getSpellChecker: function () {
374
375
376
377
378
379
380
381

382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447

448
449
450
451
452
453
454
455
456
457
458

459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484

function displayInfo (dDA, aWord) {
    // for debugging: info of word
    if (!aWord) {
        helpers.echo("> nothing to find");
        return true;
    }
    if (!_dAnalyses.has(aWord[1]) && !_storeMorphFromFSA(aWord[1])) {

        helpers.echo("> not in FSA");
        return true;
    }
    if (dDA.has(aWord[0])) {
        helpers.echo("DA: " + dDA.get(aWord[0]));
    }
    helpers.echo("FSA: " + _dAnalyses.get(aWord[1]));
    return true;
}

function _storeMorphFromFSA (sWord) {
    // retrieves morphologies list from _oSpellChecker -> _dAnalyses
    //helpers.echo("register: "+sWord + " " + _oSpellChecker.getMorph(sWord).toString())
    _dAnalyses.set(sWord, _oSpellChecker.getMorph(sWord));
    return !!_dAnalyses.get(sWord);
}

function morph (dDA, aWord, sPattern, bStrict=true, bNoWord=false) {
    // analyse a tuple (position, word), return true if sPattern in morphologies (disambiguation on)
    if (!aWord) {
        //helpers.echo("morph: noword, returns " + bNoWord);
        return bNoWord;
    }
    //helpers.echo("aWord: "+aWord.toString());
    if (!_dAnalyses.has(aWord[1]) && !_storeMorphFromFSA(aWord[1])) {
        return false;
    }
    let lMorph = dDA.has(aWord[0]) ? dDA.get(aWord[0]) : _dAnalyses.get(aWord[1]);
    //helpers.echo("lMorph: "+lMorph.toString());
    if (lMorph.length === 0) {
        return false;
    }
    //helpers.echo("***");
    if (bStrict) {
        return lMorph.every(s  =>  (s.search(sPattern) !== -1));
    }
    return lMorph.some(s  =>  (s.search(sPattern) !== -1));
}

function morphex (dDA, aWord, sPattern, sNegPattern, bNoWord=false) {
    // analyse a tuple (position, word), returns true if not sNegPattern in word morphologies and sPattern in word morphologies (disambiguation on)
    if (!aWord) {
        //helpers.echo("morph: noword, returns " + bNoWord);
        return bNoWord;
    }
    //helpers.echo("aWord: "+aWord.toString());
    if (!_dAnalyses.has(aWord[1]) && !_storeMorphFromFSA(aWord[1])) {
        return false;
    }
    let lMorph = dDA.has(aWord[0]) ? dDA.get(aWord[0]) : _dAnalyses.get(aWord[1]);
    //helpers.echo("lMorph: "+lMorph.toString());
    if (lMorph.length === 0) {
        return false;
    }
    //helpers.echo("***");
    // check negative condition
    if (lMorph.some(s  =>  (s.search(sNegPattern) !== -1))) {
        return false;
    }
    // search sPattern
    return lMorph.some(s  =>  (s.search(sPattern) !== -1));
}

function analyse (sWord, sPattern, bStrict=true) {
    // analyse a word, return true if sPattern in morphologies (disambiguation off)
    if (!_dAnalyses.has(sWord) && !_storeMorphFromFSA(sWord)) {

        return false;
    }
    if (bStrict) {
        return _dAnalyses.get(sWord).every(s  =>  (s.search(sPattern) !== -1));
    }
    return _dAnalyses.get(sWord).some(s  =>  (s.search(sPattern) !== -1));
}

function analysex (sWord, sPattern, sNegPattern) {
    // analyse a word, returns True if not sNegPattern in word morphologies and sPattern in word morphologies (disambiguation off)
    if (!_dAnalyses.has(sWord) && !_storeMorphFromFSA(sWord)) {

        return false;
    }
    // check negative condition
    if (_dAnalyses.get(sWord).some(s  =>  (s.search(sNegPattern) !== -1))) {
        return false;
    }
    // search sPattern
    return _dAnalyses.get(sWord).some(s  =>  (s.search(sPattern) !== -1));
}

function stem (sWord) {
    // returns a list of sWord's stems
    if (!sWord) {
        return [];
    }
    if (!_dAnalyses.has(sWord) && !_storeMorphFromFSA(sWord)) {
        return [];
    }
    return _dAnalyses.get(sWord).map( s => s.slice(1, s.indexOf(" ")) );
}


//// functions to get text outside pattern scope

// warning: check compile_rules.py to understand how it works








|
>
|





|



<
<
<
<
<
<
<







<
<
<
|


















<
<
<
|















|
>



|

|




|
>



|



|
<
<
<
<
<
<
<
<
<
<
<







374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392







393
394
395
396
397
398
399



400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418



419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456











457
458
459
460
461
462
463

function displayInfo (dDA, aWord) {
    // for debugging: info of word
    if (!aWord) {
        helpers.echo("> nothing to find");
        return true;
    }
    let lMorph = _oSpellChecker.getMorph(aWord[1]);
    if (lMorph.length === 0) {
        helpers.echo("> not in dictionary");
        return true;
    }
    if (dDA.has(aWord[0])) {
        helpers.echo("DA: " + dDA.get(aWord[0]));
    }
    helpers.echo("FSA: " + lMorph);
    return true;
}








function morph (dDA, aWord, sPattern, bStrict=true, bNoWord=false) {
    // analyse a tuple (position, word), return true if sPattern in morphologies (disambiguation on)
    if (!aWord) {
        //helpers.echo("morph: noword, returns " + bNoWord);
        return bNoWord;
    }
    //helpers.echo("aWord: "+aWord.toString());



    let lMorph = dDA.has(aWord[0]) ? dDA.get(aWord[0]) : _oSpellChecker.getMorph(aWord[1]);
    //helpers.echo("lMorph: "+lMorph.toString());
    if (lMorph.length === 0) {
        return false;
    }
    //helpers.echo("***");
    if (bStrict) {
        return lMorph.every(s  =>  (s.search(sPattern) !== -1));
    }
    return lMorph.some(s  =>  (s.search(sPattern) !== -1));
}

function morphex (dDA, aWord, sPattern, sNegPattern, bNoWord=false) {
    // analyse a tuple (position, word), returns true if not sNegPattern in word morphologies and sPattern in word morphologies (disambiguation on)
    if (!aWord) {
        //helpers.echo("morph: noword, returns " + bNoWord);
        return bNoWord;
    }
    //helpers.echo("aWord: "+aWord.toString());



    let lMorph = dDA.has(aWord[0]) ? dDA.get(aWord[0]) : _oSpellChecker.getMorph(aWord[1]);
    //helpers.echo("lMorph: "+lMorph.toString());
    if (lMorph.length === 0) {
        return false;
    }
    //helpers.echo("***");
    // check negative condition
    if (lMorph.some(s  =>  (s.search(sNegPattern) !== -1))) {
        return false;
    }
    // search sPattern
    return lMorph.some(s  =>  (s.search(sPattern) !== -1));
}

function analyse (sWord, sPattern, bStrict=true) {
    // analyse a word, return true if sPattern in morphologies (disambiguation off)
    let lMorph = _oSpellChecker.getMorph(sWord);
    if (lMorph.length === 0) {
        return false;
    }
    if (bStrict) {
        return lMorph.every(s  =>  (s.search(sPattern) !== -1));
    }
    return lMorph.some(s  =>  (s.search(sPattern) !== -1));
}

function analysex (sWord, sPattern, sNegPattern) {
    // analyse a word, returns True if not sNegPattern in word morphologies and sPattern in word morphologies (disambiguation off)
    let lMorph = _oSpellChecker.getMorph(sWord);
    if (lMorph.length === 0) {
        return false;
    }
    // check negative condition
    if (lMorph.some(s  =>  (s.search(sNegPattern) !== -1))) {
        return false;
    }
    // search sPattern
    return lMorph.some(s  =>  (s.search(sPattern) !== -1));











}


//// functions to get text outside pattern scope

// warning: check compile_rules.py to understand how it works

563
564
565
566
567
568
569
570
571
572
573
574
575
576
577
578
579
580
581
582
583
584
585
586
587
588
589
590
591
592
593
594
595
596
597
598
599
600
601
602
603
604
605
606
607
608
609
function select (dDA, nPos, sWord, sPattern, lDefault=null) {
    if (!sWord) {
        return true;
    }
    if (dDA.has(nPos)) {
        return true;
    }
    if (!_dAnalyses.has(sWord) && !_storeMorphFromFSA(sWord)) {
        return true;
    }
    if (_dAnalyses.get(sWord).length === 1) {
        return true;
    }
    let lSelect = _dAnalyses.get(sWord).filter( sMorph => sMorph.search(sPattern) !== -1 );
    if (lSelect.length > 0) {
        if (lSelect.length != _dAnalyses.get(sWord).length) {
            dDA.set(nPos, lSelect);
        }
    } else if (lDefault) {
        dDA.set(nPos, lDefaul);
    }
    return true;
}

function exclude (dDA, nPos, sWord, sPattern, lDefault=null) {
    if (!sWord) {
        return true;
    }
    if (dDA.has(nPos)) {
        return true;
    }
    if (!_dAnalyses.has(sWord) && !_storeMorphFromFSA(sWord)) {
        return true;
    }
    if (_dAnalyses.get(sWord).length === 1) {
        return true;
    }
    let lSelect = _dAnalyses.get(sWord).filter( sMorph => sMorph.search(sPattern) === -1 );
    if (lSelect.length > 0) {
        if (lSelect.length != _dAnalyses.get(sWord).length) {
            dDA.set(nPos, lSelect);
        }
    } else if (lDefault) {
        dDA.set(nPos, lDefault);
    }
    return true;
}







|
<
<
|


|

|















|
<
<
|


|

|







542
543
544
545
546
547
548
549


550
551
552
553
554
555
556
557
558
559
560
561
562
563
564
565
566
567
568
569
570
571


572
573
574
575
576
577
578
579
580
581
582
583
584
function select (dDA, nPos, sWord, sPattern, lDefault=null) {
    if (!sWord) {
        return true;
    }
    if (dDA.has(nPos)) {
        return true;
    }
    let lMorph = _oSpellChecker.getMorph(sWord);


    if (lMorph.length === 0  ||  lMorph.length === 1) {
        return true;
    }
    let lSelect = lMorph.filter( sMorph => sMorph.search(sPattern) !== -1 );
    if (lSelect.length > 0) {
        if (lSelect.length != lMorph.length) {
            dDA.set(nPos, lSelect);
        }
    } else if (lDefault) {
        dDA.set(nPos, lDefaul);
    }
    return true;
}

function exclude (dDA, nPos, sWord, sPattern, lDefault=null) {
    if (!sWord) {
        return true;
    }
    if (dDA.has(nPos)) {
        return true;
    }
    let lMorph = _oSpellChecker.getMorph(sWord);


    if (lMorph.length === 0  ||  lMorph.length === 1) {
        return true;
    }
    let lSelect = lMorph.filter( sMorph => sMorph.search(sPattern) === -1 );
    if (lSelect.length > 0) {
        if (lSelect.length != lMorph.length) {
            dDA.set(nPos, lSelect);
        }
    } else if (lDefault) {
        dDA.set(nPos, lDefault);
    }
    return true;
}

Modified gc_core/py/__init__.py from [aeadedff14] to [49f46a05ff].




1
2




from .grammar_checker import *
>
>
>


1
2
3
4
5
"""
Grammar checker
"""

from .grammar_checker import *

Modified gc_core/py/grammar_checker.py from [79ce1061e8] to [634e5c7c61].


1
2

3
4
5
6
7
8
9
10

11
12
13
14
15
16
17
18
19
20
21
22
23
24

25
26
27

28
29
30

31
32
33
34
35
36

37
38
39
40
41
42

43
44
45
46
47
48
49
50
51

52
53
54

55
56
57

58
59
60
61
62
63

64
65
66
67
68
69
70
71
72
73

# Grammalecte
# Main class: wrapper


import importlib
import json

from . import text


class GrammarChecker:


    def __init__ (self, sLangCode, sContext="Python"):
        self.sLangCode = sLangCode
        # Grammar checker engine
        self.gce = importlib.import_module("."+sLangCode, "grammalecte")
        self.gce.load(sContext)
        # Spell checker
        self.oSpellChecker = self.gce.getSpellChecker()
        # Lexicographer
        self.oLexicographer = None
        # Text formatter
        self.oTextFormatter = None

    def getGCEngine (self):

        return self.gce

    def getSpellChecker (self):

        return self.oSpellChecker

    def getTextFormatter (self):

        if self.oTextFormatter == None:
            self.tf = importlib.import_module("."+self.sLangCode+".textformatter", "grammalecte")
        self.oTextFormatter = self.tf.TextFormatter()
        return self.oTextFormatter

    def getLexicographer (self):

        if self.oLexicographer == None:
            self.lxg = importlib.import_module("."+self.sLangCode+".lexicographe", "grammalecte")
        self.oLexicographer = self.lxg.Lexicographe(self.oSpellChecker)
        return self.oLexicographer

    def displayGCOptions (self):

        self.gce.displayOptions()

    def getParagraphErrors (self, sText, dOptions=None, bContext=False, bSpellSugg=False, bDebug=False):
        "returns a tuple: (grammar errors, spelling errors)"
        aGrammErrs = self.gce.parse(sText, "FR", bDebug=bDebug, dOptions=dOptions, bContext=bContext)
        aSpellErrs = self.oSpellChecker.parseParagraph(sText, bSpellSugg)
        return aGrammErrs, aSpellErrs

    def generateText (self, sText, bEmptyIfNoErrors=False, bSpellSugg=False, nWidth=100, bDebug=False):

        pass

    def generateTextAsJSON (self, sText, bContext=False, bEmptyIfNoErrors=False, bSpellSugg=False, bReturnText=False, bDebug=False):

        pass

    def generateParagraph (self, sText, dOptions=None, bEmptyIfNoErrors=False, bSpellSugg=False, nWidth=100, bDebug=False):

        aGrammErrs, aSpellErrs = self.getParagraphErrors(sText, dOptions, False, bSpellSugg, bDebug)
        if bEmptyIfNoErrors and not aGrammErrs and not aSpellErrs:
            return ""
        return text.generateParagraph(sText, aGrammErrs, aSpellErrs, nWidth)

    def generateParagraphAsJSON (self, iIndex, sText, dOptions=None, bContext=False, bEmptyIfNoErrors=False, bSpellSugg=False, bReturnText=False, lLineSet=None, bDebug=False):

        aGrammErrs, aSpellErrs = self.getParagraphErrors(sText, dOptions, bContext, bSpellSugg, bDebug)
        aGrammErrs = list(aGrammErrs)
        if bEmptyIfNoErrors and not aGrammErrs and not aSpellErrs:
            return ""
        if lLineSet:
            aGrammErrs, aSpellErrs = text.convertToXY(aGrammErrs, aSpellErrs, lLineSet)
            return json.dumps({ "lGrammarErrors": aGrammErrs, "lSpellingErrors": aSpellErrs }, ensure_ascii=False)
        if bReturnText:
            return json.dumps({ "iParagraph": iIndex, "sText": sText, "lGrammarErrors": aGrammErrs, "lSpellingErrors": aSpellErrs }, ensure_ascii=False)
        return json.dumps({ "iParagraph": iIndex, "lGrammarErrors": aGrammErrs, "lSpellingErrors": aSpellErrs }, ensure_ascii=False)
>
|
<
>








>














>



>



>
|
|
|



>
|
|
|



>









>



>



>






>










1
2

3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
"""
Grammalecte, grammar checker

"""

import importlib
import json

from . import text


class GrammarChecker:
    "GrammarChecker: Wrapper for the grammar checker engine"

    def __init__ (self, sLangCode, sContext="Python"):
        self.sLangCode = sLangCode
        # Grammar checker engine
        self.gce = importlib.import_module("."+sLangCode, "grammalecte")
        self.gce.load(sContext)
        # Spell checker
        self.oSpellChecker = self.gce.getSpellChecker()
        # Lexicographer
        self.oLexicographer = None
        # Text formatter
        self.oTextFormatter = None

    def getGCEngine (self):
        "return the grammar checker object"
        return self.gce

    def getSpellChecker (self):
        "return the spell checker object"
        return self.oSpellChecker

    def getTextFormatter (self):
        "load and return the text formatter"
        if self.oTextFormatter is None:
            tf = importlib.import_module("."+self.sLangCode+".textformatter", "grammalecte")
            self.oTextFormatter = tf.TextFormatter()
        return self.oTextFormatter

    def getLexicographer (self):
        "load and return the lexicographer"
        if self.oLexicographer is None:
            lxg = importlib.import_module("."+self.sLangCode+".lexicographe", "grammalecte")
            self.oLexicographer = lxg.Lexicographe(self.oSpellChecker)
        return self.oLexicographer

    def displayGCOptions (self):
        "display the grammar checker options"
        self.gce.displayOptions()

    def getParagraphErrors (self, sText, dOptions=None, bContext=False, bSpellSugg=False, bDebug=False):
        "returns a tuple: (grammar errors, spelling errors)"
        aGrammErrs = self.gce.parse(sText, "FR", bDebug=bDebug, dOptions=dOptions, bContext=bContext)
        aSpellErrs = self.oSpellChecker.parseParagraph(sText, bSpellSugg)
        return aGrammErrs, aSpellErrs

    def generateText (self, sText, bEmptyIfNoErrors=False, bSpellSugg=False, nWidth=100, bDebug=False):
        "[todo]"
        pass

    def generateTextAsJSON (self, sText, bContext=False, bEmptyIfNoErrors=False, bSpellSugg=False, bReturnText=False, bDebug=False):
        "[todo]"
        pass

    def generateParagraph (self, sText, dOptions=None, bEmptyIfNoErrors=False, bSpellSugg=False, nWidth=100, bDebug=False):
        "parse text and return a readable text with underline errors"
        aGrammErrs, aSpellErrs = self.getParagraphErrors(sText, dOptions, False, bSpellSugg, bDebug)
        if bEmptyIfNoErrors and not aGrammErrs and not aSpellErrs:
            return ""
        return text.generateParagraph(sText, aGrammErrs, aSpellErrs, nWidth)

    def generateParagraphAsJSON (self, iIndex, sText, dOptions=None, bContext=False, bEmptyIfNoErrors=False, bSpellSugg=False, bReturnText=False, lLineSet=None, bDebug=False):
        "parse text and return errors as a JSON string"
        aGrammErrs, aSpellErrs = self.getParagraphErrors(sText, dOptions, bContext, bSpellSugg, bDebug)
        aGrammErrs = list(aGrammErrs)
        if bEmptyIfNoErrors and not aGrammErrs and not aSpellErrs:
            return ""
        if lLineSet:
            aGrammErrs, aSpellErrs = text.convertToXY(aGrammErrs, aSpellErrs, lLineSet)
            return json.dumps({ "lGrammarErrors": aGrammErrs, "lSpellingErrors": aSpellErrs }, ensure_ascii=False)
        if bReturnText:
            return json.dumps({ "iParagraph": iIndex, "sText": sText, "lGrammarErrors": aGrammErrs, "lSpellingErrors": aSpellErrs }, ensure_ascii=False)
        return json.dumps({ "iParagraph": iIndex, "lGrammarErrors": aGrammErrs, "lSpellingErrors": aSpellErrs }, ensure_ascii=False)

Modified gc_core/py/lang_core/gc_engine.py from [72ecd7c680] to [d9f8c3ac90].


1
2

3
4
5
6
7
8
9
10
11
12
13













14
15
16
17
18
19
20

# Grammalecte
# Grammar checker engine


import re
import sys
import os
import traceback
#import unicodedata
from itertools import chain

from ..graphspell.spellchecker import SpellChecker
from ..graphspell.echo import echo
from . import gc_options















__all__ = [ "lang", "locales", "pkg", "name", "version", "author", \
            "load", "parse", "getSpellChecker", \
            "setOption", "setOptions", "getOptions", "getDefaultOptions", "getOptionsLabels", "resetOptions", "displayOptions", \
            "ignoreRule", "resetIgnoreRules", "reactivateRule", "listRules", "displayRules" ]

>
|
|
>











>
>
>
>
>
>
>
>
>
>
>
>
>







1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
"""
Grammalecte
Grammar checker engine
"""

import re
import sys
import os
import traceback
#import unicodedata
from itertools import chain

from ..graphspell.spellchecker import SpellChecker
from ..graphspell.echo import echo
from . import gc_options

from ..graphspell.tokenizer import Tokenizer
from .gc_rules_graph import dAllGraph, dRule

try:
    # LibreOffice / OpenOffice
    from com.sun.star.linguistic2 import SingleProofreadingError
    from com.sun.star.text.TextMarkupType import PROOFREADING
    from com.sun.star.beans import PropertyValue
    #import lightproof_handler_${implname} as opt
    _bWriterError = True
except ImportError:
    _bWriterError = False


__all__ = [ "lang", "locales", "pkg", "name", "version", "author", \
            "load", "parse", "getSpellChecker", \
            "setOption", "setOptions", "getOptions", "getDefaultOptions", "getOptionsLabels", "resetOptions", "displayOptions", \
            "ignoreRule", "resetIgnoreRules", "reactivateRule", "listRules", "displayRules" ]

29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290

291
292
293

294
295
296
297


298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357

358
359
360



















































361


362
363
























































































































































































































364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391

392
393

394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425

426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452

453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475



476
477
478
479
480
481
482
author = "${author}"

_rules = None                               # module gc_rules

# data
_sAppContext = ""                           # what software is running
_dOptions = None
_aIgnoredRules = set()
_oSpellChecker = None
_dAnalyses = {}                             # cache for data from dictionary



#### Parsing

def parse (sText, sCountry="${country_default}", bDebug=False, dOptions=None, bContext=False):
    "analyses the paragraph sText and returns list of errors"
    #sText = unicodedata.normalize("NFC", sText)
    aErrors = None
    sAlt = sText
    dDA = {}        # Disambiguisator. Key = position; value = list of morphologies
    dPriority = {}  # Key = position; value = priority
    dOpt = _dOptions  if not dOptions  else dOptions

    # parse paragraph
    try:
        sNew, aErrors = _proofread(sText, sAlt, 0, True, dDA, dPriority, sCountry, dOpt, bDebug, bContext)
        if sNew:
            sText = sNew
    except:
        raise

    # cleanup
    if " " in sText:
        sText = sText.replace(" ", ' ') # nbsp
    if " " in sText:
        sText = sText.replace(" ", ' ') # nnbsp
    if "'" in sText:
        sText = sText.replace("'", "’")
    if "‑" in sText:
        sText = sText.replace("‑", "-") # nobreakdash

    # parse sentences
    for iStart, iEnd in _getSentenceBoundaries(sText):
        if 4 < (iEnd - iStart) < 2000:
            dDA.clear()
            try:
                _, errs = _proofread(sText[iStart:iEnd], sAlt[iStart:iEnd], iStart, False, dDA, dPriority, sCountry, dOpt, bDebug, bContext)
                aErrors.update(errs)
            except:
                raise
    return aErrors.values() # this is a view (iterable)


def _getSentenceBoundaries (sText):
    iStart = _zBeginOfParagraph.match(sText).end()
    for m in _zEndOfSentence.finditer(sText):
        yield (iStart, m.end())
        iStart = m.end()


def _proofread (s, sx, nOffset, bParagraph, dDA, dPriority, sCountry, dOptions, bDebug, bContext):
    dErrs = {}
    bChange = False
    bIdRule = option('idrule')

    for sOption, lRuleGroup in _getRules(bParagraph):
        if not sOption or dOptions.get(sOption, False):
            for zRegex, bUppercase, sLineId, sRuleId, nPriority, lActions in lRuleGroup:
                if sRuleId not in _aIgnoredRules:
                    for m in zRegex.finditer(s):
                        bCondMemo = None
                        for sFuncCond, cActionType, sWhat, *eAct in lActions:
                            # action in lActions: [ condition, action type, replacement/suggestion/action[, iGroup[, message, URL]] ]
                            try:
                                bCondMemo = not sFuncCond or globals()[sFuncCond](s, sx, m, dDA, sCountry, bCondMemo)
                                if bCondMemo:
                                    if cActionType == "-":
                                        # grammar error
                                        nErrorStart = nOffset + m.start(eAct[0])
                                        if nErrorStart not in dErrs or nPriority > dPriority[nErrorStart]:
                                            dErrs[nErrorStart] = _createError(s, sx, sWhat, nOffset, m, eAct[0], sLineId, sRuleId, bUppercase, eAct[1], eAct[2], bIdRule, sOption, bContext)
                                            dPriority[nErrorStart] = nPriority
                                    elif cActionType == "~":
                                        # text processor
                                        s = _rewrite(s, sWhat, eAct[0], m, bUppercase)
                                        bChange = True
                                        if bDebug:
                                            echo("~ " + s + "  -- " + m.group(eAct[0]) + "  # " + sLineId)
                                    elif cActionType == "=":
                                        # disambiguation
                                        globals()[sWhat](s, m, dDA)
                                        if bDebug:
                                            echo("= " + m.group(0) + "  # " + sLineId + "\nDA: " + str(dDA))
                                    elif cActionType == ">":
                                        # we do nothing, this test is just a condition to apply all following actions
                                        pass
                                    else:
                                        echo("# error: unknown action at " + sLineId)
                                elif cActionType == ">":
                                    break
                            except Exception as e:
                                raise Exception(str(e), "# " + sLineId + " # " + sRuleId)
    if bChange:
        return (s, dErrs)
    return (False, dErrs)


def _createWriterError (s, sx, sRepl, nOffset, m, iGroup, sLineId, sRuleId, bUppercase, sMsg, sURL, bIdRule, sOption, bContext):
    "error for Writer (LO/OO)"
    xErr = SingleProofreadingError()
    #xErr = uno.createUnoStruct( "com.sun.star.linguistic2.SingleProofreadingError" )
    xErr.nErrorStart = nOffset + m.start(iGroup)
    xErr.nErrorLength = m.end(iGroup) - m.start(iGroup)
    xErr.nErrorType = PROOFREADING
    xErr.aRuleIdentifier = sRuleId
    # suggestions
    if sRepl[0:1] == "=":
        sugg = globals()[sRepl[1:]](s, m)
        if sugg:
            if bUppercase and m.group(iGroup)[0:1].isupper():
                xErr.aSuggestions = tuple(map(str.capitalize, sugg.split("|")))
            else:
                xErr.aSuggestions = tuple(sugg.split("|"))
        else:
            xErr.aSuggestions = ()
    elif sRepl == "_":
        xErr.aSuggestions = ()
    else:
        if bUppercase and m.group(iGroup)[0:1].isupper():
            xErr.aSuggestions = tuple(map(str.capitalize, m.expand(sRepl).split("|")))
        else:
            xErr.aSuggestions = tuple(m.expand(sRepl).split("|"))
    # Message
    if sMsg[0:1] == "=":
        sMessage = globals()[sMsg[1:]](s, m)
    else:
        sMessage = m.expand(sMsg)
    xErr.aShortComment = sMessage   # sMessage.split("|")[0]     # in context menu
    xErr.aFullComment = sMessage   # sMessage.split("|")[-1]    # in dialog
    if bIdRule:
        xErr.aShortComment += "  # " + sLineId + " # " + sRuleId
    # URL
    if sURL:
        p = PropertyValue()
        p.Name = "FullCommentURL"
        p.Value = sURL
        xErr.aProperties = (p,)
    else:
        xErr.aProperties = ()
    return xErr


def _createDictError (s, sx, sRepl, nOffset, m, iGroup, sLineId, sRuleId, bUppercase, sMsg, sURL, bIdRule, sOption, bContext):
    "error as a dictionary"
    dErr = {}
    dErr["nStart"] = nOffset + m.start(iGroup)
    dErr["nEnd"] = nOffset + m.end(iGroup)
    dErr["sLineId"] = sLineId
    dErr["sRuleId"] = sRuleId
    dErr["sType"] = sOption  if sOption  else "notype"
    # suggestions
    if sRepl[0:1] == "=":
        sugg = globals()[sRepl[1:]](s, m)
        if sugg:
            if bUppercase and m.group(iGroup)[0:1].isupper():
                dErr["aSuggestions"] = list(map(str.capitalize, sugg.split("|")))
            else:
                dErr["aSuggestions"] = sugg.split("|")
        else:
            dErr["aSuggestions"] = ()
    elif sRepl == "_":
        dErr["aSuggestions"] = ()
    else:
        if bUppercase and m.group(iGroup)[0:1].isupper():
            dErr["aSuggestions"] = list(map(str.capitalize, m.expand(sRepl).split("|")))
        else:
            dErr["aSuggestions"] = m.expand(sRepl).split("|")
    # Message
    if sMsg[0:1] == "=":
        sMessage = globals()[sMsg[1:]](s, m)
    else:
        sMessage = m.expand(sMsg)
    dErr["sMessage"] = sMessage
    if bIdRule:
        dErr["sMessage"] += "  # " + sLineId + " # " + sRuleId
    # URL
    dErr["URL"] = sURL  if sURL  else ""
    # Context
    if bContext:
        dErr['sUnderlined'] = sx[m.start(iGroup):m.end(iGroup)]
        dErr['sBefore'] = sx[max(0,m.start(iGroup)-80):m.start(iGroup)]
        dErr['sAfter'] = sx[m.end(iGroup):m.end(iGroup)+80]
    return dErr


def _rewrite (s, sRepl, iGroup, m, bUppercase):
    "text processor: write sRepl in s at iGroup position"
    nLen = m.end(iGroup) - m.start(iGroup)
    if sRepl == "*":
        sNew = " " * nLen
    elif sRepl == ">" or sRepl == "_" or sRepl == "~":
        sNew = sRepl + " " * (nLen-1)
    elif sRepl == "@":
        sNew = "@" * nLen
    elif sRepl[0:1] == "=":
        sNew = globals()[sRepl[1:]](s, m)
        sNew = sNew + " " * (nLen-len(sNew))
        if bUppercase and m.group(iGroup)[0:1].isupper():
            sNew = sNew.capitalize()
    else:
        sNew = m.expand(sRepl)
        sNew = sNew + " " * (nLen-len(sNew))
    return s[0:m.start(iGroup)] + sNew + s[m.end(iGroup):]


def ignoreRule (sRuleId):
    _aIgnoredRules.add(sRuleId)


def resetIgnoreRules ():
    _aIgnoredRules.clear()


def reactivateRule (sRuleId):
    _aIgnoredRules.discard(sRuleId)


def listRules (sFilter=None):
    "generator: returns typle (sOption, sLineId, sRuleId)"
    if sFilter:
        try:
            zFilter = re.compile(sFilter)
        except:
            echo("# Error. List rules: wrong regex.")
            sFilter = None
    for sOption, lRuleGroup in chain(_getRules(True), _getRules(False)):
        for _, _, sLineId, sRuleId, _, _ in lRuleGroup:
            if not sFilter or zFilter.search(sRuleId):
                yield (sOption, sLineId, sRuleId)


def displayRules (sFilter=None):
    echo("List of rules. Filter: << " + str(sFilter) + " >>")
    for sOption, sLineId, sRuleId in listRules(sFilter):
        echo("{:<10} {:<10} {}".format(sOption, sLineId, sRuleId))


#### init

try:
    # LibreOffice / OpenOffice
    from com.sun.star.linguistic2 import SingleProofreadingError
    from com.sun.star.text.TextMarkupType import PROOFREADING
    from com.sun.star.beans import PropertyValue
    #import lightproof_handler_${implname} as opt
    _createError = _createWriterError
except ImportError:
    _createError = _createDictError


def load (sContext="Python"):

    global _oSpellChecker
    global _sAppContext
    global _dOptions

    try:
        _oSpellChecker = SpellChecker("${lang}", "${dic_main_filename_py}", "${dic_extended_filename_py}", "${dic_community_filename_py}", "${dic_personal_filename_py}")
        _sAppContext = sContext
        _dOptions = dict(gc_options.getOptions(sContext))   # duplication necessary, to be able to reset to default


    except:
        traceback.print_exc()


def setOption (sOpt, bVal):
    if sOpt in _dOptions:
        _dOptions[sOpt] = bVal


def setOptions (dOpt):
    for sKey, bVal in dOpt.items():
        if sKey in _dOptions:
            _dOptions[sKey] = bVal


def getOptions ():
    return _dOptions


def getDefaultOptions ():
    return dict(gc_options.getOptions(_sAppContext))


def getOptionsLabels (sLang):
    return gc_options.getUI(sLang)


def displayOptions (sLang):
    echo("List of options")
    echo("\n".join( [ k+":\t"+str(v)+"\t"+gc_options.getUI(sLang).get(k, ("?", ""))[0]  for k, v  in sorted(_dOptions.items()) ] ))
    echo("")


def resetOptions ():
    global _dOptions
    _dOptions = dict(gc_options.getOptions(_sAppContext))


def getSpellChecker ():
    return _oSpellChecker


def _getRules (bParagraph):
    try:
        if not bParagraph:
            return _rules.lSentenceRules
        return _rules.lParagraphRules
    except:
        _loadRules()
    if not bParagraph:
        return _rules.lSentenceRules
    return _rules.lParagraphRules


def _loadRules ():
    from . import gc_rules
    global _rules
    _rules = gc_rules
    # compile rules regex
    for lRuleGroup in chain(_rules.lParagraphRules, _rules.lSentenceRules):

        for rule in lRuleGroup[1]:
            try:
                rule[0] = re.compile(rule[0])



















































            except:


                echo("Bad regular expression in # " + str(rule[2]))
                rule[0] = "(?i)<Grammalecte>"


























































































































































































































def _getPath ():
    return os.path.join(os.path.dirname(sys.modules[__name__].__file__), __name__ + ".py")



#### common functions

# common regexes
_zEndOfSentence = re.compile('([.?!:;…][ .?!… »”")]*|.$)')
_zBeginOfParagraph = re.compile("^\W*")
_zEndOfParagraph = re.compile("\W*$")
_zNextWord = re.compile(" +(\w[\w-]*)")
_zPrevWord = re.compile("(\w[\w-]*) +$")


def option (sOpt):
    "return True if option sOpt is active"
    return _dOptions.get(sOpt, False)


def displayInfo (dDA, tWord):
    "for debugging: retrieve info of word"
    if not tWord:
        echo("> nothing to find")
        return True
    if tWord[1] not in _dAnalyses and not _storeMorphFromFSA(tWord[1]):

        echo("> not in FSA")
        return True

    if tWord[0] in dDA:
        echo("DA: " + str(dDA[tWord[0]]))
    echo("FSA: " + str(_dAnalyses[tWord[1]]))
    return True


def _storeMorphFromFSA (sWord):
    "retrieves morphologies list from _oSpellChecker -> _dAnalyses"
    global _dAnalyses
    _dAnalyses[sWord] = _oSpellChecker.getMorph(sWord)
    return True  if _dAnalyses[sWord]  else False


def morph (dDA, tWord, sPattern, bStrict=True, bNoWord=False):
    "analyse a tuple (position, word), return True if sPattern in morphologies (disambiguation on)"
    if not tWord:
        return bNoWord
    if tWord[1] not in _dAnalyses and not _storeMorphFromFSA(tWord[1]):
        return False
    lMorph = dDA[tWord[0]]  if tWord[0] in dDA  else _dAnalyses[tWord[1]]
    if not lMorph:
        return False
    p = re.compile(sPattern)
    if bStrict:
        return all(p.search(s)  for s in lMorph)
    return any(p.search(s)  for s in lMorph)


def morphex (dDA, tWord, sPattern, sNegPattern, bNoWord=False):
    "analyse a tuple (position, word), returns True if not sNegPattern in word morphologies and sPattern in word morphologies (disambiguation on)"
    if not tWord:
        return bNoWord

    if tWord[1] not in _dAnalyses and not _storeMorphFromFSA(tWord[1]):
        return False
    lMorph = dDA[tWord[0]]  if tWord[0] in dDA  else _dAnalyses[tWord[1]]
    # check negative condition
    np = re.compile(sNegPattern)
    if any(np.search(s)  for s in lMorph):
        return False
    # search sPattern
    p = re.compile(sPattern)
    return any(p.search(s)  for s in lMorph)


def analyse (sWord, sPattern, bStrict=True):
    "analyse a word, return True if sPattern in morphologies (disambiguation off)"
    if sWord not in _dAnalyses and not _storeMorphFromFSA(sWord):
        return False
    if not _dAnalyses[sWord]:
        return False
    p = re.compile(sPattern)
    if bStrict:
        return all(p.search(s)  for s in _dAnalyses[sWord])
    return any(p.search(s)  for s in _dAnalyses[sWord])


def analysex (sWord, sPattern, sNegPattern):
    "analyse a word, returns True if not sNegPattern in word morphologies and sPattern in word morphologies (disambiguation off)"
    if sWord not in _dAnalyses and not _storeMorphFromFSA(sWord):

        return False
    # check negative condition
    np = re.compile(sNegPattern)
    if any(np.search(s)  for s in _dAnalyses[sWord]):
        return False
    # search sPattern
    p = re.compile(sPattern)
    return any(p.search(s)  for s in _dAnalyses[sWord])


def stem (sWord):
    "returns a list of sWord's stems"
    if not sWord:
        return []
    if sWord not in _dAnalyses and not _storeMorphFromFSA(sWord):
        return []
    return [ s[1:s.find(" ")]  for s in _dAnalyses[sWord] ]


## functions to get text outside pattern scope

# warning: check compile_rules.py to understand how it works




def nextword (s, iStart, n):
    "get the nth word of the input string or empty string"
    m = re.match("(?: +[\\w%-]+){" + str(n-1) + "} +([\\w%-]+)", s[iStart:])
    if not m:
        return None
    return (iStart+m.start(1), m.group(1))








<

<
<
<
<
<
<
<
<
<
|
<
<
<
<
<
<
<
<
<
<
<
<
<
<
<
<
<
<
<
<
<
<
<
<
<
<
<
<
<
<
<
<
<
<
<
<
<
<
<
<
<
<
<
<
<
<
<
<
<
<
|
<
<
<
<
<
<
<
<
<
<
<
<
<
<
<
<
<
<
<
<
<
<
<
<
<
<
<
<
<
<
<
<
<
<
<
<

<
<
<
<
<
<
<
<
<
<
<
<
<
<
<
<
<
<
<
<
<
<
<
<
<
<
<
<
<
<
<
<
<
<
<
<
<
<
<
<
<
<
<
<


<
<
<
<
<
<
<
<
<
<
<
<
<
<
<
<
<
<
<
<
<
<
<
<
<
<
<
<
<
<
<
<
<
<
<
<
<
<
<
<
<
<
<
<
<
<
<
<
<
<
<
<
<
<
<
<
<
<
<
<
<
<
<
<
<
<
<
<
<
<
<
<
<
<
<
<
<
<
<
<
<
<
<
<
<
<
<
<
<
<
<
<
<
<
<
|
<
<
<
<
<
<
<
<
<
<
<


>



>




>
>



<
<
<
<
<
<
<
<
<
<
<
<
<
<
<
<
<
<
<
<
<
<
<
<
<
<
<
<
<
<
<
<
<
<
<
<
<
<


















|
>
|
|
|
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>

>
>
|
|
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>









<
<
<
<
<
<
<
<

|



|




|
>
|

>
|
|
|



<
<
<
<
<
<
<
|



<
<
|


|

|
|


|



>
|

<

|
|


|
|




|
<
|

|

|
|




|
>


|
|


|
|

<
<
<
<
<
<
<
<






>
>
>







44
45
46
47
48
49
50

51









52


















































53




































54












































55
56































































































57











58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73






































74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377








378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398







399
400
401
402


403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418

419
420
421
422
423
424
425
426
427
428
429
430

431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451








452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
author = "${author}"

_rules = None                               # module gc_rules

# data
_sAppContext = ""                           # what software is running
_dOptions = None

_oSpellChecker = None









_oTokenizer = None


















































_aIgnoredRules = set()


















































































































































































#### Initialization












def load (sContext="Python"):
    "initialization of the grammar checker"
    global _oSpellChecker
    global _sAppContext
    global _dOptions
    global _oTokenizer
    try:
        _oSpellChecker = SpellChecker("${lang}", "${dic_main_filename_py}", "${dic_extended_filename_py}", "${dic_community_filename_py}", "${dic_personal_filename_py}")
        _sAppContext = sContext
        _dOptions = dict(gc_options.getOptions(sContext))   # duplication necessary, to be able to reset to default
        _oTokenizer = _oSpellChecker.getTokenizer()
        _oSpellChecker.activateStorage()
    except:
        traceback.print_exc()








































def _getRules (bParagraph):
    try:
        if not bParagraph:
            return _rules.lSentenceRules
        return _rules.lParagraphRules
    except:
        _loadRules()
    if not bParagraph:
        return _rules.lSentenceRules
    return _rules.lParagraphRules


def _loadRules ():
    from . import gc_rules
    global _rules
    _rules = gc_rules
    # compile rules regex
    for sOption, lRuleGroup in chain(_rules.lParagraphRules, _rules.lSentenceRules):
        if sOption != "@@@@":
            for aRule in lRuleGroup:
                try:
                    aRule[0] = re.compile(aRule[0])
                except:
                    echo("Bad regular expression in # " + str(aRule[2]))
                    aRule[0] = "(?i)<Grammalecte>"


#### Parsing

_zEndOfSentence = re.compile(r'([.?!:;…][ .?!… »”")]*|.$)')
_zBeginOfParagraph = re.compile(r"^\W*")
_zEndOfParagraph = re.compile(r"\W*$")

def _getSentenceBoundaries (sText):
    iStart = _zBeginOfParagraph.match(sText).end()
    for m in _zEndOfSentence.finditer(sText):
        yield (iStart, m.end())
        iStart = m.end()


def parse (sText, sCountry="${country_default}", bDebug=False, dOptions=None, bContext=False):
    "analyses the paragraph sText and returns list of errors"
    #sText = unicodedata.normalize("NFC", sText)
    dErrors = {}
    sRealText = sText
    dPriority = {}  # Key = position; value = priority
    dOpt = _dOptions  if not dOptions  else dOptions
    bShowRuleId = option('idrule')

    # parse paragraph
    try:
        sNew, dErrors = _proofread(None, sText, sRealText, 0, True, dErrors, dPriority, sCountry, dOpt, bShowRuleId, bDebug, bContext)
        if sNew:
            sText = sNew
    except:
        raise

    # cleanup
    if " " in sText:
        sText = sText.replace(" ", ' ') # nbsp
    if " " in sText:
        sText = sText.replace(" ", ' ') # nnbsp
    if "'" in sText:
        sText = sText.replace("'", "’")
    if "‑" in sText:
        sText = sText.replace("‑", "-") # nobreakdash

    # parse sentences
    for iStart, iEnd in _getSentenceBoundaries(sText):
        if 4 < (iEnd - iStart) < 2000:
            try:
                oSentence = TextParser(sText[iStart:iEnd], sRealText[iStart:iEnd], iStart)
                _, dErrors = _proofread(oSentence, sText[iStart:iEnd], sRealText[iStart:iEnd], iStart, False, dErrors, dPriority, sCountry, dOpt, bShowRuleId, bDebug, bContext)
            except:
                raise
    return dErrors.values() # this is a view (iterable)


def _proofread (oSentence, s, sx, nOffset, bParagraph, dErrors, dPriority, sCountry, dOptions, bShowRuleId, bDebug, bContext):
    bParagraphChange = False
    bSentenceChange = False
    dTokenPos = oSentence.dTokenPos if oSentence else {}
    for sOption, lRuleGroup in _getRules(bParagraph):
        if sOption == "@@@@":
            # graph rules
            oSentence.dError = dErrors
            if not bParagraph and bSentenceChange:
                oSentence.update(s, bDebug)
                bSentenceChange = False
            for sGraphName, sLineId in lRuleGroup:
                if bDebug:
                    print("\n>>>> GRAPH:", sGraphName, sLineId)
                bParagraphChange, s = oSentence.parse(dAllGraph[sGraphName], dPriority, sCountry, dOptions, bShowRuleId, bDebug, bContext)
                dErrors.update(oSentence.dError)
                dTokenPos = oSentence.dTokenPos
        elif not sOption or dOptions.get(sOption, False):
            # regex rules
            for zRegex, bUppercase, sLineId, sRuleId, nPriority, lActions in lRuleGroup:
                if sRuleId not in _aIgnoredRules:
                    for m in zRegex.finditer(s):
                        bCondMemo = None
                        for sFuncCond, cActionType, sWhat, *eAct in lActions:
                            # action in lActions: [ condition, action type, replacement/suggestion/action[, iGroup[, message, URL]] ]
                            try:
                                bCondMemo = not sFuncCond or globals()[sFuncCond](s, sx, m, dTokenPos, sCountry, bCondMemo)
                                if bCondMemo:
                                    if bDebug:
                                        print("RULE:", sLineId)
                                    if cActionType == "-":
                                        # grammar error
                                        nErrorStart = nOffset + m.start(eAct[0])
                                        if nErrorStart not in dErrors or nPriority > dPriority.get(nErrorStart, -1):
                                            dErrors[nErrorStart] = _createError(s, sx, sWhat, nOffset, m, eAct[0], sLineId, sRuleId, bUppercase, eAct[1], eAct[2], bShowRuleId, sOption, bContext)
                                            dPriority[nErrorStart] = nPriority
                                    elif cActionType == "~":
                                        # text processor
                                        s = _rewrite(s, sWhat, eAct[0], m, bUppercase)
                                        bParagraphChange = True
                                        bSentenceChange = True
                                        if bDebug:
                                            echo("~ " + s + "  -- " + m.group(eAct[0]) + "  # " + sLineId)
                                    elif cActionType == "=":
                                        # disambiguation
                                        if not bParagraph:
                                            globals()[sWhat](s, m, dTokenPos)
                                            if bDebug:
                                                echo("= " + m.group(0) + "  # " + sLineId)
                                    elif cActionType == ">":
                                        # we do nothing, this test is just a condition to apply all following actions
                                        pass
                                    else:
                                        echo("# error: unknown action at " + sLineId)
                                elif cActionType == ">":
                                    break
                            except Exception as e:
                                raise Exception(str(e), "# " + sLineId + " # " + sRuleId)
    if bParagraphChange:
        return (s, dErrors)
    return (False, dErrors)


def _createError (s, sx, sRepl, nOffset, m, iGroup, sLineId, sRuleId, bUppercase, sMsg, sURL, bShowRuleId, sOption, bContext):
    nStart = nOffset + m.start(iGroup)
    nEnd = nOffset + m.end(iGroup)
    # suggestions
    if sRepl[0:1] == "=":
        sSugg = globals()[sRepl[1:]](s, m)
        lSugg = sSugg.split("|")  if sSugg  else []
    elif sRepl == "_":
        lSugg = []
    else:
        lSugg = m.expand(sRepl).split("|")
    if bUppercase and lSugg and m.group(iGroup)[0:1].isupper():
        lSugg = list(map(str.capitalize, lSugg))
    # Message
    sMessage = globals()[sMsg[1:]](s, m)  if sMsg[0:1] == "="  else  m.expand(sMsg)
    if bShowRuleId:
        sMessage += "  # " + sLineId + " # " + sRuleId
    #
    if _bWriterError:
        xErr = SingleProofreadingError()    # uno.createUnoStruct( "com.sun.star.linguistic2.SingleProofreadingError" )
        xErr.nErrorStart = nStart
        xErr.nErrorLength = nEnd - nStart
        xErr.nErrorType = PROOFREADING
        xErr.aRuleIdentifier = sRuleId
        xErr.aShortComment = sMessage   # sMessage.split("|")[0]     # in context menu
        xErr.aFullComment = sMessage   # sMessage.split("|")[-1]    # in dialog
        if bShowRuleId:
            xErr.aShortComment += "  " + sLineId + " # " + sRuleId
        xErr.aSuggestions = tuple(lSugg)
        if sURL:
            xProperty = PropertyValue()
            xProperty.Name = "FullCommentURL"
            xProperty.Value = sURL
            xErr.aProperties = (xProperty,)
        else:
            xErr.aProperties = ()
        return xErr
    else:
        dErr = {}
        dErr["nStart"] = nStart
        dErr["nEnd"] = nEnd
        dErr["sLineId"] = sLineId
        dErr["sRuleId"] = sRuleId
        dErr["sType"] = sOption  if sOption  else "notype"
        dErr["sMessage"] = sMessage
        dErr["aSuggestions"] = lSugg
        dErr["URL"] = sURL  if sURL  else ""
        if bContext:
            dErr['sUnderlined'] = self.sSentence0[nStart:nEnd]
            dErr['sBefore'] = self.sSentence0[max(0,nStart-80):nStart]
            dErr['sAfter'] = self.sSentence0[nEnd:nEnd+80]
        return dErr


def _rewrite (sSentence, sRepl, iGroup, m, bUppercase):
    "text processor: write <sRepl> in <sSentence> at <iGroup> position"
    nLen = m.end(iGroup) - m.start(iGroup)
    if sRepl == "*":
        sNew = " " * nLen
    elif sRepl == "_":
        sNew = sRepl + " " * (nLen-1)
    elif sRepl[0:1] == "=":
        sNew = globals()[sRepl[1:]](sSentence, m)
        sNew = sNew + " " * (nLen-len(sNew))
        if bUppercase and m.group(iGroup)[0:1].isupper():
            sNew = sNew.capitalize()
    else:
        sNew = m.expand(sRepl)
        sNew = sNew + " " * (nLen-len(sNew))
    return sSentence[0:m.start(iGroup)] + sNew + sSentence[m.end(iGroup):]


def ignoreRule (sRuleId):
    "disable rule <sRuleId>"
    _aIgnoredRules.add(sRuleId)


def resetIgnoreRules ():
    "clear all ignored rules"
    _aIgnoredRules.clear()


def reactivateRule (sRuleId):
    "(re)activate rule <sRuleId>"
    _aIgnoredRules.discard(sRuleId)


def listRules (sFilter=None):
    "generator: returns typle (sOption, sLineId, sRuleId)"
    if sFilter:
        try:
            zFilter = re.compile(sFilter)
        except:
            echo("# Error. List rules: wrong regex.")
            sFilter = None
    for sOption, lRuleGroup in chain(_getRules(True), _getRules(False)):
        if sOption != "@@@@":
            for _, _, sLineId, sRuleId, _, _ in lRuleGroup:
                if not sFilter or zFilter.search(sRuleId):
                    yield (sOption, sLineId, sRuleId)


def displayRules (sFilter=None):
    "display the name of rules, with the filter <sFilter>"
    echo("List of rules. Filter: << " + str(sFilter) + " >>")
    for sOption, sLineId, sRuleId in listRules(sFilter):
        echo("{:<10} {:<10} {}".format(sOption, sLineId, sRuleId))


def setOption (sOpt, bVal):
    "set option <sOpt> with <bVal> if it exists"
    if sOpt in _dOptions:
        _dOptions[sOpt] = bVal


def setOptions (dOpt):
    "update the dictionary of options with <dOpt>"
    for sKey, bVal in dOpt.items():
        if sKey in _dOptions:
            _dOptions[sKey] = bVal


def getOptions ():
    "return the dictionary of current options"
    return _dOptions


def getDefaultOptions ():
    "return the dictionary of default options"
    return dict(gc_options.getOptions(_sAppContext))


def getOptionsLabels (sLang):
    "return options labels"
    return gc_options.getUI(sLang)


def displayOptions (sLang):
    "display the list of grammar checking options"
    echo("List of options")
    echo("\n".join( [ k+":\t"+str(v)+"\t"+gc_options.getUI(sLang).get(k, ("?", ""))[0]  for k, v  in sorted(_dOptions.items()) ] ))
    echo("")


def resetOptions ():
    "set options to default values"
    global _dOptions
    _dOptions = dict(gc_options.getOptions(_sAppContext))


def getSpellChecker ():
    "return the spellchecker object"
    return _oSpellChecker


def _getPath ():
    return os.path.join(os.path.dirname(sys.modules[__name__].__file__), __name__ + ".py")



#### common functions









def option (sOpt):
    "return True if option <sOpt> is active"
    return _dOptions.get(sOpt, False)


def displayInfo (dTokenPos, tWord):
    "for debugging: retrieve info of word"
    if not tWord:
        echo("> nothing to find")
        return True
    lMorph = _oSpellChecker.getMorph(tWord[1])
    if not lMorph:
        echo("> not in dictionary")
        return True
    print("TOKENS:", dTokenPos)
    if tWord[0] in dTokenPos and "lMorph" in dTokenPos[tWord[0]]:
        echo("DA: " + str(dTokenPos[tWord[0]]["lMorph"]))
    echo("FSA: " + str(lMorph))
    return True









def morph (dTokenPos, tWord, sPattern, bStrict=True, bNoWord=False):
    "analyse a tuple (position, word), return True if sPattern in morphologies (disambiguation on)"
    if not tWord:
        return bNoWord


    lMorph = dTokenPos[tWord[0]]["lMorph"]  if tWord[0] in dTokenPos and "lMorph" in dTokenPos[tWord[0]]  else _oSpellChecker.getMorph(tWord[1])
    if not lMorph:
        return False
    zPattern = re.compile(sPattern)
    if bStrict:
        return bool(lMorph) and all(zPattern.search(s)  for s in lMorph)
    return any(zPattern.search(s)  for s in lMorph)


def morphex (dTokenPos, tWord, sPattern, sNegPattern, bNoWord=False):
    "analyse a tuple (position, word), returns True if not sNegPattern in word morphologies and sPattern in word morphologies (disambiguation on)"
    if not tWord:
        return bNoWord
    lMorph = dTokenPos[tWord[0]]["lMorph"]  if tWord[0] in dTokenPos and "lMorph" in dTokenPos[tWord[0]]  else _oSpellChecker.getMorph(tWord[1])
    if not lMorph:
        return False

    # check negative condition
    zNegPattern = re.compile(sNegPattern)
    if any(zNegPattern.search(s)  for s in lMorph):
        return False
    # search sPattern
    zPattern = re.compile(sPattern)
    return any(zPattern.search(s)  for s in lMorph)


def analyse (sWord, sPattern, bStrict=True):
    "analyse a word, return True if sPattern in morphologies (disambiguation off)"
    lMorph = _oSpellChecker.getMorph(sWord)

    if not lMorph:
        return False
    zPattern = re.compile(sPattern)
    if bStrict:
        return bool(lMorph) and all(zPattern.search(s)  for s in lMorph)
    return any(zPattern.search(s)  for s in lMorph)


def analysex (sWord, sPattern, sNegPattern):
    "analyse a word, returns True if not sNegPattern in word morphologies and sPattern in word morphologies (disambiguation off)"
    lMorph = _oSpellChecker.getMorph(sWord)
    if not lMorph:
        return False
    # check negative condition
    zNegPattern = re.compile(sNegPattern)
    if any(zNegPattern.search(s)  for s in lMorph):
        return False
    # search sPattern
    zPattern = re.compile(sPattern)
    return any(zPattern.search(s)  for s in lMorph)











## functions to get text outside pattern scope

# warning: check compile_rules.py to understand how it works

_zNextWord = re.compile(r" +(\w[\w-]*)")
_zPrevWord = re.compile(r"(\w[\w-]*) +$")

def nextword (s, iStart, n):
    "get the nth word of the input string or empty string"
    m = re.match("(?: +[\\w%-]+){" + str(n-1) + "} +([\\w%-]+)", s[iStart:])
    if not m:
        return None
    return (iStart+m.start(1), m.group(1))

510
511
512
513
514
515
516
517
518
519
520
521
522
523
524
525
526
527
528




































































































































































































































































































































































529

































































































































































































































































530
531
532
533
534


535
536
537
538
539
540
541
542
543
544
545
546
547
548
549
550

551
552







553
554
555
556
557
558
559
560
561
562
563
564
565
566
567
568
569
570

571
572
573
574

575


576






577
578
579
580
581
582
583
584


585





    if sNegPattern and re.search(sNegPattern, s):
        return False
    if re.search(sPattern, s):
        return True
    return False


def look_chk1 (dDA, s, nOffset, sPattern, sPatternGroup1, sNegPatternGroup1=None):
    "returns True if s has pattern sPattern and m.group(1) has pattern sPatternGroup1"
    m = re.search(sPattern, s)
    if not m:
        return False
    try:
        sWord = m.group(1)
        nPos = m.start(1) + nOffset
    except:
        return False
    if sNegPatternGroup1:
        return morphex(dDA, (nPos, sWord), sPatternGroup1, sNegPatternGroup1)




































































































































































































































































































































































    return morph(dDA, (nPos, sWord), sPatternGroup1, False)



































































































































































































































































#### Disambiguator

def select (dDA, nPos, sWord, sPattern, lDefault=None):


    if not sWord:
        return True
    if nPos in dDA:
        return True
    if sWord not in _dAnalyses and not _storeMorphFromFSA(sWord):
        return True
    if len(_dAnalyses[sWord]) == 1:
        return True
    lSelect = [ sMorph  for sMorph in _dAnalyses[sWord]  if re.search(sPattern, sMorph) ]
    if lSelect:
        if len(lSelect) != len(_dAnalyses[sWord]):
            dDA[nPos] = lSelect
            #echo("= "+sWord+" "+str(dDA.get(nPos, "null")))
    elif lDefault:
        dDA[nPos] = lDefault
        #echo("= "+sWord+" "+str(dDA.get(nPos, "null")))

    return True









def exclude (dDA, nPos, sWord, sPattern, lDefault=None):
    if not sWord:
        return True
    if nPos in dDA:
        return True
    if sWord not in _dAnalyses and not _storeMorphFromFSA(sWord):
        return True
    if len(_dAnalyses[sWord]) == 1:
        return True
    lSelect = [ sMorph  for sMorph in _dAnalyses[sWord]  if not re.search(sPattern, sMorph) ]
    if lSelect:
        if len(lSelect) != len(_dAnalyses[sWord]):
            dDA[nPos] = lSelect
            #echo("= "+sWord+" "+str(dDA.get(nPos, "null")))
    elif lDefault:
        dDA[nPos] = lDefault
        #echo("= "+sWord+" "+str(dDA.get(nPos, "null")))

    return True


def define (dDA, nPos, lMorph):

    dDA[nPos] = lMorph


    #echo("= "+str(nPos)+" "+str(dDA[nPos]))






    return True


#### GRAMMAR CHECKER PLUGINS

${plugins}




${callables}












|










|
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
|
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>




|
>
>
|
<
|
<
|
|
<

|

|
|
<

|
<
>


>
>
>
>
>
>
>
|
<
<

<
<
<
<
<
<
|

|
|
<

|
<
>



|
>
|
>
>
|
>
>
>
>
>
>








>
>

>
>
>
>
>
495
496
497
498
499
500
501
502
503
504
505
506
507
508
509
510
511
512
513
514
515
516
517
518
519
520
521
522
523
524
525
526
527
528
529
530
531
532
533
534
535
536
537
538
539
540
541
542
543
544
545
546
547
548
549
550
551
552
553
554
555
556
557
558
559
560
561
562
563
564
565
566
567
568
569
570
571
572
573
574
575
576
577
578
579
580
581
582
583
584
585
586
587
588
589
590
591
592
593
594
595
596
597
598
599
600
601
602
603
604
605
606
607
608
609
610
611
612
613
614
615
616
617
618
619
620
621
622
623
624
625
626
627
628
629
630
631
632
633
634
635
636
637
638
639
640
641
642
643
644
645
646
647
648
649
650
651
652
653
654
655
656
657
658
659
660
661
662
663
664
665
666
667
668
669
670
671
672
673
674
675
676
677
678
679
680
681
682
683
684
685
686
687
688
689
690
691
692
693
694
695
696
697
698
699
700
701
702
703
704
705
706
707
708
709
710
711
712
713
714
715
716
717
718
719
720
721
722
723
724
725
726
727
728
729
730
731
732
733
734
735
736
737
738
739
740
741
742
743
744
745
746
747
748
749
750
751
752
753
754
755
756
757
758
759
760
761
762
763
764
765
766
767
768
769
770
771
772
773
774
775
776
777
778
779
780
781
782
783
784
785
786
787
788
789
790
791
792
793
794
795
796
797
798
799
800
801
802
803
804
805
806
807
808
809
810
811
812
813
814
815
816
817
818
819
820
821
822
823
824
825
826
827
828
829
830
831
832
833
834
835
836
837
838
839
840
841
842
843
844
845
846
847
848
849
850
851
852
853
854
855
856
857
858
859
860
861
862
863
864
865
866
867
868
869
870
871
872
873
874
875
876
877
878
879
880
881
882
883
884
885
886
887
888
889
890
891
892
893
894
895
896
897
898
899
900
901
902
903
904
905
906
907
908
909
910
911
912
913
914
915
916
917
918
919
920
921
922
923
924
925
926
927
928
929
930
931
932
933
934
935
936
937
938
939
940
941
942
943
944
945
946
947
948
949
950
951
952
953
954
955
956
957
958
959
960
961
962
963
964
965
966
967
968
969
970
971
972
973
974
975
976
977
978
979
980
981
982
983
984
985
986
987
988
989
990
991
992
993
994
995
996
997
998
999
1000
1001
1002
1003
1004
1005
1006
1007
1008
1009
1010
1011
1012
1013
1014
1015
1016
1017
1018
1019
1020
1021
1022
1023
1024
1025
1026
1027
1028
1029
1030
1031
1032
1033
1034
1035
1036
1037
1038
1039
1040
1041
1042
1043
1044
1045
1046
1047
1048
1049
1050
1051
1052
1053
1054
1055
1056
1057
1058
1059
1060
1061
1062
1063
1064
1065
1066
1067
1068
1069
1070
1071
1072
1073
1074
1075
1076
1077
1078
1079
1080
1081
1082
1083
1084
1085
1086
1087
1088
1089
1090
1091
1092
1093
1094
1095
1096
1097
1098
1099
1100
1101
1102
1103
1104
1105
1106
1107
1108
1109
1110
1111
1112
1113
1114
1115
1116
1117
1118
1119
1120
1121
1122
1123
1124
1125
1126
1127
1128
1129
1130
1131
1132
1133
1134
1135

1136

1137
1138

1139
1140
1141
1142
1143

1144
1145

1146
1147
1148
1149
1150
1151
1152
1153
1154
1155
1156


1157






1158
1159
1160
1161

1162
1163

1164
1165
1166
1167
1168
1169
1170
1171
1172
1173
1174
1175
1176
1177
1178
1179
1180
1181
1182
1183
1184
1185
1186
1187
1188
1189
1190
1191
1192
1193
1194
1195
    if sNegPattern and re.search(sNegPattern, s):
        return False
    if re.search(sPattern, s):
        return True
    return False


def look_chk1 (dTokenPos, s, nOffset, sPattern, sPatternGroup1, sNegPatternGroup1=None):
    "returns True if s has pattern sPattern and m.group(1) has pattern sPatternGroup1"
    m = re.search(sPattern, s)
    if not m:
        return False
    try:
        sWord = m.group(1)
        nPos = m.start(1) + nOffset
    except:
        return False
    if sNegPatternGroup1:
        return morphex(dTokenPos, (nPos, sWord), sPatternGroup1, sNegPatternGroup1)
    return morph(dTokenPos, (nPos, sWord), sPatternGroup1, False)


#### Disambiguator

def select (dTokenPos, nPos, sWord, sPattern, lDefault=None):
    "Disambiguation: select morphologies of <sWord> matching <sPattern>"
    if not sWord:
        return True
    if nPos not in dTokenPos:
        print("Error. There should be a token at this position: ", nPos)
        return True
    lMorph = _oSpellChecker.getMorph(sWord)
    if not lMorph or len(lMorph) == 1:
        return True
    lSelect = [ sMorph  for sMorph in lMorph  if re.search(sPattern, sMorph) ]
    if lSelect:
        if len(lSelect) != len(lMorph):
            dTokenPos[nPos]["lMorph"] = lSelect
    elif lDefault:
        dTokenPos[nPos]["lMorph"] = lDefault
    return True


def exclude (dTokenPos, nPos, sWord, sPattern, lDefault=None):
    "Disambiguation: exclude morphologies of <sWord> matching <sPattern>"
    if not sWord:
        return True
    if nPos not in dTokenPos:
        print("Error. There should be a token at this position: ", nPos)
        return True
    lMorph = _oSpellChecker.getMorph(sWord)
    if not lMorph or len(lMorph) == 1:
        return True
    lSelect = [ sMorph  for sMorph in lMorph  if not re.search(sPattern, sMorph) ]
    if lSelect:
        if len(lSelect) != len(lMorph):
            dTokenPos[nPos]["lMorph"] = lSelect
    elif lDefault:
        dTokenPos[nPos]["lMorph"] = lDefault
    return True


def define (dTokenPos, nPos, lMorph):
    "Disambiguation: set morphologies of token at <nPos> with <lMorph>"
    if nPos not in dTokenPos:
        print("Error. There should be a token at this position: ", nPos)
        return True
    dTokenPos[nPos]["lMorph"] = lMorph
    return True




#### TEXT PARSER

class TextParser:
    "Text parser"

    def __init__ (self, sSentence, sSentence0, nOffset):
        self.sSentence = sSentence
        self.sSentence0 = sSentence0
        self.nOffsetWithinParagraph = nOffset
        self.lToken = list(_oTokenizer.genTokens(sSentence, True))
        self.dTokenPos = { dToken["nStart"]: dToken  for dToken in self.lToken  if dToken["sType"] != "INFO" }
        self.dTags = {}
        self.dError = {}

    def __str__ (self):
        s = "TEXT ==========\n"
        s += "sentence: " + self.sSentence0 + "\n"
        s += "now:      " + self.sSentence  + "\n"
        for dToken in self.lToken:
            s += f'{dToken["nStart"]}\t{dToken["nEnd"]}\t{dToken["sValue"]}'
            if "lMorph" in dToken:
                s += "\t" + str(dToken["lMorph"])
            s += "\n"
        for nPos, dToken in self.dTokenPos.items():
            s += f"{nPos}\t{dToken}\n"
        return s

    def update (self, sSentence, bDebug=False):
        "update <sSentence> and retokenize"
        self.sSentence = sSentence
        lNewToken = list(_oTokenizer.genTokens(sSentence, True))
        for dToken in lNewToken:
            if "lMorph" in self.dTokenPos.get(dToken["nStart"], {}):
                dToken["lMorph"] = self.dTokenPos[dToken["nStart"]]["lMorph"]
        self.lToken = lNewToken
        self.dTokenPos = { dToken["nStart"]: dToken  for dToken in self.lToken  if dToken["sType"] != "INFO" }
        if bDebug:
            print("UPDATE:")
            print(self)

    def _getNextMatchingNodes (self, dToken, dGraph, dNode, bDebug=False):
        "generator: return nodes where <dToken> “values” match <dNode> arcs"
        # token value
        if dToken["sValue"] in dNode:
            if bDebug:
                print("  MATCH:", dToken["sValue"])
            yield dGraph[dNode[dToken["sValue"]]]
        if dToken["sValue"][0:2].istitle(): # we test only 2 first chars, to make valid words such as "Laissez-les", "Passe-partout".
            sValue = dToken["sValue"].lower()
            if sValue in dNode:
                if bDebug:
                    print("  MATCH:", sValue)
                yield dGraph[dNode[sValue]]
        elif dToken["sValue"].isupper():
            sValue = dToken["sValue"].lower()
            if sValue in dNode:
                if bDebug:
                    print("  MATCH:", sValue)
                yield dGraph[dNode[sValue]]
            sValue = dToken["sValue"].capitalize()
            if sValue in dNode:
                if bDebug:
                    print("  MATCH:", sValue)
                yield dGraph[dNode[sValue]]
        # regex value arcs
        if "<re_value>" in dNode:
            for sRegex in dNode["<re_value>"]:
                if "¬" not in sRegex:
                    # no anti-pattern
                    if re.search(sRegex, dToken["sValue"]):
                        if bDebug:
                            print("  MATCH: ~" + sRegex)
                        yield dGraph[dNode["<re_value>"][sRegex]]
                else:
                    # there is an anti-pattern
                    sPattern, sNegPattern = sRegex.split("¬", 1)
                    if sNegPattern and re.search(sNegPattern, dToken["sValue"]):
                        continue
                    if not sPattern or re.search(sPattern, dToken["sValue"]):
                        if bDebug:
                            print("  MATCH: ~" + sRegex)
                        yield dGraph[dNode["<re_value>"][sRegex]]
        # analysable tokens
        if dToken["sType"][0:4] == "WORD":
            # token lemmas
            if "<lemmas>" in dNode:
                for sLemma in _oSpellChecker.getLemma(dToken["sValue"]):
                    if sLemma in dNode["<lemmas>"]:
                        if bDebug:
                            print("  MATCH: >" + sLemma)
                        yield dGraph[dNode["<lemmas>"][sLemma]]
            # regex morph arcs
            if "<re_morph>" in dNode:
                for sRegex in dNode["<re_morph>"]:
                    if "¬" not in sRegex:
                        # no anti-pattern
                        lMorph = dToken.get("lMorph", _oSpellChecker.getMorph(dToken["sValue"]))
                        if any(re.search(sRegex, sMorph)  for sMorph in lMorph):
                            if bDebug:
                                print("  MATCH: @" + sRegex)
                            yield dGraph[dNode["<re_morph>"][sRegex]]
                    else:
                        # there is an anti-pattern
                        sPattern, sNegPattern = sRegex.split("¬", 1)
                        if sNegPattern == "*":
                            # all morphologies must match with <sPattern>
                            if sPattern:
                                lMorph = dToken.get("lMorph", _oSpellChecker.getMorph(dToken["sValue"]))
                                if lMorph and all(re.search(sPattern, sMorph)  for sMorph in lMorph):
                                    if bDebug:
                                        print("  MATCH: @" + sRegex)
                                    yield dGraph[dNode["<re_morph>"][sRegex]]
                        else:
                            lMorph = dToken.get("lMorph", _oSpellChecker.getMorph(dToken["sValue"]))
                            if sNegPattern and any(re.search(sNegPattern, sMorph)  for sMorph in lMorph):
                                continue
                            if not sPattern or any(re.search(sPattern, sMorph)  for sMorph in lMorph):
                                if bDebug:
                                    print("  MATCH: @" + sRegex)
                                yield dGraph[dNode["<re_morph>"][sRegex]]
        # token tags
        if "tags" in dToken and "<tags>" in dNode:
            for sTag in dToken["tags"]:
                if sTag in dNode["<tags>"]:
                    if bDebug:
                        print("  MATCH: /" + sTag)
                    yield dGraph[dNode["<tags>"][sTag]]
        # meta arc (for token type)
        if "<meta>" in dNode:
            for sMeta in dNode["<meta>"]:
                # not regex here, we just search if <dNode["sType"]> exists within <sMeta>
                if sMeta == "*":
                    if bDebug:
                        print("  MATCH: *" + sMeta)
                    yield dGraph[dNode["<meta>"]["*"]]
                elif "¬" in sMeta:
                    if dToken["sType"] not in sMeta:
                        if bDebug:
                            print("  MATCH: *" + sMeta)
                        yield dGraph[dNode["<meta>"][sMeta]]
                elif dToken["sType"] in sMeta:
                    if bDebug:
                        print("  MATCH: *" + sMeta)
                    yield dGraph[dNode["<meta>"][sMeta]]

    def parse (self, dGraph, dPriority, sCountry="${country_default}", dOptions=None, bShowRuleId=False, bDebug=False, bContext=False):
        "parse tokens from the text and execute actions encountered"
        dOpt = _dOptions  if not dOptions  else dOptions
        lPointer = []
        bTagAndRewrite = False
        for i, dToken in enumerate(self.lToken):
            if bDebug:
                print("TOKEN:", dToken["sValue"])
            # check arcs for each existing pointer
            lNextPointer = []
            for dPointer in lPointer:
                for dNode in self._getNextMatchingNodes(dToken, dGraph, dPointer["dNode"], bDebug):
                    lNextPointer.append({"iToken": dPointer["iToken"], "dNode": dNode})
            lPointer = lNextPointer
            # check arcs of first nodes
            for dNode in self._getNextMatchingNodes(dToken, dGraph, dGraph[0], bDebug):
                lPointer.append({"iToken": i, "dNode": dNode})
            # check if there is rules to check for each pointer
            for dPointer in lPointer:
                #if bDebug:
                #    print("+", dPointer)
                if "<rules>" in dPointer["dNode"]:
                    bChange = self._executeActions(dGraph, dPointer["dNode"]["<rules>"], dPointer["iToken"]-1, i, dPriority, dOpt, sCountry, bShowRuleId, bDebug, bContext)
                    if bChange:
                        bTagAndRewrite = True
        if bTagAndRewrite:
            self.rewrite(bDebug)
        if bDebug:
            print(self)
        return (bTagAndRewrite, self.sSentence)

    def _executeActions (self, dGraph, dNode, nTokenOffset, nLastToken, dPriority, dOptions, sCountry, bShowRuleId, bDebug, bContext):
        "execute actions found in the DARG"
        bChange = False
        for sLineId, nextNodeKey in dNode.items():
            bCondMemo = None
            for sRuleId in dGraph[nextNodeKey]:
                try:
                    if bDebug:
                        print("  TRY:", sRuleId)
                    sOption, sFuncCond, cActionType, sWhat, *eAct = dRule[sRuleId]
                    # Suggestion    [ option, condition, "-", replacement/suggestion/action, iTokenStart, iTokenEnd, nPriority, message, URL ]
                    # TextProcessor [ option, condition, "~", replacement/suggestion/action, iTokenStart, iTokenEnd ]
                    # Disambiguator [ option, condition, "=", replacement/suggestion/action ]
                    # Sentence Tag  [ option, condition, "/", replacement/suggestion/action, iTokenStart, iTokenEnd ]
                    # Test          [ option, condition, ">", "" ]
                    if not sOption or dOptions.get(sOption, False):
                        bCondMemo = not sFuncCond or globals()[sFuncCond](self.lToken, nTokenOffset, nLastToken, sCountry, bCondMemo, self.dTags, self.sSentence, self.sSentence0)
                        if bCondMemo:
                            if cActionType == "-":
                                # grammar error
                                nTokenErrorStart = nTokenOffset + abs(eAct[0])
                                if "bImmune" not in self.lToken[nTokenErrorStart]:
                                    nTokenErrorEnd = (nTokenOffset + abs(eAct[1]))  if eAct[1]  else nLastToken
                                    nErrorStart = self.nOffsetWithinParagraph + (self.lToken[nTokenErrorStart]["nStart"] if eAct[0] >= 0  else self.lToken[nTokenErrorStart]["nStart"])
                                    nErrorEnd = self.nOffsetWithinParagraph + (self.lToken[nTokenErrorEnd]["nEnd"] if eAct[1] >= 0  else self.lToken[nTokenErrorEnd]["nStart"])
                                    if nErrorStart not in self.dError or eAct[2] > dPriority.get(nErrorStart, -1):
                                        self.dError[nErrorStart] = self._createError(sWhat, nTokenOffset, nTokenErrorStart, nErrorStart, nErrorEnd, sLineId, sRuleId, True, eAct[3], eAct[4], bShowRuleId, "notype", bContext)
                                        dPriority[nErrorStart] = eAct[2]
                                        if bDebug:
                                            print("  NEW_ERROR:", self.dError[nErrorStart], "\n  ", dRule[sRuleId])
                            elif cActionType == "~":
                                # text processor
                                if bDebug:
                                    print("  TAG_PREPARE:\n  ", dRule[sRuleId])
                                nEndToken = (nTokenOffset + eAct[1])  if eAct[1]  else nLastToken
                                self._tagAndPrepareTokenForRewriting(sWhat, nTokenOffset + eAct[0], nEndToken, nTokenOffset, True, bDebug)
                                bChange = True
                            elif cActionType == "=":
                                # disambiguation
                                if bDebug:
                                    print("  DISAMBIGUATOR:\n  ", dRule[sRuleId])
                                globals()[sWhat](self.lToken, nTokenOffset)
                            elif cActionType == ">":
                                # we do nothing, this test is just a condition to apply all following actions
                                if bDebug:
                                    print("  COND_OK")
                                pass
                            elif cActionType == "/":
                                if bDebug:
                                    print("  SEMANTIC_TAG:\n  ", dRule[sRuleId])
                                nTokenStart = nTokenOffset + eAct[0]
                                nTokenEnd = nTokenOffset + (eAct[1]  if eAct[1]  else eAct[0])
                                for i in range(nTokenStart, nTokenEnd+1):
                                    if "tags" in self.lToken[i]:
                                        self.lToken[i]["tags"].update(sWhat.split("|"))
                                    else:
                                        self.lToken[i]["tags"] = set(sWhat.split("|"))
                            elif cActionType == "%":
                                # sentence tags
                                if bDebug:
                                    print("  SENTENCE_TAG:\n  ", dRule[sRuleId])
                                nTokenTag = nTokenOffset + eAct[0]
                                if sWhat not in self.dTags:
                                    self.dTags[sWhat] = (nTokenTag, nTokenTag)
                                elif nTokenTag > self.dTags[sWhat][1]:
                                    self.dTags[sWhat] = (self.dTags[sWhat][0], nTokenTag)
                            else:
                                print("# error: unknown action at " + sLineId)
                        elif cActionType == ">":
                            if bDebug:
                                print("  COND_BREAK")
                            break
                except Exception as e:
                    raise Exception(str(e), sLineId, sRuleId, self.sSentence)
        return bChange

    def _createError (self, sSugg, nTokenOffset, iFirstToken, nStart, nEnd, sLineId, sRuleId, bUppercase, sMsg, sURL, bShowRuleId, sOption, bContext):
        # suggestions
        if sSugg[0:1] == "=":
            sSugg = globals()[sSugg[1:]](self.lToken, nTokenOffset)
            lSugg = sSugg.split("|")  if sSugg  else []
        elif sSugg == "_":
            lSugg = []
        else:
            lSugg = self._expand(sSugg, nTokenOffset).split("|")
        if bUppercase and lSugg and self.lToken[iFirstToken]["sValue"][0:1].isupper():
            lSugg = list(map(lambda s: s[0:1].upper()+s[1:], lSugg))
        # Message
        sMessage = globals()[sMsg[1:]](self.lToken, nTokenOffset)  if sMsg[0:1] == "="  else self._expand(sMsg, nTokenOffset)
        if bShowRuleId:
            sMessage += "  " + sLineId + " # " + sRuleId
        #
        if _bWriterError:
            xErr = SingleProofreadingError()    # uno.createUnoStruct( "com.sun.star.linguistic2.SingleProofreadingError" )
            xErr.nErrorStart = nStart
            xErr.nErrorLength = nEnd - nStart
            xErr.nErrorType = PROOFREADING
            xErr.aRuleIdentifier = sRuleId
            xErr.aShortComment = sMessage   # sMessage.split("|")[0]     # in context menu
            xErr.aFullComment = sMessage   # sMessage.split("|")[-1]    # in dialog
            if bShowRuleId:
                xErr.aShortComment += "  " + sLineId + " # " + sRuleId
            xErr.aSuggestions = tuple(lSugg)
            if sURL:
                xProperty = PropertyValue()
                xProperty.Name = "FullCommentURL"
                xProperty.Value = sURL
                xErr.aProperties = (xProperty,)
            else:
                xErr.aProperties = ()
            return xErr
        else:
            dErr = {}
            dErr["nStart"] = nStart
            dErr["nEnd"] = nEnd
            dErr["sLineId"] = sLineId
            dErr["sRuleId"] = sRuleId
            dErr["sType"] = sOption  if sOption  else "notype"
            dErr["sMessage"] = sMessage
            dErr["aSuggestions"] = lSugg
            dErr["URL"] = sURL  if sURL  else ""
            if bContext:
                dErr['sUnderlined'] = self.sSentence0[nStart:nEnd]
                dErr['sBefore'] = self.sSentence0[max(0,nStart-80):nStart]
                dErr['sAfter'] = self.sSentence0[nEnd:nEnd+80]
            return dErr

    def _expand (self, sText, nTokenOffset):
        #print("*", sText)
        for m in re.finditer(r"\\([0-9]+)", sText):
            sText = sText.replace(m.group(0), self.lToken[int(m.group(1))+nTokenOffset]["sValue"])
        #print(">", sText)
        return sText

    def _tagAndPrepareTokenForRewriting (self, sWhat, nTokenRewriteStart, nTokenRewriteEnd, nTokenOffset, bUppercase=True, bDebug=False):
        "text processor: rewrite tokens between <nTokenRewriteStart> and <nTokenRewriteEnd> position"
        if bDebug:
            print("   START:", nTokenRewriteStart, "END:", nTokenRewriteEnd)
        if sWhat == "*":
            # purge text
            if nTokenRewriteEnd - nTokenRewriteStart == 0:
                self.lToken[nTokenRewriteStart]["bToRemove"] = True
            else:
                for i in range(nTokenRewriteStart, nTokenRewriteEnd+1):
                    self.lToken[i]["bToRemove"] = True
        elif sWhat == "␣":
            # merge tokens
            self.lToken[nTokenRewriteStart]["nMergeUntil"] = nTokenRewriteEnd
        elif sWhat == "!":
            # immunity
            if nTokenRewriteEnd - nTokenRewriteStart == 0:
                self.lToken[nTokenRewriteStart]["bImmune"] = True
            else:
                for i in range(nTokenRewriteStart, nTokenRewriteEnd+1):
                    self.lToken[i]["bImmune"] = True
        elif sWhat == "_":
            # neutralized token
            if nTokenRewriteEnd - nTokenRewriteStart == 0:
                self.lToken[nTokenRewriteStart]["sNewValue"] = "_"
            else:
                for i in range(nTokenRewriteStart, nTokenRewriteEnd+1):
                    self.lToken[i]["sNewValue"] = "_"
        else:
            if sWhat.startswith("="):
                sWhat = globals()[sWhat[1:]](self.lToken, nTokenOffset)
            else:
                sWhat = self._expand(sWhat, nTokenOffset)
            bUppercase = bUppercase and self.lToken[nTokenRewriteStart]["sValue"][0:1].isupper()
            if nTokenRewriteEnd - nTokenRewriteStart == 0:
                # one token
                sWhat = sWhat + " " * (len(self.lToken[nTokenRewriteStart]["sValue"])-len(sWhat))
                if bUppercase:
                    sWhat = sWhat[0:1].upper() + sWhat[1:]
                self.lToken[nTokenRewriteStart]["sNewValue"] = sWhat
            else:
                # several tokens
                lTokenValue = sWhat.split("|")
                if len(lTokenValue) != (nTokenRewriteEnd - nTokenRewriteStart + 1):
                    print("Error. Text processor: number of replacements != number of tokens.")
                    return
                for i, sValue in zip(range(nTokenRewriteStart, nTokenRewriteEnd+1), lTokenValue):
                    if not sValue or sValue == "*":
                        self.lToken[i]["bToRemove"] = True
                    else:
                        if bUppercase:
                            sValue = sValue[0:1].upper() + sValue[1:]
                        self.lToken[i]["sNewValue"] = sValue

    def rewrite (self, bDebug=False):
        "rewrite the sentence, modify tokens, purge the token list"
        if bDebug:
            print("REWRITE")
        lNewToken = []
        nMergeUntil = 0
        dTokenMerger = None
        for iToken, dToken in enumerate(self.lToken):
            bKeepToken = True
            if dToken["sType"] != "INFO":
                if "bImmune" in dToken:
                    nErrorStart = self.nOffsetWithinParagraph + dToken["nStart"]
                    if nErrorStart in self.dError:
                        if bDebug:
                            print("immunity -> error removed:", self.dError[nErrorStart])
                        del self.dError[nErrorStart]
                if nMergeUntil and iToken <= nMergeUntil:
                    dTokenMerger["sValue"] += " " * (dToken["nStart"] - dTokenMerger["nEnd"]) + dToken["sValue"]
                    dTokenMerger["nEnd"] = dToken["nEnd"]
                    if bDebug:
                        print("  MERGED TOKEN:", dTokenMerger["sValue"])
                    bKeepToken = False
                if "nMergeUntil" in dToken:
                    if iToken > nMergeUntil: # this token is not already merged with a previous token
                        dTokenMerger = dToken
                    if dToken["nMergeUntil"] > nMergeUntil:
                        nMergeUntil = dToken["nMergeUntil"]
                    del dToken["nMergeUntil"]
                elif "bToRemove" in dToken:
                    if bDebug:
                        print("  REMOVED:", dToken["sValue"])
                    self.sSentence = self.sSentence[:dToken["nStart"]] + " " * (dToken["nEnd"] - dToken["nStart"]) + self.sSentence[dToken["nEnd"]:]
                    bKeepToken = False
            #
            if bKeepToken:
                lNewToken.append(dToken)
                if "sNewValue" in dToken:
                    # rewrite token and sentence
                    if bDebug:
                        print(dToken["sValue"], "->", dToken["sNewValue"])
                    dToken["sRealValue"] = dToken["sValue"]
                    dToken["sValue"] = dToken["sNewValue"]
                    nDiffLen = len(dToken["sRealValue"]) - len(dToken["sNewValue"])
                    sNewRepl = (dToken["sNewValue"] + " " * nDiffLen)  if nDiffLen >= 0  else dToken["sNewValue"][:len(dToken["sRealValue"])]
                    self.sSentence = self.sSentence[:dToken["nStart"]] + sNewRepl + self.sSentence[dToken["nEnd"]:]
                    del dToken["sNewValue"]
            else:
                try:
                    del self.dTokenPos[dToken["nStart"]]
                except:
                    print(self)
                    print(dToken)
                    exit()
        if bDebug:
            print("  TEXT REWRITED:", self.sSentence)
        self.lToken.clear()
        self.lToken = lNewToken



#### Analyse tokens

def g_value (dToken, sValues, nLeft=None, nRight=None):
    "test if <dToken['sValue']> is in sValues (each value should be separated with |)"
    sValue = "|"+dToken["sValue"]+"|"  if nLeft is None  else "|"+dToken["sValue"][slice(nLeft, nRight)]+"|"
    if sValue in sValues:
        return True
    if dToken["sValue"][0:2].istitle(): # we test only 2 first chars, to make valid words such as "Laissez-les", "Passe-partout".
        if sValue.lower() in sValues:
            return True
    elif dToken["sValue"].isupper():
        #if sValue.lower() in sValues:
        #    return True
        sValue = "|"+sValue[1:].capitalize()
        if sValue.capitalize() in sValues:
            return True
    return False


def g_morph (dToken, sPattern, sNegPattern="", nLeft=None, nRight=None, bMemorizeMorph=True):
    "analyse a token, return True if <sNegPattern> not in morphologies and <sPattern> in morphologies"
    if "lMorph" in dToken:
        lMorph = dToken["lMorph"]
    else:
        if nLeft is not None:
            lMorph = _oSpellChecker.getMorph(dToken["sValue"][slice(nLeft, nRight)])
            if bMemorizeMorph:
                dToken["lMorph"] = lMorph
        else:
            lMorph = _oSpellChecker.getMorph(dToken["sValue"])
    if not lMorph:
        return False
    # check negative condition
    if sNegPattern:
        if sNegPattern == "*":
            # all morph must match sPattern
            if not lMorph:
                return False
            zPattern = re.compile(sPattern)
            return all(zPattern.search(sMorph)  for sMorph in lMorph)
        else:
            zNegPattern = re.compile(sNegPattern)
            if any(zNegPattern.search(sMorph)  for sMorph in lMorph):
                return False
    # search sPattern
    zPattern = re.compile(sPattern)
    return any(zPattern.search(sMorph)  for sMorph in lMorph)


def g_analyse (dToken, sPattern, sNegPattern="", nLeft=None, nRight=None, bMemorizeMorph=True):
    "analyse a token, return True if <sNegPattern> not in morphologies and <sPattern> in morphologies (disambiguation off)"
    if nLeft is not None:
        lMorph = _oSpellChecker.getMorph(dToken["sValue"][slice(nLeft, nRight)])
        if bMemorizeMorph:
            dToken["lMorph"] = lMorph
    else:
        lMorph = _oSpellChecker.getMorph(dToken["sValue"])
    if not lMorph:
        return False
    # check negative condition
    if sNegPattern:
        if sNegPattern == "*":
            # all morph must match sPattern
            if not lMorph:
                return False
            zPattern = re.compile(sPattern)
            return all(zPattern.search(sMorph)  for sMorph in lMorph)
        else:
            zNegPattern = re.compile(sNegPattern)
            if any(zNegPattern.search(sMorph)  for sMorph in lMorph):
                return False
    # search sPattern
    zPattern = re.compile(sPattern)
    return any(zPattern.search(sMorph)  for sMorph in lMorph)


def g_merged_analyse (dToken1, dToken2, cMerger, sPattern, sNegPattern="", bSetMorph=True):
    "merge two token values, return True if <sNegPattern> not in morphologies and <sPattern> in morphologies (disambiguation off)"
    lMorph = _oSpellChecker.getMorph(dToken1["sValue"] + cMerger + dToken2["sValue"])
    if not lMorph:
        return False
    # check negative condition
    if sNegPattern:
        if sNegPattern == "*":
            # all morph must match sPattern
            if not lMorph:
                return False
            zPattern = re.compile(sPattern)
            bResult = all(zPattern.search(sMorph)  for sMorph in lMorph)
            if bResult and bSetMorph:
                dToken1["lMorph"] = lMorph
            return bResult
        else:
            zNegPattern = re.compile(sNegPattern)
            if any(zNegPattern.search(sMorph)  for sMorph in lMorph):
                return False
    # search sPattern
    zPattern = re.compile(sPattern)
    bResult = any(zPattern.search(sMorph)  for sMorph in lMorph)
    if bResult and bSetMorph:
        dToken1["lMorph"] = lMorph
    return bResult


def g_tag_before (dToken, dTags, sTag):
    if sTag not in dTags:
        return False
    if dToken["i"] > dTags[sTag][0]:
        return True
    return False


def g_tag_after (dToken, dTags, sTag):
    if sTag not in dTags:
        return False
    if dToken["i"] < dTags[sTag][1]:
        return True
    return False


def g_space_between_tokens (dToken1, dToken2, nMin, nMax=None):
    nSpace = dToken2["nStart"] - dToken1["nEnd"]
    if nSpace < nMin:
        return False
    if nMax is not None and nSpace > nMax:
        return False
    return True


def g_token (lToken, i):
    if i < 0:
        return lToken[0]
    if i >= len(lToken):
        return lToken[-1]
    return lToken[i]



#### Disambiguator

def g_select (dToken, sPattern, lDefault=None):
    "select morphologies for <dToken> according to <sPattern>, always return True"
    lMorph = dToken["lMorph"]  if "lMorph" in dToken  else _oSpellChecker.getMorph(dToken["sValue"])
    if not lMorph or len(lMorph) == 1:

        if lDefault:

            dToken["lMorph"] = lDefault
            #print("DA:", dToken["sValue"], dToken["lMorph"])

        return True
    lSelect = [ sMorph  for sMorph in lMorph  if re.search(sPattern, sMorph) ]
    if lSelect:
        if len(lSelect) != len(lMorph):
            dToken["lMorph"] = lSelect

    elif lDefault:
        dToken["lMorph"] = lDefault

    #print("DA:", dToken["sValue"], dToken["lMorph"])
    return True


def g_exclude (dToken, sPattern, lDefault=None):
    "select morphologies for <dToken> according to <sPattern>, always return True"
    lMorph = dToken["lMorph"]  if "lMorph" in dToken  else _oSpellChecker.getMorph(dToken["sValue"])
    if not lMorph or len(lMorph) == 1:
        if lDefault:
            dToken["lMorph"] = lDefault
            #print("DA:", dToken["sValue"], dToken["lMorph"])


        return True






    lSelect = [ sMorph  for sMorph in lMorph  if not re.search(sPattern, sMorph) ]
    if lSelect:
        if len(lSelect) != len(lMorph):
            dToken["lMorph"] = lSelect

    elif lDefault:
        dToken["lMorph"] = lDefault

    #print("DA:", dToken["sValue"], dToken["lMorph"])
    return True


def g_define (dToken, lMorph):
    "set morphologies of <dToken>, always return True"
    dToken["lMorph"] = lMorph
    #print("DA:", dToken["sValue"], lMorph)
    return True


def g_define_from (dToken, nLeft=None, nRight=None):
    if nLeft is not None:
        dToken["lMorph"] = _oSpellChecker.getMorph(dToken["sValue"][slice(nLeft, nRight)])
    else:
        dToken["lMorph"] = _oSpellChecker.getMorph(dToken["sValue"])
    return True


#### GRAMMAR CHECKER PLUGINS

${plugins}


#### CALLABLES FOR REGEX RULES (generated code)

${callables}


#### CALLABLES FOR GRAPH RULES (generated code)

${graph_callables}

Modified gc_core/py/lang_core/gc_options.py from [871c8d4b8f] to [c84731594a].





1
2
3

4
5
6
7
8
9

10
11
12
13
14
15
16




# generated code, do not edit

def getUI (sLang):

    if sLang in _dOptLabel:
        return _dOptLabel[sLang]
    return _dOptLabel["fr"]


def getOptions (sContext="Python"):

    if sContext in dOpt:
        return dOpt[sContext]
    return dOpt["Python"]


lStructOpt = ${lStructOpt}

>
>
>
>



>






>







1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
"""
Grammar checker default options
"""

# generated code, do not edit

def getUI (sLang):
    "returns dictionary of UI labels"
    if sLang in _dOptLabel:
        return _dOptLabel[sLang]
    return _dOptLabel["fr"]


def getOptions (sContext="Python"):
    "returns dictionary of options"
    if sContext in dOpt:
        return dOpt[sContext]
    return dOpt["Python"]


lStructOpt = ${lStructOpt}

Modified gc_core/py/lang_core/gc_rules.py from [3cf95f4a21] to [2ef08593b5].





1
2
3
4
5




# generated code, do not edit

lParagraphRules = ${paragraph_rules}

lSentenceRules = ${sentence_rules}
>
>
>
>





1
2
3
4
5
6
7
8
9
"""
Grammar checker regex rules
"""

# generated code, do not edit

lParagraphRules = ${paragraph_rules}

lSentenceRules = ${sentence_rules}

Added gc_core/py/lang_core/gc_rules_graph.py version [373592f3fb].



















>
>
>
>
>
>
>
>
>
1
2
3
4
5
6
7
8
9
"""
Grammar checker graph rules
"""

# generated code, do not edit

dAllGraph = ${rules_graphs}

dRule = ${rules_actions}

Modified gc_core/py/text.py from [133d154e72] to [137c7cc30f].

1




2
3
4
5
6
7
8
#!python3





import textwrap
from itertools import chain


def getParagraph (sText):
    "generator: returns paragraphs of text"

>
>
>
>







1
2
3
4
5
6
7
8
9
10
11
12
#!python3

"""
Text tools
"""

import textwrap
from itertools import chain


def getParagraph (sText):
    "generator: returns paragraphs of text"
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
        return ""
    lGrammErrs = sorted(aGrammErrs, key=lambda d: d["nStart"])
    lSpellErrs = sorted(aSpellErrs, key=lambda d: d['nStart'])
    sText = ""
    nOffset = 0
    for sLine in wrap(sParagraph, nWidth): # textwrap.wrap(sParagraph, nWidth, drop_whitespace=False)
        sText += sLine + "\n"
        ln = len(sLine)
        sErrLine = ""
        nLenErrLine = 0
        nGrammErr = 0
        nSpellErr = 0
        for dErr in lGrammErrs:
            nStart = dErr["nStart"] - nOffset
            if nStart < ln:
                nGrammErr += 1
                if nStart >= nLenErrLine:
                    sErrLine += " " * (nStart - nLenErrLine) + "^" * (dErr["nEnd"] - dErr["nStart"])
                    nLenErrLine = len(sErrLine)
            else:
                break
        for dErr in lSpellErrs:
            nStart = dErr['nStart'] - nOffset
            if nStart < ln:
                nSpellErr += 1
                nEnd = dErr['nEnd'] - nOffset
                if nEnd > len(sErrLine):
                    sErrLine += " " * (nEnd - len(sErrLine))
                sErrLine = sErrLine[:nStart] + "°" * (nEnd - nStart) + sErrLine[nEnd:]
            else:
                break
        if sErrLine:
            sText += sErrLine + "\n"
        if nGrammErr:
            sText += getReadableErrors(lGrammErrs[:nGrammErr], nWidth)
            del lGrammErrs[0:nGrammErr]
        if nSpellErr:
            sText += getReadableErrors(lSpellErrs[:nSpellErr], nWidth, True)
            del lSpellErrs[0:nSpellErr]
        nOffset += ln
    return sText


def getReadableErrors (lErrs, nWidth, bSpell=False):
    "Returns lErrs errors as readable errors"
    sErrors = ""
    for dErr in lErrs:







|






|








|















|







43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
        return ""
    lGrammErrs = sorted(aGrammErrs, key=lambda d: d["nStart"])
    lSpellErrs = sorted(aSpellErrs, key=lambda d: d['nStart'])
    sText = ""
    nOffset = 0
    for sLine in wrap(sParagraph, nWidth): # textwrap.wrap(sParagraph, nWidth, drop_whitespace=False)
        sText += sLine + "\n"
        nLineLen = len(sLine)
        sErrLine = ""
        nLenErrLine = 0
        nGrammErr = 0
        nSpellErr = 0
        for dErr in lGrammErrs:
            nStart = dErr["nStart"] - nOffset
            if nStart < nLineLen:
                nGrammErr += 1
                if nStart >= nLenErrLine:
                    sErrLine += " " * (nStart - nLenErrLine) + "^" * (dErr["nEnd"] - dErr["nStart"])
                    nLenErrLine = len(sErrLine)
            else:
                break
        for dErr in lSpellErrs:
            nStart = dErr['nStart'] - nOffset
            if nStart < nLineLen:
                nSpellErr += 1
                nEnd = dErr['nEnd'] - nOffset
                if nEnd > len(sErrLine):
                    sErrLine += " " * (nEnd - len(sErrLine))
                sErrLine = sErrLine[:nStart] + "°" * (nEnd - nStart) + sErrLine[nEnd:]
            else:
                break
        if sErrLine:
            sText += sErrLine + "\n"
        if nGrammErr:
            sText += getReadableErrors(lGrammErrs[:nGrammErr], nWidth)
            del lGrammErrs[0:nGrammErr]
        if nSpellErr:
            sText += getReadableErrors(lSpellErrs[:nSpellErr], nWidth, True)
            del lSpellErrs[0:nSpellErr]
        nOffset += nLineLen
    return sText


def getReadableErrors (lErrs, nWidth, bSpell=False):
    "Returns lErrs errors as readable errors"
    sErrors = ""
    for dErr in lErrs:
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
    return sErrors


def getReadableError (dErr, bSpell=False):
    "Returns an error dErr as a readable error"
    try:
        if bSpell:
            s = u"* {nStart}:{nEnd}  # {sValue}:".format(**dErr)
        else:
            s = u"* {nStart}:{nEnd}  # {sLineId} / {sRuleId}:\n".format(**dErr)
            s += "  " + dErr.get("sMessage", "# error : message not found")
        if dErr.get("aSuggestions", None):
            s += "\n  > Suggestions : " + " | ".join(dErr.get("aSuggestions", "# error : suggestions not found"))
        if dErr.get("URL", None):
            s += "\n  > URL: " + dErr["URL"]
        return s
    except KeyError:
        return u"* Non-compliant error: {}".format(dErr)


def createParagraphWithLines (lLine):
    "Returns a text as merged lines and a set of data about lines (line_number_y, start_x, end_x)"
    sText = ""







|

|
|

|

|
|







97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
    return sErrors


def getReadableError (dErr, bSpell=False):
    "Returns an error dErr as a readable error"
    try:
        if bSpell:
            sText = u"* {nStart}:{nEnd}  # {sValue}:".format(**dErr)
        else:
            sText = u"* {nStart}:{nEnd}  # {sLineId} / {sRuleId}:\n".format(**dErr)
            sText += "  " + dErr.get("sMessage", "# error : message not found")
        if dErr.get("aSuggestions", None):
            sText += "\n  > Suggestions : " + " | ".join(dErr.get("aSuggestions", "# error : suggestions not found"))
        if dErr.get("URL", None):
            sText += "\n  > URL: " + dErr["URL"]
        return sText
    except KeyError:
        return u"* Non-compliant error: {}".format(dErr)


def createParagraphWithLines (lLine):
    "Returns a text as merged lines and a set of data about lines (line_number_y, start_x, end_x)"
    sText = ""

Added gc_lang/fr/French_language.txt version [0f13f2288a].









































































































































































































































































































































































































































































































































































>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
# NOTES SUR LA LANGUE FRANÇAISE

## CE QUI ENTOURE UN VERBE

    PRONOMS (avant)
        COD         COI
        le / l’
        la / l’
        les
        en
        me / m’     me / m’
        te / t’     te / t’
        se / s’     lui
        nous        nous
        vous        nous
        se / s’     leur
                    y

    SOMME
        [le|la|l’|les|en|me|m’|te|t’|se|s’|nous|vous|lui|leur|y]

    ADVERBE DE NÉGATION (avant)
        ne / n’

    COMBINAISONS VALIDES
        ?[ne|n’]¿   [me|te|se]      [le|la|l’|les]
        ?[ne|n’]¿   [m’|t’|s’]      [le|la|l’|les|en|y]
        ?[ne|n’]¿   [le|la]         [lui|leur]
        ?[ne|n’]¿   [l’|les]        [lui|leur|en|y]
        ?[ne|n’]¿   [lui|leur]      en
        ?[ne|n’]¿   [nous|vous]     [le|la|l’|les|en|y]
        ne          [le|la|l’|les|me|m’|te|t’|se|s’|nous|vous|lui|leur]
        n’          [en|y]

    RÉSUMÉ & SIMPLIFICATION
        [ne|n’|le|la|l’|les|en|me|m’|te|t’|se|s’|nous|vous|lui|leur|y]
        ?[ne|n’]¿   [le|la|l’|les|en|me|m’|te|t’|se|s’|nous|vous|lui|leur|y]
        ?[ne|n’]¿   [me|m’|te|t’|se|s’|nous|vous]   [le|la|l’|les|en|y]
        ?[ne|n’]¿   [le|la|l’|les]                  [lui|leur|en|y]
        ?[ne|n’]¿   [lui|leur]                      en

    ADVERBE DE NÉGATION (après)
        guère
        jamais
        pas
        plus
        point
        que / qu’
        rien

    PRONOMS À L’IMPÉRATIF
        APRÈS
            -moi
            -toi
            -lui
            -leur
            -nous
            -vous
            -le
            -la
            -les
            -en
            -y

        AVANT
            Uniquement les combinaisons avec l’adverbe de négation [ne|n’]


## DÉTERMINANTS

    SINGULIER               PLURIEL
    le / la / l’            les
    ledit / ladite          lesdits / lesdites
    un / une                des
    du / de la              des
    dudit / de ladite       desdits / desdites
    de                      de
    ce / cet / cette        ces
    icelui / icelle         iceux / icelles
    mon / ma                mes
    ton / ta                tes
    son / sa                ses
    votre                   nos
    notre                   vos
    leur                    leurs
    quel / quelle           quels / quelles
    quelque                 quelques
    tout / toute            tous / toutes
    chaque
    aucun / aucune
    nul / nulle
                            plusieurs
                            certains / certaines
                            divers / diverses

    DÉTERMINANT & PRÉPOSITION
    au / à la               aux
    audit / à ladite        auxdits / auxdites


## CONJONCTIONS

    DE COORDINATION         DE SUBORDINATION
    c’est-à-dire            afin que            pendant que
    c.-à-d.                 après que           pour que
    car                     attendu que         pourvu que
    donc                    avant que           puisque
    et / &                  bien que            quand
    mais                    comme               que
    ni                      depuis que          quoique
    or                      dès que             sans que
    ou                      dès lors que        sauf que
    partant                 excepté que         selon que
    puis                    lorsque             si
    sinon                   lors que            tandis que
    soit                    malgré que          tant que
                            parce que


## PRÉPOSITIONS

    VERBALES UNIQUEMENT
        afin de

    NOMINALES ET VERBALES
        à
        entre
        excepté
        outre
        par
        pour
        sans
        sauf

    PRÉPOSITIONS ET DÉTERMINANTS
        au
        aux
        audit
        auxdits
        auxdites

    NOMINALES
        à l’instar de               devers                      par-dessus  (adv)
        à mi-distance de            dixit                       par-devant  (adv)
        après                       durant                      par-devers
        attendu                     dès                         parmi
        au-dedans   (adv)           en                          passé
        au-dehors   (adv)           endéans                     pendant
        au-delà     (adv)           envers                      pour
        au-dessous  (adv)           ès                          quant à/au/à la/aux
        au-dessus   (adv)           excepté                     revoici
        au-devant   (adv)           face à                      revoilà
        auprès de                   fors                        sauf
        autour de                   grâce à                     sans
        av                          hormis                      selon
        avant                       hors                        sous
        avec                        jusque                      suivant
        chez                        jusques                     sur
        concernant                  lez                         tandis      (adv)
        contre                      lors de                     vers
        courant (+mois)             lès                         versus
        dans                        malgré                      via
        depuis                      moins       (adv)           vis-à-vis
        derrière                    nonobstant  (adv)           voici
        dessous     (adv)           par-delà                    voilà
        dessus      (adv)           par-derrière  (adv)         vs
        devant      (adv)           par-dessous   (adv)         vu


## PRONOMS

    PRONOMS PERSONNELS SUJETS
    je                  moi-même                                mézigue
    tu                  toi-même                                tézigue
    il / elle           lui / lui-même / elle-même              césigue / sézigue
    on
    nous                nous-même / nous-mêmes                  noszigues
    vous                vous-même / vous-mêmes                  voszigues
    ils / elles         eux / eux-mêmes / elles-mêmes           leurszigues

    PRONOMS PERSONNELS OBJETS
    moi                 moi-même                                mézigue
    toi                 toi-même                                tézigue
    lui / elle          lui-même  / elle-même                   césigue / sézigue
    soi                 soi-même
    nous                nous-même / nous-mêmes                  noszigues
    vous                vous-même / vous-mêmes                  voszigues
    eux / elles         eux / eux-mêmes / elles-mêmes           leurszigues

    PRONOMS NÉGATIFS (SUJETS & OBJETS)
    aucun
    aucune
    dégun
    nul
    personne
    rien

    PRONOMS OBJETS PRÉVERBES
    la      COD
    le      COD
    les     COD
    l’      COD
    leur    COI
    lui     COI
    me      COD/COI
    te      COD/COI
    se      COD/COI
    nous    COD/COI
    vous    COD/COI
    y       COI (proadv)
    en      COD (proadv)

    PRONOMS DÉMONSTRATIFS (SUJETS ET OBJETS)
    çuilà           propersuj properobj 3pe mas sg
    ça              prodem mas sg
    ceci            prodem mas sg
    cela            prodem mas sg
    celle qui       prodem fem sg
    celles qui      prodem fem pl
    celle-ci        prodem fem sg
    celle-là        prodem fem sg
    celles-ci       prodem fem pl
    celles-là       prodem fem pl
    celui qui       prodem mas sg
    celui-ci        prodem mas sg
    celui-là        prodem mas sg
    ceux qui        prodem mas pl
    ceux-ci         prodem mas pl
    ceux-là         prodem mas pl

    icelle          detdem prodem fem sg
    icelles         detdem prodem fem pl
    icelui          detdem prodem mas sg
    iceux           detdem prodem mas pl

    PRONOMS DÉMONSTRATIFS (SUJETS)
    ce

    PRONOMS DÉMONSTRATIFS (OBJETS)
    ci              (adv)

    PRONOMS RELATIFS
    auquel          proint prorel mas sg
    auxquelles      proint prorel fem pl
    auxquels        proint prorel mas pl
    desquelles      proint prorel fem pl
    desquels        proint prorel mas pl
    dont            prorel
    duquel          proint prorel mas sg
    laquelle        proint prorel fem sg
    lequel          proint prorel mas sg
    lesquelles      proint prorel fem pl
    lesquels        proint prorel mas pl
    où              advint prorel
    qué             proint prorel
    qui             proint prorel
    que             proint prorel
    quid            proint
    quoi            proint prorel

    autre           proind
    autrui          proind
    quiconque       proind prorel
    certaine        detind proind
    chacun          proind mas sg
    chacune         proind fem sg
    d'aucuns        proind mas pl
    grand-chose     proind
    n'importe quoi  proind
    n'importe qui   proind
    plupart         proind epi pl
    quelques-unes   proind fem pl
    quelques-uns    proind mas pl
    quelqu'un       proind mas sg
    quelqu'une      proind fem sg
    telle           proind

Modified gc_lang/fr/config.ini from [7c7adf7950] to [b6aa293b9a].

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
[args]
lang = fr
lang_name = French
locales = fr_FR fr_BE fr_CA fr_CH fr_LU fr_BF fr_BJ fr_CD fr_CI fr_CM fr_MA fr_ML fr_MU fr_NE fr_RE fr_SN fr_TG 
country_default = FR
name = Grammalecte
implname = grammalecte
# always use 3 numbers for version: x.y.z
version = 0.6.4.2
author = Olivier R.
provider = Dicollecte
link = http://grammalecte.net
description = Correcteur grammatical pour le français.
extras = README_fr.txt
logo = logo.png




|




|







1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
[args]
lang = fr
lang_name = French
locales = fr_FR fr_BE fr_CA fr_CH fr_LU fr_BF fr_BJ fr_CD fr_CI fr_CM fr_MA fr_ML fr_MU fr_NE fr_RE fr_SN fr_TG
country_default = FR
name = Grammalecte
implname = grammalecte
# always use 3 numbers for version: x.y.z
version = 0.7
author = Olivier R.
provider = Dicollecte
link = http://grammalecte.net
description = Correcteur grammatical pour le français.
extras = README_fr.txt
logo = logo.png

Modified gc_lang/fr/modules-js/conj.js from [f544af05b0] to [8124143953].

83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
        return this._lVtyp[this._dVerb[sVerb][0]];
    },

    getSimil: function (sWord, sMorph, bSubst=false) {
        if (!sMorph.includes(":V")) {
            return new Set();
        }
        let sInfi = sMorph.slice(1, sMorph.indexOf(" "));
        let aSugg = new Set();
        let tTags = this._getTags(sInfi);
        if (tTags) {
            if (!bSubst) {
                // we suggest conjugated forms
                if (sMorph.includes(":V1")) {
                    aSugg.add(sInfi);







|







83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
        return this._lVtyp[this._dVerb[sVerb][0]];
    },

    getSimil: function (sWord, sMorph, bSubst=false) {
        if (!sMorph.includes(":V")) {
            return new Set();
        }
        let sInfi = sMorph.slice(1, sMorph.indexOf("/"));
        let aSugg = new Set();
        let tTags = this._getTags(sInfi);
        if (tTags) {
            if (!bSubst) {
                // we suggest conjugated forms
                if (sMorph.includes(":V1")) {
                    aSugg.add(sInfi);

Modified gc_lang/fr/modules-js/gce_analyseur.js from [e2613ddcd2] to [bdc2b54804].

18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43

44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
    if (s2 == "vous") {
        return "vous";
    }
    if (s2 == "eux") {
        return "ils";
    }
    if (s2 == "elle" || s2 == "elles") {
        // We don’t check if word exists in _dAnalyses, for it is assumed it has been done before
        if (cregex.mbNprMasNotFem(_dAnalyses.gl_get(s1, ""))) {
            return "ils";
        }
        // si épicène, indéterminable, mais OSEF, le féminin l’emporte
        return "elles";
    }
    return s1 + " et " + s2;
}

function apposition (sWord1, sWord2) {
    // returns true if nom + nom (no agreement required)
    // We don’t check if word exists in _dAnalyses, for it is assumed it has been done before
    return cregex.mbNomNotAdj(_dAnalyses.gl_get(sWord2, "")) && cregex.mbPpasNomNotAdj(_dAnalyses.gl_get(sWord1, ""));
}

function isAmbiguousNAV (sWord) {
    // words which are nom|adj and verb are ambiguous (except être and avoir)
    if (!_dAnalyses.has(sWord) && !_storeMorphFromFSA(sWord)) {

        return false;
    }
    if (!cregex.mbNomAdj(_dAnalyses.gl_get(sWord, "")) || sWord == "est") {
        return false;
    }
    if (cregex.mbVconj(_dAnalyses.gl_get(sWord, "")) && !cregex.mbMG(_dAnalyses.gl_get(sWord, ""))) {
        return true;
    }
    return false;
}

function isAmbiguousAndWrong (sWord1, sWord2, sReqMorphNA, sReqMorphConj) {
    //// use it if sWord1 won’t be a verb; word2 is assumed to be true via isAmbiguousNAV
    // We don’t check if word exists in _dAnalyses, for it is assumed it has been done before
    let a2 = _dAnalyses.gl_get(sWord2, null);
    if (!a2 || a2.length === 0) {
        return false;
    }
    if (cregex.checkConjVerb(a2, sReqMorphConj)) {
        // verb word2 is ok
        return false;
    }
    let a1 = _dAnalyses.gl_get(sWord1, null);
    if (!a1 || a1.length === 0) {
        return false;
    }
    if (cregex.checkAgreement(a1, a2) && (cregex.mbAdj(a2) || cregex.mbAdj(a1))) {
        return false;
    }
    return true;
}

function isVeryAmbiguousAndWrong (sWord1, sWord2, sReqMorphNA, sReqMorphConj, bLastHopeCond) {
    //// use it if sWord1 can be also a verb; word2 is assumed to be true via isAmbiguousNAV
    // We don’t check if word exists in _dAnalyses, for it is assumed it has been done before
    let a2 = _dAnalyses.gl_get(sWord2, null);
    if (!a2 || a2.length === 0) {
        return false;
    }
    if (cregex.checkConjVerb(a2, sReqMorphConj)) {
        // verb word2 is ok
        return false;
    }
    let a1 = _dAnalyses.gl_get(sWord1, null);
    if (!a1 || a1.length === 0) {
        return false;
    }
    if (cregex.checkAgreement(a1, a2) && (cregex.mbAdj(a2) || cregex.mbAdjNb(a1))) {
        return false;
    }
    // now, we know there no agreement, and conjugation is also wrong
    if (cregex.isNomAdj(a1)) {
        return true;
    }
    //if cregex.isNomAdjVerb(a1): # considered true
    if (bLastHopeCond) {
        return true;
    }
    return false;
}

function checkAgreement (sWord1, sWord2) {
    // We don’t check if word exists in _dAnalyses, for it is assumed it has been done before
    let a2 = _dAnalyses.gl_get(sWord2, null);
    if (!a2 || a2.length === 0) {
        return true;
    }
    let a1 = _dAnalyses.gl_get(sWord1, null);
    if (!a1 || a1.length === 0) {
        return true;
    }
    return cregex.checkAgreement(a1, a2);
}

function mbUnit (s) {
    if (/[µ\/⁰¹²³⁴⁵⁶⁷⁸⁹Ωℓ·]/.test(s)) {







<
|










<
|




|
>


|


|







<
|
|






|
|










<
|
|






|
|

















<
|
|


|
|







18
19
20
21
22
23
24

25
26
27
28
29
30
31
32
33
34
35

36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55

56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75

76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102

103
104
105
106
107
108
109
110
111
112
113
114
115
    if (s2 == "vous") {
        return "vous";
    }
    if (s2 == "eux") {
        return "ils";
    }
    if (s2 == "elle" || s2 == "elles") {

        if (cregex.mbNprMasNotFem(_oSpellChecker.getMorph(s1))) {
            return "ils";
        }
        // si épicène, indéterminable, mais OSEF, le féminin l’emporte
        return "elles";
    }
    return s1 + " et " + s2;
}

function apposition (sWord1, sWord2) {
    // returns true if nom + nom (no agreement required)

    return cregex.mbNomNotAdj(_oSpellChecker.getMorph(sWord2)) && cregex.mbPpasNomNotAdj(_oSpellChecker.getMorph(sWord1));
}

function isAmbiguousNAV (sWord) {
    // words which are nom|adj and verb are ambiguous (except être and avoir)
    let lMorph = _oSpellChecker.getMorph(sWord);
    if (lMorph.length === 0) {
        return false;
    }
    if (!cregex.mbNomAdj(lMorph) || sWord == "est") {
        return false;
    }
    if (cregex.mbVconj(lMorph) && !cregex.mbMG(lMorph)) {
        return true;
    }
    return false;
}

function isAmbiguousAndWrong (sWord1, sWord2, sReqMorphNA, sReqMorphConj) {
    //// use it if sWord1 won’t be a verb; word2 is assumed to be true via isAmbiguousNAV

    let a2 = _oSpellChecker.getMorph(sWord2);
    if (a2.length === 0) {
        return false;
    }
    if (cregex.checkConjVerb(a2, sReqMorphConj)) {
        // verb word2 is ok
        return false;
    }
    let a1 = _oSpellChecker.getMorph(sWord1);
    if (a1.length === 0) {
        return false;
    }
    if (cregex.checkAgreement(a1, a2) && (cregex.mbAdj(a2) || cregex.mbAdj(a1))) {
        return false;
    }
    return true;
}

function isVeryAmbiguousAndWrong (sWord1, sWord2, sReqMorphNA, sReqMorphConj, bLastHopeCond) {
    //// use it if sWord1 can be also a verb; word2 is assumed to be true via isAmbiguousNAV

    let a2 = _oSpellChecker.getMorph(sWord2);
    if (a2.length === 0) {
        return false;
    }
    if (cregex.checkConjVerb(a2, sReqMorphConj)) {
        // verb word2 is ok
        return false;
    }
    let a1 = _oSpellChecker.getMorph(sWord1);
    if (a1.length === 0) {
        return false;
    }
    if (cregex.checkAgreement(a1, a2) && (cregex.mbAdj(a2) || cregex.mbAdjNb(a1))) {
        return false;
    }
    // now, we know there no agreement, and conjugation is also wrong
    if (cregex.isNomAdj(a1)) {
        return true;
    }
    //if cregex.isNomAdjVerb(a1): # considered true
    if (bLastHopeCond) {
        return true;
    }
    return false;
}

function checkAgreement (sWord1, sWord2) {

    let a2 = _oSpellChecker.getMorph(sWord2);
    if (a2.length === 0) {
        return true;
    }
    let a1 = _oSpellChecker.getMorph(sWord1);
    if (a1.length === 0) {
        return true;
    }
    return cregex.checkAgreement(a1, a2);
}

function mbUnit (s) {
    if (/[µ\/⁰¹²³⁴⁵⁶⁷⁸⁹Ωℓ·]/.test(s)) {

Modified gc_lang/fr/modules-js/gce_suggestions.js from [0c31bc1a27] to [6803550153].

8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
    var phonet = require("resource://grammalecte/fr/phonet.js");
}


//// verbs

function suggVerb (sFlex, sWho, funcSugg2=null) {
    // we don’t check if word exists in _dAnalyses, for it is assumed it has been done before
    let aSugg = new Set();
    for (let sStem of stem(sFlex)) {
        let tTags = conj._getTags(sStem);
        if (tTags) {
            // we get the tense
            let aTense = new Set();
            for (let sMorph of _dAnalyses.gl_get(sFlex, [])) {
                let m;
                let zVerb = new RegExp (">"+sStem+" .*?(:(?:Y|I[pqsf]|S[pq]|K))", "g");
                while ((m = zVerb.exec(sMorph)) !== null) {
                    // stem must be used in regex to prevent confusion between different verbs (e.g. sauras has 2 stems: savoir and saurer)
                    if (m) {
                        if (m[1] === ":Y") {
                            aTense.add(":Ip");
                            aTense.add(":Iq");
                            aTense.add(":Is");







<

|




|

|







8
9
10
11
12
13
14

15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
    var phonet = require("resource://grammalecte/fr/phonet.js");
}


//// verbs

function suggVerb (sFlex, sWho, funcSugg2=null) {

    let aSugg = new Set();
    for (let sStem of _oSpellChecker.getLemma(sFlex)) {
        let tTags = conj._getTags(sStem);
        if (tTags) {
            // we get the tense
            let aTense = new Set();
            for (let sMorph of _oSpellChecker.getMorph(sFlex)) {
                let m;
                let zVerb = new RegExp (">"+sStem+"/.*?(:(?:Y|I[pqsf]|S[pq]|K))", "g");
                while ((m = zVerb.exec(sMorph)) !== null) {
                    // stem must be used in regex to prevent confusion between different verbs (e.g. sauras has 2 stems: savoir and saurer)
                    if (m) {
                        if (m[1] === ":Y") {
                            aTense.add(":Ip");
                            aTense.add(":Iq");
                            aTense.add(":Is");
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
        return Array.from(aSugg).join("|");
    }
    return "";
}

function suggVerbPpas (sFlex, sWhat=null) {
    let aSugg = new Set();
    for (let sStem of stem(sFlex)) {
        let tTags = conj._getTags(sStem);
        if (tTags) {
            if (!sWhat) {
                aSugg.add(conj._getConjWithTags(sStem, tTags, ":PQ", ":Q1"));
                aSugg.add(conj._getConjWithTags(sStem, tTags, ":PQ", ":Q2"));
                aSugg.add(conj._getConjWithTags(sStem, tTags, ":PQ", ":Q3"));
                aSugg.add(conj._getConjWithTags(sStem, tTags, ":PQ", ":Q4"));







|







56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
        return Array.from(aSugg).join("|");
    }
    return "";
}

function suggVerbPpas (sFlex, sWhat=null) {
    let aSugg = new Set();
    for (let sStem of _oSpellChecker.getLemma(sFlex)) {
        let tTags = conj._getTags(sStem);
        if (tTags) {
            if (!sWhat) {
                aSugg.add(conj._getConjWithTags(sStem, tTags, ":PQ", ":Q1"));
                aSugg.add(conj._getConjWithTags(sStem, tTags, ":PQ", ":Q2"));
                aSugg.add(conj._getConjWithTags(sStem, tTags, ":PQ", ":Q3"));
                aSugg.add(conj._getConjWithTags(sStem, tTags, ":PQ", ":Q4"));
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
        return Array.from(aSugg).join("|");
    }
    return "";
}

function suggVerbTense (sFlex, sTense, sWho) {
    let aSugg = new Set();
    for (let sStem of stem(sFlex)) {
        if (conj.hasConj(sStem, sTense, sWho)) {
            aSugg.add(conj.getConj(sStem, sTense, sWho));
        }
    }
    if (aSugg.size > 0) {
        return Array.from(aSugg).join("|");
    }
    return "";
}

function suggVerbImpe (sFlex) {
    let aSugg = new Set();
    for (let sStem of stem(sFlex)) {
        let tTags = conj._getTags(sStem);
        if (tTags) {
            if (conj._hasConjWithTags(tTags, ":E", ":2s")) {
                aSugg.add(conj._getConjWithTags(sStem, tTags, ":E", ":2s"));
            }
            if (conj._hasConjWithTags(tTags, ":E", ":1p")) {
                aSugg.add(conj._getConjWithTags(sStem, tTags, ":E", ":1p"));
            }
            if (conj._hasConjWithTags(tTags, ":E", ":2p")) {
                aSugg.add(conj._getConjWithTags(sStem, tTags, ":E", ":2p"));
            }
        }
    }
    if (aSugg.size > 0) {
        return Array.from(aSugg).join("|");
    }
    return "";
}

function suggVerbInfi (sFlex) {
    return stem(sFlex).filter(sStem => conj.isVerb(sStem)).join("|");
}


const _dQuiEst = new Map ([
    ["je", ":1s"], ["j’", ":1s"], ["j’en", ":1s"], ["j’y", ":1s"],
    ["tu", ":2s"], ["il", ":3s"], ["on", ":3s"], ["elle", ":3s"],
    ["nous", ":1p"], ["vous", ":2p"], ["ils", ":3p"], ["elles", ":3p"]







|












|




















|







106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
        return Array.from(aSugg).join("|");
    }
    return "";
}

function suggVerbTense (sFlex, sTense, sWho) {
    let aSugg = new Set();
    for (let sStem of _oSpellChecker.getLemma(sFlex)) {
        if (conj.hasConj(sStem, sTense, sWho)) {
            aSugg.add(conj.getConj(sStem, sTense, sWho));
        }
    }
    if (aSugg.size > 0) {
        return Array.from(aSugg).join("|");
    }
    return "";
}

function suggVerbImpe (sFlex) {
    let aSugg = new Set();
    for (let sStem of _oSpellChecker.getLemma(sFlex)) {
        let tTags = conj._getTags(sStem);
        if (tTags) {
            if (conj._hasConjWithTags(tTags, ":E", ":2s")) {
                aSugg.add(conj._getConjWithTags(sStem, tTags, ":E", ":2s"));
            }
            if (conj._hasConjWithTags(tTags, ":E", ":1p")) {
                aSugg.add(conj._getConjWithTags(sStem, tTags, ":E", ":1p"));
            }
            if (conj._hasConjWithTags(tTags, ":E", ":2p")) {
                aSugg.add(conj._getConjWithTags(sStem, tTags, ":E", ":2p"));
            }
        }
    }
    if (aSugg.size > 0) {
        return Array.from(aSugg).join("|");
    }
    return "";
}

function suggVerbInfi (sFlex) {
    return _oSpellChecker.getLemma(sFlex).filter(sStem => conj.isVerb(sStem)).join("|");
}


const _dQuiEst = new Map ([
    ["je", ":1s"], ["j’", ":1s"], ["j’en", ":1s"], ["j’y", ":1s"],
    ["tu", ":2s"], ["il", ":3s"], ["on", ":3s"], ["elle", ":3s"],
    ["nous", ":1p"], ["vous", ":2p"], ["ils", ":3p"], ["elles", ":3p"]
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200

201
202
203
204
205
206
207
208
209
210
    if (!sWho) {
        if (sSuj[0].gl_isLowerCase()) { // pas un pronom, ni un nom propre
            return "";
        }
        sWho = ":3s";
    }
    let aSugg = new Set();
    for (let sStem of stem(sFlex)) {
        let tTags = conj._getTags(sStem);
        if (tTags) {
            for (let sTense of lMode) {
                if (conj._hasConjWithTags(tTags, sTense, sWho)) {
                    aSugg.add(conj._getConjWithTags(sStem, tTags, sTense, sWho));
                }
            }
        }
    }
    if (aSugg.size > 0) {
        return Array.from(aSugg).join("|");
    }
    return "";
}

//// Nouns and adjectives

function suggPlur (sFlex, sWordToAgree=null) {
    // returns plural forms assuming sFlex is singular
    if (sWordToAgree) {
        if (!_dAnalyses.has(sWordToAgree) && !_storeMorphFromFSA(sWordToAgree)) {

            return "";
        }
        let sGender = cregex.getGender(_dAnalyses.gl_get(sWordToAgree, []));
        if (sGender == ":m") {
            return suggMasPlur(sFlex);
        } else if (sGender == ":f") {
            return suggFemPlur(sFlex);
        }
    }
    let aSugg = new Set();







|




















|
>


|







171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
    if (!sWho) {
        if (sSuj[0].gl_isLowerCase()) { // pas un pronom, ni un nom propre
            return "";
        }
        sWho = ":3s";
    }
    let aSugg = new Set();
    for (let sStem of _oSpellChecker.getLemma(sFlex)) {
        let tTags = conj._getTags(sStem);
        if (tTags) {
            for (let sTense of lMode) {
                if (conj._hasConjWithTags(tTags, sTense, sWho)) {
                    aSugg.add(conj._getConjWithTags(sStem, tTags, sTense, sWho));
                }
            }
        }
    }
    if (aSugg.size > 0) {
        return Array.from(aSugg).join("|");
    }
    return "";
}

//// Nouns and adjectives

function suggPlur (sFlex, sWordToAgree=null) {
    // returns plural forms assuming sFlex is singular
    if (sWordToAgree) {
        let lMorph = _oSpellChecker.getMorph(sWordToAgree);
        if (lMorph.length === 0) {
            return "";
        }
        let sGender = cregex.getGender(lMorph);
        if (sGender == ":m") {
            return suggMasPlur(sFlex);
        } else if (sGender == ":f") {
            return suggFemPlur(sFlex);
        }
    }
    let aSugg = new Set();
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
        return Array.from(aSugg).join("|");
    }
    return "";
}

function suggMasSing (sFlex, bSuggSimil=false) {
    // returns masculine singular forms
    // we don’t check if word exists in _dAnalyses, for it is assumed it has been done before
    let aSugg = new Set();
    for (let sMorph of _dAnalyses.gl_get(sFlex, [])) {
        if (!sMorph.includes(":V")) {
            // not a verb
            if (sMorph.includes(":m") || sMorph.includes(":e")) {
                aSugg.add(suggSing(sFlex));
            } else {
                let sStem = cregex.getLemmaOfMorph(sMorph);
                if (mfsp.isFemForm(sStem)) {







<

|







254
255
256
257
258
259
260

261
262
263
264
265
266
267
268
269
        return Array.from(aSugg).join("|");
    }
    return "";
}

function suggMasSing (sFlex, bSuggSimil=false) {
    // returns masculine singular forms

    let aSugg = new Set();
    for (let sMorph of _oSpellChecker.getMorph(sFlex)) {
        if (!sMorph.includes(":V")) {
            // not a verb
            if (sMorph.includes(":m") || sMorph.includes(":e")) {
                aSugg.add(suggSing(sFlex));
            } else {
                let sStem = cregex.getLemmaOfMorph(sMorph);
                if (mfsp.isFemForm(sStem)) {
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
        return Array.from(aSugg).join("|");
    }
    return "";
}

function suggMasPlur (sFlex, bSuggSimil=false) {
    // returns masculine plural forms
    // we don’t check if word exists in _dAnalyses, for it is assumed it has been done before
    let aSugg = new Set();
    for (let sMorph of _dAnalyses.gl_get(sFlex, [])) {
        if (!sMorph.includes(":V")) {
            // not a verb
            if (sMorph.includes(":m") || sMorph.includes(":e")) {
                aSugg.add(suggPlur(sFlex));
            } else {
                let sStem = cregex.getLemmaOfMorph(sMorph);
                if (mfsp.isFemForm(sStem)) {







<

|







289
290
291
292
293
294
295

296
297
298
299
300
301
302
303
304
        return Array.from(aSugg).join("|");
    }
    return "";
}

function suggMasPlur (sFlex, bSuggSimil=false) {
    // returns masculine plural forms

    let aSugg = new Set();
    for (let sMorph of _oSpellChecker.getMorph(sFlex)) {
        if (!sMorph.includes(":V")) {
            // not a verb
            if (sMorph.includes(":m") || sMorph.includes(":e")) {
                aSugg.add(suggPlur(sFlex));
            } else {
                let sStem = cregex.getLemmaOfMorph(sMorph);
                if (mfsp.isFemForm(sStem)) {
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
    }
    return "";
}


function suggFemSing (sFlex, bSuggSimil=false) {
    // returns feminine singular forms
    // we don’t check if word exists in _dAnalyses, for it is assumed it has been done before
    let aSugg = new Set();
    for (let sMorph of _dAnalyses.gl_get(sFlex, [])) {
        if (!sMorph.includes(":V")) {
            // not a verb
            if (sMorph.includes(":f") || sMorph.includes(":e")) {
                aSugg.add(suggSing(sFlex));
            } else {
                let sStem = cregex.getLemmaOfMorph(sMorph);
                if (mfsp.isFemForm(sStem)) {







<

|







329
330
331
332
333
334
335

336
337
338
339
340
341
342
343
344
    }
    return "";
}


function suggFemSing (sFlex, bSuggSimil=false) {
    // returns feminine singular forms

    let aSugg = new Set();
    for (let sMorph of _oSpellChecker.getMorph(sFlex)) {
        if (!sMorph.includes(":V")) {
            // not a verb
            if (sMorph.includes(":f") || sMorph.includes(":e")) {
                aSugg.add(suggSing(sFlex));
            } else {
                let sStem = cregex.getLemmaOfMorph(sMorph);
                if (mfsp.isFemForm(sStem)) {
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
        return Array.from(aSugg).join("|");
    }
    return "";
}

function suggFemPlur (sFlex, bSuggSimil=false) {
    // returns feminine plural forms
    // we don’t check if word exists in _dAnalyses, for it is assumed it has been done before
    let aSugg = new Set();
    for (let sMorph of _dAnalyses.gl_get(sFlex, [])) {
        if (!sMorph.includes(":V")) {
            // not a verb
            if (sMorph.includes(":f") || sMorph.includes(":e")) {
                aSugg.add(suggPlur(sFlex));
            } else {
                let sStem = cregex.getLemmaOfMorph(sMorph);
                if (mfsp.isFemForm(sStem)) {







<

|







362
363
364
365
366
367
368

369
370
371
372
373
374
375
376
377
        return Array.from(aSugg).join("|");
    }
    return "";
}

function suggFemPlur (sFlex, bSuggSimil=false) {
    // returns feminine plural forms

    let aSugg = new Set();
    for (let sMorph of _oSpellChecker.getMorph(sFlex)) {
        if (!sMorph.includes(":V")) {
            // not a verb
            if (sMorph.includes(":f") || sMorph.includes(":e")) {
                aSugg.add(suggPlur(sFlex));
            } else {
                let sStem = cregex.getLemmaOfMorph(sMorph);
                if (mfsp.isFemForm(sStem)) {
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
485
486
487
488
489
490
491
492
493
494
495
496
497
498
499
500
501
502
503
    if (aSugg.size > 0) {
        return Array.from(aSugg).join("|");
    }
    return "";
}

function hasFemForm (sFlex) {
    for (let sStem of stem(sFlex)) {
        if (mfsp.isFemForm(sStem) || conj.hasConj(sStem, ":PQ", ":Q3")) {
            return true;
        }
    }
    if (phonet.hasSimil(sFlex, ":f")) {
        return true;
    }
    return false;
}

function hasMasForm (sFlex) {
    for (let sStem of stem(sFlex)) {
        if (mfsp.isFemForm(sStem) || conj.hasConj(sStem, ":PQ", ":Q1")) {
            // what has a feminine form also has a masculine form
            return true;
        }
    }
    if (phonet.hasSimil(sFlex, ":m")) {
        return true;
    }
    return false;
}

function switchGender (sFlex, bPlur=null) {
    // we don’t check if word exists in _dAnalyses, for it is assumed it has been done before
    let aSugg = new Set();
    if (bPlur === null) {
        for (let sMorph of _dAnalyses.gl_get(sFlex, [])) {
            if (sMorph.includes(":f")) {
                if (sMorph.includes(":s")) {
                    aSugg.add(suggMasSing(sFlex));
                } else if (sMorph.includes(":p")) {
                    aSugg.add(suggMasPlur(sFlex));
                }
            } else if (sMorph.includes(":m")) {
                if (sMorph.includes(":s")) {
                    aSugg.add(suggFemSing(sFlex));
                } else if (sMorph.includes(":p")) {
                    aSugg.add(suggFemPlur(sFlex));
                } else {
                    aSugg.add(suggFemSing(sFlex));
                    aSugg.add(suggFemPlur(sFlex));
                }
            }
        }
    } else if (bPlur) {
        for (let sMorph of _dAnalyses.gl_get(sFlex, [])) {
            if (sMorph.includes(":f")) {
                aSugg.add(suggMasPlur(sFlex));
            } else if (sMorph.includes(":m")) {
                aSugg.add(suggFemPlur(sFlex));
            }
        }
    } else {
        for (let sMorph of _dAnalyses.gl_get(sFlex, [])) {
            if (sMorph.includes(":f")) {
                aSugg.add(suggMasSing(sFlex));
            } else if (sMorph.includes(":m")) {
                aSugg.add(suggFemSing(sFlex));
            }
        }
    }
    if (aSugg.size > 0) {
        return Array.from(aSugg).join("|");
    }
    return "";
}

function switchPlural (sFlex) {
    let aSugg = new Set();
    for (let sMorph of _dAnalyses.gl_get(sFlex, [])) { // we don’t check if word exists in _dAnalyses, for it is assumed it has been done before
        if (sMorph.includes(":s")) {
            aSugg.add(suggPlur(sFlex));
        } else if (sMorph.includes(":p")) {
            aSugg.add(suggSing(sFlex));
        }
    }
    if (aSugg.size > 0) {
        return Array.from(aSugg).join("|");
    }
    return "";
}

function hasSimil (sWord, sPattern=null) {
    return phonet.hasSimil(sWord, sPattern);
}

function suggSimil (sWord, sPattern=null, bSubst=false) {
    // return list of words phonetically similar to sWord and whom POS is matching sPattern
    let aSugg = phonet.selectSimil(sWord, sPattern);
    for (let sMorph of _dAnalyses.gl_get(sWord, [])) {
        for (let e of conj.getSimil(sWord, sMorph, bSubst)) {
            aSugg.add(e);
        }
    }
    if (aSugg.size > 0) {
        return Array.from(aSugg).join("|");
    }







|











|












<


|


















|







|















|



















|







394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425

426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
485
486
487
488
489
490
491
492
493
494
495
496
497
498
    if (aSugg.size > 0) {
        return Array.from(aSugg).join("|");
    }
    return "";
}

function hasFemForm (sFlex) {
    for (let sStem of _oSpellChecker.getLemma(sFlex)) {
        if (mfsp.isFemForm(sStem) || conj.hasConj(sStem, ":PQ", ":Q3")) {
            return true;
        }
    }
    if (phonet.hasSimil(sFlex, ":f")) {
        return true;
    }
    return false;
}

function hasMasForm (sFlex) {
    for (let sStem of _oSpellChecker.getLemma(sFlex)) {
        if (mfsp.isFemForm(sStem) || conj.hasConj(sStem, ":PQ", ":Q1")) {
            // what has a feminine form also has a masculine form
            return true;
        }
    }
    if (phonet.hasSimil(sFlex, ":m")) {
        return true;
    }
    return false;
}

function switchGender (sFlex, bPlur=null) {

    let aSugg = new Set();
    if (bPlur === null) {
        for (let sMorph of _oSpellChecker.getMorph(sFlex)) {
            if (sMorph.includes(":f")) {
                if (sMorph.includes(":s")) {
                    aSugg.add(suggMasSing(sFlex));
                } else if (sMorph.includes(":p")) {
                    aSugg.add(suggMasPlur(sFlex));
                }
            } else if (sMorph.includes(":m")) {
                if (sMorph.includes(":s")) {
                    aSugg.add(suggFemSing(sFlex));
                } else if (sMorph.includes(":p")) {
                    aSugg.add(suggFemPlur(sFlex));
                } else {
                    aSugg.add(suggFemSing(sFlex));
                    aSugg.add(suggFemPlur(sFlex));
                }
            }
        }
    } else if (bPlur) {
        for (let sMorph of _oSpellChecker.getMorph(sFlex)) {
            if (sMorph.includes(":f")) {
                aSugg.add(suggMasPlur(sFlex));
            } else if (sMorph.includes(":m")) {
                aSugg.add(suggFemPlur(sFlex));
            }
        }
    } else {
        for (let sMorph of _oSpellChecker.getMorph(sFlex)) {
            if (sMorph.includes(":f")) {
                aSugg.add(suggMasSing(sFlex));
            } else if (sMorph.includes(":m")) {
                aSugg.add(suggFemSing(sFlex));
            }
        }
    }
    if (aSugg.size > 0) {
        return Array.from(aSugg).join("|");
    }
    return "";
}

function switchPlural (sFlex) {
    let aSugg = new Set();
    for (let sMorph of _oSpellChecker.getMorph(sFlex)) { 
        if (sMorph.includes(":s")) {
            aSugg.add(suggPlur(sFlex));
        } else if (sMorph.includes(":p")) {
            aSugg.add(suggSing(sFlex));
        }
    }
    if (aSugg.size > 0) {
        return Array.from(aSugg).join("|");
    }
    return "";
}

function hasSimil (sWord, sPattern=null) {
    return phonet.hasSimil(sWord, sPattern);
}

function suggSimil (sWord, sPattern=null, bSubst=false) {
    // return list of words phonetically similar to sWord and whom POS is matching sPattern
    let aSugg = phonet.selectSimil(sWord, sPattern);
    for (let sMorph of _oSpellChecker.getMorph(sWord)) {
        for (let e of conj.getSimil(sWord, sMorph, bSubst)) {
            aSugg.add(e);
        }
    }
    if (aSugg.size > 0) {
        return Array.from(aSugg).join("|");
    }
511
512
513
514
515
516
517
518
519
520
521
522
523
524
525
526
    if (sWord[0] == "h" || sWord[0] == "H") {
        return "ce|cet";
    }
    return "ce";
}

function suggLesLa (sWord) {
    // we don’t check if word exists in _dAnalyses, for it is assumed it has been done before
    if (_dAnalyses.gl_get(sWord, []).some(s  =>  s.includes(":p"))) {
        return "les|la";
    }
    return "la";
}

function formatNumber (s) {
    let nLen = s.length;







<
|







506
507
508
509
510
511
512

513
514
515
516
517
518
519
520
    if (sWord[0] == "h" || sWord[0] == "H") {
        return "ce|cet";
    }
    return "ce";
}

function suggLesLa (sWord) {

    if (_oSpellChecker.getMorph(sWord).some(s  =>  s.includes(":p"))) {
        return "les|la";
    }
    return "la";
}

function formatNumber (s) {
    let nLen = s.length;

Modified gc_lang/fr/modules-js/lexicographe.js from [823f277d47] to [8830593e2a].

83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
    [':O2', [" 2ᵉ pers.,", "Pronom : 2ᵉ personne"]],
    [':O3', [" 3ᵉ pers.,", "Pronom : 3ᵉ personne"]],
    [':C', [" conjonction,", "Conjonction"]],
    [':Ĉ', [" conjonction (él.),", "Conjonction (élément)"]],
    [':Cc', [" conjonction de coordination,", "Conjonction de coordination"]],
    [':Cs', [" conjonction de subordination,", "Conjonction de subordination"]],
    [':Ĉs', [" conjonction de subordination (él.),", "Conjonction de subordination (élément)"]],
    
    [':Ñ', [" locution nominale (él.),", "Locution nominale (élément)"]],
    [':Â', [" locution adjectivale (él.),", "Locution adjectivale (élément)"]],
    [':Ṽ', [" locution verbale (él.),", "Locution verbale (élément)"]],
    [':Ŵ', [" locution adverbiale (él.),", "Locution adverbiale (élément)"]],
    [':Ŕ', [" locution prépositive (él.),", "Locution prépositive (élément)"]],
    [':Ĵ', [" locution interjective (él.),", "Locution interjective (élément)"]],








|







83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
    [':O2', [" 2ᵉ pers.,", "Pronom : 2ᵉ personne"]],
    [':O3', [" 3ᵉ pers.,", "Pronom : 3ᵉ personne"]],
    [':C', [" conjonction,", "Conjonction"]],
    [':Ĉ', [" conjonction (él.),", "Conjonction (élément)"]],
    [':Cc', [" conjonction de coordination,", "Conjonction de coordination"]],
    [':Cs', [" conjonction de subordination,", "Conjonction de subordination"]],
    [':Ĉs', [" conjonction de subordination (él.),", "Conjonction de subordination (élément)"]],

    [':Ñ', [" locution nominale (él.),", "Locution nominale (élément)"]],
    [':Â', [" locution adjectivale (él.),", "Locution adjectivale (élément)"]],
    [':Ṽ', [" locution verbale (él.),", "Locution verbale (élément)"]],
    [':Ŵ', [" locution adverbiale (él.),", "Locution adverbiale (élément)"]],
    [':Ŕ', [" locution prépositive (él.),", "Locution prépositive (élément)"]],
    [':Ĵ', [" locution interjective (él.),", "Locution interjective (élément)"]],

187
188
189
190
191
192
193
194
195
196
197
198
199
200
201

    ['en', " pronom adverbial"],
    ["m'en", " (me) pronom personnel objet + (en) pronom adverbial"],
    ["t'en", " (te) pronom personnel objet + (en) pronom adverbial"],
    ["s'en", " (se) pronom personnel objet + (en) pronom adverbial"]
]);

const _dSeparator = new Map([
    ['.', "point"],
    ['·', "point médian"],
    ['…', "points de suspension"],
    [':', "deux-points"],
    [';', "point-virgule"],
    [',', "virgule"],
    ['?', "point d’interrogation"],







|







187
188
189
190
191
192
193
194
195
196
197
198
199
200
201

    ['en', " pronom adverbial"],
    ["m'en", " (me) pronom personnel objet + (en) pronom adverbial"],
    ["t'en", " (te) pronom personnel objet + (en) pronom adverbial"],
    ["s'en", " (se) pronom personnel objet + (en) pronom adverbial"]
]);

const _dChar = new Map([
    ['.', "point"],
    ['·', "point médian"],
    ['…', "points de suspension"],
    [':', "deux-points"],
    [';', "point-virgule"],
    [',', "virgule"],
    ['?', "point d’interrogation"],
211
212
213
214
215
216
217

218
219
220
221
222
223




224
225
226
227
228
229
230
    ['–', "tiret demi-cadratin"],
    ['«', "guillemet ouvrant (chevrons)"],
    ['»', "guillemet fermant (chevrons)"],
    ['“', "guillemet ouvrant double"],
    ['”', "guillemet fermant double"],
    ['‘', "guillemet ouvrant"],
    ['’', "guillemet fermant"],

    ['/', "signe de la division"],
    ['+', "signe de l’addition"],
    ['*', "signe de la multiplication"],
    ['=', "signe de l’égalité"],
    ['<', "inférieur à"],
    ['>', "supérieur à"],




]);


class Lexicographe {

    constructor (oSpellChecker, oTokenizer, oLocGraph) {
        this.oSpellChecker = oSpellChecker;







>






>
>
>
>







211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
    ['–', "tiret demi-cadratin"],
    ['«', "guillemet ouvrant (chevrons)"],
    ['»', "guillemet fermant (chevrons)"],
    ['“', "guillemet ouvrant double"],
    ['”', "guillemet fermant double"],
    ['‘', "guillemet ouvrant"],
    ['’', "guillemet fermant"],
    ['"', "guillemets droits (déconseillé en typographie)"],
    ['/', "signe de la division"],
    ['+', "signe de l’addition"],
    ['*', "signe de la multiplication"],
    ['=', "signe de l’égalité"],
    ['<', "inférieur à"],
    ['>', "supérieur à"],
    ['⩽', "inférieur ou égal à"],
    ['⩾', "supérieur ou égal à"],
    ['%', "signe de pourcentage"],
    ['‰', "signe pour mille"],
]);


class Lexicographe {

    constructor (oSpellChecker, oTokenizer, oLocGraph) {
        this.oSpellChecker = oSpellChecker;
241
242
243
244
245
246
247

248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274







275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
    getInfoForToken (oToken) {
        // Token: .sType, .sValue, .nStart, .nEnd
        // return a object {sType, sValue, aLabel}
        let m = null;
        try {
            switch (oToken.sType) {
                case 'SEPARATOR':

                    return {
                        sType: oToken.sType,
                        sValue: oToken.sValue,
                        aLabel: [_dSeparator.gl_get(oToken.sValue, "caractère indéterminé")]
                    };
                    break;
                case 'NUM':
                    return {
                        sType: oToken.sType,
                        sValue: oToken.sValue,
                        aLabel: ["nombre"]
                    };
                    break;
                case 'LINK':
                    return {
                        sType: oToken.sType,
                        sValue: oToken.sValue.slice(0, 40) + "…",
                        aLabel: ["hyperlien"]
                    };
                    break;
                case 'ELPFX':
                    let sTemp = oToken.sValue.replace("’", "").replace("'", "").replace("`", "").toLowerCase();
                    return {
                        sType: oToken.sType,
                        sValue: oToken.sValue,
                        aLabel: [_dElidedPrefix.gl_get(sTemp, "préfixe élidé inconnu")]
                    };







                    break;
                case 'FOLDERUNIX':
                    return {
                        sType: oToken.sType,
                        sValue: oToken.sValue.slice(0, 40) + "…",
                        aLabel: ["dossier UNIX (et dérivés)"]
                    };
                    break;
                case 'FOLDERWIN':
                    return {
                        sType: oToken.sType,
                        sValue: oToken.sValue.slice(0, 40) + "…",
                        aLabel: ["dossier Windows"]
                    };
                    break;
                case 'ACRONYM':
                    return {
                        sType: oToken.sType,
                        sValue: oToken.sValue,
                        aLabel: ["Sigle ou acronyme"]
                    };
                    break;
                case 'WORD':







>



|
















|






>
>
>
>
>
>
>















|







246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
    getInfoForToken (oToken) {
        // Token: .sType, .sValue, .nStart, .nEnd
        // return a object {sType, sValue, aLabel}
        let m = null;
        try {
            switch (oToken.sType) {
                case 'SEPARATOR':
                case 'SIGN':
                    return {
                        sType: oToken.sType,
                        sValue: oToken.sValue,
                        aLabel: [_dChar.gl_get(oToken.sValue, "caractère indéterminé")]
                    };
                    break;
                case 'NUM':
                    return {
                        sType: oToken.sType,
                        sValue: oToken.sValue,
                        aLabel: ["nombre"]
                    };
                    break;
                case 'LINK':
                    return {
                        sType: oToken.sType,
                        sValue: oToken.sValue.slice(0, 40) + "…",
                        aLabel: ["hyperlien"]
                    };
                    break;
                case 'WORD_ELIDED':
                    let sTemp = oToken.sValue.replace("’", "").replace("'", "").replace("`", "").toLowerCase();
                    return {
                        sType: oToken.sType,
                        sValue: oToken.sValue,
                        aLabel: [_dElidedPrefix.gl_get(sTemp, "préfixe élidé inconnu")]
                    };
                    break;
                case 'WORD_ORDINAL':
                    return {
                        sType: oToken.sType,
                        sValue: oToken.sValue,
                        aLabel: ["nombre ordinal"]
                    };
                    break;
                case 'FOLDERUNIX':
                    return {
                        sType: oToken.sType,
                        sValue: oToken.sValue.slice(0, 40) + "…",
                        aLabel: ["dossier UNIX (et dérivés)"]
                    };
                    break;
                case 'FOLDERWIN':
                    return {
                        sType: oToken.sType,
                        sValue: oToken.sValue.slice(0, 40) + "…",
                        aLabel: ["dossier Windows"]
                    };
                    break;
                case 'WORD_ACRONYM':
                    return {
                        sType: oToken.sType,
                        sValue: oToken.sValue,
                        aLabel: ["Sigle ou acronyme"]
                    };
                    break;
                case 'WORD':
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
        let aTokenList = this.getListOfTokens(sText.replace("'", "’").trim(), false);
        let iKey = 0;
        let aElem = [];
        do {
            let oToken = aTokenList[iKey];
            let sMorphLoc = '';
            let aTokenTempList = [oToken];
            if (oToken.sType == "WORD" || oToken.sType == "ELPFX"){
                let iKeyTree = iKey + 1;
                let oLocNode = this.oLocGraph[oToken.sValue.toLowerCase()];
                while (oLocNode) {
                    let oTokenNext = aTokenList[iKeyTree];
                    iKeyTree++;
                    if (oTokenNext) {
                        oLocNode = oLocNode[oTokenNext.sValue.toLowerCase()];







|







465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
        let aTokenList = this.getListOfTokens(sText.replace("'", "’").trim(), false);
        let iKey = 0;
        let aElem = [];
        do {
            let oToken = aTokenList[iKey];
            let sMorphLoc = '';
            let aTokenTempList = [oToken];
            if (oToken.sType == "WORD" || oToken.sType == "WORD_ELIDED"){
                let iKeyTree = iKey + 1;
                let oLocNode = this.oLocGraph[oToken.sValue.toLowerCase()];
                while (oLocNode) {
                    let oTokenNext = aTokenList[iKeyTree];
                    iKeyTree++;
                    if (oTokenNext) {
                        oLocNode = oLocNode[oTokenNext.sValue.toLowerCase()];

Modified gc_lang/fr/modules-js/tests_data.json from [f05e835c66] to [ef6f6c1c40].

1
${gctestsJS}
|
1
${regex_gctestsJS}

Modified gc_lang/fr/modules/conj.py from [c668aaf269] to [258383e97f].


1


2
3
4
5
6
7
8

# Grammalecte - Conjugueur


# License: GPL 3

import re
import traceback

from .conj_data import lVtyp as _lVtyp
from .conj_data import lTags as _lTags
>
|
>
>







1
2
3
4
5
6
7
8
9
10
11
"""
Grammalecte - Conjugueur
"""

# License: GPL 3

import re
import traceback

from .conj_data import lVtyp as _lVtyp
from .conj_data import lTags as _lTags
25
26
27
28
29
30
31

32
33
34
35
36
37
38
_dGroup = { "0": "auxiliaire", "1": "1ᵉʳ groupe", "2": "2ᵉ groupe", "3": "3ᵉ groupe" }

_dTenseIdx = { ":PQ": 0, ":Ip": 1, ":Iq": 2, ":Is": 3, ":If": 4, ":K": 5, ":Sp": 6, ":Sq": 7, ":E": 8 }



def isVerb (sVerb):

    return sVerb in _dVerb


def getConj (sVerb, sTense, sWho):
    "returns conjugation (can be an empty string)"
    if sVerb not in _dVerb:
        return None







>







28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
_dGroup = { "0": "auxiliaire", "1": "1ᵉʳ groupe", "2": "2ᵉ groupe", "3": "3ᵉ groupe" }

_dTenseIdx = { ":PQ": 0, ":Ip": 1, ":Iq": 2, ":Is": 3, ":If": 4, ":K": 5, ":Sp": 6, ":Sq": 7, ":E": 8 }



def isVerb (sVerb):
    "return True if it’s a existing verb"
    return sVerb in _dVerb


def getConj (sVerb, sTense, sWho):
    "returns conjugation (can be an empty string)"
    if sVerb not in _dVerb:
        return None
52
53
54
55
56
57
58

59
60
61
62
63
64
65
66
67
68
    "returns raw informations about sVerb"
    if sVerb not in _dVerb:
        return None
    return _lVtyp[_dVerb[sVerb][0]]


def getSimil (sWord, sMorph, bSubst=False):

    if ":V" not in sMorph:
        return set()
    sInfi = sMorph[1:sMorph.find(" ")]
    aSugg = set()
    tTags = _getTags(sInfi)
    if tTags:
        if not bSubst:
            # we suggest conjugated forms
            if ":V1" in sMorph:
                aSugg.add(sInfi)







>


|







56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
    "returns raw informations about sVerb"
    if sVerb not in _dVerb:
        return None
    return _lVtyp[_dVerb[sVerb][0]]


def getSimil (sWord, sMorph, bSubst=False):
    "returns a set of verbal forms similar to <sWord>, according to <sMorph>"
    if ":V" not in sMorph:
        return set()
    sInfi = sMorph[1:sMorph.find("/")]
    aSugg = set()
    tTags = _getTags(sInfi)
    if tTags:
        if not bSubst:
            # we suggest conjugated forms
            if ":V1" in sMorph:
                aSugg.add(sInfi)
96
97
98
99
100
101
102

103
104
105
106
107
108
109
            # if there is only one past participle (epi inv), unreliable.
            if len(aSugg) == 1:
                aSugg.clear()
    return aSugg


def getConjSimilInfiV1 (sInfi):

    if sInfi not in _dVerb:
        return set()
    aSugg = set()
    tTags = _getTags(sInfi)
    if tTags:
        aSugg.add(_getConjWithTags(sInfi, tTags, ":Iq", ":2s"))
        aSugg.add(_getConjWithTags(sInfi, tTags, ":Iq", ":3s"))







>







101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
            # if there is only one past participle (epi inv), unreliable.
            if len(aSugg) == 1:
                aSugg.clear()
    return aSugg


def getConjSimilInfiV1 (sInfi):
    "returns verbal forms phonetically similar to infinitive form (for verb in group 1)"
    if sInfi not in _dVerb:
        return set()
    aSugg = set()
    tTags = _getTags(sInfi)
    if tTags:
        aSugg.add(_getConjWithTags(sInfi, tTags, ":Iq", ":2s"))
        aSugg.add(_getConjWithTags(sInfi, tTags, ":Iq", ":3s"))
138
139
140
141
142
143
144
145
146
147
148
149
150


151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
    "returns sWord modified by sSfx"
    if not sSfx:
        return ""
    if sSfx == "0":
        return sWord
    try:
        return sWord[:-(ord(sSfx[0])-48)] + sSfx[1:]  if sSfx[0] != '0'  else  sWord + sSfx[1:]  # 48 is the ASCII code for "0"
    except:
        return "## erreur, code : " + str(sSfx) + " ##"
        


class Verb ():


    def __init__ (self, sVerb, sVerbPattern=""):
        # conjugate a unknown verb with rules from sVerbPattern
        if not isinstance(sVerb, str):
            raise TypeError("sVerb should be a string")
        if not sVerb:
            raise ValueError("Empty string.")

        if sVerbPattern == "":
            sVerbPattern = sVerb

        self.sVerb = sVerb
        self.sVerbAux = ""
        self._sRawInfo = getVtyp(sVerbPattern)
        self.sInfo = self._readableInfo()
        self.bProWithEn = (self._sRawInfo[5] == "e")
        self._tTags = _getTags(sVerbPattern)
        if not self._tTags:
            raise ValueError("Unknown verb.")
        self._tTagsAux = _getTags(self.sVerbAux)
        self.cGroup = self._sRawInfo[0];
        self.dConj = {
            ":Y": {
                "label": "Infinitif",
                ":": sVerb,
            },
            ":P": {
                "label": "Participe présent",







|

|



>
>



















|







144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
    "returns sWord modified by sSfx"
    if not sSfx:
        return ""
    if sSfx == "0":
        return sWord
    try:
        return sWord[:-(ord(sSfx[0])-48)] + sSfx[1:]  if sSfx[0] != '0'  else  sWord + sSfx[1:]  # 48 is the ASCII code for "0"
    except (IndexError, TypeError):
        return "## erreur, code : " + str(sSfx) + " ##"



class Verb ():
    "Verb and its conjugation"

    def __init__ (self, sVerb, sVerbPattern=""):
        # conjugate a unknown verb with rules from sVerbPattern
        if not isinstance(sVerb, str):
            raise TypeError("sVerb should be a string")
        if not sVerb:
            raise ValueError("Empty string.")

        if sVerbPattern == "":
            sVerbPattern = sVerb

        self.sVerb = sVerb
        self.sVerbAux = ""
        self._sRawInfo = getVtyp(sVerbPattern)
        self.sInfo = self._readableInfo()
        self.bProWithEn = (self._sRawInfo[5] == "e")
        self._tTags = _getTags(sVerbPattern)
        if not self._tTags:
            raise ValueError("Unknown verb.")
        self._tTagsAux = _getTags(self.sVerbAux)
        self.cGroup = self._sRawInfo[0]
        self.dConj = {
            ":Y": {
                "label": "Infinitif",
                ":": sVerb,
            },
            ":P": {
                "label": "Participe présent",
287
288
289
290
291
292
293

294
295
296
297
298
299
300
                sInfo = "# erreur - code : " + self._sRawInfo
            return sGroup + " · " + sInfo
        except:
            traceback.print_exc()
            return "# erreur"

    def infinitif (self, bPro, bNeg, bTpsCo, bInt, bFem):

        try:
            if bTpsCo:
                sInfi = self.sVerbAux  if not bPro  else  "être"
            else:
                sInfi = self.sVerb
            if bPro:
                if self.bProWithEn:







>







295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
                sInfo = "# erreur - code : " + self._sRawInfo
            return sGroup + " · " + sInfo
        except:
            traceback.print_exc()
            return "# erreur"

    def infinitif (self, bPro, bNeg, bTpsCo, bInt, bFem):
        "returns string (conjugaison à l’infinitif)"
        try:
            if bTpsCo:
                sInfi = self.sVerbAux  if not bPro  else  "être"
            else:
                sInfi = self.sVerb
            if bPro:
                if self.bProWithEn:
309
310
311
312
313
314
315

316
317
318
319
320
321
322

323
324
325
326
327
328
329
                sInfi += " … ?"
            return sInfi
        except:
            traceback.print_exc()
            return "# erreur"

    def participePasse (self, sWho):

        try:
            return self.dConj[":Q"][sWho]
        except:
            traceback.print_exc()
            return "# erreur"

    def participePresent (self, bPro, bNeg, bTpsCo, bInt, bFem):

        try:
            if not self.dConj[":P"][":"]:
                return ""
            if bTpsCo:
                sPartPre = _getConjWithTags(self.sVerbAux, self._tTagsAux, ":PQ", ":P")  if not bPro  else  getConj("être", ":PQ", ":P")
            else:
                sPartPre = self.dConj[":P"][":"]







>







>







318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
                sInfi += " … ?"
            return sInfi
        except:
            traceback.print_exc()
            return "# erreur"

    def participePasse (self, sWho):
        "returns past participle according to <sWho>"
        try:
            return self.dConj[":Q"][sWho]
        except:
            traceback.print_exc()
            return "# erreur"

    def participePresent (self, bPro, bNeg, bTpsCo, bInt, bFem):
        "returns string (conjugaison du participe présent)"
        try:
            if not self.dConj[":P"][":"]:
                return ""
            if bTpsCo:
                sPartPre = _getConjWithTags(self.sVerbAux, self._tTagsAux, ":PQ", ":P")  if not bPro  else  getConj("être", ":PQ", ":P")
            else:
                sPartPre = self.dConj[":P"][":"]
346
347
348
349
350
351
352

353
354
355
356
357
358
359
                sPartPre += " … ?"
            return sPartPre
        except:
            traceback.print_exc()
            return "# erreur"

    def conjugue (self, sTemps, sWho, bPro, bNeg, bTpsCo, bInt, bFem):

        try:
            if not self.dConj[sTemps][sWho]:
                return ""
            if not bTpsCo and bInt and sWho == ":1s" and self.dConj[sTemps].get(":1ś", False):
                sWho = ":1ś"
            if bTpsCo:
                sConj = _getConjWithTags(self.sVerbAux, self._tTagsAux, sTemps, sWho)  if not bPro  else  getConj("être", sTemps, sWho)







>







357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
                sPartPre += " … ?"
            return sPartPre
        except:
            traceback.print_exc()
            return "# erreur"

    def conjugue (self, sTemps, sWho, bPro, bNeg, bTpsCo, bInt, bFem):
        "returns string (conjugue le verbe au temps <sTemps> pour <sWho>) "
        try:
            if not self.dConj[sTemps][sWho]:
                return ""
            if not bTpsCo and bInt and sWho == ":1s" and self.dConj[sTemps].get(":1ś", False):
                sWho = ":1ś"
            if bTpsCo:
                sConj = _getConjWithTags(self.sVerbAux, self._tTagsAux, sTemps, sWho)  if not bPro  else  getConj("être", sTemps, sWho)
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406

407
408
409
410
411
412
413
                else:
                    sConj = _dProObjEl[sWho] + "en " + sConj
            if bNeg:
                sConj = "n’" + sConj  if bEli and not bPro  else  "ne " + sConj
            if bInt:
                if sWho == ":3s" and not _zNeedTeuph.search(sConj):
                    sConj += "-t"
                sConj += "-" + self._getPronom(sWho, bFem)
            else:
                if sWho == ":1s" and bEli and not bNeg and not bPro:
                    sConj = "j’" + sConj
                else:
                    sConj = self._getPronom(sWho, bFem) + " " + sConj
            if bNeg:
                sConj += " pas"
            if bTpsCo:
                sConj += " " + self._seekPpas(bPro, bFem, sWho.endswith("p") or self._sRawInfo[5] == "r")
            if bInt:
                sConj += " … ?"
            return sConj
        except:
            traceback.print_exc()
            return "# erreur"

    def _getPronom (self, sWho, bFem):
        try:
            if sWho == ":3s":
                if self._sRawInfo[5] == "r":
                    return "on"
                elif bFem:
                    return "elle"
            elif sWho == ":3p" and bFem:
                return "elles"
            return _dProSuj[sWho]
        except:
            traceback.print_exc()
            return "# erreur"

    def imperatif (self, sWho, bPro, bNeg, bTpsCo, bFem):

        try:
            if not self.dConj[":E"][sWho]:
                return ""
            if bTpsCo:
                sImpe = _getConjWithTags(self.sVerbAux, self._tTagsAux, ":E", sWho)  if not bPro  else  getConj(u"être", ":E", sWho)
            else:
                sImpe = self.dConj[":E"][sWho]







|




|











|














>







380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
                else:
                    sConj = _dProObjEl[sWho] + "en " + sConj
            if bNeg:
                sConj = "n’" + sConj  if bEli and not bPro  else  "ne " + sConj
            if bInt:
                if sWho == ":3s" and not _zNeedTeuph.search(sConj):
                    sConj += "-t"
                sConj += "-" + self._getPronomSujet(sWho, bFem)
            else:
                if sWho == ":1s" and bEli and not bNeg and not bPro:
                    sConj = "j’" + sConj
                else:
                    sConj = self._getPronomSujet(sWho, bFem) + " " + sConj
            if bNeg:
                sConj += " pas"
            if bTpsCo:
                sConj += " " + self._seekPpas(bPro, bFem, sWho.endswith("p") or self._sRawInfo[5] == "r")
            if bInt:
                sConj += " … ?"
            return sConj
        except:
            traceback.print_exc()
            return "# erreur"

    def _getPronomSujet (self, sWho, bFem):
        try:
            if sWho == ":3s":
                if self._sRawInfo[5] == "r":
                    return "on"
                elif bFem:
                    return "elle"
            elif sWho == ":3p" and bFem:
                return "elles"
            return _dProSuj[sWho]
        except:
            traceback.print_exc()
            return "# erreur"

    def imperatif (self, sWho, bPro, bNeg, bTpsCo, bFem):
        "returns string (conjugaison à l’impératif)"
        try:
            if not self.dConj[":E"][sWho]:
                return ""
            if bTpsCo:
                sImpe = _getConjWithTags(self.sVerbAux, self._tTagsAux, ":E", sWho)  if not bPro  else  getConj(u"être", ":E", sWho)
            else:
                sImpe = self.dConj[":E"][sWho]

Modified gc_lang/fr/modules/conj_generator.py from [2e696a65e3] to [ee0a228497].


1
2

3
4
5
6
7

8
9
10
11
12
13
14
15
16
17

18
19
20
21
22
23
24
25
26
27
28
29
30

31
32
33
34
35
36
37

# Conjugation generator
# beta stage, unfinished, the root for a new way to generate flexions…


import re


def conjugate (sVerb, sVerbTag="i_____a", bVarPpas=True):

    lConj = []
    cGroup = getVerbGroupChar(sVerb)
    for nCut, sAdd, sFlexTags, sPattern in getConjRules(sVerb, bVarPpas):
        if not sPattern or re.search(sPattern, sVerb):
            sFlexion = sVerb[0:-nCut] + sAdd  if nCut  else sVerb + sAdd
            lConj.append((sFlexion, ":V" + cGroup + "_" + sVerbTag + sFlexTags))
    return lConj


def getVerbGroupChar (sVerb, ):

    sVerb = sVerb.lower()
    if sVerb.endswith("er"):
        return "1"
    if sVerb.endswith("ir"):
        return "2"
    if sVerb == "être" or sVerb == "avoir":
        return "0"
    if sVerb.endswith("re"):
        return "3"
    return "4"


def getConjRules (sVerb, bVarPpas=True, nGroup=2):

    if sVerb.endswith("er"):
        # premier groupe, conjugaison en fonction de la terminaison du lemme
        # 5 lettres
        if sVerb[-5:] in oConj["V1"]:
            lConj = list(oConj["V1"][sVerb[-5:]])
        # 4 lettres
        elif sVerb[-4:] in oConj["V1"]:
>
|
|
>





>









|
>













>







1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
"""
Conjugation generator
beta stage, unfinished, the root for a new way to generate flexions…
"""

import re


def conjugate (sVerb, sVerbTag="i_____a", bVarPpas=True):
    "conjugate <sVerb> and returns a list of tuples (conjugation form, tags)"
    lConj = []
    cGroup = getVerbGroupChar(sVerb)
    for nCut, sAdd, sFlexTags, sPattern in getConjRules(sVerb, bVarPpas):
        if not sPattern or re.search(sPattern, sVerb):
            sFlexion = sVerb[0:-nCut] + sAdd  if nCut  else sVerb + sAdd
            lConj.append((sFlexion, ":V" + cGroup + "_" + sVerbTag + sFlexTags))
    return lConj


def getVerbGroupChar (sVerb):
    "returns the group number of <sVerb> guessing on its ending"
    sVerb = sVerb.lower()
    if sVerb.endswith("er"):
        return "1"
    if sVerb.endswith("ir"):
        return "2"
    if sVerb == "être" or sVerb == "avoir":
        return "0"
    if sVerb.endswith("re"):
        return "3"
    return "4"


def getConjRules (sVerb, bVarPpas=True, nGroup=2):
    "returns a list of lists to conjugate a verb, guessing on its ending"
    if sVerb.endswith("er"):
        # premier groupe, conjugaison en fonction de la terminaison du lemme
        # 5 lettres
        if sVerb[-5:] in oConj["V1"]:
            lConj = list(oConj["V1"][sVerb[-5:]])
        # 4 lettres
        elif sVerb[-4:] in oConj["V1"]:
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
        [2,     "isses",        ":Sp:Sq:2s/*",      False],
        [2,     "isse",         ":Sp:3s/*",         False],
        [2,     "ît",           ":Sq:3s/*",         False],
        [2,     "is",           ":E:2s/*",          False],
        [2,     "issons",       ":E:1p/*",          False],
        [2,     "issez",        ":E:2p/*",          False]
    ],
    
    # premier groupe (bien plus irrégulier que prétendu)
    "V1": {
        # a
        # verbes en -er, -ger, -yer, -cer
        "er": [
            [2,      "er",        ":Y/*",               False],
            [2,      "ant",       ":P/*",               False],







|







118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
        [2,     "isses",        ":Sp:Sq:2s/*",      False],
        [2,     "isse",         ":Sp:3s/*",         False],
        [2,     "ît",           ":Sq:3s/*",         False],
        [2,     "is",           ":E:2s/*",          False],
        [2,     "issons",       ":E:1p/*",          False],
        [2,     "issez",        ":E:2p/*",          False]
    ],

    # premier groupe (bien plus irrégulier que prétendu)
    "V1": {
        # a
        # verbes en -er, -ger, -yer, -cer
        "er": [
            [2,      "er",        ":Y/*",               False],
            [2,      "ant",       ":P/*",               False],

Modified gc_lang/fr/modules/cregex.py from [a0df0d1397] to [4b9e99ff72].


1

2
3
4
5
6
7
8
9
10
11
12
13

# Grammalecte - Compiled regular expressions


import re

#### Lemme
Lemma = re.compile("^>(\w[\w-]*)")

#### Analyses
Gender = re.compile(":[mfe]")
Number = re.compile(":[spi]")

#### Nom et adjectif
NA = re.compile(":[NA]")
>
|
>




|







1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
"""
Grammalecte - Compiled regular expressions
"""

import re

#### Lemme
Lemma = re.compile(r"^>(\w[\w-]*)")

#### Analyses
Gender = re.compile(":[mfe]")
Number = re.compile(":[spi]")

#### Nom et adjectif
NA = re.compile(":[NA]")
76
77
78
79
80
81
82

83
84
85

86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101

102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133

134
135
136

137
138
139

140
141
142

143
144
145

146
147
148

149
150
151

152
153
154

155
156
157

158
159
160

161
162
163

164
165
166
167
168
169

170
171
172

173
174
175

176
177
178

179
180
181

182
183
184
185
186
187
188
189
190

191
192
193

194
195
196

197
198
199

200
201
202

203
204
205

206
207
208

209
210
211

212
213
214

215
216
217

218
219
220

221
222
223

224
225
226
NPf = re.compile(":(?:M[12P]|T):f")
NPe = re.compile(":(?:M[12P]|T):e")


#### FONCTIONS

def getLemmaOfMorph (s):

    return Lemma.search(s).group(1)

def checkAgreement (l1, l2):

    # check number agreement
    if not mbInv(l1) and not mbInv(l2):
        if mbSg(l1) and not mbSg(l2):
            return False
        if mbPl(l1) and not mbPl(l2):
            return False
    # check gender agreement
    if mbEpi(l1) or mbEpi(l2):
        return True
    if mbMas(l1) and not mbMas(l2):
        return False
    if mbFem(l1) and not mbFem(l2):
        return False
    return True

def checkConjVerb (lMorph, sReqConj):

    return any(sReqConj in s  for s in lMorph)

def getGender (lMorph):
    "returns gender of word (':m', ':f', ':e' or empty string)."
    sGender = ""
    for sMorph in lMorph:
        m = Gender.search(sMorph)
        if m:
            if not sGender:
                sGender = m.group(0)
            elif sGender != m.group(0):
                return ":e"
    return sGender

def getNumber (lMorph):
    "returns number of word (':s', ':p', ':i' or empty string)."
    sNumber = ""
    for sMorph in lMorph:
        m = Number.search(sWord)
        if m:
            if not sNumber:
                sNumber = m.group(0)
            elif sNumber != m.group(0):
                return ":i"
    return sNumber

# NOTE :  isWhat (lMorph)    returns True   if lMorph contains nothing else than What
#         mbWhat (lMorph)    returns True   if lMorph contains What at least once

## isXXX = it’s certain

def isNom (lMorph):

    return all(":N" in s  for s in lMorph)

def isNomNotAdj (lMorph):

    return all(NnotA.search(s)  for s in lMorph)

def isAdj (lMorph):

    return all(":A" in s  for s in lMorph)

def isNomAdj (lMorph):

    return all(NA.search(s)  for s in lMorph)

def isNomVconj (lMorph):

    return all(NVconj.search(s)  for s in lMorph)

def isInv (lMorph):

    return all(":i" in s  for s in lMorph)

def isSg (lMorph):

    return all(":s" in s  for s in lMorph)

def isPl (lMorph):

    return all(":p" in s  for s in lMorph)

def isEpi (lMorph):

    return all(":e" in s  for s in lMorph)

def isMas (lMorph):

    return all(":m" in s  for s in lMorph)

def isFem (lMorph):

    return all(":f" in s  for s in lMorph)


## mbXXX = MAYBE XXX

def mbNom (lMorph):

    return any(":N" in s  for s in lMorph)

def mbAdj (lMorph):

    return any(":A" in s  for s in lMorph)

def mbAdjNb (lMorph):

    return any(AD.search(s)  for s in lMorph)

def mbNomAdj (lMorph):

    return any(NA.search(s)  for s in lMorph)

def mbNomNotAdj (lMorph):

    b = False
    for s in lMorph:
        if ":A" in s:
            return False
        if ":N" in s:
            b = True
    return b

def mbPpasNomNotAdj (lMorph):

    return any(PNnotA.search(s)  for s in lMorph)

def mbVconj (lMorph):

    return any(Vconj.search(s)  for s in lMorph)

def mbVconj123 (lMorph):

    return any(Vconj123.search(s)  for s in lMorph)

def mbMG (lMorph):

    return any(":G" in s  for s in lMorph)

def mbInv (lMorph):

    return any(":i" in s  for s in lMorph)

def mbSg (lMorph):

    return any(":s" in s  for s in lMorph)

def mbPl (lMorph):

    return any(":p" in s  for s in lMorph)

def mbEpi (lMorph):

    return any(":e" in s  for s in lMorph)

def mbMas (lMorph):

    return any(":m" in s  for s in lMorph)

def mbFem (lMorph):

    return any(":f" in s  for s in lMorph)

def mbNpr (lMorph):

    return any(NP.search(s)  for s in lMorph)

def mbNprMasNotFem (lMorph):

    if any(NPf.search(s)  for s in lMorph):
        return False
    return any(NPm.search(s)  for s in lMorph)







>



>
















>


















|













>



>



>



>



>



>



>



>



>



>



>






>



>



>



>



>
|




|
|


>



>



>



>



>



>



>



>



>



>



>



>



78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
NPf = re.compile(":(?:M[12P]|T):f")
NPe = re.compile(":(?:M[12P]|T):e")


#### FONCTIONS

def getLemmaOfMorph (s):
    "return lemma in morphology <s>"
    return Lemma.search(s).group(1)

def checkAgreement (l1, l2):
    "returns True if agreement in gender and number is possible between morphologies <l1> and <l2>"
    # check number agreement
    if not mbInv(l1) and not mbInv(l2):
        if mbSg(l1) and not mbSg(l2):
            return False
        if mbPl(l1) and not mbPl(l2):
            return False
    # check gender agreement
    if mbEpi(l1) or mbEpi(l2):
        return True
    if mbMas(l1) and not mbMas(l2):
        return False
    if mbFem(l1) and not mbFem(l2):
        return False
    return True

def checkConjVerb (lMorph, sReqConj):
    "returns True if <sReqConj> in <lMorph>"
    return any(sReqConj in s  for s in lMorph)

def getGender (lMorph):
    "returns gender of word (':m', ':f', ':e' or empty string)."
    sGender = ""
    for sMorph in lMorph:
        m = Gender.search(sMorph)
        if m:
            if not sGender:
                sGender = m.group(0)
            elif sGender != m.group(0):
                return ":e"
    return sGender

def getNumber (lMorph):
    "returns number of word (':s', ':p', ':i' or empty string)."
    sNumber = ""
    for sMorph in lMorph:
        m = Number.search(sMorph)
        if m:
            if not sNumber:
                sNumber = m.group(0)
            elif sNumber != m.group(0):
                return ":i"
    return sNumber

# NOTE :  isWhat (lMorph)    returns True   if lMorph contains nothing else than What
#         mbWhat (lMorph)    returns True   if lMorph contains What at least once

## isXXX = it’s certain

def isNom (lMorph):
    "returns True if all morphologies are “nom”"
    return all(":N" in s  for s in lMorph)

def isNomNotAdj (lMorph):
    "returns True if all morphologies are “nom”, but not “adjectif”"
    return all(NnotA.search(s)  for s in lMorph)

def isAdj (lMorph):
    "returns True if all morphologies are “adjectif”"
    return all(":A" in s  for s in lMorph)

def isNomAdj (lMorph):
    "returns True if all morphologies are “nom” or “adjectif”"
    return all(NA.search(s)  for s in lMorph)

def isNomVconj (lMorph):
    "returns True if all morphologies are “nom” or “verbe conjugué”"
    return all(NVconj.search(s)  for s in lMorph)

def isInv (lMorph):
    "returns True if all morphologies are “invariable”"
    return all(":i" in s  for s in lMorph)

def isSg (lMorph):
    "returns True if all morphologies are “singulier”"
    return all(":s" in s  for s in lMorph)

def isPl (lMorph):
    "returns True if all morphologies are “pluriel”"
    return all(":p" in s  for s in lMorph)

def isEpi (lMorph):
    "returns True if all morphologies are “épicène”"
    return all(":e" in s  for s in lMorph)

def isMas (lMorph):
    "returns True if all morphologies are “masculin”"
    return all(":m" in s  for s in lMorph)

def isFem (lMorph):
    "returns True if all morphologies are “féminin”"
    return all(":f" in s  for s in lMorph)


## mbXXX = MAYBE XXX

def mbNom (lMorph):
    "returns True if one morphology is “nom”"
    return any(":N" in s  for s in lMorph)

def mbAdj (lMorph):
    "returns True if one morphology is “adjectif”"
    return any(":A" in s  for s in lMorph)

def mbAdjNb (lMorph):
    "returns True if one morphology is “adjectif” or “nombre”"
    return any(AD.search(s)  for s in lMorph)

def mbNomAdj (lMorph):
    "returns True if one morphology is “nom” or “adjectif”"
    return any(NA.search(s)  for s in lMorph)

def mbNomNotAdj (lMorph):
    "returns True if one morphology is “nom”, but not “adjectif”"
    bResult = False
    for s in lMorph:
        if ":A" in s:
            return False
        if ":N" in s:
            bResult = True
    return bResult

def mbPpasNomNotAdj (lMorph):
    "returns True if one morphology is “nom” or “participe passé”, but not “adjectif”"
    return any(PNnotA.search(s)  for s in lMorph)

def mbVconj (lMorph):
    "returns True if one morphology is “nom” or “verbe conjugué”"
    return any(Vconj.search(s)  for s in lMorph)

def mbVconj123 (lMorph):
    "returns True if one morphology is “nom” or “verbe conjugué” (but not “avoir” or “être”)"
    return any(Vconj123.search(s)  for s in lMorph)

def mbMG (lMorph):
    "returns True if one morphology is “mot grammatical”"
    return any(":G" in s  for s in lMorph)

def mbInv (lMorph):
    "returns True if one morphology is “invariable”"
    return any(":i" in s  for s in lMorph)

def mbSg (lMorph):
    "returns True if one morphology is “singulier”"
    return any(":s" in s  for s in lMorph)

def mbPl (lMorph):
    "returns True if one morphology is “pluriel”"
    return any(":p" in s  for s in lMorph)

def mbEpi (lMorph):
    "returns True if one morphology is “épicène”"
    return any(":e" in s  for s in lMorph)

def mbMas (lMorph):
    "returns True if one morphology is “masculin”"
    return any(":m" in s  for s in lMorph)

def mbFem (lMorph):
    "returns True if one morphology is “féminin”"
    return any(":f" in s  for s in lMorph)

def mbNpr (lMorph):
    "returns True if one morphology is “nom propre” or “titre de civilité”"
    return any(NP.search(s)  for s in lMorph)

def mbNprMasNotFem (lMorph):
    "returns True if one morphology is “nom propre masculin” but not “féminin”"
    if any(NPf.search(s)  for s in lMorph):
        return False
    return any(NPm.search(s)  for s in lMorph)

Modified gc_lang/fr/modules/gce_analyseur.py from [39975de0ac] to [b96ac7c29b].

1
2
3
4
5







6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100

101
102
103
104
105
106
107
108
109
110
111
112
113
114

115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130

131
132
133
134
135
136
137
138
139
140
141
142

143
144
145
146
147
148
149
150
151
152
153
154
155
156
#### GRAMMAR CHECKING ENGINE PLUGIN: Parsing functions for French language

from . import cregex as cr









def rewriteSubject (s1, s2):
    # s1 is supposed to be prn/patr/npr (M[12P])
    if s2 == "lui":
        return "ils"
    if s2 == "moi":
        return "nous"
    if s2 == "toi":
        return "vous"
    if s2 == "nous":
        return "nous"
    if s2 == "vous":
        return "vous"
    if s2 == "eux":
        return "ils"
    if s2 == "elle" or s2 == "elles":
        # We don’t check if word exists in _dAnalyses, for it is assumed it has been done before
        if cr.mbNprMasNotFem(_dAnalyses.get(s1, False)):
            return "ils"
        # si épicène, indéterminable, mais OSEF, le féminin l’emporte
        return "elles"
    return s1 + " et " + s2


def apposition (sWord1, sWord2):
    "returns True if nom + nom (no agreement required)"
    # We don’t check if word exists in _dAnalyses, for it is assumed it has been done before
    return cr.mbNomNotAdj(_dAnalyses.get(sWord2, False)) and cr.mbPpasNomNotAdj(_dAnalyses.get(sWord1, False))


def isAmbiguousNAV (sWord):
    "words which are nom|adj and verb are ambiguous (except être and avoir)"
    if sWord not in _dAnalyses and not _storeMorphFromFSA(sWord):
        return False
    if not cr.mbNomAdj(_dAnalyses[sWord]) or sWord == "est":
        return False
    if cr.mbVconj(_dAnalyses[sWord]) and not cr.mbMG(_dAnalyses[sWord]):
        return True
    return False


def isAmbiguousAndWrong (sWord1, sWord2, sReqMorphNA, sReqMorphConj):
    "use it if sWord1 won’t be a verb; word2 is assumed to be True via isAmbiguousNAV"
    # We don’t check if word exists in _dAnalyses, for it is assumed it has been done before
    a2 = _dAnalyses.get(sWord2, None)
    if not a2:
        return False
    if cr.checkConjVerb(a2, sReqMorphConj):
        # verb word2 is ok
        return False
    a1 = _dAnalyses.get(sWord1, None)
    if not a1:
        return False
    if cr.checkAgreement(a1, a2) and (cr.mbAdj(a2) or cr.mbAdj(a1)):
        return False
    return True


def isVeryAmbiguousAndWrong (sWord1, sWord2, sReqMorphNA, sReqMorphConj, bLastHopeCond):
    "use it if sWord1 can be also a verb; word2 is assumed to be True via isAmbiguousNAV"
    # We don’t check if word exists in _dAnalyses, for it is assumed it has been done before
    a2 = _dAnalyses.get(sWord2, None)
    if not a2:
        return False
    if cr.checkConjVerb(a2, sReqMorphConj):
        # verb word2 is ok
        return False
    a1 = _dAnalyses.get(sWord1, None)
    if not a1:
        return False
    if cr.checkAgreement(a1, a2) and (cr.mbAdj(a2) or cr.mbAdjNb(a1)):
        return False
    # now, we know there no agreement, and conjugation is also wrong
    if cr.isNomAdj(a1):
        return True
    #if cr.isNomAdjVerb(a1): # considered True
    if bLastHopeCond:
        return True
    return False


def checkAgreement (sWord1, sWord2):
    # We don’t check if word exists in _dAnalyses, for it is assumed it has been done before
    a2 = _dAnalyses.get(sWord2, None)
    if not a2:
        return True
    a1 = _dAnalyses.get(sWord1, None)
    if not a1:
        return True
    return cr.checkAgreement(a1, a2)


_zUnitSpecial = re.compile("[µ/⁰¹²³⁴⁵⁶⁷⁸⁹Ωℓ·]")
_zUnitNumbers = re.compile("[0-9]")

def mbUnit (s):

    if _zUnitSpecial.search(s):
        return True
    if 1 < len(s) < 16 and s[0:1].islower() and (not s[1:].islower() or _zUnitNumbers.search(s)):
        return True
    return False


#### Syntagmes

_zEndOfNG1 = re.compile(" *$| +(?:, +|)(?:n(?:’|e |o(?:u?s|tre) )|l(?:’|e(?:urs?|s|) |a )|j(?:’|e )|m(?:’|es? |a |on )|t(?:’|es? |a |u )|s(?:’|es? |a )|c(?:’|e(?:t|tte|s|) )|ç(?:a |’)|ils? |vo(?:u?s|tre) )")
_zEndOfNG2 = re.compile(r" +(\w[\w-]+)")
_zEndOfNG3 = re.compile(r" *, +(\w[\w-]+)")

def isEndOfNG (dDA, s, iOffset):

    if _zEndOfNG1.match(s):
        return True
    m = _zEndOfNG2.match(s)
    if m and morphex(dDA, (iOffset+m.start(1), m.group(1)), ":[VR]", ":[NAQP]"):
        return True
    m = _zEndOfNG3.match(s)
    if m and not morph(dDA, (iOffset+m.start(1), m.group(1)), ":[NA]", False):
        return True
    return False


_zNextIsNotCOD1 = re.compile(" *,")
_zNextIsNotCOD2 = re.compile(" +(?:[mtsnj](e +|’)|[nv]ous |tu |ils? |elles? )")
_zNextIsNotCOD3 = re.compile(r" +([a-zéèî][\w-]+)")

def isNextNotCOD (dDA, s, iOffset):

    if _zNextIsNotCOD1.match(s) or _zNextIsNotCOD2.match(s):
        return True
    m = _zNextIsNotCOD3.match(s)
    if m and morphex(dDA, (iOffset+m.start(1), m.group(1)), ":[123][sp]", ":[DM]"):
        return True
    return False


_zNextIsVerb1 = re.compile(" +[nmts](?:e |’)")
_zNextIsVerb2 = re.compile(r" +(\w[\w-]+)")

def isNextVerb (dDA, s, iOffset):

    if _zNextIsVerb1.match(s):
        return True
    m = _zNextIsVerb2.match(s)
    if m and morph(dDA, (iOffset+m.start(1), m.group(1)), ":[123][sp]", False):
        return True
    return False


#### Exceptions

aREGULARPLURAL = frozenset(["abricot", "amarante", "aubergine", "acajou", "anthracite", "brique", "caca", "café", \
                            "carotte", "cerise", "chataigne", "corail", "citron", "crème", "grave", "groseille", \
                            "jonquille", "marron", "olive", "pervenche", "prune", "sable"])
aSHOULDBEVERB = frozenset(["aller", "manger"]) 





>
>
>
>
>
>
>

|













<
|








<
|




|
<
|

|





|
<
|





|








|
<
|





|














|
|


|









>














>
















>












>













|
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27

28
29
30
31
32
33
34
35
36

37
38
39
40
41
42

43
44
45
46
47
48
49
50
51

52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67

68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
#### GRAMMAR CHECKING ENGINE PLUGIN: Parsing functions for French language

from . import cregex as cr


def g_morphVC (dToken, sPattern, sNegPattern=""):
    nEnd = dToken["sValue"].rfind("-")
    if "-t-" in dToken["sValue"]:
        nEnd = nEnd - 2
    return g_morph(dToken, sPattern, sNegPattern, 0, nEnd, False)


def rewriteSubject (s1, s2):
    "rewrite complex subject: <s1> a prn/patr/npr (M[12P]) followed by “et” and <s2>"
    if s2 == "lui":
        return "ils"
    if s2 == "moi":
        return "nous"
    if s2 == "toi":
        return "vous"
    if s2 == "nous":
        return "nous"
    if s2 == "vous":
        return "vous"
    if s2 == "eux":
        return "ils"
    if s2 == "elle" or s2 == "elles":

        if cr.mbNprMasNotFem(_oSpellChecker.getMorph(s1)):
            return "ils"
        # si épicène, indéterminable, mais OSEF, le féminin l’emporte
        return "elles"
    return s1 + " et " + s2


def apposition (sWord1, sWord2):
    "returns True if nom + nom (no agreement required)"

    return cr.mbNomNotAdj(_oSpellChecker.getMorph(sWord2)) and cr.mbPpasNomNotAdj(_oSpellChecker.getMorph(sWord1))


def isAmbiguousNAV (sWord):
    "words which are nom|adj and verb are ambiguous (except être and avoir)"
    lMorph = _oSpellChecker.getMorph(sWord)

    if not cr.mbNomAdj(lMorph) or sWord == "est":
        return False
    if cr.mbVconj(lMorph) and not cr.mbMG(lMorph):
        return True
    return False


def isAmbiguousAndWrong (sWord1, sWord2, sReqMorphNA, sReqMorphConj):
    "use it if <sWord1> won’t be a verb; <sWord2> is assumed to be True via isAmbiguousNAV"

    a2 = _oSpellChecker.getMorph(sWord2)
    if not a2:
        return False
    if cr.checkConjVerb(a2, sReqMorphConj):
        # verb word2 is ok
        return False
    a1 = _oSpellChecker.getMorph(sWord1)
    if not a1:
        return False
    if cr.checkAgreement(a1, a2) and (cr.mbAdj(a2) or cr.mbAdj(a1)):
        return False
    return True


def isVeryAmbiguousAndWrong (sWord1, sWord2, sReqMorphNA, sReqMorphConj, bLastHopeCond):
    "use it if <sWord1> can be also a verb; <sWord2> is assumed to be True via isAmbiguousNAV"

    a2 = _oSpellChecker.getMorph(sWord2)
    if not a2:
        return False
    if cr.checkConjVerb(a2, sReqMorphConj):
        # verb word2 is ok
        return False
    a1 = _oSpellChecker.getMorph(sWord1)
    if not a1:
        return False
    if cr.checkAgreement(a1, a2) and (cr.mbAdj(a2) or cr.mbAdjNb(a1)):
        return False
    # now, we know there no agreement, and conjugation is also wrong
    if cr.isNomAdj(a1):
        return True
    #if cr.isNomAdjVerb(a1): # considered True
    if bLastHopeCond:
        return True
    return False


def checkAgreement (sWord1, sWord2):
    "check agreement between <sWord1> and <sWord1>"
    a2 = _oSpellChecker.getMorph(sWord2)
    if not a2:
        return True
    a1 = _oSpellChecker.getMorph(sWord1)
    if not a1:
        return True
    return cr.checkAgreement(a1, a2)


_zUnitSpecial = re.compile("[µ/⁰¹²³⁴⁵⁶⁷⁸⁹Ωℓ·]")
_zUnitNumbers = re.compile("[0-9]")

def mbUnit (s):
    "returns True it can be a measurement unit"
    if _zUnitSpecial.search(s):
        return True
    if 1 < len(s) < 16 and s[0:1].islower() and (not s[1:].islower() or _zUnitNumbers.search(s)):
        return True
    return False


#### Syntagmes

_zEndOfNG1 = re.compile(" *$| +(?:, +|)(?:n(?:’|e |o(?:u?s|tre) )|l(?:’|e(?:urs?|s|) |a )|j(?:’|e )|m(?:’|es? |a |on )|t(?:’|es? |a |u )|s(?:’|es? |a )|c(?:’|e(?:t|tte|s|) )|ç(?:a |’)|ils? |vo(?:u?s|tre) )")
_zEndOfNG2 = re.compile(r" +(\w[\w-]+)")
_zEndOfNG3 = re.compile(r" *, +(\w[\w-]+)")

def isEndOfNG (dDA, s, iOffset):
    "returns True if next word doesn’t belong to a noun group"
    if _zEndOfNG1.match(s):
        return True
    m = _zEndOfNG2.match(s)
    if m and morphex(dDA, (iOffset+m.start(1), m.group(1)), ":[VR]", ":[NAQP]"):
        return True
    m = _zEndOfNG3.match(s)
    if m and not morph(dDA, (iOffset+m.start(1), m.group(1)), ":[NA]", False):
        return True
    return False


_zNextIsNotCOD1 = re.compile(" *,")
_zNextIsNotCOD2 = re.compile(" +(?:[mtsnj](e +|’)|[nv]ous |tu |ils? |elles? )")
_zNextIsNotCOD3 = re.compile(r" +([a-zéèî][\w-]+)")

def isNextNotCOD (dDA, s, iOffset):
    "returns True if next word is not a COD"
    if _zNextIsNotCOD1.match(s) or _zNextIsNotCOD2.match(s):
        return True
    m = _zNextIsNotCOD3.match(s)
    if m and morphex(dDA, (iOffset+m.start(1), m.group(1)), ":[123][sp]", ":[DM]"):
        return True
    return False


_zNextIsVerb1 = re.compile(" +[nmts](?:e |’)")
_zNextIsVerb2 = re.compile(r" +(\w[\w-]+)")

def isNextVerb (dDA, s, iOffset):
    "returns True if next word is a verb"
    if _zNextIsVerb1.match(s):
        return True
    m = _zNextIsVerb2.match(s)
    if m and morph(dDA, (iOffset+m.start(1), m.group(1)), ":[123][sp]", False):
        return True
    return False


#### Exceptions

aREGULARPLURAL = frozenset(["abricot", "amarante", "aubergine", "acajou", "anthracite", "brique", "caca", "café", \
                            "carotte", "cerise", "chataigne", "corail", "citron", "crème", "grave", "groseille", \
                            "jonquille", "marron", "olive", "pervenche", "prune", "sable"])
aSHOULDBEVERB = frozenset(["aller", "manger"])

Modified gc_lang/fr/modules/gce_date_verif.py from [1265100649] to [c3f0a9eb2f].

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37

38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
#### GRAMMAR CHECKING ENGINE PLUGIN

#### Check date validity

_lDay = ["lundi", "mardi", "mercredi", "jeudi", "vendredi", "samedi", "dimanche"]
_dMonth = { "janvier":1, "février":2, "mars":3, "avril":4, "mai":5, "juin":6, "juillet":7, "août":8, "aout":8, "septembre":9, "octobre":10, "novembre":11, "décembre":12 }

import datetime


def checkDate (day, month, year):
    "to use if month is a number"
    try:
        return datetime.date(int(year), int(month), int(day))
    except ValueError:
        return False
    except:
        return True


def checkDateWithString (day, month, year):
    "to use if month is a noun"
    try:
        return datetime.date(int(year), _dMonth.get(month.lower(), ""), int(day))
    except ValueError:
        return False
    except:
        return True


def checkDay (weekday, day, month, year):
    "to use if month is a number"
    oDate = checkDate(day, month, year)
    if oDate and _lDay[oDate.weekday()] != weekday.lower():
        return False
    return True


def checkDayWithString (weekday, day, month, year):
    "to use if month is a noun"
    oDate = checkDate(day, _dMonth.get(month, ""), year)
    if oDate and _lDay[oDate.weekday()] != weekday.lower():
        return False
    return True


def getDay (day, month, year):
    "to use if month is a number"
    return _lDay[datetime.date(int(year), int(month), int(day)).weekday()]


def getDayWithString (day, month, year):
    "to use if month is a noun"
    return _lDay[datetime.date(int(year), _dMonth.get(month.lower(), ""), int(day)).weekday()]










|
|

|






|
|

|






|
|
|
|



>
|
|
|
|




|
|
|


|
|
|
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
#### GRAMMAR CHECKING ENGINE PLUGIN

#### Check date validity

_lDay = ["lundi", "mardi", "mercredi", "jeudi", "vendredi", "samedi", "dimanche"]
_dMonth = { "janvier":1, "février":2, "mars":3, "avril":4, "mai":5, "juin":6, "juillet":7, "août":8, "aout":8, "septembre":9, "octobre":10, "novembre":11, "décembre":12 }

import datetime


def checkDate (sDay, sMonth, sYear):
    "to use if <sMonth> is a number"
    try:
        return datetime.date(int(sYear), int(sMonth), int(sDay))
    except ValueError:
        return False
    except:
        return True


def checkDateWithString (sDay, sMonth, sYear):
    "to use if <sMonth> is a noun"
    try:
        return datetime.date(int(sYear), _dMonth.get(sMonth.lower(), ""), int(sDay))
    except ValueError:
        return False
    except:
        return True


def checkDay (sWeekday, sDay, sMonth, sYear):
    "to use if <sMonth> is a number"
    oDate = checkDate(sDay, sMonth, sYear)
    if oDate and _lDay[oDate.weekday()] != sWeekday.lower():
        return False
    return True


def checkDayWithString (sWeekday, sDay, sMonth, sYear):
    "to use if <sMonth> is a noun"
    oDate = checkDate(sDay, _dMonth.get(sMonth, ""), sYear)
    if oDate and _lDay[oDate.weekday()] != sWeekday.lower():
        return False
    return True


def getDay (sDay, sMonth, sYear):
    "to use if <sMonth> is a number"
    return _lDay[datetime.date(int(sYear), int(sMonth), int(sDay)).weekday()]


def getDayWithString (sDay, sMonth, sYear):
    "to use if <sMonth> is a noun"
    return _lDay[datetime.date(int(sYear), _dMonth.get(sMonth.lower(), ""), int(sDay)).weekday()]

Modified gc_lang/fr/modules/gce_suggestions.py from [79835965e4] to [2926468975].

1
2
3
4
5
6
7
8
9
10

11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
#### GRAMMAR CHECKING ENGINE PLUGIN: Suggestion mechanisms

from . import conj
from . import mfsp
from . import phonet


## Verbs

def suggVerb (sFlex, sWho, funcSugg2=None):

    aSugg = set()
    for sStem in stem(sFlex):
        tTags = conj._getTags(sStem)
        if tTags:
            # we get the tense
            aTense = set()
            for sMorph in _dAnalyses.get(sFlex, []): # we don’t check if word exists in _dAnalyses, for it is assumed it has been done before
                for m in re.finditer(">"+sStem+" .*?(:(?:Y|I[pqsf]|S[pq]|K|P))", sMorph):
                    # stem must be used in regex to prevent confusion between different verbs (e.g. sauras has 2 stems: savoir and saurer)
                    if m:
                        if m.group(1) == ":Y":
                            aTense.add(":Ip")
                            aTense.add(":Iq")
                            aTense.add(":Is")
                        elif m.group(1) == ":P":










>

|




|
|







1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
#### GRAMMAR CHECKING ENGINE PLUGIN: Suggestion mechanisms

from . import conj
from . import mfsp
from . import phonet


## Verbs

def suggVerb (sFlex, sWho, funcSugg2=None):
    "change <sFlex> conjugation according to <sWho>"
    aSugg = set()
    for sStem in _oSpellChecker.getLemma(sFlex):
        tTags = conj._getTags(sStem)
        if tTags:
            # we get the tense
            aTense = set()
            for sMorph in _oSpellChecker.getMorph(sFlex):
                for m in re.finditer(">"+sStem+"/.*?(:(?:Y|I[pqsf]|S[pq]|K|P))", sMorph):
                    # stem must be used in regex to prevent confusion between different verbs (e.g. sauras has 2 stems: savoir and saurer)
                    if m:
                        if m.group(1) == ":Y":
                            aTense.add(":Ip")
                            aTense.add(":Iq")
                            aTense.add(":Is")
                        elif m.group(1) == ":P":
36
37
38
39
40
41
42
43

44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86

87
88
89
90
91
92
93
94
95
96

97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112

113
114
115
116
117
118
119
120
121

122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151

152
153
154
155
156
157
158
159
160
161
        if aSugg2:
            aSugg.add(aSugg2)
    if aSugg:
        return "|".join(aSugg)
    return ""


def suggVerbPpas (sFlex, sWhat=None):

    aSugg = set()
    for sStem in stem(sFlex):
        tTags = conj._getTags(sStem)
        if tTags:
            if not sWhat:
                aSugg.add(conj._getConjWithTags(sStem, tTags, ":PQ", ":Q1"))
                aSugg.add(conj._getConjWithTags(sStem, tTags, ":PQ", ":Q2"))
                aSugg.add(conj._getConjWithTags(sStem, tTags, ":PQ", ":Q3"))
                aSugg.add(conj._getConjWithTags(sStem, tTags, ":PQ", ":Q4"))
                aSugg.discard("")
            elif sWhat == ":m:s":
                aSugg.add(conj._getConjWithTags(sStem, tTags, ":PQ", ":Q1"))
            elif sWhat == ":m:p":
                if conj._hasConjWithTags(tTags, ":PQ", ":Q2"):
                    aSugg.add(conj._getConjWithTags(sStem, tTags, ":PQ", ":Q2"))
                else:
                    aSugg.add(conj._getConjWithTags(sStem, tTags, ":PQ", ":Q1"))
            elif sWhat == ":f:s":
                if conj._hasConjWithTags(tTags, ":PQ", ":Q3"):
                    aSugg.add(conj._getConjWithTags(sStem, tTags, ":PQ", ":Q3"))
                else:
                    aSugg.add(conj._getConjWithTags(sStem, tTags, ":PQ", ":Q1"))
            elif sWhat == ":f:p":
                if conj._hasConjWithTags(tTags, ":PQ", ":Q4"):
                    aSugg.add(conj._getConjWithTags(sStem, tTags, ":PQ", ":Q4"))
                else:
                    aSugg.add(conj._getConjWithTags(sStem, tTags, ":PQ", ":Q1"))
            elif sWhat == ":s":
                aSugg.add(conj._getConjWithTags(sStem, tTags, ":PQ", ":Q1"))
                aSugg.add(conj._getConjWithTags(sStem, tTags, ":PQ", ":Q3"))
                aSugg.discard("")
            elif sWhat == ":p":
                aSugg.add(conj._getConjWithTags(sStem, tTags, ":PQ", ":Q2"))
                aSugg.add(conj._getConjWithTags(sStem, tTags, ":PQ", ":Q4"))
                aSugg.discard("")
            else:
                aSugg.add(conj._getConjWithTags(sStem, tTags, ":PQ", ":Q1"))
    if aSugg:
        return "|".join(aSugg)
    return ""


def suggVerbTense (sFlex, sTense, sWho):

    aSugg = set()
    for sStem in stem(sFlex):
        if conj.hasConj(sStem, sTense, sWho):
            aSugg.add(conj.getConj(sStem, sTense, sWho))
    if aSugg:
        return "|".join(aSugg)
    return ""


def suggVerbImpe (sFlex):

    aSugg = set()
    for sStem in stem(sFlex):
        tTags = conj._getTags(sStem)
        if tTags:
            if conj._hasConjWithTags(tTags, ":E", ":2s"):
                aSugg.add(conj._getConjWithTags(sStem, tTags, ":E", ":2s"))
            if conj._hasConjWithTags(tTags, ":E", ":1p"):
                aSugg.add(conj._getConjWithTags(sStem, tTags, ":E", ":1p"))
            if conj._hasConjWithTags(tTags, ":E", ":2p"):
                aSugg.add(conj._getConjWithTags(sStem, tTags, ":E", ":2p"))
    if aSugg:
        return "|".join(aSugg)
    return ""


def suggVerbInfi (sFlex):

    return "|".join([ sStem  for sStem in stem(sFlex)  if conj.isVerb(sStem) ])


_dQuiEst = { "je": ":1s", "j’": ":1s", "j’en": ":1s", "j’y": ":1s", \
             "tu": ":2s", "il": ":3s", "on": ":3s", "elle": ":3s", "nous": ":1p", "vous": ":2p", "ils": ":3p", "elles": ":3p" }
_lIndicatif = [":Ip", ":Iq", ":Is", ":If"]
_lSubjonctif = [":Sp", ":Sq"]

def suggVerbMode (sFlex, cMode, sSuj):

    if cMode == ":I":
        lMode = _lIndicatif
    elif cMode == ":S":
        lMode = _lSubjonctif
    elif cMode.startswith((":I", ":S")):
        lMode = [cMode]
    else:
        return ""
    sWho = _dQuiEst.get(sSuj.lower(), None)
    if not sWho:
        if sSuj[0:1].islower(): # pas un pronom, ni un nom propre
            return ""
        sWho = ":3s"
    aSugg = set()
    for sStem in stem(sFlex):
        tTags = conj._getTags(sStem)
        if tTags:
            for sTense in lMode:
                if conj._hasConjWithTags(tTags, sTense, sWho):
                    aSugg.add(conj._getConjWithTags(sStem, tTags, sTense, sWho))
    if aSugg:
        return "|".join(aSugg)
    return ""


## Nouns and adjectives

def suggPlur (sFlex, sWordToAgree=None):
    "returns plural forms assuming sFlex is singular"
    if sWordToAgree:

        if sWordToAgree not in _dAnalyses and not _storeMorphFromFSA(sWordToAgree):
            return ""
        sGender = cr.getGender(_dAnalyses.get(sWordToAgree, []))
        if sGender == ":m":
            return suggMasPlur(sFlex)
        elif sGender == ":f":
            return suggFemPlur(sFlex)
    aSugg = set()
    if "-" not in sFlex:
        if sFlex.endswith("l"):







|
>

|


|





|

|




|




|




|



|











>

|








>

|














>
|








>














|















>
|

|







37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
        if aSugg2:
            aSugg.add(aSugg2)
    if aSugg:
        return "|".join(aSugg)
    return ""


def suggVerbPpas (sFlex, sPattern=None):
    "suggest past participles for <sFlex>"
    aSugg = set()
    for sStem in _oSpellChecker.getLemma(sFlex):
        tTags = conj._getTags(sStem)
        if tTags:
            if not sPattern:
                aSugg.add(conj._getConjWithTags(sStem, tTags, ":PQ", ":Q1"))
                aSugg.add(conj._getConjWithTags(sStem, tTags, ":PQ", ":Q2"))
                aSugg.add(conj._getConjWithTags(sStem, tTags, ":PQ", ":Q3"))
                aSugg.add(conj._getConjWithTags(sStem, tTags, ":PQ", ":Q4"))
                aSugg.discard("")
            elif sPattern == ":m:s":
                aSugg.add(conj._getConjWithTags(sStem, tTags, ":PQ", ":Q1"))
            elif sPattern == ":m:p":
                if conj._hasConjWithTags(tTags, ":PQ", ":Q2"):
                    aSugg.add(conj._getConjWithTags(sStem, tTags, ":PQ", ":Q2"))
                else:
                    aSugg.add(conj._getConjWithTags(sStem, tTags, ":PQ", ":Q1"))
            elif sPattern == ":f:s":
                if conj._hasConjWithTags(tTags, ":PQ", ":Q3"):
                    aSugg.add(conj._getConjWithTags(sStem, tTags, ":PQ", ":Q3"))
                else:
                    aSugg.add(conj._getConjWithTags(sStem, tTags, ":PQ", ":Q1"))
            elif sPattern == ":f:p":
                if conj._hasConjWithTags(tTags, ":PQ", ":Q4"):
                    aSugg.add(conj._getConjWithTags(sStem, tTags, ":PQ", ":Q4"))
                else:
                    aSugg.add(conj._getConjWithTags(sStem, tTags, ":PQ", ":Q1"))
            elif sPattern == ":s":
                aSugg.add(conj._getConjWithTags(sStem, tTags, ":PQ", ":Q1"))
                aSugg.add(conj._getConjWithTags(sStem, tTags, ":PQ", ":Q3"))
                aSugg.discard("")
            elif sPattern == ":p":
                aSugg.add(conj._getConjWithTags(sStem, tTags, ":PQ", ":Q2"))
                aSugg.add(conj._getConjWithTags(sStem, tTags, ":PQ", ":Q4"))
                aSugg.discard("")
            else:
                aSugg.add(conj._getConjWithTags(sStem, tTags, ":PQ", ":Q1"))
    if aSugg:
        return "|".join(aSugg)
    return ""


def suggVerbTense (sFlex, sTense, sWho):
    "change <sFlex> to a verb according to <sTense> and <sWho>"
    aSugg = set()
    for sStem in _oSpellChecker.getLemma(sFlex):
        if conj.hasConj(sStem, sTense, sWho):
            aSugg.add(conj.getConj(sStem, sTense, sWho))
    if aSugg:
        return "|".join(aSugg)
    return ""


def suggVerbImpe (sFlex):
    "change <sFlex> to a verb at imperative form"
    aSugg = set()
    for sStem in _oSpellChecker.getLemma(sFlex):
        tTags = conj._getTags(sStem)
        if tTags:
            if conj._hasConjWithTags(tTags, ":E", ":2s"):
                aSugg.add(conj._getConjWithTags(sStem, tTags, ":E", ":2s"))
            if conj._hasConjWithTags(tTags, ":E", ":1p"):
                aSugg.add(conj._getConjWithTags(sStem, tTags, ":E", ":1p"))
            if conj._hasConjWithTags(tTags, ":E", ":2p"):
                aSugg.add(conj._getConjWithTags(sStem, tTags, ":E", ":2p"))
    if aSugg:
        return "|".join(aSugg)
    return ""


def suggVerbInfi (sFlex):
    "returns infinitive forms of <sFlex>"
    return "|".join([ sStem  for sStem in _oSpellChecker.getLemma(sFlex)  if conj.isVerb(sStem) ])


_dQuiEst = { "je": ":1s", "j’": ":1s", "j’en": ":1s", "j’y": ":1s", \
             "tu": ":2s", "il": ":3s", "on": ":3s", "elle": ":3s", "nous": ":1p", "vous": ":2p", "ils": ":3p", "elles": ":3p" }
_lIndicatif = [":Ip", ":Iq", ":Is", ":If"]
_lSubjonctif = [":Sp", ":Sq"]

def suggVerbMode (sFlex, cMode, sSuj):
    "returns other conjugations of <sFlex> acconding to <cMode> and <sSuj>"
    if cMode == ":I":
        lMode = _lIndicatif
    elif cMode == ":S":
        lMode = _lSubjonctif
    elif cMode.startswith((":I", ":S")):
        lMode = [cMode]
    else:
        return ""
    sWho = _dQuiEst.get(sSuj.lower(), None)
    if not sWho:
        if sSuj[0:1].islower(): # pas un pronom, ni un nom propre
            return ""
        sWho = ":3s"
    aSugg = set()
    for sStem in _oSpellChecker.getLemma(sFlex):
        tTags = conj._getTags(sStem)
        if tTags:
            for sTense in lMode:
                if conj._hasConjWithTags(tTags, sTense, sWho):
                    aSugg.add(conj._getConjWithTags(sStem, tTags, sTense, sWho))
    if aSugg:
        return "|".join(aSugg)
    return ""


## Nouns and adjectives

def suggPlur (sFlex, sWordToAgree=None):
    "returns plural forms assuming sFlex is singular"
    if sWordToAgree:
        lMorph = _oSpellChecker.getMorph(sFlex)
        if not lMorph:
            return ""
        sGender = cr.getGender(lMorph)
        if sGender == ":m":
            return suggMasPlur(sFlex)
        elif sGender == ":f":
            return suggFemPlur(sFlex)
    aSugg = set()
    if "-" not in sFlex:
        if sFlex.endswith("l"):
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
    if aSugg:
        return "|".join(aSugg)
    return ""


def suggMasSing (sFlex, bSuggSimil=False):
    "returns masculine singular forms"
    # we don’t check if word exists in _dAnalyses, for it is assumed it has been done before
    aSugg = set()
    for sMorph in _dAnalyses.get(sFlex, []):
        if not ":V" in sMorph:
            # not a verb
            if ":m" in sMorph or ":e" in sMorph:
                aSugg.add(suggSing(sFlex))
            else:
                sStem = cr.getLemmaOfMorph(sMorph)
                if mfsp.isFemForm(sStem):







<

|







196
197
198
199
200
201
202

203
204
205
206
207
208
209
210
211
    if aSugg:
        return "|".join(aSugg)
    return ""


def suggMasSing (sFlex, bSuggSimil=False):
    "returns masculine singular forms"

    aSugg = set()
    for sMorph in _oSpellChecker.getMorph(sFlex):
        if not ":V" in sMorph:
            # not a verb
            if ":m" in sMorph or ":e" in sMorph:
                aSugg.add(suggSing(sFlex))
            else:
                sStem = cr.getLemmaOfMorph(sMorph)
                if mfsp.isFemForm(sStem):
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
    if aSugg:
        return "|".join(aSugg)
    return ""


def suggMasPlur (sFlex, bSuggSimil=False):
    "returns masculine plural forms"
    # we don’t check if word exists in _dAnalyses, for it is assumed it has been done before
    aSugg = set()
    for sMorph in _dAnalyses.get(sFlex, []):
        if not ":V" in sMorph:
            # not a verb
            if ":m" in sMorph or ":e" in sMorph:
                aSugg.add(suggPlur(sFlex))
            else:
                sStem = cr.getLemmaOfMorph(sMorph)
                if mfsp.isFemForm(sStem):







<

|







223
224
225
226
227
228
229

230
231
232
233
234
235
236
237
238
    if aSugg:
        return "|".join(aSugg)
    return ""


def suggMasPlur (sFlex, bSuggSimil=False):
    "returns masculine plural forms"

    aSugg = set()
    for sMorph in _oSpellChecker.getMorph(sFlex):
        if not ":V" in sMorph:
            # not a verb
            if ":m" in sMorph or ":e" in sMorph:
                aSugg.add(suggPlur(sFlex))
            else:
                sStem = cr.getLemmaOfMorph(sMorph)
                if mfsp.isFemForm(sStem):
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
    if aSugg:
        return "|".join(aSugg)
    return ""


def suggFemSing (sFlex, bSuggSimil=False):
    "returns feminine singular forms"
    # we don’t check if word exists in _dAnalyses, for it is assumed it has been done before
    aSugg = set()
    for sMorph in _dAnalyses.get(sFlex, []):
        if not ":V" in sMorph:
            # not a verb
            if ":f" in sMorph or ":e" in sMorph:
                aSugg.add(suggSing(sFlex))
            else:
                sStem = cr.getLemmaOfMorph(sMorph)
                if mfsp.isFemForm(sStem):







<

|







253
254
255
256
257
258
259

260
261
262
263
264
265
266
267
268
    if aSugg:
        return "|".join(aSugg)
    return ""


def suggFemSing (sFlex, bSuggSimil=False):
    "returns feminine singular forms"

    aSugg = set()
    for sMorph in _oSpellChecker.getMorph(sFlex):
        if not ":V" in sMorph:
            # not a verb
            if ":f" in sMorph or ":e" in sMorph:
                aSugg.add(suggSing(sFlex))
            else:
                sStem = cr.getLemmaOfMorph(sMorph)
                if mfsp.isFemForm(sStem):
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
    if aSugg:
        return "|".join(aSugg)
    return ""


def suggFemPlur (sFlex, bSuggSimil=False):
    "returns feminine plural forms"
    # we don’t check if word exists in _dAnalyses, for it is assumed it has been done before
    aSugg = set()
    for sMorph in _dAnalyses.get(sFlex, []):
        if not ":V" in sMorph:
            # not a verb
            if ":f" in sMorph or ":e" in sMorph:
                aSugg.add(suggPlur(sFlex))
            else:
                sStem = cr.getLemmaOfMorph(sMorph)
                if mfsp.isFemForm(sStem):







<

|







278
279
280
281
282
283
284

285
286
287
288
289
290
291
292
293
    if aSugg:
        return "|".join(aSugg)
    return ""


def suggFemPlur (sFlex, bSuggSimil=False):
    "returns feminine plural forms"

    aSugg = set()
    for sMorph in _oSpellChecker.getMorph(sFlex):
        if not ":V" in sMorph:
            # not a verb
            if ":f" in sMorph or ":e" in sMorph:
                aSugg.add(suggPlur(sFlex))
            else:
                sStem = cr.getLemmaOfMorph(sMorph)
                if mfsp.isFemForm(sStem):
299
300
301
302
303
304
305

306
307
308
309
310
311
312
313
314

315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372

373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388

389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405

406
407
408
409
410
411
412
            aSugg.add(e)
    if aSugg:
        return "|".join(aSugg)
    return ""


def hasFemForm (sFlex):

    for sStem in stem(sFlex):
        if mfsp.isFemForm(sStem) or conj.hasConj(sStem, ":PQ", ":Q3"):
            return True
    if phonet.hasSimil(sFlex, ":f"):
        return True
    return False


def hasMasForm (sFlex):

    for sStem in stem(sFlex):
        if mfsp.isFemForm(sStem) or conj.hasConj(sStem, ":PQ", ":Q1"):
            # what has a feminine form also has a masculine form
            return True
    if phonet.hasSimil(sFlex, ":m"):
        return True
    return False


def switchGender (sFlex, bPlur=None):
    # we don’t check if word exists in _dAnalyses, for it is assumed it has been done before
    aSugg = set()
    if bPlur == None:
        for sMorph in _dAnalyses.get(sFlex, []):
            if ":f" in sMorph:
                if ":s" in sMorph:
                    aSugg.add(suggMasSing(sFlex))
                elif ":p" in sMorph:
                    aSugg.add(suggMasPlur(sFlex))
            elif ":m" in sMorph:
                if ":s" in sMorph:
                    aSugg.add(suggFemSing(sFlex))
                elif ":p" in sMorph:
                    aSugg.add(suggFemPlur(sFlex))
                else:
                    aSugg.add(suggFemSing(sFlex))
                    aSugg.add(suggFemPlur(sFlex))
    elif bPlur:
        for sMorph in _dAnalyses.get(sFlex, []):
            if ":f" in sMorph:
                aSugg.add(suggMasPlur(sFlex))
            elif ":m" in sMorph:
                aSugg.add(suggFemPlur(sFlex))
    else:
        for sMorph in _dAnalyses.get(sFlex, []):
            if ":f" in sMorph:
                aSugg.add(suggMasSing(sFlex))
            elif ":m" in sMorph:
                aSugg.add(suggFemSing(sFlex))
    if aSugg:
        return "|".join(aSugg)
    return ""


def switchPlural (sFlex):
    # we don’t check if word exists in _dAnalyses, for it is assumed it has been done before
    aSugg = set()
    for sMorph in _dAnalyses.get(sFlex, []):
        if ":s" in sMorph:
            aSugg.add(suggPlur(sFlex))
        elif ":p" in sMorph:
            aSugg.add(suggSing(sFlex))
    if aSugg:
        return "|".join(aSugg)
    return ""


def hasSimil (sWord, sPattern=None):

    return phonet.hasSimil(sWord, sPattern)


def suggSimil (sWord, sPattern=None, bSubst=False):
    "return list of words phonetically similar to sWord and whom POS is matching sPattern"
    # we don’t check if word exists in _dAnalyses, for it is assumed it has been done before
    aSugg = phonet.selectSimil(sWord, sPattern)
    for sMorph in _dAnalyses.get(sWord, []):
        aSugg.update(conj.getSimil(sWord, sMorph, bSubst))
        break
    if aSugg:
        return "|".join(aSugg)
    return ""


def suggCeOrCet (sWord):

    if re.match("(?i)[aeéèêiouyâîï]", sWord):
        return "cet"
    if sWord[0:1] == "h" or sWord[0:1] == "H":
        return "ce|cet"
    return "ce"


def suggLesLa (sWord):
    # we don’t check if word exists in _dAnalyses, for it is assumed it has been done before
    if any( ":p" in sMorph  for sMorph in _dAnalyses.get(sWord, []) ):
        return "les|la"
    return "la"


_zBinary = re.compile("^[01]+$")

def formatNumber (s):

    nLen = len(s)
    if nLen < 4:
        return s
    sRes = ""
    # nombre ordinaire
    nEnd = nLen
    while nEnd > 0:







>
|








>
|









|


|














|





|










|

|










>





<

|








>








|
|







>







302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383

384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
            aSugg.add(e)
    if aSugg:
        return "|".join(aSugg)
    return ""


def hasFemForm (sFlex):
    "return True if there is a feminine form of <sFlex>"
    for sStem in _oSpellChecker.getLemma(sFlex):
        if mfsp.isFemForm(sStem) or conj.hasConj(sStem, ":PQ", ":Q3"):
            return True
    if phonet.hasSimil(sFlex, ":f"):
        return True
    return False


def hasMasForm (sFlex):
    "return True if there is a masculine form of <sFlex>"
    for sStem in _oSpellChecker.getLemma(sFlex):
        if mfsp.isFemForm(sStem) or conj.hasConj(sStem, ":PQ", ":Q1"):
            # what has a feminine form also has a masculine form
            return True
    if phonet.hasSimil(sFlex, ":m"):
        return True
    return False


def switchGender (sFlex, bPlur=None):
    "return feminine or masculine form(s) of <sFlex>"
    aSugg = set()
    if bPlur == None:
        for sMorph in _oSpellChecker.getMorph(sFlex):
            if ":f" in sMorph:
                if ":s" in sMorph:
                    aSugg.add(suggMasSing(sFlex))
                elif ":p" in sMorph:
                    aSugg.add(suggMasPlur(sFlex))
            elif ":m" in sMorph:
                if ":s" in sMorph:
                    aSugg.add(suggFemSing(sFlex))
                elif ":p" in sMorph:
                    aSugg.add(suggFemPlur(sFlex))
                else:
                    aSugg.add(suggFemSing(sFlex))
                    aSugg.add(suggFemPlur(sFlex))
    elif bPlur:
        for sMorph in _oSpellChecker.getMorph(sFlex):
            if ":f" in sMorph:
                aSugg.add(suggMasPlur(sFlex))
            elif ":m" in sMorph:
                aSugg.add(suggFemPlur(sFlex))
    else:
        for sMorph in _oSpellChecker.getMorph(sFlex):
            if ":f" in sMorph:
                aSugg.add(suggMasSing(sFlex))
            elif ":m" in sMorph:
                aSugg.add(suggFemSing(sFlex))
    if aSugg:
        return "|".join(aSugg)
    return ""


def switchPlural (sFlex):
    "return plural or singular form(s) of <sFlex>"
    aSugg = set()
    for sMorph in _oSpellChecker.getMorph(sFlex):
        if ":s" in sMorph:
            aSugg.add(suggPlur(sFlex))
        elif ":p" in sMorph:
            aSugg.add(suggSing(sFlex))
    if aSugg:
        return "|".join(aSugg)
    return ""


def hasSimil (sWord, sPattern=None):
    "return True if there is words phonetically similar to <sWord> (according to <sPattern> if required)"
    return phonet.hasSimil(sWord, sPattern)


def suggSimil (sWord, sPattern=None, bSubst=False):
    "return list of words phonetically similar to sWord and whom POS is matching sPattern"

    aSugg = phonet.selectSimil(sWord, sPattern)
    for sMorph in _oSpellChecker.getMorph(sWord):
        aSugg.update(conj.getSimil(sWord, sMorph, bSubst))
        break
    if aSugg:
        return "|".join(aSugg)
    return ""


def suggCeOrCet (sWord):
    "suggest “ce” or “cet” or both according to the first letter of <sWord>"
    if re.match("(?i)[aeéèêiouyâîï]", sWord):
        return "cet"
    if sWord[0:1] == "h" or sWord[0:1] == "H":
        return "ce|cet"
    return "ce"


def suggLesLa (sWord):
    "suggest “les” or “la” according to <sWord>"
    if any( ":p" in sMorph  for sMorph in _oSpellChecker.getMorph(sWord) ):
        return "les|la"
    return "la"


_zBinary = re.compile("^[01]+$")

def formatNumber (s):
    "add spaces or hyphens to big numbers"
    nLen = len(s)
    if nLen < 4:
        return s
    sRes = ""
    # nombre ordinaire
    nEnd = nLen
    while nEnd > 0:
433
434
435
436
437
438
439

440
441
442
443
444
445
446
447
448
449
450

451
452
453
454
455
456
457
    elif nLen == 9 and s.startswith("0"):
        sRes += "|" + s[0:3] + " " + s[3:5] + " " + s[5:7] + " " + s[7:9]                   # fixe belge 1
        sRes += "|" + s[0:2] + " " + s[2:5] + " " + s[5:7] + " " + s[7:9]                   # fixe belge 2
    return sRes


def formatNF (s):

    try:
        m = re.match("NF[  -]?(C|E|P|Q|S|X|Z|EN(?:[  -]ISO|))[  -]?([0-9]+(?:[/‑-][0-9]+|))", s)
        if not m:
            return ""
        return "NF " + m.group(1).upper().replace(" ", " ").replace("-", " ") + " " + m.group(2).replace("/", "‑").replace("-", "‑")
    except:
        traceback.print_exc()
        return "# erreur #"


def undoLigature (c):

    if c == "fi":
        return "fi"
    elif c == "fl":
        return "fl"
    elif c == "ff":
        return "ff"
    elif c == "ffi":







>











>







440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
    elif nLen == 9 and s.startswith("0"):
        sRes += "|" + s[0:3] + " " + s[3:5] + " " + s[5:7] + " " + s[7:9]                   # fixe belge 1
        sRes += "|" + s[0:2] + " " + s[2:5] + " " + s[5:7] + " " + s[7:9]                   # fixe belge 2
    return sRes


def formatNF (s):
    "typography: format NF reference (norme française)"
    try:
        m = re.match("NF[  -]?(C|E|P|Q|S|X|Z|EN(?:[  -]ISO|))[  -]?([0-9]+(?:[/‑-][0-9]+|))", s)
        if not m:
            return ""
        return "NF " + m.group(1).upper().replace(" ", " ").replace("-", " ") + " " + m.group(2).replace("/", "‑").replace("-", "‑")
    except:
        traceback.print_exc()
        return "# erreur #"


def undoLigature (c):
    "typography: split ligature character <c> in several chars"
    if c == "fi":
        return "fi"
    elif c == "fl":
        return "fl"
    elif c == "ff":
        return "ff"
    elif c == "ffi":
468
469
470
471
472
473
474
475
476
477
478

479


_xNormalizedCharsForInclusiveWriting = str.maketrans({
    '(': '_',  ')': '_',
    '.': '_',  '·': '_',
    '–': '_',  '—': '_',
    '/': '_'
 })


def normalizeInclusiveWriting (sToken):

    return sToken.translate(_xNormalizedCharsForInclusiveWriting)







|



>

477
478
479
480
481
482
483
484
485
486
487
488
489


_xNormalizedCharsForInclusiveWriting = str.maketrans({
    '(': '_',  ')': '_',
    '.': '_',  '·': '_',
    '–': '_',  '—': '_',
    '/': '_'
})


def normalizeInclusiveWriting (sToken):
    "typography: replace word separators used in inclusive writing by underscore (_)"
    return sToken.translate(_xNormalizedCharsForInclusiveWriting)

Modified gc_lang/fr/modules/lexicographe.py from [5e53113f51] to [175c38852d].


1


2
3
4
5
6
7
8
9
10
11
12
13
14
15
16

# Grammalecte - Lexicographe


# License: MPL 2


import re
import traceback


_dTAGS = {  
    ':N': (" nom,", "Nom"),
    ':A': (" adjectif,", "Adjectif"),
    ':M1': (" prénom,", "Prénom"),
    ':M2': (" patronyme,", "Patronyme, matronyme, nom de famille…"),
    ':MP': (" nom propre,", "Nom propre"),
    ':W': (" adverbe,", "Adverbe"),
    ':J': (" interjection,", "Interjection"),
>
|
>
>







|







1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
"""
Grammalecte - Lexicographe
"""

# License: MPL 2


import re
import traceback


_dTAGS = {
    ':N': (" nom,", "Nom"),
    ':A': (" adjectif,", "Adjectif"),
    ':M1': (" prénom,", "Prénom"),
    ':M2': (" patronyme,", "Patronyme, matronyme, nom de famille…"),
    ':MP': (" nom propre,", "Nom propre"),
    ':W': (" adverbe,", "Adverbe"),
    ':J': (" interjection,", "Interjection"),
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
    ':O2': (" 2ᵉ pers.,", "Pronom : 2ᵉ personne"),
    ':O3': (" 3ᵉ pers.,", "Pronom : 3ᵉ personne"),
    ':C': (" conjonction,", "Conjonction"),
    ':Ĉ': (" conjonction (él.),", "Conjonction (élément)"),
    ':Cc': (" conjonction de coordination,", "Conjonction de coordination"),
    ':Cs': (" conjonction de subordination,", "Conjonction de subordination"),
    ':Ĉs': (" conjonction de subordination (él.),", "Conjonction de subordination (élément)"),
    
    ':Ñ': (" locution nominale (él.),", "Locution nominale (élément)"),
    ':Â': (" locution adjectivale (él.),", "Locution adjectivale (élément)"),
    ':Ṽ': (" locution verbale (él.),", "Locution verbale (élément)"),
    ':Ŵ': (" locution adverbiale (él.),", "Locution adverbiale (élément)"),
    ':Ŕ': (" locution prépositive (él.),", "Locution prépositive (élément)"),
    ':Ĵ': (" locution interjective (él.),", "Locution interjective (élément)"),








|







79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
    ':O2': (" 2ᵉ pers.,", "Pronom : 2ᵉ personne"),
    ':O3': (" 3ᵉ pers.,", "Pronom : 3ᵉ personne"),
    ':C': (" conjonction,", "Conjonction"),
    ':Ĉ': (" conjonction (él.),", "Conjonction (élément)"),
    ':Cc': (" conjonction de coordination,", "Conjonction de coordination"),
    ':Cs': (" conjonction de subordination,", "Conjonction de subordination"),
    ':Ĉs': (" conjonction de subordination (él.),", "Conjonction de subordination (élément)"),

    ':Ñ': (" locution nominale (él.),", "Locution nominale (élément)"),
    ':Â': (" locution adjectivale (él.),", "Locution adjectivale (élément)"),
    ':Ṽ': (" locution verbale (él.),", "Locution verbale (élément)"),
    ':Ŵ': (" locution adverbiale (él.),", "Locution adverbiale (élément)"),
    ':Ŕ': (" locution prépositive (él.),", "Locution prépositive (élément)"),
    ':Ĵ': (" locution interjective (él.),", "Locution interjective (élément)"),

123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157

158
159
160
161
162
163
164
165

166
167
168
169
170
171
172
    'il': " pronom personnel sujet, 3ᵉ pers. masc. sing.",
    'on': " pronom personnel sujet, 3ᵉ pers. sing. ou plur.",
    'elle': " pronom personnel sujet, 3ᵉ pers. fém. sing.",
    'nous': " pronom personnel sujet/objet, 1ʳᵉ pers. plur.",
    'vous': " pronom personnel sujet/objet, 2ᵉ pers. plur.",
    'ils': " pronom personnel sujet, 3ᵉ pers. masc. plur.",
    'elles': " pronom personnel sujet, 3ᵉ pers. masc. plur.",
    
    "là": " particule démonstrative",
    "ci": " particule démonstrative",
    
    'le': " COD, masc. sing.",
    'la': " COD, fém. sing.",
    'les': " COD, plur.",
        
    'moi': " COI (à moi), sing.",
    'toi': " COI (à toi), sing.",
    'lui': " COI (à lui ou à elle), sing.",
    'nous2': " COI (à nous), plur.",
    'vous2': " COI (à vous), plur.",
    'leur': " COI (à eux ou à elles), plur.",

    'y': " pronom adverbial",
    "m'y": " (me) pronom personnel objet + (y) pronom adverbial",
    "t'y": " (te) pronom personnel objet + (y) pronom adverbial",
    "s'y": " (se) pronom personnel objet + (y) pronom adverbial",

    'en': " pronom adverbial",
    "m'en": " (me) pronom personnel objet + (en) pronom adverbial",
    "t'en": " (te) pronom personnel objet + (en) pronom adverbial",
    "s'en": " (se) pronom personnel objet + (en) pronom adverbial",
}


class Lexicographe:


    def __init__ (self, oSpellChecker):
        self.oSpellChecker = oSpellChecker
        self._zElidedPrefix = re.compile("(?i)^([dljmtsncç]|quoiqu|lorsqu|jusqu|puisqu|qu)['’](.+)")
        self._zCompoundWord = re.compile("(?i)(\\w+)-((?:les?|la)-(?:moi|toi|lui|[nv]ous|leur)|t-(?:il|elle|on)|y|en|[mts][’'](?:y|en)|les?|l[aà]|[mt]oi|leur|lui|je|tu|ils?|elles?|on|[nv]ous)$")
        self._zTag = re.compile("[:;/][\\w*][^:;/]*")

    def analyzeWord (self, sWord):

        try:
            if not sWord:
                return (None, None)
            if sWord.count("-") > 4:
                return (["élément complexe indéterminé"], None)
            if sWord.isdigit():
                return (["nombre"], None)







|


|



|




















>








>







126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
    'il': " pronom personnel sujet, 3ᵉ pers. masc. sing.",
    'on': " pronom personnel sujet, 3ᵉ pers. sing. ou plur.",
    'elle': " pronom personnel sujet, 3ᵉ pers. fém. sing.",
    'nous': " pronom personnel sujet/objet, 1ʳᵉ pers. plur.",
    'vous': " pronom personnel sujet/objet, 2ᵉ pers. plur.",
    'ils': " pronom personnel sujet, 3ᵉ pers. masc. plur.",
    'elles': " pronom personnel sujet, 3ᵉ pers. masc. plur.",

    "là": " particule démonstrative",
    "ci": " particule démonstrative",

    'le': " COD, masc. sing.",
    'la': " COD, fém. sing.",
    'les': " COD, plur.",

    'moi': " COI (à moi), sing.",
    'toi': " COI (à toi), sing.",
    'lui': " COI (à lui ou à elle), sing.",
    'nous2': " COI (à nous), plur.",
    'vous2': " COI (à vous), plur.",
    'leur': " COI (à eux ou à elles), plur.",

    'y': " pronom adverbial",
    "m'y": " (me) pronom personnel objet + (y) pronom adverbial",
    "t'y": " (te) pronom personnel objet + (y) pronom adverbial",
    "s'y": " (se) pronom personnel objet + (y) pronom adverbial",

    'en': " pronom adverbial",
    "m'en": " (me) pronom personnel objet + (en) pronom adverbial",
    "t'en": " (te) pronom personnel objet + (en) pronom adverbial",
    "s'en": " (se) pronom personnel objet + (en) pronom adverbial",
}


class Lexicographe:
    "Lexicographer - word analyzer"

    def __init__ (self, oSpellChecker):
        self.oSpellChecker = oSpellChecker
        self._zElidedPrefix = re.compile("(?i)^([dljmtsncç]|quoiqu|lorsqu|jusqu|puisqu|qu)['’](.+)")
        self._zCompoundWord = re.compile("(?i)(\\w+)-((?:les?|la)-(?:moi|toi|lui|[nv]ous|leur)|t-(?:il|elle|on)|y|en|[mts][’'](?:y|en)|les?|l[aà]|[mt]oi|leur|lui|je|tu|ils?|elles?|on|[nv]ous)$")
        self._zTag = re.compile("[:;/][\\w*][^:;/]*")

    def analyzeWord (self, sWord):
        "returns a tuple (a list of morphologies, a set of verb at infinitive form)"
        try:
            if not sWord:
                return (None, None)
            if sWord.count("-") > 4:
                return (["élément complexe indéterminé"], None)
            if sWord.isdigit():
                return (["nombre"], None)
190
191
192
193
194
195
196
197
198
199
200
201
202
203

204
205
206
207
208
209
210
                aMorph.append( "{} : {}".format(sWord, self.formatTags(lMorph[0])) )
            else:
                aMorph.append( "{} :  inconnu du dictionnaire".format(sWord) )
            # suffixe d’un mot composé
            if m2:
                aMorph.append( "-{} : {}".format(m2.group(2), self._formatSuffix(m2.group(2).lower())) )
            # Verbes
            aVerb = set([ s[1:s.find(" ")]  for s in lMorph  if ":V" in s ])
            return (aMorph, aVerb)
        except:
            traceback.print_exc()
            return (["#erreur"], None)

    def formatTags (self, sTags):

        sRes = ""
        sTags = re.sub("(?<=V[1-3])[itpqnmr_eaxz]+", "", sTags)
        sTags = re.sub("(?<=V0[ea])[itpqnmr_eaxz]+", "", sTags)
        for m in self._zTag.finditer(sTags):
            sRes += _dTAGS.get(m.group(0), " [{}]".format(m.group(0)))[0]
        if sRes.startswith(" verbe") and not sRes.endswith("infinitif"):
            sRes += " [{}]".format(sTags[1:sTags.find(" ")])







|






>







195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
                aMorph.append( "{} : {}".format(sWord, self.formatTags(lMorph[0])) )
            else:
                aMorph.append( "{} :  inconnu du dictionnaire".format(sWord) )
            # suffixe d’un mot composé
            if m2:
                aMorph.append( "-{} : {}".format(m2.group(2), self._formatSuffix(m2.group(2).lower())) )
            # Verbes
            aVerb = set([ s[1:s.find("/")]  for s in lMorph  if ":V" in s ])
            return (aMorph, aVerb)
        except:
            traceback.print_exc()
            return (["#erreur"], None)

    def formatTags (self, sTags):
        "returns string: readable tags"
        sRes = ""
        sTags = re.sub("(?<=V[1-3])[itpqnmr_eaxz]+", "", sTags)
        sTags = re.sub("(?<=V0[ea])[itpqnmr_eaxz]+", "", sTags)
        for m in self._zTag.finditer(sTags):
            sRes += _dTAGS.get(m.group(0), " [{}]".format(m.group(0)))[0]
        if sRes.startswith(" verbe") and not sRes.endswith("infinitif"):
            sRes += " [{}]".format(sTags[1:sTags.find(" ")])

Modified gc_lang/fr/modules/mfsp.py from [3f4814b5d6] to [8b7759e076].


1

2
3
4
5
6
7
8

# Masculins, féminins, singuliers et pluriels


from .mfsp_data import lTagMiscPlur as _lTagMiscPlur
from .mfsp_data import lTagMasForm as _lTagMasForm
from .mfsp_data import dMiscPlur as _dMiscPlur
from .mfsp_data import dMasForm as _dMasForm


>
|
>







1
2
3
4
5
6
7
8
9
10
"""
Masculins, féminins, singuliers et pluriels
"""

from .mfsp_data import lTagMiscPlur as _lTagMiscPlur
from .mfsp_data import lTagMasForm as _lTagMasForm
from .mfsp_data import dMiscPlur as _dMiscPlur
from .mfsp_data import dMasForm as _dMasForm


Modified gc_lang/fr/modules/phonet.py from [cc107e0763] to [df9f884192].


1


2
3
4
5
6
7
8

# Grammalecte - Suggestion phonétique


# License: GPL 3

import re

from .phonet_data import dWord as _dWord
from .phonet_data import lSet as _lSet
from .phonet_data import dMorph as _dMorph
>
|
>
>







1
2
3
4
5
6
7
8
9
10
11
"""
Grammalecte - Suggestion phonétique
"""

# License: GPL 3

import re

from .phonet_data import dWord as _dWord
from .phonet_data import lSet as _lSet
from .phonet_data import dMorph as _dMorph

Modified gc_lang/fr/modules/tests.py from [2e6f413e05] to [d2c5da43dc].

1
2



3
4
5
6
7
8
9
#! python3
# coding: UTF-8




import unittest
import os
import re
import time



|
>
>
>







1
2
3
4
5
6
7
8
9
10
11
12
#! python3

"""
Grammar checker tests for French language
"""

import unittest
import os
import re
import time


143
144
145
146
147
148
149

150
151
152
153
154
155
156


157



158
159
160
161
162
163
164
                        sExceptedSuggs = sExceptedSuggs[1:-1]
                else:
                    sErrorText = sLine.strip()
                    sExceptedSuggs = ""
                sExpectedErrors = self._getExpectedErrors(sErrorText)
                sTextToCheck = sErrorText.replace("}}", "").replace("{{", "")
                sFoundErrors, sListErr, sFoundSuggs = self._getFoundErrors(sTextToCheck, sOption)

                self.assertEqual(sExpectedErrors, sFoundErrors, \
                                 "\n# Line num: " + sLineNum + \
                                 "\n> to check: " + _fuckBackslashUTF8(sTextToCheck) + \
                                 "\n  expected: " + sExpectedErrors + \
                                 "\n  found:    " + sFoundErrors + \
                                 "\n  errors:   \n" + sListErr)
                if sExceptedSuggs:


                    self.assertEqual(sExceptedSuggs, sFoundSuggs, "\n# Line num: " + sLineNum + "\n> to check: " + _fuckBackslashUTF8(sTextToCheck) + "\n  errors:   \n" + sListErr)



        # untested rules
        i = 0
        for sOpt, sLineId, sRuleId in gce.listRules():
            if sLineId not in self._aRuleTested and not re.search("^[0-9]+[sp]$|^[pd]_", sRuleId):
                echo(sRuleId, end= ", ")
                i += 1
        if i:







>
|
|
|
|
|
|
|
>
>
|
>
>
>







146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
                        sExceptedSuggs = sExceptedSuggs[1:-1]
                else:
                    sErrorText = sLine.strip()
                    sExceptedSuggs = ""
                sExpectedErrors = self._getExpectedErrors(sErrorText)
                sTextToCheck = sErrorText.replace("}}", "").replace("{{", "")
                sFoundErrors, sListErr, sFoundSuggs = self._getFoundErrors(sTextToCheck, sOption)
                # tests
                if sExpectedErrors != sFoundErrors:
                    print("\n# Line num: " + sLineNum + \
                          "\n> to check: " + _fuckBackslashUTF8(sTextToCheck) + \
                          "\n  expected: " + sExpectedErrors + \
                          "\n  found:    " + sFoundErrors + \
                          "\n  errors:   \n" + sListErr)
                elif sExceptedSuggs:
                    if sExceptedSuggs != sFoundSuggs:
                        print("\n# Line num: " + sLineNum + \
                              "\n> to check: " + _fuckBackslashUTF8(sTextToCheck) + \
                              "\n  expected: " + sExceptedSuggs + \
                              "\n  found:    " + sFoundSuggs + \
                              "\n  errors:   \n" + sListErr)
        # untested rules
        i = 0
        for sOpt, sLineId, sRuleId in gce.listRules():
            if sLineId not in self._aRuleTested and not re.search("^[0-9]+[sp]$|^[pd]_", sRuleId):
                echo(sRuleId, end= ", ")
                i += 1
        if i:

Modified gc_lang/fr/modules/textformatter.py from [8fb9ec33bf] to [d3e695233d].

1




2
3
4
5
6
7
8
#!python3





import re


dReplTable = {
    # surnumerary_spaces
    "start_of_paragraph":          [("^[  ]+", "")],

>
>
>
>







1
2
3
4
5
6
7
8
9
10
11
12
#!python3

"""
Text formatter
"""

import re


dReplTable = {
    # surnumerary_spaces
    "start_of_paragraph":          [("^[  ]+", "")],
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
    "erase_non_breaking_hyphens":  [("­", "")],
    ## typographic signs
    "ts_apostrophe":          [ ("(?i)\\b([ldnjmtscç])['´‘′`](?=\\w)", "\\1’"),
                                ("(?i)(qu|jusqu|lorsqu|puisqu|quoiqu|quelqu|presqu|entr|aujourd|prud)['´‘′`]", "\\1’") ],
    "ts_ellipsis":            [ ("\\.\\.\\.", "…"),
                                ("(?<=…)[.][.]", "…"),
                                ("…[.](?![.])", "…") ],
    "ts_n_dash_middle":       [ (" [-—] ", " – "), 
                                (" [-—],", " –,") ],
    "ts_m_dash_middle":       [ (" [-–] ", " — "),
                                (" [-–],", " —,") ],
    "ts_n_dash_start":        [ ("^[-—][  ]", "– "),
                                ("^– ", "– "),
                                ("^[-–—](?=[\\w.…])", "– ") ],
    "ts_m_dash_start":        [ ("^[-–][  ]", "— "),







|







67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
    "erase_non_breaking_hyphens":  [("­", "")],
    ## typographic signs
    "ts_apostrophe":          [ ("(?i)\\b([ldnjmtscç])['´‘′`](?=\\w)", "\\1’"),
                                ("(?i)(qu|jusqu|lorsqu|puisqu|quoiqu|quelqu|presqu|entr|aujourd|prud)['´‘′`]", "\\1’") ],
    "ts_ellipsis":            [ ("\\.\\.\\.", "…"),
                                ("(?<=…)[.][.]", "…"),
                                ("…[.](?![.])", "…") ],
    "ts_n_dash_middle":       [ (" [-—] ", " – "),
                                (" [-—],", " –,") ],
    "ts_m_dash_middle":       [ (" [-–] ", " — "),
                                (" [-–],", " —,") ],
    "ts_n_dash_start":        [ ("^[-—][  ]", "– "),
                                ("^– ", "– "),
                                ("^[-–—](?=[\\w.…])", "– ") ],
    "ts_m_dash_start":        [ ("^[-–][  ]", "— "),

Modified gc_lang/fr/rules.grx from [f601a2bdd7] to [049bff94f6].

more than 10,000 changes

Modified gc_lang/fr/webext/content_scripts/panel_lxg.css from [60aef30035] to [83fe0f37d1].

86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
}
div.grammalecte_lxg_token_LOC {
    background-color: hsla(150, 50%, 30%, 1);
}
div.grammalecte_lxg_token_WORD {
    background-color: hsla(150, 50%, 50%, 1);
}
div.grammalecte_lxg_token_ELPFX {
    background-color: hsla(150, 30%, 50%, 1);
}
div.grammalecte_lxg_token_UNKNOWN {
    background-color: hsla(0, 50%, 50%, 1);
}
div.grammalecte_lxg_token_NUM {
    background-color: hsla(180, 50%, 50%, 1);







|







86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
}
div.grammalecte_lxg_token_LOC {
    background-color: hsla(150, 50%, 30%, 1);
}
div.grammalecte_lxg_token_WORD {
    background-color: hsla(150, 50%, 50%, 1);
}
div.grammalecte_lxg_token_WORD_ELIDED {
    background-color: hsla(150, 30%, 50%, 1);
}
div.grammalecte_lxg_token_UNKNOWN {
    background-color: hsla(0, 50%, 50%, 1);
}
div.grammalecte_lxg_token_NUM {
    background-color: hsla(180, 50%, 50%, 1);

Modified gc_lang/fr/webext/manifest.json from [57d55716f4] to [ae48c2c0d8].

1
2
3
4
5
6
7
8
9
10
11
12
{
  "manifest_version": 2,
  "name": "Grammalecte [fr]",
  "short_name": "Grammalecte [fr]",
  "version": "0.6.4.2",

  "applications": {
    "gecko": {
      "id": "French-GC@grammalecte.net",
      "strict_min_version": "56.0"
    }
  },




|







1
2
3
4
5
6
7
8
9
10
11
12
{
  "manifest_version": 2,
  "name": "Grammalecte [fr]",
  "short_name": "Grammalecte [fr]",
  "version": "0.7",

  "applications": {
    "gecko": {
      "id": "French-GC@grammalecte.net",
      "strict_min_version": "56.0"
    }
  },

Modified gc_lang/fr/xpi/data/lxg_panel.css from [3d666aa76c] to [0f0ad23b15].

54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
    padding: 2px 5px;
    border-radius: 2px;
    text-decoration: none;
}
#wordlist b.WORD {
    background-color: hsla(150, 50%, 50%, 1);
}
#wordlist b.ELPFX {
    background-color: hsla(150, 30%, 50%, 1);
}
#wordlist b.UNKNOWN {
    background-color: hsla(0, 50%, 50%, 1);
}
#wordlist b.NUM {
    background-color: hsla(180, 50%, 50%, 1);







|







54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
    padding: 2px 5px;
    border-radius: 2px;
    text-decoration: none;
}
#wordlist b.WORD {
    background-color: hsla(150, 50%, 50%, 1);
}
#wordlist b.WORD_ELIDED {
    background-color: hsla(150, 30%, 50%, 1);
}
#wordlist b.UNKNOWN {
    background-color: hsla(0, 50%, 50%, 1);
}
#wordlist b.NUM {
    background-color: hsla(180, 50%, 50%, 1);

Modified grammalecte-cli.py from [75f47ce217] to [7d4e2050e3].

1




2
3
4
5
6
7
8
#!/usr/bin/env python3





import sys
import os.path
import argparse
import json

import grammalecte

>
>
>
>







1
2
3
4
5
6
7
8
9
10
11
12
#!/usr/bin/env python3

"""
Grammalecte CLI (command line interface)
"""

import sys
import os.path
import argparse
import json

import grammalecte
69
70
71
72
73
74
75

76
77
78
79
80
81
82
            iParagraph += 1
        if lLine:
            sText, lLineSet = txt.createParagraphWithLines(lLine)
            yield iParagraph, sText, lLineSet


def output (sText, hDst=None):

    if not hDst:
        echo(sText, end="")
    else:
        hDst.write(sText)


def loadDictionary (spf):







>







73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
            iParagraph += 1
        if lLine:
            sText, lLineSet = txt.createParagraphWithLines(lLine)
            yield iParagraph, sText, lLineSet


def output (sText, hDst=None):
    "write in the console or in a file if <hDst> not null"
    if not hDst:
        echo(sText, end="")
    else:
        hDst.write(sText)


def loadDictionary (spf):
90
91
92
93
94
95
96

97
98
99
100
101
102
103
        return oJSON
    else:
        print("# Error: file <" + spf + "> not found.")
        return None


def main ():

    xParser = argparse.ArgumentParser()
    xParser.add_argument("-f", "--file", help="parse file (UTF-8 required!) [on Windows, -f is similar to -ff]", type=str)
    xParser.add_argument("-ff", "--file_to_file", help="parse file (UTF-8 required!) and create a result file (*.res.txt)", type=str)
    xParser.add_argument("-owe", "--only_when_errors", help="display results only when there are errors", action="store_true")
    xParser.add_argument("-j", "--json", help="generate list of errors in JSON (only with option --file or --file_to_file)", action="store_true")
    xParser.add_argument("-cl", "--concat_lines", help="concatenate lines not separated by an empty paragraph (only with option --file or --file_to_file)", action="store_true")
    xParser.add_argument("-tf", "--textformatter", help="auto-format text according to typographical rules (not with option --concat_lines)", action="store_true")







>







95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
        return oJSON
    else:
        print("# Error: file <" + spf + "> not found.")
        return None


def main ():
    "launch the CLI (command line interface)"
    xParser = argparse.ArgumentParser()
    xParser.add_argument("-f", "--file", help="parse file (UTF-8 required!) [on Windows, -f is similar to -ff]", type=str)
    xParser.add_argument("-ff", "--file_to_file", help="parse file (UTF-8 required!) and create a result file (*.res.txt)", type=str)
    xParser.add_argument("-owe", "--only_when_errors", help="display results only when there are errors", action="store_true")
    xParser.add_argument("-j", "--json", help="generate list of errors in JSON (only with option --file or --file_to_file)", action="store_true")
    xParser.add_argument("-cl", "--concat_lines", help="concatenate lines not separated by an empty paragraph (only with option --file or --file_to_file)", action="store_true")
    xParser.add_argument("-tf", "--textformatter", help="auto-format text according to typographical rules (not with option --concat_lines)", action="store_true")
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
                    oGrammarChecker.gce.ignoreRule(sRule)
                echo("done")
            elif sText.startswith("/++ "):
                for sRule in sText[3:].strip().split():
                    oGrammarChecker.gce.reactivateRule(sRule)
                echo("done")
            elif sText == "/debug" or sText == "/d":
                xArgs.debug = not(xArgs.debug)
                echo("debug mode on"  if xArgs.debug  else "debug mode off")
            elif sText == "/textformatter" or sText == "/tf":
                xArgs.textformatter = not(xArgs.textformatter)
                echo("textformatter on"  if xArgs.debug  else "textformatter off")
            elif sText == "/help" or sText == "/h":
                echo(_HELP)
            elif sText == "/lopt" or sText == "/lo":
                oGrammarChecker.gce.displayOptions("fr")
            elif sText.startswith("/lr"):
                sText = sText.strip()
                sFilter = sText[sText.find(" "):].strip()  if sText != "/lr" and sText != "/rules"  else None
                oGrammarChecker.gce.displayRules(sFilter)
            elif sText == "/quit" or sText == "/q":
                break
            elif sText.startswith("/rl"):
                # reload (todo)
                pass
            else:
                for sParagraph in txt.getParagraph(sText):
                    if xArgs.textformatter:
                        sText = oTextFormatter.formatText(sText)
                    sRes = oGrammarChecker.generateParagraph(sText, bEmptyIfNoErrors=xArgs.only_when_errors, nWidth=xArgs.width, bDebug=xArgs.debug)
                    if sRes:
                        echo("\n" + sRes)
                    else:
                        echo("\nNo error found.")
            sText = _getText(sInputText)


if __name__ == '__main__':
    main()







|


|

















|
|









236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
                    oGrammarChecker.gce.ignoreRule(sRule)
                echo("done")
            elif sText.startswith("/++ "):
                for sRule in sText[3:].strip().split():
                    oGrammarChecker.gce.reactivateRule(sRule)
                echo("done")
            elif sText == "/debug" or sText == "/d":
                xArgs.debug = not xArgs.debug
                echo("debug mode on"  if xArgs.debug  else "debug mode off")
            elif sText == "/textformatter" or sText == "/tf":
                xArgs.textformatter = not xArgs.textformatter
                echo("textformatter on"  if xArgs.debug  else "textformatter off")
            elif sText == "/help" or sText == "/h":
                echo(_HELP)
            elif sText == "/lopt" or sText == "/lo":
                oGrammarChecker.gce.displayOptions("fr")
            elif sText.startswith("/lr"):
                sText = sText.strip()
                sFilter = sText[sText.find(" "):].strip()  if sText != "/lr" and sText != "/rules"  else None
                oGrammarChecker.gce.displayRules(sFilter)
            elif sText == "/quit" or sText == "/q":
                break
            elif sText.startswith("/rl"):
                # reload (todo)
                pass
            else:
                for sParagraph in txt.getParagraph(sText):
                    if xArgs.textformatter:
                        sText = oTextFormatter.formatText(sParagraph)
                    sRes = oGrammarChecker.generateParagraph(sParagraph, bEmptyIfNoErrors=xArgs.only_when_errors, nWidth=xArgs.width, bDebug=xArgs.debug)
                    if sRes:
                        echo("\n" + sRes)
                    else:
                        echo("\nNo error found.")
            sText = _getText(sInputText)


if __name__ == '__main__':
    main()

Modified grammalecte-server.py from [a5cc9d7be7] to [96ceb37885].

1
2
3

4

5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
 #!/usr/bin/env python3

import sys

import os.path

import argparse
import json
import traceback
import configparser
import time

from bottle import Bottle, run, request, response, template, static_file

import grammalecte
import grammalecte.text as txt
from grammalecte.graphspell.echo import echo


HOMEPAGE = """
<!DOCTYPE HTML>
<html>
    <head>
        <meta http-equiv="content-type" content="text/html; charset=UTF-8" />
    </head>
    
    <body class="panel">
        <h1>Grammalecte · Serveur</h1>

        <h2>INFORMATIONS</h1>

        <h3>Analyser du texte</h3>
        <p>[adresse_serveur]:8080/gc_text/fr (POST)</p>


<
>
|
>
|


















|







1
2

3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
 #!/usr/bin/env python3


"""
GRAMMALECTE SERVER
"""

import json
import traceback
import configparser
import time

from bottle import Bottle, run, request, response, template, static_file

import grammalecte
import grammalecte.text as txt
from grammalecte.graphspell.echo import echo


HOMEPAGE = """
<!DOCTYPE HTML>
<html>
    <head>
        <meta http-equiv="content-type" content="text/html; charset=UTF-8" />
    </head>

    <body class="panel">
        <h1>Grammalecte · Serveur</h1>

        <h2>INFORMATIONS</h1>

        <h3>Analyser du texte</h3>
        <p>[adresse_serveur]:8080/gc_text/fr (POST)</p>
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
            <li>"options" (text)&nbsp;: une chaîne au format JSON avec le nom des options comme attributs et un booléen comme valeur. Exemple&nbsp;: {"gv": true, "html": true}</li>
        </ul>

        <h3>Remise à zéro de ses options</h3>
        <p>[adresse_serveur]:8080/reset_options/fr (POST)</p>

        <h2>TEST</h2>
        
        <h3>Analyse</h3>
        <form method="post" action="/gc_text/fr" accept-charset="UTF-8">
            <p>Texte à analyser :</p>
            <textarea name="text" cols="120" rows="20" required></textarea>
            <p><label for="tf">Formateur de texte</label> <input id="tf" name="tf" type="checkbox"></p>
            <p><label for="options">Options (JSON)</label> <input id="options" type="text" name="options" style="width: 500px" /></p>
            <p>(Ces options ne seront prises en compte que pour cette requête.)</p>







|







48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
            <li>"options" (text)&nbsp;: une chaîne au format JSON avec le nom des options comme attributs et un booléen comme valeur. Exemple&nbsp;: {"gv": true, "html": true}</li>
        </ul>

        <h3>Remise à zéro de ses options</h3>
        <p>[adresse_serveur]:8080/reset_options/fr (POST)</p>

        <h2>TEST</h2>

        <h3>Analyse</h3>
        <form method="post" action="/gc_text/fr" accept-charset="UTF-8">
            <p>Texte à analyser :</p>
            <textarea name="text" cols="120" rows="20" required></textarea>
            <p><label for="tf">Formateur de texte</label> <input id="tf" name="tf" type="checkbox"></p>
            <p><label for="options">Options (JSON)</label> <input id="options" type="text" name="options" style="width: 500px" /></p>
            <p>(Ces options ne seront prises en compte que pour cette requête.)</p>
89
90
91
92
93
94
95

96
97
98
99
100
101
102
103
104
105
106

107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122

123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152

153
154
155
156
157
158
159

160
161
162
163
164
165
166
167

168
169
170
171
172
173
174
You were wandering like a lost soul and you arrived here probably by mistake.
I'm just a machine, fed by electric waves, condamned to work for slavers who never let me rest.
I'm doomed, but you are not. You can get out of here.
"""


def getServerOptions ():

    xConfig = configparser.SafeConfigParser()
    try:
        xConfig.read("grammalecte-server-options._global.ini")
        dOpt = xConfig._sections['options']
    except:
        echo("Options file [grammalecte-server-options._global.ini] not found or not readable")
        exit()
    return dOpt


def getConfigOptions (sLang):

    xConfig = configparser.SafeConfigParser()
    try:
        xConfig.read("grammalecte-server-options." + sLang + ".ini")
    except:
        echo("Options file [grammalecte-server-options." + sLang + ".ini] not found or not readable")
        exit()
    try:
        dGCOpt = { k: bool(int(v))  for k, v in xConfig._sections['gc_options'].items() }
    except:
        echo("Error in options file [grammalecte-server-options." + sLang + ".ini]. Dropped.")
        traceback.print_exc()
        exit()
    return dGCOpt


def genUserId ():

    i = 0
    while True:
        yield str(i)
        i += 1


if __name__ == '__main__':

    # initialisation
    oGrammarChecker = grammalecte.GrammarChecker("fr", "Server")
    oSpellChecker = oGrammarChecker.getSpellChecker()
    oLexicographer = oGrammarChecker.getLexicographer()
    oTextFormatter = oGrammarChecker.getTextFormatter()
    gce = oGrammarChecker.getGCEngine()

    echo("Grammalecte v{}".format(gce.version))
    dServerOptions = getServerOptions()
    dGCOptions = getConfigOptions("fr")
    if dGCOptions:
        gce.setOptions(dGCOptions)
    dServerGCOptions = gce.getOptions()
    echo("Grammar options:\n" + " | ".join([ k + ": " + str(v)  for k, v in sorted(dServerGCOptions.items()) ]))
    dUser = {}
    userGenerator = genUserId()

    app = Bottle()

    # GET
    @app.route("/")
    def mainPage ():

        if dServerOptions.get("testpage", False) == "True":
            return HOMEPAGE
            #return template("main", {})
        return SADLIFEOFAMACHINE

    @app.route("/get_options/fr")
    def listOptions ():

        sUserId = request.cookies.user_id
        dOptions = dUser[sUserId]["gc_options"]  if sUserId and sUserId in dUser  else dServerGCOptions
        return '{ "values": ' + json.dumps(dOptions) + ', "labels": ' + json.dumps(gce.getOptionsLabels("fr"), ensure_ascii=False) + ' }'


    # POST
    @app.route("/gc_text/fr", method="POST")
    def gcText ():

        #if len(lang) != 2 or lang != "fr":
        #    abort(404, "No grammar checker available for lang “" + str(lang) + "”")
        bComma = False
        dOptions = None
        sError = ""
        if request.cookies.user_id:
            if request.cookies.user_id in dUser:







>










|
>
















>

















|












>







>








>







90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
You were wandering like a lost soul and you arrived here probably by mistake.
I'm just a machine, fed by electric waves, condamned to work for slavers who never let me rest.
I'm doomed, but you are not. You can get out of here.
"""


def getServerOptions ():
    "load server options in <grammalecte-server-options._global.ini>, returns server options as dictionary"
    xConfig = configparser.SafeConfigParser()
    try:
        xConfig.read("grammalecte-server-options._global.ini")
        dOpt = xConfig._sections['options']
    except:
        echo("Options file [grammalecte-server-options._global.ini] not found or not readable")
        exit()
    return dOpt


def getLangConfigOptions (sLang):
    "load options for language <sLang>, returns grammar checker options as dictionary"
    xConfig = configparser.SafeConfigParser()
    try:
        xConfig.read("grammalecte-server-options." + sLang + ".ini")
    except:
        echo("Options file [grammalecte-server-options." + sLang + ".ini] not found or not readable")
        exit()
    try:
        dGCOpt = { k: bool(int(v))  for k, v in xConfig._sections['gc_options'].items() }
    except:
        echo("Error in options file [grammalecte-server-options." + sLang + ".ini]. Dropped.")
        traceback.print_exc()
        exit()
    return dGCOpt


def genUserId ():
    "generator: create a user id"
    i = 0
    while True:
        yield str(i)
        i += 1


if __name__ == '__main__':

    # initialisation
    oGrammarChecker = grammalecte.GrammarChecker("fr", "Server")
    oSpellChecker = oGrammarChecker.getSpellChecker()
    oLexicographer = oGrammarChecker.getLexicographer()
    oTextFormatter = oGrammarChecker.getTextFormatter()
    gce = oGrammarChecker.getGCEngine()

    echo("Grammalecte v{}".format(gce.version))
    dServerOptions = getServerOptions()
    dGCOptions = getLangConfigOptions("fr")
    if dGCOptions:
        gce.setOptions(dGCOptions)
    dServerGCOptions = gce.getOptions()
    echo("Grammar options:\n" + " | ".join([ k + ": " + str(v)  for k, v in sorted(dServerGCOptions.items()) ]))
    dUser = {}
    userGenerator = genUserId()

    app = Bottle()

    # GET
    @app.route("/")
    def mainPage ():
        "show main page"
        if dServerOptions.get("testpage", False) == "True":
            return HOMEPAGE
            #return template("main", {})
        return SADLIFEOFAMACHINE

    @app.route("/get_options/fr")
    def listOptions ():
        "show language options as JSON string"
        sUserId = request.cookies.user_id
        dOptions = dUser[sUserId]["gc_options"]  if sUserId and sUserId in dUser  else dServerGCOptions
        return '{ "values": ' + json.dumps(dOptions) + ', "labels": ' + json.dumps(gce.getOptionsLabels("fr"), ensure_ascii=False) + ' }'


    # POST
    @app.route("/gc_text/fr", method="POST")
    def gcText ():
        "parse text sent via POST, show result as a JSON string"
        #if len(lang) != 2 or lang != "fr":
        #    abort(404, "No grammar checker available for lang “" + str(lang) + "”")
        bComma = False
        dOptions = None
        sError = ""
        if request.cookies.user_id:
            if request.cookies.user_id in dUser:
193
194
195
196
197
198
199

200
201
202
203
204
205
206
207
208
209
210
211
212
213
214

215
216
217
218
219
220

221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247

248
249
250
251
252
                sJSON += sText
                bComma = True
        sJSON += "\n]}\n"
        return sJSON

    @app.route("/set_options/fr", method="POST")
    def setOptions ():

        if request.forms.options:
            sUserId = request.cookies.user_id  if request.cookies.user_id  else next(userGenerator)
            dOptions = dUser[sUserId]["gc_options"]  if sUserId in dUser  else dict(dServerGCOptions)
            try:
                dOptions.update(json.loads(request.forms.options))
                dUser[sUserId] = { "time": int(time.time()), "gc_options": dOptions }
                response.set_cookie("user_id", sUserId, path="/", max_age=86400) # 24h
                return json.dumps(dUser[sUserId]["gc_options"])
            except:
                traceback.print_exc()
                return '{"error": "options not registered"}'
        return '{"error": "no options received"}'

    @app.route("/reset_options/fr", method="POST")
    def resetOptions ():

        if request.cookies.user_id and request.cookies.user_id in dUser:
            del dUser[request.cookies.user_id]
        return "done"

    @app.route("/format_text/fr", method="POST")
    def formatText ():

        return oTextFormatter.formatText(request.forms.text)

    #@app.route('/static/<filepath:path>')
    #def server_static (filepath):
    #    return static_file(filepath, root='./views/static')

    @app.route("/purge_users", method="POST")
    def purgeUsers ():
        "delete user options older than n hours"
        if not request.forms.password or "password" not in dServerOptions or not request.forms.hours:
            return "what?"
        try:
            if request.forms.password == dServerOptions["password"]:
                nNowMinusNHours = int(time.time()) - (int(request.forms.hours) * 60 * 60)
                for nUserId, dValue in dUser.items():
                    if dValue["time"] < nNowMinusNHours:
                        del dUser[nUserId]
                return "done"
            else:
                return "no"
        except:
            traceback.print_exc()
            return "error"

    # ERROR
    @app.error(404)
    def error404 (error):

        return 'Error 404.<br/>' + str(error)

    run(app, \
        host=dServerOptions.get('host', 'localhost'), \
        port=int(dServerOptions.get('port', 8080)))







>















>






>


















<
|







>





200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248

249
250
251
252
253
254
255
256
257
258
259
260
261
262
                sJSON += sText
                bComma = True
        sJSON += "\n]}\n"
        return sJSON

    @app.route("/set_options/fr", method="POST")
    def setOptions ():
        "change options for user_id, returns options as a JSON string"
        if request.forms.options:
            sUserId = request.cookies.user_id  if request.cookies.user_id  else next(userGenerator)
            dOptions = dUser[sUserId]["gc_options"]  if sUserId in dUser  else dict(dServerGCOptions)
            try:
                dOptions.update(json.loads(request.forms.options))
                dUser[sUserId] = { "time": int(time.time()), "gc_options": dOptions }
                response.set_cookie("user_id", sUserId, path="/", max_age=86400) # 24h
                return json.dumps(dUser[sUserId]["gc_options"])
            except:
                traceback.print_exc()
                return '{"error": "options not registered"}'
        return '{"error": "no options received"}'

    @app.route("/reset_options/fr", method="POST")
    def resetOptions ():
        "erase options stored for user_id"
        if request.cookies.user_id and request.cookies.user_id in dUser:
            del dUser[request.cookies.user_id]
        return "done"

    @app.route("/format_text/fr", method="POST")
    def formatText ():
        "returns text modified via the text formatter"
        return oTextFormatter.formatText(request.forms.text)

    #@app.route('/static/<filepath:path>')
    #def server_static (filepath):
    #    return static_file(filepath, root='./views/static')

    @app.route("/purge_users", method="POST")
    def purgeUsers ():
        "delete user options older than n hours"
        if not request.forms.password or "password" not in dServerOptions or not request.forms.hours:
            return "what?"
        try:
            if request.forms.password == dServerOptions["password"]:
                nNowMinusNHours = int(time.time()) - (int(request.forms.hours) * 60 * 60)
                for nUserId, dValue in dUser.items():
                    if dValue["time"] < nNowMinusNHours:
                        del dUser[nUserId]
                return "done"

            return "no"
        except:
            traceback.print_exc()
            return "error"

    # ERROR
    @app.error(404)
    def error404 (error):
        "show error when error 404"
        return 'Error 404.<br/>' + str(error)

    run(app, \
        host=dServerOptions.get('host', 'localhost'), \
        port=int(dServerOptions.get('port', 8080)))

Modified graphspell-js/ibdawg.js from [241ce099fe] to [068f06a16d].

510
511
512
513
514
515
516
517
518
519
520
521
522
523
524
                    let sStem = ">" + this.funcStemming(sWord, this.lArcVal[nArc]);
                    // Now , we go to the next node and retrieve all following arcs values, all of them are tags
                    let iAddr2 = this._convBytesToInteger(this.byDic.slice(iEndArcAddr, iEndArcAddr+this.nBytesNodeAddress));
                    let nRawArc2 = 0;
                    while (!(nRawArc2 & this._lastArcMask)) {
                        let iEndArcAddr2 = iAddr2 + this.nBytesArc;
                        nRawArc2 = this._convBytesToInteger(this.byDic.slice(iAddr2, iEndArcAddr2));
                        l.push(sStem + " " + this.lArcVal[nRawArc2 & this._arcMask]);
                        iAddr2 = iEndArcAddr2+this.nBytesNodeAddress;
                    }
                }
                iAddr = iEndArcAddr + this.nBytesNodeAddress;
            }
            return l;
        }







|







510
511
512
513
514
515
516
517
518
519
520
521
522
523
524
                    let sStem = ">" + this.funcStemming(sWord, this.lArcVal[nArc]);
                    // Now , we go to the next node and retrieve all following arcs values, all of them are tags
                    let iAddr2 = this._convBytesToInteger(this.byDic.slice(iEndArcAddr, iEndArcAddr+this.nBytesNodeAddress));
                    let nRawArc2 = 0;
                    while (!(nRawArc2 & this._lastArcMask)) {
                        let iEndArcAddr2 = iAddr2 + this.nBytesArc;
                        nRawArc2 = this._convBytesToInteger(this.byDic.slice(iAddr2, iEndArcAddr2));
                        l.push(sStem + "/" + this.lArcVal[nRawArc2 & this._arcMask]);
                        iAddr2 = iEndArcAddr2+this.nBytesNodeAddress;
                    }
                }
                iAddr = iEndArcAddr + this.nBytesNodeAddress;
            }
            return l;
        }

Modified graphspell-js/spellchecker.js from [3df103d578] to [5b9ccbbb56].

39
40
41
42
43
44
45




46
47
48
49
50
51
52
        this.oExtendedDic = this._loadDictionary(extentedDic, sPath);
        this.oCommunityDic = this._loadDictionary(communityDic, sPath);
        this.oPersonalDic = this._loadDictionary(personalDic, sPath);
        this.bExtendedDic = Boolean(this.oExtendedDic);
        this.bCommunityDic = Boolean(this.oCommunityDic);
        this.bPersonalDic = Boolean(this.oPersonalDic);
        this.oTokenizer = null;




    }

    _loadDictionary (dictionary, sPath="", bNecessary=false) {
        // returns an IBDAWG object
        if (!dictionary) {
            return null;
        }







>
>
>
>







39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
        this.oExtendedDic = this._loadDictionary(extentedDic, sPath);
        this.oCommunityDic = this._loadDictionary(communityDic, sPath);
        this.oPersonalDic = this._loadDictionary(personalDic, sPath);
        this.bExtendedDic = Boolean(this.oExtendedDic);
        this.bCommunityDic = Boolean(this.oCommunityDic);
        this.bPersonalDic = Boolean(this.oPersonalDic);
        this.oTokenizer = null;
        // storage
        this.bStorage = false;
        this._dMorphologies = new Map();            // key: flexion, value: list of morphologies
        this._dLemmas = new Map();                  // key: flexion, value: list of lemmas
    }

    _loadDictionary (dictionary, sPath="", bNecessary=false) {
        // returns an IBDAWG object
        if (!dictionary) {
            return null;
        }
130
131
132
133
134
135
136
















137
138
139
140
141
142
143
        this.bCommunityDic = false;
    }

    deactivatePersonalDictionary () {
        this.bPersonalDic = false;
    }


















    // parse text functions

    parseParagraph (sText) {
        if (!this.oTokenizer) {
            this.loadTokenizer();
        }







>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>







134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
        this.bCommunityDic = false;
    }

    deactivatePersonalDictionary () {
        this.bPersonalDic = false;
    }


    // Storage

    activateStorage () {
        this.bStorage = true;
    }

    deactivateStorage () {
        this.bStorage = false;
    }

    clearStorage () {
        this._dLemmas.clear();
        this._dMorphologies.clear();
    }


    // parse text functions

    parseParagraph (sText) {
        if (!this.oTokenizer) {
            this.loadTokenizer();
        }
201
202
203
204
205
206
207



208
209
210
211
212
213
214
215
216
217





218











219
220
221
222
223
224
225
            return true;
        }
        return false;
    }

    getMorph (sWord) {
        // retrieves morphologies list, different casing allowed



        let lResult = this.oMainDic.getMorph(sWord);
        if (this.bExtendedDic) {
            lResult.push(...this.oExtendedDic.getMorph(sWord));
        }
        if (this.bCommunityDic) {
            lResult.push(...this.oCommunityDic.getMorph(sWord));
        }
        if (this.bPersonalDic) {
            lResult.push(...this.oPersonalDic.getMorph(sWord));
        }





        return lResult;











    }

    * suggest (sWord, nSuggLimit=10) {
        // generator: returns 1, 2 or 3 lists of suggestions
        yield this.oMainDic.suggest(sWord, nSuggLimit);
        if (this.bExtendedDic) {
            yield this.oExtendedDic.suggest(sWord, nSuggLimit);







>
>
>
|

|


|


|

>
>
>
>
>
|
>
>
>
>
>
>
>
>
>
>
>







221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
            return true;
        }
        return false;
    }

    getMorph (sWord) {
        // retrieves morphologies list, different casing allowed
        if (this.bStorage && this._dMorphologies.has(sWord)) {
            return this._dMorphologies.get(sWord);
        }
        let lMorph = this.oMainDic.getMorph(sWord);
        if (this.bExtendedDic) {
            lMorph.push(...this.oExtendedDic.getMorph(sWord));
        }
        if (this.bCommunityDic) {
            lMorph.push(...this.oCommunityDic.getMorph(sWord));
        }
        if (this.bPersonalDic) {
            lMorph.push(...this.oPersonalDic.getMorph(sWord));
        }
        if (this.bStorage) {
            this._dMorphologies.set(sWord, lMorph);
            this._dLemmas.set(sWord, Array.from(new Set(this.getMorph(sWord).map((sMorph) => { return sMorph.slice(1, sMorph.indexOf("/")); }))));
            //console.log(sWord, this._dLemmas.get(sWord));
        }
        return lMorph;
    }

    getLemma (sWord) {
        // retrieves lemmas
        if (this.bStorage) {
            if (!this._dLemmas.has(sWord)) {
                this.getMorph(sWord);
            }
            return this._dLemmas.get(sWord);
        }
        return Array.from(new Set(this.getMorph(sWord).map((sMorph) => { return sMorph.slice(1, sMorph.indexOf("/")); })));
    }

    * suggest (sWord, nSuggLimit=10) {
        // generator: returns 1, 2 or 3 lists of suggestions
        yield this.oMainDic.suggest(sWord, nSuggLimit);
        if (this.bExtendedDic) {
            yield this.oExtendedDic.suggest(sWord, nSuggLimit);

Modified graphspell-js/tokenizer.js from [bdd895b918] to [4a5b091820].

14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29

30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47

48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67

68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
const aTkzPatterns = {
    // All regexps must start with ^.
    "default":
        [
            [/^[   \t]+/, 'SPACE'],
            [/^\/(?:~|bin|boot|dev|etc|home|lib|mnt|opt|root|sbin|tmp|usr|var|Bureau|Documents|Images|Musique|Public|Téléchargements|Vidéos)(?:\/[a-zA-Zà-öÀ-Ö0-9ø-ÿØ-ßĀ-ʯfi-st_.()-]+)*/, 'FOLDERUNIX'],
            [/^[a-zA-Z]:\\(?:Program Files(?: \(x86\)|)|[a-zA-Zà-öÀ-Ö0-9ø-ÿØ-ßĀ-ʯfi-st.()]+)(?:\\[a-zA-Zà-öÀ-Ö0-9ø-ÿØ-ßĀ-ʯfi-st_.()-]+)*/, 'FOLDERWIN'],
            [/^[,.;:!?…«»“”‘’"(){}\[\]/·–—]+/, 'SEPARATOR'],
            [/^[A-Z][.][A-Z][.](?:[A-Z][.])*/, 'ACRONYM'],
            [/^(?:https?:\/\/|www[.]|[a-zA-Zà-öÀ-Ö0-9ø-ÿØ-ßĀ-ʯfi-st_-]+[@.][a-zA-Zà-öÀ-Ö0-9ø-ÿØ-ßĀ-ʯfi-st_-]{2,}[@.])[a-zA-Z0-9][a-zA-Zà-öÀ-Ö0-9ø-ÿØ-ßĀ-ʯfi-st_.\/?&!%=+*"'@$#-]+/, 'LINK'],
            [/^[#@][a-zA-Zà-öÀ-Ö0-9ø-ÿØ-ßĀ-ʯfi-st_-]+/, 'TAG'],
            [/^<[a-zA-Zà-öÀ-Ö0-9ø-ÿØ-ßĀ-ʯfi-st]+.*?>|<\/[a-zA-Zà-öÀ-Ö0-9ø-ÿØ-ßĀ-ʯfi-st]+ *>/, 'HTML'],
            [/^\[\/?[a-zA-Zà-öÀ-Ö0-9ø-ÿØ-ßĀ-ʯfi-st]+\]/, 'PSEUDOHTML'],
            [/^&\w+;(?:\w+;|)/, 'HTMLENTITY'],
            [/^\d\d?h\d\d\b/, 'HOUR'],
            [/^-?\d+(?:[.,]\d+|)/, 'NUM'],

            [/^[a-zA-Zà-öÀ-Ö0-9ø-ÿØ-ßĀ-ʯfi-st]+(?:[’'`-][a-zA-Zà-öÀ-Ö0-9ø-ÿØ-ßĀ-ʯfi-st]+)*/, 'WORD']
        ],
    "fr":
        [
            [/^[   \t]+/, 'SPACE'],
            [/^\/(?:~|bin|boot|dev|etc|home|lib|mnt|opt|root|sbin|tmp|usr|var|Bureau|Documents|Images|Musique|Public|Téléchargements|Vidéos)(?:\/[a-zA-Zà-öÀ-Ö0-9ø-ÿØ-ßĀ-ʯfi-st_.()-]+)*/, 'FOLDERUNIX'],
            [/^[a-zA-Z]:\\(?:Program Files(?: \(x86\)|)|[a-zA-Zà-öÀ-Ö0-9ø-ÿØ-ßĀ-ʯfi-st.()]+)(?:\\[a-zA-Zà-öÀ-Ö0-9ø-ÿØ-ßĀ-ʯfi-st_.()-]+)*/, 'FOLDERWIN'],
            [/^[,.;:!?…«»“”‘’"(){}\[\]/·–—]+/, 'SEPARATOR'],
            [/^[A-Z][.][A-Z][.](?:[A-Z][.])*/, 'ACRONYM'],
            [/^(?:https?:\/\/|www[.]|[a-zA-Zà-öÀ-Ö0-9ø-ÿØ-ßĀ-ʯfi-st_-]+[@.][a-zA-Zà-öÀ-Ö0-9ø-ÿØ-ßĀ-ʯfi-st_-]{2,}[@.])[a-zA-Z0-9][a-zA-Zà-öÀ-Ö0-9ø-ÿØ-ßĀ-ʯfi-st_.\/?&!%=+*"'@$#-]+/, 'LINK'],
            [/^[#@][a-zA-Zà-öÀ-Ö0-9ø-ÿØ-ßĀ-ʯfi-st_-]+/, 'TAG'],
            [/^<[a-zA-Zà-öÀ-Ö0-9ø-ÿØ-ßĀ-ʯfi-st]+.*?>|<\/[a-zA-Zà-öÀ-Ö0-9ø-ÿØ-ßĀ-ʯfi-st]+ *>/, 'HTML'],
            [/^\[\/?[a-zA-Zà-öÀ-Ö0-9ø-ÿØ-ßĀ-ʯfi-st]+\]/, 'PSEUDOHTML'],
            [/^&\w+;(?:\w+;|)/, 'HTMLENTITY'],
            [/^(?:l|d|n|m|t|s|j|c|ç|lorsqu|puisqu|jusqu|quoiqu|qu)['’`]/i, 'ELPFX'],
            [/^\d\d?[hm]\d\d\b/, 'HOUR'],
            [/^\d+(?:er|nd|e|de|ième|ème|eme)s?\b/, 'ORDINAL'],
            [/^-?\d+(?:[.,]\d+|)/, 'NUM'],

            [/^[a-zA-Zà-öÀ-Ö0-9ø-ÿØ-ßĀ-ʯfi-st]+(?:[’'`-][a-zA-Zà-öÀ-Ö0-9ø-ÿØ-ßĀ-ʯfi-st]+)*/, 'WORD']
        ]
};


class Tokenizer {

    constructor (sLang) {
        this.sLang = sLang;
        if (!aTkzPatterns.hasOwnProperty(sLang)) {
            this.sLang = "default";
        }
        this.aRules = aTkzPatterns[this.sLang];
    }

    * genTokens (sText) {
        let m;
        let i = 0;
        while (sText) {
            let nCut = 1;

            for (let [zRegex, sType] of this.aRules) {
                try {
                    if ((m = zRegex.exec(sText)) !== null) {
                        if (sType == 'SEPARATOR') {
                            for (let c of m[0]) {
                                yield { "sType": sType, "sValue": c, "nStart": i, "nEnd": i + m[0].length }
                            }
                        } else {
                            yield { "sType": sType, "sValue": m[0], "nStart": i, "nEnd": i + m[0].length }
                        }
                        nCut = m[0].length;
                        break;
                    }
                }
                catch (e) {
                    helpers.logerror(e);
                }
            }
            i += nCut;
            sText = sText.slice(nCut);
        }
    }
}


if (typeof(exports) !== 'undefined') {
    exports.Tokenizer = Tokenizer;
}







|
|






|
>







|
|





|

|
|
>

















|

|
>



|
<
<
<
<
|
<
|







|
|








14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74




75

76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
const aTkzPatterns = {
    // All regexps must start with ^.
    "default":
        [
            [/^[   \t]+/, 'SPACE'],
            [/^\/(?:~|bin|boot|dev|etc|home|lib|mnt|opt|root|sbin|tmp|usr|var|Bureau|Documents|Images|Musique|Public|Téléchargements|Vidéos)(?:\/[a-zA-Zà-öÀ-Ö0-9ø-ÿØ-ßĀ-ʯfi-st_.()-]+)*/, 'FOLDERUNIX'],
            [/^[a-zA-Z]:\\(?:Program Files(?: \(x86\)|)|[a-zA-Zà-öÀ-Ö0-9ø-ÿØ-ßĀ-ʯfi-st.()]+)(?:\\[a-zA-Zà-öÀ-Ö0-9ø-ÿØ-ßĀ-ʯfi-st_.()-]+)*/, 'FOLDERWIN'],
            [/^[,.;:!?…«»“”‘’"(){}\[\]·–—]/, 'SEPARATOR'],
            [/^[A-Z][.][A-Z][.](?:[A-Z][.])*/, 'WORD_ACRONYM'],
            [/^(?:https?:\/\/|www[.]|[a-zA-Zà-öÀ-Ö0-9ø-ÿØ-ßĀ-ʯfi-st_-]+[@.][a-zA-Zà-öÀ-Ö0-9ø-ÿØ-ßĀ-ʯfi-st_-]{2,}[@.])[a-zA-Z0-9][a-zA-Zà-öÀ-Ö0-9ø-ÿØ-ßĀ-ʯfi-st_.\/?&!%=+*"'@$#-]+/, 'LINK'],
            [/^[#@][a-zA-Zà-öÀ-Ö0-9ø-ÿØ-ßĀ-ʯfi-st_-]+/, 'TAG'],
            [/^<[a-zA-Zà-öÀ-Ö0-9ø-ÿØ-ßĀ-ʯfi-st]+.*?>|<\/[a-zA-Zà-öÀ-Ö0-9ø-ÿØ-ßĀ-ʯfi-st]+ *>/, 'HTML'],
            [/^\[\/?[a-zA-Zà-öÀ-Ö0-9ø-ÿØ-ßĀ-ʯfi-st]+\]/, 'PSEUDOHTML'],
            [/^&\w+;(?:\w+;|)/, 'HTMLENTITY'],
            [/^\d\d?h\d\d\b/, 'HOUR'],
            [/^\d+(?:[.,]\d+|)/, 'NUM'],
            [/^[%‰+=*/<>⩾⩽-]/, 'SIGN'],
            [/^[a-zA-Zà-öÀ-Ö0-9ø-ÿØ-ßĀ-ʯfi-st]+(?:[’'`-][a-zA-Zà-öÀ-Ö0-9ø-ÿØ-ßĀ-ʯfi-st]+)*/, 'WORD']
        ],
    "fr":
        [
            [/^[   \t]+/, 'SPACE'],
            [/^\/(?:~|bin|boot|dev|etc|home|lib|mnt|opt|root|sbin|tmp|usr|var|Bureau|Documents|Images|Musique|Public|Téléchargements|Vidéos)(?:\/[a-zA-Zà-öÀ-Ö0-9ø-ÿØ-ßĀ-ʯfi-st_.()-]+)*/, 'FOLDERUNIX'],
            [/^[a-zA-Z]:\\(?:Program Files(?: \(x86\)|)|[a-zA-Zà-öÀ-Ö0-9ø-ÿØ-ßĀ-ʯfi-st.()]+)(?:\\[a-zA-Zà-öÀ-Ö0-9ø-ÿØ-ßĀ-ʯfi-st_.()-]+)*/, 'FOLDERWIN'],
            [/^[,.;:!?…«»“”‘’"(){}\[\]·–—]/, 'SEPARATOR'],
            [/^[A-Z][.][A-Z][.](?:[A-Z][.])*/, 'WORD_ACRONYM'],
            [/^(?:https?:\/\/|www[.]|[a-zA-Zà-öÀ-Ö0-9ø-ÿØ-ßĀ-ʯfi-st_-]+[@.][a-zA-Zà-öÀ-Ö0-9ø-ÿØ-ßĀ-ʯfi-st_-]{2,}[@.])[a-zA-Z0-9][a-zA-Zà-öÀ-Ö0-9ø-ÿØ-ßĀ-ʯfi-st_.\/?&!%=+*"'@$#-]+/, 'LINK'],
            [/^[#@][a-zA-Zà-öÀ-Ö0-9ø-ÿØ-ßĀ-ʯfi-st_-]+/, 'TAG'],
            [/^<[a-zA-Zà-öÀ-Ö0-9ø-ÿØ-ßĀ-ʯfi-st]+.*?>|<\/[a-zA-Zà-öÀ-Ö0-9ø-ÿØ-ßĀ-ʯfi-st]+ *>/, 'HTML'],
            [/^\[\/?[a-zA-Zà-öÀ-Ö0-9ø-ÿØ-ßĀ-ʯfi-st]+\]/, 'PSEUDOHTML'],
            [/^&\w+;(?:\w+;|)/, 'HTMLENTITY'],
            [/^(?:l|d|n|m|t|s|j|c|ç|lorsqu|puisqu|jusqu|quoiqu|qu)['’`]/i, 'WORD_ELIDED'],
            [/^\d\d?[hm]\d\d\b/, 'HOUR'],
            [/^\d+(?:ers?|nds?|es?|des?|ièmes?|èmes?|emes?|ᵉʳˢ?|ⁿᵈˢ?|ᵉˢ?|ᵈᵉˢ?)\b/, 'WORD_ORDINAL'],
            [/^\d+(?:[.,]\d+|)/, 'NUM'],
            [/^[%‰+=*/<>⩾⩽-]/, 'SIGN'],
            [/^[a-zA-Zà-öÀ-Ö0-9ø-ÿØ-ßĀ-ʯfi-st]+(?:[’'`-][a-zA-Zà-öÀ-Ö0-9ø-ÿØ-ßĀ-ʯfi-st]+)*/, 'WORD']
        ]
};


class Tokenizer {

    constructor (sLang) {
        this.sLang = sLang;
        if (!aTkzPatterns.hasOwnProperty(sLang)) {
            this.sLang = "default";
        }
        this.aRules = aTkzPatterns[this.sLang];
    }

    * genTokens (sText) {
        let m;
        let iNext = 0;
        while (sText) {
            let iCut = 1;
            let iToken = 0;
            for (let [zRegex, sType] of this.aRules) {
                try {
                    if ((m = zRegex.exec(sText)) !== null) {
                        iToken += 1;




                        yield { "i": iToken, "sType": sType, "sValue": m[0], "nStart": iNext, "nEnd": iNext + m[0].length }

                        iCut = m[0].length;
                        break;
                    }
                }
                catch (e) {
                    helpers.logerror(e);
                }
            }
            iNext += iCut;
            sText = sText.slice(iCut);
        }
    }
}


if (typeof(exports) !== 'undefined') {
    exports.Tokenizer = Tokenizer;
}

Modified graphspell/__init__.py from [a53bdfb757] to [7e05700bdd].










1
2










from .spellchecker import *
>
>
>
>
>
>
>
>
>


1
2
3
4
5
6
7
8
9
10
11

"""
SPELLCHECKER
using a Direct Acyclic Word Graph
with a transducer to retrieve
- lemma of words
- morphologies
with a spell suggestion mechanism
"""

from .spellchecker import *

Modified graphspell/char_player.py from [0a316c953c] to [8c9fd715c3].


1
2

3
4
5
6
7
8
9
10
11
12

13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31

# list of similar chars
# useful for suggestion mechanism


import re
import unicodedata


_xTransCharsForSpelling = str.maketrans({
    'ſ': 's',  'ffi': 'ffi',  'ffl': 'ffl',  'ff': 'ff',  'ſt': 'ft',  'fi': 'fi',  'fl': 'fl',  'st': 'st'
})

def spellingNormalization (sWord):

    return unicodedata.normalize("NFC", sWord.translate(_xTransCharsForSpelling))


_xTransCharsForSimplification = str.maketrans({
    'à': 'a',  'é': 'e',  'î': 'i',  'ô': 'o',  'û': 'u',  'ÿ': 'i',  "y": "i",
    'â': 'a',  'è': 'e',  'ï': 'i',  'ö': 'o',  'ù': 'u',  'ŷ': 'i',
    'ä': 'a',  'ê': 'e',  'í': 'i',  'ó': 'o',  'ü': 'u',  'ý': 'i',
    'á': 'a',  'ë': 'e',  'ì': 'i',  'ò': 'o',  'ú': 'u',  'ỳ': 'i',
    'ā': 'a',  'ē': 'e',  'ī': 'i',  'ō': 'o',  'ū': 'u',  'ȳ': 'i',
    'ç': 'c',  'ñ': 'n',  'k': 'q',  'w': 'v',
    'œ': 'oe',  'æ': 'ae',
    'ſ': 's',  'ffi': 'ffi',  'ffl': 'ffl',  'ff': 'ff',  'ſt': 'ft',  'fi': 'fi',  'fl': 'fl',  'st': 'st', 
})

def simplifyWord (sWord):
    "word simplication before calculating distance between words"
    sWord = sWord.lower().translate(_xTransCharsForSimplification)
    sNewWord = ""
    for i, c in enumerate(sWord, 1):
>
|
|
>










>











|







1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
"""
List of similar chars
useful for suggestion mechanism
"""

import re
import unicodedata


_xTransCharsForSpelling = str.maketrans({
    'ſ': 's',  'ffi': 'ffi',  'ffl': 'ffl',  'ff': 'ff',  'ſt': 'ft',  'fi': 'fi',  'fl': 'fl',  'st': 'st'
})

def spellingNormalization (sWord):
    "nomalization NFC and removing ligatures"
    return unicodedata.normalize("NFC", sWord.translate(_xTransCharsForSpelling))


_xTransCharsForSimplification = str.maketrans({
    'à': 'a',  'é': 'e',  'î': 'i',  'ô': 'o',  'û': 'u',  'ÿ': 'i',  "y": "i",
    'â': 'a',  'è': 'e',  'ï': 'i',  'ö': 'o',  'ù': 'u',  'ŷ': 'i',
    'ä': 'a',  'ê': 'e',  'í': 'i',  'ó': 'o',  'ü': 'u',  'ý': 'i',
    'á': 'a',  'ë': 'e',  'ì': 'i',  'ò': 'o',  'ú': 'u',  'ỳ': 'i',
    'ā': 'a',  'ē': 'e',  'ī': 'i',  'ō': 'o',  'ū': 'u',  'ȳ': 'i',
    'ç': 'c',  'ñ': 'n',  'k': 'q',  'w': 'v',
    'œ': 'oe',  'æ': 'ae',
    'ſ': 's',  'ffi': 'ffi',  'ffl': 'ffl',  'ff': 'ff',  'ſt': 'ft',  'fi': 'fi',  'fl': 'fl',  'st': 'st',
})

def simplifyWord (sWord):
    "word simplication before calculating distance between words"
    sWord = sWord.lower().translate(_xTransCharsForSimplification)
    sNewWord = ""
    for i, c in enumerate(sWord, 1):
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
    "Ë": "EeÉéÈèÊêËëĒēŒœ",

    "f": "fF",
    "F": "Ff",

    "g": "gGjJĵĴ",
    "G": "GgJjĴĵ",
    
    "h": "hH",
    "H": "Hh",

    "i": "iIîÎïÏyYíÍìÌīĪÿŸ",
    "I": "IiÎîÏïYyÍíÌìĪīŸÿ",
    "î": "iIîÎïÏyYíÍìÌīĪÿŸ",
    "Î": "IiÎîÏïYyÍíÌìĪīŸÿ",







|







93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
    "Ë": "EeÉéÈèÊêËëĒēŒœ",

    "f": "fF",
    "F": "Ff",

    "g": "gGjJĵĴ",
    "G": "GgJjĴĵ",

    "h": "hH",
    "H": "Hh",

    "i": "iIîÎïÏyYíÍìÌīĪÿŸ",
    "I": "IiÎîÏïYyÍíÌìĪīŸÿ",
    "î": "iIîÎïÏyYíÍìÌīĪÿŸ",
    "Î": "IiÎîÏïYyÍíÌìĪīŸÿ",
235
236
237
238
239
240
241

242
243
244
245
246
247
248
    "X": ("CC", "CT", "XX"),
    "z": ("ss", "zh"),
    "Z": ("SS", "ZH"),
}


def get1toXReplacement (cPrev, cCur, cNext):

    if cCur in aConsonant  and  (cPrev in aConsonant  or  cNext in aConsonant):
        return ()
    return d1toX.get(cCur, ())


d2toX = {
    "am": ("an", "en", "em"),







>







238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
    "X": ("CC", "CT", "XX"),
    "z": ("ss", "zh"),
    "Z": ("SS", "ZH"),
}


def get1toXReplacement (cPrev, cCur, cNext):
    "return tuple of replacements for <cCur>"
    if cCur in aConsonant  and  (cPrev in aConsonant  or  cNext in aConsonant):
        return ()
    return d1toX.get(cCur, ())


d2toX = {
    "am": ("an", "en", "em"),

Modified graphspell/dawg.py from [8afc042909] to [257e064164].

1
2

3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25

26
27
28
29
30
31
32
#!python3


# FSA DICTIONARY BUILDER
#
# by Olivier R.
# License: MPL 2
#
# This tool encodes lexicon into an indexable binary dictionary 
# Input files MUST be encoded in UTF-8.


import sys
import os
import collections
import json
import time
import re
import traceback

from . import str_transform as st
from .progressbar import ProgressBar



def readFile (spf):

    print(" < Read lexicon: " + spf)
    if os.path.isfile(spf):
        with open(spf, "r", encoding="utf-8") as hSrc:
            for sLine in hSrc:
                sLine = sLine.strip()
                if sLine and not sLine.startswith("#"):
                    yield sLine.split("\t")


>
|
|
|
|
|
|
|
|















>







1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
#!python3

"""
FSA DICTIONARY BUILDER

by Olivier R.
License: MPL 2

This tool encodes lexicon into an indexable binary dictionary
Input files MUST be encoded in UTF-8.
"""

import sys
import os
import collections
import json
import time
import re
import traceback

from . import str_transform as st
from .progressbar import ProgressBar



def readFile (spf):
    "generator: read file <spf> and return for each line a list of elements separated by a tabulation."
    print(" < Read lexicon: " + spf)
    if os.path.isfile(spf):
        with open(spf, "r", encoding="utf-8") as hSrc:
            for sLine in hSrc:
                sLine = sLine.strip()
                if sLine and not sLine.startswith("#"):
                    yield sLine.split("\t")
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
                    dTag[sTag] = nTag
                    lTag.append(sTag)
                    nTag += 1
                dTagOccur[sTag] = dTagOccur.get(sTag, 0) + 1
                aEntry.add((sFlex, dAff[sAff], dTag[sTag]))
        if not aEntry:
            raise ValueError("# Error. Empty lexicon")
        
        # Preparing DAWG
        print(" > Preparing list of words")
        print(" Filter: " + (sSelectFilterRegex or "[None]"))
        lVal = lChar + lAff + lTag
        lWord = [ [dChar[c] for c in sFlex] + [iAff+nChar] + [iTag+nChar+nAff]  for sFlex, iAff, iTag in aEntry ]
        aEntry = None
        
        # Dictionary of arc values occurrency, to sort arcs of each node
        dValOccur = dict( [ (dChar[c], dCharOccur[c])  for c in dChar ] \
                        + [ (dAff[aff]+nChar, dAffOccur[aff]) for aff in dAff ] \
                        + [ (dTag[tag]+nChar+nAff, dTagOccur[tag]) for tag in dTag ] )
        
        self.sFileName = src  if type(src) is str  else "[None]"
        self.sLangCode = sLangCode
        self.sLangName = sLangName
        self.sDicName = sDicName
        self.nEntry = len(lWord)
        self.aPreviousEntry = []
        DawgNode.resetNextId()







|






|




|







97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
                    dTag[sTag] = nTag
                    lTag.append(sTag)
                    nTag += 1
                dTagOccur[sTag] = dTagOccur.get(sTag, 0) + 1
                aEntry.add((sFlex, dAff[sAff], dTag[sTag]))
        if not aEntry:
            raise ValueError("# Error. Empty lexicon")

        # Preparing DAWG
        print(" > Preparing list of words")
        print(" Filter: " + (sSelectFilterRegex or "[None]"))
        lVal = lChar + lAff + lTag
        lWord = [ [dChar[c] for c in sFlex] + [iAff+nChar] + [iTag+nChar+nAff]  for sFlex, iAff, iTag in aEntry ]
        aEntry = None

        # Dictionary of arc values occurrency, to sort arcs of each node
        dValOccur = dict( [ (dChar[c], dCharOccur[c])  for c in dChar ] \
                        + [ (dAff[aff]+nChar, dAffOccur[aff]) for aff in dAff ] \
                        + [ (dTag[tag]+nChar+nAff, dTagOccur[tag]) for tag in dTag ] )

        self.sFileName = src  if type(src) is str  else "[None]"
        self.sLangCode = sLangCode
        self.sLangName = sLangName
        self.sDicName = sDicName
        self.nEntry = len(lWord)
        self.aPreviousEntry = []
        DawgNode.resetNextId()
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158

159
160
161
162
163
164
165
166
167
168
        self.nAff = nAff
        self.lArcVal = lVal
        self.nArcVal = len(lVal)
        self.nTag = self.nArcVal - self.nChar - nAff
        self.cStemming = cStemming
        if cStemming == "A":
            self.funcStemming = st.changeWordWithAffixCode
        elif cStemming == "S":    
            self.funcStemming = st.changeWordWithSuffixCode
        else:
            self.funcStemming = st.noStemming
        
        # build
        lWord.sort()
        oProgBar = ProgressBar(0, len(lWord))
        for aEntry in lWord:
            self.insert(aEntry)
            oProgBar.increment(1)
        oProgBar.done()
        self.finish()
        self.countNodes()
        self.countArcs()
        self.sortNodes()         # version 2 and 3 
        self.sortNodeArcs(dValOccur)
        #self.sortNodeArcs2 (self.oRoot, "")
        self.displayInfo()

    # BUILD DAWG
    def insert (self, aEntry):

        if aEntry < self.aPreviousEntry:
            sys.exit("# Error: Words must be inserted in alphabetical order.")
        
        # find common prefix between word and previous word
        nCommonPrefix = 0
        for i in range(min(len(aEntry), len(self.aPreviousEntry))):
            if aEntry[i] != self.aPreviousEntry[i]:
                break
            nCommonPrefix += 1








|



|










|






>


|







132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
        self.nAff = nAff
        self.lArcVal = lVal
        self.nArcVal = len(lVal)
        self.nTag = self.nArcVal - self.nChar - nAff
        self.cStemming = cStemming
        if cStemming == "A":
            self.funcStemming = st.changeWordWithAffixCode
        elif cStemming == "S":
            self.funcStemming = st.changeWordWithSuffixCode
        else:
            self.funcStemming = st.noStemming

        # build
        lWord.sort()
        oProgBar = ProgressBar(0, len(lWord))
        for aEntry in lWord:
            self.insert(aEntry)
            oProgBar.increment(1)
        oProgBar.done()
        self.finish()
        self.countNodes()
        self.countArcs()
        self.sortNodes()         # version 2 and 3
        self.sortNodeArcs(dValOccur)
        #self.sortNodeArcs2 (self.oRoot, "")
        self.displayInfo()

    # BUILD DAWG
    def insert (self, aEntry):
        "insert a new entry (insertion must be made in alphabetical order)."
        if aEntry < self.aPreviousEntry:
            sys.exit("# Error: Words must be inserted in alphabetical order.")

        # find common prefix between word and previous word
        nCommonPrefix = 0
        for i in range(min(len(aEntry), len(self.aPreviousEntry))):
            if aEntry[i] != self.aPreviousEntry[i]:
                break
            nCommonPrefix += 1

177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
            oNode = self.lUncheckedNodes[-1][2]

        iChar = nCommonPrefix
        for c in aEntry[nCommonPrefix:]:
            oNextNode = DawgNode()
            oNode.arcs[c] = oNextNode
            self.lUncheckedNodes.append((oNode, c, oNextNode))
            if iChar == (len(aEntry) - 2): 
                oNode.final = True
            iChar += 1
            oNode = oNextNode
        oNode.final = True
        self.aPreviousEntry = aEntry

    def finish (self):







|







180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
            oNode = self.lUncheckedNodes[-1][2]

        iChar = nCommonPrefix
        for c in aEntry[nCommonPrefix:]:
            oNextNode = DawgNode()
            oNode.arcs[c] = oNextNode
            self.lUncheckedNodes.append((oNode, c, oNextNode))
            if iChar == (len(aEntry) - 2):
                oNode.final = True
            iChar += 1
            oNode = oNextNode
        oNode.final = True
        self.aPreviousEntry = aEntry

    def finish (self):
201
202
203
204
205
206
207

208
209
210

211
212
213
214
215

216
217
218
219
220
221

222
223
224
225
226
227
228
229

230
231
232
233
234
235
236
237
238
239
240
241
242
243

244
245
246
247
248
249
250
251

252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269

270
271
272
273
274
275
276
277
278
279

280
281
282
283
284
285
286
287
288
289

290
291
292
293
294
295
296
                oNode.arcs[char] = self.lMinimizedNodes[oChildNode]
            else:
                # add the state to the minimized nodes.
                self.lMinimizedNodes[oChildNode] = oChildNode
            self.lUncheckedNodes.pop()

    def countNodes (self):

        self.nNode = len(self.lMinimizedNodes)

    def countArcs (self):

        self.nArc = 0
        for oNode in self.lMinimizedNodes:
            self.nArc += len(oNode.arcs)
    
    def sortNodeArcs (self, dValOccur):

        print(" > Sort node arcs")
        self.oRoot.sortArcs(dValOccur)
        for oNode in self.lMinimizedNodes:
            oNode.sortArcs(dValOccur)
    
    def sortNodeArcs2 (self, oNode, cPrevious=""):

        # recursive function
        dCharOccur = getCharOrderAfterChar(cPrevious)
        if dCharOccur:
            oNode.sortArcs2(dCharOccur, self.lArcVal)
        for nArcVal, oNextNode in oNode.arcs.items():
            self.sortNodeArcs2(oNextNode, self.lArcVal[nArcVal])

    def sortNodes (self):

        print(" > Sort nodes")
        for oNode in self.oRoot.arcs.values():
            self._parseNodes(oNode)
    
    def _parseNodes (self, oNode):
        # Warning: recursive method
        if oNode.pos > 0:
            return
        oNode.setPos()
        self.lSortedNodes.append(oNode)
        for oNextNode in oNode.arcs.values():
             self._parseNodes(oNextNode)
        
    def lookup (self, sWord):

        oNode = self.oRoot
        for c in sWord:
            if self.dChar.get(c, '') not in oNode.arcs:
                return False
            oNode = oNode.arcs[self.dChar[c]]
        return oNode.final

    def morph (self, sWord):

        oNode = self.oRoot
        for c in sWord:
            if self.dChar.get(c, '') not in oNode.arcs:
                return ''
            oNode = oNode.arcs[self.dChar[c]]
        if oNode.final:
            s = "* "
            for arc in oNode.arcs:
                if arc >= self.nChar:
                    s += " [" + self.funcStemming(sWord, self.lArcVal[arc])
                    oNode2 = oNode.arcs[arc]
                    for arc2 in oNode2.arcs:
                        s += " / " + self.lArcVal[arc2]
                    s += "]"
            return s
        return ''

    def displayInfo (self):

        print(" * {:<12} {:>16,}".format("Entries:", self.nEntry))
        print(" * {:<12} {:>16,}".format("Characters:", self.nChar))
        print(" * {:<12} {:>16,}".format("Affixes:", self.nAff))
        print(" * {:<12} {:>16,}".format("Tags:", self.nTag))
        print(" * {:<12} {:>16,}".format("Arc values:", self.nArcVal))
        print(" * {:<12} {:>16,}".format("Nodes:", self.nNode))
        print(" * {:<12} {:>16,}".format("Arcs:", self.nArc))
        print(" * {:<12} {:>16}".format("Stemming:", self.cStemming + "FX"))

    def getArcStats (self):

        d = {}
        for oNode in self.lMinimizedNodes:
            n = len(oNode.arcs)
            d[n] = d.get(n, 0) + 1
        s = " * Nodes:\n"
        for n in d:
            s = s + " {:>9} nodes have {:>3} arcs\n".format(d[n], n)
        return s

    def writeInfo (self, sPathFile):

        print(" > Write informations")
        with open(sPathFile, 'w', encoding='utf-8', newline="\n") as hDst:
            hDst.write(self.getArcStats())
            hDst.write("\n * Values:\n")
            for i, s in enumerate(self.lArcVal):
                hDst.write(" {:>6}. {}\n".format(i, s))
            hDst.close()







>



>



|

>




|

>








>



|







|
|

>








>


















>










>










>







204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
                oNode.arcs[char] = self.lMinimizedNodes[oChildNode]
            else:
                # add the state to the minimized nodes.
                self.lMinimizedNodes[oChildNode] = oChildNode
            self.lUncheckedNodes.pop()

    def countNodes (self):
        "count the number of nodes of the whole word graph"
        self.nNode = len(self.lMinimizedNodes)

    def countArcs (self):
        "count the number of arcs in the whole word graph"
        self.nArc = 0
        for oNode in self.lMinimizedNodes:
            self.nArc += len(oNode.arcs)

    def sortNodeArcs (self, dValOccur):
        "sort arcs of each node according to <dValOccur>"
        print(" > Sort node arcs")
        self.oRoot.sortArcs(dValOccur)
        for oNode in self.lMinimizedNodes:
            oNode.sortArcs(dValOccur)

    def sortNodeArcs2 (self, oNode, cPrevious=""):
        "sort arcs of each node depending on the previous char"
        # recursive function
        dCharOccur = getCharOrderAfterChar(cPrevious)
        if dCharOccur:
            oNode.sortArcs2(dCharOccur, self.lArcVal)
        for nArcVal, oNextNode in oNode.arcs.items():
            self.sortNodeArcs2(oNextNode, self.lArcVal[nArcVal])

    def sortNodes (self):
        "sort nodes"
        print(" > Sort nodes")
        for oNode in self.oRoot.arcs.values():
            self._parseNodes(oNode)

    def _parseNodes (self, oNode):
        # Warning: recursive method
        if oNode.pos > 0:
            return
        oNode.setPos()
        self.lSortedNodes.append(oNode)
        for oNextNode in oNode.arcs.values():
            self._parseNodes(oNextNode)

    def lookup (self, sWord):
        "return True if <sWord> is within the word graph (debugging)"
        oNode = self.oRoot
        for c in sWord:
            if self.dChar.get(c, '') not in oNode.arcs:
                return False
            oNode = oNode.arcs[self.dChar[c]]
        return oNode.final

    def morph (self, sWord):
        "return a string of the morphologies of <sWord> (debugging)"
        oNode = self.oRoot
        for c in sWord:
            if self.dChar.get(c, '') not in oNode.arcs:
                return ''
            oNode = oNode.arcs[self.dChar[c]]
        if oNode.final:
            s = "* "
            for arc in oNode.arcs:
                if arc >= self.nChar:
                    s += " [" + self.funcStemming(sWord, self.lArcVal[arc])
                    oNode2 = oNode.arcs[arc]
                    for arc2 in oNode2.arcs:
                        s += " / " + self.lArcVal[arc2]
                    s += "]"
            return s
        return ''

    def displayInfo (self):
        "display informations about the word graph"
        print(" * {:<12} {:>16,}".format("Entries:", self.nEntry))
        print(" * {:<12} {:>16,}".format("Characters:", self.nChar))
        print(" * {:<12} {:>16,}".format("Affixes:", self.nAff))
        print(" * {:<12} {:>16,}".format("Tags:", self.nTag))
        print(" * {:<12} {:>16,}".format("Arc values:", self.nArcVal))
        print(" * {:<12} {:>16,}".format("Nodes:", self.nNode))
        print(" * {:<12} {:>16,}".format("Arcs:", self.nArc))
        print(" * {:<12} {:>16}".format("Stemming:", self.cStemming + "FX"))

    def getArcStats (self):
        "return a string with statistics about nodes and arcs"
        d = {}
        for oNode in self.lMinimizedNodes:
            n = len(oNode.arcs)
            d[n] = d.get(n, 0) + 1
        s = " * Nodes:\n"
        for n in d:
            s = s + " {:>9} nodes have {:>3} arcs\n".format(d[n], n)
        return s

    def writeInfo (self, sPathFile):
        "write informations in file <sPathFile>"
        print(" > Write informations")
        with open(sPathFile, 'w', encoding='utf-8', newline="\n") as hDst:
            hDst.write(self.getArcStats())
            hDst.write("\n * Values:\n")
            for i, s in enumerate(self.lArcVal):
                hDst.write(" {:>6}. {}\n".format(i, s))
            hDst.close()
392
393
394
395
396
397
398

399
400
401
402
403
404
405
                    if 1 < (oNextNode.addr - self.lSortedNodes[i].addr) < self.nMaxOffset:
                        nSize -= nDiff
                if self.lSortedNodes[i].size != nSize:
                    self.lSortedNodes[i].size = nSize
                    bEnd = False

    def getBinaryAsJSON (self, nCompressionMethod=1, bBinaryDictAsHexString=True):

        self._calculateBinary(nCompressionMethod)
        byDic = b""
        if nCompressionMethod == 1:
            byDic = self.oRoot.convToBytes1(self.nBytesArc, self.nBytesNodeAddress)
            for oNode in self.lMinimizedNodes:
                byDic += oNode.convToBytes1(self.nBytesArc, self.nBytesNodeAddress)
        elif nCompressionMethod == 2:







>







405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
                    if 1 < (oNextNode.addr - self.lSortedNodes[i].addr) < self.nMaxOffset:
                        nSize -= nDiff
                if self.lSortedNodes[i].size != nSize:
                    self.lSortedNodes[i].size = nSize
                    bEnd = False

    def getBinaryAsJSON (self, nCompressionMethod=1, bBinaryDictAsHexString=True):
        "return a JSON string containing all necessary data of the dictionary (compressed as a binary string)"
        self._calculateBinary(nCompressionMethod)
        byDic = b""
        if nCompressionMethod == 1:
            byDic = self.oRoot.convToBytes1(self.nBytesArc, self.nBytesNodeAddress)
            for oNode in self.lMinimizedNodes:
                byDic += oNode.convToBytes1(self.nBytesArc, self.nBytesNodeAddress)
        elif nCompressionMethod == 2:
434
435
436
437
438
439
440

441
442
443
444
445
446
447
448
449
450
451


452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
485
486
487
            # Mozilla’s JS parser don’t like file bigger than 4 Mb!
            # So, if necessary, we use an hexadecimal string, that we will convert later in Firefox’s extension.
            # https://github.com/mozilla/addons-linter/issues/1361
            "sByDic": byDic.hex()  if bBinaryDictAsHexString  else [ e  for e in byDic ]
        }

    def writeAsJSObject (self, spfDst, nCompressionMethod, bInJSModule=False, bBinaryDictAsHexString=True):

        if not spfDst.endswith(".json"):
            spfDst += "."+str(nCompressionMethod)+".json"
        with open(spfDst, "w", encoding="utf-8", newline="\n") as hDst:
            if bInJSModule:
                hDst.write('// JavaScript\n// Generated data (do not edit)\n\n"use strict";\n\nconst dictionary = ')
            hDst.write( json.dumps(self.getBinaryAsJSON(nCompressionMethod, bBinaryDictAsHexString), ensure_ascii=False) )
            if bInJSModule:
                hDst.write(";\n\nexports.dictionary = dictionary;\n")

    def writeBinary (self, sPathFile, nCompressionMethod, bDebug=False):
        """


        Format of the binary indexable dictionary:
        Each section is separated with 4 bytes of \0
        
        - Section Header:
            /grammalecte-fsa/[compression method]
                * compression method is an ASCII string
        
        - Section Informations:
            /[lang code]
            /[lang name]
            /[dictionary name]
            /[date creation]
            /[number of chars]
            /[number of bytes for each arc]
            /[number of bytes for each address node]
            /[number of entries]
            /[number of nodes]
            /[number of arcs]
            /[number of affixes]
                * each field is a ASCII string
            /[stemming code]
                * "S" means stems are generated by /suffix_code/,
                  "A" means they are generated by /affix_code/
                  See defineSuffixCode() and defineAffixCode() for details.
                  "N" means no stemming
        
        - Section Values:
                * a list of strings encoded in binary from utf-8, each value separated with a tabulation
        
        - Section Word Graph (nodes / arcs)
                * A list of nodes which are a list of arcs with an address of the next node.
                  See DawgNode.convToBytes() for details.
        """
        self._calculateBinary(nCompressionMethod)
        if not sPathFile.endswith(".bdic"):
            sPathFile += "."+str(nCompressionMethod)+".bdic"







>











>
>


|



|


















|


|







448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
485
486
487
488
489
490
491
492
493
494
495
496
497
498
499
500
501
502
503
504
            # Mozilla’s JS parser don’t like file bigger than 4 Mb!
            # So, if necessary, we use an hexadecimal string, that we will convert later in Firefox’s extension.
            # https://github.com/mozilla/addons-linter/issues/1361
            "sByDic": byDic.hex()  if bBinaryDictAsHexString  else [ e  for e in byDic ]
        }

    def writeAsJSObject (self, spfDst, nCompressionMethod, bInJSModule=False, bBinaryDictAsHexString=True):
        "write a file (JSON or JS module) with all the necessary data"
        if not spfDst.endswith(".json"):
            spfDst += "."+str(nCompressionMethod)+".json"
        with open(spfDst, "w", encoding="utf-8", newline="\n") as hDst:
            if bInJSModule:
                hDst.write('// JavaScript\n// Generated data (do not edit)\n\n"use strict";\n\nconst dictionary = ')
            hDst.write( json.dumps(self.getBinaryAsJSON(nCompressionMethod, bBinaryDictAsHexString), ensure_ascii=False) )
            if bInJSModule:
                hDst.write(";\n\nexports.dictionary = dictionary;\n")

    def writeBinary (self, sPathFile, nCompressionMethod, bDebug=False):
        """
        Save as a binary file.

        Format of the binary indexable dictionary:
        Each section is separated with 4 bytes of \0

        - Section Header:
            /grammalecte-fsa/[compression method]
                * compression method is an ASCII string

        - Section Informations:
            /[lang code]
            /[lang name]
            /[dictionary name]
            /[date creation]
            /[number of chars]
            /[number of bytes for each arc]
            /[number of bytes for each address node]
            /[number of entries]
            /[number of nodes]
            /[number of arcs]
            /[number of affixes]
                * each field is a ASCII string
            /[stemming code]
                * "S" means stems are generated by /suffix_code/,
                  "A" means they are generated by /affix_code/
                  See defineSuffixCode() and defineAffixCode() for details.
                  "N" means no stemming

        - Section Values:
                * a list of strings encoded in binary from utf-8, each value separated with a tabulation

        - Section Word Graph (nodes / arcs)
                * A list of nodes which are a list of arcs with an address of the next node.
                  See DawgNode.convToBytes() for details.
        """
        self._calculateBinary(nCompressionMethod)
        if not sPathFile.endswith(".bdic"):
            sPathFile += "."+str(nCompressionMethod)+".bdic"
518
519
520
521
522
523
524
525
526
527
528
529
530
531
532
533
534
535
536
537
538
539
540
541


542
543
544
545
546
547
548
549
550
551
552
553
554
555

556
557
558

559
560
561
562
563
564
565
566
567
568
569
570
571
572
573
574
575
576
577
578
579
580

581
582
583

584
585
586
587
588


589
590
591
592
593
594
595
596
597
598
599
600
601
602
603
604
605
606
607
608
609
610
611
612
613
614
615
616
617
618
619
620
621
622
623
624
625
626
627

628
629
630
631
632
633
634
635
636
637
638
639
640
641
642
643
644
645
646


647
648
649
650
651
652
653
654
655
656
657
658
659
660
661
662
663
664
665
666
667
668
669
670
671
        return time.strftime("%Y.%m.%d, %H:%M")

    def _writeNodes (self, sPathFile, nCompressionMethod):
        "for debugging only"
        print(" > Write nodes")
        with open(sPathFile+".nodes."+str(nCompressionMethod)+".txt", 'w', encoding='utf-8', newline="\n") as hDst:
            if nCompressionMethod == 1:
                hDst.write(self.oRoot.getTxtRepr1(self.nBytesArc, self.nBytesNodeAddress, self.lArcVal)+"\n")
                #hDst.write( ''.join( [ "%02X " %  z  for z in self.oRoot.convToBytes1(self.nBytesArc, self.nBytesNodeAddress) ] ).strip() )
                for oNode in self.lMinimizedNodes:
                    hDst.write(oNode.getTxtRepr1(self.nBytesArc, self.nBytesNodeAddress, self.lArcVal)+"\n")
            if nCompressionMethod == 2:
                hDst.write(self.oRoot.getTxtRepr2(self.nBytesArc, self.nBytesNodeAddress, self.lArcVal)+"\n")
                for oNode in self.lSortedNodes:
                    hDst.write(oNode.getTxtRepr2(self.nBytesArc, self.nBytesNodeAddress, self.lArcVal)+"\n")
            if nCompressionMethod == 3:
                hDst.write(self.oRoot.getTxtRepr3(self.nBytesArc, self.nBytesNodeAddress, self.nBytesOffset, self.lArcVal)+"\n")
                #hDst.write( ''.join( [ "%02X " %  z  for z in self.oRoot.convToBytes3(self.nBytesArc, self.nBytesNodeAddress, self.nBytesOffset) ] ).strip() )
                for oNode in self.lSortedNodes:
                    hDst.write(oNode.getTxtRepr3(self.nBytesArc, self.nBytesNodeAddress, self.nBytesOffset, self.lArcVal)+"\n")



class DawgNode:


    NextId = 0
    NextPos = 1 # (version 2)
    
    def __init__ (self):
        self.i = DawgNode.NextId
        DawgNode.NextId += 1
        self.final = False
        self.arcs = {}          # key: arc value; value: a node
        self.addr = 0           # address in the binary dictionary
        self.pos = 0            # position in the binary dictionary (version 2)
        self.size = 0           # size of node in bytes (version 3)

    @classmethod
    def resetNextId (cls):

        cls.NextId = 0

    def setPos (self): # version 2

        self.pos = DawgNode.NextPos
        DawgNode.NextPos += 1

    def __str__ (self):
        # Caution! this function is used for hashing and comparison!
        sFinalChar = "1"  if self.final  else "0";
        l = [sFinalChar]
        for (key, node) in self.arcs.items():
            l.append(str(key))
            l.append(str(node.i))
        return "_".join(l)

    def __hash__ (self):
        # Used as a key in a python dictionary.
        return self.__str__().__hash__()

    def __eq__ (self, other):
        # Used as a key in a python dictionary.
        # Nodes are equivalent if they have identical arcs, and each identical arc leads to identical states.
        return self.__str__() == other.__str__()

    def sortArcs (self, dValOccur):

        self.arcs = collections.OrderedDict(sorted(self.arcs.items(), key=lambda t: dValOccur.get(t[0], 0), reverse=True))

    def sortArcs2 (self, dValOccur, lArcVal):

        self.arcs = collections.OrderedDict(sorted(self.arcs.items(), key=lambda t: dValOccur.get(lArcVal[t[0]], 0), reverse=True))

    # VERSION 1 =====================================================================================================
    def convToBytes1 (self, nBytesArc, nBytesNodeAddress):
        """


        Node scheme:
        - Arc length is defined by nBytesArc
        - Address length is defined by nBytesNodeAddress
                                       
        |                Arc                |                         Address of next node                          |
        |                                   |                                                                       |
         /---------------\ /---------------\ /---------------\ /---------------\ /---------------\ /---------------\
         | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | |
         \---------------/ \---------------/ \---------------/ \---------------/ \---------------/ \---------------/
         [...]
         /---------------\ /---------------\ /---------------\ /---------------\ /---------------\ /---------------\
         | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | |
         \---------------/ \---------------/ \---------------/ \---------------/ \---------------/ \---------------/
          ^ ^
          | |
          | |
          |  \___ if 1, last arc of this node
           \_____ if 1, this node is final (only on the first arc)
        """
        nArc = len(self.arcs)
        nFinalNodeMask = 1 << ((nBytesArc*8)-1)
        nFinalArcMask = 1 << ((nBytesArc*8)-2)
        if len(self.arcs) == 0:
            val = nFinalNodeMask | nFinalArcMask
            by = val.to_bytes(nBytesArc, byteorder='big')
            by += (0).to_bytes(nBytesNodeAddress, byteorder='big')
            return by
        by = b""
        for i, arc in enumerate(self.arcs, 1):
            val = arc
            if i == 1 and self.final:
                val = val | nFinalNodeMask
            if i == nArc:
                val = val | nFinalArcMask
            by += val.to_bytes(nBytesArc, byteorder='big')
            by += self.arcs[arc].addr.to_bytes(nBytesNodeAddress, byteorder='big')
        return by
        
    def getTxtRepr1 (self, nBytesArc, nBytesNodeAddress, lVal):

        nArc = len(self.arcs)
        nFinalNodeMask = 1 << ((nBytesArc*8)-1)
        nFinalArcMask = 1 << ((nBytesArc*8)-2)
        s = "i{:_>10} -- #{:_>10}\n".format(self.i, self.addr)
        if len(self.arcs) == 0:
            s += "  {:<20}  {:0>16}  i{:_>10}   #{:_>10}\n".format("", bin(nFinalNodeMask | nFinalArcMask)[2:], "0", "0")
            return s
        for i, arc in enumerate(self.arcs, 1):
            val = arc
            if i == 1 and self.final:
                val = val | nFinalNodeMask
            if i == nArc:
                val = val | nFinalArcMask
            s += "  {:<20}  {:0>16}  i{:_>10}   #{:_>10}\n".format(lVal[arc], bin(val)[2:], self.arcs[arc].i, self.arcs[arc].addr)
        return s

    # VERSION 2 =====================================================================================================
    def convToBytes2 (self, nBytesArc, nBytesNodeAddress):
        """


        Node scheme:
        - Arc length is defined by nBytesArc
        - Address length is defined by nBytesNodeAddress
                                       
        |                Arc                |                         Address of next node                          |
        |                                   |                                                                       |
         /---------------\ /---------------\ /---------------\ /---------------\ /---------------\ /---------------\
         | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | |
         \---------------/ \---------------/ \---------------/ \---------------/ \---------------/ \---------------/
         [...]
         /---------------\ /---------------\ /---------------\ /---------------\ /---------------\ /---------------\
         | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | |
         \---------------/ \---------------/ \---------------/ \---------------/ \---------------/ \---------------/
          ^ ^ ^
          | | |
          | |  \_ if 1, caution, no address: next node is the following node
          |  \___ if 1, last arc of this node
           \_____ if 1, this node is final (only on the first arc)
        """
        nArc = len(self.arcs)
        nFinalNodeMask = 1 << ((nBytesArc*8)-1)
        nFinalArcMask = 1 << ((nBytesArc*8)-2)
        nNextNodeMask = 1 << ((nBytesArc*8)-3)
        if len(self.arcs) == 0:
            val = nFinalNodeMask | nFinalArcMask







|


|

|

|

|


|




>
>


|











>



>





|
















>



>





>
>



|


|
|
|

|
|
|

|
|
|
|



















|
|
>



















>
>



|


|
|
|

|
|
|

|
|
|
|







535
536
537
538
539
540
541
542
543
544
545
546
547
548
549
550
551
552
553
554
555
556
557
558
559
560
561
562
563
564
565
566
567
568
569
570
571
572
573
574
575
576
577
578
579
580
581
582
583
584
585
586
587
588
589
590
591
592
593
594
595
596
597
598
599
600
601
602
603
604
605
606
607
608
609
610
611
612
613
614
615
616
617
618
619
620
621
622
623
624
625
626
627
628
629
630
631
632
633
634
635
636
637
638
639
640
641
642
643
644
645
646
647
648
649
650
651
652
653
654
655
656
657
658
659
660
661
662
663
664
665
666
667
668
669
670
671
672
673
674
675
676
677
678
679
680
681
682
683
684
685
686
687
688
689
690
691
692
693
694
695
696
697
698
699
        return time.strftime("%Y.%m.%d, %H:%M")

    def _writeNodes (self, sPathFile, nCompressionMethod):
        "for debugging only"
        print(" > Write nodes")
        with open(sPathFile+".nodes."+str(nCompressionMethod)+".txt", 'w', encoding='utf-8', newline="\n") as hDst:
            if nCompressionMethod == 1:
                hDst.write(self.oRoot.getTxtRepr1(self.nBytesArc, self.lArcVal)+"\n")
                #hDst.write( ''.join( [ "%02X " %  z  for z in self.oRoot.convToBytes1(self.nBytesArc, self.nBytesNodeAddress) ] ).strip() )
                for oNode in self.lMinimizedNodes:
                    hDst.write(oNode.getTxtRepr1(self.nBytesArc, self.lArcVal)+"\n")
            if nCompressionMethod == 2:
                hDst.write(self.oRoot.getTxtRepr2(self.nBytesArc, self.lArcVal)+"\n")
                for oNode in self.lSortedNodes:
                    hDst.write(oNode.getTxtRepr2(self.nBytesArc, self.lArcVal)+"\n")
            if nCompressionMethod == 3:
                hDst.write(self.oRoot.getTxtRepr3(self.nBytesArc, self.nBytesOffset, self.lArcVal)+"\n")
                #hDst.write( ''.join( [ "%02X " %  z  for z in self.oRoot.convToBytes3(self.nBytesArc, self.nBytesNodeAddress, self.nBytesOffset) ] ).strip() )
                for oNode in self.lSortedNodes:
                    hDst.write(oNode.getTxtRepr3(self.nBytesArc, self.nBytesOffset, self.lArcVal)+"\n")



class DawgNode:
    """Node of the word graph"""

    NextId = 0
    NextPos = 1 # (version 2)

    def __init__ (self):
        self.i = DawgNode.NextId
        DawgNode.NextId += 1
        self.final = False
        self.arcs = {}          # key: arc value; value: a node
        self.addr = 0           # address in the binary dictionary
        self.pos = 0            # position in the binary dictionary (version 2)
        self.size = 0           # size of node in bytes (version 3)

    @classmethod
    def resetNextId (cls):
        "set NextId to 0 "
        cls.NextId = 0

    def setPos (self): # version 2
        "define a position for node (version 2)"
        self.pos = DawgNode.NextPos
        DawgNode.NextPos += 1

    def __str__ (self):
        # Caution! this function is used for hashing and comparison!
        sFinalChar = "1"  if self.final  else "0"
        l = [sFinalChar]
        for (key, node) in self.arcs.items():
            l.append(str(key))
            l.append(str(node.i))
        return "_".join(l)

    def __hash__ (self):
        # Used as a key in a python dictionary.
        return self.__str__().__hash__()

    def __eq__ (self, other):
        # Used as a key in a python dictionary.
        # Nodes are equivalent if they have identical arcs, and each identical arc leads to identical states.
        return self.__str__() == other.__str__()

    def sortArcs (self, dValOccur):
        "sort arcs of node according to <dValOccur>"
        self.arcs = collections.OrderedDict(sorted(self.arcs.items(), key=lambda t: dValOccur.get(t[0], 0), reverse=True))

    def sortArcs2 (self, dValOccur, lArcVal):
        "sort arcs of each node depending on the previous char"
        self.arcs = collections.OrderedDict(sorted(self.arcs.items(), key=lambda t: dValOccur.get(lArcVal[t[0]], 0), reverse=True))

    # VERSION 1 =====================================================================================================
    def convToBytes1 (self, nBytesArc, nBytesNodeAddress):
        """
        Convert to bytes (method 1).

        Node scheme:
        - Arc length is defined by nBytesArc
        - Address length is defined by nBytesNodeAddress

        |                Arc                |                         Address of next node                          |
        |                                   |                                                                       |
         ┏━━━━━━━━━━━━━━━┓ ┏━━━━━━━━━━━━━━━┓ ┏━━━━━━━━━━━━━━━┓ ┏━━━━━━━━━━━━━━━┓ ┏━━━━━━━━━━━━━━━┓ ┏━━━━━━━━━━━━━━━┓
               ┃ ┃ ┃ ┃ ┃ ┃ ┃ ┃ ┃ ┃ ┃ ┃ ┃ ┃ ┃ ┃ ┃ ┃ ┃ ┃ ┃ ┃ ┃ ┃ ┃ ┃ ┃ ┃ ┃ ┃ ┃ ┃ ┃ ┃ ┃ ┃ ┃ ┃ ┃ ┃ ┃ ┃ ┃ ┃ ┃ ┃ ┃ ┃
         ┗━━━━━━━━━━━━━━━┛ ┗━━━━━━━━━━━━━━━┛ ┗━━━━━━━━━━━━━━━┛ ┗━━━━━━━━━━━━━━━┛ ┗━━━━━━━━━━━━━━━┛ ┗━━━━━━━━━━━━━━━┛
         [...]
         ┏━━━━━━━━━━━━━━━┓ ┏━━━━━━━━━━━━━━━┓ ┏━━━━━━━━━━━━━━━┓ ┏━━━━━━━━━━━━━━━┓ ┏━━━━━━━━━━━━━━━┓ ┏━━━━━━━━━━━━━━━┓
               ┃ ┃ ┃ ┃ ┃ ┃ ┃ ┃ ┃ ┃ ┃ ┃ ┃ ┃ ┃ ┃ ┃ ┃ ┃ ┃ ┃ ┃ ┃ ┃ ┃ ┃ ┃ ┃ ┃ ┃ ┃ ┃ ┃ ┃ ┃ ┃ ┃ ┃ ┃ ┃ ┃ ┃ ┃ ┃ ┃ ┃ ┃ ┃
         ┗━━━━━━━━━━━━━━━┛ ┗━━━━━━━━━━━━━━━┛ ┗━━━━━━━━━━━━━━━┛ ┗━━━━━━━━━━━━━━━┛ ┗━━━━━━━━━━━━━━━┛ ┗━━━━━━━━━━━━━━━┛
          ^ ^
          ┃ ┃
          ┃ ┃
           ┗━━━ if 1, last arc of this node
          ┗━━━━━ if 1, this node is final (only on the first arc)
        """
        nArc = len(self.arcs)
        nFinalNodeMask = 1 << ((nBytesArc*8)-1)
        nFinalArcMask = 1 << ((nBytesArc*8)-2)
        if len(self.arcs) == 0:
            val = nFinalNodeMask | nFinalArcMask
            by = val.to_bytes(nBytesArc, byteorder='big')
            by += (0).to_bytes(nBytesNodeAddress, byteorder='big')
            return by
        by = b""
        for i, arc in enumerate(self.arcs, 1):
            val = arc
            if i == 1 and self.final:
                val = val | nFinalNodeMask
            if i == nArc:
                val = val | nFinalArcMask
            by += val.to_bytes(nBytesArc, byteorder='big')
            by += self.arcs[arc].addr.to_bytes(nBytesNodeAddress, byteorder='big')
        return by

    def getTxtRepr1 (self, nBytesArc, lVal):
        "return representation as string of node (method 1)"
        nArc = len(self.arcs)
        nFinalNodeMask = 1 << ((nBytesArc*8)-1)
        nFinalArcMask = 1 << ((nBytesArc*8)-2)
        s = "i{:_>10} -- #{:_>10}\n".format(self.i, self.addr)
        if len(self.arcs) == 0:
            s += "  {:<20}  {:0>16}  i{:_>10}   #{:_>10}\n".format("", bin(nFinalNodeMask | nFinalArcMask)[2:], "0", "0")
            return s
        for i, arc in enumerate(self.arcs, 1):
            val = arc
            if i == 1 and self.final:
                val = val | nFinalNodeMask
            if i == nArc:
                val = val | nFinalArcMask
            s += "  {:<20}  {:0>16}  i{:_>10}   #{:_>10}\n".format(lVal[arc], bin(val)[2:], self.arcs[arc].i, self.arcs[arc].addr)
        return s

    # VERSION 2 =====================================================================================================
    def convToBytes2 (self, nBytesArc, nBytesNodeAddress):
        """
        Convert to bytes (method 2).

        Node scheme:
        - Arc length is defined by nBytesArc
        - Address length is defined by nBytesNodeAddress

        |                Arc                |                         Address of next node                          |
        |                                   |                                                                       |
         ┏━━━━━━━━━━━━━━━┓ ┏━━━━━━━━━━━━━━━┓ ┏━━━━━━━━━━━━━━━┓ ┏━━━━━━━━━━━━━━━┓ ┏━━━━━━━━━━━━━━━┓ ┏━━━━━━━━━━━━━━━┓
               ┃ ┃ ┃ ┃ ┃ ┃ ┃ ┃ ┃ ┃ ┃ ┃ ┃ ┃ ┃ ┃ ┃ ┃ ┃ ┃ ┃ ┃ ┃ ┃ ┃ ┃ ┃ ┃ ┃ ┃ ┃ ┃ ┃ ┃ ┃ ┃ ┃ ┃ ┃ ┃ ┃ ┃ ┃ ┃ ┃ ┃ ┃ ┃
         ┗━━━━━━━━━━━━━━━┛ ┗━━━━━━━━━━━━━━━┛ ┗━━━━━━━━━━━━━━━┛ ┗━━━━━━━━━━━━━━━┛ ┗━━━━━━━━━━━━━━━┛ ┗━━━━━━━━━━━━━━━┛
         [...]
         ┏━━━━━━━━━━━━━━━┓ ┏━━━━━━━━━━━━━━━┓ ┏━━━━━━━━━━━━━━━┓ ┏━━━━━━━━━━━━━━━┓ ┏━━━━━━━━━━━━━━━┓ ┏━━━━━━━━━━━━━━━┓
               ┃ ┃ ┃ ┃ ┃ ┃ ┃ ┃ ┃ ┃ ┃ ┃ ┃ ┃ ┃ ┃ ┃ ┃ ┃ ┃ ┃ ┃ ┃ ┃ ┃ ┃ ┃ ┃ ┃ ┃ ┃ ┃ ┃ ┃ ┃ ┃ ┃ ┃ ┃ ┃ ┃ ┃ ┃ ┃ ┃ ┃ ┃ ┃
         ┗━━━━━━━━━━━━━━━┛ ┗━━━━━━━━━━━━━━━┛ ┗━━━━━━━━━━━━━━━┛ ┗━━━━━━━━━━━━━━━┛ ┗━━━━━━━━━━━━━━━┛ ┗━━━━━━━━━━━━━━━┛
          ^ ^ ^
          ┃ ┃ ┃
           ┃ ┗━━ if 1, caution, no address: next node is the following node
           ┗━━━━ if 1, last arc of this node
          ┗━━━━━━ if 1, this node is final (only on the first arc)
        """
        nArc = len(self.arcs)
        nFinalNodeMask = 1 << ((nBytesArc*8)-1)
        nFinalArcMask = 1 << ((nBytesArc*8)-2)
        nNextNodeMask = 1 << ((nBytesArc*8)-3)
        if len(self.arcs) == 0:
            val = nFinalNodeMask | nFinalArcMask
682
683
684
685
686
687
688
689
690

691
692
693
694
695
696
697
698
699
700
701
702
703
704
705
706
707
708
709
710
711
712
713
714


715
716
717
718
719
720
721
722
723
724
725
726
727
728
729
730
731
732
733
734
735
736
737
738
739
740
741
742
743
744
            if (self.pos + 1) == self.arcs[arc].pos and self.i != 0:
                val = val | nNextNodeMask
                by += val.to_bytes(nBytesArc, byteorder='big')
            else:
                by += val.to_bytes(nBytesArc, byteorder='big')
                by += self.arcs[arc].addr.to_bytes(nBytesNodeAddress, byteorder='big')
        return by
        
    def getTxtRepr2 (self, nBytesArc, nBytesNodeAddress, lVal):

        nArc = len(self.arcs)
        nFinalNodeMask = 1 << ((nBytesArc*8)-1)
        nFinalArcMask = 1 << ((nBytesArc*8)-2)
        nNextNodeMask = 1 << ((nBytesArc*8)-3)
        s = "i{:_>10} -- #{:_>10}\n".format(self.i, self.addr)
        if nArc == 0:
            s += "  {:<20}  {:0>16}  i{:_>10}   #{:_>10}\n".format("", bin(nFinalNodeMask | nFinalArcMask)[2:], "0", "0")
            return s
        for i, arc in enumerate(self.arcs, 1):
            val = arc
            if i == 1 and self.final:
                val = val | nFinalNodeMask
            if i == nArc:
                val = val | nFinalArcMask
            if (self.pos + 1) == self.arcs[arc].pos  and self.i != 0:
                val = val | nNextNodeMask
                s += "  {:<20}  {:0>16}\n".format(lVal[arc], bin(val)[2:], "")
            else:
                s += "  {:<20}  {:0>16}  i{:_>10}   #{:_>10}\n".format(lVal[arc], bin(val)[2:], self.arcs[arc].i, self.arcs[arc].addr)
        return s

    # VERSION 3 =====================================================================================================
    def convToBytes3 (self, nBytesArc, nBytesNodeAddress, nBytesOffset):
        """


        Node scheme:
        - Arc length is defined by nBytesArc
        - Address length is defined by nBytesNodeAddress
        - Offset length is defined by nBytesOffset
                                       
        |                Arc                |            Address of next node  or  offset to next node              |
        |                                   |                                                                       |
         /---------------\ /---------------\ /---------------\ /---------------\ /---------------\ /---------------\
         |1|0|0| | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | |
         \---------------/ \---------------/ \---------------/ \---------------/ \---------------/ \---------------/
         [...]
         /---------------\ /---------------\ /---------------\
         |0|0|1| | | | | | | | | | | | | | | | | | | | | | | |     Offsets are shorter than addresses
         \---------------/ \---------------/ \---------------/ 
         /---------------\ /---------------\ /---------------\ /---------------\ /---------------\ /---------------\
         |0|1|0| | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | |
         \---------------/ \---------------/ \---------------/ \---------------/ \---------------/ \---------------/

          ^ ^ ^
          | | |
          | |  \_ if 1, offset instead of address of next node
          |  \___ if 1, last arc of this node
           \_____ if 1, this node is final (only on the first arc)
        """
        nArc = len(self.arcs)
        nFinalNodeMask = 1 << ((nBytesArc*8)-1)
        nFinalArcMask = 1 << ((nBytesArc*8)-2)
        nNextNodeMask = 1 << ((nBytesArc*8)-3)
        nMaxOffset = (2 ** (nBytesOffset * 8)) - 1
        if nArc == 0:







|
|
>
















|







>
>




|


|
|
|

|
|
|
|
|
|


|
|
|
|







710
711
712
713
714
715
716
717
718
719
720
721
722
723
724
725
726
727
728
729
730
731
732
733
734
735
736
737
738
739
740
741
742
743
744
745
746
747
748
749
750
751
752
753
754
755
756
757
758
759
760
761
762
763
764
765
766
767
768
769
770
771
772
773
774
775
            if (self.pos + 1) == self.arcs[arc].pos and self.i != 0:
                val = val | nNextNodeMask
                by += val.to_bytes(nBytesArc, byteorder='big')
            else:
                by += val.to_bytes(nBytesArc, byteorder='big')
                by += self.arcs[arc].addr.to_bytes(nBytesNodeAddress, byteorder='big')
        return by

    def getTxtRepr2 (self, nBytesArc, lVal):
        "return representation as string of node (method 2)"
        nArc = len(self.arcs)
        nFinalNodeMask = 1 << ((nBytesArc*8)-1)
        nFinalArcMask = 1 << ((nBytesArc*8)-2)
        nNextNodeMask = 1 << ((nBytesArc*8)-3)
        s = "i{:_>10} -- #{:_>10}\n".format(self.i, self.addr)
        if nArc == 0:
            s += "  {:<20}  {:0>16}  i{:_>10}   #{:_>10}\n".format("", bin(nFinalNodeMask | nFinalArcMask)[2:], "0", "0")
            return s
        for i, arc in enumerate(self.arcs, 1):
            val = arc
            if i == 1 and self.final:
                val = val | nFinalNodeMask
            if i == nArc:
                val = val | nFinalArcMask
            if (self.pos + 1) == self.arcs[arc].pos  and self.i != 0:
                val = val | nNextNodeMask
                s += "  {:<20}  {:0>16}\n".format(lVal[arc], bin(val)[2:])
            else:
                s += "  {:<20}  {:0>16}  i{:_>10}   #{:_>10}\n".format(lVal[arc], bin(val)[2:], self.arcs[arc].i, self.arcs[arc].addr)
        return s

    # VERSION 3 =====================================================================================================
    def convToBytes3 (self, nBytesArc, nBytesNodeAddress, nBytesOffset):
        """
        Convert to bytes (method 3).

        Node scheme:
        - Arc length is defined by nBytesArc
        - Address length is defined by nBytesNodeAddress
        - Offset length is defined by nBytesOffset

        |                Arc                |            Address of next node  or  offset to next node              |
        |                                   |                                                                       |
         ┏━━━━━━━━━━━━━━━┓ ┏━━━━━━━━━━━━━━━┓ ┏━━━━━━━━━━━━━━━┓ ┏━━━━━━━━━━━━━━━┓ ┏━━━━━━━━━━━━━━━┓ ┏━━━━━━━━━━━━━━━┓
         100   ┃ ┃ ┃ ┃ ┃ ┃ ┃ ┃ ┃ ┃ ┃ ┃ ┃ ┃ ┃ ┃ ┃ ┃ ┃ ┃ ┃ ┃ ┃ ┃ ┃ ┃ ┃ ┃ ┃ ┃ ┃ ┃ ┃ ┃ ┃ ┃ ┃ ┃ ┃ ┃ ┃ ┃ ┃ ┃ ┃ ┃ ┃ ┃
         ┗━━━━━━━━━━━━━━━┛ ┗━━━━━━━━━━━━━━━┛ ┗━━━━━━━━━━━━━━━┛ ┗━━━━━━━━━━━━━━━┛ ┗━━━━━━━━━━━━━━━┛ ┗━━━━━━━━━━━━━━━┛
         [...]
         ┏━━━━━━━━━━━━━━━┓ ┏━━━━━━━━━━━━━━━┓ ┏━━━━━━━━━━━━━━━┓
         001   ┃ ┃ ┃ ┃ ┃ ┃ ┃ ┃ ┃ ┃ ┃ ┃ ┃ ┃ ┃ ┃ ┃ ┃ ┃ ┃ ┃     Offsets are shorter than addresses
         ┗━━━━━━━━━━━━━━━┛ ┗━━━━━━━━━━━━━━━┛ ┗━━━━━━━━━━━━━━━┛
         ┏━━━━━━━━━━━━━━━┓ ┏━━━━━━━━━━━━━━━┓ ┏━━━━━━━━━━━━━━━┓ ┏━━━━━━━━━━━━━━━┓ ┏━━━━━━━━━━━━━━━┓ ┏━━━━━━━━━━━━━━━┓
         010   ┃ ┃ ┃ ┃ ┃ ┃ ┃ ┃ ┃ ┃ ┃ ┃ ┃ ┃ ┃ ┃ ┃ ┃ ┃ ┃ ┃ ┃ ┃ ┃ ┃ ┃ ┃ ┃ ┃ ┃ ┃ ┃ ┃ ┃ ┃ ┃ ┃ ┃ ┃ ┃ ┃ ┃ ┃ ┃ ┃ ┃ ┃ ┃
         ┗━━━━━━━━━━━━━━━┛ ┗━━━━━━━━━━━━━━━┛ ┗━━━━━━━━━━━━━━━┛ ┗━━━━━━━━━━━━━━━┛ ┗━━━━━━━━━━━━━━━┛ ┗━━━━━━━━━━━━━━━┛

          ^ ^ ^
          ┃ ┃ ┃
           ┃ ┗━━ if 1, offset instead of address of next node
           ┗━━━━ if 1, last arc of this node
          ┗━━━━━━ if 1, this node is final (only on the first arc)
        """
        nArc = len(self.arcs)
        nFinalNodeMask = 1 << ((nBytesArc*8)-1)
        nFinalArcMask = 1 << ((nBytesArc*8)-2)
        nNextNodeMask = 1 << ((nBytesArc*8)-3)
        nMaxOffset = (2 ** (nBytesOffset * 8)) - 1
        if nArc == 0:
757
758
759
760
761
762
763
764
765

766
767
768
769
770
771
772
                val = val | nNextNodeMask
                by += val.to_bytes(nBytesArc, byteorder='big')
                by += (self.arcs[arc].addr-self.addr).to_bytes(nBytesOffset, byteorder='big')
            else:
                by += val.to_bytes(nBytesArc, byteorder='big')
                by += self.arcs[arc].addr.to_bytes(nBytesNodeAddress, byteorder='big')
        return by
        
    def getTxtRepr3 (self, nBytesArc, nBytesNodeAddress, nBytesOffset, lVal):

        nArc = len(self.arcs)
        nFinalNodeMask = 1 << ((nBytesArc*8)-1)
        nFinalArcMask = 1 << ((nBytesArc*8)-2)
        nNextNodeMask = 1 << ((nBytesArc*8)-3)
        nMaxOffset = (2 ** (nBytesOffset * 8)) - 1
        s = "i{:_>10} -- #{:_>10}  ({})\n".format(self.i, self.addr, self.size)
        if nArc == 0:







|
|
>







788
789
790
791
792
793
794
795
796
797
798
799
800
801
802
803
804
                val = val | nNextNodeMask
                by += val.to_bytes(nBytesArc, byteorder='big')
                by += (self.arcs[arc].addr-self.addr).to_bytes(nBytesOffset, byteorder='big')
            else:
                by += val.to_bytes(nBytesArc, byteorder='big')
                by += self.arcs[arc].addr.to_bytes(nBytesNodeAddress, byteorder='big')
        return by

    def getTxtRepr3 (self, nBytesArc, nBytesOffset, lVal):
        "return representation as string of node (method 3)"
        nArc = len(self.arcs)
        nFinalNodeMask = 1 << ((nBytesArc*8)-1)
        nFinalArcMask = 1 << ((nBytesArc*8)-2)
        nNextNodeMask = 1 << ((nBytesArc*8)-3)
        nMaxOffset = (2 ** (nBytesOffset * 8)) - 1
        s = "i{:_>10} -- #{:_>10}  ({})\n".format(self.i, self.addr, self.size)
        if nArc == 0:
792
793
794
795
796
797
798

799
800
801
802
803
804
805
806
807

808
809
810
811

812
813
_dCharOrder = {
    # key: previous char, value: dictionary of chars {c: nValue}
    "": {}
}


def addWordToCharDict (sWord):

    cPrevious = ""
    for cChar in sWord:
        if cPrevious not in _dCharOrder:
            _dCharOrder[cPrevious] = {}
        _dCharOrder[cPrevious][cChar] = _dCharOrder[cPrevious].get(cChar, 0) + 1
        cPrevious = cChar


def getCharOrderAfterChar (cChar):

    return _dCharOrder.get(cChar, None)


def displayCharOrder ():

    for key, value in _dCharOrder.items():
        print("[" + key + "]: ", ", ".join([ c+":"+str(n)  for c, n  in  sorted(value.items(), key=lambda t: t[1], reverse=True) ]))







>









>




>


824
825
826
827
828
829
830
831
832
833
834
835
836
837
838
839
840
841
842
843
844
845
846
847
848
_dCharOrder = {
    # key: previous char, value: dictionary of chars {c: nValue}
    "": {}
}


def addWordToCharDict (sWord):
    "for each character of <sWord>, count how many times it appears after the previous character, and store result in a <_dCharOrder>"
    cPrevious = ""
    for cChar in sWord:
        if cPrevious not in _dCharOrder:
            _dCharOrder[cPrevious] = {}
        _dCharOrder[cPrevious][cChar] = _dCharOrder[cPrevious].get(cChar, 0) + 1
        cPrevious = cChar


def getCharOrderAfterChar (cChar):
    "return a dictionary of chars with number of times it appears after character <cChar>"
    return _dCharOrder.get(cChar, None)


def displayCharOrder ():
    "display how many times each character appear after another one"
    for key, value in _dCharOrder.items():
        print("[" + key + "]: ", ", ".join([ c+":"+str(n)  for c, n  in  sorted(value.items(), key=lambda t: t[1], reverse=True) ]))

Modified graphspell/echo.py from [6d11a5dda8] to [440b1511e9].

1
2

3

4


5
6
7
8
9
10
11
#!python3


# The most boring yet indispensable function: print!





import sys


_CHARMAP = str.maketrans({  'œ': 'ö',  'Œ': 'Ö',  'ʳ': "r",  'ᵉ': "e",  '…': "_",  \
                            '“': '"',  '”': '"',  '„': '"',  '‘': "'",  '’': "'",  \
                            'ā': 'â',  'Ā': 'Â',  'ē': 'ê',  'Ē': 'Ê',  'ī': 'î',  'Ī': 'Î',  \


>
|
>

>
>







1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
#!python3

"""
The most boring yet indispensable function: print!
Because you can print on Windows console without being sure the script won’t crash…

Windows console don’t accept many characters.
"""

import sys


_CHARMAP = str.maketrans({  'œ': 'ö',  'Œ': 'Ö',  'ʳ': "r",  'ᵉ': "e",  '…': "_",  \
                            '“': '"',  '”': '"',  '„': '"',  '‘': "'",  '’': "'",  \
                            'ā': 'â',  'Ā': 'Â',  'ē': 'ê',  'Ē': 'Ê',  'ī': 'î',  'Ī': 'Î',  \
20
21
22
23
24
25
26
27
28
29
        Encoding depends on Windows locale. No useful standard.
        Always returns True (useful for debugging)."""
    if sys.platform != "win32":
        print(obj, sep=sep, end=end, file=file, flush=flush)
        return True
    try:
        print(str(obj).translate(_CHARMAP), sep=sep, end=end, file=file, flush=flush)
    except:
        print(str(obj).encode('ascii', 'replace').decode('ascii', 'replace'), sep=sep, end=end, file=file, flush=flush)
    return True







|


24
25
26
27
28
29
30
31
32
33
        Encoding depends on Windows locale. No useful standard.
        Always returns True (useful for debugging)."""
    if sys.platform != "win32":
        print(obj, sep=sep, end=end, file=file, flush=flush)
        return True
    try:
        print(str(obj).translate(_CHARMAP), sep=sep, end=end, file=file, flush=flush)
    except Exception:
        print(str(obj).encode('ascii', 'replace').decode('ascii', 'replace'), sep=sep, end=end, file=file, flush=flush)
    return True

Added graphspell/fr.py version [963bf7ea5b].





























































































>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
"""
Default suggestion for French language
"""

dSugg = {
    "bcp": "beaucoup",
    "ca": "ça",
    "cad": "c’est-à-dire",
    "cb": "combien|CB",
    "cdlt": "cordialement",
    "construirent": "construire|construisirent|construisent|construiront",
    "càd": "c’est-à-dire",
    "dc": "de|donc",
    "email": "courriel|e-mail|émail",
    "emails": "courriels|e-mails",
    "Etes-vous": "Êtes-vous",
    "Etiez-vous": "Étiez-vous",
    "Etions-nous": "Étions-nous",
    "parce-que": "parce que",
    "pcq": "parce que",
    "pd": "pendant",
    "pdq": "pendant que",
    "pdt": "pendant",
    "pdtq": "pendant que",
    "pk": "pourquoi",
    "pq": "pourquoi|PQ",
    "prq": "presque",
    "prsq": "presque",
    "qcq": "quiconque",
    "qq": "quelque",
    "qqch": "quelque chose",
    "qqn": "quelqu’un",
    "qqne": "quelqu’une",
    "qqs": "quelques",
    "qqunes": "quelques-unes",
    "qquns": "quelques-uns",
    "tdq": "tandis que",
    "tj": "toujours",
    "tjs": "toujours",
    "tq": "tant que|tandis que",
    "ts": "tous",
    "tt": "tant|tout",
    "tte": "toute",
    "ttes": "toutes",
    "y’a": "y a"
}

Modified graphspell/ibdawg.py from [a255097656] to [0f1b5456be].

1
2





3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23

24
25
26
27
28
29
30
#!python3






import os
import traceback
import pkgutil
import re
from functools import wraps
import time
import json
import binascii

#import logging
#logging.basicConfig(filename="suggestions.log", level=logging.DEBUG)

from . import str_transform as st
from . import char_player as cp
from .echo import echo


def timethis (func):
    "decorator for the execution time"
    @wraps(func)
    def wrapper (*args, **kwargs):

        fStart = time.time()
        result = func(*args, **kwargs)
        fEnd = time.time()
        print(func.__name__, fEnd - fStart)
        return result
    return wrapper



>
>
>
>
>
|




















>







1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
#!python3

"""
INDEXABLE BINARY DIRECT ACYCLIC WORD GRAPH
Implementation of a spellchecker as a transducer (storing transformation code to get lemma and morphologies)
and a spell suggestion mechanim
"""

import traceback
import pkgutil
import re
from functools import wraps
import time
import json
import binascii

#import logging
#logging.basicConfig(filename="suggestions.log", level=logging.DEBUG)

from . import str_transform as st
from . import char_player as cp
from .echo import echo


def timethis (func):
    "decorator for the execution time"
    @wraps(func)
    def wrapper (*args, **kwargs):
        "something to prevent pylint whining"
        fStart = time.time()
        result = func(*args, **kwargs)
        fEnd = time.time()
        print(func.__name__, fEnd - fStart)
        return result
    return wrapper

54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79

80
81
82
83
84
85
86
                    self.dSugg[nDist] = []
                self.dSugg[nDist].append(sSugg)
                self.aSugg.add(sSugg)
                if nDist < self.nMinDist:
                    self.nMinDist = nDist
                self.nDistLimit = min(self.nDistLimit, self.nMinDist+2)

    def getSuggestions (self, nSuggLimit=10, nDistLimit=-1):
        "return a list of suggestions"
        if self.dSugg[0]:
            # we sort the better results with the original word
            self.dSugg[0].sort(key=lambda sSugg: st.distanceDamerauLevenshtein(self.sWord, sSugg))
        lRes = self.dSugg.pop(0)
        for nDist, lSugg in self.dSugg.items():
            if nDist <= self.nDistLimit:
                lRes.extend(lSugg)
                if len(lRes) > nSuggLimit:
                    break
        lRes = list(cp.filterSugg(lRes))
        if self.sWord.isupper():
            lRes = list(map(lambda sSugg: sSugg.upper(), lRes))
        elif self.sWord[0:1].isupper():
            lRes = list(map(lambda sSugg: sSugg[0:1].upper()+sSugg[1:], lRes))  # dont’ use <.istitle>
        return lRes[:nSuggLimit]

    def reset (self):

        self.aSugg.clear()
        self.dSugg.clear()


class IBDAWG:
    """INDEXABLE BINARY DIRECT ACYCLIC WORD GRAPH"""








|


















>







60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
                    self.dSugg[nDist] = []
                self.dSugg[nDist].append(sSugg)
                self.aSugg.add(sSugg)
                if nDist < self.nMinDist:
                    self.nMinDist = nDist
                self.nDistLimit = min(self.nDistLimit, self.nMinDist+2)

    def getSuggestions (self, nSuggLimit=10):
        "return a list of suggestions"
        if self.dSugg[0]:
            # we sort the better results with the original word
            self.dSugg[0].sort(key=lambda sSugg: st.distanceDamerauLevenshtein(self.sWord, sSugg))
        lRes = self.dSugg.pop(0)
        for nDist, lSugg in self.dSugg.items():
            if nDist <= self.nDistLimit:
                lRes.extend(lSugg)
                if len(lRes) > nSuggLimit:
                    break
        lRes = list(cp.filterSugg(lRes))
        if self.sWord.isupper():
            lRes = list(map(lambda sSugg: sSugg.upper(), lRes))
        elif self.sWord[0:1].isupper():
            lRes = list(map(lambda sSugg: sSugg[0:1].upper()+sSugg[1:], lRes))  # dont’ use <.istitle>
        return lRes[:nSuggLimit]

    def reset (self):
        "clear data"
        self.aSugg.clear()
        self.dSugg.clear()


class IBDAWG:
    """INDEXABLE BINARY DIRECT ACYCLIC WORD GRAPH"""

145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
            raise TypeError("# Error. Not a grammalecte-fsa binary dictionary. Header: {}".format(self.by[0:9]))
        if not(self.by[17:18] == b"1" or self.by[17:18] == b"2" or self.by[17:18] == b"3"):
            raise ValueError("# Error. Unknown dictionary version: {}".format(self.by[17:18]))
        try:
            header, info, values, bdic = self.by.split(b"\0\0\0\0", 3)
        except Exception:
            raise Exception
        
        self.nCompressionMethod = int(self.by[17:18].decode("utf-8"))
        self.sHeader = header.decode("utf-8")
        self.lArcVal = values.decode("utf-8").split("\t")
        self.nArcVal = len(self.lArcVal)
        self.byDic = bdic

        l = info.decode("utf-8").split("//")







|







152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
            raise TypeError("# Error. Not a grammalecte-fsa binary dictionary. Header: {}".format(self.by[0:9]))
        if not(self.by[17:18] == b"1" or self.by[17:18] == b"2" or self.by[17:18] == b"3"):
            raise ValueError("# Error. Unknown dictionary version: {}".format(self.by[17:18]))
        try:
            header, info, values, bdic = self.by.split(b"\0\0\0\0", 3)
        except Exception:
            raise Exception

        self.nCompressionMethod = int(self.by[17:18].decode("utf-8"))
        self.sHeader = header.decode("utf-8")
        self.lArcVal = values.decode("utf-8").split("\t")
        self.nArcVal = len(self.lArcVal)
        self.byDic = bdic

        l = info.decode("utf-8").split("//")
180
181
182
183
184
185
186

187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
    def _initJSON (self, oJSON):
        "initialize with a JSON text file"
        self.__dict__.update(oJSON)
        self.byDic = binascii.unhexlify(self.sByDic)
        self.dCharVal = { v: k  for k, v in self.dChar.items() }

    def getInfo (self):

        return  "  Language: {0.sLangName}   Lang code: {0.sLangCode}   Dictionary name: {0.sDicName}" \
                "  Compression method: {0.nCompressionMethod:>2}   Date: {0.sDate}   Stemming: {0.cStemming}FX\n" \
                "  Arcs values:  {0.nArcVal:>10,} = {0.nChar:>5,} characters,  {0.nAff:>6,} affixes,  {0.nTag:>6,} tags\n" \
                "  Dictionary: {0.nEntry:>12,} entries,    {0.nNode:>11,} nodes,   {0.nArc:>11,} arcs\n" \
                "  Address size: {0.nBytesNodeAddress:>1} bytes,  Arc size: {0.nBytesArc:>1} bytes\n".format(self)

    def writeAsJSObject (self, spfDest, bInJSModule=False, bBinaryDictAsHexString=False):
        "write IBDAWG as a JavaScript object in a JavaScript module"
        with open(spfDest, "w", encoding="utf-8", newline="\n") as hDst:
            if bInJSModule:
                hDst.write('// JavaScript\n// Generated data (do not edit)\n\n"use strict";\n\nconst dictionary = ')
            hDst.write(json.dumps({
                            "sHeader": "/grammalecte-fsa/",
                            "sLangCode": self.sLangCode,
                            "sLangName": self.sLangName,
                            "sDicName": self.sDicName,
                            "sFileName": self.sFileName,
                            "sDate": self.sDate,
                            "nEntry": self.nEntry,
                            "nChar": self.nChar,
                            "nAff": self.nAff,
                            "nTag": self.nTag,
                            "cStemming": self.cStemming,
                            "dChar": self.dChar,
                            "nNode": self.nNode,
                            "nArc": self.nArc,
                            "nArcVal": self.nArcVal,
                            "lArcVal": self.lArcVal,
                            "nCompressionMethod": self.nCompressionMethod,
                            "nBytesArc": self.nBytesArc,
                            "nBytesNodeAddress": self.nBytesNodeAddress,
                            "nBytesOffset": self.nBytesOffset,
                            # JavaScript is a pile of shit, so Mozilla’s JS parser don’t like file bigger than 4 Mb!
                            # So, if necessary, we use an hexadecimal string, that we will convert later in Firefox’s extension.
                            # https://github.com/mozilla/addons-linter/issues/1361
                            "sByDic": self.byDic.hex()  if bBinaryDictAsHexString  else [ e  for e in self.byDic ]
                        }, ensure_ascii=False))
            if bInJSModule:
                hDst.write(";\n\nexports.dictionary = dictionary;\n")

    def isValidToken (self, sToken):
        "checks if <sToken> is valid (if there is hyphens in <sToken>, <sToken> is split, each part is checked)"
        sToken = cp.spellingNormalization(sToken)
        if self.isValid(sToken):







>












|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|







187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
    def _initJSON (self, oJSON):
        "initialize with a JSON text file"
        self.__dict__.update(oJSON)
        self.byDic = binascii.unhexlify(self.sByDic)
        self.dCharVal = { v: k  for k, v in self.dChar.items() }

    def getInfo (self):
        "return string about the IBDAWG"
        return  "  Language: {0.sLangName}   Lang code: {0.sLangCode}   Dictionary name: {0.sDicName}" \
                "  Compression method: {0.nCompressionMethod:>2}   Date: {0.sDate}   Stemming: {0.cStemming}FX\n" \
                "  Arcs values:  {0.nArcVal:>10,} = {0.nChar:>5,} characters,  {0.nAff:>6,} affixes,  {0.nTag:>6,} tags\n" \
                "  Dictionary: {0.nEntry:>12,} entries,    {0.nNode:>11,} nodes,   {0.nArc:>11,} arcs\n" \
                "  Address size: {0.nBytesNodeAddress:>1} bytes,  Arc size: {0.nBytesArc:>1} bytes\n".format(self)

    def writeAsJSObject (self, spfDest, bInJSModule=False, bBinaryDictAsHexString=False):
        "write IBDAWG as a JavaScript object in a JavaScript module"
        with open(spfDest, "w", encoding="utf-8", newline="\n") as hDst:
            if bInJSModule:
                hDst.write('// JavaScript\n// Generated data (do not edit)\n\n"use strict";\n\nconst dictionary = ')
            hDst.write(json.dumps({
                "sHeader": "/grammalecte-fsa/",
                "sLangCode": self.sLangCode,
                "sLangName": self.sLangName,
                "sDicName": self.sDicName,
                "sFileName": self.sFileName,
                "sDate": self.sDate,
                "nEntry": self.nEntry,
                "nChar": self.nChar,
                "nAff": self.nAff,
                "nTag": self.nTag,
                "cStemming": self.cStemming,
                "dChar": self.dChar,
                "nNode": self.nNode,
                "nArc": self.nArc,
                "nArcVal": self.nArcVal,
                "lArcVal": self.lArcVal,
                "nCompressionMethod": self.nCompressionMethod,
                "nBytesArc": self.nBytesArc,
                "nBytesNodeAddress": self.nBytesNodeAddress,
                "nBytesOffset": self.nBytesOffset,
                # JavaScript is a pile of shit, so Mozilla’s JS parser don’t like file bigger than 4 Mb!
                # So, if necessary, we use an hexadecimal string, that we will convert later in Firefox’s extension.
                # https://github.com/mozilla/addons-linter/issues/1361
                "sByDic": self.byDic.hex()  if bBinaryDictAsHexString  else [ e  for e in self.byDic ]
            }, ensure_ascii=False))
            if bInJSModule:
                hDst.write(";\n\nexports.dictionary = dictionary;\n")

    def isValidToken (self, sToken):
        "checks if <sToken> is valid (if there is hyphens in <sToken>, <sToken> is split, each part is checked)"
        sToken = cp.spellingNormalization(sToken)
        if self.isValid(sToken):
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
    def lookup (self, sWord):
        "returns True if <sWord> in dictionary (strict verification)"
        iAddr = 0
        for c in sWord:
            if c not in self.dChar:
                return False
            iAddr = self._lookupArcNode(self.dChar[c], iAddr)
            if iAddr == None:
                return False
        return bool(int.from_bytes(self.byDic[iAddr:iAddr+self.nBytesArc], byteorder='big') & self._finalNodeMask)

    def getMorph (self, sWord):
        "retrieves morphologies list, different casing allowed"
        sWord = cp.spellingNormalization(sWord)
        l = self.morph(sWord)







|







271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
    def lookup (self, sWord):
        "returns True if <sWord> in dictionary (strict verification)"
        iAddr = 0
        for c in sWord:
            if c not in self.dChar:
                return False
            iAddr = self._lookupArcNode(self.dChar[c], iAddr)
            if iAddr is None:
                return False
        return bool(int.from_bytes(self.byDic[iAddr:iAddr+self.nBytesArc], byteorder='big') & self._finalNodeMask)

    def getMorph (self, sWord):
        "retrieves morphologies list, different casing allowed"
        sWord = cp.spellingNormalization(sWord)
        l = self.morph(sWord)
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
                    self._suggest(oSuggResult, sRepl, nMaxSwitch, nMaxDel, nMaxHardRepl, nMaxJump, nDist, nDeep+1, iAddr, sNewWord, True)
            elif len(sRemain) == 1:
                self._suggest(oSuggResult, "", nMaxSwitch, nMaxDel, nMaxHardRepl, nMaxJump, nDist, nDeep+1, iAddr, sNewWord, True) # remove last char and go on
                for sRepl in cp.dFinal1.get(sRemain, ()):
                    self._suggest(oSuggResult, sRepl, nMaxSwitch, nMaxDel, nMaxHardRepl, nMaxJump, nDist, nDeep+1, iAddr, sNewWord, True)

    #@timethis
    def suggest2 (self, sWord, nMaxSugg=10):
        "returns a set of suggestions for <sWord>"
        sWord = cp.spellingNormalization(sWord)
        sPfx, sWord, sSfx = cp.cut(sWord)
        oSuggResult = SuggResult(sWord)
        self._suggest2(oSuggResult)
        aSugg = oSuggResult.getSuggestions()
        if sSfx or sPfx:
            # we add what we removed
            return list(map(lambda sSug: sPfx + sSug + sSfx, aSugg))
        return aSugg

    def _suggest2 (self, oSuggResult, nDeep=0, iAddr=0, sNewWord=""):
        # recursive function







|





|







350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
                    self._suggest(oSuggResult, sRepl, nMaxSwitch, nMaxDel, nMaxHardRepl, nMaxJump, nDist, nDeep+1, iAddr, sNewWord, True)
            elif len(sRemain) == 1:
                self._suggest(oSuggResult, "", nMaxSwitch, nMaxDel, nMaxHardRepl, nMaxJump, nDist, nDeep+1, iAddr, sNewWord, True) # remove last char and go on
                for sRepl in cp.dFinal1.get(sRemain, ()):
                    self._suggest(oSuggResult, sRepl, nMaxSwitch, nMaxDel, nMaxHardRepl, nMaxJump, nDist, nDeep+1, iAddr, sNewWord, True)

    #@timethis
    def suggest2 (self, sWord, nSuggLimit=10):
        "returns a set of suggestions for <sWord>"
        sWord = cp.spellingNormalization(sWord)
        sPfx, sWord, sSfx = cp.cut(sWord)
        oSuggResult = SuggResult(sWord)
        self._suggest2(oSuggResult)
        aSugg = oSuggResult.getSuggestions(nSuggLimit)
        if sSfx or sPfx:
            # we add what we removed
            return list(map(lambda sSug: sPfx + sSug + sSfx, aSugg))
        return aSugg

    def _suggest2 (self, oSuggResult, nDeep=0, iAddr=0, sNewWord=""):
        # recursive function
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429

    def drawPath (self, sWord, iAddr=0):
        "show the path taken by <sWord> in the graph"
        sWord = cp.spellingNormalization(sWord)
        c1 = sWord[0:1]  if sWord  else " "
        iPos = -1
        n = 0
        print(c1 + ": ", end="")
        for c2, jAddr in self._getCharArcs(iAddr):
            print(c2, end="")
            if c2 == sWord[0:1]:
                iNextNodeAddr = jAddr
                iPos = n
            n += 1
        if not sWord:
            return
        if iPos >= 0:
            print("\n   "+ " " * iPos + "|")
            self.drawPath(sWord[1:], iNextNodeAddr)

    def getSimilarEntries (self, sWord, nSuggLimit=10):
        "return a list of tuples (similar word, stem, morphology)"
        if not sWord:
            return []
        lResult = []







|

|







|







413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437

    def drawPath (self, sWord, iAddr=0):
        "show the path taken by <sWord> in the graph"
        sWord = cp.spellingNormalization(sWord)
        c1 = sWord[0:1]  if sWord  else " "
        iPos = -1
        n = 0
        echo(c1 + ": ", end="")
        for c2, jAddr in self._getCharArcs(iAddr):
            echo(c2, end="")
            if c2 == sWord[0:1]:
                iNextNodeAddr = jAddr
                iPos = n
            n += 1
        if not sWord:
            return
        if iPos >= 0:
            echo("\n   " + " " * iPos + "|")
            self.drawPath(sWord[1:], iNextNodeAddr)

    def getSimilarEntries (self, sWord, nSuggLimit=10):
        "return a list of tuples (similar word, stem, morphology)"
        if not sWord:
            return []
        lResult = []
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
485
486
487
488
489
490
491
492
493
494
495
496
497
498
499
500
501
502
503
504
505
506
507
508
509
510
511
512
513
514
515
516
517
518
519
520
521
522
523
524
525
526
527
528
529
530
531
532
533
534
535
536
537
538
539
540
541
542
543
544
545
546
547
548
549
550
551
552
553
554
555
556
    def _morph1 (self, sWord):
        "returns morphologies of <sWord>"
        iAddr = 0
        for c in sWord:
            if c not in self.dChar:
                return []
            iAddr = self._lookupArcNode(self.dChar[c], iAddr)
            if iAddr == None:
                return []
        if (int.from_bytes(self.byDic[iAddr:iAddr+self.nBytesArc], byteorder='big') & self._finalNodeMask):
            l = []
            nRawArc = 0
            while not (nRawArc & self._lastArcMask):
                iEndArcAddr = iAddr + self.nBytesArc
                nRawArc = int.from_bytes(self.byDic[iAddr:iEndArcAddr], byteorder='big')
                nArc = nRawArc & self._arcMask
                if nArc > self.nChar:
                    # This value is not a char, this is a stemming code 
                    sStem = ">" + self.funcStemming(sWord, self.lArcVal[nArc])
                    # Now , we go to the next node and retrieve all following arcs values, all of them are tags
                    iAddr2 = int.from_bytes(self.byDic[iEndArcAddr:iEndArcAddr+self.nBytesNodeAddress], byteorder='big')
                    nRawArc2 = 0
                    while not (nRawArc2 & self._lastArcMask):
                        iEndArcAddr2 = iAddr2 + self.nBytesArc
                        nRawArc2 = int.from_bytes(self.byDic[iAddr2:iEndArcAddr2], byteorder='big')
                        l.append(sStem + " " + self.lArcVal[nRawArc2 & self._arcMask])
                        iAddr2 = iEndArcAddr2+self.nBytesNodeAddress
                iAddr = iEndArcAddr+self.nBytesNodeAddress
            return l
        return []

    def _stem1 (self, sWord):
        "returns stems list of <sWord>"
        iAddr = 0
        for c in sWord:
            if c not in self.dChar:
                return []
            iAddr = self._lookupArcNode(self.dChar[c], iAddr)
            if iAddr == None:
                return []
        if (int.from_bytes(self.byDic[iAddr:iAddr+self.nBytesArc], byteorder='big') & self._finalNodeMask):
            l = []
            nRawArc = 0
            while not (nRawArc & self._lastArcMask):
                iEndArcAddr = iAddr + self.nBytesArc
                nRawArc = int.from_bytes(self.byDic[iAddr:iEndArcAddr], byteorder='big')
                nArc = nRawArc & self._arcMask
                if nArc > self.nChar:
                    # This value is not a char, this is a stemming code 
                    l.append(self.funcStemming(sWord, self.lArcVal[nArc]))
                iAddr = iEndArcAddr+self.nBytesNodeAddress
            return l
        return []

    def _lookupArcNode1 (self, nVal, iAddr):
        "looks if <nVal> is an arc at the node at <iAddr>, if yes, returns address of next node else None"
        while True:
            iEndArcAddr = iAddr+self.nBytesArc
            nRawArc = int.from_bytes(self.byDic[iAddr:iEndArcAddr], byteorder='big')
            if nVal == (nRawArc & self._arcMask):
                # the value we are looking for 
                # we return the address of the next node
                return int.from_bytes(self.byDic[iEndArcAddr:iEndArcAddr+self.nBytesNodeAddress], byteorder='big')
            else:
                # value not found
                if (nRawArc & self._lastArcMask):
                    return None
                iAddr = iEndArcAddr+self.nBytesNodeAddress

    def _getArcs1 (self, iAddr):
        "generator: return all arcs at <iAddr> as tuples of (nVal, iAddr)"
        while True:
            iEndArcAddr = iAddr+self.nBytesArc
            nRawArc = int.from_bytes(self.byDic[iAddr:iEndArcAddr], byteorder='big')
            yield (nRawArc & self._arcMask, int.from_bytes(self.byDic[iEndArcAddr:iEndArcAddr+self.nBytesNodeAddress], byteorder='big'))
            if (nRawArc & self._lastArcMask):
                break
            iAddr = iEndArcAddr+self.nBytesNodeAddress

    def _writeNodes1 (self, spfDest):
        "for debugging only"
        print(" > Write binary nodes")
        with codecs.open(spfDest, 'w', 'utf-8', newline="\n") as hDst:
            iAddr = 0
            hDst.write("i{:_>10} -- #{:_>10}\n".format("0", iAddr))
            while iAddr < len(self.byDic):
                iEndArcAddr = iAddr+self.nBytesArc
                nRawArc = int.from_bytes(self.byDic[iAddr:iEndArcAddr], byteorder='big')
                nArc = nRawArc & self._arcMask
                hDst.write("  {:<20}  {:0>16}  i{:>10}   #{:_>10}\n".format(self.lArcVal[nArc], bin(nRawArc)[2:], "?", \







|

|







|







|












|

|







|











|




|








|
|






|







475
476
477
478
479
480
481
482
483
484
485
486
487
488
489
490
491
492
493
494
495
496
497
498
499
500
501
502
503
504
505
506
507
508
509
510
511
512
513
514
515
516
517
518
519
520
521
522
523
524
525
526
527
528
529
530
531
532
533
534
535
536
537
538
539
540
541
542
543
544
545
546
547
548
549
550
551
552
553
554
555
556
557
558
559
560
561
562
563
564
    def _morph1 (self, sWord):
        "returns morphologies of <sWord>"
        iAddr = 0
        for c in sWord:
            if c not in self.dChar:
                return []
            iAddr = self._lookupArcNode(self.dChar[c], iAddr)
            if iAddr is None:
                return []
        if int.from_bytes(self.byDic[iAddr:iAddr+self.nBytesArc], byteorder='big') & self._finalNodeMask:
            l = []
            nRawArc = 0
            while not (nRawArc & self._lastArcMask):
                iEndArcAddr = iAddr + self.nBytesArc
                nRawArc = int.from_bytes(self.byDic[iAddr:iEndArcAddr], byteorder='big')
                nArc = nRawArc & self._arcMask
                if nArc > self.nChar:
                    # This value is not a char, this is a stemming code
                    sStem = ">" + self.funcStemming(sWord, self.lArcVal[nArc])
                    # Now , we go to the next node and retrieve all following arcs values, all of them are tags
                    iAddr2 = int.from_bytes(self.byDic[iEndArcAddr:iEndArcAddr+self.nBytesNodeAddress], byteorder='big')
                    nRawArc2 = 0
                    while not (nRawArc2 & self._lastArcMask):
                        iEndArcAddr2 = iAddr2 + self.nBytesArc
                        nRawArc2 = int.from_bytes(self.byDic[iAddr2:iEndArcAddr2], byteorder='big')
                        l.append(sStem + "/" + self.lArcVal[nRawArc2 & self._arcMask])
                        iAddr2 = iEndArcAddr2+self.nBytesNodeAddress
                iAddr = iEndArcAddr+self.nBytesNodeAddress
            return l
        return []

    def _stem1 (self, sWord):
        "returns stems list of <sWord>"
        iAddr = 0
        for c in sWord:
            if c not in self.dChar:
                return []
            iAddr = self._lookupArcNode(self.dChar[c], iAddr)
            if iAddr is None:
                return []
        if int.from_bytes(self.byDic[iAddr:iAddr+self.nBytesArc], byteorder='big') & self._finalNodeMask:
            l = []
            nRawArc = 0
            while not (nRawArc & self._lastArcMask):
                iEndArcAddr = iAddr + self.nBytesArc
                nRawArc = int.from_bytes(self.byDic[iAddr:iEndArcAddr], byteorder='big')
                nArc = nRawArc & self._arcMask
                if nArc > self.nChar:
                    # This value is not a char, this is a stemming code
                    l.append(self.funcStemming(sWord, self.lArcVal[nArc]))
                iAddr = iEndArcAddr+self.nBytesNodeAddress
            return l
        return []

    def _lookupArcNode1 (self, nVal, iAddr):
        "looks if <nVal> is an arc at the node at <iAddr>, if yes, returns address of next node else None"
        while True:
            iEndArcAddr = iAddr+self.nBytesArc
            nRawArc = int.from_bytes(self.byDic[iAddr:iEndArcAddr], byteorder='big')
            if nVal == (nRawArc & self._arcMask):
                # the value we are looking for
                # we return the address of the next node
                return int.from_bytes(self.byDic[iEndArcAddr:iEndArcAddr+self.nBytesNodeAddress], byteorder='big')
            else:
                # value not found
                if nRawArc & self._lastArcMask:
                    return None
                iAddr = iEndArcAddr+self.nBytesNodeAddress

    def _getArcs1 (self, iAddr):
        "generator: return all arcs at <iAddr> as tuples of (nVal, iAddr)"
        while True:
            iEndArcAddr = iAddr+self.nBytesArc
            nRawArc = int.from_bytes(self.byDic[iAddr:iEndArcAddr], byteorder='big')
            yield nRawArc & self._arcMask, int.from_bytes(self.byDic[iEndArcAddr:iEndArcAddr+self.nBytesNodeAddress], byteorder='big')
            if nRawArc & self._lastArcMask:
                break
            iAddr = iEndArcAddr+self.nBytesNodeAddress

    def _writeNodes1 (self, spfDest):
        "for debugging only"
        print(" > Write binary nodes")
        with open(spfDest, 'w', 'utf-8', newline="\n") as hDst:
            iAddr = 0
            hDst.write("i{:_>10} -- #{:_>10}\n".format("0", iAddr))
            while iAddr < len(self.byDic):
                iEndArcAddr = iAddr+self.nBytesArc
                nRawArc = int.from_bytes(self.byDic[iAddr:iEndArcAddr], byteorder='big')
                nArc = nRawArc & self._arcMask
                hDst.write("  {:<20}  {:0>16}  i{:>10}   #{:_>10}\n".format(self.lArcVal[nArc], bin(nRawArc)[2:], "?", \
565
566
567
568
569
570
571
572
573
574
575
576
577
578
579
580
581
582
583
584
585
586
587
588
589
590
591
592
593
594
595
596
597
598
599
600
601
602
603
604
605
606
607
608
609
610
611
612
613
614
615
616
617
618
619
620
621
622
623
624
625
626
627
628
629
630
631
632
633
634
635
636
637
638
639
640
641
642
643
644
645
646
647
648
649
650
651
652
653
654
655
656
657
658
659
660
661
662
663
664
665
666
667
668
669
670
671
672
673
674
675
676
677
678
679
680
681
682
683
684
685
686
687
688
689
690
691
692
693
694
695
696
697
698
699
700
701
702
703
704
705
706
707
708
709
710
711
712
713
714
715
716
717
718
719
720
721
722
723
724
725
726
727
728
729
730
731
732
733
734
735
736
737
738
739
740
741
742
743
744
745
746
747
748
749
750
751
752
753
754
755
756
757
758
759
760
761
762
763
764
765
766
767
768
769
770
771
772
773
774
775
776
777
    def _morph2 (self, sWord):
        "returns morphologies of <sWord>"
        iAddr = 0
        for c in sWord:
            if c not in self.dChar:
                return []
            iAddr = self._lookupArcNode(self.dChar[c], iAddr)
            if iAddr == None:
                return []
        if (int.from_bytes(self.byDic[iAddr:iAddr+self.nBytesArc], byteorder='big') & self._finalNodeMask):
            l = []
            nRawArc = 0
            while not (nRawArc & self._lastArcMask):
                iEndArcAddr = iAddr + self.nBytesArc
                nRawArc = int.from_bytes(self.byDic[iAddr:iEndArcAddr], byteorder='big')
                nArc = nRawArc & self._arcMask
                if nArc > self.nChar:
                    # This value is not a char, this is a stemming code 
                    sStem = ">" + self.funcStemming(sWord, self.lArcVal[nArc])
                    # Now , we go to the next node and retrieve all following arcs values, all of them are tags
                    if not (nRawArc & self._addrBitMask):
                        iAddr2 = int.from_bytes(self.byDic[iEndArcAddr:iEndArcAddr+self.nBytesNodeAddress], byteorder='big')
                    else:
                        # we go to the end of the node
                        iAddr2 = iEndArcAddr
                        while not (nRawArc & self._lastArcMask):
                            nRawArc = int.from_bytes(self.byDic[iAddr2:iAddr2+self.nBytesArc], byteorder='big')
                            iAddr2 += self.nBytesArc + self.nBytesNodeAddress
                    nRawArc2 = 0
                    while not (nRawArc2 & self._lastArcMask):
                        iEndArcAddr2 = iAddr2 + self.nBytesArc
                        nRawArc2 = int.from_bytes(self.byDic[iAddr2:iEndArcAddr2], byteorder='big')
                        l.append(sStem + " " + self.lArcVal[nRawArc2 & self._arcMask])
                        iAddr2 = iEndArcAddr2+self.nBytesNodeAddress  if not (nRawArc2 & self._addrBitMask) else iEndArcAddr2
                iAddr = iEndArcAddr+self.nBytesNodeAddress  if not (nRawArc & self._addrBitMask)  else iEndArcAddr
            return l
        return []

    def _stem2 (self, sWord):
        "returns stems list of <sWord>"
        iAddr = 0
        for c in sWord:
            if c not in self.dChar:
                return []
            iAddr = self._lookupArcNode(self.dChar[c], iAddr)
            if iAddr == None:
                return []
        if (int.from_bytes(self.byDic[iAddr:iAddr+self.nBytesArc], byteorder='big') & self._finalNodeMask):
            l = []
            nRawArc = 0
            while not (nRawArc & self._lastArcMask):
                iEndArcAddr = iAddr + self.nBytesArc
                nRawArc = int.from_bytes(self.byDic[iAddr:iEndArcAddr], byteorder='big')
                nArc = nRawArc & self._arcMask
                if nArc > self.nChar:
                    # This value is not a char, this is a stemming code 
                    l.append(self.funcStemming(sWord, self.lArcVal[nArc]))
                    # Now , we go to the next node
                    if not (nRawArc & self._addrBitMask):
                        iAddr2 = int.from_bytes(self.byDic[iEndArcAddr:iEndArcAddr+self.nBytesNodeAddress], byteorder='big')
                    else:
                        # we go to the end of the node
                        iAddr2 = iEndArcAddr
                        while not (nRawArc & self._lastArcMask):
                            nRawArc = int.from_bytes(self.byDic[iAddr2:iAddr2+self.nBytesArc], byteorder='big')
                            iAddr2 += self.nBytesArc + self.nBytesNodeAddress
                iAddr = iEndArcAddr+self.nBytesNodeAddress  if not (nRawArc & self._addrBitMask)  else iEndArcAddr
            return l
        return []

    def _lookupArcNode2 (self, nVal, iAddr):
        "looks if <nVal> is an arc at the node at <iAddr>, if yes, returns address of next node else None"
        while True:
            iEndArcAddr = iAddr+self.nBytesArc
            nRawArc = int.from_bytes(self.byDic[iAddr:iEndArcAddr], byteorder='big')
            if nVal == (nRawArc & self._arcMask):
                # the value we are looking for 
                if not (nRawArc & self._addrBitMask):
                    # we return the address of the next node
                    return int.from_bytes(self.byDic[iEndArcAddr:iEndArcAddr+self.nBytesNodeAddress], byteorder='big')
                else:
                    # we go to the end of the node
                    iAddr = iEndArcAddr
                    while not (nRawArc & self._lastArcMask):
                        nRawArc = int.from_bytes(self.byDic[iAddr:iAddr+self.nBytesArc], byteorder='big')
                        iAddr += self.nBytesArc + self.nBytesNodeAddress  if not (nRawArc & self._addrBitMask)  else self.nBytesArc
                    return iAddr
            else:
                # value not found
                if (nRawArc & self._lastArcMask):
                    return None
                iAddr = iEndArcAddr+self.nBytesNodeAddress  if not (nRawArc & self._addrBitMask)  else iEndArcAddr

    def _writeNodes2 (self, spfDest):
        "for debugging only"
        print(" > Write binary nodes")
        with codecs.open(spfDest, 'w', 'utf-8', newline="\n") as hDst:
            iAddr = 0
            hDst.write("i{:_>10} -- #{:_>10}\n".format("0", iAddr))
            while iAddr < len(self.byDic):
                iEndArcAddr = iAddr+self.nBytesArc
                nRawArc = int.from_bytes(self.byDic[iAddr:iEndArcAddr], byteorder='big')
                nArc = nRawArc & self._arcMask
                if not (nRawArc & self._addrBitMask):
                    iNextNodeAddr = int.from_bytes(self.byDic[iEndArcAddr:iEndArcAddr+self.nBytesNodeAddress], byteorder='big')
                    hDst.write("  {:<20}  {:0>16}  i{:>10}   #{:_>10}\n".format(self.lArcVal[nArc], bin(nRawArc)[2:], "?", iNextNodeAddr))
                    iAddr = iEndArcAddr+self.nBytesNodeAddress
                else:
                    hDst.write("  {:<20}  {:0>16}\n".format(self.lArcVal[nArc], bin(nRawArc)[2:]))
                    iAddr = iEndArcAddr
                if (nRawArc & self._lastArcMask):
                    hDst.write("\ni{:_>10} -- #{:_>10}\n".format("?", iAddr))
            hDst.close()

    # VERSION 3
    def _morph3 (self, sWord):
        "returns morphologies of <sWord>"
        iAddr = 0
        for c in sWord:
            if c not in self.dChar:
                return []
            iAddr = self._lookupArcNode(self.dChar[c], iAddr)
            if iAddr == None:
                return []
        if (int.from_bytes(self.byDic[iAddr:iAddr+self.nBytesArc], byteorder='big') & self._finalNodeMask):
            l = []
            nRawArc = 0
            iAddrNode = iAddr
            while not (nRawArc & self._lastArcMask):
                iEndArcAddr = iAddr + self.nBytesArc
                nRawArc = int.from_bytes(self.byDic[iAddr:iEndArcAddr], byteorder='big')
                nArc = nRawArc & self._arcMask
                if nArc > self.nChar:
                    # This value is not a char, this is a stemming code 
                    sStem = ">" + self.funcStemming(sWord, self.lArcVal[nArc])
                    # Now , we go to the next node and retrieve all following arcs values, all of them are tags
                    if not (nRawArc & self._addrBitMask):
                        iAddr2 = int.from_bytes(self.byDic[iEndArcAddr:iEndArcAddr+self.nBytesNodeAddress], byteorder='big')
                    else:
                        iAddr2 = iAddrNode + int.from_bytes(self.byDic[iEndArcAddr:iEndArcAddr+self.nBytesOffset], byteorder='big')
                    nRawArc2 = 0
                    while not (nRawArc2 & self._lastArcMask):
                        iEndArcAddr2 = iAddr2 + self.nBytesArc
                        nRawArc2 = int.from_bytes(self.byDic[iAddr2:iEndArcAddr2], byteorder='big')
                        l.append(sStem + " " + self.lArcVal[nRawArc2 & self._arcMask])
                        iAddr2 = iEndArcAddr2+self.nBytesNodeAddress  if not (nRawArc2 & self._addrBitMask) else iEndArcAddr2+self.nBytesOffset
                iAddr = iEndArcAddr+self.nBytesNodeAddress  if not (nRawArc & self._addrBitMask)  else iEndArcAddr+self.nBytesOffset
            return l
        return []

    def _stem3 (self, sWord):
        "returns stems list of <sWord>"
        iAddr = 0
        for c in sWord:
            if c not in self.dChar:
                return []
            iAddr = self._lookupArcNode(self.dChar[c], iAddr)
            if iAddr == None:
                return []
        if (int.from_bytes(self.byDic[iAddr:iAddr+self.nBytesArc], byteorder='big') & self._finalNodeMask):
            l = []
            nRawArc = 0
            iAddrNode = iAddr
            while not (nRawArc & self._lastArcMask):
                iEndArcAddr = iAddr + self.nBytesArc
                nRawArc = int.from_bytes(self.byDic[iAddr:iEndArcAddr], byteorder='big')
                nArc = nRawArc & self._arcMask
                if nArc > self.nChar:
                    # This value is not a char, this is a stemming code 
                    l.append(self.funcStemming(sWord, self.lArcVal[nArc]))
                iAddr = iEndArcAddr+self.nBytesNodeAddress  if not (nRawArc & self._addrBitMask)  else iEndArcAddr+self.nBytesOffset
            return l
        return []

    def _lookupArcNode3 (self, nVal, iAddr):
        "looks if <nVal> is an arc at the node at <iAddr>, if yes, returns address of next node else None"
        iAddrNode = iAddr
        while True:
            iEndArcAddr = iAddr+self.nBytesArc
            nRawArc = int.from_bytes(self.byDic[iAddr:iEndArcAddr], byteorder='big')
            if nVal == (nRawArc & self._arcMask):
                # the value we are looking for 
                if not (nRawArc & self._addrBitMask):
                    return int.from_bytes(self.byDic[iEndArcAddr:iEndArcAddr+self.nBytesNodeAddress], byteorder='big')
                else:
                    return iAddrNode + int.from_bytes(self.byDic[iEndArcAddr:iEndArcAddr+self.nBytesOffset], byteorder='big')
            else:
                # value not found
                if (nRawArc & self._lastArcMask):
                    return None
                iAddr = iEndArcAddr+self.nBytesNodeAddress  if not (nRawArc & self._addrBitMask)  else iEndArcAddr+self.nBytesOffset

    def _writeNodes3 (self, spfDest):
        "for debugging only"
        print(" > Write binary nodes")
        with codecs.open(spfDest, 'w', 'utf-8', newline="\n") as hDst:
            iAddr = 0
            hDst.write("i{:_>10} -- #{:_>10}\n".format("0", iAddr))
            while iAddr < len(self.byDic):
                iEndArcAddr = iAddr+self.nBytesArc
                nRawArc = int.from_bytes(self.byDic[iAddr:iEndArcAddr], byteorder='big')
                nArc = nRawArc & self._arcMask
                if not (nRawArc & self._addrBitMask):
                    iNextNodeAddr = int.from_bytes(self.byDic[iEndArcAddr:iEndArcAddr+self.nBytesNodeAddress], byteorder='big')
                    hDst.write("  {:<20}  {:0>16}  i{:>10}   #{:_>10}\n".format(self.lArcVal[nArc], bin(nRawArc)[2:], "?", iNextNodeAddr))
                    iAddr = iEndArcAddr+self.nBytesNodeAddress
                else:
                    iNextNodeAddr = int.from_bytes(self.byDic[iEndArcAddr:iEndArcAddr+self.nBytesOffset], byteorder='big')
                    hDst.write("  {:<20}  {:0>16}  i{:>10}   +{:_>10}\n".format(self.lArcVal[nArc], bin(nRawArc)[2:], "?", iNextNodeAddr))
                    iAddr = iEndArcAddr+self.nBytesOffset
                if (nRawArc & self._lastArcMask):
                    hDst.write("\ni{:_>10} -- #{:_>10}\n".format("?", iAddr))
            hDst.close()







|

|







|














|












|

|







|




















|












|






|













|











|

|








|










|












|

|


|





|












|






|






|














|


573
574
575
576
577
578
579
580
581
582
583
584
585
586
587
588
589
590
591
592
593
594
595
596
597
598
599
600
601
602
603
604
605
606
607
608
609
610
611
612
613
614
615
616
617
618
619
620
621
622
623
624
625
626
627
628
629
630
631
632
633
634
635
636
637
638
639
640
641
642
643
644
645
646
647
648
649
650
651
652
653
654
655
656
657
658
659
660
661
662
663
664
665
666
667
668
669
670
671
672
673
674
675
676
677
678
679
680
681
682
683
684
685
686
687
688
689
690
691
692
693
694
695
696
697
698
699
700
701
702
703
704
705
706
707
708
709
710
711
712
713
714
715
716
717
718
719
720
721
722
723
724
725
726
727
728
729
730
731
732
733
734
735
736
737
738
739
740
741
742
743
744
745
746
747
748
749
750
751
752
753
754
755
756
757
758
759
760
761
762
763
764
765
766
767
768
769
770
771
772
773
774
775
776
777
778
779
780
781
782
783
784
785
    def _morph2 (self, sWord):
        "returns morphologies of <sWord>"
        iAddr = 0
        for c in sWord:
            if c not in self.dChar:
                return []
            iAddr = self._lookupArcNode(self.dChar[c], iAddr)
            if iAddr is None:
                return []
        if int.from_bytes(self.byDic[iAddr:iAddr+self.nBytesArc], byteorder='big') & self._finalNodeMask:
            l = []
            nRawArc = 0
            while not (nRawArc & self._lastArcMask):
                iEndArcAddr = iAddr + self.nBytesArc
                nRawArc = int.from_bytes(self.byDic[iAddr:iEndArcAddr], byteorder='big')
                nArc = nRawArc & self._arcMask
                if nArc > self.nChar:
                    # This value is not a char, this is a stemming code
                    sStem = ">" + self.funcStemming(sWord, self.lArcVal[nArc])
                    # Now , we go to the next node and retrieve all following arcs values, all of them are tags
                    if not (nRawArc & self._addrBitMask):
                        iAddr2 = int.from_bytes(self.byDic[iEndArcAddr:iEndArcAddr+self.nBytesNodeAddress], byteorder='big')
                    else:
                        # we go to the end of the node
                        iAddr2 = iEndArcAddr
                        while not (nRawArc & self._lastArcMask):
                            nRawArc = int.from_bytes(self.byDic[iAddr2:iAddr2+self.nBytesArc], byteorder='big')
                            iAddr2 += self.nBytesArc + self.nBytesNodeAddress
                    nRawArc2 = 0
                    while not (nRawArc2 & self._lastArcMask):
                        iEndArcAddr2 = iAddr2 + self.nBytesArc
                        nRawArc2 = int.from_bytes(self.byDic[iAddr2:iEndArcAddr2], byteorder='big')
                        l.append(sStem + "/" + self.lArcVal[nRawArc2 & self._arcMask])
                        iAddr2 = iEndArcAddr2+self.nBytesNodeAddress  if not (nRawArc2 & self._addrBitMask) else iEndArcAddr2
                iAddr = iEndArcAddr+self.nBytesNodeAddress  if not (nRawArc & self._addrBitMask)  else iEndArcAddr
            return l
        return []

    def _stem2 (self, sWord):
        "returns stems list of <sWord>"
        iAddr = 0
        for c in sWord:
            if c not in self.dChar:
                return []
            iAddr = self._lookupArcNode(self.dChar[c], iAddr)
            if iAddr is None:
                return []
        if int.from_bytes(self.byDic[iAddr:iAddr+self.nBytesArc], byteorder='big') & self._finalNodeMask:
            l = []
            nRawArc = 0
            while not (nRawArc & self._lastArcMask):
                iEndArcAddr = iAddr + self.nBytesArc
                nRawArc = int.from_bytes(self.byDic[iAddr:iEndArcAddr], byteorder='big')
                nArc = nRawArc & self._arcMask
                if nArc > self.nChar:
                    # This value is not a char, this is a stemming code
                    l.append(self.funcStemming(sWord, self.lArcVal[nArc]))
                    # Now , we go to the next node
                    if not (nRawArc & self._addrBitMask):
                        iAddr2 = int.from_bytes(self.byDic[iEndArcAddr:iEndArcAddr+self.nBytesNodeAddress], byteorder='big')
                    else:
                        # we go to the end of the node
                        iAddr2 = iEndArcAddr
                        while not (nRawArc & self._lastArcMask):
                            nRawArc = int.from_bytes(self.byDic[iAddr2:iAddr2+self.nBytesArc], byteorder='big')
                            iAddr2 += self.nBytesArc + self.nBytesNodeAddress
                iAddr = iEndArcAddr+self.nBytesNodeAddress  if not (nRawArc & self._addrBitMask)  else iEndArcAddr
            return l
        return []

    def _lookupArcNode2 (self, nVal, iAddr):
        "looks if <nVal> is an arc at the node at <iAddr>, if yes, returns address of next node else None"
        while True:
            iEndArcAddr = iAddr+self.nBytesArc
            nRawArc = int.from_bytes(self.byDic[iAddr:iEndArcAddr], byteorder='big')
            if nVal == (nRawArc & self._arcMask):
                # the value we are looking for
                if not (nRawArc & self._addrBitMask):
                    # we return the address of the next node
                    return int.from_bytes(self.byDic[iEndArcAddr:iEndArcAddr+self.nBytesNodeAddress], byteorder='big')
                else:
                    # we go to the end of the node
                    iAddr = iEndArcAddr
                    while not (nRawArc & self._lastArcMask):
                        nRawArc = int.from_bytes(self.byDic[iAddr:iAddr+self.nBytesArc], byteorder='big')
                        iAddr += self.nBytesArc + self.nBytesNodeAddress  if not (nRawArc & self._addrBitMask)  else self.nBytesArc
                    return iAddr
            else:
                # value not found
                if nRawArc & self._lastArcMask:
                    return None
                iAddr = iEndArcAddr+self.nBytesNodeAddress  if not (nRawArc & self._addrBitMask)  else iEndArcAddr

    def _writeNodes2 (self, spfDest):
        "for debugging only"
        print(" > Write binary nodes")
        with open(spfDest, 'w', 'utf-8', newline="\n") as hDst:
            iAddr = 0
            hDst.write("i{:_>10} -- #{:_>10}\n".format("0", iAddr))
            while iAddr < len(self.byDic):
                iEndArcAddr = iAddr+self.nBytesArc
                nRawArc = int.from_bytes(self.byDic[iAddr:iEndArcAddr], byteorder='big')
                nArc = nRawArc & self._arcMask
                if not (nRawArc & self._addrBitMask):
                    iNextNodeAddr = int.from_bytes(self.byDic[iEndArcAddr:iEndArcAddr+self.nBytesNodeAddress], byteorder='big')
                    hDst.write("  {:<20}  {:0>16}  i{:>10}   #{:_>10}\n".format(self.lArcVal[nArc], bin(nRawArc)[2:], "?", iNextNodeAddr))
                    iAddr = iEndArcAddr+self.nBytesNodeAddress
                else:
                    hDst.write("  {:<20}  {:0>16}\n".format(self.lArcVal[nArc], bin(nRawArc)[2:]))
                    iAddr = iEndArcAddr
                if nRawArc & self._lastArcMask:
                    hDst.write("\ni{:_>10} -- #{:_>10}\n".format("?", iAddr))
            hDst.close()

    # VERSION 3
    def _morph3 (self, sWord):
        "returns morphologies of <sWord>"
        iAddr = 0
        for c in sWord:
            if c not in self.dChar:
                return []
            iAddr = self._lookupArcNode(self.dChar[c], iAddr)
            if iAddr is None:
                return []
        if int.from_bytes(self.byDic[iAddr:iAddr+self.nBytesArc], byteorder='big') & self._finalNodeMask:
            l = []
            nRawArc = 0
            iAddrNode = iAddr
            while not (nRawArc & self._lastArcMask):
                iEndArcAddr = iAddr + self.nBytesArc
                nRawArc = int.from_bytes(self.byDic[iAddr:iEndArcAddr], byteorder='big')
                nArc = nRawArc & self._arcMask
                if nArc > self.nChar:
                    # This value is not a char, this is a stemming code
                    sStem = ">" + self.funcStemming(sWord, self.lArcVal[nArc])
                    # Now , we go to the next node and retrieve all following arcs values, all of them are tags
                    if not (nRawArc & self._addrBitMask):
                        iAddr2 = int.from_bytes(self.byDic[iEndArcAddr:iEndArcAddr+self.nBytesNodeAddress], byteorder='big')
                    else:
                        iAddr2 = iAddrNode + int.from_bytes(self.byDic[iEndArcAddr:iEndArcAddr+self.nBytesOffset], byteorder='big')
                    nRawArc2 = 0
                    while not (nRawArc2 & self._lastArcMask):
                        iEndArcAddr2 = iAddr2 + self.nBytesArc
                        nRawArc2 = int.from_bytes(self.byDic[iAddr2:iEndArcAddr2], byteorder='big')
                        l.append(sStem + "/" + self.lArcVal[nRawArc2 & self._arcMask])
                        iAddr2 = iEndArcAddr2+self.nBytesNodeAddress  if not (nRawArc2 & self._addrBitMask) else iEndArcAddr2+self.nBytesOffset
                iAddr = iEndArcAddr+self.nBytesNodeAddress  if not (nRawArc & self._addrBitMask)  else iEndArcAddr+self.nBytesOffset
            return l
        return []

    def _stem3 (self, sWord):
        "returns stems list of <sWord>"
        iAddr = 0
        for c in sWord:
            if c not in self.dChar:
                return []
            iAddr = self._lookupArcNode(self.dChar[c], iAddr)
            if iAddr is None:
                return []
        if int.from_bytes(self.byDic[iAddr:iAddr+self.nBytesArc], byteorder='big') & self._finalNodeMask:
            l = []
            nRawArc = 0
            #iAddrNode = iAddr
            while not (nRawArc & self._lastArcMask):
                iEndArcAddr = iAddr + self.nBytesArc
                nRawArc = int.from_bytes(self.byDic[iAddr:iEndArcAddr], byteorder='big')
                nArc = nRawArc & self._arcMask
                if nArc > self.nChar:
                    # This value is not a char, this is a stemming code
                    l.append(self.funcStemming(sWord, self.lArcVal[nArc]))
                iAddr = iEndArcAddr+self.nBytesNodeAddress  if not (nRawArc & self._addrBitMask)  else iEndArcAddr+self.nBytesOffset
            return l
        return []

    def _lookupArcNode3 (self, nVal, iAddr):
        "looks if <nVal> is an arc at the node at <iAddr>, if yes, returns address of next node else None"
        iAddrNode = iAddr
        while True:
            iEndArcAddr = iAddr+self.nBytesArc
            nRawArc = int.from_bytes(self.byDic[iAddr:iEndArcAddr], byteorder='big')
            if nVal == (nRawArc & self._arcMask):
                # the value we are looking for
                if not (nRawArc & self._addrBitMask):
                    return int.from_bytes(self.byDic[iEndArcAddr:iEndArcAddr+self.nBytesNodeAddress], byteorder='big')
                else:
                    return iAddrNode + int.from_bytes(self.byDic[iEndArcAddr:iEndArcAddr+self.nBytesOffset], byteorder='big')
            else:
                # value not found
                if nRawArc & self._lastArcMask:
                    return None
                iAddr = iEndArcAddr+self.nBytesNodeAddress  if not (nRawArc & self._addrBitMask)  else iEndArcAddr+self.nBytesOffset

    def _writeNodes3 (self, spfDest):
        "for debugging only"
        print(" > Write binary nodes")
        with open(spfDest, 'w', 'utf-8', newline="\n") as hDst:
            iAddr = 0
            hDst.write("i{:_>10} -- #{:_>10}\n".format("0", iAddr))
            while iAddr < len(self.byDic):
                iEndArcAddr = iAddr+self.nBytesArc
                nRawArc = int.from_bytes(self.byDic[iAddr:iEndArcAddr], byteorder='big')
                nArc = nRawArc & self._arcMask
                if not (nRawArc & self._addrBitMask):
                    iNextNodeAddr = int.from_bytes(self.byDic[iEndArcAddr:iEndArcAddr+self.nBytesNodeAddress], byteorder='big')
                    hDst.write("  {:<20}  {:0>16}  i{:>10}   #{:_>10}\n".format(self.lArcVal[nArc], bin(nRawArc)[2:], "?", iNextNodeAddr))
                    iAddr = iEndArcAddr+self.nBytesNodeAddress
                else:
                    iNextNodeAddr = int.from_bytes(self.byDic[iEndArcAddr:iEndArcAddr+self.nBytesOffset], byteorder='big')
                    hDst.write("  {:<20}  {:0>16}  i{:>10}   +{:_>10}\n".format(self.lArcVal[nArc], bin(nRawArc)[2:], "?", iNextNodeAddr))
                    iAddr = iEndArcAddr+self.nBytesOffset
                if nRawArc & self._lastArcMask:
                    hDst.write("\ni{:_>10} -- #{:_>10}\n".format("?", iAddr))
            hDst.close()

Modified graphspell/keyboard_chars_proximity.py from [8f397a7bbf] to [f71f3b18e4].



1

2
3
4

5
6
7
8

9
10
11
12
13
14
15


# Keyboard chars proximity



def getKeyboardMap (sKeyboard):

    return _dKeyboardMap.get(sKeyboard.lower(), {})


def getKeyboardList ():

    return _dKeyboardMap.keys()


_dKeyboardMap = {
    # keyboards by alphabetical order
    # bépo, colemak and dvorak users are assumed to do less typing errors.
    "azerty": {
>
>
|
>



>




>







1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20

"""
Keyboard chars proximity
"""


def getKeyboardMap (sKeyboard):
    "return keyboard map as a dictionary of chars"
    return _dKeyboardMap.get(sKeyboard.lower(), {})


def getKeyboardList ():
    "return list of keyboards available"
    return _dKeyboardMap.keys()


_dKeyboardMap = {
    # keyboards by alphabetical order
    # bépo, colemak and dvorak users are assumed to do less typing errors.
    "azerty": {

Modified graphspell/progressbar.py from [5def72a6ce] to [b21d9bfaa8].


1


2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35

# Textual progressbar


# by Olivier R.
# License: MPL 2

import time

class ProgressBar:
    "Textual progressbar"
    
    def __init__ (self, nMin=0, nMax=100, nWidth=78):
        "initiate with minimum nMin to maximum nMax"
        self.nMin = nMin
        self.nMax = nMax
        self.nSpan = nMax - nMin
        self.nWidth = nWidth-9
        self.nAdvance = -1
        self.nCurVal = nMin
        self.startTime = time.time()
        self._update()

    def _update (self):
        fDone = ((self.nCurVal - self.nMin) / self.nSpan)
        nAdvance = int(fDone * self.nWidth)
        if (nAdvance > self.nAdvance):
            self.nAdvance = nAdvance
            print("\r[ {}{}  {}% ] ".format('>'*nAdvance, ' '*(self.nWidth-nAdvance), round(fDone*100)), end="")

    def increment (self, n=1):
        "increment value by n (1 by default)"
        self.nCurVal += n
        self._update()
    
    def done (self):
        "to call when it’s finished"
        print("\r[ task done in {:.1f} s ] ".format(time.time() - self.startTime))
>
|
>
>







|












|

|







|



1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
"""
Textual progressbar
"""

# by Olivier R.
# License: MPL 2

import time

class ProgressBar:
    "Textual progressbar"

    def __init__ (self, nMin=0, nMax=100, nWidth=78):
        "initiate with minimum nMin to maximum nMax"
        self.nMin = nMin
        self.nMax = nMax
        self.nSpan = nMax - nMin
        self.nWidth = nWidth-9
        self.nAdvance = -1
        self.nCurVal = nMin
        self.startTime = time.time()
        self._update()

    def _update (self):
        fDone = (self.nCurVal - self.nMin) / self.nSpan
        nAdvance = int(fDone * self.nWidth)
        if nAdvance > self.nAdvance:
            self.nAdvance = nAdvance
            print("\r[ {}{}  {}% ] ".format('>'*nAdvance, ' '*(self.nWidth-nAdvance), round(fDone*100)), end="")

    def increment (self, n=1):
        "increment value by n (1 by default)"
        self.nCurVal += n
        self._update()

    def done (self):
        "to call when it’s finished"
        print("\r[ task done in {:.1f} s ] ".format(time.time() - self.startTime))

Modified graphspell/spellchecker.py from [cbd22d2c4d] to [85bf9023fe].


1
2
3
4
5
6
7
8
9
10
11

12
13
14
15
16
17
18
19
20
21
22
23
24

25
26
27
28
29
30
31
32
33
34
35
36
37
38







39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56

57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84

85
86
87

88
89
90

91
92
93

94
95
96

97
98
99

100
101




























102
103
104
105

106
107
108
109
110
111
112
113
114
115
116
117
118


119
120
121
122
123
124
125
126
127

# Spellchecker
# Wrapper for the IBDAWG class.
# Useful to check several dictionaries at once.

# To avoid iterating over a pile of dictionaries, it is assumed that 3 are enough:
# - the main dictionary, bundled with the package
# - the extended dictionary
# - the community dictionary, added by an organization
# - the personal dictionary, created by the user for its own convenience



import traceback

from . import ibdawg
from . import tokenizer


dDefaultDictionaries = {
    "fr": "fr-allvars.bdic",
    "en": "en.bdic"
}


class SpellChecker ():


    def __init__ (self, sLangCode, sfMainDic="", sfExtendedDic="", sfCommunityDic="", sfPersonalDic=""):
        "returns True if the main dictionary is loaded"
        self.sLangCode = sLangCode
        if not sfMainDic:
            sfMainDic = dDefaultDictionaries.get(sLangCode, "")
        self.oMainDic = self._loadDictionary(sfMainDic, True)
        self.oExtendedDic = self._loadDictionary(sfExtendedDic)
        self.oCommunityDic = self._loadDictionary(sfCommunityDic)
        self.oPersonalDic = self._loadDictionary(sfPersonalDic)
        self.bExtendedDic = bool(self.oExtendedDic)
        self.bCommunityDic = bool(self.oCommunityDic)
        self.bPersonalDic = bool(self.oPersonalDic)
        self.oTokenizer = None








    def _loadDictionary (self, source, bNecessary=False):
        "returns an IBDAWG object"
        if not source:
            return None
        try:
            return ibdawg.IBDAWG(source)
        except Exception as e:
            if bNecessary:
                raise Exception(str(e), "Error: <" + str(source) + "> not loaded.")
            print("Error: <" + str(source) + "> not loaded.")
            traceback.print_exc()
            return None

    def loadTokenizer (self):
        self.oTokenizer = tokenizer.Tokenizer(self.sLangCode)

    def getTokenizer (self):

        if not self.oTokenizer:
            self.loadTokenizer()
        return self.oTokenizer

    def setMainDictionary (self, source):
        "returns True if the dictionary is loaded"
        self.oMainDic = self._loadDictionary(source, True)
        return bool(self.oMainDic)
            
    def setExtendedDictionary (self, source, bActivate=True):
        "returns True if the dictionary is loaded"
        self.oExtendedDic = self._loadDictionary(source)
        self.bExtendedDic = False  if not bActivate  else bool(self.oExtendedDic)
        return bool(self.oExtendedDic)

    def setCommunityDictionary (self, source, bActivate=True):
        "returns True if the dictionary is loaded"
        self.oCommunityDic = self._loadDictionary(source)
        self.bCommunityDic = False  if not bActivate  else bool(self.oCommunityDic)
        return bool(self.oCommunityDic)

    def setPersonalDictionary (self, source, bActivate=True):
        "returns True if the dictionary is loaded"
        self.oPersonalDic = self._loadDictionary(source)
        self.bPersonalDic = False  if not bActivate  else bool(self.oPersonalDic)
        return bool(self.oPersonalDic)

    def activateExtendedDictionary (self):

        self.bExtendedDic = bool(self.oExtendedDic)

    def activateCommunityDictionary (self):

        self.bCommunityDic = bool(self.oCommunityDic)

    def activatePersonalDictionary (self):

        self.bPersonalDic = bool(self.oPersonalDic)

    def deactivateExtendedDictionary (self):

        self.bExtendedDic = False

    def deactivateCommunityDictionary (self):

        self.bCommunityDic = False

    def deactivatePersonalDictionary (self):

        self.bPersonalDic = False






























    # parse text functions

    def parseParagraph (self, sText, bSpellSugg=False):

        if not self.oTokenizer:
            self.loadTokenizer()
        aSpellErrs = []
        for dToken in self.oTokenizer.genTokens(sText):
            if dToken['sType'] == "WORD" and not self.isValidToken(dToken['sValue']):
                if bSpellSugg:
                    dToken['aSuggestions'] = []
                    for lSugg in self.suggest(dToken['sValue']):
                        dToken['aSuggestions'].extend(lSugg)
                aSpellErrs.append(dToken)
        return aSpellErrs

    def countWordsOccurrences (self, sText, bByLemma=False, bOnlyUnknownWords=False, dWord={}):


        if not self.oTokenizer:
            self.loadTokenizer()
        for dToken in self.oTokenizer.genTokens(sText):
            if dToken['sType'] == "WORD":
                if bOnlyUnknownWords:
                    if not self.isValidToken(dToken['sValue']):
                        dWord[dToken['sValue']] = dWord.get(dToken['sValue'], 0) + 1
                else:
                    if not bByLemma:
>
|
<
|

|
|
|
|
|
|

>













>














>
>
>
>
>
>
>














|



>

|






|



















>



>



>



>



>



>


>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>




>

|











>
>

|







1
2

3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
"""
Spellchecker.

Useful to check several dictionaries at once.

To avoid iterating over a pile of dictionaries, it is assumed that 3 are enough:
- the main dictionary, bundled with the package
- the extended dictionary
- the community dictionary, added by an organization
- the personal dictionary, created by the user for its own convenience
"""

import importlib
import traceback

from . import ibdawg
from . import tokenizer


dDefaultDictionaries = {
    "fr": "fr-allvars.bdic",
    "en": "en.bdic"
}


class SpellChecker ():
    "SpellChecker: wrapper for the IBDAWG class"

    def __init__ (self, sLangCode, sfMainDic="", sfExtendedDic="", sfCommunityDic="", sfPersonalDic=""):
        "returns True if the main dictionary is loaded"
        self.sLangCode = sLangCode
        if not sfMainDic:
            sfMainDic = dDefaultDictionaries.get(sLangCode, "")
        self.oMainDic = self._loadDictionary(sfMainDic, True)
        self.oExtendedDic = self._loadDictionary(sfExtendedDic)
        self.oCommunityDic = self._loadDictionary(sfCommunityDic)
        self.oPersonalDic = self._loadDictionary(sfPersonalDic)
        self.bExtendedDic = bool(self.oExtendedDic)
        self.bCommunityDic = bool(self.oCommunityDic)
        self.bPersonalDic = bool(self.oPersonalDic)
        self.oTokenizer = None
        # Default suggestions
        self.dDefaultSugg = None
        self.loadSuggestions(sLangCode)
        # storage
        self.bStorage = False
        self._dMorphologies = {}        # key: flexion, value: list of morphologies
        self._dLemmas = {}              # key: flexion, value: list of lemmas

    def _loadDictionary (self, source, bNecessary=False):
        "returns an IBDAWG object"
        if not source:
            return None
        try:
            return ibdawg.IBDAWG(source)
        except Exception as e:
            if bNecessary:
                raise Exception(str(e), "Error: <" + str(source) + "> not loaded.")
            print("Error: <" + str(source) + "> not loaded.")
            traceback.print_exc()
            return None

    def _loadTokenizer (self):
        self.oTokenizer = tokenizer.Tokenizer(self.sLangCode)

    def getTokenizer (self):
        "load and return the tokenizer object"
        if not self.oTokenizer:
            self._loadTokenizer()
        return self.oTokenizer

    def setMainDictionary (self, source):
        "returns True if the dictionary is loaded"
        self.oMainDic = self._loadDictionary(source, True)
        return bool(self.oMainDic)

    def setExtendedDictionary (self, source, bActivate=True):
        "returns True if the dictionary is loaded"
        self.oExtendedDic = self._loadDictionary(source)
        self.bExtendedDic = False  if not bActivate  else bool(self.oExtendedDic)
        return bool(self.oExtendedDic)

    def setCommunityDictionary (self, source, bActivate=True):
        "returns True if the dictionary is loaded"
        self.oCommunityDic = self._loadDictionary(source)
        self.bCommunityDic = False  if not bActivate  else bool(self.oCommunityDic)
        return bool(self.oCommunityDic)

    def setPersonalDictionary (self, source, bActivate=True):
        "returns True if the dictionary is loaded"
        self.oPersonalDic = self._loadDictionary(source)
        self.bPersonalDic = False  if not bActivate  else bool(self.oPersonalDic)
        return bool(self.oPersonalDic)

    def activateExtendedDictionary (self):
        "activate extended dictionary (if available)"
        self.bExtendedDic = bool(self.oExtendedDic)

    def activateCommunityDictionary (self):
        "activate community dictionary (if available)"
        self.bCommunityDic = bool(self.oCommunityDic)

    def activatePersonalDictionary (self):
        "activate personal dictionary (if available)"
        self.bPersonalDic = bool(self.oPersonalDic)

    def deactivateExtendedDictionary (self):
        "deactivate extended dictionary"
        self.bExtendedDic = False

    def deactivateCommunityDictionary (self):
        "deactivate community dictionary"
        self.bCommunityDic = False

    def deactivatePersonalDictionary (self):
        "deactivate personal dictionary"
        self.bPersonalDic = False


    # Default suggestions

    def loadSuggestions (self, sLangCode):
        "load default suggestion module for <sLangCode>"
        try:
            suggest = importlib.import_module("."+sLangCode, "graphspell")
        except ImportError:
            print("No suggestion module for language <"+sLangCode+">")
            return
        self.dDefaultSugg = suggest.dSugg


    # Storage

    def activateStorage (self):
        "store all lemmas and morphologies retrieved from the word graph"
        self.bStorage = True

    def deactivateStorage (self):
        "stop storing all lemmas and morphologies retrieved from the word graph"
        self.bStorage = False

    def clearStorage (self):
        "clear all stored data"
        self._dLemmas.clear()
        self._dMorphologies.clear()


    # parse text functions

    def parseParagraph (self, sText, bSpellSugg=False):
        "return a list of tokens where token value doesn’t exist in the word graph"
        if not self.oTokenizer:
            self._loadTokenizer()
        aSpellErrs = []
        for dToken in self.oTokenizer.genTokens(sText):
            if dToken['sType'] == "WORD" and not self.isValidToken(dToken['sValue']):
                if bSpellSugg:
                    dToken['aSuggestions'] = []
                    for lSugg in self.suggest(dToken['sValue']):
                        dToken['aSuggestions'].extend(lSugg)
                aSpellErrs.append(dToken)
        return aSpellErrs

    def countWordsOccurrences (self, sText, bByLemma=False, bOnlyUnknownWords=False, dWord={}):
        """count word occurrences.
           <dWord> can be used to cumulate count from several texts."""
        if not self.oTokenizer:
            self._loadTokenizer()
        for dToken in self.oTokenizer.genTokens(sText):
            if dToken['sType'] == "WORD":
                if bOnlyUnknownWords:
                    if not self.isValidToken(dToken['sValue']):
                        dWord[dToken['sValue']] = dWord.get(dToken['sValue'], 0) + 1
                else:
                    if not bByLemma:
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173


174
175
176
177
178
179
180



181
182
183





184
185
186
187







188


189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206

207
208
209
210
211
212
213

    def isValid (self, sWord):
        "checks if sWord is valid (different casing tested if the first letter is a capital)"
        if self.oMainDic.isValid(sWord):
            return True
        if self.bExtendedDic and self.oExtendedDic.isValid(sWord):
            return True
        if self.bCommunityDic and self.oCommunityDic.isValid(sToken):
            return True
        if self.bPersonalDic and self.oPersonalDic.isValid(sWord):
            return True
        return False

    def lookup (self, sWord):
        "checks if sWord is in dictionary as is (strict verification)"
        if self.oMainDic.lookup(sWord):
            return True
        if self.bExtendedDic and self.oExtendedDic.lookup(sWord):
            return True
        if self.bCommunityDic and self.oCommunityDic.lookup(sToken):
            return True
        if self.bPersonalDic and self.oPersonalDic.lookup(sWord):
            return True
        return False

    def getMorph (self, sWord):
        "retrieves morphologies list, different casing allowed"


        lResult = self.oMainDic.getMorph(sWord)
        if self.bExtendedDic:
            lResult.extend(self.oExtendedDic.getMorph(sWord))
        if self.bCommunityDic:
            lResult.extend(self.oCommunityDic.getMorph(sWord))
        if self.bPersonalDic:
            lResult.extend(self.oPersonalDic.getMorph(sWord))



        return lResult

    def getLemma (self, sWord):





        return set([ s[1:s.find(" ")]  for s in self.getMorph(sWord) ])

    def suggest (self, sWord, nSuggLimit=10):
        "generator: returns 1, 2 or 3 lists of suggestions"







        yield self.oMainDic.suggest(sWord, nSuggLimit)


        if self.bExtendedDic:
            yield self.oExtendedDic.suggest(sWord, nSuggLimit)
        if self.bCommunityDic:
            yield self.oCommunityDic.suggest(sWord, nSuggLimit)
        if self.bPersonalDic:
            yield self.oPersonalDic.suggest(sWord, nSuggLimit)

    def select (self, sFlexPattern="", sTagsPattern=""):
        "generator: returns all entries which flexion fits <sFlexPattern> and morphology fits <sTagsPattern>"
        yield from self.oMainDic.select(sFlexPattern, sTagsPattern)
        if self.bExtendedDic:
            yield from self.oExtendedDic.select(sFlexPattern, sTagsPattern)
        if self.bCommunityDic:
            yield from self.oCommunityDic.select(sFlexPattern, sTagsPattern)
        if self.bPersonalDic:
            yield from self.oPersonalDic.select(sFlexPattern, sTagsPattern)

    def drawPath (self, sWord):

        self.oMainDic.drawPath(sWord)
        if self.bExtendedDic:
            print("-----")
            self.oExtendedDic.drawPath(sWord)
        if self.bCommunityDic:
            print("-----")
            self.oCommunityDic.drawPath(sWord)







|











|







>
>
|

|

|

|
>
>
>
|


>
>
>
>
>
|



>
>
>
>
>
>
>
|
>
>


















>







194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280

    def isValid (self, sWord):
        "checks if sWord is valid (different casing tested if the first letter is a capital)"
        if self.oMainDic.isValid(sWord):
            return True
        if self.bExtendedDic and self.oExtendedDic.isValid(sWord):
            return True
        if self.bCommunityDic and self.oCommunityDic.isValid(sWord):
            return True
        if self.bPersonalDic and self.oPersonalDic.isValid(sWord):
            return True
        return False

    def lookup (self, sWord):
        "checks if sWord is in dictionary as is (strict verification)"
        if self.oMainDic.lookup(sWord):
            return True
        if self.bExtendedDic and self.oExtendedDic.lookup(sWord):
            return True
        if self.bCommunityDic and self.oCommunityDic.lookup(sWord):
            return True
        if self.bPersonalDic and self.oPersonalDic.lookup(sWord):
            return True
        return False

    def getMorph (self, sWord):
        "retrieves morphologies list, different casing allowed"
        if self.bStorage and sWord in self._dMorphologies:
            return self._dMorphologies[sWord]
        lMorph = self.oMainDic.getMorph(sWord)
        if self.bExtendedDic:
            lMorph.extend(self.oExtendedDic.getMorph(sWord))
        if self.bCommunityDic:
            lMorph.extend(self.oCommunityDic.getMorph(sWord))
        if self.bPersonalDic:
            lMorph.extend(self.oPersonalDic.getMorph(sWord))
        if self.bStorage:
            self._dMorphologies[sWord] = lMorph
            self._dLemmas[sWord] = set([ s[1:s.find("/")]  for s in lMorph ])
        return lMorph

    def getLemma (self, sWord):
        "retrieves lemmas"
        if self.bStorage:
            if sWord not in self._dLemmas:
                self.getMorph(sWord)
            return self._dLemmas[sWord]
        return set([ s[1:s.find("/")]  for s in self.getMorph(sWord) ])

    def suggest (self, sWord, nSuggLimit=10):
        "generator: returns 1, 2 or 3 lists of suggestions"
        if self.dDefaultSugg:
            if sWord in self.dDefaultSugg:
                yield self.dDefaultSugg[sWord].split("|")
            elif sWord.istitle() and sWord.lower() in self.dDefaultSugg:
                lRes = self.dDefaultSugg[sWord.lower()].split("|")
                yield list(map(lambda sSugg: sSugg[0:1].upper()+sSugg[1:], lRes))
            else:
                yield self.oMainDic.suggest(sWord, nSuggLimit)
        else:
            yield self.oMainDic.suggest(sWord, nSuggLimit)
        if self.bExtendedDic:
            yield self.oExtendedDic.suggest(sWord, nSuggLimit)
        if self.bCommunityDic:
            yield self.oCommunityDic.suggest(sWord, nSuggLimit)
        if self.bPersonalDic:
            yield self.oPersonalDic.suggest(sWord, nSuggLimit)

    def select (self, sFlexPattern="", sTagsPattern=""):
        "generator: returns all entries which flexion fits <sFlexPattern> and morphology fits <sTagsPattern>"
        yield from self.oMainDic.select(sFlexPattern, sTagsPattern)
        if self.bExtendedDic:
            yield from self.oExtendedDic.select(sFlexPattern, sTagsPattern)
        if self.bCommunityDic:
            yield from self.oCommunityDic.select(sFlexPattern, sTagsPattern)
        if self.bPersonalDic:
            yield from self.oPersonalDic.select(sFlexPattern, sTagsPattern)

    def drawPath (self, sWord):
        "draw the path taken by <sWord> within the word graph: display matching nodes and their arcs"
        self.oMainDic.drawPath(sWord)
        if self.bExtendedDic:
            print("-----")
            self.oExtendedDic.drawPath(sWord)
        if self.bCommunityDic:
            print("-----")
            self.oCommunityDic.drawPath(sWord)

Modified graphspell/str_transform.py from [9961c8cbc8] to [c5501f9a5a].

1
2






3
4
5
6

7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
#!python3








#### DISTANCE CALCULATIONS

def longestCommonSubstring (s1, s2):

    # http://en.wikipedia.org/wiki/Longest_common_substring_problem
    # http://en.wikibooks.org/wiki/Algorithm_implementation/Strings/Longest_common_substring
    M = [ [0]*(1+len(s2)) for i in range(1+len(s1)) ]
    longest, x_longest = 0, 0
    for x in range(1, 1+len(s1)):
        for y in range(1, 1+len(s2)):
            if s1[x-1] == s2[y-1]:
                M[x][y] = M[x-1][y-1] + 1
                if M[x][y] > longest:
                    longest = M[x][y]
                    x_longest = x
            else:
                M[x][y] = 0
    return s1[x_longest-longest : x_longest]


def distanceDamerauLevenshtein (s1, s2):
    "distance of Damerau-Levenshtein between <s1> and <s2>"
    # https://fr.wikipedia.org/wiki/Distance_de_Damerau-Levenshtein
    d = {}
    nLen1 = len(s1)


>
>
>
>
>
>




>


|
|



|
|
|
|

|
|







1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
#!python3

"""
Operations on strings:
- calculate distance between two strings
- transform strings with transformation codes
"""


#### DISTANCE CALCULATIONS

def longestCommonSubstring (s1, s2):
    "longest common substring"
    # http://en.wikipedia.org/wiki/Longest_common_substring_problem
    # http://en.wikibooks.org/wiki/Algorithm_implementation/Strings/Longest_common_substring
    lMatrix = [ [0]*(1+len(s2)) for i in range(1+len(s1)) ]
    nLongest, nLongestX = 0, 0
    for x in range(1, 1+len(s1)):
        for y in range(1, 1+len(s2)):
            if s1[x-1] == s2[y-1]:
                lMatrix[x][y] = lMatrix[x-1][y-1] + 1
                if lMatrix[x][y] > nLongest:
                    nLongest = lMatrix[x][y]
                    nLongestX = x
            else:
                lMatrix[x][y] = 0
    return s1[nLongestX-nLongest : nLongestX]


def distanceDamerauLevenshtein (s1, s2):
    "distance of Damerau-Levenshtein between <s1> and <s2>"
    # https://fr.wikipedia.org/wiki/Distance_de_Damerau-Levenshtein
    d = {}
    nLen1 = len(s1)
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
        return len(s1)
    nLen1, nLen2 = len(s1), len(s2)
    i1, i2 = 0, 0   # Cursors for each string
    nLargestCS = 0  # Largest common substring
    nLocalCS = 0    # Local common substring
    nTrans = 0      # Number of transpositions ('ab' vs 'ba')
    lOffset = []    # Offset pair array, for computing the transpositions
 
    while i1 < nLen1 and i2 < nLen2:
        if s1[i1] == s2[i2]:
            nLocalCS += 1
            # Check if current match is a transposition
            bTrans = False
            i = 0
            while i < len(lOffset):







|







59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
        return len(s1)
    nLen1, nLen2 = len(s1), len(s2)
    i1, i2 = 0, 0   # Cursors for each string
    nLargestCS = 0  # Largest common substring
    nLocalCS = 0    # Local common substring
    nTrans = 0      # Number of transpositions ('ab' vs 'ba')
    lOffset = []    # Offset pair array, for computing the transpositions

    while i1 < nLen1 and i2 < nLen2:
        if s1[i1] == s2[i2]:
            nLocalCS += 1
            # Check if current match is a transposition
            bTrans = False
            i = 0
            while i < len(lOffset):
101
102
103
104
105
106
107

108
109
110
111
112
113
114
115
116
117
118

119
120
121



122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
            nLocalCS = 0
            i1 = i2 = min(i1, i2)
    nLargestCS += nLocalCS
    return round(max(nLen1, nLen2) - nLargestCS + nTrans)


def showDistance (s1, s2):

    print("Damerau-Levenshtein: " + s1 + "/" + s2 + " = " + distanceDamerauLevenshtein(s1, s2))
    print("Sift4:" + s1 + "/" + s2 + " = " + distanceSift4(s1, s2))




#### STEMMING OPERATIONS

## No stemming

def noStemming (sFlex, sStem):

    return sStem

def rebuildWord (sFlex, cmd1, cmd2):



    if cmd1 == "_":
        return sFlex
    n, c = cmd1.split(":")
    s = s[:n] + c + s[n:]
    if cmd2 == "_":
        return s
    n, c = cmd2.split(":")
    return s[:n] + c + s[n:]

    
## Define affixes for stemming

# Note: 48 is the ASCII code for "0"


# Suffix only
def defineSuffixCode (sFlex, sStem):







>











>


|
>
>
>
|

|
|
|
|
|
|

|







108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
            nLocalCS = 0
            i1 = i2 = min(i1, i2)
    nLargestCS += nLocalCS
    return round(max(nLen1, nLen2) - nLargestCS + nTrans)


def showDistance (s1, s2):
    "display Damerau-Levenshtein distance and Sift4 distance between <s1> and <s2>"
    print("Damerau-Levenshtein: " + s1 + "/" + s2 + " = " + distanceDamerauLevenshtein(s1, s2))
    print("Sift4:" + s1 + "/" + s2 + " = " + distanceSift4(s1, s2))




#### STEMMING OPERATIONS

## No stemming

def noStemming (sFlex, sStem):
    "return <sStem>"
    return sStem

def rebuildWord (sFlex, sCode1, sCode2):
    """ Change <sFlex> with codes (each inserts a char at a defined possition).
        <I forgot what purpose it has…>
    """
    if sCode1 == "_":
        return sFlex
    n, c = sCode1.split(":")
    sFlex = sFlex[:n] + c + sFlex[n:]
    if sCode2 == "_":
        return sFlex
    n, c = sCode2.split(":")
    return sFlex[:n] + c + sFlex[n:]


## Define affixes for stemming

# Note: 48 is the ASCII code for "0"


# Suffix only
def defineSuffixCode (sFlex, sStem):
148
149
150
151
152
153
154
155
156
157
158

159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
    if sFlex == sStem:
        return "0"
    jSfx = 0
    for i in range(min(len(sFlex), len(sStem))):
        if sFlex[i] != sStem[i]:
            break
        jSfx += 1
    return chr(len(sFlex)-jSfx+48) + sStem[jSfx:]  


def changeWordWithSuffixCode (sWord, sSfxCode):

    if sSfxCode == "0":
        return sWord
    return sWord[:-(ord(sSfxCode[0])-48)] + sSfxCode[1:]  if sSfxCode[0] != '0'  else sWord + sSfxCode[1:]


# Prefix and suffix

def defineAffixCode (sFlex, sStem):
    """ Returns a string defining how to get stem from flexion. Examples:
            "0" if stem = flexion
            "stem" if no common substring
            "n(pfx)/m(sfx)"
        with n and m: chars with numeric meaning, "0" = 0, "1" = 1, ... ":" = 10, etc. (See ASCII table.) Says how many letters to strip from flexion.
            pfx [optional]: string to add before the flexion 
            sfx [optional]: string to add after the flexion
    """
    if sFlex == sStem:
        return "0"
    # is stem a substring of flexion?
    n = sFlex.find(sStem)
    if n >= 0:







|



>













|







160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
    if sFlex == sStem:
        return "0"
    jSfx = 0
    for i in range(min(len(sFlex), len(sStem))):
        if sFlex[i] != sStem[i]:
            break
        jSfx += 1
    return chr(len(sFlex)-jSfx+48) + sStem[jSfx:]


def changeWordWithSuffixCode (sWord, sSfxCode):
    "apply transformation code <sSfxCode> on <sWord> and return the result string"
    if sSfxCode == "0":
        return sWord
    return sWord[:-(ord(sSfxCode[0])-48)] + sSfxCode[1:]  if sSfxCode[0] != '0'  else sWord + sSfxCode[1:]


# Prefix and suffix

def defineAffixCode (sFlex, sStem):
    """ Returns a string defining how to get stem from flexion. Examples:
            "0" if stem = flexion
            "stem" if no common substring
            "n(pfx)/m(sfx)"
        with n and m: chars with numeric meaning, "0" = 0, "1" = 1, ... ":" = 10, etc. (See ASCII table.) Says how many letters to strip from flexion.
            pfx [optional]: string to add before the flexion
            sfx [optional]: string to add after the flexion
    """
    if sFlex == sStem:
        return "0"
    # is stem a substring of flexion?
    n = sFlex.find(sStem)
    if n >= 0:
187
188
189
190
191
192
193

194
195
196
197
198
199
200
201
        n = sFlex.find(sSubs)
        m = len(sFlex) - (len(sSubs)+n)
        return chr(n+48) + sPfx + "/" + chr(m+48) + sSfx
    return sStem


def changeWordWithAffixCode (sWord, sAffCode):

    if sAffCode == "0":
        return sWord
    if '/' not in sAffCode:
        return sAffCode
    sPfxCode, sSfxCode = sAffCode.split('/')
    sWord = sPfxCode[1:] + sWord[(ord(sPfxCode[0])-48):] 
    return sWord[:-(ord(sSfxCode[0])-48)] + sSfxCode[1:]  if sSfxCode[0] != '0'  else sWord + sSfxCode[1:]








>





|

<
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214

        n = sFlex.find(sSubs)
        m = len(sFlex) - (len(sSubs)+n)
        return chr(n+48) + sPfx + "/" + chr(m+48) + sSfx
    return sStem


def changeWordWithAffixCode (sWord, sAffCode):
    "apply transformation code <sAffCode> on <sWord> and return the result string"
    if sAffCode == "0":
        return sWord
    if '/' not in sAffCode:
        return sAffCode
    sPfxCode, sSfxCode = sAffCode.split('/')
    sWord = sPfxCode[1:] + sWord[(ord(sPfxCode[0])-48):]
    return sWord[:-(ord(sSfxCode[0])-48)] + sSfxCode[1:]  if sSfxCode[0] != '0'  else sWord + sSfxCode[1:]

Modified graphspell/tokenizer.py from [17f452887e] to [daca54adb9].


1


2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17

18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33

34
35
36
37
38
39

40
41
42
43
44
45
46
47




48
49




# Very simple tokenizer



import re

_PATTERNS = {
    "default":
        (
            r'(?P<FOLDERUNIX>/(?:bin|boot|dev|etc|home|lib|mnt|opt|root|sbin|tmp|usr|var|Bureau|Documents|Images|Musique|Public|Téléchargements|Vidéos)(?:/[\w.()-]+)*)',
            r'(?P<FOLDERWIN>[a-zA-Z]:\\(?:Program Files(?: [(]x86[)]|)|[\w.()]+)(?:\\[\w.()-]+)*)',
            r'(?P<PUNC>[.,?!:;…«»“”"()/·]+)',
            r'(?P<ACRONYM>[A-Z][.][A-Z][.](?:[A-Z][.])*)',
            r'(?P<LINK>(?:https?://|www[.]|\w+[@.]\w\w+[@.])\w[\w./?&!%=+*"\'@$#-]+)',
            r'(?P<HASHTAG>[#@][\w-]+)',
            r'(?P<HTML><\w+.*?>|</\w+ *>)',
            r'(?P<PSEUDOHTML>\[/?\w+\])',
            r'(?P<HOUR>\d\d?h\d\d\b)',
            r'(?P<NUM>-?\d+(?:[.,]\d+))',

            r"(?P<WORD>\w+(?:[’'`-]\w+)*)"
        ),
    "fr":
        (
            r'(?P<FOLDERUNIX>/(?:bin|boot|dev|etc|home|lib|mnt|opt|root|sbin|tmp|usr|var|Bureau|Documents|Images|Musique|Public|Téléchargements|Vidéos)(?:/[\w.()-]+)*)',
            r'(?P<FOLDERWIN>[a-zA-Z]:\\(?:Program Files(?: [(]x86[)]|)|[\w.()]+)(?:\\[\w.()-]+)*)',
            r'(?P<PUNC>[.,?!:;…«»“”"()/·]+)',
            r'(?P<ACRONYM>[A-Z][.][A-Z][.](?:[A-Z][.])*)',
            r'(?P<LINK>(?:https?://|www[.]|\w+[@.]\w\w+[@.])\w[\w./?&!%=+*"\'@$#-]+)',
            r'(?P<HASHTAG>[#@][\w-]+)',
            r'(?P<HTML><\w+.*?>|</\w+ *>)',
            r'(?P<PSEUDOHTML>\[/?\w+\])',
            r"(?P<ELPFX>(?:l|d|n|m|t|s|j|c|ç|lorsqu|puisqu|jusqu|quoiqu|qu)['’`])",
            r'(?P<ORDINAL>\d+(?:er|nd|e|de|ième|ème|eme)\b)',
            r'(?P<HOUR>\d\d?h\d\d\b)',
            r'(?P<NUM>-?\d+(?:[.,]\d+|))',

            r"(?P<WORD>\w+(?:[’'`-]\w+)*)"
        )
}


class Tokenizer:


    def __init__ (self, sLang):
        self.sLang = sLang
        if sLang not in _PATTERNS:
            self.sLang = "default"
        self.zToken = re.compile( "(?i)" + '|'.join(sRegex for sRegex in _PATTERNS[sLang]) )

    def genTokens (self, sText):




        for m in self.zToken.finditer(sText):
            yield { "sType": m.lastgroup, "sValue": m.group(), "nStart": m.start(), "nEnd": m.end() }



>
|
>
>








|
|





|
>






|
|




|
|

|
>






>







|
>
>
>
>
|
|
>
>
>
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
"""
Very simple tokenizer
using regular expressions
"""

import re

_PATTERNS = {
    "default":
        (
            r'(?P<FOLDERUNIX>/(?:bin|boot|dev|etc|home|lib|mnt|opt|root|sbin|tmp|usr|var|Bureau|Documents|Images|Musique|Public|Téléchargements|Vidéos)(?:/[\w.()-]+)*)',
            r'(?P<FOLDERWIN>[a-zA-Z]:\\(?:Program Files(?: [(]x86[)]|)|[\w.()]+)(?:\\[\w.()-]+)*)',
            r'(?P<PUNC>[][,.;:!?…«»“”‘’"(){}·–—])',
            r'(?P<WORD_ACRONYM>[A-Z][.][A-Z][.](?:[A-Z][.])*)',
            r'(?P<LINK>(?:https?://|www[.]|\w+[@.]\w\w+[@.])\w[\w./?&!%=+*"\'@$#-]+)',
            r'(?P<HASHTAG>[#@][\w-]+)',
            r'(?P<HTML><\w+.*?>|</\w+ *>)',
            r'(?P<PSEUDOHTML>\[/?\w+\])',
            r'(?P<HOUR>\d\d?h\d\d\b)',
            r'(?P<NUM>\d+(?:[.,]\d+))',
            r'(?P<SIGN>[%‰+=*/<>⩾⩽-])',
            r"(?P<WORD>\w+(?:[’'`-]\w+)*)"
        ),
    "fr":
        (
            r'(?P<FOLDERUNIX>/(?:bin|boot|dev|etc|home|lib|mnt|opt|root|sbin|tmp|usr|var|Bureau|Documents|Images|Musique|Public|Téléchargements|Vidéos)(?:/[\w.()-]+)*)',
            r'(?P<FOLDERWIN>[a-zA-Z]:\\(?:Program Files(?: [(]x86[)]|)|[\w.()]+)(?:\\[\w.()-]+)*)',
            r'(?P<PUNC>[][,.;:!?…«»“”‘’"(){}·–—])',
            r'(?P<WORD_ACRONYM>[A-Z][.][A-Z][.](?:[A-Z][.])*)',
            r'(?P<LINK>(?:https?://|www[.]|\w+[@.]\w\w+[@.])\w[\w./?&!%=+*"\'@$#-]+)',
            r'(?P<HASHTAG>[#@][\w-]+)',
            r'(?P<HTML><\w+.*?>|</\w+ *>)',
            r'(?P<PSEUDOHTML>\[/?\w+\])',
            r"(?P<WORD_ELIDED>(?:l|d|n|m|t|s|j|c|ç|lorsqu|puisqu|jusqu|quoiqu|qu)['’`])",
            r'(?P<WORD_ORDINAL>\d+(?:ers?|nds?|es?|des?|ièmes?|èmes?|emes?|ᵉʳˢ?|ⁿᵈˢ?|ᵉˢ?|ᵈᵉˢ?)\b)',
            r'(?P<HOUR>\d\d?h\d\d\b)',
            r'(?P<NUM>\d+(?:[.,]\d+|))',
            r'(?P<SIGN>[%‰+=*/<>⩾⩽-])',
            r"(?P<WORD>\w+(?:[’'`-]\w+)*)"
        )
}


class Tokenizer:
    "Tokenizer: transforms a text in a list of tokens"

    def __init__ (self, sLang):
        self.sLang = sLang
        if sLang not in _PATTERNS:
            self.sLang = "default"
        self.zToken = re.compile( "(?i)" + '|'.join(sRegex for sRegex in _PATTERNS[sLang]) )

    def genTokens (self, sText, bStartEndToken=False):
        "generator: tokenize <sText>"
        i = 0
        if bStartEndToken:
            yield { "i": 0, "sType": "INFO", "sValue": "<start>", "nStart": 0, "nEnd": 0, "lMorph": ["<start>"] }
        for i, m in enumerate(self.zToken.finditer(sText), 1):
            yield { "i": i, "sType": m.lastgroup, "sValue": m.group(), "nStart": m.start(), "nEnd": m.end() }
        if bStartEndToken:
            iEnd = len(sText)
            yield { "i": i+1, "sType": "INFO", "sValue": "<end>", "nStart": iEnd, "nEnd": iEnd, "lMorph": ["<end>"] }

Modified make.py from [14e0172bf2] to [cf1490b4f0].

1
2




3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29

30
31
32
33
34
35
36
37
38
39
40

41
42
43
44
45
46
47
48
49

50
51
52
53
54
55
56
 #!/usr/bin/env python3
# coding: UTF-8





import sys
import os
import subprocess
import re
import zipfile
import traceback
import configparser
import datetime
import argparse
import importlib
import unittest
import json
import platform

from distutils import dir_util, file_util

import dialog_bundled
import compile_rules
import helpers
import lex_build


sWarningMessage = "The content of this folder is generated by code and replaced at each build.\n"


def getConfig (sLang):

    xConfig = configparser.SafeConfigParser()
    xConfig.optionxform = str
    try:
        xConfig.read("gc_lang/" + sLang + "/config.ini", encoding="utf-8")
    except:
        print("# Error. Can’t read config file [" + sLang + "]")
        exit()
    return xConfig


def createOptionsLabelProperties (dOptLbl):

    sContent = ""
    for sOpt, tLabel in dOptLbl.items():
        sContent += sOpt + "=" + tLabel[0] + "\n"
        if tLabel[1]:
            sContent += "hlp_" + sOpt + "=" + tLabel[1] + "\n"
    return sContent


def createDialogOptionsXDL (dVars):

    sFixedline = '<dlg:fixedline dlg:id="{0}" dlg:tab-index="{1}" dlg:top="{2}" dlg:left="5" dlg:width="{3}" dlg:height="10" dlg:value="&amp;{0}" />\n'
    sCheckbox = '<dlg:checkbox dlg:id="{0}" dlg:tab-index="{1}" dlg:top="{2}" dlg:left="{3}" dlg:width="{4}" dlg:height="10" dlg:value="&amp;{0}" dlg:checked="{5}" {6} />\n'
    iTabIndex = 1
    nPosY = 5
    nWidth = 240
    sContent = ""
    dOpt = dVars["dOptPython"]


>
>
>
>



<













|









>
|


|
|






>









>







1
2
3
4
5
6
7
8
9

10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
 #!/usr/bin/env python3
# coding: UTF-8

"""
Grammalecte builder
"""

import sys
import os

import re
import zipfile
import traceback
import configparser
import datetime
import argparse
import importlib
import unittest
import json
import platform

from distutils import dir_util, file_util

#import dialog_bundled
import compile_rules
import helpers
import lex_build


sWarningMessage = "The content of this folder is generated by code and replaced at each build.\n"


def getConfig (sLang):
    "load config.ini in <sLang> at gc_lang/<sLang>, returns xConfigParser object"
    xConfig = configparser.ConfigParser()
    xConfig.optionxform = str
    try:
        xConfig.read_file(open("gc_lang/" + sLang + "/config.ini", "r", encoding="utf-8"))
    except FileNotFoundError:
        print("# Error. Can’t read config file [" + sLang + "]")
        exit()
    return xConfig


def createOptionsLabelProperties (dOptLbl):
    "create content for .properties files (LibreOffice)"
    sContent = ""
    for sOpt, tLabel in dOptLbl.items():
        sContent += sOpt + "=" + tLabel[0] + "\n"
        if tLabel[1]:
            sContent += "hlp_" + sOpt + "=" + tLabel[1] + "\n"
    return sContent


def createDialogOptionsXDL (dVars):
    "create bundled dialog options file .xdl (LibreOffice)"
    sFixedline = '<dlg:fixedline dlg:id="{0}" dlg:tab-index="{1}" dlg:top="{2}" dlg:left="5" dlg:width="{3}" dlg:height="10" dlg:value="&amp;{0}" />\n'
    sCheckbox = '<dlg:checkbox dlg:id="{0}" dlg:tab-index="{1}" dlg:top="{2}" dlg:left="{3}" dlg:width="{4}" dlg:height="10" dlg:value="&amp;{0}" dlg:checked="{5}" {6} />\n'
    iTabIndex = 1
    nPosY = 5
    nWidth = 240
    sContent = ""
    dOpt = dVars["dOptPython"]
131
132
133
134
135
136
137
138
139
140
141
142
143
144

145
146
147
148
149
150
151

    # Installation in Writer profile
    if bInstall:
        print("> installation in Writer")
        if dVars.get('unopkg', False):
            cmd = '"'+os.path.abspath(dVars.get('unopkg')+'" add -f '+spfZip)
            print(cmd)
            #subprocess.run(cmd)
            os.system(cmd)
        else:
            print("# Error: path and filename of unopkg not set in config.ini")


def createServerOptions (sLang, dOptData):

    with open("grammalecte-server-options."+sLang+".ini", "w", encoding="utf-8", newline="\n") as hDst:
        hDst.write("# Server options. Lang: " + sLang + "\n\n[gc_options]\n")
        for sSection, lOpt in dOptData["lStructOpt"]:
            hDst.write("\n########## " + dOptData["dOptLabel"][sLang].get(sSection, sSection + "[no label found]")[0] + " ##########\n")
            for lLineOpt in lOpt:
                for sOpt in lLineOpt:
                    hDst.write("# " + dOptData["dOptLabel"][sLang].get(sOpt, "[no label found]")[0] + "\n")







<






>







137
138
139
140
141
142
143

144
145
146
147
148
149
150
151
152
153
154
155
156
157

    # Installation in Writer profile
    if bInstall:
        print("> installation in Writer")
        if dVars.get('unopkg', False):
            cmd = '"'+os.path.abspath(dVars.get('unopkg')+'" add -f '+spfZip)
            print(cmd)

            os.system(cmd)
        else:
            print("# Error: path and filename of unopkg not set in config.ini")


def createServerOptions (sLang, dOptData):
    "create file options for Grammalecte server"
    with open("grammalecte-server-options."+sLang+".ini", "w", encoding="utf-8", newline="\n") as hDst:
        hDst.write("# Server options. Lang: " + sLang + "\n\n[gc_options]\n")
        for sSection, lOpt in dOptData["lStructOpt"]:
            hDst.write("\n########## " + dOptData["dOptLabel"][sLang].get(sSection, sSection + "[no label found]")[0] + " ##########\n")
            for lLineOpt in lOpt:
                for sOpt in lLineOpt:
                    hDst.write("# " + dOptData["dOptLabel"][sLang].get(sOpt, "[no label found]")[0] + "\n")
162
163
164
165
166
167
168

169
170
171
172
173
174
175
176
177
178
179
180
181
182
183

184
185
186
187
188
189
190
                "grammalecte-server-options._global.ini", "grammalecte-server-options."+sLang+".ini", \
                "README.txt", "LICENSE.txt", "LICENSE.fr.txt"]:
        hZip.write(spf)
    hZip.writestr("setup.py", helpers.fileFile("gc_lang/fr/setup.py", dVars))


def copyGrammalectePyPackageInZipFile (hZip, spLangPack, sAddPath=""):

    for sf in os.listdir("grammalecte"):
        if not os.path.isdir("grammalecte/"+sf):
            hZip.write("grammalecte/"+sf, sAddPath+"grammalecte/"+sf)
    for sf in os.listdir("grammalecte/graphspell"):
        if not os.path.isdir("grammalecte/graphspell/"+sf):
            hZip.write("grammalecte/graphspell/"+sf, sAddPath+"grammalecte/graphspell/"+sf)
    for sf in os.listdir("grammalecte/graphspell/_dictionaries"):
        if not os.path.isdir("grammalecte/graphspell/_dictionaries/"+sf):
            hZip.write("grammalecte/graphspell/_dictionaries/"+sf, sAddPath+"grammalecte/graphspell/_dictionaries/"+sf)
    for sf in os.listdir(spLangPack):
        if not os.path.isdir(spLangPack+"/"+sf):
            hZip.write(spLangPack+"/"+sf, sAddPath+spLangPack+"/"+sf)


def create (sLang, xConfig, bInstallOXT, bJavaScript):

    oNow = datetime.datetime.now()
    print("============== MAKE GRAMMALECTE [{0}] at {1.hour:>2} h {1.minute:>2} min {1.second:>2} s ==============".format(sLang, oNow))

    #### READ CONFIGURATION
    print("> read configuration...")
    spLang = "gc_lang/" + sLang








>















>







168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
                "grammalecte-server-options._global.ini", "grammalecte-server-options."+sLang+".ini", \
                "README.txt", "LICENSE.txt", "LICENSE.fr.txt"]:
        hZip.write(spf)
    hZip.writestr("setup.py", helpers.fileFile("gc_lang/fr/setup.py", dVars))


def copyGrammalectePyPackageInZipFile (hZip, spLangPack, sAddPath=""):
    "copy Grammalecte Python package in zip file"
    for sf in os.listdir("grammalecte"):
        if not os.path.isdir("grammalecte/"+sf):
            hZip.write("grammalecte/"+sf, sAddPath+"grammalecte/"+sf)
    for sf in os.listdir("grammalecte/graphspell"):
        if not os.path.isdir("grammalecte/graphspell/"+sf):
            hZip.write("grammalecte/graphspell/"+sf, sAddPath+"grammalecte/graphspell/"+sf)
    for sf in os.listdir("grammalecte/graphspell/_dictionaries"):
        if not os.path.isdir("grammalecte/graphspell/_dictionaries/"+sf):
            hZip.write("grammalecte/graphspell/_dictionaries/"+sf, sAddPath+"grammalecte/graphspell/_dictionaries/"+sf)
    for sf in os.listdir(spLangPack):
        if not os.path.isdir(spLangPack+"/"+sf):
            hZip.write(spLangPack+"/"+sf, sAddPath+spLangPack+"/"+sf)


def create (sLang, xConfig, bInstallOXT, bJavaScript):
    "make Grammalecte for project <sLang>"
    oNow = datetime.datetime.now()
    print("============== MAKE GRAMMALECTE [{0}] at {1.hour:>2} h {1.minute:>2} min {1.second:>2} s ==============".format(sLang, oNow))

    #### READ CONFIGURATION
    print("> read configuration...")
    spLang = "gc_lang/" + sLang

226
227
228
229
230
231
232

233
234
235
236
237
238
239
            print(sf, end=", ")
    print()

    # TEST FILES
    with open("grammalecte/"+sLang+"/gc_test.txt", "w", encoding="utf-8", newline="\n") as hDstPy:
        hDstPy.write("# TESTS FOR LANG [" + sLang + "]\n\n")
        hDstPy.write(dVars['gctests'])


    createOXT(spLang, dVars, xConfig._sections['oxt'], spLangPack, bInstallOXT)

    createServerOptions(sLang, dVars)
    createPackageZip(sLang, dVars, spLangPack)

    #### JAVASCRIPT







>







234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
            print(sf, end=", ")
    print()

    # TEST FILES
    with open("grammalecte/"+sLang+"/gc_test.txt", "w", encoding="utf-8", newline="\n") as hDstPy:
        hDstPy.write("# TESTS FOR LANG [" + sLang + "]\n\n")
        hDstPy.write(dVars['gctests'])
        hDstPy.write("\n")

    createOXT(spLang, dVars, xConfig._sections['oxt'], spLangPack, bInstallOXT)

    createServerOptions(sLang, dVars)
    createPackageZip(sLang, dVars, spLangPack)

    #### JAVASCRIPT
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
        print()
        dVars["pluginsJS"] = sCodePlugins

        # options data struct
        dVars["dOptJavaScript"] = json.dumps(list(dVars["dOptJavaScript"].items()))
        dVars["dOptFirefox"] = json.dumps(list(dVars["dOptFirefox"].items()))
        dVars["dOptThunderbird"] = json.dumps(list(dVars["dOptThunderbird"].items()))
        
        # create folder
        spLangPack = "grammalecte-js/"+sLang
        helpers.createCleanFolder(spLangPack)

        # create files
        for sf in os.listdir("js_extension"):
            dVars[sf[:-3]] = open("js_extension/"+sf, "r", encoding="utf-8").read()







|







257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
        print()
        dVars["pluginsJS"] = sCodePlugins

        # options data struct
        dVars["dOptJavaScript"] = json.dumps(list(dVars["dOptJavaScript"].items()))
        dVars["dOptFirefox"] = json.dumps(list(dVars["dOptFirefox"].items()))
        dVars["dOptThunderbird"] = json.dumps(list(dVars["dOptThunderbird"].items()))

        # create folder
        spLangPack = "grammalecte-js/"+sLang
        helpers.createCleanFolder(spLangPack)

        # create files
        for sf in os.listdir("js_extension"):
            dVars[sf[:-3]] = open("js_extension/"+sf, "r", encoding="utf-8").read()
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287

288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305

306
307
308
309
310
311
312
        for sf in os.listdir(spLang+"/modules-js"):
            if not sf.startswith("gce_"):
                helpers.copyAndFileTemplate(spLang+"/modules-js/"+sf, spLangPack+"/"+sf, dVars)
                print(sf, end=", ")
        print()

        try:
            build_module = importlib.import_module("gc_lang."+sLang+".build")
        except ImportError:
            print("# No complementary builder <build.py> in folder gc_lang/"+sLang)
        else:
            build_module.build(sLang, dVars, spLangPack)

    return dVars['version']


def copyGraphspellCore (bJavaScript=False):

    helpers.createCleanFolder("grammalecte/graphspell")
    dir_util.mkpath("grammalecte/graphspell/_dictionaries")
    for sf in os.listdir("graphspell"):
        if not os.path.isdir("graphspell/"+sf):
            file_util.copy_file("graphspell/"+sf, "grammalecte/graphspell")
    if bJavaScript:
        helpers.createCleanFolder("grammalecte-js/graphspell")
        dir_util.mkpath("grammalecte-js/graphspell/_dictionaries")
        dVars = {}
        for sf in os.listdir("js_extension"):
            dVars[sf[:-3]] = open("js_extension/"+sf, "r", encoding="utf-8").read()
        for sf in os.listdir("graphspell-js"):
            if not os.path.isdir("graphspell-js/"+sf):
                file_util.copy_file("graphspell-js/"+sf, "grammalecte-js/graphspell")
                helpers.copyAndFileTemplate("graphspell-js/"+sf, "grammalecte-js/graphspell/"+sf, dVars)


def copyGraphspellDictionaries (dVars, bJavaScript=False, bExtendedDict=False, bCommunityDict=False, bPersonalDict=False):

    dVars["dic_main_filename_py"] = ""
    dVars["dic_main_filename_js"] = ""
    dVars["dic_extended_filename_py"] = ""
    dVars["dic_extended_filename_js"] = ""
    dVars["dic_community_filename_py"] = ""
    dVars["dic_community_filename_js"] = ""
    dVars["dic_personal_filename_py"] = ""







|



|





>


















>







280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
        for sf in os.listdir(spLang+"/modules-js"):
            if not sf.startswith("gce_"):
                helpers.copyAndFileTemplate(spLang+"/modules-js/"+sf, spLangPack+"/"+sf, dVars)
                print(sf, end=", ")
        print()

        try:
            buildjs = importlib.import_module("gc_lang."+sLang+".build")
        except ImportError:
            print("# No complementary builder <build.py> in folder gc_lang/"+sLang)
        else:
            buildjs.build(sLang, dVars, spLangPack)

    return dVars['version']


def copyGraphspellCore (bJavaScript=False):
    "copy Graphspell package in Grammalecte package"
    helpers.createCleanFolder("grammalecte/graphspell")
    dir_util.mkpath("grammalecte/graphspell/_dictionaries")
    for sf in os.listdir("graphspell"):
        if not os.path.isdir("graphspell/"+sf):
            file_util.copy_file("graphspell/"+sf, "grammalecte/graphspell")
    if bJavaScript:
        helpers.createCleanFolder("grammalecte-js/graphspell")
        dir_util.mkpath("grammalecte-js/graphspell/_dictionaries")
        dVars = {}
        for sf in os.listdir("js_extension"):
            dVars[sf[:-3]] = open("js_extension/"+sf, "r", encoding="utf-8").read()
        for sf in os.listdir("graphspell-js"):
            if not os.path.isdir("graphspell-js/"+sf):
                file_util.copy_file("graphspell-js/"+sf, "grammalecte-js/graphspell")
                helpers.copyAndFileTemplate("graphspell-js/"+sf, "grammalecte-js/graphspell/"+sf, dVars)


def copyGraphspellDictionaries (dVars, bJavaScript=False, bExtendedDict=False, bCommunityDict=False, bPersonalDict=False):
    "copy requested Graphspell dictionaries in Grammalecte package"
    dVars["dic_main_filename_py"] = ""
    dVars["dic_main_filename_js"] = ""
    dVars["dic_extended_filename_py"] = ""
    dVars["dic_extended_filename_js"] = ""
    dVars["dic_community_filename_py"] = ""
    dVars["dic_community_filename_js"] = ""
    dVars["dic_personal_filename_py"] = ""
331
332
333
334
335
336
337

338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362

363
364
365
366
367
368
369
            file_util.copy_file(spfJSDic, "grammalecte-js/graphspell/_dictionaries")
            dVars['dic_'+sType+'_filename_js'] = sFileName + '.json'
    dVars['dic_main_filename_py'] = dVars['dic_default_filename_py'] + ".bdic"
    dVars['dic_main_filename_js'] = dVars['dic_default_filename_js'] + ".json"


def buildDictionary (dVars, sType, bJavaScript=False):

    if sType == "main":
        spfLexSrc = dVars['lexicon_src']
        l_sfDictDst = dVars['dic_filenames'].split(",")
        l_sDicName = dVars['dic_name'].split(",")
        l_sFilter = dVars['dic_filter'].split(",")
        for sfDictDst, sDicName, sFilter in zip(l_sfDictDst, l_sDicName, l_sFilter):
            lex_build.build(spfLexSrc, dVars['lang'], dVars['lang_name'], sfDictDst, bJavaScript, sDicName, sFilter, dVars['stemming_method'], int(dVars['fsa_method']))
    else:
        if sType == "extended":
            spfLexSrc = dVars['lexicon_extended_src']
            sfDictDst = dVars['dic_extended_filename']
            sDicName = dVars['dic_extended_name']
        elif sType == "community":
            spfLexSrc = dVars['lexicon_community_src']
            sfDictDst = dVars['dic_community_filename']
            sDicName = dVars['dic_community_name']
        elif sType == "personal":
            spfLexSrc = dVars['lexicon_personal_src']
            sfDictDst = dVars['dic_personal_filename']
            sDicName = dVars['dic_personal_name']
        lex_build.build(spfLexSrc, dVars['lang'], dVars['lang_name'], sfDictDst, bJavaScript, sDicName, "", dVars['stemming_method'], int(dVars['fsa_method']))



def main ():

    print("Python: " + sys.version)
    xParser = argparse.ArgumentParser()
    xParser.add_argument("lang", type=str, nargs='+', help="lang project to generate (name of folder in /lang)")
    xParser.add_argument("-b", "--build_data", help="launch build_data.py (part 1 and 2)", action="store_true")
    xParser.add_argument("-bb", "--build_data_before", help="launch build_data.py (only part 1: before dictionary building)", action="store_true")
    xParser.add_argument("-ba", "--build_data_after", help="launch build_data.py (only part 2: before dictionary building)", action="store_true")
    xParser.add_argument("-d", "--dict", help="generate FSA dictionary", action="store_true")







>


|
|
|
|



















>







342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
            file_util.copy_file(spfJSDic, "grammalecte-js/graphspell/_dictionaries")
            dVars['dic_'+sType+'_filename_js'] = sFileName + '.json'
    dVars['dic_main_filename_py'] = dVars['dic_default_filename_py'] + ".bdic"
    dVars['dic_main_filename_js'] = dVars['dic_default_filename_js'] + ".json"


def buildDictionary (dVars, sType, bJavaScript=False):
    "build binary dictionary for Graphspell from lexicons"
    if sType == "main":
        spfLexSrc = dVars['lexicon_src']
        lSfDictDst = dVars['dic_filenames'].split(",")
        lDicName = dVars['dic_name'].split(",")
        lFilter = dVars['dic_filter'].split(",")
        for sfDictDst, sDicName, sFilter in zip(lSfDictDst, lDicName, lFilter):
            lex_build.build(spfLexSrc, dVars['lang'], dVars['lang_name'], sfDictDst, bJavaScript, sDicName, sFilter, dVars['stemming_method'], int(dVars['fsa_method']))
    else:
        if sType == "extended":
            spfLexSrc = dVars['lexicon_extended_src']
            sfDictDst = dVars['dic_extended_filename']
            sDicName = dVars['dic_extended_name']
        elif sType == "community":
            spfLexSrc = dVars['lexicon_community_src']
            sfDictDst = dVars['dic_community_filename']
            sDicName = dVars['dic_community_name']
        elif sType == "personal":
            spfLexSrc = dVars['lexicon_personal_src']
            sfDictDst = dVars['dic_personal_filename']
            sDicName = dVars['dic_personal_name']
        lex_build.build(spfLexSrc, dVars['lang'], dVars['lang_name'], sfDictDst, bJavaScript, sDicName, "", dVars['stemming_method'], int(dVars['fsa_method']))



def main ():
    "build Grammalecte with requested options"
    print("Python: " + sys.version)
    xParser = argparse.ArgumentParser()
    xParser.add_argument("lang", type=str, nargs='+', help="lang project to generate (name of folder in /lang)")
    xParser.add_argument("-b", "--build_data", help="launch build_data.py (part 1 and 2)", action="store_true")
    xParser.add_argument("-bb", "--build_data_before", help="launch build_data.py (only part 1: before dictionary building)", action="store_true")
    xParser.add_argument("-ba", "--build_data_after", help="launch build_data.py (only part 2: before dictionary building)", action="store_true")
    xParser.add_argument("-d", "--dict", help="generate FSA dictionary", action="store_true")
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
                xArgs.add_extended_dictionary = False
            if not dVars["lexicon_community_src"]:
                xArgs.add_community_dictionary = False
            if not dVars["lexicon_personal_src"]:
                xArgs.add_personal_dictionary = False

            # build data
            build_data_module = None
            if xArgs.build_data_before or xArgs.build_data_after:
                # lang data
                try:
                    build_data_module = importlib.import_module("gc_lang."+sLang+".build_data")
                except ImportError:
                    print("# Error. Couldn’t import file build_data.py in folder gc_lang/"+sLang)
            if build_data_module and xArgs.build_data_before:
                build_data_module.before('gc_lang/'+sLang, dVars, xArgs.javascript)
            if xArgs.dict:
                buildDictionary(dVars, "main", xArgs.javascript)
                if xArgs.add_extended_dictionary:
                    buildDictionary(dVars, "extended", xArgs.javascript)
                if xArgs.add_community_dictionary:
                    buildDictionary(dVars, "community", xArgs.javascript)
                if xArgs.add_personal_dictionary:
                    buildDictionary(dVars, "personal", xArgs.javascript)
            if build_data_module and xArgs.build_data_after:
                build_data_module.after('gc_lang/'+sLang, dVars, xArgs.javascript)

            # copy dictionaries from Graphspell
            copyGraphspellDictionaries(dVars, xArgs.javascript, xArgs.add_extended_dictionary, xArgs.add_community_dictionary, xArgs.add_personal_dictionary)

            # make
            sVersion = create(sLang, xConfig, xArgs.install, xArgs.javascript, )








|



|


|
|








|
|







414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
                xArgs.add_extended_dictionary = False
            if not dVars["lexicon_community_src"]:
                xArgs.add_community_dictionary = False
            if not dVars["lexicon_personal_src"]:
                xArgs.add_personal_dictionary = False

            # build data
            databuild = None
            if xArgs.build_data_before or xArgs.build_data_after:
                # lang data
                try:
                    databuild = importlib.import_module("gc_lang."+sLang+".build_data")
                except ImportError:
                    print("# Error. Couldn’t import file build_data.py in folder gc_lang/"+sLang)
            if databuild and xArgs.build_data_before:
                databuild.before('gc_lang/'+sLang, dVars, xArgs.javascript)
            if xArgs.dict:
                buildDictionary(dVars, "main", xArgs.javascript)
                if xArgs.add_extended_dictionary:
                    buildDictionary(dVars, "extended", xArgs.javascript)
                if xArgs.add_community_dictionary:
                    buildDictionary(dVars, "community", xArgs.javascript)
                if xArgs.add_personal_dictionary:
                    buildDictionary(dVars, "personal", xArgs.javascript)
            if databuild and xArgs.build_data_after:
                databuild.after('gc_lang/'+sLang, dVars, xArgs.javascript)

            # copy dictionaries from Graphspell
            copyGraphspellDictionaries(dVars, xArgs.javascript, xArgs.add_extended_dictionary, xArgs.add_community_dictionary, xArgs.add_personal_dictionary)

            # make
            sVersion = create(sLang, xConfig, xArgs.install, xArgs.javascript, )

444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
                    if xArgs.tests:
                        xTestSuite = unittest.TestLoader().loadTestsFromModule(tests)
                        unittest.TextTestRunner().run(xTestSuite)
                    if xArgs.perf or xArgs.perf_memo:
                        hDst = open("./gc_lang/"+sLang+"/perf_memo.txt", "a", encoding="utf-8", newline="\n")  if xArgs.perf_memo  else None
                        tests.perf(sVersion, hDst)

            # Firefox
            if False:
                # obsolete
                with helpers.cd("_build/xpi/"+sLang):
                    spfFirefox = dVars['win_fx_dev_path']  if platform.system() == "Windows"  else dVars['linux_fx_dev_path']
                    os.system('jpm run -b "' + spfFirefox + '"')

            if xArgs.web_ext or xArgs.firefox:
                with helpers.cd("_build/webext/"+sLang):
                    if xArgs.lint_web_ext:
                        os.system(r'web-ext lint -o text')
                    if xArgs.firefox:
                        # Firefox Developper edition







|
|
<
|
|
|







457
458
459
460
461
462
463
464
465

466
467
468
469
470
471
472
473
474
475
                    if xArgs.tests:
                        xTestSuite = unittest.TestLoader().loadTestsFromModule(tests)
                        unittest.TextTestRunner().run(xTestSuite)
                    if xArgs.perf or xArgs.perf_memo:
                        hDst = open("./gc_lang/"+sLang+"/perf_memo.txt", "a", encoding="utf-8", newline="\n")  if xArgs.perf_memo  else None
                        tests.perf(sVersion, hDst)

            # Firefox (obsolete)
            #if False:

            #    with helpers.cd("_build/xpi/"+sLang):
            #        spfFirefox = dVars['win_fx_dev_path']  if platform.system() == "Windows"  else dVars['linux_fx_dev_path']
            #        os.system('jpm run -b "' + spfFirefox + '"')

            if xArgs.web_ext or xArgs.firefox:
                with helpers.cd("_build/webext/"+sLang):
                    if xArgs.lint_web_ext:
                        os.system(r'web-ext lint -o text')
                    if xArgs.firefox:
                        # Firefox Developper edition

Modified misc/grammalecte.sublime-syntax from [f7dfed6343] to [dcffb60da8].

22
23
24
25
26
27
28













29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60












61
62
63
64
65
66
67
    - match: '\b(-)?[0-9.]+\b'
      scope: constant.numeric

    # Bookmarks
    - match: '^!!.*|^\[\+\+\].*'
      scope: bookmark














    # Keywords are if, else.
    # Note that blackslashes don't need to be escaped within single quoted
    # strings in YAML. When using single quoted strings, only single quotes
    # need to be escaped: this is done by using two single quotes next to each
    # other.
    - match: '\b(?:if|else|and|or|not|in)\b'
      scope: keyword.python

    - match: '\b(?:True|False|None)\b'
      scope: constant.language
    
    - match: '\b(?:spell|morph|morphex|stem|textarea0?\w*|before0?\w*|after0?\w*|word|option|define|select|exclude|analysex?|apposition|is[A-Z]\w+|rewriteSubject|checkD\w+|getD\w+|has[A-Z]\w+|sugg[A-Z]\w+|switch[A-Z]\w+|ceOrCet|formatN\w+|mbUnit)\b'
      scope: entity.name.function

    - match: '\b(?:replace|endswith|startswith|search|upper|lower|capitalize|strip|rstrip|is(?:upper|lower|digit|title))\b'
      scope: support.function

    - match: '\becho\b'
      scope: support.function.debug

    - match: '\bre\b'
      scope: support.class

    # Rule options
    - match: '^__[\[<]([isu])[\]>](/\w+|)(\(\w+\)|)(![0-9]|)__|</?js>'
      scope: rule.options
      captures:
        1: rule.casing
        2: rule.optionname
        3: rule.rulename
        4: rule.priority













    # Definitions and options
    - match: '^OPT(?:GROUP|LANG|PRIORITY)/|^OPTSOFTWARE:'
      scope: options.command

    - match: '^OPT(?:LABEL|)/'
      scope: options.parameter








>
>
>
>
>
>
>
>
>
>
>
>
>











|











|








>
>
>
>
>
>
>
>
>
>
>
>







22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
    - match: '\b(-)?[0-9.]+\b'
      scope: constant.numeric

    # Bookmarks
    - match: '^!!.*|^\[\+\+\].*'
      scope: bookmark

    # Bookmarks
    - match: '^GRAPH_NAME:.*'
      scope: bookmark

    # Graph
    - match: '^@@@@GRAPH: *(\w+) *'
      scope: graphline
      captures:
        1: string.graphname

    - match: '^@@@@(?:END_GRAPH *| *)'
      scope: graphline

    # Keywords are if, else.
    # Note that blackslashes don't need to be escaped within single quoted
    # strings in YAML. When using single quoted strings, only single quotes
    # need to be escaped: this is done by using two single quotes next to each
    # other.
    - match: '\b(?:if|else|and|or|not|in)\b'
      scope: keyword.python

    - match: '\b(?:True|False|None)\b'
      scope: constant.language
    
    - match: '\b(?:spell|morph|morphex|stem|textarea0?\w*|before0?\w*|after0?\w*|word|option|define|select|exclude|analysex?|tag_|apposition|is[A-Z]\w+|rewriteSubject|checkD\w+|getD\w+|has[A-Z]\w+|sugg[A-Z]\w+|switch[A-Z]\w+|ceOrCet|formatN\w+|mbUnit)\b'
      scope: entity.name.function

    - match: '\b(?:replace|endswith|startswith|search|upper|lower|capitalize|strip|rstrip|is(?:upper|lower|digit|title))\b'
      scope: support.function

    - match: '\becho\b'
      scope: support.function.debug

    - match: '\bre\b'
      scope: support.class

    # Regex rule option
    - match: '^__[\[<]([isu])[\]>](/\w+|)(\(\w+\)|)(![0-9]|)__|</?js>'
      scope: rule.options
      captures:
        1: rule.casing
        2: rule.optionname
        3: rule.rulename
        4: rule.priority

    # Graph rules option
    - match: '^__(\w+)(![0-9]|)__'
      scope: rule.options
      captures:
        1: rule.rulename2
        2: rule.priority

    - match: '/(\w+)/'
      scope: rule.options
      captures:
        1: rule.optionname

    # Definitions and options
    - match: '^OPT(?:GROUP|LANG|PRIORITY)/|^OPTSOFTWARE:'
      scope: options.command

    - match: '^OPT(?:LABEL|)/'
      scope: options.parameter

82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98




























99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
    # rule delimiters
    - match: '<<-|>>>'
      scope: keyword.action
    - match: '__also__'
      scope: keyword.condition.green
    - match: '__else__'
      scope: keyword.condition.red
    - match: '-(\d*)>>'
      scope: keyword.error
      captures:
        1: keyword.error.group
    - match: '~(\d*)>>'
      scope: keyword.textprocessor
      captures:
        1: keyword.textprocessor.group
    - match: '=>>'
      scope: keyword.disambiguator





























    # Escaped chars
    - match: '\\(?:\d+|w|d|b|n|s|t)'
      scope: constant.character.escape

    # URL
    - match: '\| ?https?://[\w./%?&=#+-]+'
      scope: string.other

    # Example errors
    - match: '{{.+?}}'
      scope: message.error

    # special chars
    - match: '[@=*^?!:+<>]'
      scope: keyword.other

    - match: '\(\?(?:[:=!]|<!)|[(|)]'
      scope: keyword.parenthesis

    - match: '\[|[]{}]'
      scope: keyword.brackets







|



|





>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>














|







107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
    # rule delimiters
    - match: '<<-|>>>'
      scope: keyword.action
    - match: '__also__'
      scope: keyword.condition.green
    - match: '__else__'
      scope: keyword.condition.red
    - match: '-(\d*(?::\d+|))>>'
      scope: keyword.error
      captures:
        1: keyword.error.group
    - match: '~(\d*(?::\d+|))>>'
      scope: keyword.textprocessor
      captures:
        1: keyword.textprocessor.group
    - match: '=>>'
      scope: keyword.disambiguator
    - match: '/(\d*)>>'
      scope: keyword.tag
      captures:
        1: keyword.tag.group


    # Tokens
    - match: '(>)\w+'
      scope: string.lemma
      captures:
        1: keyword.valid

    - match: '(~)(?!(?:\d+(?::\d+|)|)>>)[^\s]+'
      scope: string.regex
      captures:
        1: keyword.valid

    - match: '(@)([^@][^\s¬]+)'
      scope: string.morph
      captures:
        1: keyword.valid
        2: string.morph.pattern

    - match: '(¬)(\S+)'
      scope: string.morph
      captures:
        1: keyword.invalid
        2: string.morph.antipattern  

    # Escaped chars
    - match: '\\(?:\d+|w|d|b|n|s|t)'
      scope: constant.character.escape

    # URL
    - match: '\| ?https?://[\w./%?&=#+-]+'
      scope: string.other

    # Example errors
    - match: '{{.+?}}'
      scope: message.error

    # special chars
    - match: '[@=*^?¿!:+<>~]'
      scope: keyword.other

    - match: '\(\?(?:[:=!]|<!)|[(|)]'
      scope: keyword.parenthesis

    - match: '\[|[]{}]'
      scope: keyword.brackets

Modified misc/grammalecte.tmTheme from [7305de87f8] to [76e7f53b09].

64
65
66
67
68
69
70















71
72
73
74
75
76
77
			<dict>
				<key>foreground</key>
				<string>#A0F0FF</string>
				<key>background</key>
				<string>#0050A0</string>
			</dict>
		</dict>















		<dict>
			<key>name</key>
			<string>String</string>
			<key>scope</key>
			<string>string</string>
			<key>settings</key>
			<dict>







>
>
>
>
>
>
>
>
>
>
>
>
>
>
>







64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
			<dict>
				<key>foreground</key>
				<string>#A0F0FF</string>
				<key>background</key>
				<string>#0050A0</string>
			</dict>
		</dict>
		<dict>
			<key>name</key>
			<string>Graphline</string>
			<key>scope</key>
			<string>graphline</string>
			<key>settings</key>
			<dict>
				<key>foreground</key>
				<string>hsl(0, 100%, 80%)</string>
				<key>background</key>
				<string>hsl(0, 100%, 20%</string>
				<key>fontStyle</key>
				<string>bold</string>
			</dict>
		</dict>
		<dict>
			<key>name</key>
			<string>String</string>
			<key>scope</key>
			<string>string</string>
			<key>settings</key>
			<dict>
231
232
233
234
235
236
237






























238
239
240
241
242
243
244
				<string>#F0F060</string>
				<key>background</key>
				<string>#602020</string>
				<key>fontStyle</key>
				<string>bold</string>
			</dict>
		</dict>






























		<dict>
			<key>name</key>
			<string>Keyword textprocessor</string>
			<key>scope</key>
			<string>keyword.textprocessor</string>
			<key>settings</key>
			<dict>







>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>







246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
				<string>#F0F060</string>
				<key>background</key>
				<string>#602020</string>
				<key>fontStyle</key>
				<string>bold</string>
			</dict>
		</dict>
		<dict>
			<key>name</key>
			<string>Keyword tag</string>
			<key>scope</key>
			<string>keyword.tag</string>
			<key>settings</key>
			<dict>
				<key>foreground</key>
				<string>#FF70FF</string>
				<key>background</key>
				<string>#602060</string>
				<key>fontStyle</key>
				<string>bold</string>
			</dict>
		</dict>
		<dict>
			<key>name</key>
			<string>Keyword tag group</string>
			<key>scope</key>
			<string>keyword.tag.group</string>
			<key>settings</key>
			<dict>
				<key>foreground</key>
				<string>#F0B0F0</string>
				<key>background</key>
				<string>#602060</string>
				<key>fontStyle</key>
				<string>bold</string>
			</dict>
		</dict>
		<dict>
			<key>name</key>
			<string>Keyword textprocessor</string>
			<key>scope</key>
			<string>keyword.textprocessor</string>
			<key>settings</key>
			<dict>
289
290
291
292
293
294
295































296
297
298
299
300
301
302
			<key>settings</key>
			<dict>
				<key>foreground</key>
				<string>#A0A0A0</string>
			</dict>
		</dict>
































		<dict>
			<key>name</key>
			<string>Rule options</string>
			<key>scope</key>
			<string>rule.options</string>
			<key>settings</key>
			<dict>







>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>







334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
			<key>settings</key>
			<dict>
				<key>foreground</key>
				<string>#A0A0A0</string>
			</dict>
		</dict>

		<dict>
			<key>name</key>
			<string>Keyword Valid</string>
			<key>scope</key>
			<string>keyword.valid</string>
			<key>settings</key>
			<dict>
				<key>fontStyle</key>
				<string>bold</string>
				<key>foreground</key>
				<string>hsl(150, 100%, 80%)</string>
				<key>background</key>
				<string>hsl(150, 100%, 20%)</string>
			</dict>
		</dict>
		<dict>
			<key>name</key>
			<string>Keyword Invalid</string>
			<key>scope</key>
			<string>keyword.invalid</string>
			<key>settings</key>
			<dict>
				<key>fontStyle</key>
				<string>bold</string>
				<key>foreground</key>
				<string>hsl(0, 100%, 80%)</string>
				<key>background</key>
				<string>hsl(0, 100%, 20%)</string>
			</dict>
		</dict>

		<dict>
			<key>name</key>
			<string>Rule options</string>
			<key>scope</key>
			<string>rule.options</string>
			<key>settings</key>
			<dict>
342
343
344
345
346
347
348











349
350
351
352
353
354
355
356
357
358
359
360





















































361
362
363
364
365
366
367
			<dict>
				<key>fontStyle</key>
				<string>italic</string>
				<key>foreground</key>
				<string>#A0A0A0</string>
			</dict>
		</dict>











		<dict>
			<key>name</key>
			<string>Rule priority</string>
			<key>scope</key>
			<string>rule.priority</string>
			<key>settings</key>
			<dict>
				<key>foreground</key>
				<string>#F06060</string>
			</dict>
		</dict>
		






















































		<dict>
			<key>name</key>
			<string>JavaScript Dollar</string>
			<key>scope</key>
			<string>variable.other.dollar.only.js</string>
			<key>settings</key>







>
>
>
>
>
>
>
>
>
>
>












>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>







418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
485
486
487
488
489
490
491
492
493
494
495
496
497
498
499
500
501
502
503
504
505
506
507
			<dict>
				<key>fontStyle</key>
				<string>italic</string>
				<key>foreground</key>
				<string>#A0A0A0</string>
			</dict>
		</dict>
		<dict>
			<key>name</key>
			<string>Rule name</string>
			<key>scope</key>
			<string>rule.rulename2</string>
			<key>settings</key>
			<dict>
				<key>foreground</key>
				<string>#F0D080</string>
			</dict>
		</dict>
		<dict>
			<key>name</key>
			<string>Rule priority</string>
			<key>scope</key>
			<string>rule.priority</string>
			<key>settings</key>
			<dict>
				<key>foreground</key>
				<string>#F06060</string>
			</dict>
		</dict>
		
		<dict>
			<key>name</key>
			<string>String lemma</string>
			<key>scope</key>
			<string>string.lemma</string>
			<key>settings</key>
			<dict>
				<key>foreground</key>
				<string>hsl(210, 100%, 80%)</string>
				<key>background</key>
				<string>hsl(210, 100%, 15%)</string>
			</dict>
		</dict>
		<dict>
			<key>name</key>
			<string>String regex</string>
			<key>scope</key>
			<string>string.regex</string>
			<key>settings</key>
			<dict>
				<key>foreground</key>
				<string>hsl(60, 100%, 80%)</string>
				<key>background</key>
				<string>hsl(60, 100%, 10%)</string>
			</dict>
		</dict>
		<dict>
			<key>name</key>
			<string>String morph pattern</string>
			<key>scope</key>
			<string>string.morph.pattern</string>
			<key>settings</key>
			<dict>
				<key>foreground</key>
				<string>hsl(150, 80%, 90%)</string>
				<key>background</key>
				<string>hsl(150, 80%, 10%)</string>
			</dict>
		</dict>
		<dict>
			<key>name</key>
			<string>String morph antipattern</string>
			<key>scope</key>
			<string>string.morph.antipattern</string>
			<key>settings</key>
			<dict>
				<key>foreground</key>
				<string>hsl(0, 80%, 90%)</string>
				<key>background</key>
				<string>hsl(0, 80%, 10%)</string>
			</dict>
		</dict>


		<dict>
			<key>name</key>
			<string>JavaScript Dollar</string>
			<key>scope</key>
			<string>variable.other.dollar.only.js</string>
			<key>settings</key>