1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
|
#!python3
import os
import sys
import re
import grammalecte.ibdawg as ibdawg
oDict = ibdawg.IBDAWG("French.bdic")
def readFile (spf):
if os.path.isfile(spf):
with open(spf, "r", encoding="utf-8") as hSrc:
for sLine in hSrc:
yield sLine
else:
print("# Error: file not found.")
def createFile (spfSrc, spfDst):
with open(spfDst, "w", encoding="utf-8") as hDst:
for sLine in readFile(spfSrc):
sLine = sLine.strip()
if sLine:
for sWord in sLine.split():
if not oDict.isValid(sWord):
hDst.write(sWord+"\n")
def createLexStatFile (spf, dStat):
dWord = {}
for i, sLine in enumerate(readFile(spf)):
if not sLine.startswith("#"):
sWord = sLine.strip()
if sWord not in dWord:
|
>
>
|
|
>
|
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
|
#!python3
# Just a file for one-shot scripts
import os
import sys
import re
import grammalecte.ibdawg as ibdawg
oDict = ibdawg.IBDAWG("French.bdic")
def readFile (spf):
if os.path.isfile(spf):
with open(spf, "r", encoding="utf-8") as hSrc:
for sLine in hSrc:
yield sLine
else:
print("# Error: file not found.")
# --------------------------------------------------------------------------------------------------
def listUnknownWords (spf):
with open(spf+".res.txt", "w", encoding="utf-8") as hDst:
for sLine in readFile(spfSrc):
sLine = sLine.strip()
if sLine:
for sWord in sLine.split():
if not oDict.isValid(sWord):
hDst.write(sWord+"\n")
# --------------------------------------------------------------------------------------------------
def createLexStatFile (spf, dStat):
dWord = {}
for i, sLine in enumerate(readFile(spf)):
if not sLine.startswith("#"):
sWord = sLine.strip()
if sWord not in dWord:
|
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
|
for sLine in readFile(spf):
if not sLine.startswith("#"):
sWord, sCount = sLine.split()
dStat[sWord] = dStat.get(sWord, 0) + int(sCount)
return dStat
def main ():
dStat = {}
readStatFile("stats1.txt", dStat)
readStatFile("stats2.txt", dStat)
readStatFile("stats3.txt", dStat)
createLexStatFile("propositions.txt", dStat)
if __name__ == '__main__' :
main()
|
|
|
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
<
>
|
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
|
for sLine in readFile(spf):
if not sLine.startswith("#"):
sWord, sCount = sLine.split()
dStat[sWord] = dStat.get(sWord, 0) + int(sCount)
return dStat
def readStatFilesAndCreateLexicon ():
dStat = {}
readStatFile("stats1.txt", dStat)
readStatFile("stats2.txt", dStat)
readStatFile("stats3.txt", dStat)
createLexStatFile("propositions.txt", dStat)
# --------------------------------------------------------------------------------------------------
def isMoreThanOneSetInList (lSet):
aFirst = lSet.pop(0)
for aSet in lSet:
if aSet != aFirst:
return True
return False
def filterLinesWithWordsWithDifferentStems (spf):
with open(spf+".res.txt", "w", encoding="utf-8") as hDst:
for sLine in readFile(spf):
lStemSet = [ set(oDict.stem(sWord)) for sWord in sLine.strip().split()]
if isMoreThanOneSetInList(lStemSet):
hDst.write(sLine)
def filterHomophonicWords ():
filterLinesWithWordsWithDifferentStems("homophones.txt")
# --------------------------------------------------------------------------------------------------
if __name__ == '__main__' :
filterHomophonicWords()
|