1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
|
# Thesaurus builder
import os
import re
def readFile (spf):
if os.path.isfile(spf):
with open(spf, "r", encoding="utf-8") as hSrc:
for sLine in hSrc:
yield sLine.strip()
else:
print("# Error. File not found or not loadable: " + spf)
class ThesaurusBuilder ():
def __init__ (self):
# synsets
self.dSynEntry = {} # {sWord: iSynset}
self.dSynset = {} # {iSynset: lSynset}
# thesaurus
self.dThesEntry = {} # {sWord: lWord}
def readSynsets (self, spf):
if not spf:
return
for i, sLine in enumerate(readFile(spf), 1):
sPOS, *lSynset = sLine.split("|")
lSynset = self._removeDuplicatesFrom(lSynset)
|
>
|
|
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
|
# Thesaurus builder
import os
import re
import json
def readFile (spf):
if os.path.isfile(spf):
with open(spf, "r", encoding="utf-8") as hSrc:
for sLine in hSrc:
yield sLine.strip()
else:
print("# Error. File not found or not loadable: " + spf)
class ThesaurusBuilder ():
def __init__ (self):
# synsets
self.dSynEntry = {} # {sWord: iSynset}
self.dSynset = {} # {iSynset: lSynset}
# thesaurus
self.dThesaurus = {} # {sWord: lWord}
def readSynsets (self, spf):
if not spf:
return
for i, sLine in enumerate(readFile(spf), 1):
sPOS, *lSynset = sLine.split("|")
lSynset = self._removeDuplicatesFrom(lSynset)
|
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
|
sLine = sLine.strip()
if re.search(r"^[^|]+\|[1-9][0-9]*$", sLine):
# new entry
if nClass != nClassFound:
print("Ligne:", iEntryLine, ", nombre de liste incorrect")
iEntryLine = i
sEntry, sNum = sLine.split("|")
self.dThesEntry[sEntry] = []
nClass = int(sNum)
nClassFound = 0
else:
# new list of synonyms
nClassFound += 1
sPOS, *lClass = sLine.split("|")
lClass = self._removeDuplicatesFrom(lClass)
self.dThesEntry[sEntry].append( (sPOS, lClass) )
def showThesaurusEntries (self):
for sWord, lClass in self.dThesEntry.items():
for sPOS, lWord in lClass:
print(sWord, sPOS, "|".join(lWord))
def _removeDuplicatesFrom (self, lWord):
return [ sWord.strip() for sWord in dict.fromkeys(lWord) ] # remove duplicates: use <dict.fromkeys()> instead of <set()> to keep order
def merge (self):
for sWord, lSynset in self.dSynEntry.items():
for sPOS, iSynset in lSynset:
if sWord in self.dThesEntry:
self.dThesEntry[sWord].append( (sPOS, self.dSynset[iSynset]) )
else:
self.dThesEntry[sWord] = [ (sPOS, self.dSynset[iSynset]) ]
def write (self, spDest):
nOffset = 0 # the offset for finding data is the number of bytes (-> encoding("utf-8"))
dOffset = {}
with open(spDest + "/thes_fr.dat", "w", encoding="utf-8", newline="\n") as hThes:
sHeader = "UTF-8\n"
hThes.write(sHeader)
nOffset = len(sHeader.encode("utf-8"))
for sWord, lClass in self.dThesEntry.items():
dOffset[sWord] = nOffset
sWordLine = sWord+"|"+str(len(lClass))+"\n"
hThes.write(sWordLine)
nOffset += len(sWordLine.encode("utf-8"))
for sPOS, lWord in lClass:
sClassLine = sPOS+"|"+"|".join(lWord)+"\n"
hThes.write(sClassLine)
nOffset += len(sClassLine.encode("utf-8"))
with open(spDest + "/thes_fr.idx", "w", encoding="utf-8", newline="\n") as hIndex:
hIndex.write("UTF-8\n")
hIndex.write(str(len(self.dThesEntry))+"\n")
for sWord, nOffset in sorted(dOffset.items()):
hIndex.write(sWord+"|"+str(nOffset)+"\n")
def build (spfThesaurus="", spfSynsets="", spDest="_build"):
oThes = ThesaurusBuilder()
oThes.readSynsets(spfSynsets)
#oThes.showSynsetEntries()
oThes.readThesaurus(spfThesaurus)
#oThes.showThesaurusEntries()
oThes.merge()
oThes.write(spDest)
if __name__ == '__main__':
build("thesaurus/thes_fr.dat", "thesaurus/synsets_fr.dat")
|
|
|
|
|
|
|
|
|
>
>
>
>
>
>
|
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
|
sLine = sLine.strip()
if re.search(r"^[^|]+\|[1-9][0-9]*$", sLine):
# new entry
if nClass != nClassFound:
print("Ligne:", iEntryLine, ", nombre de liste incorrect")
iEntryLine = i
sEntry, sNum = sLine.split("|")
self.dThesaurus[sEntry] = []
nClass = int(sNum)
nClassFound = 0
else:
# new list of synonyms
nClassFound += 1
sPOS, *lClass = sLine.split("|")
lClass = self._removeDuplicatesFrom(lClass)
self.dThesaurus[sEntry].append( (sPOS, lClass) )
def showThesaurusEntries (self):
for sWord, lClass in self.dThesaurus.items():
for sPOS, lWord in lClass:
print(sWord, sPOS, "|".join(lWord))
def _removeDuplicatesFrom (self, lWord):
return [ sWord.strip() for sWord in dict.fromkeys(lWord) ] # remove duplicates: use <dict.fromkeys()> instead of <set()> to keep order
def merge (self):
for sWord, lSynset in self.dSynEntry.items():
for sPOS, iSynset in lSynset:
if sWord in self.dThesaurus:
self.dThesaurus[sWord].append( (sPOS, self.dSynset[iSynset]) )
else:
self.dThesaurus[sWord] = [ (sPOS, self.dSynset[iSynset]) ]
def write (self, spDest):
nOffset = 0 # the offset for finding data is the number of bytes (-> encoding("utf-8"))
dOffset = {}
with open(spDest + "/thes_fr.dat", "w", encoding="utf-8", newline="\n") as hThes:
sHeader = "UTF-8\n"
hThes.write(sHeader)
nOffset = len(sHeader.encode("utf-8"))
for sWord, lClass in self.dThesaurus.items():
dOffset[sWord] = nOffset
sWordLine = sWord+"|"+str(len(lClass))+"\n"
hThes.write(sWordLine)
nOffset += len(sWordLine.encode("utf-8"))
for sPOS, lWord in lClass:
sClassLine = sPOS+"|"+"|".join(lWord)+"\n"
hThes.write(sClassLine)
nOffset += len(sClassLine.encode("utf-8"))
with open(spDest + "/thes_fr.idx", "w", encoding="utf-8", newline="\n") as hIndex:
hIndex.write("UTF-8\n")
hIndex.write(str(len(self.dThesaurus))+"\n")
for sWord, nOffset in sorted(dOffset.items()):
hIndex.write(sWord+"|"+str(nOffset)+"\n")
def writeAsJSON (self, spDest):
with open(spDest + "/thes_fr.json", "w", encoding="utf-8", newline="\n") as hJsonThes:
sJSON = json.dumps(self.dThesaurus, ensure_ascii=False)
hJsonThes.write(sJSON)
def build (spfThesaurus="", spfSynsets="", spDest="_build"):
oThes = ThesaurusBuilder()
oThes.readSynsets(spfSynsets)
#oThes.showSynsetEntries()
oThes.readThesaurus(spfThesaurus)
#oThes.showThesaurusEntries()
oThes.merge()
oThes.write(spDest)
oThes.writeAsJSON(spDest)
if __name__ == '__main__':
build("thesaurus/thes_fr.dat", "thesaurus/synsets_fr.dat")
|