Grammalecte  Diff

Differences From Artifact [7fe275114e]:

To Artifact [b66a405bcd]:


1
2
3
4

5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24

25
26
27
28
29
30
31
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24

25
26
27
28
29
30
31
32




+



















-
+







# Thesaurus builder

import os
import re
import json


def readFile (spf):
    if os.path.isfile(spf):
        with open(spf, "r", encoding="utf-8") as hSrc:
            for sLine in hSrc:
                yield sLine.strip()
    else:
        print("# Error. File not found or not loadable: " + spf)



class ThesaurusBuilder ():

    def __init__ (self):
        # synsets
        self.dSynEntry = {}     # {sWord: iSynset}
        self.dSynset = {}       # {iSynset: lSynset}
        # thesaurus
        self.dThesEntry = {}    # {sWord: lWord}
        self.dThesaurus = {}    # {sWord: lWord}

    def readSynsets (self, spf):
        if not spf:
            return
        for i, sLine in enumerate(readFile(spf), 1):
            sPOS, *lSynset = sLine.split("|")
            lSynset = self._removeDuplicatesFrom(lSynset)
57
58
59
60
61
62
63
64

65
66
67
68
69
70
71
72

73
74
75

76
77
78
79
80
81
82
83
84
85
86


87
88

89
90
91
92
93
94
95
96
97

98
99
100
101
102
103
104
105
106
107
108

109
110





111
112
113
114
115
116
117
118
119
120

121
122
123
124
58
59
60
61
62
63
64

65
66
67
68
69
70
71
72

73
74
75

76
77
78
79
80
81
82
83
84
85


86
87
88

89
90
91
92
93
94
95
96
97

98
99
100
101
102
103
104
105
106
107
108

109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131







-
+







-
+


-
+









-
-
+
+

-
+








-
+










-
+


+
+
+
+
+










+




            sLine = sLine.strip()
            if re.search(r"^[^|]+\|[1-9][0-9]*$", sLine):
                # new entry
                if nClass != nClassFound:
                    print("Ligne:", iEntryLine, ", nombre de liste incorrect")
                iEntryLine = i
                sEntry, sNum = sLine.split("|")
                self.dThesEntry[sEntry] = []
                self.dThesaurus[sEntry] = []
                nClass = int(sNum)
                nClassFound = 0
            else:
                # new list of synonyms
                nClassFound += 1
                sPOS, *lClass = sLine.split("|")
                lClass = self._removeDuplicatesFrom(lClass)
                self.dThesEntry[sEntry].append( (sPOS, lClass) )
                self.dThesaurus[sEntry].append( (sPOS, lClass) )

    def showThesaurusEntries (self):
        for sWord, lClass in self.dThesEntry.items():
        for sWord, lClass in self.dThesaurus.items():
            for sPOS, lWord in lClass:
                print(sWord, sPOS, "|".join(lWord))

    def _removeDuplicatesFrom (self, lWord):
        return [ sWord.strip()  for sWord  in dict.fromkeys(lWord) ]  # remove duplicates: use <dict.fromkeys()> instead of <set()> to keep order

    def merge (self):
        for sWord, lSynset in self.dSynEntry.items():
            for sPOS, iSynset in lSynset:
                if sWord in self.dThesEntry:
                    self.dThesEntry[sWord].append( (sPOS, self.dSynset[iSynset]) )
                if sWord in self.dThesaurus:
                    self.dThesaurus[sWord].append( (sPOS, self.dSynset[iSynset]) )
                else:
                    self.dThesEntry[sWord] = [ (sPOS, self.dSynset[iSynset]) ]
                    self.dThesaurus[sWord] = [ (sPOS, self.dSynset[iSynset]) ]

    def write (self, spDest):
        nOffset = 0     # the offset for finding data is the number of bytes (-> encoding("utf-8"))
        dOffset = {}
        with open(spDest + "/thes_fr.dat", "w", encoding="utf-8", newline="\n") as hThes:
            sHeader = "UTF-8\n"
            hThes.write(sHeader)
            nOffset = len(sHeader.encode("utf-8"))
            for sWord, lClass in self.dThesEntry.items():
            for sWord, lClass in self.dThesaurus.items():
                dOffset[sWord] = nOffset
                sWordLine = sWord+"|"+str(len(lClass))+"\n"
                hThes.write(sWordLine)
                nOffset += len(sWordLine.encode("utf-8"))
                for sPOS, lWord in lClass:
                    sClassLine = sPOS+"|"+"|".join(lWord)+"\n"
                    hThes.write(sClassLine)
                    nOffset += len(sClassLine.encode("utf-8"))
        with open(spDest + "/thes_fr.idx", "w", encoding="utf-8", newline="\n") as hIndex:
            hIndex.write("UTF-8\n")
            hIndex.write(str(len(self.dThesEntry))+"\n")
            hIndex.write(str(len(self.dThesaurus))+"\n")
            for sWord, nOffset in sorted(dOffset.items()):
                hIndex.write(sWord+"|"+str(nOffset)+"\n")

    def writeAsJSON (self, spDest):
        with open(spDest + "/thes_fr.json", "w", encoding="utf-8", newline="\n") as hJsonThes:
            sJSON = json.dumps(self.dThesaurus, ensure_ascii=False)
            hJsonThes.write(sJSON)


def build (spfThesaurus="", spfSynsets="", spDest="_build"):
    oThes = ThesaurusBuilder()
    oThes.readSynsets(spfSynsets)
    #oThes.showSynsetEntries()
    oThes.readThesaurus(spfThesaurus)
    #oThes.showThesaurusEntries()
    oThes.merge()
    oThes.write(spDest)
    oThes.writeAsJSON(spDest)


if __name__ == '__main__':
    build("thesaurus/thes_fr.dat", "thesaurus/synsets_fr.dat")