Grammalecte  Diff

Differences From Artifact [7fe275114e]:

To Artifact [b66a405bcd]:


1
2
3
4

5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
# Thesaurus builder

import os
import re



def readFile (spf):
    if os.path.isfile(spf):
        with open(spf, "r", encoding="utf-8") as hSrc:
            for sLine in hSrc:
                yield sLine.strip()
    else:
        print("# Error. File not found or not loadable: " + spf)



class ThesaurusBuilder ():

    def __init__ (self):
        # synsets
        self.dSynEntry = {}     # {sWord: iSynset}
        self.dSynset = {}       # {iSynset: lSynset}
        # thesaurus
        self.dThesEntry = {}    # {sWord: lWord}

    def readSynsets (self, spf):
        if not spf:
            return
        for i, sLine in enumerate(readFile(spf), 1):
            sPOS, *lSynset = sLine.split("|")
            lSynset = self._removeDuplicatesFrom(lSynset)




>



















|







1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
# Thesaurus builder

import os
import re
import json


def readFile (spf):
    if os.path.isfile(spf):
        with open(spf, "r", encoding="utf-8") as hSrc:
            for sLine in hSrc:
                yield sLine.strip()
    else:
        print("# Error. File not found or not loadable: " + spf)



class ThesaurusBuilder ():

    def __init__ (self):
        # synsets
        self.dSynEntry = {}     # {sWord: iSynset}
        self.dSynset = {}       # {iSynset: lSynset}
        # thesaurus
        self.dThesaurus = {}    # {sWord: lWord}

    def readSynsets (self, spf):
        if not spf:
            return
        for i, sLine in enumerate(readFile(spf), 1):
            sPOS, *lSynset = sLine.split("|")
            lSynset = self._removeDuplicatesFrom(lSynset)
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110





111
112
113
114
115
116
117
118
119
120

121
122
123
124
            sLine = sLine.strip()
            if re.search(r"^[^|]+\|[1-9][0-9]*$", sLine):
                # new entry
                if nClass != nClassFound:
                    print("Ligne:", iEntryLine, ", nombre de liste incorrect")
                iEntryLine = i
                sEntry, sNum = sLine.split("|")
                self.dThesEntry[sEntry] = []
                nClass = int(sNum)
                nClassFound = 0
            else:
                # new list of synonyms
                nClassFound += 1
                sPOS, *lClass = sLine.split("|")
                lClass = self._removeDuplicatesFrom(lClass)
                self.dThesEntry[sEntry].append( (sPOS, lClass) )

    def showThesaurusEntries (self):
        for sWord, lClass in self.dThesEntry.items():
            for sPOS, lWord in lClass:
                print(sWord, sPOS, "|".join(lWord))

    def _removeDuplicatesFrom (self, lWord):
        return [ sWord.strip()  for sWord  in dict.fromkeys(lWord) ]  # remove duplicates: use <dict.fromkeys()> instead of <set()> to keep order

    def merge (self):
        for sWord, lSynset in self.dSynEntry.items():
            for sPOS, iSynset in lSynset:
                if sWord in self.dThesEntry:
                    self.dThesEntry[sWord].append( (sPOS, self.dSynset[iSynset]) )
                else:
                    self.dThesEntry[sWord] = [ (sPOS, self.dSynset[iSynset]) ]

    def write (self, spDest):
        nOffset = 0     # the offset for finding data is the number of bytes (-> encoding("utf-8"))
        dOffset = {}
        with open(spDest + "/thes_fr.dat", "w", encoding="utf-8", newline="\n") as hThes:
            sHeader = "UTF-8\n"
            hThes.write(sHeader)
            nOffset = len(sHeader.encode("utf-8"))
            for sWord, lClass in self.dThesEntry.items():
                dOffset[sWord] = nOffset
                sWordLine = sWord+"|"+str(len(lClass))+"\n"
                hThes.write(sWordLine)
                nOffset += len(sWordLine.encode("utf-8"))
                for sPOS, lWord in lClass:
                    sClassLine = sPOS+"|"+"|".join(lWord)+"\n"
                    hThes.write(sClassLine)
                    nOffset += len(sClassLine.encode("utf-8"))
        with open(spDest + "/thes_fr.idx", "w", encoding="utf-8", newline="\n") as hIndex:
            hIndex.write("UTF-8\n")
            hIndex.write(str(len(self.dThesEntry))+"\n")
            for sWord, nOffset in sorted(dOffset.items()):
                hIndex.write(sWord+"|"+str(nOffset)+"\n")







def build (spfThesaurus="", spfSynsets="", spDest="_build"):
    oThes = ThesaurusBuilder()
    oThes.readSynsets(spfSynsets)
    #oThes.showSynsetEntries()
    oThes.readThesaurus(spfThesaurus)
    #oThes.showThesaurusEntries()
    oThes.merge()
    oThes.write(spDest)



if __name__ == '__main__':
    build("thesaurus/thes_fr.dat", "thesaurus/synsets_fr.dat")







|







|


|









|
|

|








|










|


>
>
>
>
>










>




58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
            sLine = sLine.strip()
            if re.search(r"^[^|]+\|[1-9][0-9]*$", sLine):
                # new entry
                if nClass != nClassFound:
                    print("Ligne:", iEntryLine, ", nombre de liste incorrect")
                iEntryLine = i
                sEntry, sNum = sLine.split("|")
                self.dThesaurus[sEntry] = []
                nClass = int(sNum)
                nClassFound = 0
            else:
                # new list of synonyms
                nClassFound += 1
                sPOS, *lClass = sLine.split("|")
                lClass = self._removeDuplicatesFrom(lClass)
                self.dThesaurus[sEntry].append( (sPOS, lClass) )

    def showThesaurusEntries (self):
        for sWord, lClass in self.dThesaurus.items():
            for sPOS, lWord in lClass:
                print(sWord, sPOS, "|".join(lWord))

    def _removeDuplicatesFrom (self, lWord):
        return [ sWord.strip()  for sWord  in dict.fromkeys(lWord) ]  # remove duplicates: use <dict.fromkeys()> instead of <set()> to keep order

    def merge (self):
        for sWord, lSynset in self.dSynEntry.items():
            for sPOS, iSynset in lSynset:
                if sWord in self.dThesaurus:
                    self.dThesaurus[sWord].append( (sPOS, self.dSynset[iSynset]) )
                else:
                    self.dThesaurus[sWord] = [ (sPOS, self.dSynset[iSynset]) ]

    def write (self, spDest):
        nOffset = 0     # the offset for finding data is the number of bytes (-> encoding("utf-8"))
        dOffset = {}
        with open(spDest + "/thes_fr.dat", "w", encoding="utf-8", newline="\n") as hThes:
            sHeader = "UTF-8\n"
            hThes.write(sHeader)
            nOffset = len(sHeader.encode("utf-8"))
            for sWord, lClass in self.dThesaurus.items():
                dOffset[sWord] = nOffset
                sWordLine = sWord+"|"+str(len(lClass))+"\n"
                hThes.write(sWordLine)
                nOffset += len(sWordLine.encode("utf-8"))
                for sPOS, lWord in lClass:
                    sClassLine = sPOS+"|"+"|".join(lWord)+"\n"
                    hThes.write(sClassLine)
                    nOffset += len(sClassLine.encode("utf-8"))
        with open(spDest + "/thes_fr.idx", "w", encoding="utf-8", newline="\n") as hIndex:
            hIndex.write("UTF-8\n")
            hIndex.write(str(len(self.dThesaurus))+"\n")
            for sWord, nOffset in sorted(dOffset.items()):
                hIndex.write(sWord+"|"+str(nOffset)+"\n")

    def writeAsJSON (self, spDest):
        with open(spDest + "/thes_fr.json", "w", encoding="utf-8", newline="\n") as hJsonThes:
            sJSON = json.dumps(self.dThesaurus, ensure_ascii=False)
            hJsonThes.write(sJSON)


def build (spfThesaurus="", spfSynsets="", spDest="_build"):
    oThes = ThesaurusBuilder()
    oThes.readSynsets(spfSynsets)
    #oThes.showSynsetEntries()
    oThes.readThesaurus(spfThesaurus)
    #oThes.showThesaurusEntries()
    oThes.merge()
    oThes.write(spDest)
    oThes.writeAsJSON(spDest)


if __name__ == '__main__':
    build("thesaurus/thes_fr.dat", "thesaurus/synsets_fr.dat")