Grammalecte  Check-in [c3cefaaf03]

Overview
Comment:[fr] màj: reader.py (filtre pour la liste des homophones tirée de Wiktionnaire (merci à Benoit S.))
Downloads: Tarball | ZIP archive | SQL archive
Timelines: family | ancestors | descendants | both | trunk
Files: files | file ages | folders
SHA3-256: c3cefaaf03d8555a1a36d0bc146f0178ccae919b1fcdcd0a428afcd9f51d4bef
User & Date: olr on 2017-06-11 18:32:54
Original Comment: [fr] màj: reader.py (filtre pour la liste des homophnes tirée de Wikisource(merci à Benoit S.))
Other Links: manifest | tags
Context
2017-06-11
21:03
[fr] phonet_simil: <bore/bord> check-in: 8003eff469 user: olr tags: trunk, fr
18:32
[fr] màj: reader.py (filtre pour la liste des homophones tirée de Wiktionnaire (merci à Benoit S.)) check-in: c3cefaaf03 user: olr tags: trunk
16:24
[fr] màj: règles typographiques sur les nombres ordinaux check-in: b798f77c08 user: olr tags: trunk, fr
Changes

Modified reader.py from [510a35b7df] to [0c60e1da75].

1

2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19

20
21
22
23
24
25
26
27
28
29

30
31
32
33
34
35
36
#!python3


import os
import sys
import re

import grammalecte.ibdawg as ibdawg

oDict = ibdawg.IBDAWG("French.bdic")


def readFile (spf):
    if os.path.isfile(spf):
        with open(spf, "r", encoding="utf-8") as hSrc:
            for sLine in hSrc:
                yield sLine
    else:
        print("# Error: file not found.")



def createFile (spfSrc, spfDst):
    with open(spfDst, "w", encoding="utf-8") as hDst:
        for sLine in readFile(spfSrc):
            sLine = sLine.strip()
            if sLine:
                for sWord in sLine.split():
                    if not oDict.isValid(sWord): 
                        hDst.write(sWord+"\n")



def createLexStatFile (spf, dStat):
    dWord = {}
    for i, sLine in enumerate(readFile(spf)):
        if not sLine.startswith("#"):
            sWord = sLine.strip()
            if sWord not in dWord:

>


















>

|
|







>







1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
#!python3
# Just a file for one-shot scripts

import os
import sys
import re

import grammalecte.ibdawg as ibdawg

oDict = ibdawg.IBDAWG("French.bdic")


def readFile (spf):
    if os.path.isfile(spf):
        with open(spf, "r", encoding="utf-8") as hSrc:
            for sLine in hSrc:
                yield sLine
    else:
        print("# Error: file not found.")

# --------------------------------------------------------------------------------------------------

def listUnknownWords (spf):
    with open(spf+".res.txt", "w", encoding="utf-8") as hDst:
        for sLine in readFile(spfSrc):
            sLine = sLine.strip()
            if sLine:
                for sWord in sLine.split():
                    if not oDict.isValid(sWord): 
                        hDst.write(sWord+"\n")

# --------------------------------------------------------------------------------------------------

def createLexStatFile (spf, dStat):
    dWord = {}
    for i, sLine in enumerate(readFile(spf)):
        if not sLine.startswith("#"):
            sWord = sLine.strip()
            if sWord not in dWord:
48
49
50
51
52
53
54
55
56
57
58
59
60
61

62



















63
64

    for sLine in readFile(spf):
        if not sLine.startswith("#"):
            sWord, sCount = sLine.split()
            dStat[sWord] = dStat.get(sWord, 0) + int(sCount)
    return dStat


def main ():
    dStat = {}
    readStatFile("stats1.txt", dStat)
    readStatFile("stats2.txt", dStat)
    readStatFile("stats3.txt", dStat)
    createLexStatFile("propositions.txt", dStat)
    





















if __name__ == '__main__' :
    main()








|





|
>

>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>

<
>
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86

87
    for sLine in readFile(spf):
        if not sLine.startswith("#"):
            sWord, sCount = sLine.split()
            dStat[sWord] = dStat.get(sWord, 0) + int(sCount)
    return dStat


def readStatFilesAndCreateLexicon ():
    dStat = {}
    readStatFile("stats1.txt", dStat)
    readStatFile("stats2.txt", dStat)
    readStatFile("stats3.txt", dStat)
    createLexStatFile("propositions.txt", dStat)

# --------------------------------------------------------------------------------------------------

def isMoreThanOneSetInList (lSet):
    aFirst = lSet.pop(0)
    for aSet in lSet:
        if aSet != aFirst:
            return True
    return False

def filterLinesWithWordsWithDifferentStems (spf):
    with open(spf+".res.txt", "w", encoding="utf-8") as hDst:
        for sLine in readFile(spf):
            lStemSet = [ set(oDict.stem(sWord))  for sWord in sLine.strip().split()]
            if isMoreThanOneSetInList(lStemSet):
                hDst.write(sLine)

def filterHomophonicWords ():
    filterLinesWithWordsWithDifferentStems("homophones.txt")

# --------------------------------------------------------------------------------------------------

if __name__ == '__main__' :

    filterHomophonicWords()