Overview
| Comment: | [fr] màj: reader.py (filtre pour la liste des homophones tirée de Wiktionnaire (merci à Benoit S.)) |
|---|---|
| Downloads: | Tarball | ZIP archive | SQL archive |
| Timelines: | family | ancestors | descendants | both | trunk |
| Files: | files | file ages | folders |
| SHA3-256: |
c3cefaaf03d8555a1a36d0bc146f0178 |
| User & Date: | olr on 2017-06-11 18:32:54 |
| Original Comment: | [fr] màj: reader.py (filtre pour la liste des homophnes tirée de Wikisource(merci à Benoit S.)) |
| Other Links: | manifest | tags |
Context
|
2017-06-11
| ||
| 21:03 | [fr] phonet_simil: <bore/bord> check-in: 8003eff469 user: olr tags: trunk, fr | |
| 18:32 | [fr] màj: reader.py (filtre pour la liste des homophones tirée de Wiktionnaire (merci à Benoit S.)) check-in: c3cefaaf03 user: olr tags: trunk | |
| 16:24 | [fr] màj: règles typographiques sur les nombres ordinaux check-in: b798f77c08 user: olr tags: trunk, fr | |
Changes
Modified reader.py from [510a35b7df] to [0c60e1da75].
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 |
#!python3
import os
import sys
import re
import grammalecte.ibdawg as ibdawg
oDict = ibdawg.IBDAWG("French.bdic")
def readFile (spf):
if os.path.isfile(spf):
with open(spf, "r", encoding="utf-8") as hSrc:
for sLine in hSrc:
yield sLine
else:
print("# Error: file not found.")
| > > | | > | 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 |
#!python3
# Just a file for one-shot scripts
import os
import sys
import re
import grammalecte.ibdawg as ibdawg
oDict = ibdawg.IBDAWG("French.bdic")
def readFile (spf):
if os.path.isfile(spf):
with open(spf, "r", encoding="utf-8") as hSrc:
for sLine in hSrc:
yield sLine
else:
print("# Error: file not found.")
# --------------------------------------------------------------------------------------------------
def listUnknownWords (spf):
with open(spf+".res.txt", "w", encoding="utf-8") as hDst:
for sLine in readFile(spfSrc):
sLine = sLine.strip()
if sLine:
for sWord in sLine.split():
if not oDict.isValid(sWord):
hDst.write(sWord+"\n")
# --------------------------------------------------------------------------------------------------
def createLexStatFile (spf, dStat):
dWord = {}
for i, sLine in enumerate(readFile(spf)):
if not sLine.startswith("#"):
sWord = sLine.strip()
if sWord not in dWord:
|
| ︙ | ︙ | |||
48 49 50 51 52 53 54 |
for sLine in readFile(spf):
if not sLine.startswith("#"):
sWord, sCount = sLine.split()
dStat[sWord] = dStat.get(sWord, 0) + int(sCount)
return dStat
| | | > > > > > > > > > > > > > > > > > > > > < > | 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 |
for sLine in readFile(spf):
if not sLine.startswith("#"):
sWord, sCount = sLine.split()
dStat[sWord] = dStat.get(sWord, 0) + int(sCount)
return dStat
def readStatFilesAndCreateLexicon ():
dStat = {}
readStatFile("stats1.txt", dStat)
readStatFile("stats2.txt", dStat)
readStatFile("stats3.txt", dStat)
createLexStatFile("propositions.txt", dStat)
# --------------------------------------------------------------------------------------------------
def isMoreThanOneSetInList (lSet):
aFirst = lSet.pop(0)
for aSet in lSet:
if aSet != aFirst:
return True
return False
def filterLinesWithWordsWithDifferentStems (spf):
with open(spf+".res.txt", "w", encoding="utf-8") as hDst:
for sLine in readFile(spf):
lStemSet = [ set(oDict.stem(sWord)) for sWord in sLine.strip().split()]
if isMoreThanOneSetInList(lStemSet):
hDst.write(sLine)
def filterHomophonicWords ():
filterLinesWithWordsWithDifferentStems("homophones.txt")
# --------------------------------------------------------------------------------------------------
if __name__ == '__main__' :
filterHomophonicWords()
|