Overview
Comment: | [fr] màj: reader.py (filtre pour la liste des homophones tirée de Wiktionnaire (merci à Benoit S.)) |
---|---|
Downloads: | Tarball | ZIP archive | SQL archive |
Timelines: | family | ancestors | descendants | both | trunk |
Files: | files | file ages | folders |
SHA3-256: |
c3cefaaf03d8555a1a36d0bc146f0178 |
User & Date: | olr on 2017-06-11 18:32:54 |
Original Comment: | [fr] màj: reader.py (filtre pour la liste des homophnes tirée de Wikisource(merci à Benoit S.)) |
Other Links: | manifest | tags |
Context
2017-06-11
| ||
21:03 | [fr] phonet_simil: <bore/bord> check-in: 8003eff469 user: olr tags: trunk, fr | |
18:32 | [fr] màj: reader.py (filtre pour la liste des homophones tirée de Wiktionnaire (merci à Benoit S.)) check-in: c3cefaaf03 user: olr tags: trunk | |
16:24 | [fr] màj: règles typographiques sur les nombres ordinaux check-in: b798f77c08 user: olr tags: trunk, fr | |
Changes
Modified reader.py from [510a35b7df] to [0c60e1da75].
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 | #!python3 import os import sys import re import grammalecte.ibdawg as ibdawg oDict = ibdawg.IBDAWG("French.bdic") def readFile (spf): if os.path.isfile(spf): with open(spf, "r", encoding="utf-8") as hSrc: for sLine in hSrc: yield sLine else: print("# Error: file not found.") | > > | | > | 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 | #!python3 # Just a file for one-shot scripts import os import sys import re import grammalecte.ibdawg as ibdawg oDict = ibdawg.IBDAWG("French.bdic") def readFile (spf): if os.path.isfile(spf): with open(spf, "r", encoding="utf-8") as hSrc: for sLine in hSrc: yield sLine else: print("# Error: file not found.") # -------------------------------------------------------------------------------------------------- def listUnknownWords (spf): with open(spf+".res.txt", "w", encoding="utf-8") as hDst: for sLine in readFile(spfSrc): sLine = sLine.strip() if sLine: for sWord in sLine.split(): if not oDict.isValid(sWord): hDst.write(sWord+"\n") # -------------------------------------------------------------------------------------------------- def createLexStatFile (spf, dStat): dWord = {} for i, sLine in enumerate(readFile(spf)): if not sLine.startswith("#"): sWord = sLine.strip() if sWord not in dWord: |
︙ | ︙ | |||
48 49 50 51 52 53 54 | for sLine in readFile(spf): if not sLine.startswith("#"): sWord, sCount = sLine.split() dStat[sWord] = dStat.get(sWord, 0) + int(sCount) return dStat | | | > > > > > > > > > > > > > > > > > > > > < > | 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 | for sLine in readFile(spf): if not sLine.startswith("#"): sWord, sCount = sLine.split() dStat[sWord] = dStat.get(sWord, 0) + int(sCount) return dStat def readStatFilesAndCreateLexicon (): dStat = {} readStatFile("stats1.txt", dStat) readStatFile("stats2.txt", dStat) readStatFile("stats3.txt", dStat) createLexStatFile("propositions.txt", dStat) # -------------------------------------------------------------------------------------------------- def isMoreThanOneSetInList (lSet): aFirst = lSet.pop(0) for aSet in lSet: if aSet != aFirst: return True return False def filterLinesWithWordsWithDifferentStems (spf): with open(spf+".res.txt", "w", encoding="utf-8") as hDst: for sLine in readFile(spf): lStemSet = [ set(oDict.stem(sWord)) for sWord in sLine.strip().split()] if isMoreThanOneSetInList(lStemSet): hDst.write(sLine) def filterHomophonicWords (): filterLinesWithWordsWithDifferentStems("homophones.txt") # -------------------------------------------------------------------------------------------------- if __name__ == '__main__' : filterHomophonicWords() |