Overview
| Comment: | [graphspell] normalize characters before spell checking |
|---|---|
| Downloads: | Tarball | ZIP archive | SQL archive |
| Timelines: | family | ancestors | descendants | both | trunk | graphspell |
| Files: | files | file ages | folders |
| SHA3-256: |
3348432a36ef9de5c9093131397ae5eb |
| User & Date: | olr on 2018-02-19 09:07:58 |
| Other Links: | manifest | tags |
Context
|
2018-02-19
| ||
| 09:11 | [fr] version 0.6.2 check-in: 18027d1022 user: olr tags: trunk, fr | |
| 09:07 | [graphspell] normalize characters before spell checking check-in: 3348432a36 user: olr tags: trunk, graphspell | |
| 07:30 | [fr] fichier des affixes: ICONV ? -> s, REP: suppression de cad -> c’est-à-dire check-in: b9474152b2 user: olr tags: trunk, fr | |
Changes
Modified gc_lang/fr/perf_memo.txt from [cec037999d] to [190717e473].
| ︙ | ︙ | |||
19 20 21 22 23 24 25 | 0.5.12 2016.10.14 18:58 4.51895 1.0843 0.772805 0.22387 0.249411 0.261593 0.628802 0.339303 0.0570326 0.00805416 0.5.15 2017.01.22 11:44 4.85204 1.16134 0.770762 0.227874 0.244574 0.253305 0.58831 0.319987 0.0603996 0.00694786 0.5.15 2017.01.22 11:47 4.85593 1.15248 0.762924 0.22744 0.243461 0.254609 0.586741 0.317503 0.0588827 0.00701016 (unicode normalisation NFC) 0.5.15 2017.01.31 12:06 4.88227 1.18008 0.782217 0.232617 0.247672 0.257628 0.596903 0.32169 0.0603505 0.00695196 0.5.15 2017.02.05 10:10 4.90222 1.18444 0.786696 0.233413 0.25071 0.260214 0.602112 0.325235 0.0609932 0.00706897 0.5.16 2017.05.12 07:41 4.92201 1.19269 0.80639 0.239147 0.257518 0.266523 0.62111 0.33359 0.0634668 0.00757178 0.6.1 2018.02.12 09:58 5.25924 1.2649 0.878442 0.257465 0.280558 0.293903 0.686887 0.391275 0.0672474 0.00824723 | > | 19 20 21 22 23 24 25 26 | 0.5.12 2016.10.14 18:58 4.51895 1.0843 0.772805 0.22387 0.249411 0.261593 0.628802 0.339303 0.0570326 0.00805416 0.5.15 2017.01.22 11:44 4.85204 1.16134 0.770762 0.227874 0.244574 0.253305 0.58831 0.319987 0.0603996 0.00694786 0.5.15 2017.01.22 11:47 4.85593 1.15248 0.762924 0.22744 0.243461 0.254609 0.586741 0.317503 0.0588827 0.00701016 (unicode normalisation NFC) 0.5.15 2017.01.31 12:06 4.88227 1.18008 0.782217 0.232617 0.247672 0.257628 0.596903 0.32169 0.0603505 0.00695196 0.5.15 2017.02.05 10:10 4.90222 1.18444 0.786696 0.233413 0.25071 0.260214 0.602112 0.325235 0.0609932 0.00706897 0.5.16 2017.05.12 07:41 4.92201 1.19269 0.80639 0.239147 0.257518 0.266523 0.62111 0.33359 0.0634668 0.00757178 0.6.1 2018.02.12 09:58 5.25924 1.2649 0.878442 0.257465 0.280558 0.293903 0.686887 0.391275 0.0672474 0.00824723 0.6.1 2018.02.19 09:06 6.20116 1.44334 1.02936 0.272956 0.311561 0.362367 0.812705 0.419061 0.0773003 0.00845671 (spelling normalization) |
Modified graphspell-js/char_player.js from [c9b14a8774] to [c171c18615].
1 2 3 4 5 6 7 8 |
// list of similar chars
// useful for suggestion mechanism
${map}
var char_player = {
| | > > > > > > > > > > > > > | | | 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 |
// list of similar chars
// useful for suggestion mechanism
${map}
var char_player = {
_xTransCharsForSpelling: new Map([
['ſ', 's'], ['ffi', 'ffi'], ['ffl', 'ffl'], ['ff', 'ff'], ['ſt', 'ft'], ['fi', 'fi'], ['fl', 'fl'], ['st', 'st']
]),
spellingNormalization: function (sWord) {
let sNewWord = "";
for (let c of sWord) {
sNewWord += this._xTransCharsForSpelling.gl_get(c, c);
}
return sNewWord.normalize("NFC");
},
_xTransCharsForSimplification: new Map([
['à', 'a'], ['é', 'e'], ['î', 'i'], ['ô', 'o'], ['û', 'u'], ['ÿ', 'i'], ['y', 'i'],
['â', 'a'], ['è', 'e'], ['ï', 'i'], ['ö', 'o'], ['ù', 'u'], ['ŷ', 'i'],
['ä', 'a'], ['ê', 'e'], ['í', 'i'], ['ó', 'o'], ['ü', 'u'], ['ý', 'i'],
['á', 'a'], ['ë', 'e'], ['ì', 'i'], ['ò', 'o'], ['ú', 'u'], ['ỳ', 'i'],
['ā', 'a'], ['ē', 'e'], ['ī', 'i'], ['ō', 'o'], ['ū', 'u'], ['ȳ', 'i'],
['ñ', 'n'], ['k', 'q'], ['w', 'v'],
['œ', 'oe'], ['æ', 'ae'],
['ſ', 's'], ['ffi', 'ffi'], ['ffl', 'ffl'], ['ff', 'ff'], ['ſt', 'ft'], ['fi', 'fi'], ['fl', 'fl'], ['st', 'st']
]),
simplifyWord: function (sWord) {
// word simplication before calculating distance between words
sWord = sWord.toLowerCase();
let sNewWord = "";
let i = 1;
for (let c of sWord) {
let cNew = this._xTransCharsForSimplification.gl_get(c, c);
let cNext = sWord.slice(i, i+1)
if (cNew != this._xTransCharsForSimplification.gl_get(cNext, cNext)) {
sNewWord += cNew;
}
i++;
}
return sNewWord.replace(/eau/g, "o").replace(/au/g, "o").replace(/ai/g, "e").replace(/ei/g, "e").replace(/ph/g, "f");
},
|
| ︙ | ︙ |
Modified graphspell-js/ibdawg.js from [08ad598b63] to [73e27f350e].
| ︙ | ︙ | |||
206 207 208 209 210 211 212 213 214 215 216 217 218 219 |
"sByDic": this.sByDic // binary word graph
};
return oJSON;
}
isValidToken (sToken) {
// checks if sToken is valid (if there is hyphens in sToken, sToken is split, each part is checked)
if (this.isValid(sToken)) {
return true;
}
if (sToken.includes("-")) {
if (sToken.gl_count("-") > 4) {
return true;
}
| > | 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 |
"sByDic": this.sByDic // binary word graph
};
return oJSON;
}
isValidToken (sToken) {
// checks if sToken is valid (if there is hyphens in sToken, sToken is split, each part is checked)
sToken = char_player.spellingNormalization(sToken)
if (this.isValid(sToken)) {
return true;
}
if (sToken.includes("-")) {
if (sToken.gl_count("-") > 4) {
return true;
}
|
| ︙ | ︙ | |||
276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 |
}
}
return Boolean(this._convBytesToInteger(this.byDic.slice(iAddr, iAddr+this.nBytesArc)) & this._finalNodeMask);
}
getMorph (sWord) {
// retrieves morphologies list, different casing allowed
let l = this.morph(sWord);
if (sWord[0].gl_isUpperCase()) {
l.push(...this.morph(sWord.toLowerCase()));
if (sWord.gl_isUpperCase() && sWord.length > 1) {
l.push(...this.morph(sWord.gl_toCapitalize()));
}
}
return l;
}
suggest (sWord, nSuggLimit=10) {
// returns a array of suggestions for <sWord>
let sPfx = "";
let sSfx = "";
[sPfx, sWord, sSfx] = char_player.cut(sWord);
let nMaxSwitch = Math.max(Math.floor(sWord.length / 3), 1);
let nMaxDel = Math.floor(sWord.length / 5);
let nMaxHardRepl = Math.max(Math.floor((sWord.length - 5) / 4), 1);
let oSuggResult = new SuggResult(sWord);
| > > | 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 |
}
}
return Boolean(this._convBytesToInteger(this.byDic.slice(iAddr, iAddr+this.nBytesArc)) & this._finalNodeMask);
}
getMorph (sWord) {
// retrieves morphologies list, different casing allowed
sWord = char_player.spellingNormalization(sWord)
let l = this.morph(sWord);
if (sWord[0].gl_isUpperCase()) {
l.push(...this.morph(sWord.toLowerCase()));
if (sWord.gl_isUpperCase() && sWord.length > 1) {
l.push(...this.morph(sWord.gl_toCapitalize()));
}
}
return l;
}
suggest (sWord, nSuggLimit=10) {
// returns a array of suggestions for <sWord>
sWord = char_player.spellingNormalization(sWord)
let sPfx = "";
let sSfx = "";
[sPfx, sWord, sSfx] = char_player.cut(sWord);
let nMaxSwitch = Math.max(Math.floor(sWord.length / 3), 1);
let nMaxDel = Math.floor(sWord.length / 5);
let nMaxHardRepl = Math.max(Math.floor((sWord.length - 5) / 4), 1);
let oSuggResult = new SuggResult(sWord);
|
| ︙ | ︙ |
Modified graphspell/char_player.py from [82e97eae54] to [e841b9211a].
1 2 3 4 5 6 | # list of similar chars # useful for suggestion mechanism import re | > | > > > > > > > > | > | | 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 |
# list of similar chars
# useful for suggestion mechanism
import re
import unicodedata
_xTransCharsForSpelling = str.maketrans({
'ſ': 's', 'ffi': 'ffi', 'ffl': 'ffl', 'ff': 'ff', 'ſt': 'ft', 'fi': 'fi', 'fl': 'fl', 'st': 'st'
})
def spellingNormalization (sWord):
return unicodedata.normalize("NFC", sWord.translate(_xTransCharsForSpelling))
_xTransCharsForSimplification = str.maketrans({
'à': 'a', 'é': 'e', 'î': 'i', 'ô': 'o', 'û': 'u', 'ÿ': 'i', "y": "i",
'â': 'a', 'è': 'e', 'ï': 'i', 'ö': 'o', 'ù': 'u', 'ŷ': 'i',
'ä': 'a', 'ê': 'e', 'í': 'i', 'ó': 'o', 'ü': 'u', 'ý': 'i',
'á': 'a', 'ë': 'e', 'ì': 'i', 'ò': 'o', 'ú': 'u', 'ỳ': 'i',
'ā': 'a', 'ē': 'e', 'ī': 'i', 'ō': 'o', 'ū': 'u', 'ȳ': 'i',
'ñ': 'n', 'k': 'q', 'w': 'v',
'œ': 'oe', 'æ': 'ae',
'ſ': 's', 'ffi': 'ffi', 'ffl': 'ffl', 'ff': 'ff', 'ſt': 'ft', 'fi': 'fi', 'fl': 'fl', 'st': 'st',
})
def simplifyWord (sWord):
"word simplication before calculating distance between words"
sWord = sWord.lower().translate(_xTransCharsForSimplification)
sNewWord = ""
for i, c in enumerate(sWord, 1):
if c != sWord[i:i+1]:
sNewWord += c
return sNewWord.replace("eau", "o").replace("au", "o").replace("ai", "e").replace("ei", "e").replace("ph", "f")
|
| ︙ | ︙ |
Modified graphspell/ibdawg.py from [3bf18d8144] to [c41b426a86].
| ︙ | ︙ | |||
214 215 216 217 218 219 220 221 222 223 224 225 226 227 |
"sByDic": self.byDic.hex() if bBinaryDictAsHexString else [ e for e in self.byDic ]
}, ensure_ascii=False))
if bInJSModule:
hDst.write(";\n\nexports.dictionary = dictionary;\n")
def isValidToken (self, sToken):
"checks if <sToken> is valid (if there is hyphens in <sToken>, <sToken> is split, each part is checked)"
if self.isValid(sToken):
return True
if "-" in sToken:
if sToken.count("-") > 4:
return True
return all(self.isValid(sWord) for sWord in sToken.split("-"))
return False
| > | 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 |
"sByDic": self.byDic.hex() if bBinaryDictAsHexString else [ e for e in self.byDic ]
}, ensure_ascii=False))
if bInJSModule:
hDst.write(";\n\nexports.dictionary = dictionary;\n")
def isValidToken (self, sToken):
"checks if <sToken> is valid (if there is hyphens in <sToken>, <sToken> is split, each part is checked)"
sToken = cp.spellingNormalization(sToken)
if self.isValid(sToken):
return True
if "-" in sToken:
if sToken.count("-") > 4:
return True
return all(self.isValid(sWord) for sWord in sToken.split("-"))
return False
|
| ︙ | ︙ | |||
256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 |
iAddr = self._lookupArcNode(self.dChar[c], iAddr)
if iAddr == None:
return False
return bool(int.from_bytes(self.byDic[iAddr:iAddr+self.nBytesArc], byteorder='big') & self._finalNodeMask)
def getMorph (self, sWord):
"retrieves morphologies list, different casing allowed"
l = self.morph(sWord)
if sWord[0:1].isupper():
l.extend(self.morph(sWord.lower()))
if sWord.isupper() and len(sWord) > 1:
l.extend(self.morph(sWord.capitalize()))
return l
#@timethis
def suggest (self, sWord, nSuggLimit=10):
"returns a set of suggestions for <sWord>"
sPfx, sWord, sSfx = cp.cut(sWord)
nMaxSwitch = max(len(sWord) // 3, 1)
nMaxDel = len(sWord) // 5
nMaxHardRepl = max((len(sWord) - 5) // 4, 1)
oSuggResult = SuggResult(sWord)
self._suggest(oSuggResult, sWord, nMaxSwitch=nMaxSwitch, nMaxDel=nMaxDel, nMaxHardRepl=nMaxHardRepl)
if sWord.istitle():
| > > | 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 |
iAddr = self._lookupArcNode(self.dChar[c], iAddr)
if iAddr == None:
return False
return bool(int.from_bytes(self.byDic[iAddr:iAddr+self.nBytesArc], byteorder='big') & self._finalNodeMask)
def getMorph (self, sWord):
"retrieves morphologies list, different casing allowed"
sWord = cp.spellingNormalization(sWord)
l = self.morph(sWord)
if sWord[0:1].isupper():
l.extend(self.morph(sWord.lower()))
if sWord.isupper() and len(sWord) > 1:
l.extend(self.morph(sWord.capitalize()))
return l
#@timethis
def suggest (self, sWord, nSuggLimit=10):
"returns a set of suggestions for <sWord>"
sWord = cp.spellingNormalization(sWord)
sPfx, sWord, sSfx = cp.cut(sWord)
nMaxSwitch = max(len(sWord) // 3, 1)
nMaxDel = len(sWord) // 5
nMaxHardRepl = max((len(sWord) - 5) // 4, 1)
oSuggResult = SuggResult(sWord)
self._suggest(oSuggResult, sWord, nMaxSwitch=nMaxSwitch, nMaxDel=nMaxDel, nMaxHardRepl=nMaxHardRepl)
if sWord.istitle():
|
| ︙ | ︙ | |||
326 327 328 329 330 331 332 333 334 335 336 337 338 339 |
self._suggest(oSuggResult, "", nMaxSwitch, nMaxDel, nMaxHardRepl, nDeep+1, iAddr, sNewWord, True) # remove last char and go on
for sRepl in cp.dFinal1.get(sRemain, ()):
self._suggest(oSuggResult, sRepl, nMaxSwitch, nMaxDel, nMaxHardRepl, nDeep+1, iAddr, sNewWord, True)
#@timethis
def suggest2 (self, sWord, nMaxSugg=10):
"returns a set of suggestions for <sWord>"
sPfx, sWord, sSfx = cp.cut(sWord)
oSuggResult = SuggResult(sWord)
self._suggest2(oSuggResult)
aSugg = oSuggResult.getSuggestions()
if sSfx or sPfx:
# we add what we removed
return list(map(lambda sSug: sPfx + sSug + sSfx, aSugg))
| > | 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 |
self._suggest(oSuggResult, "", nMaxSwitch, nMaxDel, nMaxHardRepl, nDeep+1, iAddr, sNewWord, True) # remove last char and go on
for sRepl in cp.dFinal1.get(sRemain, ()):
self._suggest(oSuggResult, sRepl, nMaxSwitch, nMaxDel, nMaxHardRepl, nDeep+1, iAddr, sNewWord, True)
#@timethis
def suggest2 (self, sWord, nMaxSugg=10):
"returns a set of suggestions for <sWord>"
sWord = cp.spellingNormalization(sWord)
sPfx, sWord, sSfx = cp.cut(sWord)
oSuggResult = SuggResult(sWord)
self._suggest2(oSuggResult)
aSugg = oSuggResult.getSuggestions()
if sSfx or sPfx:
# we add what we removed
return list(map(lambda sSug: sPfx + sSug + sSfx, aSugg))
|
| ︙ | ︙ | |||
382 383 384 385 386 387 388 389 390 391 392 393 394 395 |
aTails.add(sTail + self.dCharVal[nVal])
if n and not aTails:
aTails.update(self._getTails(jAddr, sTail+self.dCharVal[nVal], n-1))
return aTails
def drawPath (self, sWord, iAddr=0):
"show the path taken by <sWord> in the graph"
c1 = sWord[0:1] if sWord else " "
iPos = -1
n = 0
print(c1 + ": ", end="")
for c2, jAddr in self._getCharArcs(iAddr):
print(c2, end="")
if c2 == sWord[0:1]:
| > | 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 |
aTails.add(sTail + self.dCharVal[nVal])
if n and not aTails:
aTails.update(self._getTails(jAddr, sTail+self.dCharVal[nVal], n-1))
return aTails
def drawPath (self, sWord, iAddr=0):
"show the path taken by <sWord> in the graph"
sWord = cp.spellingNormalization(sWord)
c1 = sWord[0:1] if sWord else " "
iPos = -1
n = 0
print(c1 + ": ", end="")
for c2, jAddr in self._getCharArcs(iAddr):
print(c2, end="")
if c2 == sWord[0:1]:
|
| ︙ | ︙ |