Overview
Comment: | [graphspell] move function from char_player to str_transform |
---|---|
Downloads: | Tarball | ZIP archive | SQL archive |
Timelines: | family | ancestors | descendants | both | trunk | graphspell |
Files: | files | file ages | folders |
SHA3-256: |
2381f7c9ae11b01adf4aaee8cfd9f4f7 |
User & Date: | olr on 2020-08-05 06:56:15 |
Other Links: | manifest | tags |
Context
2020-08-05
| ||
09:30 | [graphspell] move functions from char_player to str_transform and lexicographer check-in: 19fccd89d6 user: olr tags: trunk, graphspell | |
06:56 | [graphspell] move function from char_player to str_transform check-in: 2381f7c9ae user: olr tags: trunk, graphspell | |
06:46 | [graphspell][py] fix details check-in: 4617b0dfc4 user: olr tags: trunk, graphspell | |
Changes
Modified graphspell-js/char_player.js from [3caadd8250] to [ba78268468].
1 2 3 4 5 6 7 8 9 10 11 | // list of similar chars // useful for suggestion mechanism /* jshint esversion:6 */ /* jslint esversion:6 */ ${map} var char_player = { | < < < < < < < < < < < < | 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 | // list of similar chars // useful for suggestion mechanism /* jshint esversion:6 */ /* jslint esversion:6 */ ${map} var char_player = { oDistanceBetweenChars: { "a": {}, "e": {"é": 0.5}, "é": {"e": 0.5}, "i": {"y": 0.2}, "o": {}, "u": {}, |
︙ | ︙ |
Modified graphspell-js/ibdawg.js from [189561924c] to [39524e288d].
︙ | ︙ | |||
234 235 236 237 238 239 240 | "l2grams": this.l2grams }; return oJSON; } isValidToken (sToken) { // checks if sToken is valid (if there is hyphens in sToken, sToken is split, each part is checked) | | | 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 | "l2grams": this.l2grams }; return oJSON; } isValidToken (sToken) { // checks if sToken is valid (if there is hyphens in sToken, sToken is split, each part is checked) sToken = str_transform.spellingNormalization(sToken); if (this.isValid(sToken)) { return true; } if (sToken.includes("-")) { if (sToken.gl_count("-") > 4) { return true; } |
︙ | ︙ | |||
307 308 309 310 311 312 313 | } } return Boolean(this._convBytesToInteger(this.byDic.slice(iAddr, iAddr+this.nBytesArc)) & this._finalNodeMask); } getMorph (sWord) { // retrieves morphologies list, different casing allowed | | | | 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 | } } return Boolean(this._convBytesToInteger(this.byDic.slice(iAddr, iAddr+this.nBytesArc)) & this._finalNodeMask); } getMorph (sWord) { // retrieves morphologies list, different casing allowed sWord = str_transform.spellingNormalization(sWord); let l = this.morph(sWord); if (sWord[0].gl_isUpperCase()) { l.push(...this.morph(sWord.toLowerCase())); if (sWord.gl_isUpperCase() && sWord.length > 1) { l.push(...this.morph(sWord.gl_toCapitalize())); } } return l; } suggest (sWord, nSuggLimit=10, bSplitTrailingNumbers=false) { // returns a array of suggestions for <sWord> //console.time("Suggestions for " + sWord); sWord = str_transform.spellingNormalization(sWord); let sPfx = ""; let sSfx = ""; [sPfx, sWord, sSfx] = char_player.cut(sWord); let nMaxSwitch = Math.max(Math.floor(sWord.length / 3), 1); let nMaxDel = Math.floor(sWord.length / 5); let nMaxHardRepl = Math.max(Math.floor((sWord.length - 5) / 4), 1); let nMaxJump = Math.max(Math.floor(sWord.length / 4), 1); |
︙ | ︙ |
Modified graphspell-js/str_transform.js from [9baf96ac7b] to [1ca4ee03ac].
︙ | ︙ | |||
20 21 22 23 24 25 26 27 28 29 30 31 32 33 | getNgrams: function (sWord, n=2) { let lNgrams = []; for (let i=0; i <= sWord.length - n; i++) { lNgrams.push(sWord.slice(i, i+n)); } return lNgrams; }, longestCommonSubstring: function (string1, string2) { // https://en.wikibooks.org/wiki/Algorithm_Implementation/Strings/Longest_common_substring // untested // init max value let longestCommonSubstring = 0; | > > > > > > > > > > > > | 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 | getNgrams: function (sWord, n=2) { let lNgrams = []; for (let i=0; i <= sWord.length - n; i++) { lNgrams.push(sWord.slice(i, i+n)); } return lNgrams; }, _xTransCharsForSpelling: new Map([ ['ſ', 's'], ['ffi', 'ffi'], ['ffl', 'ffl'], ['ff', 'ff'], ['ſt', 'ft'], ['fi', 'fi'], ['fl', 'fl'], ['st', 'st'] ]), spellingNormalization: function (sWord) { let sNewWord = ""; for (let c of sWord) { sNewWord += this._xTransCharsForSpelling.gl_get(c, c); } return sNewWord.normalize("NFC"); }, longestCommonSubstring: function (string1, string2) { // https://en.wikibooks.org/wiki/Algorithm_Implementation/Strings/Longest_common_substring // untested // init max value let longestCommonSubstring = 0; |
︙ | ︙ |
Modified graphspell/char_player.py from [fa338bf2f3] to [5484ce7bef].
1 2 3 4 5 6 | """ List of similar chars useful for suggestion mechanism """ import re | < < < < < < < < < < | 1 2 3 4 5 6 7 8 9 10 11 12 13 | """ List of similar chars useful for suggestion mechanism """ import re dDistanceBetweenChars = { "a": {}, "e": {"é": 0.5}, "é": {"e": 0.5}, "i": {"y": 0.2}, |
︙ | ︙ |
Modified graphspell/ibdawg.py from [f3a55a8c6e] to [78af5c9dd3].
︙ | ︙ | |||
244 245 246 247 248 249 250 | "l2grams": list(self.a2grams) }, ensure_ascii=False)) if bInJSModule: hDst.write(";\n\nexports.dictionary = dictionary;\n") def isValidToken (self, sToken): "checks if <sToken> is valid (if there is hyphens in <sToken>, <sToken> is split, each part is checked)" | | | 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 | "l2grams": list(self.a2grams) }, ensure_ascii=False)) if bInJSModule: hDst.write(";\n\nexports.dictionary = dictionary;\n") def isValidToken (self, sToken): "checks if <sToken> is valid (if there is hyphens in <sToken>, <sToken> is split, each part is checked)" sToken = st.spellingNormalization(sToken) if self.isValid(sToken): return True if "-" in sToken: if sToken.count("-") > 4: return True return all(self.isValid(sWord) for sWord in sToken.split("-")) if "." in sToken or "·" in sToken: |
︙ | ︙ | |||
288 289 290 291 292 293 294 | iAddr = self._lookupArcNode(self.dChar[c], iAddr) if iAddr is None: return False return bool(int.from_bytes(self.byDic[iAddr:iAddr+self.nBytesArc], byteorder='big') & self._finalNodeMask) def getMorph (self, sWord): "retrieves morphologies list, different casing allowed" | | | | 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 | iAddr = self._lookupArcNode(self.dChar[c], iAddr) if iAddr is None: return False return bool(int.from_bytes(self.byDic[iAddr:iAddr+self.nBytesArc], byteorder='big') & self._finalNodeMask) def getMorph (self, sWord): "retrieves morphologies list, different casing allowed" sWord = st.spellingNormalization(sWord) l = self.morph(sWord) if sWord[0:1].isupper(): l.extend(self.morph(sWord.lower())) if sWord.isupper() and len(sWord) > 1: l.extend(self.morph(sWord.capitalize())) return l #@timethis def suggest (self, sWord, nSuggLimit=10, bSplitTrailingNumbers=False): "returns a set of suggestions for <sWord>" sWord = sWord.rstrip(".") # useful for LibreOffice sWord = st.spellingNormalization(sWord) sPfx, sWord, sSfx = cp.cut(sWord) nMaxSwitch = max(len(sWord) // 3, 1) nMaxDel = len(sWord) // 5 nMaxHardRepl = max((len(sWord) - 5) // 4, 1) nMaxJump = max(len(sWord) // 4, 1) oSuggResult = SuggResult(sWord) if bSplitTrailingNumbers: |
︙ | ︙ | |||
408 409 410 411 412 413 414 | aTails.add(sTail + self.dCharVal[nVal]) if n and not aTails: aTails.update(self._getTails(jAddr, sTail+self.dCharVal[nVal], n-1)) return aTails def drawPath (self, sWord, iAddr=0): "show the path taken by <sWord> in the graph" | | | 408 409 410 411 412 413 414 415 416 417 418 419 420 421 422 | aTails.add(sTail + self.dCharVal[nVal]) if n and not aTails: aTails.update(self._getTails(jAddr, sTail+self.dCharVal[nVal], n-1)) return aTails def drawPath (self, sWord, iAddr=0): "show the path taken by <sWord> in the graph" sWord = st.spellingNormalization(sWord) c1 = sWord[0:1] if sWord else " " iPos = -1 n = 0 echo(c1 + ": ", end="") for c2, jAddr in self._getCharArcs(iAddr): echo(c2, end="") if c2 == sWord[0:1]: |
︙ | ︙ |
Modified graphspell/str_transform.py from [452d0bdcef] to [e25e3e9b20].
1 2 3 4 5 6 7 8 9 | """ Operations on strings: - calculate distance between two strings - transform strings with transformation codes """ from .char_player import distanceBetweenChars | > > | > > > > > > > > > > > > | 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 | """ Operations on strings: - calculate distance between two strings - transform strings with transformation codes """ import unicodedata from .char_player import distanceBetweenChars #### N-GRAMS def getNgrams (sWord, n=2): "return a list of Ngrams strings" return [ sWord[i:i+n] for i in range(len(sWord)-n+1) ] #### WORD NORMALIZATION _xTransCharsForSpelling = str.maketrans({ 'ſ': 's', 'ffi': 'ffi', 'ffl': 'ffl', 'ff': 'ff', 'ſt': 'ft', 'fi': 'fi', 'fl': 'fl', 'st': 'st' }) def spellingNormalization (sWord): "nomalization NFC and removing ligatures" return unicodedata.normalize("NFC", sWord.translate(_xTransCharsForSpelling)) #### DISTANCE CALCULATIONS def longestCommonSubstring (s1, s2): "longest common substring" # http://en.wikipedia.org/wiki/Longest_common_substring_problem |
︙ | ︙ |