Overview
Comment: | [core] ibdawg: suggestion mechanism update + keyboard chars proximity |
---|---|
Downloads: | Tarball | ZIP archive | SQL archive |
Timelines: | family | ancestors | descendants | both | trunk | core |
Files: | files | file ages | folders |
SHA3-256: |
80ebc25208ff6959e1d14608fd053a5e |
User & Date: | olr on 2017-06-26 06:50:06 |
Other Links: | manifest | tags |
Context
2017-06-26
| ||
06:54 | [core] comment update check-in: 6288449780 user: olr tags: trunk, core | |
06:50 | [core] ibdawg: suggestion mechanism update + keyboard chars proximity check-in: 80ebc25208 user: olr tags: trunk, core | |
2017-06-25
| ||
23:41 | [core] ibdawg: suggestion mechanism update check-in: cee9fdd1aa user: olr tags: trunk, core | |
Changes
Modified gc_core/py/char_player.py from [faa9abdccc] to [b5981aec34].
︙ | ︙ | |||
167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 | "ô": ("aut", "ot", "os"), "ö": ("aut", "ot", "os"), "u": ("ut", "us"), } dFinal2 = { "an": ("ant", "ent"), "en": ("ent", "ant"), "ei": ("ait", "ais"), "on": ("ons", "ont"), "oi": ("ois", "oit", "oix"), } # Préfixes | > > > | | | < | < < < < < < < < < < < < < < < < < < < < < < < < < < < < < < < < < < < < | < < < < < < < < < < < < < < < < < < < < < < < < < < < < < < < < < < < < < | 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 | "ô": ("aut", "ot", "os"), "ö": ("aut", "ot", "os"), "u": ("ut", "us"), } dFinal2 = { "ai": ("aient", "ais", "et"), "an": ("ant", "ent"), "en": ("ent", "ant"), "ei": ("ait", "ais"), "on": ("ons", "ont"), "oi": ("ois", "oit", "oix"), } # Préfixes aPfx1 = frozenset([ "anti", "archi", "contre", "hyper", "mé", "méta", "im", "in", "ir", "par", "proto", "pseudo", "pré", "re", "ré", "sans", "sous", "supra", "sur", "ultra" ]) aPfx2 = frozenset([ "belgo", "franco", "génito", "gynéco", "médico", "russo" ]) |
Modified gc_core/py/ibdawg.py from [18fa7e7c19] to [1a4cdd2a3d].
︙ | ︙ | |||
130 131 132 133 134 135 136 | "_addrBitMask": self._addrBitMask, "nBytesOffset": self.nBytesOffset }, ensure_ascii=False)) if bInJSModule: hDst.write(";\n\nexports.dictionary = dictionary;\n") def isValidToken (self, sToken): | | | | | | | | | | > > > > | | | | | 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 | "_addrBitMask": self._addrBitMask, "nBytesOffset": self.nBytesOffset }, ensure_ascii=False)) if bInJSModule: hDst.write(";\n\nexports.dictionary = dictionary;\n") def isValidToken (self, sToken): "checks if <sToken> is valid (if there is hyphens in <sToken>, <sToken> is split, each part is checked)" if self.isValid(sToken): return True if "-" in sToken: if sToken.count("-") > 4: return True return all(self.isValid(sWord) for sWord in sToken.split("-")) return False def isValid (self, sWord): "checks if <sWord> is valid (different casing tested if the first letter is a capital)" if not sWord: return None if "’" in sWord: # ugly hack sWord = sWord.replace("’", "'") if self.lookup(sWord): return True if sWord[0:1].isupper(): if len(sWord) > 1: if sWord.istitle(): return bool(self.lookup(sWord.lower())) if sWord.isupper(): if self.bOptNumSigle: return True return bool(self.lookup(sWord.lower()) or self.lookup(sWord.capitalize())) return bool(self.lookup(sWord[:1].lower() + sWord[1:])) else: return bool(self.lookup(sWord.lower())) return False def lookup (self, sWord): "returns True if <sWord> in dictionary (strict verification)" iAddr = 0 for c in sWord: if c not in self.dChar: return False iAddr = self._lookupArcNode(self.dChar[c], iAddr) if iAddr == None: return False return int.from_bytes(self.byDic[iAddr:iAddr+self.nBytesArc], byteorder='big') & self._finalNodeMask def suggest (self, sWord): "returns a set of similar words" # first, we check for similar words #return set(self._suggestWithCrushedUselessChars(cp.clearWord(sWord))) lSugg = self._suggest(sWord) if not lSugg: lSugg.extend(self._suggest(sWord[1:])) lSugg.extend(self._suggest(sWord[:-1])) lSugg.extend(self._suggest(sWord[1:-1])) if not lSugg: lSugg.extend(self._suggestWithCrushedUselessChars(cp.clearWord(sWord))) return set(lSugg) def _suggest (self, sWord, nDeep=0, iAddr=0, sNewWord="", bAvoidLoop=False): # RECURSIVE FUNCTION if not sWord: if int.from_bytes(self.byDic[iAddr:iAddr+self.nBytesArc], byteorder='big') & self._finalNodeMask: show(nDeep, "!!! " + sNewWord + " !!!") return [sNewWord] return [] #show(nDeep, "<" + sWord + "> ===> " + sNewWord) lSugg = [] cCurrent = sWord[0:1] for cChar, jAddr in self._getSimilarArcs(cCurrent, iAddr): #show(nDeep, cChar) lSugg.extend(self._suggest(sWord[1:], nDeep+1, jAddr, sNewWord+cChar)) if not bAvoidLoop: # avoid infinite loop #show(nDeep, ":no loop:") if cCurrent == sWord[1:2]: # same char, we remove 1 char without adding 1 to <sNewWord> lSugg.extend(self._suggest(sWord[1:], nDeep+1, iAddr, sNewWord)) for sRepl in cp.d1toX.get(cCurrent, ()): #show(nDeep, sRepl) lSugg.extend(self._suggest(sRepl + sWord[1:], nDeep+1, iAddr, sNewWord, True)) if len(sWord) == 2: for sRepl in cp.dFinal2.get(sWord, ()): #show(nDeep, sRepl) lSugg.extend(self._suggest(sRepl, nDeep+1, iAddr, sNewWord, True)) elif len(sWord) == 1: #show(nDeep, ":end of word:") # end of word for sRepl in cp.dFinal1.get(sWord, ()): #show(nDeep, sRepl) lSugg.extend(self._suggest(sRepl, nDeep+1, iAddr, sNewWord, True)) return lSugg def _getSimilarArcs (self, cChar, iAddr): "generator: yield similar char of <cChar> and address of the following node" for c in cp.d1to1.get(cChar, [cChar]): if c in self.dChar: jAddr = self._lookupArcNode(self.dChar[c], iAddr) if jAddr: yield (c, jAddr) def _suggestWithCrushedUselessChars (self, sWord, nDeep=0, iAddr=0, sNewWord="", bAvoidLoop=False): if not sWord: if int.from_bytes(self.byDic[iAddr:iAddr+self.nBytesArc], byteorder='big') & self._finalNodeMask: show(nDeep, "!!! " + sNewWord + " !!!") return [sNewWord] return [] lSugg = [] cCurrent = sWord[0:1] for cChar, jAddr in self._getSimilarArcsAndCrushedChars(cCurrent, iAddr): show(nDeep, cChar) lSugg.extend(self._suggestWithCrushedUselessChars(sWord[1:], nDeep+1, jAddr, sNewWord+cChar)) return lSugg def _getSimilarArcsAndCrushedChars (self, cChar, iAddr): "generator: yield similar char of <cChar> and address of the following node" for nVal, jAddr in self._getArcs(iAddr): if self.dVal.get(nVal, "") in cp.aUselessChar: yield (self.dVal[nVal], jAddr) |
︙ | ︙ | |||
290 291 292 293 294 295 296 | l.append(sStem + " " + self.lArcVal[nRawArc2 & self._arcMask]) iAddr2 = iEndArcAddr2+self.nBytesNodeAddress iAddr = iEndArcAddr+self.nBytesNodeAddress return l return [] def _stem1 (self, sWord): | | | 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 | l.append(sStem + " " + self.lArcVal[nRawArc2 & self._arcMask]) iAddr2 = iEndArcAddr2+self.nBytesNodeAddress iAddr = iEndArcAddr+self.nBytesNodeAddress return l return [] def _stem1 (self, sWord): "returns stems list of <sWord>" iAddr = 0 for c in sWord: if c not in self.dChar: return [] iAddr = self._lookupArcNode(self.dChar[c], iAddr) if iAddr == None: return [] |
︙ | ︙ | |||
313 314 315 316 317 318 319 | # This value is not a char, this is a stemming code l.append(self.funcStemming(sWord, self.lArcVal[nArc])) iAddr = iEndArcAddr+self.nBytesNodeAddress return l return [] def _lookupArcNode1 (self, nVal, iAddr): | | | 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 | # This value is not a char, this is a stemming code l.append(self.funcStemming(sWord, self.lArcVal[nArc])) iAddr = iEndArcAddr+self.nBytesNodeAddress return l return [] def _lookupArcNode1 (self, nVal, iAddr): "looks if <nVal> is an arc at the node at <iAddr>, if yes, returns address of next node else None" while True: iEndArcAddr = iAddr+self.nBytesArc nRawArc = int.from_bytes(self.byDic[iAddr:iEndArcAddr], byteorder='big') if nVal == (nRawArc & self._arcMask): # the value we are looking for # we return the address of the next node return int.from_bytes(self.byDic[iEndArcAddr:iEndArcAddr+self.nBytesNodeAddress], byteorder='big') |
︙ | ︙ | |||
357 358 359 360 361 362 363 | iAddr = iEndArcAddr+self.nBytesNodeAddress if (nRawArc & self._lastArcMask) and iAddr < len(self.byDic): hDst.write("\ni{:_>10} -- #{:_>10}\n".format("?", iAddr)) hDst.close() # VERSION 2 def _morph2 (self, sWord): | | | 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 | iAddr = iEndArcAddr+self.nBytesNodeAddress if (nRawArc & self._lastArcMask) and iAddr < len(self.byDic): hDst.write("\ni{:_>10} -- #{:_>10}\n".format("?", iAddr)) hDst.close() # VERSION 2 def _morph2 (self, sWord): "returns morphologies of <sWord>" iAddr = 0 for c in sWord: if c not in self.dChar: return [] iAddr = self._lookupArcNode(self.dChar[c], iAddr) if iAddr == None: return [] |
︙ | ︙ | |||
395 396 397 398 399 400 401 | l.append(sStem + " " + self.lArcVal[nRawArc2 & self._arcMask]) iAddr2 = iEndArcAddr2+self.nBytesNodeAddress if not (nRawArc2 & self._addrBitMask) else iEndArcAddr2 iAddr = iEndArcAddr+self.nBytesNodeAddress if not (nRawArc & self._addrBitMask) else iEndArcAddr return l return [] def _stem2 (self, sWord): | | | 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 | l.append(sStem + " " + self.lArcVal[nRawArc2 & self._arcMask]) iAddr2 = iEndArcAddr2+self.nBytesNodeAddress if not (nRawArc2 & self._addrBitMask) else iEndArcAddr2 iAddr = iEndArcAddr+self.nBytesNodeAddress if not (nRawArc & self._addrBitMask) else iEndArcAddr return l return [] def _stem2 (self, sWord): "returns stems list of <sWord>" iAddr = 0 for c in sWord: if c not in self.dChar: return [] iAddr = self._lookupArcNode(self.dChar[c], iAddr) if iAddr == None: return [] |
︙ | ︙ | |||
427 428 429 430 431 432 433 | nRawArc = int.from_bytes(self.byDic[iAddr2:iAddr2+self.nBytesArc], byteorder='big') iAddr2 += self.nBytesArc + self.nBytesNodeAddress iAddr = iEndArcAddr+self.nBytesNodeAddress if not (nRawArc & self._addrBitMask) else iEndArcAddr return l return [] def _lookupArcNode2 (self, nVal, iAddr): | | | 431 432 433 434 435 436 437 438 439 440 441 442 443 444 445 | nRawArc = int.from_bytes(self.byDic[iAddr2:iAddr2+self.nBytesArc], byteorder='big') iAddr2 += self.nBytesArc + self.nBytesNodeAddress iAddr = iEndArcAddr+self.nBytesNodeAddress if not (nRawArc & self._addrBitMask) else iEndArcAddr return l return [] def _lookupArcNode2 (self, nVal, iAddr): "looks if <nVal> is an arc at the node at <iAddr>, if yes, returns address of next node else None" while True: iEndArcAddr = iAddr+self.nBytesArc nRawArc = int.from_bytes(self.byDic[iAddr:iEndArcAddr], byteorder='big') if nVal == (nRawArc & self._arcMask): # the value we are looking for if not (nRawArc & self._addrBitMask): # we return the address of the next node |
︙ | ︙ | |||
472 473 474 475 476 477 478 | iAddr = iEndArcAddr if (nRawArc & self._lastArcMask): hDst.write("\ni{:_>10} -- #{:_>10}\n".format("?", iAddr)) hDst.close() # VERSION 3 def _morph3 (self, sWord): | | | 476 477 478 479 480 481 482 483 484 485 486 487 488 489 490 | iAddr = iEndArcAddr if (nRawArc & self._lastArcMask): hDst.write("\ni{:_>10} -- #{:_>10}\n".format("?", iAddr)) hDst.close() # VERSION 3 def _morph3 (self, sWord): "returns morphologies of <sWord>" iAddr = 0 for c in sWord: if c not in self.dChar: return [] iAddr = self._lookupArcNode(self.dChar[c], iAddr) if iAddr == None: return [] |
︙ | ︙ | |||
507 508 509 510 511 512 513 | l.append(sStem + " " + self.lArcVal[nRawArc2 & self._arcMask]) iAddr2 = iEndArcAddr2+self.nBytesNodeAddress if not (nRawArc2 & self._addrBitMask) else iEndArcAddr2+self.nBytesOffset iAddr = iEndArcAddr+self.nBytesNodeAddress if not (nRawArc & self._addrBitMask) else iEndArcAddr+self.nBytesOffset return l return [] def _stem3 (self, sWord): | | | 511 512 513 514 515 516 517 518 519 520 521 522 523 524 525 | l.append(sStem + " " + self.lArcVal[nRawArc2 & self._arcMask]) iAddr2 = iEndArcAddr2+self.nBytesNodeAddress if not (nRawArc2 & self._addrBitMask) else iEndArcAddr2+self.nBytesOffset iAddr = iEndArcAddr+self.nBytesNodeAddress if not (nRawArc & self._addrBitMask) else iEndArcAddr+self.nBytesOffset return l return [] def _stem3 (self, sWord): "returns stems list of <sWord>" iAddr = 0 for c in sWord: if c not in self.dChar: return [] iAddr = self._lookupArcNode(self.dChar[c], iAddr) if iAddr == None: return [] |
︙ | ︙ | |||
531 532 533 534 535 536 537 | # This value is not a char, this is a stemming code l.append(self.funcStemming(sWord, self.lArcVal[nArc])) iAddr = iEndArcAddr+self.nBytesNodeAddress if not (nRawArc & self._addrBitMask) else iEndArcAddr+self.nBytesOffset return l return [] def _lookupArcNode3 (self, nVal, iAddr): | | | 535 536 537 538 539 540 541 542 543 544 545 546 547 548 549 | # This value is not a char, this is a stemming code l.append(self.funcStemming(sWord, self.lArcVal[nArc])) iAddr = iEndArcAddr+self.nBytesNodeAddress if not (nRawArc & self._addrBitMask) else iEndArcAddr+self.nBytesOffset return l return [] def _lookupArcNode3 (self, nVal, iAddr): "looks if <nVal> is an arc at the node at <iAddr>, if yes, returns address of next node else None" iAddrNode = iAddr while True: iEndArcAddr = iAddr+self.nBytesArc nRawArc = int.from_bytes(self.byDic[iAddr:iEndArcAddr], byteorder='big') if nVal == (nRawArc & self._arcMask): # the value we are looking for if not (nRawArc & self._addrBitMask): |
︙ | ︙ |
Added gc_core/py/keyboard_chars_proximity.py version [d1b0d3e0b7].
> > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > | 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 | # Keyboard chars proximity def getKeyboardMap (sKeyboard): return _dKeyboardMap.get(sKeyboard.lower(), {}) def getKeyboardList (): return _dKeyboardMap.keys() # bépo, colemak and dvorak users are assumed to do less typing errors. _dKeyboardMap = { "azerty": { # fr # line 1 "é": "az", "è": "yu", "ç": "àio", "à": "op", # line 2 "a": "zéq", "z": "aesq", "e": "zrds", "r": "etfd", "t": "rygf", "y": "tuhg", "u": "yijh", "i": "uokj", "o": "iplk", "p": "oml", # line 3 "q": "sawz", "s": "qdzwxe", "d": "sfexcr", "f": "dgrcvt", "g": "fhtvby", "h": "gjybnu", "j": "hkuni", "k": "jlio", "l": "kmop", "m": "lùp", "ù": "m", # line 4 "w": "xqs", "x": "wcsd", "c": "xvdf", "v": "cbfg", "b": "vngh", "n": "bhj", }, "bépo": { # fr # line 2 "b": "éa", "é": "bpu", "p": "éoi", "o": "pèe", "è": "o", "v": "dt", "d": "vls", "l": "djr", "j": "lzn", "z": "jmw", # line 3 "a": "ubà", "u": "aiéy", "i": "uepx", "e": "io", "c": "t", "t": "csvq", "s": "trdg", "r": "snlh", "n": "rmjf", "m": "nzç", # line 4 "à": "yêa", "y": "àxu", "x": "ywi", "w": "z", "k": "c", "q": "gt", "g": "qhs", "h": "gfr", "f": "hçn", "ç": "fm", }, "colemak": { # en, us, intl # line 2 "q": "wa", "w": "qfr", "f": "wps", "p": "fgt", "g": "pjd", "j": "glh", "l": "jun", "u": "lye", "y": "ui", # line 3 "a": "rqz", "r": "aswx", "s": "rtfc", "t": "sdpv", "d": "thgb", "h": "dnjk", "n": "helm", "e": "niu", "i": "eoy", "o": "i", # line 4 "z": "xa", "x": "zcr", "c": "xvs", "v": "cbt", "b": "vkd", "k": "bmh", "m": "kn", }, "dvorak": { # en, us, intl # line 2 "p": "yu", "y": "pfi", "f": "ygd", "g": "fch", "c": "grt", "r": "cln", "l": "rs", # line 3 "a": "o", "o": "aeq", "e": "ouj", "u": "eipk", "i": "udyx", "d": "ihfb", "h": "dtgm", "t": "hncw", "n": "tsrv", "s": "nlz", # line 4 "q": "jo", "j": "qke", "k": "jxu", "x": "kbi", "b": "xmd", "m": "bwh", "w": "mvt", "v": "wzn", "z": "vs", }, "qwerty": { # en, us, intl # line 2 "q": "wa", "w": "qeas", "e": "wrds", "r": "etfd", "t": "rygf", "y": "tuhg", "u": "yijh", "i": "uokj", "o": "iplk", "p": "ol", # line 3 "a": "sqzw", "s": "adwzxe", "d": "sfexcr", "f": "dgrcvt", "g": "fhtvby", "h": "gjybnu", "j": "hkunmi", "k": "jlimo", "l": "kop", # line 4 "z": "xas", "x": "zcsd", "c": "xvdf", "v": "cbfg", "b": "vngh", "n": "bmhj", "m": "njk", }, "qwertz": { # ge, au # line 2 "q": "wa", "w": "qeas", "e": "wrds", "r": "etfd", "t": "rzgf", "z": "tuhg", "u": "zijh", "i": "uokj", "o": "iplk", "p": "oüöl", "ü": "päö", # line 3 "a": "sqyw", "s": "adwyxe", "d": "sfexcr", "f": "dgrcvt", "g": "fhtvbz", "h": "gjzbnu", "j": "hkunmi", "k": "jlimo", "l": "köop", "ö": "läpü", "ä": "öü", # line 4 "y": "xas", "x": "ycsd", "c": "xvdf", "v": "cbfg", "b": "vngh", "n": "bmhj", "m": "njk", } } |