Overview
Comment: | [core] ibdawg: suggestion mechanism > split word function |
---|---|
Downloads: | Tarball | ZIP archive | SQL archive |
Timelines: | family | ancestors | descendants | both | trunk | core |
Files: | files | file ages | folders |
SHA3-256: |
388e8809cf91bc6ada30a3c24d23e8c1 |
User & Date: | olr on 2017-10-25 09:41:56 |
Other Links: | manifest | tags |
Context
2017-10-25
| ||
11:37 | [core] ibdawg: clean words before damerau-levenshtein comparison check-in: 1329ae8f1c user: olr tags: trunk, core | |
09:41 | [core] ibdawg: suggestion mechanism > split word function check-in: 388e8809cf user: olr tags: trunk, core | |
2017-10-24
| ||
22:24 | [core] ibdawg: suggestion mechanism > exclude some suffixes (ß) check-in: 5fbb7ec853 user: olr tags: trunk, core | |
Changes
Modified gc_core/js/char_player.js from [56bb998588] to [c0ed55106f].
︙ | ︙ | |||
314 315 316 317 318 319 320 | ["on", ["ons", "ont"]], ["ON", ["ONS", "ONT"]], ["oi", ["ois", "oit", "oix"]], ["OI", ["OIS", "OIT", "OIX"]], ]), | | | | > > | > | > | | > > | | | 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 | ["on", ["ons", "ont"]], ["ON", ["ONS", "ONT"]], ["oi", ["ois", "oit", "oix"]], ["OI", ["OIS", "OIT", "OIX"]], ]), // Préfixes et suffixes aPfx1: new Set([ "anti", "archi", "contre", "hyper", "mé", "méta", "im", "in", "ir", "par", "proto", "pseudo", "pré", "re", "ré", "sans", "sous", "supra", "sur", "ultra" ]), aPfx2: new Set([ "belgo", "franco", "génito", "gynéco", "médico", "russo" ]), cut: function (sWord) { // returns an arry of strings (prefix, trimed_word, suffix) let m = /^([a-zA-Zà-öÀ-Ö0-9_ø-ÿØ-ßĀ-ʯfi-st]+)(-(?:t-|)(?:ils?|elles|on|je|tu|nous|vous)$)/.exec(sWord); if (m) { return ["", m[1], m[2]]; } return ["", sWord, ""]; }, // Other functions filterSugg: function (aSugg) { return aSugg.filter((sSugg) => { return !sSugg.endsWith("è") && !sSugg.endsWith("È"); }); } } |
Modified gc_core/js/ibdawg.js from [f9dd570de6] to [c871817c8f].
︙ | ︙ | |||
186 187 188 189 190 191 192 | } } return l; } suggest (sWord, nMaxSugg=10) { // returns a array of suggestions for <sWord> | | < | | < < < < | 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 | } } return l; } suggest (sWord, nMaxSugg=10) { // returns a array of suggestions for <sWord> let sPfx = ""; let sSfx = ""; [sPfx, sWord, sSfx] = char_player.cut(sWord); let nMaxDel = Math.floor(sWord.length / 5); let nMaxHardRepl = Math.max(Math.floor((sWord.length - 5) / 4), 1); let aSugg = this._suggest(sWord, nMaxDel, nMaxHardRepl); if (sWord.gl_isTitle()) { aSugg.gl_update(this._suggest(sWord.toLowerCase(), nMaxDel, nMaxHardRepl)); } else if (sWord.gl_isLowerCase()) { |
︙ | ︙ | |||
216 217 218 219 220 221 222 | if (sWord.gl_isTitle()) { aSugg = aSugg.map((sSugg) => { return sSugg.gl_toCapitalize(); }); } let dDistTemp = new Map(); aSugg.forEach((sSugg) => { dDistTemp.set(sSugg, char_player.distanceDamerauLevenshtein(sWord, sSugg)); }); aSugg = aSugg.sort((sA, sB) => { return dDistTemp.get(sA) - dDistTemp.get(sB); }).slice(0, nMaxSugg); dDistTemp.clear(); | | | | 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 | if (sWord.gl_isTitle()) { aSugg = aSugg.map((sSugg) => { return sSugg.gl_toCapitalize(); }); } let dDistTemp = new Map(); aSugg.forEach((sSugg) => { dDistTemp.set(sSugg, char_player.distanceDamerauLevenshtein(sWord, sSugg)); }); aSugg = aSugg.sort((sA, sB) => { return dDistTemp.get(sA) - dDistTemp.get(sB); }).slice(0, nMaxSugg); dDistTemp.clear(); if (sSfx || sPfx) { // we add what we removed return aSugg.map( (sSugg) => { return sPfx + sSugg + sSfx } ); } return aSugg; } _suggest (sRemain, nMaxDel=0, nMaxHardRepl=0, nDeep=0, iAddr=0, sNewWord="", bAvoidLoop=false) { // returns a set of suggestions // recursive function |
︙ | ︙ |
Modified gc_core/py/char_player.py from [e5dd8880c3] to [2ac4c0eb20].
1 2 3 4 5 6 7 8 9 10 | # list of similar chars # useful for suggestion mechanism def distanceDamerauLevenshtein (s1, s2): "distance of Damerau-Levenshtein between <s1> and <s2>" # https://fr.wikipedia.org/wiki/Distance_de_Damerau-Levenshtein d = {} nLen1 = len(s1) nLen2 = len(s2) | > > | 1 2 3 4 5 6 7 8 9 10 11 12 | # list of similar chars # useful for suggestion mechanism import re def distanceDamerauLevenshtein (s1, s2): "distance of Damerau-Levenshtein between <s1> and <s2>" # https://fr.wikipedia.org/wiki/Distance_de_Damerau-Levenshtein d = {} nLen1 = len(s1) nLen2 = len(s2) |
︙ | ︙ | |||
296 297 298 299 300 301 302 | "on": ("ons", "ont"), "ON": ("ONS", "ONT"), "oi": ("ois", "oit", "oix"), "OI": ("OIS", "OIT", "OIX"), } | | | > | > > > > > > | > > > > > > | 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 | "on": ("ons", "ont"), "ON": ("ONS", "ONT"), "oi": ("ois", "oit", "oix"), "OI": ("OIS", "OIT", "OIX"), } # Préfixes et suffixes aPfx1 = frozenset([ "anti", "archi", "contre", "hyper", "mé", "méta", "im", "in", "ir", "par", "proto", "pseudo", "pré", "re", "ré", "sans", "sous", "supra", "sur", "ultra" ]) aPfx2 = frozenset([ "belgo", "franco", "génito", "gynéco", "médico", "russo" ]) _zMotAvecPronom = re.compile("^(?i)(\\w+)(-(?:t-|)(?:ils?|elles?|on|je|tu|nous|vous))$") def cut (sWord): "returns a tuple of strings (prefix, trimed_word, suffix)" m = _zMotAvecPronom.search(sWord) if m: return ("", m.group(1), m.group(2)) return ("", sWord, "") # Other functions def filterSugg (aSugg): "exclude suggestions" return filter(lambda sSugg: not sSugg.endswith(("è", "È")), aSugg) |
Modified gc_core/py/ibdawg.py from [f31919afcb] to [e132c3a736].
︙ | ︙ | |||
185 186 187 188 189 190 191 | l.extend(self.morph(sWord.lower())) if sWord.isupper() and len(sWord) > 1: l.extend(self.morph(sWord.capitalize())) return l def suggest (self, sWord, nMaxSugg=10): "returns a set of suggestions for <sWord>" | < | < < < < | | | | 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 | l.extend(self.morph(sWord.lower())) if sWord.isupper() and len(sWord) > 1: l.extend(self.morph(sWord.capitalize())) return l def suggest (self, sWord, nMaxSugg=10): "returns a set of suggestions for <sWord>" sPfx, sWord, sSfx = cp.cut(sWord) nMaxDel = len(sWord) // 5 nMaxHardRepl = max((len(sWord) - 5) // 4, 1) aSugg = self._suggest(sWord, nMaxDel=nMaxDel, nMaxHardRepl=nMaxHardRepl) if sWord.istitle(): aSugg.update(self._suggest(sWord.lower(), nMaxDel=nMaxDel, nMaxHardRepl=nMaxHardRepl)) aSugg = set(map(lambda sSugg: sSugg.title(), aSugg)) elif sWord.islower(): aSugg.update(self._suggest(sWord.title(), nMaxDel=nMaxDel, nMaxHardRepl=nMaxHardRepl)) if not aSugg: #print("crush useless chars") aSugg.update(self._suggestWithCrushedUselessChars(cp.clearWord(sWord))) aSugg = cp.filterSugg(aSugg) aSugg = sorted(aSugg, key=lambda sSugg: cp.distanceDamerauLevenshtein(sWord, sSugg))[:nMaxSugg] if sSfx or sPfx: # we add what we removed return list(map(lambda sSug: sPfx + sSug + sSfx, aSugg)) return aSugg def _suggest (self, sRemain, nMaxDel=0, nMaxHardRepl=0, nDeep=0, iAddr=0, sNewWord="", bAvoidLoop=False): "returns a set of suggestions" # recursive function #show(nDeep, sNewWord + ":" + sRemain) aSugg = set() |
︙ | ︙ |