Index: graphspell-js/char_player.js ================================================================== --- graphspell-js/char_player.js +++ graphspell-js/char_player.js @@ -7,22 +7,10 @@ ${map} var char_player = { - _xTransCharsForSpelling: new Map([ - ['ſ', 's'], ['ffi', 'ffi'], ['ffl', 'ffl'], ['ff', 'ff'], ['ſt', 'ft'], ['fi', 'fi'], ['fl', 'fl'], ['st', 'st'] - ]), - - spellingNormalization: function (sWord) { - let sNewWord = ""; - for (let c of sWord) { - sNewWord += this._xTransCharsForSpelling.gl_get(c, c); - } - return sNewWord.normalize("NFC"); - }, - oDistanceBetweenChars: { "a": {}, "e": {"é": 0.5}, "é": {"e": 0.5}, "i": {"y": 0.2}, Index: graphspell-js/ibdawg.js ================================================================== --- graphspell-js/ibdawg.js +++ graphspell-js/ibdawg.js @@ -236,11 +236,11 @@ return oJSON; } isValidToken (sToken) { // checks if sToken is valid (if there is hyphens in sToken, sToken is split, each part is checked) - sToken = char_player.spellingNormalization(sToken); + sToken = str_transform.spellingNormalization(sToken); if (this.isValid(sToken)) { return true; } if (sToken.includes("-")) { if (sToken.gl_count("-") > 4) { @@ -309,11 +309,11 @@ return Boolean(this._convBytesToInteger(this.byDic.slice(iAddr, iAddr+this.nBytesArc)) & this._finalNodeMask); } getMorph (sWord) { // retrieves morphologies list, different casing allowed - sWord = char_player.spellingNormalization(sWord); + sWord = str_transform.spellingNormalization(sWord); let l = this.morph(sWord); if (sWord[0].gl_isUpperCase()) { l.push(...this.morph(sWord.toLowerCase())); if (sWord.gl_isUpperCase() && sWord.length > 1) { l.push(...this.morph(sWord.gl_toCapitalize())); @@ -323,11 +323,11 @@ } suggest (sWord, nSuggLimit=10, bSplitTrailingNumbers=false) { // returns a array of suggestions for //console.time("Suggestions for " + sWord); - sWord = char_player.spellingNormalization(sWord); + sWord = str_transform.spellingNormalization(sWord); let sPfx = ""; let sSfx = ""; [sPfx, sWord, sSfx] = char_player.cut(sWord); let nMaxSwitch = Math.max(Math.floor(sWord.length / 3), 1); let nMaxDel = Math.floor(sWord.length / 5); Index: graphspell-js/str_transform.js ================================================================== --- graphspell-js/str_transform.js +++ graphspell-js/str_transform.js @@ -22,10 +22,22 @@ for (let i=0; i <= sWord.length - n; i++) { lNgrams.push(sWord.slice(i, i+n)); } return lNgrams; }, + + _xTransCharsForSpelling: new Map([ + ['ſ', 's'], ['ffi', 'ffi'], ['ffl', 'ffl'], ['ff', 'ff'], ['ſt', 'ft'], ['fi', 'fi'], ['fl', 'fl'], ['st', 'st'] + ]), + + spellingNormalization: function (sWord) { + let sNewWord = ""; + for (let c of sWord) { + sNewWord += this._xTransCharsForSpelling.gl_get(c, c); + } + return sNewWord.normalize("NFC"); + }, longestCommonSubstring: function (string1, string2) { // https://en.wikibooks.org/wiki/Algorithm_Implementation/Strings/Longest_common_substring // untested Index: graphspell/char_player.py ================================================================== --- graphspell/char_player.py +++ graphspell/char_player.py @@ -2,20 +2,10 @@ List of similar chars useful for suggestion mechanism """ import re -import unicodedata - - -_xTransCharsForSpelling = str.maketrans({ - 'ſ': 's', 'ffi': 'ffi', 'ffl': 'ffl', 'ff': 'ff', 'ſt': 'ft', 'fi': 'fi', 'fl': 'fl', 'st': 'st' -}) - -def spellingNormalization (sWord): - "nomalization NFC and removing ligatures" - return unicodedata.normalize("NFC", sWord.translate(_xTransCharsForSpelling)) dDistanceBetweenChars = { "a": {}, "e": {"é": 0.5}, Index: graphspell/ibdawg.py ================================================================== --- graphspell/ibdawg.py +++ graphspell/ibdawg.py @@ -246,11 +246,11 @@ if bInJSModule: hDst.write(";\n\nexports.dictionary = dictionary;\n") def isValidToken (self, sToken): "checks if is valid (if there is hyphens in , is split, each part is checked)" - sToken = cp.spellingNormalization(sToken) + sToken = st.spellingNormalization(sToken) if self.isValid(sToken): return True if "-" in sToken: if sToken.count("-") > 4: return True @@ -290,11 +290,11 @@ return False return bool(int.from_bytes(self.byDic[iAddr:iAddr+self.nBytesArc], byteorder='big') & self._finalNodeMask) def getMorph (self, sWord): "retrieves morphologies list, different casing allowed" - sWord = cp.spellingNormalization(sWord) + sWord = st.spellingNormalization(sWord) l = self.morph(sWord) if sWord[0:1].isupper(): l.extend(self.morph(sWord.lower())) if sWord.isupper() and len(sWord) > 1: l.extend(self.morph(sWord.capitalize())) @@ -302,11 +302,11 @@ #@timethis def suggest (self, sWord, nSuggLimit=10, bSplitTrailingNumbers=False): "returns a set of suggestions for " sWord = sWord.rstrip(".") # useful for LibreOffice - sWord = cp.spellingNormalization(sWord) + sWord = st.spellingNormalization(sWord) sPfx, sWord, sSfx = cp.cut(sWord) nMaxSwitch = max(len(sWord) // 3, 1) nMaxDel = len(sWord) // 5 nMaxHardRepl = max((len(sWord) - 5) // 4, 1) nMaxJump = max(len(sWord) // 4, 1) @@ -410,11 +410,11 @@ aTails.update(self._getTails(jAddr, sTail+self.dCharVal[nVal], n-1)) return aTails def drawPath (self, sWord, iAddr=0): "show the path taken by in the graph" - sWord = cp.spellingNormalization(sWord) + sWord = st.spellingNormalization(sWord) c1 = sWord[0:1] if sWord else " " iPos = -1 n = 0 echo(c1 + ": ", end="") for c2, jAddr in self._getCharArcs(iAddr): Index: graphspell/str_transform.py ================================================================== --- graphspell/str_transform.py +++ graphspell/str_transform.py @@ -1,20 +1,34 @@ """ Operations on strings: - calculate distance between two strings - transform strings with transformation codes """ + +import unicodedata from .char_player import distanceBetweenChars -#### Ngrams +#### N-GRAMS def getNgrams (sWord, n=2): "return a list of Ngrams strings" return [ sWord[i:i+n] for i in range(len(sWord)-n+1) ] + + +#### WORD NORMALIZATION + +_xTransCharsForSpelling = str.maketrans({ + 'ſ': 's', 'ffi': 'ffi', 'ffl': 'ffl', 'ff': 'ff', 'ſt': 'ft', 'fi': 'fi', 'fl': 'fl', 'st': 'st' +}) + +def spellingNormalization (sWord): + "nomalization NFC and removing ligatures" + return unicodedata.normalize("NFC", sWord.translate(_xTransCharsForSpelling)) + #### DISTANCE CALCULATIONS def longestCommonSubstring (s1, s2):