Index: graphspell-js/dawg.js ================================================================== --- graphspell-js/dawg.js +++ graphspell-js/dawg.js @@ -3,11 +3,11 @@ // FSA DICTIONARY BUILDER // // by Olivier R. // License: MPL 2 // -// This tool encodes lexicon into an indexable binary dictionary +// This tool encodes lexicon into an indexable binary dictionary // Input files MUST be encoded in UTF-8. "use strict"; @@ -39,19 +39,24 @@ case "N": funcStemmingGen = str_transform.noStemming; break; default: throw "Error. Unknown stemming code: " + cStemming; } - + let lEntry = []; let lChar = [''], dChar = new Map(), nChar = 1, dCharOccur = new Map(); let lAff = [], dAff = new Map(), nAff = 0, dAffOccur = new Map(); let lTag = [], dTag = new Map(), nTag = 0, dTagOccur = new Map(); let nErr = 0; - + + this.a2grams = new Set(); + // read lexicon for (let [sFlex, sStem, sTag] of lEntrySrc) { + for (let s2grams of str_transform.getNgrams(sFlex)) { + this.a2grams.add(s2grams); + } addWordToCharDict(sFlex); // chars for (let c of sFlex) { if (!dChar.get(c)) { dChar.set(c, nChar); @@ -82,11 +87,11 @@ } lEntry = [...new Set(lEntry.map(e => JSON.stringify(e)))].map(s => JSON.parse(s)); // Set can’t distinguish similar lists, so we transform list item in string given to the Set // then we transform items in list a new. - + // Preparing DAWG console.log(" > Preparing list of words"); let lVal = lChar.concat(lAff).concat(lTag); let lWord = []; for (let [sFlex, iAff, iTag] of lEntry) { @@ -97,11 +102,11 @@ lTemp.push(iAff+nChar); lTemp.push(iTag+nChar+nAff) lWord.push(lTemp); } lEntry.length = 0; // clear the array - + // Dictionary of arc values occurrency, to sort arcs of each node let lKeyVal = []; for (let c of dChar.keys()) { lKeyVal.push([dChar.get(c), dCharOccur.get(c)]); } for (let sAff of dAff.keys()) { lKeyVal.push([dAff.get(sAff)+nChar, dAffOccur.get(sAff)]); } for (let sTag of dTag.keys()) { lKeyVal.push([dTag.get(sTag)+nChar+nAff, dTagOccur.get(sTag)]); } @@ -131,11 +136,11 @@ } else if (cStemming == "S") { this.funcStemming = str_transform.changeWordWithSuffixCode; } else { this.funcStemming = str_transform.noStemming; } - + // build lWord.sort(); if (xProgressBarNode) { xProgressBarNode.value = 0; xProgressBarNode.max = lWord.length; @@ -220,11 +225,11 @@ this.nArc = 0; for (let oNode of this.dMinimizedNodes.values()) { this.nArc += oNode.arcs.size; } } - + sortNodeArcs (dValOccur) { console.log(" > Sort node arcs"); this.oRoot.sortArcs(dValOccur); for (let oNode of this.dMinimizedNodes.values()) { oNode.sortArcs(dValOccur); @@ -273,10 +278,11 @@ console.log("Affixes: " + this.nAff); console.log("Tags: " + this.nTag); console.log("Arc values: " + this.nArcVal); console.log("Nodes: " + this.nNode); console.log("Arcs: " + this.nArc); + console.log("2grams: " + this.a2grams.size); console.log("Stemming: " + this.cStemming + "FX"); } getArcStats () { let d = new Map(); @@ -394,11 +400,12 @@ "nArcVal": this.nArcVal, "nCompressionMethod": nCompressionMethod, "nBytesArc": this.nBytesArc, "nBytesNodeAddress": this.nBytesNodeAddress, "nBytesOffset": this.nBytesOffset, - "sByDic": sByDic // binary word graph + "sByDic": sByDic, // binary word graph + "l2grams": Array.from(this.a2grams) }; return oJSON; } _getDate () { @@ -486,11 +493,11 @@ convToBytes1 (nBytesArc, nBytesNodeAddress) { /* Node scheme: - Arc length is defined by nBytesArc - Address length is defined by nBytesNodeAddress - + | Arc | Address of next node | | | | /---------------\ /---------------\ /---------------\ /---------------\ /---------------\ /---------------\ | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | \---------------/ \---------------/ \---------------/ \---------------/ \---------------/ \---------------/ Index: graphspell-js/ibdawg.js ================================================================== --- graphspell-js/ibdawg.js +++ graphspell-js/ibdawg.js @@ -46,11 +46,11 @@ this.dSugg.get(nDist).push(sSugg); this.aSugg.add(sSugg); if (nDist < this.nMinDist) { this.nMinDist = nDist; } - this.nDistLimit = Math.min(this.nDistLimit, this.nMinDist+2); + this.nDistLimit = Math.min(this.nDistLimit, this.nMinDist+1); } } } getSuggestions (nSuggLimit=10, nDistLimit=-1) { @@ -137,10 +137,11 @@ throw RangeError("# Error. Unknown dictionary compression method: " + this.nCompressionMethod); } // to get the value of an arc, to get the char of an arc with its value this.dChar = helpers.objectToMap(this.dChar); this.dCharVal = this.dChar.gl_reverse(); + this.a2grams = new Set(this.l2grams); if (this.cStemming == "S") { this.funcStemming = str_transform.changeWordWithSuffixCode; } else if (this.cStemming == "A") { this.funcStemming = str_transform.changeWordWithAffixCode; @@ -212,11 +213,12 @@ "nArcVal": this.nArcVal, "nCompressionMethod": this.nCompressionMethod, "nBytesArc": this.nBytesArc, "nBytesNodeAddress": this.nBytesNodeAddress, "nBytesOffset": this.nBytesOffset, - "sByDic": this.sByDic // binary word graph + "sByDic": this.sByDic, // binary word graph + "l2grams": this.l2grams }; return oJSON; } isValidToken (sToken) { @@ -349,11 +351,11 @@ for (let [cChar, jAddr] of this._getCharArcs(iAddr)) { if (char_player.d1to1.gl_get(cCurrent, cCurrent).indexOf(cChar) != -1) { this._suggest(oSuggResult, sRemain.slice(1), nMaxSwitch, nMaxDel, nMaxHardRepl, nMaxJump, nDist, nDeep+1, jAddr, sNewWord+cChar); } else if (!bAvoidLoop) { - if (nMaxHardRepl) { + if (nMaxHardRepl && this.isNgramsOK(cChar+sRemain.slice(1,2))) { this._suggest(oSuggResult, sRemain.slice(1), nMaxSwitch, nMaxDel, nMaxHardRepl-1, nMaxJump, nDist+1, nDeep+1, jAddr, sNewWord+cChar, true); } if (nMaxJump) { this._suggest(oSuggResult, sRemain, nMaxSwitch, nMaxDel, nMaxHardRepl, nMaxJump-1, nDist+1, nDeep+1, jAddr, sNewWord+cChar, true); } @@ -365,15 +367,15 @@ // same char, we remove 1 char without adding 1 to this._suggest(oSuggResult, sRemain.slice(1), nMaxSwitch, nMaxDel, nMaxHardRepl, nMaxJump, nDist, nDeep+1, iAddr, sNewWord); } else { // switching chars - if (nMaxSwitch > 0) { + if (nMaxSwitch > 0 && this.isNgramsOK(sNewWord.slice(-1)+sRemain.slice(1,2)) && this.isNgramsOK(sRemain.slice(1,2)+sRemain.slice(0,1))) { this._suggest(oSuggResult, sRemain.slice(1, 2)+sRemain.slice(0, 1)+sRemain.slice(2), nMaxSwitch-1, nMaxDel, nMaxHardRepl, nMaxJump, nDist+1, nDeep+1, iAddr, sNewWord, true); } // delete char - if (nMaxDel > 0) { + if (nMaxDel > 0 && this.isNgramsOK(sNewWord.slice(-1)+sRemain.slice(1,2))) { this._suggest(oSuggResult, sRemain.slice(1), nMaxSwitch, nMaxDel-1, nMaxHardRepl, nMaxJump, nDist+1, nDeep+1, iAddr, sNewWord, true); } } // Phonetic replacements for (let sRepl of char_player.get1toXReplacement(sNewWord.slice(-1), cCurrent, sRemain.slice(1,2))) { @@ -395,10 +397,17 @@ this._suggest(oSuggResult, sRepl, nMaxSwitch, nMaxDel, nMaxHardRepl, nMaxJump, nDist, nDeep+1, iAddr, sNewWord, true); } } } } + + isNgramsOK (sChars) { + if (sChars.length != 2) { + return true; + } + return this.a2grams.has(sChars); + } * _getCharArcs (iAddr) { // generator: yield all chars and addresses from node at address for (let [nVal, jAddr] of this._getArcs(iAddr)) { if (nVal <= this.nChar) { Index: graphspell/dawg.py ================================================================== --- graphspell/dawg.py +++ graphspell/dawg.py @@ -61,10 +61,12 @@ lChar = ['']; dChar = {}; nChar = 1; dCharOccur = {} lAff = []; dAff = {}; nAff = 0; dAffOccur = {} lTag = []; dTag = {}; nTag = 0; dTagOccur = {} nErr = 0 + self.a2grams = set() + try: zFilter = re.compile(sSelectFilterRegex) if sSelectFilterRegex else None except: print(" # Error. Wrong filter regex. Filter ignored.") traceback.print_exc() @@ -75,10 +77,11 @@ iterable = readFile(src) else: iterable = src for sFlex, sStem, sTag in iterable: if not zFilter or zFilter.search(sTag): + self.a2grams.update(st.getNgrams(sFlex)) addWordToCharDict(sFlex) # chars for c in sFlex: if c not in dChar: dChar[c] = nChar @@ -283,10 +286,11 @@ print(" * {:<12} {:>16,}".format("Affixes:", self.nAff)) print(" * {:<12} {:>16,}".format("Tags:", self.nTag)) print(" * {:<12} {:>16,}".format("Arc values:", self.nArcVal)) print(" * {:<12} {:>16,}".format("Nodes:", self.nNode)) print(" * {:<12} {:>16,}".format("Arcs:", self.nArc)) + print(" * {:<12} {:>16,}".format("2grams:", len(self.a2grams))) print(" * {:<12} {:>16}".format("Stemming:", self.cStemming + "FX")) def getArcStats (self): "return a string with statistics about nodes and arcs" d = {} @@ -446,11 +450,12 @@ "nBytesNodeAddress": self.nBytesNodeAddress, "nBytesOffset": self.nBytesOffset, # Mozilla’s JS parser don’t like file bigger than 4 Mb! # So, if necessary, we use an hexadecimal string, that we will convert later in Firefox’s extension. # https://github.com/mozilla/addons-linter/issues/1361 - "sByDic": byDic.hex() if bBinaryDictAsHexString else [ e for e in byDic ] + "sByDic": byDic.hex() if bBinaryDictAsHexString else [ e for e in byDic ], + "l2grams": list(self.a2grams) } def writeAsJSObject (self, spfDst, nCompressionMethod, bInJSModule=False, bBinaryDictAsHexString=True): "write a file (JSON or JS module) with all the necessary data" if not spfDst.endswith(".json"): @@ -496,10 +501,13 @@ * a list of strings encoded in binary from utf-8, each value separated with a tabulation - Section Word Graph (nodes / arcs) * A list of nodes which are a list of arcs with an address of the next node. See DawgNode.convToBytes() for details. + + - Section 2grams: + * A list of 2grams (as strings: 2 chars) encoded in binary from utf-8, each value separated with a tabulation """ self._calculateBinary(nCompressionMethod) if not sPathFile.endswith(".bdic"): sPathFile += "."+str(nCompressionMethod)+".bdic" with open(sPathFile, 'wb') as hDst: @@ -513,10 +521,13 @@ hDst.write(sInfo.encode("utf-8")) hDst.write(b"\0\0\0\0") # lArcVal hDst.write("\t".join(self.lArcVal).encode("utf-8")) hDst.write(b"\0\0\0\0") + # 2grams + hDst.write("\t".join(self.a2grams).encode("utf-8")) + hDst.write(b"\0\0\0\0") # DAWG: nodes / arcs if nCompressionMethod == 1: hDst.write(self.oRoot.convToBytes1(self.nBytesArc, self.nBytesNodeAddress)) for oNode in self.lMinimizedNodes: hDst.write(oNode.convToBytes1(self.nBytesArc, self.nBytesNodeAddress)) Index: graphspell/ibdawg.py ================================================================== --- graphspell/ibdawg.py +++ graphspell/ibdawg.py @@ -60,11 +60,11 @@ self.dSugg[nDist] = [] self.dSugg[nDist].append(sSugg) self.aSugg.add(sSugg) if nDist < self.nMinDist: self.nMinDist = nDist - self.nDistLimit = min(self.nDistLimit, self.nMinDist+2) + self.nDistLimit = min(self.nDistLimit, self.nMinDist+1) def getSuggestions (self, nSuggLimit=10): "return a list of suggestions" if self.dSugg[0]: # we sort the better results with the original word @@ -151,21 +151,22 @@ if self.by[0:17] != b"/grammalecte-fsa/": raise TypeError("# Error. Not a grammalecte-fsa binary dictionary. Header: {}".format(self.by[0:9])) if not(self.by[17:18] == b"1" or self.by[17:18] == b"2" or self.by[17:18] == b"3"): raise ValueError("# Error. Unknown dictionary version: {}".format(self.by[17:18])) try: - header, info, values, bdic = self.by.split(b"\0\0\0\0", 3) + byHeader, byInfo, byValues, by2grams, byDic = self.by.split(b"\0\0\0\0", 4) except Exception: raise Exception self.nCompressionMethod = int(self.by[17:18].decode("utf-8")) - self.sHeader = header.decode("utf-8") - self.lArcVal = values.decode("utf-8").split("\t") + self.sHeader = byHeader.decode("utf-8") + self.lArcVal = byValues.decode("utf-8").split("\t") self.nArcVal = len(self.lArcVal) - self.byDic = bdic + self.byDic = byDic + self.a2grams = set(by2grams.decode("utf-8").split("\t")) - l = info.decode("utf-8").split("//") + l = byInfo.decode("utf-8").split("//") self.sLangCode = l.pop(0) self.sLangName = l.pop(0) self.sDicName = l.pop(0) self.sDate = l.pop(0) self.nChar = int(l.pop(0)) @@ -187,10 +188,11 @@ def _initJSON (self, oJSON): "initialize with a JSON text file" self.__dict__.update(oJSON) self.byDic = binascii.unhexlify(self.sByDic) self.dCharVal = { v: k for k, v in self.dChar.items() } + self.a2grams = set(self.l2grams) def getInfo (self): "return string about the IBDAWG" return " Language: {0.sLangName} Lang code: {0.sLangCode} Dictionary name: {0.sDicName}" \ " Compression method: {0.nCompressionMethod:>2} Date: {0.sDate} Stemming: {0.cStemming}FX\n" \ @@ -225,11 +227,12 @@ "nBytesNodeAddress": self.nBytesNodeAddress, "nBytesOffset": self.nBytesOffset, # JavaScript is a pile of shit, so Mozilla’s JS parser don’t like file bigger than 4 Mb! # So, if necessary, we use an hexadecimal string, that we will convert later in Firefox’s extension. # https://github.com/mozilla/addons-linter/issues/1361 - "sByDic": self.byDic.hex() if bBinaryDictAsHexString else [ e for e in self.byDic ] + "sByDic": self.byDic.hex() if bBinaryDictAsHexString else [ e for e in self.byDic ], + "l2grams": list(self.a2grams) }, ensure_ascii=False)) if bInJSModule: hDst.write(";\n\nexports.dictionary = dictionary;\n") def isValidToken (self, sToken): @@ -321,11 +324,11 @@ cCurrent = sRemain[0:1] for cChar, jAddr in self._getCharArcs(iAddr): if cChar in cp.d1to1.get(cCurrent, cCurrent): self._suggest(oSuggResult, sRemain[1:], nMaxSwitch, nMaxDel, nMaxHardRepl, nMaxJump, nDist, nDeep+1, jAddr, sNewWord+cChar) elif not bAvoidLoop: - if nMaxHardRepl: + if nMaxHardRepl and self.isNgramsOK(cChar+sRemain[1:2]): self._suggest(oSuggResult, sRemain[1:], nMaxSwitch, nMaxDel, nMaxHardRepl-1, nMaxJump, nDist+1, nDeep+1, jAddr, sNewWord+cChar, True) if nMaxJump: self._suggest(oSuggResult, sRemain, nMaxSwitch, nMaxDel, nMaxHardRepl, nMaxJump-1, nDist+1, nDeep+1, jAddr, sNewWord+cChar, True) if not bAvoidLoop: # avoid infinite loop if len(sRemain) > 1: @@ -332,14 +335,14 @@ if cCurrent == sRemain[1:2]: # same char, we remove 1 char without adding 1 to self._suggest(oSuggResult, sRemain[1:], nMaxSwitch, nMaxDel, nMaxHardRepl, nMaxJump, nDist, nDeep+1, iAddr, sNewWord) else: # switching chars - if nMaxSwitch: + if nMaxSwitch and self.isNgramsOK(sNewWord[-1:]+sRemain[1:2]) and self.isNgramsOK(sRemain[1:2]+sRemain[0:1]): self._suggest(oSuggResult, sRemain[1:2]+sRemain[0:1]+sRemain[2:], nMaxSwitch-1, nMaxDel, nMaxHardRepl, nMaxJump, nDist+1, nDeep+1, iAddr, sNewWord, True) # delete char - if nMaxDel: + if nMaxDel and self.isNgramsOK(sNewWord[-1:]+sRemain[1:2]): self._suggest(oSuggResult, sRemain[1:], nMaxSwitch, nMaxDel-1, nMaxHardRepl, nMaxJump, nDist+1, nDeep+1, iAddr, sNewWord, True) # Phonetic replacements for sRepl in cp.get1toXReplacement(sNewWord[-1:], cCurrent, sRemain[1:2]): self._suggest(oSuggResult, sRepl + sRemain[1:], nMaxSwitch, nMaxDel, nMaxHardRepl, nMaxJump, nDist, nDeep+1, iAddr, sNewWord, True) for sRepl in cp.d2toX.get(sRemain[0:2], ()): @@ -351,10 +354,15 @@ elif len(sRemain) == 1: self._suggest(oSuggResult, "", nMaxSwitch, nMaxDel, nMaxHardRepl, nMaxJump, nDist, nDeep+1, iAddr, sNewWord, True) # remove last char and go on for sRepl in cp.dFinal1.get(sRemain, ()): self._suggest(oSuggResult, sRepl, nMaxSwitch, nMaxDel, nMaxHardRepl, nMaxJump, nDist, nDeep+1, iAddr, sNewWord, True) + def isNgramsOK (self, sChars): + if len(sChars) != 2: + return True + return sChars in self.a2grams + #@timethis def suggest2 (self, sWord, nSuggLimit=10): "returns a set of suggestions for " sWord = cp.spellingNormalization(sWord) sPfx, sWord, sSfx = cp.cut(sWord)