Grammalecte  Check-in [b76283a876]

Overview
Comment:[graphpspell] ibdawg: fix dictionary building
Downloads: Tarball | ZIP archive | SQL archive
Timelines: family | ancestors | descendants | both | trunk | graphspell
Files: files | file ages | folders
SHA3-256: b76283a8765751df5e32991b706ad0afcc010f1400b25c19de47addc81100ac5
User & Date: olr on 2020-10-03 19:40:02
Other Links: manifest | tags
Context
2020-10-03
19:59
[fr] mise à jour du dictionnaire check-in: f11700d693 user: olr tags: trunk, fr
19:40
[graphpspell] ibdawg: fix dictionary building check-in: b76283a876 user: olr tags: trunk, graphspell
19:20
[fr] gendicfr: update tags check-in: 80224c9502 user: olr tags: trunk, fr
Changes

Modified graphspell-js/ibdawg.js from [5c5b2cad28] to [280555859f].

177
178
179
180
181
182
183

184
185
186
187
188
189
190
191

192
193
194
195

196
197
198
199
200
201
202
203
204
205
206
207
208
177
178
179
180
181
182
183
184
185
186

187
188
189
190

191
192
193
194

195
196
197
198
199
200

201
202
203
204
205
206
207







+


-




-
+



-
+





-







            https://github.com/mozilla/addons-linter/issues/1361
        */
        /*
            Performance trick:
            Instead of converting bytes to integers each times we parse the binary dictionary,
            we do it once, then parse the array
        */
        this.lByDic = [];
        let nAcc = 0;
        let lBytesBuffer = [];
        let lTemp = [];
        let nDivisor = (this.nBytesArc + this.nBytesNodeAddress) / 2;
        for (let i = 0;  i < this.sByDic.length;  i+=2) {
            lBytesBuffer.push(parseInt(this.sByDic.slice(i, i+2), 16));
            if (nAcc == (this.nBytesArc - 1)) {
                lTemp.push(this._convBytesToInteger(lBytesBuffer));
                this.lByDic.push(this._convBytesToInteger(lBytesBuffer));
                lBytesBuffer = [];
            }
            else if (nAcc == (this.nBytesArc + this.nBytesNodeAddress - 1)) {
                lTemp.push(Math.round(this._convBytesToInteger(lBytesBuffer) / nDivisor));  // Math.round should be useless, BUT with JS who knowns what can happen…
                this.lByDic.push(Math.round(this._convBytesToInteger(lBytesBuffer) / nDivisor));  // Math.round should be useless, BUT with JS who knowns what can happen…
                lBytesBuffer = [];
                nAcc = -1;
            }
            nAcc = nAcc + 1;
        }
        this.byDic = lTemp;
        /* end of bug workaround */

        this._arcMask = (2 ** ((this.nBytesArc * 8) - 3)) - 1;
        this._finalNodeMask = 1 << ((this.nBytesArc * 8) - 1);
        this._lastArcMask = 1 << ((this.nBytesArc * 8) - 2);

        //console.log(this.getInfo());
317
318
319
320
321
322
323
324

325
326
327
328
329
330
331
316
317
318
319
320
321
322

323
324
325
326
327
328
329
330







-
+







                return false;
            }
            iAddr = this._lookupArcNode(this.dChar.get(c), iAddr);
            if (iAddr === null) {
                return false;
            }
        }
        return Boolean(this.byDic[iAddr] & this._finalNodeMask);
        return Boolean(this.lByDic[iAddr] & this._finalNodeMask);
    }

    getMorph (sWord) {
        // retrieves morphologies list, different casing allowed
        if (!sWord) {
            return [];
        }
390
391
392
393
394
395
396
397

398
399
400
401
402
403
404
389
390
391
392
393
394
395

396
397
398
399
400
401
402
403







-
+







            }
        }
    }

    _suggest (oSuggResult, sRemain, nMaxSwitch=0, nMaxDel=0, nMaxHardRepl=0, nMaxJump=0, nDist=0, nDeep=0, iAddr=0, sNewWord="", bAvoidLoop=false) {
        // returns a set of suggestions
        // recursive function
        if (this.byDic[iAddr] & this._finalNodeMask) {
        if (this.lByDic[iAddr] & this._finalNodeMask) {
            if (sRemain == "") {
                oSuggResult.addSugg(sNewWord);
                for (let sTail of this._getTails(iAddr)) {
                    oSuggResult.addSugg(sNewWord+sTail);
                }
                return;
            }
500
501
502
503
504
505
506
507

508
509
510
511
512
513
514
499
500
501
502
503
504
505

506
507
508
509
510
511
512
513







-
+







    }

    _getTails (iAddr, sTail="", n=2) {
        // return a list of suffixes ending at a distance of <n> from <iAddr>
        let aTails = new Set();
        for (let [nVal, jAddr] of this._getArcs(iAddr)) {
            if (nVal <= this.nChar) {
                if (this.byDic[jAddr] & this._finalNodeMask) {
                if (this.lByDic[jAddr] & this._finalNodeMask) {
                    aTails.add(sTail + this.dCharVal.get(nVal));
                }
                if (n && aTails.size == 0) {
                    aTails.gl_update(this._getTails(jAddr, sTail+this.dCharVal.get(nVal), n-1));
                }
            }
        }
572
573
574
575
576
577
578
579

580
581
582
583
584

585
586
587
588
589
590

591
592
593
594

595
596
597
598
599
600
601
571
572
573
574
575
576
577

578
579
580
581
582

583
584
585
586
587
588

589
590
591
592

593
594
595
596
597
598
599
600







-
+




-
+





-
+



-
+







                return [];
            }
            iAddr = this._lookupArcNode(this.dChar.get(c), iAddr);
            if (iAddr === null) {
                return [];
            }
        }
        if (this.byDic[iAddr] & this._finalNodeMask) {
        if (this.lByDic[iAddr] & this._finalNodeMask) {
            let l = [];
            let nRawArc = 0;
            while (!(nRawArc & this._lastArcMask)) {
                let iEndArcAddr = iAddr + 1;
                nRawArc = this.byDic[iAddr];
                nRawArc = this.lByDic[iAddr];
                let nArc = nRawArc & this._arcMask;
                if (nArc > this.nChar) {
                    // This value is not a char, this is a stemming code
                    let sStem = ">" + this.funcStemming(sWord, this.lArcVal[nArc]);
                    // Now , we go to the next node and retrieve all following arcs values, all of them are tags
                    let iAddr2 = this.byDic[iEndArcAddr];
                    let iAddr2 = this.lByDic[iEndArcAddr];
                    let nRawArc2 = 0;
                    while (!(nRawArc2 & this._lastArcMask)) {
                        let iEndArcAddr2 = iAddr2 + 1;
                        nRawArc2 = this.byDic[iAddr2];
                        nRawArc2 = this.lByDic[iAddr2];
                        l.push(sStem + "/" + this.lArcVal[nRawArc2 & this._arcMask]);
                        iAddr2 = iEndArcAddr2 + 1;
                    }
                }
                iAddr = iEndArcAddr + 1;
            }
            return l;
611
612
613
614
615
616
617
618

619
620
621
622
623

624
625
626
627
628
629
630
631
632
633
634
635
636
637
638
639
640

641
642
643
644

645
646
647
648
649
650
651
652
653
654
655
656
657
658
659
660
661


662
663
664
665
666
667
668
669
670
671
672
673
610
611
612
613
614
615
616

617
618
619
620
621

622
623
624
625
626
627
628
629
630
631
632
633
634
635
636
637
638

639
640
641
642

643
644
645
646
647
648
649
650
651
652
653
654
655
656
657
658


659
660
661
662
663
664
665
666
667
668
669
670
671
672







-
+




-
+
















-
+



-
+















-
-
+
+












                return [];
            }
            iAddr = this._lookupArcNode(this.dChar.get(c), iAddr);
            if (iAddr === null) {
                return [];
            }
        }
        if (this.byDic[iAddr] & this._finalNodeMask) {
        if (this.lByDic[iAddr] & this._finalNodeMask) {
            let l = [];
            let nRawArc = 0;
            while (!(nRawArc & this._lastArcMask)) {
                let iEndArcAddr = iAddr + 1;
                nRawArc = this.byDic[iAddr];
                nRawArc = this.lByDic[iAddr];
                let nArc = nRawArc & this._arcMask;
                if (nArc > this.nChar) {
                    // This value is not a char, this is a stemming code
                    l.push(this.funcStemming(sWord, this.lArcVal[nArc]));
                }
                iAddr = iEndArcAddr + 1;
            }
            return l;
        }
        return [];
    }

    _lookupArcNode (nVal, iAddr) {
        // looks if nVal is an arc at the node at iAddr, if yes, returns address of next node else None
        while (true) {
            let iEndArcAddr = iAddr+1;
            let nRawArc = this.byDic[iAddr];
            let nRawArc = this.lByDic[iAddr];
            if (nVal == (nRawArc & this._arcMask)) {
                // the value we are looking for
                // we return the address of the next node
                return this.byDic[iEndArcAddr];
                return this.lByDic[iEndArcAddr];
            }
            else {
                // value not found
                if (nRawArc & this._lastArcMask) {
                    return null;
                }
                iAddr = iEndArcAddr + 1;
            }
        }
    }

    * _getArcs (iAddr) {
        // generator: return all arcs at <iAddr> as tuples of (nVal, iAddr)
        while (true) {
            let iEndArcAddr = iAddr+1;
            let nRawArc = this.byDic[iAddr];
            yield [nRawArc & this._arcMask, this.byDic[iEndArcAddr]];
            let nRawArc = this.lByDic[iAddr];
            yield [nRawArc & this._arcMask, this.lByDic[iEndArcAddr]];
            if (nRawArc & this._lastArcMask) {
                break;
            }
            iAddr = iEndArcAddr+1;
        }
    }
}


if (typeof(exports) !== 'undefined') {
    exports.IBDAWG = IBDAWG;
}

Modified graphspell/ibdawg.py from [cae8eca48c] to [2cf8f6a51c].

130
131
132
133
134
135
136
137

138
139
140
141
142

143
144
145

146
147
148
149
150
151
152
153
154
155
156
130
131
132
133
134
135
136

137
138
139
140
141

142
143
144

145
146
147
148

149
150
151
152
153
154
155







-
+




-
+


-
+



-







        self.sFileName = source  if isinstance(source, str)  else "[None]"

        # Performance trick:
        #     Instead of converting bytes to integers each times we parse the binary dictionary,
        #     we do it once, then parse the array
        nAcc = 0
        byBuffer = b""
        lTemp = []
        self.lByDic = []
        nDivisor = (self.nBytesArc + self.nBytesNodeAddress) / 2
        for i in range(0, len(self.byDic)):
            byBuffer += self.byDic[i:i+1]
            if nAcc == (self.nBytesArc - 1):
                lTemp.append(int.from_bytes(byBuffer, byteorder="big"))
                self.lByDic.append(int.from_bytes(byBuffer, byteorder="big"))
                byBuffer = b""
            elif nAcc == (self.nBytesArc + self.nBytesNodeAddress - 1):
                lTemp.append(round(int.from_bytes(byBuffer, byteorder="big") / nDivisor))
                self.lByDic.append(round(int.from_bytes(byBuffer, byteorder="big") / nDivisor))
                byBuffer = b""
                nAcc = -1
            nAcc = nAcc + 1
        self.byDic = lTemp;

        # masks
        self._arcMask = (2 ** ((self.nBytesArc * 8) - 3)) - 1
        self._finalNodeMask = 1 << ((self.nBytesArc * 8) - 1)
        self._lastArcMask = 1 << ((self.nBytesArc * 8) - 2)

        # function to decode the affix/suffix code
298
299
300
301
302
303
304
305

306
307
308
309
310
311
312
297
298
299
300
301
302
303

304
305
306
307
308
309
310
311







-
+







        iAddr = 0
        for c in sWord:
            if c not in self.dChar:
                return False
            iAddr = self._lookupArcNode(self.dChar[c], iAddr)
            if iAddr is None:
                return False
        return bool(self.byDic[iAddr] & self._finalNodeMask)
        return bool(self.lByDic[iAddr] & self._finalNodeMask)

    def getMorph (self, sWord):
        "retrieves morphologies list, different casing allowed"
        if not sWord:
            return []
        sWord = st.spellingNormalization(sWord)
        l = self._morph(sWord)
355
356
357
358
359
360
361
362

363
364
365
366
367
368
369
354
355
356
357
358
359
360

361
362
363
364
365
366
367
368







-
+







                sWord1, sWord2 = sWord.split(cSplitter, 1)
                if self.isValid(sWord1) and self.isValid(sWord2):
                    oSuggResult.addSugg(sWord1+" "+sWord2)

    def _suggest (self, oSuggResult, sRemain, nMaxSwitch=0, nMaxDel=0, nMaxHardRepl=0, nMaxJump=0, nDist=0, nDeep=0, iAddr=0, sNewWord="", bAvoidLoop=False):
        # recursive function
        #logging.info((nDeep * "  ") + sNewWord + ":" + sRemain)
        if self.byDic[iAddr] & self._finalNodeMask:
        if self.lByDic[iAddr] & self._finalNodeMask:
            if not sRemain:
                oSuggResult.addSugg(sNewWord, nDeep)
                for sTail in self._getTails(iAddr):
                    oSuggResult.addSugg(sNewWord+sTail, nDeep)
                return
            if (len(sNewWord) + len(sRemain) == len(oSuggResult.sWord)) and oSuggResult.sWord.lower().startswith(sNewWord.lower()) and self.isValid(sRemain):
                if self.sLangCode == "fr" and sNewWord.lower() in ("l", "d", "n", "m", "t", "s", "c", "j", "qu", "lorsqu", "puisqu", "quoiqu", "jusqu", "quelqu") and sRemain[0:1] in cp.aVowel:
422
423
424
425
426
427
428
429

430
431
432
433
434
435
436
421
422
423
424
425
426
427

428
429
430
431
432
433
434
435







-
+







                yield (self.dCharVal[nVal], jAddr)

    def _getTails (self, iAddr, sTail="", n=2):
        "return a list of suffixes ending at a distance of <n> from <iAddr>"
        aTails = set()
        for nVal, jAddr in self._getArcs(iAddr):
            if nVal <= self.nChar:
                if self.byDic[jAddr] & self._finalNodeMask:
                if self.lByDic[jAddr] & self._finalNodeMask:
                    aTails.add(sTail + self.dCharVal[nVal])
                if n and not aTails:
                    aTails.update(self._getTails(jAddr, sTail+self.dCharVal[nVal], n-1))
        return aTails

    def drawPath (self, sWord, iAddr=0):
        "show the path taken by <sWord> in the graph"
494
495
496
497
498
499
500
501

502
503
504
505
506

507
508
509
510
511
512

513
514
515
516

517
518
519
520
521
522
523
524
525
526
527
528
529
530
531
532

533
534
535
536
537

538
539
540
541
542
543
544
545
546
547
548
549
550

551
552
553
554

555
556
557
558
559
560
561
562
563
564
565


566
567
568
569
570
571
572
573
574
575
576

577
578

579
580

581
582

583
584
493
494
495
496
497
498
499

500
501
502
503
504

505
506
507
508
509
510

511
512
513
514

515
516
517
518
519
520
521
522
523
524
525
526
527
528
529
530

531
532
533
534
535

536
537
538
539
540
541
542
543
544
545
546
547
548

549
550
551
552

553
554
555
556
557
558
559
560
561
562


563
564
565
566
567
568
569
570
571
572
573
574

575
576

577
578

579
580

581
582
583







-
+




-
+





-
+



-
+















-
+




-
+












-
+



-
+









-
-
+
+










-
+

-
+

-
+

-
+


        iAddr = 0
        for c in sWord:
            if c not in self.dChar:
                return []
            iAddr = self._lookupArcNode(self.dChar[c], iAddr)
            if iAddr is None:
                return []
        if self.byDic[iAddr] & self._finalNodeMask:
        if self.lByDic[iAddr] & self._finalNodeMask:
            l = []
            nRawArc = 0
            while not nRawArc & self._lastArcMask:
                iEndArcAddr = iAddr + 1
                nRawArc = self.byDic[iAddr]
                nRawArc = self.lByDic[iAddr]
                nArc = nRawArc & self._arcMask
                if nArc > self.nChar:
                    # This value is not a char, this is a stemming code
                    sStem = ">" + self.funcStemming(sWord, self.lArcVal[nArc])
                    # Now , we go to the next node and retrieve all following arcs values, all of them are tags
                    iAddr2 = self.byDic[iEndArcAddr]
                    iAddr2 = self.lByDic[iEndArcAddr]
                    nRawArc2 = 0
                    while not nRawArc2 & self._lastArcMask:
                        iEndArcAddr2 = iAddr2 + 1
                        nRawArc2 = self.byDic[iAddr2]
                        nRawArc2 = self.lByDic[iAddr2]
                        l.append(sStem + "/" + self.lArcVal[nRawArc2 & self._arcMask])
                        iAddr2 = iEndArcAddr2 + 1
                iAddr = iEndArcAddr + 1
            return l
        return []

    def _stem (self, sWord):
        "returns stems list of <sWord>"
        iAddr = 0
        for c in sWord:
            if c not in self.dChar:
                return []
            iAddr = self._lookupArcNode(self.dChar[c], iAddr)
            if iAddr is None:
                return []
        if self.byDic[iAddr] & self._finalNodeMask:
        if self.lByDic[iAddr] & self._finalNodeMask:
            l = []
            nRawArc = 0
            while not nRawArc & self._lastArcMask:
                iEndArcAddr = iAddr + 1
                nRawArc = self.byDic[iAddr]
                nRawArc = self.lByDic[iAddr]
                nArc = nRawArc & self._arcMask
                if nArc > self.nChar:
                    # This value is not a char, this is a stemming code
                    l.append(self.funcStemming(sWord, self.lArcVal[nArc]))
                iAddr = iEndArcAddr + 1
            return l
        return []

    def _lookupArcNode (self, nVal, iAddr):
        "looks if <nVal> is an arc at the node at <iAddr>, if yes, returns address of next node else None"
        while True:
            iEndArcAddr = iAddr + 1
            nRawArc = self.byDic[iAddr]
            nRawArc = self.lByDic[iAddr]
            if nVal == (nRawArc & self._arcMask):
                # the value we are looking for
                # we return the address of the next node
                return self.byDic[iEndArcAddr]
                return self.lByDic[iEndArcAddr]
            # value not found
            if nRawArc & self._lastArcMask:
                return None
            iAddr = iEndArcAddr + 1

    def _getArcs (self, iAddr):
        "generator: return all arcs at <iAddr> as tuples of (nVal, iAddr)"
        while True:
            iEndArcAddr = iAddr + 1
            nRawArc = self.byDic[iAddr]
            yield nRawArc & self._arcMask, self.byDic[iEndArcAddr]
            nRawArc = self.lByDic[iAddr]
            yield nRawArc & self._arcMask, self.lByDic[iEndArcAddr]
            if nRawArc & self._lastArcMask:
                break
            iAddr = iEndArcAddr + 1

    def _writeNodes (self, spfDest):
        "for debugging only"
        print(" > Write binary nodes")
        with open(spfDest, 'w', 'utf-8', newline="\n") as hDst:
            iAddr = 0
            hDst.write("i{:_>10} -- #{:_>10}\n".format("0", iAddr))
            while iAddr < len(self.byDic):
            while iAddr < len(self.lByDic):
                iEndArcAddr = iAddr + 1
                nRawArc = self.byDic[iAddr]
                nRawArc = self.lByDic[iAddr]
                nArc = nRawArc & self._arcMask
                hDst.write("  {:<20}  {:0>16}  i{:>10}   #{:_>10}\n".format(self.lArcVal[nArc], bin(nRawArc)[2:], "?", self.byDic[iEndArcAddr]))
                hDst.write("  {:<20}  {:0>16}  i{:>10}   #{:_>10}\n".format(self.lArcVal[nArc], bin(nRawArc)[2:], "?", self.lByDic[iEndArcAddr]))
                iAddr = iEndArcAddr + 1
                if (nRawArc & self._lastArcMask) and iAddr < len(self.byDic):
                if (nRawArc & self._lastArcMask) and iAddr < len(self.lByDic):
                    hDst.write("\ni{:_>10} -- #{:_>10}\n".format("?", iAddr))
            hDst.close()