Grammalecte  Check-in [ae767aaff5]

Overview
Comment:[graphspell][py] ibdawg optimization: precalculate bytes in binary dictionary
Downloads: Tarball | ZIP archive | SQL archive
Timelines: family | ancestors | descendants | both | graphspell | bdic_opt
Files: files | file ages | folders
SHA3-256: ae767aaff504b1b6ff830c62014977875f6db31a8bed7f656ee18f61e0915ecb
User & Date: olr on 2020-09-11 17:22:31
Other Links: branch diff | manifest | tags
Context
2020-09-11
19:20
merge trunk check-in: 43afb8b856 user: olr tags: bdic_opt
17:22
[graphspell][py] ibdawg optimization: precalculate bytes in binary dictionary check-in: ae767aaff5 user: olr tags: graphspell, bdic_opt
15:53
[graphspell][js] ibdawg optimization: precalculate bytes in binary dictionary check-in: 443f28094b user: olr tags: graphspell, bdic_opt
Changes

Modified gc_lang/fr/perf_memo.text from [6a0d81df00] to [ad156793c1].

26
27
28
29
30
31
32
33
34

0.6.2       2018.02.19 19:06    5.51302     1.29359     0.874157    0.260415    0.271596    0.290641    0.684754    0.376905    0.0815201   0.00919633  (spelling normalization)
1.0         2018.11.23 10:59    2.88577     0.702486    0.485648    0.139897    0.14079     0.148125    0.348751    0.201061    0.0360297   0.0043535   (x2, with new GC engine)
1.1         2019.05.16 09:42    1.50743     0.360923    0.261113    0.0749272   0.0763827   0.0771537   0.180504    0.102942    0.0182762   0.0021925   (×2, but new processor: AMD Ryzen 7 2700X)
1.2.1       2019.08.06 20:57    1.42886     0.358425    0.247356    0.0704405   0.0754886   0.0765604   0.177197    0.0988517   0.0188103   0.0020243
1.6.0       2020.01.03 20:22    1.38847     0.346214    0.240242    0.0709539   0.0737499   0.0748733   0.176477    0.0969171   0.0187857   0.0025143   (nouveau dictionnaire avec lemmes masculins)
1.9.0       2020.04.20 19:57    1.51183     0.369546    0.25681     0.0734314   0.0764396   0.0785668   0.183922    0.103674    0.0185812   0.002099    (NFC normalization)
1.9.2       2020.05.12 08:43    1.62465     0.398831    0.273012    0.0810811   0.080937    0.0845885   0.204133    0.114146    0.0212864   0.0029547
1.12.2      2020.09.09 13:34    1.50568     0.374504    0.233108    0.0798712   0.0804466   0.0769674   0.171519    0.0945132   0.0165344   0.0019474   
1.12.2      2020.09.09 13:35    1.41094     0.359093    0.236443    0.06968     0.0734418   0.0738087   0.169371    0.0946279   0.0167106   0.0019773   








|
|
>
26
27
28
29
30
31
32
33
34
35
0.6.2       2018.02.19 19:06    5.51302     1.29359     0.874157    0.260415    0.271596    0.290641    0.684754    0.376905    0.0815201   0.00919633  (spelling normalization)
1.0         2018.11.23 10:59    2.88577     0.702486    0.485648    0.139897    0.14079     0.148125    0.348751    0.201061    0.0360297   0.0043535   (x2, with new GC engine)
1.1         2019.05.16 09:42    1.50743     0.360923    0.261113    0.0749272   0.0763827   0.0771537   0.180504    0.102942    0.0182762   0.0021925   (×2, but new processor: AMD Ryzen 7 2700X)
1.2.1       2019.08.06 20:57    1.42886     0.358425    0.247356    0.0704405   0.0754886   0.0765604   0.177197    0.0988517   0.0188103   0.0020243
1.6.0       2020.01.03 20:22    1.38847     0.346214    0.240242    0.0709539   0.0737499   0.0748733   0.176477    0.0969171   0.0187857   0.0025143   (nouveau dictionnaire avec lemmes masculins)
1.9.0       2020.04.20 19:57    1.51183     0.369546    0.25681     0.0734314   0.0764396   0.0785668   0.183922    0.103674    0.0185812   0.002099    (NFC normalization)
1.9.2       2020.05.12 08:43    1.62465     0.398831    0.273012    0.0810811   0.080937    0.0845885   0.204133    0.114146    0.0212864   0.0029547
1.12.2      2020.09.09 13:34    1.50568     0.374504    0.233108    0.0798712   0.0804466   0.0769674   0.171519    0.0945132   0.0165344   0.0019474
1.12.2      2020.09.09 13:35    1.41094     0.359093    0.236443    0.06968     0.0734418   0.0738087   0.169371    0.0946279   0.0167106   0.0019773
1.12.2      2020.09.11 19:16    1.35297     0.330545    0.221731    0.0666998   0.0692539   0.0701707   0.160564    0.0891676   0.015807    0.0045998

Modified graphspell/ibdawg.py from [d16ed0d683] to [0fe5cbd03f].

116
117
118
119
120
121
122




















123
124
125
126
127
128
129
            else:
                raise OSError("# Error. Unknown file type: "+source)
        else:
            self._initJSON(source)

        self.sFileName = source  if isinstance(source, str)  else "[None]"





















        self._arcMask = (2 ** ((self.nBytesArc * 8) - 3)) - 1
        self._finalNodeMask = 1 << ((self.nBytesArc * 8) - 1)
        self._lastArcMask = 1 << ((self.nBytesArc * 8) - 2)
        self._addrBitMask = 1 << ((self.nBytesArc * 8) - 3)  # version 2

        # function to decode the affix/suffix code
        if self.cStemming == "S":







>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>







116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
            else:
                raise OSError("# Error. Unknown file type: "+source)
        else:
            self._initJSON(source)

        self.sFileName = source  if isinstance(source, str)  else "[None]"

        # Performance trick:
        #     Instead of converting bytes to integers each times we parse the binary dictionary,
        #     we do it once, then parse the array
        nAcc = 0
        byBuffer = b""
        lTemp = []
        nDivisor = (self.nBytesArc + self.nBytesNodeAddress) / 2
        for i in range(0, len(self.byDic)):
            byBuffer += self.byDic[i:i+1]
            if nAcc == (self.nBytesArc - 1):
                lTemp.append(int.from_bytes(byBuffer, byteorder="big"))
                byBuffer = b""
            elif nAcc == (self.nBytesArc + self.nBytesNodeAddress - 1):
                lTemp.append(round(int.from_bytes(byBuffer, byteorder="big") / nDivisor))
                byBuffer = b""
                nAcc = -1
            nAcc = nAcc + 1
        self.byDic = lTemp;

        # masks
        self._arcMask = (2 ** ((self.nBytesArc * 8) - 3)) - 1
        self._finalNodeMask = 1 << ((self.nBytesArc * 8) - 1)
        self._lastArcMask = 1 << ((self.nBytesArc * 8) - 2)
        self._addrBitMask = 1 << ((self.nBytesArc * 8) - 3)  # version 2

        # function to decode the affix/suffix code
        if self.cStemming == "S":
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
        iAddr = 0
        for c in sWord:
            if c not in self.dChar:
                return False
            iAddr = self._lookupArcNode(self.dChar[c], iAddr)
            if iAddr is None:
                return False
        return bool(int.from_bytes(self.byDic[iAddr:iAddr+self.nBytesArc], byteorder='big') & self._finalNodeMask)

    def getMorph (self, sWord):
        "retrieves morphologies list, different casing allowed"
        if not sWord:
            return []
        sWord = st.spellingNormalization(sWord)
        l = self.morph(sWord)







|







316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
        iAddr = 0
        for c in sWord:
            if c not in self.dChar:
                return False
            iAddr = self._lookupArcNode(self.dChar[c], iAddr)
            if iAddr is None:
                return False
        return bool(self.byDic[iAddr] & self._finalNodeMask)

    def getMorph (self, sWord):
        "retrieves morphologies list, different casing allowed"
        if not sWord:
            return []
        sWord = st.spellingNormalization(sWord)
        l = self.morph(sWord)
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
                sWord1, sWord2 = sWord.split(cSplitter, 1)
                if self.isValid(sWord1) and self.isValid(sWord2):
                    oSuggResult.addSugg(sWord1+" "+sWord2)

    def _suggest (self, oSuggResult, sRemain, nMaxSwitch=0, nMaxDel=0, nMaxHardRepl=0, nMaxJump=0, nDist=0, nDeep=0, iAddr=0, sNewWord="", bAvoidLoop=False):
        # recursive function
        #logging.info((nDeep * "  ") + sNewWord + ":" + sRemain)
        if int.from_bytes(self.byDic[iAddr:iAddr+self.nBytesArc], byteorder='big') & self._finalNodeMask:
            if not sRemain:
                oSuggResult.addSugg(sNewWord, nDeep)
                for sTail in self._getTails(iAddr):
                    oSuggResult.addSugg(sNewWord+sTail, nDeep)
                return
            if (len(sNewWord) + len(sRemain) == len(oSuggResult.sWord)) and oSuggResult.sWord.lower().startswith(sNewWord.lower()) and self.isValid(sRemain):
                if self.sLangCode == "fr" and sNewWord.lower() in ("l", "d", "n", "m", "t", "s", "c", "j", "qu", "lorsqu", "puisqu", "quoiqu", "jusqu", "quelqu") and sRemain[0:1] in cp.aVowel:







|







372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
                sWord1, sWord2 = sWord.split(cSplitter, 1)
                if self.isValid(sWord1) and self.isValid(sWord2):
                    oSuggResult.addSugg(sWord1+" "+sWord2)

    def _suggest (self, oSuggResult, sRemain, nMaxSwitch=0, nMaxDel=0, nMaxHardRepl=0, nMaxJump=0, nDist=0, nDeep=0, iAddr=0, sNewWord="", bAvoidLoop=False):
        # recursive function
        #logging.info((nDeep * "  ") + sNewWord + ":" + sRemain)
        if self.byDic[iAddr] & self._finalNodeMask:
            if not sRemain:
                oSuggResult.addSugg(sNewWord, nDeep)
                for sTail in self._getTails(iAddr):
                    oSuggResult.addSugg(sNewWord+sTail, nDeep)
                return
            if (len(sNewWord) + len(sRemain) == len(oSuggResult.sWord)) and oSuggResult.sWord.lower().startswith(sNewWord.lower()) and self.isValid(sRemain):
                if self.sLangCode == "fr" and sNewWord.lower() in ("l", "d", "n", "m", "t", "s", "c", "j", "qu", "lorsqu", "puisqu", "quoiqu", "jusqu", "quelqu") and sRemain[0:1] in cp.aVowel:
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
                yield (self.dCharVal[nVal], jAddr)

    def _getTails (self, iAddr, sTail="", n=2):
        "return a list of suffixes ending at a distance of <n> from <iAddr>"
        aTails = set()
        for nVal, jAddr in self._getArcs(iAddr):
            if nVal <= self.nChar:
                if int.from_bytes(self.byDic[jAddr:jAddr+self.nBytesArc], byteorder='big') & self._finalNodeMask:
                    aTails.add(sTail + self.dCharVal[nVal])
                if n and not aTails:
                    aTails.update(self._getTails(jAddr, sTail+self.dCharVal[nVal], n-1))
        return aTails

    def drawPath (self, sWord, iAddr=0):
        "show the path taken by <sWord> in the graph"







|







439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
                yield (self.dCharVal[nVal], jAddr)

    def _getTails (self, iAddr, sTail="", n=2):
        "return a list of suffixes ending at a distance of <n> from <iAddr>"
        aTails = set()
        for nVal, jAddr in self._getArcs(iAddr):
            if nVal <= self.nChar:
                if self.byDic[jAddr] & self._finalNodeMask:
                    aTails.add(sTail + self.dCharVal[nVal])
                if n and not aTails:
                    aTails.update(self._getTails(jAddr, sTail+self.dCharVal[nVal], n-1))
        return aTails

    def drawPath (self, sWord, iAddr=0):
        "show the path taken by <sWord> in the graph"
495
496
497
498
499
500
501
502
503
504
505
506
507
508
509
510
511
512
513
514
515
516
517
518
519
520
521
522
523
524
525
526
527
528
529
530
531
532
533
534
535
536
537
538
539
540
541
542
543
544
545
546
547
548
549
550
551
552
553
554
555
556
557
558
559
560
561
562
563
564
565
566
567
568
569
570
571
572
573
574
575
576
577
578
579
580
581
582
583
584
585
586
587
588
589
590
591
        iAddr = 0
        for c in sWord:
            if c not in self.dChar:
                return []
            iAddr = self._lookupArcNode(self.dChar[c], iAddr)
            if iAddr is None:
                return []
        if int.from_bytes(self.byDic[iAddr:iAddr+self.nBytesArc], byteorder='big') & self._finalNodeMask:
            l = []
            nRawArc = 0
            while not nRawArc & self._lastArcMask:
                iEndArcAddr = iAddr + self.nBytesArc
                nRawArc = int.from_bytes(self.byDic[iAddr:iEndArcAddr], byteorder='big')
                nArc = nRawArc & self._arcMask
                if nArc > self.nChar:
                    # This value is not a char, this is a stemming code
                    sStem = ">" + self.funcStemming(sWord, self.lArcVal[nArc])
                    # Now , we go to the next node and retrieve all following arcs values, all of them are tags
                    iAddr2 = int.from_bytes(self.byDic[iEndArcAddr:iEndArcAddr+self.nBytesNodeAddress], byteorder='big')
                    nRawArc2 = 0
                    while not nRawArc2 & self._lastArcMask:
                        iEndArcAddr2 = iAddr2 + self.nBytesArc
                        nRawArc2 = int.from_bytes(self.byDic[iAddr2:iEndArcAddr2], byteorder='big')
                        l.append(sStem + "/" + self.lArcVal[nRawArc2 & self._arcMask])
                        iAddr2 = iEndArcAddr2+self.nBytesNodeAddress
                iAddr = iEndArcAddr+self.nBytesNodeAddress
            return l
        return []

    def _stem1 (self, sWord):
        "returns stems list of <sWord>"
        iAddr = 0
        for c in sWord:
            if c not in self.dChar:
                return []
            iAddr = self._lookupArcNode(self.dChar[c], iAddr)
            if iAddr is None:
                return []
        if int.from_bytes(self.byDic[iAddr:iAddr+self.nBytesArc], byteorder='big') & self._finalNodeMask:
            l = []
            nRawArc = 0
            while not nRawArc & self._lastArcMask:
                iEndArcAddr = iAddr + self.nBytesArc
                nRawArc = int.from_bytes(self.byDic[iAddr:iEndArcAddr], byteorder='big')
                nArc = nRawArc & self._arcMask
                if nArc > self.nChar:
                    # This value is not a char, this is a stemming code
                    l.append(self.funcStemming(sWord, self.lArcVal[nArc]))
                iAddr = iEndArcAddr+self.nBytesNodeAddress
            return l
        return []

    def _lookupArcNode1 (self, nVal, iAddr):
        "looks if <nVal> is an arc at the node at <iAddr>, if yes, returns address of next node else None"
        while True:
            iEndArcAddr = iAddr+self.nBytesArc
            nRawArc = int.from_bytes(self.byDic[iAddr:iEndArcAddr], byteorder='big')
            if nVal == (nRawArc & self._arcMask):
                # the value we are looking for
                # we return the address of the next node
                return int.from_bytes(self.byDic[iEndArcAddr:iEndArcAddr+self.nBytesNodeAddress], byteorder='big')
            # value not found
            if nRawArc & self._lastArcMask:
                return None
            iAddr = iEndArcAddr+self.nBytesNodeAddress

    def _getArcs1 (self, iAddr):
        "generator: return all arcs at <iAddr> as tuples of (nVal, iAddr)"
        while True:
            iEndArcAddr = iAddr+self.nBytesArc
            nRawArc = int.from_bytes(self.byDic[iAddr:iEndArcAddr], byteorder='big')
            yield nRawArc & self._arcMask, int.from_bytes(self.byDic[iEndArcAddr:iEndArcAddr+self.nBytesNodeAddress], byteorder='big')
            if nRawArc & self._lastArcMask:
                break
            iAddr = iEndArcAddr+self.nBytesNodeAddress

    def _writeNodes1 (self, spfDest):
        "for debugging only"
        print(" > Write binary nodes")
        with open(spfDest, 'w', 'utf-8', newline="\n") as hDst:
            iAddr = 0
            hDst.write("i{:_>10} -- #{:_>10}\n".format("0", iAddr))
            while iAddr < len(self.byDic):
                iEndArcAddr = iAddr+self.nBytesArc
                nRawArc = int.from_bytes(self.byDic[iAddr:iEndArcAddr], byteorder='big')
                nArc = nRawArc & self._arcMask
                hDst.write("  {:<20}  {:0>16}  i{:>10}   #{:_>10}\n".format(self.lArcVal[nArc], bin(nRawArc)[2:], "?", \
                                                                            int.from_bytes(self.byDic[iEndArcAddr:iEndArcAddr+self.nBytesNodeAddress], \
                                                                                           byteorder='big')))
                iAddr = iEndArcAddr+self.nBytesNodeAddress
                if (nRawArc & self._lastArcMask) and iAddr < len(self.byDic):
                    hDst.write("\ni{:_>10} -- #{:_>10}\n".format("?", iAddr))
            hDst.close()

    # VERSION 2
    def _morph2 (self, sWord):
        "returns morphologies of <sWord>"







|



|
|





|


|
|

|
|












|



|
|




|






|
|



|



|




|
|
|


|








|
|

|
<
<
|







515
516
517
518
519
520
521
522
523
524
525
526
527
528
529
530
531
532
533
534
535
536
537
538
539
540
541
542
543
544
545
546
547
548
549
550
551
552
553
554
555
556
557
558
559
560
561
562
563
564
565
566
567
568
569
570
571
572
573
574
575
576
577
578
579
580
581
582
583
584
585
586
587
588
589
590
591
592
593
594
595
596
597
598
599
600
601


602
603
604
605
606
607
608
609
        iAddr = 0
        for c in sWord:
            if c not in self.dChar:
                return []
            iAddr = self._lookupArcNode(self.dChar[c], iAddr)
            if iAddr is None:
                return []
        if self.byDic[iAddr] & self._finalNodeMask:
            l = []
            nRawArc = 0
            while not nRawArc & self._lastArcMask:
                iEndArcAddr = iAddr + 1
                nRawArc = self.byDic[iAddr]
                nArc = nRawArc & self._arcMask
                if nArc > self.nChar:
                    # This value is not a char, this is a stemming code
                    sStem = ">" + self.funcStemming(sWord, self.lArcVal[nArc])
                    # Now , we go to the next node and retrieve all following arcs values, all of them are tags
                    iAddr2 = self.byDic[iEndArcAddr]
                    nRawArc2 = 0
                    while not nRawArc2 & self._lastArcMask:
                        iEndArcAddr2 = iAddr2 + 1
                        nRawArc2 = self.byDic[iAddr2]
                        l.append(sStem + "/" + self.lArcVal[nRawArc2 & self._arcMask])
                        iAddr2 = iEndArcAddr2 + 1
                iAddr = iEndArcAddr + 1
            return l
        return []

    def _stem1 (self, sWord):
        "returns stems list of <sWord>"
        iAddr = 0
        for c in sWord:
            if c not in self.dChar:
                return []
            iAddr = self._lookupArcNode(self.dChar[c], iAddr)
            if iAddr is None:
                return []
        if self.byDic[iAddr] & self._finalNodeMask:
            l = []
            nRawArc = 0
            while not nRawArc & self._lastArcMask:
                iEndArcAddr = iAddr + 1
                nRawArc = self.byDic[iAddr]
                nArc = nRawArc & self._arcMask
                if nArc > self.nChar:
                    # This value is not a char, this is a stemming code
                    l.append(self.funcStemming(sWord, self.lArcVal[nArc]))
                iAddr = iEndArcAddr + 1
            return l
        return []

    def _lookupArcNode1 (self, nVal, iAddr):
        "looks if <nVal> is an arc at the node at <iAddr>, if yes, returns address of next node else None"
        while True:
            iEndArcAddr = iAddr + 1
            nRawArc = self.byDic[iAddr]
            if nVal == (nRawArc & self._arcMask):
                # the value we are looking for
                # we return the address of the next node
                return self.byDic[iEndArcAddr]
            # value not found
            if nRawArc & self._lastArcMask:
                return None
            iAddr = iEndArcAddr + 1

    def _getArcs1 (self, iAddr):
        "generator: return all arcs at <iAddr> as tuples of (nVal, iAddr)"
        while True:
            iEndArcAddr = iAddr + 1
            nRawArc = self.byDic[iAddr]
            yield nRawArc & self._arcMask, self.byDic[iEndArcAddr]
            if nRawArc & self._lastArcMask:
                break
            iAddr = iEndArcAddr + 1

    def _writeNodes1 (self, spfDest):
        "for debugging only"
        print(" > Write binary nodes")
        with open(spfDest, 'w', 'utf-8', newline="\n") as hDst:
            iAddr = 0
            hDst.write("i{:_>10} -- #{:_>10}\n".format("0", iAddr))
            while iAddr < len(self.byDic):
                iEndArcAddr = iAddr + 1
                nRawArc = self.byDic[iAddr]
                nArc = nRawArc & self._arcMask
                hDst.write("  {:<20}  {:0>16}  i{:>10}   #{:_>10}\n".format(self.lArcVal[nArc], bin(nRawArc)[2:], "?", self.byDic[iEndArcAddr]))


                iAddr = iEndArcAddr + 1
                if (nRawArc & self._lastArcMask) and iAddr < len(self.byDic):
                    hDst.write("\ni{:_>10} -- #{:_>10}\n".format("?", iAddr))
            hDst.close()

    # VERSION 2
    def _morph2 (self, sWord):
        "returns morphologies of <sWord>"