Grammalecte  Diff

Differences From Artifact [29dc50eb92]:

To Artifact [186d4c2e6e]:


41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65

# Les dictionnaires
dSUBDIC = { '*': 'Commun',
            'R': 'Réforme1990',
            'M': 'Moderne',
            'C': 'Classique',
            'A': 'Annexe',
            'P': 'Multimots',
            'X': 'Contributeurs' }

dMODERNE = { 'name': 'DICTIONNAIRE ORTHOGRAPHIQUE FRANÇAIS “MODERNE”',
             'shortname': '“Moderne”',
             'asciiName': 'fr-moderne',
             'mozAsciiName': 'fr-FR-modern',
             'subDicts': '*MX',
             'mozId': 'fr-dicollecte-moderne',
             'description': "Dictionnaire français “Moderne”" }

dCLASSIQUE = { 'name': 'DICTIONNAIRE ORTHOGRAPHIQUE FRANÇAIS “CLASSIQUE”',
               'shortname': '“Classique”',
               'asciiName': 'fr-classique',
               'mozAsciiName': 'fr-FR-classic',
               'subDicts': '*MCX',
               'mozId': 'fr-dicollecte-classique',
               'description': "Dictionnaire français “Classique”" }







<


<
<
<
<
<
<
<
<







41
42
43
44
45
46
47

48
49








50
51
52
53
54
55
56

# Les dictionnaires
dSUBDIC = { '*': 'Commun',
            'R': 'Réforme1990',
            'M': 'Moderne',
            'C': 'Classique',
            'A': 'Annexe',

            'X': 'Contributeurs' }









dCLASSIQUE = { 'name': 'DICTIONNAIRE ORTHOGRAPHIQUE FRANÇAIS “CLASSIQUE”',
               'shortname': '“Classique”',
               'asciiName': 'fr-classique',
               'mozAsciiName': 'fr-FR-classic',
               'subDicts': '*MCX',
               'mozId': 'fr-dicollecte-classique',
               'description': "Dictionnaire français “Classique”" }
643
644
645
646
647
648
649
650
651
652
653
654
655
656
657
658
659
660
661
662
663
664
665
666
667
668
669
670
671
672
673
674
675
676
677
678
679
680
681
682
683
684
685
686
687
688
689
690
691
692
693
694
695
696
697
698
699
700
701
702
703
704
705
706
707
708
709
710
        with open(spBuild+'/dictDecl.txt', 'w', encoding='utf-8', newline="\n") as hDst:
            for oEntry in self.lEntry:
                if re.match("[SXFWIA]", oEntry.flags) and (oEntry.po.startswith("nom") or oEntry.po.startswith("adj")):
                    hDst.write(oEntry.getDeclination())
        if spDestGL:
            echo("   Fichier de déclinaison copié dans Grammalecte...")
            file_util.copy_file(spBuild+'/dictDecl.txt', spDestGL)

    def generateSpellVariants (self, nReq, spBuild):
        if nReq < 1: nReq = 1
        if nReq > 2: nReq = 2
        echo(" * Lexique >> variantes par suppression... n = " + str(nReq))
        with open(spBuild+'/dictSpellVariants-'+str(nReq)+'.txt', 'w', encoding='utf-8', newline="\n") as hDst:
            for oFlex in frozenset(self.lFlexions):
                hDst.write(oFlex.sFlexion+"\t_\t_\n")
                if len(oFlex.sFlexion) <= 2:
                    n = 0
                elif len(oFlex.sFlexion) <= 5:
                    n = 1
                else:
                    n = nReq
                #lTup = self._generatePhonetVariants(oFlex.sFlexion)
                lTup = self._generateDeleteVariants(oFlex.sFlexion, oFlex.sFlexion, n)
                for t in lTup:
                    sTag = t[1]  if "\t" in t[1]  else t[1]+"\t_"
                    hDst.write(t[0]+"\t"+sTag+"\n")

    _lTupPhonet = [ ("ph", "f"), ("qu", "k"), ("ss", "c"), ("ss", "ç"), ("ct", "x"),
        ("oe", "œ"), ("ae", "æ"), ("ei", "é"), ("ai", "é"), ("au", "o"), ("eau", "o"),
    ]

    def _generatePhonetVariants (self, s):
        l = []
        for torep, rep in self._lTupPhonet():
            for m in torep.finditer(s):
                l.append( (s[:m.start(0)] + rep + s[m.end(0):], str(m.start(0))+":"+str(m.start(0)+len(rep))+">"+torep) )
        return l

    def _generateDeleteVariants (self, sWord0, sWordCur, n):
        "renvoie une liste de tuples : (forme dégradée de sWord, code de genèse de sWord)"
        # caution: recursive function
        if n == 0:
            return []
        lTup = []
        for i in range(len(sWordCur)):
            sNew = sWordCur[0:i]+sWordCur[i+1:]
            lTup.append( ( sNew, self._generateAddCode(sWord0, sNew) ) )
            lTup += self._generateDeleteVariants(sWord0, sNew, n-1)
        return lTup

    def _generateAddCode (self, sWord, sCrippled):
        "returns addCode to generate sWord from sCrippled"
        sAdd = ""
        for i in range(len(sWord)):
            if sWord[i] != sCrippled[i:i+1]:
                sCrippled = sCrippled[:i] + sWord[i] + sCrippled[i:]
                if sAdd:
                    sAdd += "\t"
                sAdd += str(i)+"+"+sWord[i]
        return sAdd  if sAdd  else "0"



class Entree:
    def __init__ (self, sLine):
        self.lemma = ''
        self.flags = ''
        # champs morphologiques Hunspell







<
<
<
<
<
<
<
<
<
<
<
<
<
<
<
<
<
<
<
<
<
<
<
<
<
<
<
<
<
<
<
<
<
<
<
<
<
<
<
<
<
<
<
<
<
<
<
<
<
<
<
<
<
<







634
635
636
637
638
639
640






















































641
642
643
644
645
646
647
        with open(spBuild+'/dictDecl.txt', 'w', encoding='utf-8', newline="\n") as hDst:
            for oEntry in self.lEntry:
                if re.match("[SXFWIA]", oEntry.flags) and (oEntry.po.startswith("nom") or oEntry.po.startswith("adj")):
                    hDst.write(oEntry.getDeclination())
        if spDestGL:
            echo("   Fichier de déclinaison copié dans Grammalecte...")
            file_util.copy_file(spBuild+'/dictDecl.txt', spDestGL)
























































class Entree:
    def __init__ (self, sLine):
        self.lemma = ''
        self.flags = ''
        # champs morphologiques Hunspell
1500
1501
1502
1503
1504
1505
1506
1507
1508
1509
1510
1511
1512
1513
1514

def main ():
    xParser = argparse.ArgumentParser()
    xParser.add_argument("-v", "--verdic", help="set dictionary version, i.e. 5.4", type=str, default="X.Y.z")
    xParser.add_argument("-m", "--mode", help="0: no tags,  1: Hunspell tags (default),  2: All tags", type=int, choices=[0, 1, 2], default=1)
    xParser.add_argument("-u", "--uncompress", help="do not use Hunspell compression", action="store_true")
    xParser.add_argument("-s", "--simplify", help="no virtual lemmas", action="store_true")
    xParser.add_argument("-sv", "--spellvariants", help="generate spell variants", action="store_true")
    xParser.add_argument("-gl", "--grammalecte", help="copy generated files to Grammalecte folders", action="store_true")
    xArgs = xParser.parse_args()

    if xArgs.simplify:
        xArgs.mode = 0
        xArgs.uncompress = True








<







1437
1438
1439
1440
1441
1442
1443

1444
1445
1446
1447
1448
1449
1450

def main ():
    xParser = argparse.ArgumentParser()
    xParser.add_argument("-v", "--verdic", help="set dictionary version, i.e. 5.4", type=str, default="X.Y.z")
    xParser.add_argument("-m", "--mode", help="0: no tags,  1: Hunspell tags (default),  2: All tags", type=int, choices=[0, 1, 2], default=1)
    xParser.add_argument("-u", "--uncompress", help="do not use Hunspell compression", action="store_true")
    xParser.add_argument("-s", "--simplify", help="no virtual lemmas", action="store_true")

    xParser.add_argument("-gl", "--grammalecte", help="copy generated files to Grammalecte folders", action="store_true")
    xArgs = xParser.parse_args()

    if xArgs.simplify:
        xArgs.mode = 0
        xArgs.uncompress = True

1534
1535
1536
1537
1538
1539
1540
1541
1542
1543
1544
1545
1546
1547
1548
1549

    ### Lexique
    oFrenchDict.generateFlexions()
    oFrenchDict.calcMetagraphe()
    oFrenchDict.calcMetaphone2()

    #oFrenchDict.createNgrams(spBuild, 3)
    if xArgs.spellvariants:
        oFrenchDict.generateSpellVariants(1, spBuild)

    ### Statistiques
    spfStats = spBuild+'/'+STATS_NAME+xArgs.verdic+'.txt'
    oStatsLex = StatsLex(oFrenchDict)
    oStatsLex.addLexFromFile('lexique/corpus_data/stats_google_ngram_1.txt', 'G', 'Google 1-grams')
    oStatsLex.addLexFromFile('lexique/corpus_data/stats_frwiki.txt', 'W', 'Wikipédia')
    oStatsLex.addLexFromFile('lexique/corpus_data/stats_frwikisource.txt', 'S', 'Wikisource')







<
<







1470
1471
1472
1473
1474
1475
1476


1477
1478
1479
1480
1481
1482
1483

    ### Lexique
    oFrenchDict.generateFlexions()
    oFrenchDict.calcMetagraphe()
    oFrenchDict.calcMetaphone2()

    #oFrenchDict.createNgrams(spBuild, 3)



    ### Statistiques
    spfStats = spBuild+'/'+STATS_NAME+xArgs.verdic+'.txt'
    oStatsLex = StatsLex(oFrenchDict)
    oStatsLex.addLexFromFile('lexique/corpus_data/stats_google_ngram_1.txt', 'G', 'Google 1-grams')
    oStatsLex.addLexFromFile('lexique/corpus_data/stats_frwiki.txt', 'W', 'Wikipédia')
    oStatsLex.addLexFromFile('lexique/corpus_data/stats_frwikisource.txt', 'S', 'Wikisource')