Grammalecte  str_transform.py at [4176701b46]

File gc_core/py/str_transform.py artifact e86906e5ce part of check-in 4176701b46


#!python3
# -*- coding: UTF-8 -*-


dSimilarChars = {
    "a": "aàâáä",
    "à": "aàâáä",
    "â": "aàâáä",
    "á": "aàâáä",
    "ä": "aàâáä",
    "c": "cç",
    "ç": "cç",
    "e": "eéêèë",
    "é": "eéêèë",
    "ê": "eéêèë",
    "è": "eéêèë",
    "ë": "eéêèë",
    "i": "iîïíì",
    "î": "iîïíì",
    "ï": "iîïíì",
    "í": "iîïíì",
    "ì": "iîïíì",
    "o": "oôóòö",
    "ô": "oôóòö",
    "ó": "oôóòö",
    "ò": "oôóòö",
    "ö": "oôóòö",
    "u": "uûùüú",
    "û": "uûùüú",
    "ù": "uûùüú",
    "ü": "uûùüú",
    "ú": "uûùüú",
}

## No stemming

def noStemming (sFlex, sStem):
    return sStem

def rebuildWord (sFlex, cmd1, cmd2):
    if cmd1 == "_":
        return sFlex
    n, c = cmd1.split(":")
    s = s[:n] + c + s[n:]
    if cmd2 == "_":
        return s
    n, c = cmd2.split(":")
    return s[:n] + c + s[n:]

    
## Define affixes for stemming

# Note: 48 is the ASCII code for "0"


# Suffix only
def defineSuffixCode (sFlex, sStem):
    """ Returns a string defining how to get stem from flexion
            "n(sfx)"
        with n: a char with numeric meaning, "0" = 0, "1" = 1, ... ":" = 10, etc. (See ASCII table.) Says how many letters to strip from flexion.
             sfx [optional]: string to add on flexion
        Examples:
            "0": strips nothing, adds nothing
            "1er": strips 1 letter, adds "er"
            "2": strips 2 letters, adds nothing
    """
    if sFlex == sStem:
        return "0"
    jSfx = 0
    for i in range(min(len(sFlex), len(sStem))):
        if sFlex[i] != sStem[i]:
            break
        jSfx += 1
    return chr(len(sFlex)-jSfx+48) + sStem[jSfx:]  

def getStemFromSuffixCode (sFlex, sSfxCode):
    if sSfxCode == "0":
        return sFlex
    return sFlex[:-(ord(sSfxCode[0])-48)] + sSfxCode[1:]  if sSfxCode[0] != '0'  else sFlex + sSfxCode[1:]


# Prefix and suffix
def defineAffixCode (sFlex, sStem):
    """ Returns a string defining how to get stem from flexion. Examples:
            "0" if stem = flexion
            "stem" if no common substring
            "n(pfx)/m(sfx)"
        with n and m: chars with numeric meaning, "0" = 0, "1" = 1, ... ":" = 10, etc. (See ASCII table.) Says how many letters to strip from flexion.
            pfx [optional]: string to add before the flexion 
            sfx [optional]: string to add after the flexion
    """
    if sFlex == sStem:
        return "0"
    # is stem a substring of flexion?
    n = sFlex.find(sStem)
    if n >= 0:
        return "{}/{}".format(chr(n+48), chr(len(sFlex)-(len(sStem)+n)+48))
    # no, so we are looking for common substring
    sSubs = longestCommonSubstring(sFlex, sStem)
    if len(sSubs) > 1:
        iPos = sStem.find(sSubs)
        sPfx = sStem[:iPos]
        sSfx = sStem[iPos+len(sSubs):]
        n = sFlex.find(sSubs)
        m = len(sFlex) - (len(sSubs)+n)
        sAff = "{}/".format(chr(n+48))  if not sPfx  else "{}{}/".format(chr(n+48), sPfx)
        sAff += chr(m+48)  if not sSfx  else "{}{}".format(chr(m+48), sSfx)
        return sAff
    return sStem

def longestCommonSubstring (s1, s2):
    # http://en.wikipedia.org/wiki/Longest_common_substring_problem
    # http://en.wikibooks.org/wiki/Algorithm_implementation/Strings/Longest_common_substring
    M = [ [0]*(1+len(s2)) for i in range(1+len(s1)) ]
    longest, x_longest = 0, 0
    for x in range(1, 1+len(s1)):
        for y in range(1, 1+len(s2)):
            if s1[x-1] == s2[y-1]:
                M[x][y] = M[x-1][y-1] + 1
                if M[x][y] > longest:
                    longest = M[x][y]
                    x_longest = x
            else:
                M[x][y] = 0
    return s1[x_longest-longest : x_longest]

def getStemFromAffixCode (sFlex, sAffCode):
    if sAffCode == "0":
        return sFlex
    if '/' not in sAffCode:
        return "# error #"
    sPfxCode, sSfxCode = sAffCode.split('/')
    sFlex = sPfxCode[1:] + sFlex[(ord(sPfxCode[0])-48):] 
    return sFlex[:-(ord(sSfxCode[0])-48)] + sSfxCode[1:]  if sSfxCode[0] != '0'  else sFlex + sSfxCode[1:]