Source code for corpustools.symbolsim.khorsi


from collections import defaultdict
from math import log

def lcs(x1, x2):
    """Returns the longest common sequence of two lists of characters
    and the remainder elements not in the longest common sequence

    Parameters
    ----------
    x1: list
        List of characters

    x2: list
        List of characters

    Returns
    -------
    list
        the list of the longest common sequence of two lists

    list
        the list of remaining elements of both x1 and x2 that are not in
        the longest common sequence
    """
    if len(x1) >= len(x2):
        longer = x1
        shorter = x2
    else:
        longer = x2
        shorter = x1
    stringMatches = []
    largestMatch = None
    for i in range(len(shorter),0,-1):
        #Get all possible substrings of word x1
        shorter_strings = substring_set(shorter, i)
        longer_strings = substring_set(longer, i)

        s = shorter_strings & longer_strings

        if len(s):
            break
    else:
        return [], longer+shorter

    leftover = []
    for i in range(len(shorter)):

        begin = i
        for lcs in s:
            lcs = lcs.split('.') #back to list
            end = i+len(lcs)
            if shorter[begin:end] == lcs:
                break
        else:
            lcs = None
        if lcs is not None:
            break
    if lcs is None:
        lcs = []
    leftover.extend(shorter[:begin])
    leftover.extend(shorter[end:])
    for i in range(len(longer)):
        begin = i
        end = i+len(lcs)
        if longer[begin:end] == lcs:
            break
    leftover.extend(longer[:begin])
    leftover.extend(longer[end:])
    return lcs,leftover

def substring_set(w, l):
    """Returns all substrings of a word w of length l

    Parameters
    ----------
    w: list
        List of characters representing a word

    l: int
        Length of substrings to generate

    Returns
    -------
    set
        A set of substrings of the specified length
    """
    #return all unique substrings of a word w of length l from 1 letter to the entire word
    substrings = set([])
    for i in range(len(w)):
        sub = w[i:(i+l)]
        if len(sub) < l:
            continue
        substrings.update(['.'.join(sub)])
    return substrings

[docs]def khorsi(word1, word2, freq_base, sequence_type, max_distance = None):
    """Calculate the string similarity of two words given a set of
    characters and their frequencies in a corpus based on Khorsi (2012)

    Parameters
    ----------
    word1: Word
        First Word object to compare

    word2: Word
        Second Word object to compare

    freq_base: dictionary
        a dictionary where each segment is mapped to its frequency of
        occurrence in a corpus

    sequence_type: string
        The type of segments to be used ('spelling' = Roman letters,
        'transcription' = IPA symbols)

    Returns
    -------
    float
        A number representing the relatedness of two words based on Khorsi (2012)
    """
    w1 = word1
    w2 = word2
    if sequence_type == 'spelling':
        w1 = list(w1)
        w2 = list(w2)
    longest, left_over = lcs(w1, w2)
    #print(w1,w2)
    #print(longest,left_over)

    #Khorsi's algorithm
    #print([(freq_base[x]/freq_base['total']) for x in longest])
    khorsi_sum = 0
    for x in longest:
        khorsi_sum += log(1/(freq_base[x]/freq_base['total']))
    for x in left_over:
        khorsi_sum -= log(1/(freq_base[x]/freq_base['total']))
        if max_distance is not None and khorsi_sum < max_distance:
            break
    return khorsi_sum