Source code for corpustools.symbolsim.string_similarity

from functools import partial
from corpustools.corpus.classes import Word
from corpustools.symbolsim.khorsi import khorsi
from corpustools.symbolsim.edit_distance import edit_distance
from corpustools.symbolsim.phono_edit_distance import phono_edit_distance

from corpustools.exceptions import StringSimilarityError

def khorsi_wrapper(w1, w2, freq_base,sequence_type, max_distance):
    score = khorsi(getattr(w1, sequence_type), getattr(w2, sequence_type),
                   freq_base=freq_base, sequence_type=sequence_type)
    if score >= max_distance:
        return score
    else:
        return None

def edit_distance_wrapper(w1, w2, sequence_type, max_distance):
    score = edit_distance(getattr(w1, sequence_type), getattr(w2, sequence_type), sequence_type)
    if score <= max_distance:
        return score
    else:
        return None

def phono_edit_distance_wrapper(w1, w2, sequence_type, features, max_distance):
    score = phono_edit_distance(getattr(w1, sequence_type), getattr(w2, sequence_type),
                                sequence_type=sequence_type, features=features)
    if score <= max_distance:
        return score
    else:
        return None

[docs]def string_similarity(corpus_context, query, algorithm, **kwargs): """ This function computes similarity of pairs of words across a corpus. Parameters ---------- corpus_context : CorpusContext Context manager for a corpus query: string, tuple, or list of tuples If this is a string, every word in the corpus will be compared to it, if this is a tuple with two strings, those words will be compared to each other, if this is a list of tuples, each tuple's strings will be compared to each other. algorithm: string The algorithm of string similarity to be used, currently supports 'khorsi', 'edit_distance', and 'phono_edit_distance' max_rel: double Filters out all words that are higher than max_rel from a relatedness measure min_rel: double Filters out all words that are lower than min_rel from a relatedness measure stop_check : callable or None Optional function to check whether to gracefully terminate early call_back : callable or None Optional function to supply progress information during the function Returns ------- list of tuples: The first two elements of the tuple are the words that were compared and the final element is their relatedness score """ stop_check = kwargs.get('stop_check', None) call_back = kwargs.get('call_back', None) min_rel = kwargs.get('min_rel', None) max_rel = kwargs.get('max_rel', None) if algorithm == 'khorsi': freq_base = corpus_context.get_frequency_base() try: bound_count = freq_base['#'] freq_base = {k:v for k,v in freq_base.items() if k != '#'} freq_base['total'] -= bound_count except KeyError: pass relate_func = partial(khorsi, freq_base=freq_base, sequence_type = corpus_context.sequence_type) elif algorithm == 'edit_distance': relate_func = partial(edit_distance, sequence_type = corpus_context.sequence_type) elif algorithm == 'phono_edit_distance': relate_func = partial(phono_edit_distance, sequence_type = corpus_context.sequence_type, features = corpus_context.specifier) else: raise(StringSimilarityError('{} is not a possible string similarity algorithm.'.format(algorithm))) related_data = [] if isinstance(query, Word): # 'comparison type' option set to "compare one word to entire corpus" if call_back is not None: total = len(corpus_context) if min_rel is not None or max_rel is not None: total *= 2 cur = 0 call_back('Calculating string similarity...') call_back(cur,total) targ_word = query relate = list() for word in corpus_context: if stop_check is not None and stop_check(): return if call_back is not None: cur += 1 if cur % 50 == 0: call_back(cur) w1 = getattr(targ_word, corpus_context.sequence_type) w2 = getattr(word, corpus_context.sequence_type) relatedness = relate_func(w1, w2) if min_rel is not None and relatedness < min_rel: continue if max_rel is not None and relatedness > max_rel: continue related_data.append( (targ_word,word,relatedness) ) #Sort the list by most morphologically related related_data.sort(key=lambda t:t[-1]) if related_data[0][1] != targ_word: related_data.reverse() elif isinstance(query, tuple): # 'comparison type' option set to "Compare a single pair of words to each other" word1 = query[0] word2 = query[1] w1 = getattr(word1, corpus_context.sequence_type) w2 = getattr(word2, corpus_context.sequence_type) relatedness = relate_func(w1,w2) related_data.append((word1,word2,relatedness)) elif hasattr(query,'__iter__'): if call_back is not None: total = len(query) cur = 0 call_back('Calculating string similarity...') if total: call_back(cur,total) for q1,q2 in query: if stop_check is not None and stop_check(): return if call_back is not None: cur += 1 if cur % 50 == 0: call_back(cur) try: w1 = getattr(q1,corpus_context.sequence_type) w2 = getattr(q2,corpus_context.sequence_type) relatedness = relate_func(w1,w2) if min_rel is not None and relatedness < min_rel: continue if max_rel is not None and relatedness > max_rel: continue except: relatedness = "N/A" related_data.append((q1, q2, relatedness)) return related_data