Source code for corpustools.freqalt.freq_of_alt

#fun times with morphological relatedness
import corpustools.symbolsim.phono_align as pam
from corpustools.symbolsim.string_similarity import string_similarity
from .io import print_freqalt_results


[docs]def calc_freq_of_alt(corpus_context, seg1, seg2, algorithm, output_filename = None,
                    min_rel = None, max_rel = None, phono_align = False,
                    min_pairs_okay = False, stop_check = None,
                    call_back = None):
    """Returns a double that is a measure of the frequency of
    alternation of two sounds in a given corpus

    Parameters
    ----------
    corpus_context : CorpusContext
        Context manager for a corpus
    seg1: char
        A sound segment, e.g. 's', 't'
    seg2: char
        A sound segment
    algorithm: string
        The string similarity algorithm
    max_rel: double
        Filters out all words that are higher than max_rel from a relatedness measure
    min_rel: double
        Filters out all words that are lower than min_rel from a relatedness measure
    phono_align: boolean (1 or 0), optional
        1 means 'only count alternations that are likely phonologically aligned,'
        defaults to not force phonological alignment
    min_pairs_okay: bool, optional
        True means allow minimal pairs (e.g. in English, 's' and 't' do not
        alternate in minimal pairs,
        so allowing minimal pairs may skew results)
    stop_check : callable, optional
        Optional function to check whether to gracefully terminate early
    call_back : callable, optional
        Optional function to supply progress information during the function

    Returns
    -------
    double
        The frequency of alternation of two sounds in a given corpus
    """

    list_seg1 = []
    list_seg2 = []
    all_words = set()
    if call_back is not None:
        call_back('Finding instances of segments...')
        call_back(0, len(corpus_context))
        cur = 0
    for w in corpus_context:
        if stop_check is not None and stop_check():
            return
        if call_back is not None:
            cur += 1
            if cur % 1000 == 0:
                call_back(cur)
        tier = getattr(w, corpus_context.sequence_type)
        if seg1 in tier:
            list_seg1.append(w)
            all_words.add(w.spelling)
        if seg2 in tier:
            list_seg2.append(w)
            all_words.add(w.spelling)

    if call_back is not None:
        call_back('Calculating string similarities...')
        call_back(0, len(list_seg1) * len(list_seg2))
        cur = 0
    related_list = []
    if phono_align:
        al = pam.Aligner(features = corpus_context.specifier)
    for w1 in list_seg1:
        for w2 in list_seg2:
            if stop_check is not None and stop_check():
                return
            if call_back is not None:
                cur += 1
                if cur % 1000 == 0:
                    #print(len(related_list))
                    call_back(cur)
            if w1 == w2:
                continue
            ss = string_similarity(corpus_context, (w1,w2), algorithm)
            if min_rel is not None and ss[0][-1] < min_rel:
                continue
            if max_rel is not None and ss[0][-1] > max_rel:
                continue
            if not min_pairs_okay:
                if len(w1.transcription) == len(w2.transcription):
                    count_diff = 0
                    for i in range(len(w1.transcription)):
                        if w1.transcription[i] != w2.transcription[i]:
                            count_diff += 1
                            if count_diff > 1:
                                break
                    if count_diff == 1:
                        continue
            if phono_align:
                alignment = al.align(w1.transcription, w2.transcription)
                if not al.morpho_related(alignment, seg1, seg2):
                    continue

            related_list.append(ss[0])

    words_with_alt = set()
    if call_back is not None:
        call_back('Calculating frequency of alternation...')
        call_back(0, len(related_list))
        cur = 0
    for w1, w2, score in related_list:
        if stop_check is not None and stop_check():
            return
        if call_back is not None:
            cur += 1
            if cur % 100 == 0:
                call_back(cur)
        words_with_alt.add(w1.spelling) #Hacks
        words_with_alt.add(w2.spelling)

    #Calculate frequency of alternation using sets to ensure no duplicates (i.e. words with both seg1 and seg2

    freq_of_alt = len(words_with_alt)/len(all_words)

    if output_filename:
        print_freqalt_results(output_filename, related_list)

    return len(all_words), len(words_with_alt), freq_of_alt