Source code for corpustools.freqalt.freq_of_alt

#fun times with morphological relatedness
import corpustools.symbolsim.phono_align as pam
from corpustools.symbolsim.string_similarity import string_similarity
from .io import print_freqalt_results


[docs]def calc_freq_of_alt(corpus_context, seg1, seg2, algorithm, output_filename = None, min_rel = None, max_rel = None, phono_align = False, min_pairs_okay = False, stop_check = None, call_back = None): """Returns a double that is a measure of the frequency of alternation of two sounds in a given corpus Parameters ---------- corpus_context : CorpusContext Context manager for a corpus seg1: char A sound segment, e.g. 's', 't' seg2: char A sound segment algorithm: string The string similarity algorithm max_rel: double Filters out all words that are higher than max_rel from a relatedness measure min_rel: double Filters out all words that are lower than min_rel from a relatedness measure phono_align: boolean (1 or 0), optional 1 means 'only count alternations that are likely phonologically aligned,' defaults to not force phonological alignment min_pairs_okay: bool, optional True means allow minimal pairs (e.g. in English, 's' and 't' do not alternate in minimal pairs, so allowing minimal pairs may skew results) stop_check : callable, optional Optional function to check whether to gracefully terminate early call_back : callable, optional Optional function to supply progress information during the function Returns ------- double The frequency of alternation of two sounds in a given corpus """ list_seg1 = [] list_seg2 = [] all_words = set() if call_back is not None: call_back('Finding instances of segments...') call_back(0, len(corpus_context)) cur = 0 for w in corpus_context: if stop_check is not None and stop_check(): return if call_back is not None: cur += 1 if cur % 1000 == 0: call_back(cur) tier = getattr(w, corpus_context.sequence_type) if seg1 in tier: list_seg1.append(w) all_words.add(w.spelling) if seg2 in tier: list_seg2.append(w) all_words.add(w.spelling) if call_back is not None: call_back('Calculating string similarities...') call_back(0, len(list_seg1) * len(list_seg2)) cur = 0 related_list = [] if phono_align: al = pam.Aligner(features = corpus_context.specifier) for w1 in list_seg1: for w2 in list_seg2: if stop_check is not None and stop_check(): return if call_back is not None: cur += 1 if cur % 1000 == 0: #print(len(related_list)) call_back(cur) if w1 == w2: continue ss = string_similarity(corpus_context, (w1,w2), algorithm) if min_rel is not None and ss[0][-1] < min_rel: continue if max_rel is not None and ss[0][-1] > max_rel: continue if not min_pairs_okay: if len(w1.transcription) == len(w2.transcription): count_diff = 0 for i in range(len(w1.transcription)): if w1.transcription[i] != w2.transcription[i]: count_diff += 1 if count_diff > 1: break if count_diff == 1: continue if phono_align: alignment = al.align(w1.transcription, w2.transcription) if not al.morpho_related(alignment, seg1, seg2): continue related_list.append(ss[0]) words_with_alt = set() if call_back is not None: call_back('Calculating frequency of alternation...') call_back(0, len(related_list)) cur = 0 for w1, w2, score in related_list: if stop_check is not None and stop_check(): return if call_back is not None: cur += 1 if cur % 100 == 0: call_back(cur) words_with_alt.add(w1.spelling) #Hacks words_with_alt.add(w2.spelling) #Calculate frequency of alternation using sets to ensure no duplicates (i.e. words with both seg1 and seg2 freq_of_alt = len(words_with_alt)/len(all_words) if output_filename: print_freqalt_results(output_filename, related_list) return len(all_words), len(words_with_alt), freq_of_alt