Source code for corpustools.phonoprob.phonotactic_probability

# -*- coding: utf-8 -*-

from corpustools.corpus.classes import Word

from corpustools.exceptions import PhonoProbError

from corpustools.contextmanagers import ensure_context

def phonotactic_probability_all_words(corpus_context, algorithm,
                                    probability_type = 'unigram',
                                    num_cores = -1,
                                    stop_check = None, call_back = None):
    """Calculate the phonotactic_probability of all words in the corpus and
    adds them as attributes of the words.

    Parameters
    ----------
    corpus_context : CorpusContext
        Context manager for a corpus
    algorithm : str
        Algorithm to use for calculating phonotactic probability (currently
        only 'vitevitch')
    probability_type : str
        Either 'unigram' or 'bigram' probability
    stop_check : callable, optional
        Optional function to check whether to gracefully terminate early
    call_back : callable, optional
        Optional function to supply progress information during the function
    """
    ensure_context(corpus_context)
    if call_back is not None:
        call_back('Calculating phonotactic probabilities...')
        call_back(0,len(corpus_context))
        cur = 0
    num_cores = -1 # Multiprocessing not yet implemented
    if num_cores == -1:
        for w in corpus_context:
            if stop_check is not None and stop_check():
                break
            if call_back is not None:
                cur += 1
                if cur % 20 == 0:
                    call_back(cur)
            if algorithm == 'vitevitch':
                res = phonotactic_probability_vitevitch(corpus_context, w,
                                        probability_type = probability_type,
                                        stop_check = stop_check)
                setattr(w.original, corpus_context.attribute.name,res)
    if stop_check is not None and stop_check():
        corpus_context.corpus.remove_attribute(corpus_context.attribute)

def phonotactic_probability(corpus_context, query, algorithm,
                                    probability_type = 'unigram',
                                    stop_check = None, call_back = None):
    """Calculate the phonotactic_probability of a particular word.

    Parameters
    ----------
    corpus_context : CorpusContext
        Context manager for a corpus
    query : Word
        The word whose neighborhood density to calculate.
    algorithm : str
        Algorithm to use for calculating phonotactic probability (currently
        only 'vitevitch')
    probability_type : str
        Either 'unigram' or 'bigram' probability
    stop_check : callable, optional
        Optional function to check whether to gracefully terminate early
    call_back : callable, optional
        Optional function to supply progress information during the function

    Returns
    -------
    float
        Phonotactic probability of the word
    """
    if algorithm == 'vitevitch':
        return phonotactic_probability_vitevitch(corpus_context, query,
                                    probability_type,
                                    stop_check, call_back)

[docs]def phonotactic_probability_vitevitch(corpus_context, query, probability_type = 'unigram', stop_check = None, call_back = None): """Calculate the phonotactic_probability of a particular word using the Vitevitch & Luce algorithm Parameters ---------- corpus_context : CorpusContext Context manager for a corpus query : Word The word whose neighborhood density to calculate. probability_type : str Either 'unigram' or 'bigram' probability stop_check : callable or None Optional function to check whether to gracefully terminate early call_back : callable or None Optional function to supply progress information during the function Returns ------- float Phonotactic probability of the word """ ensure_context(corpus_context) if probability_type == 'unigram': gramsize = 1 elif probability_type == 'bigram': gramsize = 2 prob_dict = corpus_context.get_phone_probs(gramsize = gramsize) sequence = zip(*[getattr(query, corpus_context.sequence_type)[i:] for i in range(gramsize)]) totprob = 0 tot = 0 for i,s in enumerate(sequence): try: totprob += prob_dict[s,i] except KeyError: notfound = [] for seg in s: if seg not in corpus_context.inventory: notfound.append(seg) if len(notfound): raise(PhonoProbError("Segments not found in the corpus: {}".format(', '.join(notfound)))) else: raise(PhonoProbError("Segments not found in the corpus: {} at position: {}".format(', '.join(s),i))) tot += 1 try: totprob = totprob / tot except ZeroDivisionError: pass return totprob