Source code for corpustools.corpus.classes.lexicon


import re
import random
import collections
import operator
import math
import locale

from corpustools.exceptions import CorpusIntegrityError

import pdb

[docs]class Segment(object):
    """
    Class for segment symbols

    Parameters
    ----------
    symbol : str
        Segment symbol

    Attributes
    ----------
    features : dict
        Feature specification for the segment
    """

    def __init__(self, symbol):
        #None defaults are for word-boundary symbols
        self.symbol = symbol
        self.features = {}

[docs]    def specify(self, feature_dict):
        """
        Specify a segment with a new feature specification

        Parameters
        ----------
        feature_dict : dict
            Feature specification
        """
        self.features = {k.lower(): v for k,v in feature_dict.items()}

[docs]    def minimal_difference(self, other, features):
        """
        Check if this segment is a minimal feature difference with another
        segment (ignoring some features)

        Parameters
        ----------
        other : Segment
            Segment to compare with
        features : list
            Features that are allowed to vary between the two segments

        Returns
        -------
        bool
            True if all features other than the specified ones match,
            False otherwise
        """
        for k, v in self.features.items():
            if k in features:
                continue
            if v != other[k]:
                return False
        return True

[docs]    def feature_match(self, specification):
        """
        Return true if segment matches specification, false otherwise.

        Parameters
        ----------
        specification : object
            Specification can be a single feature value '+feature', a list of
            feature values ['+feature1','-feature2'], or a dictionary of
            features and values {'feature1': '+', 'feature2': '-'}

        Returns
        -------
        bool
            True if this segment contains the feature values in the specification
        """
        if isinstance(specification,str):
            try:
                if self[specification[1:]]!=specification[0]:
                    return False
            except KeyError:
                return False
        elif isinstance(specification,list):
            for f in specification:
                try:
                    if self[f[1:]]!=f[0]:
                        return False
                except KeyError:
                    return False
        elif isinstance(specification, dict):
            for f,v in specification.items():
                try:
                    if self[f] != v:
                        return False
                except KeyError:
                    return False
        return True

    def __contains__(self, item):
        return item.lower() in self.features

    def __getitem__(self, key):
        return self.features[key.lower()]

    def __setitem__(self, key, value):
        self.features[key.lower()] = value

    def __repr__(self):
        return self.__str__()

    def __str__(self):
        return self.symbol

    def __eq__(self, other):
        """Two segments are considered equal if their symbol attributes match

        """
        if isinstance(other, Segment):
            return self.symbol == other.symbol
        else:
            return self.symbol == other

    def __ne__(self, other):
        return not self.__eq__(other)

    def __lt__(self,other):
        if isinstance(other, Segment):
            return self.symbol < other.symbol
        else:
            return self.symbol < other

    def __le__(self,other):
        return (self.symbol == other.symbol or self.symbol < other.symbol)

    def __ge__(self,other):
        return (self.symbol == other.symbol or self.symbol > other.symbol)

    def __gt__(self,other):
        if isinstance(other, Segment):
            return self.symbol > other.symbol
        else:
            return self.symbol > other

    def __len__(self):
        return len(self.symbol)

[docs]class Transcription(object):
    """
    Transcription object, sequence of symbols

    Parameters
    ----------
    seg_list : list
        List of segments that form the transcription.
        Elements in the list, can be Segments, strings, or BaseAnnotations

    Attributes
    ----------
    _list : list
        List of strings representing segment symbols
    stress_pattern: dict
        Dictionary with keys of segment indices and values of the stress
        for that segment
    boundaries : dict
        Possible keys of 'morpheme' or 'tone' that keeps track of where
        morpheme or tone boundaries are inserted
    """
    def __init__(self,seg_list):
        self._list = []
        #self._times = []
        self.stress_pattern = {}
        self.boundaries = {}
        cur_group = 0
        cur_tone = None
        if seg_list is not None:
            for i,s in enumerate(seg_list):
                try:
                    self._list.append(s.label)
                    #if s.begin is not None and s.end is not None:
                    #    self._times.append((s.begin,s.end))
                    if s.stress is not None:
                        self.stress_pattern[i] = s.stress
                    if s.tone is not None:
                        if 'tone' not in self.boundaries:
                            self.boundaries['tone'] = {}
                        if s.tone != cur_tone:
                            self.boundaries['tone'][i] = s.tone
                            cur_tone = s.tone
                    if s.group is not None:
                        if 'morpheme' not in self.boundaries:
                            self.boundaries['morpheme'] = []
                        if s.group != cur_group:
                            self.boundaries['morpheme'].append(i)
                            cur_group = s.group
                except AttributeError:
                    if isinstance(s,str):
                        self._list.append(s)
                    elif isinstance(s,dict):
                        try:
                            symbol = s['label']
                        except KeyError:
                            symbol = s['symbol']
                        self._list.append(symbol)
                        #if 'begin' in s and 'end' in s:
                        #    self._times.append((s['begin'],s['end']))
                    elif isinstance(s,list):
                        if len(s) == 3:
                            self._list.append(s[0])
                            #self._times.append((s[1],s[2]))
                        else:
                            raise(NotImplementedError('That format for seg_list is not supported.'))
                    else:
                        raise(NotImplementedError('That format for seg_list is not supported.'))

[docs]    def with_word_boundaries(self):
        """
        Return the string of segments with word boundaries surrounding them

        Returns
        -------
        list
            Transcription with word boundaries
        """
        return ['#'] + self._list + ['#']

[docs]    def find(self, environment):
        """
        Find instances of an EnvironmentFilter in the Transcription

        Parameters
        ----------
        environment : EnvironmentFilter
            EnvironmentFilter to search for

        Returns
        -------
        list
            List of Environments that fit the EnvironmentFilter
        """
        if not isinstance(environment, EnvironmentFilter):
            return None
        if all(m not in self for m in environment._middle):
            return None
        num_segs = len(environment)

        possibles = zip(*[self.with_word_boundaries()[i:]
                                for i in range(num_segs)])
        lhs_num = environment.lhs_count()
        middle_num = lhs_num
        rhs_num = middle_num + 1
        envs = []
        for i, p in enumerate(possibles):
            if p in environment:
                lhs = p[:lhs_num]
                middle = p[middle_num]
                rhs = p[rhs_num:]
                envs.append(Environment(middle, i + middle_num, lhs, rhs))
        if not envs:
            return None
        return envs

[docs]    def find_nonmatch(self, environment):
        """
        Find all instances of an EnvironmentFilter in the Transcription
        that match in the middle segments, but don't match on the sides

        Parameters
        ----------
        environment : EnvironmentFilter
            EnvironmentFilter to search for

        Returns
        -------
        list
            List of Environments that fit the EnvironmentFilter's middle
            but not the sides
        """
        if not isinstance(environment, EnvironmentFilter):
            return None
        if all(m not in self for m in environment.middle):
            return None
        num_segs = len(environment)

        possibles = zip(*[self.with_word_boundaries()[i:]
                                for i in range(num_segs)])
        envs = []
        lhs_num = environment.lhs_count()
        middle_num = lhs_num
        rhs_num = middle_num + 1
        for i, p in enumerate(possibles):
            if p not in environment and p[middle_num] in environment.middle:
                lhs = p[:lhs_num]
                middle = p[middle_num]
                rhs = p[rhs_num:]
                envs.append(Environment(middle, i + middle_num, lhs, rhs))
        if not envs:
            return None
        return envs


    def __contains__(self, other):
        if isinstance(other, Segment):
            if other.symbol in self._list:
                return True
        elif isinstance(other, str):
            if other in self._list:
                return True
        return False

    def __setstate__(self, state):
        if 'stress_pattern' not in state:
            state['stress_pattern'] = {}
        if 'boundaries' not in state:
            state['boundaries'] = {}
        self.__dict__.update(state)

    def __hash__(self):
        return hash(str(self))

    def __getitem__(self, key):
        if isinstance(key,int) or isinstance(key,slice):
            return self._list[key]
        raise(KeyError)

    def __repr__(self):
        return self.__str__()

    def __str__(self):
        temp_list = []
        for i,s in enumerate(self._list):
            if self.stress_pattern and i in self.stress_pattern:
                s += self.stress_pattern[i]
            if 'tone' in self.boundaries and i in self.boundaries['tone']:
                s += self.boundaries['tone'][i]
            temp_list.append(s)
        if 'morpheme' in self.boundaries:
            beg = 0
            bound_list = []
            for i in self.boundaries['morpheme']:
                bound_list.append('.'.join(temp_list[beg:i]))
            bound_list.append('.'.join(temp_list[i:]))
            return '-'.join(bound_list)
        else:
            return '.'.join(temp_list)

    def __iter__(self):
        for s in self._list:
            yield s

    def __add__(self, other):
        """
        Allow for Transcriptions to be added to get all the segments in each
        """
        if not isinstance(other,Transcription):
            raise(TypeError)
        return self._list + other._list

    def __eq__(self, other):
        if isinstance(other,list):
            if len(other) != len(self):
                return False
            for i,s  in enumerate(self):
                if s != other[i]:
                    return False
            return True
        if not isinstance(other, Transcription):
            return False
        if self._list != other._list:
            return False
        if self.stress_pattern != other.stress_pattern:
            return False
        if self.boundaries != other.boundaries:
            return False
        return True

    def __lt__(self,other):
        if isinstance(other, Transcription):
            return self._list < other._list
        else:
            return self._list < other

    def __le__(self,other):
        if isinstance(other, Transcription):
            return (self._list == other._list or self._list < other._list)
        else:
            return self._list <= other

    def __ge__(self,other):
        if isinstance(other, Transcription):
            return (self._list == other._list or self._list > other._list)
        else:
            return self._list >= other

    def __gt__(self,other):
        if isinstance(other, Transcription):
            return self._list > other._list
        else:
            return self._list > other

[docs]    def match_segments(self, segments):
        """
        Returns a matching segments from a list of segments

        Parameters
        ----------
        segments : list
            List of Segments or strings to filter the Transcription

        Returns
        -------
        list
            List of segments (in their original order) that match the
            segment parameter
        """
        match = []
        for s in self:
            if s in segments:
                match.append(s)
        return match

    def __ne__(self, other):
        return not self.__eq__(other)

    def __len__(self):
        return len(self._list)

[docs]class FeatureMatrix(object):
    """
    An object that stores feature values for segments

    Parameters
    ----------
    name : str
        Name to give the FeatureMatrix
    feature_entries : list
        List of dict with one dictionary per segment, requires the key
        of symbol which identifies the segment

    Attributes
    ----------
    name : str
        An informative identifier for the feature matrix
    features : list
        Sorted list of feature names
    possible_values : set
        Set of values used in the FeatureMatrix
    default_value : str
        Default feature value, usually corresponding to unspecified features
    stresses : dict
        Mapping of stress values to segments that bear that stress
    places : dict
        Mapping from place of articulation labels to a feature specification
    manners : dict
        Mapping from manner of articulation labels to a feature specification
    height : dict
        Mapping from vowel height labels to a feature specification
    backness : dict
        Mapping from vowel backness labels to a feature specification
    vowel_feature : str
        Feature value (i.e., '+voc') that separates vowels from consonants
    voice_feature : str
        Feature value (i.e., '+voice') that codes voiced obstruents
    diph_feature : str
        Feature value (i.e., '+diphthong' or '.high') that separates
        diphthongs from monophthongs
    rounded_feature : str
        Feature value (i.e., '+round') that codes rounded vowels


    """

    def __init__(self, name,feature_entries):
        self.name = name
        self._features = None
        self.possible_values = set()
        self.matrix = {}
        self._default_value = 'n'
        for s in feature_entries:
            if self._features is None:
                self._features = {k for k in s.keys() if k != 'symbol'}
            self.matrix[s['symbol']] = Segment(s['symbol'])
            self.matrix[s['symbol']].specify({k:v for k,v in s.items() if k != 'symbol'})
            self.possible_values.update({v for k,v in s.items() if k != 'symbol'})

        #What are these?
        self.matrix['#'] = Segment('#')
        self.places = collections.OrderedDict()
        self.manners = collections.OrderedDict()
        self.backness = collections.OrderedDict()
        self.height = collections.OrderedDict()
        self.generate_generic_names()

    def generate_generic_names(self):
        if 'consonantal' in self.features:
            self.generate_generic_hayes()
            self.vowel_feature = '+syllabic'
            self.voice_feature = '+voice'
            self.diph_feature = '+diphthong'
            self.rounded_feature = '+round'
        elif 'voc' in self.features:
            self.generate_generic_spe()
            self.vowel_feature = '+voc'
            self.voice_feature = '+voice'
            self.diph_feature = '.high'
            self.rounded_feature = '+round'
        else:
            self.generate_generic()
            self.vowel_feature = []
            self.voice_feature = []
            self.diph_feature = []
            self.rounded_feature = []

    def generate_generic(self):
        self.places['Labial'] = {}
        self.places['Labiodental'] =  {}
        self.places['Dental'] = {}
        self.places['Alveolar'] = {}
        self.places['Alveopalatal'] = {}
        self.places['Palatal'] = {}
        self.places['Velar'] = {}
        self.places['Uvular'] = {}
        self.places['Pharyngeal'] = {}
        self.places['Glottal'] = {}

        self.manners['Stop'] = {}
        self.manners['Nasal'] = {}
        self.manners['Trill'] = {}
        self.manners['Tap'] = {}
        self.manners['Fricative'] = {}
        self.manners['Affricate'] = {}
        self.manners['Approximant'] = {}
        self.manners['Lateral approximant'] = {}

        self.backness['Front'] = {}
        self.backness['Near front'] = {}
        self.backness['Central'] = {}
        self.backness['Near back'] = {}
        self.backness['Back'] = {}

        self.height['Close'] = {}
        self.height['Near close'] = {}
        self.height['Close mid'] = {}
        self.height['Open mid'] = {}
        self.height['Open'] = {}

    def generate_generic_spe(self):
        self.places['Labial'] = {'ant':'+', 'back': '-', 'cor':'-'}
        self.places['Labiodental'] =  {'ant':'+', 'back': '-', 'cor':'-'}
        self.places['Dental'] = {'ant':'+', 'back': '-', 'cor':'+'}
        self.places['Alveolar'] = {'ant':'-', 'back': '-', 'cor':'+', 'high': '-'}
        self.places['Alveopalatal'] = {'ant':'-', 'back': '-', 'cor':'+', 'high': '+'}
        self.places['Palatal'] = {'ant':'-', 'back': '-', 'cor':'-'}
        self.places['Velar'] = {'ant':'-', 'back': '+', 'cor':'-', 'high': '+'}
        self.places['Uvular'] = {'ant':'-', 'back': '+', 'cor':'-', 'high': '-'}
        self.places['Pharyngeal'] = {'low':'+', 'back': '+'}
        self.places['Glottal'] = {'low':'+', 'back': '-'}

        self.manners['Stop'] = {'son': '-','cont':'-','nasal':'-'}
        self.manners['Nasal'] = {'nasal': '+'}
        self.manners['Trill'] = {}
        self.manners['Tap'] = {}
        self.manners['Fricative'] = {'son': '-','cont':'+','nasal':'-'}
        self.manners['Affricate'] = {'del_rel':'+'}
        self.manners['Approximant'] = {'son':'+', 'nasal': '-', 'lat':'-'}
        self.manners['Lateral approximant'] = {'son':'+', 'nasal': '-', 'lat':'+'}

        self.backness['Front'] = {'back':'-', 'tense':'+'}
        self.backness['Near front'] = {'back': '-', 'tense': '-'}
        self.backness['Central'] = {'back': 'n'}
        self.backness['Near back'] = {'back': '+', 'tense':'-'}
        self.backness['Back'] = {'back':'+', 'tense':'+'}

        self.height['Close'] = {'high':'+', 'low':'-', 'tense':'+'}
        self.height['Near close'] = {'high':'+', 'low':'-', 'tense':'-'}
        self.height['Close mid'] = {'high':'-', 'low':'-', 'tense':'+'}
        self.height['Open mid'] = {'high':'-', 'low':'-', 'tense':'-'}
        self.height['Open'] = {'high':'-', 'low':'+'}

    def generate_generic_hayes(self):
        self.places['Labial'] = {'labial': '+', 'coronal':'-'}
        self.places['Labiodental'] = {'labiodental': '+',}
        self.places['Dental'] = {'anterior': '+', 'coronal':'+', 'labial':'-'}
        self.places['Alveolar'] = {}
        self.places['Alveopalatal'] = {'anterior': '-', 'coronal':'+', 'labial':'-'}
        self.places['Palatal'] = {'dorsal': '+', 'coronal':'+', 'labial':'-'}
        self.places['Velar'] = {'dorsal': '+', 'labial':'-'}
        self.places['Uvular'] = {'dorsal': '+', 'back':'+', 'labial':'-'}
        self.places['Pharyngeal'] = {}
        self.places['Glottal'] = {'dorsal': '-', 'coronal':'-', 'labial':'-', 'nasal': '-'}

        self.manners['Stop'] = {'sonorant': '-','continuant':'-','nasal':'-','delayed_release':'-'}
        self.manners['Nasal'] = {'nasal': '+'}
        self.manners['Trill'] = {'trill': '+'}
        self.manners['Tap'] = {'tap': '+'}
        self.manners['Fricative'] = {'sonorant': '-','continuant':'+'}
        self.manners['Affricate'] = {'sonorant': '-', 'continuant':'-','delayed_release':'+'}
        self.manners['Approximant'] = {'sonorant': '+', 'lateral':'-'}
        self.manners['Lateral approximant'] = {'sonorant': '+', 'lateral':'+'}

        self.backness['Front'] = {'front': '+', 'back':'-', 'tense':'+'}
        self.backness['Near front'] = {'front': '+', 'back': '-', 'tense': '-'}
        self.backness['Central'] = {'front': '-', 'back': '-'}
        self.backness['Near back'] = {'front': '-', 'back': '-', 'tense':'-'}
        self.backness['Back'] = {'front':'-', 'back':'+', 'tense':'+'}

        self.height['Close'] = {'high':'+', 'low':'-', 'tense':'+'}
        self.height['Near close'] = {'high':'+', 'low':'-', 'tense':'-'}
        self.height['Close mid'] = {'high':'-', 'low':'-', 'tense':'+'}
        self.height['Open mid'] = {'high':'-', 'low':'-', 'tense':'-'}
        self.height['Open'] = {'high':'-', 'low':'+'}

    def __eq__(self, other):
        if not isinstance(other,FeatureMatrix):
            return False
        if self.matrix == other.matrix:
            return True
        return False

[docs]    def features_to_segments(self, feature_description):
        """
        Given a feature description, return the segments in the inventory
        that match that feature description

        Feature descriptions should be either lists, such as
        ['+feature1', '-feature2'] or strings that can be separated into
        lists by ',', such as '+feature1,-feature2'.

        Parameters
        ----------
        feature_description : str, list, or dict
            Feature values that specify the segments, see above for format

        Returns
        -------
        list of Segments
            Segments that match the feature description

        """
        segments = []
        if isinstance(feature_description, str):
            feature_description = feature_description.split(',')
        for k,v in self.matrix.items():
            if v.feature_match(feature_description):
                segments.append(k)
        return segments

    def __setstate__(self,state):
        if '_features' not in state:
            state['_features'] = state['features']
        for k,v in state['matrix'].items():
            if not isinstance(v,Segment):
                s = Segment(k)
                s.specify(v)
                state['matrix'][k] = s
            else:
                v.specify(v.features)
        self.__dict__.update(state)

        #Backwards compatability
        if '_default_value' not in state:
            self._default_value = 'n'
        if 'places' not in state:
            self.places = collections.OrderedDict()
            self.manners = collections.OrderedDict()
            self.backness = collections.OrderedDict()
            self.height = collections.OrderedDict()
            self.generate_generic_names()

    def __iter__(self):
        for k in sorted(self.matrix.keys()):
            yield self.matrix[k]

[docs]    def validate(self):
        """
        Make sure that all segments in the matrix have all the features.
        If not, add an unspecified value for that feature to them.
        """
        for k,v in self.matrix.items():
            for f in self._features:
                if f not in v:
                    self.matrix[k][f] = self._default_value

    @property
    def default_value(self):
        return self._default_value

    @property
    def features(self):
        """
        Get a list of features that are used in this feature system

        Returns
        -------
        list
            Sorted list of the names of all features in the matrix
        """
        return sorted(list(self._features))

[docs]    def add_segment(self,seg,feat_spec):
        """
        Add a segment with a feature specification to the feature system

        Attributes
        ----------
        seg : str
            Segment symbol to add to the feature system
        feat_spec : dictionary
            Dictionary with features as keys and feature values as values
        """

        #Validation
        for f in feat_spec.keys():
            if f not in self._features:
                raise(AttributeError('The segment \'%s\' has a feature \'%s\' that is not defined for this feature matrix' %(seg,f)))
        s = Segment(seg)
        s.specify(feat_spec)
        self.matrix[seg] = s

[docs]    def add_feature(self,feature, default = None):
        """
        Add a feature to the feature system

        Attributes
        ----------
        feature : str
            Name of the feature to add to the feature system
        default : str, optional
            If specified, set the value for all segments to this value,
            otherwise use the FeatureMatrix's ``default_value``
        """

        self._features.update({feature})
        if default is None:
            self.validate()
        else:
            for k,v in self.matrix.items():
                for f in self._features:
                    if f not in v:
                        self.matrix[k][f] = default


[docs]    def valid_feature_strings(self):
        """
        Get all combinations of ``possible_values`` and ``features``

        Returns
        -------
        list
            List of valid feature strings
        """
        strings = []
        for v in self.possible_values:
            for f in self.features:
                strings.append(v+f)
        return strings

[docs]    def categorize(self, seg):
        """
        Categorize a segment into consonant/vowel, place of articulation,
        manner of articulation, voicing, vowel height, vowel backness, and vowel
        rounding.

        For consonants, the category is of the format:

        ('Consonant', PLACE, MANNER, VOICING)

        For vowels, the category is of the format:

        ('Vowel', HEIGHT, BACKNESS, ROUNDED)

        Diphthongs are categorized differently:

        ('Diphthong', 'Vowel')

        Parameters
        ----------
        seg : Segment
            Segment to categorize

        Returns
        -------
        tuple or None
            Returns categories according to the formats above, if any are
            unable to be calculated, returns None in those places.
            Returns None if a category cannot be found.
        """
        if seg == '#':
            return None
        seg_features = seg.features
        if seg.feature_match(self.vowel_feature):
            category = ['Vowel']

            if seg.feature_match(self.diph_feature):
                category.insert(0,'Diphthong')
                return category

            for k,v in self.height.items():
                if seg.feature_match(v):
                    category.append(k)
                    break
            else:
                category.append(None)
            for k,v in self.backness.items():
                if seg.feature_match(v):
                    category.append(k)
                    break
            else:
                category.append(None)

            if seg.feature_match(self.rounded_feature):
                category.append('Rounded')
            else:
                category.append('Unrounded')
        else:
            category = ['Consonant']

            for k,v in self.places.items():
                if seg.feature_match(v):
                    category.append(k)
                    break
            else:
                category.append(None)

            for k,v in self.manners.items():
                if seg.feature_match(v):
                    category.append(k)
                    break
            else:
                category.append(None)

            if seg.feature_match(self.voice_feature):
                category.append('Voiced')
            else:
                category.append('Voiceless')
        return category

    @property
    def segments(self):
        """
        Return a list of segment symbols that are specified in the feature
        system

        Returns
        -------
        list
            List of all the segments with feature specifications
        """
        return list(self.matrix.keys())

[docs]    def seg_to_feat_line(self,symbol):
        """
        Get a list of feature values for a given segment in the order
        that features are return in get_feature_list

        Use for display purposes

        Attributes
        ----------
        symbol : str
            Segment symbol to look up

        Returns
        -------
        list
            List of feature values for the symbol, as well as the symbol itself
        """
        featline = [symbol] + [ self.matrix[symbol][feat]
                            for feat in self.features]
        return featline

    def __getitem__(self,item):
        if isinstance(item,str):
            return self.matrix[item]
        elif isinstance(item,tuple):
            return self.matrix[item[0]][item[1]]

    def __delitem__(self,item):
        del self.matrix[item]

    def __contains__(self,item):
        return item in list(self.matrix.keys())

    def __setitem__(self,key,value):
        self.matrix[key] = value

    def __len__(self):
        return len(self.matrix)

[docs]class Word(object):
    """An object representing a word in a corpus

    Information about the attributes are contained in the Corpus' ``attributes``.

    Attributes
    ----------
    spelling : str
        A representation of a word that lacks phonological information.

    transcription : Transcription
        A representation of a word that includes phonological information.

    frequency : float
        Token frequency in a corpus
    """

    _freq_names = ['abs_freq', 'freq_per_mil','sfreq',
        'lowercase_freq', 'log10_freq']

    def __init__(self, **kwargs):

        _corpus = None

        self.transcription = None
        self.spelling = None
        self.frequency = 0
        self.wordtokens = []
        self.descriptors = ['spelling','transcription', 'frequency']
        for key, value in kwargs.items():
            if isinstance(value, tuple):
                att, value = value
                if att.att_type == 'numeric':
                    try:
                        value = locale.atof(value)
                    except (ValueError, TypeError):
                        value = float('nan')
                elif att.att_type == 'tier':
                    value = Transcription(value)
            else:
                key = key.lower()
                if key in self._freq_names:
                    key = 'frequency'
                if isinstance(value,list):
                    #assume transcription type stuff
                    value = Transcription(value)
                elif key != 'spelling':
                    try:
                        f = float(value)
                        if not math.isnan(f) and not math.isinf(f):
                            value = f
                    except (ValueError, TypeError):
                        pass
                if key not in self.descriptors:
                    self.descriptors.append(key)
            setattr(self, key, value)
        if self.spelling is None and self.transcription is None:
            raise(ValueError('Words must be specified with at least a spelling or a transcription.'))
        if self.spelling is None:
            self.spelling = ''.join(map(str,self.transcription))

    def __hash__(self):
        return hash((self.spelling,str(self.transcription)))

    def __getstate__(self):
        state = self.__dict__.copy()
        state['wordtokens'] = []
        state['_corpus'] = None
        #for k,v in state.items():
        #    if (k == 'transcription' or k in self.tiers) and v is not None:
        #        state[k] = [x.symbol for x in v] #Only store string symbols
        return state

    def __setstate__(self, state):
        self.transcription = []
        self.spelling = ''
        self.frequency = 0
        if 'wordtokens' not in state:
            state['wordtokens'] = []
        if 'descriptors' not in state:
            state['descriptors'] = ['spelling','transcription', 'frequency']
        if 'frequency' not in state['descriptors']:
            state['descriptors'].append('frequency')
        try:
            tiers = state.pop('tiers')
            for t in tiers:
                state['descriptors'].append(t)
        except KeyError:
            pass
        self.__dict__.update(state)

[docs]    def add_abstract_tier(self, tier_name, tier_segments):
        """
        Add an abstract tier to the Word

        Parameters
        ----------
        tier_name : str
            Attribute name
        tier_segments: dict
            Dictionary with keys of the abstract segments (i.e., 'C' or 'V')
            and values that are sets of segments
        """
        tier = []
        for s in self.transcription:
            for k,v in tier_segments.items():
                if s in v:
                    tier.append(k)
                    break
        setattr(self,tier_name,''.join(tier))

[docs]    def add_attribute(self, tier_name, value):
        """
        Add an arbitrary attribute to the Word

        Parameters
        ----------
        tier_name : str
            Attribute name
        value: object
            Attribute value
        """
        setattr(self, tier_name, value)

[docs]    def add_tier(self, tier_name, tier_segments):
        """Adds a new tier attribute to the Word

        Parameters
        ----------
        tier_name : str
            Name for the new tier

        tier_segments: list of segments
            Segments that count for inclusion in the tier
        """
        matching_segs = self.transcription.match_segments(tier_segments)
        new_tier = Transcription(matching_segs)
        setattr(self,tier_name,new_tier)
        for wt in self.wordtokens:
            matching_segs = wt.transcription.match_segments(tier_segments)
            new_tier = Transcription(matching_segs)
            setattr(wt,tier_name,new_tier)


[docs]    def remove_attribute(self, attribute_name):
        """Deletes a tier attribute from a Word

        Parameters
        ----------
        attribute_name : str
            Name of tier attribute to be deleted.

        Notes
        -----
        If attribute_name is not a valid attribute, this function does nothing. It
        does not raise an error.

        """
        if attribute_name.startswith('_'):
            return
        try:
            delattr(self, attribute_name)
        except ValueError:
            pass #attribute_name does not exist

[docs]    def variants(self, sequence_type = 'transcription'):
        """
        Get variants and frequencies for a Word

        Parameters
        ----------
        sequence_type : str, optional
            Tier name to get variants

        Returns
        -------
        dict
            Dictionary with keys of Transcriptions and values of their frequencies
        """
        return collections.Counter(getattr(x,sequence_type) for x in self.wordtokens)

    def __repr__(self):
        return '<Word: \'%s\'>' % self.spelling

    def __str__(self):
        return self.spelling

    def __eq__(self, other):
        if not isinstance(other,Word):
            return False
        if self.spelling != other.spelling:
            return False
        if self.transcription != other.transcription:
            return False
        return True

    def __ne__(self, other):
        return not self.__eq__(other)

    def __lt__(self, other):
        return self.spelling < other.spelling

    def __gt__(self, other):
        return self.spelling > other.spelling

    def __le__(self, other):
        return self.spelling <= other.spelling

    def __ge__(self, other):
        return self.spelling >= other.spelling

[docs]class Environment(object):
    """
    Specific sequence of segments that was a match for an EnvironmentFilter

    Parameters
    ----------
    middle : str
        Middle segment
    position : int
        Position of the middle segment in the word (to differentiate between
        repetitions of an environment in the same word
    lhs : list, optional
        Segments to the left of the middle segment
    rhs : list, optional
        Segments to the right of the middle segment
    """
    def __init__(self, middle, position, lhs = None, rhs = None):
        self.middle = middle
        self.position = position
        self.lhs = lhs
        self.rhs = rhs
        self.lhs_string = None
        self.rhs_string = None
        self.middle_string = None

    def __getitem__(self, key):
        if self.lhs is not None:
            if key < len(self.lhs):
                return self.lhs[key]
            elif key == len(self.lhs):
                return self.middle
            elif self.rhs is not None:
                return self.rhs[key - len(self.lhs) - 1]
            else:
                raise(KeyError('Index out of bounds'))
        else:
            if key == 0:
                return self.middle
            elif self.rhs is not None:
                return self.rhs[key - 1]
            else:
                raise(KeyError('Index out of bounds'))

    def __str__(self):
        elements = []
        if self.lhs_string is not None:
            elements.append(self.lhs_string)
        elif self.lhs is not None:
            elements.append(''.join(self.lhs))
        else:
             elements.append('')
        if self.rhs_string is not None:
            elements.append(self.rhs_string)
        elif self.rhs is not None:
            elements.append(''.join(self.rhs))
        else:
             elements.append('')
        return '_'.join(elements)

    def __repr__(self):
        return self.__str__()

    def __hash__(self):
        return hash((self.lhs, self.position, self.middle, self.rhs))

    def __eq__(self,other):
        """
        Two Environments are equal if they share a left AND right hand side
        An empty lhs or rhs is an automatic match
        """
        if not isinstance(other,Environment):
            return False

        if other.lhs and other.lhs != self.lhs:
            return False
        if other.rhs and other.rhs != self.rhs:
            return False
        if other.position != self.position:
            return False
        return True

    def __ne__(self,other):
        return not self.__eq__(other)

[docs]class EnvironmentFilter(object):
    """
    Filter to use for searching words to generate Environments that match

    Parameters
    ----------
    middle_segments : set
        Set of segments to center environments
    lhs : list, optional
        List of set of segments on the left of the middle
    rhs : list, optional
        List of set of segments on the right of the middle

    """
    def __init__(self, middle_segments, lhs = None, rhs = None):
        self.original_middle = middle_segments
        if lhs is not None:
            lhs = tuple(lhs)
        self.lhs = lhs
        if rhs is not None:
            rhs = tuple(rhs)
        self.rhs = rhs

        self.lhs_string = None
        self.rhs_string = None
        self._sanitize()

    @property
    def middle(self):
        return self.original_middle

    @middle.setter
    def middle(self, middle_segments):
        self.original_middle = middle_segments
        self._sanitize()

    def _sanitize(self):
        if self.lhs is not None:
            new_lhs = []
            for seg_set in self.lhs:
                if not isinstance(seg_set,frozenset):
                    new_lhs.append(frozenset(seg_set))
                else:
                    new_lhs.append(seg_set)
            self.lhs = tuple(new_lhs)
        if self.rhs is not None:
            new_rhs = []
            for seg_set in self.rhs:
                if not isinstance(seg_set,frozenset):
                    new_rhs.append(frozenset(seg_set))
                else:
                    new_rhs.append(seg_set)
            self.rhs = tuple(new_rhs)
        if not isinstance(self.middle, frozenset):
            self.middle = frozenset(self.middle)
        self._middle = set()
        for m in self.middle:
            if isinstance(m, str):
                self._middle.add(m)
            elif isinstance(m, (list, tuple, set)):
                self._middle.update(m)

[docs]    def is_applicable(self, sequence):
        """
        Check whether the Environment filter is applicable to the sequence
        (i.e., the sequence must be greater or equal in length to the
        EnvironmentFilter)

        Parameters
        ----------
        sequence : list
            Sequence to check applicability

        Returns
        -------
        bool
            True if the sequence is equal length or longer than the
            EnvironmentFilter
        """
        if len(sequence) < len(self):
            return False
        return True

    def compile_re_pattern(self):
        pass

[docs]    def lhs_count(self):
        """
        Get the number of elements on the left hand side

        Returns
        -------
        int
            Length of the left hand side
        """
        if self.lhs is None:
            return 0
        return len(self.lhs)

[docs]    def rhs_count(self):
        """
        Get the number of elements on the right hand side

        Returns
        -------
        int
            Length of the right hand side
        """
        if self.rhs is None:
            return 0
        return len(self.rhs)

    def set_lhs(self, lhs):
        self.lhs = lhs
        self.compile_re_pattern()

    def set_rhs(self, rhs):
        self.rhs = rhs
        self.compile_re_pattern()

    def __iter__(self):
        if self.lhs is not None:
            for s in self.lhs:
                yield s
        yield self._middle
        if self.rhs is not None:
            for s in self.rhs:
                yield s

    def __len__(self):
        length = 1
        if self.lhs is not None:
            length += len(self.lhs)
        if self.rhs is not None:
            length += len(self.rhs)
        return length

    def __str__(self):
        elements = []
        if self.lhs_string is not None:
            elements.append(self.lhs_string)
        elif self.lhs is not None:
            elements.append(''.join('{' + ','.join(x) + '}' for x in self.lhs))
        else:
             elements.append('')
        if self.rhs_string is not None:
            elements.append(self.rhs_string)
        elif self.rhs is not None:
            elements.append(''.join('{' + ','.join(x) + '}' for x in self.rhs))
        else:
             elements.append('')
        return '_'.join(elements)

    def __eq__(self, other):
        if not hasattr(other,'lhs'):
            return False
        if not hasattr(other,'rhs'):
            return False
        if self.lhs != other.lhs:
            return False
        if self.rhs != other.rhs:
            return False
        return True

    def __hash__(self):
        return hash((self.rhs, self.lhs))

    def __contains__(self, sequence):
        for i, s in enumerate(self):
            if sequence[i] not in s:
                return False
        return True

[docs]class Attribute(object):
    """
    Attributes are for collecting summary information about attributes of
    Words or WordTokens, with different types of attributes allowing for
    different behaviour

    Parameters
    ----------
    name : str
        Python-safe name for using `getattr` and `setattr` on Words and
        WordTokens

    att_type : str
        Either 'spelling', 'tier', 'numeric' or 'factor'

    display_name : str
        Human-readable name of the Attribute, defaults to None

    default_value : object
        Default value for initializing the attribute

    Attributes
    ----------
    name : string
        Python-readable name for the Attribute on Word and WordToken objects

    display_name : string
        Human-readable name for the Attribute

    default_value : object
        Default value for the Attribute.  The type of `default_value` is
        dependent on the attribute type.  Numeric Attributes have a float
        default value.  Factor and Spelling Attributes have a string
        default value.  Tier Attributes have a Transcription default value.

    range : object
        Range of the Attribute, type depends on the attribute type.  Numeric
        Attributes have a tuple of floats for the range for the minimum
        and maximum.  The range for Factor Attributes is a set of all
        factor levels.  The range for Tier Attributes is the set of segments
        in that tier across the corpus.  The range for Spelling Attributes
        is None.
    """
    ATT_TYPES = ['spelling', 'tier', 'numeric', 'factor']
    def __init__(self, name, att_type, display_name = None, default_value = None):
        self.name = name
        self.att_type = att_type
        self._display_name = display_name

        if self.att_type == 'numeric':
            self._range = [0,0]
            if default_value is not None and isinstance(default_value,(int,float)):
                self._default_value = default_value
            else:
                self._default_value = 0
        elif self.att_type == 'factor':
            if default_value is not None and isinstance(default_value,str):
                self._default_value = default_value
            else:
                self._default_value = ''
            if default_value:
                self._range = set([default_value])
            else:
                self._range = set()
        elif self.att_type == 'spelling':
            self._range = None
            if default_value is not None and isinstance(default_value,str):
                self._default_value = default_value
            else:
                self._default_value = ''
        elif self.att_type == 'tier':
            self._range = set()
            self._delim = None
            if default_value is not None and isinstance(default_value,Transcription):
                self._default_value = default_value
            else:
                self._default_value = Transcription(None)

    @property
    def delimiter(self):
        if self.att_type != 'tier':
            return None
        else:
            return self._delim

    @delimiter.setter
    def delimiter(self, value):
        self._delim = value

    @staticmethod
[docs]    def guess_type(values, trans_delimiters = None):
        """
        Guess the attribute type for a sequence of values

        Parameters
        ----------
        values : list
            List of strings to evaluate for the attribute type
        trans_delimiters : list, optional
            List of delimiters to look for in transcriptions, defaults
            to ``.``, ``;``, and ``,``

        Returns
        -------
        str
            Attribute type that had the most success in parsing the
            values specified
        """
        if trans_delimiters is None:
            trans_delimiters = ['.',' ', ';', ',']
        probable_values = {x: 0 for x in Attribute.ATT_TYPES}
        for i,v in enumerate(values):
            try:
                t = float(v)
                probable_values['numeric'] += 1
                continue
            except ValueError:
                for d in trans_delimiters:
                    if d in v:
                        probable_values['tier'] += 1
                        break
                else:
                    if v in [v2 for j,v2 in enumerate(values) if i != j]:
                        probable_values['factor'] += 1
                    else:
                        probable_values['spelling'] += 1
        return max(probable_values.items(), key=operator.itemgetter(1))[0]

    @staticmethod
[docs]    def sanitize_name(name):
        """
        Sanitize a display name into a Python-readable attribute name

        Parameters
        ----------
        name : string
            Display name to sanitize

        Returns
        -------
        str
            Sanitized name
        """
        return re.sub('\W','',name.lower())

    def __hash__(self):
        return hash(self.name)

    def __repr__(self):
        return '<Attribute of type {} with name \'{}\'>'.format(self.att_type,self.name)

    def __str__(self):
        return self.display_name

    def __eq__(self,other):
        if isinstance(other,Attribute):
            if self.name == other.name:
                return True
        if isinstance(other,str):
            if self.name == other:
                return True
        return False

    @property
    def display_name(self):
        if self._display_name is not None:
            return self._display_name
        return self.name.title()

    @property
    def default_value(self):
        return self._default_value

    @default_value.setter
    def default_value(self, value):
        self._default_value = value
        self._range = set([value])

    @property
    def range(self):
        return self._range

[docs]    def update_range(self,value):
        """
        Update the range of the Attribute with the value specified.
        If the attribute is a Factor, the value is added to the set of levels.
        If the attribute is Numeric, the value expands the minimum and
        maximum values, if applicable.  If the attribute is a Tier, the
        value (a segment) is added to the set of segments allowed. If
        the attribute is Spelling, nothing is done.

        Parameters
        ----------
        value : object
            Value to update range with, the type depends on the attribute
            type
        """
        if value is None:
            return
        if self.att_type == 'numeric':
            if isinstance(value, str):
                try:
                    value = float(value)
                except ValueError:
                    self.att_type = 'spelling'
                    self._range = None
                    return
            if value < self._range[0]:
                self._range[0] = value
            elif value > self._range[1]:
                self._range[1] = value
        elif self.att_type == 'factor':
            self._range.add(value)
            #if len(self._range) > 1000:
            #    self.att_type = 'spelling'
            #    self._range = None
        elif self.att_type == 'tier':
            if isinstance(self._range, list):
                self._range = set(self._range)
            self._range.update([x for x in value])

[docs]class Inventory(object):
    """
    Inventories contain information about a Corpus' segmental inventory.
    In many cases, they are similar to FeatureMatrices, but more tailored
    to a specific corpus.  Where a FeatureMatrix would deal in feature
    specifications, inventories will deal primarily in sets of segments.

    Parameters
    ----------

    data : dict, optional
        Mapping from segment symbol to Segment objects

    Attributes
    ----------
    features : list
        List of all features used as specifications for segments
    possible_values : set
        Set of values that segments use for features
    stresses : dict
        Mapping of stress values to segments that bear that stress
    places : dict
        Mapping from place of articulation labels to sets of segments
    manners : dict
        Mapping from manner of articulation labels to sets of segments
    height : dict
        Mapping from vowel height labels to sets of segments
    backness : dict
        Mapping from vowel backness labels to sets of segments
    vowel_feature : str
        Feature value (i.e., '+voc') that separates vowels from consonants
    voice_feature : str
        Feature value (i.e., '+voice') that codes voiced obstruents
    diph_feature : str
        Feature value (i.e., '+diphthong' or '.high') that separates
        diphthongs from monophthongs
    rounded_feature : str
        Feature value (i.e., '+round') that codes rounded vowels
    """
    def __init__(self, data = None):
        if data is None:
            self._data = {'#' : Segment('#')}
        else:
            self._data = data
        self.features = []
        self.possible_values = set()
        self.stresses = collections.defaultdict(set)
        self.places = collections.OrderedDict()
        self.manners = collections.OrderedDict()
        self.height = collections.OrderedDict()
        self.backness = collections.OrderedDict()
        self.vowel_feature = None
        self.voice_feature = None
        self.diph_feature = None
        self.rounded_feature = None

    def __setstate__(self, state):
        if 'stresses' not in state:
            state['stresses'] = collections.OrderedDict()
        if 'places' not in state:
            state['places'] = collections.OrderedDict()
        if 'manners' not in state:
            state['manners'] = collections.OrderedDict()
        if 'height' not in state:
            state['height'] = collections.OrderedDict()
        if 'backness' not in state:
            state['backness'] = collections.OrderedDict()
        if 'vowel_feature' not in state:
            state['vowel_feature'] = None
        if 'voice_feature' not in state:
            state['voice_feature'] = None
        if 'diph_feature' not in state:
            state['diph_feature'] = None
        if 'rounded_feature' not in state:
            state['rounded_feature'] = None
        self.__dict__.update(state)

    def __len__(self):
        return len(self._data.keys())

    def keys(self):
        return self._data.keys()

    def values(self):
        return self._data.values()

    def items(self):
        return self._data.items()

    def __getitem__(self, key):
        if isinstance(key, slice):
            return sorted(self._data.keys())[key]
        return self._data[key]

    def __setitem__(self, key, value):
        self._data[key] = value

    def __iter__(self):
        for k in sorted(self._data.keys()):
            yield self._data[k]

    def __contains__(self, item):
        if isinstance(item, str):
            return item in self._data.keys()
        elif isinstance(item, Segment):
            return item.symbol in self._data.keys()
        return False

[docs]    def valid_feature_strings(self):
        """
        Get all combinations of ``possible_values`` and ``features``

        Returns
        -------
        list
            List of valid feature strings
        """
        strings = []
        for v in self.possible_values:
            for f in self.features:
                strings.append(v+f)
        return strings

[docs]    def find_min_feature_pairs(self, features, others = None):
        """
        Find sets of segments that differ only in certain features,
        optionally limited by a feature specification

        Parameters
        ----------
        features : list
            List of features (i.e. 'back' or 'round')
        others : list, optional
            Feature specification to limit sets

        Returns
        -------
        dict
            Dictionary with keys that correspond to the values of ``features``
            and values that are the set of segments with those feature values
        """
        plus_segs = []
        minus_segs = []
        output = collections.defaultdict(list)
        redundant = self.get_redundant_features(features, others)
        for seg in self:
            try:
                if any(seg[f] not in set('+-') for f in features):
                    continue
            except KeyError:
                continue
            if not seg.feature_match(others):
                continue
            for seg2 in self:
                if seg == seg2:
                    continue
                try:
                    if seg.minimal_difference(seg2, features + redundant):
                        break
                except KeyError:
                    continue
            else:
                continue
            if seg not in output[tuple(seg[f] for f in features)]:
                output[tuple(seg[f] for f in features)].append(seg)
            if seg2 not in output[tuple(seg2[f] for f in features)]:
                output[tuple(seg2[f] for f in features)].append(seg2)
        return output

[docs]    def get_redundant_features(self, features, others = None):
        """
        Autodetects redundent features, with the ability to subset
        the segments

        Parameters
        ----------
        features : list
            List of features to find other features that consistently
            covary with them
        others : list, optional
            Feature specification that specifies a subset to look at

        Returns
        -------
        list
            List of redundant features
        """
        redundant_features = []
        if isinstance(features, str):
            features = [features]
        if others is None:
            others = []
        other_feature_names = [x[1:] for x in others]
        for f in self.features:
            if f in features:
                continue
            if f in other_feature_names:
                continue
            feature_values = collections.defaultdict(set)
            for seg in self:
                if others is not None:
                    if not seg.feature_match(others):
                        continue
                if seg == '#':
                    continue
                try:
                    value = tuple(seg[x] for x in features)
                except KeyError:
                    continue
                other_value = seg[f]
                feature_values[value].add(other_value)
                if any(len(x) > 1 for x in feature_values.values()):
                    break
            if any(len(x) > 1 for x in feature_values.values()):
                continue
            redundant_features.append(f)
        return redundant_features

[docs]    def features_to_segments(self, feature_description):
        """
        Given a feature description, return the segments in the inventory
        that match that feature description

        Feature descriptions should be either lists, such as
        ['+feature1', '-feature2'] or strings that can be separated into
        lists by ',', such as '+feature1,-feature2'.

        Parameters
        ----------
        feature_description : string or list
            Feature values that specify the segments, see above for format

        Returns
        -------
        list of Segments
            Segments that match the feature description

        """
        segments = []
        if isinstance(feature_description, str):
            feature_description = feature_description.split(',')
        for k,v in self._data.items():
            if v.feature_match(feature_description):
                segments.append(k)
        return segments

[docs]    def specify(self, specifier):
        """
        Specify segments in the inventory using a FeatureMatrix

        Parameters
        ----------
        specifier : FeatureMatrix
            Specifier to use for updating feature specifications
        """
        if specifier is None:
            for k in self._data.keys():
                self._data[k].specify({})
            self.features = list()
            self.possible_values = set()
            self.cons_columns = collections.OrderedDict()
            self.cons_rows = collections.OrderedDict()
            self.vow_columns = collections.OrderedDict()
            self.vow_rows = collections.OrderedDict()
            self.voice_feature = None
            self.vowel_feature = None
            self.diph_feature = None
            self.rounded_feature = None
        else:
            for k in self._data.keys():
                try:
                    self._data[k].specify(specifier[k].features)
                except KeyError:
                    self._data[k].specify({})
            self.features = specifier.features
            self.possible_values = specifier.possible_values

            self.voice_feature = specifier.voice_feature
            self.vowel_feature = specifier.vowel_feature
            self.diph_feature = specifier.diph_feature
            self.rounded_feature = specifier.rounded_feature

            # Calculate which segments are in which dict
            # (pre calculate feature matches)

            self.places = collections.OrderedDict()
            for k,v in specifier.places.items():
                if len(v) == 0:
                    self.places[k] = set()
                else:
                    self.places[k] = set(self.features_to_segments(v))

            self.manners = collections.OrderedDict()
            for k,v in specifier.manners.items():
                if len(v) == 0:
                    self.manners[k] = set()
                else:
                    self.manners[k] = set(self.features_to_segments(v))

            self.height = collections.OrderedDict()
            for k,v in specifier.height.items():
                if len(v) == 0:
                    self.height[k] = set()
                else:
                    self.height[k] = set(self.features_to_segments(v))

            self.backness = collections.OrderedDict()
            for k,v in specifier.backness.items():
                if len(v) == 0:
                    self.backness[k] = set()
                else:
                    self.backness[k] = set(self.features_to_segments(v))

[docs]    def categorize(self, seg):
        """
        Categorize a segment into consonant/vowel, place of articulation,
        manner of articulation, voicing, vowel height, vowel backness, and vowel
        rounding.

        For consonants, the category is of the format:

        ('Consonant', PLACE, MANNER, VOICING)

        For vowels, the category is of the format:

        ('Vowel', HEIGHT, BACKNESS, ROUNDED)

        Diphthongs are categorized differently:

        ('Diphthong', 'Vowel')

        Parameters
        ----------
        seg : Segment
            Segment to categorize

        Returns
        -------
        tuple or None
            Returns categories according to the formats above, if any are
            unable to be calculated, returns None in those places.
            Returns None if a category cannot be found.
        """
        if seg == '#':
            return None
        seg_features = seg.features
        if seg.feature_match(self.vowel_feature):
            category = ['Vowel']

            if self.diph_feature != [] and seg.feature_match(self.diph_feature):
                category.insert(0,'Diphthong')
                return category

            for k,v in self.height.items():
                if seg.symbol in v:
                    category.append(k)
                    break
            else:
                category.append(None)
            for k,v in self.backness.items():
                if seg.symbol in v:
                    category.append(k)
                    break
            else:
                category.append(None)

            if seg.feature_match(self.rounded_feature):
                category.append('Rounded')
            else:
                category.append('Unrounded')
        else:
            category = ['Consonant']

            for k,v in self.places.items():
                if seg.symbol in v:
                    category.append(k)
                    break
            else:
                category.append(None)

            for k,v in self.manners.items():
                if seg.symbol in v:
                    category.append(k)
                    break
            else:
                category.append(None)

            if seg.feature_match(self.voice_feature):
                category.append('Voiced')
            else:
                category.append('Voiceless')
        return category

[docs]class Corpus(object):
    """
    Lexicon to store information about Words, such as transcriptions,
    spellings and frequencies

    Parameters
    ----------
    name : string
        Name to identify Corpus

    Attributes
    ----------

    name : str
        Name of the corpus, used only for easy of reference

    attributes : list of Attributes
        List of Attributes that Words in the Corpus have

    wordlist : dict
        Dictionary where every key is a unique string representing a word in a
        corpus, and each entry is a Word object

    words : list of strings
        All the keys for the wordlist of the Corpus

    specifier : FeatureSpecifier
        See the FeatureSpecifier object

    inventory : Inventory
        Inventory that contains information about segments in the Corpus
    """

    #__slots__ = ['name', 'wordlist', 'specifier',
    #            'inventory', 'orthography', 'custom', 'feature_system',
    #            'has_frequency_value','has_spelling_value','has_transcription_value']
    basic_attributes = ['spelling','transcription','frequency']
    def __init__(self, name):
        self.name = name
        self.wordlist = dict()
        self.specifier = None
        self.inventory = Inventory()
        self.has_frequency = True
        self.has_spelling = False
        self.has_wordtokens = False
        self._attributes = [Attribute('spelling','spelling'),
                            Attribute('transcription','tier'),
                            Attribute('frequency','numeric')]

    @property
    def has_transcription(self):
        for a in self.attributes:
            if a.att_type == 'tier' and len(a.range) > 0:
                return True
        return False

    def __eq__(self, other):
        if not isinstance(other,Corpus):
            return False
        if self.wordlist != other.wordlist:
            return False
        return True

    def __iadd__(self, other):
        for a in other.attributes:
            if a not in self.attributes:
                self.add_attribute(a)
        for w in other:
            try:
                sw = self.find(w.spelling)
                sw.frequency += w.frequency
                for a in self.attributes:
                    if getattr(sw, a.name) == a.default_value and getattr(w, a.name) != a.default_value:
                        setattr(sw, a.name, getattr(w, a.name))
                sw.wordtokens += w.wordtokens
            except KeyError:
                self.add_word(w)
        if self.specifier is None and other.specifier is not None:
            self.set_feature_matrix(other.specifier)
        return self

    def key(self, word):
        key = word.spelling
        if self[key] == word:
            return key
        count = 0
        while True:
            count += 1
            key = '{} ({})'.format(word.spelling,count)
            try:
                if self[key] == word:
                    return key
            except KeyError:
                break


    def keys(self):
        for k in sorted(self.wordlist.keys()):
            yield k

[docs]    def subset(self, filters):
        """
        Generate a subset of the corpus based on filters.

        Filters for Numeric Attributes should be tuples of an Attribute
        (of the Corpus), a comparison callable (``__eq__``, ``__neq__``,
        ``__gt__``, ``__gte__``, ``__lt__``, or ``__lte__``) and a value
        to compare all such attributes in the Corpus to.

        Filters for Factor Attributes should be tuples of an Attribute,
        and a set of levels for inclusion in the subset.

        Other attribute types cannot currently be the basis for filters.

        Parameters
        ----------
        filters : list of tuples
            See above for format

        Returns
        -------
        Corpus
            Subset of the corpus that matches the filter conditions
        """

        new_corpus = Corpus('')
        new_corpus._attributes = [Attribute(x.name, x.att_type, x.display_name)
                    for x in self.attributes]
        for word in self:
            for f in filters:
                if f[0].att_type == 'numeric':
                    op = f[1]
                    if not op(getattr(word,f[0].name), f[2]):
                        break
                elif f[0].att_type == 'factor':
                    if getattr(word,f[0].name) not in f[1]:
                        break
            else:
                new_corpus.add_word(word)
        return new_corpus

    @property
    def attributes(self):
        return self._attributes

    @property
    def words(self):
        return sorted(list(self.wordlist.keys()))

[docs]    def features_to_segments(self, feature_description):
        """
        Given a feature description, return the segments in the inventory
        that match that feature description

        Feature descriptions should be either lists, such as
        ['+feature1', '-feature2'] or strings that can be separated into
        lists by ',', such as '+feature1,-feature2'.

        Parameters
        ----------
        feature_description : string or list
            Feature values that specify the segments, see above for format

        Returns
        -------
        list of Segments
            Segments that match the feature description

        """
        segments = list()
        if isinstance(feature_description,str):
            feature_description = feature_description.split(',')
        for k,v in self.inventory.items():
            if v.feature_match(feature_description):
                segments.append(k)
        return segments

[docs]    def segment_to_features(self, seg):
        """
        Given a segment, return the features for that segment.

        Parameters
        ----------
        seg : string or Segment
            Segment or Segment symbol to look up

        Returns
        -------
        dict
            Dictionary with keys as features and values as featue values
        """
        try:
            features = self.specifier.matrix[seg]
        except TypeError:
            features = self.specifier.matrix[seg.symbol]
        return features

[docs]    def add_abstract_tier(self, attribute, spec):
        """
        Add a abstract tier (currently primarily for generating CV skeletons
        from tiers).

        Specifiers for abstract tiers should be dictionaries with keys that
        are the abstract symbol (such as 'C' or 'V') and the values are
        iterables of segments that should count as that abstract symbols
        (such as all consonants or all vowels).

        Currently only operates on the ``transcription`` of words.

        Parameters
        ----------
        attribute : Attribute
            Attribute to add/replace

        spec : dict
            Mapping for creating abstract tier
        """
        for i,a in enumerate(self._attributes):
            if attribute.name == a.name:
                self._attributes[i] = attribute
                break
        else:
            self._attributes.append(attribute)
        for word in self:
            word.add_abstract_tier(attribute.name,spec)
            attribute.update_range(getattr(word,attribute.name))

[docs]    def add_attribute(self, attribute, initialize_defaults = False):
        """
        Add an Attribute of any type to the Corpus or replace an existing Attribute.

        Parameters
        ----------
        attribute : Attribute
            Attribute to add or replace

        initialize_defaults : boolean
            If True, words will have this attribute set to the ``default_value``
            of the attribute, defaults to False
        """
        for i,a in enumerate(self._attributes):
            if attribute.name == a.name:
                self._attributes[i] = attribute
                break
        else:
            self._attributes.append(attribute)
        if initialize_defaults:
            for word in self:
                word.add_attribute(attribute.name,attribute.default_value)

[docs]    def add_count_attribute(self, attribute, sequence_type, spec):
        """
        Add an Numeric Attribute that is a count of a segments in a tier that
        match a given specification.

        The specification should be either a list of segments or a string of
        the format '+feature1,-feature2' that specifies the set of segments.

        Parameters
        ----------
        attribute : Attribute
            Attribute to add or replace

        sequence_type : string
            Specifies whether to use 'spelling', 'transcription' or the name of a
            transcription tier to use for comparisons

        spec : list or str
            Specification of what segments should be counted
        """
        if isinstance(attribute,str):
            attribute = Attribute(attribute,'numeric')
        for i,a in enumerate(self._attributes):
            if attribute.name == a.name:
                self._attributes[i] = attribute
                break
        else:
            self._attributes.append(attribute)
        if isinstance(spec, str):
            tier_segs = self.features_to_segments(spec)
        else:
            tier_segs = spec
        for word in self:
            v = sum([1 for x in getattr(word, sequence_type) if x in tier_segs])
            setattr(word, attribute.name, v)
            attribute.update_range(v)

[docs]    def add_tier(self, attribute, spec):
        """
        Add a Tier Attribute based on the transcription of words as a new Attribute
        that includes all segments that match the specification.

        The specification should be either a list of segments or a string of
        the format '+feature1,-feature2' that specifies the set of segments.

        Parameters
        ----------
        attribute : Attribute
            Attribute to add or replace

        spec : list or str
            Specification of what segments should be counted
        """
        if isinstance(attribute,str):
            attribute = Attribute(attribute, 'tier')
        for i,a in enumerate(self._attributes):
            if attribute.name == a.name:
                self._attributes[i] = attribute
                break
        else:
            self._attributes.append(attribute)
        if isinstance(spec, str):
            tier_segs = self.features_to_segments(spec)
        else:
            tier_segs = spec
        attribute._range = tier_segs
        for word in self:
            word.add_tier(attribute.name,tier_segs)

[docs]    def remove_word(self, word_key):
        """
        Remove a Word from the Corpus using its identifier in the Corpus.

        If the identifier is not found, nothing happens.

        Parameters
        ----------
        word_key : string
            Identifier to use to remove the Word
        """
        try:
            del self.wordlist[word_key]
        except KeyError:
            pass

[docs]    def remove_attribute(self, attribute):
        """
        Remove an Attribute from the Corpus and from all its Word objects.

        Parameters
        ----------
        attribute : Attribute
            Attribute to remove
        """
        if isinstance(attribute,str):
            name = attribute
        else:
            name = attribute.name
        if name in self.basic_attributes:
            return
        for i in range(len(self._attributes)):
            if self._attributes[i].name == name:
                del self._attributes[i]
                break
        else:
            return
        for word in self:
            word.remove_attribute(name)

    def __getstate__(self):
        state = self.__dict__.copy()
        return state

    def __setstate__(self,state):
        try:
            if 'inventory' not in state:
                state['inventory'] = state['_inventory']
            if not isinstance(state['inventory'], Inventory):
                state['inventory'] = Inventory(state['inventory'])
            if 'has_spelling' not in state:
                state['has_spelling'] = state['has_spelling_value']
            if 'has_transcription' in state:
                del state['has_transcription']
            if 'has_wordtokens' not in state:
                state['has_wordtokens'] = False
            if '_freq_base' in state:
                del state['_freq_base']
            if '_attributes' not in state:
                state['_attributes'] = [Attribute('spelling','spelling'),
                                        Attribute('transcription','tier'),
                                        Attribute('frequency','numeric')]
                try:
                    tiers = state.pop('_tiers')
                    for t in tiers:
                        state['_attributes'].append(Attribute(t,'tier'))
                except KeyError:
                    pass
            self.__dict__.update(state)
            self._specify_features()
            #Backwards compatability
            for k,w in self.wordlist.items():
                w._corpus = self
                for a in self.attributes:
                    if a.att_type == 'tier':
                        if not isinstance(getattr(w,a.name), Transcription):
                            setattr(w,a.name,Transcription(getattr(w,a.name)))
                    else:
                        try:
                            a.update_range(getattr(w,a.name))
                        except AttributeError as e:
                            print(k)
                            print(w.__dict__)
                            raise(e)
        except Exception as e:
            raise(e)
            raise(CorpusIntegrityError("An error occurred while loading the corpus: {}.\nPlease redownload or recreate the corpus.".format(str(e))))

    def _specify_features(self):
        self.inventory.specify(self.specifier)

[docs]    def check_coverage(self):
        """
        Checks the coverage of the specifier (FeatureMatrix) of the Corpus over the
        inventory of the Corpus

        Returns
        -------
        list
            List of segments in the inventory that are not in the specifier
        """
        if not self.specifier is not None:
            return []
        return [x for x in self.inventory.keys() if x not in self.specifier]

[docs]    def iter_words(self):
        """
        Sorts the keys in the corpus dictionary,
        then yields the values in that order

        Returns
        -------
        generator
            Sorted Words in the corpus
        """
        sorted_list = sorted(self.wordlist.keys())
        for word in sorted_list:
            yield self.wordlist[word]

[docs]    def iter_sort(self):
        """
        Sorts the keys in the corpus dictionary, then yields the
        values in that order

        Returns
        -------
        generator
            Sorted Words in the corpus

        """
        sorted_list = sorted(self.wordlist.keys())
        for word in sorted_list:
            yield self.wordlist[word]

[docs]    def set_feature_matrix(self,matrix):
        """
        Set the feature system to be used by the corpus and make sure
        every word is using it too.

        Parameters
        ----------
        matrix : FeatureMatrix
            New feature system to use in the corpus
        """
        self.specifier = matrix
        self._specify_features()

[docs]    def get_random_subset(self, size, new_corpus_name='randomly_generated'):
        """Get a new corpus consisting a random selection from the current corpus

        Parameters
        ----------
        size : int
            Size of new corpus

        new_corpus_name : str

        Returns
        -------
        new_corpus : Corpus
            New corpus object with len(new_corpus) == size
        """
        new_corpus = Corpus(new_corpus_name)
        while len(new_corpus) < size:
            word = self.random_word()
            new_corpus.add_word(word, allow_duplicates=False)
        new_corpus.specifier = self.specifier
        return new_corpus

[docs]    def add_word(self, word, allow_duplicates=True):
        """Add a word to the Corpus.
        If allow_duplicates is True, then words with identical spelling can
        be added. They are kept sepearate by adding a "silent" number to them
        which is never displayed to the user. If allow_duplicates is False,
        then duplicates are simply ignored.

        Parameters
        ----------
        word : Word
            Word object to be added

        allow_duplicates : bool
            If False, duplicate Words with the same spelling as an existing
            word in the corpus will not be added

        """
        word._corpus = self
        #If the word doesn't exist, add it
        try:
            check = self.find(word.spelling, keyerror=True)
            if allow_duplicates:
                #Some words have more than one entry in a corpus, e.g. "live" and "live"
                #so they need to be assigned unique keys

                n = 0
                while True:
                    n += 1
                    #key = '{} ({})'.format(word.spelling.lower(),n)
                    key = '{} ({})'.format(word.spelling,n)
                    try:
                        check = self.find(key, keyerror=True)
                    except KeyError:
                    #if isinstance(check, EmptyWord):
                        self.wordlist[key] = word
                        break
            else:
                return
        except KeyError:
            self.wordlist[word.spelling] = word
            if word.spelling is not None:
                #self.orthography.update(word.spelling)
                if not self.has_spelling:
                    self.has_spelling = True

        if word.transcription is not None:
            self.update_inventory(word.transcription)
            word.transcription._list = [self.inventory[x].symbol for x in word.transcription._list]
        for d in word.descriptors:
            if d not in self.attributes:
                if isinstance(getattr(word,d),str):
                    self._attributes.append(Attribute(d,'factor'))
                elif isinstance(getattr(word,d),Transcription):
                    self._attributes.append(Attribute(d,'tier'))
                elif isinstance(getattr(word,d),(int, float)):
                    self._attributes.append(Attribute(d,'numeric'))
        for a in self.attributes:
            if not hasattr(word,a.name):
                word.add_attribute(a.name, a.default_value)
            a.update_range(getattr(word,a.name))

[docs]    def update_inventory(self, transcription):
        """
        Update the inventory of the Corpus to ensure it contains all
        the segments in the given transcription

        Parameters
        ----------
        transcription : list
            Segment symbols to add to the inventory if needed
        """
        for s in transcription:
            if isinstance(s, str):
                if s not in self.inventory:
                    self.inventory[s] = Segment(s)
        if transcription.stress_pattern:
            for k,v in transcription.stress_pattern.items():
                self.inventory.stresses[v].add(transcription[k])

[docs]    def get_or_create_word(self, **kwargs):
        """
        Get a Word object that has the spelling and transcription
        specified or create that Word, add it to the Corpus and return it.

        Parameters
        ----------
        spelling : string
            Spelling to search for

        transcription : list
            Transcription to search for

        Returns
        -------
        Word
            Existing or newly created Word with the spelling and transcription
            specified
        """
        try:
            spelling = kwargs['spelling']
            if isinstance(spelling,tuple):
                spelling = spelling[1]
        except KeyError:
            return None

        words = self.find_all(spelling)
        for w in words:
            for k,v in kwargs.items():
                if isinstance(v,tuple):
                    v = v[1]
                if isinstance(v,list):
                    v = Transcription(v)
                if getattr(w,k) != v:
                    break
            else:
                return w
        else:
            word = Word(**kwargs)
            self.add_word(word)
        return word

[docs]    def random_word(self):
        """Return a randomly selected Word

        Returns
        -------
        Word
            Random Word
        """
        word = random.choice(list(self.wordlist.keys()))
        return self.wordlist[word]

[docs]    def get_features(self):
        """
        Get a list of the features used to describe Segments

        Returns
        ----------
        list of str

        """
        return self.specifier.features

[docs]    def find(self, word, keyerror=True, ignore_case = False):
        """Search for a Word in the corpus
        If keyerror == True, then raise a KeyError if the word is not found
        If keyerror == False, then return an EmptyWord if the word is not found

        Parameters
        ----------
        word : str
            String representing the spelling of the word (not transcription)
        keyerror : bool
            Set whether a KeyError should be raised if a word is not found

        Returns
        -------
        Word
            Word that matches the spelling specified

        Raises
        ------
        KeyError
            If keyerror == True and word is not found
        """
        patterns = [word]
        if ignore_case:
            patterns.append(word.lower())
            patterns.append(word.title())
        for w in patterns:
            key = w
            try:
                result = self.wordlist[w]
                return result
            except KeyError:
                try:
                    key = '{} (1)'.format(w)
                    result = [self.wordlist[key]]
                    return result
                except KeyError:
                    pass

        raise KeyError('The word \"{}\" is not in the corpus'.format(word))

[docs]    def find_all(self, spelling):
        """
        Find all Word objects with the specified spelling

        Parameters
        ----------
        spelling : string
            Spelling to look up

        Returns
        -------
        list of Words
            Words that have the specified spelling
        """
        words = list()
        try:
            words.append(self.wordlist[spelling])
            count = 0
            while True:
                count += 1
                try:
                    words.append(self.wordlist['{} ({})'.format(spelling,count)])
                except KeyError:
                    break
        except KeyError:
            pass
        return words

    def __contains__(self,item):
        return self.wordlist.__contains__(item)

    def __len__(self):
        return len(self.wordlist)

    def __setitem__(self,item,value):
        self.wordlist[item] = value

    def __getitem__(self,item):
        return self.wordlist[item]

    def __iter__(self):
        return iter(self.wordlist.values())