Source code for corpustools.corpus.classes.spontaneous

from collections import OrderedDict
from .lexicon import Transcription, Corpus, Attribute, Word
import os
import wave
import random

[docs]class Speaker(object): """ Speaker objects contain information about the producers of WordTokens or Discourses Parameters ---------- name : string Name to identify the Speaker Attributes ---------- name : string Name of Speaker gender : string Gender of Speaker age : int or string Age of Speaker """ def __init__(self,name, **kwargs): self.name = name self.gender = None self.age = None for k,v in kwargs.items(): setattr(self,k,v) def __repr__(self): return '<Speaker object with name \'{}\>'.format(self.name) def __str__(self): return str(self.name) def __hash__(self): return hash(self.name) def __eq__(self, other): if isinstance(other,Speaker): return self.name == other.name else: return self.name == other def __ne__(self, other): return not self.__eq__(other) def __lt__(self, other): return self.name < other.name def __gt__(self, other): return self.name > other.name def __le__(self, other): return self.name <= other.name def __ge__(self, other): return self.name >= other.name
[docs]class SpontaneousSpeechCorpus(object): """ SpontaneousSpeechCorpus objects a collection of Discourse objects and Corpus objects for frequency information. Parameters ---------- name : str Name to identify the SpontaneousSpeechCorpus directory : str Directory associated with the SpontaneousSpeechCorpus Attributes ---------- lexicon : Corpus Corpus object with token frequencies from its Discourses discourses : dict Discourses of the SpontaneousSpeechCorpus indexed by the names of the Discourses """ def __init__(self,name,directory): self.name = name self.directory = directory self.lexicon = Corpus(name+' lexicon') self.lexicon.has_wordtokens = True self.discourses = OrderedDict() def __iter__(self): for d in self.discourses.values(): yield d def __getstate__(self): return self.__dict__ def __setstate__(self, state): self.__dict__.update(state)
[docs] def add_discourse(self, discourse): """ Add a discourse to the SpontaneousSpeechCorpus Parameters ---------- discourse : Discourse Discourse to be added """ self.discourses[str(discourse)] = discourse self.lexicon += discourse.lexicon
[docs]class Discourse(object): """ Discourse objects are collections of linear text with word tokens Parameters ---------- name : str Identifier for the Discourse speaker : Speaker Speaker producing the tokens/text (defaults to an empty Speaker) Attributes ---------- attributes : list of Attributes The Discourse object tracks all of the attributes used by its WordToken objects words : dict of WordTokens The keys are the beginning times of the WordTokens (or their place in a text if it's not a speech discourse) and the values are the WordTokens """ def __init__(self, kwargs): self.name = kwargs['name'] self.speaker = Speaker(None) self.wav_path = kwargs['wav_path'] self._attributes = [kwargs['spelling_name'], kwargs['transcription_name'], Attribute('begin','numeric','Begin'), Attribute('end','numeric', 'End')] try: for att in kwargs['other_attributes']: self._attributes.append(att) except KeyError: pass self.words = dict() self.lexicon = Corpus(self.name + ' lexicon') self.lexicon.has_wordtokens = True self.lexicon.discourse = self def random_word(self): word = random.choice(list(self.words.keys())) return self.words[word] @property def attributes(self): #return [a for a in self._attributes if not a.name in ('spelling', 'transcription')] att_list = list() for a in self._attributes: if not a.display_name in [at.display_name for at in att_list]: att_list.append(a) #att_list.sort() return att_list #return self._attributes
[docs] def keys(self): """ Returns a sorted list of keys for looking up WordTokens Returns ------- list List of begin times or indices of WordTokens in the Discourse """ return sorted(self.words.keys())
def __len__(self): return len(self.words.keys()) def __eq__(self, other): if not isinstance(other,Discourse): return False if self.name != other.name: return False if self.speaker != other.speaker: return False return True def __ne__(self, other): return not self.__eq__(other) def __lt__(self, other): return self.name < other.name def __gt__(self, other): return self.name > other.name def __le__(self, other): return self.name <= other.name def __ge__(self, other): return self.name >= other.name def __str__(self): return self.name
[docs] def add_word(self, wordtoken): """ Adds a WordToken to the Discourse Parameters ---------- wordtoken : WordToken WordToken to be added """ wordtoken.discourse = self self.words[wordtoken.begins] = wordtoken for a in self.attributes: if not hasattr(wordtoken,a.name): wordtoken.add_attribute(a.name, a.default_value) a.update_range(getattr(wordtoken,a.name))
[docs] def add_attribute(self, attribute, initialize_defaults = False): """ Add an Attribute of any type to the Discourse or replace an existing Attribute. Parameters ---------- attribute : Attribute Attribute to add or replace initialize_defaults : bool If True, word tokens will have this attribute set to the ``default_value`` of the attribute, defaults to False """ for i,a in enumerate(self._attributes): if attribute.display_name == a.display_name: self._attributes[i] = attribute break else: self._attributes.append(attribute) if initialize_defaults: for word in self: word.add_attribute(attribute.name,attribute.default_value)
def __getitem__(self, key): if isinstance(key, float) or isinstance(key, int): #Find the word token at a given time keys = filter(lambda x: x >= key,self.words.keys()) t = min(keys,key = lambda x: x - key) return self.words[t] raise(TypeError) @property def has_audio(self): """ Checks whether the Discourse is associated with a .wav file Returns ------- bool True if a .wav file is associated and if that file exists, False otherwise """ if self.wav_path is not None and os.path.exists(self.wav_path): return True return False def __getstate__(self): return self.__dict__ def __setstate__(self,state): if 'wav_path' not in state: state['wav_path'] = None self.__dict__.update(state) if hasattr(self,'lexicon'): self.lexicon.has_wordtokens = True # for wt in self: # self.lexicon[str(wt.wordtype)].wordtokens.append(wt) # wt.wordtype.wordtokens.append(wt) def __iter__(self): for k in sorted(self.words.keys()): yield self.words[k] def _extract_tokens(self, tokens, output_dir): if not self.has_audio(): return filenames = [] with wave.open(self.wav_path,'r') as w_in: sr = w_in.getframerate() bitdepth = w_in.getsampwidth() for t in tokens: wt = self[t] name = '{}_{}.wav'.format(self.name,wt.begin) wt.wav_path = os.path.join(output_dir,name) filenames.append(wt.wav_path) if os.path.exists(wt.wav_path): continue begpos = int(wt.begin * sr) endpos = int(wt.end * sr) duration = endpos - begpos w_in.setpos(begpos) data = w_in.readframes(duration) with wave.open(wt.wav_path,'w') as w_out: w_out.setnchannels(1) w_out.setframerate(sr) w_out.setsampwidth(bitdepth) w_out.writeframes(data) return filenames
[docs] def create_lexicon(self): """ Create a Corpus object from the Discourse Returns ------- Corpus Corpus with spelling and transcription from previous Corpus and token frequency from the Discourse """ corpus = Corpus(self.name + ' lexicon') corpus.has_wordtokens = True for token in self: word = corpus.get_or_create_word(spelling=token.wordtype.spelling, transcription=token.wordtype.transcription) word.frequency += 1 token.wordtype = word word.wordtokens.append(token) return corpus
[docs] def find_wordtype(self, wordtype): """ Look up all WordTokens that are instances of a Word Parameters ---------- wordtype : Word Word to look up Returns ------- list of WordTokens List of the given Word's WordTokens in this Discourse """ return list(x for x in self if x.wordtype == wordtype)
[docs]class WordToken(): """ WordToken objects are individual productions of Words Parameters ---------- word : Word Word that the WordToken is associated with transcription : iterable of str Transcription for the WordToken (can be different than the transcription of the Word type). Defaults to None if not specified spelling : str Spelling for the WordToken (can be different than the spelling of the Word type). Defaults to None if not specified begin : float or int Beginning of the WordToken (can be specified as either in seconds of time or in position from the beginning of the Discourse) end : float or int End of the WordToken (can be specified as either in seconds of time or in position from the beginning of the Discourse) previous_token : WordToken The preceding WordToken in the Discourse, defaults to None if not specified following_token : WordToken The following WordToken in the Discourse, defaults to None if not specified discourse : Discourse Parent Discourse object that the WordToken belongs to speaker : Speaker The Speaker that produced the token Attributes ---------- transcription : Transcription The WordToken's transcription, or the word type's transcription if the WordToken's transcription is None spelling : str The WordToken's spelling, or the word type's spelling if the WordToken's spelling is None previous_token : WordToken The previous WordToken in the Discourse following_token : WordToken The following WordToken in the Discourse duration : float The duration of the WordToken """ wordtoken_attributes = {'wordtype': None, 'discourse': None, 'speaker': None, 'wavpath': None, 'begin': None, 'end': None, '_spelling': None, '_transcription': None, '_freq_names': None} def __init__(self,update=False,**kwargs): if update: self.update(update) return self.wordtype = kwargs.pop('word',None) self.discourse = None self.speaker = None self.wavpath = None self.begin = kwargs.pop('begin') self.end = kwargs.pop('end') self.begins = self.begin self.ends = self.end self._spelling = None self._transcription = None self._freq_names = ['abs_freq', 'freq_per_mil', 'sfreq', 'lowercase_freq', 'log10_freq'] for key, value in kwargs.items(): if not all([letter.isupper() for letter in key]): key = key.capitalize() if isinstance(value, tuple): att, value = value if att.att_type == 'numeric': try: value = float(value) except (ValueError, TypeError): value = float('nan') if key == 'Frequency': setattr(self, '_frequency', value) elif att.att_type == 'tier': value = Transcription(value) self._transcription = value setattr(self, key, value) elif isinstance(value, list): # probably a transcription value = Transcription(value) setattr(self, key, value) self._transcription = value elif isinstance(value, str): try: value = float(value) except ValueError: self._spelling = value # it's spelling, leave value as-is setattr(self, key, value) else: setattr(self, key, value) if not self._transcription: for name,value in self.__dict__.items(): if isinstance(value, Transcription): self._transcription = getattr(self, name) break else: try: self._transcription = self.wordtype._transcription except AttributeError: self._transcription = None if not self._spelling: try: self._spelling = self.wordtype._spelling except AttributeError: if self._transcription is not None: self._spelling = str(self._transcription) else: self._spelling = None def __copy__(self): return WordToken(update=self) def update(self, old_token): for attribute, value in old_token.__dict__.items(): if not hasattr(self, attribute): setattr(self, attribute, value) for attribute, default_value in WordToken.wordtoken_attributes.items(): if hasattr(old_token, attribute): setattr(self, attribute, getattr(old_token, attribute)) else: setattr(self, attribute, default_value) def __getstate__(self): state = self.__dict__.copy() state['wavpath'] = None return state def __setstate__(self, state): self.__dict__.update(state) def __eq__(self, other): if not isinstance(other,WordToken): return False if self.wordtype != other.wordtype: return False if self.begin != other.begin: return False if self.end != other.end: return False if self.discourse != other.discourse: return False if self.speaker != other.speaker: return False return True def __str__(self): return str(self.wordtype) def __repr__(self): return '<WordToken: {}, {}, {}-{}>'.format(str(self.wordtype), str(self.transcription),self.begin,self.end) def add_attribute(self, tier_name, default_value): setattr(self, tier_name, default_value) #@property #def previous_token(self): # if self.discourse is not None and self.previous_token_time is not None: # return self.discourse[self.previous_token_time] # return None #@property #def following_token(self): # if self.discourse is not None and self.following_token_time is not None: # return self.discourse[self.following_token_time] # return None @property def duration(self): return self.end - self.begin @property def spelling(self): if self._spelling is not None: return self._spelling if self.wordtype is not None: return self.wordtype.spelling return None @spelling.setter def spelling(self, value): self._spelling = value @spelling.deleter def spelling(self): del self._spelling @property def transcription(self): if self._transcription is not None: return self._transcription if self.wordtype is not None: return self.wordtype.transcription return None @transcription.setter def transcription(self, value): self._transcription = value @transcription.deleter def transcription(self): del self._transcription