Source code for corpustools.corpus.classes.spontaneous


from collections import OrderedDict

from .lexicon import Transcription, Corpus, Attribute

import os
import wave
import math

[docs]class Speaker(object): """ Speaker objects contain information about the producers of WordTokens or Discourses Parameters ---------- name : string Name to identify the Speaker Attributes ---------- name : string Name of Speaker gender : string Gender of Speaker age : int or string Age of Speaker """ def __init__(self,name, **kwargs): self.name = name self.gender = None self.age = None for k,v in kwargs.items(): setattr(self,k,v) def __repr__(self): return '<Speaker object with name \'{}\>'.format(self.name) def __str__(self): return str(self.name) def __hash__(self): return hash(self.name) def __eq__(self, other): if isinstance(other,Speaker): return self.name == other.name else: return self.name == other def __ne__(self, other): return not self.__eq__(other) def __lt__(self, other): return self.name < other.name def __gt__(self, other): return self.name > other.name def __le__(self, other): return self.name <= other.name def __ge__(self, other): return self.name >= other.name
[docs]class SpontaneousSpeechCorpus(object): """ SpontaneousSpeechCorpus objects a collection of Discourse objects and Corpus objects for frequency information. Parameters ---------- name : str Name to identify the SpontaneousSpeechCorpus directory : str Directory associated with the SpontaneousSpeechCorpus Attributes ---------- lexicon : Corpus Corpus object with token frequencies from its Discourses discourses : dict Discourses of the SpontaneousSpeechCorpus indexed by the names of the Discourses """ def __init__(self,name,directory): self.name = name self.directory = directory self.lexicon = Corpus(name+' lexicon') self.lexicon.has_wordtokens = True self.discourses = OrderedDict() def __iter__(self): for d in self.discourses.values(): yield d def __setstate__(self,state): self.__dict__.update(state) self.lexicon.has_wordtokens = True
[docs] def add_discourse(self, discourse): """ Add a discourse to the SpontaneousSpeechCorpus Parameters ---------- discourse : Discourse Discourse to be added """ self.discourses[str(discourse)] = discourse
#self.lexicon += discourse.lexicon
[docs]class Discourse(object): """ Discourse objects are collections of linear text with word tokens Parameters ---------- name : str Identifier for the Discourse speaker : Speaker Speaker producing the tokens/text (defaults to an empty Speaker) Attributes ---------- attributes : list of Attributes The Discourse object tracks all of the attributes used by its WordToken objects words : dict of WordTokens The keys are the beginning times of the WordTokens (or their place in a text if it's not a speech discourse) and the values are the WordTokens """ def __init__(self, **kwargs): self.name = '' self.speaker = Speaker(None) self.wav_path = None for k,v in kwargs.items(): setattr(self,k,v) self._attributes = [Attribute('spelling','spelling','Spelling'), Attribute('transcription','tier','Transcription'), Attribute('begin','numeric','Begin'), Attribute('end','numeric', 'End')] self.words = dict() self.lexicon = Corpus(self.name + ' lexicon') self.lexicon.has_wordtokens = True @property def attributes(self): return self._attributes
[docs] def keys(self): """ Returns a sorted list of keys for looking up WordTokens Returns ------- list List of begin times or indices of WordTokens in the Discourse """ return sorted(self.words.keys())
def __len__(self): return len(self.words.keys()) def __eq__(self, other): if not isinstance(other,Discourse): return False if self.name != other.name: return False if self.speaker != other.speaker: return False return True def __ne__(self, other): return not self.__eq__(other) def __lt__(self, other): return self.name < other.name def __gt__(self, other): return self.name > other.name def __le__(self, other): return self.name <= other.name def __ge__(self, other): return self.name >= other.name def __str__(self): return self.name
[docs] def add_word(self, wordtoken): """ Adds a WordToken to the Discourse Parameters ---------- wordtoken : WordToken WordToken to be added """ wordtoken.discourse = self self.words[wordtoken.begin] = wordtoken for a in self.attributes: if not hasattr(wordtoken,a.name): wordtoken.add_attribute(a.name, a.default_value) a.update_range(getattr(wordtoken,a.name))
[docs] def add_attribute(self, attribute, initialize_defaults = False): """ Add an Attribute of any type to the Discourse or replace an existing Attribute. Parameters ---------- attribute : Attribute Attribute to add or replace initialize_defaults : bool If True, word tokens will have this attribute set to the ``default_value`` of the attribute, defaults to False """ for i,a in enumerate(self._attributes): if attribute.name == a.name: self._attributes[i] = attribute break else: self._attributes.append(attribute) if initialize_defaults: for word in self: word.add_attribute(attribute.name,attribute.default_value)
def __getitem__(self, key): if isinstance(key, float) or isinstance(key, int): #Find the word token at a given time keys = filter(lambda x: x >= key,self.words.keys()) t = min(keys,key = lambda x: x - key) return self.words[t] raise(TypeError) @property def has_audio(self): """ Checks whether the Discourse is associated with a .wav file Returns ------- bool True if a .wav file is associated and if that file exists, False otherwise """ if self.wav_path is not None and os.path.exists(self.wav_path): return True return False def __setstate__(self,state): if 'wav_path' not in state: state['wav_path'] = None self.__dict__.update(state) if hasattr(self,'lexicon'): self.lexicon.has_wordtokens = True for wt in self: wt.wordtype.wordtokens.append(wt) def __iter__(self): for k in sorted(self.words.keys()): yield self.words[k] def _extract_tokens(self, tokens, output_dir): if not self.has_audio(): return filenames = [] with wave.open(self.wav_path,'r') as w_in: sr = w_in.getframerate() bitdepth = w_in.getsampwidth() for t in tokens: wt = self[t] name = '{}_{}.wav'.format(self.name,wt.begin) wt.wav_path = os.path.join(output_dir,name) filenames.append(wt.wav_path) if os.path.exists(wt.wav_path): continue begpos = int(wt.begin * sr) endpos = int(wt.end * sr) duration = endpos - begpos w_in.setpos(begpos) data = w_in.readframes(duration) with wave.open(wt.wav_path,'w') as w_out: w_out.setnchannels(1) w_out.setframerate(sr) w_out.setsampwidth(bitdepth) w_out.writeframes(data) return filenames
[docs] def create_lexicon(self): """ Create a Corpus object from the Discourse Returns ------- Corpus Corpus with spelling and transcription from previous Corpus and token frequency from the Discourse """ corpus = Corpus(self.name + ' lexicon') corpus.has_wordtokens = True for token in self: word = corpus.get_or_create_word(token.wordtype.spelling,token.wordtype.transcription) word.frequency += 1 token.wordtype = word word.wordtokens.append(token) return corpus
[docs] def find_wordtype(self, wordtype): """ Look up all WordTokens that are instances of a Word Parameters ---------- wordtype : Word Word to look up Returns ------- list of WordTokens List of the given Word's WordTokens in this Discourse """ return list(x for x in self if x.wordtype == wordtype)
def _calc_frequency(self,query): if isinstance(query, tuple): count = 0 base = query[0] for x in self.find_wordtype(base): cur = query[0] for i in range(1,len(query)): if cur.following_token != query[i]: break cur = cur.following_token else: count += 1 return count elif isinstance(query, Word): return len(self.find_wordtype(query))
[docs]class WordToken(object): """ WordToken objects are individual productions of Words Parameters ---------- word : Word Word that the WordToken is associated with transcription : iterable of str Transcription for the WordToken (can be different than the transcription of the Word type). Defaults to None if not specified spelling : str Spelling for the WordToken (can be different than the spelling of the Word type). Defaults to None if not specified begin : float or int Beginning of the WordToken (can be specified as either in seconds of time or in position from the beginning of the Discourse) end : float or int End of the WordToken (can be specified as either in seconds of time or in position from the beginning of the Discourse) previous_token : WordToken The preceding WordToken in the Discourse, defaults to None if not specified following_token : WordToken The following WordToken in the Discourse, defaults to None if not specified discourse : Discourse Parent Discourse object that the WordToken belongs to speaker : Speaker The Speaker that produced the token Attributes ---------- transcription : Transcription The WordToken's transcription, or the word type's transcription if the WordToken's transcription is None spelling : str The WordToken's spelling, or the word type's spelling if the WordToken's spelling is None previous_token : WordToken The previous WordToken in the Discourse following_token : WordToken The following WordToken in the Discourse duration : float The duration of the WordToken """ def __init__(self,**kwargs): self.wordtype = kwargs.pop('word',None) self.discourse = None self.speaker = None self.wavpath = None self._spelling = None self._transcription = None for key, value in kwargs.items(): if key == 'transcription': key = '_transcription' elif key == 'spelling': key = '_spelling' if isinstance(value, tuple): att, value = value if att.att_type == 'numeric': try: value = float(value) except (ValueError, TypeError): value = float('nan') elif att.att_type == 'tier': value = Transcription(value) else: key = key.lower() if isinstance(value,list): #assume transcription type stuff value = Transcription(value) elif key != '_spelling': try: f = float(value) if not math.isnan(f) and not math.isinf(f): value = f except (ValueError, TypeError): pass setattr(self, key, value) def __getstate__(self): state = self.__dict__.copy() state['wavpath'] = None return state def __eq__(self, other): if not isinstance(other,WordToken): return False if self.wordtype != other.wordtype: return False if self.begin != other.begin: return False if self.end != other.end: return False if self.discourse != other.discourse: return False if self.speaker != other.speaker: return False return True def __str__(self): return str(self.wordtype) def __repr__(self): return '<WordToken: {}, {}, {}-{}>'.format(str(self.wordtype), str(self.transcription),self.begin,self.end) def add_attribute(self, tier_name, default_value): setattr(self, tier_name, default_value) #@property #def previous_token(self): # if self.discourse is not None and self.previous_token_time is not None: # return self.discourse[self.previous_token_time] # return None #@property #def following_token(self): # if self.discourse is not None and self.following_token_time is not None: # return self.discourse[self.following_token_time] # return None @property def duration(self): return self.end - self.begin @property def spelling(self): if self._spelling is not None: return self._spelling if self.wordtype is not None: return self.wordtype.spelling return None @property def transcription(self): if self._transcription is not None: return self._transcription if self.wordtype is not None: return self.wordtype.transcription return None
#@property #def previous_conditional_probability(self): # if self.previous_token is not None: # return self.discourse.calc_frequency( # (self.previous_token.wordtype,self.wordtype) # ) / self.discourse.calc_frequency(self.previous_token.wordtype) # return None