Source code for corpustools.corpus.io.pct_textgrid

import os
import codecs
from corpustools.corpus.io.textgrid11_pct import TextGrid, IntervalTier, readFile, Interval, Point, PointTier, _getMark
#from corpustools.corpus.io.textgrid import Interval, Point, PointTier , _getMark

from corpustools.corpus.classes import SpontaneousSpeechCorpus, Attribute, Corpus
from corpustools.corpus.classes.spontaneous import Discourse
from corpustools.exceptions import PCTError, TextGridIOError

from corpustools.corpus.io.binary import load_binary
import corpustools.gui.modernize as modernize

from .helper import (parse_transcription, DiscourseData,
                    AnnotationType, data_to_discourse2, find_wav_path,
                    Annotation,)

class PCTTextGrid(TextGrid):

    def __init__(self):
        super().__init__()

    def name_filter(self, name):
        """
        Captialize the initial letter to match the specifications in PCT
        """
        return name.capitalize() if not all([x.isupper() for x in name]) else name

    def read(self, f, round_digits=15):
        """
        Read the tiers contained in the Praat-formated TextGrid file
        indicated by string f
        """
        source = readFile(f)
        self.minTime = round(float(source.readline().split()[2]), 5)
        self.maxTime = round(float(source.readline().split()[2]), 5)
        source.readline() # more header junk
        m = int(source.readline().rstrip().split()[2]) # will be self.n
        
        source.readline()

        for i in range(m): # loop over grids
            source.readline()
            if source.readline().rstrip().split()[2] == '"IntervalTier"':
                inam = source.readline().rstrip().split(' = ')[1].strip('"')
                inam = self.name_filter(inam)
                inam = Attribute.sanitize_name(inam)
                imin = round(float(source.readline().rstrip().split()[2]),
                             round_digits)
                imax = round(float(source.readline().rstrip().split()[2]),
                             round_digits)
                itie = IntervalTier(inam)
                for j in range(int(source.readline().rstrip().split()[3])):
                    source.readline().rstrip().split() # header junk
                    jmin = round(float(source.readline().rstrip().split()[2]),
                                 round_digits)
                    jmax = round(float(source.readline().rstrip().split()[2]),
                                 round_digits)
                    jmrk = _getMark(source)
                    if jmin < jmax: # non-null
                        itie.addInterval(Interval(jmin, jmax, jmrk))
                self.append(itie)
            else: # pointTier
                inam = source.readline().rstrip().split(' = ')[1].strip('"')
                inam = self.name_filter(inam)
                imin = round(float(source.readline().rstrip().split()[2]),
                             round_digits)
                imax = round(float(source.readline().rstrip().split()[2]),
                             round_digits)
                itie = PointTier(inam)
                n = int(source.readline().rstrip().split()[3])
                for j in range(n):
                    source.readline().rstrip() # header junk
                    jtim = round(float(source.readline().rstrip().split()[2]),
                                 round_digits)
                    jmrk = _getMark(source)
                    itie.addPoint(Point(jtim, jmrk))
                self.append(itie)
        source.close()

def uniqueLabels(tier):
    return set(x.mark for x in tier.intervals)

def averageLabelLen(tier):
    labels = uniqueLabels(tier)
    return sum(len(lab) for lab in labels)/len(labels)


[docs]def inspect_discourse_textgrid(path):
    """
    Generate a list of AnnotationTypes for a specified TextGrid file

    Parameters
    ----------
    path : str
        Full path to TextGrid file

    Returns
    -------
    list of AnnotationTypes
        Autodetected AnnotationTypes for the TextGrid file
    """
    trans_delimiters = ['.',' ', ';', ',']
    textgrids = []
    if os.path.isdir(path):
        for root, subdirs, files in os.walk(path):
            for filename in files:
                if not filename.lower().endswith('.textgrid'):
                    continue
                textgrids.append(os.path.join(root,filename))
    else:
        textgrids.append(path)
    anno_types = []
    for t in textgrids:
        tg = load_textgrid(t)
        spellings, segments, attributes = guess_tiers(tg)
        if len(segments) == 0:
            base = None
        else:
            base = segments[0]
        if len(spellings) == 0:
            anchor = None
        else:
            anchor = spellings[0]
        interval_tiers = [x for x in tg.tiers if isinstance(x, IntervalTier)]
        if len(anno_types) == 0:
            for ti in interval_tiers:
                if ti.name in spellings:
                    a = AnnotationType(ti.name, base, None, anchor = True, token = False)
                elif ti.name in segments:
                    a = AnnotationType(ti.name, None, anchor, base = True, token = True)
                else:
                    labels = uniqueLabels(ti)
                    cat = Attribute.guess_type(labels, trans_delimiters)
                    att = Attribute(Attribute.sanitize_name(ti.name), cat, ti.name)
                    a = AnnotationType(ti.name, None, anchor, token = False, attribute = att)
                    if cat == 'tier':
                        for l in labels:
                            for delim in trans_delimiters:
                                if delim in l:
                                    a.trans_delimiter = delim
                                    break
                            if a.trans_delimiter is not None:
                                break
                a.add((x.mark for x in ti), save = False)
                anno_types.append(a)
        else:
            if len(anno_types) != len(interval_tiers):
                raise(PCTError("The TextGrids must have the same number of tiers."))
            for i, ti in enumerate(interval_tiers):
                anno_types[i].add((x.mark for x in ti), save = False)

    return anno_types

def load_textgrid(path):
    tg = PCTTextGrid()
    tg.read(path)
    return tg

def guess_tiers(tg):
    segment_tiers = []
    spelling_tiers = []
    attribute_tiers = []
    tier_properties = {}
    interval_tiers = [x for x in tg.tiers if isinstance(x, IntervalTier)]
    for i,t in enumerate(interval_tiers):
        tier_properties[t.name] = (i, len(t), averageLabelLen(t), len(uniqueLabels(t)))

    max_labels = max(tier_properties.values(), key = lambda x: x[2])
    likely_segment = [k for k,v in tier_properties.items() if v == max_labels]
    if len(likely_segment) == 1:
        segment_tiers.append(likely_segment)
    likely_spelling = min((x for x in tier_properties.keys() if x not in segment_tiers),
                        key = lambda x: tier_properties[x][0])
    spelling_tiers.append(likely_spelling)

    for k in tier_properties.keys():
        if k in segment_tiers:
            continue
        attribute_tiers.append(k)

    return spelling_tiers, segment_tiers, attribute_tiers


def textgrid_to_data(corpus_name, path, annotation_types, stop_check=None, call_back=None):
    tg = load_textgrid(path)
    name = corpus_name
    for a in annotation_types:
        a.reset()
    data = DiscourseData(name, annotation_types)
    if call_back is not None:
        call_back('Loading...')
        cur = 0
    # flags to keep track of whether we should keep adding transcriptions to data
    transcription_flag = {n: True for n in data.base_levels}
    for word_name in data.word_levels:
        #data.word_levels = [k for k,v in data.data.items() if not v.token and v.anchor]
        #this should return the names of just the spelling tiers, and in most cases len(word_levels)==1
        if stop_check is not None and stop_check():
            return
        if call_back is not None:
            cur += 1
            call_back(cur)

        spelling_tier = tg.getFirst(data.data[word_name].output_name)

        for si in spelling_tier:
            if not si.mark:# is None:
                continue
            annotations = dict()
            word = Annotation(si.mark)
            # si.mark is the actual text, e.g the spelling of a word
            for n in data.base_levels:
                #data.base_levels should return a list of names of transcription-type tiers
                #compare with data.word_levels a few lines back in the nesting loop
                if data[word_name].speaker != data[n].speaker and data[n].speaker is not None:
                    continue
                t = tg.getFirst(data[n].output_name)
                # t is a list of Intervals
                tier_elements = list()
                for ti in t:
                    if ti.maxTime <= si.minTime:
                        continue
                    if ti.minTime >= si.maxTime:
                        break

                    phoneBegin = ti.minTime
                    phoneEnd = ti.maxTime

                    if phoneBegin < si.minTime:
                        phoneBegin = si.minTime
                    if phoneEnd > si.maxTime:
                        phoneEnd = si.maxTime
                    parsed = parse_transcription(ti.mark, data[n])
                    if parsed:
                        parsed[0].begin = phoneBegin
                        parsed[-1].end = phoneEnd
                        tier_elements.extend(parsed)

                # if not tier_elements:
                #     continue

                if len(tier_elements) > 1:
                    for j, _ in enumerate(tier_elements):
                        if j == 0:
                            tier_elements[j].end = None
                        elif j == len(tier_elements)-1:
                            tier_elements[j].begin = None
                        else:
                            tier_elements[j].begin = None
                            tier_elements[j].end = None

                level_count = data.level_length(n)
                word.references.append(n)
                word.begins.append(level_count)
                word.ends.append(level_count + len(tier_elements))
                annotations[n] = tier_elements

            #mid_point = si.minTime + (si.maxTime - si.minTime)
            mid_point = (si.maxTime + si.minTime)/2
            for at in annotation_types:
                #this catches only things marked as "Other (character)"
                if at.ignored:
                    continue
                if at.base:
                    continue
                if at.anchor:
                    continue
                t = tg.getFirst(at.attribute.name)
                ti = t.intervalContaining(mid_point)
                if ti is None:
                    #value = None
                    continue
                else:
                    value = ti.mark
                    if not value:
                        continue
                    value = [Annotation(value)]
                    if at.delimited:
                        value = parse_transcription(ti.mark, at)
                    # elif at.ignored: #this block will never be reached because at.ignored is checked above already
                    #     value = ''.join(x for x in value if x not in at.ignored)
                if at.token:
                    word.token[at.attribute.name] = value
                else:
                    word.additional[at.attribute.name] = value
                annotations[at.attribute.name] = value

            annotations[word_name] = [word]
            data.add_annotations(transcription_flag=transcription_flag, **annotations)
            #the add_annotations function appears to do nothing
            #it is supposed to update the dictionary data.data but the contents of the dictionary remain the
            #same after the function call
            #the annotations dictionary seems to contain useful information about words, but none of it is ever used
        transcription_flag = {trans: False for trans, flag in transcription_flag.items()}
    if all([len(at) == 0 for at in annotation_types]):
        raise(TextGridIOError('Empty corpus',
                              'Currently PCT is not able to import a TextGrid corpus with only a Transcription tier. '
                              'You must select another tier to be Orthography. If your TextGrid file only has one tier, '
                              'please duplicate the tier and select the duplicated tier as Orthography when importing '
                              'the file.',
                              'PCT only supports TextGrid with at least two tiers, and when importing the file, '
                              'there must be at least one Orthography column, in addition to the Transcription column.'))
    return data


[docs]def load_discourse_textgrid(corpus_name, path, annotation_types,
                            feature_system_path=None, support_corpus_path=None,
                            stop_check=None, call_back=None):
    """
    Load a discourse from a TextGrid file

    Parameters
    ----------
    corpus_name : str
        Informative identifier to refer to corpus
    path : str
        Full path to TextGrid file
    annotation_types : list of AnnotationType
        List of AnnotationType specifying how to parse the TextGrids.
        Can be generated through ``inspect_discourse_textgrid``.
    lexicon : Corpus, optional
        Corpus to store Discourse word information
    feature_system_path : str
        Full path to pickled FeatureMatrix to use with the Corpus
    stop_check : callable or None
        Optional function to check whether to gracefully terminate early
    call_back : callable or None
        Optional function to supply progress information during the loading

    Returns
    -------
    Discourse
        Discourse object generated from the TextGrid file
    """
    data = textgrid_to_data(corpus_name, path, annotation_types, call_back=call_back, stop_check=stop_check)
    # textgrid_to_data has side-effects that change annotation_types
    wav_path = find_wav_path(path)
    if support_corpus_path is not None:
        if isinstance(support_corpus_path, Corpus):
            # the corpus is 'preloaded' if this function is called by load_directory_textgrid
            # otherwise the corpus has to be loaded once per file in a directory, which could be slow
            support = support_corpus_path
        else:
            # otherwise, it's a string representing a path to the corpus
            support = load_binary(support_corpus_path)
    else:
        support = None
    discourse = data_to_discourse2(corpus_name, wav_path,
                                   annotation_types=annotation_types, support_corpus=support,
                                   stop_check=stop_check, call_back=call_back)

    if feature_system_path is not None:
        feature_matrix = load_binary(feature_system_path)
        discourse.lexicon.set_feature_matrix(feature_matrix)
        discourse.lexicon.specifier = modernize.modernize_specifier(discourse.lexicon.specifier)
    return discourse

[docs]def load_directory_textgrid(corpus_name, path, annotation_types,
                            feature_system_path = None, support_corpus_path = None,
                            stop_check = None, call_back = None):
    """
    Loads a directory of TextGrid files

    Parameters
    ----------
    corpus_name : str
        Name of corpus
    path : str
        Path to directory of TextGrid files
    annotation_types : list of AnnotationType
        List of AnnotationType specifying how to parse the TextGrids.
        Can be generated through ``inspect_discourse_textgrid``.
    feature_system_path : str, optional
        File path of FeatureMatrix binary to specify segments
    stop_check : callable or None
        Optional function to check whether to gracefully terminate early
    call_back : callable or None
        Optional function to supply progress information during the loading

    Returns
    -------
    SpontaneousSpeechCorpus
        Corpus containing Discourses corresponding to the TextGrid files
    """
    if call_back is not None:
        call_back('Finding  files...')
        call_back(0, 0)
    file_tuples = []
    for root, subdirs, files in os.walk(path):
        for filename in files:
            if stop_check is not None and stop_check():
                return
            if not filename.lower().endswith('.textgrid'):
                continue
            file_tuples.append((root, filename))
    if call_back is not None:
        call_back('Parsing files...')
        call_back(0,len(file_tuples))
        cur = 0
    corpus = SpontaneousSpeechCorpus(corpus_name, path)
    if support_corpus_path is not None:
        support = load_binary(support_corpus_path)
    else:
        support = None
    for i, t in enumerate(file_tuples):
        if stop_check is not None and stop_check():
            return
        if call_back is not None:
            call_back('Parsing file {} of {}...'.format(i+1, len(file_tuples)))
            call_back(i)
        root, filename = t
        name = os.path.splitext(filename)[0]
        at = annotation_types[:]
        d = load_discourse_textgrid(name, os.path.join(root,filename),
                                    annotation_types=at,
                                    support_corpus_path=support,
                                    stop_check=stop_check,
                                    call_back=call_back)
        corpus.add_discourse(d)

    if feature_system_path is not None:
        feature_matrix = load_binary(feature_system_path)
        corpus.lexicon.set_feature_matrix(feature_matrix)
        corpus.lexicon.specifier = modernize.modernize_specifier(corpus.lexicon.specifier)
    return corpus