Source code for corpustools.corpus.io.text_transcription

import os
import re

import corpustools.gui.modernize as modernize
from corpustools.corpus.classes import SpontaneousSpeechCorpus, Corpus, Word, Discourse, WordToken, Attribute

from corpustools.exceptions import DelimiterError, PCTOSError

from .helper import (compile_digraphs, parse_transcription, DiscourseData,
                    data_to_discourse2, AnnotationType, text_to_lines,
                    Annotation, BaseAnnotation)

from .binary import load_binary

[docs]def inspect_discourse_transcription(path):
    """
    Generate a list of AnnotationTypes for a specified text file for parsing
    it as a transcribed text

    Parameters
    ----------
    path : str
        Full path to text file

    Returns
    -------
    list of AnnotationTypes
        Autodetected AnnotationTypes for the text file
    """
    trans_delimiters = ['.', ';', ',']

    att = Attribute('transcription','tier','Transcription')
    a = AnnotationType('Transcription', None, None, attribute = att,
                                            base = True)

    if os.path.isdir(path):
        for root, subdirs, files in os.walk(path):
            for filename in files:
                if not filename.lower().endswith('.txt'):
                    continue
                with open(os.path.join(root,filename),
                            encoding='utf-8-sig', mode='r') as f:
                    for line in f.readlines():
                        trial = line.strip().split()
                        if a.trans_delimiter is None:
                            for t in trial:
                                for delim in trans_delimiters:
                                    if delim in t:
                                        a.trans_delimiter = delim
                                        break

                        a.add(trial, save = False)
    else:
        with open(path, encoding='utf-8-sig', mode='r') as f:
            for line in f.readlines():
                trial = line.strip().split()
                if a.trans_delimiter is None:
                    for t in trial:
                        for delim in trans_delimiters:
                            if delim in t:
                                a.trans_delimiter = delim
                                break

                a.add(trial, save = False)
    annotation_types = [a]
    return annotation_types

def transcription_text_to_data(corpus_name, path, annotation_types = None,
                            stop_check = None, call_back = None):

    name = corpus_name

    if annotation_types is None:
        annotation_types = inspect_discourse_transcription(path)

    for a in annotation_types:
        a.reset()
    a = AnnotationType('Spelling', None, None,
                attribute = Attribute('Spelling','spelling','Spelling'), anchor = True)

    annotation_types.append(a)

    data = DiscourseData(name, annotation_types)

    lines = text_to_lines(path)
    if call_back is not None:
        call_back('Processing file...')
        call_back(0, len(lines))
        cur = 0
    trans_check = False
    n = 'Transcription'

    for line in lines:
        if stop_check is not None and stop_check():
            return
        if call_back is not None:
            cur += 1
            if cur % 20 == 0:
                call_back(cur)
        if not line or line == '\n':
            continue
        for word in line:
            annotations = dict()
            trans = parse_transcription(word, data[n])
            #if not trans_check and data[n].delimiter is not None and len(trans) > 1:
            #    trans_check = True
            spell = ''.join(x.label for x in trans)
            if spell == '':
                continue

            word = Annotation(spell)

            tier_elements = trans
            level_count = data.level_length(n)
            word.references.append(n)
            word.begins.append(level_count)
            word.ends.append(level_count + len(tier_elements))
            tier_elements[0].begin = level_count
            tier_elements[-1].end = level_count + len(tier_elements)
            annotations[n] = tier_elements
            annotations['Spelling'] = [word]
            data.add_annotations(**annotations)
    #if data[n].delimiter and not trans_check:
    #    raise(DelimiterError('The transcription delimiter specified does not create multiple segments. Please specify another delimiter.'))

    return data

[docs]def load_directory_transcription(corpus_name, path, annotation_types = None,
                                feature_system_path = None,
                                stop_check = None, call_back = None):
    """
    Loads a directory of transcribed texts.

    Parameters
    ----------
    corpus_name : str
        Name of corpus
    path : str
        Path to directory of text files
    annotation_types : list of AnnotationType, optional
        List of AnnotationType specifying how to parse text files
    feature_system_path : str, optional
        File path of FeatureMatrix binary to specify segments
    stop_check : callable, optional
        Optional function to check whether to gracefully terminate early
    call_back : callable, optional
        Optional function to supply progress information during the loading

    Returns
    -------
    SpontaneousSpeechCorpus
        Corpus containing Discourses corresponding to the text files
    """
    if call_back is not None:
        call_back('Finding  files...')
        call_back(0, 0)
    file_tuples = []
    for root, subdirs, files in os.walk(path):
        for filename in files:
            if not filename.lower().endswith('.txt'):
                continue
            file_tuples.append((root, filename))

    if call_back is not None:
        call_back('Parsing files...')
        call_back(0,len(file_tuples))
        cur = 0
    corpus = SpontaneousSpeechCorpus(corpus_name, path)
    for i, t in enumerate(file_tuples):
        if stop_check is not None and stop_check():
            return
        if call_back is not None:
            call_back('Parsing file {} of {}...'.format(i+1,len(file_tuples)))
            call_back(i)
        root, filename = t
        name = os.path.splitext(filename)[0]
        d = load_discourse_transcription(name, os.path.join(root,filename),
                                    annotation_types=annotation_types,
                                    lexicon=corpus.lexicon, feature_system_path=feature_system_path,
                                    stop_check=stop_check, call_back=call_back)
        corpus.add_discourse(d)
    return corpus


[docs]def load_discourse_transcription(corpus_name, path, annotation_types = None,
                    lexicon = None, feature_system_path = None,
                    stop_check = None, call_back = None):
    """
    Load a discourse from a text file containing running transcribed text

    Parameters
    ----------
    corpus_name : str
        Informative identifier to refer to corpus
    path : str
        Full path to text file
    annotation_types : list of AnnotationType, optional
        List of AnnotationType specifying how to parse text files
    lexicon : Corpus, optional
        Corpus to store Discourse word information
    feature_system_path : str, optional
        Full path to pickled FeatureMatrix to use with the Corpus
    stop_check : callable, optional
        Optional function to check whether to gracefully terminate early
    call_back : callable, optional
        Optional function to supply progress information during the loading

    Returns
    -------
    Discourse
        Discourse object generated from the text file
    """
    if feature_system_path is not None:
        if not os.path.exists(feature_system_path):
            raise(PCTOSError("The feature path specified ({}) does not exist".format(feature_system_path)))

    data = transcription_text_to_data(corpus_name, path, annotation_types, stop_check=stop_check, call_back=call_back)
    # discourse = data_to_discourse(data, lexicon, stop_check=stop_check, call_back=call_back)
    discourse = data_to_discourse2(corpus_name=corpus_name, wav_path=data.wav_path, annotation_types=annotation_types,
                                   stop_check=stop_check, call_back=call_back)

    if feature_system_path is not None:
        feature_matrix = load_binary(feature_system_path)
        discourse.lexicon.set_feature_matrix(feature_matrix)

    return discourse

[docs]def export_discourse_transcription(discourse, path, trans_delim = '.', single_line = False):
    """
    Export an transcribed discourse to a text file

    Parameters
    ----------
    discourse : Discourse
        Discourse object to export
    path : str
        Path to export to
    trans_delim : str, optional
        Delimiter for segments, defaults to ``.``
    single_line : bool, optional
        Flag to enforce all text to be on a single line, defaults to False.
        If False, lines are 10 words long.
    """
    with open(path, encoding='utf-8-sig', mode='w') as f:
        count = 0
        for i, wt in enumerate(discourse):
            count += 1
            f.write(trans_delim.join(wt.transcription))
            if i != len(discourse) -1:
                if not single_line and count <= 10:
                    f.write(' ')
                else:
                    count = 0
                    f.write('\n')