Source code for corpustools.corpus.io.text_spelling

import os

from corpustools.corpus.classes import SpontaneousSpeechCorpus, Discourse, Attribute, Corpus

from corpustools.exceptions import PCTOSError
from .binary import load_binary

from .helper import (DiscourseData, Annotation, BaseAnnotation,
                     data_to_discourse, data_to_discourse2, AnnotationType, text_to_lines)

[docs]def inspect_discourse_spelling(path, support_corpus_path = None): """ Generate a list of AnnotationTypes for a specified text file for parsing it as an orthographic text Parameters ---------- path : str Full path to text file support_corpus_path : str, optional Full path to a corpus to look up transcriptions from spellings in the text Returns ------- list of AnnotationTypes Autodetected AnnotationTypes for the text file """ a = AnnotationType('Spelling', None, None, anchor = True, token = False) if os.path.isdir(path): for root, subdirs, files in os.walk(path): for filename in files: if not filename.lower().endswith('.txt'): continue with open(os.path.join(root,filename), encoding='utf-8-sig', mode='r') as f: for line in f.readlines(): trial = line.strip().split() a.add(trial, save = False) else: with open(path, encoding='utf-8-sig', mode='r') as f: for line in f.readlines(): trial = line.strip().split() a.add(trial, save = False) annotation_types = [a] if support_corpus_path is not None: annotation_types += [AnnotationType('Transcription', None, None, base = True)] return annotation_types
def spelling_text_to_data(corpus_name, path, annotation_types = None, support_corpus_path = None, ignore_case = True, stop_check = None, call_back = None): name = corpus_name if annotation_types is None: annotation_types = inspect_discourse_spelling(path, support_corpus_path) if support_corpus_path is not None: if isinstance(support_corpus_path, Corpus): support = support_corpus_path else: if not os.path.exists(support_corpus_path): raise(PCTOSError("The corpus path specified ({}) does not exist".format(support_corpus_path))) support = load_binary(support_corpus_path) a = AnnotationType('Transcription', None, None, attribute=Attribute('Transcription', 'transcription', 'Transcription'), base=True, is_default=True) annotation_types.append(a) for a in annotation_types: a.reset() data = DiscourseData(name, annotation_types) lines = text_to_lines(path) if call_back is not None: call_back('Processing file...') call_back(0, len(lines)) cur = 0 for line in lines: if stop_check is not None and stop_check(): return if call_back is not None: cur += 1 if cur % 20 == 0: call_back(cur) if not line or line == '\n': continue annotations = {} for word in line: spell = word.strip() spell = ''.join(x for x in spell if not x in data['Spelling'].ignored_characters) if spell == '': continue word = Annotation(spell) if support_corpus_path is not None: trans = None try: trans = support.find(spell, ignore_case = ignore_case).transcription except KeyError: trans = [] n = data.base_levels[0] tier_elements = [BaseAnnotation(x) for x in trans] level_count = data.level_length(n) word.references.append(n) word.begins.append(level_count) word.ends.append(level_count + len(tier_elements)) annotations[n] = tier_elements annotations['Spelling'] = [word] data.add_annotations(**annotations) return data
[docs]def load_directory_spelling(corpus_name, path, annotation_types = None, support_corpus_path = None, ignore_case = False, stop_check = None, call_back = None): """ Loads a directory of orthographic texts Parameters ---------- corpus_name : str Name of corpus path : str Path to directory of text files annotation_types : list of AnnotationType, optional List of AnnotationType specifying how to parse text files support_corpus_path : str, optional File path of corpus binary to load transcriptions from ignore_case : bool, optional Specifies whether lookups in the support corpus should ignore case stop_check : callable, optional Optional function to check whether to gracefully terminate early call_back : callable, optional Optional function to supply progress information during the function Returns ------- SpontaneousSpeechCorpus Corpus containing Discourses corresponding to the text files """ if call_back is not None: call_back('Finding files...') call_back(0, 0) file_tuples = [] for root, subdirs, files in os.walk(path): for filename in files: if not filename.lower().endswith('.txt'): continue file_tuples.append((root, filename)) if call_back is not None: call_back('Parsing files...') call_back(0,len(file_tuples)) cur = 0 corpus = SpontaneousSpeechCorpus(corpus_name, path) if support_corpus_path is not None: support = load_binary(support_corpus_path) else: support = None for i, t in enumerate(file_tuples): if stop_check is not None and stop_check(): return if call_back is not None: call_back('Parsing file {} of {}...'.format(i+1, len(file_tuples))) call_back(i) root, filename = t name = os.path.splitext(filename)[0] at = annotation_types[:] #it's necessary to take a copy, because the annotation types might be altered during the load_discourse_spelling #function, and if so this affect the annotation types on future loops in the current function d = load_discourse_spelling(name, os.path.join(root,filename), annotation_types=at, support_corpus_path = support, ignore_case = ignore_case, stop_check = stop_check, call_back = call_back) corpus.add_discourse(d) return corpus
[docs]def load_discourse_spelling(corpus_name, path, annotation_types = None, support_corpus_path = None, ignore_case = False, stop_check = None, call_back = None): """ Load a discourse from a text file containing running text of orthography Parameters ---------- corpus_name : str Informative identifier to refer to corpus path : str Full path to text file annotation_types : list of AnnotationType, optional List of AnnotationType specifying how to parse text files lexicon : Corpus, optional Corpus to store Discourse word information support_corpus_path : str, optional Full path to a corpus to look up transcriptions from spellings in the text ignore_case : bool, optional Specify whether to ignore case when using spellings in the text to look up transcriptions stop_check : callable, optional Callable that returns a boolean for whether to exit before finishing full calculation call_back : callable, optional Function that can handle strings (text updates of progress), tuples of two integers (0, total number of steps) and an integer for updating progress out of the total set by a tuple Returns ------- Discourse Discourse object generated from the text file """ data = spelling_text_to_data(corpus_name, path, annotation_types, support_corpus_path, ignore_case, stop_check, call_back) if data is None: return if support_corpus_path is not None: if isinstance(support_corpus_path, Corpus): #the corpus is 'preloaded' if this function is called by load_directory_spelling #otherwise the corpus has to be loaded once per file in a directory, which could be slow support = support_corpus_path else: #otherwise, it's a string representing a path to the corpus support = load_binary(support_corpus_path) else: support = None #discourse = data_to_discourse(data, lexicon, stop_check=stop_check, call_back=call_back) discourse = data_to_discourse2(corpus_name=data.name, wav_path=data.wav_path, annotation_types=annotation_types, support_corpus=support, ignore_case=ignore_case, stop_check=stop_check, call_back=call_back) if support_corpus_path is not None: discourse.lexicon.specifier = support.specifier return discourse
[docs]def export_discourse_spelling(discourse, path, single_line = False): """ Export an orthography discourse to a text file Parameters ---------- discourse : Discourse Discourse object to export path : str Path to export to single_line : bool, optional Flag to enforce all text to be on a single line, defaults to False. If False, lines are 10 words long. """ with open(path, encoding='utf-8', mode='w') as f: count = 0 for i, wt in enumerate(discourse): count += 1 f.write(wt.spelling) if i != len(discourse) -1: if not single_line and count <= 10: f.write(' ') else: count = 0 f.write('\n')