Source code for corpustools.corpus.io.text_ilg

import os
import re
from collections import Counter

from corpustools.corpus.classes import SpontaneousSpeechCorpus
from corpustools.corpus.classes import Corpus, Word, Discourse, WordToken, Attribute

from corpustools.exceptions import (DelimiterError, ILGError, ILGLinesMismatchError,
                                ILGWordMismatchError)

from .helper import (compile_digraphs, parse_transcription,
                    DiscourseData, AnnotationType,data_to_discourse,
                    Annotation, BaseAnnotation)

def calculate_lines_per_gloss(lines):
    line_counts = [len(x[1]) for x in lines]
    equaled = list()
    number = 1
    for i,line in enumerate(line_counts):
        if i == 0:
            equaled.append(False)
        else:
            equaled.append(line == line_counts[i-1])
    if False not in equaled[1:]:
        #All lines happen to have the same length
        for i in range(2,6):
            if len(lines) % i == 0:
                number = i
    else:
        false_intervals = list()
        ind = 0
        for i,e in enumerate(equaled):
            if i == 0:
                continue
            if not e:
                false_intervals.append(i - ind)
                ind = i
        false_intervals.append(i+1 - ind)
        counter = Counter(false_intervals)
        number = max(counter.keys(), key = lambda x: (counter[x],x))
        if number > 10:
            prev_maxes = set([number])
            while number > 10:
                prev_maxes.add(number)
                number = max(x for x in false_intervals if x not in prev_maxes)
    return number

def most_frequent_value(dictionary):
    c = Counter(dictionary.values())
    return max(c.keys(), key = lambda x: c[x])

def inspect_discourse_ilg(path, number = None):
[docs] """ Generate a list of AnnotationTypes for a specified text file for parsing it as an interlinear gloss text file Parameters ---------- path : str Full path to text file number : int, optional Number of lines per gloss, if not supplied, it is auto-detected Returns ------- list of AnnotationTypes Autodetected AnnotationTypes for the text file """ trans_delimiters = ['.', ';', ','] lines = {} if os.path.isdir(path): numbers = {} for root, subdirs, files in os.walk(path): for filename in files: if not filename.lower().endswith('.txt'): continue p = os.path.join(root, filename) lines[p] = text_to_lines(p) numbers[p] = calculate_lines_per_gloss(lines[p]) number = most_frequent_value(numbers) else: lines[path] = text_to_lines(path) number = calculate_lines_per_gloss(lines[path]) p = path annotation_types = [] for i in range(number): name = 'Line {}'.format(i+1) if i == 0: att = Attribute('spelling','spelling','Spelling') a = AnnotationType(name, None, None, anchor = True, token = False, attribute = att) else: labels = lines[p][i][1] cat = Attribute.guess_type(labels, trans_delimiters) att = Attribute(Attribute.sanitize_name(name), cat, name) a = AnnotationType(name, None, annotation_types[0].name, token = False, attribute = att) if cat == 'tier' and a.trans_delimiter is None: for l in labels: for delim in trans_delimiters: if delim in l: a.trans_delimiter = delim break if a.trans_delimiter is not None: break a.add(lines[p][i][1], save = False) annotation_types.append(a) for k,v in lines.items(): if k == p: continue for i in range(number): labels = lines[k][i][1] annotation_types[i].add(labels, save = False) return annotation_types def text_to_lines(path):
delimiter = None with open(path, encoding='utf-8-sig', mode='r') as f: text = f.read() if delimiter is not None and delimiter not in text: e = DelimiterError('The delimiter specified does not create multiple words. Please specify another delimiter.') raise(e) lines = enumerate(text.splitlines()) lines = [(x[0],x[1].strip().split(delimiter)) for x in lines if x[1].strip() != ''] return lines def ilg_to_data(path, annotation_types, stop_check = None, call_back = None): #if 'spelling' not in line_names: # raise(PCTError('Spelling required for parsing interlinear gloss files.')) lines = text_to_lines(path) if len(lines) % len(annotation_types) != 0: raise(ILGLinesMismatchError(lines)) if call_back is not None: call_back('Processing file...') call_back(0,len(lines)) cur = 0 index = 0 name = os.path.splitext(os.path.split(path)[1])[0] for a in annotation_types: a.reset() data = DiscourseData(name, annotation_types) mismatching_lines = list() while index < len(lines): cur_line = {} mismatch = False for line_ind, annotation_type in enumerate(annotation_types): if annotation_type.name == 'ignore': continue actual_line_ind, line = lines[index+line_ind] if len(cur_line.values()) != 0 and len(list(cur_line.values())[-1]) != len(line): mismatch = True if annotation_type.delimited: line = [parse_transcription(x, annotation_type) for x in line] cur_line[annotation_type.name] = line if mismatch: start_line = lines[index][0] end_line = start_line + len(annotation_types) mismatching_lines.append(((start_line, end_line), cur_line)) if len(mismatching_lines) > 0: index += len(annotation_types) continue for word_name in data.word_levels: for i, s in enumerate(cur_line[word_name]): annotations = {} word = Annotation(s) for n in data.base_levels: tier_elements = cur_line[n][i] level_count = data.level_length(n) word.references.append(n) word.begins.append(level_count) word.ends.append(level_count + len(tier_elements)) annotations[n] = tier_elements for line_type in cur_line.keys(): if data[line_type].ignored: continue if data[line_type].base: continue if data[line_type].anchor: continue if data[line_type].token: word.token[line_type] = cur_line[line_type][i] else: word.additional[line_type] = cur_line[line_type][i] annotations[word_name] = [word] data.add_annotations(**annotations) index += len(annotation_types) if len(mismatching_lines) > 0: raise(ILGWordMismatchError(mismatching_lines)) return data def load_discourse_ilg(corpus_name, path, annotation_types,
[docs] lexicon = None, feature_system_path = None, stop_check = None, call_back = None): """ Load a discourse from a text file containing interlinear glosses Parameters ---------- corpus_name : str Informative identifier to refer to corpus path : str Full path to text file annotation_types : list of AnnotationType List of AnnotationType specifying how to parse the glosses. Can be generated through ``inspect_discourse_ilg``. lexicon : Corpus, optional Corpus to store Discourse word information feature_system_path : str Full path to pickled FeatureMatrix to use with the Corpus stop_check : callable or None Optional function to check whether to gracefully terminate early call_back : callable or None Optional function to supply progress information during the loading Returns ------- Discourse Discourse object generated from the text file """ data = ilg_to_data(path, annotation_types, stop_check, call_back) discourse = data_to_discourse(data, lexicon) if feature_system_path is not None: feature_matrix = load_binary(feature_system_path) discourse.lexicon.set_feature_matrix(feature_matrix) return discourse def load_directory_ilg(corpus_name, path, annotation_types,
[docs] feature_system_path = None, stop_check = None, call_back = None): """ Loads a directory of interlinear gloss text files Parameters ---------- corpus_name : str Name of corpus path : str Path to directory of text files annotation_types : list of AnnotationType List of AnnotationType specifying how to parse the glosses. Can be generated through ``inspect_discourse_ilg``. feature_system_path : str, optional File path of FeatureMatrix binary to specify segments stop_check : callable or None Optional function to check whether to gracefully terminate early call_back : callable or None Optional function to supply progress information during the loading Returns ------- SpontaneousSpeechCorpus Corpus containing Discourses corresponding to the text files """ if call_back is not None: call_back('Finding files...') call_back(0, 0) file_tuples = [] for root, subdirs, files in os.walk(path): for filename in files: if not filename.lower().endswith('.txt'): continue file_tuples.append((root, filename)) if call_back is not None: call_back('Parsing files...') call_back(0,len(file_tuples)) cur = 0 corpus = SpontaneousSpeechCorpus(corpus_name, path) for i, t in enumerate(file_tuples): if stop_check is not None and stop_check(): return if call_back is not None: call_back('Parsing file {} of {}...'.format(i+1,len(file_tuples))) call_back(i) root, filename = t name = os.path.splitext(filename)[0] d = load_discourse_ilg(name, os.path.join(root,filename), annotation_types, corpus.lexicon, None, stop_check, call_back) corpus.add_discourse(d) if feature_system_path is not None: feature_matrix = load_binary(feature_system_path) corpus.lexicon.set_feature_matrix(feature_matrix) return corpus def export_discourse_ilg(discourse, path, trans_delim = '.'):
[docs] """ Export a discourse to an interlinear gloss text file, with a maximal line size of 10 words Parameters ---------- discourse : Discourse Discourse object to export path : str Path to export to trans_delim : str, optional Delimiter for segments, defaults to ``.`` """ with open(path, encoding='utf-8', mode='w') as f: spellings = list() transcriptions = list() for wt in discourse: spellings.append(wt.spelling) transcriptions.append(trans_delim.join(wt.transcription)) if len(spellings) > 10: f.write(' '.join(spellings)) f.write('\n') f.write(' '.join(transcriptions)) f.write('\n') spellings = list() transcriptions = list() if spellings: f.write(' '.join(spellings)) f.write('\n') f.write(' '.join(transcriptions)) f.write('\n')