Source code for corpustools.corpus.io.csv

from csv import DictReader, DictWriter
import os

from corpustools.corpus.classes.lexicon import Corpus, FeatureMatrix, Word, Attribute, Transcription
from corpustools.corpus.io.binary import save_binary, load_binary

from .helper import parse_transcription, AnnotationType, SyllableBaseAnnotation

from corpustools.exceptions import DelimiterError, PCTEncodingError, PCTError
import corpustools.gui.modernize as modernize

import time
import pdb

def inspect_csv(path, num_lines = 10, coldelim = None, transdelim = None):
    """
    Generate a list of AnnotationTypes for a specified text file for parsing
    it as a column-delimited file

    Parameters
    ----------
    path : str
        Full path to text file
    num_lines: int, optional
        The number of lines to parse from the file
    coldelim: str, optional
        A prespecified column delimiter to use, will autodetect if not
        supplied
    transdelim : list, optional
        A prespecfied set of transcription delimiters to look for, will
        autodetect if not supplied

    Returns
    -------
    list of AnnotationTypes
        Autodetected AnnotationTypes for the text file
    """
    if coldelim is not None:
        common_delimiters = [coldelim]
    else:
        common_delimiters = [',','\t',':','|']
    if transdelim is not None:
        trans_delimiters = [transdelim]
    else:
        trans_delimiters = ['.',' ', ';', ',']

    try:
        with open(path, 'r', encoding='utf-8-sig') as f:
            lines = []
            head = f.readline().strip()
            for line in f.readlines():
                if line != '\n':
                    lines.append(line.strip())
    except UnicodeDecodeError:
        raise(PCTEncodingError("PCT cannot decode your text file. Make sure it is in UTF-8.\n\n"
                               "To convert your file to UTF-8, please open it in Notepad or TextEdit "
                               "and then 'Save as' with the encoding set to 'UTF-8.'"))

    best = ''  ## best guess for the column delimiter (candidates: ',', 'tab', ':', and '|')
    num = 1
    for d in common_delimiters:
        trial = len(head.split(d))
        if trial > num:
            num = trial
            best = d
    if best == '':
        raise(DelimiterError('The column delimiter specified did not create multiple columns.'))

    head = head.split(best)
    vals = {h: list() for h in head}     # creating a dictionary object with headers (i.e., the 1st row in the file) as keys

    for line in lines:                   # for-loop each row in the file.
        l = line.strip().split(best)     # and split it with the best column delimiter
        if len(l) != len(head):
            raise(PCTError('{}, {}'.format(l,head)))
        for i in range(len(head)):
            vals[head[i]].append(l[i])
    atts = list()
    freq_flag = False  # Bool. flag for freq. the first numeric column makes it True and becomes the only freq for corpus
    for h in head:
        if h in ['Transcription', 'transcription']:         # if the header explicitly says 'transcription' then 'tier'
            cat = 'tier'
        else:   # if not, guess the attribute (numeric, tier, factor, or spelling) depending on the type of contents
            cat = Attribute.guess_type(vals[h][:num_lines], trans_delimiters)
        if cat == 'numeric' and not freq_flag:
            cat = 'freq'
            freq_flag = True
        att = Attribute(Attribute.sanitize_name(h), cat, h)
        a = AnnotationType(h, None, None, token=False, attribute=att)
        if cat == 'tier':
            for t in trans_delimiters:
                if t in vals[h][0] or t in vals[h][-1]:
                    a.trans_delimiter = t
                    break
        a.add(vals[h], save=False)
        atts.append(a)

    return atts, best

def check_feature_coverage_csv(corpus_name, path, delimiter, annotation_types=None, feature_system_path=None,
                               stop_check=None, call_back=None):

    if feature_system_path is not None and os.path.exists(feature_system_path):
        feature_matrix = load_binary(feature_system_path)
        feature_matrix = modernize.modernize_specifier(feature_matrix)

    if annotation_types is None:
        annotation_types, delimiter = inspect_csv(path, coldelim=delimiter)

    for a in annotation_types:
        a.reset()

    missing = set()

    with open(path, encoding='utf-8-sig') as f:
        headers = f.readline()
        headers = headers.split(delimiter)
        if len(headers) == 1:
            e = DelimiterError(('Could not parse the corpus.\n\Check that the column delimiter you typed in matches '
                                'the one used in the file.'))
            raise e
        headers = annotation_types

        for line in f.readlines():
            line = line.strip()
            if not line:
                continue

            for k, v in zip(headers, line.split(delimiter)):
                v = v.strip()
                if k.attribute.att_type == 'tier':
                    ignored = k.ignored_characters
                    if ignored is not None:
                        v = ''.join(x for x in v if x not in ignored)

                    sd = k.syllable_delimiter
                    if sd is not None:
                        syllables = v.split(sd)
                    else:
                        syllables = [v]

                    td = k.trans_delimiter
                    stress_spec = set(k.stress_specification.keys())
                    tone_spec = set(k.tone_specification.keys())
                    supra_spec = stress_spec.union(tone_spec)
                    for syllable in syllables:
                        syllable = ''.join(x for x in syllable if x not in supra_spec)

                        if td is None:
                            if k.digraph_pattern is not None:
                                string = k.digraph_pattern.findall(syllable)
                            else:
                                string = [x for x in syllable]
                        else:
                            string = syllable.split(td)

                        for seg in string:
                            if seg == '':
                                continue

                            if seg not in feature_matrix.segments:
                                missing.add(seg)


[docs]def load_corpus_csv(corpus_name, path, delimiter, annotation_types=None, feature_system_path=None,
                    stop_check=None, call_back=None):
    """
    Load a corpus from a column-delimited text file

    Parameters
    ----------
    corpus_name : str
        Informative identifier to refer to corpus
    path : str
        Full path to text file
    delimiter : str
        Character to use for spliting lines into columns
    annotation_types : list of AnnotationType, optional
        List of AnnotationType specifying how to parse text files
    feature_system_path : str
        Full path to pickled FeatureMatrix to use with the Corpus
    stop_check : callable, optional
        Optional function to check whether to gracefully terminate early
    call_back : callable, optional
        Optional function to supply progress information during the function

    Returns
    -------
    Corpus
        Corpus object generated from the text file

    """
    check_feature_coverage_csv(corpus_name, path, delimiter, annotation_types, feature_system_path,
                               stop_check, call_back)

    corpus = Corpus(corpus_name)
    if feature_system_path is not None and os.path.exists(feature_system_path):
        feature_matrix = load_binary(feature_system_path)
        feature_matrix = modernize.modernize_specifier(feature_matrix)
        corpus.set_feature_matrix(feature_matrix)

    if annotation_types is None:
        annotation_types, delimiter = inspect_csv(path, coldelim=delimiter)

    for a in annotation_types:
        a.reset()

    if call_back is not None:
        call_back('Loading...')
        call_back(0, 0)
        cur = 0

    with open(path, encoding='utf-8-sig') as f:
        headers = f.readline()
        headers = headers.split(delimiter)
        if len(headers) == 1:
            e = DelimiterError(('Could not parse the corpus.\nCheck that the column delimiter you typed in matches '
                                'the one used in the file.'))
            raise e
        headers = annotation_types

        for a in headers:
            corpus.add_attribute(a.attribute)

        trans_check = True

        for line in f.readlines():
            if stop_check is not None and stop_check():
                return
            if call_back is not None:
                cur += 1
                call_back(cur)

            line = line.strip()
            if not line:  # blank or just a newline
                continue

            d = {}      # d is the dictionary to be fed as the argument of Word()
            for k, v in zip(headers, line.split(delimiter)):
                v = v.strip()
                if k.attribute.att_type == 'tier':      # if dealing with a transcription column
                    trans = parse_transcription(v, k, feature_matrix=feature_matrix, corpus=corpus)  # trans is a list of BaseAnnotation
                    if not trans_check and len(trans) > 1:
                        trans_check = True
                    d[k.attribute.name] = (k.attribute, trans)
                else:
                    d[k.attribute.name] = (k.attribute, v)
            word = Word(**d)
            if word.transcription:
                #transcriptions can have phonetic symbol delimiters
                if not word.spelling:
                    word.spelling = ''.join(map(str, word.transcription))

            corpus.add_word(word, allow_duplicates=True)

    if corpus.specifier is not None:
        corpus.inventory.update_features(corpus.specifier)

    if corpus.has_transcription and any(len(word.transcription) > 1 for word in corpus):
        if not trans_check:
            e = DelimiterError(('Could not parse transcriptions with that delimiter. '
                            '\nCheck that the transcription delimiter you typed '
                            'in matches the one used in the file.'))
            raise e

    if stop_check is not None and stop_check():
        return

    return corpus


[docs]def load_feature_matrix_csv(name, path, delimiter, stop_check=None, call_back=None):
    """
    Load a FeatureMatrix from a column-delimited text file

    Parameters
    ----------
    name : str
        Informative identifier to refer to feature system
    path : str
        Full path to text file
    delimiter : str
        Character to use for spliting lines into columns
    stop_check : callable, optional
        Optional function to check whether to gracefully terminate early
    call_back : callable, optional
        Optional function to supply progress information during the function

    Returns
    -------
    FeatureMatrix
        FeatureMatrix generated from the text file

    """
    text_input = []
    with open(path, encoding='utf-8-sig', mode='r') as f:
        reader = DictReader(f, delimiter = delimiter)
        lines = list(reader)

    if call_back is not None:
        call_back('Reading file...')
        call_back(0, len(lines))

    for i, line in enumerate(lines):
        if stop_check is not None and stop_check():
            return
        if call_back is not None:
            call_back(i)
        if line:
            if len(line.keys()) == 1:
                raise DelimiterError
            if 'symbol' not in line:
                raise KeyError
            #Compat
            newline = {}
            for k,v in line.items():
                if k == 'symbol':
                    newline[k] = v
                elif v is not None:
                    newline[k] = v[0]
            text_input.append(newline)

    feature_matrix = FeatureMatrix(name,text_input)
    feature_matrix.validate()
    return feature_matrix

def make_safe(value, seg_delimiter, syll_delimiter=None):
    """
    Recursively parse transcription lists into strings for saving

    Parameters
    ----------
    value : object
        Object to make into string

    seg_delimiter : str
        Character to mark boundaries between segments

    syll_delimiter : str, optional
        Character to mark boundaries between syllables

    Returns
    -------
    str
        Safe string
    """
    if isinstance(value, Transcription):
        if syll_delimiter is not None:
            return syll_delimiter.join(map(lambda x: make_safe(list(x), seg_delimiter), value._syllable_list))
        else:
            return seg_delimiter.join(map(lambda x: make_safe(x, seg_delimiter), value.list))
    elif isinstance(value, list):
        return seg_delimiter.join(map(lambda x: make_safe(x, seg_delimiter), value))

    return str(value)

[docs]def export_corpus_csv(corpus, path,
                    delimiter = ',', trans_delimiter = '.', syll_delimiter = None,
                    variant_behavior = None):
    """
    Save a corpus as a column-delimited text file

    Parameters
    ----------
    corpus : Corpus
        Corpus to save to text file
    path : str
        Full path to write text file
    delimiter : str
        Character to mark boundaries between columns.  Defaults to ','
    trans_delimiter : str
        Character to mark boundaries in transcriptions.  Defaults to '.'
    syll_delimiter : str, optional
        Character to mark boundaries in syllables.  Defaults to 'None'. Only active when syllable exists.
    variant_behavior : str, optional
        How to treat variants, 'token' will have a line for each variant,
        'column' will have a single column for all variants for a word,
        and the default will not include variants in the output
    """
    header = []
    for a in corpus.attributes:
        header.append(str(a))

    if variant_behavior == 'token':
        for a in corpus.attributes:
            if a.att_type == 'tier':
                header.append('Token_' + str(a))
        header.append('Token_Frequency')
    elif variant_behavior == 'column':
        header += ['Variants']

    with open(path, encoding='utf-8-sig', mode='w') as f:
        print(delimiter.join(header), file=f)

        for word in corpus.iter_sort():
            word_outline = []
            for a in corpus.attributes:
                word_outline.append(make_safe(getattr(word, a.name), trans_delimiter, syll_delimiter))
            if variant_behavior == 'token':
                var = word.variants()
                for v, freq in var.items():
                    token_line = []
                    for a in corpus.attributes:
                        if a.att_type == 'tier':
                            if a.name == 'transcription':
                                token_line.append(make_safe(v, trans_delimiter))
                            else:
                                segs = a.range
                                t = v.match_segments(segs)
                                token_line.append(make_safe(v, trans_delimiter))
                    token_line.append(make_safe(freq, trans_delimiter))
                    print(delimiter.join(word_outline + token_line), file=f)
                continue
            elif variant_behavior == 'column':
                var = word.variants()
                d = ', '
                if delimiter == ',':
                    d = '; '
                var = d.join(make_safe(x,trans_delimiter) for x in sorted(var.keys(), key = lambda y: var[y]))
                word_outline.append(var)
            print(delimiter.join(word_outline), file=f)

[docs]def export_feature_matrix_csv(feature_matrix, path, delimiter = ','):
    """
    Save a FeatureMatrix as a column-delimited text file

    Parameters
    ----------
    feature_matrix : FeatureMatrix
        FeatureMatrix to save to text file
    path : str
        Full path to write text file
    delimiter : str
        Character to mark boundaries between columns.  Defaults to ','
    """
    with open(path, encoding='utf-8-sig', mode='w', newline='') as f:
        header = ['symbol'] + feature_matrix.features
        writer = DictWriter(f, header, delimiter=delimiter)

        writer.writeheader()    # write header (list of features)
        #  writer.writerow({h: h for h in header})

        for seg in feature_matrix.segments:     # loop over each seg in inventory and write its feature values as a row
            #If FeatureMatrix uses dictionaries
            #outdict = feature_matrix[seg]
            #outdict['symbol'] = seg
            #writer.writerow(outdict)
            if seg in ['#','']: #wtf
                continue
            featline = feature_matrix.seg_to_feat_line(seg)
            outdict = {header[i]: featline[i] for i in range(len(header))}
            writer.writerow(outdict)