Corpus.py

from GEODE.Metadata import fromKey, relativePath
from GEODE.Store.Tabular import tabular, toTSV
import pandas
from os import makedirs
from os.path import dirname, isdir

def abstract(f):
    def wrapped(*args, **kwargs):
        raise NotImplementedError(f.__name__)
    return wrapped

class Corpus:
    @abstract
    def __init__():
        pass

    @abstract
    def get_text(self, primary_key):
        pass

    @abstract
    def get_all(self):
        pass

    @abstract
    def save(self, iterator):
        pass

class TSVIndexed(Corpus):
    default_keys = ['work', 'volume', 'article']
    projectors = ['key', 'content', 'full']

    def __init__(self, tsv_path, column_name):
        self.tsv_path = tsv_path
        self.column_name = column_name
        self.data = None

    def load(self):
        if self.data is None:
            self.data = tabular(self.tsv_path)
            self.detect_keys()
            self.data = self.data.set_index(self.keys, drop=False)

    def detect_keys(self):
        self.keys = self.default_keys.copy()
        if 'paragraph' in self.data:
            self.keys.append('paragraph')

    @abstract
    def content(self, key, row):
        pass

    def key(self, _, row):
        return row[self.keys].to_dict()

    def full(self, key, row):
        return {**self.key(key, row),
                self.column_name: self.content(key, row).strip()}

    def get_all(self, projector=None, where=None):
        if projector is None:
            projector = self.full
        elif type(projector) == str and projector in self.projectors:
            projector = self.__getattribute__(projector)
        self.load()
        for row in self.data.iterrows():
            if where is None or where(*row):
                yield projector(*row)

class SelfContained(TSVIndexed):
    """
    A class to handle the dataset TSV normalised path used in the project and loading the
    actual text input as a generator from records when they are needed
    """
    def __init__(self, tsv_path, column_name='content'):
        """
        Positional arguments
        :param tsv_path: the path to a TSV dataset containing a primary key and
        a text content on every line

        Keyword arguments
        :param column_name: the name of the column where the text content is
        stored
        """
        TSVIndexed.__init__(self, tsv_path, column_name)

    def get_text(self, primary_key):
        self.load()
        if type(primary_key) == dict:
            primary_key = [primary_key[k] for k in self.keys if k in primary_key]
        if type(primary_key) != tuple:
            primary_key = tuple(primary_key)
        return self.data.xs(primary_key)[self.column_name]

    def content(self, _, row):
        return row[self.column_name]

    def save(self, iterator):
        self.data = pandas.DataFrame(iterator)
        self.detect_keys()
        toTSV(self.tsv_path, self.data)

class Directory(TSVIndexed):
    """
    A class to handle the normalised path used in the project and loading the
    actual text input as a generator from records when they are needed
    """
    def __init__(self, root_path, tsv_filename="files", column_name='content'):
        """
        Positional arguments
        :param root_path: the path to a GÉODE-style folder containing the text
        version of the corpus on which to predict the classes
        """
        self.text_path = f"{root_path}/Text"
        TSVIndexed.__init__(self, f"{root_path}/{tsv_filename}.tsv", column_name)

    def path_to(self, primary_key):
        record = self.dict_primary_key(primary_key)
        return f"{self.text_path}/{relativePath(record, 'txt')}"

    def dict_primary_key(self, primary_key):
        if type(primary_key) == pandas.core.series.Series:
            return dict(primary_key)
        elif type(primary_key) == dict:
            return primary_key
        else:
            return fromKey(primary_key)

    def get_text(self, primary_key):
        with open(self.path_to(primary_key), 'r') as file:
            return file.read()

    def content(self, key, _):
        return self.get_text(key)

    def write_text(self, primary_key, content):
        path = self.path_to(primary_key)
        makedirs(dirname(path), exist_ok=True)
        with open(path, 'w') as file:
            file.write(content)

    def save(self, iterator):
        self.data = pandas.DataFrame(iterator)
        self.detect_keys()
        for _, row in self.data.iterrows():
            self.write_text(row, row[self.column_name])
        toTSV(self.tsv_path, self.data[self.keys])

def corpus(path, **kwargs):
    if path[-1:] == '/' or isdir(path):
        return Directory(path, **kwargs)
    elif path[-4:] == '.tsv':
        return SelfContained(path, **kwargs)
    else:
        raise FileNotFoundError(path)