Skip to content
Snippets Groups Projects
Corpus.py 4.73 KiB
Newer Older
from GEODE.Metadata import fromKey, relativePath
from GEODE.Store.Tabular import tabular, toTSV
import pandas
from os import makedirs
from os.path import dirname, isdir

def abstract(f):
    def wrapped(*args, **kwargs):
        raise NotImplementedError(f.__name__)
    return wrapped

class Corpus:
    @abstract
    def __init__():
        pass

    @abstract
    def get_text(self, primary_key):
        pass

    @abstract
    def get_all(self):
        pass

    @abstract
    def save(self, iterator):
        pass

class TSVIndexed(Corpus):
    default_keys = ['work', 'volume', 'article']
    projectors = ['key', 'content', 'full']

    def __init__(self, tsv_path, column_name):
        self.tsv_path = tsv_path
        self.column_name = column_name
        self.data = None

    def load(self):
        if self.data is None:
            self.data = tabular(self.tsv_path)
            self.detect_keys()
            self.data = self.data.set_index(self.keys, drop=False)

    def detect_keys(self):
        self.keys = self.default_keys.copy()
        if 'paragraph' in self.data:
            self.keys.append('paragraph')

    @abstract
    def content(self, key, row):
        pass

    def key(self, _, row):
        return row[self.keys].to_dict()

    def full(self, key, row):
        return {**self.key(key, row),
                self.column_name: self.content(key, row).strip()}

    def get_all(self, projector=None, where=None):
        if projector is None:
            projector = self.full
        elif type(projector) == str and projector in self.projectors:
            projector = self.__getattribute__(projector)
        self.load()
        for row in self.data.iterrows():
            if where is None or where(*row):
                yield projector(*row)

class SelfContained(TSVIndexed):
    """
    A class to handle the dataset TSV normalised path used in the project and loading the
    actual text input as a generator from records when they are needed
    """
    def __init__(self, tsv_path, column_name='content'):
        """
        Positional arguments
        :param tsv_path: the path to a TSV dataset containing a primary key and
        a text content on every line

        Keyword arguments
        :param column_name: the name of the column where the text content is
        stored
        """
        TSVIndexed.__init__(self, tsv_path, column_name)

    def get_text(self, primary_key):
        self.load()
        if type(primary_key) == dict:
            primary_key = [primary_key[k] for k in self.keys if k in primary_key]
        if type(primary_key) != tuple:
            primary_key = tuple(primary_key)
        return self.data.xs(primary_key)[self.column_name]

    def content(self, _, row):
        return row[self.column_name]

    def save(self, iterator):
        self.data = pandas.DataFrame(iterator)
        self.detect_keys()
        toTSV(self.tsv_path, self.data)

class Directory(TSVIndexed):
    """
    A class to handle the normalised path used in the project and loading the
    actual text input as a generator from records when they are needed
    """
    def __init__(self, root_path, tsv_filename="files", column_name='content'):
        """
        Positional arguments
        :param root_path: the path to a GÉODE-style folder containing the text
        version of the corpus on which to predict the classes
        """
        self.text_path = f"{root_path}/Text"
        TSVIndexed.__init__(self, f"{root_path}/{tsv_filename}.tsv", column_name)

    def path_to(self, primary_key):
        record = self.dict_primary_key(primary_key)
        return f"{self.text_path}/{relativePath(record, 'txt')}"

    def dict_primary_key(self, primary_key):
        if type(primary_key) == pandas.core.series.Series:
            return dict(primary_key)
        elif type(primary_key) == dict:
            return primary_key
        else:
            return fromKey(primary_key)

    def get_text(self, primary_key):
        with open(self.path_to(primary_key), 'r') as file:
            return file.read()

    def content(self, key, _):
        return self.get_text(key)

    def write_text(self, primary_key, content):
        path = self.path_to(primary_key)
        makedirs(dirname(path), exist_ok=True)
        with open(path, 'w') as file:
            file.write(content)

    def save(self, iterator):
        self.data = pandas.DataFrame(iterator)
        self.detect_keys()
        for _, row in self.data.iterrows():
            self.write_text(row, row[self.column_name])
        toTSV(self.tsv_path, self.data[self.keys])

def corpus(path, **kwargs):
    if path[-1:] == '/' or isdir(path):
        return Directory(path, **kwargs)
    elif path[-4:] == '.tsv':
        return SelfContained(path, **kwargs)
    else:
        raise FileNotFoundError(path)