from GEODE.Metadata import fromKey, relativePath from GEODE.store.TSV import toTSV import pandas from os import makedirs from os.path import dirname, isdir def abstract(f): def wrapped(*args, **kwargs): raise NotImplementedError(f.__name__) return wrapped class Corpus: @abstract def __init__(): pass @abstract def get_text(self, primary_key): pass @abstract def get_all(self): pass @abstract def save(self, iterator): pass class TSVIndexed(Corpus): default_keys = ['work', 'volume', 'article'] projectors = ['key', 'content', 'full'] def __init__(self, tsv_path, column_name): self.tsv_path = tsv_path self.column_name = column_name self.data = None def load(self): if self.data is None: self.data = pandas.read_csv(self.tsv_path, sep='\t') self.detect_keys() self.data = self.data.set_index(self.keys, drop=False) def detect_keys(self): self.keys = self.default_keys.copy() if 'paragraph' in self.data: self.keys.append('paragraph') @abstract def content(self, key, row): pass def key(self, _, row): return row[self.keys].to_dict() def full(self, key, row): d = self.key(key, row) d[self.column_name] = self.content(key, row).strip() return d def get_all(self, projector=None, where=None): if projector is None: projector = self.full elif type(projector) == str and projector in self.projectors: projector = self.__getattribute__(projector) self.load() for row in self.data.iterrows(): if where is None or where(*row): yield projector(*row) class SelfContained(TSVIndexed): """ A class to handle the dataset TSV normalised path used in the project and loading the actual text input as a generator from records when they are needed """ def __init__(self, tsv_path, column_name='content'): """ Positional arguments :param tsv_path: the path to a TSV dataset containing a primary key and a text content on every line Keyword arguments :param column_name: the name of the column where the text content is stored """ TSVIndexed.__init__(self, tsv_path, column_name) def get_text(self, primary_key): self.load() if type(primary_key) == dict: primary_key = [primary_key[k] for k in self.keys if k in primary_key] if type(primary_key) != tuple: primary_key = tuple(primary_key) return self.data.xs(primary_key)[self.column_name] def content(self, _, row): return row[self.column_name] def save(self, iterator): self.data = pandas.DataFrame(iterator) self.detect_keys() toTSV(self.tsv_path, self.data) class Directory(TSVIndexed): """ A class to handle the normalised path used in the project and loading the actual text input as a generator from records when they are needed """ def __init__(self, root_path, tsv_filename="files", column_name='content'): """ Positional arguments :param root_path: the path to a GÉODE-style folder containing the text version of the corpus on which to predict the classes """ self.text_path = f"{root_path}/Text" TSVIndexed.__init__(self, f"{root_path}/{tsv_filename}.tsv", column_name) def path_to(self, primary_key): record = self.dict_primary_key(primary_key) return f"{self.text_path}/{relativePath(record, 'txt')}" def dict_primary_key(self, primary_key): if type(primary_key) == pandas.core.series.Series: return dict(primary_key) elif type(primary_key) == dict: return primary_key else: return fromKey(primary_key) def get_text(self, primary_key): with open(self.path_to(primary_key), 'r') as file: return file.read() def content(self, key, _): return self.get_text(key) def write_text(self, primary_key, content): path = self.path_to(primary_key) makedirs(dirname(path), exist_ok=True) with open(path, 'w') as file: file.write(content) def save(self, iterator): self.data = pandas.DataFrame(iterator) self.detect_keys() for _, row in self.data.iterrows(): self.write_text(row, row[self.column_name]) toTSV(self.tsv_path, self.data[self.keys]) def corpus(path, **kwargs): if path[-1:] == '/' or isdir(path): return Directory(path, **kwargs) elif path[-4:] == '.tsv': return SelfContained(path, **kwargs) else: raise FileNotFoundError(path)