diff --git a/scripts/ML/Corpus.py b/scripts/ML/Corpus.py new file mode 100644 index 0000000000000000000000000000000000000000..b01a7c8181c49a64ec9ef433b0011b154c7b0e8a --- /dev/null +++ b/scripts/ML/Corpus.py @@ -0,0 +1,140 @@ +import pandas +import os + +def abstract(f): + def wrapped(*args, **kwargs): + raise NotImplementedError(f.__name__) + return wrapped + +class Corpus: + @abstract + def __init__(): + pass + + @abstract + def get_text(self, primary_key): + pass + + @abstract + def get_all(self): + pass + + @abstract + def save(self, iterator): + pass + +class TSVIndexed(Corpus): + default_keys = ['work', 'volume', 'article'] + def __init__(self, tsv_path, column_name): + self.tsv_path = tsv_path + self.column_name = column_name + self.data = None + + def load(self): + if self.data is None: + self.data = pandas.read_csv(self.tsv_path, sep='\t') + self.detect_keys() + self.data = self.data.set_index(self.keys, drop=False) + + def detect_keys(self): + self.keys = self.default_keys.copy() + if 'paragraph' in self.data: + self.keys.append('paragraph') + + @abstract + def get_content(self, key, row): + pass + + def get_all(self): + self.load() + for key, row in self.data.iterrows(): + keys = self.keys + [self.column_name] + values = key + (self.get_content(key, row).strip() + '\n',) + yield dict(zip(keys, values)) + +class SelfContained(TSVIndexed): + """ + A class to handle the dataset TSV normalised path used in the project and loading the + actual text input as a generator from records when they are needed + """ + def __init__(self, tsv_path, column_name='content'): + """ + Positional arguments + :param tsv_path: the path to a TSV dataset containing a primary key and + a text content on every line + + Keyword arguments + :param column_name: the name of the column where the text content is + stored + """ + TSVIndexed.__init__(self, tsv_path, column_name) + + def get_text(self, primary_key): + self.load() + if type(primary_key) == dict: + primary_key = [primary_key[k] for k in self.keys if k in primary_key] + if type(primary_key) != tuple: + primary_key = tuple(primary_key) + return self.data.xs(primary_key)[self.column_name] + + def get_content(self, _, row): + return row[self.column_name] + + def save(self, iterator): + self.data = pandas.DataFrame(iterator) + self.detect_keys() + self.data.to_csv(self.tsv_path, sep='\t', index=False) + +class Directory(TSVIndexed): + """ + A class to handle the normalised path used in the project and loading the + actual text input as a generator from records when they are needed + """ + def __init__(self, root_path, column_name='content'): + """ + Positional arguments + :param root_path: the path to a GÉODE-style folder containing the text + version of the corpus on which to predict the classes + """ + self.text_path = f"{root_path}/Text" + TSVIndexed.__init__(self, f"{root_path}/files.tsv", column_name) + + def path_to(self, primary_key): + record = self.dict_primary_key(primary_key) + article_relative_path = "{work}/T{volume}/{article}".format(**record) + prefix = f"{self.text_path}/{article_relative_path}" + if 'paragraph' in record: + return f"{prefix}/{record['paragraph']}.txt" + else: + return f"{prefix}.txt" + + def dict_primary_key(self, primary_key): + if type(primary_key) == pandas.core.series.Series: + return dict(primary_key) + elif type(primary_key) != dict: + keys = self.default_keys.copy() + if len(primary_key) == 4: + keys.append('paragraph') + return dict(zip(keys, primary_key)) + else: + return primary_key + + def get_text(self, primary_key): + with open(self.path_to(primary_key), 'r') as file: + return file.read() + + def get_content(self, key, _): + return self.get_text(key) + + def write_text(self, primary_key, content): + path = self.path_to(primary_key) + os.makedirs(os.path.dirname(path), exist_ok=True) + with open(path, 'w') as file: + file.write(content) + + def save(self, iterator): + self.data = pandas.DataFrame(iterator) + self.detect_keys() + for _, row in self.data.iterrows(): + self.write_text(row, row[self.column_name]) + self.data[self.keys].to_csv(self.tsv_path, sep='\t', index=False) diff --git a/scripts/ML/Source.py b/scripts/ML/Source.py deleted file mode 100644 index 88007605267cc83c48925f0a424c57bfc8e79419..0000000000000000000000000000000000000000 --- a/scripts/ML/Source.py +++ /dev/null @@ -1,29 +0,0 @@ -class Source: - """ - A class to handle the normalised path used in the project and loading the - actual text input as a generator from records when they are needed - """ - def __init__(self, root_path): - """ - Positional arguments - :param root_path: the path to a GÉODE-style folder containing the text - version of the corpus on which to predict the classes - """ - self.root_path = root_path - - def path_to(self, record): - article_relative_path = "{work}/T{volume}/{article}".format(**record) - prefix = f"{self.root_path}/{article_relative_path}" - if 'paragraph' in record: - return f"{prefix}/{record.paragraph}.txt" - else: - return f"{prefix}.txt" - - def load_text(self, record): - with open(self.path_to(record), 'r') as file: - return file.read() - - def iterate(self, records): - for _, record in records.iterrows(): - yield self.load_text(record) - diff --git a/scripts/ML/convert-corpus.py b/scripts/ML/convert-corpus.py new file mode 100755 index 0000000000000000000000000000000000000000..98135d464ab54b81efe9eade5f297b30670788a1 --- /dev/null +++ b/scripts/ML/convert-corpus.py @@ -0,0 +1,15 @@ +#!/usr/bin/env python3 +import Corpus +from os.path import isdir +import sys + +def detect(path): + if isdir(path): + return Corpus.Directory(path) + else: + return Corpus.SelfContained(path) + +if __name__ == '__main__': + source = detect(sys.argv[1]) + destination = detect(sys.argv[2]) + destination.save(source.get_all())