Add a tool to level the gap between corpora stored as directories and as datasets

cb64e745 · Alice Brenon · 87022d9f · cb64e745 · 87022d9f · cb64e745
Commit cb64e745 authored 1 year ago by Alice Brenon
--- a/scripts/ML/Corpus.py
+++ b/scripts/ML/Corpus.py
+import pandas
+import os
+def abstract(f):
+    def wrapped(*args, **kwargs):
+        raise NotImplementedError(f.__name__)
+    return wrapped
+class Corpus:
+    @abstract
+    def __init__():
+        pass
+    @abstract
+    def get_text(self, primary_key):
+        pass
+    @abstract
+    def get_all(self):
+        pass
+    @abstract
+    def save(self, iterator):
+        pass
+class TSVIndexed(Corpus):
+    default_keys = ['work', 'volume', 'article']
+    def __init__(self, tsv_path, column_name):
+        self.tsv_path = tsv_path
+        self.column_name = column_name
+        self.data = None
+    def load(self):
+        if self.data is None:
+            self.data = pandas.read_csv(self.tsv_path, sep='\t')
+            self.detect_keys()
+            self.data = self.data.set_index(self.keys, drop=False)
+    def detect_keys(self):
+        self.keys = self.default_keys.copy()
+        if 'paragraph' in self.data:
+            self.keys.append('paragraph')
+    @abstract
+    def get_content(self, key, row):
+        pass
+    def get_all(self):
+        self.load()
+        for key, row in self.data.iterrows():
+            keys = self.keys + [self.column_name]
+            values = key + (self.get_content(key, row).strip() + '\n',)
+            yield dict(zip(keys, values))
+class SelfContained(TSVIndexed):
+    """
+    A class to handle the dataset TSV normalised path used in the project and loading the
+    actual text input as a generator from records when they are needed
+    """
+    def __init__(self, tsv_path, column_name='content'):
+        """
+        Positional arguments
+        :param tsv_path: the path to a TSV dataset containing a primary key and
+        a text content on every line
+        Keyword arguments
+        :param column_name: the name of the column where the text content is
+        stored
+        """
+        TSVIndexed.__init__(self, tsv_path, column_name)
+    def get_text(self, primary_key):
+        self.load()
+        if type(primary_key) == dict:
+            primary_key = [primary_key[k] for k in self.keys if k in primary_key]
+        if type(primary_key) != tuple:
+            primary_key = tuple(primary_key)
+        return self.data.xs(primary_key)[self.column_name]
+    def get_content(self, _, row):
+        return row[self.column_name]
+    def save(self, iterator):
+        self.data = pandas.DataFrame(iterator)
+        self.detect_keys()
+        self.data.to_csv(self.tsv_path, sep='\t', index=False)
+class Directory(TSVIndexed):
+    """
+    A class to handle the normalised path used in the project and loading the
+    actual text input as a generator from records when they are needed
+    """
+    def __init__(self, root_path, column_name='content'):
+        """
+        Positional arguments
+        :param root_path: the path to a GÉODE-style folder containing the text
+        version of the corpus on which to predict the classes
+        """
+        self.text_path = f"{root_path}/Text"
+        TSVIndexed.__init__(self, f"{root_path}/files.tsv", column_name)
+    def path_to(self, primary_key):
+        record = self.dict_primary_key(primary_key)
+        article_relative_path = "{work}/T{volume}/{article}".format(**record)
+        prefix = f"{self.text_path}/{article_relative_path}"
+        if 'paragraph' in record:
+            return f"{prefix}/{record['paragraph']}.txt"
+        else:
+            return f"{prefix}.txt"
+    def dict_primary_key(self, primary_key):
+        if type(primary_key) == pandas.core.series.Series:
+            return dict(primary_key)
+        elif type(primary_key) != dict:
+            keys = self.default_keys.copy()
+            if len(primary_key) == 4:
+                keys.append('paragraph')
+            return dict(zip(keys, primary_key))
+        else:
+            return primary_key
+    def get_text(self, primary_key):
+        with open(self.path_to(primary_key), 'r') as file:
+            return file.read()
+    def get_content(self, key, _):
+        return self.get_text(key)
+    def write_text(self, primary_key, content):
+        path = self.path_to(primary_key)
+        os.makedirs(os.path.dirname(path), exist_ok=True)
+        with open(path, 'w') as file:
+            file.write(content)
+    def save(self, iterator):
+        self.data = pandas.DataFrame(iterator)
+        self.detect_keys()
+        for _, row in self.data.iterrows():
+            self.write_text(row, row[self.column_name])
+        self.data[self.keys].to_csv(self.tsv_path, sep='\t', index=False)
--- a/scripts/ML/Source.py
+++ b/scripts/ML/Source.py
-class Source:
-    """
-    A class to handle the normalised path used in the project and loading the
-    actual text input as a generator from records when they are needed
-    """
-    def __init__(self, root_path):
-        """
-        Positional arguments
-        :param root_path: the path to a GÉODE-style folder containing the text
-        version of the corpus on which to predict the classes
-        """
-        self.root_path = root_path
-    def path_to(self, record):
-        article_relative_path = "{work}/T{volume}/{article}".format(**record)
-        prefix = f"{self.root_path}/{article_relative_path}"
-        if 'paragraph' in record:
-            return f"{prefix}/{record.paragraph}.txt"
-        else:
-            return f"{prefix}.txt"
-    def load_text(self, record):
-        with open(self.path_to(record), 'r') as file:
-            return file.read()
-    def iterate(self, records):
-        for _, record in records.iterrows():
-            yield self.load_text(record)
--- a/scripts/ML/convert-corpus.py
+++ b/scripts/ML/convert-corpus.py
+#!/usr/bin/env python3
+import Corpus
+from os.path import isdir
+import sys
+def detect(path):
+    if isdir(path):
+        return Corpus.Directory(path)
+    else:
+        return Corpus.SelfContained(path)
+if __name__ == '__main__':
+    source = detect(sys.argv[1])
+    destination = detect(sys.argv[2])
+    destination.save(source.get_all())