Skip to content
Snippets Groups Projects
Commit cb64e745 authored by Alice Brenon's avatar Alice Brenon
Browse files

Add a tool to level the gap between corpora stored as directories and as datasets

parent 87022d9f
No related branches found
No related tags found
No related merge requests found
import pandas
import os
def abstract(f):
def wrapped(*args, **kwargs):
raise NotImplementedError(f.__name__)
return wrapped
class Corpus:
@abstract
def __init__():
pass
@abstract
def get_text(self, primary_key):
pass
@abstract
def get_all(self):
pass
@abstract
def save(self, iterator):
pass
class TSVIndexed(Corpus):
default_keys = ['work', 'volume', 'article']
def __init__(self, tsv_path, column_name):
self.tsv_path = tsv_path
self.column_name = column_name
self.data = None
def load(self):
if self.data is None:
self.data = pandas.read_csv(self.tsv_path, sep='\t')
self.detect_keys()
self.data = self.data.set_index(self.keys, drop=False)
def detect_keys(self):
self.keys = self.default_keys.copy()
if 'paragraph' in self.data:
self.keys.append('paragraph')
@abstract
def get_content(self, key, row):
pass
def get_all(self):
self.load()
for key, row in self.data.iterrows():
keys = self.keys + [self.column_name]
values = key + (self.get_content(key, row).strip() + '\n',)
yield dict(zip(keys, values))
class SelfContained(TSVIndexed):
"""
A class to handle the dataset TSV normalised path used in the project and loading the
actual text input as a generator from records when they are needed
"""
def __init__(self, tsv_path, column_name='content'):
"""
Positional arguments
:param tsv_path: the path to a TSV dataset containing a primary key and
a text content on every line
Keyword arguments
:param column_name: the name of the column where the text content is
stored
"""
TSVIndexed.__init__(self, tsv_path, column_name)
def get_text(self, primary_key):
self.load()
if type(primary_key) == dict:
primary_key = [primary_key[k] for k in self.keys if k in primary_key]
if type(primary_key) != tuple:
primary_key = tuple(primary_key)
return self.data.xs(primary_key)[self.column_name]
def get_content(self, _, row):
return row[self.column_name]
def save(self, iterator):
self.data = pandas.DataFrame(iterator)
self.detect_keys()
self.data.to_csv(self.tsv_path, sep='\t', index=False)
class Directory(TSVIndexed):
"""
A class to handle the normalised path used in the project and loading the
actual text input as a generator from records when they are needed
"""
def __init__(self, root_path, column_name='content'):
"""
Positional arguments
:param root_path: the path to a GÉODE-style folder containing the text
version of the corpus on which to predict the classes
"""
self.text_path = f"{root_path}/Text"
TSVIndexed.__init__(self, f"{root_path}/files.tsv", column_name)
def path_to(self, primary_key):
record = self.dict_primary_key(primary_key)
article_relative_path = "{work}/T{volume}/{article}".format(**record)
prefix = f"{self.text_path}/{article_relative_path}"
if 'paragraph' in record:
return f"{prefix}/{record['paragraph']}.txt"
else:
return f"{prefix}.txt"
def dict_primary_key(self, primary_key):
if type(primary_key) == pandas.core.series.Series:
return dict(primary_key)
elif type(primary_key) != dict:
keys = self.default_keys.copy()
if len(primary_key) == 4:
keys.append('paragraph')
return dict(zip(keys, primary_key))
else:
return primary_key
def get_text(self, primary_key):
with open(self.path_to(primary_key), 'r') as file:
return file.read()
def get_content(self, key, _):
return self.get_text(key)
def write_text(self, primary_key, content):
path = self.path_to(primary_key)
os.makedirs(os.path.dirname(path), exist_ok=True)
with open(path, 'w') as file:
file.write(content)
def save(self, iterator):
self.data = pandas.DataFrame(iterator)
self.detect_keys()
for _, row in self.data.iterrows():
self.write_text(row, row[self.column_name])
self.data[self.keys].to_csv(self.tsv_path, sep='\t', index=False)
class Source:
"""
A class to handle the normalised path used in the project and loading the
actual text input as a generator from records when they are needed
"""
def __init__(self, root_path):
"""
Positional arguments
:param root_path: the path to a GÉODE-style folder containing the text
version of the corpus on which to predict the classes
"""
self.root_path = root_path
def path_to(self, record):
article_relative_path = "{work}/T{volume}/{article}".format(**record)
prefix = f"{self.root_path}/{article_relative_path}"
if 'paragraph' in record:
return f"{prefix}/{record.paragraph}.txt"
else:
return f"{prefix}.txt"
def load_text(self, record):
with open(self.path_to(record), 'r') as file:
return file.read()
def iterate(self, records):
for _, record in records.iterrows():
yield self.load_text(record)
#!/usr/bin/env python3
import Corpus
from os.path import isdir
import sys
def detect(path):
if isdir(path):
return Corpus.Directory(path)
else:
return Corpus.SelfContained(path)
if __name__ == '__main__':
source = detect(sys.argv[1])
destination = detect(sys.argv[2])
destination.save(source.get_all())
0% Loading or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment