Newer
Older
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
from GEODE.Metadata import fromKey, relativePath
from GEODE.Store.Tabular import tabular, toTSV
import pandas
from os import makedirs
from os.path import dirname, isdir
def abstract(f):
def wrapped(*args, **kwargs):
raise NotImplementedError(f.__name__)
return wrapped
class Corpus:
@abstract
def __init__():
pass
@abstract
def get_text(self, primary_key):
pass
@abstract
def get_all(self):
pass
@abstract
def save(self, iterator):
pass
class TSVIndexed(Corpus):
default_keys = ['work', 'volume', 'article']
projectors = ['key', 'content', 'full']
def __init__(self, tsv_path, column_name):
self.tsv_path = tsv_path
self.column_name = column_name
self.data = None
def load(self):
if self.data is None:
self.data = tabular(self.tsv_path)
self.detect_keys()
self.data = self.data.set_index(self.keys, drop=False)
def detect_keys(self):
self.keys = self.default_keys.copy()
if 'paragraph' in self.data:
self.keys.append('paragraph')
@abstract
def content(self, key, row):
pass
def key(self, _, row):
return row[self.keys].to_dict()
def full(self, key, row):
return {**self.key(key, row),
self.column_name: self.content(key, row).strip()}
def get_all(self, projector=None, where=None):
if projector is None:
projector = self.full
elif type(projector) == str and projector in self.projectors:
projector = self.__getattribute__(projector)
self.load()
for row in self.data.iterrows():
if where is None or where(*row):
yield projector(*row)
class SelfContained(TSVIndexed):
"""
A class to handle the dataset TSV normalised path used in the project and loading the
actual text input as a generator from records when they are needed
"""
def __init__(self, tsv_path, column_name='content'):
"""
Positional arguments
:param tsv_path: the path to a TSV dataset containing a primary key and
a text content on every line
Keyword arguments
:param column_name: the name of the column where the text content is
stored
"""
TSVIndexed.__init__(self, tsv_path, column_name)
def get_text(self, primary_key):
self.load()
if type(primary_key) == dict:
primary_key = [primary_key[k] for k in self.keys if k in primary_key]
if type(primary_key) != tuple:
primary_key = tuple(primary_key)
return self.data.xs(primary_key)[self.column_name]
def content(self, _, row):
return row[self.column_name]
def save(self, iterator):
self.data = pandas.DataFrame(iterator)
self.detect_keys()
toTSV(self.tsv_path, self.data)
class Directory(TSVIndexed):
"""
A class to handle the normalised path used in the project and loading the
actual text input as a generator from records when they are needed
"""
def __init__(self, root_path, tsv_filename="files", column_name='content'):
"""
Positional arguments
:param root_path: the path to a GÉODE-style folder containing the text
version of the corpus on which to predict the classes
"""
self.text_path = f"{root_path}/Text"
TSVIndexed.__init__(self, f"{root_path}/{tsv_filename}.tsv", column_name)
def path_to(self, primary_key):
record = self.dict_primary_key(primary_key)
return f"{self.text_path}/{relativePath(record, 'txt')}"
def dict_primary_key(self, primary_key):
if type(primary_key) == pandas.core.series.Series:
return dict(primary_key)
elif type(primary_key) == dict:
return primary_key
else:
return fromKey(primary_key)
def get_text(self, primary_key):
with open(self.path_to(primary_key), 'r') as file:
return file.read()
def content(self, key, _):
return self.get_text(key)
def write_text(self, primary_key, content):
path = self.path_to(primary_key)
makedirs(dirname(path), exist_ok=True)
with open(path, 'w') as file:
file.write(content)
def save(self, iterator):
self.data = pandas.DataFrame(iterator)
self.detect_keys()
for _, row in self.data.iterrows():
self.write_text(row, row[self.column_name])
toTSV(self.tsv_path, self.data[self.keys])
def corpus(path, **kwargs):
if path[-1:] == '/' or isdir(path):
return Directory(path, **kwargs)
elif path[-4:] == '.tsv':
return SelfContained(path, **kwargs)
else:
raise FileNotFoundError(path)