diff --git a/GEODE/Store/Corpus.py b/GEODE/Store/Corpus.py index 8c71dacb6f5081355adb0f56562103f445d1de7b..1d68cbe8d37d5a539bf0542a45f547e3201d1c4a 100644 --- a/GEODE/Store/Corpus.py +++ b/GEODE/Store/Corpus.py @@ -67,6 +67,25 @@ class TSVIndexed(Corpus): if where is None or where(*row): yield projector(*row) + """ + A map over the content of a corpus with filter-capability + + Positional arguments + :param function f: the function to apply on each text of the corpus. It must + accept a `row` as input (i.e. a `dict` containing the primary key for the + text and its content associated to the `column_name` attribute of the + corpus on which this method is called) and returning the transformed content + (or `None` if the text shouldn't be kept) + + :return: a generator over the transformed corpus, skipping elements for + which `f` returns `None`) + """ + def map_content(self, f): + for row in self.get_all(): + newContent = f(row) + if newContent is not None: + yield {**row, self.column_name: newContent} + class SelfContained(TSVIndexed): """ A class to handle the dataset TSV normalised path used in the project and loading the