diff --git a/src/ro_crate.py b/src/ro_crate.py index e987fb7242df0bebe73a1d936b6ccdbcbea92cc1..283cf4fc44a87e1f7f2e613fe6769f49be1c033d 100644 --- a/src/ro_crate.py +++ b/src/ro_crate.py @@ -1,14 +1,120 @@ - +import json +import glob +import os class RO_Crate: def __init__(self, workflow): - self.nextflow_file = workflow - self.directroy = '/'.join(workflow.get_file_address().split('/')[:-1]) + self.workflow = workflow + self.directory = '/'.join(workflow.get_file_address().split('/')[:-1]) + self.files = [] self.dico = {} + def get_files(self): + self.files = glob.glob(f'{self.directory}/**/*.*', recursive=True) + tab_files = [] + for file in self.files: + tab_files.append({"@id":file[len(self.directory)+1:]}) + return tab_files + def initialise_dico(self): - None + self.dico["@context"] = "https://w3id.org/ro/crate/1.1/context" + self.dico["@graph"] = [] + #GENERAL + general = {} + general["@id"] = f"ro-crate-metadata-{self.workflow.get_name()}.json" + general["@type"] = "CreativeWork" + general["about"] = {"@id":"./"} + general["conformsTo"] = [{"@id":"https://w3id.org/ro/crate/1.1"} + #, {"@id":"https://w3id.org/workflowhub/workflow-ro-crate/1.0"}#This description does not conform + ] + self.dico["@graph"].append(general) + #ROOT + root = {} + root["@id"] = "./" + root["@type"] = "Dataset" + root["name"] = self.workflow.get_name() + root["datePublished"] = self.workflow.get_datePublished() + root["description"] = self.workflow.get_description() + root["mainEntity"] = {"@id": self.workflow.get_main_file(), + "@type":["File", "SoftwareSourceCode"]} #We do not consider a File as a "ComputationalWorkflow" since multiple (sub)workflows can be defined in a same file + root["license"] = {"@id":self.workflow.get_license()} + authors = self.workflow.get_authors() + tab_authors = [] + for author in authors: + tab_authors.append({"@id":author["@id"]}) + root["author"] = tab_authors + root["maintainer"] = tab_authors #Right now i'm assuming that all the authors are maintainers + files = self.get_files() + tab_files = [] + for file in files: + tab_files.append({"@id":file["@id"]}) + root["hasPart"] = tab_files + root["publisher"] = {"@id":self.workflow.get_publisher()} + #subjectOf TODO + root["subjectOf"] = None + root["creativeWorkStatus"] = self.workflow.get_creativeWorkStatus() + root["@version"] = self.workflow.get_version() + root["keywords"] = self.workflow.get_keywords() + root["producer"] = self.workflow.get_producer() + self.dico["@graph"].append(root) + + #TODO + def get_programming_language(self, file): + if(file[-3:]==".nf"): + return "https://w3id.org/workflowhub/workflow-ro-crate#nextflow" + return None + + def get_contentSize(self, file): + file_stats = os.stat(file) + return file_stats.st_size/1e3 + + #TODO + def get_dateCreated(self, file): + return "TODO" + + #TODO + def get_dateModified(self, file): + return "TODO" + + #TODO + def get_url(self, file): + return "TODO" + + #TODO + def get_creators(self, file): + return [{"@id": "George"}] + + #TODO + def get_types(self, file): + types = ["File"] + if(file[-3:]==".nf"): + types.append("SoftwareSourceCode") + return types + + + def initialise_file(self, file): + key = file[len(self.directory)+1:] + dico = {} + dico["@id"] = key + dico["name"] = key + dico["@type"] = self.get_types(file) + dico["programmingLanguage"] = {"@id":self.get_programming_language(file)} + dico["contentSize"] = self.get_contentSize(file) + dico["dateCreated"] = self.get_dateCreated(file) + dico["dateModified"] = self.get_dateModified(file) + dico["url"] = self.get_url(file) + creators = self.get_creators(file) + dico["creator"] = [] + for creator in creators: + dico["creator"].append({"@id": creator["@id"]}) + dico["isPartOf"] = [] + dico["hasPart"] = [] + self.dico["@graph"].append(dico) def initialise(self): self.initialise_dico() - print("i'm initialised") \ No newline at end of file + for file in self.files: + self.initialise_file(file) + + with open(f"{self.workflow.get_output_dir()}/ro-crate-metadata-{self.workflow.get_name()}.json", 'w') as output_file : + json.dump(self.dico, output_file, indent=2) \ No newline at end of file diff --git a/src/workflow.py b/src/workflow.py index 5f700ae746ca54648094fb12d172206d02aa604a..d3d8d17f490d26f5a8e1c97aeb366634024d3962 100644 --- a/src/workflow.py +++ b/src/workflow.py @@ -8,6 +8,57 @@ class Workflow: self.nextflow_file = Nextflow_File(file, duplicate = duplicate, display_info = display_info) self.rocrate = None + #TODO + def get_name(self): + return "TODO" + + #TODO + def get_datePublished(self): + return "TODO" + + #TODO + def get_description(self): + return "TODO" + + #TODO + def get_main_file(self): + return "TODO" + + #TODO + def get_license(self): + return "TODO" + + #TODO + def get_main_license(self): + return "TODO" + + #TODO + def get_creativeWorkStatus(self): + return "TODO" + + #TODO + def get_version(self): + return "TODO" + + #TODO + def get_authors(self): + return [{"@id": "George"}] + + #TODO + #Need to follow this format : "rna-seq, nextflow, bioinformatics, reproducibility, workflow, reproducible-research, bioinformatics-pipeline" + def get_keywords(self): + return "TODO" + + #TODO + def get_producer(self): + return "TODO" + + #TODO + def get_publisher(self): + return "TODO" + + def get_output_dir(self): + return self.nextflow_file.get_output_dir() def get_file_address(self): return self.nextflow_file.get_file_address()