diff --git a/src/nextflow_file.py b/src/nextflow_file.py index b254a1d131689a86b1fe203fd3bc4b93bcc9c74c..6e94bfc66786bf606aab8bd7853fdc61c55fd74d 100644 --- a/src/nextflow_file.py +++ b/src/nextflow_file.py @@ -679,7 +679,7 @@ class Nextflow_File(Nextflow_Building_Blocks): def add_subworkflows_2_rocrate(self, dico, file_dico, file_name): for sub in self.subworkflows: sub_key = sub.get_rocrate_key(dico) - file_dico["hasPart"].append(sub_key) + file_dico["hasPart"].append({"@id":sub_key}) sub.add_2_rocrate(dico, file_name) def add_2_rocrate(self, dico): diff --git a/src/ro_crate.py b/src/ro_crate.py index 450bfb4d9863acfadcada867036ba94c2add4237..073e9b183f2d07109f09b8ca8b7db698f5db0f40 100644 --- a/src/ro_crate.py +++ b/src/ro_crate.py @@ -1,6 +1,9 @@ import json import glob import os +import re + +from . import constant class RO_Crate: def __init__(self, workflow): @@ -42,7 +45,7 @@ class RO_Crate: authors = self.workflow.get_authors() tab_authors = [] for author in authors: - tab_authors.append({"@id":author["@id"], "name":author["name"]}) + tab_authors.append({"@id":author["@id"], "email":author["email"]}) root["author"] = tab_authors root["maintainer"] = tab_authors #Right now i'm assuming that all the authors are maintainers files = self.get_files() @@ -69,23 +72,53 @@ class RO_Crate: file_stats = os.stat(file) return file_stats.st_size/1e3 - #TODO + def fill_log_file(self, file, reverse = True): + info = "" + current_directory = os.getcwd() + os.chdir("/".join(self.workflow.nextflow_file.get_file_address().split("/")[:-1])) + try: + os.system(f"git log {'--reverse'*reverse} {file} > temp_{id(self)}.txt") + with open(f'temp_{id(self)}.txt') as f: + info = f.read() + os.system(f"rm temp_{id(self)}.txt") + except: + None + os.chdir(current_directory) + return info + def get_dateCreated(self, file): - return "TODO" + info = self.fill_log_file(file, reverse = True) + for match in re.finditer(r"Date: +\w+ +(\w+) +(\d+) +\d+:\d+:\d+ +(\d+)", info): + month = constant.month_mapping[match.group(1)] + day = match.group(2) + year = match.group(3) + return f"{year}-{month}-{day}" + return None - #TODO + def get_dateModified(self, file): - return "TODO" + info = self.fill_log_file(file, reverse = False) + for match in re.finditer(r"Date: +\w+ +(\w+) +(\d+) +\d+:\d+:\d+ +(\d+)", info): + month = constant.month_mapping[match.group(1)] + day = match.group(2) + year = match.group(3) + return f"{year}-{month}-{day}" + return None - #TODO + def get_url(self, file): - return "TODO" + if(self.workflow.dico!={}): + return f"https://github.com/{self.workflow.get_address()}/blob/main/{file}" + return None - #TODO + def get_creators(self, file): - return [{"@id": "George"}] - - #TODO + info = self.fill_log_file(file, reverse = True) + for match in re.finditer(r"Author: ([ \w-]+) <([^>]+)>", info): + return [{"@id": match.group(1)}] + return None + + def get_types(self, file): types = ["File"] if(file[-3:]==".nf"): @@ -101,10 +134,10 @@ class RO_Crate: dico["@type"] = self.get_types(file) dico["programmingLanguage"] = {"@id":self.get_programming_language(file)} dico["contentSize"] = self.get_contentSize(file) - dico["dateCreated"] = self.get_dateCreated(file) - dico["dateModified"] = self.get_dateModified(file) - dico["url"] = self.get_url(file) - creators = self.get_creators(file) + dico["dateCreated"] = self.get_dateCreated(key) + dico["dateModified"] = self.get_dateModified(key) + dico["url"] = self.get_url(key) + creators = self.get_creators(key) dico["creator"] = [] for creator in creators: dico["creator"].append({"@id": creator["@id"]}) diff --git a/src/workflow.py b/src/workflow.py index 5342a54dbb774f56c1606f6f5936bfed3da3de56..e69d9691ae030a5e0a747085eb9d2e7fe493a20f 100644 --- a/src/workflow.py +++ b/src/workflow.py @@ -5,6 +5,7 @@ from . import constant import os import re +import json class Workflow: @@ -25,17 +26,52 @@ class Workflow: self.keywords = keywords self.producer = producer self.publisher = publisher - self.log = None + self.log = "" self.fill_log() - + self.address = "" + self.set_address() + self.dico = {} + self.get_dico() def fill_log(self): current_directory = os.getcwd() os.chdir("/".join(self.nextflow_file.get_file_address().split("/")[:-1])) - os.system(f"git log --reverse > temp_{id(self)}.txt") - with open(f'temp_{id(self)}.txt') as f: - self.log = f.read() - os.system(f"rm temp_{id(self)}.txt") + try: + os.system(f"git log --reverse > temp_{id(self)}.txt") + with open(f'temp_{id(self)}.txt') as f: + self.log = f.read() + os.system(f"rm temp_{id(self)}.txt") + except: + None + os.chdir(current_directory) + + def get_address(self): + return self.address + + def set_address(self): + current_directory = os.getcwd() + os.chdir("/".join(self.nextflow_file.get_file_address().split("/")[:-1])) + try: + os.system(f"git ls-remote --get-url origin > temp_address_{id(self)}.txt") + with open(f'temp_address_{id(self)}.txt') as f: + self.address = f.read() + os.system(f"rm temp_address_{id(self)}.txt") + except: + None + os.chdir(current_directory) + for match in re.finditer(r"https:\/\/github\.com\/([^\.]+)\.git", self.address): + self.address = match.group(1) + + def get_dico(self): + current_directory = os.getcwd() + os.chdir("/".join(self.nextflow_file.get_file_address().split("/")[:-1])) + try: + _ = os.system(f"wget -qO - https://api.github.com/repos/{self.address} > temp_dico_{id(self)}.json") + with open(f'temp_dico_{id(self)}.json') as json_file: + self.dico = json.load(json_file) + os.system(f"rm temp_dico_{id(self)}.json") + except: + None os.chdir(current_directory) @@ -47,6 +83,7 @@ class Workflow: return self.name #Format yyyy-mm-dd + #Here i return the first commit date def get_datePublished(self): if(self.datePublished==None): for match in re.finditer(r"Date: +\w+ +(\w+) +(\d+) +\d+:\d+:\d+ +(\d+)",self.log): @@ -58,18 +95,32 @@ class Workflow: return self.datePublished - - #TODO def get_description(self): - return "TODO" + if(self.description==None): + try: + res = self.dico["description"] + except: + res = None + return res + else: + return self.description + def get_main_file(self): return self.nextflow_file.get_file_address().split("/")[-1] - - #TODO + + def get_license(self): - return "TODO" + if(self.license==None): + try: + res = self.dico["license"]["key"] + except: + res = None + return res + else: + return self.license + #TODO def get_creativeWorkStatus(self): @@ -78,32 +129,51 @@ class Workflow: #TODO def get_version(self): return "TODO" - - #TODO -> this doesn't workf perfectly + + def get_authors(self): if(self.authors==None): authors = {} - for match in re.finditer(r"Author: (\w+ +\w+) <([^>]+)>",self.log): + for match in re.finditer(r"Author: ([ \w-]+) <([^>]+)>",self.log): authors[match.group(2)] = match.group(1) tab = [] for author in authors: - tab.append({"@id":author, "name":authors[author]}) + #tab.append({"@id":author, "name":authors[author]}) + tab.append({"@id":authors[author], "email":author}) return tab else: return self.authors - #TODO + #Need to follow this format : "rna-seq, nextflow, bioinformatics, reproducibility, workflow, reproducible-research, bioinformatics-pipeline" def get_keywords(self): - return "TODO" + if(self.keywords==None): + try: + res = ", ".join(self.dico["topics"]) + except: + res = None + return res + else: + return self.keywords + - #TODO + def get_producer(self): - return "TODO" + if(self.producer==None): + try: + res = {"@id": self.dico["owner"]["login"]} + except: + res = None + return res + else: + return self.producer - #TODO + def get_publisher(self): - return "TODO" + if(self.dico!={}): + return "https://github.com/" + else: + return None def get_output_dir(self): return self.nextflow_file.get_output_dir()