diff --git a/src/include.py b/src/include.py index 8bddcbfdac7728653d3d54a42fd3a88c800b5777..3ff4fff9e145511f6216b3a3d048ef99c7be40b3 100644 --- a/src/include.py +++ b/src/include.py @@ -20,10 +20,9 @@ def clean_string(txt): return txt class Include(Nextflow_Building_Blocks): - def __init__(self, code, file, importing, origin, duplicate): + def __init__(self, code, file, importing, origin): self.origin = origin self.importing = importing - self.duplicate = duplicate self.code = Code(code = code, origin = self) self.file = None self.address = file @@ -50,7 +49,7 @@ class Include(Nextflow_Building_Blocks): def get_list_name_includes(self): - if(self.duplicate): + if(self.get_duplicate_status()): names = [] for ele in self.defines: names.append(ele.get_alias()) @@ -86,14 +85,14 @@ class Include(Nextflow_Building_Blocks): found_file = True if(not found_file and os.path.isfile(address[:-3]+"/main.nf")): - self.file = Nextflow_File(address[:-3]+"/main.nf", origin=self, duplicate=self.duplicate, DSL="DSL2") + self.file = Nextflow_File(address[:-3]+"/main.nf", origin=self) #TODO -> check if the nextflow_file is defined somewhere else? #In the cas the nextflow file is imported multiple times else: if(os.path.isfile(address)): - self.file = Nextflow_File(address, origin=self, duplicate=self.duplicate, DSL="DSL2") + self.file = Nextflow_File(address, origin=self) else: address = os.path.normpath(address) raise BioFlowInsightError(f"Something went wrong in an include{self.get_string_line(self.get_code())}. No such file: '{address}'.", num = 10,origin=self) @@ -101,7 +100,7 @@ class Include(Nextflow_Building_Blocks): #If not duplicate -> we need to see if there is another include which has already defined the file #TODO -> if you wanna generalise this to all include (inbetween files -> you just need to update get_include() ) - if(not self.duplicate): + if(not self.get_duplicate_status()): #other_includes = self.origin.get_all_includes() other_includes = self.origin.get_includes() for other in other_includes: @@ -116,7 +115,7 @@ class Include(Nextflow_Building_Blocks): found = False if(include!=''): if(re.fullmatch(constant.WORD, include)): - if(self.duplicate): + if(self.get_duplicate_status()): self.defines.append(self.file.get_element_from_name(include)) else: self.aliases[include] = self.file.get_element_from_name(include) @@ -125,7 +124,7 @@ class Include(Nextflow_Building_Blocks): pattern_as = constant.INCLUDE_AS for match in re.finditer(pattern_as, include): found = True - if(self.duplicate): + if(self.get_duplicate_status()): #TODO -> try shallow copy too #thing_as = copy.copy(self.file.get_element_from_name(match.group(1))) thing_as = copy.deepcopy(self.file.get_element_from_name(match.group(1))) diff --git a/src/nextflow_building_blocks.py b/src/nextflow_building_blocks.py index 0c600056b09d2a1acee8413b0126147c2f505a61..ae780ae65ce6e658edf9d38e5eecdf3857cfb119 100644 --- a/src/nextflow_building_blocks.py +++ b/src/nextflow_building_blocks.py @@ -41,6 +41,9 @@ class Nextflow_Building_Blocks: def get_processes_annotation(self): return self.origin.get_processes_annotation() + def get_duplicate_status(self): + return self.origin.get_duplicate_status() + def get_file_address(self): return self.origin.get_file_address() diff --git a/src/nextflow_file.py b/src/nextflow_file.py index b12bea69ca93e0b5338f5a7f0453792e4aa75ee6..bc296df628a9d40ade5628d1256b14a18946d401 100644 --- a/src/nextflow_file.py +++ b/src/nextflow_file.py @@ -13,7 +13,7 @@ from . import constant warnings.filterwarnings("ignore") from .nextflow_building_blocks import Nextflow_Building_Blocks -from .outils import extract_curly, get_curly_count, get_parenthese_count, get_dico_from_tab_from_id +from .outils import extract_curly, get_curly_count, get_parenthese_count, get_dico_from_tab_from_id, check_file_exists from .bioflowinsighterror import BioFlowInsightError @@ -21,34 +21,26 @@ from .bioflowinsighterror import BioFlowInsightError class Nextflow_File(Nextflow_Building_Blocks): - def __init__(self, address, duplicate = True, DSL="", author = None, name = None, origin=None, output_dir='./results', display_info = True, - workflow = None): + def __init__(self, address, origin): self.file = address if(self.get_file_address().find('/')==-1): raise BioFlowInsightError(f"BioFlow-Insight cannot directly analyse a workflow from its directory. Please analyse the workflow from the parent directory instead.", num = -1) - self.output_dir = Path(output_dir) - contents = '' - try: - with open(self.get_file_address(), 'r') as f: - contents = f.read() - except Exception: - raise BioFlowInsightError(f"No such file: '{self.get_file_address()}'.", num = 10,origin=self) - + contents = check_file_exists(self.get_file_address(), self) Nextflow_Building_Blocks.__init__(self, contents) - self.workflow_name = name - self.author = author - self.duplicate = duplicate + self.origin = origin - self.DSL = "" - self.workflow = workflow - self.first_file = DSL=="" - self.graph = None - self.display_info = display_info self.all_includes = [] + + from src.workflow import Workflow + self.first_file = type(origin)==Workflow + if(self.first_file==True): + self.origin.set_DSL(self.which_DSL()) + self.graph = None + self.added_2_rocrate = False self.check_file_correctness() - self.set_DSL(DSL=DSL) + self.do_start_stuff() #self.extract_metadata() self.check_file_correctness_after_DSL() self.set_null() @@ -89,54 +81,31 @@ class Nextflow_File(Nextflow_Building_Blocks): if(not found_main): raise BioFlowInsightError(f"No 'main' workflow was found.", num = 16, origin=self) - def get_output_dir(self): - if(self.first_file): - return self.output_dir - else: - if(self.origin==None): - return self.output_dir - else: - return self.origin.get_output_dir() def get_processes_annotation(self): if(self.first_file): - return self.workflow.get_processes_annotation() + return self.origin.get_processes_annotation() else: if(self.origin==None): return None else: return self.origin.get_processes_annotation() - - def get_display_info(self): - if (self.first_file): - return self.display_info - else: - if(self.origin==None): - return self.display_info - else: - return self.origin.get_display_info() - + def get_workflow_address(self): if(self.origin==None): - return self.workflow.get_workflow_directory() + return self.origin.get_workflow_directory() else: return self.origin.get_workflow_address() def set_name(self): - if self.first_file and self.workflow_name is None: + if self.first_file and self.origin.get_name() is None: address = self.get_file_address() - self.workflow_name = address.split('/')[-2] + self.origin.set_name(address.split('/')[-2]) + - def set_author(self): - if self.first_file and self.author is None: - address = self.get_file_address() - try: - self.author = address.split('/')[-3] - except: - self.author="Unknown" def get_channels(self): return self.channels @@ -163,8 +132,7 @@ class Nextflow_File(Nextflow_Building_Blocks): self.set_name() self.set_author() dico_wf = {} - dico_wf["workflow name"] = self.workflow_name - dico_wf["author"] = self.author + dico_wf["workflow name"] = self.origin.get_name() dico_wf["date analysis"] = date.today().strftime("%m/%d/%y")#m/d/y dico_wf["DSL"] = self.DSL dico_wf["link"] = "TODO" @@ -195,13 +163,13 @@ class Nextflow_File(Nextflow_Building_Blocks): # ##Number of process used processes_used = {} - with open(self.output_dir / "debug" / "processes_used.json", "w") as outfile: + with open(self.get_output_dir() / "debug" / "processes_used.json", "w") as outfile: json.dump(processes_used, outfile, indent=4) else: raise Exception(f"The workflow's DSL is '{self.DSL}' -> I don't know what this is!") - with open(self.output_dir / "general.json", "w") as outfile: + with open(self.get_output_dir() / "general.json", "w") as outfile: json.dump(dico_wf, outfile, indent=4) def get_type(self): @@ -214,28 +182,24 @@ class Nextflow_File(Nextflow_Building_Blocks): def get_string_line(self, bit_of_code): return self.code.get_string_line(bit_of_code) - def set_DSL(self, DSL=""): + def do_start_stuff(self): #Set the DSL - if(DSL==""): - - - os.makedirs(self.output_dir, exist_ok=True) - os.makedirs(self.output_dir / 'debug', exist_ok=True) - os.makedirs(self.output_dir / 'graphs', exist_ok=True) - - with open(self.output_dir / "debug" / "operations.nf",'w') as file: + if(self.first_file): + os.makedirs(self.get_output_dir(), exist_ok=True) + os.makedirs(self.get_output_dir() / 'debug', exist_ok=True) + os.makedirs(self.get_output_dir() / 'graphs', exist_ok=True) + with open(self.get_output_dir() / "debug" / "operations.nf",'w') as file: pass - with open(self.output_dir / "debug" / "calls.nf",'w') as file: + with open(self.get_output_dir() / "debug" / "calls.nf",'w') as file: pass - with open(self.output_dir / "debug" / "operations_in_call.nf",'w') as file: + with open(self.get_output_dir() / "debug" / "operations_in_call.nf",'w') as file: pass self.DSL = self.which_DSL() self.set_null() if(self.get_display_info()): print(f"The workflow is written in '{self.get_DSL()}'") - else: - self.DSL = DSL + #---------------------- @@ -262,9 +226,6 @@ class Nextflow_File(Nextflow_Building_Blocks): if(name==fun.get_name()): return fun raise BioFlowInsightError(f"'{name}' is expected to be defined in the file, but it could not be found.", num = 18, origin=self) - - def get_DSL(self): - return self.DSL #Method which returns the DSL of the workflow -> by default it's DSL2 @@ -298,7 +259,7 @@ class Nextflow_File(Nextflow_Building_Blocks): for process in self.processes: if(process.get_name()==name): return process - if(self.duplicate): + if(self.get_duplicate_status()): for include in self.includes: defines = include.get_defines() for d in defines: @@ -377,7 +338,7 @@ class Nextflow_File(Nextflow_Building_Blocks): for sub in self.subworkflows: if(sub.get_name()==name): return sub - if(self.duplicate): + if(self.get_duplicate_status()): for include in self.includes: defines = include.get_defines() for d in defines: @@ -450,7 +411,7 @@ class Nextflow_File(Nextflow_Building_Blocks): #address = match.group(0).split('from')[1].strip() address = match.group(6).strip() if(address[1:].split('/')[0] not in ['plugin']): - include = Include(code =match.group(0), file = address, importing = includes, origin=self, duplicate = self.duplicate) + include = Include(code =match.group(0), file = address, importing = includes, origin=self) self.includes.append(include) self.add_include_to_all_includes(include) @@ -485,7 +446,7 @@ class Nextflow_File(Nextflow_Building_Blocks): if(fun.get_name()==name): return fun - if(self.duplicate): + if(self.get_duplicate_status()): for include in self.includes: defines = include.get_defines() for d in defines: @@ -583,17 +544,17 @@ class Nextflow_File(Nextflow_Building_Blocks): #if(self.first_file): # number_process_used = 0 - # with open(self.output_dir / 'debug/processes_used.json') as json_file: + # with open(self.get_output_dir() / 'debug/processes_used.json') as json_file: # dict = json.load(json_file) # for file in dict: # number_process_used+=len(set(dict[file])) # - # with open(self.output_dir / "general.json") as json_file: + # with open(self.get_output_dir() / "general.json") as json_file: # dico_wf = json.load(json_file) # # #dico_wf["processes"]["number used"] = number_process_used # - # with open(self.output_dir / "general.json", "w") as outfile: + # with open(self.get_output_dir() / "general.json", "w") as outfile: # json.dump(dico_wf, outfile, indent=4) diff --git a/src/outils.py b/src/outils.py index f8d60deabc399b1b8b61916f4650d3f9a91711d8..ad30ce715c71655a2cee49dcaa1f3416440a5935 100644 --- a/src/outils.py +++ b/src/outils.py @@ -1,4 +1,6 @@ import re +import subprocess +import os #============================================================= # THESE A JUST UTILITY FUNCTIONS TO BE ABLE TO MANIPULATE CODE @@ -899,3 +901,17 @@ def get_perl_modules(script): for match in re.finditer(r"(package|use)\s+([^\s;]+)\s*;", script): libraries.append(match.group(2)) return libraries + + +def check_file_exists(address, origin): + from .bioflowinsighterror import BioFlowInsightError + try: + with open(address, 'r') as f: + contents = f.read() + return contents + except Exception: + raise BioFlowInsightError(f"No such file: '{address}'.", num = 10,origin=origin) + + +def is_git_directory(path = '.'): + return subprocess.call(['git', '-C', path, 'status'], stderr=subprocess.STDOUT, stdout = open(os.devnull, 'w')) == 0 \ No newline at end of file diff --git a/src/outils_annotate.py b/src/outils_annotate.py index 324e7259f6ba6c87cfb3706791c63e0908083c41..b517390d4a3c08f6fa052b0cdc77bd5c986fad24 100644 --- a/src/outils_annotate.py +++ b/src/outils_annotate.py @@ -1,5 +1,6 @@ -import copy -import numpy as np +#Import dependencies + +#Outside packages import re tools = [] @@ -12,9 +13,7 @@ def get_propositions(process, tools = -1, commands = -1): temp = [] if(tools!=-1): for tool in tools: - #for character in char_after_tool: for match in re.finditer(r"(\s|\(|\/|\|)"+tool+r"(\s|\-|\\|\.)", process): - #if(f"{tool}{character}" in process): temp.append(tool) if(commands!=-1): for c in commands: diff --git a/src/ro_crate.py b/src/ro_crate.py index 804ba3406ec1c768b38e3e255758b0afcd631404..cc8e3947d9e60f9b3e77c600104c31c055b55a11 100644 --- a/src/ro_crate.py +++ b/src/ro_crate.py @@ -5,6 +5,19 @@ import re from . import constant +#Need to add these things here +# self.datePublished = datePublished +# self.description = description +# self.license = license +# self.creativeWorkStatus = creativeWorkStatus +# self.authors = authors +# self.version = version +# self.keywords = keywords +# self.producer = producer +# self.publisher = publisher + + + class RO_Crate: def __init__(self, workflow): self.workflow = workflow diff --git a/src/workflow.py b/src/workflow.py index 7de7c82221aab1e44b298b508e6789947fe0a4a4..d2ab751c74d35bd1e51558b7b5204e788975b72a 100644 --- a/src/workflow.py +++ b/src/workflow.py @@ -3,6 +3,7 @@ from .nextflow_file import Nextflow_File from .ro_crate import RO_Crate from . import constant +from .outils import is_git_directory from .outils_graph import flatten_dico, initia_link_dico_rec, get_number_cycles from .outils_annotate import get_tools_commands_from_user_for_process from .bioflowinsighterror import BioFlowInsightError @@ -28,31 +29,20 @@ class Workflow: display_info: A boolean indicating if the analysis information should be printed output_dir: A string indicating where the results will be saved name: A string indicating the name of the workflow - datePublished: A string indicating the date of publication of the workflow - description: A string indicating the description of the workflow - license: A string indicating the license of the workflow - creativeWorkStatus: A string indicating the creative work statuts of the workflow - authors: A string inidcating the authors of the workflow - version: A string indicating the version of the workflow - keywords: A string indicating the keywords of the workflow - producer: A string indicating the producer of the workflow - publisher: A string indicating the publisher of the workflow processes_2_remove: A string indicating the processes to remove from the workflow - processes_annotation: A dictionnary containing processes 2 annotations + processes_annotation: A dictionnary containing processes 2 annotations (tools, commands and modules) + personnal_acces_token: The Github personnal access token (this is to use the Github API with more requests per hour) """ def __init__(self, file, duplicate=False, display_info=True, output_dir = './results', - name = None, datePublished=None, description=None, - license = None, creativeWorkStatus = None, authors = None, - version = None, keywords = None, producer = None, - publisher = None, processes_2_remove = None, + name = None, processes_2_remove = None, processes_annotation = None, - personnal_acces_token = None, - processes_2_tools = None): + personnal_acces_token = None): if(not os.path.isfile(file)): nextflow_files = glob.glob(f'{file}/*.nf') if(len(nextflow_files)==0): raise BioFlowInsightError("No Nextflow files ('.nf') are in the directory!", num = -1) + #Try to read the main.nf file -> if this cannot be found then the first nextflow file is used try: file = '/'.join(nextflow_files[0].split('/')[:-1])+"/main.nf" with open(file, 'r') as f: @@ -60,42 +50,55 @@ class Workflow: except: file =nextflow_files[0] - self.processes_annotation = processes_annotation + + self.duplicate = duplicate + self.DSL = "" + self.display_info = display_info + self.output_dir = Path(output_dir) self.nextflow_file = Nextflow_File( file, - duplicate=duplicate, - display_info=display_info, - output_dir=output_dir, - workflow = self + origin = self ) + + self.workflow_directory = '/'.join(file.split('/')[:-1]) - self.output_dir = Path(output_dir) + + self.processes_annotation = processes_annotation self.rocrate = None - self.display_info = display_info + self.name = name - self.datePublished = datePublished - self.description = description - self.license = license - self.creativeWorkStatus = creativeWorkStatus - self.authors = authors - self.version = version - self.keywords = keywords - self.producer = producer - self.publisher = publisher self.tab_processes_2_remove = None self.personnal_acces_token = personnal_acces_token - self.processes_2_tools = processes_2_tools if(processes_2_remove==""): processes_2_remove = None self.processes_2_remove = processes_2_remove self.log = "" + self.fill_log() self.address = "" self.set_address() self.dico = {} self.get_dico() + def get_duplicate_status(self): + return self.duplicate + + def get_display_info(self): + return self.display_info + + def get_output_dir(self): + return Path(self.output_dir) + + def get_DSL(self): + return self.DSL + + def set_DSL(self, DSL): + self.DSL = DSL + + def get_is_a_git_repo(self): + return is_git_directory(path = self.get_repo_adress()) + def get_repo_adress(self): """Method that returns the adress of the workflow repository @@ -122,16 +125,18 @@ class Workflow: Keyword arguments: """ - current_directory = os.getcwd() - os.chdir(self.get_repo_adress()) - try: - os.system(f"git log --reverse > temp_{id(self)}.txt") - with open(f'temp_{id(self)}.txt') as f: - self.log = f.read() - os.system(f"rm temp_{id(self)}.txt") - except: - None - os.chdir(current_directory) + if(self.get_is_a_git_repo()): + current_directory = os.getcwd() + os.chdir(self.get_repo_adress()) + + try: + os.system(f"git log --reverse > temp_{id(self)}.txt") + with open(f'temp_{id(self)}.txt') as f: + self.log = f.read() + os.system(f"rm temp_{id(self)}.txt") + except: + None + os.chdir(current_directory) def get_address(self): """Method that returns the adress of the workflow main @@ -175,23 +180,26 @@ class Workflow: Keyword arguments: """ - current_directory = os.getcwd() - os.chdir(self.get_repo_adress()) - try: - if(self.personnal_acces_token!=None): - command = f'curl --silent --request GET --url "https://api.github.com/repos/{self.address}" --header "Authorization: Bearer {self.personnal_acces_token}" --header "X-GitHub-Api-Version: 2022-11-28" > temp_dico_{id(self)}.json' - else: - command = f'curl --silent --request GET --url "https://api.github.com/repos/{self.address}" > temp_dico_{id(self)}.json' - _ = os.system(command) - with open(f'temp_dico_{id(self)}.json') as json_file: - self.dico = json.load(json_file) - os.system(f"rm temp_dico_{id(self)}.json") - - except: - _ = os.system(f"rm temp_dico_{id(self)}.json") - os.chdir(current_directory) + if(self.get_is_a_git_repo()): + current_directory = os.getcwd() + os.chdir(self.get_repo_adress()) + try: + if(self.personnal_acces_token!=None): + command = f'curl --silent --request GET --url "https://api.github.com/repos/{self.address}" --header "Authorization: Bearer {self.personnal_acces_token}" --header "X-GitHub-Api-Version: 2022-11-28" > temp_dico_{id(self)}.json' + else: + command = f'curl --silent --request GET --url "https://api.github.com/repos/{self.address}" > temp_dico_{id(self)}.json' + _ = os.system(command) + with open(f'temp_dico_{id(self)}.json') as json_file: + self.dico = json.load(json_file) + os.system(f"rm temp_dico_{id(self)}.json") + + except: + _ = os.system(f"rm temp_dico_{id(self)}.json") + os.chdir(current_directory) + def set_name(self, name): + self.name = name def get_name(self): """Method that returns the name of the workflow @@ -200,7 +208,8 @@ class Workflow: """ if(self.name==None): - return self.nextflow_file.get_file_address().split("/")[-2] + self.set_name(self.nextflow_file.get_file_address().split("/")[-2]) + return self.name else: return self.name @@ -341,13 +350,6 @@ class Workflow: else: return None - def get_output_dir(self): - """Method that returns the output directory - - Keyword arguments: - - """ - return self.nextflow_file.get_output_dir() def get_file_address(self): """Method that returns the adress of the workflow main @@ -559,28 +561,26 @@ George Marchment, Bryan Brancotte, Marie Schmit, Frédéric Lemoine, Sarah Cohen def build_processes_2_tools(self): - if(self.processes_2_tools==None): - print() - print("Let's extarct the tools from the processes") - print("------------------------------------------") - print() - exiting_tools, existing_commands = [], [] - processes = self.get_processes_called() - dico = {} - index=0 - for p in processes: - print(f"* {index/len(processes)*100:.2f}% ({index}) processes annotated") - tools_found, commands_found, exiting_tools, existing_commands = get_tools_commands_from_user_for_process(p, exiting_tools, existing_commands) - dico[p.get_code()] = {} - dico[p.get_code()]["tools"] = tools_found - dico[p.get_code()]["commands"] = commands_found - index+=1 - self.processes_2_tools = dico - with open(f"{self.get_output_dir()}/processes_2_tools.json", 'w') as output_file : - json.dump(self.processes_2_tools, output_file, indent=2) - return self.processes_2_tools - else: - return self.processes_2_tools + print() + print("Let's extarct the tools from the processes") + print("------------------------------------------") + print() + exiting_tools, existing_commands = [], [] + processes = self.get_processes_called() + dico = {} + index=0 + for p in processes: + print(f"* {index/len(processes)*100:.2f}% ({index}) processes annotated") + tools_found, commands_found, exiting_tools, existing_commands = get_tools_commands_from_user_for_process(p, exiting_tools, existing_commands) + dico[p.get_code()] = {} + dico[p.get_code()]["tools"] = tools_found + dico[p.get_code()]["commands"] = commands_found + index+=1 + + with open(f"{self.get_output_dir()}/processes_2_tools.json", 'w') as output_file : + json.dump(dico, output_file, indent=2) + return dico + def get_number_subworkflows_process_dependency_graph(self):