diff --git a/pyproject.toml b/pyproject.toml new file mode 100644 index 0000000000000000000000000000000000000000..c326a12e2677df85e7d98cd125e191b595b6f004 --- /dev/null +++ b/pyproject.toml @@ -0,0 +1,40 @@ +[build-system] +requires = ["setuptools >= 61.0"] +build-backend = "setuptools.build_meta" + +[project] +name = "bioflow-insight" +authors = [ + { name="George Marchment", email="author@example.com" }, +] +description = "A software to extract and analyze the structure and associated metadata from a Nextflow workflow." +readme = "README.md" +requires-python = ">=3.8" +classifiers = [ + "Programming Language :: Python :: 3", +# "License :: OSI Approved :: MIT License", + "Operating System :: OS Independent", +] +dynamic = ["version", "dependencies"] + +[tool.setuptools.dynamic] +dependencies = {file = ["requirements.txt"]} +optional-dependencies = {dev = { file = ["requirements-dev.txt"] }} +version = {attr = "src.__version__"} + +[project.scripts] +bioflow-insight = "bioflow_insight_cli.main:cli_command" + +[project.urls] +Homepage = "https://github.com/George-Marchment/Newtflow-Structure" +Issues = "https://github.com/George-Marchment/Newtflow-Structure/issues" + +[tool.setuptools] +packages = [ + 'src', + 'bioflow_insight_cli', +] + +[tool.black] +line-length = 120 +skip-string-normalization = true \ No newline at end of file diff --git a/requirements.dev.txt b/requirements.dev.txt new file mode 100644 index 0000000000000000000000000000000000000000..3377e1ec1d82ac347fec2427f3cc0478c5aeb99f --- /dev/null +++ b/requirements.dev.txt @@ -0,0 +1,4 @@ +build +twine +coverage +black~=23.12.0 \ No newline at end of file diff --git a/requirements.txt b/requirements.txt new file mode 100644 index 0000000000000000000000000000000000000000..e187593eac9c086b4e02971e376215fcfa42f7f9 --- /dev/null +++ b/requirements.txt @@ -0,0 +1,18 @@ +#Pyhton 3.10.12 + +graphviz==0.20.1 +networkx~=3.2.1; python_version >= '3.9' +networkx~=3.1; python_version == '3.8' +numpy~=1.26.1; python_version >= '3.9' +numpy~=1.24.4; python_version == '3.8' +click + +#Default Python packages +#os +#copy +#glob +#warnings +#datetime +#unittest +#re +#json diff --git a/run_tests.py b/run_tests.py new file mode 100644 index 0000000000000000000000000000000000000000..ddee4fd255619f7f1013cf83d04f264a9334602d --- /dev/null +++ b/run_tests.py @@ -0,0 +1,8 @@ +import unittest + +#Run all tests +if __name__ == '__main__': + test_loader = unittest.TestLoader() + test_suite = test_loader.discover('tests', pattern='test_*.py') + runner = unittest.TextTestRunner() + runner.run(test_suite) diff --git a/setup.py b/setup.py new file mode 100644 index 0000000000000000000000000000000000000000..25536050b29963bbfbb07ad400439bb16afbb82a --- /dev/null +++ b/setup.py @@ -0,0 +1,6 @@ +#!/usr/bin/env python + +from setuptools import setup + +if __name__ == "__main__": + setup() diff --git a/src/__init__.py b/src/__init__.py new file mode 100644 index 0000000000000000000000000000000000000000..f648d9b525971be1d7bcd6c1c3f077cd6190340d --- /dev/null +++ b/src/__init__.py @@ -0,0 +1 @@ +__version__ = 'v0.0.1-dev' diff --git a/src/bioflowinsighterror.py b/src/bioflowinsighterror.py new file mode 100644 index 0000000000000000000000000000000000000000..83ca4b7f3f5e5fa3d59e772f6897640d4359f780 --- /dev/null +++ b/src/bioflowinsighterror.py @@ -0,0 +1,49 @@ +# creating a custom exception +class BioFlowInsightError(Exception): + def __init__(self, error, num, origin = None): + self.origin = origin + #TODO -> add message at the end + if(origin!=None): + super().__init__(f"[{num}] Error in the file '{self.origin.get_file_address()}': "+error) + else: + super().__init__(f"[{num}] {error}") + +#To handle the different type of errors; I'm gonna add numbers to the errors +#Pair numbers if it's the users fault +#Odd if it's the bioflow-insight's fault +#This is really just to be able to do stats + +#In the case something can disputed between the two, i categorise it in the users fault +#Since in futur updates i could handle when the tool makes a mistake, but i won't have +#to update the errors -> for example the numnber of parameters for a call +#In the current version, i can't handle implicit parameter (eg. multiple values in the channel) +#In any case, there is always a different way of writing it. + +######################## +# PAIR +######################## +#* [2] -> not the same number of parameters given for a process or a subworkflow +#* [4] -> a channel is trying to be created with a name already given to something else +#* [6] -> multiple channels were given by an emit eventhough only expecting one +#* [8] -> tried to acces an emit even though the thing has not been called +#* [10] -> tried to include a file which doesn't exist +#* [12] -> an include was present in a main or subworkflow +#* [14] -> in a pipe operator, the first thing called is unknown +#* [16] -> syntaxe error in the code +#* [18] -> something is expected to be defined in a file but is not +#* [20] -> The sibworkflow either emits nothing or to many values for a use in an operation +#* [22] -> a subworkflow or process defined was defined badly + + +######################## +# ODD +######################## +#* [1] -> presence of an import java or groovy (NOT USED RIGHT NOW) +#* [3] -> unkonwn thing in a pipe operator +#* [5] -> A ternary conditional operator was used with an tuple + + + + + + \ No newline at end of file diff --git a/src/call.py b/src/call.py new file mode 100644 index 0000000000000000000000000000000000000000..7d2eeea6ee5763e822c97a05ef2d7ba7139286f8 --- /dev/null +++ b/src/call.py @@ -0,0 +1,424 @@ +import re +import json + +from .code_ import Code +from .outils import get_next_param +from .executor import Executor +from .bioflowinsighterror import BioFlowInsightError +from . import constant + + +class Call(Executor): + def __init__(self, code, origin, OG_code = ''): + self.code = Code(code = code, origin = self) + self.origin = origin + self.called = [] + self.first_element_called = None + self.parameters = []#These are in the order + self.OG_code = OG_code + + def __str__(self): + return f"Call_{id(self)}" + + + + def get_code(self, clean_pipe = False, get_OG=False): + if(get_OG): + if(self.OG_code==''): + return self.code.get_code() + return self.OG_code + if(clean_pipe): + return self.clean_pipe_operator(self.code.get_code()) + else: + return self.code.get_code() + + + def get_type(self): + return "Call" + + + def get_first_element_called(self): + return self.first_element_called + + def get_elements_called(self, tab = []): + tab += [self.first_element_called] + for para in self.parameters: + if(para.get_type()=="Call"): + tab = para.get_elements_called(tab) + return list(set(tab)) + + + def get_code_split_space(self, code): + to_add_spaces = ['(', ')', '}', '{'] + for character in to_add_spaces: + code = code.replace(f'{character}', f' {character} ') + return code.split() + + def analye_parameters(self, param): + + #Step 1 -> get parameters + tab_params, start, next_param = [], 0, None + temp_param = param + while(start!=-1): + temp_param = temp_param[start:] + next_param, start = get_next_param(temp_param) + tab_params.append(next_param.strip()) + + #Step 2 -> analyse paramters + for param in tab_params: + analysed_param = False + + if param!='': + #Case it's a channel + if(re.fullmatch(constant.WORD, param) and not analysed_param): + #if(re.fullmatch(constant.WORD, param) and not analysed_param or param in ['[]'] or param[:7]=="params."): + from .channel import Channel + channel = Channel(name=param, origin=self.origin) + if(not self.origin.check_in_channels(channel)): + self.origin.add_channel(channel) + else: + channel = self.origin.get_channel_from_name(param) + #TODO -> check this + channel.add_sink(self) + self.parameters.append(channel) + analysed_param = True + else: + from .executor import Executor + executor = Executor(param, self) + executor = executor.return_type() + if(executor.get_type()=="Call"): + temp_call = executor + temp_call.initialise() + self.parameters.append(temp_call) + elif(executor.get_type()=="Operation"): + ope = executor + ope.initialise_from_call() + #Case is an Emitted -> there's only one value given and it's an emitted + if(ope.check_if_operation_is_an_full_emitted() and len(ope.get_gives())==1 and ope.get_gives()[0].get_type()=="Emitted"): + emit = ope.get_gives()[0] + self.parameters.append(emit) + else: + self.parameters.append(ope) + else: + raise Exception(f"I don't know what type '{param}' is!") + + + def get_nb_outputs(self): + first=self.get_first_element_called() + if(first.get_type()=="Process"): + return first.get_nb_outputs() + elif(first.get_type()=="Subworkflow"): + return first.get_nb_emit() + raise Exception("This soudn't happen!") + + + def get_structure(self, dico): + if(self.get_first_element_called().get_type()=="Process"): + process = self.get_first_element_called() + dico['nodes'].append({'id':str(process), 'name':process.get_alias(), "shape":"ellipse", 'xlabel':"", "fillcolor":""}) + + def add_parameter(p): + #Case parameter is a channel + if(p.get_type()=="Channel"): + channel = p + channel.get_structure(dico, B=process) + + #Case parameter is a Emitted + elif(p.get_type()=="Emitted"): + emitted = p + emitted.get_structure(dico, B=process) + + #Case parameter is a Operation + elif(p.get_type()=="Operation"): + operation = p + if(operation.show_in_structure): + operation.get_structure(dico) + dico["edges"].append({'A':str(operation), 'B':str(process), "label":""}) + + #Case parameter is a Call + elif(p.get_type()=="Call"): + call = p + call.get_structure(dico) + #Case the first call is a process + if(call.get_first_element_called().get_type()=="Process"): + for output in call.get_first_element_called().get_outputs(): + dico["edges"].append({'A':str(call.get_first_element_called()), 'B':str(process), "label":""})#TODO check name of channel + #Case the first call is a subworkflow + elif(call.get_first_element_called().get_type()=="Subworkflow"): + for emit in call.get_first_element_called().get_emit(): + dico["edges"].append({'A':str(emit), 'B':str(process), "label":""})#TODO check name of channel + + else: + raise Exception(f"Type '{p.get_type()}' was given as a parameter -> I don't know how to handle this!") + + #If the name number of parameters are given + if(len(self.parameters)==process.get_nb_inputs()): + for p in self.parameters: + add_parameter(p) + #If they are not -> we check that the right number isn't implied + else: + num_inputs = 0 + for p in self.parameters: + if(p.get_type()=="Call"): + num_inputs+= p.get_nb_outputs() + elif(p.get_type()=="Emitted"): + emitted = p + if(emitted.get_emitted_by().get_type()=="Subworkflow"): + if(emitted.get_emits()==None): + num_inputs+= emitted.get_emitted_by().get_nb_emit() + else: + num_inputs+=1 + elif(emitted.get_emitted_by().get_type()=="Process"): + if(emitted.get_emits()==None): + num_inputs+= emitted.get_emitted_by().get_nb_outputs() + else: + num_inputs+=1 + else: + raise Exception("This shoudn't happen") + else: + #Cause in case channel, operation or emit, it is only one channel given + num_inputs+=1 + if(num_inputs==process.get_nb_inputs()): + for p in self.parameters: + add_parameter(p) + + else: + raise BioFlowInsightError(f"Not the same number of parameters given as input for the process '{process.get_alias()}'{self.get_string_line(self.get_code(get_OG=True))}.", num=2, origin=self) + + elif(self.get_first_element_called().get_type()=="Subworkflow"): + sub = self.get_first_element_called() + + temp_dico = {} + temp_dico['nodes'] = [] + temp_dico['edges'] = [] + temp_dico['subworkflows'] = {} + sub.get_structure(temp_dico) + dico['subworkflows'][sub.get_alias()] = temp_dico + param_index = 0 + + def add_parameter(p, param_index): + sub_input = sub.get_takes()[param_index] + #Case parameter is a channel + if(p.get_type()=="Channel"): + channel = p + channel.get_structure(dico, B=sub_input) + + #Case parameter is a Emitted + elif(p.get_type()=="Emitted"): + emitted = p + emitted.get_structure(dico, B=sub_input) + + #Case parameter is a Operation + elif(p.get_type()=="Operation"): + operation = p + if(operation.show_in_structure): + operation.get_structure(dico) + dico["edges"].append({'A':str(operation), 'B':str(sub_input), "label":""}) + + #Case parameter is a Call + elif(p.get_type()=="Call"): + call = p + call.get_structure(dico) + #Case the first call is a process + if(call.get_first_element_called().get_type()=="Process"): + for output in call.get_first_element_called().get_outputs(): + dico["edges"].append({'A':str(call.get_first_element_called()), 'B':str(sub_input), "label":""})#TODO check name of channel + #Case the first call is a subworkflow + elif(call.get_first_element_called().get_type()=="Subworkflow"): + for emit in call.get_first_element_called().get_emit(): + dico["edges"].append({'A':str(emit), 'B':str(sub_input), "label":""})#TODO check name of channel + + else: + raise Exception(f"Type '{p.get_type()}' was given as a parameter -> I don't know how to handle this!") + param_index+=1 + return param_index + + #If the name number of parameters are given + if(len(self.parameters)==sub.get_nb_takes()): + for p in self.parameters: + param_index = add_parameter(p, param_index ) + #If they are not -> we check that the right number isn't implied + else: + num_inputs = 0 + for p in self.parameters: + if(p.get_type()=="Call"): + num_inputs+= p.get_nb_outputs() + else: + #Cause in case channel, operation or emit, it is only one channel given + num_inputs+=1 + if(num_inputs==sub.get_nb_takes()): + for p in self.parameters: + param_index = add_parameter(p, param_index ) + + else: + raise BioFlowInsightError(f"Not the same number of parameters given as input for the subworklfow '{sub.get_alias()}' in the call{self.get_string_line(self.get_code())}.", num = 2, origin=self) + + + elif(self.get_first_element_called().get_type()=="Function"): + None + + else: + raise Exception(f"This shoudn't happen! is type") + + #This function synthaxes the one above -> needs to be rechecked + def get_structure_2(self, dico): + + def add_parameter(p, to_link): + + #Case parameter is a channel + if(p.get_type()=="Channel"): + channel = p + channel.get_structure(dico, B=to_link) + + #Case parameter is a Emitted + elif(p.get_type()=="Emitted"): + emitted = p + emitted.get_structure(dico, B=to_link) + + #Case parameter is a Operation + elif(p.get_type()=="Operation"): + operation = p + operation.get_structure(dico) + dico["edges"].append({'A':str(operation), 'B':str(to_link), "label":""}) + + #Case parameter is a Call + elif(p.get_type()=="Call"): + call = p + call.get_structure(dico) + #Case the first call is a process + if(call.get_first_element_called().get_type()=="Process"): + for output in call.get_first_element_called().get_outputs(): + dico["edges"].append({'A':str(call.get_first_element_called()), 'B':str(to_link), "label":""})#TODO check name of channel + #Case the first call is a subworkflow + elif(call.get_first_element_called().get_type()=="Subworkflow"): + for emit in call.get_first_element_called().get_emit(): + dico["edges"].append({'A':str(emit), 'B':str(to_link), "label":""})#TODO check name of channel + + else: + raise Exception(f"Type '{p.get_type()}' was given as a parameter -> I don't know how to handle this!") + + + first_call = self.get_first_element_called() + param_index = 0 + if(first_call.get_type()=="Process" or first_call.get_type()=="Subworkflow"): + if(first_call.get_type()=="Process"): + dico['nodes'].append({'id':str(first_call), 'name':first_call.get_alias(), "shape":"ellipse", 'xlabel':"", 'fillcolor':''}) + else: + temp_dico = {} + temp_dico['nodes'] = [] + temp_dico['edges'] = [] + temp_dico['subworkflows'] = {} + first_call.get_structure(temp_dico) + dico['subworkflows'][first_call.get_alias()] = temp_dico + + #If the name number of parameters are given + if(len(self.parameters)==first_call.get_nb_inputs()): + for p in self.parameters: + if(first_call.get_type()=="Subworklow"): + sub_input = first_call.get_takes()[param_index] + add_parameter(p, sub_input) + param_index+=1 + else: + add_parameter(p, first_call) + #If they are not -> we check that the right number isn't implied + else: + num_inputs = 0 + for p in self.parameters: + if(p.get_type()=="Call"): + num_inputs+= p.get_nb_outputs() + else: + #Cause in case channel, operation or emit, it is only one channel given + num_inputs+=1 + if(num_inputs==first_call.get_nb_inputs()): + for p in self.parameters: + if(first_call.get_type()=="Subworklow"): + sub_input = first_call.get_takes()[param_index] + add_parameter(p, sub_input) + param_index+=1 + else: + add_parameter(p, first_call) + + else: + raise Exception(f"Not the same number of parameters given as input for the process '{first_call.get_alias()}' in the call ('{self.get_code()}')") + + + + def analyse_call(self, call): + tab_call = self.get_code_split_space(call) + if(re.fullmatch(constant.WORD, tab_call[0]) and tab_call[1]=='('): + #params1 = ' '.join(tab_call[2:-1]) + start = re.findall(tab_call[0]+constant.END_CALL, call)[0] + params = call.replace(start, "") + if(params[-1]==')'): + params = params[:-1] + else: + print(self.get_code()) + raise Exception("This shouldn't happens") + + self.analye_parameters(params) + process = self.get_process_from_name(tab_call[0]) + subworkflow = self.get_subworkflow_from_name(tab_call[0]) + fun = self.get_function_from_name(tab_call[0]) + if(process!=None and subworkflow==None and fun==None): + self.first_element_called = process + if(process==None and subworkflow!=None and fun==None): + self.first_element_called = subworkflow + if(process==None and subworkflow==None and fun!=None): + self.first_element_called = fun + if(process==None and subworkflow==None and fun==None): + raise Exception("No first call found!!") + self.called.append(self.first_element_called) + else: + print(self.get_file_address()) + raise Exception(f"Call didn't match pattern '{call}'") + + + def get_called(self): + tab = self.called + for params in self.parameters: + if(isinstance(params, Call)): + tab += params.get_called() + #TODO -> check this + tab = list(set(tab)) + return tab + + + def write_summary(self, tab=0): + file = open(f"{self.get_output_dir()}/debug/calls.nf", "a") + file.write(" "*tab+f"{self}"+"\n") + file.write(" "*(tab+1)+"* Called "+str(self.get_called())+"\n") + file.write(" "*(tab+1)+"* Code : "+ str(self.get_code())+"\n") + file.write(" "*(tab+1)+"* Parameters"+"\n") + for p in self.parameters: + file.write(" "*(tab+3)+p.get_code()+f" '{p.get_type()}'"+"\n") + file.write("\n") + + def add_call_count(self): + if(self.get_first_element_called().get_type()=="Process"): + process = self.get_first_element_called() + with open(f"{self.get_output_dir()}/debug/processes_used.json") as json_file: + dict = json.load(json_file) + try: + a = dict[process.get_file_address()] + except: + dict[process.get_file_address()] = [] + dict[process.get_file_address()].append(process.get_code()) + with open(f"{self.get_output_dir()}/debug/processes_used.json", "w") as outfile: + json.dump(dict, outfile, indent=4) + elif(self.get_first_element_called().get_type()=="Subworkflow"): + None + #TODO + elif(self.get_first_element_called().get_type()=="Function"): + None + #TODO + else: + raise Exception(f"I don't know what to do with '{self.get_first_element_called().get_type()}' in the call '{self.get_code()}' (in file ''{self.get_file_address()}'')") + + def initialise(self): + self.analyse_call(self.get_code(clean_pipe = True)) + self.write_summary() + self.add_call_count() + + + + diff --git a/src/channel.py b/src/channel.py new file mode 100644 index 0000000000000000000000000000000000000000..244cef9739b61561fa97e330cc6d0b7aeb3aea43 --- /dev/null +++ b/src/channel.py @@ -0,0 +1,50 @@ +from .nextflow_building_blocks import Nextflow_Building_Blocks +from .bioflowinsighterror import BioFlowInsightError + +class Channel(Nextflow_Building_Blocks): + def __init__(self, name, origin): + self.name = name + self.origin = origin + to_call = self.get_name_processes_subworkflows() + if(self.name in to_call): + raise BioFlowInsightError(f"'{self.name}' is trying to be created as a channel{self.get_string_line(self.origin.get_code())}. It already exists as a process or a subworkflow in the nextflow file.", num = 4, origin=self) + self.source = [] + self.sink = [] + + + def get_code(self): + return self.name.strip() + + def add_source(self, source): + self.source.append(source) + + def add_sink(self, sink): + self.sink.append(sink) + + def set_sink_null(self): + self.sink = [] + + def get_type(self): + return "Channel" + + def equal(self, channel): + return (self.name==channel.name and self.origin==self.origin) + + def get_source(self): + return self.source + + def remove_element_from_sink(self, ele): + self.sink.remove(ele) + + def get_sink(self): + return self.sink + + def get_name(self): + return self.name + + def get_structure(self, dico, B): + for source in self.get_source(): + dico["edges"].append({'A':str(source), 'B':str(B), "label":self.get_name()}) + + + diff --git a/src/code_.py b/src/code_.py new file mode 100644 index 0000000000000000000000000000000000000000..5db4f1b98ceb70eadd4909a4bee35bb15186ecc7 --- /dev/null +++ b/src/code_.py @@ -0,0 +1,57 @@ +from .outils import remove_comments +from .bioflowinsighterror import BioFlowInsightError +import re +from . import constant + +class Code: + def __init__(self, code, origin): + self.code = code + self.code_wo_comments = "" + self.origin = origin + self.initialise() + #self.check_its_nextflow() + + + def initialise(self): + #I do this just to avoid out of file problems later on + self.code = '\n'+self.code+'\n' + self.code_wo_comments = remove_comments(self.code) + self.code_wo_comments = re.sub(constant.BACKSLAPSH_JUMP, ' ', self.code_wo_comments) + self.code_wo_comments = self.code_wo_comments.replace("||", "$OR$") + + + def check_its_nextflow(self): + for illegal in constant.ILLEGAL_IMPORTS: + for match in re.finditer(constant.START_IMPORT+illegal, self.get_code()): + bit_of_code = match.group(0) + raise BioFlowInsightError(f"The presence of '{bit_of_code}' is detected{self.get_string_line(bit_of_code)}.", num = 1,origin=self) + + + def get_line(self, bit_of_code): + code = remove_comments(self.code) + index = code.find(bit_of_code) + if(index!=-1): + line = code[:index].count('\n') + if(line==0): + return 1 + return line + return -1 + + def get_string_line(self, bit_of_code): + line = self.get_line(bit_of_code) + line_error = '' + if(line!=-1): + line_error = f", possibly at line {line}" + return line_error + + + #Returns the code witout comments + def get_code(self, get_OG =False): + if(get_OG): + return self.code.strip() + else: + return self.code_wo_comments.strip() + + def get_file_address(self): + return self.origin.get_file_address() + \ No newline at end of file diff --git a/src/constant.py b/src/constant.py new file mode 100644 index 0000000000000000000000000000000000000000..9badf13580651123d61360192ce8f05f7b5ec716 --- /dev/null +++ b/src/constant.py @@ -0,0 +1,124 @@ +#========================== +# CONSTANT VARIABLES +#========================== + +ERROR_WORDS = ['null', "params", "log", "workflow", "it", "config"] + +ERROR_WORDS_ORIGINS = ['channel', 'Channel', 'null', "params", "logs", "workflow", "log", + "false", "true", "False", "True", + "it", "config"] + +ILLEGAL_IMPORTS = ["groovy", "java"] + +LIST_AS = ["as ", "As ", "AS ", "aS "] + +LIST_OPERATORS = ["distinct", "filter", "first", "last", "randomSample", "take", "unique", + "until","buffer","collate","collect","flatten","flatMap","groupBy","groupTuple","map","reduce","toList","toSortedList","transpose", + "splitCsv","splitFasta","splitFastq","splitText", + "cross","collectFile","combine","concat","join","merge","mix","phase","spread","tap", + "branch","choice","multiMap","into","separate","tap", + "count","countBy","min","max","sum","toInteger", + "close","dump","ifEmpty","print","println","set","view", + "empty", "of", "fromPath", "fromList", "subscribe", "value", "from"]#This last line is added by me:) + + +#========================== +# PATTERNS +#========================== + +# CALLS +#-------------------------- +BEGINNING_CALL = r"(\w+)\s*\(" +CALL_ID = r"Call_\d+" +END_CALL = r'\s*\(' + +# CHANNEL +#-------------------------- +CHANNEL_TAB = r"(\w+) *\[[ \d\'\"]+\]" + + +# EMIT +#-------------------------- +EMIT_ALONE = r"(\w+)\s*\.\s*(output|out)[^\w]" +EMIT_ALONE_2 = r"(\w+)\s*\.\s*(output|out)[^\w]" +EMIT_EQUALS = r"\w+\s*=\s*((\w+)\s*\.\s*(output|out))[^\w]" +EMIT_NAME = r'(\w+)\s*\.\s*(output|out)\s*\.\s*(\w+)' +EMIT_OPERATION = r"(\w+)\s*\.\s*(output|out)\s*[^\w]" +EMIT_TAB = r'(\w+)\s*\.\s*(output|out)\s*\[\s*(\d+)\s*\]' + + +# FUNCTION +#-------------------------- +HEADER_FUNCTION = r"(def)\s*(\w+)\s*\([^,)]*(,[^,)]+)*\)\s*{" + +# GENERAL +#-------------------------- +BACKSLAPSH_JUMP = r"\\\s*\n\s*" +JUMP_DOT = r"\s*\n\s*\." +NUMBER = r"\d+" +TUPLE_EQUALS = r"(\n|;)\s*(\( *\w+( *, *\w+)+ *\) *=)" +WORD = r'\w+' +WORD_EQUALS = r"(\w+)\s*=" +WORD_EQUALS_2 = r"(\n|;)\s*(\w+ *=)" +WORD_DOT = r'\w+\s*\.' + +LIST_EQUALS = [TUPLE_EQUALS, WORD_EQUALS_2] + +# IMPORTS +#-------------------------- +START_IMPORT = r'import\s+' + +# INLUCES +#-------------------------- +FULL_INCLUDE = r"include *({([^\}]+)}| +(\w+)) +from +([^\n ]+)" +FULL_INLCUDE_2 = r"include *({([^\}]+)}| +(\w+)| +(\w+ +(as|As|AS|aS) +\w+)) +from +([^\n ]+)" +INCLUDE_AS = r"(\w+) +(as|AS|As|aS) +(\w+)" + +# OPERATION +#-------------------------- +CHANNEL_EQUALS = r'\w+\s*=\s*(\w+)' +CHANNEL_EQUALS_LIST = r'\w+\s*=\s*\[(.+)\]' +CHANNEL_EQUALS_OPERATION = r'\w+\s*=\s*(\w+)\s*\.' +CHANNEL_EQUALS_SOMETHING = r"\w+\s*=(.|\s)+" +DOT_OPERATOR = r"\.\s*(\w+)\s*(\(|{)" +DOUBLE_DOT = r"(\w+)\s*=\s*([^\?\n]+)\s*\?([^\n]+)" +DOUBLE_DOT_TUPLE = r"\(\s*\w+\s*(,\s*\w+\s*)+\)\s*=\s*([^\?\n]+)\s*\?([^\n]+)" +END_OPERATOR = r' *(\(|{)' +ILLEGAL_CHARCTER_BEFORE_POTENTIAL_CHANNELS = r"\w|\'|\"|\." +MERGE_OPERATIONS = r'\.\s*((merge|mix|concat|spread|join|phase|cross|combine|fromList|collect|fromPath|value|from)\s*(\(|\{))' +OPERATOR_IN_PIPE = r"\w+ *{[^}]*}|\w+ *\([^\)]*\)|\w+" +SET_OPERATORS = ["choice", "separate", "tap", "into", "set"] +TUPLE_EQUALS = r'\( *\w+( *, *\w+)+ *\) *=\s*(\w+)\s*\.' +TUPLE_EQUALS_SOMETHING = r"(\( *\w+( *, *\w+)+ *\)) *=(.|\s)+" + +# PIPE +#-------------------------- +BEGINNING_PIPE_OPERATOR = r"[\w\.\[\]]+(\s+\|\s+\w+)+" +END_PIPE_OPERATOR = r"\s*(\s*\|\s*\w+)+" + + +# PROCESS +#-------------------------- +FILE = r'file +(\w+) *\n' +FROM = r'from([^\n]+)\n' +INPUT = r"\n\s*input *:" +INTO = r'into +([\w, ]+)' +INTO_2 = r'into +\(?( *\w+ *(, *\w+)*) *\)?' +OUTPUT = r"\n\s*output *:" +PROCESS_HEADER = r'process\s+(\w+|\'[\w ]+\'|\"[\w ]+\")\s*{' +SCRIPT = r"\n\s*script *:|shell *:|exec *:|\"\"\"|\'\'\'" +WHEN = r"\n\s*when *:" + + +# SUBWORKFLOW +#-------------------------- +EMIT_SUBWORKFLOW = r"emit *\:" +MAIN = r"\smain *\:\s" +TAKE = r"take *\:" +SUBWORKFLOW_HEADER = r'workflow +(\w+|\'[\w ]+\'|\"[\w ]+\") *{' + +# WORKFLOW +#-------------------------- +WORKFLOW_HEADER = r"workflow\s*\{" +WORKFLOW_HEADER_2 = r'[^\w](workflow\s*{)' + diff --git a/src/emitted.py b/src/emitted.py new file mode 100644 index 0000000000000000000000000000000000000000..3779c95585e84426590e67d4bbac6fd4c72c89a5 --- /dev/null +++ b/src/emitted.py @@ -0,0 +1,95 @@ +import re +from .channel import Channel +from .bioflowinsighterror import BioFlowInsightError +from . import constant + + +class Emitted(Channel): + + def __init__(self, name, origin, emitted_by): + Channel.__init__(self, name=name, origin=origin) + + self.emitted_by = emitted_by + if(not emitted_by.is_initialised()): + emitted_by.initialise() + + self.source.append(emitted_by) + self.emits = None #->this is the channel it's emits -> in the case of a subworkflow + + def get_emitted_by(self): + return self.emitted_by + + def get_emits(self): + return self.emits + + def get_type(self): + return "Emitted" + + def set_emits_decimal(self, decimal): + self.emits = self.emitted_by.get_emit()[decimal] + + def set_emits_name(self, name): + emitted = self.emitted_by.get_emit() + + for o in emitted: + code = o.get_code() + if(code[:len("emit:")]=="emit:"): + code =code[len("emit:"):].strip() + if(name==code): + self.emits = o + else: + for match in re.finditer(constant.WORD_EQUALS, code): + if(name==match.group(1)): + self.emits = o + + if(self.emits==None): + print(self.get_code()) + raise Exception(f"No emitted matched with '{name}' (in file '{self.get_file_address()}'). Should match with emittes from '{self.emitted_by.get_name()}' (in file '{self.emitted_by.get_file_address()}'") + + def set_emits(self, input): + if(input!=""): + try: + input = int(input) + self.set_emits_decimal(decimal=input) + except: + self.set_emits_name(name=input) + else: + #TODO -> check this + if(self.emitted_by.get_type()=='Process'): + #self.emits = self.emitted_by + None + elif(self.emitted_by.get_type()=='Subworkflow'): + if(len(self.emitted_by.emit)!=1): + raise BioFlowInsightError(f"One channel was expected in the emit '{self.get_code()}'. Even though multiple emits are defined for the workflow '{self.emitted_by.get_name()}'", num=6, origin=self) + self.emits = self.emitted_by.emit[0] + else: + raise Exception("This shoudn't happen!") + + def get_structure(self, dico, B): + emits = self.get_emitted_by() + if(not emits.is_called(self)): + end = "in the file" + if(self.origin.get_type()=="Subworkflow"): + end = f"in the subworkflow '{self.origin.get_name()}'" + raise BioFlowInsightError(f"Tried to access the emit '{self.get_code()}' but the {emits.get_type()} '{emits.get_name()}' has not been called {end}.", num = 8, origin=self) + + + #Case if the emit emits a process + if(emits.get_type()=="Process"): + if(self.emits==None): + #for i in range(emits.get_nb_outputs()): + # print("here") + # #I don't need to add the process (node) to the structure -> cause it's either there or will be added later on + dico["edges"].append({'A':str(emits), 'B':str(B), "label":self.get_code()}) + else: + dico["edges"].append({'A':str(self.emits), 'B':str(B), "label":self.get_code()}) + #Case if the emit emits a subworkflow + elif(emits.get_type()=="Subworkflow"): + if(self.emits==None): + raise Exception("Just a check") + for ope in emits.get_emit(): + dico["edges"].append({'A':str(ope), 'B':str(B), "label":self.get_code()}) + else: + dico["edges"].append({'A':str(self.emits), 'B':str(B), "label":self.get_name()}) + + diff --git a/src/executor.py b/src/executor.py new file mode 100644 index 0000000000000000000000000000000000000000..15a7c58c785ea21ab930d52aedbf58e8fe5a0335 --- /dev/null +++ b/src/executor.py @@ -0,0 +1,197 @@ +import re + +from . import constant + +from .code_ import Code +from .outils import update_parameters, get_curly_count, get_parenthese_count, checks_in_string +from .nextflow_building_blocks import Nextflow_Building_Blocks +from .bioflowinsighterror import BioFlowInsightError + + + + +#TODO +#- uniform eveything here +#- add a list of words illegal for channel eg. [true, process, workflow...] + + + +class Executor(Nextflow_Building_Blocks): + def __init__(self, code, origin): + self.origin = origin + self.code = Code(code = code, origin = self) + + + #--------------------------------- + #AUXILIARY METHODS FOR ALL CLASSES + #--------------------------------- + + def get_list_name_processes(self): + return self.origin.get_list_name_processes() + + def get_process_from_name(self, name): + return self.origin.get_process_from_name(name) + + def get_subworkflow_from_name(self, name): + return self.origin.get_subworkflow_from_name(name) + + def get_function_from_name(self, name): + return self.origin.get_function_from_name(name) + + def get_list_name_subworkflows(self): + return self.origin.get_list_name_subworkflows() + + def get_list_name_includes(self): + return self.origin.get_list_name_includes() + + def add_channel(self, channel): + self.origin.add_channel(channel) + + def check_in_channels(self, channel): + return self.origin.check_in_channels(channel) + + def get_channel_from_name(self, channel): + return self.origin.get_channel_from_name(channel) + + def get_executors(self): + return self.origin.get_executors() + + + + def get_file_address(self): + return self.origin.get_file_address() + + + def get_code(self, get_OG=False): + if(get_OG): + if(self.OG_code==""): + return self.code.get_code() + else: + return self.OG_code + else: + return self.code.get_code() + + def clean_pipe_operator(self, pipe): + + #Replace the || temporairly cause we don't wanna analyse them + to_replace_double_pipe = [] + found_or = True + while(found_or): + found_or = False + if(pipe.find("||")!=-1): + new_tag = f'{str(self)}_OR_{len(to_replace_double_pipe)}' + pipe = pipe.replace('||', new_tag, 1) + to_replace_double_pipe.append(new_tag) + found_or = True + + + head = '' + if(pipe.find("=")!=-1): + if(bool(re.fullmatch(constant.WORD, pipe.split("=")[0].strip()))): + head = f'{pipe.split("=")[0].strip()} = ' + pipe = "=".join(pipe.split("=")[1:]) + + + to_call = self.get_list_name_processes()+self.get_list_name_subworkflows()+self.get_list_name_includes() + searching = True + to_replace = [] + while(searching): + if(pipe.find('|')==-1): + searching=False + else: + #If the pipe operator is in a string we replace by something temporary + if(checks_in_string(pipe, '|')):#It selects the first one + new_tag = f'{str(self)}_{len(to_replace)}' + pipe = pipe.replace('|', new_tag, 1) + to_replace.append(new_tag) + #If it is not in a string + else: + pipe_split = pipe.split('|') + first_executor = pipe_split[0].strip() + first_pipe = pipe_split[1] + left_side = first_executor + right_side = "|".join(pipe_split[1:]) + thing = first_pipe.strip() + #This to test if it's actually a pipe operator and not just an || + if(get_parenthese_count(left_side)==0 and get_parenthese_count(right_side)==0 and get_curly_count(left_side)==0 and get_curly_count(right_side)==0): + #thing needs to follow the pattern for the pipe operator + + if(thing in to_call): + if(len(pipe_split[2:])==0): + pipe = f"{thing}({first_executor})" + searching = False + else: + pipe = f"{thing}({first_executor})" + '|'+ '|'.join(pipe_split[2:]) + elif(thing in constant.LIST_OPERATORS): + if(len(pipe_split[2:])==0): + pipe = f"{first_executor}.{thing}()" + searching = False + else: + pipe = f"{first_executor}.{thing}()" + '|'+'|'.join(pipe_split[2:]) + else: + added = False + for operator in constant.LIST_OPERATORS: + if(thing[:len(operator)]==operator and not added): + added = True + #This is in the case "channel | map {dfvfdvd}" + pipe = f"{first_executor}.{thing}" + #If there is still operations remaining we add them + if('|'.join(pipe_split[2:])!=""): + pipe = pipe + '|'+'|'.join(pipe_split[2:]) + if not added: + if(re.fullmatch(constant.OPERATOR_IN_PIPE, thing)): + print(pipe, self.get_file_address()) + print(f"'{thing}'") + raise Exception('problem') + raise BioFlowInsightError(f"Don't know how to handle '{thing}' in a pipe operator{self.get_string_line(thing)}. Try using the recommended operator composition.", num=3,origin = self) + + else: + pipe = str(self).join([left_side, right_side]) + + for tag in to_replace: + pipe = pipe.replace(tag, '|') + for tag in to_replace_double_pipe: + pipe = pipe.replace(tag, '||') + return (head+pipe).replace(str(self), '|', 1) + + + #This method analyses if the executor if an operation or a call, and returns + #the correct object corresponding to it + #TO do this we search if an operator is in parenthes or not + #If it's an operation the executor should be outside the parentheses + #If it's a call the operator should be inside the parentheses + def return_type(self): + list_things_to_call = self.get_name_processes_subworkflows() + is_operation =False + code = self.get_code() + code = code.replace(' ', '') + #Case for sure operation (it doesn't start with a call) + if(code.split('(')[0] not in list_things_to_call): + is_operation = True + + if(not is_operation): + curly_count, parenthese_count = 0, 0 + quote_single, quote_double = False, False + end=0 + while(end<len(code)): + curly_count, parenthese_count, quote_single, quote_double = update_parameters(code, end, curly_count, parenthese_count, quote_single, quote_double) + + if(curly_count==0 and parenthese_count==0 and quote_single==False and quote_double==False): + if(code[end]=="."): + for operator in constant.LIST_OPERATORS: + try: + if(code[end:end+len(operator)+1]=="."+operator): + is_operation=True + except: + None + end+=1 + + #If it is type operation -> the funtion returns the operation + if(is_operation): + from .operation import Operation + return Operation(self.get_code(), self.origin) + #Else it is an operation + else: + from .call import Call + return Call(self.get_code(), self.origin) + diff --git a/src/function.py b/src/function.py new file mode 100644 index 0000000000000000000000000000000000000000..9c55e969ae75cf2c9765c3ad81c3a0bb9bddb2a5 --- /dev/null +++ b/src/function.py @@ -0,0 +1,27 @@ + +from .code_ import Code +from .nextflow_building_blocks import Nextflow_Building_Blocks + + + +class Function(Nextflow_Building_Blocks): + def __init__(self, code, name, origin): + self.origin = origin + self.code = Code(code, origin = self) + self.name = name + self.alias = name + + def set_alias(self, alias): + self.alias = alias + + def get_alias(self): + return self.alias + + def get_type(self): + return "Function" + + def get_name(self): + return self.name + + + diff --git a/src/graph.py b/src/graph.py new file mode 100644 index 0000000000000000000000000000000000000000..73cca9fb6e1f10617376faef9bfdf96175e855ca --- /dev/null +++ b/src/graph.py @@ -0,0 +1,499 @@ + +import json +import networkx as nx +import numpy as np +import copy + +from .outils_graph import * + +class Graph(): + def __init__(self, nextflow_file): + self.workflow = nextflow_file + self.full_dico = nextflow_file.get_structure() + with open(f"{self.get_output_dir()}/graphs/full_graph_dico_format.json", 'w') as output_file : + json.dump(self.full_dico, output_file, indent=4) + #This dico give for the nodes its sister nodes + self.link_dico = None + #Dico to graph without operations + self.dico_wo_operation = {} + self.dico_wo_branch_operation = {} + + #Dictionaries for metadata + #Dico flattened (without any subworkflows) + self.dico_flattened = {} + + + def initialise(self): + self.get_graph_wo_branch_operations() + self.get_graph_wo_operations() + + self.get_graph_wo_operations_mermaid() + + + #self.networkX_wo_operations = self.get_networkx_graph(self.dico_wo_operation, self.networkX_wo_operations) + self.dico_flattened["nodes"] = [] + self.dico_flattened["edges"] = [] + #This will stay empty -> it's just so we can use the same function + self.dico_flattened["subworkflows"] = [] + + def get_output_dir(self): + return self.workflow.get_output_dir() + + #Creates the networkX graph + def get_networkx_graph(self, graph, networkX, first_call=True): + if(first_call): + networkX = nx.MultiDiGraph() + for node in graph['nodes']: + #Case node is process + if(is_process(node['id'])): + networkX.add_node(node['id'], type='Process', code=node['name']) + #Case node is operation + elif(is_operation(node['id'])): + networkX.add_node(node['id'], type='Operation', code=node['xlabel']) + elif(node['id']=="source"): + networkX.add_node("source", type='source', code="source") + elif(node['id']=="sink"): + networkX.add_node("sink", type='sink', code="sink") + else: + raise Exception("This shoudn't happen!") + + for edge in graph['edges']: + if(is_process(edge['A']) and is_process(edge['B'])): + networkX.add_edge(edge['A'], edge['B'], label = edge['label'], edge_type='process_2_process') + elif(is_process(edge['A']) and is_operation(edge['B'])): + networkX.add_edge(edge['A'], edge['B'], label = edge['label'], edge_type='process_2_operation') + elif(is_operation(edge['A']) and is_process(edge['B'])): + networkX.add_edge(edge['A'], edge['B'], label = edge['label'], edge_type='operation_2_process') + elif(is_operation(edge['A']) and is_operation(edge['B'])): + networkX.add_edge(edge['A'], edge['B'], label = edge['label'], edge_type='operation_2_operation') + else: + networkX.add_edge(edge['A'], edge['B'], label = "", edge_type='') + for subworkflow in graph['subworkflows']: + networkX = self.get_networkx_graph(graph['subworkflows'][subworkflow], networkX, first_call=False) + return networkX + + + + #Method that initalisise the link dico + def intia_link_dico(self): + if(self.link_dico==None): + self.link_dico = initia_link_dico_rec(self.full_dico) + + def get_full_graph(self, filename = "full_graph", render_graphs = True): + generate_graph(self.get_output_dir()/'graphs'/filename, self.full_dico, render_graphs = render_graphs) + + def get_full_graph_wo_lables(self, filename = "full_graph_wo_labels", render_graphs = True): + generate_graph(self.get_output_dir()/'graphs'/filename, self.full_dico, label_edge=False, label_node=False, render_graphs = render_graphs) + + def get_graph_wo_orphan_operations(self, filename = "graph_wo_orphan_operations", render_graphs = True): + generate_graph(self.get_output_dir()/'graphs'/filename, graph_dico_wo_orphan_operations(self.full_dico), render_graphs = render_graphs) + + def get_graph_wo_orphan_operations_wo_lables(self, filename = "graph_wo_orphan_operations_wo_labels", render_graphs = True): + generate_graph(self.get_output_dir()/'graphs'/filename, graph_dico_wo_orphan_operations(self.full_dico), label_edge=False, label_node=False, render_graphs = render_graphs) + + def get_graph_wo_operations(self): + self.intia_link_dico() + + #Function that replicates the workflow's structure wo the operations in the nodes + def replicate_dico_wo_operations(dico_struct): + dico = {} + dico['nodes'] = [] + dico['edges'] = [] + dico['subworkflows'] = {} + for node in dico_struct["nodes"]: + if(is_process(node['id'])): + dico['nodes'].append(node) + for sub in dico_struct['subworkflows']: + dico['subworkflows'][sub] = replicate_dico_wo_operations(dico_struct['subworkflows'][sub]) + return dico + + dico = replicate_dico_wo_operations(self.full_dico) + + #This is a dictionnary which links every node to it's connected process + node_2_processes = copy.deepcopy(self.link_dico) + already_searched = {} + for node in node_2_processes: + already_searched[node] = [node] + changed = True + while(changed): + changed = False + for node in node_2_processes: + temp = node_2_processes[node].copy() + for give in node_2_processes[node]: + if(is_operation(give)): + temp.remove(give) + if(node!=give and give not in already_searched[node]): + already_searched[node] += give + temp_temp = node_2_processes[give] + for node_temp in already_searched[node]: + try: + temp_temp.remove(node_temp) + except: + None + temp+=temp_temp + changed = True + node_2_processes[node] = list(set(temp)) + + + links_added = [] + def add_edges(dico): + for node in dico['nodes']: + edges = node_2_processes[node['id']] + for B in edges: + link = f"{node['id']} -> {B}" + if(link not in links_added): + dico['edges'].append({'A': node['id'], 'B': B, 'label': ''}) + links_added.append(link) + for sub in dico['subworkflows']: + add_edges(dico["subworkflows"][sub]) + + + add_edges(dico) + + + self.dico_wo_operation = dico + + def get_graph_wo_operations_mermaid(self): + dico_nodes = {} + def fill_dico_node(dico): + for node in dico["nodes"]: + dico_nodes[node["id"]] = node['name'] + for subworkflow in dico["subworkflows"]: + fill_dico_node(dico["subworkflows"][subworkflow]) + fill_dico_node(self.dico_wo_operation) + + #txt = "```mermaid\n\t" + txt= "graph TB;\n" + + def get_id(txt): + import re + for match in re.finditer(r"object at (\w+)>", txt): + return match.group(1) + + def get_graph_wo_operations_mermaid_temp(dico, txt, count): + count+=1 + for node in dico["nodes"]: + tab= count*"\t" + txt+=f"{tab}{get_id(node['id'])}[{node['name']}];\n" + + for edge in dico["edges"]: + tab= count*"\t" + txt+=f"{tab}{get_id(edge['A'])}-->{get_id(edge['B'])};\n" + for subworkflow in dico["subworkflows"]: + tab= count*"\t" + txt += f"{tab}subgraph {subworkflow}\n{tab}\tdirection TB;\n" + count+=1 + txt = get_graph_wo_operations_mermaid_temp(dico["subworkflows"][subworkflow], txt, count) + count-=1 + txt += f"{tab}end\n" + return txt + txt = get_graph_wo_operations_mermaid_temp(self.dico_wo_operation, txt, 0) + #txt += """```""" + #print(txt) + with open(f"{self.get_output_dir()}/graphs/mermaid_wo_operations.md", "w") as text_file: + text_file.write(txt) + + + def render_graph_wo_operations(self, filename = "graph_wo_operations", render_graphs = True): + generate_graph(self.get_output_dir()/'graphs'/filename, self.dico_wo_operation, render_graphs = render_graphs) + + + #def get_graph_wo_branch_operations(self): + # self.intia_link_dico() + # nodes_in_graph = [] + # #Function that replicates the workflow's structure wo the operations in the nodes + # def replicate_dico_wo_branch_operations(dico_struct): + # dico = {} + # dico['nodes'] = [] + # dico['edges'] = [] + # dico['subworkflows'] = {} + # for node in dico_struct["nodes"]: + # if(get_type_node(node)!="Branch Operation"): + # dico['nodes'].append(node) + # nodes_in_graph.append(node['id']) + # for sub in dico_struct['subworkflows']: + # dico['subworkflows'][sub] = replicate_dico_wo_branch_operations(dico_struct['subworkflows'][sub]) + # return dico + # + # dico = replicate_dico_wo_branch_operations(self.full_dico) + # + # #Function that takes a node and gives all the nodes in which is it connected to (simplifying by the branch operations) + # def get_nodes_linked(element, already_searched = {}): + # try: + # temp = already_searched[element] + # except: + # already_searched[element] = [] + # tab = [] + # #It's possible the node wasn't added to link_dico + # try: + # gives = self.link_dico[element] + # except: + # gives = [] + # for ele in gives: + # if(ele in nodes_in_graph): + # tab.append(ele) + # else: + # if(ele!=element and ele not in already_searched[element]): + # already_searched[element].append(ele) + # tab += get_nodes_linked(ele, already_searched) + # return tab + # + # def add_edges(dico, links_added = []): + # for node in dico['nodes']: + # edges = get_nodes_linked(node['id']) + # for B in edges: + # link = f"{node['id']} -> {B}" + # if(link not in links_added): + # dico['edges'].append({'A': node['id'], 'B': B, 'label': ''}) + # links_added.append(link) + # for sub in dico['subworkflows']: + # add_edges(dico["subworkflows"][sub], links_added) + # + # add_edges(dico) + # self.dico_wo_branch_operation = dico + + def get_graph_wo_branch_operations(self): + self.intia_link_dico() + nodes_in_graph = [] + branch_operation_ids = [] + #Function that replicates the workflow's structure wo the operations in the nodes + def replicate_dico_wo_branch_operations(dico_struct): + dico = {} + dico['nodes'] = [] + dico['edges'] = [] + dico['subworkflows'] = {} + for node in dico_struct["nodes"]: + if(get_type_node(node)!="Branch Operation"): + dico['nodes'].append(node) + nodes_in_graph.append(node['id']) + for sub in dico_struct['subworkflows']: + dico['subworkflows'][sub] = replicate_dico_wo_branch_operations(dico_struct['subworkflows'][sub]) + return dico + + dico = replicate_dico_wo_branch_operations(self.full_dico) + + #This is a dictionnary which links every node to it's connected process + node_2_none_branch = copy.deepcopy(self.link_dico) + already_searched = {} + for node in node_2_none_branch: + already_searched[node] = [node] + changed = True + while(changed): + changed = False + for node in node_2_none_branch: + temp = node_2_none_branch[node].copy() + for give in node_2_none_branch[node]: + if(is_operation(give) and give not in nodes_in_graph): + temp.remove(give) + if(node!=give and give not in already_searched[node]): + already_searched[node] += give + temp_temp = node_2_none_branch[give] + for node_temp in already_searched[node]: + try: + temp_temp.remove(node_temp) + except: + None + temp+=temp_temp + changed = True + node_2_none_branch[node] = list(set(temp)) + + + links_added = [] + def add_edges(dico): + for node in dico['nodes']: + edges = node_2_none_branch[node['id']] + for B in edges: + link = f"{node['id']} -> {B}" + if(link not in links_added): + dico['edges'].append({'A': node['id'], 'B': B, 'label': ''}) + links_added.append(link) + for sub in dico['subworkflows']: + add_edges(dico["subworkflows"][sub]) + + add_edges(dico) + self.dico_wo_branch_operation = dico + + + def render_graph_wo_branch_operations(self, filename = "graph_wo_branch_operations", render_graphs = True): + generate_graph(self.get_output_dir()/'graphs'/filename, self.dico_wo_branch_operation, render_graphs = render_graphs) + + def get_graph_wo_branch_operations_wo_lables(self, filename = "graph_wo_branch_operations_wo_lables", render_graphs = True): + generate_graph(self.get_output_dir()/'graphs'/filename, self.dico_wo_branch_operation, label_edge=False, label_node=False, render_graphs = render_graphs) + + def get_graph_wo_branch_operations_wo_orphan_operations(self, filename = "graph_wo_branch_operations_wo_orphan_operations", render_graphs = True): + generate_graph(self.get_output_dir()/'graphs'/filename, graph_dico_wo_orphan_operations(self.dico_wo_branch_operation), render_graphs = render_graphs) + + def get_graph_wo_branch_operations_wo_orphan_operations_wo_lables(self, filename = "graph_wo_branch_operations_wo_orphan_operations_wo_lables", render_graphs = True): + generate_graph(self.get_output_dir()/'graphs'/filename, graph_dico_wo_orphan_operations(self.dico_wo_branch_operation), label_edge=False, label_node=False, render_graphs = render_graphs) + + + #============================ + #METADATA FROM GRAPH + #============================ + + def initialise_flattened_dico(self, dico): + for node in dico["nodes"]: + self.dico_flattened["nodes"].append(node) + for edge in dico["edges"]: + self.dico_flattened["edges"].append(edge) + for subworkflow in dico["subworkflows"]: + self.initialise_flattened_dico(dico["subworkflows"][subworkflow]) + + def get_metadata(self, graph): + G = self.get_networkx_graph(graph, None) + dico = {} + for node in G.nodes(data=True): + if(node[1]=={}): + print(node) + process_nodes = [node for node, data in G.nodes(data=True) if data['type'] == 'Process'] + operation_nodes = [node for node, data in G.nodes(data=True) if data['type'] == 'Operation'] + + dico['number_of_processes'] = len(process_nodes) + dico['number_of_operations'] = len(operation_nodes) + dico['number_of_nodes'] = dico['number_of_processes']+dico['number_of_operations'] + + dico['number_of_edges_process_2_process'] = sum(1 for _, _, data in G.edges(data=True) if data['edge_type']=="process_2_process") + dico['number_of_edges_process_2_operation'] = sum(1 for _, _, data in G.edges(data=True) if data['edge_type']=="process_2_operation") + dico['number_of_edges_operation_2_process'] = sum(1 for _, _, data in G.edges(data=True) if data['edge_type']=="operation_2_process") + dico['number_of_edges_operation_2_operation'] = sum(1 for _, _, data in G.edges(data=True) if data['edge_type']=="operation_2_operation") + + dico['number_of_edges_source_process'] = dico['number_of_edges_process_2_process'] + dico['number_of_edges_process_2_operation'] + dico['number_of_edges_source_operation'] = dico['number_of_edges_operation_2_process'] + dico['number_of_edges_operation_2_operation'] + dico['number_of_edges_sink_process'] = dico['number_of_edges_process_2_process'] + dico['number_of_edges_operation_2_process'] + dico['number_of_edges_sink_operation'] = dico['number_of_edges_process_2_operation'] + dico['number_of_edges_operation_2_operation'] + dico['number_of_edges'] = dico['number_of_edges_process_2_process'] + dico['number_of_edges_process_2_operation'] + dico['number_of_edges_operation_2_process'] + dico['number_of_edges_operation_2_operation'] + + dico["number_of_simple_loops"] = nx.number_of_selfloops(G) + + distribution_in_degrees_for_processes = list(dict(G.in_degree(process_nodes)).values()) + distribution_out_degrees_for_processes = list(dict(G.out_degree(process_nodes)).values()) + distribution_in_degrees_for_operations= list(dict(G.in_degree(operation_nodes)).values()) + distribution_out_degrees_for_operations= list(dict(G.out_degree(operation_nodes)).values()) + + dico["distribution_in_degrees_for_processes"] = distribution_in_degrees_for_processes + dico["distribution_out_degrees_for_processes"] = distribution_out_degrees_for_processes + dico["distribution_in_degrees_for_operations"] = distribution_in_degrees_for_operations + dico["distribution_out_degrees_for_operations"] = distribution_out_degrees_for_operations + + dico["distribution_in_degrees_for_all"] = dico["distribution_in_degrees_for_processes"]+dico["distribution_in_degrees_for_operations"] + dico["distribution_out_degrees_for_all"] = dico["distribution_out_degrees_for_processes"]+dico["distribution_out_degrees_for_operations"] + + dico["average_in_degrees_for_processes"] = np.array(distribution_in_degrees_for_processes).mean() + dico["average_out_degrees_for_processes"] = np.array(distribution_out_degrees_for_processes).mean() + dico["average_in_degrees_for_operations"] = np.array(distribution_in_degrees_for_operations).mean() + dico["average_out_degrees_for_operations"] = np.array(distribution_out_degrees_for_operations).mean() + dico["average_in_degrees_for_all"] = np.array(dico["distribution_in_degrees_for_all"] ).mean() + dico["average_out_degrees_for_all"] = np.array(dico["distribution_out_degrees_for_all"] ).mean() + + + dico["median_in_degrees_for_processes"] = np.median(np.array(distribution_in_degrees_for_processes)) + dico["median_out_degrees_for_processes"] = np.median(np.array(distribution_out_degrees_for_processes)) + dico["median_in_degrees_for_operations"] = np.median(np.array(distribution_in_degrees_for_operations)) + dico["median_out_degrees_for_operations"] = np.median(np.array(distribution_out_degrees_for_operations)) + dico["median_in_degrees_for_all"] = np.median(np.array(dico["distribution_in_degrees_for_all"])) + dico["median_out_degrees_for_all"] = np.median(np.array(dico["distribution_out_degrees_for_all"])) + + #DEsnity = m/n(n-1), where n is the number of nodes and m is the number of edges + dico['density'] = nx.density(G) + weakly_connected_components = list(nx.weakly_connected_components(G)) + dico['number_of_weakly_connected_components'] = len(weakly_connected_components) + + components_with_over_2_nodes = [comp for comp in weakly_connected_components if len(comp) >= 2] + dico['number_of_weakly_connected_components_with_2_or_more_nodes'] = len(components_with_over_2_nodes) + + #Getting the number of cycles + self.initialise_flattened_dico(graph) + links_flattened = initia_link_dico_rec(self.dico_flattened) + not_source_2_sink = [] + node_2_sink = [] + + for node in links_flattened: + if(links_flattened[node]==[]): + node_2_sink.append(node) + else: + not_source_2_sink+=links_flattened[node] + not_source_2_sink = set(not_source_2_sink) + source_2_node = list(set(links_flattened.keys()).difference(not_source_2_sink)) + links_flattened_source_sink = links_flattened.copy() + links_flattened_source_sink["source"], links_flattened_source_sink["sink"] = source_2_node, [] + for node in node_2_sink: + links_flattened_source_sink[node].append("sink") + + #The simple loops are included in this + dico['number_of_cycles'], edges_create_cycles = get_number_cycles(links_flattened_source_sink) + + #Remove the edges which create the cycles + #Since the number of paths from Source 2 sink and the longest path depend on the + #Topological ordering + #A topological ordering is possible if and only if the graph has no directed cycles, that is, if it is a directed acyclic graph (DAG) + #We turn the CDG (cyclic directed graphs) into a DAG (directed acyclic graph) + for A, B in edges_create_cycles: + links_flattened_source_sink[A].remove(B) + + structure_type = "" + if(len(edges_create_cycles)==0): + structure_type = "DAG" + else: + structure_type = "CDG" + + dico['structure_type'] = structure_type + + dico['number_of_paths_source_2_sink'] = get_number_paths_source_2_sink(links_flattened_source_sink) + dico['shortest_path'] = dijkstra(links_flattened_source_sink) + dico['longest_path'] = get_longest_distance(links_flattened_source_sink) + + + """#Check that the values calculated are the same than what gives networkX + dico_check = {} + dico_check['nodes'] = [] + dico_check['edges'] = [] + dico_check['subworkflows'] = {} + for node in links_flattened_source_sink: + dico_check["nodes"].append({'id':node, 'xlabel':"", 'name':""}) + for B in links_flattened_source_sink[node]: + dico_check["edges"].append({'A':node, "B":B, "label":""}) + + G_DAG = self.get_networkx_graph(dico_check, None) + #===================================== + #ADDING SINK AND SOURCE TO THE GRAPH + #===================================== + source_node = "source" + sink_node = "sink" + + if(dico['shortest_path']!=nx.shortest_path_length(G_DAG, source=source_node, target=sink_node)): + raise Exception(f"{dico['shortest_path']}, {nx.shortest_path_length(G_DAG, source=source_node, target=sink_node)}") + #print("test1") + if(dico['longest_path']+1!=len(nx.dag_longest_path(G_DAG))): + raise Exception(f"{dico['longest_path']}, {len(nx.dag_longest_path(G_DAG))}") + #print("test2") + + #if(len(list(nx.all_simple_paths(G_DAG, source=source_node, target=sink_node)))!=dico['number_of_paths_source_2_sink']): + # raise Exception(f"{len(list(nx.all_simple_paths(G_DAG, source=source_node, target=sink_node)))}, {dico['number_of_paths_source_2_sink']}") + #print("test3")""" + + return dico + + + def get_metadata_fullgraph(self): + + dico = self.get_metadata(self.full_dico) + with open(self.get_output_dir()/ "graphs/metadata_full_graph.json", 'w') as output_file : + json.dump(dico, output_file, indent=4) + + def get_metadata_graph_wo_branch_operations(self): + + dico = self.get_metadata(self.dico_wo_branch_operation) + with open(self.get_output_dir()/ "graphs/metadata_graph_wo_branch_operations.json", 'w') as output_file : + json.dump(dico, output_file, indent=4) + + def get_metadata_graph_wo_operations(self): + + dico = self.get_metadata(self.dico_wo_operation) + with open(self.get_output_dir()/ "graphs/metadata_graph_wo_operations.json", 'w') as output_file : + json.dump(dico, output_file, indent=4) + + #def get_metadata_graph_wo_operations(self): + # G = self.networkX_wo_operations + # dico = self.get_metadata(G) + # with open(self.get_output_dir() / "graphs/metadata_graph_wo_operations.json", 'w') as output_file : + # json.dump(dico, output_file, indent=4) diff --git a/src/include.py b/src/include.py new file mode 100644 index 0000000000000000000000000000000000000000..8bddcbfdac7728653d3d54a42fd3a88c800b5777 --- /dev/null +++ b/src/include.py @@ -0,0 +1,152 @@ + +import re +import os +import copy + +from . import constant + +from .code_ import Code +from .nextflow_building_blocks import Nextflow_Building_Blocks +from .bioflowinsighterror import BioFlowInsightError + + + + + +#Remove ' and " from a given string +def clean_string(txt): + txt = txt.replace("'", "") + txt = txt.replace('"', "") + return txt + +class Include(Nextflow_Building_Blocks): + def __init__(self, code, file, importing, origin, duplicate): + self.origin = origin + self.importing = importing + self.duplicate = duplicate + self.code = Code(code = code, origin = self) + self.file = None + self.address = file + self.define_file(file) + self.aliases = {} + self.defines = [] + #self.initialise() + + + def get_aliases(self): + return self.aliases + + def get_defines(self): + return self.defines + + def get_file(self): + return self.file + + def get_address(self): + return self.address + + def get_root_directory(self): + return self.origin.get_root_directory() + + + def get_list_name_includes(self): + if(self.duplicate): + names = [] + for ele in self.defines: + names.append(ele.get_alias()) + return names + else: + return list(self.aliases.keys()) + + def define_file(self, file): + from .nextflow_file import Nextflow_File + address = clean_string(file) + root = self.origin.get_file_address() + root = '/'.join(root.split('/')[:-1]) + found_file = False + + if(os.path.isfile(address)): + found_file = True + + if(not found_file): + if(address[-1]in [';']): + address = address[:-1] + + if(address.split('/')[0] in ["$projectDir", "${projectDir}", "${baseDir}", "$baseDir"]): + address = '/'.join(address.split('/')[1:]) + root = self.get_root_directory() + address = root+'/'+address + if(os.path.isfile(address)): + found_file = True + + if(not found_file): + if(address[-3:]!=".nf"): + address+=".nf" + if(os.path.isfile(address)): + found_file = True + + if(not found_file and os.path.isfile(address[:-3]+"/main.nf")): + self.file = Nextflow_File(address[:-3]+"/main.nf", origin=self, duplicate=self.duplicate, DSL="DSL2") + + #TODO -> check if the nextflow_file is defined somewhere else? + #In the cas the nextflow file is imported multiple times + + else: + if(os.path.isfile(address)): + self.file = Nextflow_File(address, origin=self, duplicate=self.duplicate, DSL="DSL2") + else: + address = os.path.normpath(address) + raise BioFlowInsightError(f"Something went wrong in an include{self.get_string_line(self.get_code())}. No such file: '{address}'.", num = 10,origin=self) + + + #If not duplicate -> we need to see if there is another include which has already defined the file + #TODO -> if you wanna generalise this to all include (inbetween files -> you just need to update get_include() ) + if(not self.duplicate): + #other_includes = self.origin.get_all_includes() + other_includes = self.origin.get_includes() + for other in other_includes: + if(self.get_address()==other.get_address()): + self.file = other.get_file() + + def initialise(self): + self.file.initialise() + + for include in self.importing: + include = include.strip() + found = False + if(include!=''): + if(re.fullmatch(constant.WORD, include)): + if(self.duplicate): + self.defines.append(self.file.get_element_from_name(include)) + else: + self.aliases[include] = self.file.get_element_from_name(include) + found = True + else: + pattern_as = constant.INCLUDE_AS + for match in re.finditer(pattern_as, include): + found = True + if(self.duplicate): + #TODO -> try shallow copy too + #thing_as = copy.copy(self.file.get_element_from_name(match.group(1))) + thing_as = copy.deepcopy(self.file.get_element_from_name(match.group(1))) + thing_as.set_alias(match.group(3)) + self.defines.append(thing_as) + else: + #other_includes = self.origin.get_includes() + #added_from_other = False + #for other in other_includes: + # if(self.get_address()==other.get_address()): + # self.aliases[match.group(3)] = other.file.get_element_from_name(match.group(1)) + # added_from_other = True + #if(not added_from_other): + self.aliases[match.group(3)] = self.file.get_element_from_name(match.group(1)) + + if(not found): + raise Exception(f"I was not able to import '{include}' from {self.file.get_file_address()}") + + + + + + + diff --git a/src/main_DSL2.py b/src/main_DSL2.py new file mode 100644 index 0000000000000000000000000000000000000000..a0c2d56ae29232cac774d0725247d55a091263d8 --- /dev/null +++ b/src/main_DSL2.py @@ -0,0 +1,184 @@ +from .nextflow_building_blocks import Nextflow_Building_Blocks +from .bioflowinsighterror import BioFlowInsightError +import re + +from . import constant + + +class Main_DSL2(Nextflow_Building_Blocks): + def __init__(self, code, origin): + Nextflow_Building_Blocks.__init__(self, code) + self.origin = origin + self.calls = [] + self.initialised = False + + def get_channels(self): + return self.channels + + def get_type(self): + return "Main DSL2" + + def get_calls(self): + return self.calls + + def is_initialised(self): + return self.initialised + + + def get_processes(self): + return self.origin.get_processes()+super().get_processes() + + def get_process_from_name(self, name): + return self.origin.get_process_from_name(name) + + def get_function_from_name(self, name): + return self.origin.get_function_from_name(name) + + def get_list_name_subworkflows(self): + return self.origin.get_list_name_subworkflows() + + def get_list_name_includes(self): + return self.origin.get_list_name_includes() + + + def get_channel_from_name(self, name): + channel_file = self.origin.get_channel_from_name(name) + if(channel_file!=None): + return channel_file + return super().get_channel_from_name(name) + + + """def get_added_operations_structure(self): + return self.origin.get_added_operations_structure()""" + + def check_in_channels(self, channel): + found = super().check_in_channels(channel) + if(not found): + if(self.origin.get_type()=="Nextflow File"): + return self.origin.check_in_channels(channel) + else: + raise Exception(f"The origin is a '{self.origin.get_type()}' it should be a 'Nextflow File'") + return found + + + def get_subworkflow_from_name(self, name): + return self.origin.get_subworkflow_from_name(name) + + def check_includes(self): + code = self.get_code() + + pattern = constant.FULL_INCLUDE + for match in re.finditer(pattern, code): + if(self.get_type()=="Main DSL2"): + raise BioFlowInsightError(f"An include ('{match.group(0)}') was found in the main in the file '{self.get_file_address()}'. FlowInsight does not support this -> see specification list.", num = 12,origin=self) + elif(self.get_type()=="Subworkflow"): + raise BioFlowInsightError(f"An include ('{match.group(0)}') was found in the subworkflow '{self.get_name()}' in the file '{self.get_file_address()}'. FlowInsight does not support this -> see specification list.", num = 12, origin=self) + else: + raise Exception("This shouldn't happen!") + + + def initialise(self): + if(not self.initialised): + + self.initialised=True + + #Check that includes are not defined in the main or subworkflows + self.check_includes() + + #Extract Executors + self.extract_executors() + + + #Analyse Executors + for e in self.executors: + e.initialise() + + + + """def add_channels_structure(self, dot): + return self.add_channels_structure_temp(dot, self.origin.get_added_operations_structure()) + """ + def get_origin(self): + return self.origin + + def check_same_origin(self, sub): + return self.get_origin()== sub.get_origin() + + #Add "global" channels and operation to the structure defined in the file + def get_structure_DSL2(self, dico): + self.origin.get_structure_DSL2(dico) + + + def get_structure(self, dico): + #Add "global" channels and operation to the structure defined in the file + self.get_structure_DSL2(dico) + + + for e in self.executors: + if(e.get_type()=="Operation"): + e.get_structure(dico) + elif(e.get_type()=="Call"): + e.get_structure(dico) + else: + raise Exception(f"Executor of type '{e.get_type()}' was extracted in a DSL2 workflow! I don't know what this is! The code is '{e.get_code()}'") + + + # + #nodes_added = [] + # + ##Add operation + #for o in self.get_operations(): + # dico['nodes'].append({'id':str(o), 'name':"", "shape":"point", 'xlabel':o.get_code()}) + # nodes_added.append(str(o)) + # + # #Need to check for cases where the origin is a process or a subworkflow + # for origin in o.get_origins(): + # + # if(origin.get_type()=="Process"): + # #Here i'm not adding the node but an edge -> the node is add when the call happens + # dico["edges"].append({'A':str(origin), 'B':str(o), "label":""}) + # + # elif(origin.get_type()=="Subworkflow"): + # emits = origin.get_emit() + # #TODO -> i'm only doing one parameter for now + # if(len(emits)==1): + # for source in emits[0].get_source(): + # dico["edges"].append({'A':str(source), 'B':str(o), "label":""}) + # else: + # raise Exception(f'TO much to unpack for "{o.get_code()}"') + # + # elif(origin.get_type()=="Emitted"): + # if(origin.get_emitted_by().get_type()=="Process"): + # dico["edges"].append({'A':str(origin.get_emitted_by()), 'B':str(o), "label":origin.get_name()}) + # + # elif(origin.get_emitted_by().get_type()=="Subworkflow"): + # for source in origin.get_emits().get_source(): + # dico["edges"].append({'A':str(source), 'B':str(o), "label":origin.get_name()}) + # + # else: + # raise Exception(f"I don't know how to handle {origin.get_emitted_by()}") + # + # + # elif(origin.get_type()=="Channel"): + # None + # #Here we do nothing since the channels are gonna be added below + # + # else: + # raise Exception(f"George I don't know if this should be an error or not -> i don't think it should be") + # #TODO check this -> it should be added by the channel here below + # + # + ##Adding channels + #for c in self.get_channels(): + # for source in c.get_source(): + # for sink in c.get_sink(): + # #Here we check that the operation exists (already added to the structure) -> it's to avoid showing the operation for the emited channel + # if(str(sink) in nodes_added): + # dico["edges"].append({'A':str(source), 'B':str(sink), "label":c.get_name()}) + # + # + #for c in self.get_calls(): + # c.get_structure(dico) + # + ##return dico + diff --git a/src/nextflow_building_blocks.py b/src/nextflow_building_blocks.py new file mode 100644 index 0000000000000000000000000000000000000000..79607e14bf5c500d2eaf3e64107eaee8770d05ba --- /dev/null +++ b/src/nextflow_building_blocks.py @@ -0,0 +1,449 @@ +import os +import re +from pathlib import Path + +from . import constant + +from .outils import extract_curly, extract_end_operation, extract_executor_from_middle, get_end_call, expand_call_to_operation, get_curly_count, get_parenthese_count, expand_pipe_operator, checks_in_condition_if, checks_in_string +from .code_ import Code +from .bioflowinsighterror import BioFlowInsightError + + + +class Nextflow_Building_Blocks: + def __init__(self, code): + self.code = Code(code = code, origin = self) + + self.processes = [] + self.channels = [] + self.DSL = "" + #DSL2 + self.includes = [] + self.main = None + self.executors = [] + self.subworkflows = [] + self.functions=[] + + + + #--------------------------------- + #AUXILIARY METHODS FOR ALL CLASSES + #--------------------------------- + def get_code(self, get_OG = False): + return self.code.get_code(get_OG = get_OG) + + def get_output_dir(self): + return self.origin.get_output_dir() + + def get_DSL(self): + return self.origin.get_DSL() + + def get_file_address(self): + return self.origin.get_file_address() + + def get_display_info(self): + return self.origin.get_display_info() + + def get_name_processes_subworkflows(self): + return self.origin.get_list_name_subworkflows()+self.origin.get_list_name_includes()+ self.origin.get_list_name_processes() + + #Only used by the process or subworkflow + def is_called(self, called_from): + if(self.get_type() in ["Process", "Subworkflow"]): + + executors = called_from.origin.get_executors() + for exe in executors: + if(exe.get_type()=="Call"): + if(self in exe.get_elements_called()): + return True + #Case operation + else: + for o in exe.get_origins(): + if(o.get_type()=="Call"): + if(self in o.get_elements_called()): + return True + return False + raise Exception("You can't do this!") + + def get_line(self, bit_of_code): + return self.origin.get_line(bit_of_code) + + def get_string_line(self, bit_of_code): + return self.origin.get_string_line(bit_of_code) + + def get_name_file(self): + return self.origin.get_name_file() + + + + + #---------------------- + #PROCESSES + #---------------------- + def extract_processes(self): + from .process import Process + code = self.get_code() + #Find pattern + for match in re.finditer(constant.PROCESS_HEADER, code): + start = match.span(0)[0] + end = extract_curly(code, match.span(0)[1])#This function is defined in the functions file + p = Process(code=code[start:end], origin=self) + self.processes.append(p) + + def get_list_name_processes(self): + tab = [] + for p in self.get_processes(): + tab.append(p.get_name()) + return tab + + def get_process_from_name(self, name): + for p in self.get_processes(): + if(p.get_name()==name): + return p + return None + + def get_channels(self): + return self.origin.get_channels() + + def get_processes(self): + return self.processes + + #---------------------- + #CHANNELS + #---------------------- + + #Check if a channel given in parameters is already in channels + def check_in_channels(self, channel): + for c in self.channels: + if(c.equal(channel)): + return True + return False + + def get_channel_from_name(self, name): + for c in self.channels: + if(name == c.get_name()): + return c + #raise Exception(f"{name} is not in the list of channels") + return None + + #Method that adds channel into the lists of channels + def add_channel(self, channel): + if(not self.check_in_channels(channel)): + self.channels.append(channel) + else: + raise Exception("This shoudn't happen!") + + + """def add_channels_structure_temp(self, dico, added_operations): + for c in self.get_channels(): + for source in c.get_source(): + for sink in c.get_sink(): + if(not(isinstance(source, Operation)) or not(isinstance(sink, Operation))): + raise Exception("NOt operations!!") + + if(source not in added_operations): + #dot.node(str(source), "", shape="point", xlabel= source.get_code()) + dico["nodes"].append({"id":str(source), "name":'', "shape":"point", "xlabel": source.get_code()}) + added_operations.append(source) + if(sink not in added_operations): + #dot.node(str(sink), "", shape="point", xlabel= sink.get_code()) + dico["nodes"].append({"id":str(sink), "name":'', "shape":"point", "xlabel": sink.get_code()}) + added_operations.append(sink) + + #dot.edge(str(source), str(sink), label= c.get_name()) + dico["edges"].append({"A":str(source), "B":str(sink), "label": c.get_name()}) + return dico""" + + + #---------------------- + #EXECUTORS + #---------------------- + + + + def get_executors(self): + return self.executors + + def extract_executors(self): + from .operation import Operation + from .call import Call + + #https://github.com/nextflow-io/nextflow/blob/45ceadbdba90b0b7a42a542a9fc241fb04e3719d/docs/operator.rst + #TODO This list needs to be checked if it's exhaustive + + if(self.get_type()=="Subworkflow"): + code = self.get_work() + elif(self.get_type()=="Main DSL2"): + code = self.get_code() + code = re.sub(constant.WORKFLOW_HEADER, "", code) + if(code[-1]!='}'): + raise Exception("This shoudn't happen") + code = code[:-1] + + else: + code = self.get_code() + + things_to_remove = [] + things_to_remove+= self.processes+self.includes+self.subworkflows+self.functions + if(self.main!=None): + things_to_remove+=[self.main] + + for to_remove in things_to_remove: + code = code.replace(to_remove.get_code(get_OG = True), "", 1) + + #We add this to simplify the search of the executors + code = "start\n"+code+"\nend" + + #This function takes an executor (already found and expandes it to the pipe operators) + def expand_to_pipe_operators(text, executor): + #If the executor ends with the pipe operator -> we remove it so that it can be detected by the pattern + if(executor[-1]=="|"): + executor = executor[:-1].strip() + start = text.find(executor)+len(executor) + for match in re.finditer(constant.END_PIPE_OPERATOR, text[start:]): + begining, end = match.span(0) + if(begining==0): + return expand_pipe_operator(text, executor+match.group(0)) + break + return executor + + + + #--------------------------------------------------------------- + #STEP1 - Extract equal operations eg. + # *Case "channel = something" + # *Case "(channel1, channel2) = something" + #--------------------------------------------------------------- + pattern_equal = constant.LIST_EQUALS + + searching = True + while(searching): + searching= False + text = code + for e in self.executors: + text = text.replace(e.get_code(), "", 1) + + for pattern in pattern_equal: + for match in re.finditer(pattern, text): + + start, end = match.span(2) + ope = extract_end_operation(text, start, end) + ope = expand_to_pipe_operators(text, ope) + + #If the thing which is extracted is not in the conditon of an if + if(not checks_in_condition_if(text, ope) and not checks_in_string(text, ope)): + operation = Operation(ope, self) + self.executors.append(operation) + searching= True + break + + #I switched step 2 and step 3 -> cause there were cases where there was operations in the paramters of a call -> they were extracted and removed + #----------------------------------- + #STEP3 - Extract the remaining calls + #----------------------------------- + #These are the processes and subworkflows we need to check are called + if(self.get_DSL()=="DSL2"): + to_call = self.get_list_name_processes()+self.get_list_name_subworkflows()+self.get_list_name_includes() + pattern_call = constant.BEGINNING_CALL + searching = True + while(searching): + searching= False + text = code + for e in self.executors: + text = text.replace(e.get_code(), "", 1) + + for match in re.finditer(pattern_call, text): + if(match.group(1) in to_call): + + start, end = match.span(0) + txt_call = get_end_call(text, start, end) + txt_call = expand_to_pipe_operators(text, txt_call) + #If the thing which is extracted is not in the conditon of an if + if(not checks_in_condition_if(text, txt_call) and not checks_in_string(text, txt_call)): + if(txt_call.find("|")!=-1 and txt_call[txt_call.find("|")-1]!="|" and txt_call[txt_call.find("|")+1]!="|"): + first_thing_called = txt_call.split('|')[-1].strip() + if(first_thing_called in to_call): + call = Call(code =txt_call, origin =self) + self.executors.append(call) + else: + added = True + if(first_thing_called in constant.LIST_OPERATORS): + added = True + if(not added): + for operator in constant.LIST_OPERATORS: + for match in re.finditer(operator+constant.END_OPERATOR, txt_call.split('|')[-1].strip()): + start, end = match.span(0) + if(start==0): + added = True + if(not added): + raise BioFlowInsightError(f"In the executor '{txt_call}', '{first_thing_called}' is neither a process, subworkflow or an operator{self.get_string_line(txt_call)}", num = 14, origin=self) + else: + ope = Operation(code =txt_call, origin =self) + self.executors.append(ope) + else: + #We need to see if we can expand the call to a operation perhaps process().set{ch} + expanded = expand_call_to_operation(text, txt_call)#TODO update this + if(txt_call==expanded): + call = Call(code =txt_call, origin =self) + self.executors.append(call) + else: + ope = Operation(code =expanded, origin =self) + self.executors.append(ope) + + searching = True + break + + + #------------------------------------------------- + #STEP2 - Extract the terms which use the operators + #------------------------------------------------- + pattern_dot = constant.DOT_OPERATOR + searching = True + searched = [] + + + while(searching): + searching= False + text = code + for e in self.executors: + text = text.replace(e.get_code(), "", 1) + + for match in re.finditer(pattern_dot, text): + start, end = match.span(1) + + if(match.group(1) not in constant.ERROR_WORDS): + if(match.group(1) in constant.LIST_OPERATORS): + #TODO -> the function below might not work perfectly but i don't have any other ideas + + #TODO -> IMPORTANT find another way of doing this -> for example if there isn't the same number of curlies/parentheses + #Use if there is an operator called right before opening the curlies/parenthse + curly_left, curly_right = get_curly_count(text[:start]), get_curly_count(text[end:]) + parenthese_left, parenthese_right = get_parenthese_count(text[:start]), get_parenthese_count(text[end:]) + + #if(curly_left==0 and curly_right==0 and parenthese_left==0 and parenthese_right==0 and (start, end) not in searched): + if(parenthese_left==0 and parenthese_right==0 and (start, end) not in searched): + searched.append((start, end)) + + pot = extract_executor_from_middle(text, start, end) + pot = expand_to_pipe_operators(text, pot) + + #If the thing which is extracted is not in the conditon of an if + if(not checks_in_condition_if(text, pot) and not checks_in_string(text, pot)): + if(self.get_DSL()=="DSL2"): + to_call = self.get_list_name_processes()+self.get_list_name_subworkflows()+self.get_list_name_includes() + if(pot.find("|")!=-1): + if(not checks_in_condition_if(pot, '|') and not checks_in_string(pot, '|')):#TODO checks_in_string is the first occurance + first_thing_called = pot.split('|')[-1].strip() + if(first_thing_called in to_call): + call = Call(code =pot, origin =self) + self.executors.append(call) + elif(first_thing_called in constant.LIST_OPERATORS): + ope = Operation(code =pot, origin =self) + self.executors.append(ope) + else: + raise BioFlowInsightError(f"'{first_thing_called}' is neither a process, subworkflow or an operator. In the executor '{pot}'{self.get_string_line(pot)}.", num=14,origin=self)#TODO -> try rewriting the operation using the standard syntaxe + + else: + from .executor import Executor + executor = Executor(pot, self) + self.executors.append(executor.return_type()) + + else: + from .executor import Executor + executor = Executor(pot, self) + self.executors.append(executor.return_type()) + else: + ope = Operation(pot, self) + self.executors.append(ope) + searching = True + break + + #--------------------------------------------------------------- + #STEP4 - Extract the Executors which only use the pipe operators (which start with a channel) + #--------------------------------------------------------------- + to_call = self.get_list_name_processes()+self.get_list_name_subworkflows()+self.get_list_name_includes() + + searching = True + while(searching): + searching= False + text = code + for e in self.executors: + text = text.replace(e.get_code(get_OG=True), "", 1) + pattern = constant.BEGINNING_PIPE_OPERATOR + + for match in re.finditer(pattern, text): + txt_call = expand_pipe_operator(text, match.group(0)) + full_executor = txt_call + + #start, end = match.span(0) + ## Check to see if a parameter is given such as in the example 'splitLetters | flatten | convertToUpper | view { it.trim() }' + #params, full_executor = check_if_parameter_is_given_pipe(text, start, end) + #if(params!=''): + # tab_to_call = txt_call.split('|') + # start = f"{tab_to_call[0]}({params})" + # txt_call = start + '|' + '|'.join(tab_to_call[1:]) + # print(start) + #print(params, full_executor) + + #If the thing which is extracted is not in the conditon of an if + if(not checks_in_condition_if(text, full_executor) and not checks_in_string(text, full_executor)): + tab_to_call = txt_call.split('|') + if(tab_to_call[0].strip() in to_call): + start = f"{tab_to_call[0]}()" + txt_call = start + '|' + '|'.join(tab_to_call[1:]) + first_thing_called = txt_call.split('|')[-1].strip() + + if(first_thing_called in to_call): + call = Call(code =txt_call, origin =self, OG_code= full_executor) + self.executors.append(call) + searching = True + break + elif(first_thing_called in constant.LIST_OPERATORS): + ope = Operation(code =txt_call, origin =self, OG_code= full_executor) + self.executors.append(ope) + searching = True + break + else: + added = False + #This is in the case "channel | map {dfvfdvd}" + for ope in constant.LIST_OPERATORS: + if(first_thing_called[:len(ope)]==ope and not added): + ope = Operation(code =txt_call, origin =self, OG_code= full_executor) + self.executors.append(ope) + added = True + searching = True + if(added): + break + elif(not added): + raise BioFlowInsightError(f"In the executor '{txt_call}', '{first_thing_called}' is neither a process, subworkflow or an operator (in the file '{self.get_file_address()}')", num = 14,origin=self) + + #--------------------------------------------------------------------- + #STEP5 - We remove the things which were falsy extracted as executors + #--------------------------------------------------------------------- + to_remove = [] + starting_by_to_remove = ["System.out"] + for e in self.executors: + for r in starting_by_to_remove: + if(e.get_code()[:len(r)]==r): + to_remove.append(e) + for e in to_remove: + self.executors.remove(e) + + + #---------------------- + #OPERATIONS + #---------------------- + + #Method that adds operation into the lists of operations + def add_operation(self, operation): + self.operations.append(operation) + + #---------------------- + #INCLUDES + #---------------------- + def get_all_includes(self): + return self.origin.get_all_includes() + + def add_include_to_all_includes(self, include): + self.origin.add_include_to_all_includes(include) + diff --git a/src/nextflow_file.py b/src/nextflow_file.py new file mode 100644 index 0000000000000000000000000000000000000000..c49737e6817ac84b83d28aa27036f443bee1617c --- /dev/null +++ b/src/nextflow_file.py @@ -0,0 +1,662 @@ + +import re +import os +import json +import glob +from datetime import date + +#TODO -> check this or either change the warnings to nothing +import warnings +from pathlib import Path + +from . import constant + +warnings.filterwarnings("ignore") +from .nextflow_building_blocks import Nextflow_Building_Blocks +from .outils import extract_curly, get_curly_count, get_parenthese_count +from .bioflowinsighterror import BioFlowInsightError + + + + + +class Nextflow_File(Nextflow_Building_Blocks): + def __init__(self, address, duplicate = True, DSL="", author = None, name = None, origin=None, output_dir='./results', display_info = True): + self.file = address + self.output_dir = Path(output_dir) + contents = '' + try: + with open(self.get_file_address(), 'r') as f: + contents = f.read() + except Exception: + raise BioFlowInsightError(f"No such file: '{self.get_file_address()}'.", num = 10,origin=self) + + Nextflow_Building_Blocks.__init__(self, contents) + self.workflow_name = name + self.author = author + self.duplicate = duplicate + self.origin = origin + self.DSL = "" + self.first_file = DSL=="" + self.graph = None + self.display_info = display_info + self.all_includes = [] + self.check_file_correctness() + self.set_DSL(DSL=DSL) + self.extract_metadata() + self.check_file_correctness_after_DSL() + self.set_null() + + def get_name_file(self): + name = self.get_file_address().split('/')[-1] + return name[:-3] + + + def check_file_correctness(self): + code = self.get_code() + if(code.count("{")!=code.count("}")): + curly_count = get_curly_count(code) + if(curly_count!=0): + raise BioFlowInsightError(f"Not the same number of opening and closing curlies '{'{}'}' in the file.", num = 16,origin=self) + if(code.count("(")!=code.count(")")): + parenthese_count = get_parenthese_count(code) + if(parenthese_count!=0): + raise BioFlowInsightError(f"Not the same number of opening and closing parentheses '()' in the file.", num = 16, origin=self) + + if(code.count('"""')%2!=0): + raise BioFlowInsightError(f"An odd number of '\"\"\"' was found in the code.", num = 16, origin=self) + + #if(code.count("'''")!=code.count("'''")): + # raise BioFlowInsightError(f"Not the same number of ''' in the file '{self.get_file_address()}'") + # + #if(code.count('"""')!=code.count('"""')): + # raise BioFlowInsightError(f'Not the same number of """ in the file "{self.get_file_address()}"') + + #TODO -> finish function + def check_file_correctness_after_DSL(self): + if(self.first_file): + if(self.DSL=="DSL2"): + code = "\n"+self.get_code()+"\n" + found_main = False + for match in re.finditer(constant.WORKFLOW_HEADER_2, code): + found_main = True + if(not found_main): + raise BioFlowInsightError(f"No 'main' workflow was found.", num = 16, origin=self) + + def get_output_dir(self): + if(self.first_file): + return self.output_dir + else: + return self.origin.get_output_dir() + + def get_display_info(self): + if (self.first_file): + return self.display_info + else: + return self.origin.get_display_info() + + + def set_name(self): + if self.first_file and self.workflow_name is None: + address = self.get_file_address() + self.workflow_name = address.split('/')[-2] + + def set_author(self): + if self.first_file and self.author is None: + address = self.get_file_address() + try: + self.author = address.split('/')[-3] + except: + self.author="Unknown" + + def get_channels(self): + return self.channels + + def set_null(self): + self.processes = [] + self.channels = [] + self.functions = [] + #DSL2 + self.includes = [] + self.main = None + self.executors = [] + self.subworkflows = [] + self.already_added_structure = False + self.graph = None + self.all_includes = [] + + def extract_metadata(self): + + #When the start=="" it means it's the first analysis + if(self.first_file): + self.set_null() + self.set_name() + self.set_author() + dico_wf = {} + dico_wf["workflow name"] = self.workflow_name + dico_wf["author"] = self.author + dico_wf["date analysis"] = date.today().strftime("%m/%d/%y")#m/d/y + dico_wf["DSL"] = self.DSL + dico_wf["link"] = "TODO" + dico_wf["publish date"] = "TODO" + dico_wf["file given"] = self.get_file_address() + #dico_wf["processes"] = {} + + if(self.DSL=="DSL1"): + #self.extract_processes() + #dico_wf["processes"]["number defined"] = len(self.processes) + #dico_wf["processes"]["number used"] = len(self.processes) + None + + elif(self.DSL=="DSL2"): + dico_wf["number nextflow files from root"] = "TODO" + + ##Number of process defined + #root = '/'.join(self.get_file_address().split('/')[:-1]) + #nextflow_files = glob.glob(f'{root}/**/*.nf', recursive=True) + #number_defined=0 + # + #for file in nextflow_files: + # + # wf = Nextflow_File(file, DSL="DSL2") + # wf.extract_processes() + # number_defined+=wf.get_number_processes() + #dico_wf["processes"]["number defined"] = number_defined + # + ##Number of process used + processes_used = {} + with open(self.output_dir / "debug" / "processes_used.json", "w") as outfile: + json.dump(processes_used, outfile, indent=4) + + else: + raise Exception(f"The workflow's DSL is '{self.DSL}' -> I don't know what this is!") + + with open(self.output_dir / "general.json", "w") as outfile: + json.dump(dico_wf, outfile, indent=4) + + def get_type(self): + return "Nextflow File" + + + def get_line(self, bit_of_code): + return self.code.get_line(bit_of_code) + + def get_string_line(self, bit_of_code): + return self.code.get_string_line(bit_of_code) + + def set_DSL(self, DSL=""): + #Set the DSL + if(DSL==""): + + + os.makedirs(self.output_dir, exist_ok=True) + os.makedirs(self.output_dir / 'debug', exist_ok=True) + os.makedirs(self.output_dir / 'graphs', exist_ok=True) + + with open(self.output_dir / "debug" / "operations.nf",'w') as file: + pass + with open(self.output_dir / "debug" / "calls.nf",'w') as file: + pass + with open(self.output_dir / "debug" / "operations_in_call.nf",'w') as file: + pass + with open(self.output_dir / "graphs" / "full_graph_dico_format.json",'w') as file: + pass + with open(self.output_dir / "graphs" / "metadata_full_graph.json",'w') as file: + pass + + self.DSL = self.which_DSL() + self.set_null() + if(self.get_display_info()): + print(f"The workflow is written in '{self.get_DSL()}'") + else: + self.DSL = DSL + + + #---------------------- + #GENERAL + #---------------------- + def get_file_address(self): + return os.path.normpath(self.file) + + def get_root_directory(self): + if(self.origin==None): + return '/'.join(self.get_file_address().split('/')[:-1]) + else: + return self.origin.get_root_directory() + + #Returns either a subworkflow or process from the name + def get_element_from_name(self, name): + for process in self.processes: + if(name==process.get_name()): + return process + for subworkflow in self.subworkflows: + if(name==subworkflow.get_name()): + return subworkflow + for fun in self.functions: + if(name==fun.get_name()): + return fun + raise BioFlowInsightError(f"'{name}' is expected to be defined in the file, but it could not be found.", num = 18, origin=self) + + def get_DSL(self): + return self.DSL + + #Method which returns the DSL of the workflow -> by default it's DSL2 + #I use the presence of include, subworkflows and into/from in processes as a proxy + def which_DSL(self): + DSL = "DSL2" + #If there are include + self.extract_includes() + if(len(self.includes)>0): + return DSL + #If there are subworkflows + self.extract_subworkflows() + if(len(self.subworkflows)>0): + return DSL + #If there is the main + self.extract_main() + if(self.main!=None): + return DSL + #Analyse the processes + self.extract_processes() + for p in self.processes: + DSL = p.which_DSL() + if(DSL=="DSL1"): + return DSL + return DSL + + #---------------------- + #PROCESS + #---------------------- + def get_process_from_name(self, name): + for process in self.processes: + if(process.get_name()==name): + return process + if(self.duplicate): + for include in self.includes: + defines = include.get_defines() + for d in defines: + if(d.get_alias()==name and d.get_type()=="Process"): + return d + else: + for include in self.includes: + aliases = include.get_aliases() + for a in aliases: + if(a==name and aliases[a].get_type()=="Process"): + return aliases[a] + + return None + raise Exception(f"Process '{name}' couldn't be found in '{self.get_file_address()}'") + + def get_number_processes(self): + return len(self.processes) + + + #---------------------- + #MAIN WORKFLOW + #---------------------- + #This method extracts the "main" workflow from the file + def extract_main(self): + from .main_DSL2 import Main_DSL2 + #This returns the code without the comments + code = "\n"+self.get_code()+"\n" + #Find pattern + twice = False + for match in re.finditer(constant.WORKFLOW_HEADER_2, code): + start = match.span(1)[0] + end = extract_curly(code, match.span(1)[1])#This function is defined in the functions file + self.main = Main_DSL2(code= code[start:end], origin=self) + if(twice): + raise Exception(f"Found multiple 'main workflows' in {self.get_file_address()}") + twice = True + + #---------------------- + #SUBWORKFLOW (ones found in the file) + #---------------------- + def extract_subworkflows(self): + from .subworkflow import Subworkflow + #Get code without comments + code = self.get_code() + #Find pattern + for match in re.finditer(constant.SUBWORKFLOW_HEADER, code): + start = match.span(0)[0] + end = extract_curly(code, match.span(0)[1])#This function is defined in the functions file + sub = Subworkflow(code=code[start:end], origin=self, name=match.group(1)) + self.subworkflows.append(sub) + + def get_list_name_subworkflows(self): + names = [] + for sub in self.subworkflows: + names.append(sub.get_name()) + return names + + def get_subworkflow_from_name(self, name): + for sub in self.subworkflows: + if(sub.get_name()==name): + return sub + if(self.duplicate): + for include in self.includes: + defines = include.get_defines() + for d in defines: + if(d.get_alias()==name and d.get_type()=="Subworkflow"): + return d + else: + for include in self.includes: + aliases = include.get_aliases() + for a in aliases: + if(a==name and aliases[a].get_type()=="Subworkflow"): + return aliases[a] + return None + raise Exception(f"Subworkflow '{name}' couldn't be found in '{self.get_file_address()}'") + + + #---------------------- + #INCLUDES + #---------------------- + def extract_includes(self): + from .include import Include + + code = self.get_code() + + #pattern = r"include +{([^\}]+)} +from +([^\n ]+)" + #pattern = r"include +({([^\}]+)}|(\w+)) +from +([^\n ]+)" + pattern = constant.FULL_INLCUDE_2 + + for match in re.finditer(pattern, code): + + includes = match.group(1).replace('{', '').replace('}', '').strip() + + #We do this if there are multiple includes + #TODO -> this in a nicer way + #To take into account + #include { + #PAIRTOOLS_SELECT + # as PAIRTOOLS_SELECT_VP; + #PAIRTOOLS_SELECT + # as PAIRTOOLS_SELECT_LONG + found_semi, found_n = bool(includes.find(";")+1), bool(includes.find("\n")+1) + if(found_semi and found_n): + temp = includes.split(";") + tab = [] + for temp_include in temp: + temp_include = temp_include.replace("\n", ' ').strip() + if(temp_include[:3] in constant.LIST_AS): + tab[-1] = tab[-1]+" "+temp_include + else: + tab.append(temp_include) + includes = tab + elif(found_semi): + includes = includes.split(";") + elif(found_n): + temp = includes.split("\n") + tab = [] + for temp_include in temp: + temp_include = temp_include.strip() + if(temp_include[:3]in constant.LIST_AS): + tab[-1] = tab[-1]+" "+temp_include + else: + tab.append(temp_include) + includes = tab + else: + includes = [includes] + + + #TODO -> check this + #https://www.nextflow.io/docs/latest/plugins.html#plugins + #https://github.com/nextflow-io/nf-validation + #address = match.group(0).split('from')[1].strip() + address = match.group(6).strip() + if(address[1:].split('/')[0] not in ['plugin']): + include = Include(code =match.group(0), file = address, importing = includes, origin=self, duplicate = self.duplicate) + self.includes.append(include) + self.add_include_to_all_includes(include) + + + def get_list_name_includes(self): + names = [] + for include in self.includes: + names+=include.get_list_name_includes() + return names + + #---------------------- + #FUNCTIONS + #---------------------- + + #Method that extracts the functions from a file -> we don't analyse them + #since they don't structurally change the workflow + def extract_functions(self): + from .function import Function + #pattern_function = r"(def|String|void|Void|byte|short|int|long|float|double|char|Boolean) *(\w+) *\([^,)]*(,[^,)]+)*\)\s*{" + pattern_function = constant.HEADER_FUNCTION + code = self.get_code() + #Find pattern + for match in re.finditer(pattern_function, code): + start = match.span(0)[0] + end = extract_curly(code, match.span(0)[1])#This function is defined in the functions file + #f = Code(code=code[start:end], origin=self) + f = Function(code = code[start:end], name = match.group(2), origin =self) + self.functions.append(f) + + def get_function_from_name(self, name): + for fun in self.functions: + if(fun.get_name()==name): + return fun + + if(self.duplicate): + for include in self.includes: + defines = include.get_defines() + for d in defines: + if(d.get_alias()==name and d.get_type()=="Function"): + return d + else: + for include in self.includes: + aliases = include.get_aliases() + for a in aliases: + if(a==name and aliases[a].get_type()=="Function"): + return aliases[a] + return None + + def get_includes(self): + return self.includes + + def get_all_includes(self): + if(self.first_file): + return self.all_includes + else: + return self.origin.get_all_includes() + + def add_include_to_all_includes(self, include): + if(self.first_file): + self.all_includes.append(include) + else: + self.origin.add_include_to_all_includes(include) + + #---------------------- + #INITIALISE + #---------------------- + + #Method that initialises the nextflow file + def initialise(self): + + + if(self.get_DSL()=="DSL2"): + if(self.get_display_info()): + print(self.get_file_address()) + + #Extarct Processes + self.extract_processes() + #print("Extract processes :", self.processes) + + #CODE without processes + code = self.get_code() + for proecess in self.processes: + code = code.replace(proecess.get_code(), "") + #for match in re.finditer(r"\\\s*\n\s*\|", code): + # #TODO add line + # print(code) + # raise BioFlowInsightError(f"The use of backslash '\\' and pipe operator '|' was found in the file '{self.get_file_address()}.' ", origin=self) + + + #Analyse Processes + #TODO analyse processes + + #Extarct includes + self.extract_includes() + #print("Extract includes :", self.includes) + + #Analyse Inludes + for include in self.includes: + include.initialise() + + #Extract subworkflows + self.extract_subworkflows() + #print("Extract subworkflows :", self.subworkflows) + + #Extract main + self.extract_main() + #print("Extract main :", self.main) + + #Extract functions + self.extract_functions() + + #Extract Executors + self.extract_executors() + + #Analyse Executors + for e in self.executors: + e.initialise() + + + + #Analyse Main + if(self.main!=None and self.first_file): + self.main.initialise() + + #Analyse subworkflows + indice=1 + for sub in self.subworkflows: + sub.initialise() + indice+=1 + + if(self.first_file): + number_process_used = 0 + with open(self.output_dir / 'debug/processes_used.json') as json_file: + dict = json.load(json_file) + for file in dict: + number_process_used+=len(set(dict[file])) + + with open(self.output_dir / "general.json") as json_file: + dico_wf = json.load(json_file) + + #dico_wf["processes"]["number used"] = number_process_used + + with open(self.output_dir / "general.json", "w") as outfile: + json.dump(dico_wf, outfile, indent=4) + + + elif(self.get_DSL()=="DSL1"): + if(self.get_display_info()): + print(self.get_file_address()) + self.extract_processes() + self.extract_functions() + self.extract_executors() + for e in self.executors: + e.initialise() + + else: + raise Exception(f"I don't know what to do with this:/ '{self.get_DSL()}'") + + if(self.first_file): + self.initialise_graph() + + + #The start parameter is for when we call 'get_structure_DSL2' for the first time + def get_structure_DSL2(self, dico, start = False): + if(not self.already_added_structure): + self.already_added_structure = True + #Add the operations found in the file (outside of main or subworkflow) to the structure + for o in self.executors: + if(o.get_type()=="Operation"): + o.get_structure(dico) + else: + if(o.get_first_element_called().get_type()!="Function"): + raise Exception(f"Executor of type '{o.get_type()}' was extracted in a DSL2 workflow (outside of a subworkflow or main)! This shoudn't happen! The code is '{o.get_code()}' -> it was called in file '{o.get_file_address()}'") + + #for c in self.get_channels(): + # for source in c.get_source(): + # for sink in c.get_sink(): + # dico["edges"].append({'A':str(source), 'B':str(sink), "label":c.get_name()}) + + if(start): + if(self.main!=None): + self.main.get_structure(dico) + if(not start and self.main!=None): + warnings.warn(f"Another main was detected in the file '{self.get_file_address()}' (it is not represented in the graph)") + #raise Exception(f'There was a second main which was detected in the workflow in the file {self.get_file_address()}') + return dico + + + + def get_structure_DSL1(self, dico): + for p in self.get_processes(): + p.get_structure(dico) + + for o in self.get_executors(): + if(o.get_type()=="Operation"): + o.get_structure(dico) + else: + raise Exception(f"Executor of type '{o.get_type()}' was extracted in a DSL1 workflow! This shoudn't happen! The code is '{o.get_code()}'") + + for c in self.get_channels(): + for source in c.get_source(): + for sink in c.get_sink(): + #If the sink an operation then the edge has already been added in the get_structure method for the operation + if(sink.get_type()=="Process"): + dico["edges"].append({'A':str(source), 'B':str(sink), "label":c.get_name()}) + + return dico + + + def get_structure(self): + dico = {} + dico['nodes'] = [] + dico['edges'] = [] + dico['subworkflows'] = {} + + if(self.DSL == "DSL1"): + return self.get_structure_DSL1(dico=dico) + elif(self.DSL == "DSL2"): + return self.get_structure_DSL2(dico=dico, start = True) + else: + raise Exception(f"The workflow's DSL is '{self.DSL}' -> I don't know what this is!") + + + def initialise_graph(self): + from .graph import Graph + if(self.graph==None): + self.graph = Graph(self) + + def generate_all_graphs(self, render_graphs = True): + #Initialisation (obligatory) + self.graph.initialise() + + #Generate the different graphs + self.graph.get_full_graph(render_graphs = render_graphs) + self.graph.get_full_graph_wo_lables(render_graphs = render_graphs) + self.graph.render_graph_wo_operations(render_graphs = render_graphs) + self.graph.get_graph_wo_orphan_operations(render_graphs = render_graphs) + self.graph.get_graph_wo_orphan_operations_wo_lables(render_graphs = render_graphs) + self.graph.render_graph_wo_branch_operations(render_graphs = render_graphs) + self.graph.get_graph_wo_branch_operations_wo_lables(render_graphs = render_graphs) + self.graph.get_graph_wo_branch_operations_wo_orphan_operations(render_graphs = render_graphs) + self.graph.get_graph_wo_branch_operations_wo_orphan_operations_wo_lables(render_graphs = render_graphs) + + + #Generate the different metadata associated with the graphs + self.graph.get_metadata_fullgraph() + self.graph.get_metadata_graph_wo_branch_operations() + self.graph.get_metadata_graph_wo_operations() + + #def get_metadata_graph_wo_operations(self): + # self.graph.get_metadata_graph_wo_operations() + + diff --git a/src/operation.py b/src/operation.py new file mode 100644 index 0000000000000000000000000000000000000000..00244d6a21d3434cecfc78ef4407844898c2fb86 --- /dev/null +++ b/src/operation.py @@ -0,0 +1,802 @@ + + +import warnings + +#TODO +#- uniform eveything here +#- add a list of words illegal for channel eg. [true, process, workflow...] + +import re +from .outils import get_end_operator, get_end_call +from .code_ import Code +from .executor import Executor +from .bioflowinsighterror import BioFlowInsightError +from . import constant + + +class Operation(Executor): + def __init__(self, code, origin, OG_code = ''): + self.origin = origin + self.code = Code(code, origin = self) + self.origins = [] + self.gives = [] + self.label = "" + self.calls = {} + self.OG_code = OG_code + self.show_in_structure = True + self.operation_type = None + + def change_code(self, code): + self.code = Code(code, origin = self) + + + + def add_element_gives(self, ele): + self.gives.append(ele) + + def add_element_origins(self, ele): + self.origins.append(ele) + + + def is_defined_in_process(self, process): + if(len(self.gives)!=0): + raise Exception(f"This operation is defined in {process.get_name()} shoudn't be able to give a channel") + #Don't need to remove it from the list of operations cause it was never added (that's done outside of the operation) + for c in self.origins: + c.remove_element_from_sink(self) + c.add_sink(process) + + def get_name(self): + return str(self) + + + def get_gives(self): + return self.gives + + def get_type(self): + return "Operation" + + def check_in_channels(self, channel): + return self.origin.check_in_channels(channel) + + def add_channel(self, channel): + self.origin.add_channel(channel) + + + + def add_origin_channel(self, name): + from .channel import Channel + #Check that the name is not the list of illegal words + #and Check that the thing extarcted is not WorkflowNameFile like 'WorkflowHgtseq' in nf-core/hgtseq + if(name not in constant.ERROR_WORDS_ORIGINS and name.lower()!=f"workflow{self.get_name_file().lower()}"): + channel = Channel(name=name, origin=self.origin) + #TODO -> this needs to be checked + if(not self.origin.check_in_channels(channel)): + self.origin.add_channel(channel) + else: + channel = self.origin.get_channel_from_name(name) + self.origins.append(channel) + #channel.initialise() + channel.add_sink(self) + + + + #I don't need to define the equivalent gives -> cause it's not possible:) + def add_origin_emits(self, full_code, name_called, name_emitted): + from .emitted import Emitted + #full_code, name_called, name_emitted = match.group(1), match.group(2), match.group(3) + IGNORE_NAMES = ['params'] + #In the cas an operator is extracted at the end of the emit + if(full_code.count('.')>=2): + splited = full_code.split('.') + if( splited[-1] in constant.LIST_OPERATORS): + full_code = '.'.join(splited[:-1]) + if(name_called not in IGNORE_NAMES): + process = self.origin.get_process_from_name(name_called) + subworkflow = self.origin.get_subworkflow_from_name(name_called) + + if(process!=None and subworkflow!=None): + raise Exception(f"Problem in get_element -> {name_called} exists as process and subworkflow") + #Case subworkflow + if(process==None and subworkflow!=None): + emitted = Emitted(name=full_code, origin=self.origin, emitted_by=subworkflow) + emitted.set_emits(name_emitted) + #Case Process + if(process!=None and subworkflow==None): + emitted = Emitted(name=full_code, origin=self.origin, emitted_by=process) + #TODO -> analyse the outputs of the process + + if(process==None and subworkflow==None): + if(name_called[:5]=="Call_"): + name_called = self.calls[name_called].get_code() + raise BioFlowInsightError(f"The call for '{name_called}' coudn't be found, before its use in the operation '{self.get_code(get_OG=True)}'{self.get_string_line(self.get_code(get_OG=True))}. Either because the call wasn't made before the operation or that the element it is calling doesn't exist.", num =8, origin=self) + + emitted.add_sink(self) + self.origins.append(emitted) + + #This methods checks if the input is an emit and adds it if it's the case, it also returns T/F if it's an emit + def check_is_emit(self, name): + pattern_emit_tab = constant.EMIT_TAB + pattern_emit_name = constant.EMIT_NAME + patterns = [pattern_emit_tab, pattern_emit_name] + found_an_emit = False + for pattern in patterns: + for match in re.finditer(pattern, name): + found_an_emit = True + full_code, name_called, name_emitted = match.group(0), match.group(1), match.group(3) + if(name_emitted not in constant.LIST_OPERATORS): + self.add_origin_emits(full_code, name_called, name_emitted) + else: + self.add_origin_emits(full_code, name_called, "") + + if(not found_an_emit): + for match in re.finditer(constant.EMIT_ALONE, name+' '): + found_an_emit = True + full_code, name_called = match.group(0).strip(), match.group(1) + self.add_origin_emits(full_code, name_called, "") + + return found_an_emit + + #Function that returns if an operation is a create or a branch + def get_operation_type(self): + if(self.operation_type==None): + if(len(self.origins)!=0 and len(self.gives)!=0): + return 'Branch' + else: + return 'Create' + return self.operation_type + + def set_operation_type(self, type): + self.operation_type = type + + + #Here since the operation "gives" a channel -> we don't check + #if it's a global channel since we are defining a new one + def add_gives(self, name): + from .channel import Channel + #Case it's a call and it's been replaced + if(re.fullmatch(constant.CALL_ID, name)): + self.gives.append(self.calls[name]) + raise Exception("This shoudn't happen! -> a call is taking a value") + + else: + channel = Channel(name=name, origin=self.origin) + if(not self.origin.check_in_channels(channel)): + self.origin.add_channel(channel) + else: + channel = self.origin.get_channel_from_name(name) + + self.gives.append(channel) + #channel.initialise() + channel.add_source(self) + + def add_origin(self, name): + if(self.origin.get_DSL()=="DSL2"): + #Case it's a call and it's been replaced + if(re.fullmatch(constant.CALL_ID, name)): + self.origins.append(self.calls[name]) + else: + ##Case it's a subworkflow + #subworkflow = self.origin.get_subworkflow_from_name(name) + #process = self.origin.get_process_from_name(name) + #if(subworkflow!=None): + # print("George it's a subworkflow") + # #Case suborkflow + # self.origins.append(subworkflow) + ##Case process + #elif(process!=None): + # print("George it's a process") + # #Case process + # self.origins.append(process) + ##In this case it's a channel + #else: + self.add_origin_channel(name) + else: + self.add_origin_channel(name) + + + #Function that from an operation gives the origin ('input') channels + #For every case i have checked by compilying the nextflow code for each operator + def initialise_origins(self): + operation = self.get_code(clean_pipe = True)+" " + ERROR_WORDS = constant.ERROR_WORDS_ORIGINS + + #Replace the channels written like "ch[0]" to "ch" -> since in anycase it's just a + #subpart of the channel (we can't analyse what's in the channel) + replacing_tab = True + while(replacing_tab): + replacing_tab = False + pattern_channel_tab = constant.CHANNEL_TAB + for match in re.finditer(pattern_channel_tab, operation): + if(match.group(1) not in ["out", "output"]): + operation = operation.replace(match.group(0), match.group(1)) + replacing_tab = True + break + + #pattern= r'([^\=\n]+)\s*=\s*([^\?\n]+)\s*\?([^\n]+)' + #if(bool(re.fullmatch(pattern, operation))): + # for match in re.finditer(pattern, operation): + # origin_temp = match.group(3).split(":") + # for o in origin_temp: + # origin.append(o.strip()) + + + #else: + + #If the first word after an '=' is not the channel key word than it is an actual channel + #TODO -> check if this condition actually works!! + #temp = operation.split('.')[0].split('=')[-1].strip() + #if(get_first_word(temp)!='Channel' and get_first_word(temp)!='channel'): + # origin.append(get_first_word(temp)) + + #TODO -> does this case ever exist? + #================================ + #Case channel1 + #================================ + if(bool(re.fullmatch(constant.WORD, operation))): + self.add_origin(operation) + + + case_operation_starts_with_emit = False + #--------------------------- + #Check emits + #--------------------------- + #================================ + #Case call.out[num] + #================================ + #TODO -> here i assume that the [] is a decimal not necessary the case + pattern_emit_tab = constant.EMIT_TAB + #================================ + #Case channel1 = call.out.something + #================================ + pattern_emit_name = constant.EMIT_NAME + patterns = [pattern_emit_tab, pattern_emit_name] + first_call = True + for pattern in patterns: + #================================ + #Case channel1 = emits + #================================ + pattern_equals = r"\w+\s*=\s*\(?\s*("+pattern+r")" + for match in re.finditer(pattern_equals, operation): + full_code, name_called, name_emitted = match.group(1), match.group(2), match.group(4) + if(name_emitted in constant.LIST_OPERATORS): + self.add_origin_emits(full_code, name_called, "") + else: + self.add_origin_emits(full_code, name_called, name_emitted) + case_operation_starts_with_emit = True + + #================================ + #Case call.out[].something().. or call.out.channel.something().. -> we want to extract that emits + #================================ + for match in re.finditer(pattern, operation): + if(first_call): + full_code, name_called, name_emitted = match.group(0), match.group(1), match.group(3) + + #Check that it's a the begining of the operation + code = operation + operation_until_out = code[:code.find("out")] + if(operation_until_out==full_code[:full_code.find("out")]): + code_wo_spaces = code.replace(' ', '') + #We check that the term after the out is an operator or a channel + is_operator = True + try: + if(code_wo_spaces[len(match.group(0).replace(' ', ''))] in ['(', '{']): + is_operator=True + else: + is_operator=False + except: + is_operator=False + if(is_operator): + + self.add_origin_emits(full_code, name_called, "") + else: + self.add_origin_emits(full_code, name_called, name_emitted) + case_operation_starts_with_emit = True + first_call = False + + + #Here i the case where we assume the emit looks like "call.out" + if(not case_operation_starts_with_emit): + #================================ + #Case channel1 = emits + #================================ + pattern_equals = constant.EMIT_EQUALS + for match in re.finditer(pattern_equals, operation): + full_code, name_called = match.group(1), match.group(2) + self.add_origin_emits(full_code, name_called, "") + case_operation_starts_with_emit = True + + #================================ + #Case call.out.something().. we want to extract that emits + #================================ + #for match in re.finditer(r"(\w+)\s*\.\s*out", operation): + #TODO -> check this + #I've changed this to avoid problems like this : "ch_svdb_dbs.out_occs.toList()" + for match in re.finditer(constant.EMIT_OPERATION, operation+" "): + full_code, name_called = match.group(0), match.group(1) + #Check that it's a the begining of the operation + operation_until_out = operation[:operation.find("out")] + if(operation_until_out==full_code[:full_code.find("out")]): + self.add_origin_emits(full_code, name_called, "") + case_operation_starts_with_emit = True + + + + + if(not case_operation_starts_with_emit): + + #================================ + #Case channel1 = channel2.something + #================================ + pattern= constant.CHANNEL_EQUALS_OPERATION + for match in re.finditer(pattern, operation): + if(match.group(1) not in ERROR_WORDS): + #Here we create the channel from the name -> checks if it already exists in the workflow + name = match.group(1) + if(bool(re.fullmatch(constant.WORD, name))): + self.add_origin(name) + + #================================ + #Case channel1 = [.., ..] + #================================ + pattern= constant.CHANNEL_EQUALS_LIST + if(bool(re.fullmatch(pattern, operation.strip()))): + for match in re.finditer(pattern, operation): + origin_possibilities = match.group(1).split(",") + for o in origin_possibilities: + name = o.strip() + if(name not in ERROR_WORDS): + #Here we create the channel from the name -> checks if it already exists in the workflow + if(bool(re.fullmatch(constant.WORD, name))): + self.add_origin(name) + + #================================ + #Case (ch1, ch2, ...) = channel.something + #================================ + pattern= constant.TUPLE_EQUALS + for match in re.finditer(pattern, operation): + if(match.group(2) not in ERROR_WORDS): + #Here we create the channel from the name -> checks if it already exists in the workflow + name = match.group(2) + if(bool(re.fullmatch(constant.WORD, name))): + self.add_origin(name) + + #================================ + #Case channel1 = channel2 + #================================ + if(bool(re.fullmatch(constant.CHANNEL_EQUALS, operation.strip()))): + temp = operation.split('=')[-1].strip() + if(temp not in ERROR_WORDS and bool(re.fullmatch(constant.WORD, temp))): + #Here we create the channel from the name -> checks if it already exists in the workflow + self.add_origin(temp) + + + #================================ + #Case (ch1, ch2, ...) = (ch1_1, ch2_1, ...) + #================================ + #Nextflow doesn't allow this + #TODO -> double check + + #================================ + #Case channel.something().. -> we want to extract that channel + #================================ + index_dot = operation.find(".") + if(index_dot!=-1): + if(bool(re.fullmatch(constant.WORD_DOT, operation[:index_dot+1].strip()))): + temp = operation[:index_dot].strip() + if(temp not in ERROR_WORDS and bool(re.fullmatch(constant.WORD, temp))): + #Here we create the channel from the name -> checks if it already exists in the workflow + name = temp + if(bool(re.fullmatch(constant.WORD, name))): + self.add_origin(name) + + + ##================================ + ##join/ phase/ cross/ combine + ##================================ + #pattern= r'\.\s*(join|phase|cross|combine)\s*\(([^\)]+)\)' + #for match in re.finditer(pattern, operation): + # name = match.group(2).strip() + # #Case channel + # if(bool(re.fullmatch(r'\w+', name))): + # self.add_origin(name) + # else: + # #check and add if it's an emitted value + # emited = self.check_is_emit(name) + # if(not emited): + # raise Exception(f"I don't know what i'm looking at {name} in {self.get_code()}") + + + #================================ + #merge/ mix/ concat/ spread/ join/ phase/ cross/ combine + #================================ + pattern= constant.MERGE_OPERATIONS + for match in re.finditer(pattern, operation): + start, end, beginning_character= match.span(1)[0], match.span(1)[1], match.group(3) + operator_call, operator_params = get_end_operator(operation, start, end, beginning_character) + spliting_param = '' + if(beginning_character=="("): + spliting_param="," + if(beginning_character=="{"): + spliting_param=";" + temp= operator_params.split(spliting_param) + for t in temp: + name = t.strip() + #Case channel + if(bool(re.fullmatch(constant.WORD, name))): + self.add_origin(name) + else: + #check and add if it's an emitted value + emited = self.check_is_emit(name) + if(not emited): + #TODO -> check at what extend this is used + channels = self.get_channels() + for c in channels: + if(c.get_name() in name): + pos = [m.start() for m in re.finditer(c.get_name(), operation)] + to_add = True + for p in pos: + if(p>0): + if(bool(re.fullmatch(constant.ILLEGAL_CHARCTER_BEFORE_POTENTIAL_CHANNELS, operation[p-1]))): + to_add = False + if(to_add): + self.add_origin(c.get_name()) + #TODO update this -> it's an operation itselfs + warnings.warn(f"I don't know what i'm looking at '{name}' in '{self.get_code()}'\n") + + ##================================ + ##merge/ mix/ concat + ##================================ + #pattern= r'\.\s*(merge|mix|concat)\s*\((\s*\w+\s*\,\s*(\w+\s*\,\s*)*\w+\s*|\s*(\w+)\s*)\)' + #for match in re.finditer(pattern, operation): + # temp=match.group(2) + # temp= temp.split(',') + # for t in temp: + # t= t.strip() + # #Here we create the channel from the name -> checks if it already exists in the workflow + # name = t + # if(bool(re.fullmatch(r'\w+', name))): + # self.add_origin(name) + # + ##================================ + ##spread + ##================================ + #pattern= r'\.\s*spread\s*\(([\s\w\.(),\"\'\{\}\[\]+-]+)\)' + #for match in re.finditer(pattern, operation): + # #Here we create the channel from the name -> checks if it already exists in the workflow + # name = match.group(1).strip() + # if(bool(re.fullmatch(r'\w+', name))): + # self.add_origin(name) + + #print(self.origin) + #self.origins = list(set(origin)) + + + + + #Method that intialises the gives (the outputs) of an opeartion + def initialise_gives(self): + code = self.get_code(clean_pipe = True) + #Case channel1 = something -> then channel1 is added to the gives + if(bool(re.fullmatch(constant.CHANNEL_EQUALS_SOMETHING, code))): + first_gives = code.split("=")[0].strip() + if(bool(re.fullmatch(constant.WORD, first_gives))): + self.add_gives(first_gives) + + #Case (ch1, ch2, ...) = something -> then ch1, ch2, ... is added to the gives + elif(bool(re.fullmatch(constant.TUPLE_EQUALS_SOMETHING, code))): + for match in re.finditer(constant.TUPLE_EQUALS_SOMETHING, code): + to_give = match.group(1)[1:-1]#Removing the parentheses + to_give = to_give.split(",") + for g in to_give: + g = g.strip() + if(bool(re.fullmatch(constant.WORD, g))): + self.add_gives(g) + else: + raise Exception("Something unexpected") + + #else: + # print(self.get_code()) + # raise Exception("Something unexpected!") + + + #Cases we use the "set" operators + set_operators = constant.SET_OPERATORS + start_end = [["(", ")"], ["{", "}"]] + for operator in set_operators: + for start, end in start_end: + pattern = f"{operator}\s*\{start}([^\{end}]+)\{end}" + for match in re.finditer(pattern, code): + channels = match.group(1) + #Add channel + gives = re.split(';|,|\n', channels) + for g in gives: + c = g.strip() + if(c!=""): + if(bool(re.fullmatch(constant.WORD, c))): + if(not bool(re.fullmatch(constant.NUMBER, c))): + self.add_gives(c) + else: + #check and add if it's an emitted value + emited = self.check_is_emit(c) + #TODO -> do i not need to add it in the gives? + if(not emited): + raise Exception(f"I don't know what i'm looking at '{c}' in '{self.get_code()}'") + + + #print(operator, s, start, end) + + + + def get_origins(self): + return self.origins + + #def get_origins(self): + # tab = [] + # for o in self.origins: + # #tab.append(o.get_name()) + # tab.append(o) + # return tab + + def get_gives(self): + tab = [] + for g in self.gives: + #tab.append(g.get_name()) + tab.append(g) + return tab + + def print_operation(self): + print(self.get_code()) + + #TODO -> put this somewhere + def check_loop(self): + for g in self.gives: + for o in self.origins: + if(g==o): + print(f"Loop here:\n- {self.get_code()}\n- For the {o.get_name()} channel") + + #This method checks if an operation is just a full emited + #This is in the case of a parameter in a call + def check_if_operation_is_an_full_emitted(self): + #george_here + pattern_emit_tab = constant.EMIT_TAB + pattern_emit_name = constant.EMIT_NAME + pattern_emit_full = constant.EMIT_ALONE_2 + patterns = [pattern_emit_tab, pattern_emit_name, pattern_emit_full] + for pattern in patterns: + if(bool(re.fullmatch(pattern, self.get_code(clean_pipe = True)))): + return True + return False + + + #def print_summary(self, tab = 0, print_code=False): + # print(" "*tab+f"{self}") + # if(print_code): + # print(" "*(tab+1)+"* Code :", self.get_code()) + # print(" "*(tab+1)+"* Origins") + # for o in self.get_origins(): + # print(" "*(tab+1+2)+o.get_code(), f"'{o.get_type()}'") + # print(" "*(tab+1)+"* Gives") + # for g in self.get_gives(): + # print(" "*(tab+1+2)+g.get_code(), f"'{g.get_type()}'") + + def write_summary(self, address, tab = 0): + file = open(address, "a") + file.write(" "*tab+f"{self}\n") + file.write(" "*(tab+1)+"* Code : "+str(self.get_code())+ "\n") + file.write(" "*(tab+1)+"* Origins"+ "\n") + for o in self.get_origins(): + file.write(" "*(tab+1+2)+o.get_code()+ f" '{o.get_type()}'\n") + file.write(" "*(tab+1)+"* Gives\n") + for g in self.get_gives(): + file.write(" "*(tab+1+2)+g.get_code()+ f" '{g.get_type()}'\n") + + file.write("\n") + + # Closing the opened file + file.close() + + + def get_code(self, replace_calls = True, clean_pipe = False, get_OG=False): + code = self.code.get_code() + if(get_OG): + if(self.OG_code!=""): + return self.OG_code + else: + return code + + + + if(clean_pipe): + code = self.clean_pipe_operator(code) + + if(replace_calls): + for call in self.calls: + code = code.replace(self.calls[call].get_code(), str(call)) + + return code + + def initialise_double_dot(self): + self.extract_calls(clean_pipe=False) + code = self.get_code(clean_pipe = False) + pattern = constant.DOUBLE_DOT + for match in re.finditer(pattern, code): + double_dot = match.group(0).strip() + + c = double_dot.split("=")[0].strip() + self.add_gives(c) + + possibilities = double_dot[double_dot.rfind('?')+1:].split(":") + for p in possibilities: + p = p.strip() + if(p!=""): + name = p + #print(name) + if(bool(re.fullmatch(constant.WORD, name))): + self.add_origin(name) + elif(self.check_is_emit(name)): + None + #else: + # raise Exception(f"Don't know what i'm looking at '{name}' in operation '{self.get_code()}', in file '{self.get_file_address()}'!") + + + + + def extract_calls(self, clean_pipe = True): + from .call import Call + to_call = self.get_name_processes_subworkflows() + pattern_call = constant.BEGINNING_CALL + searching = True + while(searching): + searching= False + text = self.get_code(clean_pipe = clean_pipe) + + for c in self.calls: + text = text.replace(self.calls[c].get_code(), "") + for match in re.finditer(pattern_call, text): + if(match.group(1) in to_call): + searching=True + start, end = match.span(0) + call = Call(code =get_end_call(text, start, end), origin =self) + call.initialise() + self.calls[str(call)] = call + break + + #pattern_call_pipe = r"\|\s*(\w+)" + #searching = True + #while(searching): + # searching= False + # text = self.get_code(clean_pipe = clean_pipe) + # + # for c in self.calls: + # text = text.replace(self.calls[c].get_code(), "") + # for match in re.finditer(pattern_call_pipe, text): + # if(match.group(1) in to_call): + # print(match.group(1), text) + # start, end = match.span(0) + # from .outils import checks_in_condition_if, checks_in_string, extract_inside_parentheses + # if(not checks_in_condition_if(text, match.group(1)) and not checks_in_string(text, match.group(1))): + # searching=True + # call = Call(code =extract_inside_parentheses(text, match.group(1)), origin =self) + # call.initialise() + # self.calls[str(call)] = call + # break + + + + def initialise(self): + pattern = constant.DOUBLE_DOT + #If the operation is a double dot consition thing + if(bool(re.fullmatch(pattern, self.get_code(clean_pipe = False)))): + self.initialise_double_dot() + elif(bool(re.fullmatch(constant.DOUBLE_DOT_TUPLE, self.get_code(clean_pipe = False)))): + raise BioFlowInsightError(f"A ternary conditional operator was used with an tuple{self.get_string_line(self.get_code(clean_pipe = False))}. BioFlow-Insight doesn't support this yet (see specification list), try defining the operation in a different way.", num=5, origin=self) + else: + self.extract_calls() + self.initialise_origins() + self.initialise_gives() + + + + self.write_summary(self.get_output_dir() / "debug/operations.nf") + + def check_if_empty_call(self): + return self.get_code()=="" + + + + def initialise_from_call(self): + if(self.get_code()!="" and self.get_code()[0]=="[" and self.get_code()[-1]=="]"): + #print("in list", self.get_code()) + None + #TODO + #basically in this code -> i want to do the same thing for analye_parameters for a call + #But instead of adding them to the params, adding them to the gives.. + #Cause in the list you can put anything really + + #Working here + if(self.get_code()!=""): + self.extract_calls() + self.initialise_gives() + self.initialise_origins() + self.gives+=self.origins + self.gives = list(set(self.gives)) + #TODO -> this was originally uncommented, check it doesn't add any other bugs + #self.origins = [] + warnings.warn(f"TO CHECK !! From this : '{self.get_code()}'. I extracted to give (for a call) '{self.gives}' (in file '{self.get_file_address()}')\n") + #TODO + #We check that the operation is an actuel operation and not just a string for example + #if(len(self.get_origins())==0 and len(self.get_gives())==0): + # self.show_in_structure = False + + self.write_summary(self.get_output_dir() / "debug/operations_in_call.nf") + + def get_structure(self, dico, to_remove = False): + if(self.show_in_structure): + code = self.get_code(replace_calls=False) + #Need to replace /* and */ by /\* and *\/ so graphivz doesn't think it's a comment + #Same for // -> replace it by /\/\ + code = code.replace("/*", "/\*").replace("*/", "*\/").replace("//", "/\/\\") + code = code.replace('"', "'") + if(self.get_operation_type()=="Branch"): + fillcolor = "white" + else: + fillcolor = "" + + #TODO check this -> IMPORTANT + if(not to_remove): + dico['nodes'].append({'id':str(self), 'name':"", "shape":"point", 'xlabel': code, 'fillcolor':fillcolor}) + else: + #The ones which have the 'to_remove' name is because they are used in as takes and emits in subworkflow (they will be removed in post) + dico['nodes'].append({'id':str(self), 'name':"to_remove", "shape":"point", 'xlabel': code, 'fillcolor':fillcolor}) + + + for o in self.origins: + #Case origins is a channel + if(o.get_type()=="Channel"): + channel = o + channel.get_structure(dico, B=self) + + #Case origins is a call + elif(o.get_type()=="Call"): + call = o + call.get_structure(dico) + #Case the first call is a process + if(call.get_first_element_called().get_type()=="Process"): + dico["edges"].append({'A':str(call.get_first_element_called()), 'B':str(self), "label":""})#TODO check name of channel + #Case the first call is a subworkflow + elif(call.get_first_element_called().get_type()=="Subworkflow"): + sub = call.get_first_element_called() + if(sub.get_nb_emit()==0): + raise BioFlowInsightError(f"The subworkflow '{sub.get_name()}' doesn't emit anything. It is given to an operation{self.get_string_line(call.get_code())}.", num=20, origin=self) + elif(sub.get_nb_emit()>1): + raise BioFlowInsightError(f"To much to unpack : The subworkflow '{sub.get_name()}' emits over one channel in a operation{self.get_string_line(call.get_code())}.", num=20, origin=self) + #TODO recommendation -> try using an emit subworkflow.out + else: + emit = sub.get_emit()[0] + dico["edges"].append({'A':str(emit), 'B':str(self), "label":emit.get_code()}) + #for out in sub.get_emit(): + # print(out, out.get_code()) + # #These are channels + # #TODO check this -> it was the one line 644 before + # #out.get_structure(dico, B=self) + # out.get_structure(dico) + + elif(call.get_first_element_called().get_type()=="Function"): + #TODO check if this is actually the cas + None + else: + raise Exception("This souldn't happen!") + + + #Case origins is a Emmited + elif(o.get_type()=="Emitted"): + emitted = o + emitted.get_structure(dico, B=self) + + else: + print(self.get_code()) + print(self.get_file_address()) + raise Exception(f"This souldn't happen! The origin of an operation is of type '{o.get_type()}'. It's code is '{o.get_code()}'") + + diff --git a/src/outils.py b/src/outils.py new file mode 100644 index 0000000000000000000000000000000000000000..4309b82245032372d0eb3e2654e2fea5bb24c3c2 --- /dev/null +++ b/src/outils.py @@ -0,0 +1,820 @@ +import re + +#============================================================= +# THESE A JUST UTILITY FUNCTIONS TO BE ABLE TO MANIPULATE CODE +#============================================================= + +#Function that returns the next character (+ it's index) +def get_next_element_caracter(string, i): + while(i+1<len(string)): + i+=1 + if(string[i]!=' ' and string[i]!='\n'and string[i]!='\t'): + return string[i], i + return -1, -1 + +#Function that returns the character before (+ it's index) +def get_before_element_caracter(string, i): + while(i>0): + i-=1 + if(string[i]!=' ' and string[i]!='\n'and string[i]!='\t'): + return string[i], i + return -1, -1 + +def get_curly_count(code): + curly_count = 0 + quote_single, quote_double = False, False + triple_single, triple_double = False, False + for end in range(len(code)): + checked_triple = False + if(end+3<=len(code)): + if(code[end:end+3]=="'''" and not quote_single and not quote_double and not triple_single and not triple_double): + triple_single = True + end+=3 + checked_triple = True + elif(code[end:end+3]=="'''" and not quote_single and not quote_double and triple_single and not triple_double): + triple_single = False + end+=3 + checked_triple = True + + if(code[end:end+3]=='"""' and not quote_single and not quote_double and not triple_single and not triple_double): + triple_double = True + end+=3 + checked_triple = True + elif(code[end:end+3]=='"""' and not quote_single and not quote_double and not triple_single and triple_double): + triple_double = False + end+=3 + checked_triple = True + + if(not checked_triple): + if(code[end]=="{" and not quote_single and not quote_double and not triple_double): + curly_count+=1 + if(code[end]=="}" and not quote_single and not quote_double and not triple_double): + curly_count-=1 + + if(code[end]=="'" and not quote_single and not quote_double and not triple_double): + if(code[end-1]!="\\" or (code[end-1]=="\\" and code[end-2]=="\\")): + quote_single=True + elif(code[end]=="'" and quote_single and not quote_double and not triple_double): + if(code[end-1]!="\\" or (code[end-1]=="\\" and code[end-2]=="\\")): + quote_single=False + + if(code[end]=='"' and not quote_single and not quote_double and not triple_double): + if(code[end-1]!="\\" or (code[end-1]=="\\" and code[end-2]=="\\")): + quote_double=True + elif(code[end]=='"' and not quote_single and quote_double and not triple_double): + if(code[end-1]!="\\" or (code[end-1]=="\\" and code[end-2]=="\\")): + quote_double=False + return curly_count + +def get_single_count(code): + single_count = 0 + quote_single, quote_double = False, False + for end in range(len(code)): + if(code[end]=="'" and not quote_single and not quote_double): + if(code[end-1]!="\\" or (code[end-1]=="\\" and code[end-2]=="\\")): + quote_single=True + single_count+=1 + elif(code[end]=="'" and quote_single and not quote_double): + if(code[end-1]!="\\" or (code[end-1]=="\\" and code[end-2]=="\\")): + quote_single=False + single_count-=1 + + if(code[end]=='"' and not quote_single and not quote_double): + if(code[end-1]!="\\" or (code[end-1]=="\\" and code[end-2]=="\\")): + quote_double=True + elif(code[end]=='"' and not quote_single and quote_double): + if(code[end-1]!="\\" or (code[end-1]=="\\" and code[end-2]=="\\")): + quote_double=False + return single_count + +def get_double_count(code): + double_count = 0 + quote_single, quote_double = False, False + for end in range(len(code)): + if(code[end]=="'" and not quote_single and not quote_double): + if(code[end-1]!="\\" or (code[end-1]=="\\" and code[end-2]=="\\")): + quote_single=True + elif(code[end]=="'" and quote_single and not quote_double): + if(code[end-1]!="\\" or (code[end-1]=="\\" and code[end-2]=="\\")): + quote_single=False + + if(code[end]=='"' and not quote_single and not quote_double): + if(code[end-1]!="\\" or (code[end-1]=="\\" and code[end-2]=="\\")): + quote_double=True + double_count+=1 + elif(code[end]=='"' and not quote_single and quote_double): + if(code[end-1]!="\\" or (code[end-1]=="\\" and code[end-2]=="\\")): + quote_double=False + double_count-=1 + return double_count + + +#Function that returns the parenthese count of a bit of code +def get_parenthese_count(code): + parenthese_count = 0 + quote_single, quote_double = False, False + triple_single, triple_double = False, False + for end in range(len(code)): + checked_triple = False + if(end+3<=len(code)): + if(code[end:end+3]=="'''" and not quote_single and not quote_double and not triple_single and not triple_double): + triple_single = True + end+=3 + checked_triple = True + elif(code[end:end+3]=="'''" and not quote_single and not quote_double and triple_single and not triple_double): + triple_single = False + end+=3 + checked_triple = True + + if(code[end:end+3]=='"""' and not quote_single and not quote_double and not triple_single and not triple_double): + triple_double = True + end+=3 + checked_triple = True + elif(code[end:end+3]=='"""' and not quote_single and not quote_double and not triple_single and triple_double): + triple_double = False + end+=3 + checked_triple = True + + if(not checked_triple): + if(code[end]=="(" and not quote_single and not quote_double and not triple_single and not triple_double): + parenthese_count+=1 + if(code[end]==")" and not quote_single and not quote_double and not triple_single and not triple_double): + parenthese_count-=1 + + if(code[end]=="'" and not quote_single and not quote_double and not triple_single and not triple_double): + if(code[end-1]!="\\" or (code[end-1]=="\\" and code[end-2]=="\\")): + quote_single=True + elif(code[end]=="'" and quote_single and not quote_double and not triple_single and not triple_double): + if(code[end-1]!="\\" or (code[end-1]=="\\" and code[end-2]=="\\")): + quote_single=False + + if(code[end]=='"' and not quote_single and not quote_double and not triple_single and not triple_double): + if(code[end-1]!="\\" or (code[end-1]=="\\" and code[end-2]=="\\")): + quote_double=True + elif(code[end]=='"' and not quote_single and quote_double and not triple_single and not triple_double): + if(code[end-1]!="\\" or (code[end-1]=="\\" and code[end-2]=="\\")): + quote_double=False + return parenthese_count + + +#Function that returns a subpart of the code until the parenthse_count equals the given value +def get_code_until_parenthese_count(code, val, left_2_right = True): + parenthese_count = 0 + quote_single, quote_double = False, False + if(left_2_right): + tab = list(range(len(code))) + else: + tab = list(range(len(code)-1, -1, -1)) + for end in tab: + if(parenthese_count==val): + if(left_2_right): + return code[:end] + else: + return code[end:] + + if(code[end]=="(" and not quote_single and not quote_double): + parenthese_count+=1 + if(code[end]==")" and not quote_single and not quote_double): + parenthese_count-=1 + + if(code[end]=="'" and not quote_single and not quote_double): + if(code[end-1]!="\\" or (code[end-1]=="\\" and code[end-2]=="\\")): + quote_single=True + + elif(code[end]=="'" and quote_single and not quote_double): + if(code[end-1]!="\\" or (code[end-1]=="\\" and code[end-2]=="\\")): + quote_single=False + + if(code[end]=='"' and not quote_single and not quote_double): + if(code[end-1]!="\\" or (code[end-1]=="\\" and code[end-2]=="\\")): + quote_double=True + elif(code[end]=='"' and not quote_single and quote_double): + if(code[end-1]!="\\" or (code[end-1]=="\\" and code[end-2]=="\\")): + quote_double=False + + if(parenthese_count==val): + return code + return None + + +#This function takes some code, the begining of an operator and the end, then extracts +#the whole executor +def extract_executor_from_middle(code, start, end): + save_start, save_end = start, end + find_start, find_end = False, False + + #Basically the logic here is that at the start of operation curly or parenthese count can be positive but never negative (see example below) + curly_count, parenthese_count = 0, 0 + quote_single, quote_double = False, False + + + while(not find_start): + if(start<0): + raise Exception(f"Couldn't find the start of the executor : {code[save_start:save_end]}") + + if(code[end]=="{" and not quote_single and not quote_double): + curly_count+=1 + if(code[end]=="}" and not quote_single and not quote_double): + curly_count-=1 + if(code[end]=="(" and not quote_single and not quote_double): + parenthese_count+=1 + if(code[end]==")" and not quote_single and not quote_double): + parenthese_count-=1 + if(code[end]=="'" and not quote_single and not quote_double): + if(code[end-1]!="\\" or (code[end-1]=="\\" and code[end-2]=="\\")): + quote_single=True + elif(code[end]=="'" and quote_single and not quote_double): + if(code[end-1]!="\\" or (code[end-1]=="\\" and code[end-2]=="\\")): + quote_single=False + if(code[end]=='"' and not quote_single and not quote_double): + if(code[end-1]!="\\" or (code[end-1]=="\\" and code[end-2]=="\\")): + quote_double=True + elif(code[end]=='"' and not quote_single and quote_double): + if(code[end-1]!="\\" or (code[end-1]=="\\" and code[end-2]=="\\")): + quote_double=False + + next_character, _ = get_next_element_caracter(code, start) + character_before, _ = get_before_element_caracter(code, start) + + + if(code[start]=='\n' and (re.fullmatch("\w", next_character) or next_character in ['(']) and character_before not in ['(', '[', ',', '.', '|'] and curly_count>=0 and parenthese_count>=0 and not quote_single and not quote_single): + #if(code[start]=='\n' and character_before not in ['(', '[', ',', '.', '|'] and curly_count>=0 and parenthese_count>=0 and not quote_single and not quote_single): + find_start = True + else: + start-=1 + + + #Basically the logic here is that at the end of operation curly or parenthese count can be negative but never positive + #For example (.join is detected first): + #trim_reads + #.join(trim_log) + #.map { + # meta, reads, trim_log -> + # if (!meta.single_end) { + # trim_log = trim_log[-1] + # } + # if (getTrimGaloreReadsAfterFiltering(trim_log) > 0) { + # [ meta, reads ] + # } + #} + #.set { trim_reads } + + curly_count, parenthese_count = 0, 0 + quote_single, quote_double = False, False + + + while(not find_end): + if(end>=len(code)): + raise Exception(f"Couldn't find the end of the executor : {code[start:save_end]}") + + + if(code[end]=="{" and not quote_single and not quote_double): + curly_count+=1 + if(code[end]=="}" and not quote_single and not quote_double): + curly_count-=1 + if(code[end]=="(" and not quote_single and not quote_double): + parenthese_count+=1 + if(code[end]==")" and not quote_single and not quote_double): + parenthese_count-=1 + if(code[end]=="'" and not quote_single and not quote_double): + if(code[end-1]!="\\" or (code[end-1]=="\\" and code[end-2]=="\\")): + quote_single=True + elif(code[end]=="'" and quote_single and not quote_double): + if(code[end-1]!="\\" or (code[end-1]=="\\" and code[end-2]=="\\")): + quote_single=False + if(code[end]=='"' and not quote_single and not quote_double): + if(code[end-1]!="\\" or (code[end-1]=="\\" and code[end-2]=="\\")): + quote_double=True + elif(code[end]=='"' and not quote_single and quote_double): + if(code[end-1]!="\\" or (code[end-1]=="\\" and code[end-2]=="\\")): + quote_double=False + + + + next_character, next = get_next_element_caracter(code, end) + next_next_character, next = get_next_element_caracter(code, next) + character_before, _ = get_before_element_caracter(code, end) + #TODO -> my intuition tells me i need to add next_character in ['}', ')']) + #But it creates a problem in this example + #MERGED_LIBRARY_ATAQV_MKARV ( + # MERGED_LIBRARY_ATAQV_ATAQV.out.json.collect{it[1]} + #) + #v0 + #if(code[end]=='\n' and (re.fullmatch("\w", next_character) or next_character in ['}', '/', '|']) and character_before in [')', '}'] and curly_count<=0 and parenthese_count<=0 and not quote_single and not quote_single): + #v1 + #if(code[end]=='\n' and (re.fullmatch("\w", next_character) or next_character in ['}', '/', '|']) and curly_count<=0 and parenthese_count<=0 and not quote_single and not quote_single): + #v2 + if(code[end]=='\n' and (re.fullmatch("\w", next_character) or next_character in ['}', '/', '|']) and character_before not in [','] and next_next_character not in ['.', '|'] and curly_count<=0 and parenthese_count<=0 and not quote_single and not quote_single): + find_end = True + else: + end+=1 + + return code[start:end].strip() + + +def extract_end_operation(code, start, end): + curly_count, parenthese_count , bracket_count= 0, 0, 0 + quote_single, quote_double = False, False + finish = False + while(not finish): + if(end>=len(code)): + raise Exception('Unable to extract') + elif(code[end]=="{" and not quote_single and not quote_double): + curly_count+=1 + elif(code[end]=="}" and not quote_single and not quote_double): + curly_count-=1 + elif(code[end]=="(" and not quote_single and not quote_double): + parenthese_count+=1 + elif(code[end]==")" and not quote_single and not quote_double): + parenthese_count-=1 + elif(code[end]=="[" and not quote_single and not quote_double): + bracket_count+=1 + elif(code[end]=="]" and not quote_single and not quote_double): + bracket_count-=1 + elif(code[end]=="'" and not quote_single and not quote_double): + if(code[end-1]!="\\" or (code[end-1]=="\\" and code[end-2]=="\\")): + quote_single=True + elif(code[end]=='"' and not quote_single and not quote_double): + if(code[end-1]!="\\" or (code[end-1]=="\\" and code[end-2]=="\\")): + quote_double=True + elif(code[end]=="'" and quote_single and not quote_double): + if(code[end-1]!="\\" or (code[end-1]=="\\" and code[end-2]=="\\")): + quote_single=False + elif(code[end]=='"' and not quote_single and quote_double): + if(code[end-1]!="\\" or (code[end-1]=="\\" and code[end-2]=="\\")): + quote_double=False + + character_before, _ = get_before_element_caracter(code, end) + next_character, _ = get_next_element_caracter(code, end) + if(code[end]=='\n' and next_character not in ['.', "|"] and curly_count==0 and parenthese_count==0 and bracket_count==0 and not quote_single and not quote_double and character_before!="|"): + #if(next_character!='.' and curly_count==0 and parenthese_count==0 and not quote_single and not quote_double): + finish = True + elif((curly_count<0 or parenthese_count<0 or bracket_count<0) and character_before in [')', '}'] and not quote_single and not quote_double): + finish = True + else: + end+=1 + return code[start:end].strip() + +#Function that 'finds' the end of the process, when we give the start position +#So it follows the pattern 'process name {....}' +def extract_curly(text, start): + + end = start + code= text + curly_count, parenthese_count = 1, 0 + quote_single, quote_double = False, False + triple_single, triple_double = False, False + + + while(parenthese_count!=0 or curly_count!=0 or quote_single or quote_double or triple_single or triple_double): + #print(parenthese_count, curly_count, quote_single, quote_double, triple_single, triple_double) + + + checked_triple = False + if(end+3<=len(code)): + if(code[end:end+3]=="'''" and not quote_single and not quote_double and not triple_single and not triple_double): + triple_single = True + end+=3 + checked_triple = True + elif(code[end:end+3]=="'''" and not quote_single and not quote_double and triple_single and not triple_double): + triple_single = False + end+=3 + checked_triple = True + + if(code[end:end+3]=='"""' and not quote_single and not quote_double and not triple_single and not triple_double): + triple_double = True + end+=3 + checked_triple = True + elif(code[end:end+3]=='"""' and not quote_single and not quote_double and not triple_single and triple_double): + triple_double = False + end+=3 + checked_triple = True + + if(not checked_triple): + if(code[end]=="{" and not quote_single and not quote_double and not triple_single and not triple_double): + curly_count+=1 + elif(code[end]=="}" and not quote_single and not quote_double and not triple_single and not triple_double): + curly_count-=1 + + if(code[end]=="(" and not quote_single and not quote_double and not triple_single and not triple_double): + parenthese_count+=1 + elif(code[end]==")" and not quote_single and not quote_double and not triple_single and not triple_double): + parenthese_count-=1 + + if(code[end]=="'" and not quote_single and not quote_double and not triple_single and not triple_double): + if(code[end-1]!="\\" or (code[end-1]=="\\" and code[end-2]=="\\")): + quote_single=True + elif(code[end]=="'" and quote_single and not quote_double and not triple_single and not triple_double): + if(code[end-1]!="\\" or (code[end-1]=="\\" and code[end-2]=="\\")): + quote_single=False + + if(code[end]=='"' and not quote_single and not quote_double and not triple_single and not triple_double): + if(code[end-1]!="\\" or (code[end-1]=="\\" and code[end-2]=="\\")): + quote_double=True + elif(code[end]=='"' and not quote_single and quote_double and not triple_single and not triple_double): + if(code[end-1]!="\\" or (code[end-1]=="\\" and code[end-2]=="\\")): + quote_double=False + + end+=1 + + if(end>len(code)): + raise Exception('Unable to extract') + return end + + + +def get_end_operator(code, start, end, beginning_character): + curly_count, parenthese_count = 0, 0 + quote_single, quote_double = False, False + + start_param = end + if(beginning_character=="("): + parenthese_count+=1 + if(beginning_character=="{"): + curly_count+=1 + + while(parenthese_count!=0 or curly_count!=0 or quote_single or quote_double): + if(code[end]=="{" and not quote_single and not quote_double): + curly_count+=1 + if(code[end]=="}" and not quote_single and not quote_double): + curly_count-=1 + if(code[end]=="(" and not quote_single and not quote_double): + parenthese_count+=1 + if(code[end]==")" and not quote_single and not quote_double): + parenthese_count-=1 + if(code[end]=="'" and not quote_single and not quote_double): + if(code[end-1]!="\\" or (code[end-1]=="\\" and code[end-2]=="\\")): + quote_single=True + elif(code[end]=="'" and quote_single and not quote_double): + if(code[end-1]!="\\" or (code[end-1]=="\\" and code[end-2]=="\\")): + quote_single=False + if(code[end]=='"' and not quote_single and not quote_double): + if(code[end-1]!="\\" or (code[end-1]=="\\" and code[end-2]=="\\")): + quote_double=True + elif(code[end]=='"' and not quote_single and quote_double): + if(code[end-1]!="\\" or (code[end-1]=="\\" and code[end-2]=="\\")): + quote_double=False + end+=1 + if(end>len(code)): + raise Exception('Unable to extract') + + return code[start:end].strip(), code[start_param:end-1].strip() + + + +#===================================================== +#FUNCTIONS FOR THE CODE CLASS -> REMOVE COMMENTS ETC.. +#===================================================== + +def remove_comments(input_text): + + input_text= input_text+"\n\n\n" + #Remove the \" and \' in the code + input_text = re.sub(r'([^\\])\\"', r'\g<1>', input_text) + input_text = re.sub(r"([^\\])\\'", r'\g<1>', input_text) + #Remove the /'/ and /"/ in the code + input_text = re.sub(r'\/"\/', "", input_text) + input_text = re.sub(r"\/'\/", "", input_text) + ##replace the '"${...}"' by '"""${...}"""' + #input_text = re.sub(r'" *(\$ *{[^}]*}) *"', r'"""\g<1>"""', input_text) + #input_text = re.sub(r"' *(\$ *{[^}]*}) *'", r"'''\g<1>'''", input_text) + + + + #input_text = input_text.replace('/\/*', '"').replace('\/*$/', '"')#TODO check if i actually wanna do this -> Etjean/Long_project/masque.nf + #TO remove `/\/* ... \/*$/ and /[fasta|fa]$/ + input_text = re.sub(r'\/\\\/\*([^(\\\/\*\$\/)]+)\\\/\*\$\/', r'"\g<1>"', input_text) + + #input_text = re.sub(r'\/([^($\/)]+)\$\/', r'"\g<1>"', input_text) + #if(temp!=input_text): + # print("-",start) + + to_remove = [] + quote_single, quote_double = False, False + triple_single, triple_double = False, False + in_bloc, in_single_line = False, False + start, end = 0, 0 + i=0 + while(i<len(input_text)-3): + #for i in range(len(input_text)-3): + #Case single line comment "//" + if(input_text[i:i+2]=="//" and not quote_single and not quote_double and not in_bloc and not in_single_line and not triple_single and not triple_double): + is_comment_one_line = True + if(i-1>=0): + if(input_text[i-1]=='\\'): + is_comment_one_line=False + if(is_comment_one_line): + start = i + in_single_line = True + i+=2 + else: + i+=1 + elif(input_text[i]=="\n" and not quote_single and not quote_double and not in_bloc and in_single_line and not triple_single and not triple_double): + end = i + in_single_line = False + to_remove.append(input_text[start:end+1]) + i+=1 + #Case bloc comment "/*...*/" + elif(input_text[i:i+2]=="/*" and not quote_single and not quote_double and not in_bloc and not in_single_line and not triple_single and not triple_double): + start = i + in_bloc = True + i+=2 + elif(input_text[i:i+2]=="*/" and not quote_single and not quote_double and in_bloc and not in_single_line and not triple_single and not triple_double): + end = i+2 + in_bloc = False + to_remove.append(input_text[start:end]) + i+=2 + #ELSE + #Triple single + elif(input_text[i:i+3]=="'''" and not quote_single and not quote_double and not in_bloc and not in_single_line and not triple_single and not triple_double): + triple_single = True + i+=3 + elif(input_text[i:i+3]=="'''" and not quote_single and not quote_double and not in_bloc and not in_single_line and triple_single and not triple_double): + triple_single = False + i+=3 + #Triple double + elif(input_text[i:i+3]=='"""' and not quote_single and not quote_double and not in_bloc and not in_single_line and not triple_single and not triple_double): + triple_double = True + i+=3 + elif(input_text[i:i+3]=='"""' and not quote_single and not quote_double and not in_bloc and not in_single_line and not triple_single and triple_double): + triple_double = False + i+=3 + #Case single + elif(input_text[i]=="'" and not quote_single and not quote_double and not in_bloc and not in_single_line and not triple_single and not triple_double): + #if(input_text[i-1]!="\\"): + quote_single = True + i+=1 + elif(input_text[i]=="'" and quote_single and not quote_double and not in_bloc and not in_single_line and not triple_single and not triple_double): + #if(input_text[i-1]!="\\"): + quote_single = False + i+=1 + #Case double + elif(input_text[i]=='"' and not quote_single and not quote_double and not in_bloc and not in_single_line and not triple_single and not triple_double): + #if(input_text[i-1]!="\\"): + quote_double = True + i+=1 + elif(input_text[i]=='"' and not quote_single and quote_double and not in_bloc and not in_single_line and not triple_single and not triple_double): + #if(input_text[i-1]!="\\"): + quote_double = False + i+=1 + else: + i+=1 + + for r in to_remove: + if(r[:2]=="//"): + input_text = input_text.replace(r, '\n', 1) + else: + nb_jumps = r.count('\n') + input_text = input_text.replace(r, '\n'*nb_jumps, 1) + + return input_text + + + +#---------------------- +#Calls +#---------------------- +def get_end_call(code, start, end): + curly_count, parenthese_count = 0, 1 + quote_single, quote_double = False, False + #Before it was this + #while(parenthese_count!=0 or curly_count!=0 or quote_single or quote_double or code[end]!='\n'): + while(parenthese_count!=0 or curly_count!=0 or quote_single or quote_double): + if(end>=len(code)): + raise Exception('Unable to extract') + if(code[end]=="{" and not quote_single and not quote_double): + curly_count+=1 + if(code[end]=="}" and not quote_single and not quote_double): + curly_count-=1 + if(code[end]=="(" and not quote_single and not quote_double): + parenthese_count+=1 + if(code[end]==")" and not quote_single and not quote_double): + parenthese_count-=1 + if(code[end]=="'" and not quote_single and not quote_double): + if(code[end-1]!="\\" or (code[end-1]=="\\" and code[end-2]=="\\")): + quote_single=True + elif(code[end]=="'" and quote_single and not quote_double): + if(code[end-1]!="\\" or (code[end-1]=="\\" and code[end-2]=="\\")): + quote_single=False + if(code[end]=='"' and not quote_single and not quote_double): + if(code[end-1]!="\\" or (code[end-1]=="\\" and code[end-2]=="\\")): + quote_double=True + elif(code[end]=='"' and not quote_single and quote_double): + if(code[end-1]!="\\" or (code[end-1]=="\\" and code[end-2]=="\\")): + quote_double=False + end+=1 + return code[start:end].strip() + + +#This function takes a string "param" and returns the next parameter +def get_next_param(param): + curly_count, parenthese_count, bracket_count= 0, 0, 0 + quote_single, quote_double = False, False + end= 0 + while(True): + if(end>=len(param)): + return param, -1 + if(parenthese_count==0 and curly_count==0 and bracket_count==0 and not quote_single and not quote_double and param[end]==','): + return param[0:end], end+1 + + if(param[end]=="{" and not quote_single and not quote_double): + curly_count+=1 + elif(param[end]=="}" and not quote_single and not quote_double): + curly_count-=1 + elif(param[end]=="(" and not quote_single and not quote_double): + parenthese_count+=1 + elif(param[end]==")" and not quote_single and not quote_double): + parenthese_count-=1 + elif(param[end]=="[" and not quote_single and not quote_double): + bracket_count+=1 + elif(param[end]=="]" and not quote_single and not quote_double): + bracket_count-=1 + elif(param[end]=="'" and not quote_single and not quote_double): + if(param[end-1]!="\\"): + quote_single=True + elif(param[end]=='"' and not quote_single and not quote_double): + if(param[end-1]!="\\"): + quote_double=True + elif(param[end]=="'" and quote_single and not quote_double): + if(param[end-1]!="\\"): + quote_single=False + elif(param[end]=='"' and not quote_single and quote_double): + if(param[end-1]!="\\"): + quote_double=False + end+=1 + +def update_parameters(code, end, curly_count, parenthese_count, quote_single, quote_double) : + if(code[end]=="{" and not quote_single and not quote_double): + curly_count+=1 + elif(code[end]=="}" and not quote_single and not quote_double): + curly_count-=1 + elif(code[end]=="(" and not quote_single and not quote_double): + parenthese_count+=1 + elif(code[end]==")" and not quote_single and not quote_double): + parenthese_count-=1 + elif(code[end]=="'" and not quote_single and not quote_double): + if(code[end-1]!="\\" or (code[end-1]=="\\" and code[end-2]=="\\")): + quote_single=True + elif(code[end]=='"' and not quote_single and not quote_double): + if(code[end-1]!="\\" or (code[end-1]=="\\" and code[end-2]=="\\")): + quote_double=True + elif(code[end]=="'" and quote_single and not quote_double): + if(code[end-1]!="\\" or (code[end-1]=="\\" and code[end-2]=="\\")): + quote_single=False + elif(code[end]=='"' and not quote_single and quote_double): + if(code[end-1]!="\\" or (code[end-1]=="\\" and code[end-2]=="\\")): + quote_double=False + return curly_count, parenthese_count, quote_single, quote_double + + +def remove_jumps_inbetween_parentheses(code): + code = re.sub(',\s*\n\s*', ', ', code) + code = re.sub(';\s*\n\s*', '; ', code) + code = list(code) + parentheses_count = 0 + for i in range(len(code)): + if(code[i]=="("): + parentheses_count+=1 + elif(code[i]==")"): + parentheses_count-=1 + elif(code[i]=="\n" and parentheses_count!=0): + code[i] = " " + code = "".join(code) + code = re.sub(r", *\n", ", ", code) + return code + +#def check_if_parameter_is_given_pipe(code, OG_start, OG_end): +# char, end = get_next_element_caracter(code, OG_end-1) +# start = OG_end +# if(char in ['(', '{']): +# curly_count, parenthese_count = int(char=="{"), int(char=="(") +# quote_single, quote_double = False, False +# end+=1 +# #Before it was this +# #while(parenthese_count!=0 or curly_count!=0 or quote_single or quote_double or code[end]!='\n'): +# while(parenthese_count!=0 or curly_count!=0 or quote_single or quote_double): +# if(end>=len(code)): +# raise Exception('Unable to extract') +# if(code[end]=="{" and not quote_single and not quote_double): +# curly_count+=1 +# if(code[end]=="}" and not quote_single and not quote_double): +# curly_count-=1 +# if(code[end]=="(" and not quote_single and not quote_double): +# parenthese_count+=1 +# if(code[end]==")" and not quote_single and not quote_double): +# parenthese_count-=1 +# if(code[end]=="'" and not quote_single and not quote_double): +# if(code[end-1]!="\\" or (code[end-1]=="\\" and code[end-2]=="\\")): +# quote_single=True +# elif(code[end]=="'" and quote_single and not quote_double): +# if(code[end-1]!="\\" or (code[end-1]=="\\" and code[end-2]=="\\")): +# quote_single=False +# if(code[end]=='"' and not quote_single and not quote_double): +# if(code[end-1]!="\\" or (code[end-1]=="\\" and code[end-2]=="\\")): +# quote_double=True +# elif(code[end]=='"' and not quote_single and quote_double): +# if(code[end-1]!="\\" or (code[end-1]=="\\" and code[end-2]=="\\")): +# quote_double=False +# end+=1 +# return code[start:end].strip()[1:-1].strip(), code[OG_start:end] +# return '' + +def expand_call_to_operation(code, call): + start = code.find(call) + end = start+len(call) + char, _ = get_next_element_caracter(code, end-1) + #This means it's an operation + if(char=="."): + return extract_end_operation(code, start, end) + return call + +def expand_pipe_operator(code, operator): + start = code.find(operator) + end = start+len(operator) + expanding = True + while(expanding): + expanding = False + char, _ = get_next_element_caracter(code, end-1) + if(char in ['{', '|', '(']): + operator = extract_end_operation(code, start, end) + start = code.find(operator) + end = start+len(operator) + expanding = True + return operator + +#Function that checks if a bit of code given is in the condition of an if +def checks_in_condition_if(code, bit_of_code): + start = code.find(bit_of_code) + end = start+len(bit_of_code) + start_if, end_if = 0, 0 + for match in re.finditer(r"if\s*\(", code[:start]): + start_if, end_if = match.span(0) + parenthese_count_left_before_if = get_parenthese_count(code[start_if:start]) + if(parenthese_count_left_before_if>0 and get_parenthese_count(code[:start_if])==0): + code_end_if = get_code_until_parenthese_count(code[end:], -1*parenthese_count_left_before_if) + if(code_end_if!=None): + code_right_after_if = code[code.find(code_end_if)+len(code_end_if):] + if(get_parenthese_count(code_right_after_if)==0): + return True + return False + + +#the function sort_and_filter takes two lists, positions and variables, and removes +#entries with positions equal to (0, 0). It then sorts the remaining entries based +#on positions and returns the sorted positions and corresponding variables. +def sort_and_filter(positions, variables): + combined_data = list(zip(positions, variables)) + combined_data = [(pos, var) for pos, var in combined_data if pos != (0, 0)] + combined_data.sort(key=lambda x: x[0]) + sorted_positions, sorted_variables = zip(*combined_data) + return list(sorted_positions), list(sorted_variables) + +#Function that checks if a bit of code given is in a string +def checks_in_string(code, bit_of_code): + start = code.find(bit_of_code) + end = start+len(bit_of_code) + + + #Start by single quote + first_quote_from_left, first_quote_from_right = -1, -1 + for i in range(start-1, -1, -1): + if(code[i]=="'"): + first_quote_from_left = i + break + for i in range(end, len(code)): + if(code[i]=="'"): + first_quote_from_right = i + break + if(first_quote_from_left!=-1 and first_quote_from_right!=-1): + if(get_single_count(code[:first_quote_from_left])==0 and get_single_count(code[first_quote_from_right+1:])==0): + return True + + #Do the same for double quote + first_quote_from_left, first_quote_from_right = -1, -1 + for i in range(start-1, -1, -1): + if(code[i]=='"'): + first_quote_from_left = i + break + if(first_quote_from_left==-1): + return False + for i in range(end, len(code)): + if(code[i]=='"'): + first_quote_from_right = i + break + if(first_quote_from_right==-1): + return False + if(first_quote_from_left!=-1 and first_quote_from_right!=-1): + if(get_double_count(code[:first_quote_from_left])==0 and get_double_count(code[first_quote_from_right+1:])==0): + return True + + return False + + +#This function extracts the rest of the inside of a parentheses given a +#bit of code (we assume that the bit of code is inside the code) +def extract_inside_parentheses(code, bit_of_code): + start = code.find(bit_of_code) + end = start+len(bit_of_code) + left = get_code_until_parenthese_count(code[:start], 1, left_2_right = False) + right = get_code_until_parenthese_count(code[end:], -1, left_2_right = True) + return (left[1:]+bit_of_code+right[:-1]).strip() diff --git a/src/outils_graph.py b/src/outils_graph.py new file mode 100644 index 0000000000000000000000000000000000000000..ca99e61f467b27c5b3904e3759d7961fdcbb99ac --- /dev/null +++ b/src/outils_graph.py @@ -0,0 +1,283 @@ +import graphviz +import copy +import numpy as np + +process_id = "<src.process.Process" +operation_id = "<src.operation.Operation" + +def is_process(node_id): + if(node_id[:len(process_id)]==process_id): + return True + return False + +def is_operation(node_id): + if(node_id[:len(operation_id)]==operation_id): + return True + return False + +def fill_dot(dot, dico, label_node = True, label_edge = True): + for n in dico["nodes"]: + if(label_node): + dot.node(n["id"], n["name"], shape=n["shape"], xlabel= n["xlabel"], fillcolor=n["fillcolor"]) + else: + dot.node(n["id"], n["name"], shape=n["shape"], fillcolor=n["fillcolor"]) + for e in dico["edges"]: + if(label_edge): + dot.edge(e['A'], e['B'], label= e['label']) + else: + dot.edge(e['A'], e['B']) + for sub in dico["subworkflows"]: + with dot.subgraph(name="cluster"+sub) as c: + fill_dot(c, dico["subworkflows"][sub], label_node, label_edge) + c.attr(label=sub) + +def generate_graph(filename, dico, label_node = True, label_edge = True, render_graphs = True): + dot = graphviz.Digraph(filename=filename, format='png', comment="temp") + fill_dot(dot, dico, label_node, label_edge) + dot.save(filename=f'{filename}.dot') + if(render_graphs): + dot.render(filename=f'{filename}.dot', outfile=f'{filename}.png') + +#Function that merges to dictionnaries +def merge(x, y): + return { key:list(set(x.get(key,[])+y.get(key,[]))) for key in set(list(x.keys())+list(y.keys())) } + +#This function returns a listof the orphan operations in the graph +def get_id_orphan_operation(graph): + id_operations = [] + + def get_id_operations(graph): + for node in graph['nodes']: + if(is_operation(node['id'])): + id_operations.append(node['id']) + for subworkflow in graph["subworkflows"]: + get_id_operations(graph["subworkflows"][subworkflow]) + + def get_dico_operation_is_linked(graph, dico_operation_is_linked = {}): + #First call + if(dico_operation_is_linked == {}): + for id in id_operations: + dico_operation_is_linked[id] = False + for edge in graph["edges"]: + dico_operation_is_linked[edge["A"]] = True + dico_operation_is_linked[edge["B"]] = True + for subworkflow in graph["subworkflows"]: + get_dico_operation_is_linked(graph["subworkflows"][subworkflow], dico_operation_is_linked) + return dico_operation_is_linked + + + get_id_operations(graph) + dico = get_dico_operation_is_linked(graph) + tab = [] + for operation_id in dico: + if(not dico[operation_id]): + tab.append(operation_id) + return tab + +def graph_dico_wo_orphan_operations(graph_tmp): + graph = copy.deepcopy(graph_tmp) + orphans = get_id_orphan_operation(graph) + + def remove_orphans(graph, orphans): + to_remove = [] + for node in graph["nodes"]: + if(node["id"] in orphans): + to_remove.append(node) + for r in to_remove: + try: + graph["nodes"].remove(r) + except: + None + for subworkflow in graph["subworkflows"]: + remove_orphans(graph["subworkflows"][subworkflow], orphans) + remove_orphans(graph, orphans) + return graph + +#Function that returns the type of a given node +def get_type_node(node): + if(is_process(node['id'])): + return "Process" + else: + if(node["fillcolor"]=="white"): + return "Branch Operation" + else: + return "Create Operation" + +#Function that creates the link dico from a given graph dico +def initia_link_dico_rec(dico): + links = {} + for node in dico['nodes']: + try: + temp = links[node['id']] + except: + links[node['id']] = [] + for edge in dico['edges']: + A = edge['A'] + B = edge['B'] + try: + temp = links[A] + except: + links[A] = [] + links[A].append(B) + + for sub in dico['subworkflows']: + links = merge(links, initia_link_dico_rec(dico['subworkflows'][sub])) + return links + + + + + +#Returns the number of cycles in a graph (rootes with "Source" and "Sink") +#The input parameter is a links dico +#https://en.wikipedia.org/wiki/Cycle_(graph_theory)#Algorithm +def get_number_cycles(links): + dico_nb_cycles = {'nb':0} + dfs_dico = {} + for node in links: + dfs_dico[node] = {} + dfs_dico[node]['visited'] = False + dfs_dico[node]['finished'] = False + + edges_create_cycles = [] + + def DFS(mother): + if(dfs_dico[mother]["finished"]): + return + if(dfs_dico[mother]["visited"]): + dico_nb_cycles["nb"]+=1 + return "found cycle" + dfs_dico[mother]["visited"] = True + for daughter in links[mother]: + _ = DFS(daughter) + if(_ == "found cycle"): + edges_create_cycles.append((mother, daughter)) + dfs_dico[mother]["finished"] = True + + for node in links: + DFS(node) + return dico_nb_cycles['nb'], edges_create_cycles + + +#https://en.wikipedia.org/wiki/Topological_sorting#Depth-first_search +def topological_sort(graph): + L = [] # Empty list that will contain the sorted nodes + temporary_marks = set() + permanent_marks = set() + + def visit(node): + if node in permanent_marks: + return + + if node in temporary_marks: + None + #raise ValueError("Graph has at least one cycle") + else: + + temporary_marks.add(node) + + for neighbor in graph.get(node, []): + visit(neighbor) + + temporary_marks.remove(node) + permanent_marks.add(node) + L.insert(0, node) # add node to head of L + + while set(graph.keys()) - permanent_marks: + node = (set(graph.keys()) - permanent_marks).pop() + visit(node) + + return L + +#A variant of this answer https://stackoverflow.com/a/5164820 +def get_number_paths_source_2_sink(graph): + topo_sort = topological_sort(graph) + + dict_paths_from_node_2_sink = {} + for node in topo_sort: + dict_paths_from_node_2_sink[node] = 1 + + for i in range(len(topo_sort)-2, -1, -1): + sum= 0 + for y in range(i+1, len(topo_sort)): + sum += graph[topo_sort[i]].count(topo_sort[y])*dict_paths_from_node_2_sink[topo_sort[y]] + dict_paths_from_node_2_sink[topo_sort[i]] = sum + + return dict_paths_from_node_2_sink["source"] + + +#For the shortest path +#https://en.wikipedia.org/wiki/Dijkstra%27s_algorithm#Pseudocode +def dijkstra(graph): + dist, prev = {}, {} + Q = [] + for node in graph: + dist[node] = np.Infinity + prev[node] = None + Q.append(node) + dist['source'] = 0 + + def get_node_in_Q_min_dist(): + min, node_min = dist[Q[0]], Q[0] + for node in Q: + if(min>dist[node]): + min, node_min = dist[node], node + return node_min + + while(len(Q)>0): + u = get_node_in_Q_min_dist() + Q.remove(u) + for v in graph[u]: + if(v in Q): + alt = dist[u] + 1 + if(alt<dist[v]): + dist[v] = alt + prev[v] = u + return dist["sink"] + +#https://www.geeksforgeeks.org/find-longest-path-directed-acyclic-graph/ +def get_longest_distance(graph): + dist = {} + for node in graph: + dist[node] = -np.Infinity + dist["source"] = 0 + topo = topological_sort(graph) + for u in topo: + for v in graph[u]: + if(dist[v]<dist[u]+1): + dist[v] = dist[u]+1 + return dist["sink"] + +##Returns the of paths, the longest and the shortes (not counting the source and sink) +#def get_paths(links): +# PATHS = [] +# shortest_path = {"nb":0} +# longest_path = {"nb":0} +# nb_paths = {"nb":0} +# +# def get_paths_temp(links, mother, path_temp): +# path = path_temp.copy() +# path.append(mother) +# if(mother=="Sink"): +# nb_paths["nb"]+=1 +# if(shortest_path["nb"]==0): +# shortest_path["nb"] = len(path) +# if(longest_path["nb"]==0): +# longest_path["nb"] = len(path) +# if(longest_path["nb"]<len(path)): +# longest_path["nb"]=len(path) +# if(shortest_path["nb"]>len(path)): +# shortest_path["nb"]=len(path) +# return +# for daughter in links[mother]: +# if(daughter!=mother): +# if(daughter not in path): +# get_paths_temp(links, daughter, path) +# +# +# get_paths_temp(links, "Source", []) +# number_paths_source_2_sink = nb_paths["nb"] +# longest_path = longest_path["nb"] +# smallest_path = shortest_path["nb"] +# +# return number_paths_source_2_sink, longest_path, smallest_path \ No newline at end of file diff --git a/src/process.py b/src/process.py new file mode 100644 index 0000000000000000000000000000000000000000..8a94f275e165e62dc46a055742099efb06ec2dc8 --- /dev/null +++ b/src/process.py @@ -0,0 +1,257 @@ +import re + +from .code_ import Code +from .nextflow_building_blocks import Nextflow_Building_Blocks +from .outils import remove_jumps_inbetween_parentheses, sort_and_filter +from .bioflowinsighterror import BioFlowInsightError + +from . import constant + +class Process(Nextflow_Building_Blocks): + def __init__(self, code, origin): + self.origin = origin + self.code = Code(code, origin = self) + self.name = "" + self.alias = "" + self.inputs = [] + self.outputs = [] + self.input_code = "" + self.output_code = "" + self.when_code = "" + self.script_code = "" + self.initialise() + self.initialised = True + + def set_alias(self, alias): + self.alias = alias + + def get_alias(self): + return self.alias + + def get_name(self): + return self.name + + + #def get_source(self): + # return [self] + + #MEthod which returns the DSL type of a process, i use the presence + #of from and into as a proxy. By default it's DSL2 + def which_DSL(self): + DSL = "DSL2" + pattern = constant.FROM + for match in re.finditer(pattern, self.code.get_code()): + DSL = "DSL1" + pattern = constant.INTO + for match in re.finditer(pattern, self.code.get_code()): + DSL = "DSL1" + return DSL + + def is_initialised(self): + return self.initialised + + #def get_sink(self): + # return [self] + + def get_type(self): + return "Process" + + + + + def get_inputs(self): + return self.inputs + + def get_nb_inputs(self): + return len(self.inputs) + + def get_outputs(self): + return self.outputs + + def get_nb_outputs(self): + return len(self.outputs) + + + def initialise_parts(self): + code = self.get_code() + + #Check to see if the process is empty + temp_code = re.sub(constant.PROCESS_HEADER, "", code) + temp_code = temp_code[:-1].strip() + if(len(temp_code)==0): + raise BioFlowInsightError(f"The process '{self.get_name()}' defined in the file '{self.get_file_address()}' is an empty process!", num = 22, origin=self) + input_multiple, input_pos= False, (0, 0) + for match in re.finditer(constant.INPUT, code): + if(input_multiple): + raise BioFlowInsightError(f"Multiple 'input:' were found in the process '{self.get_name()}'.", num = 22, origin=self) + input_pos = match.span(0) + input_multiple = True + + output_multiple, output_pos= False, (0, 0) + for match in re.finditer(constant.OUTPUT, code): + if(output_multiple): + raise BioFlowInsightError(f"Multiple 'output:' were found in the process '{self.get_name()}'?", num = 22, origin=self) + output_pos = match.span(0) + output_multiple = True + + when_multiple, when_pos= False, (0, 0) + for match in re.finditer(constant.WHEN, code): + if(when_multiple): + raise BioFlowInsightError(f"Multiple 'when:' were found in the process '{self.get_name()}'.", num = 22, origin=self) + when_pos = match.span(0) + when_multiple = True + + script_pos= (0, 0) + for match in re.finditer(constant.SCRIPT, code): + script_pos = match.span(0) + break + + positions = [input_pos, output_pos, when_pos, script_pos] + variables_index = ['input', 'output', 'when', 'script'] + positions, variables_index = sort_and_filter(positions, variables_index) + + + for i in range(len(positions)): + temp_code = "" + if(i==len(positions)-1): + temp_code = code[positions[i][1]:code.rfind('}')].strip() + else: + temp_code = code[positions[i][1]:positions[i+1][0]].strip() + + if(variables_index[i]=='input'): + self.input_code = temp_code + elif(variables_index[i]=='output'): + self.output_code = temp_code + elif(variables_index[i]=='when'): + self.when_code = temp_code + elif(variables_index[i]=='script'): + self.script_code = temp_code + else: + raise Exception("This shoudn't happen!") + + + #Method that returns the input part of the process code + def get_input_code(self): + return self.input_code + + + #Function that extracts the inputs from a process + def initialise_inputs_DSL1(self): + code = "\n"+self.get_input_code()+"\n" + code = remove_jumps_inbetween_parentheses(code) + #Simplying the inputs -> when there is a jump line '.' -> it turns it to '.' + code = re.sub(constant.JUMP_DOT, '.', code) + + def add_channel(name): + from .channel import Channel + input = Channel(name=name, origin=self.origin) + if(not self.origin.check_in_channels(input)): + self.origin.add_channel(input) + input.add_sink(self) + self.inputs.append(input) + else: + input = self.origin.get_channel_from_name(name) + self.inputs.append(input) + input.add_sink(self) + + #Case there is a single channel as an input -> doesn't use from to import channel -> uses file (see https://github.com/nextflow-io/nextflow/blob/45ceadbdba90b0b7a42a542a9fc241fb04e3719d/docs/process.rst) + pattern = constant.FILE + for match in re.finditer(pattern, code): + add_channel(match.group(1)) + + + #Case there are multiple channels as input (e.g. channel1.mix(channel2)) + pattern = constant.FROM + for match in re.finditer(pattern, code): + extracted = match.group(1).strip() + if(bool(re.fullmatch(constant.WORD, extracted))): + add_channel(extracted) + else: + from .operation import Operation + operation = Operation(code=extracted, origin=self.origin) + operation.initialise() + operation.is_defined_in_process(self) + self.inputs+=operation.get_origins() + + #self.inputs = list(set(self.inputs))#TODO Check this + + #Function that extracts the inputs from a process (for DSLS workflows) + def initialise_inputs_DSL2(self): + code = self.get_input_code() + code = remove_jumps_inbetween_parentheses(code) + for input in code.split("\n"): + input = input.strip() + if(input!=""): + self.inputs.append(input) + + + #Method that returns the input part of the process code + def get_output_code(self): + return self.output_code + + + + + #Function that extracts the outputs from a process (DSL1) + def initialise_outputs_DSL1(self): + code = self.get_output_code() + code = remove_jumps_inbetween_parentheses(code) + def add_channel(name): + from .channel import Channel + output = Channel(name=name, origin=self.origin) + if(not self.origin.check_in_channels(output)): + self.origin.add_channel(output) + output.add_source(self) + self.outputs.append(output) + else: + output = self.origin.get_channel_from_name(outputs[i].strip()) + self.outputs.append(output) + output.add_source(self) + + + pattern =constant.INTO_2 + for match in re.finditer(pattern, code): + outputs = match.group(1).split(',') + for i in range(len(outputs)): + add_channel(outputs[i].strip()) + + pattern = constant.FILE + for match in re.finditer(pattern, code): + add_channel(match.group(1)) + + #Function that extracts the inputs from a process (for DSLS workflows) + def initialise_outputs_DSL2(self): + code = self.get_output_code() + code = remove_jumps_inbetween_parentheses(code) + for output in code.split("\n"): + output = output.strip() + if(output!=""): + self.outputs.append(output) + + + def initialise_name(self): + for match in re.finditer(constant.PROCESS_HEADER, self.code.get_code()): + self.name = match.group(1) + self.name = self.name.replace("'", "") + self.name = self.name.replace('"', '') + self.alias = self.name + + def get_structure(self, dico): + dico['nodes'].append({'id':str(self), 'name':self.get_name(), "shape":"ellipse", 'xlabel':"", 'fillcolor':''}) + + def initialise_inputs_outputs(self): + DSL = self.origin.get_DSL() + if(DSL=="DSL1"): + self.initialise_inputs_DSL1() + self.initialise_outputs_DSL1() + elif(DSL=="DSL2"): + self.initialise_inputs_DSL2() + self.initialise_outputs_DSL2() + #else: + # raise Exception("Workflow is neither written in DSL1 nor DSL2!") + + + def initialise(self): + self.initialise_name() + self.initialise_parts() + self.initialise_inputs_outputs() diff --git a/src/subworkflow.py b/src/subworkflow.py new file mode 100644 index 0000000000000000000000000000000000000000..c37972ddeb76e029c15c5c292fea0df1a1515993 --- /dev/null +++ b/src/subworkflow.py @@ -0,0 +1,246 @@ +import re +from . import constant +from .code_ import Code +from .main_DSL2 import Main_DSL2 +from .bioflowinsighterror import BioFlowInsightError +from .outils import remove_jumps_inbetween_parentheses + + + + +class Subworkflow(Main_DSL2): + def __init__(self, code, origin, name): + Main_DSL2.__init__(self, code, origin) + self.name = name.replace("'", "").replace('"', '') + self.alias = self.name + #These are the different parts of of a subworkflow -> work corresponds to the main + self.take = None + self.work = None + self.emit = None + + self.initialised = False + + def print_summary(self, tab = 0): + print(" "*tab+f"* {self.name} ({self})") + super().print_summary(tab) + + def set_alias(self, alias): + self.alias = alias + + def get_alias(self): + return self.alias + + def get_type(self): + return "Subworkflow" + + def get_name(self): + return self.name + + def get_work(self): + return self.work.get_code() + + #TODO -> when return the code of a subworkflow -> i return evrything (not just the work) -> check if that is correct + #Method which initiliases the different parts of a workflow (take/main/emit) + def initialise_parts(self): + code = self.get_code() + take_multiple, take_pos= False, (0, 0) + for match in re.finditer(constant.TAKE, code): + if(take_multiple): + raise BioFlowInsightError(f"Multiple 'take:' were found in the subworkflow '{self.get_name()}'.", num = 22, origin=self) + take_pos = match.span(0) + take_multiple = True + + main_multiple, main_pos= False, (0, 0) + for match in re.finditer(constant.MAIN, code): + if(main_multiple): + raise BioFlowInsightError(f"Multiple 'main:' were found in the subworkflow '{self.get_name()}'.", num = 22, origin=self) + main_pos = match.span(0) + main_multiple = True + + emit_multiple, emit_pos= False, (0, 0) + for match in re.finditer(constant.EMIT_SUBWORKFLOW, code): + if(emit_multiple): + raise BioFlowInsightError(f"Multiple 'emit:' were found in the subworkflow '{self.get_name()}'. ", num = 22, origin=self) + emit_pos = match.span(0) + emit_multiple = True + + #Case everything is there + if(take_pos!=(0, 0) and main_pos!=(0, 0) and emit_pos!=(0, 0)): + if(take_pos[0]<main_pos[0] and main_pos[0]<emit_pos[0]): + self.take = Code(code[take_pos[1]:main_pos[0]].strip(), origin = self) + self.work = Code(code[main_pos[1]:emit_pos[0]].strip(), origin = self) + self.emit = Code(code[emit_pos[1]:code.rfind('}')].strip(), origin = self) + elif(take_pos[0]<emit_pos[0] and emit_pos[0]<main_pos[0]): + self.take = Code(code[take_pos[1]:emit_pos[0]].strip(), origin = self) + self.emit = Code(code[emit_pos[1]:main_pos[0]].strip(), origin = self) + self.work = Code(code[main_pos[1]:code.rfind('}')].strip(), origin = self) + else: + raise Exception('You need to add a case') + #Case nothing is there + if(take_pos==(0, 0) and main_pos==(0, 0) and emit_pos==(0, 0)): + #raise Exception(f"Subworkflow {code} doesn't have anything defined") + self.work = Code(code, origin = self) + #Case there is an input but no output + if(take_pos!=(0, 0) and main_pos!=(0, 0) and emit_pos==(0, 0)): + if(take_pos[0]<main_pos[0]): + self.take = Code(code[take_pos[1]:main_pos[0]].strip(), origin = self) + self.work = Code(code[main_pos[1]:code.rfind('}')].strip(), origin = self) + else: + raise Exception('You need to add a case') + #Case there is no input but an output + if(take_pos==(0, 0) and main_pos!=(0, 0) and emit_pos!=(0, 0)): + if(main_pos[0]<emit_pos[0]): + self.work = Code(code[main_pos[1]:emit_pos[0]].strip(), origin = self) + self.emit = Code(code[emit_pos[1]:code.rfind('}')].strip(), origin = self) + else: + raise Exception('You need to add a case') + #Case there is a main but no input and no output + if(take_pos==(0, 0) and main_pos!=(0, 0) and emit_pos==(0, 0)): + self.work = Code(code[main_pos[1]:code.rfind('}')].strip(), origin = self) + if( main_pos==(0, 0) and (take_pos!=(0, 0) or emit_pos!=(0, 0))): + if(take_pos!=(0, 0) and emit_pos!=(0, 0)): + raise Exception("TODO") + elif(take_pos!=(0, 0) and emit_pos==(0, 0)): + raise Exception("TODO") + elif(take_pos==(0, 0) and emit_pos!=(0, 0)): + self.emit = Code(code[emit_pos[1]:code.rfind('}')].strip(), origin = self) + firt_curly = code.find("{") + self.work = Code(code[firt_curly+1:emit_pos[0]].strip(), origin = self) + else: + raise Exception("Not possible!") + + def get_channel_from_name_takes(self, name): + for c in self.channels: + if(name == c.get_name()): + return c + return None + + def initialise_takes(self): + if(self.take!=None): + code = remove_jumps_inbetween_parentheses(self.take.get_code()).split('\n') + tab = [] + for i in range(len(code)): + code[i] = code[i].strip() + if(code[i]!=''): + channel = self.get_channel_from_name_takes(code[i]) + #In the case the channel doesn't exist + if(channel==None): + from .operation import Operation + ope = Operation(f"take: {code[i]}", self) + from .channel import Channel + channel = Channel(code[i], self) + ope.add_element_gives(channel) + channel.add_source(ope) + #ope.initialise_from_subworkflow_take() + else: + raise BioFlowInsightError(f"The channel '{code[i]}' is already defined somewhere else in the subworkflow ('{self.get_name()}') or in the file.", num=4, origin=self) + tab.append(ope) + for channel in ope.get_gives(): + self.channels.append(channel) + + self.take = tab + + #def initialise_emit(self): + # if(self.emit!=None): + # code = self.emit.get_code().split('\n') + # tab = [] + # for i in range(len(code)): + # code[i] = code[i].strip() + # channel = self.get_channel_from_name(code[i]) + # if(channel!=None): + # tab.append(channel) + # channel.add_sink(Operation(code=channel.get_name(), origin=self)) + # + # else: + # #Case it's an operation + # operation = Operation(code[i], self) + # operation.initialise() + # for gives in operation.get_gives(): + # tab.append(gives) + # #TODO -> check not add origin too! + # gives.add_sink(Operation(code=gives.get_name(), origin=self)) + # #self.add_operation(operation) + # self.executors.append(operation) + # self.emit = tab + + def initialise_emit(self): + from .operation import Operation + if(self.emit!=None): + code = remove_jumps_inbetween_parentheses(self.emit.get_code()).split('\n') + tab = [] + for i in range(len(code)): + code[i] = code[i].strip() + if(code[i]!=""): + channel = self.get_channel_from_name(code[i]) + if(channel!=None): + ope = Operation(code=f"emit: {code[i]}", origin=self) + ope.add_element_origins(channel) + channel.add_sink(ope) + tab.append(ope) + + else: + #raise Exception(f"I don't know how to handle '{code[i]}'") + #Case it's an operation + operation = Operation(code[i], self) + operation.initialise() + operation.change_code(f"emit: {code[i]}") + tab.append(operation) + #operation.add_gives(channel) + #for gives in operation.get_gives(): + # #TODO -> check not add origin too! + # gives.add_sink(operation) + #tab.append(operation) + #print(operation) + ##self.add_operation(operation) + ##self.executors.append(operation) + self.emit = tab + + + + + + def get_emit(self): + return self.emit + + def get_nb_emit(self): + if(self.emit==None): + return 0 + return len(self.emit) + + def get_takes(self): + return self.take + + def get_nb_takes(self): + if(self.take==None): + return 0 + return len(self.take) + + def get_nb_inputs(self): + return self.get_nb_takes() + + + def initialise(self): + if(not self.initialised): + self.initialise_parts() + self.initialise_takes() + super().initialise() + self.initialise_emit() + self.initialised = True + + def get_structure(self, dico): + super().get_structure(dico) + + if(self.take!=None): + for ope in self.get_takes(): + #ope.set_operation_type("Branch") + ope.get_structure(dico, to_remove = True) + + if(self.emit!=None): + for ope in self.get_emit(): + #ope.set_operation_type("Branch") + ope.get_structure(dico, to_remove = True) + + + + + diff --git a/tests/ressources/call/calls_to_test.nf b/tests/ressources/call/calls_to_test.nf new file mode 100644 index 0000000000000000000000000000000000000000..01b04229a3ce22d89b85f02ab9f38343040b5b10 --- /dev/null +++ b/tests/ressources/call/calls_to_test.nf @@ -0,0 +1,13 @@ +//GiantSpaceRobot/tsRNAsearch + +DESEQ2(COUNTS_TO_COLLAPSED_COUNTS.out.collapsed_count.collect(), "$layoutfile", PREPARE_NCRNA_GTF.out.ncRNA_gtf) +DATA_TRANSFORMATIONS("$layoutfile", \ + GENERATE_TRNA_DEPTH_FILES.out.depth_files.collect(), \ + GENERATE_NCRNA_DEPTH_FILES.out.depth_files.collect(), \ + GENERATE_MULTIMAPPER_TRNA_DEPTH_FILES.out.depth_files.collect(), \ + SUM_COUNTS.out.sum_counts) +DISTRIBUTION_SCORE(DATA_TRANSFORMATIONS.out.ncrna_stddev, DATA_TRANSFORMATIONS.out.trna_stddev, PREPARE_NCRNA_GTF.out.ncRNA_gtf) +SLOPE_SCORE(DATA_TRANSFORMATIONS.out.depth_means, "$layoutfile", PREPARE_NCRNA_GTF.out.ncRNA_gtf) + + +//Case where call.into{ch1, ch2} \ No newline at end of file diff --git a/tests/ressources/channel/empty_wf.nf b/tests/ressources/channel/empty_wf.nf new file mode 100644 index 0000000000000000000000000000000000000000..e69de29bb2d1d6434b8b29ae775ad8c2e48c5391 diff --git a/tests/ressources/outils/remove_comments_with.nf b/tests/ressources/outils/remove_comments_with.nf new file mode 100644 index 0000000000000000000000000000000000000000..b4aaab82dff21aee028c1e22708710f864042783 --- /dev/null +++ b/tests/ressources/outils/remove_comments_with.nf @@ -0,0 +1,30 @@ +/* + This is a sample Nextflow script to test the comment removal function. Just writing this to add an "'" like if i wrote "it's" +*/ + +/* +https://www.nextflow.io/ +*/ + +// Define a process that echoes a message +process echoMessage { + + // Input parameters + input: + // This is an inline comment + val message + + // Output + output: + // Single-line comment + path "/*.txt" + + // Script section + script: + """ + echo "Message: $message" > output.txt + #https://www.nextflow.io/ + """ + + +} \ No newline at end of file diff --git a/tests/ressources/outils/remove_comments_wo.nf b/tests/ressources/outils/remove_comments_wo.nf new file mode 100644 index 0000000000000000000000000000000000000000..ac965e70dee82f1fd0bc4c1054a8fd7b1f313861 --- /dev/null +++ b/tests/ressources/outils/remove_comments_wo.nf @@ -0,0 +1,28 @@ + + + + + +process echoMessage { + + + input: + + val message + + + output: + + path "/*.txt" + + + script: + """ + echo "Message: $message" > output.txt + #https://www.nextflow.io/ + """ + + +} + + diff --git a/tests/ressources/process/process_DSL1.nf b/tests/ressources/process/process_DSL1.nf new file mode 100644 index 0000000000000000000000000000000000000000..e67c592ccbef2dd4d5b10a75be4cbf75b080c846 --- /dev/null +++ b/tests/ressources/process/process_DSL1.nf @@ -0,0 +1,18 @@ +//Taken from https://github.com/maxemil/ALE-pipeline/blob/c8f17b11dd3496420cfcb4a5c29564d2257eabf4/main.nf +//+ modified + +process cleanSpeciesTree { + input: + file species_tree + file 'map_species.txt' from species_map.first() + + output: + file "${species_tree.baseName}_clean.tree" into clean_species_tree + file "${species_tree.baseName}_root.tree" into rooted_species_tree + + publishDir params.output_trees, mode: 'copy' + tag {"${species_tree.simpleName}"} + + script: + template 'cleanSpeciesTree.py' +} \ No newline at end of file diff --git a/tests/ressources/process/process_DSL2.nf b/tests/ressources/process/process_DSL2.nf new file mode 100644 index 0000000000000000000000000000000000000000..5dd75965c82dbfee0e3b68d6dac96bc85f8423cf --- /dev/null +++ b/tests/ressources/process/process_DSL2.nf @@ -0,0 +1,80 @@ +//Taken from https://github.com/nf-core/mhcquant/blob/b80a5a4fbf1ff4d409885d08ab09f6ceeb7fe4c9/modules/local/openms_falsediscoveryrate.nf +//+ modified + +process OPENMS_FALSEDISCOVERYRATE { + tag "$meta.id" + label 'process_single' + + conda "bioconda::openms=3.0.0" + container "${ workflow.containerEngine == 'singularity' && !task.ext.singularity_pull_docker_container ? + 'https://depot.galaxyproject.org/singularity/openms:3.0.0--h8964181_1' : + 'biocontainers/openms:3.0.0--h8964181_1' }" + + input: + tuple val(meta), path(idxml) + + output: + tuple val(meta), path("*.idXML"), emit: idxml + path "versions.yml" , emit: versions + + when: + task.ext.when == null || task.ext.when + + script: + def prefix = task.ext.prefix ?: "${idxml.baseName}_fdr" + + """ + FalseDiscoveryRate -in $idxml \\ + -protein 'false' \\ + -out ${prefix}.idXML \\ + -threads $task.cpus + + cat <<-END_VERSIONS > versions.yml + "${task.process}": + openms: \$(echo \$(FileInfo --help 2>&1) | sed 's/^.*Version: //; s/-.*\$//' | sed 's/ -*//; s/ .*\$//') + END_VERSIONS + """ +} + +/* + +//wtsi-hgi/nf_cellbender/modules/core.nf + +input: + val(outdir_prev) + tuple( + val(experiment_id), + path(file_10x_barcodes), + path(file_10x_features), + path(file_10x_matrix), + val(ncells_expected), + val(ndroplets_include_cellbender) + ) + val(estimate_params_umis) + + output: + val(outdir, emit: outdir) + tuple( + val(experiment_id), + path(file_10x_barcodes), + path(file_10x_features), + path(file_10x_matrix), + path("${outfile}-expected_cells.txt"), + path("${outfile}-total_droplets_included.txt"), + emit: cb_input + ) + path( + "${outfile}-expected_cells.txt", + emit: expected_cells + ) + path( + "${outfile}-total_droplets_included.txt", + emit: total_droplets_include + ) + path("${outfile}-cell_estimate_cutoff.tsv.gz") + path("${outfile}-total_droplets_cutoff.tsv.gz") + path("plots/*.png") optional true + path("plots/*.pdf") optional true + +*/ + diff --git a/tests/test_call.py b/tests/test_call.py new file mode 100644 index 0000000000000000000000000000000000000000..a5821895c2b1a1c78628adb23a3d0dcf3f6c8852 --- /dev/null +++ b/tests/test_call.py @@ -0,0 +1,8 @@ +import unittest +from src.call import * + +class TestCall(unittest.TestCase): + + def test_check_everything_works(self): + self.assertTrue(True) + diff --git a/tests/test_channel.py b/tests/test_channel.py new file mode 100644 index 0000000000000000000000000000000000000000..b5c5547f7b3b336b32f05a52adcc053af4af27d9 --- /dev/null +++ b/tests/test_channel.py @@ -0,0 +1,78 @@ +import unittest +from src.channel import * +from src.nextflow_file import Nextflow_File + + +class TestChannel(unittest.TestCase): + + def test_get_code(self): + wf1 = Nextflow_File("tests/ressources/channel/empty_wf.nf", display_info=False) + ch1 = Channel(name = "ch1", origin = wf1) + self.assertIsInstance(ch1, Channel) + self.assertEqual(ch1.get_code(), "ch1") + + def test_get_name(self): + wf1 = Nextflow_File("tests/ressources/channel/empty_wf.nf", display_info=False) + ch1 = Channel(name = "ch1", origin = wf1) + self.assertEqual(ch1.get_name(), "ch1") + + def test_get_type(self): + wf1 = Nextflow_File("tests/ressources/channel/empty_wf.nf", display_info=False) + ch1 = Channel(name = "ch1", origin = wf1) + self.assertEqual(ch1.get_type(), "Channel") + + def test_add_source(self): + wf1 = Nextflow_File("tests/ressources/channel/empty_wf.nf", display_info=False) + ch1 = Channel(name = "ch1", origin = wf1) + self.assertEqual(ch1.get_source(), []) + ele = "This is a test" + ch1.add_source(ele) + self.assertEqual(ch1.get_source(), [ele]) + + def test_add_sink(self): + wf1 = Nextflow_File("tests/ressources/channel/empty_wf.nf", display_info=False) + ch1 = Channel(name = "ch1", origin = wf1) + self.assertEqual(ch1.get_sink(), []) + ele = "This is a test" + ch1.add_sink(ele) + self.assertEqual(ch1.get_sink(), [ele]) + + def test_set_sink_null(self): + wf1 = Nextflow_File("tests/ressources/channel/empty_wf.nf", display_info=False) + ch1 = Channel(name = "ch1", origin = wf1) + ele = "This is a test" + ch1.add_sink(ele) + self.assertEqual(ch1.get_sink(), [ele]) + ch1.set_sink_null() + self.assertEqual(ch1.get_sink(), []) + + def test_remove_element_from_sink(self): + wf1 = Nextflow_File("tests/ressources/channel/empty_wf.nf", display_info=False) + ch1 = Channel(name = "ch1", origin = wf1) + ele = "This is a test" + ch1.add_sink(ele) + self.assertEqual(ch1.get_sink(), [ele]) + ch1.remove_element_from_sink(ele = ele) + self.assertEqual(ch1.get_sink(), []) + + def test_equal(self): + wf1 = Nextflow_File("tests/ressources/channel/empty_wf.nf", display_info=False) + ch1 = Channel(name = "ch1", origin = wf1) + ch1_1 = Channel(name = "ch1", origin = wf1) + ch2 = Channel(name = "ch2", origin = wf1) + self.assertTrue(ch1.equal(channel=ch1_1)) + self.assertFalse(ch1.equal(channel=ch2)) + + def test_get_structure(self): + wf1 = Nextflow_File("tests/ressources/channel/empty_wf.nf", display_info=False) + ch1 = Channel(name = "ch1", origin = wf1) + dico = {} + dico['nodes'] = [] + dico['edges'] = [] + dico['subworkflows'] = {} + ch1.add_source("out1") + ch1.add_source("out2") + ch1.get_structure(dico, "in") + dico_true = {'nodes': [], 'edges': [{'A': 'out1', 'B': 'in', 'label': 'ch1'}, {'A': 'out2', 'B': 'in', 'label': 'ch1'}], 'subworkflows': {}} + self.assertEqual(dico, dico_true) + diff --git a/tests/test_code.py b/tests/test_code.py new file mode 100644 index 0000000000000000000000000000000000000000..b57f3515c438a53a130c59cc383a38dbadf44697 --- /dev/null +++ b/tests/test_code.py @@ -0,0 +1,27 @@ +import unittest +from src.code_ import * + +class TestCode(unittest.TestCase): + + def test_initialise(self): + with open("tests/ressources/outils/remove_comments_with.nf", 'r') as f: + code_with_comments = f.read() + + with open("tests/ressources/outils/remove_comments_wo.nf", 'r') as f: + code_wo_comments = f.read() + + code = Code(code_with_comments, origin=None) + self.assertIsInstance(code, Code) + self.assertEqual(code.code, '\n'+code_with_comments+'\n') + self.assertEqual(code.code_wo_comments, '\n'+code_wo_comments+'\n') + + def test_get_code(self): + with open("tests/ressources/outils/remove_comments_with.nf", 'r') as f: + code_with_comments = f.read() + + with open("tests/ressources/outils/remove_comments_wo.nf", 'r') as f: + code_wo_comments = f.read() + + code = Code(code_with_comments, origin=None) + self.assertEqual(code.get_code(), code_wo_comments.strip()) + diff --git a/tests/test_emitted.py b/tests/test_emitted.py new file mode 100644 index 0000000000000000000000000000000000000000..ea4a1287459f4a4d4cb4484be4691f99f853e55d --- /dev/null +++ b/tests/test_emitted.py @@ -0,0 +1,8 @@ +import unittest +from src.emitted import * + +class TestEmitted(unittest.TestCase): + + def test_check_everything_works(self): + self.assertTrue(True) + diff --git a/tests/test_executor.py b/tests/test_executor.py new file mode 100644 index 0000000000000000000000000000000000000000..350e7e22ef61fa8c1607968e59bb5651bb48d0e9 --- /dev/null +++ b/tests/test_executor.py @@ -0,0 +1,8 @@ +import unittest +from src.executor import * + +class TestExecutor(unittest.TestCase): + + def test_check_everything_works(self): + self.assertTrue(True) + diff --git a/tests/test_function.py b/tests/test_function.py new file mode 100644 index 0000000000000000000000000000000000000000..cd19970c738e0e458e73bfb991adf82754ff02db --- /dev/null +++ b/tests/test_function.py @@ -0,0 +1,8 @@ +import unittest +from src.function import * + +class TestFunction(unittest.TestCase): + + def test_check_everything_works(self): + self.assertTrue(True) + diff --git a/tests/test_graph.py b/tests/test_graph.py new file mode 100644 index 0000000000000000000000000000000000000000..7b35fdb97edcdcc1cc111c3993a2bf4de5a9ffac --- /dev/null +++ b/tests/test_graph.py @@ -0,0 +1,8 @@ +import unittest +from src.graph import * + +class TestGraph(unittest.TestCase): + + def test_check_everything_works(self): + self.assertTrue(True) + diff --git a/tests/test_include.py b/tests/test_include.py new file mode 100644 index 0000000000000000000000000000000000000000..80dd16f67b605c25e7b8ce0aafb3b6bce4879ca7 --- /dev/null +++ b/tests/test_include.py @@ -0,0 +1,8 @@ +import unittest +from src.include import * + +class TestInclude(unittest.TestCase): + + def test_check_everything_works(self): + self.assertTrue(True) + diff --git a/tests/test_main_DSL2.py b/tests/test_main_DSL2.py new file mode 100644 index 0000000000000000000000000000000000000000..6585066fa30ebe9217cadf9badf5d96172149b6e --- /dev/null +++ b/tests/test_main_DSL2.py @@ -0,0 +1,8 @@ +import unittest +from src.main_DSL2 import * + +class TestMain_DSL2(unittest.TestCase): + + def test_check_everything_works(self): + self.assertTrue(True) + diff --git a/tests/test_nextflow_building_blocks.py b/tests/test_nextflow_building_blocks.py new file mode 100644 index 0000000000000000000000000000000000000000..47937bdcd8a8d6272366e12599405191da59ef27 --- /dev/null +++ b/tests/test_nextflow_building_blocks.py @@ -0,0 +1,9 @@ +import unittest +from src.nextflow_building_blocks import * + +class TestNextflow_Building_Blocks(unittest.TestCase): + + def test_check_everything_works(self): + self.assertTrue(True) + + diff --git a/tests/test_nextflow_file.py b/tests/test_nextflow_file.py new file mode 100644 index 0000000000000000000000000000000000000000..7b8b3098787ddbcfa172f643c591539ecf947391 --- /dev/null +++ b/tests/test_nextflow_file.py @@ -0,0 +1,8 @@ +import unittest +from src.nextflow_file import * + +class TestNextflow_File(unittest.TestCase): + + def test_check_everything_works(self): + self.assertTrue(True) + diff --git a/tests/test_operation.py b/tests/test_operation.py new file mode 100644 index 0000000000000000000000000000000000000000..160b5fd8bbe2aeb7716ee52adac7d4a668d6f785 --- /dev/null +++ b/tests/test_operation.py @@ -0,0 +1,8 @@ +import unittest +from src.operation import * + +class TestOperation(unittest.TestCase): + + def test_check_everything_works(self): + self.assertTrue(True) + diff --git a/tests/test_outils.py b/tests/test_outils.py new file mode 100644 index 0000000000000000000000000000000000000000..9fe2c7d9772e917a13dd02095e8759e220e4c34c --- /dev/null +++ b/tests/test_outils.py @@ -0,0 +1,41 @@ +import unittest +from src.outils import * + +class TestOutils(unittest.TestCase): + + #TODO -> finish this + + def test_get_next_element_caracter(self): + test = """This is a test\n!""" + val, index = get_next_element_caracter(test, 3) + self.assertEqual(val, 'i') + self.assertEqual(index, 5) + val, index = get_next_element_caracter(test, 13) + self.assertEqual(val, '!') + self.assertEqual(index, 15) + val, index = get_next_element_caracter(test, 15) + self.assertEqual(val, -1) + self.assertEqual(index, -1) + + def test_get_before_element_caracter(self): + test = """This is a test\n!""" + val, index = get_before_element_caracter(test, 0) + self.assertEqual(val, -1) + self.assertEqual(index, -1) + val, index = get_before_element_caracter(test, 5) + self.assertEqual(val, 's') + self.assertEqual(index, 3) + val, index = get_before_element_caracter(test, 15) + self.assertEqual(val, 't') + self.assertEqual(index, 13) + + def test_remove_comments(self): + code_with_comments = '' + with open("tests/ressources/outils/remove_comments_with.nf", 'r') as f: + code_with_comments = f.read() + + with open("tests/ressources/outils/remove_comments_wo.nf", 'r') as f: + code_wo_comments = f.read() + self.assertEqual(remove_comments(code_with_comments), code_wo_comments) + + diff --git a/tests/test_process.py b/tests/test_process.py new file mode 100644 index 0000000000000000000000000000000000000000..4c2f6400190fc2127e1b0e6a909ba385ea9c6262 --- /dev/null +++ b/tests/test_process.py @@ -0,0 +1,104 @@ +import unittest +from src.process import * +from src.nextflow_file import Nextflow_File + +class TestProcess(unittest.TestCase): + + def test_check_everything_works(self): + self.assertTrue(True) + + + def test_initialise_name(self): + #DSL1 + file = Nextflow_File("tests/ressources/process/process_DSL1.nf", display_info=False) + file.initialise() + process_DSL1 = file.processes[0] + self.assertEqual(process_DSL1.get_name(), "cleanSpeciesTree") + self.assertEqual(process_DSL1.get_alias(), "cleanSpeciesTree") + #DSL2 + file = Nextflow_File("tests/ressources/process/process_DSL2.nf", display_info=False) + file.initialise() + process_DSL2 = file.processes[0] + self.assertEqual(process_DSL2.get_name(), "OPENMS_FALSEDISCOVERYRATE") + self.assertEqual(process_DSL2.get_alias(), "OPENMS_FALSEDISCOVERYRATE") + + def test_set_alias(self): + file = Nextflow_File("tests/ressources/process/process_DSL2.nf", display_info=False) + file.initialise() + process_DSL2 = file.processes[0] + self.assertEqual(process_DSL2.get_alias(), "OPENMS_FALSEDISCOVERYRATE") + new_alias = "new_alias" + process_DSL2.set_alias(new_alias) + self.assertEqual(process_DSL2.get_name(), "OPENMS_FALSEDISCOVERYRATE") + self.assertEqual(process_DSL2.get_alias(), new_alias) + + + def test_which_DSL(self): + #DSL1 + file = Nextflow_File("tests/ressources/process/process_DSL1.nf", display_info=False) + file.initialise() + process_DSL1 = file.processes[0] + self.assertEqual(process_DSL1.which_DSL(), "DSL1") + #DSL2 + file = Nextflow_File("tests/ressources/process/process_DSL2.nf", display_info=False) + file.initialise() + process_DSL2 = file.processes[0] + self.assertEqual(process_DSL2.which_DSL(), "DSL2") + + def test_is_initialised(self): + #DSL1 + file = Nextflow_File("tests/ressources/process/process_DSL1.nf", display_info=False) + file.initialise() + process_DSL1 = file.processes[0] + self.assertTrue(process_DSL1.is_initialised()) + #DSL2 + file = Nextflow_File("tests/ressources/process/process_DSL2.nf", display_info=False) + file.initialise() + process_DSL2 = file.processes[0] + self.assertTrue(process_DSL2.is_initialised()) + + + def test_get_type(self): + #DSL1 + file = Nextflow_File("tests/ressources/process/process_DSL1.nf", display_info=False) + file.initialise() + process_DSL1 = file.processes[0] + self.assertEqual(process_DSL1.get_type(), "Process") + #DSL2 + file = Nextflow_File("tests/ressources/process/process_DSL2.nf", display_info=False) + file.initialise() + process_DSL2 = file.processes[0] + self.assertEqual(process_DSL2.get_type(), "Process") + + #TODO define the tests for the inputs and outputs + + def test_get_structure(self): + #DSL1 + file = Nextflow_File("tests/ressources/process/process_DSL1.nf", display_info=False) + file.initialise() + process_DSL1 = file.processes[0] + dico = {} + dico['nodes'] = [] + dico['edges'] = [] + dico['subworkflows'] = {} + process_DSL1.get_structure(dico) + dico_true = {'nodes': [{'id': str(process_DSL1), 'name': 'cleanSpeciesTree', 'shape': 'ellipse', 'xlabel': ''}], 'edges': [], 'subworkflows': {}} + self.assertEqual(dico, dico_true) + #DSL2 + file = Nextflow_File("tests/ressources/process/process_DSL2.nf", display_info=False) + file.initialise() + process_DSL2 = file.processes[0] + dico = {} + dico['nodes'] = [] + dico['edges'] = [] + dico['subworkflows'] = {} + process_DSL2.get_structure(dico) + dico_true = {'nodes': [{'id': str(process_DSL2), 'name': 'OPENMS_FALSEDISCOVERYRATE', 'shape': 'ellipse', 'xlabel': ''}], 'edges': [], 'subworkflows': {}} + self.assertEqual(dico, dico_true) + + def test_(self): + file = Nextflow_File("tests/ressources/process/process_DSL1.nf", display_info=False) + file.initialise() + process_DSL1 = file.processes[0] + + diff --git a/tests/test_subworkflow.py b/tests/test_subworkflow.py new file mode 100644 index 0000000000000000000000000000000000000000..93f1bc0e19d0f87543cff5e67482d1f04f1f94ee --- /dev/null +++ b/tests/test_subworkflow.py @@ -0,0 +1,8 @@ +import unittest +from src.subworkflow import * + +class TestSubworkflow(unittest.TestCase): + + def test_check_everything_works(self): + self.assertTrue(True) +