Added the concordance criteria for the automatic selection of relevant processes

67a4eeb5 · George Marchment · 6600a0e7 · 67a4eeb5
Commit 67a4eeb5 authored 2 months ago by George Marchment
--- a/src/workflow.py
+++ b/src/workflow.py
@@ -455,7 +455,9 @@ George Marchment, Bryan Brancotte, Marie Schmit, Frédéric Lemoine, Sarah Cohen
        import numpy as np
        min_nb_clusters, min_relevant_processes = np.inf, []
        already_tried = []
+        print('-'*number_of_tries+">")
        for i in range(number_of_tries):
+            print('.', end='')
            random_relevant_processes = self.generate_random_relevant_processes()
            escape = 0
            while(escape<100 and set(random_relevant_processes) in already_tried):
@@ -483,7 +485,9 @@ George Marchment, Bryan Brancotte, Marie Schmit, Frédéric Lemoine, Sarah Cohen
        import numpy as np
        min_uniform_score, min_relevant_processes = np.inf, []
        already_tried = []
+        print('-'*number_of_tries+">")
        for i in range(number_of_tries):
+            print('.', end='')
            random_relevant_processes = self.generate_random_relevant_processes()
            escape = 0
            while(escape<100 and set(random_relevant_processes) in already_tried):
@@ -514,6 +518,48 @@ George Marchment, Bryan Brancotte, Marie Schmit, Frédéric Lemoine, Sarah Cohen
                min_uniform_score = score
        return min_relevant_processes
    
+    
+    #reduction_alpha is the same as above
+    #reduction_beta is the same as above
+    def get_relevant_which_minizes_the_number_of_conditions(self, reduction_alpha = 0.2, reduction_beta = 0.8, number_of_tries = 50):
+        import numpy as np
+        import copy
+        min_condition_score, min_relevant_processes = np.inf, []
+        already_tried = []
+        w_save = copy.deepcopy(self)
+        number_processes_called = len(self.get_processes_called())
+        print('-'*number_of_tries+">")
+        for i in range(number_of_tries):
+            print('.', end='')
+            w = copy.deepcopy(w_save)
+            random_relevant_processes = w.generate_random_relevant_processes()
+            escape = 0
+            while(escape<100 and set(random_relevant_processes) in already_tried):
+                escape+=1
+                random_relevant_processes = w.generate_random_relevant_processes()
+            #Cause it means we've already searched the majority of the possibilities
+            if(escape>=100):
+                return min_relevant_processes
+            already_tried.append(set(random_relevant_processes))
+            _, cluster_organisation = w.convert_workflow_2_user_view(relevant_processes=random_relevant_processes, render_graphs = False)
+            
+            tab_nb_executors_per_cluster, tab_nb_conditions_per_cluster = [], []
+            for c in cluster_organisation:
+                tab_nb_executors_per_cluster.append(cluster_organisation[c]["nb_executors"])
+                tab_nb_conditions_per_cluster.append(cluster_organisation[c]["nb_conditions"])
+            
+            score = np.max(tab_nb_conditions_per_cluster)
+            #score = np.mean(tab_nb_conditions_per_cluster)
+            #score = np.median(tab_nb_conditions_per_cluster)
+            #Ratio
+            #score = np.max(np.array(tab_nb_conditions_per_cluster)/np.array(tab_nb_executors_per_cluster))
+
+            if(len(cluster_organisation)>=reduction_alpha*number_processes_called and 
+               len(cluster_organisation)<=reduction_beta*number_processes_called and 
+               score<min_condition_score):
+                min_relevant_processes = random_relevant_processes
+                min_condition_score = score
+        return min_relevant_processes

    #Method that returns the order of execution for each executor
    def get_order_execution_executors(self):
@@ -1076,11 +1122,16 @@ George Marchment, Bryan Brancotte, Marie Schmit, Frédéric Lemoine, Sarah Cohen
            calls_in_operations = []
            non_relevant_name = 1
            
-            subworkflow_clusters_to_add, subworkflow_cluster_calls_to_add = [], []
+            #This is a dico of cluster to info about the number of executors and conditions
+            clusters_2_organisation = {}
+
+            #subworkflow_clusters_to_add, subworkflow_cluster_calls_to_add = [], []
            index_cluster = len(clusters)
            #We replace the last clusters first -> this is cause the outputs of the last clusters aren't used anywhere else in the workflow by definition 
            for elements in list(reversed(clusters)):

+                nb_executors = 0
+
                channels_to_replace_outside_of_cluster = []

                #Check that there is at least one process in cluster
@@ -1093,11 +1144,12 @@ George Marchment, Bryan Brancotte, Marie Schmit, Frédéric Lemoine, Sarah Cohen
                processes_added = []
                things_added_in_cluster = []
                if(len(elements)>=1 and at_least_one_process):
+                    
                    name, body, take, emit = "", "", "", ""
                    first_element = True

                    for ele in elements:
-            
+                        nb_executors+=1
                        if(ele.get_type()=="Process"):
                            
                            #Determine the name of the created subworkflow cluster
@@ -1263,6 +1315,7 @@ George Marchment, Bryan Brancotte, Marie Schmit, Frédéric Lemoine, Sarah Cohen
                            channels_to_replace_outside_of_cluster.append((old_output_names[i], param_out_name))
                    #If there was only one single condition in the subworkflow cluster -> then we add it when the call is done
                    if(len(conditions_in_subworkflow)==1):
+                        #TODO -> i think the case "else" -> needs to be removed cause sometimes the the empty channel created may overwrite an existing one
                        subworkfow_call = f"if({conditions_in_subworkflow[0].split('$$__$$')[0]}) {{\n{subworkfow_call_case_true}\n}} else {{\n{subworkfow_call_case_false}\n}}"
                    else:
                        subworkfow_call = subworkfow_call_case_true
@@ -1298,6 +1351,9 @@ George Marchment, Bryan Brancotte, Marie Schmit, Frédéric Lemoine, Sarah Cohen
                        code = replace_group1(code, pattern, new)
                        #code = code.replace(old, new)

+                    #Since i've added the conditions myself -> i can just count them by searching for this simple pattern
+                    clusters_2_organisation[subworkflow_code] = {"nb_executors":nb_executors, "nb_conditions":subworkflow_code.count("if(")}
+
                    #Add the subworkflow defintions
                    #-------------------------------------                
                    code = code.replace(f'{subworkflow_section}', f"{subworkflow_code}\n\n{subworkflow_section}")
@@ -1332,7 +1388,7 @@ George Marchment, Bryan Brancotte, Marie Schmit, Frédéric Lemoine, Sarah Cohen
            f.write(code)
            f.close()
            self.rewrite_and_initialise(code, self.processes_2_remove, render_graphs=render_graphs)
-            return code
+            return code, clusters_2_organisation
            #return code
            #
            ##So basically when retriving a thing (process or subworkflow)