From 67a4eeb5b2ad4517dd40aa3fe06919b2332cf1eb Mon Sep 17 00:00:00 2001
From: George Marchment <georgemarchment@yahoo.fr>
Date: Tue, 15 Apr 2025 10:10:06 +0200
Subject: [PATCH] Added the concordance criteria for the automatic selection of
 relevant processes

---
 src/workflow.py | 62 ++++++++++++++++++++++++++++++++++++++++++++++---
 1 file changed, 59 insertions(+), 3 deletions(-)

diff --git a/src/workflow.py b/src/workflow.py
index 0e156e0..02d0650 100644
--- a/src/workflow.py
+++ b/src/workflow.py
@@ -455,7 +455,9 @@ George Marchment, Bryan Brancotte, Marie Schmit, Frédéric Lemoine, Sarah Cohen
         import numpy as np
         min_nb_clusters, min_relevant_processes = np.inf, []
         already_tried = []
+        print('-'*number_of_tries+">")
         for i in range(number_of_tries):
+            print('.', end='')
             random_relevant_processes = self.generate_random_relevant_processes()
             escape = 0
             while(escape<100 and set(random_relevant_processes) in already_tried):
@@ -483,7 +485,9 @@ George Marchment, Bryan Brancotte, Marie Schmit, Frédéric Lemoine, Sarah Cohen
         import numpy as np
         min_uniform_score, min_relevant_processes = np.inf, []
         already_tried = []
+        print('-'*number_of_tries+">")
         for i in range(number_of_tries):
+            print('.', end='')
             random_relevant_processes = self.generate_random_relevant_processes()
             escape = 0
             while(escape<100 and set(random_relevant_processes) in already_tried):
@@ -514,6 +518,48 @@ George Marchment, Bryan Brancotte, Marie Schmit, Frédéric Lemoine, Sarah Cohen
                 min_uniform_score = score
         return min_relevant_processes
     
+    
+    #reduction_alpha is the same as above
+    #reduction_beta is the same as above
+    def get_relevant_which_minizes_the_number_of_conditions(self, reduction_alpha = 0.2, reduction_beta = 0.8, number_of_tries = 50):
+        import numpy as np
+        import copy
+        min_condition_score, min_relevant_processes = np.inf, []
+        already_tried = []
+        w_save = copy.deepcopy(self)
+        number_processes_called = len(self.get_processes_called())
+        print('-'*number_of_tries+">")
+        for i in range(number_of_tries):
+            print('.', end='')
+            w = copy.deepcopy(w_save)
+            random_relevant_processes = w.generate_random_relevant_processes()
+            escape = 0
+            while(escape<100 and set(random_relevant_processes) in already_tried):
+                escape+=1
+                random_relevant_processes = w.generate_random_relevant_processes()
+            #Cause it means we've already searched the majority of the possibilities
+            if(escape>=100):
+                return min_relevant_processes
+            already_tried.append(set(random_relevant_processes))
+            _, cluster_organisation = w.convert_workflow_2_user_view(relevant_processes=random_relevant_processes, render_graphs = False)
+            
+            tab_nb_executors_per_cluster, tab_nb_conditions_per_cluster = [], []
+            for c in cluster_organisation:
+                tab_nb_executors_per_cluster.append(cluster_organisation[c]["nb_executors"])
+                tab_nb_conditions_per_cluster.append(cluster_organisation[c]["nb_conditions"])
+            
+            score = np.max(tab_nb_conditions_per_cluster)
+            #score = np.mean(tab_nb_conditions_per_cluster)
+            #score = np.median(tab_nb_conditions_per_cluster)
+            #Ratio
+            #score = np.max(np.array(tab_nb_conditions_per_cluster)/np.array(tab_nb_executors_per_cluster))
+
+            if(len(cluster_organisation)>=reduction_alpha*number_processes_called and 
+               len(cluster_organisation)<=reduction_beta*number_processes_called and 
+               score<min_condition_score):
+                min_relevant_processes = random_relevant_processes
+                min_condition_score = score
+        return min_relevant_processes
 
     #Method that returns the order of execution for each executor
     def get_order_execution_executors(self):
@@ -1076,11 +1122,16 @@ George Marchment, Bryan Brancotte, Marie Schmit, Frédéric Lemoine, Sarah Cohen
             calls_in_operations = []
             non_relevant_name = 1
             
-            subworkflow_clusters_to_add, subworkflow_cluster_calls_to_add = [], []
+            #This is a dico of cluster to info about the number of executors and conditions
+            clusters_2_organisation = {}
+
+            #subworkflow_clusters_to_add, subworkflow_cluster_calls_to_add = [], []
             index_cluster = len(clusters)
             #We replace the last clusters first -> this is cause the outputs of the last clusters aren't used anywhere else in the workflow by definition 
             for elements in list(reversed(clusters)):
 
+                nb_executors = 0
+
                 channels_to_replace_outside_of_cluster = []
 
                 #Check that there is at least one process in cluster
@@ -1093,11 +1144,12 @@ George Marchment, Bryan Brancotte, Marie Schmit, Frédéric Lemoine, Sarah Cohen
                 processes_added = []
                 things_added_in_cluster = []
                 if(len(elements)>=1 and at_least_one_process):
+                    
                     name, body, take, emit = "", "", "", ""
                     first_element = True
 
                     for ele in elements:
-            
+                        nb_executors+=1
                         if(ele.get_type()=="Process"):
                             
                             #Determine the name of the created subworkflow cluster
@@ -1263,6 +1315,7 @@ George Marchment, Bryan Brancotte, Marie Schmit, Frédéric Lemoine, Sarah Cohen
                             channels_to_replace_outside_of_cluster.append((old_output_names[i], param_out_name))
                     #If there was only one single condition in the subworkflow cluster -> then we add it when the call is done
                     if(len(conditions_in_subworkflow)==1):
+                        #TODO -> i think the case "else" -> needs to be removed cause sometimes the the empty channel created may overwrite an existing one
                         subworkfow_call = f"if({conditions_in_subworkflow[0].split('$$__$$')[0]}) {{\n{subworkfow_call_case_true}\n}} else {{\n{subworkfow_call_case_false}\n}}"
                     else:
                         subworkfow_call = subworkfow_call_case_true
@@ -1298,6 +1351,9 @@ George Marchment, Bryan Brancotte, Marie Schmit, Frédéric Lemoine, Sarah Cohen
                         code = replace_group1(code, pattern, new)
                         #code = code.replace(old, new)
 
+                    #Since i've added the conditions myself -> i can just count them by searching for this simple pattern
+                    clusters_2_organisation[subworkflow_code] = {"nb_executors":nb_executors, "nb_conditions":subworkflow_code.count("if(")}
+
                     #Add the subworkflow defintions
                     #-------------------------------------                
                     code = code.replace(f'{subworkflow_section}', f"{subworkflow_code}\n\n{subworkflow_section}")
@@ -1332,7 +1388,7 @@ George Marchment, Bryan Brancotte, Marie Schmit, Frédéric Lemoine, Sarah Cohen
             f.write(code)
             f.close()
             self.rewrite_and_initialise(code, self.processes_2_remove, render_graphs=render_graphs)
-            return code
+            return code, clusters_2_organisation
             #return code
             #
             ##So basically when retriving a thing (process or subworkflow)
-- 
GitLab