From 9c657b980e8a5f5dcc5325bc1be9d54bd04ab9c0 Mon Sep 17 00:00:00 2001
From: George Marchment <georgemarchment@yahoo.fr>
Date: Wed, 23 Apr 2025 16:31:56 +0200
Subject: [PATCH] Started creating profiles + added the option when a non
 relavant process is being created -> it takes the name from the process with
 the least used bioinfo tool process

---
 ressources/tool_2_nb_usage.json | 737 ++++++++++++++++++++++++++++++++
 src/workflow.py                 | 129 +++++-
 2 files changed, 846 insertions(+), 20 deletions(-)
 create mode 100644 ressources/tool_2_nb_usage.json

diff --git a/ressources/tool_2_nb_usage.json b/ressources/tool_2_nb_usage.json
new file mode 100644
index 0000000..ac614ed
--- /dev/null
+++ b/ressources/tool_2_nb_usage.json
@@ -0,0 +1,737 @@
+{
+    "canu": 2,
+    "quast": 5,
+    "multiqc": 114,
+    "fastqc": 132,
+    "pilon": 2,
+    "minimap2": 28,
+    "python": 658,
+    "echo": 460,
+    "nanoplot": 5,
+    "mv": 191,
+    "samtools": 428,
+    "masurca": 1,
+    "rm": 94,
+    "fastq-dump": 9,
+    "wget": 26,
+    "tar": 56,
+    "cat": 1205,
+    "bwa": 62,
+    "touch": 621,
+    "wc": 54,
+    "tail": 28,
+    "head": 79,
+    "zerone": 1,
+    "cp": 143,
+    "trim_galore": 26,
+    "star": 44,
+    "sed": 904,
+    "mkdir": 356,
+    "grep": 196,
+    "awk": 195,
+    "sort": 103,
+    "featurecounts": 21,
+    "tr": 47,
+    "r": 229,
+    "gs": 1,
+    "uniq": 46,
+    "paste": 15,
+    "wait": 2,
+    "fastp": 31,
+    "kraken2": 14,
+    "shovill": 4,
+    "prokka": 12,
+    "ln": 59,
+    "cellbender": 3,
+    "salmon": 18,
+    "filtlong": 4,
+    "porechop": 3,
+    "flye": 3,
+    "racon": 3,
+    "medaka_consensus": 1,
+    "homopolish": 1,
+    "kallisto": 11,
+    "diamond": 8,
+    "cd": 43,
+    "vsearch": 10,
+    "seqtk": 11,
+    "clustalo": 3,
+    "cd-hit": 3,
+    "ls": 25,
+    "set": 53,
+    "muscle": 3,
+    "trimal": 3,
+    "bc": 11,
+    "entropy-analysis": 2,
+    "o-trim-uninformative-columns-from-alignment": 1,
+    "seqfu": 4,
+    "fu-index": 1,
+    "pigz": 35,
+    "bracken": 6,
+    "megahit": 2,
+    "prodigal": 8,
+    "methylpy": 2,
+    "picard": 57,
+    "bshap": 1,
+    "genomeanalysistk": 14,
+    "bcftools": 79,
+    "kwip": 1,
+    "trinity": 8,
+    "find": 27,
+    "split": 15,
+    "singularity": 8,
+    "parallel": 11,
+    "chmod": 8,
+    "eof": 6,
+    "perl": 102,
+    "hello_cpu": 1,
+    "hello_gpu": 1,
+    "guppy_basecaller": 1,
+    "mini_assemble": 1,
+    "blastn": 8,
+    "blast_formatter": 3,
+    "blastdbcmd": 3,
+    "mini_align": 1,
+    "from": 1,
+    "outhandle.writenewick": 1,
+    "with": 2,
+    "aleobserve": 1,
+    "aleml_undated": 1,
+    "print.join[,": 1,
+    "malt-build": 2,
+    "malt-run": 1,
+    "rma2info": 3,
+    "printf": 59,
+    "spaced": 1,
+    "snp-pileup": 1,
+    "mafft": 4,
+    "cmd": 6,
+    "fasttree": 1,
+    "pangolin": 7,
+    "date": 23,
+    "tee": 27,
+    "zcat": 44,
+    "flexiplex": 3,
+    "sambamba": 10,
+    "delly_v1.1.6_linux_x86_64bit": 1,
+    "breakdancer-max": 1,
+    "tardis": 1,
+    "cnvnator": 1,
+    "gunzip": 65,
+    "lumpyexpress": 1,
+    "extractsplitreads_bwamem": 1,
+    "whamg": 1,
+    "rev": 12,
+    "tac": 1,
+    "curl": 2,
+    "freebayes": 9,
+    "tabix": 56,
+    "bgzip": 58,
+    "gffread": 12,
+    "pblat": 2,
+    "cutadapt": 20,
+    "seqkit": 10,
+    "gzip": 99,
+    "nanostat": 1,
+    "trimmomatic": 14,
+    "nanofilt": 2,
+    "ktimporttext": 3,
+    "mkfifo": 1,
+    "bcl2fastq": 2,
+    "pairtools": 5,
+    "cooler": 10,
+    "gatk": 71,
+    "bamtofastq": 4,
+    "java": 11,
+    "alfred": 1,
+    "baysor": 1,
+    "docker": 3,
+    "julia": 5,
+    "extra": 1,
+    "pip": 5,
+    "sleep": 11,
+    "mummichog": 2,
+    "sudo": 2,
+    "mem": 6,
+    "merge": 4,
+    "markduplicates": 4,
+    "index": 13,
+    "haplotypecaller": 4,
+    "indelqual": 2,
+    "call-parallel": 2,
+    "filter": 2,
+    "indexfeaturefile": 3,
+    "stats": 2,
+    "collectwgsmetrics": 2,
+    "flagstat": 2,
+    "baserecalibrator": 4,
+    "applybqsr": 4,
+    "profile": 4,
+    "collate": 4,
+    "call": 2,
+    "combinegvcfs": 1,
+    "genotypegvcfs": 1,
+    "variantrecalibrator": 6,
+    "bedtools": 90,
+    "metilene": 1,
+    "ivar": 5,
+    "command1": 9,
+    "contammix": 1,
+    "lofreq": 3,
+    "gatk3": 1,
+    "bwa-mem2": 15,
+    "vt": 1,
+    "vafator": 2,
+    "multiallelics-filter": 1,
+    "whatshap": 1,
+    "snpeff": 8,
+    "assert_eq": 1,
+    "assert_not_eq": 1,
+    "macs2": 7,
+    "bedmap": 2,
+    "sort-bed": 3,
+    "bam2bed": 1,
+    "wigtobigwig": 12,
+    "starch": 2,
+    "unstarch": 2,
+    "snpsift": 1,
+    "gawk": 5,
+    "occultercut": 1,
+    "gnuplot": 1,
+    "iqtree": 9,
+    "dnadiff": 1,
+    "crux": 2,
+    "bedtobigbed": 5,
+    "miniprot": 2,
+    "end": 3,
+    "paftools.js": 3,
+    "windowmasker": 4,
+    "bedgraphtobigwig": 12,
+    "pyfasta": 1,
+    "mummer": 1,
+    "find_telomere": 1,
+    "cram_filter": 1,
+    "pretextmap": 2,
+    "pretextsnapshot": 1,
+    "merge_bedgraph_files": 1,
+    "trycycler": 2,
+    "raven": 1,
+    "miniasm": 1,
+    "any2fasta": 1,
+    "minipolish": 1,
+    "mlst": 3,
+    "dos2unix": 2,
+    "md5sum": 3,
+    "params.app": 1,
+    "makeblastdb": 6,
+    "snakemake": 1,
+    "smalt": 2,
+    "count++": 1,
+    "phyml": 1,
+    "bwa_invoc": 1,
+    "picard_invoc": 1,
+    "pwd": 3,
+    "cmalign": 1,
+    "biom": 1,
+    "qiime": 6,
+    "vcf-sort": 2,
+    "vcf-concat": 2,
+    "bio-vcf": 1,
+    "plink2": 11,
+    "join": 7,
+    "comm": 2,
+    "gemma": 2,
+    "k++": 2,
+    "timeout": 1,
+    "ignore": 1,
+    "eliminates": 1,
+    "vk": 6,
+    "fastq-kmers": 1,
+    "bam": 2,
+    "gsub,": 1,
+    "print": 1,
+    "gsub,,": 1,
+    "plink": 9,
+    "rvtest": 2,
+    "sh": 1,
+    "gimme": 1,
+    "sortbed": 7,
+    "featurefindermultiplex": 2,
+    "highresprecursormasscorrector": 2,
+    "msgfplusadapter": 2,
+    "peptideindexer": 2,
+    "falsediscoveryrate": 2,
+    "mztabexporter": 4,
+    "idmapper": 2,
+    "featurelinkerunlabeledqt": 2,
+    "idconflictresolver": 2,
+    "consensusmapnormalizer": 2,
+    "idposteriorerrorprobability": 2,
+    "idmerger": 2,
+    "epifany": 2,
+    "proteinquantifier": 2,
+    "kraken-biom": 2,
+    "samblaster": 2,
+    "bowtie2": 35,
+    "kaiju": 2,
+    "kaijureport": 1,
+    "kaiju2krona": 2,
+    "bowtie2-build": 23,
+    "pilon-1.22": 1,
+    "abricate": 3,
+    "wtpoa-cns": 1,
+    "wtdbg2": 1,
+    "jupyter": 4,
+    "jupytext": 1,
+    "velocyto": 1,
+    "reportsrender": 3,
+    "expected_liftoff_integers.each": 1,
+    "throw": 2,
+    "expected_liftoff_floats.each": 1,
+    "expected_liftoff_strings.each": 1,
+    "expected_strings.each": 1,
+    "assert": 2,
+    "it.setpermissions": 1,
+    "dir.setpermissions": 1,
+    "try": 1,
+    "file.delete": 1,
+    "dir.deletedir": 1,
+    "dir.eachfile": 1,
+    "file.setpermissions": 1,
+    "it.deletedir": 1,
+    "repeatmasker": 3,
+    "liftoff": 1,
+    "bakta_db": 2,
+    "bakta": 4,
+    "conda": 2,
+    "kaiju-makedb": 1,
+    "kaiju2table": 1,
+    "ktimporttaxonomy": 2,
+    "aws": 5,
+    "pipspeak": 1,
+    "bustools": 11,
+    "scispeak": 1,
+    "colabfold_batch": 4,
+    "psiblast": 1,
+    "msa_filter_blast.awk": 1,
+    "prepare_receptor": 1,
+    "pymol": 2,
+    "prank": 1,
+    "prepare_ligand": 1,
+    "ligand": 1,
+    "srapath": 1,
+    "sha512sum": 1,
+    "sha256sum": 1,
+    "montage": 1,
+    "blastp": 1,
+    "metaphlan": 2,
+    "humann": 1,
+    "php": 3,
+    "file.fq": 1,
+    "kseq_test": 2,
+    "yacrd": 1,
+    "sam2rma": 2,
+    "micromamba": 1,
+    "bamaddrg": 2,
+    "vcfstats": 1,
+    "vcftools": 1,
+    "vcfrandomsample": 1,
+    "angsd": 1,
+    "pyscenic": 3,
+    "prefetch": 2,
+    "hisat2-build": 4,
+    "gtf2bed": 4,
+    "mc": 5,
+    "bamsort": 2,
+    "bam_stats": 1,
+    "bammarkduplicates": 2,
+    "hmmsearch": 5,
+    "zstdcat": 4,
+    "irma": 2,
+    "mqc_file.text": 3,
+    "contents": 3,
+    "mosdepth": 3,
+    "medaka_variant": 1,
+    "medaka": 1,
+    "pride-molecules-indexer-1.0.0-snapshot-bin": 6,
+    "maracluster": 1,
+    "extract_exons": 2,
+    "flattengtf": 2,
+    "hisat2": 7,
+    "jgi_summarize_bam_contig_depths": 1,
+    "metabat2": 1,
+    "das_tool": 1,
+    "checkm": 2,
+    "msmc2": 1,
+    "astral.5.7.4": 1,
+    "pgs-calc": 6,
+    "nvidia-smi": 1,
+    "nvcc": 1,
+    "igvtools": 8,
+    "parallel-fastq-dump": 4,
+    "genomecoveragebed": 10,
+    "process_atac": 1,
+    "bowtie-build": 4,
+    "javac": 1,
+    "mageck": 1,
+    "fastx_reverse_complement": 1,
+    "fastx_trimmer": 4,
+    "snippy": 2,
+    "cmd3": 2,
+    "cmd1": 2,
+    "cmd2": 2,
+    "cmd4": 2,
+    "extract_splice_sites": 1,
+    "rsem-prepare-reference": 2,
+    "umi_tools": 6,
+    "bowtie": 5,
+    "rungzip": 6,
+    "rsem-calculate-expression": 2,
+    "runsamtools": 10,
+    "tophat2": 1,
+    "rmdir": 1,
+    "memote": 2,
+    "groot": 1,
+    "kmc_tools": 1,
+    "kmc": 1,
+    "genomescope2": 5,
+    "csvtk": 4,
+    "nextclade": 3,
+    "augur": 24,
+    "gbrs": 2,
+    "export-genoprob-file": 1,
+    "alignmentsieve": 1,
+    "g2gtools": 1,
+    "bamcoverage": 1,
+    "cellranger": 8,
+    "mash": 10,
+    "b*": 8,
+    "wine": 2,
+    "kronik": 1,
+    "msstitch": 2,
+    "liftover": 1,
+    "unicycler": 2,
+    "git": 6,
+    "run": 5,
+    "varscan": 1,
+    "vardict-java": 1,
+    "view": 3,
+    "rmdup": 1,
+    "fixmate": 1,
+    "resistome": 2,
+    "rarefaction": 1,
+    "quasitools": 1,
+    "sierralocal": 1,
+    "synapse": 2,
+    "challengeutils": 6,
+    "model": 2,
+    "stansummary": 1,
+    "adapterremoval": 9,
+    "bracken-build": 1,
+    "centrifuge": 1,
+    "basta": 1,
+    "basta2krona": 1,
+    "ratt": 1,
+    "ffmpeg": 4,
+    "local": 1,
+    "add_subtitles": 1,
+    "add_bumper": 1,
+    "preseq": 6,
+    "unionbedgraphs": 1,
+    "bidir": 1,
+    "segment": 1,
+    "train": 1,
+    "bamtobed": 4,
+    "calc_maximal_isoform.bash": 1,
+    "fastasplitn": 1,
+    "builddatabase": 1,
+    "repeatmodeler": 1,
+    "test": 2,
+    "fastasort": 1,
+    "cdbfasta": 1,
+    "bam2hints": 1,
+    "fasomerecords": 1,
+    "mysql_create_options": 1,
+    "pasa_asmbls_to_training_set.dbi": 1,
+    "augustus": 2,
+    "etraining": 1,
+    "options": 1,
+    "fastaexplode": 1,
+    "srst2": 9,
+    "segemehl.x": 2,
+    "circos": 2,
+    "declare": 2,
+    "let": 1,
+    "bamtools": 5,
+    "bamcompare": 1,
+    "hicup_digester": 1,
+    "hicup_truncater": 1,
+    "hicup_mapper": 2,
+    "hicup_filter": 1,
+    "hicup_deduplicator": 2,
+    "juicer_tools_1.22.01": 1,
+    "pairix": 2,
+    "mlr": 1,
+    "egrep": 2,
+    "cobalt": 1,
+    "amber": 1,
+    "purple": 1,
+    "octopus": 1,
+    "nextflow": 1,
+    "shasum": 1,
+    "bandage": 1,
+    "nanoq": 4,
+    "dragonflye": 2,
+    "gotree": 4,
+    "goalign": 4,
+    "clipkit": 1,
+    "raxml-ng": 2,
+    "nw_order": 1,
+    "convert2bed": 1,
+    "silva-preprocess": 1,
+    "silva-run": 1,
+    "might": 1,
+    "mafsinregion": 2,
+    "msa_view": 4,
+    "mafspeciessubset": 2,
+    "iget": 1,
+    "fastq_screen": 3,
+    "otherwise": 1,
+    "sniffles": 1,
+    "purge_haplotigs": 2,
+    "chromap": 2,
+    "yahs": 1,
+    "juicer": 1,
+    "assembly-stats": 1,
+    "shopt": 4,
+    "delly": 1,
+    "amrfinder": 3,
+    "run_deepvariant": 1,
+    "glnexus_cli": 1,
+    "qualimap": 1,
+    "smoove": 1,
+    "tiddit": 2,
+    "survivor": 2,
+    "annotsv": 1,
+    "lcep-package": 1,
+    "system-intelligence": 1,
+    "raxmlhpc-pthreads": 2,
+    "zstd": 8,
+    "clodius": 1,
+    "unlink": 4,
+    "vcf-consensus": 1,
+    "ncbi-acc-download": 1,
+    "unzip": 2,
+    "vdb-validate": 1,
+    "vdb-config": 1,
+    "efetch": 1,
+    "aria2c": 16,
+    "xmlstarlet": 1,
+    "kb": 2,
+    "mykrobe": 1,
+    "tb-profiler": 3,
+    "snippy-core": 1,
+    "kvarq": 1,
+    "rclone": 1,
+    "ukbunpack": 1,
+    "dconvert": 2,
+    "blacklist_filter": 2,
+    "bedops": 2,
+    "plotprofile": 3,
+    "computematrix": 3,
+    "plotheatmap": 1,
+    "ncrf": 1,
+    "graphaligner": 1,
+    "spoa": 1,
+    "fgrep": 2,
+    "vg": 3,
+    "kpal": 1,
+    "37": 1,
+    "quantitative": 1,
+    "binary": 1,
+    "38": 1,
+    "finemap_v1.4_x86_64": 2,
+    "locuszoom": 2,
+    "score-client": 1,
+    "time": 2,
+    "call_variants": 1,
+    "postprocess_variants": 1,
+    "density-peaks.bash": 1,
+    "selenoprofiles": 1,
+    "strling": 5,
+    "trimmomatic-0.36": 1,
+    "kmergenie": 1,
+    "abyss-pe": 1,
+    "velveth": 1,
+    "velvetg": 1,
+    "fq2fa": 1,
+    "idba_ud": 1,
+    "command": 2,
+    "ngs-extract-consensus-stdout": 1,
+    "fasta2gfe_nextflow": 1,
+    "razers3": 2,
+    "params.mira": 1,
+    "params.mitobim": 1,
+    "vcffilter": 1,
+    "files": 5,
+    "regenievalidateinput": 4,
+    "regenie": 12,
+    "regenielogparser": 6,
+    "genomic-utils": 9,
+    "icount": 4,
+    "which": 1,
+    "sortmerna": 3,
+    "paraclu-cut": 2,
+    "paraclu": 2,
+    "intersectbed": 1,
+    "yara_indexer": 1,
+    "yara_mapper": 1,
+    "cutsite_trimming": 1,
+    "build_matrix": 1,
+    "ice": 1,
+    "cooltools": 2,
+    "hicfindtads": 1,
+    "hicplotdistvscounts": 1,
+    "import": 2,
+    "os.renameinterval,": 2,
+    "code": 2,
+    "directory,": 2,
+    "samtools_cram_convert": 1,
+    "fileinfo": 1,
+    "pyprophet": 4,
+    "idfileconverter": 1,
+    "easypqp": 1,
+    "openswathassaygenerator": 1,
+    "targetedfileconverter": 6,
+    "openswathdecoygenerator": 1,
+    "openswathworkflow": 1,
+    "fileconverter": 1,
+    "openswathmzmlfilecacher": 1,
+    "ampcombi": 1,
+    "amplify": 1,
+    "macrel": 1,
+    "hamronize": 12,
+    "amrfinder_update": 2,
+    "fargene": 2,
+    "rgi": 2,
+    "deeparg": 4,
+    "download-antismash-databases": 1,
+    "cp_cmd": 1,
+    "antismash": 3,
+    "deepbgc": 2,
+    "gecco": 1,
+    "bioawk": 1,
+    "pyrodigal": 1,
+    "get_decoy_ids": 1,
+    "switch": 16,
+    "break": 16,
+    "mv_unsorted_bam": 6,
+    "index_gff": 1,
+    "miso": 1,
+    "sashimi_plot": 1,
+    "bedclip": 2,
+    "readlist.eachwithindex": 4,
+    "sb26": 1,
+    "ab339": 1,
+    "chr1": 1,
+    "chrom": 1,
+    "seq_batches": 1,
+    "amp_batch": 1,
+    "well_id": 1,
+    "wells_cells": 1,
+    "gene_intervals": 1,
+    "seq_batch_id": 1,
+    "tw1": 1,
+    "amp_batch_id": 1,
+    "ska": 1,
+    "sourmash": 4,
+    "bam2fasta": 1,
+    "orpheum": 1,
+    "spaceranger": 1,
+    "quarto": 3,
+    "bismark_genome_preparation": 1,
+    "bismark": 1,
+    "bismark_methylation_extractor": 1,
+    "bismark2report": 1,
+    "bismark2summary": 1,
+    "deduplicate_bismark": 1,
+    "coverage2cytosine": 1,
+    "meryl": 12,
+    "fastk": 4,
+    "fastmerge": 4,
+    "histex": 4,
+    "ploidyplot": 4,
+    "katgc": 4,
+    "busco": 1,
+    "merquryfk": 4,
+    "ccs": 1,
+    "lima": 1,
+    "isoseq3": 1,
+    "ultra": 2,
+    "hmmbuild": 1,
+    "easel": 5,
+    "esl-reformat": 3,
+    "hmmalign": 2,
+    "esl-alimask": 2,
+    "epa-ng": 2,
+    "gappa": 3,
+    "cat_input": 1,
+    "transdecoder.longorfs": 1,
+    "transdecoder.predict": 1,
+    "zgrep": 1,
+    "exec_annotation": 1,
+    "eukulele": 1,
+    "unpigz": 1,
+    "transrate": 1,
+    "fgbio": 8,
+    "pygmentize": 1,
+    "pureclip": 2,
+    "meme": 1,
+    "piranha": 2,
+    "shuf": 4,
+    "dreme": 4,
+    "supernova": 4,
+    "rsync": 2,
+    "rcode": 1,
+    "vep": 2,
+    "pigz:echo": 2,
+    "kaiju-mkfmi": 1,
+    "kaiju-mkbwt": 1,
+    "centrifuge-build": 1,
+    "7za": 1,
+    "pmdtools": 3,
+    "sourcepredict": 2,
+    "normalizedreadcount": 2,
+    "damageprofiler": 3,
+    "mmseqs": 7,
+    "colabfold_search": 2,
+    "rasusa": 2,
+    "name_sort_bam": 1,
+    "plotfingerprint": 1,
+    "crispresso": 1,
+    "cellprofiler": 3,
+    "querylca.tsv": 1,
+    "csvtotable": 1,
+    "ktimportec": 1,
+    "flexbar": 1,
+    "novoalign": 1,
+    "bamqc": 1,
+    "stress": 1,
+    "tagdust": 1,
+    "zip": 1,
+    "fastq-scan": 2,
+    "snp-sites": 1,
+    "pgdspider2-cli": 1,
+    "ne2-1l": 1,
+    "0.02": 1,
+    "kma": 2,
+    "aligncov": 1,
+    "dorado": 3,
+    "run_pepper_margin_deepvariant": 1,
+    "pod5": 1,
+    "pycoqc": 1,
+    "modkit": 1,
+    "fasterq-dump": 1,
+    "complementbed": 1,
+    "mergebed": 2
+}
\ No newline at end of file
diff --git a/src/workflow.py b/src/workflow.py
index e993caf..0cbfb61 100644
--- a/src/workflow.py
+++ b/src/workflow.py
@@ -19,6 +19,7 @@ from pathlib import Path
 import glob
 import ctypes
 import time
+import numpy as np
 
 
 
@@ -81,7 +82,6 @@ class Workflow:
         self.ternary_operation_dico = {}
         self.map_element_dico = {}
 
-
         OG_file = Nextflow_File(file, workflow = self, first_file = True)
         self.DSL = OG_file.find_DSL()
         self.create_empty_results()
@@ -162,8 +162,7 @@ George Marchment, Bryan Brancotte, Marie Schmit, Frédéric Lemoine, Sarah Cohen
 
         if(self.graph==None):
             self.graph = Graph(self)
-
-
+        
 
     def iniatilise_tab_processes_2_remove(self):
         if(self.processes_2_remove==None):
@@ -425,8 +424,20 @@ George Marchment, Bryan Brancotte, Marie Schmit, Frédéric Lemoine, Sarah Cohen
             searching = False
             if(random_pool):
                 alpha = get_value()
-            nb_2_select = int(alpha*len(pool))
-            sampled = random.sample(set(pool), nb_2_select)
+            nb_2_select = int(alpha*len(set(pool)))
+
+            #Taking one from one the processes until we've reached the number
+            sampled = []
+            while(len(sampled)<nb_2_select):
+                element = random.sample(pool, 1)
+                sampled+=element
+                #Removing all occurances of element in the list
+                #We do this cause their can be mulitple of the same element
+                #In the case we are searching with the frequency
+                pool = list(filter(lambda a: a != element[0], pool))
+            ##This was 'simple' way of doing it (in the case there wasn't any duplicates in the pool)
+            ##The new method in the case there are multiples and also in the case there aren't
+            #sampled = random.sample(set(pool), nb_2_select)
             
             sampled_str = []
             for s in sampled:
@@ -462,9 +473,7 @@ George Marchment, Bryan Brancotte, Marie Schmit, Frédéric Lemoine, Sarah Cohen
         else:
             raise BioFlowInsightError("Trying to generate random relevant processes however option 'duplicate' is not activated.")
 
-        
-    
-    #TODO -> do this in a bit of a smarter way -> looking at popularity of the tools
+         
     def get_random_relevant_processes_which_use_bioinformatics_tools(self, processes_2_tools = {}):
         if(self.duplicate):
             processes_called = []
@@ -488,11 +497,59 @@ George Marchment, Bryan Brancotte, Marie Schmit, Frédéric Lemoine, Sarah Cohen
             raise BioFlowInsightError("Trying to generate random relevant processes however option 'duplicate' is not activated.")
 
 
+    def get_random_relevant_processes_which_use_bioinformatics_tools_considering_their_frequency(self, processes_2_tools = {}):
+        
+        OG_path = os.getcwd()
+        #Change working directory to the one of the file
+        os.chdir("/".join((str(__file__).split("/")[:-1])))
+        with open("../ressources/tool_2_nb_usage.json", 'r') as file:
+            tool_2_nb_usage = json.load(file)
+        os.chdir(OG_path)
+
+        if(self.duplicate):
+            processes_called = []
+            if(self.get_DSL()=="DSL2"):
+                for c in self.get_workflow_main().get_all_calls_in_workflow():
+                    p = c.get_first_element_called()
+                    if(p.get_type()=="Process"):
+                        processes_called.append(p)
+            else:
+                processes_called = self.get_first_file().get_processes()
+            process_to_min_frequency = {}
+            for p in processes_called:
+                if(processes_2_tools!={}):
+                    tools = processes_2_tools[p.get_code()]
+                else:
+                    tools = p.get_tools()
+                
+                if(len(tools)>0):
+                    min_value = np.inf
+                    for t in tools:
+                        try:
+                            val = tool_2_nb_usage[t]
+                        except:
+                            val = 1
+                        if(val<min_value):
+                            min_value = val
+                    process_to_min_frequency[p] = min_value
+            sample_of_processes = []
+            total_nb = np.sum(list(process_to_min_frequency.values()))
+            max_nb = np.max(list(process_to_min_frequency.values()))
+            for p in process_to_min_frequency:
+                freq = process_to_min_frequency[p]
+                nb_to_add = (max_nb-freq)+1
+                #nb_to_add = int(total_nb*(1-(freq/total_nb)**4))
+                sample_of_processes+=nb_to_add*[p]
+            return self.draw_pool_and_check_dependencies(sample_of_processes)
+        else:
+            raise BioFlowInsightError("Trying to generate random relevant processes however option 'duplicate' is not activated.")
+
+
     #The reduction alpha is the minimun number cluster depending on the percentage ofprocesses
     #For example if there are 10 processes and reduction_alpha = 0.2 -> we want at least 2 clusters
     #In the same idea if reduction_alpha = 0.4 -> we want at least 4 clusters
     def get_relevant_processes_which_minimize_nb_of_clusters(self, reduction_alpha = 0.2, number_of_tries = 50):
-        import numpy as np
+        
         min_nb_clusters, min_relevant_processes = np.inf, []
         already_tried = []
         print('-'*number_of_tries+">")
@@ -522,7 +579,7 @@ George Marchment, Bryan Brancotte, Marie Schmit, Frédéric Lemoine, Sarah Cohen
     #In the same idea if reduction_beta = 0.6 -> we want a maximum of 6 clusters
     #reduction_alpha is the same as above
     def get_relevant_processes_which_uniformizes_cluster_distribution(self, reduction_alpha = 0.2, reduction_beta = 0.8, number_of_tries = 50):
-        import numpy as np
+        
         min_uniform_score, min_relevant_processes = np.inf, []
         already_tried = []
         print('-'*number_of_tries+">")
@@ -564,7 +621,6 @@ George Marchment, Bryan Brancotte, Marie Schmit, Frédéric Lemoine, Sarah Cohen
     #reduction_alpha is the same as above
     #reduction_beta is the same as above
     def get_relevant_which_minizes_the_number_of_conditions(self, reduction_alpha = 0.2, reduction_beta = 0.8, number_of_tries = 50):
-        import numpy as np
         import copy
         min_condition_score, min_relevant_processes = np.inf, []
         already_tried = []
@@ -609,12 +665,12 @@ George Marchment, Bryan Brancotte, Marie Schmit, Frédéric Lemoine, Sarah Cohen
                                                   reduction_alpha = 0.2, 
                                                   reduction_beta = 0.8, 
                                                   number_of_tries = 50,
-                                                  using_processes_with_bioinfo_tools = True,
+                                                  process_pre_selection = "bioinfo_freq",
                                                   concordance_factor = 1,
                                                   uniformity_factor = 1,
                                                   min_nb_clusters_factor = 1,
-                                                  min_nb_non_relevant_cluster = 1):
-        import numpy as np
+                                                  min_nb_non_relevant_cluster_factor = 1):
+        
         import copy
         min_score, min_processes = np.inf, []
         already_tried = []
@@ -628,10 +684,15 @@ George Marchment, Bryan Brancotte, Marie Schmit, Frédéric Lemoine, Sarah Cohen
         for i in range(number_of_tries):
             print('.', end='')
             w = copy.deepcopy(w_save)
-            if(using_processes_with_bioinfo_tools):
+            
+            if(process_pre_selection == "bioinfo"):
                 random_relevant_processes = w.get_random_relevant_processes_which_use_bioinformatics_tools(processes_2_tools = processes_2_tools)
-            else:
+            elif(process_pre_selection == "bioinfo_freq"):
+                random_relevant_processes = w.get_random_relevant_processes_which_use_bioinformatics_tools_considering_their_frequency(processes_2_tools = processes_2_tools)
+            elif(process_pre_selection == "None"):
                 random_relevant_processes = w.generate_random_relevant_processes()
+            else:
+                raise Exception('process_pre_selection option not recognised')
             escape = 0
             while(escape<100 and set(random_relevant_processes) in already_tried):
                 escape+=1
@@ -660,14 +721,20 @@ George Marchment, Bryan Brancotte, Marie Schmit, Frédéric Lemoine, Sarah Cohen
             for x in tab_nb_processes_per_cluster:
                 uniformity_variance += (average_number_of_process_per_cluster-x)**2/nb_clusters
 
-            score = concordance_factor * np.mean(np.array(tab_nb_conditions_per_cluster)/np.array(tab_nb_executors_per_cluster)) + \
+            score = concordance_factor * np.max(np.array(tab_nb_conditions_per_cluster)/np.array(tab_nb_executors_per_cluster)) + \
                     uniformity_factor * (uniformity_variance / number_processes_called) + \
                     min_nb_clusters_factor * (nb_clusters / number_processes_called) + \
-                    min_nb_non_relevant_cluster * (nb_non_relevant_clusters / nb_clusters)
+                    min_nb_non_relevant_cluster_factor * (nb_non_relevant_clusters / nb_clusters)
 
             if(len(cluster_organisation)>=reduction_alpha*number_processes_called and 
                len(cluster_organisation)<=reduction_beta*number_processes_called and 
                score<min_score):
+                print()
+                print("concordance",  np.max(np.array(tab_nb_conditions_per_cluster)/np.array(tab_nb_executors_per_cluster)) )
+                print("uniformity",   (uniformity_variance / number_processes_called) )
+                print("min_nb_clusters",  (nb_clusters / number_processes_called) )
+                print("min_nb_non_relevant_cluster",  (nb_non_relevant_clusters / nb_clusters))
+                print("score", score)
                 min_processes = random_relevant_processes
                 min_score = score
         return min_processes
@@ -1070,7 +1137,15 @@ George Marchment, Bryan Brancotte, Marie Schmit, Frédéric Lemoine, Sarah Cohen
 
     #Method which rewrites the workflow follwong the user view
     #Conert workflow to user_view only makes sense when the option duplicate is activated -> otherwise is doesn't make sense + it makes the analysis way more complicated
-    def convert_workflow_2_user_view(self, relevant_processes = [], render_graphs = True):
+    def convert_workflow_2_user_view(self, relevant_processes = [], render_graphs = True, extract_tools = False):
+        if(extract_tools):
+            OG_path = os.getcwd()
+            #Change working directory to the one of the file
+            os.chdir("/".join((str(__file__).split("/")[:-1])))
+            with open("../ressources/tool_2_nb_usage.json", 'r') as file:
+                tool_2_nb_usage = json.load(file)
+            os.chdir(OG_path)
+        
         self.iniatilise_tab_processes_2_remove()
         self.graph.initialise(processes_2_remove = self.processes_2_remove)
         self.check_something_illegal_for_rewrite()
@@ -1364,9 +1439,23 @@ George Marchment, Bryan Brancotte, Marie Schmit, Frédéric Lemoine, Sarah Cohen
                     #If the name=="" -> it means there isn't any relevant processes in the cluster -> it means it's a cluster of non relevant nodes
                     if(name==""):
                         #If there are processes called we are going to use them
+                        #We use the process with the least used tool
                         if(len(processes_added)>0):
+                            
+                            min_tool, min_process = np.inf, processes_added[0]
+                            #If the tools have been extracted they can be used to name the none relevant processes
+                            if(extract_tools):
+                                for p in processes_added:
+                                    for t in p.get_code():
+                                        try:
+                                            val = tool_2_nb_usage[t]
+                                        except:
+                                            val = 1
+                                        if(val<min_tool):
+                                            min_tool = val
+                                            min_process = p
                             #TODO find a better naming system
-                            name = f"non_relevant_cluster_{processes_added[0].get_alias()}"
+                            name = f"non_relevant_cluster_{min_process.get_alias()}"
                         else:
                             #TODO find a better naming system
                             name = f"non_relevant_cluster_{non_relevant_name}"
-- 
GitLab