From 9c657b980e8a5f5dcc5325bc1be9d54bd04ab9c0 Mon Sep 17 00:00:00 2001 From: George Marchment <georgemarchment@yahoo.fr> Date: Wed, 23 Apr 2025 16:31:56 +0200 Subject: [PATCH] Started creating profiles + added the option when a non relavant process is being created -> it takes the name from the process with the least used bioinfo tool process --- ressources/tool_2_nb_usage.json | 737 ++++++++++++++++++++++++++++++++ src/workflow.py | 129 +++++- 2 files changed, 846 insertions(+), 20 deletions(-) create mode 100644 ressources/tool_2_nb_usage.json diff --git a/ressources/tool_2_nb_usage.json b/ressources/tool_2_nb_usage.json new file mode 100644 index 0000000..ac614ed --- /dev/null +++ b/ressources/tool_2_nb_usage.json @@ -0,0 +1,737 @@ +{ + "canu": 2, + "quast": 5, + "multiqc": 114, + "fastqc": 132, + "pilon": 2, + "minimap2": 28, + "python": 658, + "echo": 460, + "nanoplot": 5, + "mv": 191, + "samtools": 428, + "masurca": 1, + "rm": 94, + "fastq-dump": 9, + "wget": 26, + "tar": 56, + "cat": 1205, + "bwa": 62, + "touch": 621, + "wc": 54, + "tail": 28, + "head": 79, + "zerone": 1, + "cp": 143, + "trim_galore": 26, + "star": 44, + "sed": 904, + "mkdir": 356, + "grep": 196, + "awk": 195, + "sort": 103, + "featurecounts": 21, + "tr": 47, + "r": 229, + "gs": 1, + "uniq": 46, + "paste": 15, + "wait": 2, + "fastp": 31, + "kraken2": 14, + "shovill": 4, + "prokka": 12, + "ln": 59, + "cellbender": 3, + "salmon": 18, + "filtlong": 4, + "porechop": 3, + "flye": 3, + "racon": 3, + "medaka_consensus": 1, + "homopolish": 1, + "kallisto": 11, + "diamond": 8, + "cd": 43, + "vsearch": 10, + "seqtk": 11, + "clustalo": 3, + "cd-hit": 3, + "ls": 25, + "set": 53, + "muscle": 3, + "trimal": 3, + "bc": 11, + "entropy-analysis": 2, + "o-trim-uninformative-columns-from-alignment": 1, + "seqfu": 4, + "fu-index": 1, + "pigz": 35, + "bracken": 6, + "megahit": 2, + "prodigal": 8, + "methylpy": 2, + "picard": 57, + "bshap": 1, + "genomeanalysistk": 14, + "bcftools": 79, + "kwip": 1, + "trinity": 8, + "find": 27, + "split": 15, + "singularity": 8, + "parallel": 11, + "chmod": 8, + "eof": 6, + "perl": 102, + "hello_cpu": 1, + "hello_gpu": 1, + "guppy_basecaller": 1, + "mini_assemble": 1, + "blastn": 8, + "blast_formatter": 3, + "blastdbcmd": 3, + "mini_align": 1, + "from": 1, + "outhandle.writenewick": 1, + "with": 2, + "aleobserve": 1, + "aleml_undated": 1, + "print.join[,": 1, + "malt-build": 2, + "malt-run": 1, + "rma2info": 3, + "printf": 59, + "spaced": 1, + "snp-pileup": 1, + "mafft": 4, + "cmd": 6, + "fasttree": 1, + "pangolin": 7, + "date": 23, + "tee": 27, + "zcat": 44, + "flexiplex": 3, + "sambamba": 10, + "delly_v1.1.6_linux_x86_64bit": 1, + "breakdancer-max": 1, + "tardis": 1, + "cnvnator": 1, + "gunzip": 65, + "lumpyexpress": 1, + "extractsplitreads_bwamem": 1, + "whamg": 1, + "rev": 12, + "tac": 1, + "curl": 2, + "freebayes": 9, + "tabix": 56, + "bgzip": 58, + "gffread": 12, + "pblat": 2, + "cutadapt": 20, + "seqkit": 10, + "gzip": 99, + "nanostat": 1, + "trimmomatic": 14, + "nanofilt": 2, + "ktimporttext": 3, + "mkfifo": 1, + "bcl2fastq": 2, + "pairtools": 5, + "cooler": 10, + "gatk": 71, + "bamtofastq": 4, + "java": 11, + "alfred": 1, + "baysor": 1, + "docker": 3, + "julia": 5, + "extra": 1, + "pip": 5, + "sleep": 11, + "mummichog": 2, + "sudo": 2, + "mem": 6, + "merge": 4, + "markduplicates": 4, + "index": 13, + "haplotypecaller": 4, + "indelqual": 2, + "call-parallel": 2, + "filter": 2, + "indexfeaturefile": 3, + "stats": 2, + "collectwgsmetrics": 2, + "flagstat": 2, + "baserecalibrator": 4, + "applybqsr": 4, + "profile": 4, + "collate": 4, + "call": 2, + "combinegvcfs": 1, + "genotypegvcfs": 1, + "variantrecalibrator": 6, + "bedtools": 90, + "metilene": 1, + "ivar": 5, + "command1": 9, + "contammix": 1, + "lofreq": 3, + "gatk3": 1, + "bwa-mem2": 15, + "vt": 1, + "vafator": 2, + "multiallelics-filter": 1, + "whatshap": 1, + "snpeff": 8, + "assert_eq": 1, + "assert_not_eq": 1, + "macs2": 7, + "bedmap": 2, + "sort-bed": 3, + "bam2bed": 1, + "wigtobigwig": 12, + "starch": 2, + "unstarch": 2, + "snpsift": 1, + "gawk": 5, + "occultercut": 1, + "gnuplot": 1, + "iqtree": 9, + "dnadiff": 1, + "crux": 2, + "bedtobigbed": 5, + "miniprot": 2, + "end": 3, + "paftools.js": 3, + "windowmasker": 4, + "bedgraphtobigwig": 12, + "pyfasta": 1, + "mummer": 1, + "find_telomere": 1, + "cram_filter": 1, + "pretextmap": 2, + "pretextsnapshot": 1, + "merge_bedgraph_files": 1, + "trycycler": 2, + "raven": 1, + "miniasm": 1, + "any2fasta": 1, + "minipolish": 1, + "mlst": 3, + "dos2unix": 2, + "md5sum": 3, + "params.app": 1, + "makeblastdb": 6, + "snakemake": 1, + "smalt": 2, + "count++": 1, + "phyml": 1, + "bwa_invoc": 1, + "picard_invoc": 1, + "pwd": 3, + "cmalign": 1, + "biom": 1, + "qiime": 6, + "vcf-sort": 2, + "vcf-concat": 2, + "bio-vcf": 1, + "plink2": 11, + "join": 7, + "comm": 2, + "gemma": 2, + "k++": 2, + "timeout": 1, + "ignore": 1, + "eliminates": 1, + "vk": 6, + "fastq-kmers": 1, + "bam": 2, + "gsub,": 1, + "print": 1, + "gsub,,": 1, + "plink": 9, + "rvtest": 2, + "sh": 1, + "gimme": 1, + "sortbed": 7, + "featurefindermultiplex": 2, + "highresprecursormasscorrector": 2, + "msgfplusadapter": 2, + "peptideindexer": 2, + "falsediscoveryrate": 2, + "mztabexporter": 4, + "idmapper": 2, + "featurelinkerunlabeledqt": 2, + "idconflictresolver": 2, + "consensusmapnormalizer": 2, + "idposteriorerrorprobability": 2, + "idmerger": 2, + "epifany": 2, + "proteinquantifier": 2, + "kraken-biom": 2, + "samblaster": 2, + "bowtie2": 35, + "kaiju": 2, + "kaijureport": 1, + "kaiju2krona": 2, + "bowtie2-build": 23, + "pilon-1.22": 1, + "abricate": 3, + "wtpoa-cns": 1, + "wtdbg2": 1, + "jupyter": 4, + "jupytext": 1, + "velocyto": 1, + "reportsrender": 3, + "expected_liftoff_integers.each": 1, + "throw": 2, + "expected_liftoff_floats.each": 1, + "expected_liftoff_strings.each": 1, + "expected_strings.each": 1, + "assert": 2, + "it.setpermissions": 1, + "dir.setpermissions": 1, + "try": 1, + "file.delete": 1, + "dir.deletedir": 1, + "dir.eachfile": 1, + "file.setpermissions": 1, + "it.deletedir": 1, + "repeatmasker": 3, + "liftoff": 1, + "bakta_db": 2, + "bakta": 4, + "conda": 2, + "kaiju-makedb": 1, + "kaiju2table": 1, + "ktimporttaxonomy": 2, + "aws": 5, + "pipspeak": 1, + "bustools": 11, + "scispeak": 1, + "colabfold_batch": 4, + "psiblast": 1, + "msa_filter_blast.awk": 1, + "prepare_receptor": 1, + "pymol": 2, + "prank": 1, + "prepare_ligand": 1, + "ligand": 1, + "srapath": 1, + "sha512sum": 1, + "sha256sum": 1, + "montage": 1, + "blastp": 1, + "metaphlan": 2, + "humann": 1, + "php": 3, + "file.fq": 1, + "kseq_test": 2, + "yacrd": 1, + "sam2rma": 2, + "micromamba": 1, + "bamaddrg": 2, + "vcfstats": 1, + "vcftools": 1, + "vcfrandomsample": 1, + "angsd": 1, + "pyscenic": 3, + "prefetch": 2, + "hisat2-build": 4, + "gtf2bed": 4, + "mc": 5, + "bamsort": 2, + "bam_stats": 1, + "bammarkduplicates": 2, + "hmmsearch": 5, + "zstdcat": 4, + "irma": 2, + "mqc_file.text": 3, + "contents": 3, + "mosdepth": 3, + "medaka_variant": 1, + "medaka": 1, + "pride-molecules-indexer-1.0.0-snapshot-bin": 6, + "maracluster": 1, + "extract_exons": 2, + "flattengtf": 2, + "hisat2": 7, + "jgi_summarize_bam_contig_depths": 1, + "metabat2": 1, + "das_tool": 1, + "checkm": 2, + "msmc2": 1, + "astral.5.7.4": 1, + "pgs-calc": 6, + "nvidia-smi": 1, + "nvcc": 1, + "igvtools": 8, + "parallel-fastq-dump": 4, + "genomecoveragebed": 10, + "process_atac": 1, + "bowtie-build": 4, + "javac": 1, + "mageck": 1, + "fastx_reverse_complement": 1, + "fastx_trimmer": 4, + "snippy": 2, + "cmd3": 2, + "cmd1": 2, + "cmd2": 2, + "cmd4": 2, + "extract_splice_sites": 1, + "rsem-prepare-reference": 2, + "umi_tools": 6, + "bowtie": 5, + "rungzip": 6, + "rsem-calculate-expression": 2, + "runsamtools": 10, + "tophat2": 1, + "rmdir": 1, + "memote": 2, + "groot": 1, + "kmc_tools": 1, + "kmc": 1, + "genomescope2": 5, + "csvtk": 4, + "nextclade": 3, + "augur": 24, + "gbrs": 2, + "export-genoprob-file": 1, + "alignmentsieve": 1, + "g2gtools": 1, + "bamcoverage": 1, + "cellranger": 8, + "mash": 10, + "b*": 8, + "wine": 2, + "kronik": 1, + "msstitch": 2, + "liftover": 1, + "unicycler": 2, + "git": 6, + "run": 5, + "varscan": 1, + "vardict-java": 1, + "view": 3, + "rmdup": 1, + "fixmate": 1, + "resistome": 2, + "rarefaction": 1, + "quasitools": 1, + "sierralocal": 1, + "synapse": 2, + "challengeutils": 6, + "model": 2, + "stansummary": 1, + "adapterremoval": 9, + "bracken-build": 1, + "centrifuge": 1, + "basta": 1, + "basta2krona": 1, + "ratt": 1, + "ffmpeg": 4, + "local": 1, + "add_subtitles": 1, + "add_bumper": 1, + "preseq": 6, + "unionbedgraphs": 1, + "bidir": 1, + "segment": 1, + "train": 1, + "bamtobed": 4, + "calc_maximal_isoform.bash": 1, + "fastasplitn": 1, + "builddatabase": 1, + "repeatmodeler": 1, + "test": 2, + "fastasort": 1, + "cdbfasta": 1, + "bam2hints": 1, + "fasomerecords": 1, + "mysql_create_options": 1, + "pasa_asmbls_to_training_set.dbi": 1, + "augustus": 2, + "etraining": 1, + "options": 1, + "fastaexplode": 1, + "srst2": 9, + "segemehl.x": 2, + "circos": 2, + "declare": 2, + "let": 1, + "bamtools": 5, + "bamcompare": 1, + "hicup_digester": 1, + "hicup_truncater": 1, + "hicup_mapper": 2, + "hicup_filter": 1, + "hicup_deduplicator": 2, + "juicer_tools_1.22.01": 1, + "pairix": 2, + "mlr": 1, + "egrep": 2, + "cobalt": 1, + "amber": 1, + "purple": 1, + "octopus": 1, + "nextflow": 1, + "shasum": 1, + "bandage": 1, + "nanoq": 4, + "dragonflye": 2, + "gotree": 4, + "goalign": 4, + "clipkit": 1, + "raxml-ng": 2, + "nw_order": 1, + "convert2bed": 1, + "silva-preprocess": 1, + "silva-run": 1, + "might": 1, + "mafsinregion": 2, + "msa_view": 4, + "mafspeciessubset": 2, + "iget": 1, + "fastq_screen": 3, + "otherwise": 1, + "sniffles": 1, + "purge_haplotigs": 2, + "chromap": 2, + "yahs": 1, + "juicer": 1, + "assembly-stats": 1, + "shopt": 4, + "delly": 1, + "amrfinder": 3, + "run_deepvariant": 1, + "glnexus_cli": 1, + "qualimap": 1, + "smoove": 1, + "tiddit": 2, + "survivor": 2, + "annotsv": 1, + "lcep-package": 1, + "system-intelligence": 1, + "raxmlhpc-pthreads": 2, + "zstd": 8, + "clodius": 1, + "unlink": 4, + "vcf-consensus": 1, + "ncbi-acc-download": 1, + "unzip": 2, + "vdb-validate": 1, + "vdb-config": 1, + "efetch": 1, + "aria2c": 16, + "xmlstarlet": 1, + "kb": 2, + "mykrobe": 1, + "tb-profiler": 3, + "snippy-core": 1, + "kvarq": 1, + "rclone": 1, + "ukbunpack": 1, + "dconvert": 2, + "blacklist_filter": 2, + "bedops": 2, + "plotprofile": 3, + "computematrix": 3, + "plotheatmap": 1, + "ncrf": 1, + "graphaligner": 1, + "spoa": 1, + "fgrep": 2, + "vg": 3, + "kpal": 1, + "37": 1, + "quantitative": 1, + "binary": 1, + "38": 1, + "finemap_v1.4_x86_64": 2, + "locuszoom": 2, + "score-client": 1, + "time": 2, + "call_variants": 1, + "postprocess_variants": 1, + "density-peaks.bash": 1, + "selenoprofiles": 1, + "strling": 5, + "trimmomatic-0.36": 1, + "kmergenie": 1, + "abyss-pe": 1, + "velveth": 1, + "velvetg": 1, + "fq2fa": 1, + "idba_ud": 1, + "command": 2, + "ngs-extract-consensus-stdout": 1, + "fasta2gfe_nextflow": 1, + "razers3": 2, + "params.mira": 1, + "params.mitobim": 1, + "vcffilter": 1, + "files": 5, + "regenievalidateinput": 4, + "regenie": 12, + "regenielogparser": 6, + "genomic-utils": 9, + "icount": 4, + "which": 1, + "sortmerna": 3, + "paraclu-cut": 2, + "paraclu": 2, + "intersectbed": 1, + "yara_indexer": 1, + "yara_mapper": 1, + "cutsite_trimming": 1, + "build_matrix": 1, + "ice": 1, + "cooltools": 2, + "hicfindtads": 1, + "hicplotdistvscounts": 1, + "import": 2, + "os.renameinterval,": 2, + "code": 2, + "directory,": 2, + "samtools_cram_convert": 1, + "fileinfo": 1, + "pyprophet": 4, + "idfileconverter": 1, + "easypqp": 1, + "openswathassaygenerator": 1, + "targetedfileconverter": 6, + "openswathdecoygenerator": 1, + "openswathworkflow": 1, + "fileconverter": 1, + "openswathmzmlfilecacher": 1, + "ampcombi": 1, + "amplify": 1, + "macrel": 1, + "hamronize": 12, + "amrfinder_update": 2, + "fargene": 2, + "rgi": 2, + "deeparg": 4, + "download-antismash-databases": 1, + "cp_cmd": 1, + "antismash": 3, + "deepbgc": 2, + "gecco": 1, + "bioawk": 1, + "pyrodigal": 1, + "get_decoy_ids": 1, + "switch": 16, + "break": 16, + "mv_unsorted_bam": 6, + "index_gff": 1, + "miso": 1, + "sashimi_plot": 1, + "bedclip": 2, + "readlist.eachwithindex": 4, + "sb26": 1, + "ab339": 1, + "chr1": 1, + "chrom": 1, + "seq_batches": 1, + "amp_batch": 1, + "well_id": 1, + "wells_cells": 1, + "gene_intervals": 1, + "seq_batch_id": 1, + "tw1": 1, + "amp_batch_id": 1, + "ska": 1, + "sourmash": 4, + "bam2fasta": 1, + "orpheum": 1, + "spaceranger": 1, + "quarto": 3, + "bismark_genome_preparation": 1, + "bismark": 1, + "bismark_methylation_extractor": 1, + "bismark2report": 1, + "bismark2summary": 1, + "deduplicate_bismark": 1, + "coverage2cytosine": 1, + "meryl": 12, + "fastk": 4, + "fastmerge": 4, + "histex": 4, + "ploidyplot": 4, + "katgc": 4, + "busco": 1, + "merquryfk": 4, + "ccs": 1, + "lima": 1, + "isoseq3": 1, + "ultra": 2, + "hmmbuild": 1, + "easel": 5, + "esl-reformat": 3, + "hmmalign": 2, + "esl-alimask": 2, + "epa-ng": 2, + "gappa": 3, + "cat_input": 1, + "transdecoder.longorfs": 1, + "transdecoder.predict": 1, + "zgrep": 1, + "exec_annotation": 1, + "eukulele": 1, + "unpigz": 1, + "transrate": 1, + "fgbio": 8, + "pygmentize": 1, + "pureclip": 2, + "meme": 1, + "piranha": 2, + "shuf": 4, + "dreme": 4, + "supernova": 4, + "rsync": 2, + "rcode": 1, + "vep": 2, + "pigz:echo": 2, + "kaiju-mkfmi": 1, + "kaiju-mkbwt": 1, + "centrifuge-build": 1, + "7za": 1, + "pmdtools": 3, + "sourcepredict": 2, + "normalizedreadcount": 2, + "damageprofiler": 3, + "mmseqs": 7, + "colabfold_search": 2, + "rasusa": 2, + "name_sort_bam": 1, + "plotfingerprint": 1, + "crispresso": 1, + "cellprofiler": 3, + "querylca.tsv": 1, + "csvtotable": 1, + "ktimportec": 1, + "flexbar": 1, + "novoalign": 1, + "bamqc": 1, + "stress": 1, + "tagdust": 1, + "zip": 1, + "fastq-scan": 2, + "snp-sites": 1, + "pgdspider2-cli": 1, + "ne2-1l": 1, + "0.02": 1, + "kma": 2, + "aligncov": 1, + "dorado": 3, + "run_pepper_margin_deepvariant": 1, + "pod5": 1, + "pycoqc": 1, + "modkit": 1, + "fasterq-dump": 1, + "complementbed": 1, + "mergebed": 2 +} \ No newline at end of file diff --git a/src/workflow.py b/src/workflow.py index e993caf..0cbfb61 100644 --- a/src/workflow.py +++ b/src/workflow.py @@ -19,6 +19,7 @@ from pathlib import Path import glob import ctypes import time +import numpy as np @@ -81,7 +82,6 @@ class Workflow: self.ternary_operation_dico = {} self.map_element_dico = {} - OG_file = Nextflow_File(file, workflow = self, first_file = True) self.DSL = OG_file.find_DSL() self.create_empty_results() @@ -162,8 +162,7 @@ George Marchment, Bryan Brancotte, Marie Schmit, Frédéric Lemoine, Sarah Cohen if(self.graph==None): self.graph = Graph(self) - - + def iniatilise_tab_processes_2_remove(self): if(self.processes_2_remove==None): @@ -425,8 +424,20 @@ George Marchment, Bryan Brancotte, Marie Schmit, Frédéric Lemoine, Sarah Cohen searching = False if(random_pool): alpha = get_value() - nb_2_select = int(alpha*len(pool)) - sampled = random.sample(set(pool), nb_2_select) + nb_2_select = int(alpha*len(set(pool))) + + #Taking one from one the processes until we've reached the number + sampled = [] + while(len(sampled)<nb_2_select): + element = random.sample(pool, 1) + sampled+=element + #Removing all occurances of element in the list + #We do this cause their can be mulitple of the same element + #In the case we are searching with the frequency + pool = list(filter(lambda a: a != element[0], pool)) + ##This was 'simple' way of doing it (in the case there wasn't any duplicates in the pool) + ##The new method in the case there are multiples and also in the case there aren't + #sampled = random.sample(set(pool), nb_2_select) sampled_str = [] for s in sampled: @@ -462,9 +473,7 @@ George Marchment, Bryan Brancotte, Marie Schmit, Frédéric Lemoine, Sarah Cohen else: raise BioFlowInsightError("Trying to generate random relevant processes however option 'duplicate' is not activated.") - - - #TODO -> do this in a bit of a smarter way -> looking at popularity of the tools + def get_random_relevant_processes_which_use_bioinformatics_tools(self, processes_2_tools = {}): if(self.duplicate): processes_called = [] @@ -488,11 +497,59 @@ George Marchment, Bryan Brancotte, Marie Schmit, Frédéric Lemoine, Sarah Cohen raise BioFlowInsightError("Trying to generate random relevant processes however option 'duplicate' is not activated.") + def get_random_relevant_processes_which_use_bioinformatics_tools_considering_their_frequency(self, processes_2_tools = {}): + + OG_path = os.getcwd() + #Change working directory to the one of the file + os.chdir("/".join((str(__file__).split("/")[:-1]))) + with open("../ressources/tool_2_nb_usage.json", 'r') as file: + tool_2_nb_usage = json.load(file) + os.chdir(OG_path) + + if(self.duplicate): + processes_called = [] + if(self.get_DSL()=="DSL2"): + for c in self.get_workflow_main().get_all_calls_in_workflow(): + p = c.get_first_element_called() + if(p.get_type()=="Process"): + processes_called.append(p) + else: + processes_called = self.get_first_file().get_processes() + process_to_min_frequency = {} + for p in processes_called: + if(processes_2_tools!={}): + tools = processes_2_tools[p.get_code()] + else: + tools = p.get_tools() + + if(len(tools)>0): + min_value = np.inf + for t in tools: + try: + val = tool_2_nb_usage[t] + except: + val = 1 + if(val<min_value): + min_value = val + process_to_min_frequency[p] = min_value + sample_of_processes = [] + total_nb = np.sum(list(process_to_min_frequency.values())) + max_nb = np.max(list(process_to_min_frequency.values())) + for p in process_to_min_frequency: + freq = process_to_min_frequency[p] + nb_to_add = (max_nb-freq)+1 + #nb_to_add = int(total_nb*(1-(freq/total_nb)**4)) + sample_of_processes+=nb_to_add*[p] + return self.draw_pool_and_check_dependencies(sample_of_processes) + else: + raise BioFlowInsightError("Trying to generate random relevant processes however option 'duplicate' is not activated.") + + #The reduction alpha is the minimun number cluster depending on the percentage ofprocesses #For example if there are 10 processes and reduction_alpha = 0.2 -> we want at least 2 clusters #In the same idea if reduction_alpha = 0.4 -> we want at least 4 clusters def get_relevant_processes_which_minimize_nb_of_clusters(self, reduction_alpha = 0.2, number_of_tries = 50): - import numpy as np + min_nb_clusters, min_relevant_processes = np.inf, [] already_tried = [] print('-'*number_of_tries+">") @@ -522,7 +579,7 @@ George Marchment, Bryan Brancotte, Marie Schmit, Frédéric Lemoine, Sarah Cohen #In the same idea if reduction_beta = 0.6 -> we want a maximum of 6 clusters #reduction_alpha is the same as above def get_relevant_processes_which_uniformizes_cluster_distribution(self, reduction_alpha = 0.2, reduction_beta = 0.8, number_of_tries = 50): - import numpy as np + min_uniform_score, min_relevant_processes = np.inf, [] already_tried = [] print('-'*number_of_tries+">") @@ -564,7 +621,6 @@ George Marchment, Bryan Brancotte, Marie Schmit, Frédéric Lemoine, Sarah Cohen #reduction_alpha is the same as above #reduction_beta is the same as above def get_relevant_which_minizes_the_number_of_conditions(self, reduction_alpha = 0.2, reduction_beta = 0.8, number_of_tries = 50): - import numpy as np import copy min_condition_score, min_relevant_processes = np.inf, [] already_tried = [] @@ -609,12 +665,12 @@ George Marchment, Bryan Brancotte, Marie Schmit, Frédéric Lemoine, Sarah Cohen reduction_alpha = 0.2, reduction_beta = 0.8, number_of_tries = 50, - using_processes_with_bioinfo_tools = True, + process_pre_selection = "bioinfo_freq", concordance_factor = 1, uniformity_factor = 1, min_nb_clusters_factor = 1, - min_nb_non_relevant_cluster = 1): - import numpy as np + min_nb_non_relevant_cluster_factor = 1): + import copy min_score, min_processes = np.inf, [] already_tried = [] @@ -628,10 +684,15 @@ George Marchment, Bryan Brancotte, Marie Schmit, Frédéric Lemoine, Sarah Cohen for i in range(number_of_tries): print('.', end='') w = copy.deepcopy(w_save) - if(using_processes_with_bioinfo_tools): + + if(process_pre_selection == "bioinfo"): random_relevant_processes = w.get_random_relevant_processes_which_use_bioinformatics_tools(processes_2_tools = processes_2_tools) - else: + elif(process_pre_selection == "bioinfo_freq"): + random_relevant_processes = w.get_random_relevant_processes_which_use_bioinformatics_tools_considering_their_frequency(processes_2_tools = processes_2_tools) + elif(process_pre_selection == "None"): random_relevant_processes = w.generate_random_relevant_processes() + else: + raise Exception('process_pre_selection option not recognised') escape = 0 while(escape<100 and set(random_relevant_processes) in already_tried): escape+=1 @@ -660,14 +721,20 @@ George Marchment, Bryan Brancotte, Marie Schmit, Frédéric Lemoine, Sarah Cohen for x in tab_nb_processes_per_cluster: uniformity_variance += (average_number_of_process_per_cluster-x)**2/nb_clusters - score = concordance_factor * np.mean(np.array(tab_nb_conditions_per_cluster)/np.array(tab_nb_executors_per_cluster)) + \ + score = concordance_factor * np.max(np.array(tab_nb_conditions_per_cluster)/np.array(tab_nb_executors_per_cluster)) + \ uniformity_factor * (uniformity_variance / number_processes_called) + \ min_nb_clusters_factor * (nb_clusters / number_processes_called) + \ - min_nb_non_relevant_cluster * (nb_non_relevant_clusters / nb_clusters) + min_nb_non_relevant_cluster_factor * (nb_non_relevant_clusters / nb_clusters) if(len(cluster_organisation)>=reduction_alpha*number_processes_called and len(cluster_organisation)<=reduction_beta*number_processes_called and score<min_score): + print() + print("concordance", np.max(np.array(tab_nb_conditions_per_cluster)/np.array(tab_nb_executors_per_cluster)) ) + print("uniformity", (uniformity_variance / number_processes_called) ) + print("min_nb_clusters", (nb_clusters / number_processes_called) ) + print("min_nb_non_relevant_cluster", (nb_non_relevant_clusters / nb_clusters)) + print("score", score) min_processes = random_relevant_processes min_score = score return min_processes @@ -1070,7 +1137,15 @@ George Marchment, Bryan Brancotte, Marie Schmit, Frédéric Lemoine, Sarah Cohen #Method which rewrites the workflow follwong the user view #Conert workflow to user_view only makes sense when the option duplicate is activated -> otherwise is doesn't make sense + it makes the analysis way more complicated - def convert_workflow_2_user_view(self, relevant_processes = [], render_graphs = True): + def convert_workflow_2_user_view(self, relevant_processes = [], render_graphs = True, extract_tools = False): + if(extract_tools): + OG_path = os.getcwd() + #Change working directory to the one of the file + os.chdir("/".join((str(__file__).split("/")[:-1]))) + with open("../ressources/tool_2_nb_usage.json", 'r') as file: + tool_2_nb_usage = json.load(file) + os.chdir(OG_path) + self.iniatilise_tab_processes_2_remove() self.graph.initialise(processes_2_remove = self.processes_2_remove) self.check_something_illegal_for_rewrite() @@ -1364,9 +1439,23 @@ George Marchment, Bryan Brancotte, Marie Schmit, Frédéric Lemoine, Sarah Cohen #If the name=="" -> it means there isn't any relevant processes in the cluster -> it means it's a cluster of non relevant nodes if(name==""): #If there are processes called we are going to use them + #We use the process with the least used tool if(len(processes_added)>0): + + min_tool, min_process = np.inf, processes_added[0] + #If the tools have been extracted they can be used to name the none relevant processes + if(extract_tools): + for p in processes_added: + for t in p.get_code(): + try: + val = tool_2_nb_usage[t] + except: + val = 1 + if(val<min_tool): + min_tool = val + min_process = p #TODO find a better naming system - name = f"non_relevant_cluster_{processes_added[0].get_alias()}" + name = f"non_relevant_cluster_{min_process.get_alias()}" else: #TODO find a better naming system name = f"non_relevant_cluster_{non_relevant_name}" -- GitLab