From 794d5f106c0105759ccef29093d2d31892b855b8 Mon Sep 17 00:00:00 2001 From: Schneider Leo <leo.schneider@etu.ec-lyon.fr> Date: Fri, 21 Mar 2025 16:53:48 +0100 Subject: [PATCH] =?UTF-8?q?modelisation=20biais=20collecte=20de=20donn?= =?UTF-8?q?=C3=A9es?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- main.py | 65 +++++++++++++++++++++++++++++++++++++++- modelisation_flyer.py | 70 +++++++++++++++++++++++++++++++++++++++++++ 2 files changed, 134 insertions(+), 1 deletion(-) create mode 100644 modelisation_flyer.py diff --git a/main.py b/main.py index cc98a85..76475bd 100644 --- a/main.py +++ b/main.py @@ -1,8 +1,15 @@ import numpy as np import pandas as pd +import tensorflow as tf +import dlomix +import sys +import os from dlomix.models import DetectabilityModel from dlomix.constants import CLASSES_LABELS, alphabet, aa_to_int_dict from dlomix.data import DetectabilityDataset +from datasets import load_dataset, DatasetDict +from dlomix.reports.DetectabilityReport import DetectabilityReport, predictions_report +WANDB_REPORT_API_DISABLE_MESSAGE=True def fasta_like_to_data(path): file = open(path, "r") @@ -19,6 +26,61 @@ def strip_lines(s): s = s.split(' ')[1] return s +def test_model(): + total_num_classes = len(CLASSES_LABELS) + input_dimension = len(alphabet) + num_cells = 64 + + model = DetectabilityModel(num_units=num_cells, num_clases=total_num_classes) + model.built = True + model_save_path = 'pretrained_model/original_detectability_fine_tuned_model_FINAL' + model.load_weights(model_save_path) + + # hf_data_name = "Wilhelmlab/detectability-sinitcyn" + # hf_data_name = "Wilhelmlab/detectability-wang" + hf_data_name = "Wilhelmlab/detectability-proteometools" + + hf_dataset_split = load_dataset(hf_data_name, split="test") + hf_dataset = DatasetDict({"test": hf_dataset_split}) + max_pep_length = 40 + BATCH_SIZE = 128 + + detectability_data = DetectabilityDataset(data_source=hf_dataset, + data_format='hf', + max_seq_len=max_pep_length, + label_column="Classes", + sequence_column="Sequences", + dataset_columns_to_keep=None, + batch_size=BATCH_SIZE, + with_termini=False, + alphabet=aa_to_int_dict) + predictions = model.predict(detectability_data.tensor_test_data) + test_targets = detectability_data["test"]["Classes"] + test_data_df = pd.DataFrame( + { + "Sequences": detectability_data["test"]["_parsed_sequence"], # get the raw parsed sequences + "Classes": test_targets, # get the test targets from above + # get the Proteins column from the dataset object + } + ) + + test_data_df.Sequences = test_data_df.Sequences.apply( + lambda x: "".join(x)) # join the sequences since they are a list of string amino acids. + num_classes = np.max(test_targets) + 1 + test_targets_one_hot = np.eye(num_classes)[test_targets] + test_targets_one_hot.shape, len(test_targets) + report = DetectabilityReport(targets=test_targets_one_hot, + predictions=predictions, + input_data_df=test_data_df, + output_path="./output/report_on_Sinitcyn_2000_proteins_test_set_labeled", + history=None, + rank_by_prot=True, + threshold=None, + name_of_dataset='Sinitcyn 2000 proteins test set', + name_of_model='Fine-tuned model (Original)') + results_df = report.detectability_report_table + + def main(input_data_path): print('Reading file') file = open(input_data_path, "r") @@ -102,4 +164,5 @@ def main(input_data_path): new_file.close() if __name__ == '__main__': - main('250107_FASTA_RP_GroEL_GroES_Tuf_5pct_assemble_peptides_list.txt') + # main() + main('241205_list_test_peptide_detectability.txt') diff --git a/modelisation_flyer.py b/modelisation_flyer.py new file mode 100644 index 0000000..c04ccdd --- /dev/null +++ b/modelisation_flyer.py @@ -0,0 +1,70 @@ +import matplotlib.pyplot as plt +import numpy as np +import numpy.random +import pandas as pd +from tensorflow_probability.python.layers.distribution_layer import sample + +rng = np.random.default_rng() + +PROT_NUM = 1000 + +peptide_per_prot = rng.normal(10,3,PROT_NUM) +plt.hist(peptide_per_prot) +plt.show() +plt.savefig('dist_peptide_prot.png') +plt.clf() + +log_intensity_test = rng.normal(8,0.8,1000) +plt.hist(log_intensity_test) +plt.show() +plt.savefig('dist_log_intensity.png') +plt.clf() + +no_id=0 +int_total = np.array([]) +int_normalized_first = np.array([]) +for num_peptides in peptide_per_prot : + if int(num_peptides)>0: + coverage = rng.binomial(num_peptides,0.4) + log_intensities = rng.normal(8,2,coverage) + if coverage > 3 and coverage > 0.2*num_peptides: + no_id+=int(num_peptides)-coverage + intensities = np.exp(log_intensities) + int_total = np.concatenate((int_total, intensities)) + int_normalized = intensities/np.max(intensities) + int_normalized_first = np.concatenate((int_normalized_first, int_normalized)) + +int_normalized_after = int_total/np.max(int_total) + +threshold_true_weak_flyer = sorted(int_normalized_after)[int(len(int_normalized_after)/3)] +threshold_true_medium_flyer = sorted(int_normalized_after)[int(2*len(int_normalized_after)/3)] + +threshold_measure_weak_flyer = sorted(int_normalized_first)[int(len(int_normalized_first)/3)] +threshold_measure_medium_flyer = sorted(int_normalized_first)[int(2*len(int_normalized_first)/3)] + + +label_true = [] +label_measure = [] + +for i in range(int_normalized_after.size): + if int_normalized_after[i] < threshold_true_weak_flyer : + label_true.append('Weak flyer') + elif int_normalized_after[i] < threshold_true_medium_flyer: + label_true.append('Medium flyer') + else: + label_true.append('Strong flyer') + + + if int_normalized_first[i] < threshold_measure_weak_flyer : + label_measure.append('Weak flyer') + elif int_normalized_first[i] < threshold_measure_medium_flyer: + label_measure.append('Medium flyer') + else: + label_measure.append('Strong flyer') + +data = {'True label':label_true, 'Measured label':label_measure} + +df = pd.DataFrame(data=data) + +acc_measure = df[df['True label']==df['Measured label']].shape[0]/df.shape[0] +acc_with_no_id = (no_id + df[df['True label']==df['Measured label']].shape[0])/(no_id + df.shape[0]) \ No newline at end of file -- GitLab