modelisation biais collecte de données

794d5f10 · Schneider Leo · 26bffa3f · 794d5f10 · 794d5f10
Commit 794d5f10 authored 3 months ago by Schneider Leo
--- a/main.py
+++ b/main.py
 import numpy as np
 import pandas as pd
+import tensorflow as tf
+import dlomix
+import sys
+import os
 from dlomix.models import DetectabilityModel
 from dlomix.constants import CLASSES_LABELS, alphabet, aa_to_int_dict
 from dlomix.data import DetectabilityDataset
+from datasets import load_dataset, DatasetDict
+from dlomix.reports.DetectabilityReport import DetectabilityReport, predictions_report
+WANDB_REPORT_API_DISABLE_MESSAGE=True
 def fasta_like_to_data(path):
    file = open(path, "r")
@@ -19,6 +26,61 @@ def strip_lines(s):
    s = s.split(' ')[1]
    return s
+def test_model():
+    total_num_classes = len(CLASSES_LABELS)
+    input_dimension = len(alphabet)
+    num_cells = 64
+    model = DetectabilityModel(num_units=num_cells, num_clases=total_num_classes)
+    model.built = True
+    model_save_path = 'pretrained_model/original_detectability_fine_tuned_model_FINAL'
+    model.load_weights(model_save_path)
+    # hf_data_name = "Wilhelmlab/detectability-sinitcyn"
+    # hf_data_name = "Wilhelmlab/detectability-wang"
+    hf_data_name = "Wilhelmlab/detectability-proteometools"
+    hf_dataset_split = load_dataset(hf_data_name, split="test")
+    hf_dataset = DatasetDict({"test": hf_dataset_split})
+    max_pep_length = 40
+    BATCH_SIZE = 128
+    detectability_data = DetectabilityDataset(data_source=hf_dataset,
+                                              data_format='hf',
+                                              max_seq_len=max_pep_length,
+                                              label_column="Classes",
+                                              sequence_column="Sequences",
+                                              dataset_columns_to_keep=None,
+                                              batch_size=BATCH_SIZE,
+                                              with_termini=False,
+                                              alphabet=aa_to_int_dict)
+    predictions = model.predict(detectability_data.tensor_test_data)
+    test_targets = detectability_data["test"]["Classes"]
+    test_data_df = pd.DataFrame(
+        {
+            "Sequences": detectability_data["test"]["_parsed_sequence"],  # get the raw parsed sequences
+            "Classes": test_targets,  # get the test targets from above
+              # get the Proteins column from the dataset object
+        }
+    )
+    test_data_df.Sequences = test_data_df.Sequences.apply(
+        lambda x: "".join(x))  # join the sequences since they are a list of string amino acids.
+    num_classes = np.max(test_targets) + 1
+    test_targets_one_hot = np.eye(num_classes)[test_targets]
+    test_targets_one_hot.shape, len(test_targets)
+    report = DetectabilityReport(targets=test_targets_one_hot,
+                                 predictions=predictions,
+                                 input_data_df=test_data_df,
+                                 output_path="./output/report_on_Sinitcyn_2000_proteins_test_set_labeled",
+                                 history=None,
+                                 rank_by_prot=True,
+                                 threshold=None,
+                                 name_of_dataset='Sinitcyn 2000 proteins test set',
+                                 name_of_model='Fine-tuned model (Original)')
+    results_df = report.detectability_report_table
 def main(input_data_path):
    print('Reading file')
    file = open(input_data_path, "r")
@@ -102,4 +164,5 @@ def main(input_data_path):
    new_file.close()
 if __name__ == '__main__':
-    main('250107_FASTA_RP_GroEL_GroES_Tuf_5pct_assemble_peptides_list.txt')
+    # main()
+    main('241205_list_test_peptide_detectability.txt')
--- a/modelisation_flyer.py
+++ b/modelisation_flyer.py
+import matplotlib.pyplot as plt
+import numpy as np
+import numpy.random
+import pandas as pd
+from tensorflow_probability.python.layers.distribution_layer import sample
+rng = np.random.default_rng()
+PROT_NUM = 1000
+peptide_per_prot = rng.normal(10,3,PROT_NUM)
+plt.hist(peptide_per_prot)
+plt.show()
+plt.savefig('dist_peptide_prot.png')
+plt.clf()
+log_intensity_test = rng.normal(8,0.8,1000)
+plt.hist(log_intensity_test)
+plt.show()
+plt.savefig('dist_log_intensity.png')
+plt.clf()
+no_id=0
+int_total = np.array([])
+int_normalized_first = np.array([])
+for num_peptides in peptide_per_prot :
+    if int(num_peptides)>0:
+        coverage = rng.binomial(num_peptides,0.4)
+        log_intensities = rng.normal(8,2,coverage)
+        if coverage > 3 and coverage > 0.2*num_peptides:
+            no_id+=int(num_peptides)-coverage
+            intensities = np.exp(log_intensities)
+            int_total = np.concatenate((int_total, intensities))
+            int_normalized = intensities/np.max(intensities)
+            int_normalized_first = np.concatenate((int_normalized_first, int_normalized))
+int_normalized_after = int_total/np.max(int_total)
+threshold_true_weak_flyer = sorted(int_normalized_after)[int(len(int_normalized_after)/3)]
+threshold_true_medium_flyer = sorted(int_normalized_after)[int(2*len(int_normalized_after)/3)]
+threshold_measure_weak_flyer = sorted(int_normalized_first)[int(len(int_normalized_first)/3)]
+threshold_measure_medium_flyer = sorted(int_normalized_first)[int(2*len(int_normalized_first)/3)]
+label_true = []
+label_measure = []
+for i in range(int_normalized_after.size):
+    if int_normalized_after[i] < threshold_true_weak_flyer :
+        label_true.append('Weak flyer')
+    elif int_normalized_after[i] < threshold_true_medium_flyer:
+        label_true.append('Medium flyer')
+    else:
+        label_true.append('Strong flyer')
+    if int_normalized_first[i] < threshold_measure_weak_flyer :
+        label_measure.append('Weak flyer')
+    elif int_normalized_first[i] < threshold_measure_medium_flyer:
+        label_measure.append('Medium flyer')
+    else:
+        label_measure.append('Strong flyer')
+data = {'True label':label_true, 'Measured label':label_measure}
+df = pd.DataFrame(data=data)
+acc_measure = df[df['True label']==df['Measured label']].shape[0]/df.shape[0]
+acc_with_no_id = (no_id + df[df['True label']==df['Measured label']].shape[0])/(no_id + df.shape[0])
\ No newline at end of file