Skip to content
Snippets Groups Projects
Commit 794d5f10 authored by Schneider Leo's avatar Schneider Leo
Browse files

modelisation biais collecte de données

parent 26bffa3f
No related branches found
No related tags found
No related merge requests found
import numpy as np import numpy as np
import pandas as pd import pandas as pd
import tensorflow as tf
import dlomix
import sys
import os
from dlomix.models import DetectabilityModel from dlomix.models import DetectabilityModel
from dlomix.constants import CLASSES_LABELS, alphabet, aa_to_int_dict from dlomix.constants import CLASSES_LABELS, alphabet, aa_to_int_dict
from dlomix.data import DetectabilityDataset from dlomix.data import DetectabilityDataset
from datasets import load_dataset, DatasetDict
from dlomix.reports.DetectabilityReport import DetectabilityReport, predictions_report
WANDB_REPORT_API_DISABLE_MESSAGE=True
def fasta_like_to_data(path): def fasta_like_to_data(path):
file = open(path, "r") file = open(path, "r")
...@@ -19,6 +26,61 @@ def strip_lines(s): ...@@ -19,6 +26,61 @@ def strip_lines(s):
s = s.split(' ')[1] s = s.split(' ')[1]
return s return s
def test_model():
total_num_classes = len(CLASSES_LABELS)
input_dimension = len(alphabet)
num_cells = 64
model = DetectabilityModel(num_units=num_cells, num_clases=total_num_classes)
model.built = True
model_save_path = 'pretrained_model/original_detectability_fine_tuned_model_FINAL'
model.load_weights(model_save_path)
# hf_data_name = "Wilhelmlab/detectability-sinitcyn"
# hf_data_name = "Wilhelmlab/detectability-wang"
hf_data_name = "Wilhelmlab/detectability-proteometools"
hf_dataset_split = load_dataset(hf_data_name, split="test")
hf_dataset = DatasetDict({"test": hf_dataset_split})
max_pep_length = 40
BATCH_SIZE = 128
detectability_data = DetectabilityDataset(data_source=hf_dataset,
data_format='hf',
max_seq_len=max_pep_length,
label_column="Classes",
sequence_column="Sequences",
dataset_columns_to_keep=None,
batch_size=BATCH_SIZE,
with_termini=False,
alphabet=aa_to_int_dict)
predictions = model.predict(detectability_data.tensor_test_data)
test_targets = detectability_data["test"]["Classes"]
test_data_df = pd.DataFrame(
{
"Sequences": detectability_data["test"]["_parsed_sequence"], # get the raw parsed sequences
"Classes": test_targets, # get the test targets from above
# get the Proteins column from the dataset object
}
)
test_data_df.Sequences = test_data_df.Sequences.apply(
lambda x: "".join(x)) # join the sequences since they are a list of string amino acids.
num_classes = np.max(test_targets) + 1
test_targets_one_hot = np.eye(num_classes)[test_targets]
test_targets_one_hot.shape, len(test_targets)
report = DetectabilityReport(targets=test_targets_one_hot,
predictions=predictions,
input_data_df=test_data_df,
output_path="./output/report_on_Sinitcyn_2000_proteins_test_set_labeled",
history=None,
rank_by_prot=True,
threshold=None,
name_of_dataset='Sinitcyn 2000 proteins test set',
name_of_model='Fine-tuned model (Original)')
results_df = report.detectability_report_table
def main(input_data_path): def main(input_data_path):
print('Reading file') print('Reading file')
file = open(input_data_path, "r") file = open(input_data_path, "r")
...@@ -102,4 +164,5 @@ def main(input_data_path): ...@@ -102,4 +164,5 @@ def main(input_data_path):
new_file.close() new_file.close()
if __name__ == '__main__': if __name__ == '__main__':
main('250107_FASTA_RP_GroEL_GroES_Tuf_5pct_assemble_peptides_list.txt') # main()
main('241205_list_test_peptide_detectability.txt')
import matplotlib.pyplot as plt
import numpy as np
import numpy.random
import pandas as pd
from tensorflow_probability.python.layers.distribution_layer import sample
rng = np.random.default_rng()
PROT_NUM = 1000
peptide_per_prot = rng.normal(10,3,PROT_NUM)
plt.hist(peptide_per_prot)
plt.show()
plt.savefig('dist_peptide_prot.png')
plt.clf()
log_intensity_test = rng.normal(8,0.8,1000)
plt.hist(log_intensity_test)
plt.show()
plt.savefig('dist_log_intensity.png')
plt.clf()
no_id=0
int_total = np.array([])
int_normalized_first = np.array([])
for num_peptides in peptide_per_prot :
if int(num_peptides)>0:
coverage = rng.binomial(num_peptides,0.4)
log_intensities = rng.normal(8,2,coverage)
if coverage > 3 and coverage > 0.2*num_peptides:
no_id+=int(num_peptides)-coverage
intensities = np.exp(log_intensities)
int_total = np.concatenate((int_total, intensities))
int_normalized = intensities/np.max(intensities)
int_normalized_first = np.concatenate((int_normalized_first, int_normalized))
int_normalized_after = int_total/np.max(int_total)
threshold_true_weak_flyer = sorted(int_normalized_after)[int(len(int_normalized_after)/3)]
threshold_true_medium_flyer = sorted(int_normalized_after)[int(2*len(int_normalized_after)/3)]
threshold_measure_weak_flyer = sorted(int_normalized_first)[int(len(int_normalized_first)/3)]
threshold_measure_medium_flyer = sorted(int_normalized_first)[int(2*len(int_normalized_first)/3)]
label_true = []
label_measure = []
for i in range(int_normalized_after.size):
if int_normalized_after[i] < threshold_true_weak_flyer :
label_true.append('Weak flyer')
elif int_normalized_after[i] < threshold_true_medium_flyer:
label_true.append('Medium flyer')
else:
label_true.append('Strong flyer')
if int_normalized_first[i] < threshold_measure_weak_flyer :
label_measure.append('Weak flyer')
elif int_normalized_first[i] < threshold_measure_medium_flyer:
label_measure.append('Medium flyer')
else:
label_measure.append('Strong flyer')
data = {'True label':label_true, 'Measured label':label_measure}
df = pd.DataFrame(data=data)
acc_measure = df[df['True label']==df['Measured label']].shape[0]/df.shape[0]
acc_with_no_id = (no_id + df[df['True label']==df['Measured label']].shape[0])/(no_id + df.shape[0])
\ No newline at end of file
0% Loading or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment