Skip to content
Snippets Groups Projects
Commit 10195944 authored by Schneider Leo's avatar Schneider Leo
Browse files

fix

parent f64de84a
No related branches found
No related tags found
No related merge requests found
......@@ -39,6 +39,7 @@ def numerical_to_alphabetical_str(s):
seq+=ALPHABET_UNMOD_REV[arr[i]]
return seq
if __name__ == '__main__':
#extract seq from .msp
......
source diff could not be displayed: it is too large. Options to address this: view the blob.
......@@ -5,7 +5,7 @@ import pyarrow.parquet as pq
import pyarrow as pa
import torch
import matplotlib.pyplot as plt
# from loess.loess_1d import loess_1d
from loess.loess_1d import loess_1d
from model.model import ModelTransformer
from config import load_args
......@@ -95,10 +95,9 @@ def predict(data_pred, model, output_path):
if __name__ =='__main__':
# df = load_lib('spectral_lib/first_lib_CITBASE_try_contaminent.parquet')
# df = extract_sequence(df)
#
# df.to_csv('spec_lib/data_CITBASE_try_contaminant.csv')
df = load_lib('spectral_lib/1-240711_ident_resistance_idbioriv_fluoroquinolones_conta_human_sang.parquet')
df = extract_sequence(df)
df.to_csv('spectral_lib/1-240711_ident_resistance_idbioriv_fluoroquinolones_conta_human_sang.csv')
# plt.hist(df['RT'])
# plt.savefig('test.png')
#
......@@ -110,8 +109,8 @@ if __name__ =='__main__':
#
# df_2 = extract_sequence(df).reset_index(drop=True)
#
# pred = pd.read_csv('../output/out_uniprot_base.csv')
# pred = pd.read_csv('../output/out_lib_CITBASE_try_contaminant.csv')
#
# pred['seq']=pred['seq'].map(numerical_to_alphabetical_str)
#
# pred['Modified.Sequence']=pred['seq']
......@@ -171,23 +170,23 @@ if __name__ =='__main__':
#
# table = pa.Table.from_pandas(result)
#
# pq.write_table(table, 'spectral_lib/custom_first_lib_prosit_aligned.parquet')
args = load_args()
model = ModelTransformer(encoder_ff=args.encoder_ff, decoder_rt_ff=args.decoder_rt_ff,
n_head=args.n_head, encoder_num_layer=args.encoder_num_layer,
decoder_rt_num_layer=args.decoder_rt_num_layer, drop_rate=args.drop_rate,
embedding_dim=args.embedding_dim, acti=args.activation, norm=args.norm_first, seq_length=30)
if torch.cuda.is_available():
model = model.cuda()
model.load_state_dict(torch.load(args.model_weigh, weights_only=True))
# pq.write_table(table, 'spectral_lib/first_lib_contaminant_prosit_aligned.parquet')
#
data_test = load_data(data_source=args.dataset_test, batch_size=args.batch_size, length=30, mode=args.split_test,
seq_col=args.seq_test)
predict(data_test, model, args.output)
# args = load_args()
#
# model = ModelTransformer(encoder_ff=args.encoder_ff, decoder_rt_ff=args.decoder_rt_ff,
# n_head=args.n_head, encoder_num_layer=args.encoder_num_layer,
# decoder_rt_num_layer=args.decoder_rt_num_layer, drop_rate=args.drop_rate,
# embedding_dim=args.embedding_dim, acti=args.activation, norm=args.norm_first, seq_length=30)
#
# if torch.cuda.is_available():
# model = model.cuda()
#
# model.load_state_dict(torch.load(args.model_weigh, weights_only=True))
#
# data_test = load_data(data_source=args.dataset_test, batch_size=args.batch_size, length=30, mode=args.split_test,
# seq_col=args.seq_test)
#
# predict(data_test, model, args.output)
import numpy as np
import pandas as pd
from matplotlib_venn import venn2
from matplotlib_venn import venn3
import matplotlib.pyplot as plt
def compare_id(path_1,path_2,sample_name):
def compare_id(path_1,path_2,path_3,sample_name):
df_1 = pd.read_csv(path_1, sep='\t', encoding='latin-1')
df_2 = pd.read_csv(path_2, sep='\t', encoding='latin-1')
df_3 = pd.read_csv(path_3, sep='\t', encoding='latin-1')
peptides_1 = set(df_1['Stripped.Sequence'].tolist())
protein_1 = set(df_1['Protein.Ids'].tolist())
peptides_2 = set(df_2['Stripped.Sequence'].tolist())
protein_2 = set(df_2['Protein.Ids'].tolist())
peptides_3 = set(df_3['Stripped.Sequence'].tolist())
protein_3 = set(df_3['Protein.Ids'].tolist())
venn2((peptides_1, peptides_2), ('custom lib', 'base lib'), set_colors=('g','r')) # venn2 works for two sets
venn3((peptides_1, peptides_2, peptides_3), ('custom lib', 'base lib','fine tuned'), set_colors=('g','r','b')) # venn2 works for two sets
plt.title('Peptide identifications on {} sample'.format(sample_name))
plt.savefig('venn_diag_pep_{}.png'.format(sample_name))
plt.clf()
venn2((protein_1, protein_2), ('custom lib', 'base lib'), set_colors=('g','r')) # venn2 works for two sets
venn3((protein_1, protein_2, protein_3), ('custom lib', 'base lib','fine tuned'), set_colors=('g','r','b')) # venn2 works for two sets
plt.title('Protein identifications on {} sample'.format(sample_name))
plt.savefig('venn_diag_prot_{}.png'.format(sample_name))
compare_id('CITFRE_ANA_69/report_custom.tsv','CITFRE_ANA_69/report_first_lib.tsv',sample_name='CITFRE_ANA_69')
def compare_error(path_1,path_2):
df_1 = pd.read_csv(path_1, sep='\t', encoding='latin-1')
df_2 = pd.read_csv(path_2, sep='\t', encoding='latin-1')
peptides_2 = set(df_2['Stripped.Sequence'].tolist())
df_1=df_1[df_1['Stripped.Sequence'].isin(peptides_2)]
error_1 = abs(df_1['iRT']-df_1['Predicted.iRT'])
error_2 = abs(df_2['iRT'] - df_2['Predicted.iRT'])
plt.hist(error_1)
plt.savefig('error1.png')
plt.clf()
plt.hist(error_2)
plt.savefig('error2.png')
return error_1,error_2
if __name__ == '__main__':
# compare_id('CITCRO_ANA_3/report_custom.tsv', 'CITCRO_ANA_3/report_first_lib.tsv', 'CITCRO_ANA_3/report_finetune.tsv','CITCRO_ANA_3')
e1,e2 = compare_error('CITCRO_ANA_3/report_custom.tsv', 'CITCRO_ANA_3/report_first_lib.tsv')
\ No newline at end of file
0% Loading or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment