Skip to content
Snippets Groups Projects
Commit c04e9f28 authored by Schneider Leo's avatar Schneider Leo
Browse files

df oktoberfest

parent 239cf539
No related branches found
No related tags found
No related merge requests found
...@@ -33,45 +33,45 @@ ALPHABET_UNMOD = { ...@@ -33,45 +33,45 @@ ALPHABET_UNMOD = {
ALPHABET_UNMOD_REV = {v: k for k, v in ALPHABET_UNMOD.items()} ALPHABET_UNMOD_REV = {v: k for k, v in ALPHABET_UNMOD.items()}
# def align(dataset, reference, column_dataset, column_ref, seq_data, seq_ref): def align(dataset, reference, column_dataset, column_ref, seq_data, seq_ref):
# dataset_ref=dataset[dataset['state']=='train'] dataset_ref=dataset[dataset['state']=='train']
# dataset_unique = dataset_ref[[seq_data,column_dataset]].groupby(seq_data).mean() dataset_unique = dataset_ref[[seq_data,column_dataset]].groupby(seq_data).mean()
# print('unique',len(dataset_unique)) print('unique',len(dataset_unique))
# reference_unique = reference[[seq_ref,column_ref]].groupby(seq_ref).mean() reference_unique = reference[[seq_ref,column_ref]].groupby(seq_ref).mean()
# seq_ref = reference_unique.index seq_ref = reference_unique.index
# seq_common = dataset_unique.index seq_common = dataset_unique.index
# seq_ref = seq_ref.tolist() seq_ref = seq_ref.tolist()
# seq_common = seq_common.tolist() seq_common = seq_common.tolist()
#
# seq_ref = [tuple(l) for l in seq_ref] seq_ref = [tuple(l) for l in seq_ref]
# seq_common = [tuple(l) for l in seq_common] seq_common = [tuple(l) for l in seq_common]
#
# ind_dict_ref = dict((k, i) for i, k in enumerate(seq_ref)) ind_dict_ref = dict((k, i) for i, k in enumerate(seq_ref))
# inter = set(ind_dict_ref).intersection(seq_common) inter = set(ind_dict_ref).intersection(seq_common)
# print(len(inter)) print(len(inter))
#
# ind_dict_ref = [ind_dict_ref[x] for x in inter] ind_dict_ref = [ind_dict_ref[x] for x in inter]
#
# indices_common = dict((k, i) for i, k in enumerate(seq_common)) indices_common = dict((k, i) for i, k in enumerate(seq_common))
# indices_common = [indices_common[x] for x in inter] indices_common = [indices_common[x] for x in inter]
#
#
# rt_ref = reference_unique[column_ref][ind_dict_ref].reset_index() rt_ref = reference_unique[column_ref][ind_dict_ref].reset_index()
# rt_data = dataset_unique[column_dataset][indices_common].reset_index() rt_data = dataset_unique[column_dataset][indices_common].reset_index()
#
# plt.scatter(rt_data[column_dataset].tolist(),rt_ref[column_ref].tolist(),s=0.1) plt.scatter(rt_data[column_dataset].tolist(),rt_ref[column_ref].tolist(),s=0.1)
# plt.savefig('test.png') plt.savefig('test.png')
#
# xout, yout, wout = loess_1d(np.array(rt_data[column_dataset].tolist()), np.array(rt_ref[column_ref].tolist()), xout, yout, wout = loess_1d(np.array(rt_data[column_dataset].tolist()), np.array(rt_ref[column_ref].tolist()),
# xnew=dataset[column_dataset], xnew=dataset[column_dataset],
# degree=1, frac=0.25, degree=1, frac=0.25,
# npoints=None, rotate=False, sigy=None) npoints=None, rotate=False, sigy=None)
#
# plt.scatter(xout, yout, s=0.1) plt.scatter(xout, yout, s=0.1)
# plt.savefig('test_2.png') plt.savefig('test_2.png')
#
# dataset[column_dataset] = yout dataset[column_dataset] = yout
# return dataset return dataset
def get_number_unique_peptide(dataset): def get_number_unique_peptide(dataset):
seq = dataset['sequence'] seq = dataset['sequence']
......
...@@ -2,14 +2,53 @@ import numpy as np ...@@ -2,14 +2,53 @@ import numpy as np
import pandas as pb import pandas as pb
import pandas as pd import pandas as pd
import pyarrow.parquet as pq import pyarrow.parquet as pq
import pyarrow as pa
import torch import torch
import matplotlib.pyplot as plt
from model.model import ModelTransformer from model.model import ModelTransformer
from config import load_args from config import load_args
from data.dataset import load_data from data.dataset import load_data
ALPHABET_UNMOD = {
"": 0,
"A": 1,
"C": 2,
"D": 3,
"E": 4,
"F": 5,
"G": 6,
"H": 7,
"I": 8,
"K": 9,
"L": 10,
"M": 11,
"N": 12,
"P": 13,
"Q": 14,
"R": 15,
"S": 16,
"T": 17,
"V": 18,
"W": 19,
"Y": 20,
"M(UniMod:35)": 21,
"CaC": 22
}
ALPHABET_UNMOD_REV = {v: k for k, v in ALPHABET_UNMOD.items()}
def numerical_to_alphabetical_str(s):
seq = ''
s = s.replace('[','')
s = s.replace(']', '')
arr = s.split(',')
arr = list(map(int, arr))
for i in range(len(arr)):
seq+=ALPHABET_UNMOD_REV[arr[i]]
return seq
def load_lib(path): def load_lib(path):
table = pq.read_table('data/spectral_lib/first_lib.parquet') table = pq.read_table(path)
table = table.to_pandas() table = table.to_pandas()
return table return table
...@@ -54,26 +93,54 @@ def predict(data_pred, model, output_path): ...@@ -54,26 +93,54 @@ def predict(data_pred, model, output_path):
if __name__ =='__main__': if __name__ =='__main__':
# df = load_lib('data/spectral_lib/first_lib.parquet') df = load_lib('spectral_lib/first_lib.parquet')
# df_2 = extract_sequence(df).reset_index(drop=True)
# df_2.to_csv('data/spectral_lib/data_uniprot_base.csv', index=False)
args = load_args() plt.hist(df['RT'])
plt.savefig('test.png')
model = ModelTransformer(encoder_ff=args.encoder_ff, decoder_rt_ff=args.decoder_rt_ff, df_2 = pd.read_csv('data_prosit/data.csv')
n_head=args.n_head, encoder_num_layer=args.encoder_num_layer,
decoder_rt_num_layer=args.decoder_rt_num_layer, drop_rate=args.drop_rate,
embedding_dim=args.embedding_dim, acti=args.activation, norm=args.norm_first, seq_length=30)
if torch.cuda.is_available(): plt.clf()
model = model.cuda() plt.hist(df_2['irt'])
plt.savefig('test2.png')
model.load_state_dict(torch.load(args.model_weigh, weights_only=True)) # df_2 = extract_sequence(df).reset_index(drop=True)
#
print(args.dataset_test) # pred = pd.read_csv('../output/out_uniprot_base.csv')
data_test = load_data(data_source='data/spectral_lib/data_uniprot_base.csv', batch_size=args.batch_size, length=30, mode=args.split_test, #
seq_col=args.seq_test) # pred['seq']=pred['seq'].map(numerical_to_alphabetical_str)
#
predict(data_test, model, args.output) # pred['Modified.Sequence']=pred['seq']
\ No newline at end of file #
# result = pd.merge(df,pred[['Modified.Sequence','rt pred']],on='Modified.Sequence',how='left')
#
# result['RT']=result['rt pred']
#
# result = result.drop('rt pred', axis=1)
#
# table = pa.Table.from_pandas(result)
#
# pq.write_table(table, 'spectral_lib/custom_first_lib.parquet')
# args = load_args()
#
# model = ModelTransformer(encoder_ff=args.encoder_ff, decoder_rt_ff=args.decoder_rt_ff,
# n_head=args.n_head, encoder_num_layer=args.encoder_num_layer,
# decoder_rt_num_layer=args.decoder_rt_num_layer, drop_rate=args.drop_rate,
# embedding_dim=args.embedding_dim, acti=args.activation, norm=args.norm_first, seq_length=30)
#
# if torch.cuda.is_available():
# model = model.cuda()
#
# model.load_state_dict(torch.load(args.model_weigh, weights_only=True))
#
# print(args.dataset_test)
# data_test = load_data(data_source='data/spectral_lib/data_uniprot_base.csv', batch_size=args.batch_size, length=30, mode=args.split_test,
# seq_col=args.seq_test)
#
# predict(data_test, model, args.output)
plt.hist(df['RT'])
plt.savefig('test.png')
\ No newline at end of file
import pandas as pd
if __name__ == '__main__':
seq=[]
file = open("spectral_lib/predicted_library.msp", "r")
content=file.readlines()
file.close()
for l in content :
if 'Name:'in l:
seq.append(l.split(':')[1].split('/')[0])
df = pd.DataFrame(seq,columns=['sequence'])
df['irt_scaled']=0
df['state'] = 'holdout'
df.to_csv('spectral_lib/df_predicted_library_oktoberfest.csv',index=False)
\ No newline at end of file
File suppressed by a .gitattributes entry or the file's encoding is unsupported.
0% Loading or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment