Skip to content
Snippets Groups Projects
Commit 945682f5 authored by Schneider Leo's avatar Schneider Leo
Browse files

df oktoberfest

parent e34578b4
No related branches found
No related tags found
No related merge requests found
import matplotlib.pyplot as plt import matplotlib.pyplot as plt
import numpy as np import numpy as np
import pandas as pd import pandas as pd
# from loess.loess_1d import loess_1d from loess.loess_1d import loess_1d
import time import time
ALPHABET_UNMOD = { ALPHABET_UNMOD = {
......
...@@ -71,10 +71,31 @@ if __name__ == '__main__': ...@@ -71,10 +71,31 @@ if __name__ == '__main__':
# df.to_csv('spectral_lib/df_predicted_library_oktoberfest.csv',index=False) # df.to_csv('spectral_lib/df_predicted_library_oktoberfest.csv',index=False)
# #
# #
# #write new .msp with new RT #write new .msp with new RT
# seq=[]
#
df= pd.read_csv('spectral_lib/df_predicted_library_oktoberfest.csv') file = open("spectral_lib/predicted_library.msp", "r")
content=file.readlines()
file.close()
remove = False
predicted_lib=pd.read_csv('../output/out_lib_oktoberfest.csv')
pred_rt=predicted_lib['rt pred']
for i in range(len(content)) :
if remove:
if 'Name:' in content[i]:
remove = False
else :
pass
if 'Name:'in content[i]:
s=content[i].split(': ')[1].split('/')[0]
if 'C' in s or len(s)>30:
remove=True
else :
seq.append(s)
df = pd.DataFrame(seq,columns=['sequence'])
predicted_lib=pd.read_csv('../output/out_lib_oktoberfest.csv') predicted_lib=pd.read_csv('../output/out_lib_oktoberfest.csv')
......
...@@ -5,6 +5,8 @@ import pyarrow.parquet as pq ...@@ -5,6 +5,8 @@ import pyarrow.parquet as pq
import pyarrow as pa import pyarrow as pa
import torch import torch
import matplotlib.pyplot as plt import matplotlib.pyplot as plt
from loess.loess_1d import loess_1d
from model.model import ModelTransformer from model.model import ModelTransformer
from config import load_args from config import load_args
from data.dataset import load_data from data.dataset import load_data
...@@ -93,34 +95,81 @@ def predict(data_pred, model, output_path): ...@@ -93,34 +95,81 @@ def predict(data_pred, model, output_path):
if __name__ =='__main__': if __name__ =='__main__':
# df = load_lib('data/spectral_lib/first_lib.parquet') df = load_lib('spectral_lib/first_lib.parquet')
#
# plt.hist(df['RT']) # plt.hist(df['RT'])
# plt.savefig('test.png') # plt.savefig('test.png')
# #
# df_2 = pd.read_csv('data/data_prosit/data.csv') # df_2 = pd.read_csv('data_prosit/data.csv')
# #
# plt.clf() # plt.clf()
# plt.hist(df_2['irt']) # plt.hist(df_2['irt'])
# plt.savefig('test2.png') # plt.savefig('test2.png')
#
# df_2 = extract_sequence(df).reset_index(drop=True) # df_2 = extract_sequence(df).reset_index(drop=True)
# #
# pred = pd.read_csv('../output/out_uniprot_base.csv') # pred = pd.read_csv('../output/out_uniprot_base.csv')
#
# pred['seq']=pred['seq'].map(numerical_to_alphabetical_str) # pred['seq']=pred['seq'].map(numerical_to_alphabetical_str)
# #
# pred['Modified.Sequence']=pred['seq'] # pred['Modified.Sequence']=pred['seq']
# #
# result = pd.merge(df,pred[['Modified.Sequence','rt pred']],on='Modified.Sequence',how='left') # result = pd.merge(df,pred[['Modified.Sequence','rt pred']],on='Modified.Sequence',how='left')
# #
# result['RT']=result['rt pred'] #
#
# #alignement
#
# ref = pd.read_csv('data_prosit/data_noc.csv')
# df_ISA = pd.read_csv('data_ISA/data_aligned_isa_noc.csv')
#
# dataset, reference, column_dataset, column_ref, seq_data, seq_ref = df_ISA, ref, 'irt_scaled', 'irt', 'sequence','sequence',
#
# dataset_ref=dataset[dataset['state']=='train']
# dataset_unique = dataset_ref[[seq_data,column_dataset]].groupby(seq_data).mean()
# print('unique',len(dataset_unique))
# reference_unique = reference[[seq_ref,column_ref]].groupby(seq_ref).mean()
# seq_ref = reference_unique.index
# seq_common = dataset_unique.index
# seq_ref = seq_ref.tolist()
# seq_common = seq_common.tolist()
#
# seq_ref = [tuple(l) for l in seq_ref]
# seq_common = [tuple(l) for l in seq_common]
#
# ind_dict_ref = dict((k, i) for i, k in enumerate(seq_ref))
# inter = set(ind_dict_ref).intersection(seq_common)
# print(len(inter))
#
# ind_dict_ref = [ind_dict_ref[x] for x in inter]
#
# indices_common = dict((k, i) for i, k in enumerate(seq_common))
# indices_common = [indices_common[x] for x in inter]
#
#
# rt_ref = reference_unique[column_ref][ind_dict_ref].reset_index()
# rt_data = dataset_unique[column_dataset][indices_common].reset_index()
#
# plt.scatter(rt_data[column_dataset].tolist(),rt_ref[column_ref].tolist(),s=0.1)
# plt.savefig('test.png')
#
# #présence de NAN qui casse le réalignement (solution temporaire : remplacer par 0.
# result['rt pred']=result['rt pred'].fillna(value=0)
# xout, yout, wout = loess_1d(np.array(rt_data[column_dataset].tolist()), np.array(rt_ref[column_ref].tolist()),
# xnew=result['rt pred'],
# degree=1,
# npoints=None, rotate=False, sigy=None)
#
#
# #writing results
#
# result['RT'] = yout
# #
# result = result.drop('rt pred', axis=1) # result = result.drop('rt pred', axis=1)
# #
# table = pa.Table.from_pandas(result) # table = pa.Table.from_pandas(result)
# #
# pq.write_table(table, 'spectral_lib/custom_first_lib.parquet') # pq.write_table(table, 'spectral_lib/custom_first_lib_prosit_aligned.parquet')
......
0% Loading or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment