Skip to content
Snippets Groups Projects
Commit 3da55a91 authored by Schneider Leo's avatar Schneider Leo
Browse files

rename output

parent d3de7130
No related branches found
No related tags found
No related merge requests found
...@@ -20,8 +20,8 @@ ALPHABET_UNMOD = { ...@@ -20,8 +20,8 @@ ALPHABET_UNMOD = {
"V": 18, "V": 18,
"W": 19, "W": 19,
"Y": 20, "Y": 20,
"CaC": 22, "OxM": 21,
"OxM": 21 "CaC": 22
} }
ALPHABET_UNMOD_REV = {v: k for k, v in ALPHABET_UNMOD.items()} ALPHABET_UNMOD_REV = {v: k for k, v in ALPHABET_UNMOD.items()}
\ No newline at end of file
import numpy as np import numpy as np
import matplotlib.pyplot as plt import matplotlib.pyplot as plt
import pandas as pd import pandas as pd
from numpy.ma.core import shape
from constant import ALPHABET_UNMOD, ALPHABET_UNMOD_REV
from constant import ALPHABET_UNMOD
def length_distribution(data, plot=False, save=False, f_name='fig/data_exploration/length_distribution.png'): def length_distribution(data, plot=False, save=False, f_name='fig/data_exploration/length_distribution.png'):
maximum = 31 maximum = 31
...@@ -36,6 +38,9 @@ def aa_distribution(data, plot=False, save=False, f_name='fig/data_exploration/a ...@@ -36,6 +38,9 @@ def aa_distribution(data, plot=False, save=False, f_name='fig/data_exploration/a
freq = 100 * freq / freq.sum() freq = 100 * freq / freq.sum()
for i in range(len(freq)) :
print(freq[i],ALPHABET_UNMOD_REV[i])
dict_freq = ALPHABET_UNMOD.copy() dict_freq = ALPHABET_UNMOD.copy()
for aa in list(ALPHABET_UNMOD.keys()): for aa in list(ALPHABET_UNMOD.keys()):
dict_freq[aa] = freq[ALPHABET_UNMOD[aa]] dict_freq[aa] = freq[ALPHABET_UNMOD[aa]]
...@@ -62,25 +67,25 @@ def retention_time_distribution(data, plot=False, save=False, f_name='fig/data_e ...@@ -62,25 +67,25 @@ def retention_time_distribution(data, plot=False, save=False, f_name='fig/data_e
def main(): def main():
#data prosit #data prosit
df = pd.read_csv('data_prosit/data.csv') # df = pd.read_csv('data_prosit/data.csv')
_ = length_distribution(df['sequence'],False ,True, '../fig/data_exploration/length_distribution_prosit.png') # _ = length_distribution(df['sequence'],False ,True, '../fig/data_exploration/length_distribution_prosit.png')
_ = aa_distribution(df['mod_sequence'], False, True, '../fig/data_exploration/aa_distribution_prosit.png') # _ = aa_distribution(df['mod_sequence'], False, True, '../fig/data_exploration/aa_distribution_prosit.png')
retention_time_distribution(df['irt_scaled'], False, True, '../fig/data_exploration/retention_time_distribution_prosit.png') # retention_time_distribution(df['irt_scaled'], False, True, '../fig/data_exploration/retention_time_distribution_prosit.png')
df_unique = df[['mod_sequence','irt_scaled']].groupby('mod_sequence').mean() # df_unique = df[['mod_sequence','irt_scaled']].groupby('mod_sequence').mean()
_ = length_distribution(df_unique.index, False, True, '../fig/data_exploration/length_distribution_prosit_unique.png') # _ = length_distribution(df_unique.index, False, True, '../fig/data_exploration/length_distribution_prosit_unique.png')
_ = aa_distribution(df_unique.index, False, True, '../fig/data_exploration/aa_distribution_prosit_unique.png') # _ = aa_distribution(df_unique.index, False, True, '../fig/data_exploration/aa_distribution_prosit_unique.png')
retention_time_distribution(df_unique['irt_scaled'], False, True, # retention_time_distribution(df_unique['irt_scaled'], False, True,
'../fig/data_exploration/retention_time_distribution_prosit_unique.png') # '../fig/data_exploration/retention_time_distribution_prosit_unique.png')
#prosit no cysteine #prosit no cysteine
df = pd.read_csv('data_prosit/data_noc.csv') # df = pd.read_csv('data_prosit/data_noc.csv')
_ = length_distribution(df['sequence'],False ,True, '../fig/data_exploration/length_distribution_prosit_noc.png') # _ = length_distribution(df['sequence'],False ,True, '../fig/data_exploration/length_distribution_prosit_noc.png')
_ = aa_distribution(df['mod_sequence'], False, True, '../fig/data_exploration/aa_distribution_prosit_noc.png') # _ = aa_distribution(df['mod_sequence'], False, True, '../fig/data_exploration/aa_distribution_prosit_noc.png')
retention_time_distribution(df['irt_scaled'], False, True, '../fig/data_exploration/retention_time_distribution_prosit_noc.png') # retention_time_distribution(df['irt_scaled'], False, True, '../fig/data_exploration/retention_time_distribution_prosit_noc.png')
df_unique = df[['mod_sequence','irt_scaled']].groupby('mod_sequence').mean() # df_unique = df[['mod_sequence','irt_scaled']].groupby('mod_sequence').mean()
_ = length_distribution(df_unique.index,False ,True, '../fig/data_exploration/length_distribution_prosit_noc_unique.png') # _ = length_distribution(df_unique.index,False ,True, '../fig/data_exploration/length_distribution_prosit_noc_unique.png')
_ = aa_distribution(df_unique.index, False, True, '../fig/data_exploration/aa_distribution_prosit_noc_unique.png') # _ = aa_distribution(df_unique.index, False, True, '../fig/data_exploration/aa_distribution_prosit_noc_unique.png')
retention_time_distribution(df_unique['irt_scaled'], False, True, '../fig/data_exploration/retention_time_distribution_prosit_noc_unique.png') # retention_time_distribution(df_unique['irt_scaled'], False, True, '../fig/data_exploration/retention_time_distribution_prosit_noc_unique.png')
#isa #isa
# df = pd.read_csv('data_ISA/data_aligned_isa.csv') # df = pd.read_csv('data_ISA/data_aligned_isa.csv')
...@@ -102,6 +107,16 @@ def main(): ...@@ -102,6 +107,16 @@ def main():
# _ = aa_distribution(df_unique.index, False, True, '../fig/data_exploration/aa_distribution_isa_noc_unique.png') # _ = aa_distribution(df_unique.index, False, True, '../fig/data_exploration/aa_distribution_isa_noc_unique.png')
# retention_time_distribution(df_unique['irt_scaled'], False, True, '../fig/data_exploration/retention_time_distribution_isa_noc_unique.png') # retention_time_distribution(df_unique['irt_scaled'], False, True, '../fig/data_exploration/retention_time_distribution_isa_noc_unique.png')
#isa mox
df = pd.read_csv('data_ISA_mox/data_aligned_isa_noc.csv')
_ = length_distribution(df['sequence'],False ,True, '../fig/data_exploration/length_distribution_isa_noc_mox.png')
_ = aa_distribution(df['sequence'], False, True, '../fig/data_exploration/aa_distribution_isa_noc_mox.png')
retention_time_distribution(df['irt_scaled'], False, True, '../fig/data_exploration/retention_time_distribution_isa_noc_mox.png')
df_unique = df[['sequence', 'irt_scaled']].groupby('sequence').mean()
_ = length_distribution(df_unique.index,False ,True, '../fig/data_exploration/length_distribution_isa_noc_mox_unique.png')
_ = aa_distribution(df_unique.index, False, True, '../fig/data_exploration/aa_distribution_isa_noc_mox_unique.png')
retention_time_distribution(df_unique['irt_scaled'], False, True, '../fig/data_exploration/retention_time_distribution_isa_noc_mox_unique.png')
if __name__ == '__main__': if __name__ == '__main__':
main() main()
...@@ -18,24 +18,26 @@ def align(dataset, reference, column_dataset, column_ref, seq_data, seq_ref): ...@@ -18,24 +18,26 @@ def align(dataset, reference, column_dataset, column_ref, seq_data, seq_ref):
ind_dict_ref = dict((k, i) for i, k in enumerate(seq_ref)) ind_dict_ref = dict((k, i) for i, k in enumerate(seq_ref))
inter = set(ind_dict_ref).intersection(seq_common) inter = set(ind_dict_ref).intersection(seq_common)
print(len(inter))
ind_dict_ref = [ind_dict_ref[x] for x in inter] ind_dict_ref = [ind_dict_ref[x] for x in inter]
indices_common = dict((k, i) for i, k in enumerate(seq_common)) indices_common = dict((k, i) for i, k in enumerate(seq_common))
indices_common = [indices_common[x] for x in inter] indices_common = [indices_common[x] for x in inter]
rt_ref = reference_unique[column_ref][ind_dict_ref].reset_index() rt_ref = reference_unique[column_ref][ind_dict_ref].reset_index()
rt_data = dataset_unique[column_dataset][indices_common].reset_index() rt_data = dataset_unique[column_dataset][indices_common].reset_index()
plt.scatter(rt_data[column_dataset].tolist(),rt_ref[column_ref].tolist()) plt.scatter(rt_data[column_dataset].tolist(),rt_ref[column_ref].tolist(),s=0.1)
plt.savefig('test.png') plt.savefig('test.png')
xout, yout, wout = loess_1d(np.array(rt_data[column_dataset].tolist()), np.array(rt_ref[column_ref].tolist()), xout, yout, wout = loess_1d(np.array(rt_data[column_dataset].tolist()), np.array(rt_ref[column_ref].tolist()),
xnew=dataset[column_dataset], xnew=dataset[column_dataset],
degree=1, frac=0.5, degree=1, frac=0.25,
npoints=None, rotate=False, sigy=None) npoints=None, rotate=False, sigy=None)
plt.scatter(xout, yout) plt.scatter(xout, yout, s=0.1)
plt.savefig('test_2.png') plt.savefig('test_2.png')
dataset[column_dataset] = yout dataset[column_dataset] = yout
...@@ -115,46 +117,41 @@ def numerical_to_alphabetical_str(s): ...@@ -115,46 +117,41 @@ def numerical_to_alphabetical_str(s):
def main(): def main():
ref = pd.read_csv('data_prosit/data.csv') ref = pd.read_csv('data_prosit/data.csv')
df_ISA = pd.read_csv('data_ISA_mox/data_isa.csv') df_ISA = pd.read_csv('data_PXD006109/data_plasma.csv')
df_ISA_aligned = align(df_ISA, ref, 'irt_scaled', 'irt_scaled','sequence', 'mod_sequence')
df_ISA_aligned.to_csv('data_ISA_mox/data_aligned_isa.csv', index=False)
ref = pd.read_csv('data_prosit/data_noc.csv')
df_ISA = pd.read_csv('data_ISA_mox/data_isa_noc.csv')
df_ISA_aligned = align(df_ISA, ref, 'irt_scaled', 'irt_scaled','sequence', 'mod_sequence') df_ISA_aligned = align(df_ISA, ref, 'irt_scaled', 'irt_scaled','sequence', 'mod_sequence')
df_ISA_aligned.to_csv('data_ISA_mox/data_aligned_isa_noc.csv', index=False) df_ISA_aligned.to_csv('data_PXD006109/data_aligned_plasma.csv', index=False)
if __name__ == '__main__': if __name__ == '__main__':
# main() main()
df_base = pd.read_csv('data_ISA/data_aligned_isa_noc.csv') # df_base = pd.read_csv('data_ISA/data_aligned_isa_noc.csv')
df_base = df_base[['sequence', 'irt_scaled','state']] # df_base = df_base[['sequence', 'irt_scaled','state']]
t = [0.7,1,10] # t = [0.7,1,10]
#reste 07 1 et all # #reste 07 1 et all
name = ['07','1','all'] # name = ['07','1','all']
for i in range(len(name)): # for i in range(len(name)):
#
#
#creating augmented datasets # #creating augmented datasets
print('thresold {} en cours'.format(name[i])) # print('thresold {} en cours'.format(name[i]))
# # #
df_0 = pd.read_csv('../output/out_ISA_noc_mox_prosit_mod_0.csv') # df_0 = pd.read_csv('../output/out_ISA_noc_mox_prosit_mod_0.csv')
df_1 = pd.read_csv('../output/out_ISA_noc_mox_prosit_mod_1.csv') # df_1 = pd.read_csv('../output/out_ISA_noc_mox_prosit_mod_1.csv')
df_2 = pd.read_csv('../output/out_ISA_noc_mox_prosit_mod_2.csv') # df_2 = pd.read_csv('../output/out_ISA_noc_mox_prosit_mod_2.csv')
df_3 = pd.read_csv('../output/out_ISA_noc_mox_prosit_mod_3.csv') # df_3 = pd.read_csv('../output/out_ISA_noc_mox_prosit_mod_3.csv')
df_4 = pd.read_csv('../output/out_ISA_noc_mox_prosit_mod_4.csv') # df_4 = pd.read_csv('../output/out_ISA_noc_mox_prosit_mod_4.csv')
#
list_df = [df_0,df_1,df_2,df_3,df_4] # list_df = [df_0,df_1,df_2,df_3,df_4]
df = select_best_data(list_df, t[i]) # df = select_best_data(list_df, t[i])
df.to_pickle('data_ISA_mox/data_ISA_additionnal_{}.pkl'.format(name[i])) # df.to_pickle('data_ISA_mox/data_ISA_additionnal_{}.pkl'.format(name[i]))
df = pd.read_pickle('data_ISA_mox/data_ISA_additionnal_{}.pkl'.format(name[i])) # df = pd.read_pickle('data_ISA_mox/data_ISA_additionnal_{}.pkl'.format(name[i]))
df['state'] = 'train' # df['state'] = 'train'
df['sequence'] = df['sequence'].map(numerical_to_alphabetical_str) # df['sequence'] = df['sequence'].map(numerical_to_alphabetical_str)
df_augmented_1 = pd.concat([df, df_base], axis=0).reset_index(drop=True) # df_augmented_1 = pd.concat([df, df_base], axis=0).reset_index(drop=True)
df_augmented_1.columns = ['sequence', 'irt_scaled','state'] # df_augmented_1.columns = ['sequence', 'irt_scaled','state']
#
df_augmented_1.to_csv('data_ISA_mox/isa_data_augmented_{}.csv'.format(name[i]), index=False) # df_augmented_1.to_csv('data_ISA_mox/isa_data_augmented_{}.csv'.format(name[i]), index=False)
print(df_augmented_1.shape) # print(df_augmented_1.shape)
\ No newline at end of file \ No newline at end of file
...@@ -166,9 +166,10 @@ def plot_res(): ...@@ -166,9 +166,10 @@ def plot_res():
def calc_and_plot_res(): def calc_and_plot_res():
all_data=[] all_data=[]
base = 'out_early_stop_' base = 'out_early_stop_'
for name in ['ISA_noc_ISA_noc','ISA_aug_005_ISA_noc','ISA_aug_01_ISA_noc','ISA_aug_02_ISA_noc','ISA_aug_03_ISA_noc', for name in ['ISA_noc_mox_ISA_noc_mox','ISA_noc_aug_005_mox_ISA_noc_mox','ISA_noc_aug_01_mox_ISA_noc_mox',
'ISA_aug_04_ISA_noc','ISA_aug_05_ISA_noc','ISA_aug_07_ISA_noc','ISA_aug_1_ISA_noc', 'ISA_noc_aug_02_mox_ISA_noc_mox','ISA_noc_aug_03_mox_ISA_noc_mox','ISA_noc_aug_04_mox_ISA_noc_mox',
'ISA_aug_all_ISA_noc','prosit_ISA_noc']: 'ISA_noc_aug_05_mox_ISA_noc_mox','ISA_noc_aug_07_mox_ISA_noc_mox','ISA_noc_aug_1_mox_ISA_noc_mox',
'ISA_noc_aug_all_mox_ISA_noc_mox','prosit_mod_ISA_noc_mox']:
print(name) print(name)
r2_list=[] r2_list=[]
for index in range(10): for index in range(10):
...@@ -184,7 +185,7 @@ def calc_and_plot_res(): ...@@ -184,7 +185,7 @@ def calc_and_plot_res():
axs.set_xticks([y + 1 for y in range(len(all_data))], axs.set_xticks([y + 1 for y in range(len(all_data))],
labels=[ 'ISA_noc', 'Augm 0.05', 'Augm 0.1', 'Augm 0.2', 'Augm 0.3', 'Augm 0.4', 'Augm 0.5', 'Augm 0.7', labels=[ 'ISA_noc', 'Augm 0.05', 'Augm 0.1', 'Augm 0.2', 'Augm 0.3', 'Augm 0.4', 'Augm 0.5', 'Augm 0.7',
'Augm 1', 'Augm all', 'Prosit', ]) 'Augm 1', 'Augm all', 'Prosit', ])
plt.savefig('../fig/model perf/summary_early_stop.png') plt.savefig('../fig/model perf/summary_early_stop_mox.png')
def error_by_methionine(dataframe): def error_by_methionine(dataframe):
def fonc(a): def fonc(a):
...@@ -206,6 +207,7 @@ def error_by_methionine(dataframe): ...@@ -206,6 +207,7 @@ def error_by_methionine(dataframe):
if __name__ == '__main__' : if __name__ == '__main__' :
calc_and_plot_res()
# base = ['ISA_noc_ISA_noc','prosit_ISA_noc', 'ISA_noc_prosit', 'prosit_prosit'] # base = ['ISA_noc_ISA_noc','prosit_ISA_noc', 'ISA_noc_prosit', 'prosit_prosit']
# augmented = ['ISA_aug_07_ISA_noc','ISA_aug_1_ISA_noc','ISA_aug_all_ISA_noc'] # augmented = ['ISA_aug_07_ISA_noc','ISA_aug_1_ISA_noc','ISA_aug_all_ISA_noc']
# for f_suffix_name in augmented: # for f_suffix_name in augmented:
...@@ -218,6 +220,6 @@ if __name__ == '__main__' : ...@@ -218,6 +220,6 @@ if __name__ == '__main__' :
# histo_length_by_error(df, bins=10, display=False, save=True, path='../fig/model perf/histo_length_{}_{}.png'.format(f_suffix_name,number)) # histo_length_by_error(df, bins=10, display=False, save=True, path='../fig/model perf/histo_length_{}_{}.png'.format(f_suffix_name,number))
# calc_and_plot_res() # calc_and_plot_res()
for number in range(10): # for number in range(10):
df = pd.read_csv('../output/out_{}_{}.csv'.format('early_stop_ISA_noc_prosit',str(number))) # df = pd.read_csv('../output/out_{}_{}.csv'.format('early_stop_ISA_noc_prosit',str(number)))
error_by_methionine(df) # error_by_methionine(df)
...@@ -3,7 +3,7 @@ import numpy as np ...@@ -3,7 +3,7 @@ import numpy as np
import random import random
from constant import ALPHABET_UNMOD, ALPHABET_UNMOD_REV from constant import ALPHABET_UNMOD, ALPHABET_UNMOD_REV
def load_data(msms_filet_path='data/msms.txt', score_treshold=70): def load_data(msms_filet_path='data/msms_plasma.txt', score_treshold=70):
data = pd.read_csv(msms_filet_path, sep='\t') data = pd.read_csv(msms_filet_path, sep='\t')
data_compact = data[['Modified sequence', 'Length', 'Charge', 'Retention time', 'Score', 'Matches', 'Intensities']] data_compact = data[['Modified sequence', 'Length', 'Charge', 'Retention time', 'Score', 'Matches', 'Intensities']]
data_filtered = data_compact[data_compact['Score'] > score_treshold] data_filtered = data_compact[data_compact['Score'] > score_treshold]
...@@ -23,7 +23,7 @@ def convert(l): ...@@ -23,7 +23,7 @@ def convert(l):
num_str)] num_str)]
def convert_mod_to_prosit(s): def convert_mod_to_prosit(s):
return s.replace('(Oxidation (M))','-OxM-').replace('(Acetyl (Protein N-term))','') #acetyl reliquat d'une mauvaise analyse, a enlever a terme return s.replace('M(ox)','-OxM-').replace('(ac)','@')
def numerical_to_alphabetical(arr): def numerical_to_alphabetical(arr):
seq = '' seq = ''
...@@ -31,6 +31,14 @@ def numerical_to_alphabetical(arr): ...@@ -31,6 +31,14 @@ def numerical_to_alphabetical(arr):
seq+=ALPHABET_UNMOD_REV[arr[i]] seq+=ALPHABET_UNMOD_REV[arr[i]]
return seq return seq
def filter_acetyl(df):
df2 = df.copy()
df2['acetyl']=df['sequence'].map(lambda x: '@' in x)
df2 = df2[df2['acetyl']==False]
df_final = df2.drop('acetyl', axis=1)
df_final.reset_index(drop=True, inplace=True)
return df_final
def filter_cysteine(df): def filter_cysteine(df):
df2 = df.copy() df2 = df.copy()
df2['cysteine']=df['sequence'].map(lambda x: 'C' in x) df2['cysteine']=df['sequence'].map(lambda x: 'C' in x)
...@@ -79,19 +87,12 @@ def add_split_column(data, split=(0.7,0.15,0.15)): ...@@ -79,19 +87,12 @@ def add_split_column(data, split=(0.7,0.15,0.15)):
return data_split return data_split
def main(): def main():
df_03_02 = load_data('data_ISA_mox/msms_03_02.txt', 70) df_plasma = load_data('data_PXD006109/msms_plasma.txt', 70)
df_16_01 = load_data('data_ISA_mox/msms_16_01.txt', 70) merged_df = pd.concat([df_plasma], ignore_index=True)
df_20_01 = load_data('data_ISA_mox/msms_20_01.txt', 70)
df_30_01 = load_data('data_ISA_mox/msms_30_01.txt', 70)
merged_df = pd.concat([df_20_01, df_30_01, df_16_01, df_03_02], ignore_index=True)
final_df = add_split_column(merged_df) final_df = add_split_column(merged_df)
final_df.to_csv('data_ISA_mox/data_isa.csv', index=False) final_df = filter_acetyl(final_df)
df2 = filter_cysteine(final_df) final_df.to_csv('data_PXD006109/data_plasma.csv', index=False)
df2.to_csv('data_ISA_mox/data_isa_noc.csv', index=False)
# final_df= pd.read_csv('data_prosit/data.csv')
# df2 = filter_cysteine(final_df)
# df2.to_csv('data_prosit/data_noc.csv', index=False)
if __name__ == '__main__': if __name__ == '__main__':
main() main()
\ No newline at end of file
0% Loading or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment