Skip to content
Snippets Groups Projects
Commit 3da55a91 authored by Schneider Leo's avatar Schneider Leo
Browse files

rename output

parent d3de7130
No related branches found
No related tags found
No related merge requests found
......@@ -20,8 +20,8 @@ ALPHABET_UNMOD = {
"V": 18,
"W": 19,
"Y": 20,
"CaC": 22,
"OxM": 21
"OxM": 21,
"CaC": 22
}
ALPHABET_UNMOD_REV = {v: k for k, v in ALPHABET_UNMOD.items()}
\ No newline at end of file
import numpy as np
import matplotlib.pyplot as plt
import pandas as pd
from numpy.ma.core import shape
from constant import ALPHABET_UNMOD, ALPHABET_UNMOD_REV
from constant import ALPHABET_UNMOD
def length_distribution(data, plot=False, save=False, f_name='fig/data_exploration/length_distribution.png'):
maximum = 31
......@@ -36,6 +38,9 @@ def aa_distribution(data, plot=False, save=False, f_name='fig/data_exploration/a
freq = 100 * freq / freq.sum()
for i in range(len(freq)) :
print(freq[i],ALPHABET_UNMOD_REV[i])
dict_freq = ALPHABET_UNMOD.copy()
for aa in list(ALPHABET_UNMOD.keys()):
dict_freq[aa] = freq[ALPHABET_UNMOD[aa]]
......@@ -62,25 +67,25 @@ def retention_time_distribution(data, plot=False, save=False, f_name='fig/data_e
def main():
#data prosit
df = pd.read_csv('data_prosit/data.csv')
_ = length_distribution(df['sequence'],False ,True, '../fig/data_exploration/length_distribution_prosit.png')
_ = aa_distribution(df['mod_sequence'], False, True, '../fig/data_exploration/aa_distribution_prosit.png')
retention_time_distribution(df['irt_scaled'], False, True, '../fig/data_exploration/retention_time_distribution_prosit.png')
df_unique = df[['mod_sequence','irt_scaled']].groupby('mod_sequence').mean()
_ = length_distribution(df_unique.index, False, True, '../fig/data_exploration/length_distribution_prosit_unique.png')
_ = aa_distribution(df_unique.index, False, True, '../fig/data_exploration/aa_distribution_prosit_unique.png')
retention_time_distribution(df_unique['irt_scaled'], False, True,
'../fig/data_exploration/retention_time_distribution_prosit_unique.png')
# df = pd.read_csv('data_prosit/data.csv')
# _ = length_distribution(df['sequence'],False ,True, '../fig/data_exploration/length_distribution_prosit.png')
# _ = aa_distribution(df['mod_sequence'], False, True, '../fig/data_exploration/aa_distribution_prosit.png')
# retention_time_distribution(df['irt_scaled'], False, True, '../fig/data_exploration/retention_time_distribution_prosit.png')
# df_unique = df[['mod_sequence','irt_scaled']].groupby('mod_sequence').mean()
# _ = length_distribution(df_unique.index, False, True, '../fig/data_exploration/length_distribution_prosit_unique.png')
# _ = aa_distribution(df_unique.index, False, True, '../fig/data_exploration/aa_distribution_prosit_unique.png')
# retention_time_distribution(df_unique['irt_scaled'], False, True,
# '../fig/data_exploration/retention_time_distribution_prosit_unique.png')
#prosit no cysteine
df = pd.read_csv('data_prosit/data_noc.csv')
_ = length_distribution(df['sequence'],False ,True, '../fig/data_exploration/length_distribution_prosit_noc.png')
_ = aa_distribution(df['mod_sequence'], False, True, '../fig/data_exploration/aa_distribution_prosit_noc.png')
retention_time_distribution(df['irt_scaled'], False, True, '../fig/data_exploration/retention_time_distribution_prosit_noc.png')
df_unique = df[['mod_sequence','irt_scaled']].groupby('mod_sequence').mean()
_ = length_distribution(df_unique.index,False ,True, '../fig/data_exploration/length_distribution_prosit_noc_unique.png')
_ = aa_distribution(df_unique.index, False, True, '../fig/data_exploration/aa_distribution_prosit_noc_unique.png')
retention_time_distribution(df_unique['irt_scaled'], False, True, '../fig/data_exploration/retention_time_distribution_prosit_noc_unique.png')
# df = pd.read_csv('data_prosit/data_noc.csv')
# _ = length_distribution(df['sequence'],False ,True, '../fig/data_exploration/length_distribution_prosit_noc.png')
# _ = aa_distribution(df['mod_sequence'], False, True, '../fig/data_exploration/aa_distribution_prosit_noc.png')
# retention_time_distribution(df['irt_scaled'], False, True, '../fig/data_exploration/retention_time_distribution_prosit_noc.png')
# df_unique = df[['mod_sequence','irt_scaled']].groupby('mod_sequence').mean()
# _ = length_distribution(df_unique.index,False ,True, '../fig/data_exploration/length_distribution_prosit_noc_unique.png')
# _ = aa_distribution(df_unique.index, False, True, '../fig/data_exploration/aa_distribution_prosit_noc_unique.png')
# retention_time_distribution(df_unique['irt_scaled'], False, True, '../fig/data_exploration/retention_time_distribution_prosit_noc_unique.png')
#isa
# df = pd.read_csv('data_ISA/data_aligned_isa.csv')
......@@ -102,6 +107,16 @@ def main():
# _ = aa_distribution(df_unique.index, False, True, '../fig/data_exploration/aa_distribution_isa_noc_unique.png')
# retention_time_distribution(df_unique['irt_scaled'], False, True, '../fig/data_exploration/retention_time_distribution_isa_noc_unique.png')
#isa mox
df = pd.read_csv('data_ISA_mox/data_aligned_isa_noc.csv')
_ = length_distribution(df['sequence'],False ,True, '../fig/data_exploration/length_distribution_isa_noc_mox.png')
_ = aa_distribution(df['sequence'], False, True, '../fig/data_exploration/aa_distribution_isa_noc_mox.png')
retention_time_distribution(df['irt_scaled'], False, True, '../fig/data_exploration/retention_time_distribution_isa_noc_mox.png')
df_unique = df[['sequence', 'irt_scaled']].groupby('sequence').mean()
_ = length_distribution(df_unique.index,False ,True, '../fig/data_exploration/length_distribution_isa_noc_mox_unique.png')
_ = aa_distribution(df_unique.index, False, True, '../fig/data_exploration/aa_distribution_isa_noc_mox_unique.png')
retention_time_distribution(df_unique['irt_scaled'], False, True, '../fig/data_exploration/retention_time_distribution_isa_noc_mox_unique.png')
if __name__ == '__main__':
main()
......@@ -18,24 +18,26 @@ def align(dataset, reference, column_dataset, column_ref, seq_data, seq_ref):
ind_dict_ref = dict((k, i) for i, k in enumerate(seq_ref))
inter = set(ind_dict_ref).intersection(seq_common)
print(len(inter))
ind_dict_ref = [ind_dict_ref[x] for x in inter]
indices_common = dict((k, i) for i, k in enumerate(seq_common))
indices_common = [indices_common[x] for x in inter]
rt_ref = reference_unique[column_ref][ind_dict_ref].reset_index()
rt_data = dataset_unique[column_dataset][indices_common].reset_index()
plt.scatter(rt_data[column_dataset].tolist(),rt_ref[column_ref].tolist())
plt.scatter(rt_data[column_dataset].tolist(),rt_ref[column_ref].tolist(),s=0.1)
plt.savefig('test.png')
xout, yout, wout = loess_1d(np.array(rt_data[column_dataset].tolist()), np.array(rt_ref[column_ref].tolist()),
xnew=dataset[column_dataset],
degree=1, frac=0.5,
degree=1, frac=0.25,
npoints=None, rotate=False, sigy=None)
plt.scatter(xout, yout)
plt.scatter(xout, yout, s=0.1)
plt.savefig('test_2.png')
dataset[column_dataset] = yout
......@@ -115,46 +117,41 @@ def numerical_to_alphabetical_str(s):
def main():
ref = pd.read_csv('data_prosit/data.csv')
df_ISA = pd.read_csv('data_ISA_mox/data_isa.csv')
df_ISA_aligned = align(df_ISA, ref, 'irt_scaled', 'irt_scaled','sequence', 'mod_sequence')
df_ISA_aligned.to_csv('data_ISA_mox/data_aligned_isa.csv', index=False)
ref = pd.read_csv('data_prosit/data_noc.csv')
df_ISA = pd.read_csv('data_ISA_mox/data_isa_noc.csv')
df_ISA = pd.read_csv('data_PXD006109/data_plasma.csv')
df_ISA_aligned = align(df_ISA, ref, 'irt_scaled', 'irt_scaled','sequence', 'mod_sequence')
df_ISA_aligned.to_csv('data_ISA_mox/data_aligned_isa_noc.csv', index=False)
df_ISA_aligned.to_csv('data_PXD006109/data_aligned_plasma.csv', index=False)
if __name__ == '__main__':
# main()
df_base = pd.read_csv('data_ISA/data_aligned_isa_noc.csv')
df_base = df_base[['sequence', 'irt_scaled','state']]
t = [0.7,1,10]
#reste 07 1 et all
name = ['07','1','all']
for i in range(len(name)):
#creating augmented datasets
print('thresold {} en cours'.format(name[i]))
#
df_0 = pd.read_csv('../output/out_ISA_noc_mox_prosit_mod_0.csv')
df_1 = pd.read_csv('../output/out_ISA_noc_mox_prosit_mod_1.csv')
df_2 = pd.read_csv('../output/out_ISA_noc_mox_prosit_mod_2.csv')
df_3 = pd.read_csv('../output/out_ISA_noc_mox_prosit_mod_3.csv')
df_4 = pd.read_csv('../output/out_ISA_noc_mox_prosit_mod_4.csv')
list_df = [df_0,df_1,df_2,df_3,df_4]
df = select_best_data(list_df, t[i])
df.to_pickle('data_ISA_mox/data_ISA_additionnal_{}.pkl'.format(name[i]))
df = pd.read_pickle('data_ISA_mox/data_ISA_additionnal_{}.pkl'.format(name[i]))
df['state'] = 'train'
df['sequence'] = df['sequence'].map(numerical_to_alphabetical_str)
df_augmented_1 = pd.concat([df, df_base], axis=0).reset_index(drop=True)
df_augmented_1.columns = ['sequence', 'irt_scaled','state']
df_augmented_1.to_csv('data_ISA_mox/isa_data_augmented_{}.csv'.format(name[i]), index=False)
print(df_augmented_1.shape)
\ No newline at end of file
main()
# df_base = pd.read_csv('data_ISA/data_aligned_isa_noc.csv')
# df_base = df_base[['sequence', 'irt_scaled','state']]
# t = [0.7,1,10]
# #reste 07 1 et all
# name = ['07','1','all']
# for i in range(len(name)):
#
#
# #creating augmented datasets
# print('thresold {} en cours'.format(name[i]))
# #
# df_0 = pd.read_csv('../output/out_ISA_noc_mox_prosit_mod_0.csv')
# df_1 = pd.read_csv('../output/out_ISA_noc_mox_prosit_mod_1.csv')
# df_2 = pd.read_csv('../output/out_ISA_noc_mox_prosit_mod_2.csv')
# df_3 = pd.read_csv('../output/out_ISA_noc_mox_prosit_mod_3.csv')
# df_4 = pd.read_csv('../output/out_ISA_noc_mox_prosit_mod_4.csv')
#
# list_df = [df_0,df_1,df_2,df_3,df_4]
# df = select_best_data(list_df, t[i])
# df.to_pickle('data_ISA_mox/data_ISA_additionnal_{}.pkl'.format(name[i]))
# df = pd.read_pickle('data_ISA_mox/data_ISA_additionnal_{}.pkl'.format(name[i]))
# df['state'] = 'train'
# df['sequence'] = df['sequence'].map(numerical_to_alphabetical_str)
# df_augmented_1 = pd.concat([df, df_base], axis=0).reset_index(drop=True)
# df_augmented_1.columns = ['sequence', 'irt_scaled','state']
#
# df_augmented_1.to_csv('data_ISA_mox/isa_data_augmented_{}.csv'.format(name[i]), index=False)
# print(df_augmented_1.shape)
\ No newline at end of file
......@@ -166,9 +166,10 @@ def plot_res():
def calc_and_plot_res():
all_data=[]
base = 'out_early_stop_'
for name in ['ISA_noc_ISA_noc','ISA_aug_005_ISA_noc','ISA_aug_01_ISA_noc','ISA_aug_02_ISA_noc','ISA_aug_03_ISA_noc',
'ISA_aug_04_ISA_noc','ISA_aug_05_ISA_noc','ISA_aug_07_ISA_noc','ISA_aug_1_ISA_noc',
'ISA_aug_all_ISA_noc','prosit_ISA_noc']:
for name in ['ISA_noc_mox_ISA_noc_mox','ISA_noc_aug_005_mox_ISA_noc_mox','ISA_noc_aug_01_mox_ISA_noc_mox',
'ISA_noc_aug_02_mox_ISA_noc_mox','ISA_noc_aug_03_mox_ISA_noc_mox','ISA_noc_aug_04_mox_ISA_noc_mox',
'ISA_noc_aug_05_mox_ISA_noc_mox','ISA_noc_aug_07_mox_ISA_noc_mox','ISA_noc_aug_1_mox_ISA_noc_mox',
'ISA_noc_aug_all_mox_ISA_noc_mox','prosit_mod_ISA_noc_mox']:
print(name)
r2_list=[]
for index in range(10):
......@@ -184,7 +185,7 @@ def calc_and_plot_res():
axs.set_xticks([y + 1 for y in range(len(all_data))],
labels=[ 'ISA_noc', 'Augm 0.05', 'Augm 0.1', 'Augm 0.2', 'Augm 0.3', 'Augm 0.4', 'Augm 0.5', 'Augm 0.7',
'Augm 1', 'Augm all', 'Prosit', ])
plt.savefig('../fig/model perf/summary_early_stop.png')
plt.savefig('../fig/model perf/summary_early_stop_mox.png')
def error_by_methionine(dataframe):
def fonc(a):
......@@ -206,6 +207,7 @@ def error_by_methionine(dataframe):
if __name__ == '__main__' :
calc_and_plot_res()
# base = ['ISA_noc_ISA_noc','prosit_ISA_noc', 'ISA_noc_prosit', 'prosit_prosit']
# augmented = ['ISA_aug_07_ISA_noc','ISA_aug_1_ISA_noc','ISA_aug_all_ISA_noc']
# for f_suffix_name in augmented:
......@@ -218,6 +220,6 @@ if __name__ == '__main__' :
# histo_length_by_error(df, bins=10, display=False, save=True, path='../fig/model perf/histo_length_{}_{}.png'.format(f_suffix_name,number))
# calc_and_plot_res()
for number in range(10):
df = pd.read_csv('../output/out_{}_{}.csv'.format('early_stop_ISA_noc_prosit',str(number)))
error_by_methionine(df)
# for number in range(10):
# df = pd.read_csv('../output/out_{}_{}.csv'.format('early_stop_ISA_noc_prosit',str(number)))
# error_by_methionine(df)
......@@ -3,7 +3,7 @@ import numpy as np
import random
from constant import ALPHABET_UNMOD, ALPHABET_UNMOD_REV
def load_data(msms_filet_path='data/msms.txt', score_treshold=70):
def load_data(msms_filet_path='data/msms_plasma.txt', score_treshold=70):
data = pd.read_csv(msms_filet_path, sep='\t')
data_compact = data[['Modified sequence', 'Length', 'Charge', 'Retention time', 'Score', 'Matches', 'Intensities']]
data_filtered = data_compact[data_compact['Score'] > score_treshold]
......@@ -23,7 +23,7 @@ def convert(l):
num_str)]
def convert_mod_to_prosit(s):
return s.replace('(Oxidation (M))','-OxM-').replace('(Acetyl (Protein N-term))','') #acetyl reliquat d'une mauvaise analyse, a enlever a terme
return s.replace('M(ox)','-OxM-').replace('(ac)','@')
def numerical_to_alphabetical(arr):
seq = ''
......@@ -31,6 +31,14 @@ def numerical_to_alphabetical(arr):
seq+=ALPHABET_UNMOD_REV[arr[i]]
return seq
def filter_acetyl(df):
df2 = df.copy()
df2['acetyl']=df['sequence'].map(lambda x: '@' in x)
df2 = df2[df2['acetyl']==False]
df_final = df2.drop('acetyl', axis=1)
df_final.reset_index(drop=True, inplace=True)
return df_final
def filter_cysteine(df):
df2 = df.copy()
df2['cysteine']=df['sequence'].map(lambda x: 'C' in x)
......@@ -79,19 +87,12 @@ def add_split_column(data, split=(0.7,0.15,0.15)):
return data_split
def main():
df_03_02 = load_data('data_ISA_mox/msms_03_02.txt', 70)
df_16_01 = load_data('data_ISA_mox/msms_16_01.txt', 70)
df_20_01 = load_data('data_ISA_mox/msms_20_01.txt', 70)
df_30_01 = load_data('data_ISA_mox/msms_30_01.txt', 70)
merged_df = pd.concat([df_20_01, df_30_01, df_16_01, df_03_02], ignore_index=True)
df_plasma = load_data('data_PXD006109/msms_plasma.txt', 70)
merged_df = pd.concat([df_plasma], ignore_index=True)
final_df = add_split_column(merged_df)
final_df.to_csv('data_ISA_mox/data_isa.csv', index=False)
df2 = filter_cysteine(final_df)
df2.to_csv('data_ISA_mox/data_isa_noc.csv', index=False)
final_df = filter_acetyl(final_df)
final_df.to_csv('data_PXD006109/data_plasma.csv', index=False)
# final_df= pd.read_csv('data_prosit/data.csv')
# df2 = filter_cysteine(final_df)
# df2.to_csv('data_prosit/data_noc.csv', index=False)
if __name__ == '__main__':
main()
\ No newline at end of file
0% Loading or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment