import numpy as np import matplotlib.pyplot as plt import matplotlib import pandas as pd matplotlib.use('agg') length = 30 ALPHABET_UNMOD = { "A": 0, "C": 1, "D": 2, "E": 3, "F": 4, "G": 5, "H": 6, "I": 7, "K": 8, "L": 9, "M": 10, "N": 11, "P": 12, "Q": 13, "R": 14, "S": 15, "T": 16, "V": 17, "W": 18, "Y": 19, "CaC": 20, "OxM": 21 } def separe_by_length(X, Y, base_name='out'): max = 30 datasets = [[[], []] for i in range(max)] for i in range(X.shape[0]): try: datasets[list(X[i]).index(0)-X[i].count('-')*2][0].append(X[i]) datasets[list(X[i]).index(0)-X[i].count('-')*2][1].append(Y[i]) except: datasets[max - 1][0].append(X[i][:]) datasets[max - 1][1].append(Y[i]) for length in range(max): print('Cutting ', length) if datasets[length][0]: print('data/X_' + base_name + '_' + str(length) + '.npy') X_cut = np.array(datasets[length][0]) Y_cut = np.array(datasets[length][1]) np.save('data/RT/X_' + base_name + '_' + str(length) + '.npy', X_cut) np.save('data/RT/Y_' + base_name + '_' + str(length) + '.npy', Y_cut) def dist_long(X, plot=False, save=False, f_name='out.png'): max = 31 dist = np.zeros(max) for seq in X: try: dist[len(list(seq)) - seq.count('-') * 2] += 1 except: dist[-1] += 1 if plot or save: plt.stairs(dist, range(max + 1), fill=True) if plot: plt.show() if save: plt.savefig(f_name) plt.clf() plt.close() return 100 * dist / X.shape[0] def feq_aa(X, plot=False, save=False, f_name='out.png'): freq = np.zeros(22) for seq in X: dec = 0 for i in range(len(seq)-2*seq.count('-')): if seq[dec+i] != '-': freq[ALPHABET_UNMOD[seq[dec+i]]] += 1 elif seq[i+dec+1:i+dec+4] == 'CaC': dec += 4 freq[ALPHABET_UNMOD['CaC']] += 1 elif seq[i+dec+1:i+dec+4] == 'OxM': dec += 4 freq[ALPHABET_UNMOD['OxM']] += 1 freq = 100 * freq / freq.sum() dict_freq = ALPHABET_UNMOD.copy() for aa in list(ALPHABET_UNMOD.keys()): dict_freq[aa] = freq[ALPHABET_UNMOD[aa]] if plot or save: plt.bar(list(ALPHABET_UNMOD.keys()), freq, label=list(ALPHABET_UNMOD.keys())) if plot: plt.show() if save: plt.savefig(f_name) plt.clf() plt.close() return dict_freq def intersection(A, B): C = np.intersect1d(A, B) return C.shape[0], C def RT_variance(X, Y): unique_rows, unique_indices, unique_counts = np.unique(X, axis=0, return_inverse=True, return_counts=True) variances = np.zeros(len(unique_rows)) for i in range(len(unique_rows)): ind = np.where(unique_indices == i) variances[i] = np.var(Y[ind]) return np.mean(variances) def RT_distrib(Y, f_name): plt.hist(Y,bins = 50) plt.title("RT distribution") plt.savefig(f_name) plt.clf() # data = pd.read_csv('database/data_ptms.csv') # data_train = data[data.state == 'train'] # data_train_2 = data_train.drop([data_train.columns[0] ,'sequence','irt_scaled','state'], axis = 1) # data_train_2.to_csv('data/RT/data_ptms_train.csv', index= False) # data_test = data[data.state == 'holdout'] # data_validation = data[data.state == 'validation'] # mean_unique_train = data_train.groupby(['mod_sequence'])['irt'].mean() # var_unique_train = data_train.groupby(['mod_sequence'])['irt'].var() # avg_train = pd.concat([mean_unique_train,var_unique_train], axis=1).reset_index() # avg_train['state'] = 'train' # # mean_unique_test = data_test.groupby(['mod_sequence'])['irt'].mean() # var_unique_test = data_test.groupby(['mod_sequence'])['irt'].var() # avg_test = pd.concat([mean_unique_test,var_unique_test], axis=1).reset_index() # avg_test['state'] = 'holdout' # # mean_unique_validation = data_validation.groupby(['mod_sequence'])['irt'].mean() # var_unique_validation = data_validation.groupby(['mod_sequence'])['irt'].var() # avg_validation = pd.concat([mean_unique_validation, var_unique_validation], axis=1).reset_index() # avg_validation['state'] = 'validation' # # avg = pd.concat([avg_train, avg_test, avg_validation]) # # avg.columns.values[0] = 'mod_sequence' # avg.columns.values[1] = 'irt' # avg.columns.values[2] = 'var' # avg = avg.fillna(0) # data_unique = avg.query('var <= 1') # # avg.to_csv('database/data_unique.csv', index=False) # # data_unique = pd.read_csv('database/data_unique.csv') # # data_train = data_unique[data_unique.state == 'train'] # data_test= data_unique[data_unique.state == 'holdout'] # data_validation = data_unique[data_unique.state == 'validation'] # # # np.save('data/RT/Y_unique_train.npy',data_train['irt'],allow_pickle=True) # np.save('data/RT/Y_unique_holdout.npy',data_test['irt'],allow_pickle=True) # np.save('data/RT/Y_unique_validation.npy',data_validation['irt'],allow_pickle=True) # np.save('data/RT/X_unique_train.npy',data_train['mod_sequence'],allow_pickle=True) # np.save('data/RT/X_unique_holdout.npy',data_test['mod_sequence'],allow_pickle=True) # np.save('data/RT/X_unique_validation.npy',data_validation['mod_sequence'],allow_pickle=True) # # # # X_train = np.load('data/RT/X_unique_train.npy',allow_pickle=True) # Y_train = np.load('data/RT/Y_unique_train.npy',allow_pickle=True) # X_validation = np.load('data/RT/X_unique_validation.npy',allow_pickle=True) # Y_validation = np.load('data/RT/Y_unique_validation.npy',allow_pickle=True) # X_test = np.load('data/RT/X_unique_holdout.npy',allow_pickle=True) # Y_test = np.load('data/RT/Y_unique_holdout.npy',allow_pickle=True) # # X_train = np.array(X_train.tolist()) # X_validation = np.array(X_validation.tolist()) # X_test = np.array(X_test.tolist()) # # # print('\n Tailles des données') # print('\n Train : ', Y_train.size) # print('Validation : ', Y_validation.size) # print('Test: ', Y_test.size) # # print('\n Longueurs des séquences') # print('\n Train : ', dist_long(X_train, plot=False, save=True, f_name='fig/histo_length_train_unique.png')) # print('Validation : ', dist_long(X_validation, plot=False, save=True, f_name='fig/histo_length_validation_unique.png')) # print('Test : ', dist_long(X_test, plot=False, save=True, f_name='fig/histo_length_test_unique.png')) # # print('\n Fréquences des acides aminés') # print('\n Train : ', feq_aa(X_train, plot=False, save=True, f_name='fig/histo_aa_train_unique.png')) # print('Validation : ', feq_aa(X_validation, plot=False, save=True, f_name='fig/histo_aa_validation_unique.png')) # print('Test : ', feq_aa(X_test, plot=False, save=True, f_name='fig/histo_aa_test_unique.png')) # # # l1, c1 = intersection(X_train, X_validation) # l2, c2 = intersection(X_train, X_test) # l3, c3 = intersection(X_test, X_validation) # # print('\n Intersection checking') # print('\n Train x Validation : ', l1, ' intersection(s)') # print('Train x Test : ', l2, ' intersection(s)') # print('Test x validation : ', l3, ' intersection(s)') # # # print('\n RT Variance') # # print('\n Train : ', RT_variance(X_train, Y_train)) # # print('\n Validation : ', RT_variance(X_validation, Y_validation)) # # print('\n Test : ', RT_variance(X_test, Y_test)) # # RT_distrib(Y_train,'fig/histo_RT_train_unique.png' ) # RT_distrib(Y_test,'fig/histo_RT_test_unique.png' ) # RT_distrib(Y_validation,'fig/histo_RT_validation_unique.png' ) # # data_train = data[data.state == 'train'] # data_test= data[data.state == 'holdout'] # data_validation = data[data.state == 'validation'] # # # np.save('data/RT/Y_train.npy',data_train['irt'],allow_pickle=True) # np.save('data/RT/Y_holdout.npy',data_test['irt'],allow_pickle=True) # np.save('data/RT/Y_validation.npy',data_validation['irt'],allow_pickle=True) # np.save('data/RT/X_train.npy',data_train['mod_sequence'],allow_pickle=True) # np.save('data/RT/X_holdout.npy',data_test['mod_sequence'],allow_pickle=True) # np.save('data/RT/X_validation.npy',data_validation['mod_sequence'],allow_pickle=True) # # # # X_train = np.load('data/RT/X_train.npy',allow_pickle=True) # Y_train = np.load('data/RT/Y_train.npy',allow_pickle=True) # X_validation = np.load('data/RT/X_validation.npy',allow_pickle=True) # Y_validation = np.load('data/RT/Y_validation.npy',allow_pickle=True) # X_test = np.load('data/RT/X_holdout.npy',allow_pickle=True) # Y_test = np.load('data/RT/Y_holdout.npy',allow_pickle=True) # # X_train = np.array(X_train.tolist()) # X_validation = np.array(X_validation.tolist()) # X_test = np.array(X_test.tolist()) # # # print('\n Tailles des données') # print('\n Train : ', Y_train.size) # print('Validation : ', Y_validation.size) # print('Test: ', Y_test.size) # # print('\n Longueurs des séquences') # print('\n Train : ', dist_long(X_train, plot=False, save=True, f_name='fig/histo_length_train.png')) # print('Validation : ', dist_long(X_validation, plot=False, save=True, f_name='fig/histo_length_validation.png')) # print('Test : ', dist_long(X_test, plot=False, save=True, f_name='fig/histo_length_test.png')) # # print('\n Fréquences des acides aminés') # print('\n Train : ', feq_aa(X_train, plot=False, save=True, f_name='fig/histo_aa_train.png')) # print('Validation : ', feq_aa(X_validation, plot=False, save=True, f_name='fig/histo_aa_validation.png')) # print('Test : ', feq_aa(X_test, plot=False, save=True, f_name='fig/histo_aa_test.png')) # # # # l1, c1 = intersection(X_train, X_validation) # l2, c2 = intersection(X_train, X_test) # l3, c3 = intersection(X_test, X_validation) # # print('\n Intersection checking') # print('\n Train x Validation : ', l1, ' intersection(s)') # print('Train x Test : ', l2, ' intersection(s)') # print('Test x validation : ', l3, ' intersection(s)') # # print('\n RT Variance') # print('\n Train : ', RT_variance(X_train, Y_train)) # print('\n Validation : ', RT_variance(X_validation, Y_validation)) # print('\n Test : ', RT_variance(X_test, Y_test)) # # RT_distrib(Y_train,'fig/histo_RT_train.png' ) # RT_distrib(Y_test,'fig/histo_RT_test.png' ) # RT_distrib(Y_validation,'fig/histo_RT_validation.png' ) # #