import pandas as pd import numpy as np import random from constant import ALPHABET_UNMOD, ALPHABET_UNMOD_REV def load_data(msms_filet_path='data/msms_plasma.txt', score_treshold=70): data = pd.read_csv(msms_filet_path, sep='\t') data_compact = data[['Modified sequence', 'Length', 'Charge', 'Retention time', 'Score', 'Matches', 'Intensities']] data_filtered = data_compact[data_compact['Score'] > score_treshold] data_filtered = data_filtered[data_filtered['Length'] < 26] data_filtered = data_filtered[['Modified sequence', 'Retention time', 'Score']] data_filtered['Modified sequence']=data_filtered['Modified sequence'].map(lambda x : x[1:-1]) data_filtered['Modified sequence']=data_filtered['Modified sequence'].map(convert_mod_to_prosit) data_filtered.rename(columns={'Modified sequence': 'sequence', 'Retention time': 'irt_scaled', 'Score':'score'}, inplace=True) return data_filtered def convert(l): return [num_int for num_str in l.split() for num_int in ( lambda x: [float(x.replace('[', '').replace(']', ''))] if x.replace('.', '').replace('[', '').replace(']', '').replace( 'e+', '').isdigit() else [])( num_str)] def convert_mod_to_prosit(s): return s.replace('M(ox)','-OxM-').replace('(ac)','@') def numerical_to_alphabetical(arr): seq = '' for i in range(len(arr)): seq+=ALPHABET_UNMOD_REV[arr[i]] return seq def filter_acetyl(df): df2 = df.copy() df2['acetyl']=df['sequence'].map(lambda x: '@' in x) df2 = df2[df2['acetyl']==False] df_final = df2.drop('acetyl', axis=1) df_final.reset_index(drop=True, inplace=True) return df_final def filter_cysteine(df): df2 = df.copy() df2['cysteine']=df['sequence'].map(lambda x: 'C' in x) df2 = df2[df2['cysteine']==False] df_final = df2.drop('cysteine', axis=1) df_final.reset_index(drop=True, inplace=True) return df_final def add_split_column(data, split=(0.7,0.15,0.15)): num_data = data.shape[0] train_num = np.floor(num_data*split[0]) val_num = np.floor(num_data*(split[0]+split[1])) num_data=0 list_train=[] list_test=[] list_val=[] groups = [df for _, df in data.groupby('sequence')] random.shuffle(groups) data = pd.concat(groups).reset_index(drop=True) groups = data.groupby('sequence',sort=False) for seq, gr in groups: num_data+= gr.shape[0] if num_data<train_num: list_train.append(gr) elif num_data<val_num: list_val.append(gr) else: list_test.append(gr) dataset_train = pd.concat(list_train, ignore_index=True) dataset_train['state'] = 'train' dataset_val = pd.concat(list_val, ignore_index=True) dataset_val['state'] = 'validation' dataset_test = pd.concat(list_test, ignore_index=True) dataset_test['state'] = 'holdout' data_split = pd.concat([dataset_train, dataset_val, dataset_test], ignore_index=True) data_split.reset_index(drop=True, inplace=True) return data_split def main(): df_plasma = load_data('data_PXD006109/msms_plasma.txt', 70) merged_df = pd.concat([df_plasma], ignore_index=True) final_df = add_split_column(merged_df) final_df = filter_acetyl(final_df) final_df.to_csv('data_PXD006109/data_plasma.csv', index=False) if __name__ == '__main__': main()