-
Schneider Leo authored3da55a91
msms_processing.py 3.41 KiB
import pandas as pd
import numpy as np
import random
from constant import ALPHABET_UNMOD, ALPHABET_UNMOD_REV
def load_data(msms_filet_path='data/msms_plasma.txt', score_treshold=70):
data = pd.read_csv(msms_filet_path, sep='\t')
data_compact = data[['Modified sequence', 'Length', 'Charge', 'Retention time', 'Score', 'Matches', 'Intensities']]
data_filtered = data_compact[data_compact['Score'] > score_treshold]
data_filtered = data_filtered[data_filtered['Length'] < 26]
data_filtered = data_filtered[['Modified sequence', 'Retention time', 'Score']]
data_filtered['Modified sequence']=data_filtered['Modified sequence'].map(lambda x : x[1:-1])
data_filtered['Modified sequence']=data_filtered['Modified sequence'].map(convert_mod_to_prosit)
data_filtered.rename(columns={'Modified sequence': 'sequence', 'Retention time': 'irt_scaled', 'Score':'score'}, inplace=True)
return data_filtered
def convert(l):
return [num_int for num_str in l.split() for num_int in (
lambda x: [float(x.replace('[', '').replace(']', ''))] if x.replace('.', '').replace('[', '').replace(']',
'').replace(
'e+', '').isdigit() else [])(
num_str)]
def convert_mod_to_prosit(s):
return s.replace('M(ox)','-OxM-').replace('(ac)','@')
def numerical_to_alphabetical(arr):
seq = ''
for i in range(len(arr)):
seq+=ALPHABET_UNMOD_REV[arr[i]]
return seq
def filter_acetyl(df):
df2 = df.copy()
df2['acetyl']=df['sequence'].map(lambda x: '@' in x)
df2 = df2[df2['acetyl']==False]
df_final = df2.drop('acetyl', axis=1)
df_final.reset_index(drop=True, inplace=True)
return df_final
def filter_cysteine(df):
df2 = df.copy()
df2['cysteine']=df['sequence'].map(lambda x: 'C' in x)
df2 = df2[df2['cysteine']==False]
df_final = df2.drop('cysteine', axis=1)
df_final.reset_index(drop=True, inplace=True)
return df_final
def add_split_column(data, split=(0.7,0.15,0.15)):
num_data = data.shape[0]
train_num = np.floor(num_data*split[0])
val_num = np.floor(num_data*(split[0]+split[1]))
num_data=0
list_train=[]
list_test=[]
list_val=[]
groups = [df for _, df in data.groupby('sequence')]
random.shuffle(groups)
data = pd.concat(groups).reset_index(drop=True)
groups = data.groupby('sequence',sort=False)
for seq, gr in groups:
num_data+= gr.shape[0]
if num_data<train_num:
list_train.append(gr)
elif num_data<val_num:
list_val.append(gr)
else:
list_test.append(gr)
dataset_train = pd.concat(list_train, ignore_index=True)
dataset_train['state'] = 'train'
dataset_val = pd.concat(list_val, ignore_index=True)
dataset_val['state'] = 'validation'
dataset_test = pd.concat(list_test, ignore_index=True)
dataset_test['state'] = 'holdout'
data_split = pd.concat([dataset_train, dataset_val, dataset_test], ignore_index=True)
data_split.reset_index(drop=True, inplace=True)
return data_split
def main():
df_plasma = load_data('data_PXD006109/msms_plasma.txt', 70)
merged_df = pd.concat([df_plasma], ignore_index=True)
final_df = add_split_column(merged_df)
final_df = filter_acetyl(final_df)
final_df.to_csv('data_PXD006109/data_plasma.csv', index=False)
if __name__ == '__main__':
main()