diff --git a/data/msp_file_extraction.py b/data/msp_file_extraction.py index 46188a2d71f6a333c6e8fb79ca3fd8123936cedf..ebd4b9ded7b0682e346372e183ea1a11d54a87d1 100644 --- a/data/msp_file_extraction.py +++ b/data/msp_file_extraction.py @@ -1,5 +1,43 @@ import pandas as pd +from sympy import false +ALPHABET_UNMOD = { + "": 0, + "A": 1, + "C": 2, + "D": 3, + "E": 4, + "F": 5, + "G": 6, + "H": 7, + "I": 8, + "K": 9, + "L": 10, + "M": 11, + "N": 12, + "P": 13, + "Q": 14, + "R": 15, + "S": 16, + "T": 17, + "V": 18, + "W": 19, + "Y": 20, + "M(UniMod:35)": 21, + "CaC": 22 +} + +ALPHABET_UNMOD_REV = {v: k for k, v in ALPHABET_UNMOD.items()} + +def numerical_to_alphabetical_str(s): + seq = '' + s = s.replace('[','') + s = s.replace(']', '') + arr = s.split(',') + arr = list(map(int, arr)) + for i in range(len(arr)): + seq+=ALPHABET_UNMOD_REV[arr[i]] + return seq if __name__ == '__main__': seq=[] @@ -8,12 +46,17 @@ if __name__ == '__main__': file.close() remove = False index_to_remove=[] + updated_content=[] + ind=0 + predicted_lib=pd.read_csv('../output/out_lib_oktoberfest.csv') + pred_rt=predicted_lib['rt pred'] + for i in range(len(content)) : if remove: if 'Name:' in content[i]: remove = False else : - index_to_remove.append(i) + pass if 'Name:'in content[i]: s=content[i].split(':')[1].split('/')[0] @@ -27,4 +70,16 @@ if __name__ == '__main__': df['state'] = 'holdout' df.to_csv('spectral_lib/df_predicted_library_oktoberfest.csv',index=False) + updated_content=[] + ind=0 + predicted_lib=pd.read_csv('../output/out_lib_oktoberfest.csv') + + predicted_lib['seq'] = predicted_lib['seq'].map(numerical_to_alphabetical_str) + + predicted_lib['sequence']=predicted_lib['seq'] + pred_rt=predicted_lib['rt pred'] + + df_joined = pd.merge(df,predicted_lib[['rt pred','sequence']],on='sequence',how='left') + + #1787661 avec C , 15104040 sans \ No newline at end of file diff --git a/diann_lib_processing.py b/diann_lib_processing.py index 90ef00290874e107ea563fa4627f2280dfa263c2..2c0ff1f32540dd624a9c2cc4db9c764a8fdf5f53 100644 --- a/diann_lib_processing.py +++ b/diann_lib_processing.py @@ -136,8 +136,7 @@ if __name__ =='__main__': model.load_state_dict(torch.load(args.model_weigh, weights_only=True)) - print(args.dataset_test) - data_test = load_data(data_source='data/spectral_lib/data_uniprot_base.csv', batch_size=args.batch_size, length=30, mode=args.split_test, + data_test = load_data(data_source=args.dataset_test, batch_size=args.batch_size, length=30, mode=args.split_test, seq_col=args.seq_test) predict(data_test, model, args.output)