From ac0134505a3a5530d36aeb01ba221e3c4352a302 Mon Sep 17 00:00:00 2001 From: Schneider Leo <leo.schneider@etu.ec-lyon.fr> Date: Fri, 20 Sep 2024 17:25:49 +0200 Subject: [PATCH] fix vocab + update local integration --- common_dataset.py | 2 +- data_viz.py | 6 ++-- local_integration_msms.py | 70 ++++++++++++++++++++++++++++++++------- mzml_exploration.py | 2 +- 4 files changed, 63 insertions(+), 17 deletions(-) diff --git a/common_dataset.py b/common_dataset.py index 6e46845..5ab8a5f 100644 --- a/common_dataset.py +++ b/common_dataset.py @@ -95,7 +95,7 @@ def alphabetical_to_numerical(seq, vocab): else : for i in range(len(seq) - 2 * seq.count('-')): if seq[i + dec] != '-': - num.append(IUPAC_VOCAB[seq[i + dec]]) + num.append(ALPHABET_UNMOD[seq[i + dec]]) else: if seq[i + dec + 1:i + dec + 4] == 'CaC': num.append(21) diff --git a/data_viz.py b/data_viz.py index 23dec3e..ba16bc0 100644 --- a/data_viz.py +++ b/data_viz.py @@ -229,9 +229,9 @@ def add_length(dataframe): dataframe['length']=dataframe['seq'].map(fonc) -df = pd.read_csv('output/out_ISA_no_tape.csv') +df = pd.read_csv('output/out_prosit_common.csv') add_length(df) df['abs_error'] = np.abs(df['rt pred']-df['true rt']) -histo_abs_error(df, display=False, save=True, path='temp.png') +# histo_abs_error(df, display=False, save=True, path='temp.png') # scatter_rt(df, display=False, save=True, path='temp.png') -# histo_length_by_error(df, 10, save=True, path='temp.png') \ No newline at end of file +histo_length_by_error(df, 10, save=True, path='temp.png') \ No newline at end of file diff --git a/local_integration_msms.py b/local_integration_msms.py index 7cc384b..984e4c2 100644 --- a/local_integration_msms.py +++ b/local_integration_msms.py @@ -1,6 +1,7 @@ import pyopenms as oms import numpy as np import matplotlib.pyplot as plt +import pandas as pd def compute_chromatograms(rt, mz, intensity, start_c, end_c): value=[] @@ -12,29 +13,74 @@ def compute_chromatograms(rt, mz, intensity, start_c, end_c): return value +def get_df(expe, long: bool = False): + """Generates a pandas DataFrame with all peaks in the MSExperiment -if __name__ == "__main__": - e = oms.MSExperiment() - oms.MzMLFile().load("data/Staph140.mzML", e) - e.updateRanges() + Parameters: + long: set to True if you want to have a long/expanded/melted dataframe with one row per peak. Faster but + replicated RT information. If False, returns rows in the style: rt, _np.array(mz), _np.array(int) + + Returns: + pandas.DataFrame: feature information stored in a DataFrame + """ + if long: + cols = ["RT", "mz", "inty", 'MSlevel'] + expe.updateRanges() + spectraarrs2d = expe.get2DPeakDataLong(expe.getMinRT(), expe.getMaxRT(), expe.getMinMZ(), expe.getMaxMZ()) + return pd.DataFrame(dict(zip(cols, spectraarrs2d))) #TODO ajouter MSlevel + + cols = ["RT", "mzarray", "intarray", 'MSlevel','MS1 MZ'] + + return pd.DataFrame(data=((spec.getRT(), *spec.get_peaks(), spec.getMSLevel(), spec.getPrecursors()[0].getMZ() if spec.getMSLevel() ==2 else None) for spec in expe), columns=cols) + +def generate_RT_int_imgs(exp,star_mz,stop_mz): + exp.updateRanges() rt = [] - charge = [] + mz = [] intensity = [] - for s in e : + for s in exp : if s.getMSLevel() == 1: rt.append(s.getRT()) - charge.append(s.get_peaks()[0]) + mz.append(s.get_peaks()[0]) intensity.append(s.get_peaks()[1]) - mz_range = np.linspace(350,1250,4000) + mz_range = np.linspace(star_mz,stop_mz,1000) for i in range(len(mz_range)-1): - print(mz_range[i],'/1250') - val = compute_chromatograms(rt, charge, intensity, mz_range[i] ,mz_range[i+1]) + print(mz_range[i],'/{}'.format(stop_mz)) + val = compute_chromatograms(rt, mz, intensity, mz_range[i] ,mz_range[i+1]) fig, ax = plt.subplots() - ax.plot(val) + ax.plot(rt,val) ax.set_xlabel('Retention time') ax.set_ylabel('Intensity') ax.set_title('mz : {} to {}'.format(mz_range[i] ,mz_range[i+1])) plt.savefig('fig/rt_local/{}_to_{}.png'.format(mz_range[i] ,mz_range[i+1])) plt.clf() -df = e.get_df() + +def integrate_ms_ms(time_start, time_end, df): + df_useful = df[(df['MS1 RT']>time_start) & (df['MS1 RT']<time_end) & (df['MSlevel']==2)].reset_index(inplace=True) + + + + + return value + +if __name__ == "__main__": + e = oms.MSExperiment() + oms.MzMLFile().load("data/Staph140.mzML", e) + # generate_RT_int_imgs(e, 350, 1250) + + df = get_df(e) + df1 = df[df['MSlevel'] == 1] + df1.reset_index(inplace=True, drop=True) + for i in range(len(df1)): + fig, ax = plt.subplots() + ax.plot(df1['mzarray'][i], df1['intarray'][i],linewidth=0.1) + ax.set_xlabel('mz') + ax.set_xlim(350,750) + ax.set_ylabel('Intensity') + ax.set_title('RT : {}'.format(df1['RT'][i])) + plt.savefig('fig/rt_local/RT{}.png'.format(df1['RT'][i])) + plt.close() + + + #358.1 358.32 \ No newline at end of file diff --git a/mzml_exploration.py b/mzml_exploration.py index 7e63464..15d80c2 100644 --- a/mzml_exploration.py +++ b/mzml_exploration.py @@ -301,5 +301,5 @@ if __name__ == "__main__": # b = [np.pad(array, (0, max_len - len(array)), mode='constant', constant_values=default_value) for array in res] - +s = oms.MSSpectrum -- GitLab