From ac0134505a3a5530d36aeb01ba221e3c4352a302 Mon Sep 17 00:00:00 2001
From: Schneider Leo <leo.schneider@etu.ec-lyon.fr>
Date: Fri, 20 Sep 2024 17:25:49 +0200
Subject: [PATCH] fix vocab + update local integration

---
 common_dataset.py         |  2 +-
 data_viz.py               |  6 ++--
 local_integration_msms.py | 70 ++++++++++++++++++++++++++++++++-------
 mzml_exploration.py       |  2 +-
 4 files changed, 63 insertions(+), 17 deletions(-)

diff --git a/common_dataset.py b/common_dataset.py
index 6e46845..5ab8a5f 100644
--- a/common_dataset.py
+++ b/common_dataset.py
@@ -95,7 +95,7 @@ def alphabetical_to_numerical(seq, vocab):
     else :
         for i in range(len(seq) - 2 * seq.count('-')):
             if seq[i + dec] != '-':
-                num.append(IUPAC_VOCAB[seq[i + dec]])
+                num.append(ALPHABET_UNMOD[seq[i + dec]])
             else:
                 if seq[i + dec + 1:i + dec + 4] == 'CaC':
                     num.append(21)
diff --git a/data_viz.py b/data_viz.py
index 23dec3e..ba16bc0 100644
--- a/data_viz.py
+++ b/data_viz.py
@@ -229,9 +229,9 @@ def add_length(dataframe):
     dataframe['length']=dataframe['seq'].map(fonc)
 
 
-df = pd.read_csv('output/out_ISA_no_tape.csv')
+df = pd.read_csv('output/out_prosit_common.csv')
 add_length(df)
 df['abs_error'] =  np.abs(df['rt pred']-df['true rt'])
-histo_abs_error(df, display=False, save=True, path='temp.png')
+# histo_abs_error(df, display=False, save=True, path='temp.png')
 # scatter_rt(df, display=False, save=True, path='temp.png')
-# histo_length_by_error(df, 10, save=True, path='temp.png')
\ No newline at end of file
+histo_length_by_error(df, 10, save=True, path='temp.png')
\ No newline at end of file
diff --git a/local_integration_msms.py b/local_integration_msms.py
index 7cc384b..984e4c2 100644
--- a/local_integration_msms.py
+++ b/local_integration_msms.py
@@ -1,6 +1,7 @@
 import pyopenms as oms
 import numpy as np
 import matplotlib.pyplot as plt
+import pandas as pd
 
 def compute_chromatograms(rt, mz, intensity, start_c, end_c):
     value=[]
@@ -12,29 +13,74 @@ def compute_chromatograms(rt, mz, intensity, start_c, end_c):
 
     return value
 
+def get_df(expe, long: bool = False):
+    """Generates a pandas DataFrame with all peaks in the MSExperiment
 
-if __name__ == "__main__":
-    e = oms.MSExperiment()
-    oms.MzMLFile().load("data/Staph140.mzML", e)
-    e.updateRanges()
+    Parameters:
+    long: set to True if you want to have a long/expanded/melted dataframe with one row per peak. Faster but
+        replicated RT information. If False, returns rows in the style: rt, _np.array(mz), _np.array(int)
+
+    Returns:
+    pandas.DataFrame: feature information stored in a DataFrame
+    """
+    if long:
+        cols = ["RT", "mz", "inty", 'MSlevel']
+        expe.updateRanges()
+        spectraarrs2d = expe.get2DPeakDataLong(expe.getMinRT(), expe.getMaxRT(), expe.getMinMZ(), expe.getMaxMZ())
+        return pd.DataFrame(dict(zip(cols, spectraarrs2d))) #TODO ajouter MSlevel
+
+    cols = ["RT", "mzarray", "intarray", 'MSlevel','MS1 MZ']
+
+    return pd.DataFrame(data=((spec.getRT(), *spec.get_peaks(), spec.getMSLevel(), spec.getPrecursors()[0].getMZ() if  spec.getMSLevel() ==2 else None) for spec in expe), columns=cols)
+
+def generate_RT_int_imgs(exp,star_mz,stop_mz):
+    exp.updateRanges()
     rt = []
-    charge = []
+    mz = []
     intensity = []
-    for s in e :
+    for s in exp :
         if s.getMSLevel() == 1:
             rt.append(s.getRT())
-            charge.append(s.get_peaks()[0])
+            mz.append(s.get_peaks()[0])
             intensity.append(s.get_peaks()[1])
-    mz_range = np.linspace(350,1250,4000)
+    mz_range = np.linspace(star_mz,stop_mz,1000)
     for i in range(len(mz_range)-1):
-        print(mz_range[i],'/1250')
-        val = compute_chromatograms(rt, charge, intensity, mz_range[i] ,mz_range[i+1])
+        print(mz_range[i],'/{}'.format(stop_mz))
+        val = compute_chromatograms(rt, mz, intensity, mz_range[i] ,mz_range[i+1])
         fig, ax = plt.subplots()
-        ax.plot(val)
+        ax.plot(rt,val)
         ax.set_xlabel('Retention time')
         ax.set_ylabel('Intensity')
         ax.set_title('mz : {} to {}'.format(mz_range[i] ,mz_range[i+1]))
         plt.savefig('fig/rt_local/{}_to_{}.png'.format(mz_range[i] ,mz_range[i+1]))
         plt.clf()
-df = e.get_df()
+
+def integrate_ms_ms(time_start, time_end, df):
+    df_useful = df[(df['MS1 RT']>time_start) & (df['MS1 RT']<time_end) & (df['MSlevel']==2)].reset_index(inplace=True)
+
+
+
+
+    return value
+
+if __name__ == "__main__":
+    e = oms.MSExperiment()
+    oms.MzMLFile().load("data/Staph140.mzML", e)
+    # generate_RT_int_imgs(e, 350, 1250)
+
+    df = get_df(e)
+    df1 = df[df['MSlevel'] == 1]
+    df1.reset_index(inplace=True, drop=True)
+    for i in range(len(df1)):
+        fig, ax = plt.subplots()
+        ax.plot(df1['mzarray'][i], df1['intarray'][i],linewidth=0.1)
+        ax.set_xlabel('mz')
+        ax.set_xlim(350,750)
+        ax.set_ylabel('Intensity')
+        ax.set_title('RT : {}'.format(df1['RT'][i]))
+        plt.savefig('fig/rt_local/RT{}.png'.format(df1['RT'][i]))
+        plt.close()
+
+
+
 #358.1 358.32
\ No newline at end of file
diff --git a/mzml_exploration.py b/mzml_exploration.py
index 7e63464..15d80c2 100644
--- a/mzml_exploration.py
+++ b/mzml_exploration.py
@@ -301,5 +301,5 @@ if __name__ == "__main__":
     # b = [np.pad(array, (0, max_len - len(array)), mode='constant', constant_values=default_value) for array in res]
 
 
-
+s = oms.MSSpectrum
 
-- 
GitLab