fix image ref ccreation

c766cc82 · Schneider Leo · ea776796 · c766cc82 · c766cc82 · c766cc82
Commit c766cc82 authored 3 months ago by Schneider Leo
--- a/image_processing/build_image.py
+++ b/image_processing/build_image.py
@@ -44,6 +44,7 @@ def build_image_ms1(path, bin_mz):
            break
    total_ms1_mz = ms1_end_mz - ms1_start_mz
+    print('start',ms1_start_mz,'end',ms1_end_mz)
    n_bin_ms1 = int(total_ms1_mz//bin_mz)
    size_bin_ms1 = total_ms1_mz / n_bin_ms1
    for spec in e:  # data structure

--- a/image_ref/analyse_diann_digestion.py
+++ b/image_ref/analyse_diann_digestion.py
+import pandas as pd
+import pyarrow.parquet as pq
+import matplotlib.pyplot as plt
+import numpy as np
+from matplotlib_venn import venn2
+def load_lib(path):
+    table = pq.read_table(path)
+    table = table.to_pandas()
+    return table
+df1 = load_lib('fasta/steigerwaltii variants/uniparc_proteome_UP000033376_2025_03_14.predicted.parquet')
+df2 = load_lib('fasta/steigerwaltii variants/uniparc_proteome_UP000033499_2025_03_14.predicted.parquet')
+set1 = set(df1['Stripped.Sequence'].to_list())
+set2 = set(df2['Stripped.Sequence'].to_list())
+venn2((set1, set2), ('Group1', 'Group2'))
+plt.show()
+plt.savefig('fasta_similarity_diann.png')
\ No newline at end of file
--- a/image_ref/utils.py
+++ b/image_ref/utils.py
@@ -2,8 +2,8 @@ import fastapy
 import matplotlib.pyplot as plt
 import numpy as np
 from matplotlib_venn import venn2
+from analyse_diann_digestion import load_lib
+import matplotlib.image as mpimg
 ALPHABET_UNMOD = {
@@ -109,9 +109,9 @@ def fasta_similarity(path_fasta_1, path_fasta_2):
    list_seq_1=[]
    list_seq_2 = []
    for record in fastapy.parse(path_fasta_1):
-        list_seq_1.append(record.seq)
+        list_seq_1.extend(digest(record.seq))
    for record in fastapy.parse(path_fasta_2):
-        list_seq_2.append(record.seq)
+        list_seq_2.extend(digest(record.seq))
    set1 = set(list_seq_1)
    set2 = set(list_seq_2)
@@ -143,12 +143,16 @@ def build_ref_image(path_fasta, possible_charge, ms1_end_mz, ms1_start_mz, bin_m
    #compute m/z ration
    mz_ratio={}
+    i=0
+    list_peptides = list(set(list_peptides))
    for seq in list_peptides:
        mz_ratio['seq']=[]
        for charge in possible_charge:
            ratio = compute_mass(seq,'avg')/charge
            if ms1_end_mz > ratio > ms1_start_mz:
                mz_ratio['seq'].append(ratio)
+                i+=1
+    print(i)
    #assocy predict rt
    data=[]
@@ -167,7 +171,30 @@ def build_ref_image(path_fasta, possible_charge, ms1_end_mz, ms1_start_mz, bin_m
    return im
+def build_ref_image_from_diann(path_parqet, ms1_end_mz, ms1_start_mz, bin_mz, max_cycle, rt_pred):
+    df = load_lib(path_parqet)
+    df=df[['Stripped.Sequence','Precursor.Charge','RT','Precursor.Mz']]
+    df_unique = df.drop_duplicates()
+    #build image
+    total_ms1_mz = ms1_end_mz - ms1_start_mz
+    n_bin_ms1 = int(total_ms1_mz // bin_mz)
+    im = np.zeros([max_cycle, n_bin_ms1])
+    max_rt = np.max(df_unique['RT'])
+    min_rt = np.min(df_unique['RT'])
+    total_rt = max_rt - min_rt +1e-3
+    for row in df_unique.iterrows() :
+        if 900 > int(((row[1]['Precursor.Mz']-ms1_start_mz)/total_ms1_mz)*n_bin_ms1) >= 0:
+            im[int((row[1]['RT']-min_rt)/total_rt*max_cycle),int(((row[1]['Precursor.Mz']-ms1_start_mz)/total_ms1_mz)*n_bin_ms1)]=1
+    return im
 if __name__ == '__main__':
-    # fasta_similarity('fasta/uniprotkb_proteome_UP000742934_2025_03_12.fasta','fasta/uniprotkb_proteome_UP001182277_2025_03_12.fasta')
+    # fasta_similarity('fasta/uniparc_proteome_UP000033376_2025_03_14.fasta','fasta/uniparc_proteome_UP000033499_2025_03_14.fasta')
-    # mass = build_ref_image('fasta/uniprotkb_proteome_UP000742934_2025_03_12.fasta')
+    im = build_ref_image_from_diann('fasta/steigerwaltii variants/uniparc_proteome_UP000033376_2025_03_14.predicted.parquet', ms1_end_mz=1250, ms1_start_mz=350, bin_mz=1, max_cycle=663, rt_pred=[])
-    pass
+    plt.clf()
\ No newline at end of file
+    mpimg.imsave('test_img.png', im)