diff --git a/image_processing/build_image.py b/image_processing/build_image.py index 251af24fcae03dc9b1164102a2ec4982c1d52f62..336940a4073c9aae6ae9c7f307904b0b42c753ea 100644 --- a/image_processing/build_image.py +++ b/image_processing/build_image.py @@ -44,6 +44,7 @@ def build_image_ms1(path, bin_mz): break total_ms1_mz = ms1_end_mz - ms1_start_mz + print('start',ms1_start_mz,'end',ms1_end_mz) n_bin_ms1 = int(total_ms1_mz//bin_mz) size_bin_ms1 = total_ms1_mz / n_bin_ms1 for spec in e: # data structure diff --git a/image_ref/analyse_diann_digestion.py b/image_ref/analyse_diann_digestion.py new file mode 100644 index 0000000000000000000000000000000000000000..94f8a4c9987161c66a8ff9087b9dbaa9a8d42df6 --- /dev/null +++ b/image_ref/analyse_diann_digestion.py @@ -0,0 +1,23 @@ +import pandas as pd +import pyarrow.parquet as pq +import matplotlib.pyplot as plt +import numpy as np +from matplotlib_venn import venn2 + + +def load_lib(path): + table = pq.read_table(path) + table = table.to_pandas() + + return table + + +df1 = load_lib('fasta/steigerwaltii variants/uniparc_proteome_UP000033376_2025_03_14.predicted.parquet') +df2 = load_lib('fasta/steigerwaltii variants/uniparc_proteome_UP000033499_2025_03_14.predicted.parquet') + +set1 = set(df1['Stripped.Sequence'].to_list()) +set2 = set(df2['Stripped.Sequence'].to_list()) + +venn2((set1, set2), ('Group1', 'Group2')) +plt.show() +plt.savefig('fasta_similarity_diann.png') \ No newline at end of file diff --git a/image_ref/utils.py b/image_ref/utils.py index e22fccbf3bb0ecfc4a94c293eda01f9d8e7b964a..13d6a69367d38196741a5c16a32b626103eb8486 100644 --- a/image_ref/utils.py +++ b/image_ref/utils.py @@ -2,8 +2,8 @@ import fastapy import matplotlib.pyplot as plt import numpy as np from matplotlib_venn import venn2 - - +from analyse_diann_digestion import load_lib +import matplotlib.image as mpimg ALPHABET_UNMOD = { @@ -109,9 +109,9 @@ def fasta_similarity(path_fasta_1, path_fasta_2): list_seq_1=[] list_seq_2 = [] for record in fastapy.parse(path_fasta_1): - list_seq_1.append(record.seq) + list_seq_1.extend(digest(record.seq)) for record in fastapy.parse(path_fasta_2): - list_seq_2.append(record.seq) + list_seq_2.extend(digest(record.seq)) set1 = set(list_seq_1) set2 = set(list_seq_2) @@ -143,12 +143,16 @@ def build_ref_image(path_fasta, possible_charge, ms1_end_mz, ms1_start_mz, bin_m #compute m/z ration mz_ratio={} + i=0 + list_peptides = list(set(list_peptides)) for seq in list_peptides: mz_ratio['seq']=[] for charge in possible_charge: ratio = compute_mass(seq,'avg')/charge if ms1_end_mz > ratio > ms1_start_mz: mz_ratio['seq'].append(ratio) + i+=1 + print(i) #assocy predict rt data=[] @@ -167,7 +171,30 @@ def build_ref_image(path_fasta, possible_charge, ms1_end_mz, ms1_start_mz, bin_m return im + +def build_ref_image_from_diann(path_parqet, ms1_end_mz, ms1_start_mz, bin_mz, max_cycle, rt_pred): + + + df = load_lib(path_parqet) + df=df[['Stripped.Sequence','Precursor.Charge','RT','Precursor.Mz']] + df_unique = df.drop_duplicates() + #build image + total_ms1_mz = ms1_end_mz - ms1_start_mz + n_bin_ms1 = int(total_ms1_mz // bin_mz) + im = np.zeros([max_cycle, n_bin_ms1]) + max_rt = np.max(df_unique['RT']) + min_rt = np.min(df_unique['RT']) + total_rt = max_rt - min_rt +1e-3 + for row in df_unique.iterrows() : + if 900 > int(((row[1]['Precursor.Mz']-ms1_start_mz)/total_ms1_mz)*n_bin_ms1) >= 0: + im[int((row[1]['RT']-min_rt)/total_rt*max_cycle),int(((row[1]['Precursor.Mz']-ms1_start_mz)/total_ms1_mz)*n_bin_ms1)]=1 + + return im + + + if __name__ == '__main__': - # fasta_similarity('fasta/uniprotkb_proteome_UP000742934_2025_03_12.fasta','fasta/uniprotkb_proteome_UP001182277_2025_03_12.fasta') - # mass = build_ref_image('fasta/uniprotkb_proteome_UP000742934_2025_03_12.fasta') - pass \ No newline at end of file + # fasta_similarity('fasta/uniparc_proteome_UP000033376_2025_03_14.fasta','fasta/uniparc_proteome_UP000033499_2025_03_14.fasta') + im = build_ref_image_from_diann('fasta/steigerwaltii variants/uniparc_proteome_UP000033376_2025_03_14.predicted.parquet', ms1_end_mz=1250, ms1_start_mz=350, bin_mz=1, max_cycle=663, rt_pred=[]) + plt.clf() + mpimg.imsave('test_img.png', im)