diff --git a/image_ref/utils.py b/image_ref/utils.py new file mode 100644 index 0000000000000000000000000000000000000000..e22fccbf3bb0ecfc4a94c293eda01f9d8e7b964a --- /dev/null +++ b/image_ref/utils.py @@ -0,0 +1,173 @@ +import fastapy +import matplotlib.pyplot as plt +import numpy as np +from matplotlib_venn import venn2 + + + + +ALPHABET_UNMOD = { + "A": 1, + "C": 2, + "D": 3, + "E": 4, + "F": 5, + "G": 6, + "H": 7, + "I": 8, + "K": 9, + "L": 10, + "M": 11, + "N": 12, + "P": 13, + "Q": 14, + "R": 15, + "S": 16, + "T": 17, + "V": 18, + "W": 19, + "Y": 20, +} + + +MASSES_MONO = { + "A": 71.03711, + "C": 103.00919, + "D": 115.02694, + "E": 129.04259, + "F": 147.06841, + "G": 57.02146, + "H": 137.05891, + "I": 113.08406, + "K": 128.09496, + "L": 113.08406, + "M": 131.04049, + "N": 114.04293, + "P": 97.05276, + "Q": 128.05858, + "R": 156.1875, + "S": 87.03203, + "T": 101.04768, + "V": 99.06841, + "W": 186.07931, + "Y": 163.06333, +} + + +MASSES_AVG = { + "A": 71.0788, + "C": 103.1388, + "D": 115.0886, + "E": 129.1155, + "F": 147.1766, + "G": 57.0519, + "H": 137.1411, + "I": 113.1594, + "K": 128.1741, + "L": 113.1594, + "M": 131.1926, + "N": 114.1038, + "P": 97.1167, + "Q": 128.1307, + "R": 156.1875, + "S": 87.0782, + "T": 101.1051, + "V": 99.1326, + "W": 186.2132, + "Y": 163.1760, +} + +# trypsin cut after K or R (if not followed by P) + +def cut(seq): + cuts = [] + l = len(seq) + for i in range(l): + if seq[i] == 'R' or seq[i] == 'K': + if i < l - 1 and seq[i + 1] != 'P': + cuts.append(i + 1) + return cuts + + +def cut_with_ind(seq, ind_list): + l = [] + size = len(seq) + ind_list.append(size) + + for i in range(len(ind_list) - 1): + if i == 0: + l.append(seq[:ind_list[i]]) + l.append(seq[ind_list[i]:ind_list[i + 1]]) + + return l + +def digest(seq): + ind = cut(seq) + return cut_with_ind(seq, ind) + +def fasta_similarity(path_fasta_1, path_fasta_2): + list_seq_1=[] + list_seq_2 = [] + for record in fastapy.parse(path_fasta_1): + list_seq_1.append(record.seq) + for record in fastapy.parse(path_fasta_2): + list_seq_2.append(record.seq) + + set1 = set(list_seq_1) + set2 = set(list_seq_2) + + venn2((set1, set2), ('Group1', 'Group2')) + plt.show() + plt.savefig('fasta_similarity.png') + +def compute_mass(seq, isotop): + m = 0 + if isotop == 'mono': + for char in MASSES_MONO.keys(): + m += MASSES_MONO[char] * seq.count(char) + if isotop == 'avg': + for char in MASSES_AVG.keys(): + m += MASSES_AVG[char] * seq.count(char) + return m + +def build_ref_image(path_fasta, possible_charge, ms1_end_mz, ms1_start_mz, bin_mz, max_cycle, rt_pred): + + #build peptide list + list_seq_1 = [] + for record in fastapy.parse(path_fasta): + list_seq_1.append(record.seq) + + list_peptides = [] + for prot in list_seq_1 : + list_peptides.extend(digest(prot)) + + #compute m/z ration + mz_ratio={} + for seq in list_peptides: + mz_ratio['seq']=[] + for charge in possible_charge: + ratio = compute_mass(seq,'avg')/charge + if ms1_end_mz > ratio > ms1_start_mz: + mz_ratio['seq'].append(ratio) + + #assocy predict rt + data=[] + + #predict detectability (optional) + + #build image + total_ms1_mz = ms1_end_mz - ms1_start_mz + n_bin_ms1 = int(total_ms1_mz // bin_mz) + im = np.zeros([max_cycle, n_bin_ms1]) + max_rt = np.max(rt_pred) + min_rt = np.min(rt_pred) + total_rt = max_rt - min_rt + for (rt,mz_ratio) in data : + im[int((rt-min_rt/total_rt)*max_cycle),int(((mz_ratio-ms1_start_mz)/total_ms1_mz)*n_bin_ms1)]=1 + + return im + +if __name__ == '__main__': + # fasta_similarity('fasta/uniprotkb_proteome_UP000742934_2025_03_12.fasta','fasta/uniprotkb_proteome_UP001182277_2025_03_12.fasta') + # mass = build_ref_image('fasta/uniprotkb_proteome_UP000742934_2025_03_12.fasta') + pass \ No newline at end of file