Skip to content
Snippets Groups Projects
Commit 498b38c4 authored by Schneider Leo's avatar Schneider Leo
Browse files

clean : utils.py

parent 10edb8ba
No related branches found
No related tags found
No related merge requests found
...@@ -15,76 +15,6 @@ SPECIES = ['Citrobacter cronae', 'Citrobacter portucalensis', 'Enterobacter horm ...@@ -15,76 +15,6 @@ SPECIES = ['Citrobacter cronae', 'Citrobacter portucalensis', 'Enterobacter horm
'Citrobacter koseri', 'Enterobacter cloacae', 'Hafnia alvei', 'Klebsiella pneumoniae', 'Citrobacter koseri', 'Enterobacter cloacae', 'Hafnia alvei', 'Klebsiella pneumoniae',
'Providencia stuartii'] 'Providencia stuartii']
ALPHABET_UNMOD = {
"A": 1,
"C": 2,
"D": 3,
"E": 4,
"F": 5,
"G": 6,
"H": 7,
"I": 8,
"K": 9,
"L": 10,
"M": 11,
"N": 12,
"P": 13,
"Q": 14,
"R": 15,
"S": 16,
"T": 17,
"V": 18,
"W": 19,
"Y": 20,
}
MASSES_MONO = {
"A": 71.03711,
"C": 103.00919,
"D": 115.02694,
"E": 129.04259,
"F": 147.06841,
"G": 57.02146,
"H": 137.05891,
"I": 113.08406,
"K": 128.09496,
"L": 113.08406,
"M": 131.04049,
"N": 114.04293,
"P": 97.05276,
"Q": 128.05858,
"R": 156.1875,
"S": 87.03203,
"T": 101.04768,
"V": 99.06841,
"W": 186.07931,
"Y": 163.06333,
}
MASSES_AVG = {
"A": 71.0788,
"C": 103.1388,
"D": 115.0886,
"E": 129.1155,
"F": 147.1766,
"G": 57.0519,
"H": 137.1411,
"I": 113.1594,
"K": 128.1741,
"L": 113.1594,
"M": 131.1926,
"N": 114.1038,
"P": 97.1167,
"Q": 128.1307,
"R": 156.1875,
"S": 87.0782,
"T": 101.1051,
"V": 99.1326,
"W": 186.2132,
"Y": 163.1760,
}
# trypsin cut after K or R (if not followed by P) # trypsin cut after K or R (if not followed by P)
def cut(seq): def cut(seq):
...@@ -142,6 +72,9 @@ def build_database_ref_peptide(): ...@@ -142,6 +72,9 @@ def build_database_ref_peptide():
for line in f: for line in f:
if line != '\n': if line != '\n':
if '>' in line: if '>' in line:
#remove new line
line = line.replace('\n', '')
# typo ?? # typo ??
line = line.replace('no_family', 'No_family') line = line.replace('no_family', 'No_family')
line = line.replace('no_order', 'No_order') line = line.replace('no_order', 'No_order')
...@@ -157,66 +90,46 @@ def build_database_ref_peptide(): ...@@ -157,66 +90,46 @@ def build_database_ref_peptide():
o = split_line[3].replace('_', ' ') o = split_line[3].replace('_', ' ')
else: else:
seq = line.split(' ')[1] seq = line.split(' ')[1]
#remove new lines
seq = seq.replace('\n','')
l.append({'Sequence': seq, 'Protein code': prot, 'Error treshold': err, 'Prevalance': prev, l.append({'Sequence': seq, 'Protein code': prot, 'Error treshold': err, 'Prevalance': prev,
'Specie': spe, 'Genus': gen, 'Family': fam, 'Order': o}) 'Specie': spe, 'Genus': gen, 'Family': fam, 'Order': o})
return pd.DataFrame(l) df = pd.DataFrame(l)
return df.drop_duplicates()
def compute_mass(seq, isotop):
m = 0
if isotop == 'mono':
for char in MASSES_MONO.keys():
m += MASSES_MONO[char] * seq.count(char)
if isotop == 'avg':
for char in MASSES_AVG.keys():
m += MASSES_AVG[char] * seq.count(char)
return m
def build_ref_image(path_fasta, possible_charge, ms1_end_mz, ms1_start_mz, bin_mz, max_cycle, rt_pred): def compute_common_peptide(df_path,species):
# build peptide list df = pd.read_csv(df_path)
list_seq_1 = [] df_filter = df[df['Specie'].isin(species)]
for record in fastapy.parse(path_fasta): df_filter = df_filter[['Sequence','Specie']].drop_duplicates()
list_seq_1.append(record.seq) df_filter['Count'] = df_filter.groupby('Sequence')['Sequence'].transform('count')
df_count = df_filter[['Sequence','Count']].drop_duplicates()
list_peptides = [] return df_count
for prot in list_seq_1:
list_peptides.extend(digest(prot))
# compute m/z ration
mz_ratio = {}
i = 0
list_peptides = list(set(list_peptides))
for seq in list_peptides:
mz_ratio['seq'] = []
for charge in possible_charge:
ratio = compute_mass(seq, 'avg') / charge
if ms1_end_mz > ratio > ms1_start_mz:
mz_ratio['seq'].append(ratio)
i += 1
print(i)
# assocy predict rt
data = []
# predict detectability (optional)
def build_ref_image_from_diann(path_parqet, ms1_end_mz, ms1_start_mz, bin_mz, max_cycle, min_rt=None, max_rt=None):
df = load_lib(path_parqet)
df = df[['Stripped.Sequence', 'Precursor.Charge', 'RT', 'Precursor.Mz']]
df_unique = df.drop_duplicates()
# build image # build image
total_ms1_mz = ms1_end_mz - ms1_start_mz total_ms1_mz = ms1_end_mz - ms1_start_mz
n_bin_ms1 = int(total_ms1_mz // bin_mz) n_bin_ms1 = int(total_ms1_mz // bin_mz)
im = np.zeros([max_cycle, n_bin_ms1]) im = np.zeros([max_cycle, n_bin_ms1])
max_rt = np.max(rt_pred) if max_rt is None:
min_rt = np.min(rt_pred) max_rt = np.max(df_unique['RT'])
total_rt = max_rt - min_rt if min_rt is None:
for (rt, mz_ratio) in data: min_rt = np.min(df_unique['RT'])
im[int((rt - min_rt / total_rt) * max_cycle), int(((mz_ratio - ms1_start_mz) / total_ms1_mz) * n_bin_ms1)] = 1 total_rt = max_rt - min_rt + 1e-3
for row in df_unique.iterrows():
if 900 > int(((row[1]['Precursor.Mz'] - ms1_start_mz) / total_ms1_mz) * n_bin_ms1) >= 0:
im[int((row[1]['RT'] - min_rt) / total_rt * max_cycle), int(
((row[1]['Precursor.Mz'] - ms1_start_mz) / total_ms1_mz) * n_bin_ms1)] = 1
return im return im
'Klebsiella michiganensis', def build_ref_image_from_diann_global(path_parqet, target_seq, ms1_end_mz, ms1_start_mz, bin_mz, max_cycle, min_rt=None, max_rt=None):
def build_ref_image_from_diann(path_parqet, ms1_end_mz, ms1_start_mz, bin_mz, max_cycle, min_rt=None, max_rt=None):
df = load_lib(path_parqet) df = load_lib(path_parqet)
df = df[['Stripped.Sequence', 'Precursor.Charge', 'RT', 'Precursor.Mz']] df = df[['Stripped.Sequence', 'Precursor.Charge', 'RT', 'Precursor.Mz']]
df = df[df['Stripped.Sequence'].isin(target_seq)]
df_unique = df.drop_duplicates() df_unique = df.drop_duplicates()
# build image # build image
total_ms1_mz = ms1_end_mz - ms1_start_mz total_ms1_mz = ms1_end_mz - ms1_start_mz
...@@ -236,29 +149,31 @@ def build_ref_image_from_diann(path_parqet, ms1_end_mz, ms1_start_mz, bin_mz, ma ...@@ -236,29 +149,31 @@ def build_ref_image_from_diann(path_parqet, ms1_end_mz, ms1_start_mz, bin_mz, ma
if __name__ == '__main__': if __name__ == '__main__':
df = build_database_ref_peptide() # df = build_database_ref_peptide()
#Write fasta file # df.to_csv("dataset_species_ref_peptides.csv", index=False)
# for spe in SPECIES: # #Write fasta file
# # with open('fasta/optimal peptide set/gobal_peptide_set.fasta', "w") as f:
# df_spe = df[df['Specie']==spe] # for spe in SPECIES:
# spe_list = df_spe['Sequence'].to_list() # df_spe = df[df['Specie']==spe]
# with open('fasta/optimal peptide set/'+spe+'.fasta',"w") as f: # spe_list = df_spe['Sequence'].drop_duplicate().to_list()
# if not spe_list: # if not spe_list:
# print('Empty : ',spe) # print('Empty : ',spe)
# for pep in spe_list : # for pep in spe_list :
# f.write(pep) # f.write(pep)
df_count = compute_common_peptide("dataset_species_ref_peptides.csv", SPECIES)
# #
#Create ref img #Create ref img
df_full = load_lib( # df_full = load_lib(
'fasta/full proteom/steigerwaltii variants/uniparc_proteome_UP000033376_2025_03_14.predicted.parquet') # 'fasta/full proteom/steigerwaltii variants/uniparc_proteome_UP000033376_2025_03_14.predicted.parquet')
min_rt = df_full['RT'].min() # min_rt = df_full['RT'].min()
max_rt = df_full['RT'].max() # max_rt = df_full['RT'].max()
#
for spe in SPECIES: # for spe in SPECIES:
im = build_ref_image_from_diann( # im = build_ref_image_from_diann(
'fasta/optimal peptide set/' + spe + '.parquet', ms1_end_mz=1250, # 'fasta/optimal peptide set/' + spe + '.parquet', ms1_end_mz=1250,
ms1_start_mz=350, bin_mz=1, max_cycle=663, min_rt=min_rt, max_rt=max_rt) # ms1_start_mz=350, bin_mz=1, max_cycle=663, min_rt=min_rt, max_rt=max_rt)
plt.clf() # plt.clf()
mpimg.imsave('img_ref/' + spe + '.png', im) # mpimg.imsave('img_ref/' + spe + '.png', im)
np.save(spe + '.npy', im) # np.save(spe + '.npy', im)
0% Loading or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment