Skip to content
Snippets Groups Projects
Commit 498b38c4 authored by Schneider Leo's avatar Schneider Leo
Browse files

clean : utils.py

parent 10edb8ba
No related branches found
No related tags found
Loading
......@@ -15,76 +15,6 @@ SPECIES = ['Citrobacter cronae', 'Citrobacter portucalensis', 'Enterobacter horm
'Citrobacter koseri', 'Enterobacter cloacae', 'Hafnia alvei', 'Klebsiella pneumoniae',
'Providencia stuartii']
ALPHABET_UNMOD = {
"A": 1,
"C": 2,
"D": 3,
"E": 4,
"F": 5,
"G": 6,
"H": 7,
"I": 8,
"K": 9,
"L": 10,
"M": 11,
"N": 12,
"P": 13,
"Q": 14,
"R": 15,
"S": 16,
"T": 17,
"V": 18,
"W": 19,
"Y": 20,
}
MASSES_MONO = {
"A": 71.03711,
"C": 103.00919,
"D": 115.02694,
"E": 129.04259,
"F": 147.06841,
"G": 57.02146,
"H": 137.05891,
"I": 113.08406,
"K": 128.09496,
"L": 113.08406,
"M": 131.04049,
"N": 114.04293,
"P": 97.05276,
"Q": 128.05858,
"R": 156.1875,
"S": 87.03203,
"T": 101.04768,
"V": 99.06841,
"W": 186.07931,
"Y": 163.06333,
}
MASSES_AVG = {
"A": 71.0788,
"C": 103.1388,
"D": 115.0886,
"E": 129.1155,
"F": 147.1766,
"G": 57.0519,
"H": 137.1411,
"I": 113.1594,
"K": 128.1741,
"L": 113.1594,
"M": 131.1926,
"N": 114.1038,
"P": 97.1167,
"Q": 128.1307,
"R": 156.1875,
"S": 87.0782,
"T": 101.1051,
"V": 99.1326,
"W": 186.2132,
"Y": 163.1760,
}
# trypsin cut after K or R (if not followed by P)
def cut(seq):
......@@ -142,6 +72,9 @@ def build_database_ref_peptide():
for line in f:
if line != '\n':
if '>' in line:
#remove new line
line = line.replace('\n', '')
# typo ??
line = line.replace('no_family', 'No_family')
line = line.replace('no_order', 'No_order')
......@@ -157,66 +90,46 @@ def build_database_ref_peptide():
o = split_line[3].replace('_', ' ')
else:
seq = line.split(' ')[1]
#remove new lines
seq = seq.replace('\n','')
l.append({'Sequence': seq, 'Protein code': prot, 'Error treshold': err, 'Prevalance': prev,
'Specie': spe, 'Genus': gen, 'Family': fam, 'Order': o})
return pd.DataFrame(l)
def compute_mass(seq, isotop):
m = 0
if isotop == 'mono':
for char in MASSES_MONO.keys():
m += MASSES_MONO[char] * seq.count(char)
if isotop == 'avg':
for char in MASSES_AVG.keys():
m += MASSES_AVG[char] * seq.count(char)
return m
df = pd.DataFrame(l)
return df.drop_duplicates()
def build_ref_image(path_fasta, possible_charge, ms1_end_mz, ms1_start_mz, bin_mz, max_cycle, rt_pred):
# build peptide list
list_seq_1 = []
for record in fastapy.parse(path_fasta):
list_seq_1.append(record.seq)
list_peptides = []
for prot in list_seq_1:
list_peptides.extend(digest(prot))
# compute m/z ration
mz_ratio = {}
i = 0
list_peptides = list(set(list_peptides))
for seq in list_peptides:
mz_ratio['seq'] = []
for charge in possible_charge:
ratio = compute_mass(seq, 'avg') / charge
if ms1_end_mz > ratio > ms1_start_mz:
mz_ratio['seq'].append(ratio)
i += 1
print(i)
# assocy predict rt
data = []
def compute_common_peptide(df_path,species):
df = pd.read_csv(df_path)
df_filter = df[df['Specie'].isin(species)]
df_filter = df_filter[['Sequence','Specie']].drop_duplicates()
df_filter['Count'] = df_filter.groupby('Sequence')['Sequence'].transform('count')
df_count = df_filter[['Sequence','Count']].drop_duplicates()
return df_count
# predict detectability (optional)
def build_ref_image_from_diann(path_parqet, ms1_end_mz, ms1_start_mz, bin_mz, max_cycle, min_rt=None, max_rt=None):
df = load_lib(path_parqet)
df = df[['Stripped.Sequence', 'Precursor.Charge', 'RT', 'Precursor.Mz']]
df_unique = df.drop_duplicates()
# build image
total_ms1_mz = ms1_end_mz - ms1_start_mz
n_bin_ms1 = int(total_ms1_mz // bin_mz)
im = np.zeros([max_cycle, n_bin_ms1])
max_rt = np.max(rt_pred)
min_rt = np.min(rt_pred)
total_rt = max_rt - min_rt
for (rt, mz_ratio) in data:
im[int((rt - min_rt / total_rt) * max_cycle), int(((mz_ratio - ms1_start_mz) / total_ms1_mz) * n_bin_ms1)] = 1
if max_rt is None:
max_rt = np.max(df_unique['RT'])
if min_rt is None:
min_rt = np.min(df_unique['RT'])
total_rt = max_rt - min_rt + 1e-3
for row in df_unique.iterrows():
if 900 > int(((row[1]['Precursor.Mz'] - ms1_start_mz) / total_ms1_mz) * n_bin_ms1) >= 0:
im[int((row[1]['RT'] - min_rt) / total_rt * max_cycle), int(
((row[1]['Precursor.Mz'] - ms1_start_mz) / total_ms1_mz) * n_bin_ms1)] = 1
return im
'Klebsiella michiganensis',
def build_ref_image_from_diann(path_parqet, ms1_end_mz, ms1_start_mz, bin_mz, max_cycle, min_rt=None, max_rt=None):
def build_ref_image_from_diann_global(path_parqet, target_seq, ms1_end_mz, ms1_start_mz, bin_mz, max_cycle, min_rt=None, max_rt=None):
df = load_lib(path_parqet)
df = df[['Stripped.Sequence', 'Precursor.Charge', 'RT', 'Precursor.Mz']]
df = df[df['Stripped.Sequence'].isin(target_seq)]
df_unique = df.drop_duplicates()
# build image
total_ms1_mz = ms1_end_mz - ms1_start_mz
......@@ -236,29 +149,31 @@ def build_ref_image_from_diann(path_parqet, ms1_end_mz, ms1_start_mz, bin_mz, ma
if __name__ == '__main__':
df = build_database_ref_peptide()
#Write fasta file
# for spe in SPECIES:
#
# df_spe = df[df['Specie']==spe]
# spe_list = df_spe['Sequence'].to_list()
# with open('fasta/optimal peptide set/'+spe+'.fasta',"w") as f:
# df = build_database_ref_peptide()
# df.to_csv("dataset_species_ref_peptides.csv", index=False)
# #Write fasta file
# with open('fasta/optimal peptide set/gobal_peptide_set.fasta', "w") as f:
# for spe in SPECIES:
# df_spe = df[df['Specie']==spe]
# spe_list = df_spe['Sequence'].drop_duplicate().to_list()
# if not spe_list:
# print('Empty : ',spe)
# for pep in spe_list :
# f.write(pep)
df_count = compute_common_peptide("dataset_species_ref_peptides.csv", SPECIES)
#
#Create ref img
df_full = load_lib(
'fasta/full proteom/steigerwaltii variants/uniparc_proteome_UP000033376_2025_03_14.predicted.parquet')
min_rt = df_full['RT'].min()
max_rt = df_full['RT'].max()
for spe in SPECIES:
im = build_ref_image_from_diann(
'fasta/optimal peptide set/' + spe + '.parquet', ms1_end_mz=1250,
ms1_start_mz=350, bin_mz=1, max_cycle=663, min_rt=min_rt, max_rt=max_rt)
plt.clf()
mpimg.imsave('img_ref/' + spe + '.png', im)
np.save(spe + '.npy', im)
# df_full = load_lib(
# 'fasta/full proteom/steigerwaltii variants/uniparc_proteome_UP000033376_2025_03_14.predicted.parquet')
# min_rt = df_full['RT'].min()
# max_rt = df_full['RT'].max()
#
# for spe in SPECIES:
# im = build_ref_image_from_diann(
# 'fasta/optimal peptide set/' + spe + '.parquet', ms1_end_mz=1250,
# ms1_start_mz=350, bin_mz=1, max_cycle=663, min_rt=min_rt, max_rt=max_rt)
# plt.clf()
# mpimg.imsave('img_ref/' + spe + '.png', im)
# np.save(spe + '.npy', im)
0% Loading or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment