Skip to content
Snippets Groups Projects
Commit 19fc70ca authored by Schneider Leo's avatar Schneider Leo
Browse files

extracting peptide set from txt

parent f42b778d
No related branches found
No related tags found
No related merge requests found
import fastapy import fastapy
import matplotlib.pyplot as plt import matplotlib.pyplot as plt
import numpy as np import numpy as np
import pandas as pd
from matplotlib_venn import venn2 from matplotlib_venn import venn2
from analyse_diann_digestion import load_lib from analyse_diann_digestion import load_lib
import matplotlib.image as mpimg import matplotlib.image as mpimg
import re
ALPHABET_UNMOD = { ALPHABET_UNMOD = {
"A": 1, "A": 1,
...@@ -120,6 +121,35 @@ def fasta_similarity(path_fasta_1, path_fasta_2): ...@@ -120,6 +121,35 @@ def fasta_similarity(path_fasta_1, path_fasta_2):
plt.show() plt.show()
plt.savefig('fasta_similarity.png') plt.savefig('fasta_similarity.png')
def split_string(input_string):
# Use regular expression to split the string at underscore followed by uppercase letter
return re.split(r'_(?=[A-Zc])', input_string)
def build_database_ref_peptide():
l=[]
with open('../data/label_raw/250107_FASTA_RP_GroEL_GroES_Tuf_5pct_assemble_peptides_list.txt', 'r') as f:
for line in f:
if line != '\n':
if '>' in line:
#typo ??
line = line.replace('no_family','No_family')
line = line.replace('no_order', 'No_order')
split_line = line.split('_')
prot = split_line[0][1:]
err = split_line[1]
prev = split_line[2]
split_line = split_string(line.split(' ')[1])
spe = split_line[0].replace('_',' ')
gen = split_line[1].replace('_',' ')
fam = split_line[2].replace('_',' ')
o = split_line[3].replace('_',' ')
else :
seq = line.split(' ')[1]
l.append({'Sequence' : seq,'Protein code' :prot , 'Error treshold':err , 'Prevalance': prev,
'Specie':spe ,'Genus':gen ,'Family':fam ,'Order':o })
return pd.DataFrame(l)
def compute_mass(seq, isotop): def compute_mass(seq, isotop):
m = 0 m = 0
if isotop == 'mono': if isotop == 'mono':
...@@ -195,6 +225,13 @@ def build_ref_image_from_diann(path_parqet, ms1_end_mz, ms1_start_mz, bin_mz, ma ...@@ -195,6 +225,13 @@ def build_ref_image_from_diann(path_parqet, ms1_end_mz, ms1_start_mz, bin_mz, ma
if __name__ == '__main__': if __name__ == '__main__':
# fasta_similarity('fasta/uniparc_proteome_UP000033376_2025_03_14.fasta','fasta/uniparc_proteome_UP000033499_2025_03_14.fasta') # fasta_similarity('fasta/uniparc_proteome_UP000033376_2025_03_14.fasta','fasta/uniparc_proteome_UP000033499_2025_03_14.fasta')
im = build_ref_image_from_diann('fasta/steigerwaltii variants/uniparc_proteome_UP000033376_2025_03_14.predicted.parquet', ms1_end_mz=1250, ms1_start_mz=350, bin_mz=1, max_cycle=663, rt_pred=[]) # im = build_ref_image_from_diann('fasta/steigerwaltii variants/uniparc_proteome_UP000033376_2025_03_14.predicted.parquet', ms1_end_mz=1250, ms1_start_mz=350, bin_mz=1, max_cycle=663, rt_pred=[])
plt.clf() # plt.clf()
mpimg.imsave('test_img.png', im) # mpimg.imsave('test_img.png', im)
df = build_database_ref_peptide()
for spe in ['Proteus mirabilis','Klebsiella pneumoniae','Klebsiella oxytoca','Enterobacter hormaechei','Citrobacter freundii']:
df_spe = df[df['Specie']==spe]
with open(spe+'.fasta','w') as f:
for r in df_spe.iterrows():
f.write(r[1]['Sequence'])
0% Loading or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment