diff --git a/main.py b/main.py index 21f2abd0c79cf657b9d5ee96a63e6714b2537e8b..54f2c2858492c768b5aecf714b58782fc963d15c 100644 --- a/main.py +++ b/main.py @@ -12,8 +12,11 @@ def fasta_like_to_data(path): def strip_lines(s): s = s.strip('\n') - s = s.split(' ')[1] - return s + try : + s = s.split(' ')[1] + return s + except: + return 'unidentifid seq error' def main(input_data_path): print('Reading file') @@ -24,6 +27,8 @@ def main(input_data_path): data = pd.DataFrame(content,columns=['Sequences']) data = data[~data['Sequences'].str.contains(">")] data['Sequences']=data['Sequences'].map(strip_lines) + data = data[~data['Sequences'].str.contains('unidentifid seq error')] + data['Classes']=[0]*data.shape[0] data['Proteins']=[0]*data.shape[0] @@ -95,4 +100,4 @@ def main(input_data_path): new_file.close() if __name__ == '__main__': - main('241211_FASTA_RP_GroEL_GroES_Tuf_assemble_peptides_list.txt') \ No newline at end of file + main('250107_FASTA_RP_GroEL_GroES_Tuf_5pct_assemble_peptides_list.txt') \ No newline at end of file