diff --git a/data/data_processing.py b/data/data_processing.py index 9f7fd3e5c8dbc7234e7ec6d877e4e35ba63c1215..b31655122660562d396357e9cdf91d19bc5f23ea 100644 --- a/data/data_processing.py +++ b/data/data_processing.py @@ -100,15 +100,18 @@ def select_best_data(df_list,threshold): num = len(df_list) l=[] i=0 + print('Error calc') for df in df_list : df['abs err {}'.format(i)] = abs(df['rt pred'] - df['true rt']) df_group = df.groupby(['seq'])['abs err {}'.format(i)].mean().to_frame().reset_index() l.append(df_group) i += 1 + print(str(i)+'/'+str(num)) df = pd.concat(l, axis=1) df['mean'] = df['abs err 0'] for i in range(1,num): df['mean']=df['mean']+df['abs err {}'.format(i)] + print('filtering') df['mean'] = df['mean']/num df_res = df[df['mean']<threshold] c_name=['seq{}'.format(i) for i in range(num)]+['mean'] @@ -117,10 +120,14 @@ def select_best_data(df_list,threshold): df_res = df_res[['seq0']] good_seq=[] good_rt=[] + print('selecting') + i=0 for r in df_list[0].iterrows() : + print(str(i) + '/' + str(len(df_list[0]))) if r[1]['seq'] in df_res.values : good_rt.append(r[1]['true rt']) good_seq.append(r[1]['seq']) + print('merging') return pd.DataFrame({'sequence' : good_seq, 'irt_scaled': good_rt}) @@ -156,16 +163,16 @@ def numerical_to_alphabetical_str(s): if __name__ == '__main__': # main() - df_base = pd.read_csv('/lustre/fswork/projects/rech/bun/ucg81ws/these/dia-augmentation/data/data_PXD006109/e_coli/data_aligned_train_coli.csv') + df_base = pd.read_csv('./data_PXD006109/e_coli/data_aligned_train_coli.csv') df_base = df_base[['sequence', 'irt_scaled','state']] t = [0.05,0.1,0.2,0.3,0.4,0.5,0.7,1,10] #reste 07 1 et all name = ['005','01','02','03','04','05','07','1','all'] - df_0 = pd.read_csv('/lustre/fswork/projects/rech/bun/ucg81ws/these/dia-augmentation/output/out_coli_aligned_train_0.csv') - df_1 = pd.read_csv('/lustre/fswork/projects/rech/bun/ucg81ws/these/dia-augmentation/output/out_coli_aligned_train_1.csv') - df_2 = pd.read_csv('/lustre/fswork/projects/rech/bun/ucg81ws/these/dia-augmentation/output/out_coli_aligned_train_2.csv') - df_3 = pd.read_csv('/lustre/fswork/projects/rech/bun/ucg81ws/these/dia-augmentation/output/out_coli_aligned_train_3.csv') - df_4 = pd.read_csv('/lustre/fswork/projects/rech/bun/ucg81ws/these/dia-augmentation/output/out_coli_aligned_train_4.csv') + df_0 = pd.read_csv('../output/out_coli_aligned_train_0.csv') + df_1 = pd.read_csv('../output/out_coli_aligned_train_1.csv') + df_2 = pd.read_csv('../output/out_coli_aligned_train_2.csv') + df_3 = pd.read_csv('../output/out_coli_aligned_train_3.csv') + df_4 = pd.read_csv('../output/out_coli_aligned_train_4.csv') list_df = [df_0, df_1, df_2, df_3, df_4] for i in range(len(name)): @@ -173,12 +180,12 @@ if __name__ == '__main__': print('thresold {} en cours'.format(name[i])) # df = select_best_data(list_df, t[i]) - df.to_pickle('/lustre/fswork/projects/rech/bun/ucg81ws/these/dia-augmentation/data/data_PXD006109/e_coli/data_ISA_additionnal_{}.pkl'.format(name[i])) - df = pd.read_pickle('/lustre/fswork/projects/rech/bun/ucg81ws/these/dia-augmentation/data/data_PXD006109/e_coli/data_ISA_additionnal_{}.pkl'.format(name[i])) + df.to_pickle('./data_PXD006109/e_coli/data_ISA_additionnal_{}.pkl'.format(name[i])) + df = pd.read_pickle('./data_PXD006109/e_coli/data_ISA_additionnal_{}.pkl'.format(name[i])) df['state'] = 'train' df['sequence'] = df['sequence'].map(numerical_to_alphabetical_str) df_augmented_1 = pd.concat([df, df_base], axis=0).reset_index(drop=True) df_augmented_1.columns = ['sequence', 'irt_scaled','state'] - df_augmented_1.to_csv('/lustre/fswork/projects/rech/bun/ucg81ws/these/dia-augmentation/data/data_PXD006109/e_coli/plasma_data_augmented_{}.csv'.format(name[i]), index=False) + df_augmented_1.to_csv('./data_PXD006109/e_coli/plasma_data_augmented_{}.csv'.format(name[i]), index=False) print(df_augmented_1.shape) \ No newline at end of file