Skip to content
Snippets Groups Projects
Commit 02f95633 authored by Schneider Leo's avatar Schneider Leo
Browse files

del raysesult

parent b844726e
No related branches found
No related tags found
No related merge requests found
Showing
with 0 additions and 119 deletions
loss,timestamp,checkpoint_dir_name,should_checkpoint,done,training_iteration,trial_id,date,time_this_iter_s,time_total_s,pid,hostname,node_ip,time_since_restore,iterations_since_restore
266.66766681821326,1719346308,checkpoint_000000,True,False,1,18c11cbc,2024-06-25_22-11-48,74.13704776763916,74.13704776763916,2195253,r8i6n2,10.159.28.60,74.13704776763916,1
191.86041752372202,1719346367,checkpoint_000001,True,False,2,18c11cbc,2024-06-25_22-12-47,59.13420367240906,133.27125144004822,2195253,r8i6n2,10.159.28.60,133.27125144004822,2
172.28332663708784,1719346427,checkpoint_000002,True,False,3,18c11cbc,2024-06-25_22-13-47,59.52952766418457,192.8007791042328,2195253,r8i6n2,10.159.28.60,192.8007791042328,3
165.87816860168937,1719346487,checkpoint_000003,True,False,4,18c11cbc,2024-06-25_22-14-47,59.658318519592285,252.45909762382507,2195253,r8i6n2,10.159.28.60,252.45909762382507,4
149.45840237835262,1719346547,checkpoint_000004,True,False,5,18c11cbc,2024-06-25_22-15-47,60.179141998291016,312.6382396221161,2195253,r8i6n2,10.159.28.60,312.6382396221161,5
142.80532779843787,1719354718,checkpoint_000005,True,False,6,18c11cbc,2024-06-26_00-31-58,73.47930812835693,386.117547750473,2850562,r8i6n2,10.159.28.60,73.47930812835693,1
147.61887368630238,1719354776,checkpoint_000006,True,False,7,18c11cbc,2024-06-26_00-32-56,58.23622250556946,444.3537702560425,2850562,r8i6n2,10.159.28.60,131.7155306339264,2
136.93548496877114,1719354835,checkpoint_000007,True,False,8,18c11cbc,2024-06-26_00-33-55,59.08127737045288,503.43504762649536,2850562,r8i6n2,10.159.28.60,190.79680800437927,3
133.39135180495856,1719354893,checkpoint_000008,True,False,9,18c11cbc,2024-06-26_00-34-53,58.395625591278076,561.8306732177734,2850562,r8i6n2,10.159.28.60,249.19243359565735,4
130.94450414462352,1719354952,checkpoint_000009,True,False,10,18c11cbc,2024-06-26_00-35-52,58.66959762573242,620.5002708435059,2850562,r8i6n2,10.159.28.60,307.86203122138977,5
128.02527375108613,1719357338,checkpoint_000010,True,False,11,18c11cbc,2024-06-26_01-15-38,72.75415802001953,693.2544288635254,2850562,r8i6n2,10.159.28.60,72.75415802001953,1
130.49067126296637,1719357397,checkpoint_000011,True,False,12,18c11cbc,2024-06-26_01-16-37,59.34819006919861,752.602618932724,2850562,r8i6n2,10.159.28.60,132.10234808921814,2
125.55984971654696,1719357457,checkpoint_000012,True,False,13,18c11cbc,2024-06-26_01-17-37,59.262863636016846,811.8654825687408,2850562,r8i6n2,10.159.28.60,191.36521172523499,3
124.00354583620087,1719357516,checkpoint_000013,True,False,14,18c11cbc,2024-06-26_01-18-36,59.35755896568298,871.2230415344238,2850562,r8i6n2,10.159.28.60,250.72277069091797,4
122.43110686775267,1719357575,checkpoint_000014,True,False,15,18c11cbc,2024-06-26_01-19-35,59.15378403663635,930.3768255710602,2850562,r8i6n2,10.159.28.60,309.8765547275543,5
121.44499277129887,1719357635,checkpoint_000015,True,False,16,18c11cbc,2024-06-26_01-20-35,59.61450457572937,989.9913301467896,2850562,r8i6n2,10.159.28.60,369.4910593032837,6
124.11241438257413,1719357694,checkpoint_000016,True,False,17,18c11cbc,2024-06-26_01-21-34,59.14637589454651,1049.137706041336,2850562,r8i6n2,10.159.28.60,428.6374351978302,7
121.52879060159518,1719357753,checkpoint_000017,True,False,18,18c11cbc,2024-06-26_01-22-33,59.11753058433533,1108.2552366256714,2850562,r8i6n2,10.159.28.60,487.7549657821655,8
118.6914559612124,1719357812,checkpoint_000018,True,False,19,18c11cbc,2024-06-26_01-23-32,59.181042432785034,1167.4362790584564,2850562,r8i6n2,10.159.28.60,546.9360082149506,9
118.33789474006713,1719357871,checkpoint_000019,True,False,20,18c11cbc,2024-06-26_01-24-31,59.315247774124146,1226.7515268325806,2850562,r8i6n2,10.159.28.60,606.2512559890747,10
{"loss": 266.66766681821326, "timestamp": 1719346308, "checkpoint_dir_name": "checkpoint_000000", "should_checkpoint": true, "done": false, "training_iteration": 1, "trial_id": "18c11cbc", "date": "2024-06-25_22-11-48", "time_this_iter_s": 74.13704776763916, "time_total_s": 74.13704776763916, "pid": 2195253, "hostname": "r8i6n2", "node_ip": "10.159.28.60", "config": {"encoder_num_layer": 4, "decoder_rt_num_layer": 1, "decoder_int_num_layer": 1, "embedding_dim": 16, "encoder_ff": 512, "decoder_rt_ff": 2048, "decoder_int_ff": 512, "n_head": 8, "drop_rate": 0.3688667410196246, "lr": 0.0018036063294034547, "batch_size": 1024}, "time_since_restore": 74.13704776763916, "iterations_since_restore": 1}
{"loss": 191.86041752372202, "timestamp": 1719346367, "checkpoint_dir_name": "checkpoint_000001", "should_checkpoint": true, "done": false, "training_iteration": 2, "trial_id": "18c11cbc", "date": "2024-06-25_22-12-47", "time_this_iter_s": 59.13420367240906, "time_total_s": 133.27125144004822, "pid": 2195253, "hostname": "r8i6n2", "node_ip": "10.159.28.60", "config": {"encoder_num_layer": 4, "decoder_rt_num_layer": 1, "decoder_int_num_layer": 1, "embedding_dim": 16, "encoder_ff": 512, "decoder_rt_ff": 2048, "decoder_int_ff": 512, "n_head": 8, "drop_rate": 0.3688667410196246, "lr": 0.0018036063294034547, "batch_size": 1024}, "time_since_restore": 133.27125144004822, "iterations_since_restore": 2}
{"loss": 172.28332663708784, "timestamp": 1719346427, "checkpoint_dir_name": "checkpoint_000002", "should_checkpoint": true, "done": false, "training_iteration": 3, "trial_id": "18c11cbc", "date": "2024-06-25_22-13-47", "time_this_iter_s": 59.52952766418457, "time_total_s": 192.8007791042328, "pid": 2195253, "hostname": "r8i6n2", "node_ip": "10.159.28.60", "config": {"encoder_num_layer": 4, "decoder_rt_num_layer": 1, "decoder_int_num_layer": 1, "embedding_dim": 16, "encoder_ff": 512, "decoder_rt_ff": 2048, "decoder_int_ff": 512, "n_head": 8, "drop_rate": 0.3688667410196246, "lr": 0.0018036063294034547, "batch_size": 1024}, "time_since_restore": 192.8007791042328, "iterations_since_restore": 3}
{"loss": 165.87816860168937, "timestamp": 1719346487, "checkpoint_dir_name": "checkpoint_000003", "should_checkpoint": true, "done": false, "training_iteration": 4, "trial_id": "18c11cbc", "date": "2024-06-25_22-14-47", "time_this_iter_s": 59.658318519592285, "time_total_s": 252.45909762382507, "pid": 2195253, "hostname": "r8i6n2", "node_ip": "10.159.28.60", "config": {"encoder_num_layer": 4, "decoder_rt_num_layer": 1, "decoder_int_num_layer": 1, "embedding_dim": 16, "encoder_ff": 512, "decoder_rt_ff": 2048, "decoder_int_ff": 512, "n_head": 8, "drop_rate": 0.3688667410196246, "lr": 0.0018036063294034547, "batch_size": 1024}, "time_since_restore": 252.45909762382507, "iterations_since_restore": 4}
{"loss": 149.45840237835262, "timestamp": 1719346547, "checkpoint_dir_name": "checkpoint_000004", "should_checkpoint": true, "done": false, "training_iteration": 5, "trial_id": "18c11cbc", "date": "2024-06-25_22-15-47", "time_this_iter_s": 60.179141998291016, "time_total_s": 312.6382396221161, "pid": 2195253, "hostname": "r8i6n2", "node_ip": "10.159.28.60", "config": {"encoder_num_layer": 4, "decoder_rt_num_layer": 1, "decoder_int_num_layer": 1, "embedding_dim": 16, "encoder_ff": 512, "decoder_rt_ff": 2048, "decoder_int_ff": 512, "n_head": 8, "drop_rate": 0.3688667410196246, "lr": 0.0018036063294034547, "batch_size": 1024}, "time_since_restore": 312.6382396221161, "iterations_since_restore": 5}
{"loss": 142.80532779843787, "timestamp": 1719354718, "checkpoint_dir_name": "checkpoint_000005", "should_checkpoint": true, "done": false, "training_iteration": 6, "trial_id": "18c11cbc", "date": "2024-06-26_00-31-58", "time_this_iter_s": 73.47930812835693, "time_total_s": 386.117547750473, "pid": 2850562, "hostname": "r8i6n2", "node_ip": "10.159.28.60", "config": {"encoder_num_layer": 4, "decoder_rt_num_layer": 1, "decoder_int_num_layer": 1, "embedding_dim": 16, "encoder_ff": 512, "decoder_rt_ff": 2048, "decoder_int_ff": 512, "n_head": 8, "drop_rate": 0.3688667410196246, "lr": 0.0018036063294034547, "batch_size": 1024}, "time_since_restore": 73.47930812835693, "iterations_since_restore": 1}
{"loss": 147.61887368630238, "timestamp": 1719354776, "checkpoint_dir_name": "checkpoint_000006", "should_checkpoint": true, "done": false, "training_iteration": 7, "trial_id": "18c11cbc", "date": "2024-06-26_00-32-56", "time_this_iter_s": 58.23622250556946, "time_total_s": 444.3537702560425, "pid": 2850562, "hostname": "r8i6n2", "node_ip": "10.159.28.60", "config": {"encoder_num_layer": 4, "decoder_rt_num_layer": 1, "decoder_int_num_layer": 1, "embedding_dim": 16, "encoder_ff": 512, "decoder_rt_ff": 2048, "decoder_int_ff": 512, "n_head": 8, "drop_rate": 0.3688667410196246, "lr": 0.0018036063294034547, "batch_size": 1024}, "time_since_restore": 131.7155306339264, "iterations_since_restore": 2}
{"loss": 136.93548496877114, "timestamp": 1719354835, "checkpoint_dir_name": "checkpoint_000007", "should_checkpoint": true, "done": false, "training_iteration": 8, "trial_id": "18c11cbc", "date": "2024-06-26_00-33-55", "time_this_iter_s": 59.08127737045288, "time_total_s": 503.43504762649536, "pid": 2850562, "hostname": "r8i6n2", "node_ip": "10.159.28.60", "config": {"encoder_num_layer": 4, "decoder_rt_num_layer": 1, "decoder_int_num_layer": 1, "embedding_dim": 16, "encoder_ff": 512, "decoder_rt_ff": 2048, "decoder_int_ff": 512, "n_head": 8, "drop_rate": 0.3688667410196246, "lr": 0.0018036063294034547, "batch_size": 1024}, "time_since_restore": 190.79680800437927, "iterations_since_restore": 3}
{"loss": 133.39135180495856, "timestamp": 1719354893, "checkpoint_dir_name": "checkpoint_000008", "should_checkpoint": true, "done": false, "training_iteration": 9, "trial_id": "18c11cbc", "date": "2024-06-26_00-34-53", "time_this_iter_s": 58.395625591278076, "time_total_s": 561.8306732177734, "pid": 2850562, "hostname": "r8i6n2", "node_ip": "10.159.28.60", "config": {"encoder_num_layer": 4, "decoder_rt_num_layer": 1, "decoder_int_num_layer": 1, "embedding_dim": 16, "encoder_ff": 512, "decoder_rt_ff": 2048, "decoder_int_ff": 512, "n_head": 8, "drop_rate": 0.3688667410196246, "lr": 0.0018036063294034547, "batch_size": 1024}, "time_since_restore": 249.19243359565735, "iterations_since_restore": 4}
{"loss": 130.94450414462352, "timestamp": 1719354952, "checkpoint_dir_name": "checkpoint_000009", "should_checkpoint": true, "done": false, "training_iteration": 10, "trial_id": "18c11cbc", "date": "2024-06-26_00-35-52", "time_this_iter_s": 58.66959762573242, "time_total_s": 620.5002708435059, "pid": 2850562, "hostname": "r8i6n2", "node_ip": "10.159.28.60", "config": {"encoder_num_layer": 4, "decoder_rt_num_layer": 1, "decoder_int_num_layer": 1, "embedding_dim": 16, "encoder_ff": 512, "decoder_rt_ff": 2048, "decoder_int_ff": 512, "n_head": 8, "drop_rate": 0.3688667410196246, "lr": 0.0018036063294034547, "batch_size": 1024}, "time_since_restore": 307.86203122138977, "iterations_since_restore": 5}
{"loss": 128.02527375108613, "timestamp": 1719357338, "checkpoint_dir_name": "checkpoint_000010", "should_checkpoint": true, "done": false, "training_iteration": 11, "trial_id": "18c11cbc", "date": "2024-06-26_01-15-38", "time_this_iter_s": 72.75415802001953, "time_total_s": 693.2544288635254, "pid": 2850562, "hostname": "r8i6n2", "node_ip": "10.159.28.60", "config": {"encoder_num_layer": 4, "decoder_rt_num_layer": 1, "decoder_int_num_layer": 1, "embedding_dim": 16, "encoder_ff": 512, "decoder_rt_ff": 2048, "decoder_int_ff": 512, "n_head": 8, "drop_rate": 0.3688667410196246, "lr": 0.0018036063294034547, "batch_size": 1024}, "time_since_restore": 72.75415802001953, "iterations_since_restore": 1}
{"loss": 130.49067126296637, "timestamp": 1719357397, "checkpoint_dir_name": "checkpoint_000011", "should_checkpoint": true, "done": false, "training_iteration": 12, "trial_id": "18c11cbc", "date": "2024-06-26_01-16-37", "time_this_iter_s": 59.34819006919861, "time_total_s": 752.602618932724, "pid": 2850562, "hostname": "r8i6n2", "node_ip": "10.159.28.60", "config": {"encoder_num_layer": 4, "decoder_rt_num_layer": 1, "decoder_int_num_layer": 1, "embedding_dim": 16, "encoder_ff": 512, "decoder_rt_ff": 2048, "decoder_int_ff": 512, "n_head": 8, "drop_rate": 0.3688667410196246, "lr": 0.0018036063294034547, "batch_size": 1024}, "time_since_restore": 132.10234808921814, "iterations_since_restore": 2}
{"loss": 125.55984971654696, "timestamp": 1719357457, "checkpoint_dir_name": "checkpoint_000012", "should_checkpoint": true, "done": false, "training_iteration": 13, "trial_id": "18c11cbc", "date": "2024-06-26_01-17-37", "time_this_iter_s": 59.262863636016846, "time_total_s": 811.8654825687408, "pid": 2850562, "hostname": "r8i6n2", "node_ip": "10.159.28.60", "config": {"encoder_num_layer": 4, "decoder_rt_num_layer": 1, "decoder_int_num_layer": 1, "embedding_dim": 16, "encoder_ff": 512, "decoder_rt_ff": 2048, "decoder_int_ff": 512, "n_head": 8, "drop_rate": 0.3688667410196246, "lr": 0.0018036063294034547, "batch_size": 1024}, "time_since_restore": 191.36521172523499, "iterations_since_restore": 3}
{"loss": 124.00354583620087, "timestamp": 1719357516, "checkpoint_dir_name": "checkpoint_000013", "should_checkpoint": true, "done": false, "training_iteration": 14, "trial_id": "18c11cbc", "date": "2024-06-26_01-18-36", "time_this_iter_s": 59.35755896568298, "time_total_s": 871.2230415344238, "pid": 2850562, "hostname": "r8i6n2", "node_ip": "10.159.28.60", "config": {"encoder_num_layer": 4, "decoder_rt_num_layer": 1, "decoder_int_num_layer": 1, "embedding_dim": 16, "encoder_ff": 512, "decoder_rt_ff": 2048, "decoder_int_ff": 512, "n_head": 8, "drop_rate": 0.3688667410196246, "lr": 0.0018036063294034547, "batch_size": 1024}, "time_since_restore": 250.72277069091797, "iterations_since_restore": 4}
{"loss": 122.43110686775267, "timestamp": 1719357575, "checkpoint_dir_name": "checkpoint_000014", "should_checkpoint": true, "done": false, "training_iteration": 15, "trial_id": "18c11cbc", "date": "2024-06-26_01-19-35", "time_this_iter_s": 59.15378403663635, "time_total_s": 930.3768255710602, "pid": 2850562, "hostname": "r8i6n2", "node_ip": "10.159.28.60", "config": {"encoder_num_layer": 4, "decoder_rt_num_layer": 1, "decoder_int_num_layer": 1, "embedding_dim": 16, "encoder_ff": 512, "decoder_rt_ff": 2048, "decoder_int_ff": 512, "n_head": 8, "drop_rate": 0.3688667410196246, "lr": 0.0018036063294034547, "batch_size": 1024}, "time_since_restore": 309.8765547275543, "iterations_since_restore": 5}
{"loss": 121.44499277129887, "timestamp": 1719357635, "checkpoint_dir_name": "checkpoint_000015", "should_checkpoint": true, "done": false, "training_iteration": 16, "trial_id": "18c11cbc", "date": "2024-06-26_01-20-35", "time_this_iter_s": 59.61450457572937, "time_total_s": 989.9913301467896, "pid": 2850562, "hostname": "r8i6n2", "node_ip": "10.159.28.60", "config": {"encoder_num_layer": 4, "decoder_rt_num_layer": 1, "decoder_int_num_layer": 1, "embedding_dim": 16, "encoder_ff": 512, "decoder_rt_ff": 2048, "decoder_int_ff": 512, "n_head": 8, "drop_rate": 0.3688667410196246, "lr": 0.0018036063294034547, "batch_size": 1024}, "time_since_restore": 369.4910593032837, "iterations_since_restore": 6}
{"loss": 124.11241438257413, "timestamp": 1719357694, "checkpoint_dir_name": "checkpoint_000016", "should_checkpoint": true, "done": false, "training_iteration": 17, "trial_id": "18c11cbc", "date": "2024-06-26_01-21-34", "time_this_iter_s": 59.14637589454651, "time_total_s": 1049.137706041336, "pid": 2850562, "hostname": "r8i6n2", "node_ip": "10.159.28.60", "config": {"encoder_num_layer": 4, "decoder_rt_num_layer": 1, "decoder_int_num_layer": 1, "embedding_dim": 16, "encoder_ff": 512, "decoder_rt_ff": 2048, "decoder_int_ff": 512, "n_head": 8, "drop_rate": 0.3688667410196246, "lr": 0.0018036063294034547, "batch_size": 1024}, "time_since_restore": 428.6374351978302, "iterations_since_restore": 7}
{"loss": 121.52879060159518, "timestamp": 1719357753, "checkpoint_dir_name": "checkpoint_000017", "should_checkpoint": true, "done": false, "training_iteration": 18, "trial_id": "18c11cbc", "date": "2024-06-26_01-22-33", "time_this_iter_s": 59.11753058433533, "time_total_s": 1108.2552366256714, "pid": 2850562, "hostname": "r8i6n2", "node_ip": "10.159.28.60", "config": {"encoder_num_layer": 4, "decoder_rt_num_layer": 1, "decoder_int_num_layer": 1, "embedding_dim": 16, "encoder_ff": 512, "decoder_rt_ff": 2048, "decoder_int_ff": 512, "n_head": 8, "drop_rate": 0.3688667410196246, "lr": 0.0018036063294034547, "batch_size": 1024}, "time_since_restore": 487.7549657821655, "iterations_since_restore": 8}
{"loss": 118.6914559612124, "timestamp": 1719357812, "checkpoint_dir_name": "checkpoint_000018", "should_checkpoint": true, "done": false, "training_iteration": 19, "trial_id": "18c11cbc", "date": "2024-06-26_01-23-32", "time_this_iter_s": 59.181042432785034, "time_total_s": 1167.4362790584564, "pid": 2850562, "hostname": "r8i6n2", "node_ip": "10.159.28.60", "config": {"encoder_num_layer": 4, "decoder_rt_num_layer": 1, "decoder_int_num_layer": 1, "embedding_dim": 16, "encoder_ff": 512, "decoder_rt_ff": 2048, "decoder_int_ff": 512, "n_head": 8, "drop_rate": 0.3688667410196246, "lr": 0.0018036063294034547, "batch_size": 1024}, "time_since_restore": 546.9360082149506, "iterations_since_restore": 9}
{"loss": 118.33789474006713, "timestamp": 1719357871, "checkpoint_dir_name": "checkpoint_000019", "should_checkpoint": true, "done": false, "training_iteration": 20, "trial_id": "18c11cbc", "date": "2024-06-26_01-24-31", "time_this_iter_s": 59.315247774124146, "time_total_s": 1226.7515268325806, "pid": 2850562, "hostname": "r8i6n2", "node_ip": "10.159.28.60", "config": {"encoder_num_layer": 4, "decoder_rt_num_layer": 1, "decoder_int_num_layer": 1, "embedding_dim": 16, "encoder_ff": 512, "decoder_rt_ff": 2048, "decoder_int_ff": 512, "n_head": 8, "drop_rate": 0.3688667410196246, "lr": 0.0018036063294034547, "batch_size": 1024}, "time_since_restore": 606.2512559890747, "iterations_since_restore": 10}
{
"batch_size": 1024,
"decoder_int_ff": 1024,
"decoder_int_num_layer": 2,
"decoder_rt_ff": 2048,
"decoder_rt_num_layer": 1,
"drop_rate": 0.922082156004742,
"embedding_dim": 64,
"encoder_ff": 2048,
"encoder_num_layer": 4,
"lr": 0.0016525285062566051,
"n_head": 1
}
\ No newline at end of file
loss,timestamp,checkpoint_dir_name,should_checkpoint,done,training_iteration,trial_id,date,time_this_iter_s,time_total_s,pid,hostname,node_ip,time_since_restore,iterations_since_restore
2102.196185254675,1719325314,checkpoint_000000,True,False,1,1920f493,2024-06-25_16-21-54,79.38143539428711,79.38143539428711,69318,r3i5n6,10.159.8.159,79.38143539428711,1
{"loss": 2102.196185254675, "timestamp": 1719325314, "checkpoint_dir_name": "checkpoint_000000", "should_checkpoint": true, "done": false, "training_iteration": 1, "trial_id": "1920f493", "date": "2024-06-25_16-21-54", "time_this_iter_s": 79.38143539428711, "time_total_s": 79.38143539428711, "pid": 69318, "hostname": "r3i5n6", "node_ip": "10.159.8.159", "config": {"encoder_num_layer": 4, "decoder_rt_num_layer": 1, "decoder_int_num_layer": 2, "embedding_dim": 64, "encoder_ff": 2048, "decoder_rt_ff": 2048, "decoder_int_ff": 1024, "n_head": 1, "drop_rate": 0.922082156004742, "lr": 0.0016525285062566051, "batch_size": 1024}, "time_since_restore": 79.38143539428711, "iterations_since_restore": 1}
{
"batch_size": 2048,
"decoder_int_ff": 512,
"decoder_int_num_layer": 1,
"decoder_rt_ff": 1024,
"decoder_rt_num_layer": 4,
"drop_rate": 0.28239791341845644,
"embedding_dim": 1024,
"encoder_ff": 512,
"encoder_num_layer": 2,
"lr": 0.0010916820440167283,
"n_head": 1
}
\ No newline at end of file
loss,timestamp,checkpoint_dir_name,should_checkpoint,done,training_iteration,trial_id,date,time_this_iter_s,time_total_s,pid,hostname,node_ip,time_since_restore,iterations_since_restore
2012.1308699480192,1719499938,checkpoint_000000,True,False,1,1cd059c8,2024-06-27_16-52-19,525.3962342739105,525.3962342739105,242786,r8i6n8,10.159.28.66,525.3962342739105,1
2025.7421327125369,1719500447,checkpoint_000001,True,False,2,1cd059c8,2024-06-27_17-00-47,508.4716944694519,1033.8679287433624,242786,r8i6n8,10.159.28.66,1033.8679287433624,2
{"loss": 2012.1308699480192, "timestamp": 1719499938, "checkpoint_dir_name": "checkpoint_000000", "should_checkpoint": true, "done": false, "training_iteration": 1, "trial_id": "1cd059c8", "date": "2024-06-27_16-52-19", "time_this_iter_s": 525.3962342739105, "time_total_s": 525.3962342739105, "pid": 242786, "hostname": "r8i6n8", "node_ip": "10.159.28.66", "config": {"encoder_num_layer": 2, "decoder_rt_num_layer": 4, "decoder_int_num_layer": 1, "embedding_dim": 1024, "encoder_ff": 512, "decoder_rt_ff": 1024, "decoder_int_ff": 512, "n_head": 1, "drop_rate": 0.28239791341845644, "lr": 0.0010916820440167283, "batch_size": 2048}, "time_since_restore": 525.3962342739105, "iterations_since_restore": 1}
{"loss": 2025.7421327125369, "timestamp": 1719500447, "checkpoint_dir_name": "checkpoint_000001", "should_checkpoint": true, "done": false, "training_iteration": 2, "trial_id": "1cd059c8", "date": "2024-06-27_17-00-47", "time_this_iter_s": 508.4716944694519, "time_total_s": 1033.8679287433624, "pid": 242786, "hostname": "r8i6n8", "node_ip": "10.159.28.66", "config": {"encoder_num_layer": 2, "decoder_rt_num_layer": 4, "decoder_int_num_layer": 1, "embedding_dim": 1024, "encoder_ff": 512, "decoder_rt_ff": 1024, "decoder_int_ff": 512, "n_head": 1, "drop_rate": 0.28239791341845644, "lr": 0.0010916820440167283, "batch_size": 2048}, "time_since_restore": 1033.8679287433624, "iterations_since_restore": 2}
{
"batch_size": 1024,
"decoder_int_ff": 1024,
"decoder_int_num_layer": 8,
"decoder_rt_ff": 512,
"decoder_rt_num_layer": 4,
"drop_rate": 0.03175079062321118,
"embedding_dim": 16,
"encoder_ff": 2048,
"encoder_num_layer": 2,
"lr": 0.003436670599863372,
"n_head": 1
}
\ No newline at end of file
loss,timestamp,checkpoint_dir_name,should_checkpoint,done,training_iteration,trial_id,date,time_this_iter_s,time_total_s,pid,hostname,node_ip,time_since_restore,iterations_since_restore
1961.4144263079786,1719325387,checkpoint_000000,True,False,1,1de460a8,2024-06-25_16-23-07,73.56608295440674,73.56608295440674,69318,r3i5n6,10.159.8.159,73.56608295440674,1
1955.3101080947035,1719327763,checkpoint_000001,True,False,2,1de460a8,2024-06-25_17-02-43,72.80412983894348,146.37021279335022,130020,r3i5n6,10.159.8.159,72.80412983894348,1
{"loss": 1961.4144263079786, "timestamp": 1719325387, "checkpoint_dir_name": "checkpoint_000000", "should_checkpoint": true, "done": false, "training_iteration": 1, "trial_id": "1de460a8", "date": "2024-06-25_16-23-07", "time_this_iter_s": 73.56608295440674, "time_total_s": 73.56608295440674, "pid": 69318, "hostname": "r3i5n6", "node_ip": "10.159.8.159", "config": {"encoder_num_layer": 2, "decoder_rt_num_layer": 4, "decoder_int_num_layer": 8, "embedding_dim": 16, "encoder_ff": 2048, "decoder_rt_ff": 512, "decoder_int_ff": 1024, "n_head": 1, "drop_rate": 0.03175079062321118, "lr": 0.003436670599863372, "batch_size": 1024}, "time_since_restore": 73.56608295440674, "iterations_since_restore": 1}
{"loss": 1955.3101080947035, "timestamp": 1719327763, "checkpoint_dir_name": "checkpoint_000001", "should_checkpoint": true, "done": false, "training_iteration": 2, "trial_id": "1de460a8", "date": "2024-06-25_17-02-43", "time_this_iter_s": 72.80412983894348, "time_total_s": 146.37021279335022, "pid": 130020, "hostname": "r3i5n6", "node_ip": "10.159.8.159", "config": {"encoder_num_layer": 2, "decoder_rt_num_layer": 4, "decoder_int_num_layer": 8, "embedding_dim": 16, "encoder_ff": 2048, "decoder_rt_ff": 512, "decoder_int_ff": 1024, "n_head": 1, "drop_rate": 0.03175079062321118, "lr": 0.003436670599863372, "batch_size": 1024}, "time_since_restore": 72.80412983894348, "iterations_since_restore": 1}
Failure # 1 (occurred at 2024-06-25_17-03-02)
ray::ImplicitFunc.train() (pid=130020, ip=10.159.8.159, actor_id=595d5a321c4a0818d73ea9fa01000000, repr=train_model)
^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
File "/gpfslocalsup/pub/anaconda-py3/2023.09/envs/pytorch-gpu-2.2.0+py3.11.7/lib/python3.11/site-packages/ray/tune/trainable/trainable.py", line 342, in train
raise skipped from exception_cause(skipped)
File "/gpfslocalsup/pub/anaconda-py3/2023.09/envs/pytorch-gpu-2.2.0+py3.11.7/lib/python3.11/site-packages/ray/air/_internal/util.py", line 88, in run
self._ret = self._target(*self._args, **self._kwargs)
^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
File "/gpfslocalsup/pub/anaconda-py3/2023.09/envs/pytorch-gpu-2.2.0+py3.11.7/lib/python3.11/site-packages/ray/tune/trainable/function_trainable.py", line 249, in <lambda>
training_func=lambda: self._trainable_func(self.config),
^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
File "/gpfslocalsup/pub/anaconda-py3/2023.09/envs/pytorch-gpu-2.2.0+py3.11.7/lib/python3.11/site-packages/ray/tune/trainable/function_trainable.py", line 332, in _trainable_func
output = fn()
^^^^
File "/gpfslocalsup/pub/anaconda-py3/2023.09/envs/pytorch-gpu-2.2.0+py3.11.7/lib/python3.11/site-packages/ray/tune/trainable/util.py", line 138, in inner
return trainable(config, **fn_kwargs)
^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
File "/gpfswork/rech/ute/ucg81ws/these/LC-MS-RT-prediction/main_ray_tune.py", line 98, in train_model
loss.backward()
File "/gpfslocalsup/pub/anaconda-py3/2023.09/envs/pytorch-gpu-2.2.0+py3.11.7/lib/python3.11/site-packages/torch/_tensor.py", line 522, in backward
torch.autograd.backward(
File "/gpfslocalsup/pub/anaconda-py3/2023.09/envs/pytorch-gpu-2.2.0+py3.11.7/lib/python3.11/site-packages/torch/autograd/__init__.py", line 266, in backward
Variable._execution_engine.run_backward( # Calls into the C++ engine to run the backward pass
torch.cuda.OutOfMemoryError: CUDA out of memory. Tried to allocate 258.00 MiB. GPU 0 has a total capacity of 15.77 GiB of which 216.69 MiB is free. Including non-PyTorch memory, this process has 15.55 GiB memory in use. Of the allocated memory 15.02 GiB is allocated by PyTorch, and 161.15 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation. See documentation for Memory Management (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables)
0% Loading or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment