Skip to content
Snippets Groups Projects
Commit 02f95633 authored by Schneider Leo's avatar Schneider Leo
Browse files

del raysesult

parent b844726e
No related branches found
No related tags found
No related merge requests found
Showing
with 0 additions and 154 deletions
{"loss": 1986.2198524775467, "timestamp": 1719348983, "checkpoint_dir_name": "checkpoint_000000", "should_checkpoint": true, "done": false, "training_iteration": 1, "trial_id": "0f80a6d7", "date": "2024-06-25_22-56-24", "time_this_iter_s": 363.18635082244873, "time_total_s": 363.18635082244873, "pid": 2195253, "hostname": "r8i6n2", "node_ip": "10.159.28.60", "config": {"encoder_num_layer": 2, "decoder_rt_num_layer": 2, "decoder_int_num_layer": 1, "embedding_dim": 1024, "encoder_ff": 1024, "decoder_rt_ff": 512, "decoder_int_ff": 512, "n_head": 16, "drop_rate": 0.023920599186226577, "lr": 0.0024687399238039302, "batch_size": 2048}, "time_since_restore": 363.18635082244873, "iterations_since_restore": 1}
{"loss": 1979.5493135226993, "timestamp": 1719349334, "checkpoint_dir_name": "checkpoint_000001", "should_checkpoint": true, "done": false, "training_iteration": 2, "trial_id": "0f80a6d7", "date": "2024-06-25_23-02-14", "time_this_iter_s": 350.295019865036, "time_total_s": 713.4813706874847, "pid": 2195253, "hostname": "r8i6n2", "node_ip": "10.159.28.60", "config": {"encoder_num_layer": 2, "decoder_rt_num_layer": 2, "decoder_int_num_layer": 1, "embedding_dim": 1024, "encoder_ff": 1024, "decoder_rt_ff": 512, "decoder_int_ff": 512, "n_head": 16, "drop_rate": 0.023920599186226577, "lr": 0.0024687399238039302, "batch_size": 2048}, "time_since_restore": 713.4813706874847, "iterations_since_restore": 2}
{"loss": 1964.9348182978592, "timestamp": 1719349683, "checkpoint_dir_name": "checkpoint_000002", "should_checkpoint": true, "done": false, "training_iteration": 3, "trial_id": "0f80a6d7", "date": "2024-06-25_23-08-04", "time_this_iter_s": 349.6246259212494, "time_total_s": 1063.1059966087341, "pid": 2195253, "hostname": "r8i6n2", "node_ip": "10.159.28.60", "config": {"encoder_num_layer": 2, "decoder_rt_num_layer": 2, "decoder_int_num_layer": 1, "embedding_dim": 1024, "encoder_ff": 1024, "decoder_rt_ff": 512, "decoder_int_ff": 512, "n_head": 16, "drop_rate": 0.023920599186226577, "lr": 0.0024687399238039302, "batch_size": 2048}, "time_since_restore": 1063.1059966087341, "iterations_since_restore": 3}
{"loss": 1955.6751497523992, "timestamp": 1719350034, "checkpoint_dir_name": "checkpoint_000003", "should_checkpoint": true, "done": false, "training_iteration": 4, "trial_id": "0f80a6d7", "date": "2024-06-25_23-13-54", "time_this_iter_s": 350.46285796165466, "time_total_s": 1413.5688545703888, "pid": 2195253, "hostname": "r8i6n2", "node_ip": "10.159.28.60", "config": {"encoder_num_layer": 2, "decoder_rt_num_layer": 2, "decoder_int_num_layer": 1, "embedding_dim": 1024, "encoder_ff": 1024, "decoder_rt_ff": 512, "decoder_int_ff": 512, "n_head": 16, "drop_rate": 0.023920599186226577, "lr": 0.0024687399238039302, "batch_size": 2048}, "time_since_restore": 1413.5688545703888, "iterations_since_restore": 4}
{"loss": 1956.1783514548474, "timestamp": 1719350384, "checkpoint_dir_name": "checkpoint_000004", "should_checkpoint": true, "done": false, "training_iteration": 5, "trial_id": "0f80a6d7", "date": "2024-06-25_23-19-44", "time_this_iter_s": 350.18478202819824, "time_total_s": 1763.753636598587, "pid": 2195253, "hostname": "r8i6n2", "node_ip": "10.159.28.60", "config": {"encoder_num_layer": 2, "decoder_rt_num_layer": 2, "decoder_int_num_layer": 1, "embedding_dim": 1024, "encoder_ff": 1024, "decoder_rt_ff": 512, "decoder_int_ff": 512, "n_head": 16, "drop_rate": 0.023920599186226577, "lr": 0.0024687399238039302, "batch_size": 2048}, "time_since_restore": 1763.753636598587, "iterations_since_restore": 5}
{"loss": 1971.8988527312993, "timestamp": 1719355866, "checkpoint_dir_name": "checkpoint_000005", "should_checkpoint": true, "done": false, "training_iteration": 6, "trial_id": "0f80a6d7", "date": "2024-06-26_00-51-07", "time_this_iter_s": 365.2865264415741, "time_total_s": 2129.040163040161, "pid": 2850562, "hostname": "r8i6n2", "node_ip": "10.159.28.60", "config": {"encoder_num_layer": 2, "decoder_rt_num_layer": 2, "decoder_int_num_layer": 1, "embedding_dim": 1024, "encoder_ff": 1024, "decoder_rt_ff": 512, "decoder_int_ff": 512, "n_head": 16, "drop_rate": 0.023920599186226577, "lr": 0.0024687399238039302, "batch_size": 2048}, "time_since_restore": 365.2865264415741, "iterations_since_restore": 1}
{"loss": 1955.6729476808564, "timestamp": 1719356216, "checkpoint_dir_name": "checkpoint_000006", "should_checkpoint": true, "done": false, "training_iteration": 7, "trial_id": "0f80a6d7", "date": "2024-06-26_00-56-56", "time_this_iter_s": 349.18421268463135, "time_total_s": 2478.2243757247925, "pid": 2850562, "hostname": "r8i6n2", "node_ip": "10.159.28.60", "config": {"encoder_num_layer": 2, "decoder_rt_num_layer": 2, "decoder_int_num_layer": 1, "embedding_dim": 1024, "encoder_ff": 1024, "decoder_rt_ff": 512, "decoder_int_ff": 512, "n_head": 16, "drop_rate": 0.023920599186226577, "lr": 0.0024687399238039302, "batch_size": 2048}, "time_since_restore": 714.4707391262054, "iterations_since_restore": 2}
{"loss": 1957.4972913847196, "timestamp": 1719356565, "checkpoint_dir_name": "checkpoint_000007", "should_checkpoint": true, "done": false, "training_iteration": 8, "trial_id": "0f80a6d7", "date": "2024-06-26_01-02-46", "time_this_iter_s": 349.7124717235565, "time_total_s": 2827.936847448349, "pid": 2850562, "hostname": "r8i6n2", "node_ip": "10.159.28.60", "config": {"encoder_num_layer": 2, "decoder_rt_num_layer": 2, "decoder_int_num_layer": 1, "embedding_dim": 1024, "encoder_ff": 1024, "decoder_rt_ff": 512, "decoder_int_ff": 512, "n_head": 16, "drop_rate": 0.023920599186226577, "lr": 0.0024687399238039302, "batch_size": 2048}, "time_since_restore": 1064.183210849762, "iterations_since_restore": 3}
{"loss": 1967.2758471871923, "timestamp": 1719356915, "checkpoint_dir_name": "checkpoint_000008", "should_checkpoint": true, "done": false, "training_iteration": 9, "trial_id": "0f80a6d7", "date": "2024-06-26_01-08-35", "time_this_iter_s": 349.3587329387665, "time_total_s": 3177.2955803871155, "pid": 2850562, "hostname": "r8i6n2", "node_ip": "10.159.28.60", "config": {"encoder_num_layer": 2, "decoder_rt_num_layer": 2, "decoder_int_num_layer": 1, "embedding_dim": 1024, "encoder_ff": 1024, "decoder_rt_ff": 512, "decoder_int_ff": 512, "n_head": 16, "drop_rate": 0.023920599186226577, "lr": 0.0024687399238039302, "batch_size": 2048}, "time_since_restore": 1413.5419437885284, "iterations_since_restore": 4}
{"loss": 1959.91755063515, "timestamp": 1719357265, "checkpoint_dir_name": "checkpoint_000009", "should_checkpoint": true, "done": true, "training_iteration": 10, "trial_id": "0f80a6d7", "date": "2024-06-26_01-14-25", "time_this_iter_s": 349.87603521347046, "time_total_s": 3527.171615600586, "pid": 2850562, "hostname": "r8i6n2", "node_ip": "10.159.28.60", "config": {"encoder_num_layer": 2, "decoder_rt_num_layer": 2, "decoder_int_num_layer": 1, "embedding_dim": 1024, "encoder_ff": 1024, "decoder_rt_ff": 512, "decoder_int_ff": 512, "n_head": 16, "drop_rate": 0.023920599186226577, "lr": 0.0024687399238039302, "batch_size": 2048}, "time_since_restore": 1763.417979001999, "iterations_since_restore": 5}
{
"batch_size": 2048,
"decoder_int_ff": 512,
"decoder_int_num_layer": 1,
"decoder_rt_ff": 512,
"decoder_rt_num_layer": 2,
"drop_rate": 0.005508735010543275,
"embedding_dim": 1024,
"encoder_ff": 1024,
"encoder_num_layer": 4,
"lr": 0.0011880094129501908,
"n_head": 1
}
\ No newline at end of file
loss,timestamp,checkpoint_dir_name,should_checkpoint,done,training_iteration,trial_id,date,time_this_iter_s,time_total_s,pid,hostname,node_ip,time_since_restore,iterations_since_restore
1956.0084007443406,1719335168,checkpoint_000000,True,False,1,0f9cc97d,2024-06-25_19-06-08,519.6532871723175,519.6532871723175,2195253,r8i6n2,10.159.28.60,519.6532871723175,1
1970.2912184347317,1719335674,checkpoint_000001,True,False,2,0f9cc97d,2024-06-25_19-14-34,505.92808413505554,1025.581371307373,2195253,r8i6n2,10.159.28.60,1025.581371307373,2
1954.4228304164617,1719339260,checkpoint_000002,True,False,3,0f9cc97d,2024-06-25_20-14-21,522.2363362312317,1547.8177075386047,2195253,r8i6n2,10.159.28.60,522.2363362312317,1
1969.7449739711492,1719339766,checkpoint_000003,True,False,4,0f9cc97d,2024-06-25_20-22-47,506.13422203063965,2053.9519295692444,2195253,r8i6n2,10.159.28.60,1028.3705582618713,2
1951.9356997031866,1719341192,checkpoint_000004,True,False,5,0f9cc97d,2024-06-25_20-46-33,522.4360599517822,2576.3879895210266,2195253,r8i6n2,10.159.28.60,522.4360599517822,1
1952.9474155696357,1719341699,checkpoint_000005,True,False,6,0f9cc97d,2024-06-25_20-54-59,506.28402161598206,3082.6720111370087,2195253,r8i6n2,10.159.28.60,1028.7200815677643,2
1954.9041440468134,1719342205,checkpoint_000006,True,False,7,0f9cc97d,2024-06-25_21-03-25,506.20089197158813,3588.872903108597,2195253,r8i6n2,10.159.28.60,1534.9209735393524,3
1954.9512458861343,1719342711,checkpoint_000007,True,False,8,0f9cc97d,2024-06-25_21-11-51,505.91624569892883,4094.7891488075256,2195253,r8i6n2,10.159.28.60,2040.8372192382812,4
1955.8870253675566,1719343217,checkpoint_000008,True,False,9,0f9cc97d,2024-06-25_21-20-17,505.6786599159241,4600.46780872345,2195253,r8i6n2,10.159.28.60,2546.5158791542053,5
1953.494582769439,1719343722,checkpoint_000009,True,False,10,0f9cc97d,2024-06-25_21-28-43,505.8313031196594,5106.299111843109,2195253,r8i6n2,10.159.28.60,3052.3471822738647,6
1953.025947150283,1719344228,checkpoint_000010,True,False,11,0f9cc97d,2024-06-25_21-37-09,505.74103355407715,5612.040145397186,2195253,r8i6n2,10.159.28.60,3558.088215827942,7
1956.2824668583908,1719344735,checkpoint_000011,True,False,12,0f9cc97d,2024-06-25_21-45-35,506.4504108428955,6118.490556240082,2195253,r8i6n2,10.159.28.60,4064.5386266708374,8
1955.5371737742987,1719345241,checkpoint_000012,True,False,13,0f9cc97d,2024-06-25_21-54-01,506.19851207733154,6624.689068317413,2195253,r8i6n2,10.159.28.60,4570.737138748169,9
1956.4829832062007,1719345747,checkpoint_000013,True,False,14,0f9cc97d,2024-06-25_22-02-27,506.25228786468506,7130.941356182098,2195253,r8i6n2,10.159.28.60,5076.989426612854,10
{"loss": 1956.0084007443406, "timestamp": 1719335168, "checkpoint_dir_name": "checkpoint_000000", "should_checkpoint": true, "done": false, "training_iteration": 1, "trial_id": "0f9cc97d", "date": "2024-06-25_19-06-08", "time_this_iter_s": 519.6532871723175, "time_total_s": 519.6532871723175, "pid": 2195253, "hostname": "r8i6n2", "node_ip": "10.159.28.60", "config": {"encoder_num_layer": 4, "decoder_rt_num_layer": 2, "decoder_int_num_layer": 1, "embedding_dim": 1024, "encoder_ff": 1024, "decoder_rt_ff": 512, "decoder_int_ff": 512, "n_head": 1, "drop_rate": 0.005508735010543275, "lr": 0.0011880094129501908, "batch_size": 2048}, "time_since_restore": 519.6532871723175, "iterations_since_restore": 1}
{"loss": 1970.2912184347317, "timestamp": 1719335674, "checkpoint_dir_name": "checkpoint_000001", "should_checkpoint": true, "done": false, "training_iteration": 2, "trial_id": "0f9cc97d", "date": "2024-06-25_19-14-34", "time_this_iter_s": 505.92808413505554, "time_total_s": 1025.581371307373, "pid": 2195253, "hostname": "r8i6n2", "node_ip": "10.159.28.60", "config": {"encoder_num_layer": 4, "decoder_rt_num_layer": 2, "decoder_int_num_layer": 1, "embedding_dim": 1024, "encoder_ff": 1024, "decoder_rt_ff": 512, "decoder_int_ff": 512, "n_head": 1, "drop_rate": 0.005508735010543275, "lr": 0.0011880094129501908, "batch_size": 2048}, "time_since_restore": 1025.581371307373, "iterations_since_restore": 2}
{"loss": 1954.4228304164617, "timestamp": 1719339260, "checkpoint_dir_name": "checkpoint_000002", "should_checkpoint": true, "done": false, "training_iteration": 3, "trial_id": "0f9cc97d", "date": "2024-06-25_20-14-21", "time_this_iter_s": 522.2363362312317, "time_total_s": 1547.8177075386047, "pid": 2195253, "hostname": "r8i6n2", "node_ip": "10.159.28.60", "config": {"encoder_num_layer": 4, "decoder_rt_num_layer": 2, "decoder_int_num_layer": 1, "embedding_dim": 1024, "encoder_ff": 1024, "decoder_rt_ff": 512, "decoder_int_ff": 512, "n_head": 1, "drop_rate": 0.005508735010543275, "lr": 0.0011880094129501908, "batch_size": 2048}, "time_since_restore": 522.2363362312317, "iterations_since_restore": 1}
{"loss": 1969.7449739711492, "timestamp": 1719339766, "checkpoint_dir_name": "checkpoint_000003", "should_checkpoint": true, "done": false, "training_iteration": 4, "trial_id": "0f9cc97d", "date": "2024-06-25_20-22-47", "time_this_iter_s": 506.13422203063965, "time_total_s": 2053.9519295692444, "pid": 2195253, "hostname": "r8i6n2", "node_ip": "10.159.28.60", "config": {"encoder_num_layer": 4, "decoder_rt_num_layer": 2, "decoder_int_num_layer": 1, "embedding_dim": 1024, "encoder_ff": 1024, "decoder_rt_ff": 512, "decoder_int_ff": 512, "n_head": 1, "drop_rate": 0.005508735010543275, "lr": 0.0011880094129501908, "batch_size": 2048}, "time_since_restore": 1028.3705582618713, "iterations_since_restore": 2}
{"loss": 1951.9356997031866, "timestamp": 1719341192, "checkpoint_dir_name": "checkpoint_000004", "should_checkpoint": true, "done": false, "training_iteration": 5, "trial_id": "0f9cc97d", "date": "2024-06-25_20-46-33", "time_this_iter_s": 522.4360599517822, "time_total_s": 2576.3879895210266, "pid": 2195253, "hostname": "r8i6n2", "node_ip": "10.159.28.60", "config": {"encoder_num_layer": 4, "decoder_rt_num_layer": 2, "decoder_int_num_layer": 1, "embedding_dim": 1024, "encoder_ff": 1024, "decoder_rt_ff": 512, "decoder_int_ff": 512, "n_head": 1, "drop_rate": 0.005508735010543275, "lr": 0.0011880094129501908, "batch_size": 2048}, "time_since_restore": 522.4360599517822, "iterations_since_restore": 1}
{"loss": 1952.9474155696357, "timestamp": 1719341699, "checkpoint_dir_name": "checkpoint_000005", "should_checkpoint": true, "done": false, "training_iteration": 6, "trial_id": "0f9cc97d", "date": "2024-06-25_20-54-59", "time_this_iter_s": 506.28402161598206, "time_total_s": 3082.6720111370087, "pid": 2195253, "hostname": "r8i6n2", "node_ip": "10.159.28.60", "config": {"encoder_num_layer": 4, "decoder_rt_num_layer": 2, "decoder_int_num_layer": 1, "embedding_dim": 1024, "encoder_ff": 1024, "decoder_rt_ff": 512, "decoder_int_ff": 512, "n_head": 1, "drop_rate": 0.005508735010543275, "lr": 0.0011880094129501908, "batch_size": 2048}, "time_since_restore": 1028.7200815677643, "iterations_since_restore": 2}
{"loss": 1954.9041440468134, "timestamp": 1719342205, "checkpoint_dir_name": "checkpoint_000006", "should_checkpoint": true, "done": false, "training_iteration": 7, "trial_id": "0f9cc97d", "date": "2024-06-25_21-03-25", "time_this_iter_s": 506.20089197158813, "time_total_s": 3588.872903108597, "pid": 2195253, "hostname": "r8i6n2", "node_ip": "10.159.28.60", "config": {"encoder_num_layer": 4, "decoder_rt_num_layer": 2, "decoder_int_num_layer": 1, "embedding_dim": 1024, "encoder_ff": 1024, "decoder_rt_ff": 512, "decoder_int_ff": 512, "n_head": 1, "drop_rate": 0.005508735010543275, "lr": 0.0011880094129501908, "batch_size": 2048}, "time_since_restore": 1534.9209735393524, "iterations_since_restore": 3}
{"loss": 1954.9512458861343, "timestamp": 1719342711, "checkpoint_dir_name": "checkpoint_000007", "should_checkpoint": true, "done": false, "training_iteration": 8, "trial_id": "0f9cc97d", "date": "2024-06-25_21-11-51", "time_this_iter_s": 505.91624569892883, "time_total_s": 4094.7891488075256, "pid": 2195253, "hostname": "r8i6n2", "node_ip": "10.159.28.60", "config": {"encoder_num_layer": 4, "decoder_rt_num_layer": 2, "decoder_int_num_layer": 1, "embedding_dim": 1024, "encoder_ff": 1024, "decoder_rt_ff": 512, "decoder_int_ff": 512, "n_head": 1, "drop_rate": 0.005508735010543275, "lr": 0.0011880094129501908, "batch_size": 2048}, "time_since_restore": 2040.8372192382812, "iterations_since_restore": 4}
{"loss": 1955.8870253675566, "timestamp": 1719343217, "checkpoint_dir_name": "checkpoint_000008", "should_checkpoint": true, "done": false, "training_iteration": 9, "trial_id": "0f9cc97d", "date": "2024-06-25_21-20-17", "time_this_iter_s": 505.6786599159241, "time_total_s": 4600.46780872345, "pid": 2195253, "hostname": "r8i6n2", "node_ip": "10.159.28.60", "config": {"encoder_num_layer": 4, "decoder_rt_num_layer": 2, "decoder_int_num_layer": 1, "embedding_dim": 1024, "encoder_ff": 1024, "decoder_rt_ff": 512, "decoder_int_ff": 512, "n_head": 1, "drop_rate": 0.005508735010543275, "lr": 0.0011880094129501908, "batch_size": 2048}, "time_since_restore": 2546.5158791542053, "iterations_since_restore": 5}
{"loss": 1953.494582769439, "timestamp": 1719343722, "checkpoint_dir_name": "checkpoint_000009", "should_checkpoint": true, "done": false, "training_iteration": 10, "trial_id": "0f9cc97d", "date": "2024-06-25_21-28-43", "time_this_iter_s": 505.8313031196594, "time_total_s": 5106.299111843109, "pid": 2195253, "hostname": "r8i6n2", "node_ip": "10.159.28.60", "config": {"encoder_num_layer": 4, "decoder_rt_num_layer": 2, "decoder_int_num_layer": 1, "embedding_dim": 1024, "encoder_ff": 1024, "decoder_rt_ff": 512, "decoder_int_ff": 512, "n_head": 1, "drop_rate": 0.005508735010543275, "lr": 0.0011880094129501908, "batch_size": 2048}, "time_since_restore": 3052.3471822738647, "iterations_since_restore": 6}
{"loss": 1953.025947150283, "timestamp": 1719344228, "checkpoint_dir_name": "checkpoint_000010", "should_checkpoint": true, "done": false, "training_iteration": 11, "trial_id": "0f9cc97d", "date": "2024-06-25_21-37-09", "time_this_iter_s": 505.74103355407715, "time_total_s": 5612.040145397186, "pid": 2195253, "hostname": "r8i6n2", "node_ip": "10.159.28.60", "config": {"encoder_num_layer": 4, "decoder_rt_num_layer": 2, "decoder_int_num_layer": 1, "embedding_dim": 1024, "encoder_ff": 1024, "decoder_rt_ff": 512, "decoder_int_ff": 512, "n_head": 1, "drop_rate": 0.005508735010543275, "lr": 0.0011880094129501908, "batch_size": 2048}, "time_since_restore": 3558.088215827942, "iterations_since_restore": 7}
{"loss": 1956.2824668583908, "timestamp": 1719344735, "checkpoint_dir_name": "checkpoint_000011", "should_checkpoint": true, "done": false, "training_iteration": 12, "trial_id": "0f9cc97d", "date": "2024-06-25_21-45-35", "time_this_iter_s": 506.4504108428955, "time_total_s": 6118.490556240082, "pid": 2195253, "hostname": "r8i6n2", "node_ip": "10.159.28.60", "config": {"encoder_num_layer": 4, "decoder_rt_num_layer": 2, "decoder_int_num_layer": 1, "embedding_dim": 1024, "encoder_ff": 1024, "decoder_rt_ff": 512, "decoder_int_ff": 512, "n_head": 1, "drop_rate": 0.005508735010543275, "lr": 0.0011880094129501908, "batch_size": 2048}, "time_since_restore": 4064.5386266708374, "iterations_since_restore": 8}
{"loss": 1955.5371737742987, "timestamp": 1719345241, "checkpoint_dir_name": "checkpoint_000012", "should_checkpoint": true, "done": false, "training_iteration": 13, "trial_id": "0f9cc97d", "date": "2024-06-25_21-54-01", "time_this_iter_s": 506.19851207733154, "time_total_s": 6624.689068317413, "pid": 2195253, "hostname": "r8i6n2", "node_ip": "10.159.28.60", "config": {"encoder_num_layer": 4, "decoder_rt_num_layer": 2, "decoder_int_num_layer": 1, "embedding_dim": 1024, "encoder_ff": 1024, "decoder_rt_ff": 512, "decoder_int_ff": 512, "n_head": 1, "drop_rate": 0.005508735010543275, "lr": 0.0011880094129501908, "batch_size": 2048}, "time_since_restore": 4570.737138748169, "iterations_since_restore": 9}
{"loss": 1956.4829832062007, "timestamp": 1719345747, "checkpoint_dir_name": "checkpoint_000013", "should_checkpoint": true, "done": false, "training_iteration": 14, "trial_id": "0f9cc97d", "date": "2024-06-25_22-02-27", "time_this_iter_s": 506.25228786468506, "time_total_s": 7130.941356182098, "pid": 2195253, "hostname": "r8i6n2", "node_ip": "10.159.28.60", "config": {"encoder_num_layer": 4, "decoder_rt_num_layer": 2, "decoder_int_num_layer": 1, "embedding_dim": 1024, "encoder_ff": 1024, "decoder_rt_ff": 512, "decoder_int_ff": 512, "n_head": 1, "drop_rate": 0.005508735010543275, "lr": 0.0011880094129501908, "batch_size": 2048}, "time_since_restore": 5076.989426612854, "iterations_since_restore": 10}
{
"batch_size": 2048,
"decoder_int_ff": 512,
"decoder_int_num_layer": 1,
"decoder_rt_ff": 1024,
"decoder_rt_num_layer": 2,
"drop_rate": 0.025073968761468723,
"embedding_dim": 256,
"encoder_ff": 2048,
"encoder_num_layer": 2,
"lr": 0.0010259835678295768,
"n_head": 16
}
\ No newline at end of file
loss,timestamp,checkpoint_dir_name,should_checkpoint,done,training_iteration,trial_id,date,time_this_iter_s,time_total_s,pid,hostname,node_ip,time_since_restore,iterations_since_restore
1956.1862581508367,1719330289,checkpoint_000000,True,False,1,110f0a27,2024-06-25_17-44-49,110.66666960716248,110.66666960716248,2195253,r8i6n2,10.159.28.60,110.66666960716248,1
1952.48307740219,1719331057,checkpoint_000001,True,False,2,110f0a27,2024-06-25_17-57-37,110.16742134094238,220.83409094810486,2195253,r8i6n2,10.159.28.60,110.16742134094238,1
{"loss": 1956.1862581508367, "timestamp": 1719330289, "checkpoint_dir_name": "checkpoint_000000", "should_checkpoint": true, "done": false, "training_iteration": 1, "trial_id": "110f0a27", "date": "2024-06-25_17-44-49", "time_this_iter_s": 110.66666960716248, "time_total_s": 110.66666960716248, "pid": 2195253, "hostname": "r8i6n2", "node_ip": "10.159.28.60", "config": {"encoder_num_layer": 2, "decoder_rt_num_layer": 2, "decoder_int_num_layer": 1, "embedding_dim": 256, "encoder_ff": 2048, "decoder_rt_ff": 1024, "decoder_int_ff": 512, "n_head": 16, "drop_rate": 0.025073968761468723, "lr": 0.0010259835678295768, "batch_size": 2048}, "time_since_restore": 110.66666960716248, "iterations_since_restore": 1}
{"loss": 1952.48307740219, "timestamp": 1719331057, "checkpoint_dir_name": "checkpoint_000001", "should_checkpoint": true, "done": false, "training_iteration": 2, "trial_id": "110f0a27", "date": "2024-06-25_17-57-37", "time_this_iter_s": 110.16742134094238, "time_total_s": 220.83409094810486, "pid": 2195253, "hostname": "r8i6n2", "node_ip": "10.159.28.60", "config": {"encoder_num_layer": 2, "decoder_rt_num_layer": 2, "decoder_int_num_layer": 1, "embedding_dim": 256, "encoder_ff": 2048, "decoder_rt_ff": 1024, "decoder_int_ff": 512, "n_head": 16, "drop_rate": 0.025073968761468723, "lr": 0.0010259835678295768, "batch_size": 2048}, "time_since_restore": 110.16742134094238, "iterations_since_restore": 1}
Failure # 1 (occurred at 2024-06-27_14-37-55)
ray::ImplicitFunc.train() (pid=35196, ip=10.159.28.66, actor_id=76f13cb1c39b1e46c8a07d1101000000, repr=train_model)
^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
File "/gpfslocalsup/pub/anaconda-py3/2023.09/envs/pytorch-gpu-2.2.0+py3.11.7/lib/python3.11/site-packages/ray/tune/trainable/trainable.py", line 342, in train
raise skipped from exception_cause(skipped)
File "/gpfslocalsup/pub/anaconda-py3/2023.09/envs/pytorch-gpu-2.2.0+py3.11.7/lib/python3.11/site-packages/ray/air/_internal/util.py", line 88, in run
self._ret = self._target(*self._args, **self._kwargs)
^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
File "/gpfslocalsup/pub/anaconda-py3/2023.09/envs/pytorch-gpu-2.2.0+py3.11.7/lib/python3.11/site-packages/ray/tune/trainable/function_trainable.py", line 249, in <lambda>
training_func=lambda: self._trainable_func(self.config),
^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
File "/gpfslocalsup/pub/anaconda-py3/2023.09/envs/pytorch-gpu-2.2.0+py3.11.7/lib/python3.11/site-packages/ray/tune/trainable/function_trainable.py", line 332, in _trainable_func
output = fn()
^^^^
File "/gpfslocalsup/pub/anaconda-py3/2023.09/envs/pytorch-gpu-2.2.0+py3.11.7/lib/python3.11/site-packages/ray/tune/trainable/util.py", line 138, in inner
return trainable(config, **fn_kwargs)
^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
File "/gpfswork/rech/ute/ucg81ws/these/LC-MS-RT-prediction/main_ray_tune.py", line 66, in train_model
pred_rt = net.module.forward_rt(seq)
^^^^^^^^^^^^^^^^^^^^^^^^^^
File "/gpfsdswork/projects/rech/ute/ucg81ws/these/LC-MS-RT-prediction/model_custom.py", line 111, in forward_rt
out_rt = self.decoder_RT(enc)
^^^^^^^^^^^^^^^^^^^^
File "/gpfslocalsup/pub/anaconda-py3/2023.09/envs/pytorch-gpu-2.2.0+py3.11.7/lib/python3.11/site-packages/torch/nn/modules/module.py", line 1511, in _wrapped_call_impl
return self._call_impl(*args, **kwargs)
^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
File "/gpfslocalsup/pub/anaconda-py3/2023.09/envs/pytorch-gpu-2.2.0+py3.11.7/lib/python3.11/site-packages/torch/nn/modules/module.py", line 1520, in _call_impl
return forward_call(*args, **kwargs)
^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
File "/gpfslocalsup/pub/anaconda-py3/2023.09/envs/pytorch-gpu-2.2.0+py3.11.7/lib/python3.11/site-packages/torch/nn/modules/container.py", line 217, in forward
input = module(input)
^^^^^^^^^^^^^
File "/gpfslocalsup/pub/anaconda-py3/2023.09/envs/pytorch-gpu-2.2.0+py3.11.7/lib/python3.11/site-packages/torch/nn/modules/module.py", line 1511, in _wrapped_call_impl
return self._call_impl(*args, **kwargs)
^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
File "/gpfslocalsup/pub/anaconda-py3/2023.09/envs/pytorch-gpu-2.2.0+py3.11.7/lib/python3.11/site-packages/torch/nn/modules/module.py", line 1520, in _call_impl
return forward_call(*args, **kwargs)
^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
File "/gpfslocalsup/pub/anaconda-py3/2023.09/envs/pytorch-gpu-2.2.0+py3.11.7/lib/python3.11/site-packages/torch/nn/modules/transformer.py", line 391, in forward
output = mod(output, src_mask=mask, is_causal=is_causal, src_key_padding_mask=src_key_padding_mask_for_layers)
^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
File "/gpfslocalsup/pub/anaconda-py3/2023.09/envs/pytorch-gpu-2.2.0+py3.11.7/lib/python3.11/site-packages/torch/nn/modules/module.py", line 1511, in _wrapped_call_impl
return self._call_impl(*args, **kwargs)
^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
File "/gpfslocalsup/pub/anaconda-py3/2023.09/envs/pytorch-gpu-2.2.0+py3.11.7/lib/python3.11/site-packages/torch/nn/modules/module.py", line 1520, in _call_impl
return forward_call(*args, **kwargs)
^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
File "/gpfslocalsup/pub/anaconda-py3/2023.09/envs/pytorch-gpu-2.2.0+py3.11.7/lib/python3.11/site-packages/torch/nn/modules/transformer.py", line 714, in forward
x = self.norm1(x + self._sa_block(x, src_mask, src_key_padding_mask, is_causal=is_causal))
^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
File "/gpfslocalsup/pub/anaconda-py3/2023.09/envs/pytorch-gpu-2.2.0+py3.11.7/lib/python3.11/site-packages/torch/nn/modules/transformer.py", line 722, in _sa_block
x = self.self_attn(x, x, x,
^^^^^^^^^^^^^^^^^^^^^^^
File "/gpfslocalsup/pub/anaconda-py3/2023.09/envs/pytorch-gpu-2.2.0+py3.11.7/lib/python3.11/site-packages/torch/nn/modules/module.py", line 1511, in _wrapped_call_impl
return self._call_impl(*args, **kwargs)
^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
File "/gpfslocalsup/pub/anaconda-py3/2023.09/envs/pytorch-gpu-2.2.0+py3.11.7/lib/python3.11/site-packages/torch/nn/modules/module.py", line 1520, in _call_impl
return forward_call(*args, **kwargs)
^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
File "/gpfslocalsup/pub/anaconda-py3/2023.09/envs/pytorch-gpu-2.2.0+py3.11.7/lib/python3.11/site-packages/torch/nn/modules/activation.py", line 1241, in forward
attn_output, attn_output_weights = F.multi_head_attention_forward(
^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
File "/gpfslocalsup/pub/anaconda-py3/2023.09/envs/pytorch-gpu-2.2.0+py3.11.7/lib/python3.11/site-packages/torch/nn/functional.py", line 5336, in multi_head_attention_forward
q, k, v = _in_projection_packed(query, key, value, in_proj_weight, in_proj_bias)
^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
File "/gpfslocalsup/pub/anaconda-py3/2023.09/envs/pytorch-gpu-2.2.0+py3.11.7/lib/python3.11/site-packages/torch/nn/functional.py", line 4857, in _in_projection_packed
proj = linear(q, w, b)
^^^^^^^^^^^^^^^
torch.cuda.OutOfMemoryError: CUDA out of memory. Tried to allocate 600.00 MiB. GPU 0 has a total capacity of 31.73 GiB of which 64.69 MiB is free. Including non-PyTorch memory, this process has 31.67 GiB memory in use. Of the allocated memory 31.05 GiB is allocated by PyTorch, and 253.16 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation. See documentation for Memory Management (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables)
{
"batch_size": 2048,
"decoder_int_ff": 512,
"decoder_int_num_layer": 1,
"decoder_rt_ff": 2048,
"decoder_rt_num_layer": 8,
"drop_rate": 0.18892533975584302,
"embedding_dim": 1024,
"encoder_ff": 2048,
"encoder_num_layer": 4,
"lr": 0.0023772931833339287,
"n_head": 2
}
\ No newline at end of file
0% Loading or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment