Skip to content
Snippets Groups Projects
Commit 915e419e authored by Léo Schneider's avatar Léo Schneider Committed by Schneider Leo
Browse files

res

parent a790c5c2
No related branches found
No related tags found
No related merge requests found
Showing
with 178 additions and 0 deletions
{
"batch_size": 2048,
"decoder_int_ff": 512,
"decoder_int_num_layer": 1,
"decoder_rt_ff": 512,
"decoder_rt_num_layer": 2,
"drop_rate": 0.6143131375049876,
"embedding_dim": 64,
"encoder_ff": 2048,
"encoder_num_layer": 4,
"lr": 0.0007003169863053682,
"n_head": 4
}
\ No newline at end of file
loss,timestamp,checkpoint_dir_name,should_checkpoint,done,training_iteration,trial_id,date,time_this_iter_s,time_total_s,pid,hostname,node_ip,time_since_restore,iterations_since_restore
1980.4066219780389,1719514079,checkpoint_000000,True,False,1,0cc69cc0,2024-06-27_20-47-59,78.72657775878906,78.72657775878906,242786,r8i6n8,10.159.28.66,78.72657775878906,1
1974.7192046398252,1719514144,checkpoint_000001,True,False,2,0cc69cc0,2024-06-27_20-49-04,64.93273973464966,143.65931749343872,242786,r8i6n8,10.159.28.66,143.65931749343872,2
1979.3855787862942,1719514209,checkpoint_000002,True,False,3,0cc69cc0,2024-06-27_20-50-09,65.05311489105225,208.71243238449097,242786,r8i6n8,10.159.28.66,208.71243238449097,3
1979.0269044891118,1719514274,checkpoint_000003,True,False,4,0cc69cc0,2024-06-27_20-51-14,65.01725125312805,273.729683637619,242786,r8i6n8,10.159.28.66,273.729683637619,4
1976.4739701879307,1719514339,checkpoint_000004,True,False,5,0cc69cc0,2024-06-27_20-52-19,65.06425142288208,338.7939350605011,242786,r8i6n8,10.159.28.66,338.7939350605011,5
1978.0683257335752,1719514404,checkpoint_000005,True,False,6,0cc69cc0,2024-06-27_20-53-24,65.00006556510925,403.79400062561035,242786,r8i6n8,10.159.28.66,403.79400062561035,6
1451.6436892531988,1719514469,checkpoint_000006,True,False,7,0cc69cc0,2024-06-27_20-54-29,64.97408652305603,468.7680871486664,242786,r8i6n8,10.159.28.66,468.7680871486664,7
1978.8701690914124,1719514535,checkpoint_000007,True,False,8,0cc69cc0,2024-06-27_20-55-35,65.56082820892334,534.3289153575897,242786,r8i6n8,10.159.28.66,534.3289153575897,8
1973.6112147053395,1719514600,checkpoint_000008,True,False,9,0cc69cc0,2024-06-27_20-56-40,64.95659232139587,599.2855076789856,242786,r8i6n8,10.159.28.66,599.2855076789856,9
1973.0043147530143,1719514665,checkpoint_000009,True,False,10,0cc69cc0,2024-06-27_20-57-45,65.37610912322998,664.6616168022156,242786,r8i6n8,10.159.28.66,664.6616168022156,10
1978.763697826956,1719519614,checkpoint_000010,True,False,11,0cc69cc0,2024-06-27_22-20-14,79.15366196632385,743.8152787685394,242786,r8i6n8,10.159.28.66,79.15366196632385,1
1975.8241879921259,1719519680,checkpoint_000011,True,False,12,0cc69cc0,2024-06-27_22-21-20,65.74563002586365,809.5609087944031,242786,r8i6n8,10.159.28.66,144.8992919921875,2
1974.2569397453249,1719519746,checkpoint_000012,True,False,13,0cc69cc0,2024-06-27_22-22-26,65.7228844165802,875.2837932109833,242786,r8i6n8,10.159.28.66,210.6221764087677,3
1976.7626645546259,1719519812,checkpoint_000013,True,False,14,0cc69cc0,2024-06-27_22-23-32,66.2012448310852,941.4850380420685,242786,r8i6n8,10.159.28.66,276.8234212398529,4
1976.9357179656743,1719519878,checkpoint_000014,True,False,15,0cc69cc0,2024-06-27_22-24-38,65.79811668395996,1007.2831547260284,242786,r8i6n8,10.159.28.66,342.62153792381287,5
1976.9357958215428,1719519944,checkpoint_000015,True,False,16,0cc69cc0,2024-06-27_22-25-44,65.78404641151428,1073.0672011375427,242786,r8i6n8,10.159.28.66,408.40558433532715,6
1976.7361752968134,1719520009,checkpoint_000016,True,False,17,0cc69cc0,2024-06-27_22-26-49,65.76862573623657,1138.8358268737793,242786,r8i6n8,10.159.28.66,474.1742100715637,7
1977.8280586783342,1719520075,checkpoint_000017,True,False,18,0cc69cc0,2024-06-27_22-27-55,65.81831574440002,1204.6541426181793,242786,r8i6n8,10.159.28.66,539.9925258159637,8
1975.8057380736343,1719520141,checkpoint_000018,True,False,19,0cc69cc0,2024-06-27_22-29-01,66.07627272605896,1270.7304153442383,242786,r8i6n8,10.159.28.66,606.0687985420227,9
1976.771424781619,1719520207,checkpoint_000019,True,False,20,0cc69cc0,2024-06-27_22-30-07,65.2650773525238,1335.995492696762,242786,r8i6n8,10.159.28.66,671.3338758945465,10
{"loss": 1980.4066219780389, "timestamp": 1719514079, "checkpoint_dir_name": "checkpoint_000000", "should_checkpoint": true, "done": false, "training_iteration": 1, "trial_id": "0cc69cc0", "date": "2024-06-27_20-47-59", "time_this_iter_s": 78.72657775878906, "time_total_s": 78.72657775878906, "pid": 242786, "hostname": "r8i6n8", "node_ip": "10.159.28.66", "config": {"encoder_num_layer": 4, "decoder_rt_num_layer": 2, "decoder_int_num_layer": 1, "embedding_dim": 64, "encoder_ff": 2048, "decoder_rt_ff": 512, "decoder_int_ff": 512, "n_head": 4, "drop_rate": 0.6143131375049876, "lr": 0.0007003169863053682, "batch_size": 2048}, "time_since_restore": 78.72657775878906, "iterations_since_restore": 1}
{"loss": 1974.7192046398252, "timestamp": 1719514144, "checkpoint_dir_name": "checkpoint_000001", "should_checkpoint": true, "done": false, "training_iteration": 2, "trial_id": "0cc69cc0", "date": "2024-06-27_20-49-04", "time_this_iter_s": 64.93273973464966, "time_total_s": 143.65931749343872, "pid": 242786, "hostname": "r8i6n8", "node_ip": "10.159.28.66", "config": {"encoder_num_layer": 4, "decoder_rt_num_layer": 2, "decoder_int_num_layer": 1, "embedding_dim": 64, "encoder_ff": 2048, "decoder_rt_ff": 512, "decoder_int_ff": 512, "n_head": 4, "drop_rate": 0.6143131375049876, "lr": 0.0007003169863053682, "batch_size": 2048}, "time_since_restore": 143.65931749343872, "iterations_since_restore": 2}
{"loss": 1979.3855787862942, "timestamp": 1719514209, "checkpoint_dir_name": "checkpoint_000002", "should_checkpoint": true, "done": false, "training_iteration": 3, "trial_id": "0cc69cc0", "date": "2024-06-27_20-50-09", "time_this_iter_s": 65.05311489105225, "time_total_s": 208.71243238449097, "pid": 242786, "hostname": "r8i6n8", "node_ip": "10.159.28.66", "config": {"encoder_num_layer": 4, "decoder_rt_num_layer": 2, "decoder_int_num_layer": 1, "embedding_dim": 64, "encoder_ff": 2048, "decoder_rt_ff": 512, "decoder_int_ff": 512, "n_head": 4, "drop_rate": 0.6143131375049876, "lr": 0.0007003169863053682, "batch_size": 2048}, "time_since_restore": 208.71243238449097, "iterations_since_restore": 3}
{"loss": 1979.0269044891118, "timestamp": 1719514274, "checkpoint_dir_name": "checkpoint_000003", "should_checkpoint": true, "done": false, "training_iteration": 4, "trial_id": "0cc69cc0", "date": "2024-06-27_20-51-14", "time_this_iter_s": 65.01725125312805, "time_total_s": 273.729683637619, "pid": 242786, "hostname": "r8i6n8", "node_ip": "10.159.28.66", "config": {"encoder_num_layer": 4, "decoder_rt_num_layer": 2, "decoder_int_num_layer": 1, "embedding_dim": 64, "encoder_ff": 2048, "decoder_rt_ff": 512, "decoder_int_ff": 512, "n_head": 4, "drop_rate": 0.6143131375049876, "lr": 0.0007003169863053682, "batch_size": 2048}, "time_since_restore": 273.729683637619, "iterations_since_restore": 4}
{"loss": 1976.4739701879307, "timestamp": 1719514339, "checkpoint_dir_name": "checkpoint_000004", "should_checkpoint": true, "done": false, "training_iteration": 5, "trial_id": "0cc69cc0", "date": "2024-06-27_20-52-19", "time_this_iter_s": 65.06425142288208, "time_total_s": 338.7939350605011, "pid": 242786, "hostname": "r8i6n8", "node_ip": "10.159.28.66", "config": {"encoder_num_layer": 4, "decoder_rt_num_layer": 2, "decoder_int_num_layer": 1, "embedding_dim": 64, "encoder_ff": 2048, "decoder_rt_ff": 512, "decoder_int_ff": 512, "n_head": 4, "drop_rate": 0.6143131375049876, "lr": 0.0007003169863053682, "batch_size": 2048}, "time_since_restore": 338.7939350605011, "iterations_since_restore": 5}
{"loss": 1978.0683257335752, "timestamp": 1719514404, "checkpoint_dir_name": "checkpoint_000005", "should_checkpoint": true, "done": false, "training_iteration": 6, "trial_id": "0cc69cc0", "date": "2024-06-27_20-53-24", "time_this_iter_s": 65.00006556510925, "time_total_s": 403.79400062561035, "pid": 242786, "hostname": "r8i6n8", "node_ip": "10.159.28.66", "config": {"encoder_num_layer": 4, "decoder_rt_num_layer": 2, "decoder_int_num_layer": 1, "embedding_dim": 64, "encoder_ff": 2048, "decoder_rt_ff": 512, "decoder_int_ff": 512, "n_head": 4, "drop_rate": 0.6143131375049876, "lr": 0.0007003169863053682, "batch_size": 2048}, "time_since_restore": 403.79400062561035, "iterations_since_restore": 6}
{"loss": 1451.6436892531988, "timestamp": 1719514469, "checkpoint_dir_name": "checkpoint_000006", "should_checkpoint": true, "done": false, "training_iteration": 7, "trial_id": "0cc69cc0", "date": "2024-06-27_20-54-29", "time_this_iter_s": 64.97408652305603, "time_total_s": 468.7680871486664, "pid": 242786, "hostname": "r8i6n8", "node_ip": "10.159.28.66", "config": {"encoder_num_layer": 4, "decoder_rt_num_layer": 2, "decoder_int_num_layer": 1, "embedding_dim": 64, "encoder_ff": 2048, "decoder_rt_ff": 512, "decoder_int_ff": 512, "n_head": 4, "drop_rate": 0.6143131375049876, "lr": 0.0007003169863053682, "batch_size": 2048}, "time_since_restore": 468.7680871486664, "iterations_since_restore": 7}
{"loss": 1978.8701690914124, "timestamp": 1719514535, "checkpoint_dir_name": "checkpoint_000007", "should_checkpoint": true, "done": false, "training_iteration": 8, "trial_id": "0cc69cc0", "date": "2024-06-27_20-55-35", "time_this_iter_s": 65.56082820892334, "time_total_s": 534.3289153575897, "pid": 242786, "hostname": "r8i6n8", "node_ip": "10.159.28.66", "config": {"encoder_num_layer": 4, "decoder_rt_num_layer": 2, "decoder_int_num_layer": 1, "embedding_dim": 64, "encoder_ff": 2048, "decoder_rt_ff": 512, "decoder_int_ff": 512, "n_head": 4, "drop_rate": 0.6143131375049876, "lr": 0.0007003169863053682, "batch_size": 2048}, "time_since_restore": 534.3289153575897, "iterations_since_restore": 8}
{"loss": 1973.6112147053395, "timestamp": 1719514600, "checkpoint_dir_name": "checkpoint_000008", "should_checkpoint": true, "done": false, "training_iteration": 9, "trial_id": "0cc69cc0", "date": "2024-06-27_20-56-40", "time_this_iter_s": 64.95659232139587, "time_total_s": 599.2855076789856, "pid": 242786, "hostname": "r8i6n8", "node_ip": "10.159.28.66", "config": {"encoder_num_layer": 4, "decoder_rt_num_layer": 2, "decoder_int_num_layer": 1, "embedding_dim": 64, "encoder_ff": 2048, "decoder_rt_ff": 512, "decoder_int_ff": 512, "n_head": 4, "drop_rate": 0.6143131375049876, "lr": 0.0007003169863053682, "batch_size": 2048}, "time_since_restore": 599.2855076789856, "iterations_since_restore": 9}
{"loss": 1973.0043147530143, "timestamp": 1719514665, "checkpoint_dir_name": "checkpoint_000009", "should_checkpoint": true, "done": false, "training_iteration": 10, "trial_id": "0cc69cc0", "date": "2024-06-27_20-57-45", "time_this_iter_s": 65.37610912322998, "time_total_s": 664.6616168022156, "pid": 242786, "hostname": "r8i6n8", "node_ip": "10.159.28.66", "config": {"encoder_num_layer": 4, "decoder_rt_num_layer": 2, "decoder_int_num_layer": 1, "embedding_dim": 64, "encoder_ff": 2048, "decoder_rt_ff": 512, "decoder_int_ff": 512, "n_head": 4, "drop_rate": 0.6143131375049876, "lr": 0.0007003169863053682, "batch_size": 2048}, "time_since_restore": 664.6616168022156, "iterations_since_restore": 10}
{"loss": 1978.763697826956, "timestamp": 1719519614, "checkpoint_dir_name": "checkpoint_000010", "should_checkpoint": true, "done": false, "training_iteration": 11, "trial_id": "0cc69cc0", "date": "2024-06-27_22-20-14", "time_this_iter_s": 79.15366196632385, "time_total_s": 743.8152787685394, "pid": 242786, "hostname": "r8i6n8", "node_ip": "10.159.28.66", "config": {"encoder_num_layer": 4, "decoder_rt_num_layer": 2, "decoder_int_num_layer": 1, "embedding_dim": 64, "encoder_ff": 2048, "decoder_rt_ff": 512, "decoder_int_ff": 512, "n_head": 4, "drop_rate": 0.6143131375049876, "lr": 0.0007003169863053682, "batch_size": 2048}, "time_since_restore": 79.15366196632385, "iterations_since_restore": 1}
{"loss": 1975.8241879921259, "timestamp": 1719519680, "checkpoint_dir_name": "checkpoint_000011", "should_checkpoint": true, "done": false, "training_iteration": 12, "trial_id": "0cc69cc0", "date": "2024-06-27_22-21-20", "time_this_iter_s": 65.74563002586365, "time_total_s": 809.5609087944031, "pid": 242786, "hostname": "r8i6n8", "node_ip": "10.159.28.66", "config": {"encoder_num_layer": 4, "decoder_rt_num_layer": 2, "decoder_int_num_layer": 1, "embedding_dim": 64, "encoder_ff": 2048, "decoder_rt_ff": 512, "decoder_int_ff": 512, "n_head": 4, "drop_rate": 0.6143131375049876, "lr": 0.0007003169863053682, "batch_size": 2048}, "time_since_restore": 144.8992919921875, "iterations_since_restore": 2}
{"loss": 1974.2569397453249, "timestamp": 1719519746, "checkpoint_dir_name": "checkpoint_000012", "should_checkpoint": true, "done": false, "training_iteration": 13, "trial_id": "0cc69cc0", "date": "2024-06-27_22-22-26", "time_this_iter_s": 65.7228844165802, "time_total_s": 875.2837932109833, "pid": 242786, "hostname": "r8i6n8", "node_ip": "10.159.28.66", "config": {"encoder_num_layer": 4, "decoder_rt_num_layer": 2, "decoder_int_num_layer": 1, "embedding_dim": 64, "encoder_ff": 2048, "decoder_rt_ff": 512, "decoder_int_ff": 512, "n_head": 4, "drop_rate": 0.6143131375049876, "lr": 0.0007003169863053682, "batch_size": 2048}, "time_since_restore": 210.6221764087677, "iterations_since_restore": 3}
{"loss": 1976.7626645546259, "timestamp": 1719519812, "checkpoint_dir_name": "checkpoint_000013", "should_checkpoint": true, "done": false, "training_iteration": 14, "trial_id": "0cc69cc0", "date": "2024-06-27_22-23-32", "time_this_iter_s": 66.2012448310852, "time_total_s": 941.4850380420685, "pid": 242786, "hostname": "r8i6n8", "node_ip": "10.159.28.66", "config": {"encoder_num_layer": 4, "decoder_rt_num_layer": 2, "decoder_int_num_layer": 1, "embedding_dim": 64, "encoder_ff": 2048, "decoder_rt_ff": 512, "decoder_int_ff": 512, "n_head": 4, "drop_rate": 0.6143131375049876, "lr": 0.0007003169863053682, "batch_size": 2048}, "time_since_restore": 276.8234212398529, "iterations_since_restore": 4}
{"loss": 1976.9357179656743, "timestamp": 1719519878, "checkpoint_dir_name": "checkpoint_000014", "should_checkpoint": true, "done": false, "training_iteration": 15, "trial_id": "0cc69cc0", "date": "2024-06-27_22-24-38", "time_this_iter_s": 65.79811668395996, "time_total_s": 1007.2831547260284, "pid": 242786, "hostname": "r8i6n8", "node_ip": "10.159.28.66", "config": {"encoder_num_layer": 4, "decoder_rt_num_layer": 2, "decoder_int_num_layer": 1, "embedding_dim": 64, "encoder_ff": 2048, "decoder_rt_ff": 512, "decoder_int_ff": 512, "n_head": 4, "drop_rate": 0.6143131375049876, "lr": 0.0007003169863053682, "batch_size": 2048}, "time_since_restore": 342.62153792381287, "iterations_since_restore": 5}
{"loss": 1976.9357958215428, "timestamp": 1719519944, "checkpoint_dir_name": "checkpoint_000015", "should_checkpoint": true, "done": false, "training_iteration": 16, "trial_id": "0cc69cc0", "date": "2024-06-27_22-25-44", "time_this_iter_s": 65.78404641151428, "time_total_s": 1073.0672011375427, "pid": 242786, "hostname": "r8i6n8", "node_ip": "10.159.28.66", "config": {"encoder_num_layer": 4, "decoder_rt_num_layer": 2, "decoder_int_num_layer": 1, "embedding_dim": 64, "encoder_ff": 2048, "decoder_rt_ff": 512, "decoder_int_ff": 512, "n_head": 4, "drop_rate": 0.6143131375049876, "lr": 0.0007003169863053682, "batch_size": 2048}, "time_since_restore": 408.40558433532715, "iterations_since_restore": 6}
{"loss": 1976.7361752968134, "timestamp": 1719520009, "checkpoint_dir_name": "checkpoint_000016", "should_checkpoint": true, "done": false, "training_iteration": 17, "trial_id": "0cc69cc0", "date": "2024-06-27_22-26-49", "time_this_iter_s": 65.76862573623657, "time_total_s": 1138.8358268737793, "pid": 242786, "hostname": "r8i6n8", "node_ip": "10.159.28.66", "config": {"encoder_num_layer": 4, "decoder_rt_num_layer": 2, "decoder_int_num_layer": 1, "embedding_dim": 64, "encoder_ff": 2048, "decoder_rt_ff": 512, "decoder_int_ff": 512, "n_head": 4, "drop_rate": 0.6143131375049876, "lr": 0.0007003169863053682, "batch_size": 2048}, "time_since_restore": 474.1742100715637, "iterations_since_restore": 7}
{"loss": 1977.8280586783342, "timestamp": 1719520075, "checkpoint_dir_name": "checkpoint_000017", "should_checkpoint": true, "done": false, "training_iteration": 18, "trial_id": "0cc69cc0", "date": "2024-06-27_22-27-55", "time_this_iter_s": 65.81831574440002, "time_total_s": 1204.6541426181793, "pid": 242786, "hostname": "r8i6n8", "node_ip": "10.159.28.66", "config": {"encoder_num_layer": 4, "decoder_rt_num_layer": 2, "decoder_int_num_layer": 1, "embedding_dim": 64, "encoder_ff": 2048, "decoder_rt_ff": 512, "decoder_int_ff": 512, "n_head": 4, "drop_rate": 0.6143131375049876, "lr": 0.0007003169863053682, "batch_size": 2048}, "time_since_restore": 539.9925258159637, "iterations_since_restore": 8}
{"loss": 1975.8057380736343, "timestamp": 1719520141, "checkpoint_dir_name": "checkpoint_000018", "should_checkpoint": true, "done": false, "training_iteration": 19, "trial_id": "0cc69cc0", "date": "2024-06-27_22-29-01", "time_this_iter_s": 66.07627272605896, "time_total_s": 1270.7304153442383, "pid": 242786, "hostname": "r8i6n8", "node_ip": "10.159.28.66", "config": {"encoder_num_layer": 4, "decoder_rt_num_layer": 2, "decoder_int_num_layer": 1, "embedding_dim": 64, "encoder_ff": 2048, "decoder_rt_ff": 512, "decoder_int_ff": 512, "n_head": 4, "drop_rate": 0.6143131375049876, "lr": 0.0007003169863053682, "batch_size": 2048}, "time_since_restore": 606.0687985420227, "iterations_since_restore": 9}
{"loss": 1976.771424781619, "timestamp": 1719520207, "checkpoint_dir_name": "checkpoint_000019", "should_checkpoint": true, "done": false, "training_iteration": 20, "trial_id": "0cc69cc0", "date": "2024-06-27_22-30-07", "time_this_iter_s": 65.2650773525238, "time_total_s": 1335.995492696762, "pid": 242786, "hostname": "r8i6n8", "node_ip": "10.159.28.66", "config": {"encoder_num_layer": 4, "decoder_rt_num_layer": 2, "decoder_int_num_layer": 1, "embedding_dim": 64, "encoder_ff": 2048, "decoder_rt_ff": 512, "decoder_int_ff": 512, "n_head": 4, "drop_rate": 0.6143131375049876, "lr": 0.0007003169863053682, "batch_size": 2048}, "time_since_restore": 671.3338758945465, "iterations_since_restore": 10}
{
"batch_size": 1024,
"decoder_int_ff": 512,
"decoder_int_num_layer": 1,
"decoder_rt_ff": 2048,
"decoder_rt_num_layer": 4,
"drop_rate": 0.5738068254184514,
"embedding_dim": 64,
"encoder_ff": 2048,
"encoder_num_layer": 4,
"lr": 0.009136259111296272,
"n_head": 1
}
\ No newline at end of file
loss,timestamp,checkpoint_dir_name,should_checkpoint,done,training_iteration,trial_id,date,time_this_iter_s,time_total_s,pid,hostname,node_ip,time_since_restore,iterations_since_restore
2003.1058138148992,1719493727,checkpoint_000000,True,True,1,0d034d62,2024-06-27_15-08-47,94.90786528587341,94.90786528587341,69992,r8i6n8,10.159.28.66,94.90786528587341,1
{"loss": 2003.1058138148992, "timestamp": 1719493727, "checkpoint_dir_name": "checkpoint_000000", "should_checkpoint": true, "done": true, "training_iteration": 1, "trial_id": "0d034d62", "date": "2024-06-27_15-08-47", "time_this_iter_s": 94.90786528587341, "time_total_s": 94.90786528587341, "pid": 69992, "hostname": "r8i6n8", "node_ip": "10.159.28.66", "config": {"encoder_num_layer": 4, "decoder_rt_num_layer": 4, "decoder_int_num_layer": 1, "embedding_dim": 64, "encoder_ff": 2048, "decoder_rt_ff": 2048, "decoder_int_ff": 512, "n_head": 1, "drop_rate": 0.5738068254184514, "lr": 0.009136259111296272, "batch_size": 1024}, "time_since_restore": 94.90786528587341, "iterations_since_restore": 1}
Failure # 1 (occurred at 2024-06-27_14-37-55)
ray::ImplicitFunc.train() (pid=35196, ip=10.159.28.66, actor_id=76f13cb1c39b1e46c8a07d1101000000, repr=train_model)
^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
File "/gpfslocalsup/pub/anaconda-py3/2023.09/envs/pytorch-gpu-2.2.0+py3.11.7/lib/python3.11/site-packages/ray/tune/trainable/trainable.py", line 342, in train
raise skipped from exception_cause(skipped)
File "/gpfslocalsup/pub/anaconda-py3/2023.09/envs/pytorch-gpu-2.2.0+py3.11.7/lib/python3.11/site-packages/ray/air/_internal/util.py", line 88, in run
self._ret = self._target(*self._args, **self._kwargs)
^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
File "/gpfslocalsup/pub/anaconda-py3/2023.09/envs/pytorch-gpu-2.2.0+py3.11.7/lib/python3.11/site-packages/ray/tune/trainable/function_trainable.py", line 249, in <lambda>
training_func=lambda: self._trainable_func(self.config),
^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
File "/gpfslocalsup/pub/anaconda-py3/2023.09/envs/pytorch-gpu-2.2.0+py3.11.7/lib/python3.11/site-packages/ray/tune/trainable/function_trainable.py", line 332, in _trainable_func
output = fn()
^^^^
File "/gpfslocalsup/pub/anaconda-py3/2023.09/envs/pytorch-gpu-2.2.0+py3.11.7/lib/python3.11/site-packages/ray/tune/trainable/util.py", line 138, in inner
return trainable(config, **fn_kwargs)
^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
File "/gpfswork/rech/ute/ucg81ws/these/LC-MS-RT-prediction/main_ray_tune.py", line 66, in train_model
pred_rt = net.module.forward_rt(seq)
^^^^^^^^^^^^^^^^^^^^^^^^^^
File "/gpfsdswork/projects/rech/ute/ucg81ws/these/LC-MS-RT-prediction/model_custom.py", line 111, in forward_rt
out_rt = self.decoder_RT(enc)
^^^^^^^^^^^^^^^^^^^^
File "/gpfslocalsup/pub/anaconda-py3/2023.09/envs/pytorch-gpu-2.2.0+py3.11.7/lib/python3.11/site-packages/torch/nn/modules/module.py", line 1511, in _wrapped_call_impl
return self._call_impl(*args, **kwargs)
^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
File "/gpfslocalsup/pub/anaconda-py3/2023.09/envs/pytorch-gpu-2.2.0+py3.11.7/lib/python3.11/site-packages/torch/nn/modules/module.py", line 1520, in _call_impl
return forward_call(*args, **kwargs)
^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
File "/gpfslocalsup/pub/anaconda-py3/2023.09/envs/pytorch-gpu-2.2.0+py3.11.7/lib/python3.11/site-packages/torch/nn/modules/container.py", line 217, in forward
input = module(input)
^^^^^^^^^^^^^
File "/gpfslocalsup/pub/anaconda-py3/2023.09/envs/pytorch-gpu-2.2.0+py3.11.7/lib/python3.11/site-packages/torch/nn/modules/module.py", line 1511, in _wrapped_call_impl
return self._call_impl(*args, **kwargs)
^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
File "/gpfslocalsup/pub/anaconda-py3/2023.09/envs/pytorch-gpu-2.2.0+py3.11.7/lib/python3.11/site-packages/torch/nn/modules/module.py", line 1520, in _call_impl
return forward_call(*args, **kwargs)
^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
File "/gpfslocalsup/pub/anaconda-py3/2023.09/envs/pytorch-gpu-2.2.0+py3.11.7/lib/python3.11/site-packages/torch/nn/modules/transformer.py", line 391, in forward
output = mod(output, src_mask=mask, is_causal=is_causal, src_key_padding_mask=src_key_padding_mask_for_layers)
^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
File "/gpfslocalsup/pub/anaconda-py3/2023.09/envs/pytorch-gpu-2.2.0+py3.11.7/lib/python3.11/site-packages/torch/nn/modules/module.py", line 1511, in _wrapped_call_impl
return self._call_impl(*args, **kwargs)
^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
File "/gpfslocalsup/pub/anaconda-py3/2023.09/envs/pytorch-gpu-2.2.0+py3.11.7/lib/python3.11/site-packages/torch/nn/modules/module.py", line 1520, in _call_impl
return forward_call(*args, **kwargs)
^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
File "/gpfslocalsup/pub/anaconda-py3/2023.09/envs/pytorch-gpu-2.2.0+py3.11.7/lib/python3.11/site-packages/torch/nn/modules/transformer.py", line 714, in forward
x = self.norm1(x + self._sa_block(x, src_mask, src_key_padding_mask, is_causal=is_causal))
^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
File "/gpfslocalsup/pub/anaconda-py3/2023.09/envs/pytorch-gpu-2.2.0+py3.11.7/lib/python3.11/site-packages/torch/nn/modules/transformer.py", line 722, in _sa_block
x = self.self_attn(x, x, x,
^^^^^^^^^^^^^^^^^^^^^^^
File "/gpfslocalsup/pub/anaconda-py3/2023.09/envs/pytorch-gpu-2.2.0+py3.11.7/lib/python3.11/site-packages/torch/nn/modules/module.py", line 1511, in _wrapped_call_impl
return self._call_impl(*args, **kwargs)
^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
File "/gpfslocalsup/pub/anaconda-py3/2023.09/envs/pytorch-gpu-2.2.0+py3.11.7/lib/python3.11/site-packages/torch/nn/modules/module.py", line 1520, in _call_impl
return forward_call(*args, **kwargs)
^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
File "/gpfslocalsup/pub/anaconda-py3/2023.09/envs/pytorch-gpu-2.2.0+py3.11.7/lib/python3.11/site-packages/torch/nn/modules/activation.py", line 1241, in forward
attn_output, attn_output_weights = F.multi_head_attention_forward(
^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
File "/gpfslocalsup/pub/anaconda-py3/2023.09/envs/pytorch-gpu-2.2.0+py3.11.7/lib/python3.11/site-packages/torch/nn/functional.py", line 5336, in multi_head_attention_forward
q, k, v = _in_projection_packed(query, key, value, in_proj_weight, in_proj_bias)
^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
File "/gpfslocalsup/pub/anaconda-py3/2023.09/envs/pytorch-gpu-2.2.0+py3.11.7/lib/python3.11/site-packages/torch/nn/functional.py", line 4857, in _in_projection_packed
proj = linear(q, w, b)
^^^^^^^^^^^^^^^
torch.cuda.OutOfMemoryError: CUDA out of memory. Tried to allocate 600.00 MiB. GPU 0 has a total capacity of 31.73 GiB of which 64.69 MiB is free. Including non-PyTorch memory, this process has 31.67 GiB memory in use. Of the allocated memory 31.05 GiB is allocated by PyTorch, and 253.16 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation. See documentation for Memory Management (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables)
{
"batch_size": 2048,
"decoder_int_ff": 512,
"decoder_int_num_layer": 1,
"decoder_rt_ff": 2048,
"decoder_rt_num_layer": 8,
"drop_rate": 0.18892533975584302,
"embedding_dim": 1024,
"encoder_ff": 2048,
"encoder_num_layer": 4,
"lr": 0.0023772931833339287,
"n_head": 2
}
\ No newline at end of file
{
"batch_size": 2048,
"decoder_int_ff": 512,
"decoder_int_num_layer": 1,
"decoder_rt_ff": 1024,
"decoder_rt_num_layer": 1,
"drop_rate": 0.3380953528716497,
"embedding_dim": 256,
"encoder_ff": 1024,
"encoder_num_layer": 8,
"lr": 0.07187916796871326,
"n_head": 16
}
\ No newline at end of file
loss,timestamp,checkpoint_dir_name,should_checkpoint,done,training_iteration,trial_id,date,time_this_iter_s,time_total_s,pid,hostname,node_ip,time_since_restore,iterations_since_restore
3017.182736374262,1719510089,checkpoint_000000,True,False,1,12269c59,2024-06-27_19-41-29,184.32761549949646,184.32761549949646,242786,r8i6n8,10.159.28.66,184.32761549949646,1
2949.607454555241,1719510259,checkpoint_000001,True,False,2,12269c59,2024-06-27_19-44-19,170.74375820159912,355.0713737010956,242786,r8i6n8,10.159.28.66,355.0713737010956,2
2856.6661463459645,1719510430,checkpoint_000002,True,False,3,12269c59,2024-06-27_19-47-11,171.93422675132751,527.0056004524231,242786,r8i6n8,10.159.28.66,527.0056004524231,3
2757.570174089567,1719510602,checkpoint_000003,True,False,4,12269c59,2024-06-27_19-50-02,170.7144296169281,697.7200300693512,242786,r8i6n8,10.159.28.66,697.7200300693512,4
2631.3529908187747,1719510773,checkpoint_000004,True,False,5,12269c59,2024-06-27_19-52-53,170.7047655582428,868.424795627594,242786,r8i6n8,10.159.28.66,868.424795627594,5
{"loss": 3017.182736374262, "timestamp": 1719510089, "checkpoint_dir_name": "checkpoint_000000", "should_checkpoint": true, "done": false, "training_iteration": 1, "trial_id": "12269c59", "date": "2024-06-27_19-41-29", "time_this_iter_s": 184.32761549949646, "time_total_s": 184.32761549949646, "pid": 242786, "hostname": "r8i6n8", "node_ip": "10.159.28.66", "config": {"encoder_num_layer": 8, "decoder_rt_num_layer": 1, "decoder_int_num_layer": 1, "embedding_dim": 256, "encoder_ff": 1024, "decoder_rt_ff": 1024, "decoder_int_ff": 512, "n_head": 16, "drop_rate": 0.3380953528716497, "lr": 0.07187916796871326, "batch_size": 2048}, "time_since_restore": 184.32761549949646, "iterations_since_restore": 1}
{"loss": 2949.607454555241, "timestamp": 1719510259, "checkpoint_dir_name": "checkpoint_000001", "should_checkpoint": true, "done": false, "training_iteration": 2, "trial_id": "12269c59", "date": "2024-06-27_19-44-19", "time_this_iter_s": 170.74375820159912, "time_total_s": 355.0713737010956, "pid": 242786, "hostname": "r8i6n8", "node_ip": "10.159.28.66", "config": {"encoder_num_layer": 8, "decoder_rt_num_layer": 1, "decoder_int_num_layer": 1, "embedding_dim": 256, "encoder_ff": 1024, "decoder_rt_ff": 1024, "decoder_int_ff": 512, "n_head": 16, "drop_rate": 0.3380953528716497, "lr": 0.07187916796871326, "batch_size": 2048}, "time_since_restore": 355.0713737010956, "iterations_since_restore": 2}
{"loss": 2856.6661463459645, "timestamp": 1719510430, "checkpoint_dir_name": "checkpoint_000002", "should_checkpoint": true, "done": false, "training_iteration": 3, "trial_id": "12269c59", "date": "2024-06-27_19-47-11", "time_this_iter_s": 171.93422675132751, "time_total_s": 527.0056004524231, "pid": 242786, "hostname": "r8i6n8", "node_ip": "10.159.28.66", "config": {"encoder_num_layer": 8, "decoder_rt_num_layer": 1, "decoder_int_num_layer": 1, "embedding_dim": 256, "encoder_ff": 1024, "decoder_rt_ff": 1024, "decoder_int_ff": 512, "n_head": 16, "drop_rate": 0.3380953528716497, "lr": 0.07187916796871326, "batch_size": 2048}, "time_since_restore": 527.0056004524231, "iterations_since_restore": 3}
{"loss": 2757.570174089567, "timestamp": 1719510602, "checkpoint_dir_name": "checkpoint_000003", "should_checkpoint": true, "done": false, "training_iteration": 4, "trial_id": "12269c59", "date": "2024-06-27_19-50-02", "time_this_iter_s": 170.7144296169281, "time_total_s": 697.7200300693512, "pid": 242786, "hostname": "r8i6n8", "node_ip": "10.159.28.66", "config": {"encoder_num_layer": 8, "decoder_rt_num_layer": 1, "decoder_int_num_layer": 1, "embedding_dim": 256, "encoder_ff": 1024, "decoder_rt_ff": 1024, "decoder_int_ff": 512, "n_head": 16, "drop_rate": 0.3380953528716497, "lr": 0.07187916796871326, "batch_size": 2048}, "time_since_restore": 697.7200300693512, "iterations_since_restore": 4}
{"loss": 2631.3529908187747, "timestamp": 1719510773, "checkpoint_dir_name": "checkpoint_000004", "should_checkpoint": true, "done": false, "training_iteration": 5, "trial_id": "12269c59", "date": "2024-06-27_19-52-53", "time_this_iter_s": 170.7047655582428, "time_total_s": 868.424795627594, "pid": 242786, "hostname": "r8i6n8", "node_ip": "10.159.28.66", "config": {"encoder_num_layer": 8, "decoder_rt_num_layer": 1, "decoder_int_num_layer": 1, "embedding_dim": 256, "encoder_ff": 1024, "decoder_rt_ff": 1024, "decoder_int_ff": 512, "n_head": 16, "drop_rate": 0.3380953528716497, "lr": 0.07187916796871326, "batch_size": 2048}, "time_since_restore": 868.424795627594, "iterations_since_restore": 5}
0% Loading or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment