Skip to content
Snippets Groups Projects
Commit 915e419e authored by Léo Schneider's avatar Léo Schneider Committed by Schneider Leo
Browse files

res

parent a790c5c2
No related branches found
No related tags found
No related merge requests found
Showing
with 161 additions and 0 deletions
{
"batch_size": 2048,
"decoder_int_ff": 512,
"decoder_int_num_layer": 1,
"decoder_rt_ff": 2048,
"decoder_rt_num_layer": 4,
"drop_rate": 0.2474625892184107,
"embedding_dim": 1024,
"encoder_ff": 1024,
"encoder_num_layer": 2,
"lr": 0.09815210368028619,
"n_head": 16
}
\ No newline at end of file
loss,timestamp,checkpoint_dir_name,should_checkpoint,done,training_iteration,trial_id,date,time_this_iter_s,time_total_s,pid,hostname,node_ip,time_since_restore,iterations_since_restore
5146.327555979331,1719552388,checkpoint_000000,True,False,1,261349af,2024-06-28_07-26-29,641.6589081287384,641.6589081287384,242786,r8i6n8,10.159.28.66,641.6589081287384,1
5048.819774237205,1719553016,checkpoint_000001,True,False,2,261349af,2024-06-28_07-36-56,627.6112632751465,1269.270171403885,242786,r8i6n8,10.159.28.66,1269.270171403885,2
4911.544510488435,1719553644,checkpoint_000002,True,False,3,261349af,2024-06-28_07-47-24,627.9598400592804,1897.2300114631653,242786,r8i6n8,10.159.28.66,1897.2300114631653,3
4737.393370140256,1719554272,checkpoint_000003,True,False,4,261349af,2024-06-28_07-57-52,628.01415848732,2525.2441699504852,242786,r8i6n8,10.159.28.66,2525.2441699504852,4
4523.467796505905,1719554899,checkpoint_000004,True,False,5,261349af,2024-06-28_08-08-20,627.539436340332,3152.7836062908173,242786,r8i6n8,10.159.28.66,3152.7836062908173,5
4273.132577971211,1719555527,checkpoint_000005,True,False,6,261349af,2024-06-28_08-18-47,627.5760381221771,3780.3596444129944,242786,r8i6n8,10.159.28.66,3780.3596444129944,6
3991.0541396253693,1719556155,checkpoint_000006,True,False,7,261349af,2024-06-28_08-29-15,627.5778048038483,4407.937449216843,242786,r8i6n8,10.159.28.66,4407.937449216843,7
3685.58821166031,1719556782,checkpoint_000007,True,False,8,261349af,2024-06-28_08-39-43,627.666775226593,5035.604224443436,242786,r8i6n8,10.159.28.66,5035.604224443436,8
3365.655848225271,1719557410,checkpoint_000008,True,False,9,261349af,2024-06-28_08-50-10,627.4417996406555,5663.046024084091,242786,r8i6n8,10.159.28.66,5663.046024084091,9
3048.0907587967517,1719558037,checkpoint_000009,True,False,10,261349af,2024-06-28_09-00-38,627.6434020996094,6290.689426183701,242786,r8i6n8,10.159.28.66,6290.689426183701,10
{"loss": 5146.327555979331, "timestamp": 1719552388, "checkpoint_dir_name": "checkpoint_000000", "should_checkpoint": true, "done": false, "training_iteration": 1, "trial_id": "261349af", "date": "2024-06-28_07-26-29", "time_this_iter_s": 641.6589081287384, "time_total_s": 641.6589081287384, "pid": 242786, "hostname": "r8i6n8", "node_ip": "10.159.28.66", "config": {"encoder_num_layer": 2, "decoder_rt_num_layer": 4, "decoder_int_num_layer": 1, "embedding_dim": 1024, "encoder_ff": 1024, "decoder_rt_ff": 2048, "decoder_int_ff": 512, "n_head": 16, "drop_rate": 0.2474625892184107, "lr": 0.09815210368028619, "batch_size": 2048}, "time_since_restore": 641.6589081287384, "iterations_since_restore": 1}
{"loss": 5048.819774237205, "timestamp": 1719553016, "checkpoint_dir_name": "checkpoint_000001", "should_checkpoint": true, "done": false, "training_iteration": 2, "trial_id": "261349af", "date": "2024-06-28_07-36-56", "time_this_iter_s": 627.6112632751465, "time_total_s": 1269.270171403885, "pid": 242786, "hostname": "r8i6n8", "node_ip": "10.159.28.66", "config": {"encoder_num_layer": 2, "decoder_rt_num_layer": 4, "decoder_int_num_layer": 1, "embedding_dim": 1024, "encoder_ff": 1024, "decoder_rt_ff": 2048, "decoder_int_ff": 512, "n_head": 16, "drop_rate": 0.2474625892184107, "lr": 0.09815210368028619, "batch_size": 2048}, "time_since_restore": 1269.270171403885, "iterations_since_restore": 2}
{"loss": 4911.544510488435, "timestamp": 1719553644, "checkpoint_dir_name": "checkpoint_000002", "should_checkpoint": true, "done": false, "training_iteration": 3, "trial_id": "261349af", "date": "2024-06-28_07-47-24", "time_this_iter_s": 627.9598400592804, "time_total_s": 1897.2300114631653, "pid": 242786, "hostname": "r8i6n8", "node_ip": "10.159.28.66", "config": {"encoder_num_layer": 2, "decoder_rt_num_layer": 4, "decoder_int_num_layer": 1, "embedding_dim": 1024, "encoder_ff": 1024, "decoder_rt_ff": 2048, "decoder_int_ff": 512, "n_head": 16, "drop_rate": 0.2474625892184107, "lr": 0.09815210368028619, "batch_size": 2048}, "time_since_restore": 1897.2300114631653, "iterations_since_restore": 3}
{"loss": 4737.393370140256, "timestamp": 1719554272, "checkpoint_dir_name": "checkpoint_000003", "should_checkpoint": true, "done": false, "training_iteration": 4, "trial_id": "261349af", "date": "2024-06-28_07-57-52", "time_this_iter_s": 628.01415848732, "time_total_s": 2525.2441699504852, "pid": 242786, "hostname": "r8i6n8", "node_ip": "10.159.28.66", "config": {"encoder_num_layer": 2, "decoder_rt_num_layer": 4, "decoder_int_num_layer": 1, "embedding_dim": 1024, "encoder_ff": 1024, "decoder_rt_ff": 2048, "decoder_int_ff": 512, "n_head": 16, "drop_rate": 0.2474625892184107, "lr": 0.09815210368028619, "batch_size": 2048}, "time_since_restore": 2525.2441699504852, "iterations_since_restore": 4}
{"loss": 4523.467796505905, "timestamp": 1719554899, "checkpoint_dir_name": "checkpoint_000004", "should_checkpoint": true, "done": false, "training_iteration": 5, "trial_id": "261349af", "date": "2024-06-28_08-08-20", "time_this_iter_s": 627.539436340332, "time_total_s": 3152.7836062908173, "pid": 242786, "hostname": "r8i6n8", "node_ip": "10.159.28.66", "config": {"encoder_num_layer": 2, "decoder_rt_num_layer": 4, "decoder_int_num_layer": 1, "embedding_dim": 1024, "encoder_ff": 1024, "decoder_rt_ff": 2048, "decoder_int_ff": 512, "n_head": 16, "drop_rate": 0.2474625892184107, "lr": 0.09815210368028619, "batch_size": 2048}, "time_since_restore": 3152.7836062908173, "iterations_since_restore": 5}
{"loss": 4273.132577971211, "timestamp": 1719555527, "checkpoint_dir_name": "checkpoint_000005", "should_checkpoint": true, "done": false, "training_iteration": 6, "trial_id": "261349af", "date": "2024-06-28_08-18-47", "time_this_iter_s": 627.5760381221771, "time_total_s": 3780.3596444129944, "pid": 242786, "hostname": "r8i6n8", "node_ip": "10.159.28.66", "config": {"encoder_num_layer": 2, "decoder_rt_num_layer": 4, "decoder_int_num_layer": 1, "embedding_dim": 1024, "encoder_ff": 1024, "decoder_rt_ff": 2048, "decoder_int_ff": 512, "n_head": 16, "drop_rate": 0.2474625892184107, "lr": 0.09815210368028619, "batch_size": 2048}, "time_since_restore": 3780.3596444129944, "iterations_since_restore": 6}
{"loss": 3991.0541396253693, "timestamp": 1719556155, "checkpoint_dir_name": "checkpoint_000006", "should_checkpoint": true, "done": false, "training_iteration": 7, "trial_id": "261349af", "date": "2024-06-28_08-29-15", "time_this_iter_s": 627.5778048038483, "time_total_s": 4407.937449216843, "pid": 242786, "hostname": "r8i6n8", "node_ip": "10.159.28.66", "config": {"encoder_num_layer": 2, "decoder_rt_num_layer": 4, "decoder_int_num_layer": 1, "embedding_dim": 1024, "encoder_ff": 1024, "decoder_rt_ff": 2048, "decoder_int_ff": 512, "n_head": 16, "drop_rate": 0.2474625892184107, "lr": 0.09815210368028619, "batch_size": 2048}, "time_since_restore": 4407.937449216843, "iterations_since_restore": 7}
{"loss": 3685.58821166031, "timestamp": 1719556782, "checkpoint_dir_name": "checkpoint_000007", "should_checkpoint": true, "done": false, "training_iteration": 8, "trial_id": "261349af", "date": "2024-06-28_08-39-43", "time_this_iter_s": 627.666775226593, "time_total_s": 5035.604224443436, "pid": 242786, "hostname": "r8i6n8", "node_ip": "10.159.28.66", "config": {"encoder_num_layer": 2, "decoder_rt_num_layer": 4, "decoder_int_num_layer": 1, "embedding_dim": 1024, "encoder_ff": 1024, "decoder_rt_ff": 2048, "decoder_int_ff": 512, "n_head": 16, "drop_rate": 0.2474625892184107, "lr": 0.09815210368028619, "batch_size": 2048}, "time_since_restore": 5035.604224443436, "iterations_since_restore": 8}
{"loss": 3365.655848225271, "timestamp": 1719557410, "checkpoint_dir_name": "checkpoint_000008", "should_checkpoint": true, "done": false, "training_iteration": 9, "trial_id": "261349af", "date": "2024-06-28_08-50-10", "time_this_iter_s": 627.4417996406555, "time_total_s": 5663.046024084091, "pid": 242786, "hostname": "r8i6n8", "node_ip": "10.159.28.66", "config": {"encoder_num_layer": 2, "decoder_rt_num_layer": 4, "decoder_int_num_layer": 1, "embedding_dim": 1024, "encoder_ff": 1024, "decoder_rt_ff": 2048, "decoder_int_ff": 512, "n_head": 16, "drop_rate": 0.2474625892184107, "lr": 0.09815210368028619, "batch_size": 2048}, "time_since_restore": 5663.046024084091, "iterations_since_restore": 9}
{"loss": 3048.0907587967517, "timestamp": 1719558037, "checkpoint_dir_name": "checkpoint_000009", "should_checkpoint": true, "done": false, "training_iteration": 10, "trial_id": "261349af", "date": "2024-06-28_09-00-38", "time_this_iter_s": 627.6434020996094, "time_total_s": 6290.689426183701, "pid": 242786, "hostname": "r8i6n8", "node_ip": "10.159.28.66", "config": {"encoder_num_layer": 2, "decoder_rt_num_layer": 4, "decoder_int_num_layer": 1, "embedding_dim": 1024, "encoder_ff": 1024, "decoder_rt_ff": 2048, "decoder_int_ff": 512, "n_head": 16, "drop_rate": 0.2474625892184107, "lr": 0.09815210368028619, "batch_size": 2048}, "time_since_restore": 6290.689426183701, "iterations_since_restore": 10}
Failure # 1 (occurred at 2024-06-27_16-33-47)
ray::ImplicitFunc.train() (pid=69992, ip=10.159.28.66, actor_id=4f23fe319defb9cd61c97da301000000, repr=train_model)
^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
File "/gpfslocalsup/pub/anaconda-py3/2023.09/envs/pytorch-gpu-2.2.0+py3.11.7/lib/python3.11/site-packages/ray/tune/trainable/trainable.py", line 342, in train
raise skipped from exception_cause(skipped)
File "/gpfslocalsup/pub/anaconda-py3/2023.09/envs/pytorch-gpu-2.2.0+py3.11.7/lib/python3.11/site-packages/ray/air/_internal/util.py", line 88, in run
self._ret = self._target(*self._args, **self._kwargs)
^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
File "/gpfslocalsup/pub/anaconda-py3/2023.09/envs/pytorch-gpu-2.2.0+py3.11.7/lib/python3.11/site-packages/ray/tune/trainable/function_trainable.py", line 249, in <lambda>
training_func=lambda: self._trainable_func(self.config),
^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
File "/gpfslocalsup/pub/anaconda-py3/2023.09/envs/pytorch-gpu-2.2.0+py3.11.7/lib/python3.11/site-packages/ray/tune/trainable/function_trainable.py", line 332, in _trainable_func
output = fn()
^^^^
File "/gpfslocalsup/pub/anaconda-py3/2023.09/envs/pytorch-gpu-2.2.0+py3.11.7/lib/python3.11/site-packages/ray/tune/trainable/util.py", line 138, in inner
return trainable(config, **fn_kwargs)
^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
File "/gpfswork/rech/ute/ucg81ws/these/LC-MS-RT-prediction/main_ray_tune.py", line 66, in train_model
pred_rt = net.module.forward_rt(seq)
^^^^^^^^^^^^^^^^^^^^^^^^^^
File "/gpfsdswork/projects/rech/ute/ucg81ws/these/LC-MS-RT-prediction/model_custom.py", line 111, in forward_rt
out_rt = self.decoder_RT(enc)
^^^^^^^^^^^^^^^^^^^^
File "/gpfslocalsup/pub/anaconda-py3/2023.09/envs/pytorch-gpu-2.2.0+py3.11.7/lib/python3.11/site-packages/torch/nn/modules/module.py", line 1511, in _wrapped_call_impl
return self._call_impl(*args, **kwargs)
^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
File "/gpfslocalsup/pub/anaconda-py3/2023.09/envs/pytorch-gpu-2.2.0+py3.11.7/lib/python3.11/site-packages/torch/nn/modules/module.py", line 1520, in _call_impl
return forward_call(*args, **kwargs)
^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
File "/gpfslocalsup/pub/anaconda-py3/2023.09/envs/pytorch-gpu-2.2.0+py3.11.7/lib/python3.11/site-packages/torch/nn/modules/container.py", line 217, in forward
input = module(input)
^^^^^^^^^^^^^
File "/gpfslocalsup/pub/anaconda-py3/2023.09/envs/pytorch-gpu-2.2.0+py3.11.7/lib/python3.11/site-packages/torch/nn/modules/module.py", line 1511, in _wrapped_call_impl
return self._call_impl(*args, **kwargs)
^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
File "/gpfslocalsup/pub/anaconda-py3/2023.09/envs/pytorch-gpu-2.2.0+py3.11.7/lib/python3.11/site-packages/torch/nn/modules/module.py", line 1520, in _call_impl
return forward_call(*args, **kwargs)
^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
File "/gpfslocalsup/pub/anaconda-py3/2023.09/envs/pytorch-gpu-2.2.0+py3.11.7/lib/python3.11/site-packages/torch/nn/modules/transformer.py", line 391, in forward
output = mod(output, src_mask=mask, is_causal=is_causal, src_key_padding_mask=src_key_padding_mask_for_layers)
^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
File "/gpfslocalsup/pub/anaconda-py3/2023.09/envs/pytorch-gpu-2.2.0+py3.11.7/lib/python3.11/site-packages/torch/nn/modules/module.py", line 1511, in _wrapped_call_impl
return self._call_impl(*args, **kwargs)
^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
File "/gpfslocalsup/pub/anaconda-py3/2023.09/envs/pytorch-gpu-2.2.0+py3.11.7/lib/python3.11/site-packages/torch/nn/modules/module.py", line 1520, in _call_impl
return forward_call(*args, **kwargs)
^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
File "/gpfslocalsup/pub/anaconda-py3/2023.09/envs/pytorch-gpu-2.2.0+py3.11.7/lib/python3.11/site-packages/torch/nn/modules/transformer.py", line 715, in forward
x = self.norm2(x + self._ff_block(x))
^^^^^^^^^^^^^^^^^
File "/gpfslocalsup/pub/anaconda-py3/2023.09/envs/pytorch-gpu-2.2.0+py3.11.7/lib/python3.11/site-packages/torch/nn/modules/transformer.py", line 730, in _ff_block
x = self.linear2(self.dropout(self.activation(self.linear1(x))))
^^^^^^^^^^^^^^^
File "/gpfslocalsup/pub/anaconda-py3/2023.09/envs/pytorch-gpu-2.2.0+py3.11.7/lib/python3.11/site-packages/torch/nn/modules/module.py", line 1511, in _wrapped_call_impl
return self._call_impl(*args, **kwargs)
^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
File "/gpfslocalsup/pub/anaconda-py3/2023.09/envs/pytorch-gpu-2.2.0+py3.11.7/lib/python3.11/site-packages/torch/nn/modules/module.py", line 1520, in _call_impl
return forward_call(*args, **kwargs)
^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
File "/gpfslocalsup/pub/anaconda-py3/2023.09/envs/pytorch-gpu-2.2.0+py3.11.7/lib/python3.11/site-packages/torch/nn/modules/linear.py", line 116, in forward
return F.linear(input, self.weight, self.bias)
^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
torch.cuda.OutOfMemoryError: CUDA out of memory. Tried to allocate 400.00 MiB. GPU 0 has a total capacity of 31.73 GiB of which 116.69 MiB is free. Including non-PyTorch memory, this process has 31.62 GiB memory in use. Of the allocated memory 30.71 GiB is allocated by PyTorch, and 541.03 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation. See documentation for Memory Management (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables)
{
"batch_size": 2048,
"decoder_int_ff": 512,
"decoder_int_num_layer": 1,
"decoder_rt_ff": 2048,
"decoder_rt_num_layer": 8,
"drop_rate": 0.8248444707020037,
"embedding_dim": 1024,
"encoder_ff": 1024,
"encoder_num_layer": 4,
"lr": 0.027832194286055118,
"n_head": 16
}
\ No newline at end of file
{
"batch_size": 2048,
"decoder_int_ff": 512,
"decoder_int_num_layer": 1,
"decoder_rt_ff": 2048,
"decoder_rt_num_layer": 1,
"drop_rate": 0.4976920968263533,
"embedding_dim": 1024,
"encoder_ff": 512,
"encoder_num_layer": 1,
"lr": 0.013704737381480214,
"n_head": 4
}
\ No newline at end of file
loss,timestamp,checkpoint_dir_name,should_checkpoint,done,training_iteration,trial_id,date,time_this_iter_s,time_total_s,pid,hostname,node_ip,time_since_restore,iterations_since_restore
2700.2572838490405,1719534417,checkpoint_000000,True,False,1,32772c28,2024-06-28_02-26-57,219.04653239250183,219.04653239250183,242786,r8i6n8,10.159.28.66,219.04653239250183,1
2609.489275113804,1719534622,checkpoint_000001,True,False,2,32772c28,2024-06-28_02-30-22,204.8529450893402,423.89947748184204,242786,r8i6n8,10.159.28.66,423.89947748184204,2
2558.6601024237207,1719534827,checkpoint_000002,True,False,3,32772c28,2024-06-28_02-33-48,205.22449278831482,629.1239702701569,242786,r8i6n8,10.159.28.66,629.1239702701569,3
2442.0073280634842,1719535032,checkpoint_000003,True,False,4,32772c28,2024-06-28_02-37-13,205.06548023223877,834.1894505023956,242786,r8i6n8,10.159.28.66,834.1894505023956,4
2364.7526893916092,1719535238,checkpoint_000004,True,False,5,32772c28,2024-06-28_02-40-38,205.07774353027344,1039.267194032669,242786,r8i6n8,10.159.28.66,1039.267194032669,5
2269.5375149944634,1719535443,checkpoint_000005,True,False,6,32772c28,2024-06-28_02-44-03,205.03923749923706,1244.3064315319061,242786,r8i6n8,10.159.28.66,1244.3064315319061,6
2195.085433839813,1719535648,checkpoint_000006,True,False,7,32772c28,2024-06-28_02-47-28,205.1729245185852,1449.4793560504913,242786,r8i6n8,10.159.28.66,1449.4793560504913,7
2150.467501422552,1719535853,checkpoint_000007,True,False,8,32772c28,2024-06-28_02-50-53,204.98054313659668,1654.459899187088,242786,r8i6n8,10.159.28.66,1654.459899187088,8
2097.3525179164617,1719536057,checkpoint_000008,True,False,9,32772c28,2024-06-28_02-54-18,204.72472834587097,1859.184627532959,242786,r8i6n8,10.159.28.66,1859.184627532959,9
2048.298097625492,1719536262,checkpoint_000009,True,False,10,32772c28,2024-06-28_02-57-43,204.9549217224121,2064.139549255371,242786,r8i6n8,10.159.28.66,2064.139549255371,10
{"loss": 2700.2572838490405, "timestamp": 1719534417, "checkpoint_dir_name": "checkpoint_000000", "should_checkpoint": true, "done": false, "training_iteration": 1, "trial_id": "32772c28", "date": "2024-06-28_02-26-57", "time_this_iter_s": 219.04653239250183, "time_total_s": 219.04653239250183, "pid": 242786, "hostname": "r8i6n8", "node_ip": "10.159.28.66", "config": {"encoder_num_layer": 1, "decoder_rt_num_layer": 1, "decoder_int_num_layer": 1, "embedding_dim": 1024, "encoder_ff": 512, "decoder_rt_ff": 2048, "decoder_int_ff": 512, "n_head": 4, "drop_rate": 0.4976920968263533, "lr": 0.013704737381480214, "batch_size": 2048}, "time_since_restore": 219.04653239250183, "iterations_since_restore": 1}
{"loss": 2609.489275113804, "timestamp": 1719534622, "checkpoint_dir_name": "checkpoint_000001", "should_checkpoint": true, "done": false, "training_iteration": 2, "trial_id": "32772c28", "date": "2024-06-28_02-30-22", "time_this_iter_s": 204.8529450893402, "time_total_s": 423.89947748184204, "pid": 242786, "hostname": "r8i6n8", "node_ip": "10.159.28.66", "config": {"encoder_num_layer": 1, "decoder_rt_num_layer": 1, "decoder_int_num_layer": 1, "embedding_dim": 1024, "encoder_ff": 512, "decoder_rt_ff": 2048, "decoder_int_ff": 512, "n_head": 4, "drop_rate": 0.4976920968263533, "lr": 0.013704737381480214, "batch_size": 2048}, "time_since_restore": 423.89947748184204, "iterations_since_restore": 2}
{"loss": 2558.6601024237207, "timestamp": 1719534827, "checkpoint_dir_name": "checkpoint_000002", "should_checkpoint": true, "done": false, "training_iteration": 3, "trial_id": "32772c28", "date": "2024-06-28_02-33-48", "time_this_iter_s": 205.22449278831482, "time_total_s": 629.1239702701569, "pid": 242786, "hostname": "r8i6n8", "node_ip": "10.159.28.66", "config": {"encoder_num_layer": 1, "decoder_rt_num_layer": 1, "decoder_int_num_layer": 1, "embedding_dim": 1024, "encoder_ff": 512, "decoder_rt_ff": 2048, "decoder_int_ff": 512, "n_head": 4, "drop_rate": 0.4976920968263533, "lr": 0.013704737381480214, "batch_size": 2048}, "time_since_restore": 629.1239702701569, "iterations_since_restore": 3}
{"loss": 2442.0073280634842, "timestamp": 1719535032, "checkpoint_dir_name": "checkpoint_000003", "should_checkpoint": true, "done": false, "training_iteration": 4, "trial_id": "32772c28", "date": "2024-06-28_02-37-13", "time_this_iter_s": 205.06548023223877, "time_total_s": 834.1894505023956, "pid": 242786, "hostname": "r8i6n8", "node_ip": "10.159.28.66", "config": {"encoder_num_layer": 1, "decoder_rt_num_layer": 1, "decoder_int_num_layer": 1, "embedding_dim": 1024, "encoder_ff": 512, "decoder_rt_ff": 2048, "decoder_int_ff": 512, "n_head": 4, "drop_rate": 0.4976920968263533, "lr": 0.013704737381480214, "batch_size": 2048}, "time_since_restore": 834.1894505023956, "iterations_since_restore": 4}
{"loss": 2364.7526893916092, "timestamp": 1719535238, "checkpoint_dir_name": "checkpoint_000004", "should_checkpoint": true, "done": false, "training_iteration": 5, "trial_id": "32772c28", "date": "2024-06-28_02-40-38", "time_this_iter_s": 205.07774353027344, "time_total_s": 1039.267194032669, "pid": 242786, "hostname": "r8i6n8", "node_ip": "10.159.28.66", "config": {"encoder_num_layer": 1, "decoder_rt_num_layer": 1, "decoder_int_num_layer": 1, "embedding_dim": 1024, "encoder_ff": 512, "decoder_rt_ff": 2048, "decoder_int_ff": 512, "n_head": 4, "drop_rate": 0.4976920968263533, "lr": 0.013704737381480214, "batch_size": 2048}, "time_since_restore": 1039.267194032669, "iterations_since_restore": 5}
{"loss": 2269.5375149944634, "timestamp": 1719535443, "checkpoint_dir_name": "checkpoint_000005", "should_checkpoint": true, "done": false, "training_iteration": 6, "trial_id": "32772c28", "date": "2024-06-28_02-44-03", "time_this_iter_s": 205.03923749923706, "time_total_s": 1244.3064315319061, "pid": 242786, "hostname": "r8i6n8", "node_ip": "10.159.28.66", "config": {"encoder_num_layer": 1, "decoder_rt_num_layer": 1, "decoder_int_num_layer": 1, "embedding_dim": 1024, "encoder_ff": 512, "decoder_rt_ff": 2048, "decoder_int_ff": 512, "n_head": 4, "drop_rate": 0.4976920968263533, "lr": 0.013704737381480214, "batch_size": 2048}, "time_since_restore": 1244.3064315319061, "iterations_since_restore": 6}
{"loss": 2195.085433839813, "timestamp": 1719535648, "checkpoint_dir_name": "checkpoint_000006", "should_checkpoint": true, "done": false, "training_iteration": 7, "trial_id": "32772c28", "date": "2024-06-28_02-47-28", "time_this_iter_s": 205.1729245185852, "time_total_s": 1449.4793560504913, "pid": 242786, "hostname": "r8i6n8", "node_ip": "10.159.28.66", "config": {"encoder_num_layer": 1, "decoder_rt_num_layer": 1, "decoder_int_num_layer": 1, "embedding_dim": 1024, "encoder_ff": 512, "decoder_rt_ff": 2048, "decoder_int_ff": 512, "n_head": 4, "drop_rate": 0.4976920968263533, "lr": 0.013704737381480214, "batch_size": 2048}, "time_since_restore": 1449.4793560504913, "iterations_since_restore": 7}
{"loss": 2150.467501422552, "timestamp": 1719535853, "checkpoint_dir_name": "checkpoint_000007", "should_checkpoint": true, "done": false, "training_iteration": 8, "trial_id": "32772c28", "date": "2024-06-28_02-50-53", "time_this_iter_s": 204.98054313659668, "time_total_s": 1654.459899187088, "pid": 242786, "hostname": "r8i6n8", "node_ip": "10.159.28.66", "config": {"encoder_num_layer": 1, "decoder_rt_num_layer": 1, "decoder_int_num_layer": 1, "embedding_dim": 1024, "encoder_ff": 512, "decoder_rt_ff": 2048, "decoder_int_ff": 512, "n_head": 4, "drop_rate": 0.4976920968263533, "lr": 0.013704737381480214, "batch_size": 2048}, "time_since_restore": 1654.459899187088, "iterations_since_restore": 8}
{"loss": 2097.3525179164617, "timestamp": 1719536057, "checkpoint_dir_name": "checkpoint_000008", "should_checkpoint": true, "done": false, "training_iteration": 9, "trial_id": "32772c28", "date": "2024-06-28_02-54-18", "time_this_iter_s": 204.72472834587097, "time_total_s": 1859.184627532959, "pid": 242786, "hostname": "r8i6n8", "node_ip": "10.159.28.66", "config": {"encoder_num_layer": 1, "decoder_rt_num_layer": 1, "decoder_int_num_layer": 1, "embedding_dim": 1024, "encoder_ff": 512, "decoder_rt_ff": 2048, "decoder_int_ff": 512, "n_head": 4, "drop_rate": 0.4976920968263533, "lr": 0.013704737381480214, "batch_size": 2048}, "time_since_restore": 1859.184627532959, "iterations_since_restore": 9}
{"loss": 2048.298097625492, "timestamp": 1719536262, "checkpoint_dir_name": "checkpoint_000009", "should_checkpoint": true, "done": false, "training_iteration": 10, "trial_id": "32772c28", "date": "2024-06-28_02-57-43", "time_this_iter_s": 204.9549217224121, "time_total_s": 2064.139549255371, "pid": 242786, "hostname": "r8i6n8", "node_ip": "10.159.28.66", "config": {"encoder_num_layer": 1, "decoder_rt_num_layer": 1, "decoder_int_num_layer": 1, "embedding_dim": 1024, "encoder_ff": 512, "decoder_rt_ff": 2048, "decoder_int_ff": 512, "n_head": 4, "drop_rate": 0.4976920968263533, "lr": 0.013704737381480214, "batch_size": 2048}, "time_since_restore": 2064.139549255371, "iterations_since_restore": 10}
{
"batch_size": 1024,
"decoder_int_ff": 512,
"decoder_int_num_layer": 1,
"decoder_rt_ff": 512,
"decoder_rt_num_layer": 2,
"drop_rate": 0.3031602001882847,
"embedding_dim": 1024,
"encoder_ff": 1024,
"encoder_num_layer": 1,
"lr": 0.019401170360006482,
"n_head": 2
}
\ No newline at end of file
loss,timestamp,checkpoint_dir_name,should_checkpoint,done,training_iteration,trial_id,date,time_this_iter_s,time_total_s,pid,hostname,node_ip,time_since_restore,iterations_since_restore
4021.740253598671,1719492153,checkpoint_000000,True,False,1,3c9a1c0c,2024-06-27_14-42-33,270.95838141441345,270.95838141441345,69992,r8i6n8,10.159.28.66,270.95838141441345,1
0% Loading or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment