Skip to content
Snippets Groups Projects
Commit a790c5c2 authored by Léo Schneider's avatar Léo Schneider Committed by Schneider Leo
Browse files

res

parent efd6839c
No related branches found
No related tags found
No related merge requests found
Showing
with 76 additions and 0 deletions
{
"batch_size": 1024,
"decoder_int_ff": 1024,
"decoder_int_num_layer": 2,
"decoder_rt_ff": 2048,
"decoder_rt_num_layer": 1,
"drop_rate": 0.922082156004742,
"embedding_dim": 64,
"encoder_ff": 2048,
"encoder_num_layer": 4,
"lr": 0.0016525285062566051,
"n_head": 1
}
\ No newline at end of file
loss,timestamp,checkpoint_dir_name,should_checkpoint,done,training_iteration,trial_id,date,time_this_iter_s,time_total_s,pid,hostname,node_ip,time_since_restore,iterations_since_restore
2102.196185254675,1719325314,checkpoint_000000,True,False,1,1920f493,2024-06-25_16-21-54,79.38143539428711,79.38143539428711,69318,r3i5n6,10.159.8.159,79.38143539428711,1
{"loss": 2102.196185254675, "timestamp": 1719325314, "checkpoint_dir_name": "checkpoint_000000", "should_checkpoint": true, "done": false, "training_iteration": 1, "trial_id": "1920f493", "date": "2024-06-25_16-21-54", "time_this_iter_s": 79.38143539428711, "time_total_s": 79.38143539428711, "pid": 69318, "hostname": "r3i5n6", "node_ip": "10.159.8.159", "config": {"encoder_num_layer": 4, "decoder_rt_num_layer": 1, "decoder_int_num_layer": 2, "embedding_dim": 64, "encoder_ff": 2048, "decoder_rt_ff": 2048, "decoder_int_ff": 1024, "n_head": 1, "drop_rate": 0.922082156004742, "lr": 0.0016525285062566051, "batch_size": 1024}, "time_since_restore": 79.38143539428711, "iterations_since_restore": 1}
{
"batch_size": 1024,
"decoder_int_ff": 1024,
"decoder_int_num_layer": 8,
"decoder_rt_ff": 512,
"decoder_rt_num_layer": 4,
"drop_rate": 0.03175079062321118,
"embedding_dim": 16,
"encoder_ff": 2048,
"encoder_num_layer": 2,
"lr": 0.003436670599863372,
"n_head": 1
}
\ No newline at end of file
loss,timestamp,checkpoint_dir_name,should_checkpoint,done,training_iteration,trial_id,date,time_this_iter_s,time_total_s,pid,hostname,node_ip,time_since_restore,iterations_since_restore
1961.4144263079786,1719325387,checkpoint_000000,True,False,1,1de460a8,2024-06-25_16-23-07,73.56608295440674,73.56608295440674,69318,r3i5n6,10.159.8.159,73.56608295440674,1
1955.3101080947035,1719327763,checkpoint_000001,True,False,2,1de460a8,2024-06-25_17-02-43,72.80412983894348,146.37021279335022,130020,r3i5n6,10.159.8.159,72.80412983894348,1
{"loss": 1961.4144263079786, "timestamp": 1719325387, "checkpoint_dir_name": "checkpoint_000000", "should_checkpoint": true, "done": false, "training_iteration": 1, "trial_id": "1de460a8", "date": "2024-06-25_16-23-07", "time_this_iter_s": 73.56608295440674, "time_total_s": 73.56608295440674, "pid": 69318, "hostname": "r3i5n6", "node_ip": "10.159.8.159", "config": {"encoder_num_layer": 2, "decoder_rt_num_layer": 4, "decoder_int_num_layer": 8, "embedding_dim": 16, "encoder_ff": 2048, "decoder_rt_ff": 512, "decoder_int_ff": 1024, "n_head": 1, "drop_rate": 0.03175079062321118, "lr": 0.003436670599863372, "batch_size": 1024}, "time_since_restore": 73.56608295440674, "iterations_since_restore": 1}
{"loss": 1955.3101080947035, "timestamp": 1719327763, "checkpoint_dir_name": "checkpoint_000001", "should_checkpoint": true, "done": false, "training_iteration": 2, "trial_id": "1de460a8", "date": "2024-06-25_17-02-43", "time_this_iter_s": 72.80412983894348, "time_total_s": 146.37021279335022, "pid": 130020, "hostname": "r3i5n6", "node_ip": "10.159.8.159", "config": {"encoder_num_layer": 2, "decoder_rt_num_layer": 4, "decoder_int_num_layer": 8, "embedding_dim": 16, "encoder_ff": 2048, "decoder_rt_ff": 512, "decoder_int_ff": 1024, "n_head": 1, "drop_rate": 0.03175079062321118, "lr": 0.003436670599863372, "batch_size": 1024}, "time_since_restore": 72.80412983894348, "iterations_since_restore": 1}
Failure # 1 (occurred at 2024-06-25_17-03-02)
ray::ImplicitFunc.train() (pid=130020, ip=10.159.8.159, actor_id=595d5a321c4a0818d73ea9fa01000000, repr=train_model)
^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
File "/gpfslocalsup/pub/anaconda-py3/2023.09/envs/pytorch-gpu-2.2.0+py3.11.7/lib/python3.11/site-packages/ray/tune/trainable/trainable.py", line 342, in train
raise skipped from exception_cause(skipped)
File "/gpfslocalsup/pub/anaconda-py3/2023.09/envs/pytorch-gpu-2.2.0+py3.11.7/lib/python3.11/site-packages/ray/air/_internal/util.py", line 88, in run
self._ret = self._target(*self._args, **self._kwargs)
^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
File "/gpfslocalsup/pub/anaconda-py3/2023.09/envs/pytorch-gpu-2.2.0+py3.11.7/lib/python3.11/site-packages/ray/tune/trainable/function_trainable.py", line 249, in <lambda>
training_func=lambda: self._trainable_func(self.config),
^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
File "/gpfslocalsup/pub/anaconda-py3/2023.09/envs/pytorch-gpu-2.2.0+py3.11.7/lib/python3.11/site-packages/ray/tune/trainable/function_trainable.py", line 332, in _trainable_func
output = fn()
^^^^
File "/gpfslocalsup/pub/anaconda-py3/2023.09/envs/pytorch-gpu-2.2.0+py3.11.7/lib/python3.11/site-packages/ray/tune/trainable/util.py", line 138, in inner
return trainable(config, **fn_kwargs)
^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
File "/gpfswork/rech/ute/ucg81ws/these/LC-MS-RT-prediction/main_ray_tune.py", line 98, in train_model
loss.backward()
File "/gpfslocalsup/pub/anaconda-py3/2023.09/envs/pytorch-gpu-2.2.0+py3.11.7/lib/python3.11/site-packages/torch/_tensor.py", line 522, in backward
torch.autograd.backward(
File "/gpfslocalsup/pub/anaconda-py3/2023.09/envs/pytorch-gpu-2.2.0+py3.11.7/lib/python3.11/site-packages/torch/autograd/__init__.py", line 266, in backward
Variable._execution_engine.run_backward( # Calls into the C++ engine to run the backward pass
torch.cuda.OutOfMemoryError: CUDA out of memory. Tried to allocate 258.00 MiB. GPU 0 has a total capacity of 15.77 GiB of which 216.69 MiB is free. Including non-PyTorch memory, this process has 15.55 GiB memory in use. Of the allocated memory 15.02 GiB is allocated by PyTorch, and 161.15 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation. See documentation for Memory Management (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables)
{
"batch_size": 1024,
"decoder_int_ff": 1024,
"decoder_int_num_layer": 4,
"decoder_rt_ff": 512,
"decoder_rt_num_layer": 4,
"drop_rate": 0.14711494497891564,
"embedding_dim": 1024,
"encoder_ff": 1024,
"encoder_num_layer": 8,
"lr": 0.00018307057253269556,
"n_head": 2
}
\ No newline at end of file
loss,timestamp,checkpoint_dir_name,should_checkpoint,done,training_iteration,trial_id,date,time_this_iter_s,time_total_s,pid,hostname,node_ip,time_since_restore,iterations_since_restore
1963.8436942513533,1719326402,checkpoint_000000,True,False,1,1ef7c88b,2024-06-25_16-40-02,1014.7334928512573,1014.7334928512573,69318,r3i5n6,10.159.8.159,1014.7334928512573,1
{"loss": 1963.8436942513533, "timestamp": 1719326402, "checkpoint_dir_name": "checkpoint_000000", "should_checkpoint": true, "done": false, "training_iteration": 1, "trial_id": "1ef7c88b", "date": "2024-06-25_16-40-02", "time_this_iter_s": 1014.7334928512573, "time_total_s": 1014.7334928512573, "pid": 69318, "hostname": "r3i5n6", "node_ip": "10.159.8.159", "config": {"encoder_num_layer": 8, "decoder_rt_num_layer": 4, "decoder_int_num_layer": 4, "embedding_dim": 1024, "encoder_ff": 1024, "decoder_rt_ff": 512, "decoder_int_ff": 1024, "n_head": 2, "drop_rate": 0.14711494497891564, "lr": 0.00018307057253269556, "batch_size": 1024}, "time_since_restore": 1014.7334928512573, "iterations_since_restore": 1}
0% Loading or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment