Skip to content
Snippets Groups Projects
Commit b286dd0d authored by Alexandre Chapin's avatar Alexandre Chapin :race_car:
Browse files

Remove useless params and clean some code

parent 1e34fe2b
No related branches found
No related tags found
No related merge requests found
......@@ -9,16 +9,14 @@ model:
hidden_dim: 128
iters: 3
training:
num_workers: 2
num_gpus: 1
batch_size: 8
max_it: 333000000
num_workers: 48
num_gpus: 8
batch_size: 256
max_it: 1000000
warmup_it: 10000
lr_warmup: 5000
decay_rate: 0.5
decay_it: 100000
visualize_every: 5000
validate_every: 5000
checkpoint_every: 1000
backup_every: 25000
......@@ -9,16 +9,14 @@ model:
hidden_dim: 128
iters: 3
training:
num_workers: 2
num_gpus: 1
batch_size: 8
max_it: 333000000
num_workers: 48
num_gpus: 8
batch_size: 256
max_it: 1000000
warmup_it: 10000
lr_warmup: 5000
decay_rate: 0.5
decay_it: 100000
visualize_every: 5000
validate_every: 5000
checkpoint_every: 1000
backup_every: 25000
......@@ -12,13 +12,11 @@ training:
num_workers: 2
num_gpus: 1
batch_size: 8
max_it: 333000000
max_it: 1000000
warmup_it: 10000
lr_warmup: 5000
decay_rate: 0.5
decay_it: 100000
visualize_every: 5000
validate_every: 5000
checkpoint_every: 1000
backup_every: 25000
......@@ -12,13 +12,11 @@ training:
num_workers: 2
num_gpus: 1
batch_size: 8
max_it: 333000000
max_it: 1000000
warmup_it: 10000
lr_warmup: 5000
decay_rate: 0.5
decay_it: 100000
visualize_every: 5000
validate_every: 5000
checkpoint_every: 1000
backup_every: 25000
File moved
#!/bin/bash
#SBATCH --job-name=slot_att_clevr
#SBATCH --output=logs/job.%j.out
#SBATCH --error=logs/job.%j.err
#SBATCH --account=uli@v100
#SBATCH --partition=gpu_p2
#SBATCH --gres=gpu:8
#SBATCH --nodes=1
#SBATCH --ntasks-per-node=8 #number of MPI tasks per node (=number of GPUs per node)
#SBATCH --exclusive
#SBATCH --hint=nomultithread
#SBATCH -t 20:00:00
#SBATCH --mail-user=alexandre.chapin@ec-lyon.fr
#SBATCH --mail-typ=FAIL
#SBATCH --qos=qos_gpu-t3
module purge
echo ${SLURM_NODELIST}
#module load cudnn/8.5.0.96-11.7-cuda
module load pytorch-gpu/py3/2.0.0
srun python train_sa.py runs/clevr/slot_att/config.yaml --wandb
#!/bin/bash
#SBATCH --job-name=slot_att_ycb
#SBATCH --output=logs/job.%j.out
#SBATCH --error=logs/job.%j.err
#SBATCH --account=uli@v100
#SBATCH --partition=gpu_p2
#SBATCH --gres=gpu:8
#SBATCH --nodes=1
#SBATCH --ntasks-per-node=8 #number of MPI tasks per node (=number of GPUs per node)
#SBATCH --exclusive
#SBATCH --hint=nomultithread
#SBATCH -t 20:00:00
#SBATCH --mail-user=alexandre.chapin@ec-lyon.fr
#SBATCH --mail-typ=FAIL
#SBATCH --qos=qos_gpu-t3
module purge
echo ${SLURM_NODELIST}
#module load cudnn/8.5.0.96-11.7-cuda
module load pytorch-gpu/py3/2.0.0
srun python train_sa.py runs/ycb/slot_att/config.yaml --wandb
File moved
File moved
#!/bin/bash
#SBATCH --job-name=trans_slot_att_ycb
#SBATCH --output=logs/job.%j.out
#SBATCH --error=logs/job.%j.err
#SBATCH --account=uli@v100
#SBATCH --partition=gpu_p2
#SBATCH --gres=gpu:8
#SBATCH --nodes=1
#SBATCH --ntasks-per-node=8 #number of MPI tasks per node (=number of GPUs per node)
#SBATCH --exclusive
#SBATCH --hint=nomultithread
#SBATCH -t 20:00:00
#SBATCH --mail-user=alexandre.chapin@ec-lyon.fr
#SBATCH --mail-typ=FAIL
#SBATCH --qos=qos_gpu-t3
module purge
echo ${SLURM_NODELIST}
#module load cudnn/8.5.0.96-11.7-cuda
module load pytorch-gpu/py3/2.0.0
srun python train_sa.py runs/clevr/slot_att/config_tsa.yaml --wandb
#!/bin/bash
#SBATCH --job-name=trans_slot_att_ycb
#SBATCH --output=logs/job.%j.out
#SBATCH --error=logs/job.%j.err
#SBATCH --account=uli@v100
#SBATCH --partition=gpu_p2
#SBATCH --gres=gpu:8
#SBATCH --nodes=1
#SBATCH --ntasks-per-node=8 #number of MPI tasks per node (=number of GPUs per node)
#SBATCH --exclusive
#SBATCH --hint=nomultithread
#SBATCH -t 20:00:00
#SBATCH --mail-user=alexandre.chapin@ec-lyon.fr
#SBATCH --mail-typ=FAIL
#SBATCH --qos=qos_gpu-t3
module purge
echo ${SLURM_NODELIST}
#module load cudnn/8.5.0.96-11.7-cuda
module load pytorch-gpu/py3/2.0.0
srun python train_sa.py runs/ycb/slot_att/config_tsa.yaml --wandb
......@@ -32,6 +32,7 @@ def main():
parser.add_argument('--wandb', action='store_true', help='Log run to Weights and Biases.')
parser.add_argument('--seed', type=int, default=0, help='Random seed.')
parser.add_argument('--ckpt', type=str, default=None, help='Model checkpoint path')
parser.add_argument('--profiler', action='store_true', help='Activate checkpoiting')
args = parser.parse_args()
with open(args.config, 'r') as f:
......@@ -53,12 +54,12 @@ def main():
#### Create datasets
train_dataset = data.get_dataset('train', cfg['data'])
train_loader = DataLoader(
train_dataset, batch_size=batch_size, num_workers=num_workers-9 if num_workers > 9 else 0,
train_dataset, batch_size=batch_size, num_workers=num_workers-8 if num_workers > 8 else 0,
shuffle=True, worker_init_fn=data.worker_init_fn, pin_memory=True)
val_dataset = data.get_dataset('val', cfg['data'])
val_loader = DataLoader(
val_dataset, batch_size=batch_size, num_workers=8 if num_workers > 9 else 0,
val_dataset, batch_size=batch_size, num_workers=8 if num_workers > 8 else 0,
shuffle=True, worker_init_fn=data.worker_init_fn, pin_memory=True)
#### Create model
......@@ -73,31 +74,37 @@ def main():
mode="max",
dirpath="./checkpoints",
filename="ckpt-" + str(cfg["data"]["dataset"])+ "-slots:"+ str(cfg["model"]["num_slots"]) + "-" + str(cfg["model"]["model_type"]) +"-{epoch:02d}-psnr{val_psnr:.2f}",
save_weights_only=True, # don't save optimizer states nor lr-scheduler, ...
every_n_train_steps=cfg["training"]["checkpoint_every"]
save_weights_only=True # don't save optimizer states nor lr-scheduler, ...
)
early_stopping = EarlyStopping(monitor="val_psnr", mode="max")
trainer = pl.Trainer(accelerator="gpu",
devices=num_gpus,
profiler="simple",
profiler="simple" if args.proffiler else None,
default_root_dir="./logs",
logger=WandbLogger(project="slot-att", offline=True) if args.wandb else None,
strategy="ddp_find_unused_parameters_true" if num_gpus > 1 else "auto",
callbacks=[checkpoint_callback, early_stopping],
log_every_n_steps=100,
val_check_interval=cfg["training"]["validate_every"],
check_val_every_n_epoch=None,
max_steps=num_train_steps,
enable_model_summary=True)
trainer.fit(model, train_loader, val_loader)
#### Evaluate the model
print(f"Begin testing : ")
test_dataset = data.get_dataset('test', cfg['data'])
test_loader = DataLoader(
test_dataset, batch_size=batch_size, num_workers=8 if num_workers > 8 else 0,
shuffle=True, worker_init_fn=data.worker_init_fn, pin_memory=True)
trainer.test(ckpt_path="best", dataloaders=test_loader, verbose=True)
print(f"Begin visualization : ")
#### Create datasets
vis_dataset = data.get_dataset('train', cfg['data'])
vis_loader = DataLoader(
vis_dataset, batch_size=1, num_workers=1,
vis_dataset, batch_size=1, num_workers=0,
shuffle=True, worker_init_fn=data.worker_init_fn)
device = model.device
......
0% Loading or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment