Skip to content
Snippets Groups Projects
Commit 24f3fb70 authored by Jiwen Tang's avatar Jiwen Tang
Browse files

Merge branch 'det' into main

parents 42f71b6a cfb8760a
No related branches found
No related tags found
No related merge requests found
#!/usr/bin/env python3
# -*- coding:utf-8 -*-
# Copyright (c) 2014-2021 Megvii Inc. All rights reserved.
import os
import numpy as np
__all__ = ["mkdir", "nms", "multiclass_nms", "demo_postprocess"]
def mkdir(path):
if not os.path.exists(path):
os.makedirs(path)
def nms(boxes, scores, nms_thr):
"""Single class NMS implemented in Numpy."""
x1 = boxes[:, 0]
y1 = boxes[:, 1]
x2 = boxes[:, 2]
y2 = boxes[:, 3]
areas = (x2 - x1 + 1) * (y2 - y1 + 1)
order = scores.argsort()[::-1]
keep = []
while order.size > 0:
i = order[0]
keep.append(i)
xx1 = np.maximum(x1[i], x1[order[1:]])
yy1 = np.maximum(y1[i], y1[order[1:]])
xx2 = np.minimum(x2[i], x2[order[1:]])
yy2 = np.minimum(y2[i], y2[order[1:]])
w = np.maximum(0.0, xx2 - xx1 + 1)
h = np.maximum(0.0, yy2 - yy1 + 1)
inter = w * h
ovr = inter / (areas[i] + areas[order[1:]] - inter)
inds = np.where(ovr <= nms_thr)[0]
order = order[inds + 1]
return keep
def multiclass_nms(boxes, scores, nms_thr, score_thr, class_agnostic=True):
"""Multiclass NMS implemented in Numpy."""
if class_agnostic:
nms_method = multiclass_nms_class_agnostic
else:
nms_method = multiclass_nms_class_aware
return nms_method(boxes, scores, nms_thr, score_thr)
def multiclass_nms_class_aware(boxes, scores, nms_thr, score_thr):
"""Multiclass NMS implemented in Numpy.
Class-aware version.
"""
final_dets = []
num_classes = scores.shape[1]
for cls_ind in range(num_classes):
cls_scores = scores[:, cls_ind]
valid_score_mask = cls_scores > score_thr
if valid_score_mask.sum() == 0:
continue
else:
valid_scores = cls_scores[valid_score_mask]
valid_boxes = boxes[valid_score_mask]
keep = nms(valid_boxes, valid_scores, nms_thr)
if len(keep) > 0:
cls_inds = np.ones((len(keep), 1)) * cls_ind
dets = np.concatenate([valid_boxes[keep], valid_scores[keep, None], cls_inds], 1)
final_dets.append(dets)
if len(final_dets) == 0:
return None
return np.concatenate(final_dets, 0)
def multiclass_nms_class_agnostic(boxes, scores, nms_thr, score_thr):
"""Multiclass NMS implemented in Numpy.
Class-agnostic version.
"""
cls_inds = scores.argmax(1)
cls_scores = scores[np.arange(len(cls_inds)), cls_inds]
valid_score_mask = cls_scores > score_thr
if valid_score_mask.sum() == 0:
return None
valid_scores = cls_scores[valid_score_mask]
valid_boxes = boxes[valid_score_mask]
valid_cls_inds = cls_inds[valid_score_mask]
keep = nms(valid_boxes, valid_scores, nms_thr)
if keep:
dets = np.concatenate([valid_boxes[keep], valid_scores[keep, None], valid_cls_inds[keep, None]], 1)
return dets
def demo_postprocess(outputs, img_size, p6=False):
grids = []
expanded_strides = []
if not p6:
strides = [8, 16, 32]
else:
strides = [8, 16, 32, 64]
hsizes = [img_size[0] // stride for stride in strides]
wsizes = [img_size[1] // stride for stride in strides]
for hsize, wsize, stride in zip(hsizes, wsizes, strides):
xv, yv = np.meshgrid(np.arange(wsize), np.arange(hsize))
grid = np.stack((xv, yv), 2).reshape(1, -1, 2)
grids.append(grid)
shape = grid.shape[:2]
expanded_strides.append(np.full((*shape, 1), stride))
grids = np.concatenate(grids, 1)
expanded_strides = np.concatenate(expanded_strides, 1)
outputs[..., :2] = (outputs[..., :2] + grids) * expanded_strides
outputs[..., 2:4] = np.exp(outputs[..., 2:4]) * expanded_strides
return outputs
#!/usr/bin/env python3
# -*- coding:utf-8 -*-
# This file mainly comes from
# https://github.com/facebookresearch/detectron2/blob/master/detectron2/utils/comm.py
# Copyright (c) Facebook, Inc. and its affiliates.
# Copyright (c) 2014-2021 Megvii Inc. All rights reserved.
"""This file contains primitives for multi-gpu communication.
This is useful when doing distributed training.
"""
import functools
import os
import pickle
import time
from contextlib import contextmanager
from loguru import logger
import numpy as np
import torch
from torch import distributed as dist
__all__ = [
"get_num_devices",
"wait_for_the_master",
"is_main_process",
"synchronize",
"get_world_size",
"get_rank",
"get_local_rank",
"get_local_size",
"time_synchronized",
"gather",
"all_gather",
]
_LOCAL_PROCESS_GROUP = None
def get_num_devices():
gpu_list = os.getenv("CUDA_VISIBLE_DEVICES", None)
if gpu_list is not None:
return len(gpu_list.split(","))
else:
devices_list_info = os.popen("nvidia-smi -L")
devices_list_info = devices_list_info.read().strip().split("\n")
return len(devices_list_info)
@contextmanager
def wait_for_the_master(local_rank: int):
"""Make all processes waiting for the master to do some task."""
if local_rank > 0:
dist.barrier()
yield
if local_rank == 0:
if not dist.is_available():
return
if not dist.is_initialized():
return
else:
dist.barrier()
def synchronize():
"""Helper function to synchronize (barrier) among all processes when using
distributed training."""
if not dist.is_available():
return
if not dist.is_initialized():
return
world_size = dist.get_world_size()
if world_size == 1:
return
dist.barrier()
def get_world_size() -> int:
if not dist.is_available():
return 1
if not dist.is_initialized():
return 1
return dist.get_world_size()
def get_rank() -> int:
if not dist.is_available():
return 0
if not dist.is_initialized():
return 0
return dist.get_rank()
def get_local_rank() -> int:
"""
Returns:
The rank of the current process within the local (per-machine) process group.
"""
if not dist.is_available():
return 0
if not dist.is_initialized():
return 0
assert _LOCAL_PROCESS_GROUP is not None
return dist.get_rank(group=_LOCAL_PROCESS_GROUP)
def get_local_size() -> int:
"""
Returns:
The size of the per-machine process group, i.e. the number of processes per machine.
"""
if not dist.is_available():
return 1
if not dist.is_initialized():
return 1
return dist.get_world_size(group=_LOCAL_PROCESS_GROUP)
def is_main_process() -> bool:
return get_rank() == 0
@functools.lru_cache()
def _get_global_gloo_group():
"""Return a process group based on gloo backend, containing all the ranks
The result is cached."""
if dist.get_backend() == "nccl":
return dist.new_group(backend="gloo")
else:
return dist.group.WORLD
def _serialize_to_tensor(data, group):
backend = dist.get_backend(group)
assert backend in ["gloo", "nccl"]
device = torch.device("cpu" if backend == "gloo" else "cuda")
buffer = pickle.dumps(data)
if len(buffer) > 1024**3:
logger.warning(
"Rank {} trying to all-gather {:.2f} GB of data on device {}".format(
get_rank(), len(buffer) / (1024**3), device
)
)
storage = torch.ByteStorage.from_buffer(buffer)
tensor = torch.ByteTensor(storage).to(device=device)
return tensor
def _pad_to_largest_tensor(tensor, group):
"""
Returns:
list[int]: size of the tensor, on each rank
Tensor: padded tensor that has the max size
"""
world_size = dist.get_world_size(group=group)
assert world_size >= 1, "comm.gather/all_gather must be called from ranks within the given group!"
local_size = torch.tensor([tensor.numel()], dtype=torch.int64, device=tensor.device)
size_list = [torch.zeros([1], dtype=torch.int64, device=tensor.device) for _ in range(world_size)]
dist.all_gather(size_list, local_size, group=group)
size_list = [int(size.item()) for size in size_list]
max_size = max(size_list)
# we pad the tensor because torch all_gather does not support
# gathering tensors of different shapes
if local_size != max_size:
padding = torch.zeros((max_size - local_size,), dtype=torch.uint8, device=tensor.device)
tensor = torch.cat((tensor, padding), dim=0)
return size_list, tensor
def all_gather(data, group=None):
"""Run all_gather on arbitrary picklable data (not necessarily tensors).
Args:
data: any picklable object
group: a torch process group. By default, will use a group which
contains all ranks on gloo backend.
Returns:
list[data]: list of data gathered from each rank
"""
if get_world_size() == 1:
return [data]
if group is None:
group = _get_global_gloo_group()
if dist.get_world_size(group) == 1:
return [data]
tensor = _serialize_to_tensor(data, group)
size_list, tensor = _pad_to_largest_tensor(tensor, group)
max_size = max(size_list)
# receiving Tensor from all ranks
tensor_list = [torch.empty((max_size,), dtype=torch.uint8, device=tensor.device) for _ in size_list]
dist.all_gather(tensor_list, tensor, group=group)
data_list = []
for size, tensor in zip(size_list, tensor_list):
buffer = tensor.cpu().numpy().tobytes()[:size]
data_list.append(pickle.loads(buffer))
return data_list
def gather(data, dst=0, group=None):
"""Run gather on arbitrary picklable data (not necessarily tensors).
Args:
data: any picklable object
dst (int): destination rank
group: a torch process group. By default, will use a group which
contains all ranks on gloo backend.
Returns:
list[data]: on dst, a list of data gathered from each rank. Otherwise,
an empty list.
"""
if get_world_size() == 1:
return [data]
if group is None:
group = _get_global_gloo_group()
if dist.get_world_size(group=group) == 1:
return [data]
rank = dist.get_rank(group=group)
tensor = _serialize_to_tensor(data, group)
size_list, tensor = _pad_to_largest_tensor(tensor, group)
# receiving Tensor from all ranks
if rank == dst:
max_size = max(size_list)
tensor_list = [torch.empty((max_size,), dtype=torch.uint8, device=tensor.device) for _ in size_list]
dist.gather(tensor, tensor_list, dst=dst, group=group)
data_list = []
for size, tensor in zip(size_list, tensor_list):
buffer = tensor.cpu().numpy().tobytes()[:size]
data_list.append(pickle.loads(buffer))
return data_list
else:
dist.gather(tensor, [], dst=dst, group=group)
return []
def shared_random_seed():
"""
Returns:
int: a random number that is the same across all workers.
If workers need a shared RNG, they can use this shared seed to
create one.
All workers must call this function, otherwise it will deadlock.
"""
ints = np.random.randint(2**31)
all_ints = all_gather(ints)
return all_ints[0]
def time_synchronized():
"""pytorch-accurate time."""
if torch.cuda.is_available():
torch.cuda.synchronize()
return time.perf_counter()
#!/usr/bin/env python3
# -*- coding:utf-8 -*-
# Copyright (c) 2014-2021 Megvii Inc. All rights reserved.
import math
from copy import deepcopy
import logging
import torch
import torch.nn as nn
from lib.utils.setup_logger import log_first_n
__all__ = ["ModelEMA", "is_parallel"]
def is_parallel(model):
"""check if model is in parallel mode."""
parallel_type = (nn.parallel.DataParallel, nn.parallel.DistributedDataParallel)
return isinstance(model, parallel_type)
class ModelEMA:
"""Model Exponential Moving Average from
https://github.com/rwightman/pytorch-image-models Keep a moving average of
everything in the model state_dict (parameters and buffers).
This is intended to allow functionality like
https://www.tensorflow.org/api_docs/python/tf/train/ExponentialMovingAverage
A smoothed version of the weights is necessary for some training schemes to perform well.
This class is sensitive where it is initialized in the sequence of model init,
GPU assignment and distributed training wrappers.
"""
def __init__(self, model, decay=0.9999, updates=0):
"""
Args:
model (nn.Module): model to apply EMA.
decay (float): ema decay reate.
updates (int): counter of EMA updates.
"""
# Create EMA(FP32)
self.ema = deepcopy(model.module if is_parallel(model) else model).eval()
self.updates = updates
# decay exponential ramp (to help early epochs)
self.decay = lambda x: decay * (1 - math.exp(-x / 2000))
for p in self.ema.parameters():
p.requires_grad_(False)
def update(self, model):
# Update EMA parameters
with torch.no_grad():
self.updates += 1
d = self.decay(self.updates)
msd = model.module.state_dict() if is_parallel(model) else model.state_dict() # model state_dict
for k, v in self.ema.state_dict().items():
if v.dtype.is_floating_point:
v *= d
v += (1.0 - d) * msd[k].detach()
#!/usr/bin/env python3
# -*- coding:utf-8 -*-
# Copyright (c) 2014-2021 Megvii Inc. All rights reserved.
import inspect
import os
import sys
from loguru import logger
def get_caller_name(depth=0):
"""
Args:
depth (int): Depth of caller conext, use 0 for caller depth. Default value: 0.
Returns:
str: module name of the caller
"""
# the following logic is a little bit faster than inspect.stack() logic
frame = inspect.currentframe().f_back
for _ in range(depth):
frame = frame.f_back
return frame.f_globals["__name__"]
class StreamToLoguru:
"""stream object that redirects writes to a logger instance."""
def __init__(self, level="INFO", caller_names=("apex", "pycocotools")):
"""
Args:
level(str): log level string of loguru. Default value: "INFO".
caller_names(tuple): caller names of redirected module.
Default value: (apex, pycocotools).
"""
self.level = level
self.linebuf = ""
self.caller_names = caller_names
def write(self, buf):
full_name = get_caller_name(depth=1)
module_name = full_name.rsplit(".", maxsplit=-1)[0]
if module_name in self.caller_names:
for line in buf.rstrip().splitlines():
# use caller level log
logger.opt(depth=2).log(self.level, line.rstrip())
else:
sys.__stdout__.write(buf)
def flush(self):
pass
def redirect_sys_output(log_level="INFO"):
redirect_logger = StreamToLoguru(log_level)
sys.stderr = redirect_logger
sys.stdout = redirect_logger
def setup_logger(save_dir, distributed_rank=0, filename="log.txt", mode="a"):
"""setup logger for training and testing.
Args:
save_dir(str): location to save log file
distributed_rank(int): device rank when multi-gpu environment
filename (string): log save name.
mode(str): log file write mode, `append` or `override`. default is `a`.
Return:
logger instance.
"""
loguru_format = (
"<green>{time:YYYY-MM-DD HH:mm:ss}</green> | "
"<level>{level: <8}</level> | "
"<cyan>{name}</cyan>:<cyan>{line}</cyan> - <level>{message}</level>"
)
logger.remove()
save_file = os.path.join(save_dir, filename)
if mode == "o" and os.path.exists(save_file):
os.remove(save_file)
# only keep logger in rank0 process
if distributed_rank == 0:
logger.add(
sys.stderr,
format=loguru_format,
level="INFO",
enqueue=True,
)
logger.add(save_file)
# redirect stdout/stderr to loguru
redirect_sys_output("INFO")
#!/usr/bin/env python3
# -*- coding:utf-8 -*-
# Copyright (c) 2014-2021 Megvii Inc. All rights reserved.
import math
from functools import partial
class LRScheduler:
def __init__(self, name, lr, iters_per_epoch, total_epochs, **kwargs):
"""Supported lr schedulers: [cos, warmcos, multistep]
Args:
lr (float): learning rate.
iters_per_peoch (int): number of iterations in one epoch.
total_epochs (int): number of epochs in training.
kwargs (dict):
- cos: None
- warmcos: [warmup_epochs, warmup_lr_start (default 1e-6)]
- multistep: [milestones (epochs), gamma (default 0.1)]
"""
self.lr = lr
self.iters_per_epoch = iters_per_epoch
self.total_epochs = total_epochs
self.total_iters = iters_per_epoch * total_epochs
self.__dict__.update(kwargs)
self.lr_func = self._get_lr_func(name)
def update_lr(self, iters):
return self.lr_func(iters)
def _get_lr_func(self, name):
if name == "cos": # cosine lr schedule
lr_func = partial(cos_lr, self.lr, self.total_iters)
elif name == "warmcos":
warmup_total_iters = self.iters_per_epoch * self.warmup_epochs
warmup_lr_start = getattr(self, "warmup_lr_start", 1e-6)
lr_func = partial(
warm_cos_lr,
self.lr,
self.total_iters,
warmup_total_iters,
warmup_lr_start,
)
elif name == "yoloxwarmcos":
warmup_total_iters = self.iters_per_epoch * self.warmup_epochs
no_aug_iters = self.iters_per_epoch * self.no_aug_epochs
warmup_lr_start = getattr(self, "warmup_lr_start", 0)
min_lr_ratio = getattr(self, "min_lr_ratio", 0.2)
lr_func = partial(
yolox_warm_cos_lr,
self.lr,
min_lr_ratio,
self.total_iters,
warmup_total_iters,
warmup_lr_start,
no_aug_iters,
)
elif name == "yoloxsemiwarmcos":
warmup_lr_start = getattr(self, "warmup_lr_start", 0)
min_lr_ratio = getattr(self, "min_lr_ratio", 0.2)
warmup_total_iters = self.iters_per_epoch * self.warmup_epochs
no_aug_iters = self.iters_per_epoch * self.no_aug_epochs
normal_iters = self.iters_per_epoch * self.semi_epoch
semi_iters = self.iters_per_epoch_semi * (self.total_epochs - self.semi_epoch - self.no_aug_epochs)
lr_func = partial(
yolox_semi_warm_cos_lr,
self.lr,
min_lr_ratio,
warmup_lr_start,
self.total_iters,
normal_iters,
no_aug_iters,
warmup_total_iters,
semi_iters,
self.iters_per_epoch,
self.iters_per_epoch_semi,
)
elif name == "multistep": # stepwise lr schedule
milestones = [int(self.total_iters * milestone / self.total_epochs) for milestone in self.milestones]
gamma = getattr(self, "gamma", 0.1)
lr_func = partial(multistep_lr, self.lr, milestones, gamma)
else:
raise ValueError("Scheduler version {} not supported.".format(name))
return lr_func
def cos_lr(lr, total_iters, iters):
"""Cosine learning rate."""
lr *= 0.5 * (1.0 + math.cos(math.pi * iters / total_iters))
return lr
def warm_cos_lr(lr, total_iters, warmup_total_iters, warmup_lr_start, iters):
"""Cosine learning rate with warm up."""
if iters <= warmup_total_iters:
lr = (lr - warmup_lr_start) * iters / float(warmup_total_iters) + warmup_lr_start
else:
lr *= 0.5 * (1.0 + math.cos(math.pi * (iters - warmup_total_iters) / (total_iters - warmup_total_iters)))
return lr
def yolox_warm_cos_lr(
lr,
min_lr_ratio,
total_iters,
warmup_total_iters,
warmup_lr_start,
no_aug_iter,
iters,
):
"""Cosine learning rate with warm up.
iters: current iter
"""
min_lr = lr * min_lr_ratio
if iters <= warmup_total_iters:
# lr = (lr - warmup_lr_start) * iters / float(warmup_total_iters) + warmup_lr_start
lr = (lr - warmup_lr_start) * pow(iters / float(warmup_total_iters), 2) + warmup_lr_start
elif iters >= total_iters - no_aug_iter:
lr = min_lr
else:
lr = min_lr + 0.5 * (lr - min_lr) * (
1.0 + math.cos(math.pi * (iters - warmup_total_iters) / (total_iters - warmup_total_iters - no_aug_iter))
)
return lr
def yolox_semi_warm_cos_lr(
lr,
min_lr_ratio,
warmup_lr_start,
total_iters,
normal_iters,
no_aug_iters,
warmup_total_iters,
semi_iters,
iters_per_epoch,
iters_per_epoch_semi,
iters,
):
"""Cosine learning rate with warm up."""
min_lr = lr * min_lr_ratio
if iters <= warmup_total_iters:
# lr = (lr - warmup_lr_start) * iters / float(warmup_total_iters) + warmup_lr_start
lr = (lr - warmup_lr_start) * pow(iters / float(warmup_total_iters), 2) + warmup_lr_start
elif iters >= normal_iters + semi_iters:
lr = min_lr
elif iters <= normal_iters:
lr = min_lr + 0.5 * (lr - min_lr) * (
1.0 + math.cos(math.pi * (iters - warmup_total_iters) / (total_iters - warmup_total_iters - no_aug_iters))
)
else:
lr = min_lr + 0.5 * (lr - min_lr) * (
1.0
+ math.cos(
math.pi
* (
normal_iters
- warmup_total_iters
+ (iters - normal_iters) * iters_per_epoch * 1.0 / iters_per_epoch_semi
)
/ (total_iters - warmup_total_iters - no_aug_iters)
)
)
return lr
def multistep_lr(lr, milestones, gamma, iters):
"""MultiStep learning rate."""
for milestone in milestones:
lr *= gamma if iters >= milestone else 1.0
return lr
#!/usr/bin/env python3
# -*- coding: utf-8 -*-
# Copyright (c) 2014-2021 Megvii Inc. All rights reserved.
import functools
import os
import time
from collections import defaultdict, deque
import numpy as np
import torch
__all__ = [
"AverageMeter",
"MeterBuffer",
"get_total_and_free_memory_in_Mb",
"occupy_mem",
"gpu_mem_usage",
]
def get_total_and_free_memory_in_Mb(cuda_device):
devices_info_str = os.popen("nvidia-smi --query-gpu=memory.total,memory.used --format=csv,nounits,noheader")
devices_info = devices_info_str.read().strip().split("\n")
total, used = devices_info[int(cuda_device)].split(",")
return int(total), int(used)
def occupy_mem(cuda_device, mem_ratio=0.9):
"""pre-allocate gpu memory for training to avoid memory Fragmentation."""
total, used = get_total_and_free_memory_in_Mb(cuda_device)
max_mem = int(total * mem_ratio)
block_mem = max_mem - used
x = torch.cuda.FloatTensor(256, 1024, block_mem)
del x
time.sleep(5)
def gpu_mem_usage():
"""Compute the GPU memory usage for the current device (MB)."""
mem_usage_bytes = torch.cuda.max_memory_allocated()
return mem_usage_bytes / (1024 * 1024)
class AverageMeter:
"""Track a series of values and provide access to smoothed values over a
window or the global series average."""
def __init__(self, window_size=50):
self._deque = deque(maxlen=window_size)
self._total = 0.0
self._count = 0
def update(self, value):
self._deque.append(value)
self._count += 1
self._total += value
@property
def median(self):
d = np.array(list(self._deque))
return np.median(d)
@property
def avg(self):
# if deque is empty, nan will be returned.
d = np.array(list(self._deque))
return d.mean()
@property
def global_avg(self):
return self._total / max(self._count, 1e-5)
@property
def latest(self):
return self._deque[-1] if len(self._deque) > 0 else None
@property
def total(self):
return self._total
def reset(self):
self._deque.clear()
self._total = 0.0
self._count = 0
def clear(self):
self._deque.clear()
class MeterBuffer(defaultdict):
"""Computes and stores the average and current value."""
def __init__(self, window_size=20):
factory = functools.partial(AverageMeter, window_size=window_size)
super().__init__(factory)
def reset(self):
for v in self.values():
v.reset()
def get_filtered_meter(self, filter_key="time"):
return {k: v for k, v in self.items() if filter_key in k}
def update(self, values=None, **kwargs):
if values is None:
values = {}
values.update(kwargs)
for k, v in values.items():
if isinstance(v, torch.Tensor):
v = v.detach()
self[k].update(v)
def clear_meters(self):
for v in self.values():
v.clear()
#!/usr/bin/env python3
# -*- coding:utf-8 -*-
# Copyright (c) 2014-2021 Megvii Inc. All rights reserved.
from copy import deepcopy
import torch
import torch.nn as nn
import torch.nn.functional as F
import math
from thop import profile
__all__ = [
"fuse_conv_and_bn",
"fuse_model",
"get_model_info",
"replace_module",
"scale_img",
]
def get_model_info(model, tsize):
stride = 64
img = torch.zeros((1, 3, stride, stride), device=next(model.parameters()).device)
flops, params = profile(deepcopy(model), inputs=(img,), verbose=False)
params /= 1e6
flops /= 1e9
flops *= tsize[0] * tsize[1] / stride / stride * 2 # Gflops
info = "Params: {:.2f}M, Gflops: {:.2f}".format(params, flops)
return info
def fuse_conv_and_bn(conv, bn):
# Fuse convolution and batchnorm layers https://tehnokv.com/posts/fusing-batchnorm-and-conv/
fusedconv = (
nn.Conv2d(
conv.in_channels,
conv.out_channels,
kernel_size=conv.kernel_size,
stride=conv.stride,
padding=conv.padding,
groups=conv.groups,
bias=True,
)
.requires_grad_(False)
.to(conv.weight.device)
)
# prepare filters
w_conv = conv.weight.clone().view(conv.out_channels, -1)
w_bn = torch.diag(bn.weight.div(torch.sqrt(bn.eps + bn.running_var)))
fusedconv.weight.copy_(torch.mm(w_bn, w_conv).view(fusedconv.weight.shape))
# prepare spatial bias
b_conv = torch.zeros(conv.weight.size(0), device=conv.weight.device) if conv.bias is None else conv.bias
b_bn = bn.bias - bn.weight.mul(bn.running_mean).div(torch.sqrt(bn.running_var + bn.eps))
fusedconv.bias.copy_(torch.mm(w_bn, b_conv.reshape(-1, 1)).reshape(-1) + b_bn)
return fusedconv
def fuse_model(model):
from det.yolox.models.network_blocks import BaseConv
for m in model.modules():
if type(m) is BaseConv and hasattr(m, "bn"):
m.conv = fuse_conv_and_bn(m.conv, m.bn) # update conv
delattr(m, "bn") # remove batchnorm
m.forward = m.fuseforward # update forward
return model
def replace_module(module, replaced_module_type, new_module_type, replace_func=None):
"""Replace given type in module to a new type. mostly used in deploy.
Args:
module (nn.Module): model to apply replace operation.
replaced_module_type (Type): module type to be replaced.
new_module_type (Type)
replace_func (function): python function to describe replace logic. Defalut value None.
Returns:
model (nn.Module): module that already been replaced.
"""
def default_replace_func(replaced_module_type, new_module_type):
return new_module_type()
if replace_func is None:
replace_func = default_replace_func
model = module
if isinstance(module, replaced_module_type):
model = replace_func(replaced_module_type, new_module_type)
else: # recurrsively replace
for name, child in module.named_children():
new_child = replace_module(child, replaced_module_type, new_module_type)
if new_child is not child: # child is already replaced
model.add_module(name, new_child)
return model
def scale_img(img, ratio=1.0, same_shape=False, gs=32): # img(16,3,256,416)
# scales img(bs,3,y,x) by ratio constrained to gs-multiple
if ratio == 1.0:
return img
else:
h, w = img.shape[2:]
s = (int(h * ratio), int(w * ratio)) # new size
img = F.interpolate(img, size=s, mode="bilinear", align_corners=False) # resize
if not same_shape: # pad/crop img
h, w = [math.ceil(x * ratio / gs) * gs for x in (h, w)]
return F.pad(img, [0, w - s[1], 0, h - s[0]], value=0.447 * 255) # value = imagenet mean
#!/usr/bin/env python3
# -*- coding:utf-8 -*-
# Copyright (c) 2014-2021 Megvii Inc. All rights reserved.
import os
import subprocess
from loguru import logger
import cv2
from .dist import get_world_size, is_main_process
__all__ = ["configure_nccl", "configure_module", "get_yolox_datadir", "configure_omp"]
def get_yolox_datadir():
"""get dataset dir of YOLOX.
If environment variable named `YOLOX_DATADIR` is set, this function
will return value of the environment variable. Otherwise, use data
"""
yolox_datadir = os.getenv("YOLOX_DATADIR", None)
if yolox_datadir is None:
from det import yolox
yolox_path = os.path.dirname(os.path.dirname(yolox.__file__))
yolox_datadir = os.path.join(yolox_path, "../datasets")
return yolox_datadir
def configure_nccl():
"""Configure multi-machine environment variables of NCCL."""
os.environ["NCCL_LAUNCH_MODE"] = "PARALLEL"
os.environ["NCCL_IB_HCA"] = subprocess.getoutput(
"pushd /sys/class/infiniband/ > /dev/null; for i in mlx5_*; "
"do cat $i/ports/1/gid_attrs/types/* 2>/dev/null "
"| grep v >/dev/null && echo $i ; done; popd > /dev/null"
)
os.environ["NCCL_IB_GID_INDEX"] = "3"
os.environ["NCCL_IB_TC"] = "106"
def configure_omp(num_threads=1):
"""If OMP_NUM_THREADS is not configured and world_size is greater than 1,
Configure OMP_NUM_THREADS environment variables of NCCL to `num_thread`.
Args:
num_threads (int): value of `OMP_NUM_THREADS` to set.
"""
# We set OMP_NUM_THREADS=1 by default, which achieves the best speed on our machines
# feel free to change it for better performance.
if "OMP_NUM_THREADS" not in os.environ and get_world_size() > 1:
os.environ["OMP_NUM_THREADS"] = str(num_threads)
if is_main_process():
logger.info(
"\n***************************************************************\n"
"We set `OMP_NUM_THREADS` for each process to {} to speed up.\n"
"please further tune the variable for optimal performance.\n"
"***************************************************************".format(os.environ["OMP_NUM_THREADS"])
)
def configure_module(ulimit_value=8192):
"""Configure pytorch module environment. setting of ulimit and cv2 will be
set.
Args:
ulimit_value(int): default open file number on linux. Default value: 8192.
"""
# system setting
try:
import resource
rlimit = resource.getrlimit(resource.RLIMIT_NOFILE)
resource.setrlimit(resource.RLIMIT_NOFILE, (ulimit_value, rlimit[1]))
except Exception:
# Exception might be raised in Windows OS or rlimit reaches max limit number.
# However, set rlimit value might not be necessary.
pass
# cv2
# multiprocess might be harmful on performance of torch dataloader
os.environ["OPENCV_OPENCL_RUNTIME"] = "disabled"
try:
cv2.setNumThreads(0)
cv2.ocl.setUseOpenCL(False)
except Exception:
# cv2 version mismatch might rasie exceptions.
pass
#!/usr/bin/env python3
# -*- coding:utf-8 -*-
# Copyright (c) 2014-2021 Megvii Inc. All rights reserved.
import cv2
import numpy as np
import os.path as osp
import time
import ref
__all__ = ["vis"]
def vis_train(inps, targets, cfg):
for i in range(inps.shape[0]):
image = inps[i].cpu().numpy().transpose(2, 1, 0).astype(np.uint8).copy()
target = targets[i].cpu().numpy().astype(np.int).copy()
bbox = target[:, 1:]
# bbox[:, 2] = bbox[:, 0] + bbox[:, 2]
# bbox[:, 3] = bbox[:, 1] + bbox[:, 3]
# scene_id = int(scene_im_id[0].split("/")[0])
# im_id = int(scene_im_id[0].split("/")[1])
out_file = osp.join(cfg.train["output_dir"], "{}.png".format(str(time.perf_counter())))
# scores = np.ones(bbox.shape[0])
scores = np.zeros(bbox.shape[0])
cls_ids = target[:, 0]
class_names = ref.hb.objects
vis_image = vis(image, bbox, scores, cls_ids, 0.5, class_names)
cv2.imwrite(out_file, vis_image)
def vis(img, boxes, scores, cls_ids, conf=0.5, class_names=None):
for i in range(len(boxes)):
box = boxes[i]
cls_id = int(cls_ids[i])
score = scores[i]
if score < conf:
continue
x0 = int(box[0])
y0 = int(box[1])
x1 = int(box[2])
y1 = int(box[3])
color = (_COLORS[cls_id] * 255).astype(np.uint8).tolist()
text = "{}:{:.1f}%".format(class_names[cls_id], score * 100)
txt_color = (0, 0, 0) if np.mean(_COLORS[cls_id]) > 0.5 else (255, 255, 255)
font = cv2.FONT_HERSHEY_SIMPLEX
txt_size = cv2.getTextSize(text, font, 0.4, 1)[0]
cv2.rectangle(img, (x0, y0), (x1, y1), color, 2)
txt_bk_color = (_COLORS[cls_id] * 255 * 0.7).astype(np.uint8).tolist()
cv2.rectangle(
img,
(x0, y0 + 1),
(x0 + txt_size[0] + 1, y0 + int(1.5 * txt_size[1])),
txt_bk_color,
-1,
)
cv2.putText(img, text, (x0, y0 + txt_size[1]), font, 0.4, txt_color, thickness=1)
return img
_COLORS = (
np.array(
[
0.000,
0.447,
0.741,
0.850,
0.325,
0.098,
0.929,
0.694,
0.125,
0.494,
0.184,
0.556,
0.466,
0.674,
0.188,
0.301,
0.745,
0.933,
0.635,
0.078,
0.184,
0.300,
0.300,
0.300,
0.600,
0.600,
0.600,
1.000,
0.000,
0.000,
1.000,
0.500,
0.000,
0.749,
0.749,
0.000,
0.000,
1.000,
0.000,
0.000,
0.000,
1.000,
0.667,
0.000,
1.000,
0.333,
0.333,
0.000,
0.333,
0.667,
0.000,
0.333,
1.000,
0.000,
0.667,
0.333,
0.000,
0.667,
0.667,
0.000,
0.667,
1.000,
0.000,
1.000,
0.333,
0.000,
1.000,
0.667,
0.000,
1.000,
1.000,
0.000,
0.000,
0.333,
0.500,
0.000,
0.667,
0.500,
0.000,
1.000,
0.500,
0.333,
0.000,
0.500,
0.333,
0.333,
0.500,
0.333,
0.667,
0.500,
0.333,
1.000,
0.500,
0.667,
0.000,
0.500,
0.667,
0.333,
0.500,
0.667,
0.667,
0.500,
0.667,
1.000,
0.500,
1.000,
0.000,
0.500,
1.000,
0.333,
0.500,
1.000,
0.667,
0.500,
1.000,
1.000,
0.500,
0.000,
0.333,
1.000,
0.000,
0.667,
1.000,
0.000,
1.000,
1.000,
0.333,
0.000,
1.000,
0.333,
0.333,
1.000,
0.333,
0.667,
1.000,
0.333,
1.000,
1.000,
0.667,
0.000,
1.000,
0.667,
0.333,
1.000,
0.667,
0.667,
1.000,
0.667,
1.000,
1.000,
1.000,
0.000,
1.000,
1.000,
0.333,
1.000,
1.000,
0.667,
1.000,
0.333,
0.000,
0.000,
0.500,
0.000,
0.000,
0.667,
0.000,
0.000,
0.833,
0.000,
0.000,
1.000,
0.000,
0.000,
0.000,
0.167,
0.000,
0.000,
0.333,
0.000,
0.000,
0.500,
0.000,
0.000,
0.667,
0.000,
0.000,
0.833,
0.000,
0.000,
1.000,
0.000,
0.000,
0.000,
0.167,
0.000,
0.000,
0.333,
0.000,
0.000,
0.500,
0.000,
0.000,
0.667,
0.000,
0.000,
0.833,
0.000,
0.000,
1.000,
0.000,
0.000,
0.000,
0.143,
0.143,
0.143,
0.286,
0.286,
0.286,
0.429,
0.429,
0.429,
0.571,
0.571,
0.571,
0.714,
0.714,
0.714,
0.857,
0.857,
0.857,
0.000,
0.447,
0.741,
0.314,
0.717,
0.741,
0.50,
0.5,
0,
]
)
.astype(np.float32)
.reshape(-1, 3)
)
0% Loading or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment