Merge branch 'det' into main

24f3fb70 · Jiwen Tang · 42f71b6a · cfb8760a · 24f3fb70 · 24f3fb70
Commit 24f3fb70 authored 2 years ago by Jiwen Tang
--- a/det/yolox/utils/demo_utils.py
+++ b/det/yolox/utils/demo_utils.py
+#!/usr/bin/env python3
+# -*- coding:utf-8 -*-
+# Copyright (c) 2014-2021 Megvii Inc. All rights reserved.
+import os
+
+import numpy as np
+
+__all__ = ["mkdir", "nms", "multiclass_nms", "demo_postprocess"]
+
+
+def mkdir(path):
+    if not os.path.exists(path):
+        os.makedirs(path)
+
+
+def nms(boxes, scores, nms_thr):
+    """Single class NMS implemented in Numpy."""
+    x1 = boxes[:, 0]
+    y1 = boxes[:, 1]
+    x2 = boxes[:, 2]
+    y2 = boxes[:, 3]
+
+    areas = (x2 - x1 + 1) * (y2 - y1 + 1)
+    order = scores.argsort()[::-1]
+
+    keep = []
+    while order.size > 0:
+        i = order[0]
+        keep.append(i)
+        xx1 = np.maximum(x1[i], x1[order[1:]])
+        yy1 = np.maximum(y1[i], y1[order[1:]])
+        xx2 = np.minimum(x2[i], x2[order[1:]])
+        yy2 = np.minimum(y2[i], y2[order[1:]])
+
+        w = np.maximum(0.0, xx2 - xx1 + 1)
+        h = np.maximum(0.0, yy2 - yy1 + 1)
+        inter = w * h
+        ovr = inter / (areas[i] + areas[order[1:]] - inter)
+
+        inds = np.where(ovr <= nms_thr)[0]
+        order = order[inds + 1]
+
+    return keep
+
+
+def multiclass_nms(boxes, scores, nms_thr, score_thr, class_agnostic=True):
+    """Multiclass NMS implemented in Numpy."""
+    if class_agnostic:
+        nms_method = multiclass_nms_class_agnostic
+    else:
+        nms_method = multiclass_nms_class_aware
+    return nms_method(boxes, scores, nms_thr, score_thr)
+
+
+def multiclass_nms_class_aware(boxes, scores, nms_thr, score_thr):
+    """Multiclass NMS implemented in Numpy.
+
+    Class-aware version.
+    """
+    final_dets = []
+    num_classes = scores.shape[1]
+    for cls_ind in range(num_classes):
+        cls_scores = scores[:, cls_ind]
+        valid_score_mask = cls_scores > score_thr
+        if valid_score_mask.sum() == 0:
+            continue
+        else:
+            valid_scores = cls_scores[valid_score_mask]
+            valid_boxes = boxes[valid_score_mask]
+            keep = nms(valid_boxes, valid_scores, nms_thr)
+            if len(keep) > 0:
+                cls_inds = np.ones((len(keep), 1)) * cls_ind
+                dets = np.concatenate([valid_boxes[keep], valid_scores[keep, None], cls_inds], 1)
+                final_dets.append(dets)
+    if len(final_dets) == 0:
+        return None
+    return np.concatenate(final_dets, 0)
+
+
+def multiclass_nms_class_agnostic(boxes, scores, nms_thr, score_thr):
+    """Multiclass NMS implemented in Numpy.
+
+    Class-agnostic version.
+    """
+    cls_inds = scores.argmax(1)
+    cls_scores = scores[np.arange(len(cls_inds)), cls_inds]
+
+    valid_score_mask = cls_scores > score_thr
+    if valid_score_mask.sum() == 0:
+        return None
+    valid_scores = cls_scores[valid_score_mask]
+    valid_boxes = boxes[valid_score_mask]
+    valid_cls_inds = cls_inds[valid_score_mask]
+    keep = nms(valid_boxes, valid_scores, nms_thr)
+    if keep:
+        dets = np.concatenate([valid_boxes[keep], valid_scores[keep, None], valid_cls_inds[keep, None]], 1)
+    return dets
+
+
+def demo_postprocess(outputs, img_size, p6=False):
+
+    grids = []
+    expanded_strides = []
+
+    if not p6:
+        strides = [8, 16, 32]
+    else:
+        strides = [8, 16, 32, 64]
+
+    hsizes = [img_size[0] // stride for stride in strides]
+    wsizes = [img_size[1] // stride for stride in strides]
+
+    for hsize, wsize, stride in zip(hsizes, wsizes, strides):
+        xv, yv = np.meshgrid(np.arange(wsize), np.arange(hsize))
+        grid = np.stack((xv, yv), 2).reshape(1, -1, 2)
+        grids.append(grid)
+        shape = grid.shape[:2]
+        expanded_strides.append(np.full((*shape, 1), stride))
+
+    grids = np.concatenate(grids, 1)
+    expanded_strides = np.concatenate(expanded_strides, 1)
+    outputs[..., :2] = (outputs[..., :2] + grids) * expanded_strides
+    outputs[..., 2:4] = np.exp(outputs[..., 2:4]) * expanded_strides
+
+    return outputs
--- a/det/yolox/utils/dist.py
+++ b/det/yolox/utils/dist.py
+#!/usr/bin/env python3
+# -*- coding:utf-8 -*-
+# This file mainly comes from
+# https://github.com/facebookresearch/detectron2/blob/master/detectron2/utils/comm.py
+# Copyright (c) Facebook, Inc. and its affiliates.
+# Copyright (c) 2014-2021 Megvii Inc. All rights reserved.
+"""This file contains primitives for multi-gpu communication.
+
+This is useful when doing distributed training.
+"""
+import functools
+import os
+import pickle
+import time
+from contextlib import contextmanager
+from loguru import logger
+
+import numpy as np
+
+import torch
+from torch import distributed as dist
+
+__all__ = [
+    "get_num_devices",
+    "wait_for_the_master",
+    "is_main_process",
+    "synchronize",
+    "get_world_size",
+    "get_rank",
+    "get_local_rank",
+    "get_local_size",
+    "time_synchronized",
+    "gather",
+    "all_gather",
+]
+
+_LOCAL_PROCESS_GROUP = None
+
+
+def get_num_devices():
+    gpu_list = os.getenv("CUDA_VISIBLE_DEVICES", None)
+    if gpu_list is not None:
+        return len(gpu_list.split(","))
+    else:
+        devices_list_info = os.popen("nvidia-smi -L")
+        devices_list_info = devices_list_info.read().strip().split("\n")
+        return len(devices_list_info)
+
+
+@contextmanager
+def wait_for_the_master(local_rank: int):
+    """Make all processes waiting for the master to do some task."""
+    if local_rank > 0:
+        dist.barrier()
+    yield
+    if local_rank == 0:
+        if not dist.is_available():
+            return
+        if not dist.is_initialized():
+            return
+        else:
+            dist.barrier()
+
+
+def synchronize():
+    """Helper function to synchronize (barrier) among all processes when using
+    distributed training."""
+    if not dist.is_available():
+        return
+    if not dist.is_initialized():
+        return
+    world_size = dist.get_world_size()
+    if world_size == 1:
+        return
+    dist.barrier()
+
+
+def get_world_size() -> int:
+    if not dist.is_available():
+        return 1
+    if not dist.is_initialized():
+        return 1
+    return dist.get_world_size()
+
+
+def get_rank() -> int:
+    if not dist.is_available():
+        return 0
+    if not dist.is_initialized():
+        return 0
+    return dist.get_rank()
+
+
+def get_local_rank() -> int:
+    """
+    Returns:
+        The rank of the current process within the local (per-machine) process group.
+    """
+    if not dist.is_available():
+        return 0
+    if not dist.is_initialized():
+        return 0
+    assert _LOCAL_PROCESS_GROUP is not None
+    return dist.get_rank(group=_LOCAL_PROCESS_GROUP)
+
+
+def get_local_size() -> int:
+    """
+    Returns:
+        The size of the per-machine process group, i.e. the number of processes per machine.
+    """
+    if not dist.is_available():
+        return 1
+    if not dist.is_initialized():
+        return 1
+    return dist.get_world_size(group=_LOCAL_PROCESS_GROUP)
+
+
+def is_main_process() -> bool:
+    return get_rank() == 0
+
+
+@functools.lru_cache()
+def _get_global_gloo_group():
+    """Return a process group based on gloo backend, containing all the ranks
+    The result is cached."""
+    if dist.get_backend() == "nccl":
+        return dist.new_group(backend="gloo")
+    else:
+        return dist.group.WORLD
+
+
+def _serialize_to_tensor(data, group):
+    backend = dist.get_backend(group)
+    assert backend in ["gloo", "nccl"]
+    device = torch.device("cpu" if backend == "gloo" else "cuda")
+
+    buffer = pickle.dumps(data)
+    if len(buffer) > 1024**3:
+        logger.warning(
+            "Rank {} trying to all-gather {:.2f} GB of data on device {}".format(
+                get_rank(), len(buffer) / (1024**3), device
+            )
+        )
+    storage = torch.ByteStorage.from_buffer(buffer)
+    tensor = torch.ByteTensor(storage).to(device=device)
+    return tensor
+
+
+def _pad_to_largest_tensor(tensor, group):
+    """
+    Returns:
+        list[int]: size of the tensor, on each rank
+        Tensor: padded tensor that has the max size
+    """
+    world_size = dist.get_world_size(group=group)
+    assert world_size >= 1, "comm.gather/all_gather must be called from ranks within the given group!"
+    local_size = torch.tensor([tensor.numel()], dtype=torch.int64, device=tensor.device)
+    size_list = [torch.zeros([1], dtype=torch.int64, device=tensor.device) for _ in range(world_size)]
+    dist.all_gather(size_list, local_size, group=group)
+    size_list = [int(size.item()) for size in size_list]
+
+    max_size = max(size_list)
+
+    # we pad the tensor because torch all_gather does not support
+    # gathering tensors of different shapes
+    if local_size != max_size:
+        padding = torch.zeros((max_size - local_size,), dtype=torch.uint8, device=tensor.device)
+        tensor = torch.cat((tensor, padding), dim=0)
+    return size_list, tensor
+
+
+def all_gather(data, group=None):
+    """Run all_gather on arbitrary picklable data (not necessarily tensors).
+
+    Args:
+        data: any picklable object
+        group: a torch process group. By default, will use a group which
+            contains all ranks on gloo backend.
+    Returns:
+        list[data]: list of data gathered from each rank
+    """
+    if get_world_size() == 1:
+        return [data]
+    if group is None:
+        group = _get_global_gloo_group()
+    if dist.get_world_size(group) == 1:
+        return [data]
+
+    tensor = _serialize_to_tensor(data, group)
+
+    size_list, tensor = _pad_to_largest_tensor(tensor, group)
+    max_size = max(size_list)
+
+    # receiving Tensor from all ranks
+    tensor_list = [torch.empty((max_size,), dtype=torch.uint8, device=tensor.device) for _ in size_list]
+    dist.all_gather(tensor_list, tensor, group=group)
+
+    data_list = []
+    for size, tensor in zip(size_list, tensor_list):
+        buffer = tensor.cpu().numpy().tobytes()[:size]
+        data_list.append(pickle.loads(buffer))
+
+    return data_list
+
+
+def gather(data, dst=0, group=None):
+    """Run gather on arbitrary picklable data (not necessarily tensors).
+
+    Args:
+        data: any picklable object
+        dst (int): destination rank
+        group: a torch process group. By default, will use a group which
+            contains all ranks on gloo backend.
+
+    Returns:
+        list[data]: on dst, a list of data gathered from each rank. Otherwise,
+            an empty list.
+    """
+    if get_world_size() == 1:
+        return [data]
+    if group is None:
+        group = _get_global_gloo_group()
+    if dist.get_world_size(group=group) == 1:
+        return [data]
+    rank = dist.get_rank(group=group)
+
+    tensor = _serialize_to_tensor(data, group)
+    size_list, tensor = _pad_to_largest_tensor(tensor, group)
+
+    # receiving Tensor from all ranks
+    if rank == dst:
+        max_size = max(size_list)
+        tensor_list = [torch.empty((max_size,), dtype=torch.uint8, device=tensor.device) for _ in size_list]
+        dist.gather(tensor, tensor_list, dst=dst, group=group)
+
+        data_list = []
+        for size, tensor in zip(size_list, tensor_list):
+            buffer = tensor.cpu().numpy().tobytes()[:size]
+            data_list.append(pickle.loads(buffer))
+        return data_list
+    else:
+        dist.gather(tensor, [], dst=dst, group=group)
+        return []
+
+
+def shared_random_seed():
+    """
+    Returns:
+        int: a random number that is the same across all workers.
+            If workers need a shared RNG, they can use this shared seed to
+            create one.
+    All workers must call this function, otherwise it will deadlock.
+    """
+    ints = np.random.randint(2**31)
+    all_ints = all_gather(ints)
+    return all_ints[0]
+
+
+def time_synchronized():
+    """pytorch-accurate time."""
+    if torch.cuda.is_available():
+        torch.cuda.synchronize()
+    return time.perf_counter()
--- a/det/yolox/utils/ema.py
+++ b/det/yolox/utils/ema.py
+#!/usr/bin/env python3
+# -*- coding:utf-8 -*-
+# Copyright (c) 2014-2021 Megvii Inc. All rights reserved.
+import math
+from copy import deepcopy
+import logging
+
+import torch
+import torch.nn as nn
+
+from lib.utils.setup_logger import log_first_n
+
+__all__ = ["ModelEMA", "is_parallel"]
+
+
+def is_parallel(model):
+    """check if model is in parallel mode."""
+
+    parallel_type = (nn.parallel.DataParallel, nn.parallel.DistributedDataParallel)
+    return isinstance(model, parallel_type)
+
+
+class ModelEMA:
+    """Model Exponential Moving Average from
+    https://github.com/rwightman/pytorch-image-models Keep a moving average of
+    everything in the model state_dict (parameters and buffers).
+
+    This is intended to allow functionality like
+    https://www.tensorflow.org/api_docs/python/tf/train/ExponentialMovingAverage
+    A smoothed version of the weights is necessary for some training schemes to perform well.
+    This class is sensitive where it is initialized in the sequence of model init,
+    GPU assignment and distributed training wrappers.
+    """
+
+    def __init__(self, model, decay=0.9999, updates=0):
+        """
+        Args:
+            model (nn.Module): model to apply EMA.
+            decay (float): ema decay reate.
+            updates (int): counter of EMA updates.
+        """
+        # Create EMA(FP32)
+        self.ema = deepcopy(model.module if is_parallel(model) else model).eval()
+        self.updates = updates
+        # decay exponential ramp (to help early epochs)
+        self.decay = lambda x: decay * (1 - math.exp(-x / 2000))
+        for p in self.ema.parameters():
+            p.requires_grad_(False)
+
+    def update(self, model):
+        # Update EMA parameters
+        with torch.no_grad():
+            self.updates += 1
+            d = self.decay(self.updates)
+
+            msd = model.module.state_dict() if is_parallel(model) else model.state_dict()  # model state_dict
+            for k, v in self.ema.state_dict().items():
+                if v.dtype.is_floating_point:
+                    v *= d
+                    v += (1.0 - d) * msd[k].detach()
--- a/det/yolox/utils/logger.py
+++ b/det/yolox/utils/logger.py
+#!/usr/bin/env python3
+# -*- coding:utf-8 -*-
+# Copyright (c) 2014-2021 Megvii Inc. All rights reserved.
+import inspect
+import os
+import sys
+from loguru import logger
+
+
+def get_caller_name(depth=0):
+    """
+    Args:
+        depth (int): Depth of caller conext, use 0 for caller depth. Default value: 0.
+
+    Returns:
+        str: module name of the caller
+    """
+    # the following logic is a little bit faster than inspect.stack() logic
+    frame = inspect.currentframe().f_back
+    for _ in range(depth):
+        frame = frame.f_back
+
+    return frame.f_globals["__name__"]
+
+
+class StreamToLoguru:
+    """stream object that redirects writes to a logger instance."""
+
+    def __init__(self, level="INFO", caller_names=("apex", "pycocotools")):
+        """
+        Args:
+            level(str): log level string of loguru. Default value: "INFO".
+            caller_names(tuple): caller names of redirected module.
+                Default value: (apex, pycocotools).
+        """
+        self.level = level
+        self.linebuf = ""
+        self.caller_names = caller_names
+
+    def write(self, buf):
+        full_name = get_caller_name(depth=1)
+        module_name = full_name.rsplit(".", maxsplit=-1)[0]
+        if module_name in self.caller_names:
+            for line in buf.rstrip().splitlines():
+                # use caller level log
+                logger.opt(depth=2).log(self.level, line.rstrip())
+        else:
+            sys.__stdout__.write(buf)
+
+    def flush(self):
+        pass
+
+
+def redirect_sys_output(log_level="INFO"):
+    redirect_logger = StreamToLoguru(log_level)
+    sys.stderr = redirect_logger
+    sys.stdout = redirect_logger
+
+
+def setup_logger(save_dir, distributed_rank=0, filename="log.txt", mode="a"):
+    """setup logger for training and testing.
+    Args:
+        save_dir(str): location to save log file
+        distributed_rank(int): device rank when multi-gpu environment
+        filename (string): log save name.
+        mode(str): log file write mode, `append` or `override`. default is `a`.
+
+    Return:
+        logger instance.
+    """
+    loguru_format = (
+        "<green>{time:YYYY-MM-DD HH:mm:ss}</green> | "
+        "<level>{level: <8}</level> | "
+        "<cyan>{name}</cyan>:<cyan>{line}</cyan> - <level>{message}</level>"
+    )
+
+    logger.remove()
+    save_file = os.path.join(save_dir, filename)
+    if mode == "o" and os.path.exists(save_file):
+        os.remove(save_file)
+    # only keep logger in rank0 process
+    if distributed_rank == 0:
+        logger.add(
+            sys.stderr,
+            format=loguru_format,
+            level="INFO",
+            enqueue=True,
+        )
+        logger.add(save_file)
+
+    # redirect stdout/stderr to loguru
+    redirect_sys_output("INFO")
--- a/det/yolox/utils/lr_scheduler.py
+++ b/det/yolox/utils/lr_scheduler.py
+#!/usr/bin/env python3
+# -*- coding:utf-8 -*-
+# Copyright (c) 2014-2021 Megvii Inc. All rights reserved.
+
+import math
+from functools import partial
+
+
+class LRScheduler:
+    def __init__(self, name, lr, iters_per_epoch, total_epochs, **kwargs):
+        """Supported lr schedulers: [cos, warmcos, multistep]
+
+        Args:
+            lr (float): learning rate.
+            iters_per_peoch (int): number of iterations in one epoch.
+            total_epochs (int): number of epochs in training.
+            kwargs (dict):
+                - cos: None
+                - warmcos: [warmup_epochs, warmup_lr_start (default 1e-6)]
+                - multistep: [milestones (epochs), gamma (default 0.1)]
+        """
+
+        self.lr = lr
+        self.iters_per_epoch = iters_per_epoch
+        self.total_epochs = total_epochs
+        self.total_iters = iters_per_epoch * total_epochs
+
+        self.__dict__.update(kwargs)
+
+        self.lr_func = self._get_lr_func(name)
+
+    def update_lr(self, iters):
+        return self.lr_func(iters)
+
+    def _get_lr_func(self, name):
+        if name == "cos":  # cosine lr schedule
+            lr_func = partial(cos_lr, self.lr, self.total_iters)
+        elif name == "warmcos":
+            warmup_total_iters = self.iters_per_epoch * self.warmup_epochs
+            warmup_lr_start = getattr(self, "warmup_lr_start", 1e-6)
+            lr_func = partial(
+                warm_cos_lr,
+                self.lr,
+                self.total_iters,
+                warmup_total_iters,
+                warmup_lr_start,
+            )
+        elif name == "yoloxwarmcos":
+            warmup_total_iters = self.iters_per_epoch * self.warmup_epochs
+            no_aug_iters = self.iters_per_epoch * self.no_aug_epochs
+            warmup_lr_start = getattr(self, "warmup_lr_start", 0)
+            min_lr_ratio = getattr(self, "min_lr_ratio", 0.2)
+            lr_func = partial(
+                yolox_warm_cos_lr,
+                self.lr,
+                min_lr_ratio,
+                self.total_iters,
+                warmup_total_iters,
+                warmup_lr_start,
+                no_aug_iters,
+            )
+        elif name == "yoloxsemiwarmcos":
+            warmup_lr_start = getattr(self, "warmup_lr_start", 0)
+            min_lr_ratio = getattr(self, "min_lr_ratio", 0.2)
+            warmup_total_iters = self.iters_per_epoch * self.warmup_epochs
+            no_aug_iters = self.iters_per_epoch * self.no_aug_epochs
+            normal_iters = self.iters_per_epoch * self.semi_epoch
+            semi_iters = self.iters_per_epoch_semi * (self.total_epochs - self.semi_epoch - self.no_aug_epochs)
+            lr_func = partial(
+                yolox_semi_warm_cos_lr,
+                self.lr,
+                min_lr_ratio,
+                warmup_lr_start,
+                self.total_iters,
+                normal_iters,
+                no_aug_iters,
+                warmup_total_iters,
+                semi_iters,
+                self.iters_per_epoch,
+                self.iters_per_epoch_semi,
+            )
+        elif name == "multistep":  # stepwise lr schedule
+            milestones = [int(self.total_iters * milestone / self.total_epochs) for milestone in self.milestones]
+            gamma = getattr(self, "gamma", 0.1)
+            lr_func = partial(multistep_lr, self.lr, milestones, gamma)
+        else:
+            raise ValueError("Scheduler version {} not supported.".format(name))
+        return lr_func
+
+
+def cos_lr(lr, total_iters, iters):
+    """Cosine learning rate."""
+    lr *= 0.5 * (1.0 + math.cos(math.pi * iters / total_iters))
+    return lr
+
+
+def warm_cos_lr(lr, total_iters, warmup_total_iters, warmup_lr_start, iters):
+    """Cosine learning rate with warm up."""
+    if iters <= warmup_total_iters:
+        lr = (lr - warmup_lr_start) * iters / float(warmup_total_iters) + warmup_lr_start
+    else:
+        lr *= 0.5 * (1.0 + math.cos(math.pi * (iters - warmup_total_iters) / (total_iters - warmup_total_iters)))
+    return lr
+
+
+def yolox_warm_cos_lr(
+    lr,
+    min_lr_ratio,
+    total_iters,
+    warmup_total_iters,
+    warmup_lr_start,
+    no_aug_iter,
+    iters,
+):
+    """Cosine learning rate with warm up.
+
+    iters: current iter
+    """
+    min_lr = lr * min_lr_ratio
+    if iters <= warmup_total_iters:
+        # lr = (lr - warmup_lr_start) * iters / float(warmup_total_iters) + warmup_lr_start
+        lr = (lr - warmup_lr_start) * pow(iters / float(warmup_total_iters), 2) + warmup_lr_start
+    elif iters >= total_iters - no_aug_iter:
+        lr = min_lr
+    else:
+        lr = min_lr + 0.5 * (lr - min_lr) * (
+            1.0 + math.cos(math.pi * (iters - warmup_total_iters) / (total_iters - warmup_total_iters - no_aug_iter))
+        )
+    return lr
+
+
+def yolox_semi_warm_cos_lr(
+    lr,
+    min_lr_ratio,
+    warmup_lr_start,
+    total_iters,
+    normal_iters,
+    no_aug_iters,
+    warmup_total_iters,
+    semi_iters,
+    iters_per_epoch,
+    iters_per_epoch_semi,
+    iters,
+):
+    """Cosine learning rate with warm up."""
+    min_lr = lr * min_lr_ratio
+    if iters <= warmup_total_iters:
+        # lr = (lr - warmup_lr_start) * iters / float(warmup_total_iters) + warmup_lr_start
+        lr = (lr - warmup_lr_start) * pow(iters / float(warmup_total_iters), 2) + warmup_lr_start
+    elif iters >= normal_iters + semi_iters:
+        lr = min_lr
+    elif iters <= normal_iters:
+        lr = min_lr + 0.5 * (lr - min_lr) * (
+            1.0 + math.cos(math.pi * (iters - warmup_total_iters) / (total_iters - warmup_total_iters - no_aug_iters))
+        )
+    else:
+        lr = min_lr + 0.5 * (lr - min_lr) * (
+            1.0
+            + math.cos(
+                math.pi
+                * (
+                    normal_iters
+                    - warmup_total_iters
+                    + (iters - normal_iters) * iters_per_epoch * 1.0 / iters_per_epoch_semi
+                )
+                / (total_iters - warmup_total_iters - no_aug_iters)
+            )
+        )
+    return lr
+
+
+def multistep_lr(lr, milestones, gamma, iters):
+    """MultiStep learning rate."""
+    for milestone in milestones:
+        lr *= gamma if iters >= milestone else 1.0
+    return lr
--- a/det/yolox/utils/metric.py
+++ b/det/yolox/utils/metric.py
+#!/usr/bin/env python3
+# -*- coding: utf-8 -*-
+# Copyright (c) 2014-2021 Megvii Inc. All rights reserved.
+import functools
+import os
+import time
+from collections import defaultdict, deque
+
+import numpy as np
+
+import torch
+
+__all__ = [
+    "AverageMeter",
+    "MeterBuffer",
+    "get_total_and_free_memory_in_Mb",
+    "occupy_mem",
+    "gpu_mem_usage",
+]
+
+
+def get_total_and_free_memory_in_Mb(cuda_device):
+    devices_info_str = os.popen("nvidia-smi --query-gpu=memory.total,memory.used --format=csv,nounits,noheader")
+    devices_info = devices_info_str.read().strip().split("\n")
+    total, used = devices_info[int(cuda_device)].split(",")
+    return int(total), int(used)
+
+
+def occupy_mem(cuda_device, mem_ratio=0.9):
+    """pre-allocate gpu memory for training to avoid memory Fragmentation."""
+    total, used = get_total_and_free_memory_in_Mb(cuda_device)
+    max_mem = int(total * mem_ratio)
+    block_mem = max_mem - used
+    x = torch.cuda.FloatTensor(256, 1024, block_mem)
+    del x
+    time.sleep(5)
+
+
+def gpu_mem_usage():
+    """Compute the GPU memory usage for the current device (MB)."""
+    mem_usage_bytes = torch.cuda.max_memory_allocated()
+    return mem_usage_bytes / (1024 * 1024)
+
+
+class AverageMeter:
+    """Track a series of values and provide access to smoothed values over a
+    window or the global series average."""
+
+    def __init__(self, window_size=50):
+        self._deque = deque(maxlen=window_size)
+        self._total = 0.0
+        self._count = 0
+
+    def update(self, value):
+        self._deque.append(value)
+        self._count += 1
+        self._total += value
+
+    @property
+    def median(self):
+        d = np.array(list(self._deque))
+        return np.median(d)
+
+    @property
+    def avg(self):
+        # if deque is empty, nan will be returned.
+        d = np.array(list(self._deque))
+        return d.mean()
+
+    @property
+    def global_avg(self):
+        return self._total / max(self._count, 1e-5)
+
+    @property
+    def latest(self):
+        return self._deque[-1] if len(self._deque) > 0 else None
+
+    @property
+    def total(self):
+        return self._total
+
+    def reset(self):
+        self._deque.clear()
+        self._total = 0.0
+        self._count = 0
+
+    def clear(self):
+        self._deque.clear()
+
+
+class MeterBuffer(defaultdict):
+    """Computes and stores the average and current value."""
+
+    def __init__(self, window_size=20):
+        factory = functools.partial(AverageMeter, window_size=window_size)
+        super().__init__(factory)
+
+    def reset(self):
+        for v in self.values():
+            v.reset()
+
+    def get_filtered_meter(self, filter_key="time"):
+        return {k: v for k, v in self.items() if filter_key in k}
+
+    def update(self, values=None, **kwargs):
+        if values is None:
+            values = {}
+        values.update(kwargs)
+        for k, v in values.items():
+            if isinstance(v, torch.Tensor):
+                v = v.detach()
+            self[k].update(v)
+
+    def clear_meters(self):
+        for v in self.values():
+            v.clear()
--- a/det/yolox/utils/model_utils.py
+++ b/det/yolox/utils/model_utils.py
+#!/usr/bin/env python3
+# -*- coding:utf-8 -*-
+# Copyright (c) 2014-2021 Megvii Inc. All rights reserved.
+from copy import deepcopy
+
+import torch
+import torch.nn as nn
+import torch.nn.functional as F
+import math
+from thop import profile
+
+__all__ = [
+    "fuse_conv_and_bn",
+    "fuse_model",
+    "get_model_info",
+    "replace_module",
+    "scale_img",
+]
+
+
+def get_model_info(model, tsize):
+
+    stride = 64
+    img = torch.zeros((1, 3, stride, stride), device=next(model.parameters()).device)
+    flops, params = profile(deepcopy(model), inputs=(img,), verbose=False)
+    params /= 1e6
+    flops /= 1e9
+    flops *= tsize[0] * tsize[1] / stride / stride * 2  # Gflops
+    info = "Params: {:.2f}M, Gflops: {:.2f}".format(params, flops)
+    return info
+
+
+def fuse_conv_and_bn(conv, bn):
+    # Fuse convolution and batchnorm layers https://tehnokv.com/posts/fusing-batchnorm-and-conv/
+    fusedconv = (
+        nn.Conv2d(
+            conv.in_channels,
+            conv.out_channels,
+            kernel_size=conv.kernel_size,
+            stride=conv.stride,
+            padding=conv.padding,
+            groups=conv.groups,
+            bias=True,
+        )
+        .requires_grad_(False)
+        .to(conv.weight.device)
+    )
+
+    # prepare filters
+    w_conv = conv.weight.clone().view(conv.out_channels, -1)
+    w_bn = torch.diag(bn.weight.div(torch.sqrt(bn.eps + bn.running_var)))
+    fusedconv.weight.copy_(torch.mm(w_bn, w_conv).view(fusedconv.weight.shape))
+
+    # prepare spatial bias
+    b_conv = torch.zeros(conv.weight.size(0), device=conv.weight.device) if conv.bias is None else conv.bias
+    b_bn = bn.bias - bn.weight.mul(bn.running_mean).div(torch.sqrt(bn.running_var + bn.eps))
+    fusedconv.bias.copy_(torch.mm(w_bn, b_conv.reshape(-1, 1)).reshape(-1) + b_bn)
+
+    return fusedconv
+
+
+def fuse_model(model):
+    from det.yolox.models.network_blocks import BaseConv
+
+    for m in model.modules():
+        if type(m) is BaseConv and hasattr(m, "bn"):
+            m.conv = fuse_conv_and_bn(m.conv, m.bn)  # update conv
+            delattr(m, "bn")  # remove batchnorm
+            m.forward = m.fuseforward  # update forward
+    return model
+
+
+def replace_module(module, replaced_module_type, new_module_type, replace_func=None):
+    """Replace given type in module to a new type. mostly used in deploy.
+
+    Args:
+        module (nn.Module): model to apply replace operation.
+        replaced_module_type (Type): module type to be replaced.
+        new_module_type (Type)
+        replace_func (function): python function to describe replace logic. Defalut value None.
+
+    Returns:
+        model (nn.Module): module that already been replaced.
+    """
+
+    def default_replace_func(replaced_module_type, new_module_type):
+        return new_module_type()
+
+    if replace_func is None:
+        replace_func = default_replace_func
+
+    model = module
+    if isinstance(module, replaced_module_type):
+        model = replace_func(replaced_module_type, new_module_type)
+    else:  # recurrsively replace
+        for name, child in module.named_children():
+            new_child = replace_module(child, replaced_module_type, new_module_type)
+            if new_child is not child:  # child is already replaced
+                model.add_module(name, new_child)
+
+    return model
+
+
+def scale_img(img, ratio=1.0, same_shape=False, gs=32):  # img(16,3,256,416)
+    # scales img(bs,3,y,x) by ratio constrained to gs-multiple
+    if ratio == 1.0:
+        return img
+    else:
+        h, w = img.shape[2:]
+        s = (int(h * ratio), int(w * ratio))  # new size
+        img = F.interpolate(img, size=s, mode="bilinear", align_corners=False)  # resize
+        if not same_shape:  # pad/crop img
+            h, w = [math.ceil(x * ratio / gs) * gs for x in (h, w)]
+        return F.pad(img, [0, w - s[1], 0, h - s[0]], value=0.447 * 255)  # value = imagenet mean
--- a/det/yolox/utils/setup_env.py
+++ b/det/yolox/utils/setup_env.py
+#!/usr/bin/env python3
+# -*- coding:utf-8 -*-
+# Copyright (c) 2014-2021 Megvii Inc. All rights reserved.
+import os
+import subprocess
+from loguru import logger
+import cv2
+from .dist import get_world_size, is_main_process
+
+
+__all__ = ["configure_nccl", "configure_module", "get_yolox_datadir", "configure_omp"]
+
+
+def get_yolox_datadir():
+    """get dataset dir of YOLOX.
+
+    If environment variable named `YOLOX_DATADIR` is set, this function
+    will return value of the environment variable. Otherwise, use data
+    """
+    yolox_datadir = os.getenv("YOLOX_DATADIR", None)
+    if yolox_datadir is None:
+        from det import yolox
+
+        yolox_path = os.path.dirname(os.path.dirname(yolox.__file__))
+        yolox_datadir = os.path.join(yolox_path, "../datasets")
+    return yolox_datadir
+
+
+def configure_nccl():
+    """Configure multi-machine environment variables of NCCL."""
+    os.environ["NCCL_LAUNCH_MODE"] = "PARALLEL"
+    os.environ["NCCL_IB_HCA"] = subprocess.getoutput(
+        "pushd /sys/class/infiniband/ > /dev/null; for i in mlx5_*; "
+        "do cat $i/ports/1/gid_attrs/types/* 2>/dev/null "
+        "| grep v >/dev/null && echo $i ; done; popd > /dev/null"
+    )
+    os.environ["NCCL_IB_GID_INDEX"] = "3"
+    os.environ["NCCL_IB_TC"] = "106"
+
+
+def configure_omp(num_threads=1):
+    """If OMP_NUM_THREADS is not configured and world_size is greater than 1,
+    Configure OMP_NUM_THREADS environment variables of NCCL to `num_thread`.
+
+    Args:
+        num_threads (int): value of `OMP_NUM_THREADS` to set.
+    """
+    # We set OMP_NUM_THREADS=1 by default, which achieves the best speed on our machines
+    # feel free to change it for better performance.
+    if "OMP_NUM_THREADS" not in os.environ and get_world_size() > 1:
+        os.environ["OMP_NUM_THREADS"] = str(num_threads)
+        if is_main_process():
+            logger.info(
+                "\n***************************************************************\n"
+                "We set `OMP_NUM_THREADS` for each process to {} to speed up.\n"
+                "please further tune the variable for optimal performance.\n"
+                "***************************************************************".format(os.environ["OMP_NUM_THREADS"])
+            )
+
+
+def configure_module(ulimit_value=8192):
+    """Configure pytorch module environment. setting of ulimit and cv2 will be
+    set.
+
+    Args:
+        ulimit_value(int): default open file number on linux. Default value: 8192.
+    """
+    # system setting
+    try:
+        import resource
+
+        rlimit = resource.getrlimit(resource.RLIMIT_NOFILE)
+        resource.setrlimit(resource.RLIMIT_NOFILE, (ulimit_value, rlimit[1]))
+    except Exception:
+        # Exception might be raised in Windows OS or rlimit reaches max limit number.
+        # However, set rlimit value might not be necessary.
+        pass
+
+    # cv2
+    # multiprocess might be harmful on performance of torch dataloader
+    os.environ["OPENCV_OPENCL_RUNTIME"] = "disabled"
+    try:
+        cv2.setNumThreads(0)
+        cv2.ocl.setUseOpenCL(False)
+    except Exception:
+        # cv2 version mismatch might rasie exceptions.
+        pass
--- a/det/yolox/utils/visualize.py
+++ b/det/yolox/utils/visualize.py
+#!/usr/bin/env python3
+# -*- coding:utf-8 -*-
+# Copyright (c) 2014-2021 Megvii Inc. All rights reserved.
+
+import cv2
+import numpy as np
+import os.path as osp
+import time
+
+import ref
+
+__all__ = ["vis"]
+
+
+def vis_train(inps, targets, cfg):
+    for i in range(inps.shape[0]):
+        image = inps[i].cpu().numpy().transpose(2, 1, 0).astype(np.uint8).copy()
+        target = targets[i].cpu().numpy().astype(np.int).copy()
+        bbox = target[:, 1:]
+        # bbox[:, 2] = bbox[:, 0] + bbox[:, 2]
+        # bbox[:, 3] = bbox[:, 1] + bbox[:, 3]
+        # scene_id = int(scene_im_id[0].split("/")[0])
+        # im_id = int(scene_im_id[0].split("/")[1])
+        out_file = osp.join(cfg.train["output_dir"], "{}.png".format(str(time.perf_counter())))
+        # scores = np.ones(bbox.shape[0])
+        scores = np.zeros(bbox.shape[0])
+        cls_ids = target[:, 0]
+        class_names = ref.hb.objects
+        vis_image = vis(image, bbox, scores, cls_ids, 0.5, class_names)
+        cv2.imwrite(out_file, vis_image)
+
+
+def vis(img, boxes, scores, cls_ids, conf=0.5, class_names=None):
+
+    for i in range(len(boxes)):
+        box = boxes[i]
+        cls_id = int(cls_ids[i])
+        score = scores[i]
+        if score < conf:
+            continue
+        x0 = int(box[0])
+        y0 = int(box[1])
+        x1 = int(box[2])
+        y1 = int(box[3])
+
+        color = (_COLORS[cls_id] * 255).astype(np.uint8).tolist()
+        text = "{}:{:.1f}%".format(class_names[cls_id], score * 100)
+        txt_color = (0, 0, 0) if np.mean(_COLORS[cls_id]) > 0.5 else (255, 255, 255)
+        font = cv2.FONT_HERSHEY_SIMPLEX
+
+        txt_size = cv2.getTextSize(text, font, 0.4, 1)[0]
+        cv2.rectangle(img, (x0, y0), (x1, y1), color, 2)
+
+        txt_bk_color = (_COLORS[cls_id] * 255 * 0.7).astype(np.uint8).tolist()
+        cv2.rectangle(
+            img,
+            (x0, y0 + 1),
+            (x0 + txt_size[0] + 1, y0 + int(1.5 * txt_size[1])),
+            txt_bk_color,
+            -1,
+        )
+        cv2.putText(img, text, (x0, y0 + txt_size[1]), font, 0.4, txt_color, thickness=1)
+
+    return img
+
+
+_COLORS = (
+    np.array(
+        [
+            0.000,
+            0.447,
+            0.741,
+            0.850,
+            0.325,
+            0.098,
+            0.929,
+            0.694,
+            0.125,
+            0.494,
+            0.184,
+            0.556,
+            0.466,
+            0.674,
+            0.188,
+            0.301,
+            0.745,
+            0.933,
+            0.635,
+            0.078,
+            0.184,
+            0.300,
+            0.300,
+            0.300,
+            0.600,
+            0.600,
+            0.600,
+            1.000,
+            0.000,
+            0.000,
+            1.000,
+            0.500,
+            0.000,
+            0.749,
+            0.749,
+            0.000,
+            0.000,
+            1.000,
+            0.000,
+            0.000,
+            0.000,
+            1.000,
+            0.667,
+            0.000,
+            1.000,
+            0.333,
+            0.333,
+            0.000,
+            0.333,
+            0.667,
+            0.000,
+            0.333,
+            1.000,
+            0.000,
+            0.667,
+            0.333,
+            0.000,
+            0.667,
+            0.667,
+            0.000,
+            0.667,
+            1.000,
+            0.000,
+            1.000,
+            0.333,
+            0.000,
+            1.000,
+            0.667,
+            0.000,
+            1.000,
+            1.000,
+            0.000,
+            0.000,
+            0.333,
+            0.500,
+            0.000,
+            0.667,
+            0.500,
+            0.000,
+            1.000,
+            0.500,
+            0.333,
+            0.000,
+            0.500,
+            0.333,
+            0.333,
+            0.500,
+            0.333,
+            0.667,
+            0.500,
+            0.333,
+            1.000,
+            0.500,
+            0.667,
+            0.000,
+            0.500,
+            0.667,
+            0.333,
+            0.500,
+            0.667,
+            0.667,
+            0.500,
+            0.667,
+            1.000,
+            0.500,
+            1.000,
+            0.000,
+            0.500,
+            1.000,
+            0.333,
+            0.500,
+            1.000,
+            0.667,
+            0.500,
+            1.000,
+            1.000,
+            0.500,
+            0.000,
+            0.333,
+            1.000,
+            0.000,
+            0.667,
+            1.000,
+            0.000,
+            1.000,
+            1.000,
+            0.333,
+            0.000,
+            1.000,
+            0.333,
+            0.333,
+            1.000,
+            0.333,
+            0.667,
+            1.000,
+            0.333,
+            1.000,
+            1.000,
+            0.667,
+            0.000,
+            1.000,
+            0.667,
+            0.333,
+            1.000,
+            0.667,
+            0.667,
+            1.000,
+            0.667,
+            1.000,
+            1.000,
+            1.000,
+            0.000,
+            1.000,
+            1.000,
+            0.333,
+            1.000,
+            1.000,
+            0.667,
+            1.000,
+            0.333,
+            0.000,
+            0.000,
+            0.500,
+            0.000,
+            0.000,
+            0.667,
+            0.000,
+            0.000,
+            0.833,
+            0.000,
+            0.000,
+            1.000,
+            0.000,
+            0.000,
+            0.000,
+            0.167,
+            0.000,
+            0.000,
+            0.333,
+            0.000,
+            0.000,
+            0.500,
+            0.000,
+            0.000,
+            0.667,
+            0.000,
+            0.000,
+            0.833,
+            0.000,
+            0.000,
+            1.000,
+            0.000,
+            0.000,
+            0.000,
+            0.167,
+            0.000,
+            0.000,
+            0.333,
+            0.000,
+            0.000,
+            0.500,
+            0.000,
+            0.000,
+            0.667,
+            0.000,
+            0.000,
+            0.833,
+            0.000,
+            0.000,
+            1.000,
+            0.000,
+            0.000,
+            0.000,
+            0.143,
+            0.143,
+            0.143,
+            0.286,
+            0.286,
+            0.286,
+            0.429,
+            0.429,
+            0.429,
+            0.571,
+            0.571,
+            0.571,
+            0.714,
+            0.714,
+            0.714,
+            0.857,
+            0.857,
+            0.857,
+            0.000,
+            0.447,
+            0.741,
+            0.314,
+            0.717,
+            0.741,
+            0.50,
+            0.5,
+            0,
+        ]
+    )
+    .astype(np.float32)
+    .reshape(-1, 3)
+)