diff --git a/configs/gdrn/fruitbin/convnext_a6_AugCosyAAEGray_BG05_mlL1_DMask_amodalClipBox_classAware_fruitbin.py b/configs/gdrn/fruitbin/convnext_a6_AugCosyAAEGray_BG05_mlL1_DMask_amodalClipBox_classAware_fruitbin.py
new file mode 100644
index 0000000000000000000000000000000000000000..b167a6c65f29fdd8eb8dc32f0485f15a9f41ec7b
--- /dev/null
+++ b/configs/gdrn/fruitbin/convnext_a6_AugCosyAAEGray_BG05_mlL1_DMask_amodalClipBox_classAware_fruitbin.py
@@ -0,0 +1,143 @@
+# about 3 days
+_base_ = ["../../_base_/gdrn_base.py"]
+
+OUTPUT_DIR = "output/gdrn/ycbv/convnext_a6_AugCosyAAEGray_BG05_mlL1_DMask_amodalClipBox_classAware_ycbv"
+INPUT = dict(
+    DZI_PAD_SCALE=1.5,
+    TRUNCATE_FG=True,
+    CHANGE_BG_PROB=0.5,
+    COLOR_AUG_PROB=0.8,
+    COLOR_AUG_TYPE="code",
+    COLOR_AUG_CODE=(
+        "Sequential(["
+        # Sometimes(0.5, PerspectiveTransform(0.05)),
+        # Sometimes(0.5, CropAndPad(percent=(-0.05, 0.1))),
+        # Sometimes(0.5, Affine(scale=(1.0, 1.2))),
+        "Sometimes(0.5, CoarseDropout( p=0.2, size_percent=0.05) ),"
+        "Sometimes(0.4, GaussianBlur((0., 3.))),"
+        "Sometimes(0.3, pillike.EnhanceSharpness(factor=(0., 50.))),"
+        "Sometimes(0.3, pillike.EnhanceContrast(factor=(0.2, 50.))),"
+        "Sometimes(0.5, pillike.EnhanceBrightness(factor=(0.1, 6.))),"
+        "Sometimes(0.3, pillike.EnhanceColor(factor=(0., 20.))),"
+        "Sometimes(0.5, Add((-25, 25), per_channel=0.3)),"
+        "Sometimes(0.3, Invert(0.2, per_channel=True)),"
+        "Sometimes(0.5, Multiply((0.6, 1.4), per_channel=0.5)),"
+        "Sometimes(0.5, Multiply((0.6, 1.4))),"
+        "Sometimes(0.1, AdditiveGaussianNoise(scale=10, per_channel=True)),"
+        "Sometimes(0.5, iaa.contrast.LinearContrast((0.5, 2.2), per_channel=0.3)),"
+        "Sometimes(0.5, Grayscale(alpha=(0.0, 1.0))),"  # maybe remove for det
+        "], random_order=True)"
+        # cosy+aae
+    ),
+)
+
+SOLVER = dict(
+    IMS_PER_BATCH=48,
+    TOTAL_EPOCHS=40,  # 10
+    LR_SCHEDULER_NAME="flat_and_anneal",
+    ANNEAL_METHOD="cosine",  # "cosine"
+    ANNEAL_POINT=0.72,
+    OPTIMIZER_CFG=dict(_delete_=True, type="Ranger", lr=8e-4, weight_decay=0.01),
+    WEIGHT_DECAY=0.0,
+    WARMUP_FACTOR=0.001,
+    WARMUP_ITERS=1000,
+)
+
+DATASETS = dict(
+    TRAIN=("ycbv_train_real", "ycbv_train_pbr"),
+    TEST=("ycbv_test",),
+    DET_FILES_TEST=("datasets/BOP_DATASETS/ycbv/test/test_bboxes/yolox_x_640_ycbv_real_pbr_ycbv_bop_test.json",),
+    SYM_OBJS=[
+        "024_bowl",
+        "036_wood_block",
+        "051_large_clamp",
+        "052_extra_large_clamp",
+        "061_foam_brick",
+    ],  # used for custom evalutor
+)
+
+DATALOADER = dict(
+    # Number of data loading threads
+    NUM_WORKERS=8,
+    FILTER_VISIB_THR=0.3,
+)
+
+MODEL = dict(
+    LOAD_DETS_TEST=True,
+    PIXEL_MEAN=[0.0, 0.0, 0.0],
+    PIXEL_STD=[255.0, 255.0, 255.0],
+    BBOX_TYPE="AMODAL_CLIP",  # VISIB or AMODAL
+    POSE_NET=dict(
+        NAME="GDRN_double_mask",
+        XYZ_ONLINE=True,
+        NUM_CLASSES=21,
+        BACKBONE=dict(
+            FREEZE=False,
+            PRETRAINED="timm",
+            INIT_CFG=dict(
+                type="timm/convnext_base",
+                pretrained=True,
+                in_chans=3,
+                features_only=True,
+                out_indices=(3,),
+            ),
+        ),
+        ## geo head: Mask, XYZ, Region
+        GEO_HEAD=dict(
+            FREEZE=False,
+            INIT_CFG=dict(
+                type="TopDownDoubleMaskXyzRegionHead",
+                in_dim=1024,  # this is num out channels of backbone conv feature
+            ),
+            NUM_REGIONS=64,
+            XYZ_CLASS_AWARE=True,
+            MASK_CLASS_AWARE=True,
+            REGION_CLASS_AWARE=True,
+        ),
+        PNP_NET=dict(
+            INIT_CFG=dict(norm="GN", act="gelu"),
+            REGION_ATTENTION=True,
+            WITH_2D_COORD=True,
+            ROT_TYPE="allo_rot6d",
+            TRANS_TYPE="centroid_z",
+        ),
+        LOSS_CFG=dict(
+            # xyz loss ----------------------------
+            XYZ_LOSS_TYPE="L1",  # L1 | CE_coor
+            XYZ_LOSS_MASK_GT="visib",  # trunc | visib | obj
+            XYZ_LW=1.0,
+            # mask loss ---------------------------
+            MASK_LOSS_TYPE="L1",  # L1 | BCE | CE
+            MASK_LOSS_GT="trunc",  # trunc | visib | gt
+            MASK_LW=1.0,
+            # full mask loss ---------------------------
+            FULL_MASK_LOSS_TYPE="L1",  # L1 | BCE | CE
+            FULL_MASK_LW=1.0,
+            # region loss -------------------------
+            REGION_LOSS_TYPE="CE",  # CE
+            REGION_LOSS_MASK_GT="visib",  # trunc | visib | obj
+            REGION_LW=1.0,
+            # pm loss --------------
+            PM_LOSS_SYM=True,  # NOTE: sym loss
+            PM_R_ONLY=True,  # only do R loss in PM
+            PM_LW=1.0,
+            # centroid loss -------
+            CENTROID_LOSS_TYPE="L1",
+            CENTROID_LW=1.0,
+            # z loss -----------
+            Z_LOSS_TYPE="L1",
+            Z_LW=1.0,
+        ),
+    ),
+)
+
+VAL = dict(
+    DATASET_NAME="ycbv",
+    SPLIT_TYPE="",
+    SCRIPT_PATH="lib/pysixd/scripts/eval_pose_results_more.py",
+    TARGETS_FILENAME="test_targets_bop19.json",
+    ERROR_TYPES="vsd,mspd,mssd",
+    USE_BOP=True,  # whether to use bop toolkit
+)
+
+TEST = dict(EVAL_PERIOD=0, VIS=False, TEST_BBOX_TYPE="est")  # gt | est
diff --git a/configs/gdrn/fruitbinPbrSOr/convnext_AugCosyAAEGray_DMask_amodalClipBox_fruitbin/banana1.py b/configs/gdrn/fruitbinPbrSOr/convnext_AugCosyAAEGray_DMask_amodalClipBox_fruitbin/banana1.py
new file mode 100644
index 0000000000000000000000000000000000000000..a7f363fbf837ed4dd4d4143e0a5f89f4efb498b1
--- /dev/null
+++ b/configs/gdrn/fruitbinPbrSOr/convnext_AugCosyAAEGray_DMask_amodalClipBox_fruitbin/banana1.py
@@ -0,0 +1,138 @@
+_base_ = ["../../../_base_/gdrn_base.py"]
+
+OUTPUT_DIR = "output/gdrn/ycbvPbrSO/convnext_AugCosyAAEGray_DMask_amodalClipBox_ycbv/002_master_chef_can"
+INPUT = dict(
+    DZI_PAD_SCALE=1.5,
+    TRUNCATE_FG=True,
+    CHANGE_BG_PROB=0.5,
+    COLOR_AUG_PROB=0.8,
+    COLOR_AUG_TYPE="code",
+    COLOR_AUG_CODE=(
+        "Sequential(["
+        # Sometimes(0.5, PerspectiveTransform(0.05)),
+        # Sometimes(0.5, CropAndPad(percent=(-0.05, 0.1))),
+        # Sometimes(0.5, Affine(scale=(1.0, 1.2))),
+        "Sometimes(0.5, CoarseDropout( p=0.2, size_percent=0.05) ),"
+        "Sometimes(0.4, GaussianBlur((0., 3.))),"
+        "Sometimes(0.3, pillike.EnhanceSharpness(factor=(0., 50.))),"
+        "Sometimes(0.3, pillike.EnhanceContrast(factor=(0.2, 50.))),"
+        "Sometimes(0.5, pillike.EnhanceBrightness(factor=(0.1, 6.))),"
+        "Sometimes(0.3, pillike.EnhanceColor(factor=(0., 20.))),"
+        "Sometimes(0.5, Add((-25, 25), per_channel=0.3)),"
+        "Sometimes(0.3, Invert(0.2, per_channel=True)),"
+        "Sometimes(0.5, Multiply((0.6, 1.4), per_channel=0.5)),"
+        "Sometimes(0.5, Multiply((0.6, 1.4))),"
+        "Sometimes(0.1, AdditiveGaussianNoise(scale=10, per_channel=True)),"
+        "Sometimes(0.5, iaa.contrast.LinearContrast((0.5, 2.2), per_channel=0.3)),"
+        "Sometimes(0.5, Grayscale(alpha=(0.0, 1.0))),"  # maybe remove for det
+        "], random_order=True)"
+        # cosy+aae
+    ),
+)
+
+SOLVER = dict(
+    IMS_PER_BATCH=36,
+    TOTAL_EPOCHS=100,
+    LR_SCHEDULER_NAME="flat_and_anneal",
+    ANNEAL_METHOD="cosine",  # "cosine"
+    ANNEAL_POINT=0.72,
+    OPTIMIZER_CFG=dict(_delete_=True, type="Ranger", lr=8e-4, weight_decay=0.01),
+    WEIGHT_DECAY=0.0,
+    WARMUP_FACTOR=0.001,
+    WARMUP_ITERS=1000,
+)
+
+DATASETS = dict(
+    TRAIN=("ycbv_002_master_chef_can_train_pbr",),
+    TEST=("ycbv_test",),
+    DET_FILES_TEST=("datasets/BOP_DATASETS/ycbv/test/test_bboxes/yolox_x_640_ycbv_pbr_ycbv_bop_test.json",),
+    SYM_OBJS=[
+        "024_bowl",
+        "036_wood_block",
+        "051_large_clamp",
+        "052_extra_large_clamp",
+        "061_foam_brick",
+    ],  # used for custom evalutor
+)
+
+DATALOADER = dict(
+    # Number of data loading threads
+    NUM_WORKERS=8,
+    FILTER_VISIB_THR=0.3,
+)
+
+MODEL = dict(
+    LOAD_DETS_TEST=True,
+    PIXEL_MEAN=[0.0, 0.0, 0.0],
+    PIXEL_STD=[255.0, 255.0, 255.0],
+    BBOX_TYPE="AMODAL_CLIP",  # VISIB or AMODAL
+    POSE_NET=dict(
+        NAME="GDRN_double_mask",
+        XYZ_ONLINE=True,
+        BACKBONE=dict(
+            FREEZE=False,
+            PRETRAINED="timm",
+            INIT_CFG=dict(
+                type="timm/convnext_base",
+                pretrained=True,
+                in_chans=3,
+                features_only=True,
+                out_indices=(3,),
+            ),
+        ),
+        ## geo head: Mask, XYZ, Region
+        GEO_HEAD=dict(
+            FREEZE=False,
+            INIT_CFG=dict(
+                type="TopDownDoubleMaskXyzRegionHead",
+                in_dim=1024,  # this is num out channels of backbone conv feature
+            ),
+            NUM_REGIONS=64,
+        ),
+        PNP_NET=dict(
+            INIT_CFG=dict(norm="GN", act="gelu"),
+            REGION_ATTENTION=True,
+            WITH_2D_COORD=True,
+            ROT_TYPE="allo_rot6d",
+            TRANS_TYPE="centroid_z",
+        ),
+        LOSS_CFG=dict(
+            # xyz loss ----------------------------
+            XYZ_LOSS_TYPE="L1",  # L1 | CE_coor
+            XYZ_LOSS_MASK_GT="visib",  # trunc | visib | obj
+            XYZ_LW=1.0,
+            # mask loss ---------------------------
+            MASK_LOSS_TYPE="L1",  # L1 | BCE | CE
+            MASK_LOSS_GT="trunc",  # trunc | visib | gt
+            MASK_LW=1.0,
+            # full mask loss ---------------------------
+            FULL_MASK_LOSS_TYPE="L1",  # L1 | BCE | CE
+            FULL_MASK_LW=1.0,
+            # region loss -------------------------
+            REGION_LOSS_TYPE="CE",  # CE
+            REGION_LOSS_MASK_GT="visib",  # trunc | visib | obj
+            REGION_LW=1.0,
+            # pm loss --------------
+            PM_LOSS_SYM=True,  # NOTE: sym loss
+            PM_R_ONLY=True,  # only do R loss in PM
+            PM_LW=1.0,
+            # centroid loss -------
+            CENTROID_LOSS_TYPE="L1",
+            CENTROID_LW=1.0,
+            # z loss -----------
+            Z_LOSS_TYPE="L1",
+            Z_LW=1.0,
+        ),
+    ),
+)
+
+VAL = dict(
+    DATASET_NAME="ycbv",
+    SPLIT_TYPE="",
+    SCRIPT_PATH="lib/pysixd/scripts/eval_pose_results_more.py",
+    TARGETS_FILENAME="test_targets_bop19.json",
+    ERROR_TYPES="vsd,mspd,mssd",
+    USE_BOP=True,  # whether to use bop toolkit
+)
+
+TEST = dict(EVAL_PERIOD=0, VIS=False, TEST_BBOX_TYPE="est")  # gt | est
diff --git a/configs/gdrn/fruitbinPbrSOr/convnext_AugCosyAAEGray_DMask_amodalClipBox_fruitbin/lemon2.py b/configs/gdrn/fruitbinPbrSOr/convnext_AugCosyAAEGray_DMask_amodalClipBox_fruitbin/lemon2.py
new file mode 100644
index 0000000000000000000000000000000000000000..59dc1d681ca12b9b51c48e8d03e52b822c7b8888
--- /dev/null
+++ b/configs/gdrn/fruitbinPbrSOr/convnext_AugCosyAAEGray_DMask_amodalClipBox_fruitbin/lemon2.py
@@ -0,0 +1,3 @@
+_base_ = "./002_master_chef_can.py"
+OUTPUT_DIR = "output/gdrn/ycbvPbrSO/convnext_AugCosyAAEGray_DMask_amodalClipBox_ycbv/003_cracker_box"
+DATASETS = dict(TRAIN=("ycbv_003_cracker_box_train_pbr",))
diff --git a/configs/gdrn/fruitbinPbrSOr/convnext_AugCosyAAEGray_DMask_amodalClipBox_fruitbin/pear2.py b/configs/gdrn/fruitbinPbrSOr/convnext_AugCosyAAEGray_DMask_amodalClipBox_fruitbin/pear2.py
new file mode 100644
index 0000000000000000000000000000000000000000..d03bd4d1c6b09a755b29e730ef591d415a97b306
--- /dev/null
+++ b/configs/gdrn/fruitbinPbrSOr/convnext_AugCosyAAEGray_DMask_amodalClipBox_fruitbin/pear2.py
@@ -0,0 +1,3 @@
+_base_ = "./002_master_chef_can.py"
+OUTPUT_DIR = "output/gdrn/ycbvPbrSO/convnext_AugCosyAAEGray_DMask_amodalClipBox_ycbv/004_sugar_box"
+DATASETS = dict(TRAIN=("ycbv_004_sugar_box_train_pbr",))
diff --git a/configs/gdrn/fruitbinSO/convnext_AugCosyAAEGray_DMask_amodalClipBox_fruitbin/002_master_chef_can.py b/configs/gdrn/fruitbinSO/convnext_AugCosyAAEGray_DMask_amodalClipBox_fruitbin/002_master_chef_can.py
new file mode 100644
index 0000000000000000000000000000000000000000..8551bf536e4cd2d79dbcc44b9606c3fd4bf0d3cc
--- /dev/null
+++ b/configs/gdrn/fruitbinSO/convnext_AugCosyAAEGray_DMask_amodalClipBox_fruitbin/002_master_chef_can.py
@@ -0,0 +1,138 @@
+_base_ = ["../../../_base_/gdrn_base.py"]
+
+OUTPUT_DIR = "output/gdrn/ycbvSO/convnext_AugCosyAAEGray_DMask_amodalClipBox_ycbv/002_master_chef_can"
+INPUT = dict(
+    DZI_PAD_SCALE=1.5,
+    TRUNCATE_FG=True,
+    CHANGE_BG_PROB=0.5,
+    COLOR_AUG_PROB=0.8,
+    COLOR_AUG_TYPE="code",
+    COLOR_AUG_CODE=(
+        "Sequential(["
+        # Sometimes(0.5, PerspectiveTransform(0.05)),
+        # Sometimes(0.5, CropAndPad(percent=(-0.05, 0.1))),
+        # Sometimes(0.5, Affine(scale=(1.0, 1.2))),
+        "Sometimes(0.5, CoarseDropout( p=0.2, size_percent=0.05) ),"
+        "Sometimes(0.4, GaussianBlur((0., 3.))),"
+        "Sometimes(0.3, pillike.EnhanceSharpness(factor=(0., 50.))),"
+        "Sometimes(0.3, pillike.EnhanceContrast(factor=(0.2, 50.))),"
+        "Sometimes(0.5, pillike.EnhanceBrightness(factor=(0.1, 6.))),"
+        "Sometimes(0.3, pillike.EnhanceColor(factor=(0., 20.))),"
+        "Sometimes(0.5, Add((-25, 25), per_channel=0.3)),"
+        "Sometimes(0.3, Invert(0.2, per_channel=True)),"
+        "Sometimes(0.5, Multiply((0.6, 1.4), per_channel=0.5)),"
+        "Sometimes(0.5, Multiply((0.6, 1.4))),"
+        "Sometimes(0.1, AdditiveGaussianNoise(scale=10, per_channel=True)),"
+        "Sometimes(0.5, iaa.contrast.LinearContrast((0.5, 2.2), per_channel=0.3)),"
+        "Sometimes(0.5, Grayscale(alpha=(0.0, 1.0))),"  # maybe remove for det
+        "], random_order=True)"
+        # cosy+aae
+    ),
+)
+
+SOLVER = dict(
+    IMS_PER_BATCH=36,
+    TOTAL_EPOCHS=100,
+    LR_SCHEDULER_NAME="flat_and_anneal",
+    ANNEAL_METHOD="cosine",  # "cosine"
+    ANNEAL_POINT=0.72,
+    OPTIMIZER_CFG=dict(_delete_=True, type="Ranger", lr=8e-4, weight_decay=0.01),
+    WEIGHT_DECAY=0.0,
+    WARMUP_FACTOR=0.001,
+    WARMUP_ITERS=1000,
+)
+
+DATASETS = dict(
+    TRAIN=("ycbv_002_master_chef_can_train_pbr", "ycbv_002_master_chef_can_train_real"),
+    TEST=("ycbv_test",),
+    DET_FILES_TEST=("datasets/BOP_DATASETS/ycbv/test/test_bboxes/yolox_x_640_ycbv_real_pbr_ycbv_bop_test.json",),
+    SYM_OBJS=[
+        "024_bowl",
+        "036_wood_block",
+        "051_large_clamp",
+        "052_extra_large_clamp",
+        "061_foam_brick",
+    ],  # used for custom evalutor
+)
+
+DATALOADER = dict(
+    # Number of data loading threads
+    NUM_WORKERS=8,
+    FILTER_VISIB_THR=0.3,
+)
+
+MODEL = dict(
+    LOAD_DETS_TEST=True,
+    PIXEL_MEAN=[0.0, 0.0, 0.0],
+    PIXEL_STD=[255.0, 255.0, 255.0],
+    BBOX_TYPE="AMODAL_CLIP",  # VISIB or AMODAL
+    POSE_NET=dict(
+        NAME="GDRN_double_mask",
+        XYZ_ONLINE=True,
+        BACKBONE=dict(
+            FREEZE=False,
+            PRETRAINED="timm",
+            INIT_CFG=dict(
+                type="timm/convnext_base",
+                pretrained=True,
+                in_chans=3,
+                features_only=True,
+                out_indices=(3,),
+            ),
+        ),
+        ## geo head: Mask, XYZ, Region
+        GEO_HEAD=dict(
+            FREEZE=False,
+            INIT_CFG=dict(
+                type="TopDownDoubleMaskXyzRegionHead",
+                in_dim=1024,  # this is num out channels of backbone conv feature
+            ),
+            NUM_REGIONS=64,
+        ),
+        PNP_NET=dict(
+            INIT_CFG=dict(norm="GN", act="gelu"),
+            REGION_ATTENTION=True,
+            WITH_2D_COORD=True,
+            ROT_TYPE="allo_rot6d",
+            TRANS_TYPE="centroid_z",
+        ),
+        LOSS_CFG=dict(
+            # xyz loss ----------------------------
+            XYZ_LOSS_TYPE="L1",  # L1 | CE_coor
+            XYZ_LOSS_MASK_GT="visib",  # trunc | visib | obj
+            XYZ_LW=1.0,
+            # mask loss ---------------------------
+            MASK_LOSS_TYPE="L1",  # L1 | BCE | CE
+            MASK_LOSS_GT="trunc",  # trunc | visib | gt
+            MASK_LW=1.0,
+            # full mask loss ---------------------------
+            FULL_MASK_LOSS_TYPE="L1",  # L1 | BCE | CE
+            FULL_MASK_LW=1.0,
+            # region loss -------------------------
+            REGION_LOSS_TYPE="CE",  # CE
+            REGION_LOSS_MASK_GT="visib",  # trunc | visib | obj
+            REGION_LW=1.0,
+            # pm loss --------------
+            PM_LOSS_SYM=True,  # NOTE: sym loss
+            PM_R_ONLY=True,  # only do R loss in PM
+            PM_LW=1.0,
+            # centroid loss -------
+            CENTROID_LOSS_TYPE="L1",
+            CENTROID_LW=1.0,
+            # z loss -----------
+            Z_LOSS_TYPE="L1",
+            Z_LW=1.0,
+        ),
+    ),
+)
+
+VAL = dict(
+    DATASET_NAME="ycbv",
+    SPLIT_TYPE="",
+    SCRIPT_PATH="lib/pysixd/scripts/eval_pose_results_more.py",
+    TARGETS_FILENAME="test_targets_bop19.json",
+    ERROR_TYPES="vsd,mspd,mssd",
+    USE_BOP=True,  # whether to use bop toolkit
+)
+
+TEST = dict(EVAL_PERIOD=0, VIS=False, TEST_BBOX_TYPE="est")  # gt | est
diff --git a/configs/gdrn/fruitbinSO/convnext_AugCosyAAEGray_DMask_amodalClipBox_fruitbin/lemon2.py b/configs/gdrn/fruitbinSO/convnext_AugCosyAAEGray_DMask_amodalClipBox_fruitbin/lemon2.py
new file mode 100644
index 0000000000000000000000000000000000000000..23e99a379bb037b8ad33df51c390f8f2ee0120b9
--- /dev/null
+++ b/configs/gdrn/fruitbinSO/convnext_AugCosyAAEGray_DMask_amodalClipBox_fruitbin/lemon2.py
@@ -0,0 +1,3 @@
+_base_ = "./002_master_chef_can.py"
+OUTPUT_DIR = "output/gdrn/ycbvSO/convnext_AugCosyAAEGray_DMask_amodalClipBox_ycbv/003_cracker_box"
+DATASETS = dict(TRAIN=("ycbv_003_cracker_box_train_pbr", "ycbv_003_cracker_box_train_real"))
diff --git a/configs/gdrn/fruitbinSO/convnext_AugCosyAAEGray_DMask_amodalClipBox_fruitbin/pear2.py b/configs/gdrn/fruitbinSO/convnext_AugCosyAAEGray_DMask_amodalClipBox_fruitbin/pear2.py
new file mode 100644
index 0000000000000000000000000000000000000000..efa07cb00fa8d4d1863dc6dce7ff5c9eb262407f
--- /dev/null
+++ b/configs/gdrn/fruitbinSO/convnext_AugCosyAAEGray_DMask_amodalClipBox_fruitbin/pear2.py
@@ -0,0 +1,3 @@
+_base_ = "./002_master_chef_can.py"
+OUTPUT_DIR = "output/gdrn/ycbvSO/convnext_AugCosyAAEGray_DMask_amodalClipBox_ycbv/004_sugar_box"
+DATASETS = dict(TRAIN=("ycbv_004_sugar_box_train_pbr", "ycbv_004_sugar_box_train_real"))
diff --git a/configs/yolox/bop_pbr/yolox_x_640_augCozyAAEhsv_ranger_30_epochs_fruitbin_pbr_fruitbin_bop_test.py b/configs/yolox/bop_pbr/yolox_x_640_augCozyAAEhsv_ranger_30_epochs_fruitbin_pbr_fruitbin_bop_test.py
new file mode 100644
index 0000000000000000000000000000000000000000..729c297d4a926c1db49bc47780d732e864793f41
--- /dev/null
+++ b/configs/yolox/bop_pbr/yolox_x_640_augCozyAAEhsv_ranger_30_epochs_fruitbin_pbr_fruitbin_bop_test.py
@@ -0,0 +1,112 @@
+import os.path as osp
+
+import torch
+from detectron2.config import LazyCall as L
+from detectron2.solver.build import get_default_optimizer_params
+
+from .yolox_base import train, val, test, model, dataloader, optimizer, lr_config, DATASETS  # noqa
+from det.yolox.data import build_yolox_test_loader, ValTransform
+from det.yolox.data.datasets import Base_DatasetFromList
+from detectron2.data import get_detection_dataset_dicts
+from det.yolox.evaluators import YOLOX_COCOEvaluator
+from lib.torch_utils.solver.ranger import Ranger
+
+train.update(
+    output_dir=osp.abspath(__file__).replace("configs", "output", 1)[0:-3],
+    exp_name=osp.split(osp.abspath(__file__))[1][0:-3],  # .py
+)
+train.amp.enabled = True
+
+model.backbone.depth = 1.33
+model.backbone.width = 1.25
+
+model.head.num_classes = 21
+
+train.init_checkpoint = "pretrained_models/yolox/yolox_x.pth"
+
+# datasets
+DATASETS.TRAIN = ["ycbv_train_pbr"]
+DATASETS.TEST = ["ycbv_bop_test"]
+
+dataloader.train.dataset.lst.names = DATASETS.TRAIN
+dataloader.train.total_batch_size = 32
+
+# color aug
+dataloader.train.aug_wrapper.COLOR_AUG_PROB = 0.8
+dataloader.train.aug_wrapper.COLOR_AUG_TYPE = "code"
+dataloader.train.aug_wrapper.COLOR_AUG_CODE = (
+    "Sequential(["
+    # Sometimes(0.5, PerspectiveTransform(0.05)),
+    # Sometimes(0.5, CropAndPad(percent=(-0.05, 0.1))),
+    # Sometimes(0.5, Affine(scale=(1.0, 1.2))),
+    "Sometimes(0.5, CoarseDropout( p=0.2, size_percent=0.05) ),"
+    "Sometimes(0.4, GaussianBlur((0., 3.))),"
+    "Sometimes(0.3, pillike.EnhanceSharpness(factor=(0., 50.))),"
+    "Sometimes(0.3, pillike.EnhanceContrast(factor=(0.2, 50.))),"
+    "Sometimes(0.5, pillike.EnhanceBrightness(factor=(0.1, 6.))),"
+    "Sometimes(0.3, pillike.EnhanceColor(factor=(0., 20.))),"
+    "Sometimes(0.5, Add((-25, 25), per_channel=0.3)),"
+    "Sometimes(0.3, Invert(0.2, per_channel=True)),"
+    "Sometimes(0.5, Multiply((0.6, 1.4), per_channel=0.5)),"
+    "Sometimes(0.5, Multiply((0.6, 1.4))),"
+    "Sometimes(0.1, AdditiveGaussianNoise(scale=10, per_channel=True)),"
+    "Sometimes(0.5, iaa.contrast.LinearContrast((0.5, 2.2), per_channel=0.3)),"
+    # "Sometimes(0.5, Grayscale(alpha=(0.0, 1.0))),"  # maybe remove for det
+    "], random_order=True)"
+    # cosy+aae
+)
+
+# hsv color aug
+dataloader.train.aug_wrapper.AUG_HSV_PROB = 1.0
+dataloader.train.aug_wrapper.HSV_H = 0.015
+dataloader.train.aug_wrapper.HSV_S = 0.7
+dataloader.train.aug_wrapper.HSV_V = 0.4
+dataloader.train.aug_wrapper.FORMAT = "RGB"
+
+optimizer = L(Ranger)(
+    params=L(get_default_optimizer_params)(
+        # params.model is meant to be set to the model object, before instantiating
+        # the optimizer.
+        weight_decay_norm=0.0,
+        weight_decay_bias=0.0,
+    ),
+    lr=0.001,  # bs=64
+    # momentum=0.9,
+    weight_decay=0,
+    # nesterov=True,
+)
+
+train.total_epochs = 30
+train.no_aug_epochs = 15
+train.checkpointer = dict(period=2, max_to_keep=10)
+
+test.test_dataset_names = DATASETS.TEST
+test.augment = True
+test.scales = (1, 0.75, 0.83, 1.12, 1.25)
+test.conf_thr = 0.001
+
+dataloader.test = [
+    L(build_yolox_test_loader)(
+        dataset=L(Base_DatasetFromList)(
+            split="test",
+            lst=L(get_detection_dataset_dicts)(names=test_dataset_name, filter_empty=False),
+            img_size="${test.test_size}",
+            preproc=L(ValTransform)(
+                legacy=False,
+            ),
+        ),
+        total_batch_size=1,
+        # total_batch_size=64,
+        num_workers=4,
+        pin_memory=True,
+    )
+    for test_dataset_name in test.test_dataset_names
+]
+
+dataloader.evaluator = [
+    L(YOLOX_COCOEvaluator)(
+        dataset_name=test_dataset_name,
+        filter_scene=False,
+    )
+    for test_dataset_name in test.test_dataset_names
+]
diff --git a/core/gdrn_modeling/datasets/fruitbin_bop_test.py b/core/gdrn_modeling/datasets/fruitbin_bop_test.py
new file mode 100644
index 0000000000000000000000000000000000000000..43dd74bf91817e068b844d52cdb2a9c7904218fd
--- /dev/null
+++ b/core/gdrn_modeling/datasets/fruitbin_bop_test.py
@@ -0,0 +1,455 @@
+import hashlib
+import logging
+import os
+import os.path as osp
+import sys
+
+cur_dir = osp.dirname(osp.abspath(__file__))
+PROJ_ROOT = osp.normpath(osp.join(cur_dir, "../../.."))
+sys.path.insert(0, PROJ_ROOT)
+import time
+from collections import OrderedDict
+import mmcv
+import numpy as np
+from tqdm import tqdm
+from transforms3d.quaternions import mat2quat, quat2mat
+import ref
+from detectron2.data import DatasetCatalog, MetadataCatalog
+from detectron2.structures import BoxMode
+from lib.pysixd import inout, misc
+from lib.utils.mask_utils import binary_mask_to_rle, cocosegm2mask
+from lib.utils.utils import dprint, iprint, lazy_property
+
+
+logger = logging.getLogger(__name__)
+DATASETS_ROOT = osp.normpath(osp.join(PROJ_ROOT, "datasets"))
+
+
+class YCBV_BOP_TEST_Dataset:
+    """ycbv bop test."""
+
+    def __init__(self, data_cfg):
+        """
+        Set with_depth and with_masks default to True,
+        and decide whether to load them into dataloader/network later
+        with_masks:
+        """
+        self.name = data_cfg["name"]
+        self.data_cfg = data_cfg
+
+        self.objs = data_cfg["objs"]  # selected objects
+        # all classes are self.objs, but this enables us to evaluate on selected objs
+        self.select_objs = data_cfg.get("select_objs", self.objs)
+
+        self.ann_file = data_cfg["ann_file"]  # json file with scene_id and im_id items
+
+        self.dataset_root = data_cfg["dataset_root"]  # BOP_DATASETS/ycbv/test
+        self.models_root = data_cfg["models_root"]  # BOP_DATASETS/ycbv/models
+        self.scale_to_meter = data_cfg["scale_to_meter"]  # 0.001
+
+        self.with_masks = data_cfg["with_masks"]  # True (load masks but may not use it)
+        self.with_depth = data_cfg["with_depth"]  # True (load depth path here, but may not use it)
+
+        self.height = data_cfg["height"]  # 480
+        self.width = data_cfg["width"]  # 640
+
+        self.cache_dir = data_cfg.get("cache_dir", osp.join(PROJ_ROOT, ".cache"))  # .cache
+        self.use_cache = data_cfg.get("use_cache", True)
+        self.num_to_load = data_cfg["num_to_load"]  # -1
+        self.filter_invalid = data_cfg["filter_invalid"]
+        ##################################################
+
+        # NOTE: careful! Only the selected objects
+        self.cat_ids = [cat_id for cat_id, obj_name in ref.ycbv.id2obj.items() if obj_name in self.objs]
+        # map selected objs to [0, num_objs-1]
+        self.cat2label = {v: i for i, v in enumerate(self.cat_ids)}  # id_map
+        self.label2cat = {label: cat for cat, label in self.cat2label.items()}
+        self.obj2label = OrderedDict((obj, obj_id) for obj_id, obj in enumerate(self.objs))
+        ##########################################################
+
+    def __call__(self):
+        """Load light-weight instance annotations of all images into a list of
+        dicts in Detectron2 format.
+
+        Do not load heavy data into memory in this file, since we will
+        load the annotations of all images into memory.
+        """
+        # cache the dataset_dicts to avoid loading masks from files
+        hashed_file_name = hashlib.md5(
+            (
+                "".join([str(fn) for fn in self.objs])
+                + "dataset_dicts_{}_{}_{}_{}_{}".format(
+                    self.name,
+                    self.dataset_root,
+                    self.with_masks,
+                    self.with_depth,
+                    __name__,
+                )
+            ).encode("utf-8")
+        ).hexdigest()
+        cache_path = osp.join(
+            self.cache_dir,
+            "dataset_dicts_{}_{}.pkl".format(self.name, hashed_file_name),
+        )
+
+        if osp.exists(cache_path) and self.use_cache:
+            logger.info("load cached dataset dicts from {}".format(cache_path))
+            return mmcv.load(cache_path)
+
+        t_start = time.perf_counter()
+
+        logger.info("loading dataset dicts: {}".format(self.name))
+        self.num_instances_without_valid_segmentation = 0
+        self.num_instances_without_valid_box = 0
+        dataset_dicts = []  # ######################################################
+        im_id_global = 0
+
+        if True:
+            targets = mmcv.load(self.ann_file)
+            scene_im_ids = [(item["scene_id"], item["im_id"]) for item in targets]
+            scene_im_ids = sorted(list(set(scene_im_ids)))
+
+            # load infos for each scene
+            gt_dicts = {}
+            gt_info_dicts = {}
+            cam_dicts = {}
+            for scene_id, im_id in scene_im_ids:
+                scene_root = osp.join(self.dataset_root, f"{scene_id:06d}")
+                if scene_id not in gt_dicts:
+                    gt_dicts[scene_id] = mmcv.load(osp.join(scene_root, "scene_gt.json"))
+                if scene_id not in gt_info_dicts:
+                    gt_info_dicts[scene_id] = mmcv.load(
+                        osp.join(scene_root, "scene_gt_info.json")
+                    )  # bbox_obj, bbox_visib
+                if scene_id not in cam_dicts:
+                    cam_dicts[scene_id] = mmcv.load(osp.join(scene_root, "scene_camera.json"))
+
+            for scene_id, im_id in tqdm(scene_im_ids):
+                str_im_id = str(im_id)
+                scene_root = osp.join(self.dataset_root, f"{scene_id:06d}")
+                rgb_path = osp.join(scene_root, "rgb/{:06d}.png").format(im_id)
+                assert osp.exists(rgb_path), rgb_path
+
+                depth_path = osp.join(scene_root, "depth/{:06d}.png".format(im_id))
+
+                scene_id = int(rgb_path.split("/")[-3])
+
+                cam = np.array(cam_dicts[scene_id][str_im_id]["cam_K"], dtype=np.float32).reshape(3, 3)
+                depth_factor = 1000.0 / cam_dicts[scene_id][str_im_id]["depth_scale"]
+                record = {
+                    "dataset_name": self.name,
+                    "file_name": osp.relpath(rgb_path, PROJ_ROOT),
+                    "depth_file": osp.relpath(depth_path, PROJ_ROOT),
+                    "depth_factor": depth_factor,
+                    "height": self.height,
+                    "width": self.width,
+                    "image_id": im_id_global,  # unique image_id in the dataset, for coco evaluation
+                    "scene_im_id": "{}/{}".format(scene_id, im_id),  # for evaluation
+                    "cam": cam,
+                    "img_type": "real",
+                }
+                im_id_global += 1
+                insts = []
+                for anno_i, anno in enumerate(gt_dicts[scene_id][str_im_id]):
+                    obj_id = anno["obj_id"]
+                    if ref.ycbv.id2obj[obj_id] not in self.select_objs:
+                        continue
+                    cur_label = self.cat2label[obj_id]  # 0-based label
+                    R = np.array(anno["cam_R_m2c"], dtype="float32").reshape(3, 3)
+                    t = np.array(anno["cam_t_m2c"], dtype="float32") / 1000.0
+                    pose = np.hstack([R, t.reshape(3, 1)])
+                    quat = mat2quat(R).astype("float32")
+
+                    proj = (record["cam"] @ t.T).T
+                    proj = proj[:2] / proj[2]
+
+                    bbox_visib = gt_info_dicts[scene_id][str_im_id][anno_i]["bbox_visib"]
+                    bbox_obj = gt_info_dicts[scene_id][str_im_id][anno_i]["bbox_obj"]
+                    x1, y1, w, h = bbox_visib
+                    if self.filter_invalid:
+                        if h <= 1 or w <= 1:
+                            self.num_instances_without_valid_box += 1
+                            continue
+
+                    mask_file = osp.join(
+                        scene_root,
+                        "mask/{:06d}_{:06d}.png".format(im_id, anno_i),
+                    )
+                    mask_visib_file = osp.join(
+                        scene_root,
+                        "mask_visib/{:06d}_{:06d}.png".format(im_id, anno_i),
+                    )
+                    assert osp.exists(mask_file), mask_file
+                    assert osp.exists(mask_visib_file), mask_visib_file
+                    # load mask visib
+                    mask_single = mmcv.imread(mask_visib_file, "unchanged")
+                    area = mask_single.sum()
+                    if area < 3:  # filter out too small or nearly invisible instances
+                        self.num_instances_without_valid_segmentation += 1
+                        continue
+                    mask_rle = binary_mask_to_rle(mask_single, compressed=True)
+
+                    # load mask full
+                    mask_full = mmcv.imread(mask_file, "unchanged")
+                    mask_full = mask_full.astype("bool")
+                    mask_full_rle = binary_mask_to_rle(mask_full, compressed=True)
+
+                    inst = {
+                        "category_id": cur_label,  # 0-based label
+                        "bbox": bbox_visib,
+                        "bbox_obj": bbox_obj,
+                        "bbox_mode": BoxMode.XYWH_ABS,
+                        "pose": pose,
+                        "quat": quat,
+                        "trans": t,
+                        "centroid_2d": proj,  # absolute (cx, cy)
+                        "segmentation": mask_rle,
+                        "mask_full": mask_full_rle,  # TODO: load as mask_full, rle
+                    }
+
+                    model_info = self.models_info[str(obj_id)]
+                    inst["model_info"] = model_info
+                    # TODO: using full mask and full xyz
+                    for key in ["bbox3d_and_center"]:
+                        inst[key] = self.models[cur_label][key]
+                    insts.append(inst)
+                if len(insts) == 0:  # filter im without anno
+                    continue
+                record["annotations"] = insts
+                dataset_dicts.append(record)
+
+        if self.num_instances_without_valid_segmentation > 0:
+            logger.warning(
+                "Filtered out {} instances without valid segmentation. "
+                "There might be issues in your dataset generation process.".format(
+                    self.num_instances_without_valid_segmentation
+                )
+            )
+        if self.num_instances_without_valid_box > 0:
+            logger.warning(
+                "Filtered out {} instances without valid box. "
+                "There might be issues in your dataset generation process.".format(self.num_instances_without_valid_box)
+            )
+        ##########################################################################
+        if self.num_to_load > 0:
+            self.num_to_load = min(int(self.num_to_load), len(dataset_dicts))
+            dataset_dicts = dataset_dicts[: self.num_to_load]
+        logger.info("loaded {} dataset dicts, using {}s".format(len(dataset_dicts), time.perf_counter() - t_start))
+
+        mmcv.mkdir_or_exist(osp.dirname(cache_path))
+        mmcv.dump(dataset_dicts, cache_path, protocol=4)
+        logger.info("Dumped dataset_dicts to {}".format(cache_path))
+        return dataset_dicts
+
+    @lazy_property
+    def models_info(self):
+        models_info_path = osp.join(self.models_root, "models_info.json")
+        assert osp.exists(models_info_path), models_info_path
+        models_info = mmcv.load(models_info_path)  # key is str(obj_id)
+        return models_info
+
+    @lazy_property
+    def models(self):
+        """Load models into a list."""
+        cache_path = osp.join(self.models_root, f"models_{self.name}.pkl")
+        if osp.exists(cache_path) and self.use_cache:
+            # dprint("{}: load cached object models from {}".format(self.name, cache_path))
+            return mmcv.load(cache_path)
+
+        models = []
+        for obj_name in self.objs:
+            model = inout.load_ply(
+                osp.join(
+                    self.models_root,
+                    f"obj_{ref.ycbv.obj2id[obj_name]:06d}.ply",
+                ),
+                vertex_scale=self.scale_to_meter,
+            )
+            # NOTE: the bbox3d_and_center is not obtained from centered vertices
+            # for BOP models, not a big problem since they had been centered
+            model["bbox3d_and_center"] = misc.get_bbox3d_and_center(model["pts"])
+
+            models.append(model)
+        logger.info("cache models to {}".format(cache_path))
+        mmcv.dump(models, cache_path, protocol=4)
+        return models
+
+    def image_aspect_ratio(self):
+        return self.width / self.height  # 4/3
+
+
+########### register datasets ############################################################
+
+
+def get_ycbv_metadata(obj_names, ref_key):
+    """task specific metadata."""
+    data_ref = ref.__dict__[ref_key]
+
+    cur_sym_infos = {}  # label based key
+    loaded_models_info = data_ref.get_models_info()
+
+    for i, obj_name in enumerate(obj_names):
+        obj_id = data_ref.obj2id[obj_name]
+        model_info = loaded_models_info[str(obj_id)]
+        if "symmetries_discrete" in model_info or "symmetries_continuous" in model_info:
+            sym_transforms = misc.get_symmetry_transformations(model_info, max_sym_disc_step=0.01)
+            sym_info = np.array([sym["R"] for sym in sym_transforms], dtype=np.float32)
+        else:
+            sym_info = None
+        cur_sym_infos[i] = sym_info
+
+    meta = {"thing_classes": obj_names, "sym_infos": cur_sym_infos}
+    return meta
+
+
+################################################################################
+
+SPLITS_YCBV = dict(
+    ycbv_bop_test=dict(
+        name="ycbv_bop_test",
+        dataset_root=osp.join(DATASETS_ROOT, "BOP_DATASETS/ycbv/test"),
+        models_root=osp.join(DATASETS_ROOT, "BOP_DATASETS/ycbv/models"),
+        objs=ref.ycbv.objects,  # selected objects
+        ann_file=osp.join(DATASETS_ROOT, "BOP_DATASETS/ycbv/test_targets_bop19.json"),
+        scale_to_meter=0.001,
+        with_masks=True,  # (load masks but may not use it)
+        with_depth=True,  # (load depth path here, but may not use it)
+        height=480,
+        width=640,
+        cache_dir=osp.join(PROJ_ROOT, ".cache"),
+        use_cache=True,
+        num_to_load=-1,
+        filter_invalid=False,
+        ref_key="ycbv",
+    )
+)
+
+
+# single objs (num_class is from all objs)
+for obj in ref.ycbv.objects:
+    name = "ycbv_bop_{}_test".format(obj)
+    select_objs = [obj]
+    if name not in SPLITS_YCBV:
+        SPLITS_YCBV[name] = dict(
+            name=name,
+            dataset_root=osp.join(DATASETS_ROOT, "BOP_DATASETS/ycbv/test"),
+            models_root=osp.join(DATASETS_ROOT, "BOP_DATASETS/ycbv/models"),
+            objs=[obj],  # only this obj
+            select_objs=select_objs,  # selected objects
+            ann_file=osp.join(DATASETS_ROOT, "BOP_DATASETS/ycbv/test_targets_bop19.json"),
+            scale_to_meter=0.001,
+            with_masks=True,  # (load masks but may not use it)
+            with_depth=True,  # (load depth path here, but may not use it)
+            height=480,
+            width=640,
+            cache_dir=osp.join(PROJ_ROOT, ".cache"),
+            use_cache=True,
+            num_to_load=-1,
+            filter_invalid=False,
+            ref_key="ycbv",
+        )
+
+
+def register_with_name_cfg(name, data_cfg=None):
+    """Assume pre-defined datasets live in `./datasets`.
+
+    Args:
+        name: datasnet_name,
+        data_cfg: if name is in existing SPLITS, use pre-defined data_cfg
+            otherwise requires data_cfg
+            data_cfg can be set in cfg.DATA_CFG.name
+    """
+    dprint("register dataset: {}".format(name))
+    if name in SPLITS_YCBV:
+        used_cfg = SPLITS_YCBV[name]
+    else:
+        assert data_cfg is not None, f"dataset name {name} is not registered"
+        used_cfg = data_cfg
+    DatasetCatalog.register(name, YCBV_BOP_TEST_Dataset(used_cfg))
+    # something like eval_types
+    MetadataCatalog.get(name).set(
+        id="ycbv",  # NOTE: for pvnet to determine module
+        ref_key=used_cfg["ref_key"],
+        objs=used_cfg["objs"],
+        eval_error_types=["ad", "rete", "proj"],
+        evaluator_type="bop",
+        **get_ycbv_metadata(obj_names=used_cfg["objs"], ref_key=used_cfg["ref_key"]),
+    )
+
+
+def get_available_datasets():
+    return list(SPLITS_YCBV.keys())
+
+
+#### tests ###############################################
+def test_vis():
+    dset_name = sys.argv[1]
+    assert dset_name in DatasetCatalog.list()
+
+    meta = MetadataCatalog.get(dset_name)
+    dprint("MetadataCatalog: ", meta)
+    objs = meta.objs
+
+    t_start = time.perf_counter()
+    dicts = DatasetCatalog.get(dset_name)
+    logger.info("Done loading {} samples with {:.3f}s.".format(len(dicts), time.perf_counter() - t_start))
+
+    dirname = "output/{}-data-vis".format(dset_name)
+    os.makedirs(dirname, exist_ok=True)
+    for d in dicts:
+        img = read_image_mmcv(d["file_name"], format="BGR")
+        depth = mmcv.imread(d["depth_file"], "unchanged") / d["depth_factor"]
+
+        imH, imW = img.shape[:2]
+        annos = d["annotations"]
+        masks = [cocosegm2mask(anno["segmentation"], imH, imW) for anno in annos]
+        bboxes = [anno["bbox"] for anno in annos]
+        bbox_modes = [anno["bbox_mode"] for anno in annos]
+        bboxes_xyxy = np.array(
+            [BoxMode.convert(box, box_mode, BoxMode.XYXY_ABS) for box, box_mode in zip(bboxes, bbox_modes)]
+        )
+        kpts_3d_list = [anno["bbox3d_and_center"] for anno in annos]
+        quats = [anno["quat"] for anno in annos]
+        transes = [anno["trans"] for anno in annos]
+        Rs = [quat2mat(quat) for quat in quats]
+        # 0-based label
+        cat_ids = [anno["category_id"] for anno in annos]
+        K = d["cam"]
+        kpts_2d = [misc.project_pts(kpt3d, K, R, t) for kpt3d, R, t in zip(kpts_3d_list, Rs, transes)]
+        # # TODO: visualize pose and keypoints
+        labels = [objs[cat_id] for cat_id in cat_ids]
+        # img_vis = vis_image_bboxes_cv2(img, bboxes=bboxes_xyxy, labels=labels)
+        img_vis = vis_image_mask_bbox_cv2(img, masks, bboxes=bboxes_xyxy, labels=labels)
+        img_vis_kpts2d = img.copy()
+        for anno_i in range(len(annos)):
+            img_vis_kpts2d = misc.draw_projected_box3d(img_vis_kpts2d, kpts_2d[anno_i])
+        grid_show(
+            [
+                img[:, :, [2, 1, 0]],
+                img_vis[:, :, [2, 1, 0]],
+                img_vis_kpts2d[:, :, [2, 1, 0]],
+                depth,
+            ],
+            [f"img:{d['file_name']}", "vis_img", "img_vis_kpts2d", "depth"],
+            row=2,
+            col=2,
+        )
+
+
+if __name__ == "__main__":
+    """Test the  dataset loader.
+
+    Usage:
+        python -m core.datasets.ycbv_bop_test dataset_name
+    """
+    from lib.vis_utils.image import grid_show
+    from lib.utils.setup_logger import setup_my_logger
+    import detectron2.data.datasets  # noqa # add pre-defined metadata
+    from core.utils.data_utils import read_image_mmcv
+    from lib.vis_utils.image import vis_image_mask_bbox_cv2
+
+    print("sys.argv:", sys.argv)
+    logger = setup_my_logger(name="core")
+    register_with_name_cfg(sys.argv[1])
+    print("dataset catalog: ", DatasetCatalog.list())
+    test_vis()
diff --git a/core/gdrn_modeling/datasets/fruitbin_d2.py b/core/gdrn_modeling/datasets/fruitbin_d2.py
new file mode 100755
index 0000000000000000000000000000000000000000..385f463603ce13afb9ac5c8e022a8fa3907b688e
--- /dev/null
+++ b/core/gdrn_modeling/datasets/fruitbin_d2.py
@@ -0,0 +1,741 @@
+import hashlib
+import copy
+import logging
+import os
+import os.path as osp
+import sys
+
+cur_dir = osp.dirname(osp.abspath(__file__))
+PROJ_ROOT = osp.normpath(osp.join(cur_dir, "../../.."))
+sys.path.insert(0, PROJ_ROOT)
+import time
+from collections import OrderedDict
+import mmcv
+import numpy as np
+from tqdm import tqdm
+from transforms3d.quaternions import mat2quat, quat2mat
+import ref
+from detectron2.data import DatasetCatalog, MetadataCatalog
+from detectron2.structures import BoxMode
+from lib.pysixd import inout, misc
+from lib.utils.mask_utils import binary_mask_to_rle, cocosegm2mask
+from lib.utils.utils import dprint, iprint, lazy_property
+
+
+logger = logging.getLogger(__name__)
+DATASETS_ROOT = osp.normpath(osp.join(PROJ_ROOT, "datasets"))
+
+
+class YCBV_Dataset:
+    """use image_sets(scene/image_id) and image root to get data; Here we use
+    bop models, which are center aligned and have some offsets compared to
+    original models."""
+
+    def __init__(self, data_cfg):
+        """
+        Set with_depth and with_masks default to True,
+        and decide whether to load them into dataloader/network later
+        with_masks:
+        """
+        self.name = data_cfg["name"]
+        self.data_cfg = data_cfg
+
+        self.objs = data_cfg["objs"]  # selected objects
+
+        self.ann_files = data_cfg["ann_files"]  # provide scene/im_id list
+        self.image_prefixes = data_cfg["image_prefixes"]  # image root
+
+        self.dataset_root = data_cfg["dataset_root"]  # BOP_DATASETS/ycbv/
+        assert osp.exists(self.dataset_root), self.dataset_root
+        self.models_root = data_cfg["models_root"]  # BOP_DATASETS/ycbv/models
+        self.scale_to_meter = data_cfg["scale_to_meter"]  # 0.001
+
+        self.with_masks = data_cfg["with_masks"]  # True (load masks but may not use it)
+        self.with_depth = data_cfg["with_depth"]  # True (load depth path here, but may not use it)
+        self.with_xyz = data_cfg["with_xyz"]
+
+        self.height = data_cfg["height"]  # 480
+        self.width = data_cfg["width"]  # 640
+
+        self.cache_dir = data_cfg.get("cache_dir", osp.join(PROJ_ROOT, ".cache"))  # .cache
+        self.use_cache = data_cfg.get("use_cache", True)
+        self.num_to_load = data_cfg["num_to_load"]  # -1
+        self.filter_invalid = data_cfg["filter_invalid"]
+
+        self.align_K_by_change_pose = data_cfg.get("align_K_by_change_pose", False)
+        # default: 0000~0059 and synt
+        self.cam = np.array(
+            [
+                [1066.778, 0.0, 312.9869],
+                [0.0, 1067.487, 241.3109],
+                [0.0, 0.0, 1.0],
+            ],
+            dtype="float32",
+        )
+        # 0060~0091
+        # cmu_cam = np.array([[1077.836, 0.0, 323.7872], [0.0, 1078.189, 279.6921], [0.0, 0.0, 1.0]], dtype='float32')
+        ##################################################
+
+        # NOTE: careful! Only the selected objects
+        self.cat_ids = [cat_id for cat_id, obj_name in ref.ycbv.id2obj.items() if obj_name in self.objs]
+        # map selected objs to [0, num_objs-1]
+        self.cat2label = {v: i for i, v in enumerate(self.cat_ids)}  # id_map
+        self.label2cat = {label: cat for cat, label in self.cat2label.items()}
+        self.obj2label = OrderedDict((obj, obj_id) for obj_id, obj in enumerate(self.objs))
+        ##########################################################
+
+    def _load_from_idx_file(self, idx_file, image_root):
+        """
+        idx_file: the scene/image ids
+        image_root/scene contains:
+            scene_gt.json
+            scene_gt_info.json
+            scene_camera.json
+        """
+        xyz_root = osp.join(image_root, "xyz_crop")
+        scene_gt_dicts = {}
+        scene_gt_info_dicts = {}
+        scene_cam_dicts = {}
+        scene_im_ids = []  # store tuples of (scene_id, im_id)
+        with open(idx_file, "r") as f:
+            for line in f:
+                line_split = line.strip("\r\n").split("/")
+                scene_id = int(line_split[0])
+                im_id = int(line_split[1])
+                scene_im_ids.append((scene_id, im_id))
+                if scene_id not in scene_gt_dicts:
+                    scene_gt_file = osp.join(image_root, f"{scene_id:06d}/scene_gt.json")
+                    assert osp.exists(scene_gt_file), scene_gt_file
+                    scene_gt_dicts[scene_id] = mmcv.load(scene_gt_file)
+
+                if scene_id not in scene_gt_info_dicts:
+                    scene_gt_info_file = osp.join(image_root, f"{scene_id:06d}/scene_gt_info.json")
+                    assert osp.exists(scene_gt_info_file), scene_gt_info_file
+                    scene_gt_info_dicts[scene_id] = mmcv.load(scene_gt_info_file)
+
+                if scene_id not in scene_cam_dicts:
+                    scene_cam_file = osp.join(image_root, f"{scene_id:06d}/scene_camera.json")
+                    assert osp.exists(scene_cam_file), scene_cam_file
+                    scene_cam_dicts[scene_id] = mmcv.load(scene_cam_file)
+        ######################################################
+        scene_im_ids = sorted(scene_im_ids)  # sort to make it reproducible
+        dataset_dicts = []
+
+        num_instances_without_valid_segmentation = 0
+        num_instances_without_valid_box = 0
+
+        for (scene_id, im_id) in tqdm(scene_im_ids):
+            rgb_path = osp.join(image_root, f"{scene_id:06d}/rgb/{im_id:06d}.png")
+            assert osp.exists(rgb_path), rgb_path
+            str_im_id = str(im_id)
+
+            scene_im_id = f"{scene_id}/{im_id}"
+
+            # for ycbv/tless, load cam K from image infos
+            cam_anno = np.array(scene_cam_dicts[scene_id][str_im_id]["cam_K"], dtype=np.float32).reshape(3, 3)
+            adapth_this_K = False
+            if self.align_K_by_change_pose:
+                if (cam_anno != self.cam).any():
+                    adapth_this_K = True
+                    cam_anno_ori = cam_anno.copy()
+                    cam_anno = self.cam
+
+            depth_factor = 1000.0 / scene_cam_dicts[scene_id][str_im_id]["depth_scale"]
+            # dprint(record['cam'])
+            if "/train_synt/" in rgb_path:
+                img_type = "syn"
+            else:
+                img_type = "real"
+            record = {
+                "dataset_name": self.name,
+                "file_name": osp.relpath(rgb_path, PROJ_ROOT),
+                "height": self.height,
+                "width": self.width,
+                "image_id": self._unique_im_id,
+                "scene_im_id": scene_im_id,  # for evaluation
+                "cam": cam_anno,  # self.cam,
+                "depth_factor": depth_factor,
+                "img_type": img_type,
+            }
+
+            if self.with_depth:
+                depth_file = osp.join(image_root, f"{scene_id:06d}/depth/{im_id:06d}.png")
+                assert osp.exists(depth_file), depth_file
+                record["depth_file"] = osp.relpath(depth_file, PROJ_ROOT)
+
+            insts = []
+            anno_dict_list = scene_gt_dicts[scene_id][str(im_id)]
+            info_dict_list = scene_gt_info_dicts[scene_id][str(im_id)]
+            for anno_i, anno in enumerate(anno_dict_list):
+                info = info_dict_list[anno_i]
+                obj_id = anno["obj_id"]
+                if obj_id not in self.cat_ids:
+                    continue
+                # 0-based label now
+                cur_label = self.cat2label[obj_id]
+                ################ pose ###########################
+                R = np.array(anno["cam_R_m2c"], dtype="float32").reshape(3, 3)
+                trans = np.array(anno["cam_t_m2c"], dtype="float32") / 1000.0  # mm->m
+                pose = np.hstack([R, trans.reshape(3, 1)])
+                if adapth_this_K:
+                    # pose_uw = inv(K_uw) @ K_cmu @ pose_cmu
+                    pose = np.linalg.inv(cam_anno) @ cam_anno_ori @ pose
+                    # R = pose[:3, :3]
+                    trans = pose[:3, 3]
+
+                quat = mat2quat(pose[:3, :3])
+
+                ############# bbox ############################
+                bbox = info["bbox_obj"]
+                x1, y1, w, h = bbox
+                x2 = x1 + w
+                y2 = y1 + h
+                x1 = max(min(x1, self.width), 0)
+                y1 = max(min(y1, self.height), 0)
+                x2 = max(min(x2, self.width), 0)
+                y2 = max(min(y2, self.height), 0)
+                bbox = [x1, y1, x2, y2]
+                if self.filter_invalid:
+                    bw = bbox[2] - bbox[0]
+                    bh = bbox[3] - bbox[1]
+                    if bh <= 1 or bw <= 1:
+                        num_instances_without_valid_box += 1
+                        continue
+
+                ############## mask #######################
+                if self.with_masks:  # either list[list[float]] or dict(RLE)
+                    mask_visib_file = osp.join(
+                        image_root,
+                        f"{scene_id:06d}/mask_visib/{im_id:06d}_{anno_i:06d}.png",
+                    )
+                    assert osp.exists(mask_visib_file), mask_visib_file
+                    mask = mmcv.imread(mask_visib_file, "unchanged")
+                    area = mask.sum()
+                    if area < 30 and self.filter_invalid:
+                        num_instances_without_valid_segmentation += 1
+                        continue
+                    mask_rle = binary_mask_to_rle(mask)
+
+                    mask_full_file = osp.join(
+                        image_root,
+                        f"{scene_id:06d}/mask/{im_id:06d}_{anno_i:06d}.png",
+                    )
+                    assert osp.exists(mask_full_file), mask_full_file
+
+                    # load mask full
+                    mask_full = mmcv.imread(mask_full_file, "unchanged")
+                    mask_full = mask_full.astype("bool")
+                    mask_full_rle = binary_mask_to_rle(mask_full, compressed=True)
+
+                proj = (self.cam @ trans.T).T  # NOTE: use self.cam here
+                proj = proj[:2] / proj[2]
+
+                inst = {
+                    "category_id": cur_label,  # 0-based label
+                    "bbox": bbox,
+                    "bbox_obj": bbox,
+                    "bbox_mode": BoxMode.XYXY_ABS,
+                    "pose": pose,
+                    "quat": quat,
+                    "trans": trans,
+                    "centroid_2d": proj,  # absolute (cx, cy)
+                    "segmentation": mask_rle,
+                    "mask_full": mask_full_rle,
+                }
+
+                if self.with_xyz:
+                    xyz_path = osp.join(
+                        xyz_root,
+                        f"{scene_id:06d}/{im_id:06d}_{anno_i:06d}-xyz.pkl",
+                    )
+                    # assert osp.exists(xyz_path), xyz_path
+                    inst["xyz_path"] = xyz_path
+
+                model_info = self.models_info[str(obj_id)]
+                inst["model_info"] = model_info
+                # TODO: using full mask and full xyz
+                for key in ["bbox3d_and_center"]:
+                    inst[key] = self.models[cur_label][key]
+                insts.append(inst)
+            if len(insts) == 0:  # and self.filter_invalid:
+                continue
+            record["annotations"] = insts
+            dataset_dicts.append(record)
+            self._unique_im_id += 1
+
+        if num_instances_without_valid_segmentation > 0:
+            logger.warning(
+                "Filtered out {} instances without valid segmentation. "
+                "There might be issues in your dataset generation process.".format(
+                    num_instances_without_valid_segmentation
+                )
+            )
+        if num_instances_without_valid_box > 0:
+            logger.warning(
+                "Filtered out {} instances without valid box. "
+                "There might be issues in your dataset generation process.".format(num_instances_without_valid_box)
+            )
+        return dataset_dicts
+
+    def __call__(self):  # YCBV_Dataset
+        """Load light-weight instance annotations of all images into a list of
+        dicts in Detectron2 format.
+
+        Do not load heavy data into memory in this file, since we will
+        load the annotations of all images into memory.
+        """
+        # cache the dataset_dicts to avoid loading masks from files
+        hashed_file_name = hashlib.md5(
+            (
+                "".join([str(fn) for fn in self.objs])
+                + "dataset_dicts_{}_{}_{}_{}_{}_{}".format(
+                    self.name,
+                    self.dataset_root,
+                    self.with_masks,
+                    self.with_depth,
+                    self.with_xyz,
+                    __name__,
+                )
+            ).encode("utf-8")
+        ).hexdigest()
+        cache_path = osp.join(
+            self.cache_dir,
+            "dataset_dicts_{}_{}.pkl".format(self.name, hashed_file_name),
+        )
+
+        if osp.exists(cache_path) and self.use_cache:
+            logger.info("load cached dataset dicts from {}".format(cache_path))
+            return mmcv.load(cache_path)
+
+        logger.info("loading dataset dicts: {}".format(self.name))
+        t_start = time.perf_counter()
+        dataset_dicts = []
+        self._unique_im_id = 0
+        for ann_file, image_root in zip(self.ann_files, self.image_prefixes):
+            # logger.info("loading coco json: {}".format(ann_file))
+            dataset_dicts.extend(self._load_from_idx_file(ann_file, image_root))
+
+        ##########################################################################
+        if self.num_to_load > 0:
+            self.num_to_load = min(int(self.num_to_load), len(dataset_dicts))
+            dataset_dicts = dataset_dicts[: self.num_to_load]
+        logger.info("loaded {} dataset dicts, using {}s".format(len(dataset_dicts), time.perf_counter() - t_start))
+
+        mmcv.mkdir_or_exist(osp.dirname(cache_path))
+        mmcv.dump(dataset_dicts, cache_path, protocol=4)
+        logger.info("Dumped dataset_dicts to {}".format(cache_path))
+        return dataset_dicts
+
+    @lazy_property
+    def models_info(self):
+        models_info_path = osp.join(self.models_root, "models_info.json")
+        assert osp.exists(models_info_path), models_info_path
+        models_info = mmcv.load(models_info_path)  # key is str(obj_id)
+        return models_info
+
+    @lazy_property
+    def models(self):
+        """Load models into a list."""
+        cache_path = osp.join(self.models_root, "models_{}.pkl".format(self.name))
+        if osp.exists(cache_path) and self.use_cache:
+            # dprint("{}: load cached object models from {}".format(self.name, cache_path))
+            return mmcv.load(cache_path)
+
+        models = []
+        for obj_name in self.objs:
+            model = inout.load_ply(
+                osp.join(
+                    self.models_root,
+                    f"obj_{ref.ycbv.obj2id[obj_name]:06d}.ply",
+                ),
+                vertex_scale=self.scale_to_meter,
+            )
+            # NOTE: the bbox3d_and_center is not obtained from centered vertices
+            # for BOP models, not a big problem since they had been centered
+            model["bbox3d_and_center"] = misc.get_bbox3d_and_center(model["pts"])
+
+            models.append(model)
+        logger.info("cache models to {}".format(cache_path))
+        mmcv.dump(models, cache_path, protocol=4)
+        return models
+
+    def image_aspect_ratio(self):
+        return self.width / self.height  # 4/3
+
+
+########### register datasets ############################################################
+
+
+def get_ycbv_metadata(obj_names, ref_key):
+    """task specific metadata."""
+    data_ref = ref.__dict__[ref_key]
+
+    cur_sym_infos = {}  # label based key
+    loaded_models_info = data_ref.get_models_info()
+
+    for i, obj_name in enumerate(obj_names):
+        obj_id = data_ref.obj2id[obj_name]
+        model_info = loaded_models_info[str(obj_id)]
+        if "symmetries_discrete" in model_info or "symmetries_continuous" in model_info:
+            sym_transforms = misc.get_symmetry_transformations(model_info, max_sym_disc_step=0.01)
+            sym_info = np.array([sym["R"] for sym in sym_transforms], dtype=np.float32)
+        else:
+            sym_info = None
+        cur_sym_infos[i] = sym_info
+
+    meta = {"thing_classes": obj_names, "sym_infos": cur_sym_infos}
+    return meta
+
+
+ycbv_model_root = "BOP_DATASETS/ycbv/models/"
+################################################################################
+default_cfg = dict(
+    # name="ycbv_train_real",
+    dataset_root=osp.join(DATASETS_ROOT, "BOP_DATASETS/ycbv/"),
+    models_root=osp.join(DATASETS_ROOT, "BOP_DATASETS/ycbv/models"),  # models_simple
+    objs=ref.ycbv.objects,  # all objects
+    # NOTE: this contains all classes
+    # ann_files=[osp.join(DATASETS_ROOT, "BOP_DATASETS/ycbv/image_sets/train.txt")],
+    # image_prefixes=[osp.join(DATASETS_ROOT, "BOP_DATASETS/ycbv/train_real")],
+    scale_to_meter=0.001,
+    with_masks=True,  # (load masks but may not use it)
+    with_depth=True,  # (load depth path here, but may not use it)
+    with_xyz=True,
+    height=480,
+    width=640,
+    align_K_by_change_pose=False,
+    cache_dir=osp.join(PROJ_ROOT, ".cache"),
+    use_cache=True,
+    num_to_load=-1,
+    filter_invalid=True,
+    ref_key="ycbv",
+)
+SPLITS_YCBV = {}
+update_cfgs = {
+    "ycbv_train_real": {
+        "ann_files": [osp.join(DATASETS_ROOT, "BOP_DATASETS/ycbv/image_sets/train.txt")],
+        "image_prefixes": [osp.join(DATASETS_ROOT, "BOP_DATASETS/ycbv/train_real")],
+    },
+    "ycbv_train_real_aligned_Kuw": {
+        "ann_files": [osp.join(DATASETS_ROOT, "BOP_DATASETS/ycbv/image_sets/train.txt")],
+        "image_prefixes": [osp.join(DATASETS_ROOT, "BOP_DATASETS/ycbv/train_real")],
+        "align_K_by_change_pose": True,
+    },
+    "ycbv_train_real_uw": {
+        "ann_files": [osp.join(DATASETS_ROOT, "BOP_DATASETS/ycbv/image_sets/train_real_uw.txt")],
+        "image_prefixes": [osp.join(DATASETS_ROOT, "BOP_DATASETS/ycbv/train_real")],
+    },
+    "ycbv_train_real_uw_every10": {
+        "ann_files": [
+            osp.join(
+                DATASETS_ROOT,
+                "BOP_DATASETS/ycbv/image_sets/train_real_uw_every10.txt",
+            )
+        ],
+        "image_prefixes": [osp.join(DATASETS_ROOT, "BOP_DATASETS/ycbv/train_real")],
+    },
+    "ycbv_train_real_cmu": {
+        "ann_files": [
+            osp.join(
+                DATASETS_ROOT,
+                "BOP_DATASETS/ycbv/image_sets/train_real_cmu.txt",
+            )
+        ],
+        "image_prefixes": [osp.join(DATASETS_ROOT, "BOP_DATASETS/ycbv/train_real")],
+    },
+    "ycbv_train_real_cmu_aligned_Kuw": {
+        "ann_files": [
+            osp.join(
+                DATASETS_ROOT,
+                "BOP_DATASETS/ycbv/image_sets/train_real_cmu.txt",
+            )
+        ],
+        "image_prefixes": [osp.join(DATASETS_ROOT, "BOP_DATASETS/ycbv/train_real")],
+        "align_K_by_change_pose": True,
+    },
+    "ycbv_train_synt": {
+        "ann_files": [osp.join(DATASETS_ROOT, "BOP_DATASETS/ycbv/image_sets/train_synt.txt")],
+        "image_prefixes": [osp.join(DATASETS_ROOT, "BOP_DATASETS/ycbv/train_synt")],
+    },
+    "ycbv_train_synt_50k": {
+        "ann_files": [
+            osp.join(
+                DATASETS_ROOT,
+                "BOP_DATASETS/ycbv/image_sets/train_synt_50k.txt",
+            )
+        ],
+        "image_prefixes": [osp.join(DATASETS_ROOT, "BOP_DATASETS/ycbv/train_synt")],
+    },
+    "ycbv_train_synt_30k": {
+        "ann_files": [
+            osp.join(
+                DATASETS_ROOT,
+                "BOP_DATASETS/ycbv/image_sets/train_synt_30k.txt",
+            )
+        ],
+        "image_prefixes": [osp.join(DATASETS_ROOT, "BOP_DATASETS/ycbv/train_synt")],
+    },
+    "ycbv_train_synt_100": {
+        "ann_files": [
+            osp.join(
+                DATASETS_ROOT,
+                "BOP_DATASETS/ycbv/image_sets/train_synt_100.txt",
+            )
+        ],
+        "image_prefixes": [osp.join(DATASETS_ROOT, "BOP_DATASETS/ycbv/train_synt")],
+    },
+    "ycbv_test": {
+        "ann_files": [osp.join(DATASETS_ROOT, "BOP_DATASETS/ycbv/image_sets/keyframe.txt")],
+        "image_prefixes": [osp.join(DATASETS_ROOT, "BOP_DATASETS/ycbv/test")],
+        "with_xyz": False,
+        "filter_invalid": False,
+    },
+}
+for name, update_cfg in update_cfgs.items():
+    used_cfg = copy.deepcopy(default_cfg)
+    used_cfg["name"] = name
+    used_cfg.update(update_cfg)
+    num_to_load = -1
+    if "_100" in name:
+        num_to_load = 100
+    used_cfg["num_to_load"] = num_to_load
+    SPLITS_YCBV[name] = used_cfg
+
+# single object splits ######################################################
+for obj in ref.ycbv.objects:
+    for split in [
+        "train_real",
+        "train_real_aligned_Kuw",
+        "train_real_uw",
+        "train_real_uw_every10",
+        "train_real_cmu",
+        "train_real_cmu_aligned_Kuw",
+        "train_synt",
+        "train_synt_30k",
+        "test",
+    ]:
+        name = "ycbv_{}_{}".format(obj, split)
+        if split in [
+            "train_real",
+            "train_real_aligned_Kuw",
+            "train_real_uw",
+            "train_real_uw_every10",
+            "train_real_cmu",
+            "train_real_cmu_aligned_Kuw",
+            "train_synt",
+            "train_synt_30k",
+        ]:
+            filter_invalid = True
+            with_xyz = True
+        elif split in ["test"]:
+            filter_invalid = False
+            with_xyz = False
+        else:
+            raise ValueError("{}".format(split))
+
+        if split in ["train_real_aligned_Kuw", "train_real_cmu_aligned_Kuw"]:
+            align_K_by_change_pose = True
+        else:
+            align_K_by_change_pose = False
+
+        split_idx_file_dict = {
+            "train_real": ("train_real", "train.txt"),
+            "train_real_aligned_Kuw": ("train_real", "train.txt"),
+            "train_real_uw": ("train_real", "train_real_uw.txt"),
+            "train_real_uw_every10": (
+                "train_real",
+                "train_real_uw_every10.txt",
+            ),
+            "train_real_cmu": ("train_real", "train_real_cmu.txt"),
+            "train_real_cmu_aligned_Kuw": ("train_real", "train_real_cmu.txt"),
+            "train_synt": ("train_synt", "train_synt.txt"),
+            "train_synt_30k": ("train_synt", "train_synt_30k.txt"),
+            "test": ("test", "keyframe.txt"),
+        }
+        root_name, idx_file = split_idx_file_dict[split]
+
+        if name not in SPLITS_YCBV:
+            SPLITS_YCBV[name] = dict(
+                name=name,
+                dataset_root=osp.join(DATASETS_ROOT, "BOP_DATASETS/ycbv/"),
+                models_root=osp.join(DATASETS_ROOT, "BOP_DATASETS/ycbv/models"),
+                objs=[obj],
+                ann_files=[
+                    osp.join(
+                        DATASETS_ROOT,
+                        "BOP_DATASETS/ycbv/image_sets/{}".format(idx_file),
+                    )
+                ],
+                image_prefixes=[osp.join(DATASETS_ROOT, "BOP_DATASETS/ycbv/{}".format(root_name))],
+                scale_to_meter=0.001,
+                with_masks=True,  # (load masks but may not use it)
+                with_depth=True,  # (load depth path here, but may not use it)
+                with_xyz=with_xyz,
+                height=480,
+                width=640,
+                align_K_by_change_pose=align_K_by_change_pose,
+                cache_dir=osp.join(PROJ_ROOT, ".cache"),
+                use_cache=True,
+                num_to_load=-1,
+                filter_invalid=filter_invalid,
+                ref_key="ycbv",
+            )
+
+
+def register_with_name_cfg(name, data_cfg=None):
+    """Assume pre-defined datasets live in `./datasets`.
+
+    Args:
+        name: datasnet_name,
+        data_cfg: if name is in existing SPLITS, use pre-defined data_cfg
+            otherwise requires data_cfg
+            data_cfg can be set in cfg.DATA_CFG.name
+    """
+    dprint("register dataset: {}".format(name))
+    if name in SPLITS_YCBV:
+        used_cfg = SPLITS_YCBV[name]
+    else:
+        assert (
+            data_cfg is not None
+        ), f"dataset name {name} is not registered. available datasets: {list(SPLITS_YCBV.keys())}"
+        used_cfg = data_cfg
+    DatasetCatalog.register(name, YCBV_Dataset(used_cfg))
+    # something like eval_types
+    MetadataCatalog.get(name).set(
+        id="ycbv",  # NOTE: for pvnet to determine module
+        ref_key=used_cfg["ref_key"],
+        objs=used_cfg["objs"],
+        eval_error_types=["ad", "rete", "proj"],
+        evaluator_type="bop",
+        **get_ycbv_metadata(obj_names=used_cfg["objs"], ref_key=used_cfg["ref_key"]),
+    )
+
+
+def get_available_datasets():
+    return list(SPLITS_YCBV.keys())
+
+
+#### tests ###############################################
+def test_vis():
+    # python -m core.datasets.ycbv_d2 ycbv_test
+    dataset_name = sys.argv[1]
+    meta = MetadataCatalog.get(dataset_name)
+    t_start = time.perf_counter()
+    dicts = DatasetCatalog.get(dataset_name)
+    with_xyz = False if "test" in dataset_name else True
+    logger.info("Done loading {} samples with {:.3f}s.".format(len(dicts), time.perf_counter() - t_start))
+
+    dirname = "output/ycbv_test-data-vis"
+    os.makedirs(dirname, exist_ok=True)
+    objs = meta.objs
+    for d in dicts:
+        img = read_image_mmcv(d["file_name"], format="BGR")
+        depth = mmcv.imread(d["depth_file"], "unchanged") / 1000.0
+
+        imH, imW = img.shape[:2]
+        annos = d["annotations"]
+        masks = [cocosegm2mask(anno["segmentation"], imH, imW) for anno in annos]
+        bboxes = [anno["bbox"] for anno in annos]
+        bbox_modes = [anno["bbox_mode"] for anno in annos]
+        bboxes_xyxy = np.array(
+            [BoxMode.convert(box, box_mode, BoxMode.XYXY_ABS) for box, box_mode in zip(bboxes, bbox_modes)]
+        )
+        kpts_3d_list = [anno["bbox3d_and_center"] for anno in annos]
+        quats = [anno["quat"] for anno in annos]
+        transes = [anno["trans"] for anno in annos]
+        Rs = [quat2mat(quat) for quat in quats]
+        # 0-based label
+        cat_ids = [anno["category_id"] for anno in annos]
+        K = d["cam"]
+        kpts_2d = [misc.project_pts(kpt3d, K, R, t) for kpt3d, R, t in zip(kpts_3d_list, Rs, transes)]
+        # # TODO: visualize pose and keypoints
+        labels = [objs[cat_id] for cat_id in cat_ids]
+        for _i in range(len(annos)):
+            img_vis = vis_image_mask_bbox_cv2(
+                img,
+                masks[_i : _i + 1],
+                bboxes=bboxes_xyxy[_i : _i + 1],
+                labels=labels[_i : _i + 1],
+            )
+            img_vis_kpts2d = misc.draw_projected_box3d(img_vis.copy(), kpts_2d[_i])
+            if with_xyz:
+                xyz_path = annos[_i]["xyz_path"]
+                xyz_info = mmcv.load(xyz_path)
+                x1, y1, x2, y2 = xyz_info["xyxy"]
+                xyz_crop = xyz_info["xyz_crop"].astype(np.float32)
+                xyz = np.zeros((imH, imW, 3), dtype=np.float32)
+                xyz[y1 : y2 + 1, x1 : x2 + 1, :] = xyz_crop
+                xyz_show = get_emb_show(xyz)
+                xyz_crop_show = get_emb_show(xyz_crop)
+                img_xyz = img.copy() / 255.0
+                mask_xyz = ((xyz[:, :, 0] != 0) | (xyz[:, :, 1] != 0) | (xyz[:, :, 2] != 0)).astype("uint8")
+                fg_idx = np.where(mask_xyz != 0)
+                img_xyz[fg_idx[0], fg_idx[1], :] = (
+                    0.5 * xyz_show[fg_idx[0], fg_idx[1], :3] + 0.5 * img_xyz[fg_idx[0], fg_idx[1], :]
+                )
+                img_xyz_crop = img_xyz[y1 : y2 + 1, x1 : x2 + 1, :]
+                img_vis_crop = img_vis[y1 : y2 + 1, x1 : x2 + 1, :]
+                # diff mask
+                diff_mask_xyz = np.abs(masks[_i] - mask_xyz)[y1 : y2 + 1, x1 : x2 + 1]
+
+                grid_show(
+                    [
+                        img[:, :, [2, 1, 0]],
+                        img_vis[:, :, [2, 1, 0]],
+                        img_vis_kpts2d[:, :, [2, 1, 0]],
+                        depth,
+                        # xyz_show,
+                        diff_mask_xyz,
+                        xyz_crop_show,
+                        img_xyz[:, :, [2, 1, 0]],
+                        img_xyz_crop[:, :, [2, 1, 0]],
+                        img_vis_crop[:, :, ::-1],
+                    ],
+                    [
+                        "img",
+                        "vis_img",
+                        "img_vis_kpts2d",
+                        "depth",
+                        "diff_mask_xyz",
+                        "xyz_crop_show",
+                        "img_xyz",
+                        "img_xyz_crop",
+                        "img_vis_crop",
+                    ],
+                    row=3,
+                    col=3,
+                )
+            else:
+                grid_show(
+                    [
+                        img[:, :, [2, 1, 0]],
+                        img_vis[:, :, [2, 1, 0]],
+                        img_vis_kpts2d[:, :, [2, 1, 0]],
+                        depth,
+                    ],
+                    ["img", "vis_img", "img_vis_kpts2d", "depth"],
+                    row=2,
+                    col=2,
+                )
+
+
+if __name__ == "__main__":
+    """Test the  dataset loader.
+
+    Usage:
+        python -m this_module dataset_name
+        "dataset_name" can be any pre-registered ones
+    """
+    from lib.vis_utils.image import grid_show
+    from lib.utils.setup_logger import setup_my_logger
+
+    import detectron2.data.datasets  # noqa # add pre-defined metadata
+    from lib.vis_utils.image import vis_image_mask_bbox_cv2
+    from core.utils.utils import get_emb_show
+    from core.utils.data_utils import read_image_mmcv
+
+    print("sys.argv:", sys.argv)
+    logger = setup_my_logger(name="core")
+    register_with_name_cfg(sys.argv[1])
+    print("dataset catalog: ", DatasetCatalog.list())
+    test_vis()
diff --git a/core/gdrn_modeling/datasets/fruitbin_pbr.py b/core/gdrn_modeling/datasets/fruitbin_pbr.py
new file mode 100644
index 0000000000000000000000000000000000000000..de7ecb2fa0340ebf2b7923385c0a83edce420ca8
--- /dev/null
+++ b/core/gdrn_modeling/datasets/fruitbin_pbr.py
@@ -0,0 +1,491 @@
+import hashlib
+import logging
+import os
+import os.path as osp
+import sys
+
+cur_dir = osp.dirname(osp.abspath(__file__))
+PROJ_ROOT = osp.normpath(osp.join(cur_dir, "../../.."))
+sys.path.insert(0, PROJ_ROOT)
+import time
+from collections import OrderedDict
+import mmcv
+import numpy as np
+from tqdm import tqdm
+from transforms3d.quaternions import mat2quat, quat2mat
+import ref
+from detectron2.data import DatasetCatalog, MetadataCatalog
+from detectron2.structures import BoxMode
+from lib.pysixd import inout, misc
+from lib.utils.mask_utils import binary_mask_to_rle, cocosegm2mask
+from lib.utils.utils import dprint, iprint, lazy_property
+
+
+logger = logging.getLogger(__name__)
+DATASETS_ROOT = osp.normpath(osp.join(PROJ_ROOT, "datasets"))
+
+
+class YCBV_PBR_Dataset:
+    def __init__(self, data_cfg):
+        """
+        Set with_depth and with_masks default to True,
+        and decide whether to load them into dataloader/network later
+        with_masks:
+        """
+        self.name = data_cfg["name"]
+        self.data_cfg = data_cfg
+
+        self.objs = data_cfg["objs"]  # selected objects
+
+        self.dataset_root = data_cfg.get(
+            "dataset_root",
+            osp.join(DATASETS_ROOT, "BOP_DATASETS/ycbv/train_pbr"),
+        )
+        self.xyz_root = data_cfg.get("xyz_root", osp.join(self.dataset_root, "xyz_crop"))
+        assert osp.exists(self.dataset_root), self.dataset_root
+        self.models_root = data_cfg["models_root"]  # BOP_DATASETS/ycbv/models
+        self.scale_to_meter = data_cfg["scale_to_meter"]  # 0.001
+
+        self.with_masks = data_cfg["with_masks"]
+        self.with_depth = data_cfg["with_depth"]
+
+        self.height = data_cfg["height"]  # 480
+        self.width = data_cfg["width"]  # 640
+
+        self.cache_dir = data_cfg.get("cache_dir", osp.join(PROJ_ROOT, ".cache"))  # .cache
+        self.use_cache = data_cfg.get("use_cache", True)
+        self.num_to_load = data_cfg["num_to_load"]  # -1
+        self.filter_invalid = data_cfg.get("filter_invalid", True)
+        ##################################################
+
+        # NOTE: careful! Only the selected objects
+        self.cat_ids = [cat_id for cat_id, obj_name in ref.ycbv.id2obj.items() if obj_name in self.objs]
+        # map selected objs to [0, num_objs-1]
+        self.cat2label = {v: i for i, v in enumerate(self.cat_ids)}  # id_map
+        self.label2cat = {label: cat for cat, label in self.cat2label.items()}
+        self.obj2label = OrderedDict((obj, obj_id) for obj_id, obj in enumerate(self.objs))
+        ##########################################################
+
+        self.scenes = [f"{i:06d}" for i in range(50)]
+
+    def __call__(self):
+        """Load light-weight instance annotations of all images into a list of
+        dicts in Detectron2 format.
+
+        Do not load heavy data into memory in this file, since we will
+        load the annotations of all images into memory.
+        """
+        # cache the dataset_dicts to avoid loading masks from files
+        hashed_file_name = hashlib.md5(
+            (
+                "".join([str(fn) for fn in self.objs])
+                + "dataset_dicts_{}_{}_{}_{}_{}".format(
+                    self.name,
+                    self.dataset_root,
+                    self.with_masks,
+                    self.with_depth,
+                    __name__,
+                )
+            ).encode("utf-8")
+        ).hexdigest()
+        cache_path = osp.join(
+            self.cache_dir,
+            "dataset_dicts_{}_{}.pkl".format(self.name, hashed_file_name),
+        )
+
+        if osp.exists(cache_path) and self.use_cache:
+            logger.info("load cached dataset dicts from {}".format(cache_path))
+            return mmcv.load(cache_path)
+
+        t_start = time.perf_counter()
+
+        logger.info("loading dataset dicts: {}".format(self.name))
+        self.num_instances_without_valid_segmentation = 0
+        self.num_instances_without_valid_box = 0
+        dataset_dicts = []  # ######################################################
+        # it is slow because of loading and converting masks to rle
+        for scene in tqdm(self.scenes):
+            scene_id = int(scene)
+            scene_root = osp.join(self.dataset_root, scene)
+
+            gt_dict = mmcv.load(osp.join(scene_root, "scene_gt.json"))
+            gt_info_dict = mmcv.load(osp.join(scene_root, "scene_gt_info.json"))
+            cam_dict = mmcv.load(osp.join(scene_root, "scene_camera.json"))
+
+            for str_im_id in tqdm(gt_dict, postfix=f"{scene_id}"):
+                int_im_id = int(str_im_id)
+                rgb_path = osp.join(scene_root, "rgb/{:06d}.jpg").format(int_im_id)
+                assert osp.exists(rgb_path), rgb_path
+
+                depth_path = osp.join(scene_root, "depth/{:06d}.png".format(int_im_id))
+
+                scene_im_id = f"{scene_id}/{int_im_id}"
+
+                K = np.array(cam_dict[str_im_id]["cam_K"], dtype=np.float32).reshape(3, 3)
+                depth_factor = 1000.0 / cam_dict[str_im_id]["depth_scale"]  # 10000
+
+                record = {
+                    "dataset_name": self.name,
+                    "file_name": osp.relpath(rgb_path, PROJ_ROOT),
+                    "depth_file": osp.relpath(depth_path, PROJ_ROOT),
+                    "height": self.height,
+                    "width": self.width,
+                    "image_id": int_im_id,
+                    "scene_im_id": scene_im_id,  # for evaluation
+                    "cam": K,
+                    "depth_factor": depth_factor,
+                    "img_type": "syn_pbr",  # NOTE: has background
+                }
+                insts = []
+                for anno_i, anno in enumerate(gt_dict[str_im_id]):
+                    obj_id = anno["obj_id"]
+                    if obj_id not in self.cat_ids:
+                        continue
+                    cur_label = self.cat2label[obj_id]  # 0-based label
+                    R = np.array(anno["cam_R_m2c"], dtype="float32").reshape(3, 3)
+                    t = np.array(anno["cam_t_m2c"], dtype="float32") / 1000.0
+                    pose = np.hstack([R, t.reshape(3, 1)])
+                    quat = mat2quat(R).astype("float32")
+
+                    proj = (record["cam"] @ t.T).T
+                    proj = proj[:2] / proj[2]
+
+                    bbox_visib = gt_info_dict[str_im_id][anno_i]["bbox_visib"]
+                    bbox_obj = gt_info_dict[str_im_id][anno_i]["bbox_obj"]
+                    x1, y1, w, h = bbox_visib
+                    if self.filter_invalid:
+                        if h <= 1 or w <= 1:
+                            self.num_instances_without_valid_box += 1
+                            continue
+
+                    mask_file = osp.join(
+                        scene_root,
+                        "mask/{:06d}_{:06d}.png".format(int_im_id, anno_i),
+                    )
+                    mask_visib_file = osp.join(
+                        scene_root,
+                        "mask_visib/{:06d}_{:06d}.png".format(int_im_id, anno_i),
+                    )
+                    assert osp.exists(mask_file), mask_file
+                    assert osp.exists(mask_visib_file), mask_visib_file
+                    # load mask visib  TODO: load both mask_visib and mask_full
+                    mask_single = mmcv.imread(mask_visib_file, "unchanged")
+                    area = mask_single.sum()
+                    if area <= 64:  # filter out too small or nearly invisible instances
+                        self.num_instances_without_valid_segmentation += 1
+                        continue
+                    mask_rle = binary_mask_to_rle(mask_single, compressed=True)
+
+                    # load mask full
+                    mask_full = mmcv.imread(mask_file, "unchanged")
+                    mask_full = mask_full.astype("bool")
+                    mask_full_rle = binary_mask_to_rle(mask_full, compressed=True)
+
+                    visib_fract = gt_info_dict[str_im_id][anno_i].get("visib_fract", 1.0)
+
+                    xyz_path = osp.join(self.xyz_root, f"{scene_id:06d}/{int_im_id:06d}_{anno_i:06d}-xyz.pkl")
+                    # assert osp.exists(xyz_path), xyz_path
+                    inst = {
+                        "category_id": cur_label,  # 0-based label
+                        "bbox": bbox_visib,  # TODO: load both bbox_obj and bbox_visib
+                        "bbox_obj": bbox_obj,
+                        "bbox_mode": BoxMode.XYWH_ABS,
+                        "pose": pose,
+                        "quat": quat,
+                        "trans": t,
+                        "centroid_2d": proj,  # absolute (cx, cy)
+                        "segmentation": mask_rle,
+                        "mask_full": mask_full_rle,  # TODO: load as mask_full, rle
+                        "visib_fract": visib_fract,
+                        "xyz_path": xyz_path,
+                    }
+
+                    model_info = self.models_info[str(obj_id)]
+                    inst["model_info"] = model_info
+                    # TODO: using full mask and full xyz
+                    for key in ["bbox3d_and_center"]:
+                        inst[key] = self.models[cur_label][key]
+                    insts.append(inst)
+                if len(insts) == 0:  # filter im without anno
+                    continue
+                record["annotations"] = insts
+                dataset_dicts.append(record)
+
+        if self.num_instances_without_valid_segmentation > 0:
+            logger.warning(
+                "Filtered out {} instances without valid segmentation. "
+                "There might be issues in your dataset generation process.".format(
+                    self.num_instances_without_valid_segmentation
+                )
+            )
+        if self.num_instances_without_valid_box > 0:
+            logger.warning(
+                "Filtered out {} instances without valid box. "
+                "There might be issues in your dataset generation process.".format(self.num_instances_without_valid_box)
+            )
+        ##########################################################################
+        if self.num_to_load > 0:
+            self.num_to_load = min(int(self.num_to_load), len(dataset_dicts))
+            dataset_dicts = dataset_dicts[: self.num_to_load]
+        logger.info("loaded {} dataset dicts, using {}s".format(len(dataset_dicts), time.perf_counter() - t_start))
+
+        mmcv.mkdir_or_exist(osp.dirname(cache_path))
+        mmcv.dump(dataset_dicts, cache_path, protocol=4)
+        logger.info("Dumped dataset_dicts to {}".format(cache_path))
+        return dataset_dicts
+
+    @lazy_property
+    def models_info(self):
+        models_info_path = osp.join(self.models_root, "models_info.json")
+        assert osp.exists(models_info_path), models_info_path
+        models_info = mmcv.load(models_info_path)  # key is str(obj_id)
+        return models_info
+
+    @lazy_property
+    def models(self):
+        """Load models into a list."""
+        cache_path = osp.join(self.models_root, "models_{}.pkl".format(self.name))
+        if osp.exists(cache_path) and self.use_cache:
+            # dprint("{}: load cached object models from {}".format(self.name, cache_path))
+            return mmcv.load(cache_path)
+
+        models = []
+        for obj_name in self.objs:
+            model = inout.load_ply(
+                osp.join(
+                    self.models_root,
+                    f"obj_{ref.ycbv.obj2id[obj_name]:06d}.ply",
+                ),
+                vertex_scale=self.scale_to_meter,
+            )
+            # NOTE: the bbox3d_and_center is not obtained from centered vertices
+            # for BOP models, not a big problem since they had been centered
+            model["bbox3d_and_center"] = misc.get_bbox3d_and_center(model["pts"])
+
+            models.append(model)
+        logger.info("cache models to {}".format(cache_path))
+        mmcv.dump(models, cache_path, protocol=4)
+        return models
+
+    def image_aspect_ratio(self):
+        return self.width / self.height  # 4/3
+
+
+########### register datasets ############################################################
+
+
+def get_ycbv_metadata(obj_names, ref_key):
+    """task specific metadata."""
+    data_ref = ref.__dict__[ref_key]
+
+    cur_sym_infos = {}  # label based key
+    loaded_models_info = data_ref.get_models_info()
+
+    for i, obj_name in enumerate(obj_names):
+        obj_id = data_ref.obj2id[obj_name]
+        model_info = loaded_models_info[str(obj_id)]
+        if "symmetries_discrete" in model_info or "symmetries_continuous" in model_info:
+            sym_transforms = misc.get_symmetry_transformations(model_info, max_sym_disc_step=0.01)
+            sym_info = np.array([sym["R"] for sym in sym_transforms], dtype=np.float32)
+        else:
+            sym_info = None
+        cur_sym_infos[i] = sym_info
+
+    meta = {"thing_classes": obj_names, "sym_infos": cur_sym_infos}
+    return meta
+
+
+ycbv_model_root = "BOP_DATASETS/ycbv/models/"
+################################################################################
+
+
+SPLITS_YCBV_PBR = dict(
+    ycbv_train_pbr=dict(
+        name="ycbv_train_pbr",
+        objs=ref.ycbv.objects,  # selected objects
+        dataset_root=osp.join(DATASETS_ROOT, "BOP_DATASETS/ycbv/train_pbr"),
+        models_root=osp.join(DATASETS_ROOT, "BOP_DATASETS/ycbv/models"),
+        xyz_root=osp.join(DATASETS_ROOT, "BOP_DATASETS/ycbv/train_pbr/xyz_crop"),
+        scale_to_meter=0.001,
+        with_masks=True,  # (load masks but may not use it)
+        with_depth=True,  # (load depth path here, but may not use it)
+        height=480,
+        width=640,
+        use_cache=True,
+        num_to_load=-1,
+        filter_invalid=True,
+        ref_key="ycbv",
+    )
+)
+
+# single obj splits
+for obj in ref.ycbv.objects:
+    for split in ["train_pbr"]:
+        name = "ycbv_{}_{}".format(obj, split)
+        if split in ["train_pbr"]:
+            filter_invalid = True
+        elif split in ["test"]:
+            filter_invalid = False
+        else:
+            raise ValueError("{}".format(split))
+        if name not in SPLITS_YCBV_PBR:
+            SPLITS_YCBV_PBR[name] = dict(
+                name=name,
+                objs=[obj],  # only this obj
+                dataset_root=osp.join(DATASETS_ROOT, "BOP_DATASETS/ycbv/train_pbr"),
+                models_root=osp.join(DATASETS_ROOT, "BOP_DATASETS/ycbv/models"),
+                xyz_root=osp.join(DATASETS_ROOT, "BOP_DATASETS/ycbv/train_pbr/xyz_crop"),
+                scale_to_meter=0.001,
+                with_masks=True,  # (load masks but may not use it)
+                with_depth=True,  # (load depth path here, but may not use it)
+                height=480,
+                width=640,
+                use_cache=True,
+                num_to_load=-1,
+                filter_invalid=filter_invalid,
+                ref_key="ycbv",
+            )
+
+
+def register_with_name_cfg(name, data_cfg=None):
+    """Assume pre-defined datasets live in `./datasets`.
+
+    Args:
+        name: datasnet_name,
+        data_cfg: if name is in existing SPLITS, use pre-defined data_cfg
+            otherwise requires data_cfg
+            data_cfg can be set in cfg.DATA_CFG.name
+    """
+    dprint("register dataset: {}".format(name))
+    if name in SPLITS_YCBV_PBR:
+        used_cfg = SPLITS_YCBV_PBR[name]
+    else:
+        assert data_cfg is not None, f"dataset name {name} is not registered"
+        used_cfg = data_cfg
+    DatasetCatalog.register(name, YCBV_PBR_Dataset(used_cfg))
+    # something like eval_types
+    MetadataCatalog.get(name).set(
+        id="ycbv",  # NOTE: for pvnet to determine module
+        ref_key=used_cfg["ref_key"],
+        objs=used_cfg["objs"],
+        eval_error_types=["ad", "rete", "proj"],
+        evaluator_type="bop",
+        **get_ycbv_metadata(obj_names=used_cfg["objs"], ref_key=used_cfg["ref_key"]),
+    )
+
+
+def get_available_datasets():
+    return list(SPLITS_YCBV_PBR.keys())
+
+
+#### tests ###############################################
+def test_vis():
+    dset_name = sys.argv[1]
+    assert dset_name in DatasetCatalog.list()
+
+    meta = MetadataCatalog.get(dset_name)
+    dprint("MetadataCatalog: ", meta)
+    objs = meta.objs
+
+    t_start = time.perf_counter()
+    dicts = DatasetCatalog.get(dset_name)
+    logger.info("Done loading {} samples with {:.3f}s.".format(len(dicts), time.perf_counter() - t_start))
+
+    dirname = "output/{}-data-vis".format(dset_name)
+    os.makedirs(dirname, exist_ok=True)
+    for d in dicts:
+        img = read_image_mmcv(d["file_name"], format="BGR")
+        depth = mmcv.imread(d["depth_file"], "unchanged") / 10000.0
+
+        imH, imW = img.shape[:2]
+        annos = d["annotations"]
+        masks = [cocosegm2mask(anno["segmentation"], imH, imW) for anno in annos]
+        bboxes = [anno["bbox"] for anno in annos]
+        bbox_modes = [anno["bbox_mode"] for anno in annos]
+        bboxes_xyxy = np.array(
+            [BoxMode.convert(box, box_mode, BoxMode.XYXY_ABS) for box, box_mode in zip(bboxes, bbox_modes)]
+        )
+        kpts_3d_list = [anno["bbox3d_and_center"] for anno in annos]
+        quats = [anno["quat"] for anno in annos]
+        transes = [anno["trans"] for anno in annos]
+        Rs = [quat2mat(quat) for quat in quats]
+        # 0-based label
+        cat_ids = [anno["category_id"] for anno in annos]
+        K = d["cam"]
+        kpts_2d = [misc.project_pts(kpt3d, K, R, t) for kpt3d, R, t in zip(kpts_3d_list, Rs, transes)]
+
+        labels = [objs[cat_id] for cat_id in cat_ids]
+        for _i in range(len(annos)):
+            img_vis = vis_image_mask_bbox_cv2(
+                img,
+                masks[_i : _i + 1],
+                bboxes=bboxes_xyxy[_i : _i + 1],
+                labels=labels[_i : _i + 1],
+            )
+            img_vis_kpts2d = misc.draw_projected_box3d(img_vis.copy(), kpts_2d[_i])
+            xyz_path = annos[_i]["xyz_path"]
+            xyz_info = mmcv.load(xyz_path)
+            x1, y1, x2, y2 = xyz_info["xyxy"]
+            xyz_crop = xyz_info["xyz_crop"].astype(np.float32)
+            xyz = np.zeros((imH, imW, 3), dtype=np.float32)
+            xyz[y1 : y2 + 1, x1 : x2 + 1, :] = xyz_crop
+            xyz_show = get_emb_show(xyz)
+            xyz_crop_show = get_emb_show(xyz_crop)
+            img_xyz = img.copy() / 255.0
+            mask_xyz = ((xyz[:, :, 0] != 0) | (xyz[:, :, 1] != 0) | (xyz[:, :, 2] != 0)).astype("uint8")
+            fg_idx = np.where(mask_xyz != 0)
+            img_xyz[fg_idx[0], fg_idx[1], :] = xyz_show[fg_idx[0], fg_idx[1], :3]
+            img_xyz_crop = img_xyz[y1 : y2 + 1, x1 : x2 + 1, :]
+            img_vis_crop = img_vis[y1 : y2 + 1, x1 : x2 + 1, :]
+            # diff mask
+            diff_mask_xyz = np.abs(masks[_i] - mask_xyz)[y1 : y2 + 1, x1 : x2 + 1]
+
+            grid_show(
+                [
+                    img[:, :, [2, 1, 0]],
+                    img_vis[:, :, [2, 1, 0]],
+                    img_vis_kpts2d[:, :, [2, 1, 0]],
+                    depth,
+                    # xyz_show,
+                    diff_mask_xyz,
+                    xyz_crop_show,
+                    img_xyz[:, :, [2, 1, 0]],
+                    img_xyz_crop[:, :, [2, 1, 0]],
+                    img_vis_crop,
+                ],
+                [
+                    "img",
+                    "vis_img",
+                    "img_vis_kpts2d",
+                    "depth",
+                    "diff_mask_xyz",
+                    "xyz_crop_show",
+                    "img_xyz",
+                    "img_xyz_crop",
+                    "img_vis_crop",
+                ],
+                row=3,
+                col=3,
+            )
+
+
+if __name__ == "__main__":
+    """Test the  dataset loader.
+
+    Usage:
+        python -m this_module ycbv_pbr_train
+    """
+    from lib.vis_utils.image import grid_show
+    from lib.utils.setup_logger import setup_my_logger
+
+    import detectron2.data.datasets  # noqa # add pre-defined metadata
+    from lib.vis_utils.image import vis_image_mask_bbox_cv2
+    from core.utils.utils import get_emb_show
+    from core.utils.data_utils import read_image_mmcv
+
+    print("sys.argv:", sys.argv)
+    logger = setup_my_logger(name="core")
+    register_with_name_cfg(sys.argv[1])
+    print("dataset catalog: ", DatasetCatalog.list())
+
+    test_vis()
diff --git a/core/gdrn_modeling/tools/fruitbin/__init__.py b/core/gdrn_modeling/tools/fruitbin/__init__.py
new file mode 100644
index 0000000000000000000000000000000000000000..e69de29bb2d1d6434b8b29ae775ad8c2e48c5391
diff --git a/core/gdrn_modeling/tools/fruitbin/convert_det_to_our_format.py b/core/gdrn_modeling/tools/fruitbin/convert_det_to_our_format.py
new file mode 100644
index 0000000000000000000000000000000000000000..e5de9ead58a7b171868b0c0344d3afb7d4b5e918
--- /dev/null
+++ b/core/gdrn_modeling/tools/fruitbin/convert_det_to_our_format.py
@@ -0,0 +1,69 @@
+import mmcv
+import sys
+from tqdm import tqdm
+
+import json
+
+path = "/data2/lxy/Storage/bop22_results/yolovx_amodal/ycbv/yolox_x_640_ycbv_real_pbr_ycbv_bop_test.json"
+ds = mmcv.load(path)
+
+outs = {}
+for d in tqdm(ds):
+    scene_id = d["scene_id"]
+    image_id = d["image_id"]
+    scene_im_id = f"{scene_id}/{image_id}"
+
+    obj_id = d["category_id"]
+    score = d["score"]
+
+    bbox = d["bbox"]
+    time = d["time"]
+
+    cur_dict = {
+        "bbox_est": bbox,
+        "obj_id": obj_id,
+        "score": score,
+        "time": time,
+    }
+
+    if scene_im_id in outs.keys():
+        outs[scene_im_id].append(cur_dict)
+    else:
+        outs[scene_im_id] = [cur_dict]
+
+
+def save_json(path, content, sort=False):
+    """Saves the provided content to a JSON file.
+
+    :param path: Path to the output JSON file.
+    :param content: Dictionary/list to save.
+    """
+    with open(path, "w") as f:
+
+        if isinstance(content, dict):
+            f.write("{\n")
+            if sort:
+                content_sorted = sorted(content.items(), key=lambda x: x[0])
+            else:
+                content_sorted = content.items()
+            for elem_id, (k, v) in enumerate(content_sorted):
+                f.write('  "{}": {}'.format(k, json.dumps(v, sort_keys=True)))
+                if elem_id != len(content) - 1:
+                    f.write(",")
+                f.write("\n")
+            f.write("}")
+
+        elif isinstance(content, list):
+            f.write("[\n")
+            for elem_id, elem in enumerate(content):
+                f.write("  {}".format(json.dumps(elem, sort_keys=True)))
+                if elem_id != len(content) - 1:
+                    f.write(",")
+                f.write("\n")
+            f.write("]")
+
+        else:
+            json.dump(content, f, sort_keys=True)
+
+
+save_json("datasets/BOP_DATASETS/ycbv/test/test_bboxes/yolox_x_640_ycbv_real_pbr_ycbv_bop_test.json", outs)
diff --git a/det/yolox/data/datasets/fruitbin_bop_test.py b/det/yolox/data/datasets/fruitbin_bop_test.py
new file mode 100644
index 0000000000000000000000000000000000000000..c9a9324ef3bfaeb485a7e798548fc6a0695615ad
--- /dev/null
+++ b/det/yolox/data/datasets/fruitbin_bop_test.py
@@ -0,0 +1,454 @@
+import hashlib
+import logging
+import os
+import os.path as osp
+import sys
+
+cur_dir = osp.dirname(osp.abspath(__file__))
+PROJ_ROOT = osp.normpath(osp.join(cur_dir, "../../../.."))
+sys.path.insert(0, PROJ_ROOT)
+import time
+from collections import OrderedDict
+import mmcv
+import numpy as np
+from tqdm import tqdm
+from transforms3d.quaternions import mat2quat, quat2mat
+import ref
+from detectron2.data import DatasetCatalog, MetadataCatalog
+from detectron2.structures import BoxMode
+from lib.pysixd import inout, misc
+from lib.utils.mask_utils import binary_mask_to_rle, cocosegm2mask
+from lib.utils.utils import dprint, iprint, lazy_property
+
+
+logger = logging.getLogger(__name__)
+DATASETS_ROOT = osp.normpath(osp.join(PROJ_ROOT, "datasets"))
+
+
+class YCBV_BOP_TEST_Dataset:
+    """ycbv bop test."""
+
+    def __init__(self, data_cfg):
+        """
+        Set with_depth and with_masks default to True,
+        and decide whether to load them into dataloader/network later
+        with_masks:
+        """
+        self.name = data_cfg["name"]
+        self.data_cfg = data_cfg
+
+        self.objs = data_cfg["objs"]  # selected objects
+        # all classes are self.objs, but this enables us to evaluate on selected objs
+        self.select_objs = data_cfg.get("select_objs", self.objs)
+
+        self.ann_file = data_cfg["ann_file"]  # json file with scene_id and im_id items
+
+        self.dataset_root = data_cfg["dataset_root"]  # BOP_DATASETS/ycbv/test
+        self.models_root = data_cfg["models_root"]  # BOP_DATASETS/ycbv/models
+        self.scale_to_meter = data_cfg["scale_to_meter"]  # 0.001
+
+        self.with_masks = data_cfg["with_masks"]  # True (load masks but may not use it)
+        self.with_depth = data_cfg["with_depth"]  # True (load depth path here, but may not use it)
+
+        self.height = data_cfg["height"]  # 480
+        self.width = data_cfg["width"]  # 640
+
+        self.cache_dir = data_cfg.get("cache_dir", osp.join(PROJ_ROOT, ".cache"))  # .cache
+        self.use_cache = data_cfg.get("use_cache", True)
+        self.num_to_load = data_cfg["num_to_load"]  # -1
+        self.filter_invalid = data_cfg["filter_invalid"]
+        ##################################################
+
+        # NOTE: careful! Only the selected objects
+        self.cat_ids = [cat_id for cat_id, obj_name in ref.ycbv.id2obj.items() if obj_name in self.objs]
+        # map selected objs to [0, num_objs-1]
+        self.cat2label = {v: i for i, v in enumerate(self.cat_ids)}  # id_map
+        self.label2cat = {label: cat for cat, label in self.cat2label.items()}
+        self.obj2label = OrderedDict((obj, obj_id) for obj_id, obj in enumerate(self.objs))
+        ##########################################################
+
+    def __call__(self):
+        """Load light-weight instance annotations of all images into a list of
+        dicts in Detectron2 format.
+
+        Do not load heavy data into memory in this file, since we will
+        load the annotations of all images into memory.
+        """
+        # cache the dataset_dicts to avoid loading masks from files
+        hashed_file_name = hashlib.md5(
+            (
+                "".join([str(fn) for fn in self.objs])
+                + "dataset_dicts_{}_{}_{}_{}_{}".format(
+                    self.name,
+                    self.dataset_root,
+                    self.with_masks,
+                    self.with_depth,
+                    __name__,
+                )
+            ).encode("utf-8")
+        ).hexdigest()
+        cache_path = osp.join(
+            self.cache_dir,
+            "dataset_dicts_{}_{}.pkl".format(self.name, hashed_file_name),
+        )
+
+        if osp.exists(cache_path) and self.use_cache:
+            logger.info("load cached dataset dicts from {}".format(cache_path))
+            return mmcv.load(cache_path)
+
+        t_start = time.perf_counter()
+
+        logger.info("loading dataset dicts: {}".format(self.name))
+        self.num_instances_without_valid_segmentation = 0
+        self.num_instances_without_valid_box = 0
+        dataset_dicts = []  # ######################################################
+        im_id_global = 0
+
+        if True:
+            targets = mmcv.load(self.ann_file)
+            scene_im_ids = [(item["scene_id"], item["im_id"]) for item in targets]
+            scene_im_ids = sorted(list(set(scene_im_ids)))
+
+            # load infos for each scene
+            gt_dicts = {}
+            gt_info_dicts = {}
+            cam_dicts = {}
+            for scene_id, im_id in scene_im_ids:
+                scene_root = osp.join(self.dataset_root, f"{scene_id:06d}")
+                if scene_id not in gt_dicts:
+                    gt_dicts[scene_id] = mmcv.load(osp.join(scene_root, "scene_gt.json"))
+                if scene_id not in gt_info_dicts:
+                    gt_info_dicts[scene_id] = mmcv.load(
+                        osp.join(scene_root, "scene_gt_info.json")
+                    )  # bbox_obj, bbox_visib
+                if scene_id not in cam_dicts:
+                    cam_dicts[scene_id] = mmcv.load(osp.join(scene_root, "scene_camera.json"))
+
+            for scene_id, im_id in tqdm(scene_im_ids):
+                str_im_id = str(im_id)
+                scene_root = osp.join(self.dataset_root, f"{scene_id:06d}")
+                rgb_path = osp.join(scene_root, "rgb/{:06d}.png").format(im_id)
+                assert osp.exists(rgb_path), rgb_path
+
+                depth_path = osp.join(scene_root, "depth/{:06d}.png".format(im_id))
+
+                scene_id = int(rgb_path.split("/")[-3])
+
+                cam = np.array(cam_dicts[scene_id][str_im_id]["cam_K"], dtype=np.float32).reshape(3, 3)
+                depth_factor = 1000.0 / cam_dicts[scene_id][str_im_id]["depth_scale"]
+                record = {
+                    "dataset_name": self.name,
+                    "file_name": osp.relpath(rgb_path, PROJ_ROOT),
+                    "depth_file": osp.relpath(depth_path, PROJ_ROOT),
+                    "depth_factor": depth_factor,
+                    "height": self.height,
+                    "width": self.width,
+                    "image_id": im_id_global,  # unique image_id in the dataset, for coco evaluation
+                    "scene_im_id": "{}/{}".format(scene_id, im_id),  # for evaluation
+                    "cam": cam,
+                    "img_type": "real",
+                }
+                im_id_global += 1
+                insts = []
+                for anno_i, anno in enumerate(gt_dicts[scene_id][str_im_id]):
+                    obj_id = anno["obj_id"]
+                    if ref.ycbv.id2obj[obj_id] not in self.select_objs:
+                        continue
+                    cur_label = self.cat2label[obj_id]  # 0-based label
+                    R = np.array(anno["cam_R_m2c"], dtype="float32").reshape(3, 3)
+                    t = np.array(anno["cam_t_m2c"], dtype="float32") / 1000.0
+                    pose = np.hstack([R, t.reshape(3, 1)])
+                    quat = mat2quat(R).astype("float32")
+
+                    proj = (record["cam"] @ t.T).T
+                    proj = proj[:2] / proj[2]
+
+                    bbox_visib = gt_info_dicts[scene_id][str_im_id][anno_i]["bbox_visib"]
+                    bbox_obj = gt_info_dicts[scene_id][str_im_id][anno_i]["bbox_obj"]
+                    x1, y1, w, h = bbox_visib
+                    if self.filter_invalid:
+                        if h <= 1 or w <= 1:
+                            self.num_instances_without_valid_box += 1
+                            continue
+
+                    mask_file = osp.join(
+                        scene_root,
+                        "mask/{:06d}_{:06d}.png".format(im_id, anno_i),
+                    )
+                    mask_visib_file = osp.join(
+                        scene_root,
+                        "mask_visib/{:06d}_{:06d}.png".format(im_id, anno_i),
+                    )
+                    assert osp.exists(mask_file), mask_file
+                    assert osp.exists(mask_visib_file), mask_visib_file
+                    # load mask visib
+                    mask_single = mmcv.imread(mask_visib_file, "unchanged")
+                    area = mask_single.sum()
+                    if area < 3:  # filter out too small or nearly invisible instances
+                        self.num_instances_without_valid_segmentation += 1
+                        continue
+                    mask_rle = binary_mask_to_rle(mask_single, compressed=True)
+
+                    # load mask full
+                    mask_full = mmcv.imread(mask_file, "unchanged")
+                    mask_full = mask_full.astype("bool")
+                    mask_full_rle = binary_mask_to_rle(mask_full, compressed=True)
+
+                    inst = {
+                        "category_id": cur_label,  # 0-based label
+                        "bbox": bbox_obj,  # TODO: load both bbox_obj and bbox_visib
+                        "bbox_mode": BoxMode.XYWH_ABS,
+                        "pose": pose,
+                        "quat": quat,
+                        "trans": t,
+                        "centroid_2d": proj,  # absolute (cx, cy)
+                        "segmentation": mask_rle,
+                        "mask_full": mask_full_rle,  # TODO: load as mask_full, rle
+                    }
+
+                    model_info = self.models_info[str(obj_id)]
+                    inst["model_info"] = model_info
+                    # TODO: using full mask and full xyz
+                    for key in ["bbox3d_and_center"]:
+                        inst[key] = self.models[cur_label][key]
+                    insts.append(inst)
+                if len(insts) == 0:  # filter im without anno
+                    continue
+                record["annotations"] = insts
+                dataset_dicts.append(record)
+
+        if self.num_instances_without_valid_segmentation > 0:
+            logger.warning(
+                "Filtered out {} instances without valid segmentation. "
+                "There might be issues in your dataset generation process.".format(
+                    self.num_instances_without_valid_segmentation
+                )
+            )
+        if self.num_instances_without_valid_box > 0:
+            logger.warning(
+                "Filtered out {} instances without valid box. "
+                "There might be issues in your dataset generation process.".format(self.num_instances_without_valid_box)
+            )
+        ##########################################################################
+        if self.num_to_load > 0:
+            self.num_to_load = min(int(self.num_to_load), len(dataset_dicts))
+            dataset_dicts = dataset_dicts[: self.num_to_load]
+        logger.info("loaded {} dataset dicts, using {}s".format(len(dataset_dicts), time.perf_counter() - t_start))
+
+        mmcv.mkdir_or_exist(osp.dirname(cache_path))
+        mmcv.dump(dataset_dicts, cache_path, protocol=4)
+        logger.info("Dumped dataset_dicts to {}".format(cache_path))
+        return dataset_dicts
+
+    @lazy_property
+    def models_info(self):
+        models_info_path = osp.join(self.models_root, "models_info.json")
+        assert osp.exists(models_info_path), models_info_path
+        models_info = mmcv.load(models_info_path)  # key is str(obj_id)
+        return models_info
+
+    @lazy_property
+    def models(self):
+        """Load models into a list."""
+        cache_path = osp.join(self.models_root, f"models_{self.name}.pkl")
+        if osp.exists(cache_path) and self.use_cache:
+            # dprint("{}: load cached object models from {}".format(self.name, cache_path))
+            return mmcv.load(cache_path)
+
+        models = []
+        for obj_name in self.objs:
+            model = inout.load_ply(
+                osp.join(
+                    self.models_root,
+                    f"obj_{ref.ycbv.obj2id[obj_name]:06d}.ply",
+                ),
+                vertex_scale=self.scale_to_meter,
+            )
+            # NOTE: the bbox3d_and_center is not obtained from centered vertices
+            # for BOP models, not a big problem since they had been centered
+            model["bbox3d_and_center"] = misc.get_bbox3d_and_center(model["pts"])
+
+            models.append(model)
+        logger.info("cache models to {}".format(cache_path))
+        mmcv.dump(models, cache_path, protocol=4)
+        return models
+
+    def image_aspect_ratio(self):
+        return self.width / self.height  # 4/3
+
+
+########### register datasets ############################################################
+
+
+def get_ycbv_metadata(obj_names, ref_key):
+    """task specific metadata."""
+    data_ref = ref.__dict__[ref_key]
+
+    cur_sym_infos = {}  # label based key
+    loaded_models_info = data_ref.get_models_info()
+
+    for i, obj_name in enumerate(obj_names):
+        obj_id = data_ref.obj2id[obj_name]
+        model_info = loaded_models_info[str(obj_id)]
+        if "symmetries_discrete" in model_info or "symmetries_continuous" in model_info:
+            sym_transforms = misc.get_symmetry_transformations(model_info, max_sym_disc_step=0.01)
+            sym_info = np.array([sym["R"] for sym in sym_transforms], dtype=np.float32)
+        else:
+            sym_info = None
+        cur_sym_infos[i] = sym_info
+
+    meta = {"thing_classes": obj_names, "sym_infos": cur_sym_infos}
+    return meta
+
+
+################################################################################
+
+SPLITS_YCBV = dict(
+    ycbv_bop_test=dict(
+        name="ycbv_bop_test",
+        dataset_root=osp.join(DATASETS_ROOT, "BOP_DATASETS/ycbv/test"),
+        models_root=osp.join(DATASETS_ROOT, "BOP_DATASETS/ycbv/models"),
+        objs=ref.ycbv.objects,  # selected objects
+        ann_file=osp.join(DATASETS_ROOT, "BOP_DATASETS/ycbv/test_targets_bop19.json"),
+        scale_to_meter=0.001,
+        with_masks=True,  # (load masks but may not use it)
+        with_depth=True,  # (load depth path here, but may not use it)
+        height=480,
+        width=640,
+        cache_dir=osp.join(PROJ_ROOT, ".cache"),
+        use_cache=True,
+        num_to_load=-1,
+        filter_invalid=False,
+        ref_key="ycbv",
+    )
+)
+
+
+# single objs (num_class is from all objs)
+for obj in ref.ycbv.objects:
+    name = "ycbv_bop_{}_test".format(obj)
+    select_objs = [obj]
+    if name not in SPLITS_YCBV:
+        SPLITS_YCBV[name] = dict(
+            name=name,
+            dataset_root=osp.join(DATASETS_ROOT, "BOP_DATASETS/ycbv/test"),
+            models_root=osp.join(DATASETS_ROOT, "BOP_DATASETS/ycbv/models"),
+            objs=[obj],  # only this obj
+            select_objs=select_objs,  # selected objects
+            ann_file=osp.join(DATASETS_ROOT, "BOP_DATASETS/ycbv/test_targets_bop19.json"),
+            scale_to_meter=0.001,
+            with_masks=True,  # (load masks but may not use it)
+            with_depth=True,  # (load depth path here, but may not use it)
+            height=480,
+            width=640,
+            cache_dir=osp.join(PROJ_ROOT, ".cache"),
+            use_cache=True,
+            num_to_load=-1,
+            filter_invalid=False,
+            ref_key="ycbv",
+        )
+
+
+def register_with_name_cfg(name, data_cfg=None):
+    """Assume pre-defined datasets live in `./datasets`.
+
+    Args:
+        name: datasnet_name,
+        data_cfg: if name is in existing SPLITS, use pre-defined data_cfg
+            otherwise requires data_cfg
+            data_cfg can be set in cfg.DATA_CFG.name
+    """
+    dprint("register dataset: {}".format(name))
+    if name in SPLITS_YCBV:
+        used_cfg = SPLITS_YCBV[name]
+    else:
+        assert data_cfg is not None, f"dataset name {name} is not registered"
+        used_cfg = data_cfg
+    DatasetCatalog.register(name, YCBV_BOP_TEST_Dataset(used_cfg))
+    # something like eval_types
+    MetadataCatalog.get(name).set(
+        id="ycbv",  # NOTE: for pvnet to determine module
+        ref_key=used_cfg["ref_key"],
+        objs=used_cfg["objs"],
+        eval_error_types=["ad", "rete", "proj"],
+        evaluator_type="bop",
+        **get_ycbv_metadata(obj_names=used_cfg["objs"], ref_key=used_cfg["ref_key"]),
+    )
+
+
+def get_available_datasets():
+    return list(SPLITS_YCBV.keys())
+
+
+#### tests ###############################################
+def test_vis():
+    dset_name = sys.argv[1]
+    assert dset_name in DatasetCatalog.list()
+
+    meta = MetadataCatalog.get(dset_name)
+    dprint("MetadataCatalog: ", meta)
+    objs = meta.objs
+
+    t_start = time.perf_counter()
+    dicts = DatasetCatalog.get(dset_name)
+    logger.info("Done loading {} samples with {:.3f}s.".format(len(dicts), time.perf_counter() - t_start))
+
+    dirname = "output/{}-data-vis".format(dset_name)
+    os.makedirs(dirname, exist_ok=True)
+    for d in dicts:
+        img = read_image_mmcv(d["file_name"], format="BGR")
+        depth = mmcv.imread(d["depth_file"], "unchanged") / d["depth_factor"]
+
+        imH, imW = img.shape[:2]
+        annos = d["annotations"]
+        masks = [cocosegm2mask(anno["segmentation"], imH, imW) for anno in annos]
+        bboxes = [anno["bbox"] for anno in annos]
+        bbox_modes = [anno["bbox_mode"] for anno in annos]
+        bboxes_xyxy = np.array(
+            [BoxMode.convert(box, box_mode, BoxMode.XYXY_ABS) for box, box_mode in zip(bboxes, bbox_modes)]
+        )
+        kpts_3d_list = [anno["bbox3d_and_center"] for anno in annos]
+        quats = [anno["quat"] for anno in annos]
+        transes = [anno["trans"] for anno in annos]
+        Rs = [quat2mat(quat) for quat in quats]
+        # 0-based label
+        cat_ids = [anno["category_id"] for anno in annos]
+        K = d["cam"]
+        kpts_2d = [misc.project_pts(kpt3d, K, R, t) for kpt3d, R, t in zip(kpts_3d_list, Rs, transes)]
+        # # TODO: visualize pose and keypoints
+        labels = [objs[cat_id] for cat_id in cat_ids]
+        # img_vis = vis_image_bboxes_cv2(img, bboxes=bboxes_xyxy, labels=labels)
+        img_vis = vis_image_mask_bbox_cv2(img, masks, bboxes=bboxes_xyxy, labels=labels)
+        img_vis_kpts2d = img.copy()
+        for anno_i in range(len(annos)):
+            img_vis_kpts2d = misc.draw_projected_box3d(img_vis_kpts2d, kpts_2d[anno_i])
+        grid_show(
+            [
+                img[:, :, [2, 1, 0]],
+                img_vis[:, :, [2, 1, 0]],
+                img_vis_kpts2d[:, :, [2, 1, 0]],
+                depth,
+            ],
+            [f"img:{d['file_name']}", "vis_img", "img_vis_kpts2d", "depth"],
+            row=2,
+            col=2,
+        )
+
+
+if __name__ == "__main__":
+    """Test the  dataset loader.
+
+    Usage:
+        python -m core.datasets.ycbv_bop_test dataset_name
+    """
+    from lib.vis_utils.image import grid_show
+    from lib.utils.setup_logger import setup_my_logger
+    import detectron2.data.datasets  # noqa # add pre-defined metadata
+    from core.utils.data_utils import read_image_mmcv
+    from lib.vis_utils.image import vis_image_mask_bbox_cv2
+
+    print("sys.argv:", sys.argv)
+    logger = setup_my_logger(name="core")
+    register_with_name_cfg(sys.argv[1])
+    print("dataset catalog: ", DatasetCatalog.list())
+    test_vis()
diff --git a/det/yolox/data/datasets/fruitbin_d2.py b/det/yolox/data/datasets/fruitbin_d2.py
new file mode 100755
index 0000000000000000000000000000000000000000..95341811b3f03748d634bc133904a1806a18452b
--- /dev/null
+++ b/det/yolox/data/datasets/fruitbin_d2.py
@@ -0,0 +1,739 @@
+import hashlib
+import copy
+import logging
+import os
+import os.path as osp
+import sys
+
+cur_dir = osp.dirname(osp.abspath(__file__))
+PROJ_ROOT = osp.normpath(osp.join(cur_dir, "../../../.."))
+sys.path.insert(0, PROJ_ROOT)
+import time
+from collections import OrderedDict
+import mmcv
+import numpy as np
+from tqdm import tqdm
+from transforms3d.quaternions import mat2quat, quat2mat
+import ref
+from detectron2.data import DatasetCatalog, MetadataCatalog
+from detectron2.structures import BoxMode
+from lib.pysixd import inout, misc
+from lib.utils.mask_utils import binary_mask_to_rle, cocosegm2mask
+from lib.utils.utils import dprint, iprint, lazy_property
+
+
+logger = logging.getLogger(__name__)
+DATASETS_ROOT = osp.normpath(osp.join(PROJ_ROOT, "datasets"))
+
+
+class YCBV_Dataset:
+    """use image_sets(scene/image_id) and image root to get data; Here we use
+    bop models, which are center aligned and have some offsets compared to
+    original models."""
+
+    def __init__(self, data_cfg):
+        """
+        Set with_depth and with_masks default to True,
+        and decide whether to load them into dataloader/network later
+        with_masks:
+        """
+        self.name = data_cfg["name"]
+        self.data_cfg = data_cfg
+
+        self.objs = data_cfg["objs"]  # selected objects
+
+        self.ann_files = data_cfg["ann_files"]  # provide scene/im_id list
+        self.image_prefixes = data_cfg["image_prefixes"]  # image root
+
+        self.dataset_root = data_cfg["dataset_root"]  # BOP_DATASETS/ycbv/
+        assert osp.exists(self.dataset_root), self.dataset_root
+        self.models_root = data_cfg["models_root"]  # BOP_DATASETS/ycbv/models
+        self.scale_to_meter = data_cfg["scale_to_meter"]  # 0.001
+
+        self.with_masks = data_cfg["with_masks"]  # True (load masks but may not use it)
+        self.with_depth = data_cfg["with_depth"]  # True (load depth path here, but may not use it)
+        self.with_xyz = data_cfg["with_xyz"]
+
+        self.height = data_cfg["height"]  # 480
+        self.width = data_cfg["width"]  # 640
+
+        self.cache_dir = data_cfg.get("cache_dir", osp.join(PROJ_ROOT, ".cache"))  # .cache
+        self.use_cache = data_cfg.get("use_cache", True)
+        self.num_to_load = data_cfg["num_to_load"]  # -1
+        self.filter_invalid = data_cfg["filter_invalid"]
+
+        self.align_K_by_change_pose = data_cfg.get("align_K_by_change_pose", False)
+        # default: 0000~0059 and synt
+        self.cam = np.array(
+            [
+                [1066.778, 0.0, 312.9869],
+                [0.0, 1067.487, 241.3109],
+                [0.0, 0.0, 1.0],
+            ],
+            dtype="float32",
+        )
+        # 0060~0091
+        # cmu_cam = np.array([[1077.836, 0.0, 323.7872], [0.0, 1078.189, 279.6921], [0.0, 0.0, 1.0]], dtype='float32')
+        ##################################################
+
+        # NOTE: careful! Only the selected objects
+        self.cat_ids = [cat_id for cat_id, obj_name in ref.ycbv.id2obj.items() if obj_name in self.objs]
+        # map selected objs to [0, num_objs-1]
+        self.cat2label = {v: i for i, v in enumerate(self.cat_ids)}  # id_map
+        self.label2cat = {label: cat for cat, label in self.cat2label.items()}
+        self.obj2label = OrderedDict((obj, obj_id) for obj_id, obj in enumerate(self.objs))
+        ##########################################################
+
+    def _load_from_idx_file(self, idx_file, image_root):
+        """
+        idx_file: the scene/image ids
+        image_root/scene contains:
+            scene_gt.json
+            scene_gt_info.json
+            scene_camera.json
+        """
+        xyz_root = osp.join(image_root, "xyz_crop")
+        scene_gt_dicts = {}
+        scene_gt_info_dicts = {}
+        scene_cam_dicts = {}
+        scene_im_ids = []  # store tuples of (scene_id, im_id)
+        with open(idx_file, "r") as f:
+            for line in f:
+                line_split = line.strip("\r\n").split("/")
+                scene_id = int(line_split[0])
+                im_id = int(line_split[1])
+                scene_im_ids.append((scene_id, im_id))
+                if scene_id not in scene_gt_dicts:
+                    scene_gt_file = osp.join(image_root, f"{scene_id:06d}/scene_gt.json")
+                    assert osp.exists(scene_gt_file), scene_gt_file
+                    scene_gt_dicts[scene_id] = mmcv.load(scene_gt_file)
+
+                if scene_id not in scene_gt_info_dicts:
+                    scene_gt_info_file = osp.join(image_root, f"{scene_id:06d}/scene_gt_info.json")
+                    assert osp.exists(scene_gt_info_file), scene_gt_info_file
+                    scene_gt_info_dicts[scene_id] = mmcv.load(scene_gt_info_file)
+
+                if scene_id not in scene_cam_dicts:
+                    scene_cam_file = osp.join(image_root, f"{scene_id:06d}/scene_camera.json")
+                    assert osp.exists(scene_cam_file), scene_cam_file
+                    scene_cam_dicts[scene_id] = mmcv.load(scene_cam_file)
+        ######################################################
+        scene_im_ids = sorted(scene_im_ids)  # sort to make it reproducible
+        dataset_dicts = []
+
+        num_instances_without_valid_segmentation = 0
+        num_instances_without_valid_box = 0
+
+        for (scene_id, im_id) in tqdm(scene_im_ids):
+            rgb_path = osp.join(image_root, f"{scene_id:06d}/rgb/{im_id:06d}.png")
+            assert osp.exists(rgb_path), rgb_path
+            str_im_id = str(im_id)
+
+            scene_im_id = f"{scene_id}/{im_id}"
+
+            # for ycbv/tless, load cam K from image infos
+            cam_anno = np.array(scene_cam_dicts[scene_id][str_im_id]["cam_K"], dtype=np.float32).reshape(3, 3)
+            adapth_this_K = False
+            if self.align_K_by_change_pose:
+                if (cam_anno != self.cam).any():
+                    adapth_this_K = True
+                    cam_anno_ori = cam_anno.copy()
+                    cam_anno = self.cam
+
+            depth_factor = 1000.0 / scene_cam_dicts[scene_id][str_im_id]["depth_scale"]
+            # dprint(record['cam'])
+            if "/train_synt/" in rgb_path:
+                img_type = "syn"
+            else:
+                img_type = "real"
+            record = {
+                "dataset_name": self.name,
+                "file_name": osp.relpath(rgb_path, PROJ_ROOT),
+                "height": self.height,
+                "width": self.width,
+                "image_id": self._unique_im_id,
+                "scene_im_id": scene_im_id,  # for evaluation
+                "cam": cam_anno,  # self.cam,
+                "depth_factor": depth_factor,
+                "img_type": img_type,
+            }
+
+            if self.with_depth:
+                depth_file = osp.join(image_root, f"{scene_id:06d}/depth/{im_id:06d}.png")
+                assert osp.exists(depth_file), depth_file
+                record["depth_file"] = osp.relpath(depth_file, PROJ_ROOT)
+
+            insts = []
+            anno_dict_list = scene_gt_dicts[scene_id][str(im_id)]
+            info_dict_list = scene_gt_info_dicts[scene_id][str(im_id)]
+            for anno_i, anno in enumerate(anno_dict_list):
+                info = info_dict_list[anno_i]
+                obj_id = anno["obj_id"]
+                if obj_id not in self.cat_ids:
+                    continue
+                # 0-based label now
+                cur_label = self.cat2label[obj_id]
+                ################ pose ###########################
+                R = np.array(anno["cam_R_m2c"], dtype="float32").reshape(3, 3)
+                trans = np.array(anno["cam_t_m2c"], dtype="float32") / 1000.0  # mm->m
+                pose = np.hstack([R, trans.reshape(3, 1)])
+                if adapth_this_K:
+                    # pose_uw = inv(K_uw) @ K_cmu @ pose_cmu
+                    pose = np.linalg.inv(cam_anno) @ cam_anno_ori @ pose
+                    # R = pose[:3, :3]
+                    trans = pose[:3, 3]
+
+                quat = mat2quat(pose[:3, :3])
+
+                ############# bbox ############################
+                bbox = info["bbox_obj"]
+                x1, y1, w, h = bbox
+                x2 = x1 + w
+                y2 = y1 + h
+                x1 = max(min(x1, self.width), 0)
+                y1 = max(min(y1, self.height), 0)
+                x2 = max(min(x2, self.width), 0)
+                y2 = max(min(y2, self.height), 0)
+                bbox = [x1, y1, x2, y2]
+                if self.filter_invalid:
+                    bw = bbox[2] - bbox[0]
+                    bh = bbox[3] - bbox[1]
+                    if bh <= 1 or bw <= 1:
+                        num_instances_without_valid_box += 1
+                        continue
+
+                ############## mask #######################
+                if self.with_masks:  # either list[list[float]] or dict(RLE)
+                    mask_visib_file = osp.join(
+                        image_root,
+                        f"{scene_id:06d}/mask_visib/{im_id:06d}_{anno_i:06d}.png",
+                    )
+                    assert osp.exists(mask_visib_file), mask_visib_file
+                    mask = mmcv.imread(mask_visib_file, "unchanged")
+                    area = mask.sum()
+                    if area < 30 and self.filter_invalid:
+                        num_instances_without_valid_segmentation += 1
+                        continue
+                    mask_rle = binary_mask_to_rle(mask)
+
+                    mask_full_file = osp.join(
+                        image_root,
+                        f"{scene_id:06d}/mask/{im_id:06d}_{anno_i:06d}.png",
+                    )
+                    assert osp.exists(mask_full_file), mask_full_file
+
+                    # load mask full
+                    mask_full = mmcv.imread(mask_full_file, "unchanged")
+                    mask_full = mask_full.astype("bool")
+                    mask_full_rle = binary_mask_to_rle(mask_full, compressed=True)
+
+                proj = (self.cam @ trans.T).T  # NOTE: use self.cam here
+                proj = proj[:2] / proj[2]
+
+                inst = {
+                    "category_id": cur_label,  # 0-based label
+                    "bbox": bbox,  # TODO: load both bbox_obj and bbox_visib
+                    "bbox_mode": BoxMode.XYXY_ABS,
+                    "pose": pose,
+                    "quat": quat,
+                    "trans": trans,
+                    "centroid_2d": proj,  # absolute (cx, cy)
+                    "segmentation": mask_rle,
+                    "mask_full": mask_full_rle,
+                }
+
+                if self.with_xyz:
+                    xyz_path = osp.join(
+                        xyz_root,
+                        f"{scene_id:06d}/{im_id:06d}_{anno_i:06d}-xyz.pkl",
+                    )
+                    inst["xyz_path"] = xyz_path
+
+                model_info = self.models_info[str(obj_id)]
+                inst["model_info"] = model_info
+                # TODO: using full mask and full xyz
+                for key in ["bbox3d_and_center"]:
+                    inst[key] = self.models[cur_label][key]
+                insts.append(inst)
+            if len(insts) == 0:  # and self.filter_invalid:
+                continue
+            record["annotations"] = insts
+            dataset_dicts.append(record)
+            self._unique_im_id += 1
+
+        if num_instances_without_valid_segmentation > 0:
+            logger.warning(
+                "Filtered out {} instances without valid segmentation. "
+                "There might be issues in your dataset generation process.".format(
+                    num_instances_without_valid_segmentation
+                )
+            )
+        if num_instances_without_valid_box > 0:
+            logger.warning(
+                "Filtered out {} instances without valid box. "
+                "There might be issues in your dataset generation process.".format(num_instances_without_valid_box)
+            )
+        return dataset_dicts
+
+    def __call__(self):  # YCBV_Dataset
+        """Load light-weight instance annotations of all images into a list of
+        dicts in Detectron2 format.
+
+        Do not load heavy data into memory in this file, since we will
+        load the annotations of all images into memory.
+        """
+        # cache the dataset_dicts to avoid loading masks from files
+        hashed_file_name = hashlib.md5(
+            (
+                "".join([str(fn) for fn in self.objs])
+                + "dataset_dicts_{}_{}_{}_{}_{}_{}".format(
+                    self.name,
+                    self.dataset_root,
+                    self.with_masks,
+                    self.with_depth,
+                    self.with_xyz,
+                    __name__,
+                )
+            ).encode("utf-8")
+        ).hexdigest()
+        cache_path = osp.join(
+            self.cache_dir,
+            "dataset_dicts_{}_{}.pkl".format(self.name, hashed_file_name),
+        )
+
+        if osp.exists(cache_path) and self.use_cache:
+            logger.info("load cached dataset dicts from {}".format(cache_path))
+            return mmcv.load(cache_path)
+
+        logger.info("loading dataset dicts: {}".format(self.name))
+        t_start = time.perf_counter()
+        dataset_dicts = []
+        self._unique_im_id = 0
+        for ann_file, image_root in zip(self.ann_files, self.image_prefixes):
+            # logger.info("loading coco json: {}".format(ann_file))
+            dataset_dicts.extend(self._load_from_idx_file(ann_file, image_root))
+
+        ##########################################################################
+        if self.num_to_load > 0:
+            self.num_to_load = min(int(self.num_to_load), len(dataset_dicts))
+            dataset_dicts = dataset_dicts[: self.num_to_load]
+        logger.info("loaded {} dataset dicts, using {}s".format(len(dataset_dicts), time.perf_counter() - t_start))
+
+        mmcv.mkdir_or_exist(osp.dirname(cache_path))
+        mmcv.dump(dataset_dicts, cache_path, protocol=4)
+        logger.info("Dumped dataset_dicts to {}".format(cache_path))
+        return dataset_dicts
+
+    @lazy_property
+    def models_info(self):
+        models_info_path = osp.join(self.models_root, "models_info.json")
+        assert osp.exists(models_info_path), models_info_path
+        models_info = mmcv.load(models_info_path)  # key is str(obj_id)
+        return models_info
+
+    @lazy_property
+    def models(self):
+        """Load models into a list."""
+        cache_path = osp.join(self.models_root, "models_{}.pkl".format(self.name))
+        if osp.exists(cache_path) and self.use_cache:
+            # dprint("{}: load cached object models from {}".format(self.name, cache_path))
+            return mmcv.load(cache_path)
+
+        models = []
+        for obj_name in self.objs:
+            model = inout.load_ply(
+                osp.join(
+                    self.models_root,
+                    f"obj_{ref.ycbv.obj2id[obj_name]:06d}.ply",
+                ),
+                vertex_scale=self.scale_to_meter,
+            )
+            # NOTE: the bbox3d_and_center is not obtained from centered vertices
+            # for BOP models, not a big problem since they had been centered
+            model["bbox3d_and_center"] = misc.get_bbox3d_and_center(model["pts"])
+
+            models.append(model)
+        logger.info("cache models to {}".format(cache_path))
+        mmcv.dump(models, cache_path, protocol=4)
+        return models
+
+    def image_aspect_ratio(self):
+        return self.width / self.height  # 4/3
+
+
+########### register datasets ############################################################
+
+
+def get_ycbv_metadata(obj_names, ref_key):
+    """task specific metadata."""
+    data_ref = ref.__dict__[ref_key]
+
+    cur_sym_infos = {}  # label based key
+    loaded_models_info = data_ref.get_models_info()
+
+    for i, obj_name in enumerate(obj_names):
+        obj_id = data_ref.obj2id[obj_name]
+        model_info = loaded_models_info[str(obj_id)]
+        if "symmetries_discrete" in model_info or "symmetries_continuous" in model_info:
+            sym_transforms = misc.get_symmetry_transformations(model_info, max_sym_disc_step=0.01)
+            sym_info = np.array([sym["R"] for sym in sym_transforms], dtype=np.float32)
+        else:
+            sym_info = None
+        cur_sym_infos[i] = sym_info
+
+    meta = {"thing_classes": obj_names, "sym_infos": cur_sym_infos}
+    return meta
+
+
+ycbv_model_root = "BOP_DATASETS/ycbv/models/"
+################################################################################
+default_cfg = dict(
+    # name="ycbv_train_real",
+    dataset_root=osp.join(DATASETS_ROOT, "BOP_DATASETS/ycbv/"),
+    models_root=osp.join(DATASETS_ROOT, "BOP_DATASETS/ycbv/models"),  # models_simple
+    objs=ref.ycbv.objects,  # all objects
+    # NOTE: this contains all classes
+    # ann_files=[osp.join(DATASETS_ROOT, "BOP_DATASETS/ycbv/image_sets/train.txt")],
+    # image_prefixes=[osp.join(DATASETS_ROOT, "BOP_DATASETS/ycbv/train_real")],
+    scale_to_meter=0.001,
+    with_masks=True,  # (load masks but may not use it)
+    with_depth=True,  # (load depth path here, but may not use it)
+    with_xyz=True,
+    height=480,
+    width=640,
+    align_K_by_change_pose=False,
+    cache_dir=osp.join(PROJ_ROOT, ".cache"),
+    use_cache=True,
+    num_to_load=-1,
+    filter_invalid=True,
+    ref_key="ycbv",
+)
+SPLITS_YCBV = {}
+update_cfgs = {
+    "ycbv_train_real": {
+        "ann_files": [osp.join(DATASETS_ROOT, "BOP_DATASETS/ycbv/image_sets/train.txt")],
+        "image_prefixes": [osp.join(DATASETS_ROOT, "BOP_DATASETS/ycbv/train_real")],
+    },
+    "ycbv_train_real_aligned_Kuw": {
+        "ann_files": [osp.join(DATASETS_ROOT, "BOP_DATASETS/ycbv/image_sets/train.txt")],
+        "image_prefixes": [osp.join(DATASETS_ROOT, "BOP_DATASETS/ycbv/train_real")],
+        "align_K_by_change_pose": True,
+    },
+    "ycbv_train_real_uw": {
+        "ann_files": [osp.join(DATASETS_ROOT, "BOP_DATASETS/ycbv/image_sets/train_real_uw.txt")],
+        "image_prefixes": [osp.join(DATASETS_ROOT, "BOP_DATASETS/ycbv/train_real")],
+    },
+    "ycbv_train_real_uw_every10": {
+        "ann_files": [
+            osp.join(
+                DATASETS_ROOT,
+                "BOP_DATASETS/ycbv/image_sets/train_real_uw_every10.txt",
+            )
+        ],
+        "image_prefixes": [osp.join(DATASETS_ROOT, "BOP_DATASETS/ycbv/train_real")],
+    },
+    "ycbv_train_real_cmu": {
+        "ann_files": [
+            osp.join(
+                DATASETS_ROOT,
+                "BOP_DATASETS/ycbv/image_sets/train_real_cmu.txt",
+            )
+        ],
+        "image_prefixes": [osp.join(DATASETS_ROOT, "BOP_DATASETS/ycbv/train_real")],
+    },
+    "ycbv_train_real_cmu_aligned_Kuw": {
+        "ann_files": [
+            osp.join(
+                DATASETS_ROOT,
+                "BOP_DATASETS/ycbv/image_sets/train_real_cmu.txt",
+            )
+        ],
+        "image_prefixes": [osp.join(DATASETS_ROOT, "BOP_DATASETS/ycbv/train_real")],
+        "align_K_by_change_pose": True,
+    },
+    "ycbv_train_synt": {
+        "ann_files": [osp.join(DATASETS_ROOT, "BOP_DATASETS/ycbv/image_sets/train_synt.txt")],
+        "image_prefixes": [osp.join(DATASETS_ROOT, "BOP_DATASETS/ycbv/train_synt")],
+    },
+    "ycbv_train_synt_50k": {
+        "ann_files": [
+            osp.join(
+                DATASETS_ROOT,
+                "BOP_DATASETS/ycbv/image_sets/train_synt_50k.txt",
+            )
+        ],
+        "image_prefixes": [osp.join(DATASETS_ROOT, "BOP_DATASETS/ycbv/train_synt")],
+    },
+    "ycbv_train_synt_30k": {
+        "ann_files": [
+            osp.join(
+                DATASETS_ROOT,
+                "BOP_DATASETS/ycbv/image_sets/train_synt_30k.txt",
+            )
+        ],
+        "image_prefixes": [osp.join(DATASETS_ROOT, "BOP_DATASETS/ycbv/train_synt")],
+    },
+    "ycbv_train_synt_100": {
+        "ann_files": [
+            osp.join(
+                DATASETS_ROOT,
+                "BOP_DATASETS/ycbv/image_sets/train_synt_100.txt",
+            )
+        ],
+        "image_prefixes": [osp.join(DATASETS_ROOT, "BOP_DATASETS/ycbv/train_synt")],
+    },
+    "ycbv_test": {
+        "ann_files": [osp.join(DATASETS_ROOT, "BOP_DATASETS/ycbv/image_sets/keyframe.txt")],
+        "image_prefixes": [osp.join(DATASETS_ROOT, "BOP_DATASETS/ycbv/test")],
+        "with_xyz": False,
+        "filter_invalid": False,
+    },
+}
+for name, update_cfg in update_cfgs.items():
+    used_cfg = copy.deepcopy(default_cfg)
+    used_cfg["name"] = name
+    used_cfg.update(update_cfg)
+    num_to_load = -1
+    if "_100" in name:
+        num_to_load = 100
+    used_cfg["num_to_load"] = num_to_load
+    SPLITS_YCBV[name] = used_cfg
+
+# single object splits ######################################################
+for obj in ref.ycbv.objects:
+    for split in [
+        "train_real",
+        "train_real_aligned_Kuw",
+        "train_real_uw",
+        "train_real_uw_every10",
+        "train_real_cmu",
+        "train_real_cmu_aligned_Kuw",
+        "train_synt",
+        "train_synt_30k",
+        "test",
+    ]:
+        name = "ycbv_{}_{}".format(obj, split)
+        if split in [
+            "train_real",
+            "train_real_aligned_Kuw",
+            "train_real_uw",
+            "train_real_uw_every10",
+            "train_real_cmu",
+            "train_real_cmu_aligned_Kuw",
+            "train_synt",
+            "train_synt_30k",
+        ]:
+            filter_invalid = True
+            with_xyz = True
+        elif split in ["test"]:
+            filter_invalid = False
+            with_xyz = False
+        else:
+            raise ValueError("{}".format(split))
+
+        if split in ["train_real_aligned_Kuw", "train_real_cmu_aligned_Kuw"]:
+            align_K_by_change_pose = True
+        else:
+            align_K_by_change_pose = False
+
+        split_idx_file_dict = {
+            "train_real": ("train_real", "train.txt"),
+            "train_real_aligned_Kuw": ("train_real", "train.txt"),
+            "train_real_uw": ("train_real", "train_real_uw.txt"),
+            "train_real_uw_every10": (
+                "train_real",
+                "train_real_uw_every10.txt",
+            ),
+            "train_real_cmu": ("train_real", "train_real_cmu.txt"),
+            "train_real_cmu_aligned_Kuw": ("train_real", "train_real_cmu.txt"),
+            "train_synt": ("train_synt", "train_synt.txt"),
+            "train_synt_30k": ("train_synt", "train_synt_30k.txt"),
+            "test": ("test", "keyframe.txt"),
+        }
+        root_name, idx_file = split_idx_file_dict[split]
+
+        if name not in SPLITS_YCBV:
+            SPLITS_YCBV[name] = dict(
+                name=name,
+                dataset_root=osp.join(DATASETS_ROOT, "BOP_DATASETS/ycbv/"),
+                models_root=osp.join(DATASETS_ROOT, "BOP_DATASETS/ycbv/models"),
+                objs=[obj],
+                ann_files=[
+                    osp.join(
+                        DATASETS_ROOT,
+                        "BOP_DATASETS/ycbv/image_sets/{}".format(idx_file),
+                    )
+                ],
+                image_prefixes=[osp.join(DATASETS_ROOT, "BOP_DATASETS/ycbv/{}".format(root_name))],
+                scale_to_meter=0.001,
+                with_masks=True,  # (load masks but may not use it)
+                with_depth=True,  # (load depth path here, but may not use it)
+                with_xyz=with_xyz,
+                height=480,
+                width=640,
+                align_K_by_change_pose=align_K_by_change_pose,
+                cache_dir=osp.join(PROJ_ROOT, ".cache"),
+                use_cache=True,
+                num_to_load=-1,
+                filter_invalid=filter_invalid,
+                ref_key="ycbv",
+            )
+
+
+def register_with_name_cfg(name, data_cfg=None):
+    """Assume pre-defined datasets live in `./datasets`.
+
+    Args:
+        name: datasnet_name,
+        data_cfg: if name is in existing SPLITS, use pre-defined data_cfg
+            otherwise requires data_cfg
+            data_cfg can be set in cfg.DATA_CFG.name
+    """
+    dprint("register dataset: {}".format(name))
+    if name in SPLITS_YCBV:
+        used_cfg = SPLITS_YCBV[name]
+    else:
+        assert (
+            data_cfg is not None
+        ), f"dataset name {name} is not registered. available datasets: {list(SPLITS_YCBV.keys())}"
+        used_cfg = data_cfg
+    DatasetCatalog.register(name, YCBV_Dataset(used_cfg))
+    # something like eval_types
+    MetadataCatalog.get(name).set(
+        id="ycbv",  # NOTE: for pvnet to determine module
+        ref_key=used_cfg["ref_key"],
+        objs=used_cfg["objs"],
+        eval_error_types=["ad", "rete", "proj"],
+        evaluator_type="bop",
+        **get_ycbv_metadata(obj_names=used_cfg["objs"], ref_key=used_cfg["ref_key"]),
+    )
+
+
+def get_available_datasets():
+    return list(SPLITS_YCBV.keys())
+
+
+#### tests ###############################################
+def test_vis():
+    # python -m core.datasets.ycbv_d2 ycbv_test
+    dataset_name = sys.argv[1]
+    meta = MetadataCatalog.get(dataset_name)
+    t_start = time.perf_counter()
+    dicts = DatasetCatalog.get(dataset_name)
+    with_xyz = False if "test" in dataset_name else True
+    logger.info("Done loading {} samples with {:.3f}s.".format(len(dicts), time.perf_counter() - t_start))
+
+    dirname = "output/ycbv_test-data-vis"
+    os.makedirs(dirname, exist_ok=True)
+    objs = meta.objs
+    for d in dicts:
+        img = read_image_mmcv(d["file_name"], format="BGR")
+        depth = mmcv.imread(d["depth_file"], "unchanged") / 1000.0
+
+        imH, imW = img.shape[:2]
+        annos = d["annotations"]
+        masks = [cocosegm2mask(anno["segmentation"], imH, imW) for anno in annos]
+        bboxes = [anno["bbox"] for anno in annos]
+        bbox_modes = [anno["bbox_mode"] for anno in annos]
+        bboxes_xyxy = np.array(
+            [BoxMode.convert(box, box_mode, BoxMode.XYXY_ABS) for box, box_mode in zip(bboxes, bbox_modes)]
+        )
+        kpts_3d_list = [anno["bbox3d_and_center"] for anno in annos]
+        quats = [anno["quat"] for anno in annos]
+        transes = [anno["trans"] for anno in annos]
+        Rs = [quat2mat(quat) for quat in quats]
+        # 0-based label
+        cat_ids = [anno["category_id"] for anno in annos]
+        K = d["cam"]
+        kpts_2d = [misc.project_pts(kpt3d, K, R, t) for kpt3d, R, t in zip(kpts_3d_list, Rs, transes)]
+        # # TODO: visualize pose and keypoints
+        labels = [objs[cat_id] for cat_id in cat_ids]
+        for _i in range(len(annos)):
+            img_vis = vis_image_mask_bbox_cv2(
+                img,
+                masks[_i : _i + 1],
+                bboxes=bboxes_xyxy[_i : _i + 1],
+                labels=labels[_i : _i + 1],
+            )
+            img_vis_kpts2d = misc.draw_projected_box3d(img_vis.copy(), kpts_2d[_i])
+            if with_xyz:
+                xyz_path = annos[_i]["xyz_path"]
+                xyz_info = mmcv.load(xyz_path)
+                x1, y1, x2, y2 = xyz_info["xyxy"]
+                xyz_crop = xyz_info["xyz_crop"].astype(np.float32)
+                xyz = np.zeros((imH, imW, 3), dtype=np.float32)
+                xyz[y1 : y2 + 1, x1 : x2 + 1, :] = xyz_crop
+                xyz_show = get_emb_show(xyz)
+                xyz_crop_show = get_emb_show(xyz_crop)
+                img_xyz = img.copy() / 255.0
+                mask_xyz = ((xyz[:, :, 0] != 0) | (xyz[:, :, 1] != 0) | (xyz[:, :, 2] != 0)).astype("uint8")
+                fg_idx = np.where(mask_xyz != 0)
+                img_xyz[fg_idx[0], fg_idx[1], :] = (
+                    0.5 * xyz_show[fg_idx[0], fg_idx[1], :3] + 0.5 * img_xyz[fg_idx[0], fg_idx[1], :]
+                )
+                img_xyz_crop = img_xyz[y1 : y2 + 1, x1 : x2 + 1, :]
+                img_vis_crop = img_vis[y1 : y2 + 1, x1 : x2 + 1, :]
+                # diff mask
+                diff_mask_xyz = np.abs(masks[_i] - mask_xyz)[y1 : y2 + 1, x1 : x2 + 1]
+
+                grid_show(
+                    [
+                        img[:, :, [2, 1, 0]],
+                        img_vis[:, :, [2, 1, 0]],
+                        img_vis_kpts2d[:, :, [2, 1, 0]],
+                        depth,
+                        # xyz_show,
+                        diff_mask_xyz,
+                        xyz_crop_show,
+                        img_xyz[:, :, [2, 1, 0]],
+                        img_xyz_crop[:, :, [2, 1, 0]],
+                        img_vis_crop[:, :, ::-1],
+                    ],
+                    [
+                        "img",
+                        "vis_img",
+                        "img_vis_kpts2d",
+                        "depth",
+                        "diff_mask_xyz",
+                        "xyz_crop_show",
+                        "img_xyz",
+                        "img_xyz_crop",
+                        "img_vis_crop",
+                    ],
+                    row=3,
+                    col=3,
+                )
+            else:
+                grid_show(
+                    [
+                        img[:, :, [2, 1, 0]],
+                        img_vis[:, :, [2, 1, 0]],
+                        img_vis_kpts2d[:, :, [2, 1, 0]],
+                        depth,
+                    ],
+                    ["img", "vis_img", "img_vis_kpts2d", "depth"],
+                    row=2,
+                    col=2,
+                )
+
+
+if __name__ == "__main__":
+    """Test the  dataset loader.
+
+    Usage:
+        python -m this_module dataset_name
+        "dataset_name" can be any pre-registered ones
+    """
+    from lib.vis_utils.image import grid_show
+    from lib.utils.setup_logger import setup_my_logger
+
+    import detectron2.data.datasets  # noqa # add pre-defined metadata
+    from lib.vis_utils.image import vis_image_mask_bbox_cv2
+    from core.utils.utils import get_emb_show
+    from core.utils.data_utils import read_image_mmcv
+
+    print("sys.argv:", sys.argv)
+    logger = setup_my_logger(name="core")
+    register_with_name_cfg(sys.argv[1])
+    print("dataset catalog: ", DatasetCatalog.list())
+    test_vis()
diff --git a/det/yolox/data/datasets/fruitbin_pbr.py b/det/yolox/data/datasets/fruitbin_pbr.py
new file mode 100644
index 0000000000000000000000000000000000000000..9295d4c9ac48572daee96041ec38e883f9f19cbc
--- /dev/null
+++ b/det/yolox/data/datasets/fruitbin_pbr.py
@@ -0,0 +1,492 @@
+import hashlib
+import logging
+import os
+import os.path as osp
+import sys
+
+cur_dir = osp.dirname(osp.abspath(__file__))
+PROJ_ROOT = osp.normpath(osp.join(cur_dir, "../../../.."))
+sys.path.insert(0, PROJ_ROOT)
+import time
+from collections import OrderedDict
+import mmcv
+import numpy as np
+from tqdm import tqdm
+from transforms3d.quaternions import mat2quat, quat2mat
+import ref
+from detectron2.data import DatasetCatalog, MetadataCatalog
+from detectron2.structures import BoxMode
+from lib.pysixd import inout, misc
+from lib.utils.mask_utils import binary_mask_to_rle, cocosegm2mask
+from lib.utils.utils import dprint, iprint, lazy_property
+
+
+logger = logging.getLogger(__name__)
+DATASETS_ROOT = osp.normpath(osp.join(PROJ_ROOT, "datasets"))
+
+
+class YCBV_PBR_Dataset:
+    def __init__(self, data_cfg):
+        """
+        Set with_depth and with_masks default to True,
+        and decide whether to load them into dataloader/network later
+        with_masks:
+        """
+        self.name = data_cfg["name"]
+        self.data_cfg = data_cfg
+
+        self.objs = data_cfg["objs"]  # selected objects
+
+        self.dataset_root = data_cfg.get(
+            "dataset_root",
+            osp.join(DATASETS_ROOT, "BOP_DATASETS/ycbv/train_pbr"),
+        )
+        self.xyz_root = data_cfg.get("xyz_root", osp.join(self.dataset_root, "xyz_crop"))
+        assert osp.exists(self.dataset_root), self.dataset_root
+        self.models_root = data_cfg["models_root"]  # BOP_DATASETS/ycbv/models
+        self.scale_to_meter = data_cfg["scale_to_meter"]  # 0.001
+
+        self.with_masks = data_cfg["with_masks"]
+        self.with_depth = data_cfg["with_depth"]
+
+        self.height = data_cfg["height"]  # 480
+        self.width = data_cfg["width"]  # 640
+
+        self.cache_dir = data_cfg.get("cache_dir", osp.join(PROJ_ROOT, ".cache"))  # .cache
+        self.use_cache = data_cfg.get("use_cache", True)
+        self.num_to_load = data_cfg["num_to_load"]  # -1
+        self.filter_invalid = data_cfg.get("filter_invalid", True)
+        ##################################################
+
+        # NOTE: careful! Only the selected objects
+        self.cat_ids = [cat_id for cat_id, obj_name in ref.ycbv.id2obj.items() if obj_name in self.objs]
+        # map selected objs to [0, num_objs-1]
+        self.cat2label = {v: i for i, v in enumerate(self.cat_ids)}  # id_map
+        self.label2cat = {label: cat for cat, label in self.cat2label.items()}
+        self.obj2label = OrderedDict((obj, obj_id) for obj_id, obj in enumerate(self.objs))
+        ##########################################################
+
+        self.scenes = [f"{i:06d}" for i in range(50)]
+
+    def __call__(self):
+        """Load light-weight instance annotations of all images into a list of
+        dicts in Detectron2 format.
+
+        Do not load heavy data into memory in this file, since we will
+        load the annotations of all images into memory.
+        """
+        # cache the dataset_dicts to avoid loading masks from files
+        hashed_file_name = hashlib.md5(
+            (
+                "".join([str(fn) for fn in self.objs])
+                + "dataset_dicts_{}_{}_{}_{}_{}".format(
+                    self.name,
+                    self.dataset_root,
+                    self.with_masks,
+                    self.with_depth,
+                    __name__,
+                )
+            ).encode("utf-8")
+        ).hexdigest()
+        cache_path = osp.join(
+            self.cache_dir,
+            "dataset_dicts_{}_{}.pkl".format(self.name, hashed_file_name),
+        )
+
+        if osp.exists(cache_path) and self.use_cache:
+            logger.info("load cached dataset dicts from {}".format(cache_path))
+            return mmcv.load(cache_path)
+
+        t_start = time.perf_counter()
+
+        logger.info("loading dataset dicts: {}".format(self.name))
+        self.num_instances_without_valid_segmentation = 0
+        self.num_instances_without_valid_box = 0
+        dataset_dicts = []  # ######################################################
+        # it is slow because of loading and converting masks to rle
+        for scene in tqdm(self.scenes):
+            scene_id = int(scene)
+            scene_root = osp.join(self.dataset_root, scene)
+
+            gt_dict = mmcv.load(osp.join(scene_root, "scene_gt.json"))
+            gt_info_dict = mmcv.load(osp.join(scene_root, "scene_gt_info.json"))
+            cam_dict = mmcv.load(osp.join(scene_root, "scene_camera.json"))
+
+            for str_im_id in tqdm(gt_dict, postfix=f"{scene_id}"):
+                int_im_id = int(str_im_id)
+                rgb_path = osp.join(scene_root, "rgb/{:06d}.jpg").format(int_im_id)
+                assert osp.exists(rgb_path), rgb_path
+
+                depth_path = osp.join(scene_root, "depth/{:06d}.png".format(int_im_id))
+
+                scene_im_id = f"{scene_id}/{int_im_id}"
+
+                K = np.array(cam_dict[str_im_id]["cam_K"], dtype=np.float32).reshape(3, 3)
+                depth_factor = 1000.0 / cam_dict[str_im_id]["depth_scale"]  # 10000
+
+                record = {
+                    "dataset_name": self.name,
+                    "file_name": osp.relpath(rgb_path, PROJ_ROOT),
+                    "depth_file": osp.relpath(depth_path, PROJ_ROOT),
+                    "height": self.height,
+                    "width": self.width,
+                    "image_id": int_im_id,
+                    "scene_im_id": scene_im_id,  # for evaluation
+                    "cam": K,
+                    "depth_factor": depth_factor,
+                    "img_type": "syn_pbr",  # NOTE: has background
+                }
+                insts = []
+                for anno_i, anno in enumerate(gt_dict[str_im_id]):
+                    obj_id = anno["obj_id"]
+                    if obj_id not in self.cat_ids:
+                        continue
+                    cur_label = self.cat2label[obj_id]  # 0-based label
+                    R = np.array(anno["cam_R_m2c"], dtype="float32").reshape(3, 3)
+                    t = np.array(anno["cam_t_m2c"], dtype="float32") / 1000.0
+                    pose = np.hstack([R, t.reshape(3, 1)])
+                    quat = mat2quat(R).astype("float32")
+
+                    proj = (record["cam"] @ t.T).T
+                    proj = proj[:2] / proj[2]
+
+                    bbox_visib = gt_info_dict[str_im_id][anno_i]["bbox_visib"]
+                    bbox_obj = gt_info_dict[str_im_id][anno_i]["bbox_obj"]
+                    x1, y1, w, h = bbox_visib
+                    if self.filter_invalid:
+                        if h <= 1 or w <= 1:
+                            self.num_instances_without_valid_box += 1
+                            continue
+
+                    mask_file = osp.join(
+                        scene_root,
+                        "mask/{:06d}_{:06d}.png".format(int_im_id, anno_i),
+                    )
+                    mask_visib_file = osp.join(
+                        scene_root,
+                        "mask_visib/{:06d}_{:06d}.png".format(int_im_id, anno_i),
+                    )
+                    assert osp.exists(mask_file), mask_file
+                    assert osp.exists(mask_visib_file), mask_visib_file
+                    # load mask visib  TODO: load both mask_visib and mask_full
+                    mask_single = mmcv.imread(mask_visib_file, "unchanged")
+                    area = mask_single.sum()
+                    if area <= 64:  # filter out too small or nearly invisible instances
+                        self.num_instances_without_valid_segmentation += 1
+                        continue
+                    mask_rle = binary_mask_to_rle(mask_single, compressed=True)
+
+                    # load mask full
+                    mask_full = mmcv.imread(mask_file, "unchanged")
+                    mask_full = mask_full.astype("bool")
+                    mask_full_rle = binary_mask_to_rle(mask_full, compressed=True)
+
+                    visib_fract = gt_info_dict[str_im_id][anno_i].get("visib_fract", 1.0)
+
+                    xyz_path = osp.join(
+                        self.xyz_root,
+                        f"{scene_id:06d}/{int_im_id:06d}_{anno_i:06d}-xyz.pkl",
+                    )
+                    inst = {
+                        "category_id": cur_label,  # 0-based label
+                        "bbox": bbox_obj,  # TODO: load both bbox_obj and bbox_visib
+                        "bbox_mode": BoxMode.XYWH_ABS,
+                        "pose": pose,
+                        "quat": quat,
+                        "trans": t,
+                        "centroid_2d": proj,  # absolute (cx, cy)
+                        "segmentation": mask_rle,
+                        "mask_full": mask_full_rle,  # TODO: load as mask_full, rle
+                        "visib_fract": visib_fract,
+                        "xyz_path": xyz_path,
+                    }
+
+                    model_info = self.models_info[str(obj_id)]
+                    inst["model_info"] = model_info
+                    # TODO: using full mask and full xyz
+                    for key in ["bbox3d_and_center"]:
+                        inst[key] = self.models[cur_label][key]
+                    insts.append(inst)
+                if len(insts) == 0:  # filter im without anno
+                    continue
+                record["annotations"] = insts
+                dataset_dicts.append(record)
+
+        if self.num_instances_without_valid_segmentation > 0:
+            logger.warning(
+                "Filtered out {} instances without valid segmentation. "
+                "There might be issues in your dataset generation process.".format(
+                    self.num_instances_without_valid_segmentation
+                )
+            )
+        if self.num_instances_without_valid_box > 0:
+            logger.warning(
+                "Filtered out {} instances without valid box. "
+                "There might be issues in your dataset generation process.".format(self.num_instances_without_valid_box)
+            )
+        ##########################################################################
+        if self.num_to_load > 0:
+            self.num_to_load = min(int(self.num_to_load), len(dataset_dicts))
+            dataset_dicts = dataset_dicts[: self.num_to_load]
+        logger.info("loaded {} dataset dicts, using {}s".format(len(dataset_dicts), time.perf_counter() - t_start))
+
+        mmcv.mkdir_or_exist(osp.dirname(cache_path))
+        mmcv.dump(dataset_dicts, cache_path, protocol=4)
+        logger.info("Dumped dataset_dicts to {}".format(cache_path))
+        return dataset_dicts
+
+    @lazy_property
+    def models_info(self):
+        models_info_path = osp.join(self.models_root, "models_info.json")
+        assert osp.exists(models_info_path), models_info_path
+        models_info = mmcv.load(models_info_path)  # key is str(obj_id)
+        return models_info
+
+    @lazy_property
+    def models(self):
+        """Load models into a list."""
+        cache_path = osp.join(self.models_root, "models_{}.pkl".format(self.name))
+        if osp.exists(cache_path) and self.use_cache:
+            # dprint("{}: load cached object models from {}".format(self.name, cache_path))
+            return mmcv.load(cache_path)
+
+        models = []
+        for obj_name in self.objs:
+            model = inout.load_ply(
+                osp.join(
+                    self.models_root,
+                    f"obj_{ref.ycbv.obj2id[obj_name]:06d}.ply",
+                ),
+                vertex_scale=self.scale_to_meter,
+            )
+            # NOTE: the bbox3d_and_center is not obtained from centered vertices
+            # for BOP models, not a big problem since they had been centered
+            model["bbox3d_and_center"] = misc.get_bbox3d_and_center(model["pts"])
+
+            models.append(model)
+        logger.info("cache models to {}".format(cache_path))
+        mmcv.dump(models, cache_path, protocol=4)
+        return models
+
+    def image_aspect_ratio(self):
+        return self.width / self.height  # 4/3
+
+
+########### register datasets ############################################################
+
+
+def get_ycbv_metadata(obj_names, ref_key):
+    """task specific metadata."""
+    data_ref = ref.__dict__[ref_key]
+
+    cur_sym_infos = {}  # label based key
+    loaded_models_info = data_ref.get_models_info()
+
+    for i, obj_name in enumerate(obj_names):
+        obj_id = data_ref.obj2id[obj_name]
+        model_info = loaded_models_info[str(obj_id)]
+        if "symmetries_discrete" in model_info or "symmetries_continuous" in model_info:
+            sym_transforms = misc.get_symmetry_transformations(model_info, max_sym_disc_step=0.01)
+            sym_info = np.array([sym["R"] for sym in sym_transforms], dtype=np.float32)
+        else:
+            sym_info = None
+        cur_sym_infos[i] = sym_info
+
+    meta = {"thing_classes": obj_names, "sym_infos": cur_sym_infos}
+    return meta
+
+
+ycbv_model_root = "BOP_DATASETS/ycbv/models/"
+################################################################################
+
+
+SPLITS_YCBV_PBR = dict(
+    ycbv_train_pbr=dict(
+        name="ycbv_train_pbr",
+        objs=ref.ycbv.objects,  # selected objects
+        dataset_root=osp.join(DATASETS_ROOT, "BOP_DATASETS/ycbv/train_pbr"),
+        models_root=osp.join(DATASETS_ROOT, "BOP_DATASETS/ycbv/models"),
+        xyz_root=osp.join(DATASETS_ROOT, "BOP_DATASETS/ycbv/train_pbr/xyz_crop"),
+        scale_to_meter=0.001,
+        with_masks=True,  # (load masks but may not use it)
+        with_depth=True,  # (load depth path here, but may not use it)
+        height=480,
+        width=640,
+        use_cache=True,
+        num_to_load=-1,
+        filter_invalid=True,
+        ref_key="ycbv",
+    )
+)
+
+# single obj splits
+for obj in ref.ycbv.objects:
+    for split in ["train_pbr"]:
+        name = "ycbv_{}_{}".format(obj, split)
+        if split in ["train_pbr"]:
+            filter_invalid = True
+        elif split in ["test"]:
+            filter_invalid = False
+        else:
+            raise ValueError("{}".format(split))
+        if name not in SPLITS_YCBV_PBR:
+            SPLITS_YCBV_PBR[name] = dict(
+                name=name,
+                objs=[obj],  # only this obj
+                dataset_root=osp.join(DATASETS_ROOT, "BOP_DATASETS/ycbv/train_pbr"),
+                models_root=osp.join(DATASETS_ROOT, "BOP_DATASETS/ycbv/models"),
+                xyz_root=osp.join(DATASETS_ROOT, "BOP_DATASETS/ycbv/train_pbr/xyz_crop"),
+                scale_to_meter=0.001,
+                with_masks=True,  # (load masks but may not use it)
+                with_depth=True,  # (load depth path here, but may not use it)
+                height=480,
+                width=640,
+                use_cache=True,
+                num_to_load=-1,
+                filter_invalid=filter_invalid,
+                ref_key="ycbv",
+            )
+
+
+def register_with_name_cfg(name, data_cfg=None):
+    """Assume pre-defined datasets live in `./datasets`.
+
+    Args:
+        name: datasnet_name,
+        data_cfg: if name is in existing SPLITS, use pre-defined data_cfg
+            otherwise requires data_cfg
+            data_cfg can be set in cfg.DATA_CFG.name
+    """
+    dprint("register dataset: {}".format(name))
+    if name in SPLITS_YCBV_PBR:
+        used_cfg = SPLITS_YCBV_PBR[name]
+    else:
+        assert data_cfg is not None, f"dataset name {name} is not registered"
+        used_cfg = data_cfg
+    DatasetCatalog.register(name, YCBV_PBR_Dataset(used_cfg))
+    # something like eval_types
+    MetadataCatalog.get(name).set(
+        id="ycbv",  # NOTE: for pvnet to determine module
+        ref_key=used_cfg["ref_key"],
+        objs=used_cfg["objs"],
+        eval_error_types=["ad", "rete", "proj"],
+        evaluator_type="bop",
+        **get_ycbv_metadata(obj_names=used_cfg["objs"], ref_key=used_cfg["ref_key"]),
+    )
+
+
+def get_available_datasets():
+    return list(SPLITS_YCBV_PBR.keys())
+
+
+#### tests ###############################################
+def test_vis():
+    dset_name = sys.argv[1]
+    assert dset_name in DatasetCatalog.list()
+
+    meta = MetadataCatalog.get(dset_name)
+    dprint("MetadataCatalog: ", meta)
+    objs = meta.objs
+
+    t_start = time.perf_counter()
+    dicts = DatasetCatalog.get(dset_name)
+    logger.info("Done loading {} samples with {:.3f}s.".format(len(dicts), time.perf_counter() - t_start))
+
+    dirname = "output/{}-data-vis".format(dset_name)
+    os.makedirs(dirname, exist_ok=True)
+    for d in dicts:
+        img = read_image_mmcv(d["file_name"], format="BGR")
+        depth = mmcv.imread(d["depth_file"], "unchanged") / 10000.0
+
+        imH, imW = img.shape[:2]
+        annos = d["annotations"]
+        masks = [cocosegm2mask(anno["segmentation"], imH, imW) for anno in annos]
+        bboxes = [anno["bbox"] for anno in annos]
+        bbox_modes = [anno["bbox_mode"] for anno in annos]
+        bboxes_xyxy = np.array(
+            [BoxMode.convert(box, box_mode, BoxMode.XYXY_ABS) for box, box_mode in zip(bboxes, bbox_modes)]
+        )
+        kpts_3d_list = [anno["bbox3d_and_center"] for anno in annos]
+        quats = [anno["quat"] for anno in annos]
+        transes = [anno["trans"] for anno in annos]
+        Rs = [quat2mat(quat) for quat in quats]
+        # 0-based label
+        cat_ids = [anno["category_id"] for anno in annos]
+        K = d["cam"]
+        kpts_2d = [misc.project_pts(kpt3d, K, R, t) for kpt3d, R, t in zip(kpts_3d_list, Rs, transes)]
+
+        labels = [objs[cat_id] for cat_id in cat_ids]
+        for _i in range(len(annos)):
+            img_vis = vis_image_mask_bbox_cv2(
+                img,
+                masks[_i : _i + 1],
+                bboxes=bboxes_xyxy[_i : _i + 1],
+                labels=labels[_i : _i + 1],
+            )
+            img_vis_kpts2d = misc.draw_projected_box3d(img_vis.copy(), kpts_2d[_i])
+            xyz_path = annos[_i]["xyz_path"]
+            xyz_info = mmcv.load(xyz_path)
+            x1, y1, x2, y2 = xyz_info["xyxy"]
+            xyz_crop = xyz_info["xyz_crop"].astype(np.float32)
+            xyz = np.zeros((imH, imW, 3), dtype=np.float32)
+            xyz[y1 : y2 + 1, x1 : x2 + 1, :] = xyz_crop
+            xyz_show = get_emb_show(xyz)
+            xyz_crop_show = get_emb_show(xyz_crop)
+            img_xyz = img.copy() / 255.0
+            mask_xyz = ((xyz[:, :, 0] != 0) | (xyz[:, :, 1] != 0) | (xyz[:, :, 2] != 0)).astype("uint8")
+            fg_idx = np.where(mask_xyz != 0)
+            img_xyz[fg_idx[0], fg_idx[1], :] = xyz_show[fg_idx[0], fg_idx[1], :3]
+            img_xyz_crop = img_xyz[y1 : y2 + 1, x1 : x2 + 1, :]
+            img_vis_crop = img_vis[y1 : y2 + 1, x1 : x2 + 1, :]
+            # diff mask
+            diff_mask_xyz = np.abs(masks[_i] - mask_xyz)[y1 : y2 + 1, x1 : x2 + 1]
+
+            grid_show(
+                [
+                    img[:, :, [2, 1, 0]],
+                    img_vis[:, :, [2, 1, 0]],
+                    img_vis_kpts2d[:, :, [2, 1, 0]],
+                    depth,
+                    # xyz_show,
+                    diff_mask_xyz,
+                    xyz_crop_show,
+                    img_xyz[:, :, [2, 1, 0]],
+                    img_xyz_crop[:, :, [2, 1, 0]],
+                    img_vis_crop,
+                ],
+                [
+                    "img",
+                    "vis_img",
+                    "img_vis_kpts2d",
+                    "depth",
+                    "diff_mask_xyz",
+                    "xyz_crop_show",
+                    "img_xyz",
+                    "img_xyz_crop",
+                    "img_vis_crop",
+                ],
+                row=3,
+                col=3,
+            )
+
+
+if __name__ == "__main__":
+    """Test the  dataset loader.
+
+    Usage:
+        python -m this_module ycbv_pbr_train
+    """
+    from lib.vis_utils.image import grid_show
+    from lib.utils.setup_logger import setup_my_logger
+
+    import detectron2.data.datasets  # noqa # add pre-defined metadata
+    from lib.vis_utils.image import vis_image_mask_bbox_cv2
+    from core.utils.utils import get_emb_show
+    from core.utils.data_utils import read_image_mmcv
+
+    print("sys.argv:", sys.argv)
+    logger = setup_my_logger(name="core")
+    register_with_name_cfg(sys.argv[1])
+    print("dataset catalog: ", DatasetCatalog.list())
+
+    test_vis()
diff --git a/ref/fruitbin.py b/ref/fruitbin.py
new file mode 100644
index 0000000000000000000000000000000000000000..83502f9b283287b79e81b55fb4e2017d813a01bc
--- /dev/null
+++ b/ref/fruitbin.py
@@ -0,0 +1,119 @@
+# encoding: utf-8
+"""This file includes necessary params, info."""
+import os
+import mmcv
+import os.path as osp
+
+import numpy as np
+
+# ---------------------------------------------------------------- #
+# ROOT PATH INFO
+# ---------------------------------------------------------------- #
+cur_dir = osp.abspath(osp.dirname(__file__))
+root_dir = osp.normpath(osp.join(cur_dir, ".."))
+# directory storing experiment data (result, model checkpoints, etc).
+output_dir = osp.join(root_dir, "output")
+
+data_root = osp.join(root_dir, "datasets")
+bop_root = osp.join(data_root, "BOP_DATASETS/")
+
+# ---------------------------------------------------------------- #
+# YCBV DATASET
+# ---------------------------------------------------------------- #
+dataset_root = osp.join(bop_root, "ycbv")
+
+train_real_dir = osp.join(dataset_root, "train_real")
+train_render_dir = osp.join(dataset_root, "train_synt")
+train_pbr_dir = osp.join(dataset_root, "train_pbr")
+
+test_dir = osp.join(dataset_root, "test")
+
+test_scenes = [i for i in range(48, 59 + 1)]
+train_real_scenes = [i for i in range(0, 91 + 1) if i not in test_scenes]
+train_synt_scenes = [i for i in range(0, 79 + 1)]
+train_pbr_scenes = [i for i in range(0, 49 + 1)]
+
+model_dir = osp.join(dataset_root, "models")
+fine_model_dir = osp.join(dataset_root, "models_fine")
+model_eval_dir = osp.join(dataset_root, "models_eval")
+model_scaled_simple_dir = osp.join(dataset_root, "models_rescaled")  # m, .obj
+vertex_scale = 0.001
+
+# object info
+id2obj = {
+    1: "002_master_chef_can",  # [1.3360, -0.5000, 3.5105]
+    2: "003_cracker_box",  # [0.5575, 1.7005, 4.8050]
+    3: "004_sugar_box",  # [-0.9520, 1.4670, 4.3645]
+    4: "005_tomato_soup_can",  # [-0.0240, -1.5270, 8.4035]
+    5: "006_mustard_bottle",  # [1.2995, 2.4870, -11.8290]
+    6: "007_tuna_fish_can",  # [-0.1565, 0.1150, 4.2625]
+    7: "008_pudding_box",  # [1.1645, -4.2015, 3.1190]
+    8: "009_gelatin_box",  # [1.4460, -0.5915, 3.6085]
+    9: "010_potted_meat_can",  # [2.4195, 0.3075, 8.0715]
+    10: "011_banana",  # [-18.6730, 12.1915, -1.4635]
+    11: "019_pitcher_base",  # [5.3370, 5.8855, 25.6115]
+    12: "021_bleach_cleanser",  # [4.9290, -2.4800, -13.2920]
+    13: "024_bowl",  # [-0.2270, 0.7950, -2.9675]
+    14: "025_mug",  # [-8.4675, -0.6995, -1.6145]
+    15: "035_power_drill",  # [9.0710, 20.9360, -2.1190]
+    16: "036_wood_block",  # [1.4265, -2.5305, 17.1890]
+    17: "037_scissors",  # [7.0535, -28.1320, 0.0420]
+    18: "040_large_marker",  # [0.0460, -2.1040, 0.3500]
+    19: "051_large_clamp",  # [10.5180, -1.9640, -0.4745]
+    20: "052_extra_large_clamp",  # [-0.3950, -10.4130, 0.1620]
+    21: "061_foam_brick",  # [-0.0805, 0.0805, -8.2435]
+}
+objects = list(id2obj.values())
+
+obj_num = len(id2obj)
+obj2id = {_name: _id for _id, _name in id2obj.items()}
+
+model_paths = [osp.join(model_dir, "obj_{:06d}.ply").format(_id) for _id in id2obj]  # TODO: check this
+texture_paths = [osp.join(model_dir, "obj_{:06d}.png".format(_id)) for _id in id2obj]
+model_colors = [((i + 1) * 10, (i + 1) * 10, (i + 1) * 10) for i in range(obj_num)]  # for renderer
+
+# yapf: disable
+diameters = np.array([172.063, 269.573, 198.377, 120.543, 196.463,
+                      89.797,  142.543, 114.053, 129.540, 197.796,
+                      259.534, 259.566, 161.922, 124.990, 226.170,
+                      237.299, 203.973, 121.365, 174.746, 217.094,
+                      102.903]) / 1000.0
+# yapf: enable
+# Camera info
+width = 640
+height = 480
+zNear = 0.25
+zFar = 6.0
+center = (height / 2, width / 2)
+# default: 0000~0059 and synt
+camera_matrix = uw_camera_matrix = np.array([[1066.778, 0.0, 312.9869], [0.0, 1067.487, 241.3109], [0.0, 0.0, 1.0]])
+# 0060~0091
+cmu_camera_matrix = np.array([[1077.836, 0.0, 323.7872], [0.0, 1078.189, 279.6921], [0.0, 0.0, 1.0]])
+
+depth_factor = 10000.0
+
+
+def get_models_info():
+    """key is str(obj_id)"""
+    models_info_path = osp.join(model_dir, "models_info.json")
+    assert osp.exists(models_info_path), models_info_path
+    models_info = mmcv.load(models_info_path)  # key is str(obj_id)
+    return models_info
+
+
+def get_fps_points():
+    """key is str(obj_id) generated by
+    core/gdrn_modeling/tools/ycbv/ycbv_1_compute_fps.py."""
+    fps_points_path = osp.join(model_dir, "fps_points.pkl")
+    assert osp.exists(fps_points_path), fps_points_path
+    fps_dict = mmcv.load(fps_points_path)
+    return fps_dict
+
+
+def get_keypoints_3d():
+    """key is str(obj_id) generated by
+    core/roi_pvnet/tools/ycbv/ycbv_1_compute_keypoints_3d.py."""
+    keypoints_3d_path = osp.join(model_dir, "keypoints_3d.pkl")
+    assert osp.exists(keypoints_3d_path), keypoints_3d_path
+    kpts_dict = mmcv.load(keypoints_3d_path)
+    return kpts_dict