-
Alexandre Chapin authored6071d486
ycbv.py 15.16 KiB
import numpy as np
import imageio
import yaml
from torch.utils.data import Dataset
import os
from os import listdir
from os.path import isfile, join
from .core import downsample, crop_center
import torch
import torch.nn.functional as F
def get_extrinsic(camera_pos, rays=None, track_point=None, fourxfour=True):
""" Returns extrinsic matrix mapping world to camera coordinates.
Args:
camera_pos (np array [3]): Camera position.
track_point (np array [3]): Point on which the camera is fixated.
rays (np array [h, w, 3]): Rays eminating from the camera. Used to determine track_point
if it's not given.
fourxfour (bool): If true, a 4x4 matrix for homogeneous 3D coordinates is returned.
Otherwise, a 3x4 matrix is returned.
Returns:
extrinsic camera matrix (np array [4, 4] or [3, 4])
"""
if track_point is None:
h, w, _ = rays.shape
if h % 2 == 0:
center_rays = rays[h//2 - 1:h//2 + 1]
else:
center_rays = rays[h//2:h//2+1]
if w % 2 == 0:
center_rays = rays[:, w//2 - 1:w//2 + 1]
else:
center_rays = rays[:, w//2:w//2+1]
camera_z = center_rays.mean((0, 1))
else:
camera_z = track_point - camera_pos
camera_z = camera_z / np.linalg.norm(camera_z, axis=-1, keepdims=True)
# We assume that (a) the z-axis is vertical, and that
# (b) the camera's horizontal, the x-axis, is orthogonal to the vertical, i.e.,
# the camera is in a level position.
vertical = np.array((0., 0., 1.))
camera_x = np.cross(camera_z, vertical)
camera_x = camera_x / np.linalg.norm(camera_x, axis=-1, keepdims=True)
camera_y = np.cross(camera_z, camera_x)
camera_matrix = np.stack((camera_x, camera_y, camera_z), -2)
translation = -np.einsum('...ij,...j->...i', camera_matrix, camera_pos)
camera_matrix = np.concatenate((camera_matrix, np.expand_dims(translation, -1)), -1)
if fourxfour:
filler = np.array([[0., 0., 0., 1.]])
camera_matrix = np.concatenate((camera_matrix, filler), 0)
return camera_matrix
def transform_points(points, transform, translate=True):
""" Apply linear transform to a np array of points.
Args:
points (np array [..., 3]): Points to transform.
transform (np array [3, 4] or [4, 4]): Linear map.
translate (bool): If false, do not apply translation component of transform.
Returns:
transformed points (np array [..., 3])
"""
# Append ones or zeros to get homogenous coordinates
if translate:
constant_term = np.ones_like(points[..., :1])
else:
constant_term = np.zeros_like(points[..., :1])
points = np.concatenate((points, constant_term), axis=-1)
points = np.einsum('nm,...m->...n', transform, points)
return points[..., :3]
def get_camera_rays(c_pos, c_rot, width=640, height=480, focal_length=0.035, sensor_width=0.032,
vertical=None):
if vertical is None:
vertical = np.array((0., 0., 1.))
c_dir = c_rot
img_plane_center = c_pos + c_dir * focal_length
# The horizontal axis of the camera sensor is horizontal (z=0) and orthogonal to the view axis
img_plane_horizontal = np.cross(c_dir, vertical)
img_plane_horizontal = img_plane_horizontal / np.linalg.norm(img_plane_horizontal)
# The vertical axis is orthogonal to both the view axis and the horizontal axis
img_plane_vertical = np.cross(c_dir, img_plane_horizontal)
img_plane_vertical = img_plane_vertical / np.linalg.norm(img_plane_vertical)
# Double check that everything is orthogonal
def is_small(x, atol=1e-7):
return abs(x) < atol
assert(is_small(np.dot(img_plane_vertical, img_plane_horizontal)))
assert(is_small(np.dot(img_plane_vertical, c_dir)))
assert(is_small(np.dot(c_dir, img_plane_horizontal)))
# Sensor height is implied by sensor width and aspect ratio
sensor_height = (sensor_width / width) * height
# Compute pixel boundaries
horizontal_offsets = np.linspace(-1, 1, width+1) * sensor_width / 2
vertical_offsets = np.linspace(-1, 1, height+1) * sensor_height / 2
# Compute pixel centers
horizontal_offsets = (horizontal_offsets[:-1] + horizontal_offsets[1:]) / 2
vertical_offsets = (vertical_offsets[:-1] + vertical_offsets[1:]) / 2
horizontal_offsets = np.repeat(np.reshape(horizontal_offsets, (1, width)), height, 0)
vertical_offsets = np.repeat(np.reshape(vertical_offsets, (height, 1)), width, 1)
horizontal_offsets = (np.reshape(horizontal_offsets, (height, width, 1)) *
np.reshape(img_plane_horizontal, (1, 1, 3)))
vertical_offsets = (np.reshape(vertical_offsets, (height, width, 1)) *
np.reshape(img_plane_vertical, (1, 1, 3)))
image_plane = horizontal_offsets + vertical_offsets
image_plane = image_plane + np.reshape(img_plane_center, (1, 1, 3))
c_pos_exp = np.reshape(c_pos, (1, 1, 3))
rays = image_plane - c_pos_exp
ray_norms = np.linalg.norm(rays, axis=2, keepdims=True)
rays = rays / ray_norms
return rays.astype(np.float32)
def extract_images_path(global_path, mode, images, type_im="rgb"):
scenes = [f for f in listdir(global_path)]
for scene in scenes:
path = global_path + scene + f"/{type_im}/"
temp = np.array(list([join(path, f) for f in listdir(path) if isfile(join(path, f))]))
temp.sort()
cut = int(len(temp)*0.7)
if mode == "train":
temp = temp[:cut]
else:
temp = temp[cut:]
images = np.concatenate((images, temp))
return images
class YCBVideo3D(Dataset):
def __init__(self, path, mode, max_views=None, points_per_item=2048, canonical_view=True,
max_len=None, full_scale=False, shapenet=False, downsample=None):
""" Loads the YCB-Video dataset that we have adapted.
Args:
path (str): Path to dataset.
mode (str): 'train', 'val', or 'test'.
points_per_item (int): Number of target points per scene.
max_len (int): Limit to the number of entries in the dataset.
canonical_view (bool): Return data in canonical camera coordinates (like in SRT), as opposed
to world coordinates.
full_scale (bool): Return all available target points, instead of sampling.
downsample (int): Downsample height and width of input image by a factor of 2**downsample
"""
self.path = path
self.mode = mode
self.points_per_item = points_per_item
self.max_len = max_len
self.canonical = canonical_view
self.full_scale = full_scale
self.shapenet = shapenet
self.downsample = downsample
self.max_num_entities = 21 # max number of objects in a scene
self.num_views = 3 # TODO : set this number for each scene
self.start_idx, self.end_idx = {'train': (0, 70000),
'val': (70000, 75000),
'test': (85000, 100000)}[mode]
self.metadata = np.load(os.path.join(path, 'metadata.npz'))
self.metadata = {k: v for k, v in self.metadata.items()}
self.idxs = np.arange(self.start_idx, self.end_idx)
dataset_name = 'YCB-Video'
print(f'Initialized {dataset_name} {mode} set, {len(self.idxs)} examples')
print(self.idxs)
self.render_kwargs = {
'min_dist': 0.035,
'max_dist': 35.}
def __len__(self):
if self.max_len is not None:
return self.max_len
return len(self.idxs) * self.num_views
def __getitem__(self, idx):
scene_idx = idx % len(self.idxs)
view_idx = idx // len(self.idxs)
scene_idx = self.idxs[scene_idx]
imgs = [np.asarray(imageio.imread(
os.path.join(self.path, 'images', f'img_{scene_idx}_{v}.png')))
for v in range(self.num_views)]
imgs = [img[..., :3].astype(np.float32) / 255 for img in imgs]
mask_idxs = [imageio.imread(os.path.join(self.path, 'masks', f'masks_{scene_idx}_{v}.png'))
for v in range(self.num_views)]
masks = np.zeros((self.num_views, 240, 320, self.max_num_entities), dtype=np.uint8)
np.put_along_axis(masks, np.expand_dims(mask_idxs, -1), 1, axis=-1)
input_image = downsample(imgs[view_idx], num_steps=self.downsample)
input_images = np.expand_dims(np.transpose(input_image, (2, 0, 1)), 0)
all_rays = []
# TODO : find a way to get the camera poses
all_camera_pos = self.metadata['camera_pos'][:self.num_views].astype(np.float32)
all_camera_rot= self.metadata['camera_rot'][:self.num_views].astype(np.float32)
for i in range(self.num_views):
cur_rays = get_camera_rays(all_camera_pos[i], all_camera_rot[i], noisy=False) # TODO : adapt function
all_rays.append(cur_rays)
all_rays = np.stack(all_rays, 0).astype(np.float32)
input_camera_pos = all_camera_pos[view_idx]
if self.canonical:
track_point = np.zeros_like(input_camera_pos) # All cameras are pointed at the origin
canonical_extrinsic = get_extrinsic(input_camera_pos, track_point=track_point) # TODO : adapt function
canonical_extrinsic = canonical_extrinsic.astype(np.float32)
all_rays = transform_points(all_rays, canonical_extrinsic, translate=False) # TODO : adapt function
all_camera_pos = transform_points(all_camera_pos, canonical_extrinsic)
input_camera_pos = all_camera_pos[view_idx]
input_rays = all_rays[view_idx]
input_rays = downsample(input_rays, num_steps=self.downsample)
input_rays = np.expand_dims(input_rays, 0)
input_masks = masks[view_idx]
input_masks = downsample(input_masks, num_steps=self.downsample)
input_masks = np.expand_dims(input_masks, 0)
input_camera_pos = np.expand_dims(input_camera_pos, 0)
all_pixels = np.reshape(np.stack(imgs, 0), (self.num_views * 240 * 320, 3))
all_rays = np.reshape(all_rays, (self.num_views * 240 * 320, 3))
all_camera_pos = np.tile(np.expand_dims(all_camera_pos, 1), (1, 240 * 320, 1))
all_camera_pos = np.reshape(all_camera_pos, (self.num_views * 240 * 320, 3))
all_masks = np.reshape(masks, (self.num_views * 240 * 320, self.max_num_entities))
num_points = all_rays.shape[0]
if not self.full_scale:
# If we have fewer points than we want, sample with replacement
replace = num_points < self.points_per_item
sampled_idxs = np.random.choice(np.arange(num_points),
size=(self.points_per_item,),
replace=replace)
target_rays = all_rays[sampled_idxs]
target_camera_pos = all_camera_pos[sampled_idxs]
target_pixels = all_pixels[sampled_idxs]
target_masks = all_masks[sampled_idxs]
else:
target_rays = all_rays
target_camera_pos = all_camera_pos
target_pixels = all_pixels
target_masks = all_masks
result = {
'input_images': input_images, # [1, 3, h, w]
'input_camera_pos': input_camera_pos, # [1, 3]
'input_rays': input_rays, # [1, h, w, 3]
'input_masks': input_masks, # [1, h, w, self.max_num_entities]
'target_pixels': target_pixels, # [p, 3]
'target_camera_pos': target_camera_pos, # [p, 3]
'target_rays': target_rays, # [p, 3]
'target_masks': target_masks, # [p, self.max_num_entities]
'sceneid': idx, # int
}
if self.canonical:
result['transform'] = canonical_extrinsic # [3, 4] (optional)
return result
class YCBVideo2D(Dataset):
def __init__(self, path, mode, max_objects=6):
""" Loads the YCB dataset in the right format
Args:
path (str): Path to dataset.
mode (str): 'train', 'val', or 'test'.
full_scale (bool): Return all available target points, instead of sampling.
max_objects (int): Load only scenes with at most this many objects.
"""
self.path = path
print(f"Get path {path}")
self.mode = mode
self.max_objects = max_objects
self.max_num_entities = 22
self.rescale = 128
"""self.metadata = np.load(os.path.join(self.path, 'metadata.npz'))
self.metadata = {k: v for k, v in self.metadata.items()}
num_objs = (self.metadata['shape'][self.start_idx:self.end_idx] > 0).sum(1)
self.idxs = np.arange(self.start_idx, self.end_idx)[num_objs <= max_objects]"""
self.images = np.empty(shape=(0,), dtype=np.str_)
self.masks = np.empty(shape=(0,), dtype=np.str_)
if mode == "test":
self.path += "/test/"
scenes = [f for f in listdir(self.path)]
for scene in scenes:
path = self.path + scene
temp = np.array([f for f in listdir(path) if isfile(join(path, f))])
self.images = np.concatenate(self.images, temp)
else:
path_real = self.path + "/train_real/"
path_synth = self.path + "/train_synth/"
self.images = extract_images_path(path_real, mode, self.images)
self.images = extract_images_path(path_synth, mode, self.images)
self.masks = extract_images_path(path_real, mode, self.masks, "masks")
self.masks = extract_images_path(path_synth, mode, self.masks, "masks")
dataset_name = 'YCB'
print(f"Load dataset {dataset_name} in mode {self.mode}")
def __len__(self):
return len(self.images)
def __getitem__(self, idx, noisy=True):
"""scene_idx = idx % len(self.idxs)
scene_idx = self.idxs[scene_idx]"""
img_path = self.images[idx]
img = np.asarray(imageio.imread(img_path))
img = img[..., :3].astype(np.float32) / 255
input_image = crop_center(img, 440)
input_image = F.interpolate(torch.tensor(input_image).permute(2, 0, 1).unsqueeze(0), size=self.rescale)
input_image = input_image.squeeze(0)
mask_path = self.masks[idx]
mask_idxs = imageio.imread(mask_path)
masks = np.zeros((480, 640, self.max_num_entities), dtype=np.uint8)
np.put_along_axis(masks, np.expand_dims(mask_idxs, -1), 1, axis=-1)
input_masks = crop_center(torch.tensor(masks), 440)
input_masks = F.interpolate(input_masks.permute(2, 0, 1).unsqueeze(0), size=128)
input_masks = input_masks.squeeze(0).permute(1, 2, 0)
target_masks = np.reshape(input_masks, (self.rescale*self.rescale, self.max_num_entities))
result = {
'input_images': input_image, # [3, h, w]
'input_masks': input_masks, # [h, w, self.max_num_entities]
'target_masks': target_masks, # [h*w, self.max_num_entities]
'sceneid': idx, # int
}
return result