Skip to content
Snippets Groups Projects
Commit 6071d486 authored by Alexandre Chapin's avatar Alexandre Chapin :race_car:
Browse files

Add logs

parent 37c69f4c
No related branches found
No related tags found
No related merge requests found
......@@ -7,6 +7,5 @@
outputs
logs/*
logs
data
results
*__pycache__
import numpy as np
import imageio
import yaml
from torch.utils.data import Dataset
import os
from os import listdir
from os.path import isfile, join
from .core import downsample, crop_center
import torch
import torch.nn.functional as F
def get_extrinsic(camera_pos, rays=None, track_point=None, fourxfour=True):
""" Returns extrinsic matrix mapping world to camera coordinates.
Args:
camera_pos (np array [3]): Camera position.
track_point (np array [3]): Point on which the camera is fixated.
rays (np array [h, w, 3]): Rays eminating from the camera. Used to determine track_point
if it's not given.
fourxfour (bool): If true, a 4x4 matrix for homogeneous 3D coordinates is returned.
Otherwise, a 3x4 matrix is returned.
Returns:
extrinsic camera matrix (np array [4, 4] or [3, 4])
"""
if track_point is None:
h, w, _ = rays.shape
if h % 2 == 0:
center_rays = rays[h//2 - 1:h//2 + 1]
else:
center_rays = rays[h//2:h//2+1]
if w % 2 == 0:
center_rays = rays[:, w//2 - 1:w//2 + 1]
else:
center_rays = rays[:, w//2:w//2+1]
camera_z = center_rays.mean((0, 1))
else:
camera_z = track_point - camera_pos
camera_z = camera_z / np.linalg.norm(camera_z, axis=-1, keepdims=True)
# We assume that (a) the z-axis is vertical, and that
# (b) the camera's horizontal, the x-axis, is orthogonal to the vertical, i.e.,
# the camera is in a level position.
vertical = np.array((0., 0., 1.))
camera_x = np.cross(camera_z, vertical)
camera_x = camera_x / np.linalg.norm(camera_x, axis=-1, keepdims=True)
camera_y = np.cross(camera_z, camera_x)
camera_matrix = np.stack((camera_x, camera_y, camera_z), -2)
translation = -np.einsum('...ij,...j->...i', camera_matrix, camera_pos)
camera_matrix = np.concatenate((camera_matrix, np.expand_dims(translation, -1)), -1)
if fourxfour:
filler = np.array([[0., 0., 0., 1.]])
camera_matrix = np.concatenate((camera_matrix, filler), 0)
return camera_matrix
def transform_points(points, transform, translate=True):
""" Apply linear transform to a np array of points.
Args:
points (np array [..., 3]): Points to transform.
transform (np array [3, 4] or [4, 4]): Linear map.
translate (bool): If false, do not apply translation component of transform.
Returns:
transformed points (np array [..., 3])
"""
# Append ones or zeros to get homogenous coordinates
if translate:
constant_term = np.ones_like(points[..., :1])
else:
constant_term = np.zeros_like(points[..., :1])
points = np.concatenate((points, constant_term), axis=-1)
points = np.einsum('nm,...m->...n', transform, points)
return points[..., :3]
def get_camera_rays(c_pos, c_rot, width=640, height=480, focal_length=0.035, sensor_width=0.032,
vertical=None):
if vertical is None:
vertical = np.array((0., 0., 1.))
c_dir = c_rot
img_plane_center = c_pos + c_dir * focal_length
# The horizontal axis of the camera sensor is horizontal (z=0) and orthogonal to the view axis
img_plane_horizontal = np.cross(c_dir, vertical)
img_plane_horizontal = img_plane_horizontal / np.linalg.norm(img_plane_horizontal)
# The vertical axis is orthogonal to both the view axis and the horizontal axis
img_plane_vertical = np.cross(c_dir, img_plane_horizontal)
img_plane_vertical = img_plane_vertical / np.linalg.norm(img_plane_vertical)
# Double check that everything is orthogonal
def is_small(x, atol=1e-7):
return abs(x) < atol
assert(is_small(np.dot(img_plane_vertical, img_plane_horizontal)))
assert(is_small(np.dot(img_plane_vertical, c_dir)))
assert(is_small(np.dot(c_dir, img_plane_horizontal)))
# Sensor height is implied by sensor width and aspect ratio
sensor_height = (sensor_width / width) * height
# Compute pixel boundaries
horizontal_offsets = np.linspace(-1, 1, width+1) * sensor_width / 2
vertical_offsets = np.linspace(-1, 1, height+1) * sensor_height / 2
# Compute pixel centers
horizontal_offsets = (horizontal_offsets[:-1] + horizontal_offsets[1:]) / 2
vertical_offsets = (vertical_offsets[:-1] + vertical_offsets[1:]) / 2
horizontal_offsets = np.repeat(np.reshape(horizontal_offsets, (1, width)), height, 0)
vertical_offsets = np.repeat(np.reshape(vertical_offsets, (height, 1)), width, 1)
horizontal_offsets = (np.reshape(horizontal_offsets, (height, width, 1)) *
np.reshape(img_plane_horizontal, (1, 1, 3)))
vertical_offsets = (np.reshape(vertical_offsets, (height, width, 1)) *
np.reshape(img_plane_vertical, (1, 1, 3)))
image_plane = horizontal_offsets + vertical_offsets
image_plane = image_plane + np.reshape(img_plane_center, (1, 1, 3))
c_pos_exp = np.reshape(c_pos, (1, 1, 3))
rays = image_plane - c_pos_exp
ray_norms = np.linalg.norm(rays, axis=2, keepdims=True)
rays = rays / ray_norms
return rays.astype(np.float32)
def extract_images_path(global_path, mode, images, type_im="rgb"):
scenes = [f for f in listdir(global_path)]
for scene in scenes:
path = global_path + scene + f"/{type_im}/"
temp = np.array(list([join(path, f) for f in listdir(path) if isfile(join(path, f))]))
temp.sort()
cut = int(len(temp)*0.7)
if mode == "train":
temp = temp[:cut]
else:
temp = temp[cut:]
images = np.concatenate((images, temp))
return images
class YCBVideo3D(Dataset):
def __init__(self, path, mode, max_views=None, points_per_item=2048, canonical_view=True,
max_len=None, full_scale=False, shapenet=False, downsample=None):
""" Loads the YCB-Video dataset that we have adapted.
Args:
path (str): Path to dataset.
mode (str): 'train', 'val', or 'test'.
points_per_item (int): Number of target points per scene.
max_len (int): Limit to the number of entries in the dataset.
canonical_view (bool): Return data in canonical camera coordinates (like in SRT), as opposed
to world coordinates.
full_scale (bool): Return all available target points, instead of sampling.
downsample (int): Downsample height and width of input image by a factor of 2**downsample
"""
self.path = path
self.mode = mode
self.points_per_item = points_per_item
self.max_len = max_len
self.canonical = canonical_view
self.full_scale = full_scale
self.shapenet = shapenet
self.downsample = downsample
self.max_num_entities = 21 # max number of objects in a scene
self.num_views = 3 # TODO : set this number for each scene
self.start_idx, self.end_idx = {'train': (0, 70000),
'val': (70000, 75000),
'test': (85000, 100000)}[mode]
self.metadata = np.load(os.path.join(path, 'metadata.npz'))
self.metadata = {k: v for k, v in self.metadata.items()}
self.idxs = np.arange(self.start_idx, self.end_idx)
dataset_name = 'YCB-Video'
print(f'Initialized {dataset_name} {mode} set, {len(self.idxs)} examples')
print(self.idxs)
self.render_kwargs = {
'min_dist': 0.035,
'max_dist': 35.}
def __len__(self):
if self.max_len is not None:
return self.max_len
return len(self.idxs) * self.num_views
def __getitem__(self, idx):
scene_idx = idx % len(self.idxs)
view_idx = idx // len(self.idxs)
scene_idx = self.idxs[scene_idx]
imgs = [np.asarray(imageio.imread(
os.path.join(self.path, 'images', f'img_{scene_idx}_{v}.png')))
for v in range(self.num_views)]
imgs = [img[..., :3].astype(np.float32) / 255 for img in imgs]
mask_idxs = [imageio.imread(os.path.join(self.path, 'masks', f'masks_{scene_idx}_{v}.png'))
for v in range(self.num_views)]
masks = np.zeros((self.num_views, 240, 320, self.max_num_entities), dtype=np.uint8)
np.put_along_axis(masks, np.expand_dims(mask_idxs, -1), 1, axis=-1)
input_image = downsample(imgs[view_idx], num_steps=self.downsample)
input_images = np.expand_dims(np.transpose(input_image, (2, 0, 1)), 0)
all_rays = []
# TODO : find a way to get the camera poses
all_camera_pos = self.metadata['camera_pos'][:self.num_views].astype(np.float32)
all_camera_rot= self.metadata['camera_rot'][:self.num_views].astype(np.float32)
for i in range(self.num_views):
cur_rays = get_camera_rays(all_camera_pos[i], all_camera_rot[i], noisy=False) # TODO : adapt function
all_rays.append(cur_rays)
all_rays = np.stack(all_rays, 0).astype(np.float32)
input_camera_pos = all_camera_pos[view_idx]
if self.canonical:
track_point = np.zeros_like(input_camera_pos) # All cameras are pointed at the origin
canonical_extrinsic = get_extrinsic(input_camera_pos, track_point=track_point) # TODO : adapt function
canonical_extrinsic = canonical_extrinsic.astype(np.float32)
all_rays = transform_points(all_rays, canonical_extrinsic, translate=False) # TODO : adapt function
all_camera_pos = transform_points(all_camera_pos, canonical_extrinsic)
input_camera_pos = all_camera_pos[view_idx]
input_rays = all_rays[view_idx]
input_rays = downsample(input_rays, num_steps=self.downsample)
input_rays = np.expand_dims(input_rays, 0)
input_masks = masks[view_idx]
input_masks = downsample(input_masks, num_steps=self.downsample)
input_masks = np.expand_dims(input_masks, 0)
input_camera_pos = np.expand_dims(input_camera_pos, 0)
all_pixels = np.reshape(np.stack(imgs, 0), (self.num_views * 240 * 320, 3))
all_rays = np.reshape(all_rays, (self.num_views * 240 * 320, 3))
all_camera_pos = np.tile(np.expand_dims(all_camera_pos, 1), (1, 240 * 320, 1))
all_camera_pos = np.reshape(all_camera_pos, (self.num_views * 240 * 320, 3))
all_masks = np.reshape(masks, (self.num_views * 240 * 320, self.max_num_entities))
num_points = all_rays.shape[0]
if not self.full_scale:
# If we have fewer points than we want, sample with replacement
replace = num_points < self.points_per_item
sampled_idxs = np.random.choice(np.arange(num_points),
size=(self.points_per_item,),
replace=replace)
target_rays = all_rays[sampled_idxs]
target_camera_pos = all_camera_pos[sampled_idxs]
target_pixels = all_pixels[sampled_idxs]
target_masks = all_masks[sampled_idxs]
else:
target_rays = all_rays
target_camera_pos = all_camera_pos
target_pixels = all_pixels
target_masks = all_masks
result = {
'input_images': input_images, # [1, 3, h, w]
'input_camera_pos': input_camera_pos, # [1, 3]
'input_rays': input_rays, # [1, h, w, 3]
'input_masks': input_masks, # [1, h, w, self.max_num_entities]
'target_pixels': target_pixels, # [p, 3]
'target_camera_pos': target_camera_pos, # [p, 3]
'target_rays': target_rays, # [p, 3]
'target_masks': target_masks, # [p, self.max_num_entities]
'sceneid': idx, # int
}
if self.canonical:
result['transform'] = canonical_extrinsic # [3, 4] (optional)
return result
class YCBVideo2D(Dataset):
def __init__(self, path, mode, max_objects=6):
""" Loads the YCB dataset in the right format
Args:
path (str): Path to dataset.
mode (str): 'train', 'val', or 'test'.
full_scale (bool): Return all available target points, instead of sampling.
max_objects (int): Load only scenes with at most this many objects.
"""
self.path = path
print(f"Get path {path}")
self.mode = mode
self.max_objects = max_objects
self.max_num_entities = 22
self.rescale = 128
"""self.metadata = np.load(os.path.join(self.path, 'metadata.npz'))
self.metadata = {k: v for k, v in self.metadata.items()}
num_objs = (self.metadata['shape'][self.start_idx:self.end_idx] > 0).sum(1)
self.idxs = np.arange(self.start_idx, self.end_idx)[num_objs <= max_objects]"""
self.images = np.empty(shape=(0,), dtype=np.str_)
self.masks = np.empty(shape=(0,), dtype=np.str_)
if mode == "test":
self.path += "/test/"
scenes = [f for f in listdir(self.path)]
for scene in scenes:
path = self.path + scene
temp = np.array([f for f in listdir(path) if isfile(join(path, f))])
self.images = np.concatenate(self.images, temp)
else:
path_real = self.path + "/train_real/"
path_synth = self.path + "/train_synth/"
self.images = extract_images_path(path_real, mode, self.images)
self.images = extract_images_path(path_synth, mode, self.images)
self.masks = extract_images_path(path_real, mode, self.masks, "masks")
self.masks = extract_images_path(path_synth, mode, self.masks, "masks")
dataset_name = 'YCB'
print(f"Load dataset {dataset_name} in mode {self.mode}")
def __len__(self):
return len(self.images)
def __getitem__(self, idx, noisy=True):
"""scene_idx = idx % len(self.idxs)
scene_idx = self.idxs[scene_idx]"""
img_path = self.images[idx]
img = np.asarray(imageio.imread(img_path))
img = img[..., :3].astype(np.float32) / 255
input_image = crop_center(img, 440)
input_image = F.interpolate(torch.tensor(input_image).permute(2, 0, 1).unsqueeze(0), size=self.rescale)
input_image = input_image.squeeze(0)
mask_path = self.masks[idx]
mask_idxs = imageio.imread(mask_path)
masks = np.zeros((480, 640, self.max_num_entities), dtype=np.uint8)
np.put_along_axis(masks, np.expand_dims(mask_idxs, -1), 1, axis=-1)
input_masks = crop_center(torch.tensor(masks), 440)
input_masks = F.interpolate(input_masks.permute(2, 0, 1).unsqueeze(0), size=128)
input_masks = input_masks.squeeze(0).permute(1, 2, 0)
target_masks = np.reshape(input_masks, (self.rescale*self.rescale, self.max_num_entities))
result = {
'input_images': input_image, # [3, h, w]
'input_masks': input_masks, # [h, w, self.max_num_entities]
'target_masks': target_masks, # [h*w, self.max_num_entities]
'sceneid': idx, # int
}
return result
0% Loading or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment