diff --git a/scripts/record3d2nerf.py b/scripts/record3d2nerf.py index 5e5f9316d21a0ab1a4fdbb0ea96dfe36a260138a..c148a05cb18414cfe5f33936462c654157376641 100644 --- a/scripts/record3d2nerf.py +++ b/scripts/record3d2nerf.py @@ -20,157 +20,157 @@ from tqdm import tqdm from PIL import Image def rotate_img(img_path, degree=90): - img = Image.open(img_path) - img = img.rotate(degree, expand=1) - img.save(img_path, quality=100, subsampling=0) - + img = Image.open(img_path) + img = img.rotate(degree, expand=1) + img.save(img_path, quality=100, subsampling=0) + def rotate_camera(c2w, degree=90): - rad = np.deg2rad(degree) - R = Quaternion(axis=[0, 0, -1], angle=rad) - T = R.transformation_matrix - return c2w @ T + rad = np.deg2rad(degree) + R = Quaternion(axis=[0, 0, -1], angle=rad) + T = R.transformation_matrix + return c2w @ T def swap_axes(c2w): - rad = np.pi / 2 - R = Quaternion(axis=[1, 0, 0], angle=rad) - T = R.transformation_matrix - return T @ c2w + rad = np.pi / 2 + R = Quaternion(axis=[1, 0, 0], angle=rad) + T = R.transformation_matrix + return T @ c2w # Automatic rescale & offset the poses. def find_transforms_center_and_scale(raw_transforms): - print("computing center of attention...") - frames = raw_transforms['frames'] - for frame in frames: - frame['transform_matrix'] = np.array(frame['transform_matrix']) - - rays_o = [] - rays_d = [] - for f in tqdm(frames): - mf = f["transform_matrix"][0:3,:] - rays_o.append(mf[:3,3:]) - rays_d.append(mf[:3,2:3]) - rays_o = np.asarray(rays_o) - rays_d = np.asarray(rays_d) - - # Find the point that minimizes its distances to all rays. - def min_line_dist(rays_o, rays_d): - A_i = np.eye(3) - rays_d * np.transpose(rays_d, [0,2,1]) - b_i = -A_i @ rays_o - pt_mindist = np.squeeze(-np.linalg.inv((np.transpose(A_i, [0,2,1]) @ A_i).mean(0)) @ (b_i).mean(0)) - return pt_mindist - - translation = min_line_dist(rays_o, rays_d) - normalized_transforms = copy.deepcopy(raw_transforms) - for f in normalized_transforms["frames"]: - f["transform_matrix"][0:3,3] -= translation - - # Find the scale. - avglen = 0. - for f in normalized_transforms["frames"]: - avglen += np.linalg.norm(f["transform_matrix"][0:3,3]) - nframes = len(normalized_transforms["frames"]) - avglen /= nframes - print("avg camera distance from origin", avglen) - scale = 4.0 / avglen # scale to "nerf sized" - - return translation, scale + print("computing center of attention...") + frames = raw_transforms['frames'] + for frame in frames: + frame['transform_matrix'] = np.array(frame['transform_matrix']) + + rays_o = [] + rays_d = [] + for f in tqdm(frames): + mf = f["transform_matrix"][0:3,:] + rays_o.append(mf[:3,3:]) + rays_d.append(mf[:3,2:3]) + rays_o = np.asarray(rays_o) + rays_d = np.asarray(rays_d) + + # Find the point that minimizes its distances to all rays. + def min_line_dist(rays_o, rays_d): + A_i = np.eye(3) - rays_d * np.transpose(rays_d, [0,2,1]) + b_i = -A_i @ rays_o + pt_mindist = np.squeeze(-np.linalg.inv((np.transpose(A_i, [0,2,1]) @ A_i).mean(0)) @ (b_i).mean(0)) + return pt_mindist + + translation = min_line_dist(rays_o, rays_d) + normalized_transforms = copy.deepcopy(raw_transforms) + for f in normalized_transforms["frames"]: + f["transform_matrix"][0:3,3] -= translation + + # Find the scale. + avglen = 0. + for f in normalized_transforms["frames"]: + avglen += np.linalg.norm(f["transform_matrix"][0:3,3]) + nframes = len(normalized_transforms["frames"]) + avglen /= nframes + print("avg camera distance from origin", avglen) + scale = 4.0 / avglen # scale to "nerf sized" + + return translation, scale def normalize_transforms(transforms, translation, scale): - normalized_transforms = copy.deepcopy(transforms) - for f in normalized_transforms["frames"]: - f["transform_matrix"] = np.asarray(f["transform_matrix"]) - f["transform_matrix"][0:3,3] -= translation - f["transform_matrix"][0:3,3] *= scale - f["transform_matrix"] = f["transform_matrix"].tolist() - return normalized_transforms + normalized_transforms = copy.deepcopy(transforms) + for f in normalized_transforms["frames"]: + f["transform_matrix"] = np.asarray(f["transform_matrix"]) + f["transform_matrix"][0:3,3] -= translation + f["transform_matrix"][0:3,3] *= scale + f["transform_matrix"] = f["transform_matrix"].tolist() + return normalized_transforms def parse_args(): - parser = argparse.ArgumentParser(description="convert a Record3D capture to nerf format transforms.json") - parser.add_argument("--scene", default="", help="path to the Record3D capture") - parser.add_argument("--rotate", action="store_true", help="rotate the dataset") - parser.add_argument("--subsample", default=1, type=int, help="step size of subsampling") - args = parser.parse_args() - return args + parser = argparse.ArgumentParser(description="convert a Record3D capture to nerf format transforms.json") + parser.add_argument("--scene", default="", help="path to the Record3D capture") + parser.add_argument("--rotate", action="store_true", help="rotate the dataset") + parser.add_argument("--subsample", default=1, type=int, help="step size of subsampling") + args = parser.parse_args() + return args if __name__ == "__main__": - args = parse_args() - dataset_dir = Path(args.scene) - with open(dataset_dir / 'metadata') as f: - metadata = json.load(f) - - frames = [] - n_images = len(list((dataset_dir / 'rgbd').glob('*.jpg'))) - poses = np.array(metadata['poses']) - for idx in tqdm(range(n_images)): - # Link the image. - img_name = f'{idx}.jpg' - img_path = dataset_dir / 'rgbd' / img_name - - # Rotate the image. - if args.rotate: - # TODO: parallelize this step with joblib. - rotate_img(img_path) - - # Extract c2w. - """ Each `pose` is a 7-element tuple which contains quaternion + world position. - [qx, qy, qz, qw, tx, ty, tz] - """ - pose = poses[idx] - q = Quaternion(x=pose[0], y=pose[1], z=pose[2], w=pose[3]) - c2w = np.eye(4) - c2w[:3, :3] = q.rotation_matrix - c2w[:3, -1] = [pose[4], pose[5], pose[6]] - if args.rotate: - c2w = rotate_camera(c2w) - c2w = swap_axes(c2w) - - frames.append( - { - "file_path": f"./rgbd/{img_name}", - "transform_matrix": c2w.tolist(), - } - ) - - # Write intrinsics to `cameras.txt`. - if not args.rotate: - h = metadata['h'] - w = metadata['w'] - K = np.array(metadata['K']).reshape([3, 3]).T - fx = K[0, 0] - fy = K[1, 1] - cx = K[0, 2] - cy = K[1, 2] - else: - h = metadata['w'] - w = metadata['h'] - K = np.array(metadata['K']).reshape([3, 3]).T - fx = K[1, 1] - fy = K[0, 0] - cx = K[1, 2] - cy = h - K[0, 2] - - transforms = {} - transforms['fl_x'] = fx - transforms['fl_y'] = fy - transforms['cx'] = cx - transforms['cy'] = cy - transforms['w'] = w - transforms['h'] = h - transforms['aabb_scale'] = 16 - transforms['scale'] = 1.0 - transforms['camera_angle_x'] = 2 * np.arctan(transforms['w'] / (2 * transforms['fl_x'])) - transforms['camera_angle_y'] = 2 * np.arctan(transforms['h'] / (2 * transforms['fl_y'])) - transforms['frames'] = frames - - os.makedirs(dataset_dir / 'arkit_transforms', exist_ok=True) - with open(dataset_dir / 'arkit_transforms' / 'transforms.json', 'w') as fp: - json.dump(transforms, fp, indent=2) - - # Normalize the poses. - transforms['frames'] = transforms['frames'][::args.subsample] - translation, scale = find_transforms_center_and_scale(transforms) - normalized_transforms = normalize_transforms(transforms, translation, scale) - - output_path = dataset_dir / 'transforms.json' - with open(output_path, "w") as outfile: - json.dump(normalized_transforms, outfile, indent=2) \ No newline at end of file + args = parse_args() + dataset_dir = Path(args.scene) + with open(dataset_dir / 'metadata') as f: + metadata = json.load(f) + + frames = [] + n_images = len(list((dataset_dir / 'rgbd').glob('*.jpg'))) + poses = np.array(metadata['poses']) + for idx in tqdm(range(n_images)): + # Link the image. + img_name = f'{idx}.jpg' + img_path = dataset_dir / 'rgbd' / img_name + + # Rotate the image. + if args.rotate: + # TODO: parallelize this step with joblib. + rotate_img(img_path) + + # Extract c2w. + """ Each `pose` is a 7-element tuple which contains quaternion + world position. + [qx, qy, qz, qw, tx, ty, tz] + """ + pose = poses[idx] + q = Quaternion(x=pose[0], y=pose[1], z=pose[2], w=pose[3]) + c2w = np.eye(4) + c2w[:3, :3] = q.rotation_matrix + c2w[:3, -1] = [pose[4], pose[5], pose[6]] + if args.rotate: + c2w = rotate_camera(c2w) + c2w = swap_axes(c2w) + + frames.append( + { + "file_path": f"./rgbd/{img_name}", + "transform_matrix": c2w.tolist(), + } + ) + + # Write intrinsics to `cameras.txt`. + if not args.rotate: + h = metadata['h'] + w = metadata['w'] + K = np.array(metadata['K']).reshape([3, 3]).T + fx = K[0, 0] + fy = K[1, 1] + cx = K[0, 2] + cy = K[1, 2] + else: + h = metadata['w'] + w = metadata['h'] + K = np.array(metadata['K']).reshape([3, 3]).T + fx = K[1, 1] + fy = K[0, 0] + cx = K[1, 2] + cy = h - K[0, 2] + + transforms = {} + transforms['fl_x'] = fx + transforms['fl_y'] = fy + transforms['cx'] = cx + transforms['cy'] = cy + transforms['w'] = w + transforms['h'] = h + transforms['aabb_scale'] = 16 + transforms['scale'] = 1.0 + transforms['camera_angle_x'] = 2 * np.arctan(transforms['w'] / (2 * transforms['fl_x'])) + transforms['camera_angle_y'] = 2 * np.arctan(transforms['h'] / (2 * transforms['fl_y'])) + transforms['frames'] = frames + + os.makedirs(dataset_dir / 'arkit_transforms', exist_ok=True) + with open(dataset_dir / 'arkit_transforms' / 'transforms.json', 'w') as fp: + json.dump(transforms, fp, indent=2) + + # Normalize the poses. + transforms['frames'] = transforms['frames'][::args.subsample] + translation, scale = find_transforms_center_and_scale(transforms) + normalized_transforms = normalize_transforms(transforms, translation, scale) + + output_path = dataset_dir / 'transforms.json' + with open(output_path, "w") as outfile: + json.dump(normalized_transforms, outfile, indent=2)