diff --git a/docs/nerf_dataset_tips.md b/docs/nerf_dataset_tips.md index bf4581423fd24d9fbd23677d22c099ab81bc678e..2bfc2d52d5ff0f82d474efda3e2c9beec35fb816 100644 --- a/docs/nerf_dataset_tips.md +++ b/docs/nerf_dataset_tips.md @@ -50,7 +50,11 @@ You can set any of the following parameters, where the listed values are the def See [nerf_loader.cu](src/nerf_loader.cu) for implementation details and additional options. ## Preparing new NeRF datasets +To train on self-captured data, one has to process the data into an existing format supported by Instant-NGP. We provide scripts to support two complementary approaches: +- [COLMAP](#COLMAP) +- [Record3D](#Record3D) (based on ARKit) +### COLMAP Make sure that you have installed [COLMAP](https://colmap.github.io/) and that it is available in your PATH. If you are using a video file as input, also be sure to install [FFmpeg](https://www.ffmpeg.org/) and make sure that it is available in your PATH. To check that this is the case, from a terminal window, you should be able to run `colmap` and `ffmpeg -?` and see some help text from each. @@ -81,10 +85,26 @@ Assuming success, you can now train your NeRF model as follows, starting in the instant-ngp$ ./build/testbed --mode nerf --scene [path to training data folder containing transforms.json] ``` -### Tips for NeRF training data +### Record3D +With an >=iPhone 12 Pro, one can use [Record3D](https://record3d.app/) to collect data and avoid COLMAP. [Record3D](https://record3d.app/) is an iOS app that relies on ARKit to estimate each image's camera pose. It is more robust than COLMAP for scenes that lack textures or contain repetitive patterns. To train Instant-NGPs with Record3D data, follow these steps: + +1. Record a video and export with the "Shareable/Internal format (.r3d)". +2. Send the exported data to your computer. +3. Replace the `.r3d` extension with `.zip` and unzip the file to get a directory `path/to/data`. +4. Run the preprocessing script: + ``` + python scripts/record3d2nerf.py --scene path/to/data + ``` + If you capture the scene in the landscape orientation, add `--rotate`. + +5. Launch Instant-NGP training: + ``` + ./build/testbed --scene path/to/data + ``` + +## Tips for NeRF training data The NeRF model trains best with between 50-150 images which exhibit minimal scene movement, motion blur or other blurring artefacts. The quality of reconstruction is predicated on COLMAP being able to extract accurate camera parameters from the images. Review the earlier sections for information on how to verify this. -The `colmap2nerf.py` script assumes that the training images are all pointing approximately at a shared point of interest, which it places at the origin. This point is found by taking a weighted average of the closest points of approach between the rays through the central pixel of all pairs of training images. In practice, this means that the script works best when the training images have been captured pointing inwards towards the object of interest, although they do not need to complete a full 360 view of it. Any background visible behind the object of interest will still be reconstructed if `aabb_scale` is set to a number larger than 1, as explained above. - +The `colmap2nerf.py` script assumes that the training images are all pointing approximately at a shared point of interest, which it places at the origin. This point is found by taking a weighted average of the closest points of approach between the rays through the central pixel of all pairs of training images. In practice, this means that the script works best when the training images have been captured pointing inwards towards the object of interest, although they do not need to complete a full 360 view of it. Any background visible behind the object of interest will still be reconstructed if `aabb_scale` is set to a number larger than 1, as explained above. \ No newline at end of file diff --git a/scripts/record3d2nerf.py b/scripts/record3d2nerf.py new file mode 100644 index 0000000000000000000000000000000000000000..5e5f9316d21a0ab1a4fdbb0ea96dfe36a260138a --- /dev/null +++ b/scripts/record3d2nerf.py @@ -0,0 +1,176 @@ +#!/usr/bin/env python3 + +# Copyright (c) 2020-2022, NVIDIA CORPORATION. All rights reserved. +# +# NVIDIA CORPORATION and its licensors retain all intellectual property +# and proprietary rights in and to this software, related documentation +# and any modifications thereto. Any use, reproduction, disclosure or +# distribution of this software and related documentation without an express +# license agreement from NVIDIA CORPORATION is strictly prohibited. + +import argparse +import os +from pathlib import Path + +import numpy as np +import json +import copy +from pyquaternion import Quaternion +from tqdm import tqdm +from PIL import Image + +def rotate_img(img_path, degree=90): + img = Image.open(img_path) + img = img.rotate(degree, expand=1) + img.save(img_path, quality=100, subsampling=0) + +def rotate_camera(c2w, degree=90): + rad = np.deg2rad(degree) + R = Quaternion(axis=[0, 0, -1], angle=rad) + T = R.transformation_matrix + return c2w @ T + +def swap_axes(c2w): + rad = np.pi / 2 + R = Quaternion(axis=[1, 0, 0], angle=rad) + T = R.transformation_matrix + return T @ c2w + +# Automatic rescale & offset the poses. +def find_transforms_center_and_scale(raw_transforms): + print("computing center of attention...") + frames = raw_transforms['frames'] + for frame in frames: + frame['transform_matrix'] = np.array(frame['transform_matrix']) + + rays_o = [] + rays_d = [] + for f in tqdm(frames): + mf = f["transform_matrix"][0:3,:] + rays_o.append(mf[:3,3:]) + rays_d.append(mf[:3,2:3]) + rays_o = np.asarray(rays_o) + rays_d = np.asarray(rays_d) + + # Find the point that minimizes its distances to all rays. + def min_line_dist(rays_o, rays_d): + A_i = np.eye(3) - rays_d * np.transpose(rays_d, [0,2,1]) + b_i = -A_i @ rays_o + pt_mindist = np.squeeze(-np.linalg.inv((np.transpose(A_i, [0,2,1]) @ A_i).mean(0)) @ (b_i).mean(0)) + return pt_mindist + + translation = min_line_dist(rays_o, rays_d) + normalized_transforms = copy.deepcopy(raw_transforms) + for f in normalized_transforms["frames"]: + f["transform_matrix"][0:3,3] -= translation + + # Find the scale. + avglen = 0. + for f in normalized_transforms["frames"]: + avglen += np.linalg.norm(f["transform_matrix"][0:3,3]) + nframes = len(normalized_transforms["frames"]) + avglen /= nframes + print("avg camera distance from origin", avglen) + scale = 4.0 / avglen # scale to "nerf sized" + + return translation, scale + +def normalize_transforms(transforms, translation, scale): + normalized_transforms = copy.deepcopy(transforms) + for f in normalized_transforms["frames"]: + f["transform_matrix"] = np.asarray(f["transform_matrix"]) + f["transform_matrix"][0:3,3] -= translation + f["transform_matrix"][0:3,3] *= scale + f["transform_matrix"] = f["transform_matrix"].tolist() + return normalized_transforms + +def parse_args(): + parser = argparse.ArgumentParser(description="convert a Record3D capture to nerf format transforms.json") + parser.add_argument("--scene", default="", help="path to the Record3D capture") + parser.add_argument("--rotate", action="store_true", help="rotate the dataset") + parser.add_argument("--subsample", default=1, type=int, help="step size of subsampling") + args = parser.parse_args() + return args + +if __name__ == "__main__": + args = parse_args() + dataset_dir = Path(args.scene) + with open(dataset_dir / 'metadata') as f: + metadata = json.load(f) + + frames = [] + n_images = len(list((dataset_dir / 'rgbd').glob('*.jpg'))) + poses = np.array(metadata['poses']) + for idx in tqdm(range(n_images)): + # Link the image. + img_name = f'{idx}.jpg' + img_path = dataset_dir / 'rgbd' / img_name + + # Rotate the image. + if args.rotate: + # TODO: parallelize this step with joblib. + rotate_img(img_path) + + # Extract c2w. + """ Each `pose` is a 7-element tuple which contains quaternion + world position. + [qx, qy, qz, qw, tx, ty, tz] + """ + pose = poses[idx] + q = Quaternion(x=pose[0], y=pose[1], z=pose[2], w=pose[3]) + c2w = np.eye(4) + c2w[:3, :3] = q.rotation_matrix + c2w[:3, -1] = [pose[4], pose[5], pose[6]] + if args.rotate: + c2w = rotate_camera(c2w) + c2w = swap_axes(c2w) + + frames.append( + { + "file_path": f"./rgbd/{img_name}", + "transform_matrix": c2w.tolist(), + } + ) + + # Write intrinsics to `cameras.txt`. + if not args.rotate: + h = metadata['h'] + w = metadata['w'] + K = np.array(metadata['K']).reshape([3, 3]).T + fx = K[0, 0] + fy = K[1, 1] + cx = K[0, 2] + cy = K[1, 2] + else: + h = metadata['w'] + w = metadata['h'] + K = np.array(metadata['K']).reshape([3, 3]).T + fx = K[1, 1] + fy = K[0, 0] + cx = K[1, 2] + cy = h - K[0, 2] + + transforms = {} + transforms['fl_x'] = fx + transforms['fl_y'] = fy + transforms['cx'] = cx + transforms['cy'] = cy + transforms['w'] = w + transforms['h'] = h + transforms['aabb_scale'] = 16 + transforms['scale'] = 1.0 + transforms['camera_angle_x'] = 2 * np.arctan(transforms['w'] / (2 * transforms['fl_x'])) + transforms['camera_angle_y'] = 2 * np.arctan(transforms['h'] / (2 * transforms['fl_y'])) + transforms['frames'] = frames + + os.makedirs(dataset_dir / 'arkit_transforms', exist_ok=True) + with open(dataset_dir / 'arkit_transforms' / 'transforms.json', 'w') as fp: + json.dump(transforms, fp, indent=2) + + # Normalize the poses. + transforms['frames'] = transforms['frames'][::args.subsample] + translation, scale = find_transforms_center_and_scale(transforms) + normalized_transforms = normalize_transforms(transforms, translation, scale) + + output_path = dataset_dir / 'transforms.json' + with open(output_path, "w") as outfile: + json.dump(normalized_transforms, outfile, indent=2) \ No newline at end of file