diff --git a/.github/workflows/main.yml b/.github/workflows/main.yml index 35b134c469547036aa442d37575445102bda76a1..b131e66da9bdb5748cd2102f273facab90558da0 100644 --- a/.github/workflows/main.yml +++ b/.github/workflows/main.yml @@ -27,6 +27,9 @@ jobs: - os: ubuntu-18.04 cuda: "10.2" arch: 61 + - os: ubuntu-18.04 + cuda: "10.2" + arch: 53 - os: ubuntu-18.04 cuda: "10.2" arch: 37 @@ -74,6 +77,10 @@ jobs: visual_studio: "Visual Studio 16 2019" cuda: "11.5.1" arch: 61 + - os: windows-2019 + visual_studio: "Visual Studio 16 2019" + cuda: "11.5.1" + arch: 53 - os: windows-2019 visual_studio: "Visual Studio 16 2019" cuda: "11.5.1" diff --git a/dependencies/tiny-cuda-nn b/dependencies/tiny-cuda-nn index 8575542682cb67cddfc748cc3d3cfc12593799aa..ece9cdd88601a3e754ef82de11dd0114008d0fbc 160000 --- a/dependencies/tiny-cuda-nn +++ b/dependencies/tiny-cuda-nn @@ -1 +1 @@ -Subproject commit 8575542682cb67cddfc748cc3d3cfc12593799aa +Subproject commit ece9cdd88601a3e754ef82de11dd0114008d0fbc diff --git a/include/neural-graphics-primitives/envmap.cuh b/include/neural-graphics-primitives/envmap.cuh index 273f66100c26047648337c9cb46b61717ae277f3..89820b01664b946d4fefc8249fdf818735c69e37 100644 --- a/include/neural-graphics-primitives/envmap.cuh +++ b/include/neural-graphics-primitives/envmap.cuh @@ -64,8 +64,8 @@ __device__ Eigen::Array4f read_envmap(const T* __restrict__ envmap_data, const E return result; } -template <typename T> -__device__ void deposit_envmap_gradient(const tcnn::vector_t<T, 4>& value, T* __restrict__ envmap_gradient, const Eigen::Vector2i envmap_resolution, const Eigen::Vector3f& dir) { +template <typename T, typename GRAD_T> +__device__ void deposit_envmap_gradient(const tcnn::vector_t<T, 4>& value, GRAD_T* __restrict__ envmap_gradient, const Eigen::Vector2i envmap_resolution, const Eigen::Vector3f& dir) { auto dir_cyl = dir_to_spherical_unorm({dir.z(), -dir.x(), dir.y()}); auto envmap_float = Eigen::Vector2f{dir_cyl.y() * (envmap_resolution.x()-1), dir_cyl.x() * (envmap_resolution.y()-1)}; @@ -83,8 +83,8 @@ __device__ void deposit_envmap_gradient(const tcnn::vector_t<T, 4>& value, T* __ Eigen::Array4f result; -#if TCNN_MIN_GPU_ARCH >= 60 // atomicAdd(__half2) is only supported with compute capability 60 and above - if (std::is_same<T, __half>::value) { +#if defined(__CUDA_ARCH__) && __CUDA_ARCH__ >= 600 // atomicAdd(__half2) is only supported with compute capability 60 and above + if (std::is_same<GRAD_T, __half>::value) { for (uint32_t c = 0; c < 4; c += 2) { atomicAdd((__half2*)&envmap_gradient[(pos.x() + pos.y() * envmap_resolution.x()) * 4 + c], {value[c] * weight, value[c+1] * weight}); } @@ -92,7 +92,7 @@ __device__ void deposit_envmap_gradient(const tcnn::vector_t<T, 4>& value, T* __ #endif { for (uint32_t c = 0; c < 4; ++c) { - atomicAdd(&envmap_gradient[(pos.x() + pos.y() * envmap_resolution.x()) * 4 + c], value[c] * weight); + atomicAdd(&envmap_gradient[(pos.x() + pos.y() * envmap_resolution.x()) * 4 + c], (GRAD_T)(value[c] * weight)); } } }; diff --git a/include/neural-graphics-primitives/takikawa_encoding.cuh b/include/neural-graphics-primitives/takikawa_encoding.cuh index 64ebd4ae1483c9b80db517c6fd92f4b4f6b782e3..5746ecaed851de7eaef46a36a9d8ae697746899e 100644 --- a/include/neural-graphics-primitives/takikawa_encoding.cuh +++ b/include/neural-graphics-primitives/takikawa_encoding.cuh @@ -183,7 +183,7 @@ __global__ void kernel_takikawa_backward_input( dL_dx(i)[j] = result; } -template <typename T, uint32_t N_FEATURES_PER_LEVEL> +template <typename T, typename GRAD_T, uint32_t N_FEATURES_PER_LEVEL> __global__ void kernel_takikawa_backward( const uint32_t num_elements, const uint32_t n_levels, @@ -191,7 +191,7 @@ __global__ void kernel_takikawa_backward( const tcnn::InterpolationType interpolation_type, const TriangleOctreeNode* octree_nodes, const TriangleOctreeDualNode* octree_dual_nodes, - T* __restrict__ grid_gradient, + GRAD_T* __restrict__ params_gradient, const tcnn::PitchedPtr<const float> data_in, const tcnn::PitchedPtr<const T> dL_dy ) { @@ -240,19 +240,24 @@ __global__ void kernel_takikawa_backward( int param_idx = node.vertices[idx] * N_FEATURES_PER_LEVEL; -#if TCNN_MIN_GPU_ARCH >= 60 // atomicAdd(__half2) is only supported with compute capability 60 and above - if (N_FEATURES_PER_LEVEL > 1 && std::is_same<T, __half>::value) { +#if defined(__CUDA_ARCH__) && __CUDA_ARCH__ >= 600 // atomicAdd(__half2) is only supported with compute capability 60 and above + if (N_FEATURES_PER_LEVEL > 1 && std::is_same<GRAD_T, __half>::value) { #pragma unroll for (uint32_t feature = 0; feature < N_FEATURES_PER_LEVEL; feature += 2) { __half2 v = {(__half)((float)grad[feature] * weight), (__half)((float)grad[feature+1] * weight)}; - atomicAdd((__half2*)&grid_gradient[param_idx + feature], v); + atomicAdd((__half2*)¶ms_gradient[param_idx + feature], v); } } else #endif { - #pragma unroll - for (uint32_t f = 0; f < N_FEATURES_PER_LEVEL; ++f) { - atomicAdd(&grid_gradient[param_idx], (T)((float)grad[f] * weight)); + if (std::is_same<GRAD_T, __half>::value) { + // Should never happen + //printf("Attempted to use atomicAdd(__half)\n") + } else { + #pragma unroll + for (uint32_t f = 0; f < N_FEATURES_PER_LEVEL; ++f) { + atomicAdd((float*)¶ms_gradient[param_idx], (float)grad[f] * weight); + } } } } @@ -263,6 +268,19 @@ __global__ void kernel_takikawa_backward( template <typename T, uint32_t N_FEATURES_PER_LEVEL=8> class TakikawaEncoding : public tcnn::Encoding<T> { public: +#if TCNN_MIN_GPU_ARCH >= 60 + // The GPUs that we tested this on do not have an efficient 1D fp16 + // atomicAdd feature. Thus, we accumulate gradients at fp32 if we're + // forced to use 1D atomicAdds. As soon as 2D or higher is possible, + // we can make use the efficient atomicAdd(half2) function. + using grad_t = std::conditional_t<N_FEATURES_PER_LEVEL == 1, float, T>; +#else + // atomicAdd(__half2) is only supported with compute capability 60 and above. + // Since atomicAdd(__half) is relatively slow / doesn't exist for low compute + // capabilities, accumulate in fp32 instead. + using grad_t = float; +#endif + TakikawaEncoding(uint32_t starting_level, bool sum_instead_of_concat, std::shared_ptr<TriangleOctree> octree, tcnn::InterpolationType interpolation_type) : m_starting_level{starting_level}, m_sum_instead_of_concat{sum_instead_of_concat}, m_octree{octree}, m_interpolation_type{interpolation_type} { @@ -315,21 +333,36 @@ public: } { + // We accumulate gradients with grad_t precision, which, for performance reasons, is not always T. + // If not, accumulate in a temporary buffer and cast later. + grad_t* params_gradient; + if (!std::is_same<grad_t, T>::value) { + params_gradient = (grad_t*)m_params_gradient_tmp.data(); + } else { + params_gradient = (grad_t*)m_params_gradient; + } + if (!accumulate_param_gradients) { - CUDA_CHECK_THROW(cudaMemsetAsync(m_params_gradient, 0, n_params() * sizeof(T), stream)); + CUDA_CHECK_THROW(cudaMemsetAsync(params_gradient, 0, n_params() * sizeof(grad_t), stream)); } - tcnn::linear_kernel(kernel_takikawa_backward<T, N_FEATURES_PER_LEVEL>, 0, stream, + tcnn::linear_kernel(kernel_takikawa_backward<T, grad_t, N_FEATURES_PER_LEVEL>, 0, stream, num_elements, n_levels(), m_starting_level, m_interpolation_type, m_octree->nodes_gpu(), m_octree->dual_nodes_gpu(), - m_params_gradient, + params_gradient, inputs, dL_dy ); + + if (!std::is_same<grad_t, T>::value) { + parallel_for_gpu(stream, n_params(), [grad=m_params_gradient, grad_tmp=params_gradient] __device__ (size_t i) { + grad[i] = (T)grad_tmp[i]; + }); + } } // Gradient computation w.r.t. input @@ -380,6 +413,11 @@ public: // Initialize the encoding from the GPU, because the number of parameters can be quite large. tcnn::generate_random_uniform<float>(rnd, n_params(), params_full_precision, -1e-4f, 1e-4f); + + // Only needs temporary storage if gradients are computed with different precision from T. + if (!std::is_same<grad_t, T>::value) { + m_params_gradient_tmp.resize(n_params()); + } } size_t n_params() const override { @@ -410,6 +448,8 @@ private: // Storage of params T* m_params; T* m_params_inference; + + tcnn::GPUMemory<grad_t> m_params_gradient_tmp; T* m_params_gradient; std::shared_ptr<TriangleOctree> m_octree; diff --git a/include/neural-graphics-primitives/testbed.h b/include/neural-graphics-primitives/testbed.h index 5d224767101333a65fe033f83f9e6e85b0651326..a2294d10ef39fef69a03e4f1e2ae227c1f6d403c 100644 --- a/include/neural-graphics-primitives/testbed.h +++ b/include/neural-graphics-primitives/testbed.h @@ -85,7 +85,7 @@ public: float floor_y, float plane_z, float dof, - const precision_t* envmap_data, + const float* envmap_data, const Eigen::Vector2i& envmap_resolution, Eigen::Array4f* frame_buffer, const TriangleOctree* octree, cudaStream_t stream); @@ -125,7 +125,7 @@ public: float plane_z, float dof, const CameraDistortion& camera_distortion, - const precision_t* envmap_data, + const float* envmap_data, const Eigen::Vector2i& envmap_resolution, const float* distortion_data, const Eigen::Vector2i& distortion_resolution, @@ -718,14 +718,16 @@ public: std::shared_ptr<tcnn::Encoding<precision_t>> m_encoding; std::shared_ptr<tcnn::Network<float, precision_t>> m_network; std::shared_ptr<tcnn::Trainer<float, precision_t, precision_t>> m_trainer; + struct TrainableEnvmap { - std::shared_ptr<tcnn::Optimizer<precision_t>> optimizer; - std::shared_ptr<TrainableBuffer<4, 2, precision_t>> envmap; - std::shared_ptr<tcnn::Trainer<float, precision_t, precision_t>> trainer; + std::shared_ptr<tcnn::Optimizer<float>> optimizer; + std::shared_ptr<TrainableBuffer<4, 2, float>> envmap; + std::shared_ptr<tcnn::Trainer<float, float, float>> trainer; Eigen::Vector2i resolution; ELossType loss_type; } m_envmap; + struct TrainableDistortionMap { std::shared_ptr<tcnn::Optimizer<float>> optimizer; std::shared_ptr<TrainableBuffer<2, 2, float>> map; diff --git a/src/testbed.cu b/src/testbed.cu index 4a20e165de806b140d3864b78ec8dd349badfbbc..ab2bb281de64adc5ad2d7951d5e95f03d0cd1879 100644 --- a/src/testbed.cu +++ b/src/testbed.cu @@ -1693,9 +1693,9 @@ void Testbed::reset_network() { m_envmap.loss_type = string_to_loss_type(envmap_loss_config.value("otype", "L2")); m_envmap.resolution = m_nerf.training.dataset.envmap_resolution; - m_envmap.envmap = std::make_shared<TrainableBuffer<4, 2, precision_t>>(m_envmap.resolution); - m_envmap.optimizer.reset(create_optimizer<precision_t>(envmap_optimizer_config)); - m_envmap.trainer = std::make_shared<Trainer<float, precision_t, precision_t>>(m_envmap.envmap, m_envmap.optimizer, m_loss, m_seed); + m_envmap.envmap = std::make_shared<TrainableBuffer<4, 2, float>>(m_envmap.resolution); + m_envmap.optimizer.reset(create_optimizer<float>(envmap_optimizer_config)); + m_envmap.trainer = std::make_shared<Trainer<float, float, float>>(m_envmap.envmap, m_envmap.optimizer, std::shared_ptr<Loss<float>>{create_loss<float>(envmap_loss_config)}, m_seed); if (m_nerf.training.dataset.envmap_data.data()) { m_envmap.trainer->set_params_full_precision(m_nerf.training.dataset.envmap_data.data(), m_nerf.training.dataset.envmap_data.size()); diff --git a/src/testbed_nerf.cu b/src/testbed_nerf.cu index ea41f988746cbd70da1ebf9aacfa6e7a01ff54b7..fd395a442d4734334ba4c45c542b2544e3f35e9c 100644 --- a/src/testbed_nerf.cu +++ b/src/testbed_nerf.cu @@ -1118,8 +1118,8 @@ __global__ void compute_loss_kernel_train_nerf( const uint32_t* __restrict__ rays_counter, float loss_scale, int padded_output_width, - const tcnn::network_precision_t* __restrict__ envmap_data, - tcnn::network_precision_t* __restrict__ envmap_gradient, + const float* __restrict__ envmap_data, + float* __restrict__ envmap_gradient, const Vector2i envmap_resolution, ELossType envmap_loss_type, Array3f background_color, @@ -1569,7 +1569,7 @@ __global__ void init_rays_with_payload_kernel_nerf( float plane_z, float dof, CameraDistortion camera_distortion, - const network_precision_t* __restrict__ envmap_data, + const float* __restrict__ envmap_data, const Vector2i envmap_resolution, Array4f* __restrict__ framebuffer, const float* __restrict__ distortion_data, @@ -1729,7 +1729,7 @@ void Testbed::NerfTracer::init_rays_from_camera(uint32_t spp, float plane_z, float dof, const CameraDistortion& camera_distortion, - const network_precision_t* envmap_data, + const float* envmap_data, const Vector2i& envmap_resolution, const float* distortion_data, const Vector2i& distortion_resolution, @@ -2247,9 +2247,9 @@ void Testbed::train_nerf(uint32_t target_batch_size, uint32_t n_training_steps, CUDA_CHECK_THROW(cudaMemsetAsync(m_nerf.training.error_map.data.data(), 0, m_nerf.training.error_map.data.get_bytes(), stream)); } - network_precision_t* envmap_gradient = m_nerf.training.train_envmap ? m_envmap.envmap->gradients() : nullptr; + float* envmap_gradient = m_nerf.training.train_envmap ? m_envmap.envmap->gradients() : nullptr; if (envmap_gradient) { - CUDA_CHECK_THROW(cudaMemsetAsync(envmap_gradient, 0, sizeof(network_precision_t)*m_envmap.envmap->n_params(), stream)); + CUDA_CHECK_THROW(cudaMemsetAsync(envmap_gradient, 0, sizeof(float)*m_envmap.envmap->n_params(), stream)); } for (uint32_t i = 0; i < n_training_steps; ++i) { @@ -2468,7 +2468,7 @@ void Testbed::train_nerf_step(uint32_t target_batch_size, uint32_t n_rays_per_ba m_nerf.training.n_rays_since_error_map_update += n_rays_per_batch; // If we have an envmap, prepare its gradient buffer - network_precision_t* envmap_gradient = m_nerf.training.train_envmap ? m_envmap.envmap->gradients() : nullptr; + float* envmap_gradient = m_nerf.training.train_envmap ? m_envmap.envmap->gradients() : nullptr; bool sample_focal_plane_proportional_to_error = m_nerf.training.error_map.is_cdf_valid && m_nerf.training.sample_focal_plane_proportional_to_error; bool sample_image_proportional_to_error = m_nerf.training.error_map.is_cdf_valid && m_nerf.training.sample_image_proportional_to_error; diff --git a/src/testbed_sdf.cu b/src/testbed_sdf.cu index 2a8d18ba5d08223a846791fcb4b24376f0748186..8fe55b13549b2315e20bf4d6f733c686a7abaaf0 100644 --- a/src/testbed_sdf.cu +++ b/src/testbed_sdf.cu @@ -502,7 +502,7 @@ __global__ void init_rays_with_payload_kernel_sdf( float floor_y, float plane_z, float dof, - const network_precision_t* __restrict__ envmap_data, + const float* __restrict__ envmap_data, const Vector2i envmap_resolution, Array4f* __restrict__ framebuffer, const TriangleOctreeNode* __restrict__ octree_nodes = nullptr, @@ -594,7 +594,7 @@ void Testbed::SphereTracer::init_rays_from_camera(uint32_t spp, float floor_y, float plane_z, float dof, - const network_precision_t* envmap_data, + const float* envmap_data, const Vector2i& envmap_resolution, Array4f* frame_buffer, const TriangleOctree* octree, cudaStream_t stream diff --git a/src/testbed_volume.cu b/src/testbed_volume.cu index f39e7c369b16e2e20df122ee921d22afaba50938..36e3a84de25635fa5078bf6c2df6837a3bc6d6aa 100644 --- a/src/testbed_volume.cu +++ b/src/testbed_volume.cu @@ -224,7 +224,7 @@ __global__ void init_rays_volume( BoundingBox aabb, float plane_z, float dof, - const network_precision_t* __restrict__ envmap_data, + const float* __restrict__ envmap_data, const Vector2i envmap_resolution, Array4f* __restrict__ framebuffer, default_rng_t rng,