diff --git a/include/neural-graphics-primitives/sdf.h b/include/neural-graphics-primitives/sdf.h index ddef63cab37c9824bfeaa5cb5bdf0a58e8e4c6e0..07a99bbd90e17dbac43d9875a9e6dc54c96ff489 100644 --- a/include/neural-graphics-primitives/sdf.h +++ b/include/neural-graphics-primitives/sdf.h @@ -29,34 +29,34 @@ struct SdfPayload { struct RaysSdfSoa { #if defined(__NVCC__) || (defined(__clang__) && defined(__CUDA__)) - void enlarge(size_t n_elements) { - pos.enlarge(n_elements); - normal.enlarge(n_elements); - distance.enlarge(n_elements); - prev_distance.enlarge(n_elements); - total_distance.enlarge(n_elements); - min_visibility.enlarge(n_elements); - payload.enlarge(n_elements); - } - void copy_from_other_async(uint32_t n_elements, const RaysSdfSoa& other, cudaStream_t stream) { - CUDA_CHECK_THROW(cudaMemcpyAsync(pos.data(), other.pos.data(), n_elements * sizeof(Eigen::Vector3f), cudaMemcpyDeviceToDevice, stream)); - CUDA_CHECK_THROW(cudaMemcpyAsync(normal.data(), other.normal.data(), n_elements * sizeof(Eigen::Vector3f), cudaMemcpyDeviceToDevice, stream)); - CUDA_CHECK_THROW(cudaMemcpyAsync(distance.data(), other.distance.data(), n_elements * sizeof(float), cudaMemcpyDeviceToDevice, stream)); - CUDA_CHECK_THROW(cudaMemcpyAsync(prev_distance.data(), other.prev_distance.data(), n_elements * sizeof(float), cudaMemcpyDeviceToDevice, stream)); - CUDA_CHECK_THROW(cudaMemcpyAsync(total_distance.data(), other.total_distance.data(), n_elements * sizeof(float), cudaMemcpyDeviceToDevice, stream)); - CUDA_CHECK_THROW(cudaMemcpyAsync(min_visibility.data(), other.min_visibility.data(), n_elements * sizeof(float), cudaMemcpyDeviceToDevice, stream)); - CUDA_CHECK_THROW(cudaMemcpyAsync(payload.data(), other.payload.data(), n_elements * sizeof(SdfPayload), cudaMemcpyDeviceToDevice, stream)); + CUDA_CHECK_THROW(cudaMemcpyAsync(pos, other.pos, n_elements * sizeof(Eigen::Vector3f), cudaMemcpyDeviceToDevice, stream)); + CUDA_CHECK_THROW(cudaMemcpyAsync(normal, other.normal, n_elements * sizeof(Eigen::Vector3f), cudaMemcpyDeviceToDevice, stream)); + CUDA_CHECK_THROW(cudaMemcpyAsync(distance, other.distance, n_elements * sizeof(float), cudaMemcpyDeviceToDevice, stream)); + CUDA_CHECK_THROW(cudaMemcpyAsync(prev_distance, other.prev_distance, n_elements * sizeof(float), cudaMemcpyDeviceToDevice, stream)); + CUDA_CHECK_THROW(cudaMemcpyAsync(total_distance, other.total_distance, n_elements * sizeof(float), cudaMemcpyDeviceToDevice, stream)); + CUDA_CHECK_THROW(cudaMemcpyAsync(min_visibility, other.min_visibility, n_elements * sizeof(float), cudaMemcpyDeviceToDevice, stream)); + CUDA_CHECK_THROW(cudaMemcpyAsync(payload, other.payload, n_elements * sizeof(SdfPayload), cudaMemcpyDeviceToDevice, stream)); } #endif - tcnn::GPUMemory<Eigen::Vector3f> pos; - tcnn::GPUMemory<Eigen::Vector3f> normal; - tcnn::GPUMemory<float> distance; - tcnn::GPUMemory<float> prev_distance; - tcnn::GPUMemory<float> total_distance; - tcnn::GPUMemory<float> min_visibility; - tcnn::GPUMemory<SdfPayload> payload; + void set(Eigen::Vector3f* pos, Eigen::Vector3f* normal, float* distance, float* prev_distance, float* total_distance, float* min_visibility, SdfPayload* payload) { + this->pos = pos; + this->normal = normal; + this->distance = distance; + this->prev_distance = prev_distance; + this->total_distance = total_distance; + this->min_visibility = min_visibility; + this->payload = payload; + } + + Eigen::Vector3f* pos; + Eigen::Vector3f* normal; + float* distance; + float* prev_distance; + float* total_distance; + float* min_visibility; + SdfPayload* payload; }; struct BRDFParams { diff --git a/include/neural-graphics-primitives/testbed.h b/include/neural-graphics-primitives/testbed.h index 9e40b6a3a16e805f65e80bdc15419a8b4ae7ffae..8e55eeb08c90198e5b45320549e89a0b789dbe82 100644 --- a/include/neural-graphics-primitives/testbed.h +++ b/include/neural-graphics-primitives/testbed.h @@ -69,12 +69,12 @@ public: void load_training_data(const std::string& data_path); void clear_training_data(); - using distance_fun_t = std::function<void(uint32_t, const tcnn::GPUMemory<Eigen::Vector3f>&, tcnn::GPUMemory<float>&, cudaStream_t)>; - using normals_fun_t = std::function<void(uint32_t, const tcnn::GPUMemory<Eigen::Vector3f>&, tcnn::GPUMemory<Eigen::Vector3f>&, cudaStream_t)>; + using distance_fun_t = std::function<void(uint32_t, const Eigen::Vector3f*, float*, cudaStream_t)>; + using normals_fun_t = std::function<void(uint32_t, const Eigen::Vector3f*, Eigen::Vector3f*, cudaStream_t)>; class SphereTracer { public: - SphereTracer() : m_hit_counter(1), m_alive_counter(1) {} + SphereTracer() {} void init_rays_from_camera( uint32_t spp, @@ -111,7 +111,7 @@ public: uint32_t n_octree_levels, cudaStream_t stream ); - void enlarge(size_t n_elements); + void enlarge(size_t n_elements, cudaStream_t stream); RaysSdfSoa& rays_hit() { return m_rays_hit; } RaysSdfSoa& rays_init() { return m_rays[0]; } uint32_t n_rays_initialized() const { return m_n_rays_initialized; } @@ -120,16 +120,19 @@ public: private: RaysSdfSoa m_rays[2]; RaysSdfSoa m_rays_hit; - tcnn::GPUMemory<uint32_t> m_hit_counter; - tcnn::GPUMemory<uint32_t> m_alive_counter; + uint32_t* m_hit_counter; + uint32_t* m_alive_counter; + uint32_t m_n_rays_initialized = 0; float m_shadow_sharpness = 2048.f; bool m_trace_shadow_rays = false; + + tcnn::GPUMemoryArena::Allocation m_scratch_alloc; }; class NerfTracer { public: - NerfTracer() : m_hit_counter(1), m_alive_counter(1) {} + NerfTracer() {} void init_rays_from_camera( uint32_t spp, @@ -193,38 +196,36 @@ public: RaysNerfSoa& rays_init() { return m_rays[0]; } uint32_t n_rays_initialized() const { return m_n_rays_initialized; } - void clear() { - m_scratch_alloc = {}; - } - private: RaysNerfSoa m_rays[2]; RaysNerfSoa m_rays_hit; precision_t* m_network_output; float* m_network_input; - tcnn::GPUMemory<uint32_t> m_hit_counter; - tcnn::GPUMemory<uint32_t> m_alive_counter; + uint32_t* m_hit_counter; + uint32_t* m_alive_counter; uint32_t m_n_rays_initialized = 0; tcnn::GPUMemoryArena::Allocation m_scratch_alloc; }; class FiniteDifferenceNormalsApproximator { public: - void enlarge(uint32_t n_elements); - void normal(uint32_t n_elements, const distance_fun_t& distance_function, tcnn::GPUMemory<Eigen::Vector3f>& pos, tcnn::GPUMemory<Eigen::Vector3f>& normal, float epsilon, cudaStream_t stream); + void enlarge(uint32_t n_elements, cudaStream_t stream); + void normal(uint32_t n_elements, const distance_fun_t& distance_function, const Eigen::Vector3f* pos, Eigen::Vector3f* normal, float epsilon, cudaStream_t stream); private: - tcnn::GPUMemory<Eigen::Vector3f> dx; - tcnn::GPUMemory<Eigen::Vector3f> dy; - tcnn::GPUMemory<Eigen::Vector3f> dz; + Eigen::Vector3f* dx; + Eigen::Vector3f* dy; + Eigen::Vector3f* dz; - tcnn::GPUMemory<float> dist_dx_pos; - tcnn::GPUMemory<float> dist_dy_pos; - tcnn::GPUMemory<float> dist_dz_pos; + float* dist_dx_pos; + float* dist_dy_pos; + float* dist_dz_pos; - tcnn::GPUMemory<float> dist_dx_neg; - tcnn::GPUMemory<float> dist_dy_neg; - tcnn::GPUMemory<float> dist_dz_neg; + float* dist_dx_neg; + float* dist_dy_neg; + float* dist_dz_neg; + + tcnn::GPUMemoryArena::Allocation m_scratch_alloc; }; struct LevelStats { @@ -532,8 +533,6 @@ public: bool m_gui_redraw = true; struct Nerf { - NerfTracer tracer; - struct Training { NerfDataset dataset; int n_images_for_training = 0; // how many images to train from, as a high watermark compared to the dataset size @@ -640,9 +639,6 @@ public: uint32_t max_cascade = 0; - tcnn::GPUMemory<float> vis_input; - tcnn::GPUMemory<Eigen::Array4f> vis_rgba; - ENerfActivation rgb_activation = ENerfActivation::Exponential; ENerfActivation density_activation = ENerfActivation::Exponential; @@ -666,8 +662,6 @@ public: } m_nerf; struct Sdf { - SphereTracer tracer; - SphereTracer shadow_tracer; float shadow_sharpness = 2048.0f; float maximum_distance = 0.00005f; float fd_normals_epsilon = 0.0005f; @@ -676,8 +670,6 @@ public: BRDFParams brdf; - FiniteDifferenceNormalsApproximator fd_normals; - // Mesh data EMeshSdfMode mesh_sdf_mode = EMeshSdfMode::Raystab; float mesh_scale; @@ -764,7 +756,7 @@ public: tcnn::GPUMemory<char> nanovdb_grid; tcnn::GPUMemory<uint8_t> bitgrid; float global_majorant = 1.f; - Eigen::Vector3f world2index_offset = {0,0,0}; + Eigen::Vector3f world2index_offset = {0, 0, 0}; float world2index_scale = 1.f; struct Training { diff --git a/src/testbed.cu b/src/testbed.cu index 57ea85330703b0b5bcbc19e300b0037232f7e524..6b4388169f206d4b577dfb9b616c70283dca8c74 100644 --- a/src/testbed.cu +++ b/src/testbed.cu @@ -2710,7 +2710,7 @@ void Testbed::render_frame(const Matrix<float, 3, 4>& camera_matrix0, const Matr } } distance_fun_t distance_fun = - m_render_ground_truth ? (distance_fun_t)[&](uint32_t n_elements, const GPUMemory<Vector3f>& positions, GPUMemory<float>& distances, cudaStream_t stream) { + m_render_ground_truth ? (distance_fun_t)[&](uint32_t n_elements, const Vector3f* positions, float* distances, cudaStream_t stream) { if (n_elements == 0) { return; } @@ -2730,35 +2730,35 @@ void Testbed::render_frame(const Matrix<float, 3, 4>& camera_matrix0, const Matr m_sdf.triangle_bvh->signed_distance_gpu( n_elements, m_sdf.mesh_sdf_mode, - (Vector3f*)positions.data(), - distances.data(), + (Vector3f*)positions, + distances, m_sdf.triangles_gpu.data(), false, m_stream.get() ); } - } : (distance_fun_t)[&](uint32_t n_elements, const GPUMemory<Vector3f>& positions, GPUMemory<float>& distances, cudaStream_t stream) { + } : (distance_fun_t)[&](uint32_t n_elements, const Vector3f* positions, float* distances, cudaStream_t stream) { if (n_elements == 0) { return; } n_elements = next_multiple(n_elements, tcnn::batch_size_granularity); - GPUMatrix<float> positions_matrix((float*)positions.data(), 3, n_elements); - GPUMatrix<float, RM> distances_matrix(distances.data(), 1, n_elements); + GPUMatrix<float> positions_matrix((float*)positions, 3, n_elements); + GPUMatrix<float, RM> distances_matrix(distances, 1, n_elements); m_network->inference(stream, positions_matrix, distances_matrix); }; normals_fun_t normals_fun = - m_render_ground_truth ? (normals_fun_t)[&](uint32_t n_elements, const GPUMemory<Vector3f>& positions, GPUMemory<Vector3f>& normals, cudaStream_t stream) { + m_render_ground_truth ? (normals_fun_t)[&](uint32_t n_elements, const Vector3f* positions, Vector3f* normals, cudaStream_t stream) { // NO-OP. Normals will automatically be populated by raytrace - } : (normals_fun_t)[&](uint32_t n_elements, const GPUMemory<Vector3f>& positions, GPUMemory<Vector3f>& normals, cudaStream_t stream) { + } : (normals_fun_t)[&](uint32_t n_elements, const Vector3f* positions, Vector3f* normals, cudaStream_t stream) { if (n_elements == 0) { return; } n_elements = next_multiple(n_elements, tcnn::batch_size_granularity); - GPUMatrix<float> positions_matrix((float*)positions.data(), 3, n_elements); - GPUMatrix<float> normals_matrix((float*)normals.data(), 3, n_elements); + GPUMatrix<float> positions_matrix((float*)positions, 3, n_elements); + GPUMatrix<float> normals_matrix((float*)normals, 3, n_elements); m_network->input_gradient(stream, 0, positions_matrix, normals_matrix); }; diff --git a/src/testbed_nerf.cu b/src/testbed_nerf.cu index 8390e34d7c3401026aac7eaa884308390944ac6d..2a1d50d045028b92b609bdcb7d75588ec3139dfb 100644 --- a/src/testbed_nerf.cu +++ b/src/testbed_nerf.cu @@ -2081,7 +2081,7 @@ uint32_t Testbed::NerfTracer::trace( return 0; } - CUDA_CHECK_THROW(cudaMemsetAsync(m_hit_counter.data(), 0, sizeof(uint32_t), stream)); + CUDA_CHECK_THROW(cudaMemsetAsync(m_hit_counter, 0, sizeof(uint32_t), stream)); uint32_t n_alive = m_n_rays_initialized; // m_n_rays_initialized = 0; @@ -2095,15 +2095,15 @@ uint32_t Testbed::NerfTracer::trace( // Compact rays that did not diverge yet { - CUDA_CHECK_THROW(cudaMemsetAsync(m_alive_counter.data(), 0, sizeof(uint32_t), stream)); + CUDA_CHECK_THROW(cudaMemsetAsync(m_alive_counter, 0, sizeof(uint32_t), stream)); linear_kernel(compact_kernel_nerf, 0, stream, n_alive, rays_tmp.rgba, rays_tmp.depth, rays_tmp.payload, rays_current.rgba, rays_current.depth, rays_current.payload, m_rays_hit.rgba, m_rays_hit.depth, m_rays_hit.payload, - m_alive_counter.data(), m_hit_counter.data() + m_alive_counter, m_hit_counter ); - CUDA_CHECK_THROW(cudaMemcpyAsync(&n_alive, m_alive_counter.data(), sizeof(uint32_t), cudaMemcpyDeviceToHost, stream)); + CUDA_CHECK_THROW(cudaMemcpyAsync(&n_alive, m_alive_counter, sizeof(uint32_t), cudaMemcpyDeviceToHost, stream)); CUDA_CHECK_THROW(cudaStreamSynchronize(stream)); } @@ -2172,7 +2172,7 @@ uint32_t Testbed::NerfTracer::trace( } uint32_t n_hit; - CUDA_CHECK_THROW(cudaMemcpyAsync(&n_hit, m_hit_counter.data(), sizeof(uint32_t), cudaMemcpyDeviceToHost, stream)); + CUDA_CHECK_THROW(cudaMemcpyAsync(&n_hit, m_hit_counter, sizeof(uint32_t), cudaMemcpyDeviceToHost, stream)); CUDA_CHECK_THROW(cudaStreamSynchronize(stream)); return n_hit; } @@ -2186,14 +2186,18 @@ void Testbed::NerfTracer::enlarge(size_t n_elements, uint32_t padded_output_widt Array4f, float, NerfPayload, // m_rays_hit network_precision_t, - float + float, + uint32_t, + uint32_t >( stream, &m_scratch_alloc, n_elements, n_elements, n_elements, n_elements, n_elements, n_elements, n_elements, n_elements, n_elements, n_elements * MAX_STEPS_INBETWEEN_COMPACTION * padded_output_width, - n_elements * MAX_STEPS_INBETWEEN_COMPACTION * num_floats + n_elements * MAX_STEPS_INBETWEEN_COMPACTION * num_floats, + 32, // 2 full cache lines to ensure no overlap + 32 // 2 full cache lines to ensure no overlap ); m_rays[0].set(std::get<0>(scratch), std::get<1>(scratch), std::get<2>(scratch), n_elements); @@ -2202,6 +2206,9 @@ void Testbed::NerfTracer::enlarge(size_t n_elements, uint32_t padded_output_widt m_network_output = std::get<9>(scratch); m_network_input = std::get<10>(scratch); + + m_hit_counter = std::get<11>(scratch); + m_alive_counter = std::get<12>(scratch); } void Testbed::Nerf::Training::reset_extra_dims(default_rng_t &rng) { @@ -2256,9 +2263,7 @@ void Testbed::render_nerf(CudaRenderBuffer& render_buffer, const Vector2i& max_r const float* extra_dims_gpu = get_inference_extra_dims(stream); - ScopeGuard tmp_memory_guard{[&]() { - m_nerf.tracer.clear(); - }}; + NerfTracer tracer; // Our motion vector code can't undo f-theta and grid distortions -- so don't render these if DLSS is enabled. bool render_opencv_lens = m_nerf.render_with_lens_distortion && (!render_buffer.dlss() || m_nerf.render_lens.mode == ELensMode::OpenCV); @@ -2267,7 +2272,7 @@ void Testbed::render_nerf(CudaRenderBuffer& render_buffer, const Vector2i& max_r Lens lens = render_opencv_lens ? m_nerf.render_lens : Lens{}; - m_nerf.tracer.init_rays_from_camera( + tracer.init_rays_from_camera( render_buffer.spp(), m_network->padded_output_width(), m_nerf_network->n_extra_dims(), @@ -2301,10 +2306,10 @@ void Testbed::render_nerf(CudaRenderBuffer& render_buffer, const Vector2i& max_r uint32_t n_hit; if (m_render_mode == ERenderMode::Slice) { - n_hit = m_nerf.tracer.n_rays_initialized(); + n_hit = tracer.n_rays_initialized(); } else { float depth_scale = 1.0f / m_nerf.training.dataset.scale; - n_hit = m_nerf.tracer.trace( + n_hit = tracer.trace( *m_nerf_network, m_render_aabb, m_render_aabb_to_local, @@ -2329,7 +2334,7 @@ void Testbed::render_nerf(CudaRenderBuffer& render_buffer, const Vector2i& max_r stream ); } - RaysNerfSoa& rays_hit = m_render_mode == ERenderMode::Slice ? m_nerf.tracer.rays_init() : m_nerf.tracer.rays_hit(); + RaysNerfSoa& rays_hit = m_render_mode == ERenderMode::Slice ? tracer.rays_init() : tracer.rays_hit(); if (m_render_mode == ERenderMode::Slice) { // Store colors in the normal buffer @@ -2337,23 +2342,21 @@ void Testbed::render_nerf(CudaRenderBuffer& render_buffer, const Vector2i& max_r const uint32_t floats_per_coord = sizeof(NerfCoordinate) / sizeof(float) + m_nerf_network->n_extra_dims(); const uint32_t extra_stride = m_nerf_network->n_extra_dims() * sizeof(float); // extra stride on top of base NerfCoordinate struct - m_nerf.vis_input.enlarge(n_elements * floats_per_coord); - m_nerf.vis_rgba.enlarge(n_elements); - linear_kernel(generate_nerf_network_inputs_at_current_position, 0, stream, n_hit, m_aabb, rays_hit.payload, PitchedPtr<NerfCoordinate>((NerfCoordinate*)m_nerf.vis_input.data(), 1, 0, extra_stride), extra_dims_gpu ); + GPUMatrix<float> positions_matrix{floats_per_coord, n_elements, stream}; + GPUMatrix<float> rgbsigma_matrix{4, n_elements, stream}; - GPUMatrix<float> positions_matrix((float*)m_nerf.vis_input.data(), floats_per_coord, n_elements); - GPUMatrix<float> rgbsigma_matrix((float*)m_nerf.vis_rgba.data(), 4, n_elements); + linear_kernel(generate_nerf_network_inputs_at_current_position, 0, stream, n_hit, m_aabb, rays_hit.payload, PitchedPtr<NerfCoordinate>((NerfCoordinate*)positions_matrix.data(), 1, 0, extra_stride), extra_dims_gpu ); if (m_visualized_dimension == -1) { m_network->inference(stream, positions_matrix, rgbsigma_matrix); - linear_kernel(compute_nerf_rgba, 0, stream, n_hit, m_nerf.vis_rgba.data(), m_nerf.rgb_activation, m_nerf.density_activation, 0.01f, false); + linear_kernel(compute_nerf_rgba, 0, stream, n_hit, (Array4f*)rgbsigma_matrix.data(), m_nerf.rgb_activation, m_nerf.density_activation, 0.01f, false); } else { m_network->visualize_activation(stream, m_visualized_layer, m_visualized_dimension, positions_matrix, rgbsigma_matrix); } linear_kernel(shade_kernel_nerf, 0, stream, n_hit, - m_nerf.vis_rgba.data(), + (Array4f*)rgbsigma_matrix.data(), nullptr, rays_hit.payload, m_render_mode, diff --git a/src/testbed_sdf.cu b/src/testbed_sdf.cu index 780dfb20200a034040582e08de01036d263b6153..bf3e5af148d0d8fb97ae502de21736fd0e177395 100644 --- a/src/testbed_sdf.cu +++ b/src/testbed_sdf.cu @@ -550,12 +550,13 @@ __global__ void init_rays_with_payload_kernel_sdf( if (octree_nodes && !TriangleOctree::contains(octree_nodes, max_depth, ray.o)) { t = max(0.0f, TriangleOctree::ray_intersect(octree_nodes, max_depth, ray.o, ray.d)); - if (ray.o.y() > floor_y && ray.d.y()<0.f) { - float floor_dist = -(ray.o.y() - floor_y)/ray.d.y(); - if (floor_dist>0.f) { - t=min(t,floor_dist); + if (ray.o.y() > floor_y && ray.d.y() < 0.f) { + float floor_dist = -(ray.o.y() - floor_y) / ray.d.y(); + if (floor_dist > 0.f) { + t = min(t, floor_dist); } } + ray.o = ray.o + (t + 1e-6f) * ray.d; } @@ -591,7 +592,8 @@ __global__ void sample_uniform_on_triangle_kernel(uint32_t n_elements, const flo sampled_positions[i] = triangles[tri_idx].sample_uniform_position(sample.tail<2>()); } -void Testbed::SphereTracer::init_rays_from_camera(uint32_t sample_index, +void Testbed::SphereTracer::init_rays_from_camera( + uint32_t sample_index, const Vector2i& resolution, const Vector2f& focal_length, const Matrix<float, 3, 4>& camera_matrix, @@ -613,15 +615,15 @@ void Testbed::SphereTracer::init_rays_from_camera(uint32_t sample_index, ) { // Make sure we have enough memory reserved to render at the requested resolution size_t n_pixels = (size_t)resolution.x() * resolution.y(); - enlarge(n_pixels); + enlarge(n_pixels, stream); const dim3 threads = { 16, 8, 1 }; const dim3 blocks = { div_round_up((uint32_t)resolution.x(), threads.x), div_round_up((uint32_t)resolution.y(), threads.y), 1 }; init_rays_with_payload_kernel_sdf<<<blocks, threads, 0, stream>>>( sample_index, - m_rays[0].pos.data(), - m_rays[0].distance.data(), - m_rays[0].payload.data(), + m_rays[0].pos, + m_rays[0].distance, + m_rays[0].payload, resolution, focal_length, camera_matrix, @@ -644,7 +646,7 @@ void Testbed::SphereTracer::init_rays_from_camera(uint32_t sample_index, } void Testbed::SphereTracer::init_rays_from_data(uint32_t n_elements, const RaysSdfSoa& data, cudaStream_t stream) { - enlarge(n_elements); + enlarge(n_elements, stream); m_rays[0].copy_from_other_async(n_elements, data, stream); m_n_rays_initialized = n_elements; } @@ -658,11 +660,11 @@ uint32_t Testbed::SphereTracer::trace_bvh(TriangleBvh* bvh, const Triangle* tria } // Abuse the normal buffer to temporarily hold ray directions - parallel_for_gpu(stream, n_alive, [payloads=m_rays[0].payload.data(), normals=m_rays[0].normal.data()] __device__ (size_t i) { + parallel_for_gpu(stream, n_alive, [payloads=m_rays[0].payload, normals=m_rays[0].normal] __device__ (size_t i) { normals[i] = payloads[i].dir; }); - bvh->ray_trace_gpu(n_alive, m_rays[0].pos.data(), m_rays[0].normal.data(), triangles, stream); + bvh->ray_trace_gpu(n_alive, m_rays[0].pos, m_rays[0].normal, triangles, stream); return n_alive; } @@ -681,7 +683,7 @@ uint32_t Testbed::SphereTracer::trace( return 0; } - CUDA_CHECK_THROW(cudaMemsetAsync(m_hit_counter.data(), 0, sizeof(uint32_t), stream)); + CUDA_CHECK_THROW(cudaMemsetAsync(m_hit_counter, 0, sizeof(uint32_t), stream)); const uint32_t STEPS_INBETWEEN_COMPACTION = 4; @@ -700,29 +702,29 @@ uint32_t Testbed::SphereTracer::trace( // Compact rays that did not diverge yet { - CUDA_CHECK_THROW(cudaMemsetAsync(m_alive_counter.data(), 0, sizeof(uint32_t), stream)); + CUDA_CHECK_THROW(cudaMemsetAsync(m_alive_counter, 0, sizeof(uint32_t), stream)); if (m_trace_shadow_rays) { linear_kernel(compact_kernel_shadow_sdf, 0, stream, n_alive, zero_offset, - rays_tmp.pos.data(), rays_tmp.distance.data(), rays_tmp.payload.data(), rays_tmp.prev_distance.data(), rays_tmp.total_distance.data(), rays_tmp.min_visibility.data(), - rays_current.pos.data(), rays_current.distance.data(), rays_current.payload.data(), rays_current.prev_distance.data(), rays_current.total_distance.data(), rays_current.min_visibility.data(), - m_rays_hit.pos.data(), m_rays_hit.distance.data(), m_rays_hit.payload.data(), m_rays_hit.prev_distance.data(), m_rays_hit.total_distance.data(), m_rays_hit.min_visibility.data(), + rays_tmp.pos, rays_tmp.distance, rays_tmp.payload, rays_tmp.prev_distance, rays_tmp.total_distance, rays_tmp.min_visibility, + rays_current.pos, rays_current.distance, rays_current.payload, rays_current.prev_distance, rays_current.total_distance, rays_current.min_visibility, + m_rays_hit.pos, m_rays_hit.distance, m_rays_hit.payload, m_rays_hit.prev_distance, m_rays_hit.total_distance, m_rays_hit.min_visibility, aabb, - m_alive_counter.data(), m_hit_counter.data() + m_alive_counter, m_hit_counter ); } else { linear_kernel(compact_kernel_sdf, 0, stream, n_alive, zero_offset, - rays_tmp.pos.data(), rays_tmp.distance.data(), rays_tmp.payload.data(), - rays_current.pos.data(), rays_current.distance.data(), rays_current.payload.data(), - m_rays_hit.pos.data(), m_rays_hit.distance.data(), m_rays_hit.payload.data(), + rays_tmp.pos, rays_tmp.distance, rays_tmp.payload, + rays_current.pos, rays_current.distance, rays_current.payload, + m_rays_hit.pos, m_rays_hit.distance, m_rays_hit.payload, aabb, - m_alive_counter.data(), m_hit_counter.data() + m_alive_counter, m_hit_counter ); } - CUDA_CHECK_THROW(cudaMemcpyAsync(&n_alive, m_alive_counter.data(), sizeof(uint32_t), cudaMemcpyDeviceToHost, stream)); + CUDA_CHECK_THROW(cudaMemcpyAsync(&n_alive, m_alive_counter, sizeof(uint32_t), cudaMemcpyDeviceToHost, stream)); CUDA_CHECK_THROW(cudaStreamSynchronize(stream)); } @@ -735,9 +737,9 @@ uint32_t Testbed::SphereTracer::trace( linear_kernel(advance_pos_kernel_sdf, 0, stream, n_alive, zero_offset, - rays_current.pos.data(), - rays_current.distance.data(), - rays_current.payload.data(), + rays_current.pos, + rays_current.distance, + rays_current.payload, aabb, floor_y, octree ? octree->nodes_gpu() : nullptr, @@ -745,9 +747,9 @@ uint32_t Testbed::SphereTracer::trace( distance_scale, maximum_distance, m_shadow_sharpness, - m_trace_shadow_rays ? rays_current.prev_distance.data() : nullptr, - m_trace_shadow_rays ? rays_current.total_distance.data() : nullptr, - m_trace_shadow_rays ? rays_current.min_visibility.data() : nullptr + m_trace_shadow_rays ? rays_current.prev_distance : nullptr, + m_trace_shadow_rays ? rays_current.total_distance : nullptr, + m_trace_shadow_rays ? rays_current.min_visibility : nullptr ); } @@ -755,36 +757,67 @@ uint32_t Testbed::SphereTracer::trace( } uint32_t n_hit; - CUDA_CHECK_THROW(cudaMemcpyAsync(&n_hit, m_hit_counter.data(), sizeof(uint32_t), cudaMemcpyDeviceToHost, stream)); + CUDA_CHECK_THROW(cudaMemcpyAsync(&n_hit, m_hit_counter, sizeof(uint32_t), cudaMemcpyDeviceToHost, stream)); CUDA_CHECK_THROW(cudaStreamSynchronize(stream)); return n_hit; } -void Testbed::SphereTracer::enlarge(size_t n_elements) { +void Testbed::SphereTracer::enlarge(size_t n_elements, cudaStream_t stream) { n_elements = next_multiple(n_elements, size_t(tcnn::batch_size_granularity)); - m_rays[0].enlarge(n_elements); - m_rays[1].enlarge(n_elements); - m_rays_hit.enlarge(n_elements); + auto scratch = allocate_workspace_and_distribute< + Vector3f, Vector3f, float, float, float, float, SdfPayload, // m_rays[0] + Vector3f, Vector3f, float, float, float, float, SdfPayload, // m_rays[1] + Vector3f, Vector3f, float, float, float, float, SdfPayload, // m_rays_hit + + uint32_t, + uint32_t + >( + stream, &m_scratch_alloc, + n_elements, n_elements, n_elements, n_elements, n_elements, n_elements, n_elements, + n_elements, n_elements, n_elements, n_elements, n_elements, n_elements, n_elements, + n_elements, n_elements, n_elements, n_elements, n_elements, n_elements, n_elements, + 32, // 2 full cache lines to ensure no overlap + 32 // 2 full cache lines to ensure no overlap + ); + + m_rays[0].set(std::get<0>(scratch), std::get<1>(scratch), std::get<2>(scratch), std::get<3>(scratch), std::get<4>(scratch), std::get<5>(scratch), std::get<6>(scratch)); + m_rays[1].set(std::get<7>(scratch), std::get<8>(scratch), std::get<9>(scratch), std::get<10>(scratch), std::get<11>(scratch), std::get<12>(scratch), std::get<13>(scratch)); + m_rays_hit.set(std::get<14>(scratch), std::get<15>(scratch), std::get<16>(scratch), std::get<17>(scratch), std::get<18>(scratch), std::get<19>(scratch), std::get<20>(scratch)); + + m_hit_counter = std::get<21>(scratch); + m_alive_counter = std::get<22>(scratch); } -void Testbed::FiniteDifferenceNormalsApproximator::enlarge(uint32_t n_elements) { - dx.enlarge(n_elements); - dy.enlarge(n_elements); - dz.enlarge(n_elements); +void Testbed::FiniteDifferenceNormalsApproximator::enlarge(uint32_t n_elements, cudaStream_t stream) { + n_elements = next_multiple(n_elements, tcnn::batch_size_granularity); + auto scratch = allocate_workspace_and_distribute< + Vector3f, Vector3f, Vector3f, + float, float, float, + float, float, float + >( + stream, &m_scratch_alloc, + n_elements, n_elements, n_elements, + n_elements, n_elements, n_elements, + n_elements, n_elements, n_elements + ); + + dx = std::get<0>(scratch); + dy = std::get<1>(scratch); + dz = std::get<2>(scratch); - dist_dx_pos.enlarge(n_elements); - dist_dy_pos.enlarge(n_elements); - dist_dz_pos.enlarge(n_elements); + dist_dx_pos = std::get<3>(scratch); + dist_dy_pos = std::get<4>(scratch); + dist_dz_pos = std::get<5>(scratch); - dist_dx_neg.enlarge(n_elements); - dist_dy_neg.enlarge(n_elements); - dist_dz_neg.enlarge(n_elements); + dist_dx_neg = std::get<6>(scratch); + dist_dy_neg = std::get<7>(scratch); + dist_dz_neg = std::get<8>(scratch); } -void Testbed::FiniteDifferenceNormalsApproximator::normal(uint32_t n_elements, const distance_fun_t& distance_function, GPUMemory<Vector3f>& pos, GPUMemory<Vector3f>& normal, float epsilon, cudaStream_t stream) { - enlarge(n_elements); +void Testbed::FiniteDifferenceNormalsApproximator::normal(uint32_t n_elements, const distance_fun_t& distance_function, const Vector3f* pos, Vector3f* normal, float epsilon, cudaStream_t stream) { + enlarge(n_elements, stream); - parallel_for_gpu(stream, n_elements, [pos=pos.data(), dx=dx.data(), dy=dy.data(), dz=dz.data(), epsilon] __device__ (size_t i) { + parallel_for_gpu(stream, n_elements, [pos=pos, dx=dx, dy=dy, dz=dz, epsilon] __device__ (size_t i) { Vector3f p = pos[i]; dx[i] = Vector3f{p.x() + epsilon, p.y(), p.z()}; dy[i] = Vector3f{p.x(), p.y() + epsilon, p.z()}; @@ -795,7 +828,7 @@ void Testbed::FiniteDifferenceNormalsApproximator::normal(uint32_t n_elements, c distance_function(n_elements, dy, dist_dy_pos, stream); distance_function(n_elements, dz, dist_dz_pos, stream); - parallel_for_gpu(stream, n_elements, [pos=pos.data(), dx=dx.data(), dy=dy.data(), dz=dz.data(), epsilon] __device__ (size_t i) { + parallel_for_gpu(stream, n_elements, [pos=pos, dx=dx, dy=dy, dz=dz, epsilon] __device__ (size_t i) { Vector3f p = pos[i]; dx[i] = Vector3f{p.x() - epsilon, p.y(), p.z()}; dy[i] = Vector3f{p.x(), p.y() - epsilon, p.z()}; @@ -806,7 +839,7 @@ void Testbed::FiniteDifferenceNormalsApproximator::normal(uint32_t n_elements, c distance_function(n_elements, dy, dist_dy_neg, stream); distance_function(n_elements, dz, dist_dz_neg, stream); - parallel_for_gpu(stream, n_elements, [normal=normal.data(), dist_dx_pos=dist_dx_pos.data(), dist_dx_neg=dist_dx_neg.data(), dist_dy_pos=dist_dy_pos.data(), dist_dy_neg=dist_dy_neg.data(), dist_dz_pos=dist_dz_pos.data(), dist_dz_neg=dist_dz_neg.data()] __device__ (size_t i) { + parallel_for_gpu(stream, n_elements, [normal=normal, dist_dx_pos=dist_dx_pos, dist_dx_neg=dist_dx_neg, dist_dy_pos=dist_dy_pos, dist_dy_neg=dist_dy_neg, dist_dz_pos=dist_dz_pos, dist_dz_neg=dist_dz_neg] __device__ (size_t i) { normal[i] = {dist_dx_pos[i] - dist_dx_neg[i], dist_dy_pos[i] - dist_dy_neg[i], dist_dz_pos[i] - dist_dz_neg[i]}; }); } @@ -827,8 +860,7 @@ void Testbed::render_sdf( } auto* octree_ptr = m_sdf.uses_takikawa_encoding || m_sdf.use_triangle_octree ? m_sdf.triangle_octree.get() : nullptr; - // Reserve the memory for max-res rendering to prevent stuttering - m_sdf.tracer.enlarge(max_res.x() * max_res.y()); + SphereTracer tracer; uint32_t n_octree_levels = octree_ptr ? octree_ptr->depth() : 0; if (m_render_ground_truth && m_sdf.groundtruth_mode == ESDFGroundTruthMode::SDFBricks) { @@ -837,7 +869,7 @@ void Testbed::render_sdf( BoundingBox sdf_bounding_box = m_aabb; sdf_bounding_box.inflate(m_sdf.zero_offset); - m_sdf.tracer.init_rays_from_camera( + tracer.init_rays_from_camera( render_buffer.spp(), render_buffer.in_resolution(), focal_length, @@ -881,22 +913,22 @@ void Testbed::render_sdf( uint32_t n_hit; if (m_render_mode == ERenderMode::Slice) { - n_hit = m_sdf.tracer.n_rays_initialized(); + n_hit = tracer.n_rays_initialized(); } else { - n_hit = trace(m_sdf.tracer); + n_hit = trace(tracer); } - RaysSdfSoa& rays_hit = m_render_mode == ERenderMode::Slice || gt_raytrace ? m_sdf.tracer.rays_init() : m_sdf.tracer.rays_hit(); + RaysSdfSoa& rays_hit = m_render_mode == ERenderMode::Slice || gt_raytrace ? tracer.rays_init() : tracer.rays_hit(); if (m_render_mode == ERenderMode::Slice) { if (m_visualized_dimension == -1) { distance_function(n_hit, rays_hit.pos, rays_hit.distance, stream); - extract_dimension_pos_neg_kernel<float><<<n_blocks_linear(n_hit*3), n_threads_linear, 0, stream>>>(n_hit*3, 0, 1, 3, rays_hit.distance.data(), CM, (float*)rays_hit.normal.data()); + extract_dimension_pos_neg_kernel<float><<<n_blocks_linear(n_hit*3), n_threads_linear, 0, stream>>>(n_hit*3, 0, 1, 3, rays_hit.distance, CM, (float*)rays_hit.normal); } else { // Store colors in the normal buffer uint32_t n_elements = next_multiple(n_hit, tcnn::batch_size_granularity); - GPUMatrix<float> positions_matrix((float*)rays_hit.pos.data(), 3, n_elements); - GPUMatrix<float> colors_matrix((float*)rays_hit.normal.data(), 3, n_elements); + GPUMatrix<float> positions_matrix((float*)rays_hit.pos, 3, n_elements); + GPUMatrix<float> colors_matrix((float*)rays_hit.normal, 3, n_elements); m_network->visualize_activation(stream, m_visualized_layer, m_visualized_dimension, positions_matrix, colors_matrix); } } @@ -906,45 +938,47 @@ void Testbed::render_sdf( if (m_sdf.analytic_normals || gt_raytrace) { normals_function(n_hit, rays_hit.pos, rays_hit.normal, stream); } else { - // Prevent spurious enlargements by reserving enough memory to hold a full-res image in any case. - m_sdf.fd_normals.enlarge(render_buffer.in_resolution().x() * render_buffer.in_resolution().y()); float fd_normals_epsilon = m_sdf.fd_normals_epsilon; if (m_render_ground_truth && m_sdf.groundtruth_mode == ESDFGroundTruthMode::SDFBricks && m_sdf.brick_smooth_normals) { fd_normals_epsilon = exp2f(-float(n_octree_levels)) * (1.f/(m_sdf.brick_res-1)); // in sdf brick mode, use one voxel as the normal central difference radius } - m_sdf.fd_normals.normal(n_hit, distance_function, rays_hit.pos, rays_hit.normal, fd_normals_epsilon, stream); + + FiniteDifferenceNormalsApproximator fd_normals; + fd_normals.normal(n_hit, distance_function, rays_hit.pos, rays_hit.normal, fd_normals_epsilon, stream); } if (render_mode == ERenderMode::Shade && n_hit > 0) { // Shadow rays towards the sun - m_sdf.shadow_tracer.init_rays_from_data(n_hit, rays_hit, stream); - m_sdf.shadow_tracer.set_trace_shadow_rays(true); - m_sdf.shadow_tracer.set_shadow_sharpness(m_sdf.shadow_sharpness); - RaysSdfSoa& shadow_rays_init = m_sdf.shadow_tracer.rays_init(); + SphereTracer shadow_tracer; + + shadow_tracer.init_rays_from_data(n_hit, rays_hit, stream); + shadow_tracer.set_trace_shadow_rays(true); + shadow_tracer.set_shadow_sharpness(m_sdf.shadow_sharpness); + RaysSdfSoa& shadow_rays_init = shadow_tracer.rays_init(); linear_kernel(prepare_shadow_rays, 0, stream, n_hit, m_sun_dir.normalized(), - shadow_rays_init.pos.data(), - shadow_rays_init.normal.data(), - shadow_rays_init.distance.data(), - shadow_rays_init.prev_distance.data(), - shadow_rays_init.total_distance.data(), - shadow_rays_init.min_visibility.data(), - shadow_rays_init.payload.data(), + shadow_rays_init.pos, + shadow_rays_init.normal, + shadow_rays_init.distance, + shadow_rays_init.prev_distance, + shadow_rays_init.total_distance, + shadow_rays_init.min_visibility, + shadow_rays_init.payload, sdf_bounding_box, octree_ptr ? octree_ptr->nodes_gpu() : nullptr, n_octree_levels ); - uint32_t n_hit_shadow = trace(m_sdf.shadow_tracer); - auto& shadow_rays_hit = gt_raytrace ? m_sdf.shadow_tracer.rays_init() : m_sdf.shadow_tracer.rays_hit(); + uint32_t n_hit_shadow = trace(shadow_tracer); + auto& shadow_rays_hit = gt_raytrace ? shadow_tracer.rays_init() : shadow_tracer.rays_hit(); linear_kernel(write_shadow_ray_result, 0, stream, n_hit_shadow, sdf_bounding_box, - shadow_rays_hit.pos.data(), - shadow_rays_hit.payload.data(), - shadow_rays_hit.min_visibility.data(), - rays_hit.distance.data() + shadow_rays_hit.pos, + shadow_rays_hit.payload, + shadow_rays_hit.min_visibility, + rays_hit.distance ); // todo: Reflection rays? @@ -953,8 +987,8 @@ void Testbed::render_sdf( // HACK: Store colors temporarily in the normal buffer uint32_t n_elements = next_multiple(n_hit, tcnn::batch_size_granularity); - GPUMatrix<float> positions_matrix((float*)rays_hit.pos.data(), 3, n_elements); - GPUMatrix<float> colors_matrix((float*)rays_hit.normal.data(), 3, n_elements); + GPUMatrix<float> positions_matrix((float*)rays_hit.pos, 3, n_elements); + GPUMatrix<float> colors_matrix((float*)rays_hit.normal, 3, n_elements); m_network->visualize_activation(stream, m_visualized_layer, m_visualized_dimension, positions_matrix, colors_matrix); } @@ -967,17 +1001,18 @@ void Testbed::render_sdf( m_sun_dir.normalized(), m_up_dir.normalized(), camera_matrix, - rays_hit.pos.data(), - rays_hit.normal.data(), - rays_hit.distance.data(), - rays_hit.payload.data(), + rays_hit.pos, + rays_hit.normal, + rays_hit.distance, + rays_hit.payload, render_buffer.frame_buffer(), render_buffer.depth_buffer() ); if (render_mode == ERenderMode::Cost) { std::vector<SdfPayload> payloads_final_cpu(n_hit); - rays_hit.payload.copy_to_host(payloads_final_cpu, n_hit); + CUDA_CHECK_THROW(cudaMemcpyAsync(payloads_final_cpu.data(), rays_hit.payload, n_hit * sizeof(SdfPayload), cudaMemcpyDeviceToHost, stream)); + CUDA_CHECK_THROW(cudaStreamSynchronize(stream)); size_t total_n_steps = 0; for (uint32_t i = 0; i < n_hit; ++i) { total_n_steps += payloads_final_cpu[i].n_steps;