diff --git a/include/neural-graphics-primitives/dlss.h b/include/neural-graphics-primitives/dlss.h index 9f6dbdc0acdf3db3cca93c76a79ea3a1ead9c601..dbe86fccca1cd0d911535b340e8c2a05bd406a5b 100644 --- a/include/neural-graphics-primitives/dlss.h +++ b/include/neural-graphics-primitives/dlss.h @@ -26,6 +26,11 @@ class IDlss { public: virtual ~IDlss() {} + virtual void update_feature( + const Eigen::Vector2i& in_resolution, + bool is_hdr, + bool sharpen + ) = 0; virtual void run( const Eigen::Vector2i& in_resolution, bool is_hdr, @@ -42,8 +47,10 @@ public: virtual Eigen::Vector2i clamp_resolution(const Eigen::Vector2i& resolution) const = 0; virtual Eigen::Vector2i out_resolution() const = 0; + virtual Eigen::Vector2i max_out_resolution() const = 0; virtual bool is_hdr() const = 0; + virtual bool sharpen() const = 0; virtual EDlssQuality quality() const = 0; }; diff --git a/include/neural-graphics-primitives/render_buffer.h b/include/neural-graphics-primitives/render_buffer.h index 99756c6c559084a27d890d035ae323474e03ea28..73466ff3008d78863dea9a075240b5907d2d8552 100644 --- a/include/neural-graphics-primitives/render_buffer.h +++ b/include/neural-graphics-primitives/render_buffer.h @@ -95,11 +95,11 @@ public: GLuint texture(); - cudaSurfaceObject_t surface() override ; + cudaSurfaceObject_t surface() override; - cudaArray_t array() override ; + cudaArray_t array() override; - void blit_from_cuda_mapping() ; + void blit_from_cuda_mapping(); const std::string& texture_name() const { return m_texture_name; } @@ -162,7 +162,9 @@ public: CudaRenderBuffer(const std::shared_ptr<SurfaceProvider>& surf) : m_surface_provider{surf} {} CudaRenderBuffer(const CudaRenderBuffer& other) = delete; + CudaRenderBuffer& operator=(const CudaRenderBuffer& other) = delete; CudaRenderBuffer(CudaRenderBuffer&& other) = default; + CudaRenderBuffer& operator=(CudaRenderBuffer&& other) = default; cudaSurfaceObject_t surface() { return m_surface_provider->surface(); @@ -186,6 +188,10 @@ public: return m_spp; } + void set_spp(uint32_t value) { + m_spp = value; + } + Eigen::Array4f* frame_buffer() const { return m_frame_buffer.data(); } @@ -249,7 +255,7 @@ public: } } - void enable_dlss(const Eigen::Vector2i& out_res); + void enable_dlss(const Eigen::Vector2i& max_out_res); void disable_dlss(); void set_dlss_sharpening(float value) { m_dlss_sharpening = value; diff --git a/include/neural-graphics-primitives/sdf.h b/include/neural-graphics-primitives/sdf.h index ddef63cab37c9824bfeaa5cb5bdf0a58e8e4c6e0..07a99bbd90e17dbac43d9875a9e6dc54c96ff489 100644 --- a/include/neural-graphics-primitives/sdf.h +++ b/include/neural-graphics-primitives/sdf.h @@ -29,34 +29,34 @@ struct SdfPayload { struct RaysSdfSoa { #if defined(__NVCC__) || (defined(__clang__) && defined(__CUDA__)) - void enlarge(size_t n_elements) { - pos.enlarge(n_elements); - normal.enlarge(n_elements); - distance.enlarge(n_elements); - prev_distance.enlarge(n_elements); - total_distance.enlarge(n_elements); - min_visibility.enlarge(n_elements); - payload.enlarge(n_elements); - } - void copy_from_other_async(uint32_t n_elements, const RaysSdfSoa& other, cudaStream_t stream) { - CUDA_CHECK_THROW(cudaMemcpyAsync(pos.data(), other.pos.data(), n_elements * sizeof(Eigen::Vector3f), cudaMemcpyDeviceToDevice, stream)); - CUDA_CHECK_THROW(cudaMemcpyAsync(normal.data(), other.normal.data(), n_elements * sizeof(Eigen::Vector3f), cudaMemcpyDeviceToDevice, stream)); - CUDA_CHECK_THROW(cudaMemcpyAsync(distance.data(), other.distance.data(), n_elements * sizeof(float), cudaMemcpyDeviceToDevice, stream)); - CUDA_CHECK_THROW(cudaMemcpyAsync(prev_distance.data(), other.prev_distance.data(), n_elements * sizeof(float), cudaMemcpyDeviceToDevice, stream)); - CUDA_CHECK_THROW(cudaMemcpyAsync(total_distance.data(), other.total_distance.data(), n_elements * sizeof(float), cudaMemcpyDeviceToDevice, stream)); - CUDA_CHECK_THROW(cudaMemcpyAsync(min_visibility.data(), other.min_visibility.data(), n_elements * sizeof(float), cudaMemcpyDeviceToDevice, stream)); - CUDA_CHECK_THROW(cudaMemcpyAsync(payload.data(), other.payload.data(), n_elements * sizeof(SdfPayload), cudaMemcpyDeviceToDevice, stream)); + CUDA_CHECK_THROW(cudaMemcpyAsync(pos, other.pos, n_elements * sizeof(Eigen::Vector3f), cudaMemcpyDeviceToDevice, stream)); + CUDA_CHECK_THROW(cudaMemcpyAsync(normal, other.normal, n_elements * sizeof(Eigen::Vector3f), cudaMemcpyDeviceToDevice, stream)); + CUDA_CHECK_THROW(cudaMemcpyAsync(distance, other.distance, n_elements * sizeof(float), cudaMemcpyDeviceToDevice, stream)); + CUDA_CHECK_THROW(cudaMemcpyAsync(prev_distance, other.prev_distance, n_elements * sizeof(float), cudaMemcpyDeviceToDevice, stream)); + CUDA_CHECK_THROW(cudaMemcpyAsync(total_distance, other.total_distance, n_elements * sizeof(float), cudaMemcpyDeviceToDevice, stream)); + CUDA_CHECK_THROW(cudaMemcpyAsync(min_visibility, other.min_visibility, n_elements * sizeof(float), cudaMemcpyDeviceToDevice, stream)); + CUDA_CHECK_THROW(cudaMemcpyAsync(payload, other.payload, n_elements * sizeof(SdfPayload), cudaMemcpyDeviceToDevice, stream)); } #endif - tcnn::GPUMemory<Eigen::Vector3f> pos; - tcnn::GPUMemory<Eigen::Vector3f> normal; - tcnn::GPUMemory<float> distance; - tcnn::GPUMemory<float> prev_distance; - tcnn::GPUMemory<float> total_distance; - tcnn::GPUMemory<float> min_visibility; - tcnn::GPUMemory<SdfPayload> payload; + void set(Eigen::Vector3f* pos, Eigen::Vector3f* normal, float* distance, float* prev_distance, float* total_distance, float* min_visibility, SdfPayload* payload) { + this->pos = pos; + this->normal = normal; + this->distance = distance; + this->prev_distance = prev_distance; + this->total_distance = total_distance; + this->min_visibility = min_visibility; + this->payload = payload; + } + + Eigen::Vector3f* pos; + Eigen::Vector3f* normal; + float* distance; + float* prev_distance; + float* total_distance; + float* min_visibility; + SdfPayload* payload; }; struct BRDFParams { diff --git a/include/neural-graphics-primitives/testbed.h b/include/neural-graphics-primitives/testbed.h index 1503674304e944289d9534ca639cc6b38809c99f..8e55eeb08c90198e5b45320549e89a0b789dbe82 100644 --- a/include/neural-graphics-primitives/testbed.h +++ b/include/neural-graphics-primitives/testbed.h @@ -69,12 +69,12 @@ public: void load_training_data(const std::string& data_path); void clear_training_data(); - using distance_fun_t = std::function<void(uint32_t, const tcnn::GPUMemory<Eigen::Vector3f>&, tcnn::GPUMemory<float>&, cudaStream_t)>; - using normals_fun_t = std::function<void(uint32_t, const tcnn::GPUMemory<Eigen::Vector3f>&, tcnn::GPUMemory<Eigen::Vector3f>&, cudaStream_t)>; + using distance_fun_t = std::function<void(uint32_t, const Eigen::Vector3f*, float*, cudaStream_t)>; + using normals_fun_t = std::function<void(uint32_t, const Eigen::Vector3f*, Eigen::Vector3f*, cudaStream_t)>; class SphereTracer { public: - SphereTracer() : m_hit_counter(1), m_alive_counter(1) {} + SphereTracer() {} void init_rays_from_camera( uint32_t spp, @@ -111,7 +111,7 @@ public: uint32_t n_octree_levels, cudaStream_t stream ); - void enlarge(size_t n_elements); + void enlarge(size_t n_elements, cudaStream_t stream); RaysSdfSoa& rays_hit() { return m_rays_hit; } RaysSdfSoa& rays_init() { return m_rays[0]; } uint32_t n_rays_initialized() const { return m_n_rays_initialized; } @@ -120,16 +120,19 @@ public: private: RaysSdfSoa m_rays[2]; RaysSdfSoa m_rays_hit; - tcnn::GPUMemory<uint32_t> m_hit_counter; - tcnn::GPUMemory<uint32_t> m_alive_counter; + uint32_t* m_hit_counter; + uint32_t* m_alive_counter; + uint32_t m_n_rays_initialized = 0; float m_shadow_sharpness = 2048.f; bool m_trace_shadow_rays = false; + + tcnn::GPUMemoryArena::Allocation m_scratch_alloc; }; class NerfTracer { public: - NerfTracer() : m_hit_counter(1), m_alive_counter(1) {} + NerfTracer() {} void init_rays_from_camera( uint32_t spp, @@ -193,38 +196,36 @@ public: RaysNerfSoa& rays_init() { return m_rays[0]; } uint32_t n_rays_initialized() const { return m_n_rays_initialized; } - void clear() { - m_scratch_alloc = {}; - } - private: RaysNerfSoa m_rays[2]; RaysNerfSoa m_rays_hit; precision_t* m_network_output; float* m_network_input; - tcnn::GPUMemory<uint32_t> m_hit_counter; - tcnn::GPUMemory<uint32_t> m_alive_counter; + uint32_t* m_hit_counter; + uint32_t* m_alive_counter; uint32_t m_n_rays_initialized = 0; tcnn::GPUMemoryArena::Allocation m_scratch_alloc; }; class FiniteDifferenceNormalsApproximator { public: - void enlarge(uint32_t n_elements); - void normal(uint32_t n_elements, const distance_fun_t& distance_function, tcnn::GPUMemory<Eigen::Vector3f>& pos, tcnn::GPUMemory<Eigen::Vector3f>& normal, float epsilon, cudaStream_t stream); + void enlarge(uint32_t n_elements, cudaStream_t stream); + void normal(uint32_t n_elements, const distance_fun_t& distance_function, const Eigen::Vector3f* pos, Eigen::Vector3f* normal, float epsilon, cudaStream_t stream); private: - tcnn::GPUMemory<Eigen::Vector3f> dx; - tcnn::GPUMemory<Eigen::Vector3f> dy; - tcnn::GPUMemory<Eigen::Vector3f> dz; + Eigen::Vector3f* dx; + Eigen::Vector3f* dy; + Eigen::Vector3f* dz; - tcnn::GPUMemory<float> dist_dx_pos; - tcnn::GPUMemory<float> dist_dy_pos; - tcnn::GPUMemory<float> dist_dz_pos; + float* dist_dx_pos; + float* dist_dy_pos; + float* dist_dz_pos; - tcnn::GPUMemory<float> dist_dx_neg; - tcnn::GPUMemory<float> dist_dy_neg; - tcnn::GPUMemory<float> dist_dz_neg; + float* dist_dx_neg; + float* dist_dy_neg; + float* dist_dz_neg; + + tcnn::GPUMemoryArena::Allocation m_scratch_alloc; }; struct LevelStats { @@ -532,8 +533,6 @@ public: bool m_gui_redraw = true; struct Nerf { - NerfTracer tracer; - struct Training { NerfDataset dataset; int n_images_for_training = 0; // how many images to train from, as a high watermark compared to the dataset size @@ -640,9 +639,6 @@ public: uint32_t max_cascade = 0; - tcnn::GPUMemory<float> vis_input; - tcnn::GPUMemory<Eigen::Array4f> vis_rgba; - ENerfActivation rgb_activation = ENerfActivation::Exponential; ENerfActivation density_activation = ENerfActivation::Exponential; @@ -666,8 +662,6 @@ public: } m_nerf; struct Sdf { - SphereTracer tracer; - SphereTracer shadow_tracer; float shadow_sharpness = 2048.0f; float maximum_distance = 0.00005f; float fd_normals_epsilon = 0.0005f; @@ -676,8 +670,6 @@ public: BRDFParams brdf; - FiniteDifferenceNormalsApproximator fd_normals; - // Mesh data EMeshSdfMode mesh_sdf_mode = EMeshSdfMode::Raystab; float mesh_scale; @@ -764,7 +756,7 @@ public: tcnn::GPUMemory<char> nanovdb_grid; tcnn::GPUMemory<uint8_t> bitgrid; float global_majorant = 1.f; - Eigen::Vector3f world2index_offset = {0,0,0}; + Eigen::Vector3f world2index_offset = {0, 0, 0}; float world2index_scale = 1.f; struct Training { @@ -790,7 +782,7 @@ public: float m_render_near_distance = 0.0f; float m_slice_plane_z = 0.0f; bool m_floor_enable = false; - inline float get_floor_y() const { return m_floor_enable ? m_aabb.min.y()+0.001f : -10000.f; } + inline float get_floor_y() const { return m_floor_enable ? m_aabb.min.y() + 0.001f : -10000.f; } BoundingBox m_raw_aabb; BoundingBox m_aabb; BoundingBox m_render_aabb; diff --git a/include/neural-graphics-primitives/thread_pool.h b/include/neural-graphics-primitives/thread_pool.h index 47e9d7b268ee878fe3917fc210a93ddeef3b5fa9..37f7fd53b18373652e3138e65de96f84c3eda678 100644 --- a/include/neural-graphics-primitives/thread_pool.h +++ b/include/neural-graphics-primitives/thread_pool.h @@ -41,8 +41,6 @@ public: auto enqueueTask(F&& f, bool highPriority = false) -> std::future<std::result_of_t <F()>> { using return_type = std::result_of_t<F()>; - ++mNumTasksInSystem; - auto task = std::make_shared<std::packaged_task<return_type()>>(std::forward<F>(f)); auto res = task->get_future(); @@ -63,13 +61,8 @@ public: void startThreads(size_t num); void shutdownThreads(size_t num); + void setNThreads(size_t num); - size_t numTasksInSystem() const { - return mNumTasksInSystem; - } - - void waitUntilFinished(); - void waitUntilFinishedFor(const std::chrono::microseconds Duration); void flushQueue(); template <typename Int, typename F> @@ -109,10 +102,6 @@ private: std::deque<std::function<void()>> mTaskQueue; std::mutex mTaskQueueMutex; std::condition_variable mWorkerCondition; - - std::atomic<size_t> mNumTasksInSystem; - std::mutex mSystemBusyMutex; - std::condition_variable mSystemBusyCondition; }; NGP_NAMESPACE_END diff --git a/scripts/colmap2nerf.py b/scripts/colmap2nerf.py index b4f463e0a2bb8a024411ab6246b5a11454023c54..d015f5e1d1030484bc54bb9ae4c4f267ced4012f 100755 --- a/scripts/colmap2nerf.py +++ b/scripts/colmap2nerf.py @@ -33,7 +33,7 @@ def parse_args(): parser.add_argument("--colmap_camera_params", default="", help="intrinsic parameters, depending on the chosen model. Format: fx,fy,cx,cy,dist") parser.add_argument("--images", default="images", help="input path to the images") parser.add_argument("--text", default="colmap_text", help="input path to the colmap text files (set automatically if run_colmap is used)") - parser.add_argument("--aabb_scale", default=16, choices=["1","2","4","8","16"], help="large scene scale factor. 1=scene fits in unit cube; power of 2 up to 16") + parser.add_argument("--aabb_scale", default=16, choices=["1", "2", "4", "8", "16", "32", "64", "128"], help="large scene scale factor. 1=scene fits in unit cube; power of 2 up to 16") parser.add_argument("--skip_early", default=0, help="skip this many images from the start") parser.add_argument("--keep_colmap_coords", action="store_true", help="keep transforms.json in COLMAP's original frame of reference (this will avoid reorienting and repositioning the scene for preview and rendering)") parser.add_argument("--out", default="transforms.json", help="output path") diff --git a/src/dlss.cu b/src/dlss.cu index 9bb402c4d63f93f941fcf70993de48abd1da9995..627dbd986513a6c5575be575cd36e46ac22bb95d 100644 --- a/src/dlss.cu +++ b/src/dlss.cu @@ -417,7 +417,7 @@ void vulkan_and_ngx_init() { throw std::runtime_error{fmt::format("DLSS not available: {}", ngx_error_string(ngx_result))}; } - tlog::success() << "Initialized Vulkan and NGX on device #" << device_id << ": " << physical_device_properties.deviceName; + tlog::success() << "Initialized Vulkan and NGX on GPU #" << device_id << ": " << physical_device_properties.deviceName; } size_t dlss_allocated_bytes() { @@ -878,6 +878,10 @@ public: return m_specs.quality; } + Vector2i out_resolution() const { + return m_specs.out_resolution; + } + Vector2i clamp_resolution(const Vector2i& resolution) const { return m_specs.clamp_resolution(resolution); } @@ -895,21 +899,44 @@ private: class Dlss : public IDlss { public: - Dlss(const Eigen::Vector2i& out_resolution) + Dlss(const Eigen::Vector2i& max_out_resolution) : - m_out_resolution{out_resolution}, + m_max_out_resolution{max_out_resolution}, // Allocate all buffers at output resolution and use dynamic sub-rects // to use subsets of them. This avoids re-allocations when using DLSS // with dynamically changing input resolution. - m_frame_buffer{out_resolution, 4}, - m_depth_buffer{out_resolution, 1}, - m_mvec_buffer{out_resolution, 2}, + m_frame_buffer{max_out_resolution, 4}, + m_depth_buffer{max_out_resolution, 1}, + m_mvec_buffer{max_out_resolution, 2}, m_exposure_buffer{{1, 1}, 1}, - m_output_buffer{out_resolution, 4} + m_output_buffer{max_out_resolution, 4} { + // Various quality modes of DLSS for (int i = 0; i < (int)EDlssQuality::NumDlssQualitySettings; ++i) { try { - auto specs = dlss_feature_specs(out_resolution, (EDlssQuality)i); + auto specs = dlss_feature_specs(max_out_resolution, (EDlssQuality)i); + + // Only emplace the specs if the feature can be created in practice! + DlssFeature{specs, true, true}; + DlssFeature{specs, true, false}; + DlssFeature{specs, false, true}; + DlssFeature{specs, false, false}; + m_dlss_specs.emplace_back(specs); + } catch (...) {} + } + + // For super insane performance requirements (more than 3x upscaling) try UltraPerformance + // with reduced output resolutions for 4.5x, 6x, 9x. + std::vector<Vector2i> reduced_out_resolutions = { + max_out_resolution / 3 * 2, + max_out_resolution / 2, + max_out_resolution / 3, + // max_out_resolution / 4, + }; + + for (const auto& out_resolution : reduced_out_resolutions) { + try { + auto specs = dlss_feature_specs(out_resolution, EDlssQuality::UltraPerformance); // Only emplace the specs if the feature can be created in practice! DlssFeature{specs, true, true}; @@ -926,20 +953,14 @@ public: m_dlss_feature = nullptr; } - void run( - const Vector2i& in_resolution, - bool is_hdr, - float sharpening, - const Vector2f& jitter_offset, - bool shall_reset - ) override { + void update_feature(const Vector2i& in_resolution, bool is_hdr, bool sharpen) override { CUDA_CHECK_THROW(cudaDeviceSynchronize()); - EDlssQuality quality; + DlssFeatureSpecs specs; bool found = false; - for (const auto& specs : m_dlss_specs) { - if (specs.distance(in_resolution) == 0.0f) { - quality = specs.quality; + for (const auto& s : m_dlss_specs) { + if (s.distance(in_resolution) == 0.0f) { + specs = s; found = true; } } @@ -948,10 +969,21 @@ public: throw std::runtime_error{"Dlss::run called with invalid input resolution."}; } - bool sharpen = sharpening != 0.0f; - if (!m_dlss_feature || m_dlss_feature->is_hdr() != is_hdr || m_dlss_feature->sharpen() != sharpen || m_dlss_feature->quality() != quality) { - m_dlss_feature.reset(new DlssFeature{m_out_resolution, is_hdr, sharpen, quality}); + if (!m_dlss_feature || m_dlss_feature->is_hdr() != is_hdr || m_dlss_feature->sharpen() != sharpen || m_dlss_feature->quality() != specs.quality || m_dlss_feature->out_resolution() != specs.out_resolution) { + m_dlss_feature.reset(new DlssFeature{specs.out_resolution, is_hdr, sharpen, specs.quality}); } + } + + void run( + const Vector2i& in_resolution, + bool is_hdr, + float sharpening, + const Vector2f& jitter_offset, + bool shall_reset + ) override { + CUDA_CHECK_THROW(cudaDeviceSynchronize()); + + update_feature(in_resolution, is_hdr, sharpening != 0.0f); m_dlss_feature->run( in_resolution, @@ -1001,13 +1033,21 @@ public: } Vector2i out_resolution() const override { - return m_out_resolution; + return m_dlss_feature ? m_dlss_feature->out_resolution() : m_max_out_resolution; + } + + Vector2i max_out_resolution() const override { + return m_max_out_resolution; } bool is_hdr() const override { return m_dlss_feature && m_dlss_feature->is_hdr(); } + bool sharpen() const override { + return m_dlss_feature && m_dlss_feature->sharpen(); + } + EDlssQuality quality() const override { return m_dlss_feature ? m_dlss_feature->quality() : EDlssQuality::None; } @@ -1022,7 +1062,7 @@ private: VulkanTexture m_exposure_buffer; VulkanTexture m_output_buffer; - Vector2i m_out_resolution; + Vector2i m_max_out_resolution; }; std::shared_ptr<IDlss> dlss_init(const Eigen::Vector2i& out_resolution) { diff --git a/src/nerf_loader.cu b/src/nerf_loader.cu index ba21e6b58de959b1744bba77bce111fec3340c24..eb0555bf75a26edebdd87b267842a0e8b17864c9 100644 --- a/src/nerf_loader.cu +++ b/src/nerf_loader.cu @@ -582,7 +582,6 @@ NerfDataset load_nerf(const std::vector<filesystem::path>& jsonpaths, float shar throw std::runtime_error{"Could not open image file: "s + std::string{stbi_failure_reason()}}; } - fs::path alphapath = basepath / fmt::format("{}.alpha.{}", frame["file_path"], path.extension()); if (alphapath.exists()) { int wa = 0, ha = 0; @@ -613,7 +612,7 @@ NerfDataset load_nerf(const std::vector<filesystem::path>& jsonpaths, float shar } dst.mask_color = 0x00FF00FF; // HOT PINK for (int i = 0; i < dst.res.prod(); ++i) { - if (mask_img[i*4] != 0) { + if (mask_img[i*4] != 0 || mask_img[i*4+1] != 0 || mask_img[i*4+2] != 0) { *(uint32_t*)&img[i*4] = dst.mask_color; } } diff --git a/src/render_buffer.cu b/src/render_buffer.cu index a9a6d47724c53b6094782619bc3c725f8eb9ecbf..12ab7538f0ba475ec37684d1fbc85eb944ac6123 100644 --- a/src/render_buffer.cu +++ b/src/render_buffer.cu @@ -741,12 +741,15 @@ void CudaRenderBuffer::overlay_false_color(Vector2i training_resolution, bool to ); } -void CudaRenderBuffer::enable_dlss(const Eigen::Vector2i& out_res) { +void CudaRenderBuffer::enable_dlss(const Eigen::Vector2i& max_out_res) { #ifdef NGP_VULKAN - if (!m_dlss || m_dlss->out_resolution() != out_res) { - m_dlss = dlss_init(out_res); + if (!m_dlss || m_dlss->max_out_resolution() != max_out_res) { + m_dlss = dlss_init(max_out_res); + } + + if (m_dlss) { + resize(m_dlss->clamp_resolution(in_resolution())); } - resize(in_resolution()); #else throw std::runtime_error{"NGP was compiled without Vulkan/NGX/DLSS support."}; #endif diff --git a/src/testbed.cu b/src/testbed.cu index 886292c6bdc7d025053710f33c97ba2a704f01a9..b7bff31029fd531cbbc1b2f5eb3bc4f68a6d8d6e 100644 --- a/src/testbed.cu +++ b/src/testbed.cu @@ -53,8 +53,7 @@ # include <GL/glew.h> # endif # include <GLFW/glfw3.h> - - +# include <cuda_gl_interop.h> #endif // Windows.h is evil @@ -209,7 +208,7 @@ void Testbed::set_visualized_dim(int dim) { } void Testbed::translate_camera(const Vector3f& rel) { - m_camera.col(3) += m_camera.block<3,3>(0,0) * rel * m_bounding_radius; + m_camera.col(3) += m_camera.block<3, 3>(0, 0) * rel * m_bounding_radius; reset_accumulation(true); } @@ -524,7 +523,7 @@ void Testbed::imgui() { } ImGui::SameLine(); ImGui::PushItemWidth(400.f); - ImGui::InputText("File", opt_extr_filename_buf, sizeof(opt_extr_filename_buf)); + ImGui::InputText("File##Extrinsics file path", opt_extr_filename_buf, sizeof(opt_extr_filename_buf)); ImGui::PopItemWidth(); ImGui::SameLine(); ImGui::Checkbox("Quaternion format", &export_extrinsics_in_quat_format); @@ -1019,7 +1018,7 @@ void Testbed::imgui() { } ImGui::SameLine(); ImGui::Checkbox("w/ Optimizer State", &m_include_optimizer_state_in_snapshot); - ImGui::InputText("File", snapshot_filename_buf, sizeof(snapshot_filename_buf)); + ImGui::InputText("File##Snapshot file path", snapshot_filename_buf, sizeof(snapshot_filename_buf)); } if (m_testbed_mode == ETestbedMode::Nerf || m_testbed_mode == ETestbedMode::Sdf) { @@ -1451,7 +1450,7 @@ void Testbed::mouse_drag(const Vector2f& rel, int button) { m_image.pos += rel; if (m_fps_camera) { - m_camera.block<3,3>(0,0) = rot * m_camera.block<3,3>(0,0); + m_camera.block<3, 3>(0, 0) = rot * m_camera.block<3, 3>(0, 0); } else { // Turntable auto old_look_at = look_at(); @@ -1693,40 +1692,51 @@ void Testbed::train_and_render(bool skip_rendering) { auto& render_buffer = m_render_surfaces.front(); - if (m_dlss) { - render_buffer.enable_dlss(m_window_res); - m_aperture_size = 0.0f; - } else { - render_buffer.disable_dlss(); - } + { + // Don't count the time being spent allocating buffers and resetting DLSS as part of the frame time. + // Otherwise the dynamic resolution calculations for following frames will be thrown out of whack + // and may even start oscillating. + auto skip_start = std::chrono::steady_clock::now(); + ScopeGuard skip_timing_guard{[&]() { + start += std::chrono::steady_clock::now() - skip_start; + }}; + if (m_dlss) { + render_buffer.enable_dlss(m_window_res); + m_aperture_size = 0.0f; + } else { + render_buffer.disable_dlss(); + } - auto render_res = render_buffer.in_resolution(); - if (render_res.isZero() || (m_train && m_training_step == 0)) { - render_res = m_window_res/16; - } else { - render_res = render_res.cwiseMin(m_window_res); - } + auto render_res = render_buffer.in_resolution(); + if (render_res.isZero() || (m_train && m_training_step == 0)) { + render_res = m_window_res/16; + } else { + render_res = render_res.cwiseMin(m_window_res); + } - float render_time_per_fullres_frame = m_render_ms.val() / (float)render_res.x() / (float)render_res.y() * (float)m_window_res.x() * (float)m_window_res.y(); + float render_time_per_fullres_frame = m_render_ms.val() / (float)render_res.x() / (float)render_res.y() * (float)m_window_res.x() * (float)m_window_res.y(); - // Make sure we don't starve training with slow rendering - float factor = std::sqrt(1000.0f / m_dynamic_res_target_fps / render_time_per_fullres_frame); - if (!m_dynamic_res) { - factor = 8.f/(float)m_fixed_res_factor; - } + // Make sure we don't starve training with slow rendering + float factor = std::sqrt(1000.0f / m_dynamic_res_target_fps / render_time_per_fullres_frame); + if (!m_dynamic_res) { + factor = 8.f/(float)m_fixed_res_factor; + } - factor = tcnn::clamp(factor, 1.0f/16.0f, 1.0f); + factor = tcnn::clamp(factor, 1.0f/16.0f, 1.0f); - if (factor > m_last_render_res_factor * 1.2f || factor < m_last_render_res_factor * 0.8f || factor == 1.0f || !m_dynamic_res) { - render_res = (m_window_res.cast<float>() * factor).cast<int>().cwiseMin(m_window_res).cwiseMax(m_window_res/16); - m_last_render_res_factor = factor; - } + if (factor > m_last_render_res_factor * 1.2f || factor < m_last_render_res_factor * 0.8f || factor == 1.0f || !m_dynamic_res) { + render_res = (m_window_res.cast<float>() * factor).cast<int>().cwiseMin(m_window_res).cwiseMax(m_window_res/16); + m_last_render_res_factor = factor; + } + + if (render_buffer.dlss()) { + render_res = render_buffer.dlss()->clamp_resolution(render_res); + render_buffer.dlss()->update_feature(render_res, render_buffer.dlss()->is_hdr(), render_buffer.dlss()->sharpen()); + } - if (render_buffer.dlss()) { - render_res = render_buffer.dlss()->clamp_resolution(render_res); + render_buffer.resize(render_res); } - render_buffer.resize(render_res); render_frame(m_smoothed_camera, m_smoothed_camera, Eigen::Vector4f::Zero(), render_buffer); #ifdef NGP_GUI @@ -1880,6 +1890,9 @@ void Testbed::init_window(int resw, int resh, bool hidden, bool second_window) { try { vulkan_and_ngx_init(); m_dlss_supported = true; + if (m_testbed_mode == ETestbedMode::Nerf) { + m_dlss = true; + } } catch (const std::runtime_error& e) { tlog::warning() << "Could not initialize Vulkan and NGX. DLSS not supported. (" << e.what() << ")"; } @@ -2082,7 +2095,6 @@ bool Testbed::frame() { #ifdef NGP_GUI if (m_render_window) { if (m_gui_redraw) { - // Gather histogram statistics of the encoding in use if (m_gather_histograms) { gather_histograms(); } @@ -2469,6 +2481,35 @@ void Testbed::reset_network(bool clear_density_grid) { Testbed::Testbed(ETestbedMode mode) : m_testbed_mode(mode) { + if (!(__CUDACC_VER_MAJOR__ > 10 || (__CUDACC_VER_MAJOR__ == 10 && __CUDACC_VER_MINOR__ >= 2))) { + throw std::runtime_error{"Testbed required CUDA 10.2 or later."}; + } + +#ifdef NGP_GUI + // Ensure we're running on the GPU that'll host our GUI. To do so, try creating a dummy + // OpenGL context, figure out the GPU it's running on, and then kill that context again. + if (glfwInit()) { + glfwWindowHint(GLFW_VISIBLE, GLFW_FALSE); + GLFWwindow* offscreen_context = glfwCreateWindow(640, 480, "", NULL, NULL); + + if (offscreen_context) { + glfwMakeContextCurrent(offscreen_context); + + int gl_device = -1; + unsigned int device_count = 0; + if (cudaGLGetDevices(&device_count, &gl_device, 1, cudaGLDeviceListAll) == cudaSuccess) { + if (device_count > 0 && gl_device != -1) { + set_cuda_device(gl_device); + } + } + + glfwDestroyWindow(offscreen_context); + } + + glfwTerminate(); + } +#endif + uint32_t compute_capability = cuda_compute_capability(); if (compute_capability < MIN_GPU_ARCH) { tlog::warning() << "Insufficient compute capability " << compute_capability << " detected."; @@ -2505,10 +2546,6 @@ Testbed::Testbed(ETestbedMode mode) reset_camera(); - if (!(__CUDACC_VER_MAJOR__ > 10 || (__CUDACC_VER_MAJOR__ == 10 && __CUDACC_VER_MINOR__ >= 2))) { - throw std::runtime_error{"Testbed required CUDA 10.2 or later."}; - } - set_exposure(0); set_min_level(0.f); set_max_level(1.f); @@ -2711,7 +2748,7 @@ void Testbed::render_frame(const Matrix<float, 3, 4>& camera_matrix0, const Matr } } distance_fun_t distance_fun = - m_render_ground_truth ? (distance_fun_t)[&](uint32_t n_elements, const GPUMemory<Vector3f>& positions, GPUMemory<float>& distances, cudaStream_t stream) { + m_render_ground_truth ? (distance_fun_t)[&](uint32_t n_elements, const Vector3f* positions, float* distances, cudaStream_t stream) { if (n_elements == 0) { return; } @@ -2731,35 +2768,35 @@ void Testbed::render_frame(const Matrix<float, 3, 4>& camera_matrix0, const Matr m_sdf.triangle_bvh->signed_distance_gpu( n_elements, m_sdf.mesh_sdf_mode, - (Vector3f*)positions.data(), - distances.data(), + (Vector3f*)positions, + distances, m_sdf.triangles_gpu.data(), false, m_stream.get() ); } - } : (distance_fun_t)[&](uint32_t n_elements, const GPUMemory<Vector3f>& positions, GPUMemory<float>& distances, cudaStream_t stream) { + } : (distance_fun_t)[&](uint32_t n_elements, const Vector3f* positions, float* distances, cudaStream_t stream) { if (n_elements == 0) { return; } n_elements = next_multiple(n_elements, tcnn::batch_size_granularity); - GPUMatrix<float> positions_matrix((float*)positions.data(), 3, n_elements); - GPUMatrix<float, RM> distances_matrix(distances.data(), 1, n_elements); + GPUMatrix<float> positions_matrix((float*)positions, 3, n_elements); + GPUMatrix<float, RM> distances_matrix(distances, 1, n_elements); m_network->inference(stream, positions_matrix, distances_matrix); }; normals_fun_t normals_fun = - m_render_ground_truth ? (normals_fun_t)[&](uint32_t n_elements, const GPUMemory<Vector3f>& positions, GPUMemory<Vector3f>& normals, cudaStream_t stream) { + m_render_ground_truth ? (normals_fun_t)[&](uint32_t n_elements, const Vector3f* positions, Vector3f* normals, cudaStream_t stream) { // NO-OP. Normals will automatically be populated by raytrace - } : (normals_fun_t)[&](uint32_t n_elements, const GPUMemory<Vector3f>& positions, GPUMemory<Vector3f>& normals, cudaStream_t stream) { + } : (normals_fun_t)[&](uint32_t n_elements, const Vector3f* positions, Vector3f* normals, cudaStream_t stream) { if (n_elements == 0) { return; } n_elements = next_multiple(n_elements, tcnn::batch_size_granularity); - GPUMatrix<float> positions_matrix((float*)positions.data(), 3, n_elements); - GPUMatrix<float> normals_matrix((float*)normals.data(), 3, n_elements); + GPUMatrix<float> positions_matrix((float*)positions, 3, n_elements); + GPUMatrix<float> normals_matrix((float*)normals, 3, n_elements); m_network->input_gradient(stream, 0, positions_matrix, normals_matrix); }; diff --git a/src/testbed_nerf.cu b/src/testbed_nerf.cu index 2f20fadeb14ebb9d5444e83c7b0fc4b5f09e3cc2..2a1d50d045028b92b609bdcb7d75588ec3139dfb 100644 --- a/src/testbed_nerf.cu +++ b/src/testbed_nerf.cu @@ -376,8 +376,6 @@ __global__ void mark_untrained_density_grid(const uint32_t n_elements, float* _ uint32_t y = tcnn::morton3D_invert(pos_idx>>1); uint32_t z = tcnn::morton3D_invert(pos_idx>>2); - - Vector3f pos = ((Vector3f{(float)x+0.5f, (float)y+0.5f, (float)z+0.5f}) / NERF_GRIDSIZE() - Vector3f::Constant(0.5f)) * scalbnf(1.0f, level) + Vector3f::Constant(0.5f); float voxel_radius = 0.5f*SQRT3()*scalbnf(1.0f, level) / NERF_GRIDSIZE(); int count=0; @@ -643,7 +641,12 @@ __global__ void advance_pos_nerf( } dt = calc_dt(t, cone_angle); - uint32_t mip = max(min_mip, mip_from_dt(dt, pos)); + + // Use the mip level from the position rather than dt. Unlike training, + // for rendering there's no need to use coarser mip levels when the step + // size is large (rather, it reduces performance, because the network may be queried) + // more frequently than necessary. + uint32_t mip = max(min_mip, mip_from_pos(pos)); if (!density_grid || density_grid_occupied_at(pos, density_grid, mip)) { break; @@ -736,7 +739,12 @@ __global__ void generate_next_nerf_network_inputs( } dt = calc_dt(t, cone_angle); - uint32_t mip = max(min_mip, mip_from_dt(dt, pos)); + + // Use the mip level from the position rather than dt. Unlike training, + // for rendering there's no need to use coarser mip levels when the step + // size is large (rather, it reduces performance, because the network may be queried) + // more frequently than necessary. + uint32_t mip = max(min_mip, mip_from_pos(pos)); if (!density_grid || density_grid_occupied_at(pos, density_grid, mip)) { break; @@ -2073,7 +2081,7 @@ uint32_t Testbed::NerfTracer::trace( return 0; } - CUDA_CHECK_THROW(cudaMemsetAsync(m_hit_counter.data(), 0, sizeof(uint32_t), stream)); + CUDA_CHECK_THROW(cudaMemsetAsync(m_hit_counter, 0, sizeof(uint32_t), stream)); uint32_t n_alive = m_n_rays_initialized; // m_n_rays_initialized = 0; @@ -2087,15 +2095,15 @@ uint32_t Testbed::NerfTracer::trace( // Compact rays that did not diverge yet { - CUDA_CHECK_THROW(cudaMemsetAsync(m_alive_counter.data(), 0, sizeof(uint32_t), stream)); + CUDA_CHECK_THROW(cudaMemsetAsync(m_alive_counter, 0, sizeof(uint32_t), stream)); linear_kernel(compact_kernel_nerf, 0, stream, n_alive, rays_tmp.rgba, rays_tmp.depth, rays_tmp.payload, rays_current.rgba, rays_current.depth, rays_current.payload, m_rays_hit.rgba, m_rays_hit.depth, m_rays_hit.payload, - m_alive_counter.data(), m_hit_counter.data() + m_alive_counter, m_hit_counter ); - CUDA_CHECK_THROW(cudaMemcpyAsync(&n_alive, m_alive_counter.data(), sizeof(uint32_t), cudaMemcpyDeviceToHost, stream)); + CUDA_CHECK_THROW(cudaMemcpyAsync(&n_alive, m_alive_counter, sizeof(uint32_t), cudaMemcpyDeviceToHost, stream)); CUDA_CHECK_THROW(cudaStreamSynchronize(stream)); } @@ -2164,7 +2172,7 @@ uint32_t Testbed::NerfTracer::trace( } uint32_t n_hit; - CUDA_CHECK_THROW(cudaMemcpyAsync(&n_hit, m_hit_counter.data(), sizeof(uint32_t), cudaMemcpyDeviceToHost, stream)); + CUDA_CHECK_THROW(cudaMemcpyAsync(&n_hit, m_hit_counter, sizeof(uint32_t), cudaMemcpyDeviceToHost, stream)); CUDA_CHECK_THROW(cudaStreamSynchronize(stream)); return n_hit; } @@ -2178,14 +2186,18 @@ void Testbed::NerfTracer::enlarge(size_t n_elements, uint32_t padded_output_widt Array4f, float, NerfPayload, // m_rays_hit network_precision_t, - float + float, + uint32_t, + uint32_t >( stream, &m_scratch_alloc, n_elements, n_elements, n_elements, n_elements, n_elements, n_elements, n_elements, n_elements, n_elements, n_elements * MAX_STEPS_INBETWEEN_COMPACTION * padded_output_width, - n_elements * MAX_STEPS_INBETWEEN_COMPACTION * num_floats + n_elements * MAX_STEPS_INBETWEEN_COMPACTION * num_floats, + 32, // 2 full cache lines to ensure no overlap + 32 // 2 full cache lines to ensure no overlap ); m_rays[0].set(std::get<0>(scratch), std::get<1>(scratch), std::get<2>(scratch), n_elements); @@ -2194,6 +2206,9 @@ void Testbed::NerfTracer::enlarge(size_t n_elements, uint32_t padded_output_widt m_network_output = std::get<9>(scratch); m_network_input = std::get<10>(scratch); + + m_hit_counter = std::get<11>(scratch); + m_alive_counter = std::get<12>(scratch); } void Testbed::Nerf::Training::reset_extra_dims(default_rng_t &rng) { @@ -2248,9 +2263,7 @@ void Testbed::render_nerf(CudaRenderBuffer& render_buffer, const Vector2i& max_r const float* extra_dims_gpu = get_inference_extra_dims(stream); - ScopeGuard tmp_memory_guard{[&]() { - m_nerf.tracer.clear(); - }}; + NerfTracer tracer; // Our motion vector code can't undo f-theta and grid distortions -- so don't render these if DLSS is enabled. bool render_opencv_lens = m_nerf.render_with_lens_distortion && (!render_buffer.dlss() || m_nerf.render_lens.mode == ELensMode::OpenCV); @@ -2259,7 +2272,7 @@ void Testbed::render_nerf(CudaRenderBuffer& render_buffer, const Vector2i& max_r Lens lens = render_opencv_lens ? m_nerf.render_lens : Lens{}; - m_nerf.tracer.init_rays_from_camera( + tracer.init_rays_from_camera( render_buffer.spp(), m_network->padded_output_width(), m_nerf_network->n_extra_dims(), @@ -2293,10 +2306,10 @@ void Testbed::render_nerf(CudaRenderBuffer& render_buffer, const Vector2i& max_r uint32_t n_hit; if (m_render_mode == ERenderMode::Slice) { - n_hit = m_nerf.tracer.n_rays_initialized(); + n_hit = tracer.n_rays_initialized(); } else { float depth_scale = 1.0f / m_nerf.training.dataset.scale; - n_hit = m_nerf.tracer.trace( + n_hit = tracer.trace( *m_nerf_network, m_render_aabb, m_render_aabb_to_local, @@ -2321,7 +2334,7 @@ void Testbed::render_nerf(CudaRenderBuffer& render_buffer, const Vector2i& max_r stream ); } - RaysNerfSoa& rays_hit = m_render_mode == ERenderMode::Slice ? m_nerf.tracer.rays_init() : m_nerf.tracer.rays_hit(); + RaysNerfSoa& rays_hit = m_render_mode == ERenderMode::Slice ? tracer.rays_init() : tracer.rays_hit(); if (m_render_mode == ERenderMode::Slice) { // Store colors in the normal buffer @@ -2329,23 +2342,21 @@ void Testbed::render_nerf(CudaRenderBuffer& render_buffer, const Vector2i& max_r const uint32_t floats_per_coord = sizeof(NerfCoordinate) / sizeof(float) + m_nerf_network->n_extra_dims(); const uint32_t extra_stride = m_nerf_network->n_extra_dims() * sizeof(float); // extra stride on top of base NerfCoordinate struct - m_nerf.vis_input.enlarge(n_elements * floats_per_coord); - m_nerf.vis_rgba.enlarge(n_elements); - linear_kernel(generate_nerf_network_inputs_at_current_position, 0, stream, n_hit, m_aabb, rays_hit.payload, PitchedPtr<NerfCoordinate>((NerfCoordinate*)m_nerf.vis_input.data(), 1, 0, extra_stride), extra_dims_gpu ); + GPUMatrix<float> positions_matrix{floats_per_coord, n_elements, stream}; + GPUMatrix<float> rgbsigma_matrix{4, n_elements, stream}; - GPUMatrix<float> positions_matrix((float*)m_nerf.vis_input.data(), floats_per_coord, n_elements); - GPUMatrix<float> rgbsigma_matrix((float*)m_nerf.vis_rgba.data(), 4, n_elements); + linear_kernel(generate_nerf_network_inputs_at_current_position, 0, stream, n_hit, m_aabb, rays_hit.payload, PitchedPtr<NerfCoordinate>((NerfCoordinate*)positions_matrix.data(), 1, 0, extra_stride), extra_dims_gpu ); if (m_visualized_dimension == -1) { m_network->inference(stream, positions_matrix, rgbsigma_matrix); - linear_kernel(compute_nerf_rgba, 0, stream, n_hit, m_nerf.vis_rgba.data(), m_nerf.rgb_activation, m_nerf.density_activation, 0.01f, false); + linear_kernel(compute_nerf_rgba, 0, stream, n_hit, (Array4f*)rgbsigma_matrix.data(), m_nerf.rgb_activation, m_nerf.density_activation, 0.01f, false); } else { m_network->visualize_activation(stream, m_visualized_layer, m_visualized_dimension, positions_matrix, rgbsigma_matrix); } linear_kernel(shade_kernel_nerf, 0, stream, n_hit, - m_nerf.vis_rgba.data(), + (Array4f*)rgbsigma_matrix.data(), nullptr, rays_hit.payload, m_render_mode, diff --git a/src/testbed_sdf.cu b/src/testbed_sdf.cu index 780dfb20200a034040582e08de01036d263b6153..5a0065e8874545fddc6783ff3b0aa6dc057182d8 100644 --- a/src/testbed_sdf.cu +++ b/src/testbed_sdf.cu @@ -550,12 +550,13 @@ __global__ void init_rays_with_payload_kernel_sdf( if (octree_nodes && !TriangleOctree::contains(octree_nodes, max_depth, ray.o)) { t = max(0.0f, TriangleOctree::ray_intersect(octree_nodes, max_depth, ray.o, ray.d)); - if (ray.o.y() > floor_y && ray.d.y()<0.f) { - float floor_dist = -(ray.o.y() - floor_y)/ray.d.y(); - if (floor_dist>0.f) { - t=min(t,floor_dist); + if (ray.o.y() > floor_y && ray.d.y() < 0.f) { + float floor_dist = -(ray.o.y() - floor_y) / ray.d.y(); + if (floor_dist > 0.f) { + t = min(t, floor_dist); } } + ray.o = ray.o + (t + 1e-6f) * ray.d; } @@ -566,15 +567,10 @@ __global__ void init_rays_with_payload_kernel_sdf( } SdfPayload& payload = payloads[idx]; - if (!aabb.contains(ray.o)) { - payload.alive = false; - return; - } - payload.dir = ray.d; payload.idx = idx; payload.n_steps = 0; - payload.alive = true; + payload.alive = aabb.contains(ray.o); } __host__ __device__ uint32_t sample_discrete(float uniform_sample, const float* __restrict__ cdf, int length) { @@ -591,7 +587,8 @@ __global__ void sample_uniform_on_triangle_kernel(uint32_t n_elements, const flo sampled_positions[i] = triangles[tri_idx].sample_uniform_position(sample.tail<2>()); } -void Testbed::SphereTracer::init_rays_from_camera(uint32_t sample_index, +void Testbed::SphereTracer::init_rays_from_camera( + uint32_t sample_index, const Vector2i& resolution, const Vector2f& focal_length, const Matrix<float, 3, 4>& camera_matrix, @@ -613,15 +610,15 @@ void Testbed::SphereTracer::init_rays_from_camera(uint32_t sample_index, ) { // Make sure we have enough memory reserved to render at the requested resolution size_t n_pixels = (size_t)resolution.x() * resolution.y(); - enlarge(n_pixels); + enlarge(n_pixels, stream); const dim3 threads = { 16, 8, 1 }; const dim3 blocks = { div_round_up((uint32_t)resolution.x(), threads.x), div_round_up((uint32_t)resolution.y(), threads.y), 1 }; init_rays_with_payload_kernel_sdf<<<blocks, threads, 0, stream>>>( sample_index, - m_rays[0].pos.data(), - m_rays[0].distance.data(), - m_rays[0].payload.data(), + m_rays[0].pos, + m_rays[0].distance, + m_rays[0].payload, resolution, focal_length, camera_matrix, @@ -644,7 +641,7 @@ void Testbed::SphereTracer::init_rays_from_camera(uint32_t sample_index, } void Testbed::SphereTracer::init_rays_from_data(uint32_t n_elements, const RaysSdfSoa& data, cudaStream_t stream) { - enlarge(n_elements); + enlarge(n_elements, stream); m_rays[0].copy_from_other_async(n_elements, data, stream); m_n_rays_initialized = n_elements; } @@ -658,11 +655,11 @@ uint32_t Testbed::SphereTracer::trace_bvh(TriangleBvh* bvh, const Triangle* tria } // Abuse the normal buffer to temporarily hold ray directions - parallel_for_gpu(stream, n_alive, [payloads=m_rays[0].payload.data(), normals=m_rays[0].normal.data()] __device__ (size_t i) { + parallel_for_gpu(stream, n_alive, [payloads=m_rays[0].payload, normals=m_rays[0].normal] __device__ (size_t i) { normals[i] = payloads[i].dir; }); - bvh->ray_trace_gpu(n_alive, m_rays[0].pos.data(), m_rays[0].normal.data(), triangles, stream); + bvh->ray_trace_gpu(n_alive, m_rays[0].pos, m_rays[0].normal, triangles, stream); return n_alive; } @@ -681,7 +678,7 @@ uint32_t Testbed::SphereTracer::trace( return 0; } - CUDA_CHECK_THROW(cudaMemsetAsync(m_hit_counter.data(), 0, sizeof(uint32_t), stream)); + CUDA_CHECK_THROW(cudaMemsetAsync(m_hit_counter, 0, sizeof(uint32_t), stream)); const uint32_t STEPS_INBETWEEN_COMPACTION = 4; @@ -700,29 +697,29 @@ uint32_t Testbed::SphereTracer::trace( // Compact rays that did not diverge yet { - CUDA_CHECK_THROW(cudaMemsetAsync(m_alive_counter.data(), 0, sizeof(uint32_t), stream)); + CUDA_CHECK_THROW(cudaMemsetAsync(m_alive_counter, 0, sizeof(uint32_t), stream)); if (m_trace_shadow_rays) { linear_kernel(compact_kernel_shadow_sdf, 0, stream, n_alive, zero_offset, - rays_tmp.pos.data(), rays_tmp.distance.data(), rays_tmp.payload.data(), rays_tmp.prev_distance.data(), rays_tmp.total_distance.data(), rays_tmp.min_visibility.data(), - rays_current.pos.data(), rays_current.distance.data(), rays_current.payload.data(), rays_current.prev_distance.data(), rays_current.total_distance.data(), rays_current.min_visibility.data(), - m_rays_hit.pos.data(), m_rays_hit.distance.data(), m_rays_hit.payload.data(), m_rays_hit.prev_distance.data(), m_rays_hit.total_distance.data(), m_rays_hit.min_visibility.data(), + rays_tmp.pos, rays_tmp.distance, rays_tmp.payload, rays_tmp.prev_distance, rays_tmp.total_distance, rays_tmp.min_visibility, + rays_current.pos, rays_current.distance, rays_current.payload, rays_current.prev_distance, rays_current.total_distance, rays_current.min_visibility, + m_rays_hit.pos, m_rays_hit.distance, m_rays_hit.payload, m_rays_hit.prev_distance, m_rays_hit.total_distance, m_rays_hit.min_visibility, aabb, - m_alive_counter.data(), m_hit_counter.data() + m_alive_counter, m_hit_counter ); } else { linear_kernel(compact_kernel_sdf, 0, stream, n_alive, zero_offset, - rays_tmp.pos.data(), rays_tmp.distance.data(), rays_tmp.payload.data(), - rays_current.pos.data(), rays_current.distance.data(), rays_current.payload.data(), - m_rays_hit.pos.data(), m_rays_hit.distance.data(), m_rays_hit.payload.data(), + rays_tmp.pos, rays_tmp.distance, rays_tmp.payload, + rays_current.pos, rays_current.distance, rays_current.payload, + m_rays_hit.pos, m_rays_hit.distance, m_rays_hit.payload, aabb, - m_alive_counter.data(), m_hit_counter.data() + m_alive_counter, m_hit_counter ); } - CUDA_CHECK_THROW(cudaMemcpyAsync(&n_alive, m_alive_counter.data(), sizeof(uint32_t), cudaMemcpyDeviceToHost, stream)); + CUDA_CHECK_THROW(cudaMemcpyAsync(&n_alive, m_alive_counter, sizeof(uint32_t), cudaMemcpyDeviceToHost, stream)); CUDA_CHECK_THROW(cudaStreamSynchronize(stream)); } @@ -735,9 +732,9 @@ uint32_t Testbed::SphereTracer::trace( linear_kernel(advance_pos_kernel_sdf, 0, stream, n_alive, zero_offset, - rays_current.pos.data(), - rays_current.distance.data(), - rays_current.payload.data(), + rays_current.pos, + rays_current.distance, + rays_current.payload, aabb, floor_y, octree ? octree->nodes_gpu() : nullptr, @@ -745,9 +742,9 @@ uint32_t Testbed::SphereTracer::trace( distance_scale, maximum_distance, m_shadow_sharpness, - m_trace_shadow_rays ? rays_current.prev_distance.data() : nullptr, - m_trace_shadow_rays ? rays_current.total_distance.data() : nullptr, - m_trace_shadow_rays ? rays_current.min_visibility.data() : nullptr + m_trace_shadow_rays ? rays_current.prev_distance : nullptr, + m_trace_shadow_rays ? rays_current.total_distance : nullptr, + m_trace_shadow_rays ? rays_current.min_visibility : nullptr ); } @@ -755,36 +752,67 @@ uint32_t Testbed::SphereTracer::trace( } uint32_t n_hit; - CUDA_CHECK_THROW(cudaMemcpyAsync(&n_hit, m_hit_counter.data(), sizeof(uint32_t), cudaMemcpyDeviceToHost, stream)); + CUDA_CHECK_THROW(cudaMemcpyAsync(&n_hit, m_hit_counter, sizeof(uint32_t), cudaMemcpyDeviceToHost, stream)); CUDA_CHECK_THROW(cudaStreamSynchronize(stream)); return n_hit; } -void Testbed::SphereTracer::enlarge(size_t n_elements) { +void Testbed::SphereTracer::enlarge(size_t n_elements, cudaStream_t stream) { n_elements = next_multiple(n_elements, size_t(tcnn::batch_size_granularity)); - m_rays[0].enlarge(n_elements); - m_rays[1].enlarge(n_elements); - m_rays_hit.enlarge(n_elements); + auto scratch = allocate_workspace_and_distribute< + Vector3f, Vector3f, float, float, float, float, SdfPayload, // m_rays[0] + Vector3f, Vector3f, float, float, float, float, SdfPayload, // m_rays[1] + Vector3f, Vector3f, float, float, float, float, SdfPayload, // m_rays_hit + + uint32_t, + uint32_t + >( + stream, &m_scratch_alloc, + n_elements, n_elements, n_elements, n_elements, n_elements, n_elements, n_elements, + n_elements, n_elements, n_elements, n_elements, n_elements, n_elements, n_elements, + n_elements, n_elements, n_elements, n_elements, n_elements, n_elements, n_elements, + 32, // 2 full cache lines to ensure no overlap + 32 // 2 full cache lines to ensure no overlap + ); + + m_rays[0].set(std::get<0>(scratch), std::get<1>(scratch), std::get<2>(scratch), std::get<3>(scratch), std::get<4>(scratch), std::get<5>(scratch), std::get<6>(scratch)); + m_rays[1].set(std::get<7>(scratch), std::get<8>(scratch), std::get<9>(scratch), std::get<10>(scratch), std::get<11>(scratch), std::get<12>(scratch), std::get<13>(scratch)); + m_rays_hit.set(std::get<14>(scratch), std::get<15>(scratch), std::get<16>(scratch), std::get<17>(scratch), std::get<18>(scratch), std::get<19>(scratch), std::get<20>(scratch)); + + m_hit_counter = std::get<21>(scratch); + m_alive_counter = std::get<22>(scratch); } -void Testbed::FiniteDifferenceNormalsApproximator::enlarge(uint32_t n_elements) { - dx.enlarge(n_elements); - dy.enlarge(n_elements); - dz.enlarge(n_elements); +void Testbed::FiniteDifferenceNormalsApproximator::enlarge(uint32_t n_elements, cudaStream_t stream) { + n_elements = next_multiple(n_elements, tcnn::batch_size_granularity); + auto scratch = allocate_workspace_and_distribute< + Vector3f, Vector3f, Vector3f, + float, float, float, + float, float, float + >( + stream, &m_scratch_alloc, + n_elements, n_elements, n_elements, + n_elements, n_elements, n_elements, + n_elements, n_elements, n_elements + ); + + dx = std::get<0>(scratch); + dy = std::get<1>(scratch); + dz = std::get<2>(scratch); - dist_dx_pos.enlarge(n_elements); - dist_dy_pos.enlarge(n_elements); - dist_dz_pos.enlarge(n_elements); + dist_dx_pos = std::get<3>(scratch); + dist_dy_pos = std::get<4>(scratch); + dist_dz_pos = std::get<5>(scratch); - dist_dx_neg.enlarge(n_elements); - dist_dy_neg.enlarge(n_elements); - dist_dz_neg.enlarge(n_elements); + dist_dx_neg = std::get<6>(scratch); + dist_dy_neg = std::get<7>(scratch); + dist_dz_neg = std::get<8>(scratch); } -void Testbed::FiniteDifferenceNormalsApproximator::normal(uint32_t n_elements, const distance_fun_t& distance_function, GPUMemory<Vector3f>& pos, GPUMemory<Vector3f>& normal, float epsilon, cudaStream_t stream) { - enlarge(n_elements); +void Testbed::FiniteDifferenceNormalsApproximator::normal(uint32_t n_elements, const distance_fun_t& distance_function, const Vector3f* pos, Vector3f* normal, float epsilon, cudaStream_t stream) { + enlarge(n_elements, stream); - parallel_for_gpu(stream, n_elements, [pos=pos.data(), dx=dx.data(), dy=dy.data(), dz=dz.data(), epsilon] __device__ (size_t i) { + parallel_for_gpu(stream, n_elements, [pos=pos, dx=dx, dy=dy, dz=dz, epsilon] __device__ (size_t i) { Vector3f p = pos[i]; dx[i] = Vector3f{p.x() + epsilon, p.y(), p.z()}; dy[i] = Vector3f{p.x(), p.y() + epsilon, p.z()}; @@ -795,7 +823,7 @@ void Testbed::FiniteDifferenceNormalsApproximator::normal(uint32_t n_elements, c distance_function(n_elements, dy, dist_dy_pos, stream); distance_function(n_elements, dz, dist_dz_pos, stream); - parallel_for_gpu(stream, n_elements, [pos=pos.data(), dx=dx.data(), dy=dy.data(), dz=dz.data(), epsilon] __device__ (size_t i) { + parallel_for_gpu(stream, n_elements, [pos=pos, dx=dx, dy=dy, dz=dz, epsilon] __device__ (size_t i) { Vector3f p = pos[i]; dx[i] = Vector3f{p.x() - epsilon, p.y(), p.z()}; dy[i] = Vector3f{p.x(), p.y() - epsilon, p.z()}; @@ -806,7 +834,7 @@ void Testbed::FiniteDifferenceNormalsApproximator::normal(uint32_t n_elements, c distance_function(n_elements, dy, dist_dy_neg, stream); distance_function(n_elements, dz, dist_dz_neg, stream); - parallel_for_gpu(stream, n_elements, [normal=normal.data(), dist_dx_pos=dist_dx_pos.data(), dist_dx_neg=dist_dx_neg.data(), dist_dy_pos=dist_dy_pos.data(), dist_dy_neg=dist_dy_neg.data(), dist_dz_pos=dist_dz_pos.data(), dist_dz_neg=dist_dz_neg.data()] __device__ (size_t i) { + parallel_for_gpu(stream, n_elements, [normal=normal, dist_dx_pos=dist_dx_pos, dist_dx_neg=dist_dx_neg, dist_dy_pos=dist_dy_pos, dist_dy_neg=dist_dy_neg, dist_dz_pos=dist_dz_pos, dist_dz_neg=dist_dz_neg] __device__ (size_t i) { normal[i] = {dist_dx_pos[i] - dist_dx_neg[i], dist_dy_pos[i] - dist_dy_neg[i], dist_dz_pos[i] - dist_dz_neg[i]}; }); } @@ -827,8 +855,7 @@ void Testbed::render_sdf( } auto* octree_ptr = m_sdf.uses_takikawa_encoding || m_sdf.use_triangle_octree ? m_sdf.triangle_octree.get() : nullptr; - // Reserve the memory for max-res rendering to prevent stuttering - m_sdf.tracer.enlarge(max_res.x() * max_res.y()); + SphereTracer tracer; uint32_t n_octree_levels = octree_ptr ? octree_ptr->depth() : 0; if (m_render_ground_truth && m_sdf.groundtruth_mode == ESDFGroundTruthMode::SDFBricks) { @@ -837,7 +864,7 @@ void Testbed::render_sdf( BoundingBox sdf_bounding_box = m_aabb; sdf_bounding_box.inflate(m_sdf.zero_offset); - m_sdf.tracer.init_rays_from_camera( + tracer.init_rays_from_camera( render_buffer.spp(), render_buffer.in_resolution(), focal_length, @@ -881,22 +908,22 @@ void Testbed::render_sdf( uint32_t n_hit; if (m_render_mode == ERenderMode::Slice) { - n_hit = m_sdf.tracer.n_rays_initialized(); + n_hit = tracer.n_rays_initialized(); } else { - n_hit = trace(m_sdf.tracer); + n_hit = trace(tracer); } - RaysSdfSoa& rays_hit = m_render_mode == ERenderMode::Slice || gt_raytrace ? m_sdf.tracer.rays_init() : m_sdf.tracer.rays_hit(); + RaysSdfSoa& rays_hit = m_render_mode == ERenderMode::Slice || gt_raytrace ? tracer.rays_init() : tracer.rays_hit(); if (m_render_mode == ERenderMode::Slice) { if (m_visualized_dimension == -1) { distance_function(n_hit, rays_hit.pos, rays_hit.distance, stream); - extract_dimension_pos_neg_kernel<float><<<n_blocks_linear(n_hit*3), n_threads_linear, 0, stream>>>(n_hit*3, 0, 1, 3, rays_hit.distance.data(), CM, (float*)rays_hit.normal.data()); + extract_dimension_pos_neg_kernel<float><<<n_blocks_linear(n_hit*3), n_threads_linear, 0, stream>>>(n_hit*3, 0, 1, 3, rays_hit.distance, CM, (float*)rays_hit.normal); } else { // Store colors in the normal buffer uint32_t n_elements = next_multiple(n_hit, tcnn::batch_size_granularity); - GPUMatrix<float> positions_matrix((float*)rays_hit.pos.data(), 3, n_elements); - GPUMatrix<float> colors_matrix((float*)rays_hit.normal.data(), 3, n_elements); + GPUMatrix<float> positions_matrix((float*)rays_hit.pos, 3, n_elements); + GPUMatrix<float> colors_matrix((float*)rays_hit.normal, 3, n_elements); m_network->visualize_activation(stream, m_visualized_layer, m_visualized_dimension, positions_matrix, colors_matrix); } } @@ -906,45 +933,47 @@ void Testbed::render_sdf( if (m_sdf.analytic_normals || gt_raytrace) { normals_function(n_hit, rays_hit.pos, rays_hit.normal, stream); } else { - // Prevent spurious enlargements by reserving enough memory to hold a full-res image in any case. - m_sdf.fd_normals.enlarge(render_buffer.in_resolution().x() * render_buffer.in_resolution().y()); float fd_normals_epsilon = m_sdf.fd_normals_epsilon; if (m_render_ground_truth && m_sdf.groundtruth_mode == ESDFGroundTruthMode::SDFBricks && m_sdf.brick_smooth_normals) { fd_normals_epsilon = exp2f(-float(n_octree_levels)) * (1.f/(m_sdf.brick_res-1)); // in sdf brick mode, use one voxel as the normal central difference radius } - m_sdf.fd_normals.normal(n_hit, distance_function, rays_hit.pos, rays_hit.normal, fd_normals_epsilon, stream); + + FiniteDifferenceNormalsApproximator fd_normals; + fd_normals.normal(n_hit, distance_function, rays_hit.pos, rays_hit.normal, fd_normals_epsilon, stream); } if (render_mode == ERenderMode::Shade && n_hit > 0) { // Shadow rays towards the sun - m_sdf.shadow_tracer.init_rays_from_data(n_hit, rays_hit, stream); - m_sdf.shadow_tracer.set_trace_shadow_rays(true); - m_sdf.shadow_tracer.set_shadow_sharpness(m_sdf.shadow_sharpness); - RaysSdfSoa& shadow_rays_init = m_sdf.shadow_tracer.rays_init(); + SphereTracer shadow_tracer; + + shadow_tracer.init_rays_from_data(n_hit, rays_hit, stream); + shadow_tracer.set_trace_shadow_rays(true); + shadow_tracer.set_shadow_sharpness(m_sdf.shadow_sharpness); + RaysSdfSoa& shadow_rays_init = shadow_tracer.rays_init(); linear_kernel(prepare_shadow_rays, 0, stream, n_hit, m_sun_dir.normalized(), - shadow_rays_init.pos.data(), - shadow_rays_init.normal.data(), - shadow_rays_init.distance.data(), - shadow_rays_init.prev_distance.data(), - shadow_rays_init.total_distance.data(), - shadow_rays_init.min_visibility.data(), - shadow_rays_init.payload.data(), + shadow_rays_init.pos, + shadow_rays_init.normal, + shadow_rays_init.distance, + shadow_rays_init.prev_distance, + shadow_rays_init.total_distance, + shadow_rays_init.min_visibility, + shadow_rays_init.payload, sdf_bounding_box, octree_ptr ? octree_ptr->nodes_gpu() : nullptr, n_octree_levels ); - uint32_t n_hit_shadow = trace(m_sdf.shadow_tracer); - auto& shadow_rays_hit = gt_raytrace ? m_sdf.shadow_tracer.rays_init() : m_sdf.shadow_tracer.rays_hit(); + uint32_t n_hit_shadow = trace(shadow_tracer); + auto& shadow_rays_hit = gt_raytrace ? shadow_tracer.rays_init() : shadow_tracer.rays_hit(); linear_kernel(write_shadow_ray_result, 0, stream, n_hit_shadow, sdf_bounding_box, - shadow_rays_hit.pos.data(), - shadow_rays_hit.payload.data(), - shadow_rays_hit.min_visibility.data(), - rays_hit.distance.data() + shadow_rays_hit.pos, + shadow_rays_hit.payload, + shadow_rays_hit.min_visibility, + rays_hit.distance ); // todo: Reflection rays? @@ -953,8 +982,8 @@ void Testbed::render_sdf( // HACK: Store colors temporarily in the normal buffer uint32_t n_elements = next_multiple(n_hit, tcnn::batch_size_granularity); - GPUMatrix<float> positions_matrix((float*)rays_hit.pos.data(), 3, n_elements); - GPUMatrix<float> colors_matrix((float*)rays_hit.normal.data(), 3, n_elements); + GPUMatrix<float> positions_matrix((float*)rays_hit.pos, 3, n_elements); + GPUMatrix<float> colors_matrix((float*)rays_hit.normal, 3, n_elements); m_network->visualize_activation(stream, m_visualized_layer, m_visualized_dimension, positions_matrix, colors_matrix); } @@ -967,17 +996,18 @@ void Testbed::render_sdf( m_sun_dir.normalized(), m_up_dir.normalized(), camera_matrix, - rays_hit.pos.data(), - rays_hit.normal.data(), - rays_hit.distance.data(), - rays_hit.payload.data(), + rays_hit.pos, + rays_hit.normal, + rays_hit.distance, + rays_hit.payload, render_buffer.frame_buffer(), render_buffer.depth_buffer() ); if (render_mode == ERenderMode::Cost) { std::vector<SdfPayload> payloads_final_cpu(n_hit); - rays_hit.payload.copy_to_host(payloads_final_cpu, n_hit); + CUDA_CHECK_THROW(cudaMemcpyAsync(payloads_final_cpu.data(), rays_hit.payload, n_hit * sizeof(SdfPayload), cudaMemcpyDeviceToHost, stream)); + CUDA_CHECK_THROW(cudaStreamSynchronize(stream)); size_t total_n_steps = 0; for (uint32_t i = 0; i < n_hit; ++i) { total_n_steps += payloads_final_cpu[i].n_steps; diff --git a/src/thread_pool.cpp b/src/thread_pool.cpp index 17551b101774bf580606de4a8e0de697b6cf6637..0289db40e1d14c2144cda61f3dcfa227ba1e0158 100644 --- a/src/thread_pool.cpp +++ b/src/thread_pool.cpp @@ -28,7 +28,6 @@ ThreadPool::ThreadPool(size_t maxNumThreads, bool force) { maxNumThreads = min((size_t)thread::hardware_concurrency(), maxNumThreads); } startThreads(maxNumThreads); - mNumTasksInSystem.store(0); } ThreadPool::~ThreadPool() { @@ -59,16 +58,6 @@ void ThreadPool::startThreads(size_t num) { lock.unlock(); task(); - - mNumTasksInSystem--; - - { - unique_lock<mutex> localLock{mSystemBusyMutex}; - - if (mNumTasksInSystem == 0) { - mSystemBusyCondition.notify_all(); - } - } } }); } @@ -90,30 +79,16 @@ void ThreadPool::shutdownThreads(size_t num) { } } -void ThreadPool::waitUntilFinished() { - unique_lock<mutex> lock{mSystemBusyMutex}; - - if (mNumTasksInSystem == 0) { - return; - } - - mSystemBusyCondition.wait(lock); -} - -void ThreadPool::waitUntilFinishedFor(const chrono::microseconds Duration) { - unique_lock<mutex> lock{mSystemBusyMutex}; - - if (mNumTasksInSystem == 0) { - return; +void ThreadPool::setNThreads(size_t num) { + if (mNumThreads > num) { + shutdownThreads(mNumThreads - num); + } else if (mNumThreads < num) { + startThreads(num - mNumThreads); } - - mSystemBusyCondition.wait_for(lock, Duration); } void ThreadPool::flushQueue() { lock_guard<mutex> lock{mTaskQueueMutex}; - - mNumTasksInSystem -= mTaskQueue.size(); mTaskQueue.clear(); }