diff --git a/include/neural-graphics-primitives/dlss.h b/include/neural-graphics-primitives/dlss.h
index 9f6dbdc0acdf3db3cca93c76a79ea3a1ead9c601..dbe86fccca1cd0d911535b340e8c2a05bd406a5b 100644
--- a/include/neural-graphics-primitives/dlss.h
+++ b/include/neural-graphics-primitives/dlss.h
@@ -26,6 +26,11 @@ class IDlss {
 public:
 	virtual ~IDlss() {}
 
+	virtual void update_feature(
+		const Eigen::Vector2i& in_resolution,
+		bool is_hdr,
+		bool sharpen
+	) = 0;
 	virtual void run(
 		const Eigen::Vector2i& in_resolution,
 		bool is_hdr,
@@ -42,8 +47,10 @@ public:
 
 	virtual Eigen::Vector2i clamp_resolution(const Eigen::Vector2i& resolution) const = 0;
 	virtual Eigen::Vector2i out_resolution() const = 0;
+	virtual Eigen::Vector2i max_out_resolution() const = 0;
 
 	virtual bool is_hdr() const = 0;
+	virtual bool sharpen() const = 0;
 	virtual EDlssQuality quality() const = 0;
 };
 
diff --git a/include/neural-graphics-primitives/render_buffer.h b/include/neural-graphics-primitives/render_buffer.h
index 99756c6c559084a27d890d035ae323474e03ea28..73466ff3008d78863dea9a075240b5907d2d8552 100644
--- a/include/neural-graphics-primitives/render_buffer.h
+++ b/include/neural-graphics-primitives/render_buffer.h
@@ -95,11 +95,11 @@ public:
 
 	GLuint texture();
 
-	cudaSurfaceObject_t surface() override ;
+	cudaSurfaceObject_t surface() override;
 
-	cudaArray_t array() override ;
+	cudaArray_t array() override;
 
-	void blit_from_cuda_mapping() ;
+	void blit_from_cuda_mapping();
 
 	const std::string& texture_name() const { return m_texture_name; }
 
@@ -162,7 +162,9 @@ public:
 	CudaRenderBuffer(const std::shared_ptr<SurfaceProvider>& surf) : m_surface_provider{surf} {}
 
 	CudaRenderBuffer(const CudaRenderBuffer& other) = delete;
+	CudaRenderBuffer& operator=(const CudaRenderBuffer& other) = delete;
 	CudaRenderBuffer(CudaRenderBuffer&& other) = default;
+	CudaRenderBuffer& operator=(CudaRenderBuffer&& other) = default;
 
 	cudaSurfaceObject_t surface() {
 		return m_surface_provider->surface();
@@ -186,6 +188,10 @@ public:
 		return m_spp;
 	}
 
+	void set_spp(uint32_t value) {
+		m_spp = value;
+	}
+
 	Eigen::Array4f* frame_buffer() const {
 		return m_frame_buffer.data();
 	}
@@ -249,7 +255,7 @@ public:
 		}
 	}
 
-	void enable_dlss(const Eigen::Vector2i& out_res);
+	void enable_dlss(const Eigen::Vector2i& max_out_res);
 	void disable_dlss();
 	void set_dlss_sharpening(float value) {
 		m_dlss_sharpening = value;
diff --git a/include/neural-graphics-primitives/sdf.h b/include/neural-graphics-primitives/sdf.h
index ddef63cab37c9824bfeaa5cb5bdf0a58e8e4c6e0..07a99bbd90e17dbac43d9875a9e6dc54c96ff489 100644
--- a/include/neural-graphics-primitives/sdf.h
+++ b/include/neural-graphics-primitives/sdf.h
@@ -29,34 +29,34 @@ struct SdfPayload {
 
 struct RaysSdfSoa {
 #if defined(__NVCC__) || (defined(__clang__) && defined(__CUDA__))
-	void enlarge(size_t n_elements) {
-		pos.enlarge(n_elements);
-		normal.enlarge(n_elements);
-		distance.enlarge(n_elements);
-		prev_distance.enlarge(n_elements);
-		total_distance.enlarge(n_elements);
-		min_visibility.enlarge(n_elements);
-		payload.enlarge(n_elements);
-	}
-
 	void copy_from_other_async(uint32_t n_elements, const RaysSdfSoa& other, cudaStream_t stream) {
-		CUDA_CHECK_THROW(cudaMemcpyAsync(pos.data(), other.pos.data(), n_elements * sizeof(Eigen::Vector3f), cudaMemcpyDeviceToDevice, stream));
-		CUDA_CHECK_THROW(cudaMemcpyAsync(normal.data(), other.normal.data(), n_elements * sizeof(Eigen::Vector3f), cudaMemcpyDeviceToDevice, stream));
-		CUDA_CHECK_THROW(cudaMemcpyAsync(distance.data(), other.distance.data(), n_elements * sizeof(float), cudaMemcpyDeviceToDevice, stream));
-		CUDA_CHECK_THROW(cudaMemcpyAsync(prev_distance.data(), other.prev_distance.data(), n_elements * sizeof(float), cudaMemcpyDeviceToDevice, stream));
-		CUDA_CHECK_THROW(cudaMemcpyAsync(total_distance.data(), other.total_distance.data(), n_elements * sizeof(float), cudaMemcpyDeviceToDevice, stream));
-		CUDA_CHECK_THROW(cudaMemcpyAsync(min_visibility.data(), other.min_visibility.data(), n_elements * sizeof(float), cudaMemcpyDeviceToDevice, stream));
-		CUDA_CHECK_THROW(cudaMemcpyAsync(payload.data(), other.payload.data(), n_elements * sizeof(SdfPayload), cudaMemcpyDeviceToDevice, stream));
+		CUDA_CHECK_THROW(cudaMemcpyAsync(pos, other.pos, n_elements * sizeof(Eigen::Vector3f), cudaMemcpyDeviceToDevice, stream));
+		CUDA_CHECK_THROW(cudaMemcpyAsync(normal, other.normal, n_elements * sizeof(Eigen::Vector3f), cudaMemcpyDeviceToDevice, stream));
+		CUDA_CHECK_THROW(cudaMemcpyAsync(distance, other.distance, n_elements * sizeof(float), cudaMemcpyDeviceToDevice, stream));
+		CUDA_CHECK_THROW(cudaMemcpyAsync(prev_distance, other.prev_distance, n_elements * sizeof(float), cudaMemcpyDeviceToDevice, stream));
+		CUDA_CHECK_THROW(cudaMemcpyAsync(total_distance, other.total_distance, n_elements * sizeof(float), cudaMemcpyDeviceToDevice, stream));
+		CUDA_CHECK_THROW(cudaMemcpyAsync(min_visibility, other.min_visibility, n_elements * sizeof(float), cudaMemcpyDeviceToDevice, stream));
+		CUDA_CHECK_THROW(cudaMemcpyAsync(payload, other.payload, n_elements * sizeof(SdfPayload), cudaMemcpyDeviceToDevice, stream));
 	}
 #endif
 
-	tcnn::GPUMemory<Eigen::Vector3f> pos;
-	tcnn::GPUMemory<Eigen::Vector3f> normal;
-	tcnn::GPUMemory<float> distance;
-	tcnn::GPUMemory<float> prev_distance;
-	tcnn::GPUMemory<float> total_distance;
-	tcnn::GPUMemory<float> min_visibility;
-	tcnn::GPUMemory<SdfPayload> payload;
+	void set(Eigen::Vector3f* pos, Eigen::Vector3f* normal, float* distance, float* prev_distance, float* total_distance, float* min_visibility, SdfPayload* payload) {
+		this->pos = pos;
+		this->normal = normal;
+		this->distance = distance;
+		this->prev_distance = prev_distance;
+		this->total_distance = total_distance;
+		this->min_visibility = min_visibility;
+		this->payload = payload;
+	}
+
+	Eigen::Vector3f* pos;
+	Eigen::Vector3f* normal;
+	float* distance;
+	float* prev_distance;
+	float* total_distance;
+	float* min_visibility;
+	SdfPayload* payload;
 };
 
 struct BRDFParams {
diff --git a/include/neural-graphics-primitives/testbed.h b/include/neural-graphics-primitives/testbed.h
index 1503674304e944289d9534ca639cc6b38809c99f..8e55eeb08c90198e5b45320549e89a0b789dbe82 100644
--- a/include/neural-graphics-primitives/testbed.h
+++ b/include/neural-graphics-primitives/testbed.h
@@ -69,12 +69,12 @@ public:
 	void load_training_data(const std::string& data_path);
 	void clear_training_data();
 
-	using distance_fun_t = std::function<void(uint32_t, const tcnn::GPUMemory<Eigen::Vector3f>&, tcnn::GPUMemory<float>&, cudaStream_t)>;
-	using normals_fun_t = std::function<void(uint32_t, const tcnn::GPUMemory<Eigen::Vector3f>&, tcnn::GPUMemory<Eigen::Vector3f>&, cudaStream_t)>;
+	using distance_fun_t = std::function<void(uint32_t, const Eigen::Vector3f*, float*, cudaStream_t)>;
+	using normals_fun_t = std::function<void(uint32_t, const Eigen::Vector3f*, Eigen::Vector3f*, cudaStream_t)>;
 
 	class SphereTracer {
 	public:
-		SphereTracer() : m_hit_counter(1), m_alive_counter(1) {}
+		SphereTracer() {}
 
 		void init_rays_from_camera(
 			uint32_t spp,
@@ -111,7 +111,7 @@ public:
 			uint32_t n_octree_levels,
 			cudaStream_t stream
 		);
-		void enlarge(size_t n_elements);
+		void enlarge(size_t n_elements, cudaStream_t stream);
 		RaysSdfSoa& rays_hit() { return m_rays_hit; }
 		RaysSdfSoa& rays_init() { return m_rays[0];	}
 		uint32_t n_rays_initialized() const { return m_n_rays_initialized; }
@@ -120,16 +120,19 @@ public:
 	private:
 		RaysSdfSoa m_rays[2];
 		RaysSdfSoa m_rays_hit;
-		tcnn::GPUMemory<uint32_t> m_hit_counter;
-		tcnn::GPUMemory<uint32_t> m_alive_counter;
+		uint32_t* m_hit_counter;
+		uint32_t* m_alive_counter;
+
 		uint32_t m_n_rays_initialized = 0;
 		float m_shadow_sharpness = 2048.f;
 		bool m_trace_shadow_rays = false;
+
+		tcnn::GPUMemoryArena::Allocation m_scratch_alloc;
 	};
 
 	class NerfTracer {
 	public:
-		NerfTracer() : m_hit_counter(1), m_alive_counter(1) {}
+		NerfTracer() {}
 
 		void init_rays_from_camera(
 			uint32_t spp,
@@ -193,38 +196,36 @@ public:
 		RaysNerfSoa& rays_init() { return m_rays[0]; }
 		uint32_t n_rays_initialized() const { return m_n_rays_initialized; }
 
-		void clear() {
-			m_scratch_alloc = {};
-		}
-
 	private:
 		RaysNerfSoa m_rays[2];
 		RaysNerfSoa m_rays_hit;
 		precision_t* m_network_output;
 		float* m_network_input;
-		tcnn::GPUMemory<uint32_t> m_hit_counter;
-		tcnn::GPUMemory<uint32_t> m_alive_counter;
+		uint32_t* m_hit_counter;
+		uint32_t* m_alive_counter;
 		uint32_t m_n_rays_initialized = 0;
 		tcnn::GPUMemoryArena::Allocation m_scratch_alloc;
 	};
 
 	class FiniteDifferenceNormalsApproximator {
 	public:
-		void enlarge(uint32_t n_elements);
-		void normal(uint32_t n_elements, const distance_fun_t& distance_function, tcnn::GPUMemory<Eigen::Vector3f>& pos, tcnn::GPUMemory<Eigen::Vector3f>& normal, float epsilon, cudaStream_t stream);
+		void enlarge(uint32_t n_elements, cudaStream_t stream);
+		void normal(uint32_t n_elements, const distance_fun_t& distance_function, const Eigen::Vector3f* pos, Eigen::Vector3f* normal, float epsilon, cudaStream_t stream);
 
 	private:
-		tcnn::GPUMemory<Eigen::Vector3f> dx;
-		tcnn::GPUMemory<Eigen::Vector3f> dy;
-		tcnn::GPUMemory<Eigen::Vector3f> dz;
+		Eigen::Vector3f* dx;
+		Eigen::Vector3f* dy;
+		Eigen::Vector3f* dz;
 
-		tcnn::GPUMemory<float> dist_dx_pos;
-		tcnn::GPUMemory<float> dist_dy_pos;
-		tcnn::GPUMemory<float> dist_dz_pos;
+		float* dist_dx_pos;
+		float* dist_dy_pos;
+		float* dist_dz_pos;
 
-		tcnn::GPUMemory<float> dist_dx_neg;
-		tcnn::GPUMemory<float> dist_dy_neg;
-		tcnn::GPUMemory<float> dist_dz_neg;
+		float* dist_dx_neg;
+		float* dist_dy_neg;
+		float* dist_dz_neg;
+
+		tcnn::GPUMemoryArena::Allocation m_scratch_alloc;
 	};
 
 	struct LevelStats {
@@ -532,8 +533,6 @@ public:
 	bool m_gui_redraw = true;
 
 	struct Nerf {
-		NerfTracer tracer;
-
 		struct Training {
 			NerfDataset dataset;
 			int n_images_for_training = 0; // how many images to train from, as a high watermark compared to the dataset size
@@ -640,9 +639,6 @@ public:
 
 		uint32_t max_cascade = 0;
 
-		tcnn::GPUMemory<float> vis_input;
-		tcnn::GPUMemory<Eigen::Array4f> vis_rgba;
-
 		ENerfActivation rgb_activation = ENerfActivation::Exponential;
 		ENerfActivation density_activation = ENerfActivation::Exponential;
 
@@ -666,8 +662,6 @@ public:
 	} m_nerf;
 
 	struct Sdf {
-		SphereTracer tracer;
-		SphereTracer shadow_tracer;
 		float shadow_sharpness = 2048.0f;
 		float maximum_distance = 0.00005f;
 		float fd_normals_epsilon = 0.0005f;
@@ -676,8 +670,6 @@ public:
 
 		BRDFParams brdf;
 
-		FiniteDifferenceNormalsApproximator fd_normals;
-
 		// Mesh data
 		EMeshSdfMode mesh_sdf_mode = EMeshSdfMode::Raystab;
 		float mesh_scale;
@@ -764,7 +756,7 @@ public:
 		tcnn::GPUMemory<char> nanovdb_grid;
 		tcnn::GPUMemory<uint8_t> bitgrid;
 		float global_majorant = 1.f;
-		Eigen::Vector3f world2index_offset = {0,0,0};
+		Eigen::Vector3f world2index_offset = {0, 0, 0};
 		float world2index_scale = 1.f;
 
 		struct Training {
@@ -790,7 +782,7 @@ public:
 	float m_render_near_distance = 0.0f;
 	float m_slice_plane_z = 0.0f;
 	bool m_floor_enable = false;
-	inline float get_floor_y() const { return m_floor_enable ? m_aabb.min.y()+0.001f : -10000.f; }
+	inline float get_floor_y() const { return m_floor_enable ? m_aabb.min.y() + 0.001f : -10000.f; }
 	BoundingBox m_raw_aabb;
 	BoundingBox m_aabb;
 	BoundingBox m_render_aabb;
diff --git a/include/neural-graphics-primitives/thread_pool.h b/include/neural-graphics-primitives/thread_pool.h
index 47e9d7b268ee878fe3917fc210a93ddeef3b5fa9..37f7fd53b18373652e3138e65de96f84c3eda678 100644
--- a/include/neural-graphics-primitives/thread_pool.h
+++ b/include/neural-graphics-primitives/thread_pool.h
@@ -41,8 +41,6 @@ public:
     auto enqueueTask(F&& f, bool highPriority = false) -> std::future<std::result_of_t <F()>> {
         using return_type = std::result_of_t<F()>;
 
-        ++mNumTasksInSystem;
-
         auto task = std::make_shared<std::packaged_task<return_type()>>(std::forward<F>(f));
 
         auto res = task->get_future();
@@ -63,13 +61,8 @@ public:
 
     void startThreads(size_t num);
     void shutdownThreads(size_t num);
+	void setNThreads(size_t num);
 
-    size_t numTasksInSystem() const {
-        return mNumTasksInSystem;
-    }
-
-    void waitUntilFinished();
-    void waitUntilFinishedFor(const std::chrono::microseconds Duration);
     void flushQueue();
 
     template <typename Int, typename F>
@@ -109,10 +102,6 @@ private:
     std::deque<std::function<void()>> mTaskQueue;
     std::mutex mTaskQueueMutex;
     std::condition_variable mWorkerCondition;
-
-    std::atomic<size_t> mNumTasksInSystem;
-    std::mutex mSystemBusyMutex;
-    std::condition_variable mSystemBusyCondition;
 };
 
 NGP_NAMESPACE_END
diff --git a/scripts/colmap2nerf.py b/scripts/colmap2nerf.py
index b4f463e0a2bb8a024411ab6246b5a11454023c54..d015f5e1d1030484bc54bb9ae4c4f267ced4012f 100755
--- a/scripts/colmap2nerf.py
+++ b/scripts/colmap2nerf.py
@@ -33,7 +33,7 @@ def parse_args():
 	parser.add_argument("--colmap_camera_params", default="", help="intrinsic parameters, depending on the chosen model.  Format: fx,fy,cx,cy,dist")
 	parser.add_argument("--images", default="images", help="input path to the images")
 	parser.add_argument("--text", default="colmap_text", help="input path to the colmap text files (set automatically if run_colmap is used)")
-	parser.add_argument("--aabb_scale", default=16, choices=["1","2","4","8","16"], help="large scene scale factor. 1=scene fits in unit cube; power of 2 up to 16")
+	parser.add_argument("--aabb_scale", default=16, choices=["1", "2", "4", "8", "16", "32", "64", "128"], help="large scene scale factor. 1=scene fits in unit cube; power of 2 up to 16")
 	parser.add_argument("--skip_early", default=0, help="skip this many images from the start")
 	parser.add_argument("--keep_colmap_coords", action="store_true", help="keep transforms.json in COLMAP's original frame of reference (this will avoid reorienting and repositioning the scene for preview and rendering)")
 	parser.add_argument("--out", default="transforms.json", help="output path")
diff --git a/src/dlss.cu b/src/dlss.cu
index 9bb402c4d63f93f941fcf70993de48abd1da9995..627dbd986513a6c5575be575cd36e46ac22bb95d 100644
--- a/src/dlss.cu
+++ b/src/dlss.cu
@@ -417,7 +417,7 @@ void vulkan_and_ngx_init() {
 		throw std::runtime_error{fmt::format("DLSS not available: {}", ngx_error_string(ngx_result))};
 	}
 
-	tlog::success() << "Initialized Vulkan and NGX on device #" << device_id << ": " << physical_device_properties.deviceName;
+	tlog::success() << "Initialized Vulkan and NGX on GPU #" << device_id << ": " << physical_device_properties.deviceName;
 }
 
 size_t dlss_allocated_bytes() {
@@ -878,6 +878,10 @@ public:
 		return m_specs.quality;
 	}
 
+	Vector2i out_resolution() const {
+		return m_specs.out_resolution;
+	}
+
 	Vector2i clamp_resolution(const Vector2i& resolution) const {
 		return m_specs.clamp_resolution(resolution);
 	}
@@ -895,21 +899,44 @@ private:
 
 class Dlss : public IDlss {
 public:
-	Dlss(const Eigen::Vector2i& out_resolution)
+	Dlss(const Eigen::Vector2i& max_out_resolution)
 	:
-	m_out_resolution{out_resolution},
+	m_max_out_resolution{max_out_resolution},
 	// Allocate all buffers at output resolution and use dynamic sub-rects
 	// to use subsets of them. This avoids re-allocations when using DLSS
 	// with dynamically changing input resolution.
-	m_frame_buffer{out_resolution, 4},
-	m_depth_buffer{out_resolution, 1},
-	m_mvec_buffer{out_resolution, 2},
+	m_frame_buffer{max_out_resolution, 4},
+	m_depth_buffer{max_out_resolution, 1},
+	m_mvec_buffer{max_out_resolution, 2},
 	m_exposure_buffer{{1, 1}, 1},
-	m_output_buffer{out_resolution, 4}
+	m_output_buffer{max_out_resolution, 4}
 	{
+		// Various quality modes of DLSS
 		for (int i = 0; i < (int)EDlssQuality::NumDlssQualitySettings; ++i) {
 			try {
-				auto specs = dlss_feature_specs(out_resolution, (EDlssQuality)i);
+				auto specs = dlss_feature_specs(max_out_resolution, (EDlssQuality)i);
+
+				// Only emplace the specs if the feature can be created in practice!
+				DlssFeature{specs, true, true};
+				DlssFeature{specs, true, false};
+				DlssFeature{specs, false, true};
+				DlssFeature{specs, false, false};
+				m_dlss_specs.emplace_back(specs);
+			} catch (...) {}
+		}
+
+		// For super insane performance requirements (more than 3x upscaling) try UltraPerformance
+		// with reduced output resolutions for 4.5x, 6x, 9x.
+		std::vector<Vector2i> reduced_out_resolutions = {
+			max_out_resolution / 3 * 2,
+			max_out_resolution / 2,
+			max_out_resolution / 3,
+			// max_out_resolution / 4,
+		};
+
+		for (const auto& out_resolution : reduced_out_resolutions) {
+			try {
+				auto specs = dlss_feature_specs(out_resolution, EDlssQuality::UltraPerformance);
 
 				// Only emplace the specs if the feature can be created in practice!
 				DlssFeature{specs, true, true};
@@ -926,20 +953,14 @@ public:
 		m_dlss_feature = nullptr;
 	}
 
-	void run(
-		const Vector2i& in_resolution,
-		bool is_hdr,
-		float sharpening,
-		const Vector2f& jitter_offset,
-		bool shall_reset
-	) override {
+	void update_feature(const Vector2i& in_resolution, bool is_hdr, bool sharpen) override {
 		CUDA_CHECK_THROW(cudaDeviceSynchronize());
 
-		EDlssQuality quality;
+		DlssFeatureSpecs specs;
 		bool found = false;
-		for (const auto& specs : m_dlss_specs) {
-			if (specs.distance(in_resolution) == 0.0f) {
-				quality = specs.quality;
+		for (const auto& s : m_dlss_specs) {
+			if (s.distance(in_resolution) == 0.0f) {
+				specs = s;
 				found = true;
 			}
 		}
@@ -948,10 +969,21 @@ public:
 			throw std::runtime_error{"Dlss::run called with invalid input resolution."};
 		}
 
-		bool sharpen = sharpening != 0.0f;
-		if (!m_dlss_feature || m_dlss_feature->is_hdr() != is_hdr || m_dlss_feature->sharpen() != sharpen || m_dlss_feature->quality() != quality) {
-			m_dlss_feature.reset(new DlssFeature{m_out_resolution, is_hdr, sharpen, quality});
+		if (!m_dlss_feature || m_dlss_feature->is_hdr() != is_hdr || m_dlss_feature->sharpen() != sharpen || m_dlss_feature->quality() != specs.quality || m_dlss_feature->out_resolution() != specs.out_resolution) {
+			m_dlss_feature.reset(new DlssFeature{specs.out_resolution, is_hdr, sharpen, specs.quality});
 		}
+	}
+
+	void run(
+		const Vector2i& in_resolution,
+		bool is_hdr,
+		float sharpening,
+		const Vector2f& jitter_offset,
+		bool shall_reset
+	) override {
+		CUDA_CHECK_THROW(cudaDeviceSynchronize());
+
+		update_feature(in_resolution, is_hdr, sharpening != 0.0f);
 
 		m_dlss_feature->run(
 			in_resolution,
@@ -1001,13 +1033,21 @@ public:
 	}
 
 	Vector2i out_resolution() const override {
-		return m_out_resolution;
+		return m_dlss_feature ? m_dlss_feature->out_resolution() : m_max_out_resolution;
+	}
+
+	Vector2i max_out_resolution() const override {
+		return m_max_out_resolution;
 	}
 
 	bool is_hdr() const override {
 		return m_dlss_feature && m_dlss_feature->is_hdr();
 	}
 
+	bool sharpen() const override {
+		return m_dlss_feature && m_dlss_feature->sharpen();
+	}
+
 	EDlssQuality quality() const override {
 		return m_dlss_feature ? m_dlss_feature->quality() : EDlssQuality::None;
 	}
@@ -1022,7 +1062,7 @@ private:
 	VulkanTexture m_exposure_buffer;
 	VulkanTexture m_output_buffer;
 
-	Vector2i m_out_resolution;
+	Vector2i m_max_out_resolution;
 };
 
 std::shared_ptr<IDlss> dlss_init(const Eigen::Vector2i& out_resolution) {
diff --git a/src/nerf_loader.cu b/src/nerf_loader.cu
index ba21e6b58de959b1744bba77bce111fec3340c24..eb0555bf75a26edebdd87b267842a0e8b17864c9 100644
--- a/src/nerf_loader.cu
+++ b/src/nerf_loader.cu
@@ -582,7 +582,6 @@ NerfDataset load_nerf(const std::vector<filesystem::path>& jsonpaths, float shar
 					throw std::runtime_error{"Could not open image file: "s + std::string{stbi_failure_reason()}};
 				}
 
-
 				fs::path alphapath = basepath / fmt::format("{}.alpha.{}", frame["file_path"], path.extension());
 				if (alphapath.exists()) {
 					int wa = 0, ha = 0;
@@ -613,7 +612,7 @@ NerfDataset load_nerf(const std::vector<filesystem::path>& jsonpaths, float shar
 					}
 					dst.mask_color = 0x00FF00FF; // HOT PINK
 					for (int i = 0; i < dst.res.prod(); ++i) {
-						if (mask_img[i*4] != 0) {
+						if (mask_img[i*4] != 0 || mask_img[i*4+1] != 0 || mask_img[i*4+2] != 0) {
 							*(uint32_t*)&img[i*4] = dst.mask_color;
 						}
 					}
diff --git a/src/render_buffer.cu b/src/render_buffer.cu
index a9a6d47724c53b6094782619bc3c725f8eb9ecbf..12ab7538f0ba475ec37684d1fbc85eb944ac6123 100644
--- a/src/render_buffer.cu
+++ b/src/render_buffer.cu
@@ -741,12 +741,15 @@ void CudaRenderBuffer::overlay_false_color(Vector2i training_resolution, bool to
 	);
 }
 
-void CudaRenderBuffer::enable_dlss(const Eigen::Vector2i& out_res) {
+void CudaRenderBuffer::enable_dlss(const Eigen::Vector2i& max_out_res) {
 #ifdef NGP_VULKAN
-	if (!m_dlss || m_dlss->out_resolution() != out_res) {
-		m_dlss = dlss_init(out_res);
+	if (!m_dlss || m_dlss->max_out_resolution() != max_out_res) {
+		m_dlss = dlss_init(max_out_res);
+	}
+
+	if (m_dlss) {
+		resize(m_dlss->clamp_resolution(in_resolution()));
 	}
-	resize(in_resolution());
 #else
 	throw std::runtime_error{"NGP was compiled without Vulkan/NGX/DLSS support."};
 #endif
diff --git a/src/testbed.cu b/src/testbed.cu
index 886292c6bdc7d025053710f33c97ba2a704f01a9..b7bff31029fd531cbbc1b2f5eb3bc4f68a6d8d6e 100644
--- a/src/testbed.cu
+++ b/src/testbed.cu
@@ -53,8 +53,7 @@
 #    include <GL/glew.h>
 #  endif
 #  include <GLFW/glfw3.h>
-
-
+#  include <cuda_gl_interop.h>
 #endif
 
 // Windows.h is evil
@@ -209,7 +208,7 @@ void Testbed::set_visualized_dim(int dim) {
 }
 
 void Testbed::translate_camera(const Vector3f& rel) {
-	m_camera.col(3) += m_camera.block<3,3>(0,0) * rel * m_bounding_radius;
+	m_camera.col(3) += m_camera.block<3, 3>(0, 0) * rel * m_bounding_radius;
 	reset_accumulation(true);
 }
 
@@ -524,7 +523,7 @@ void Testbed::imgui() {
 			}
 			ImGui::SameLine();
 			ImGui::PushItemWidth(400.f);
-			ImGui::InputText("File", opt_extr_filename_buf, sizeof(opt_extr_filename_buf));
+			ImGui::InputText("File##Extrinsics file path", opt_extr_filename_buf, sizeof(opt_extr_filename_buf));
 			ImGui::PopItemWidth();
 			ImGui::SameLine();
 			ImGui::Checkbox("Quaternion format", &export_extrinsics_in_quat_format);
@@ -1019,7 +1018,7 @@ void Testbed::imgui() {
 		}
 		ImGui::SameLine();
 		ImGui::Checkbox("w/ Optimizer State", &m_include_optimizer_state_in_snapshot);
-		ImGui::InputText("File", snapshot_filename_buf, sizeof(snapshot_filename_buf));
+		ImGui::InputText("File##Snapshot file path", snapshot_filename_buf, sizeof(snapshot_filename_buf));
 	}
 
 	if (m_testbed_mode == ETestbedMode::Nerf || m_testbed_mode == ETestbedMode::Sdf) {
@@ -1451,7 +1450,7 @@ void Testbed::mouse_drag(const Vector2f& rel, int button) {
 
 			m_image.pos += rel;
 			if (m_fps_camera) {
-				m_camera.block<3,3>(0,0) = rot * m_camera.block<3,3>(0,0);
+				m_camera.block<3, 3>(0, 0) = rot * m_camera.block<3, 3>(0, 0);
 			} else {
 				// Turntable
 				auto old_look_at = look_at();
@@ -1693,40 +1692,51 @@ void Testbed::train_and_render(bool skip_rendering) {
 
 		auto& render_buffer = m_render_surfaces.front();
 
-		if (m_dlss) {
-			render_buffer.enable_dlss(m_window_res);
-			m_aperture_size = 0.0f;
-		} else {
-			render_buffer.disable_dlss();
-		}
+		{
+			// Don't count the time being spent allocating buffers and resetting DLSS as part of the frame time.
+			// Otherwise the dynamic resolution calculations for following frames will be thrown out of whack
+			// and may even start oscillating.
+			auto skip_start = std::chrono::steady_clock::now();
+			ScopeGuard skip_timing_guard{[&]() {
+				start += std::chrono::steady_clock::now() - skip_start;
+			}};
+			if (m_dlss) {
+				render_buffer.enable_dlss(m_window_res);
+				m_aperture_size = 0.0f;
+			} else {
+				render_buffer.disable_dlss();
+			}
 
-		auto render_res = render_buffer.in_resolution();
-		if (render_res.isZero() || (m_train && m_training_step == 0)) {
-			render_res = m_window_res/16;
-		} else {
-			render_res = render_res.cwiseMin(m_window_res);
-		}
+			auto render_res = render_buffer.in_resolution();
+			if (render_res.isZero() || (m_train && m_training_step == 0)) {
+				render_res = m_window_res/16;
+			} else {
+				render_res = render_res.cwiseMin(m_window_res);
+			}
 
-		float render_time_per_fullres_frame = m_render_ms.val() / (float)render_res.x() / (float)render_res.y() * (float)m_window_res.x() * (float)m_window_res.y();
+			float render_time_per_fullres_frame = m_render_ms.val() / (float)render_res.x() / (float)render_res.y() * (float)m_window_res.x() * (float)m_window_res.y();
 
-		// Make sure we don't starve training with slow rendering
-		float factor = std::sqrt(1000.0f / m_dynamic_res_target_fps / render_time_per_fullres_frame);
-		if (!m_dynamic_res) {
-			factor = 8.f/(float)m_fixed_res_factor;
-		}
+			// Make sure we don't starve training with slow rendering
+			float factor = std::sqrt(1000.0f / m_dynamic_res_target_fps / render_time_per_fullres_frame);
+			if (!m_dynamic_res) {
+				factor = 8.f/(float)m_fixed_res_factor;
+			}
 
-		factor = tcnn::clamp(factor, 1.0f/16.0f, 1.0f);
+			factor = tcnn::clamp(factor, 1.0f/16.0f, 1.0f);
 
-		if (factor > m_last_render_res_factor * 1.2f || factor < m_last_render_res_factor * 0.8f || factor == 1.0f || !m_dynamic_res) {
-			render_res = (m_window_res.cast<float>() * factor).cast<int>().cwiseMin(m_window_res).cwiseMax(m_window_res/16);
-			m_last_render_res_factor = factor;
-		}
+			if (factor > m_last_render_res_factor * 1.2f || factor < m_last_render_res_factor * 0.8f || factor == 1.0f || !m_dynamic_res) {
+				render_res = (m_window_res.cast<float>() * factor).cast<int>().cwiseMin(m_window_res).cwiseMax(m_window_res/16);
+				m_last_render_res_factor = factor;
+			}
+
+			if (render_buffer.dlss()) {
+				render_res = render_buffer.dlss()->clamp_resolution(render_res);
+				render_buffer.dlss()->update_feature(render_res, render_buffer.dlss()->is_hdr(), render_buffer.dlss()->sharpen());
+			}
 
-		if (render_buffer.dlss()) {
-			render_res = render_buffer.dlss()->clamp_resolution(render_res);
+			render_buffer.resize(render_res);
 		}
 
-		render_buffer.resize(render_res);
 		render_frame(m_smoothed_camera, m_smoothed_camera, Eigen::Vector4f::Zero(), render_buffer);
 
 #ifdef NGP_GUI
@@ -1880,6 +1890,9 @@ void Testbed::init_window(int resw, int resh, bool hidden, bool second_window) {
 	try {
 		vulkan_and_ngx_init();
 		m_dlss_supported = true;
+		if (m_testbed_mode == ETestbedMode::Nerf) {
+			m_dlss = true;
+		}
 	} catch (const std::runtime_error& e) {
 		tlog::warning() << "Could not initialize Vulkan and NGX. DLSS not supported. (" << e.what() << ")";
 	}
@@ -2082,7 +2095,6 @@ bool Testbed::frame() {
 #ifdef NGP_GUI
 	if (m_render_window) {
 		if (m_gui_redraw) {
-			// Gather histogram statistics of the encoding in use
 			if (m_gather_histograms) {
 				gather_histograms();
 			}
@@ -2469,6 +2481,35 @@ void Testbed::reset_network(bool clear_density_grid) {
 Testbed::Testbed(ETestbedMode mode)
 : m_testbed_mode(mode)
 {
+	if (!(__CUDACC_VER_MAJOR__ > 10 || (__CUDACC_VER_MAJOR__ == 10 && __CUDACC_VER_MINOR__ >= 2))) {
+		throw std::runtime_error{"Testbed required CUDA 10.2 or later."};
+	}
+
+#ifdef NGP_GUI
+	// Ensure we're running on the GPU that'll host our GUI. To do so, try creating a dummy
+	// OpenGL context, figure out the GPU it's running on, and then kill that context again.
+	if (glfwInit()) {
+		glfwWindowHint(GLFW_VISIBLE, GLFW_FALSE);
+		GLFWwindow* offscreen_context = glfwCreateWindow(640, 480, "", NULL, NULL);
+
+		if (offscreen_context) {
+			glfwMakeContextCurrent(offscreen_context);
+
+			int gl_device = -1;
+			unsigned int device_count = 0;
+			if (cudaGLGetDevices(&device_count, &gl_device, 1, cudaGLDeviceListAll) == cudaSuccess) {
+				if (device_count > 0 && gl_device != -1) {
+					set_cuda_device(gl_device);
+				}
+			}
+
+			glfwDestroyWindow(offscreen_context);
+		}
+
+		glfwTerminate();
+	}
+#endif
+
 	uint32_t compute_capability = cuda_compute_capability();
 	if (compute_capability < MIN_GPU_ARCH) {
 		tlog::warning() << "Insufficient compute capability " << compute_capability << " detected.";
@@ -2505,10 +2546,6 @@ Testbed::Testbed(ETestbedMode mode)
 
 	reset_camera();
 
-	if (!(__CUDACC_VER_MAJOR__ > 10 || (__CUDACC_VER_MAJOR__ == 10 && __CUDACC_VER_MINOR__ >= 2))) {
-		throw std::runtime_error{"Testbed required CUDA 10.2 or later."};
-	}
-
 	set_exposure(0);
 	set_min_level(0.f);
 	set_max_level(1.f);
@@ -2711,7 +2748,7 @@ void Testbed::render_frame(const Matrix<float, 3, 4>& camera_matrix0, const Matr
 					}
 				}
 				distance_fun_t distance_fun =
-					m_render_ground_truth ? (distance_fun_t)[&](uint32_t n_elements, const GPUMemory<Vector3f>& positions, GPUMemory<float>& distances, cudaStream_t stream) {
+					m_render_ground_truth ? (distance_fun_t)[&](uint32_t n_elements, const Vector3f* positions, float* distances, cudaStream_t stream) {
 						if (n_elements == 0) {
 							return;
 						}
@@ -2731,35 +2768,35 @@ void Testbed::render_frame(const Matrix<float, 3, 4>& camera_matrix0, const Matr
 							m_sdf.triangle_bvh->signed_distance_gpu(
 								n_elements,
 								m_sdf.mesh_sdf_mode,
-								(Vector3f*)positions.data(),
-								distances.data(),
+								(Vector3f*)positions,
+								distances,
 								m_sdf.triangles_gpu.data(),
 								false,
 								m_stream.get()
 							);
 						}
-					} : (distance_fun_t)[&](uint32_t n_elements, const GPUMemory<Vector3f>& positions, GPUMemory<float>& distances, cudaStream_t stream) {
+					} : (distance_fun_t)[&](uint32_t n_elements, const Vector3f* positions, float* distances, cudaStream_t stream) {
 						if (n_elements == 0) {
 							return;
 						}
 						n_elements = next_multiple(n_elements, tcnn::batch_size_granularity);
-						GPUMatrix<float> positions_matrix((float*)positions.data(), 3, n_elements);
-						GPUMatrix<float, RM> distances_matrix(distances.data(), 1, n_elements);
+						GPUMatrix<float> positions_matrix((float*)positions, 3, n_elements);
+						GPUMatrix<float, RM> distances_matrix(distances, 1, n_elements);
 						m_network->inference(stream, positions_matrix, distances_matrix);
 					};
 
 				normals_fun_t normals_fun =
-					m_render_ground_truth ? (normals_fun_t)[&](uint32_t n_elements, const GPUMemory<Vector3f>& positions, GPUMemory<Vector3f>& normals, cudaStream_t stream) {
+					m_render_ground_truth ? (normals_fun_t)[&](uint32_t n_elements, const Vector3f* positions, Vector3f* normals, cudaStream_t stream) {
 						// NO-OP. Normals will automatically be populated by raytrace
-					} : (normals_fun_t)[&](uint32_t n_elements, const GPUMemory<Vector3f>& positions, GPUMemory<Vector3f>& normals, cudaStream_t stream) {
+					} : (normals_fun_t)[&](uint32_t n_elements, const Vector3f* positions, Vector3f* normals, cudaStream_t stream) {
 						if (n_elements == 0) {
 							return;
 						}
 
 						n_elements = next_multiple(n_elements, tcnn::batch_size_granularity);
 
-						GPUMatrix<float> positions_matrix((float*)positions.data(), 3, n_elements);
-						GPUMatrix<float> normals_matrix((float*)normals.data(), 3, n_elements);
+						GPUMatrix<float> positions_matrix((float*)positions, 3, n_elements);
+						GPUMatrix<float> normals_matrix((float*)normals, 3, n_elements);
 						m_network->input_gradient(stream, 0, positions_matrix, normals_matrix);
 					};
 
diff --git a/src/testbed_nerf.cu b/src/testbed_nerf.cu
index 2f20fadeb14ebb9d5444e83c7b0fc4b5f09e3cc2..2a1d50d045028b92b609bdcb7d75588ec3139dfb 100644
--- a/src/testbed_nerf.cu
+++ b/src/testbed_nerf.cu
@@ -376,8 +376,6 @@ __global__ void mark_untrained_density_grid(const uint32_t n_elements,  float* _
 	uint32_t y = tcnn::morton3D_invert(pos_idx>>1);
 	uint32_t z = tcnn::morton3D_invert(pos_idx>>2);
 
-
-
 	Vector3f pos = ((Vector3f{(float)x+0.5f, (float)y+0.5f, (float)z+0.5f}) / NERF_GRIDSIZE() - Vector3f::Constant(0.5f)) * scalbnf(1.0f, level) + Vector3f::Constant(0.5f);
 	float voxel_radius = 0.5f*SQRT3()*scalbnf(1.0f, level) / NERF_GRIDSIZE();
 	int count=0;
@@ -643,7 +641,12 @@ __global__ void advance_pos_nerf(
 		}
 
 		dt = calc_dt(t, cone_angle);
-		uint32_t mip = max(min_mip, mip_from_dt(dt, pos));
+
+		// Use the mip level from the position rather than dt. Unlike training,
+		// for rendering there's no need to use coarser mip levels when the step
+		// size is large (rather, it reduces performance, because the network may be queried)
+		// more frequently than necessary.
+		uint32_t mip = max(min_mip, mip_from_pos(pos));
 
 		if (!density_grid || density_grid_occupied_at(pos, density_grid, mip)) {
 			break;
@@ -736,7 +739,12 @@ __global__ void generate_next_nerf_network_inputs(
 			}
 
 			dt = calc_dt(t, cone_angle);
-			uint32_t mip = max(min_mip, mip_from_dt(dt, pos));
+
+			// Use the mip level from the position rather than dt. Unlike training,
+			// for rendering there's no need to use coarser mip levels when the step
+			// size is large (rather, it reduces performance, because the network may be queried)
+			// more frequently than necessary.
+			uint32_t mip = max(min_mip, mip_from_pos(pos));
 
 			if (!density_grid || density_grid_occupied_at(pos, density_grid, mip)) {
 				break;
@@ -2073,7 +2081,7 @@ uint32_t Testbed::NerfTracer::trace(
 		return 0;
 	}
 
-	CUDA_CHECK_THROW(cudaMemsetAsync(m_hit_counter.data(), 0, sizeof(uint32_t), stream));
+	CUDA_CHECK_THROW(cudaMemsetAsync(m_hit_counter, 0, sizeof(uint32_t), stream));
 
 	uint32_t n_alive = m_n_rays_initialized;
 	// m_n_rays_initialized = 0;
@@ -2087,15 +2095,15 @@ uint32_t Testbed::NerfTracer::trace(
 
 		// Compact rays that did not diverge yet
 		{
-			CUDA_CHECK_THROW(cudaMemsetAsync(m_alive_counter.data(), 0, sizeof(uint32_t), stream));
+			CUDA_CHECK_THROW(cudaMemsetAsync(m_alive_counter, 0, sizeof(uint32_t), stream));
 			linear_kernel(compact_kernel_nerf, 0, stream,
 				n_alive,
 				rays_tmp.rgba, rays_tmp.depth, rays_tmp.payload,
 				rays_current.rgba, rays_current.depth, rays_current.payload,
 				m_rays_hit.rgba, m_rays_hit.depth, m_rays_hit.payload,
-				m_alive_counter.data(), m_hit_counter.data()
+				m_alive_counter, m_hit_counter
 			);
-			CUDA_CHECK_THROW(cudaMemcpyAsync(&n_alive, m_alive_counter.data(), sizeof(uint32_t), cudaMemcpyDeviceToHost, stream));
+			CUDA_CHECK_THROW(cudaMemcpyAsync(&n_alive, m_alive_counter, sizeof(uint32_t), cudaMemcpyDeviceToHost, stream));
 			CUDA_CHECK_THROW(cudaStreamSynchronize(stream));
 		}
 
@@ -2164,7 +2172,7 @@ uint32_t Testbed::NerfTracer::trace(
 	}
 
 	uint32_t n_hit;
-	CUDA_CHECK_THROW(cudaMemcpyAsync(&n_hit, m_hit_counter.data(), sizeof(uint32_t), cudaMemcpyDeviceToHost, stream));
+	CUDA_CHECK_THROW(cudaMemcpyAsync(&n_hit, m_hit_counter, sizeof(uint32_t), cudaMemcpyDeviceToHost, stream));
 	CUDA_CHECK_THROW(cudaStreamSynchronize(stream));
 	return n_hit;
 }
@@ -2178,14 +2186,18 @@ void Testbed::NerfTracer::enlarge(size_t n_elements, uint32_t padded_output_widt
 		Array4f, float, NerfPayload, // m_rays_hit
 
 		network_precision_t,
-		float
+		float,
+		uint32_t,
+		uint32_t
 	>(
 		stream, &m_scratch_alloc,
 		n_elements, n_elements, n_elements,
 		n_elements, n_elements, n_elements,
 		n_elements, n_elements, n_elements,
 		n_elements * MAX_STEPS_INBETWEEN_COMPACTION * padded_output_width,
-		n_elements * MAX_STEPS_INBETWEEN_COMPACTION * num_floats
+		n_elements * MAX_STEPS_INBETWEEN_COMPACTION * num_floats,
+		32, // 2 full cache lines to ensure no overlap
+		32  // 2 full cache lines to ensure no overlap
 	);
 
 	m_rays[0].set(std::get<0>(scratch), std::get<1>(scratch), std::get<2>(scratch), n_elements);
@@ -2194,6 +2206,9 @@ void Testbed::NerfTracer::enlarge(size_t n_elements, uint32_t padded_output_widt
 
 	m_network_output = std::get<9>(scratch);
 	m_network_input = std::get<10>(scratch);
+
+	m_hit_counter = std::get<11>(scratch);
+	m_alive_counter = std::get<12>(scratch);
 }
 
 void Testbed::Nerf::Training::reset_extra_dims(default_rng_t &rng) {
@@ -2248,9 +2263,7 @@ void Testbed::render_nerf(CudaRenderBuffer& render_buffer, const Vector2i& max_r
 
 	const float* extra_dims_gpu = get_inference_extra_dims(stream);
 
-	ScopeGuard tmp_memory_guard{[&]() {
-		m_nerf.tracer.clear();
-	}};
+	NerfTracer tracer;
 
 	// Our motion vector code can't undo f-theta and grid distortions -- so don't render these if DLSS is enabled.
 	bool render_opencv_lens = m_nerf.render_with_lens_distortion && (!render_buffer.dlss() || m_nerf.render_lens.mode == ELensMode::OpenCV);
@@ -2259,7 +2272,7 @@ void Testbed::render_nerf(CudaRenderBuffer& render_buffer, const Vector2i& max_r
 	Lens lens = render_opencv_lens ? m_nerf.render_lens : Lens{};
 
 
-	m_nerf.tracer.init_rays_from_camera(
+	tracer.init_rays_from_camera(
 		render_buffer.spp(),
 		m_network->padded_output_width(),
 		m_nerf_network->n_extra_dims(),
@@ -2293,10 +2306,10 @@ void Testbed::render_nerf(CudaRenderBuffer& render_buffer, const Vector2i& max_r
 
 	uint32_t n_hit;
 	if (m_render_mode == ERenderMode::Slice) {
-		n_hit = m_nerf.tracer.n_rays_initialized();
+		n_hit = tracer.n_rays_initialized();
 	} else {
 		float depth_scale = 1.0f / m_nerf.training.dataset.scale;
-		n_hit = m_nerf.tracer.trace(
+		n_hit = tracer.trace(
 			*m_nerf_network,
 			m_render_aabb,
 			m_render_aabb_to_local,
@@ -2321,7 +2334,7 @@ void Testbed::render_nerf(CudaRenderBuffer& render_buffer, const Vector2i& max_r
 			stream
 		);
 	}
-	RaysNerfSoa& rays_hit = m_render_mode == ERenderMode::Slice ? m_nerf.tracer.rays_init() : m_nerf.tracer.rays_hit();
+	RaysNerfSoa& rays_hit = m_render_mode == ERenderMode::Slice ? tracer.rays_init() : tracer.rays_hit();
 
 	if (m_render_mode == ERenderMode::Slice) {
 		// Store colors in the normal buffer
@@ -2329,23 +2342,21 @@ void Testbed::render_nerf(CudaRenderBuffer& render_buffer, const Vector2i& max_r
 		const uint32_t floats_per_coord = sizeof(NerfCoordinate) / sizeof(float) + m_nerf_network->n_extra_dims();
 		const uint32_t extra_stride = m_nerf_network->n_extra_dims() * sizeof(float); // extra stride on top of base NerfCoordinate struct
 
-		m_nerf.vis_input.enlarge(n_elements * floats_per_coord);
-		m_nerf.vis_rgba.enlarge(n_elements);
-		linear_kernel(generate_nerf_network_inputs_at_current_position, 0, stream, n_hit, m_aabb, rays_hit.payload, PitchedPtr<NerfCoordinate>((NerfCoordinate*)m_nerf.vis_input.data(), 1, 0, extra_stride), extra_dims_gpu );
+		GPUMatrix<float> positions_matrix{floats_per_coord, n_elements, stream};
+		GPUMatrix<float> rgbsigma_matrix{4, n_elements, stream};
 
-		GPUMatrix<float> positions_matrix((float*)m_nerf.vis_input.data(), floats_per_coord, n_elements);
-		GPUMatrix<float> rgbsigma_matrix((float*)m_nerf.vis_rgba.data(), 4, n_elements);
+		linear_kernel(generate_nerf_network_inputs_at_current_position, 0, stream, n_hit, m_aabb, rays_hit.payload, PitchedPtr<NerfCoordinate>((NerfCoordinate*)positions_matrix.data(), 1, 0, extra_stride), extra_dims_gpu );
 
 		if (m_visualized_dimension == -1) {
 			m_network->inference(stream, positions_matrix, rgbsigma_matrix);
-			linear_kernel(compute_nerf_rgba, 0, stream, n_hit, m_nerf.vis_rgba.data(), m_nerf.rgb_activation, m_nerf.density_activation, 0.01f, false);
+			linear_kernel(compute_nerf_rgba, 0, stream, n_hit, (Array4f*)rgbsigma_matrix.data(), m_nerf.rgb_activation, m_nerf.density_activation, 0.01f, false);
 		} else {
 			m_network->visualize_activation(stream, m_visualized_layer, m_visualized_dimension, positions_matrix, rgbsigma_matrix);
 		}
 
 		linear_kernel(shade_kernel_nerf, 0, stream,
 			n_hit,
-			m_nerf.vis_rgba.data(),
+			(Array4f*)rgbsigma_matrix.data(),
 			nullptr,
 			rays_hit.payload,
 			m_render_mode,
diff --git a/src/testbed_sdf.cu b/src/testbed_sdf.cu
index 780dfb20200a034040582e08de01036d263b6153..5a0065e8874545fddc6783ff3b0aa6dc057182d8 100644
--- a/src/testbed_sdf.cu
+++ b/src/testbed_sdf.cu
@@ -550,12 +550,13 @@ __global__ void init_rays_with_payload_kernel_sdf(
 
 	if (octree_nodes && !TriangleOctree::contains(octree_nodes, max_depth, ray.o)) {
 		t = max(0.0f, TriangleOctree::ray_intersect(octree_nodes, max_depth, ray.o, ray.d));
-		if (ray.o.y() > floor_y && ray.d.y()<0.f) {
-			float floor_dist = -(ray.o.y() - floor_y)/ray.d.y();
-			if (floor_dist>0.f) {
-				t=min(t,floor_dist);
+		if (ray.o.y() > floor_y && ray.d.y() < 0.f) {
+			float floor_dist = -(ray.o.y() - floor_y) / ray.d.y();
+			if (floor_dist > 0.f) {
+				t = min(t, floor_dist);
 			}
 		}
+
 		ray.o = ray.o + (t + 1e-6f) * ray.d;
 	}
 
@@ -566,15 +567,10 @@ __global__ void init_rays_with_payload_kernel_sdf(
 	}
 
 	SdfPayload& payload = payloads[idx];
-	if (!aabb.contains(ray.o)) {
-		payload.alive = false;
-		return;
-	}
-
 	payload.dir = ray.d;
 	payload.idx = idx;
 	payload.n_steps = 0;
-	payload.alive = true;
+	payload.alive = aabb.contains(ray.o);
 }
 
 __host__ __device__ uint32_t sample_discrete(float uniform_sample, const float* __restrict__ cdf, int length) {
@@ -591,7 +587,8 @@ __global__ void sample_uniform_on_triangle_kernel(uint32_t n_elements, const flo
 	sampled_positions[i] = triangles[tri_idx].sample_uniform_position(sample.tail<2>());
 }
 
-void Testbed::SphereTracer::init_rays_from_camera(uint32_t sample_index,
+void Testbed::SphereTracer::init_rays_from_camera(
+	uint32_t sample_index,
 	const Vector2i& resolution,
 	const Vector2f& focal_length,
 	const Matrix<float, 3, 4>& camera_matrix,
@@ -613,15 +610,15 @@ void Testbed::SphereTracer::init_rays_from_camera(uint32_t sample_index,
 ) {
 	// Make sure we have enough memory reserved to render at the requested resolution
 	size_t n_pixels = (size_t)resolution.x() * resolution.y();
-	enlarge(n_pixels);
+	enlarge(n_pixels, stream);
 
 	const dim3 threads = { 16, 8, 1 };
 	const dim3 blocks = { div_round_up((uint32_t)resolution.x(), threads.x), div_round_up((uint32_t)resolution.y(), threads.y), 1 };
 	init_rays_with_payload_kernel_sdf<<<blocks, threads, 0, stream>>>(
 		sample_index,
-		m_rays[0].pos.data(),
-		m_rays[0].distance.data(),
-		m_rays[0].payload.data(),
+		m_rays[0].pos,
+		m_rays[0].distance,
+		m_rays[0].payload,
 		resolution,
 		focal_length,
 		camera_matrix,
@@ -644,7 +641,7 @@ void Testbed::SphereTracer::init_rays_from_camera(uint32_t sample_index,
 }
 
 void Testbed::SphereTracer::init_rays_from_data(uint32_t n_elements, const RaysSdfSoa& data, cudaStream_t stream) {
-	enlarge(n_elements);
+	enlarge(n_elements, stream);
 	m_rays[0].copy_from_other_async(n_elements, data, stream);
 	m_n_rays_initialized = n_elements;
 }
@@ -658,11 +655,11 @@ uint32_t Testbed::SphereTracer::trace_bvh(TriangleBvh* bvh, const Triangle* tria
 	}
 
 	// Abuse the normal buffer to temporarily hold ray directions
-	parallel_for_gpu(stream, n_alive, [payloads=m_rays[0].payload.data(), normals=m_rays[0].normal.data()] __device__ (size_t i) {
+	parallel_for_gpu(stream, n_alive, [payloads=m_rays[0].payload, normals=m_rays[0].normal] __device__ (size_t i) {
 		normals[i] = payloads[i].dir;
 	});
 
-	bvh->ray_trace_gpu(n_alive, m_rays[0].pos.data(), m_rays[0].normal.data(), triangles, stream);
+	bvh->ray_trace_gpu(n_alive, m_rays[0].pos, m_rays[0].normal, triangles, stream);
 	return n_alive;
 }
 
@@ -681,7 +678,7 @@ uint32_t Testbed::SphereTracer::trace(
 		return 0;
 	}
 
-	CUDA_CHECK_THROW(cudaMemsetAsync(m_hit_counter.data(), 0, sizeof(uint32_t), stream));
+	CUDA_CHECK_THROW(cudaMemsetAsync(m_hit_counter, 0, sizeof(uint32_t), stream));
 
 	const uint32_t STEPS_INBETWEEN_COMPACTION = 4;
 
@@ -700,29 +697,29 @@ uint32_t Testbed::SphereTracer::trace(
 
 		// Compact rays that did not diverge yet
 		{
-			CUDA_CHECK_THROW(cudaMemsetAsync(m_alive_counter.data(), 0, sizeof(uint32_t), stream));
+			CUDA_CHECK_THROW(cudaMemsetAsync(m_alive_counter, 0, sizeof(uint32_t), stream));
 			if (m_trace_shadow_rays) {
 				linear_kernel(compact_kernel_shadow_sdf, 0, stream,
 					n_alive,
 					zero_offset,
-					rays_tmp.pos.data(), rays_tmp.distance.data(), rays_tmp.payload.data(), rays_tmp.prev_distance.data(), rays_tmp.total_distance.data(), rays_tmp.min_visibility.data(),
-					rays_current.pos.data(), rays_current.distance.data(), rays_current.payload.data(), rays_current.prev_distance.data(), rays_current.total_distance.data(), rays_current.min_visibility.data(),
-					m_rays_hit.pos.data(), m_rays_hit.distance.data(), m_rays_hit.payload.data(), m_rays_hit.prev_distance.data(), m_rays_hit.total_distance.data(), m_rays_hit.min_visibility.data(),
+					rays_tmp.pos, rays_tmp.distance, rays_tmp.payload, rays_tmp.prev_distance, rays_tmp.total_distance, rays_tmp.min_visibility,
+					rays_current.pos, rays_current.distance, rays_current.payload, rays_current.prev_distance, rays_current.total_distance, rays_current.min_visibility,
+					m_rays_hit.pos, m_rays_hit.distance, m_rays_hit.payload, m_rays_hit.prev_distance, m_rays_hit.total_distance, m_rays_hit.min_visibility,
 					aabb,
-					m_alive_counter.data(), m_hit_counter.data()
+					m_alive_counter, m_hit_counter
 				);
 			} else {
 				linear_kernel(compact_kernel_sdf, 0, stream,
 					n_alive,
 					zero_offset,
-					rays_tmp.pos.data(), rays_tmp.distance.data(), rays_tmp.payload.data(),
-					rays_current.pos.data(), rays_current.distance.data(), rays_current.payload.data(),
-					m_rays_hit.pos.data(), m_rays_hit.distance.data(), m_rays_hit.payload.data(),
+					rays_tmp.pos, rays_tmp.distance, rays_tmp.payload,
+					rays_current.pos, rays_current.distance, rays_current.payload,
+					m_rays_hit.pos, m_rays_hit.distance, m_rays_hit.payload,
 					aabb,
-					m_alive_counter.data(), m_hit_counter.data()
+					m_alive_counter, m_hit_counter
 				);
 			}
-			CUDA_CHECK_THROW(cudaMemcpyAsync(&n_alive, m_alive_counter.data(), sizeof(uint32_t), cudaMemcpyDeviceToHost, stream));
+			CUDA_CHECK_THROW(cudaMemcpyAsync(&n_alive, m_alive_counter, sizeof(uint32_t), cudaMemcpyDeviceToHost, stream));
 			CUDA_CHECK_THROW(cudaStreamSynchronize(stream));
 		}
 
@@ -735,9 +732,9 @@ uint32_t Testbed::SphereTracer::trace(
 			linear_kernel(advance_pos_kernel_sdf, 0, stream,
 				n_alive,
 				zero_offset,
-				rays_current.pos.data(),
-				rays_current.distance.data(),
-				rays_current.payload.data(),
+				rays_current.pos,
+				rays_current.distance,
+				rays_current.payload,
 				aabb,
 				floor_y,
 				octree ? octree->nodes_gpu() : nullptr,
@@ -745,9 +742,9 @@ uint32_t Testbed::SphereTracer::trace(
 				distance_scale,
 				maximum_distance,
 				m_shadow_sharpness,
-				m_trace_shadow_rays ? rays_current.prev_distance.data() : nullptr,
-				m_trace_shadow_rays ? rays_current.total_distance.data() : nullptr,
-				m_trace_shadow_rays ? rays_current.min_visibility.data() : nullptr
+				m_trace_shadow_rays ? rays_current.prev_distance : nullptr,
+				m_trace_shadow_rays ? rays_current.total_distance : nullptr,
+				m_trace_shadow_rays ? rays_current.min_visibility : nullptr
 			);
 		}
 
@@ -755,36 +752,67 @@ uint32_t Testbed::SphereTracer::trace(
 	}
 
 	uint32_t n_hit;
-	CUDA_CHECK_THROW(cudaMemcpyAsync(&n_hit, m_hit_counter.data(), sizeof(uint32_t), cudaMemcpyDeviceToHost, stream));
+	CUDA_CHECK_THROW(cudaMemcpyAsync(&n_hit, m_hit_counter, sizeof(uint32_t), cudaMemcpyDeviceToHost, stream));
 	CUDA_CHECK_THROW(cudaStreamSynchronize(stream));
 	return n_hit;
 }
 
-void Testbed::SphereTracer::enlarge(size_t n_elements) {
+void Testbed::SphereTracer::enlarge(size_t n_elements, cudaStream_t stream) {
 	n_elements = next_multiple(n_elements, size_t(tcnn::batch_size_granularity));
-	m_rays[0].enlarge(n_elements);
-	m_rays[1].enlarge(n_elements);
-	m_rays_hit.enlarge(n_elements);
+	auto scratch = allocate_workspace_and_distribute<
+		Vector3f, Vector3f, float, float, float, float, SdfPayload, // m_rays[0]
+		Vector3f, Vector3f, float, float, float, float, SdfPayload, // m_rays[1]
+		Vector3f, Vector3f, float, float, float, float, SdfPayload, // m_rays_hit
+
+		uint32_t,
+		uint32_t
+	>(
+		stream, &m_scratch_alloc,
+		n_elements, n_elements, n_elements, n_elements, n_elements, n_elements, n_elements,
+		n_elements, n_elements, n_elements, n_elements, n_elements, n_elements, n_elements,
+		n_elements, n_elements, n_elements, n_elements, n_elements, n_elements, n_elements,
+		32, // 2 full cache lines to ensure no overlap
+		32  // 2 full cache lines to ensure no overlap
+	);
+
+	m_rays[0].set(std::get<0>(scratch), std::get<1>(scratch), std::get<2>(scratch), std::get<3>(scratch), std::get<4>(scratch), std::get<5>(scratch), std::get<6>(scratch));
+	m_rays[1].set(std::get<7>(scratch), std::get<8>(scratch), std::get<9>(scratch), std::get<10>(scratch), std::get<11>(scratch), std::get<12>(scratch), std::get<13>(scratch));
+	m_rays_hit.set(std::get<14>(scratch), std::get<15>(scratch), std::get<16>(scratch), std::get<17>(scratch), std::get<18>(scratch), std::get<19>(scratch), std::get<20>(scratch));
+
+	m_hit_counter = std::get<21>(scratch);
+	m_alive_counter = std::get<22>(scratch);
 }
 
-void Testbed::FiniteDifferenceNormalsApproximator::enlarge(uint32_t n_elements) {
-	dx.enlarge(n_elements);
-	dy.enlarge(n_elements);
-	dz.enlarge(n_elements);
+void Testbed::FiniteDifferenceNormalsApproximator::enlarge(uint32_t n_elements, cudaStream_t stream) {
+	n_elements = next_multiple(n_elements, tcnn::batch_size_granularity);
+	auto scratch = allocate_workspace_and_distribute<
+		Vector3f, Vector3f, Vector3f,
+		float, float, float,
+		float, float, float
+	>(
+		stream, &m_scratch_alloc,
+		n_elements, n_elements, n_elements,
+		n_elements, n_elements, n_elements,
+		n_elements, n_elements, n_elements
+	);
+
+	dx = std::get<0>(scratch);
+	dy = std::get<1>(scratch);
+	dz = std::get<2>(scratch);
 
-	dist_dx_pos.enlarge(n_elements);
-	dist_dy_pos.enlarge(n_elements);
-	dist_dz_pos.enlarge(n_elements);
+	dist_dx_pos = std::get<3>(scratch);
+	dist_dy_pos = std::get<4>(scratch);
+	dist_dz_pos = std::get<5>(scratch);
 
-	dist_dx_neg.enlarge(n_elements);
-	dist_dy_neg.enlarge(n_elements);
-	dist_dz_neg.enlarge(n_elements);
+	dist_dx_neg = std::get<6>(scratch);
+	dist_dy_neg = std::get<7>(scratch);
+	dist_dz_neg = std::get<8>(scratch);
 }
 
-void Testbed::FiniteDifferenceNormalsApproximator::normal(uint32_t n_elements, const distance_fun_t& distance_function, GPUMemory<Vector3f>& pos, GPUMemory<Vector3f>& normal, float epsilon, cudaStream_t stream) {
-	enlarge(n_elements);
+void Testbed::FiniteDifferenceNormalsApproximator::normal(uint32_t n_elements, const distance_fun_t& distance_function, const Vector3f* pos, Vector3f* normal, float epsilon, cudaStream_t stream) {
+	enlarge(n_elements, stream);
 
-	parallel_for_gpu(stream, n_elements, [pos=pos.data(), dx=dx.data(), dy=dy.data(), dz=dz.data(), epsilon] __device__ (size_t i) {
+	parallel_for_gpu(stream, n_elements, [pos=pos, dx=dx, dy=dy, dz=dz, epsilon] __device__ (size_t i) {
 		Vector3f p = pos[i];
 		dx[i] = Vector3f{p.x() + epsilon, p.y(), p.z()};
 		dy[i] = Vector3f{p.x(), p.y() + epsilon, p.z()};
@@ -795,7 +823,7 @@ void Testbed::FiniteDifferenceNormalsApproximator::normal(uint32_t n_elements, c
 	distance_function(n_elements, dy, dist_dy_pos, stream);
 	distance_function(n_elements, dz, dist_dz_pos, stream);
 
-	parallel_for_gpu(stream, n_elements, [pos=pos.data(), dx=dx.data(), dy=dy.data(), dz=dz.data(), epsilon] __device__ (size_t i) {
+	parallel_for_gpu(stream, n_elements, [pos=pos, dx=dx, dy=dy, dz=dz, epsilon] __device__ (size_t i) {
 		Vector3f p = pos[i];
 		dx[i] = Vector3f{p.x() - epsilon, p.y(), p.z()};
 		dy[i] = Vector3f{p.x(), p.y() - epsilon, p.z()};
@@ -806,7 +834,7 @@ void Testbed::FiniteDifferenceNormalsApproximator::normal(uint32_t n_elements, c
 	distance_function(n_elements, dy, dist_dy_neg, stream);
 	distance_function(n_elements, dz, dist_dz_neg, stream);
 
-	parallel_for_gpu(stream, n_elements, [normal=normal.data(), dist_dx_pos=dist_dx_pos.data(), dist_dx_neg=dist_dx_neg.data(), dist_dy_pos=dist_dy_pos.data(), dist_dy_neg=dist_dy_neg.data(), dist_dz_pos=dist_dz_pos.data(), dist_dz_neg=dist_dz_neg.data()] __device__ (size_t i) {
+	parallel_for_gpu(stream, n_elements, [normal=normal, dist_dx_pos=dist_dx_pos, dist_dx_neg=dist_dx_neg, dist_dy_pos=dist_dy_pos, dist_dy_neg=dist_dy_neg, dist_dz_pos=dist_dz_pos, dist_dz_neg=dist_dz_neg] __device__ (size_t i) {
 		normal[i] = {dist_dx_pos[i] - dist_dx_neg[i], dist_dy_pos[i] - dist_dy_neg[i], dist_dz_pos[i] - dist_dz_neg[i]};
 	});
 }
@@ -827,8 +855,7 @@ void Testbed::render_sdf(
 	}
 	auto* octree_ptr = m_sdf.uses_takikawa_encoding || m_sdf.use_triangle_octree ? m_sdf.triangle_octree.get() : nullptr;
 
-	// Reserve the memory for max-res rendering to prevent stuttering
-	m_sdf.tracer.enlarge(max_res.x() * max_res.y());
+	SphereTracer tracer;
 
 	uint32_t n_octree_levels = octree_ptr ? octree_ptr->depth() : 0;
 	if (m_render_ground_truth && m_sdf.groundtruth_mode == ESDFGroundTruthMode::SDFBricks) {
@@ -837,7 +864,7 @@ void Testbed::render_sdf(
 
 	BoundingBox sdf_bounding_box = m_aabb;
 	sdf_bounding_box.inflate(m_sdf.zero_offset);
-	m_sdf.tracer.init_rays_from_camera(
+	tracer.init_rays_from_camera(
 		render_buffer.spp(),
 		render_buffer.in_resolution(),
 		focal_length,
@@ -881,22 +908,22 @@ void Testbed::render_sdf(
 
 	uint32_t n_hit;
 	if (m_render_mode == ERenderMode::Slice) {
-		n_hit = m_sdf.tracer.n_rays_initialized();
+		n_hit = tracer.n_rays_initialized();
 	} else {
-		n_hit = trace(m_sdf.tracer);
+		n_hit = trace(tracer);
 	}
-	RaysSdfSoa& rays_hit = m_render_mode == ERenderMode::Slice || gt_raytrace ? m_sdf.tracer.rays_init() : m_sdf.tracer.rays_hit();
+	RaysSdfSoa& rays_hit = m_render_mode == ERenderMode::Slice || gt_raytrace ? tracer.rays_init() : tracer.rays_hit();
 
 	if (m_render_mode == ERenderMode::Slice) {
 		if (m_visualized_dimension == -1) {
 			distance_function(n_hit, rays_hit.pos, rays_hit.distance, stream);
-			extract_dimension_pos_neg_kernel<float><<<n_blocks_linear(n_hit*3), n_threads_linear, 0, stream>>>(n_hit*3, 0, 1, 3, rays_hit.distance.data(), CM, (float*)rays_hit.normal.data());
+			extract_dimension_pos_neg_kernel<float><<<n_blocks_linear(n_hit*3), n_threads_linear, 0, stream>>>(n_hit*3, 0, 1, 3, rays_hit.distance, CM, (float*)rays_hit.normal);
 		} else {
 			// Store colors in the normal buffer
 			uint32_t n_elements = next_multiple(n_hit, tcnn::batch_size_granularity);
 
-			GPUMatrix<float> positions_matrix((float*)rays_hit.pos.data(), 3, n_elements);
-			GPUMatrix<float> colors_matrix((float*)rays_hit.normal.data(), 3, n_elements);
+			GPUMatrix<float> positions_matrix((float*)rays_hit.pos, 3, n_elements);
+			GPUMatrix<float> colors_matrix((float*)rays_hit.normal, 3, n_elements);
 			m_network->visualize_activation(stream, m_visualized_layer, m_visualized_dimension, positions_matrix, colors_matrix);
 		}
 	}
@@ -906,45 +933,47 @@ void Testbed::render_sdf(
 		if (m_sdf.analytic_normals || gt_raytrace) {
 			normals_function(n_hit, rays_hit.pos, rays_hit.normal, stream);
 		} else {
-			// Prevent spurious enlargements by reserving enough memory to hold a full-res image in any case.
-			m_sdf.fd_normals.enlarge(render_buffer.in_resolution().x() * render_buffer.in_resolution().y());
 			float fd_normals_epsilon = m_sdf.fd_normals_epsilon;
 			if (m_render_ground_truth && m_sdf.groundtruth_mode == ESDFGroundTruthMode::SDFBricks && m_sdf.brick_smooth_normals) {
 				fd_normals_epsilon = exp2f(-float(n_octree_levels)) * (1.f/(m_sdf.brick_res-1)); // in sdf brick mode, use one voxel as the normal central difference radius
 			}
-			m_sdf.fd_normals.normal(n_hit, distance_function, rays_hit.pos, rays_hit.normal, fd_normals_epsilon, stream);
+
+			FiniteDifferenceNormalsApproximator fd_normals;
+			fd_normals.normal(n_hit, distance_function, rays_hit.pos, rays_hit.normal, fd_normals_epsilon, stream);
 		}
 
 		if (render_mode == ERenderMode::Shade && n_hit > 0) {
 			// Shadow rays towards the sun
-			m_sdf.shadow_tracer.init_rays_from_data(n_hit, rays_hit, stream);
-			m_sdf.shadow_tracer.set_trace_shadow_rays(true);
-			m_sdf.shadow_tracer.set_shadow_sharpness(m_sdf.shadow_sharpness);
-			RaysSdfSoa& shadow_rays_init = m_sdf.shadow_tracer.rays_init();
+			SphereTracer shadow_tracer;
+
+			shadow_tracer.init_rays_from_data(n_hit, rays_hit, stream);
+			shadow_tracer.set_trace_shadow_rays(true);
+			shadow_tracer.set_shadow_sharpness(m_sdf.shadow_sharpness);
+			RaysSdfSoa& shadow_rays_init = shadow_tracer.rays_init();
 			linear_kernel(prepare_shadow_rays, 0, stream,
 				n_hit,
 				m_sun_dir.normalized(),
-				shadow_rays_init.pos.data(),
-				shadow_rays_init.normal.data(),
-				shadow_rays_init.distance.data(),
-				shadow_rays_init.prev_distance.data(),
-				shadow_rays_init.total_distance.data(),
-				shadow_rays_init.min_visibility.data(),
-				shadow_rays_init.payload.data(),
+				shadow_rays_init.pos,
+				shadow_rays_init.normal,
+				shadow_rays_init.distance,
+				shadow_rays_init.prev_distance,
+				shadow_rays_init.total_distance,
+				shadow_rays_init.min_visibility,
+				shadow_rays_init.payload,
 				sdf_bounding_box,
 				octree_ptr ? octree_ptr->nodes_gpu() : nullptr,
 				n_octree_levels
 			);
-			uint32_t n_hit_shadow = trace(m_sdf.shadow_tracer);
-			auto& shadow_rays_hit = gt_raytrace ? m_sdf.shadow_tracer.rays_init() : m_sdf.shadow_tracer.rays_hit();
+			uint32_t n_hit_shadow = trace(shadow_tracer);
+			auto& shadow_rays_hit = gt_raytrace ? shadow_tracer.rays_init() : shadow_tracer.rays_hit();
 
 			linear_kernel(write_shadow_ray_result, 0, stream,
 				n_hit_shadow,
 				sdf_bounding_box,
-				shadow_rays_hit.pos.data(),
-				shadow_rays_hit.payload.data(),
-				shadow_rays_hit.min_visibility.data(),
-				rays_hit.distance.data()
+				shadow_rays_hit.pos,
+				shadow_rays_hit.payload,
+				shadow_rays_hit.min_visibility,
+				rays_hit.distance
 			);
 
 			// todo: Reflection rays?
@@ -953,8 +982,8 @@ void Testbed::render_sdf(
 		// HACK: Store colors temporarily in the normal buffer
 		uint32_t n_elements = next_multiple(n_hit, tcnn::batch_size_granularity);
 
-		GPUMatrix<float> positions_matrix((float*)rays_hit.pos.data(), 3, n_elements);
-		GPUMatrix<float> colors_matrix((float*)rays_hit.normal.data(), 3, n_elements);
+		GPUMatrix<float> positions_matrix((float*)rays_hit.pos, 3, n_elements);
+		GPUMatrix<float> colors_matrix((float*)rays_hit.normal, 3, n_elements);
 		m_network->visualize_activation(stream, m_visualized_layer, m_visualized_dimension, positions_matrix, colors_matrix);
 	}
 
@@ -967,17 +996,18 @@ void Testbed::render_sdf(
 		m_sun_dir.normalized(),
 		m_up_dir.normalized(),
 		camera_matrix,
-		rays_hit.pos.data(),
-		rays_hit.normal.data(),
-		rays_hit.distance.data(),
-		rays_hit.payload.data(),
+		rays_hit.pos,
+		rays_hit.normal,
+		rays_hit.distance,
+		rays_hit.payload,
 		render_buffer.frame_buffer(),
 		render_buffer.depth_buffer()
 	);
 
 	if (render_mode == ERenderMode::Cost) {
 		std::vector<SdfPayload> payloads_final_cpu(n_hit);
-		rays_hit.payload.copy_to_host(payloads_final_cpu, n_hit);
+		CUDA_CHECK_THROW(cudaMemcpyAsync(payloads_final_cpu.data(), rays_hit.payload, n_hit * sizeof(SdfPayload), cudaMemcpyDeviceToHost, stream));
+		CUDA_CHECK_THROW(cudaStreamSynchronize(stream));
 		size_t total_n_steps = 0;
 		for (uint32_t i = 0; i < n_hit; ++i) {
 			total_n_steps += payloads_final_cpu[i].n_steps;
diff --git a/src/thread_pool.cpp b/src/thread_pool.cpp
index 17551b101774bf580606de4a8e0de697b6cf6637..0289db40e1d14c2144cda61f3dcfa227ba1e0158 100644
--- a/src/thread_pool.cpp
+++ b/src/thread_pool.cpp
@@ -28,7 +28,6 @@ ThreadPool::ThreadPool(size_t maxNumThreads, bool force) {
 		maxNumThreads = min((size_t)thread::hardware_concurrency(), maxNumThreads);
 	}
 	startThreads(maxNumThreads);
-	mNumTasksInSystem.store(0);
 }
 
 ThreadPool::~ThreadPool() {
@@ -59,16 +58,6 @@ void ThreadPool::startThreads(size_t num) {
 				lock.unlock();
 
 				task();
-
-				mNumTasksInSystem--;
-
-				{
-					unique_lock<mutex> localLock{mSystemBusyMutex};
-
-					if (mNumTasksInSystem == 0) {
-						mSystemBusyCondition.notify_all();
-					}
-				}
 			}
 		});
 	}
@@ -90,30 +79,16 @@ void ThreadPool::shutdownThreads(size_t num) {
 	}
 }
 
-void ThreadPool::waitUntilFinished() {
-	unique_lock<mutex> lock{mSystemBusyMutex};
-
-	if (mNumTasksInSystem == 0) {
-		return;
-	}
-
-	mSystemBusyCondition.wait(lock);
-}
-
-void ThreadPool::waitUntilFinishedFor(const chrono::microseconds Duration) {
-	unique_lock<mutex> lock{mSystemBusyMutex};
-
-	if (mNumTasksInSystem == 0) {
-		return;
+void ThreadPool::setNThreads(size_t num) {
+	if (mNumThreads > num) {
+		shutdownThreads(mNumThreads - num);
+	} else if (mNumThreads < num) {
+		startThreads(num - mNumThreads);
 	}
-
-	mSystemBusyCondition.wait_for(lock, Duration);
 }
 
 void ThreadPool::flushQueue() {
 	lock_guard<mutex> lock{mTaskQueueMutex};
-
-	mNumTasksInSystem -= mTaskQueue.size();
 	mTaskQueue.clear();
 }