diff --git a/include/neural-graphics-primitives/sdf.h b/include/neural-graphics-primitives/sdf.h
index ddef63cab37c9824bfeaa5cb5bdf0a58e8e4c6e0..07a99bbd90e17dbac43d9875a9e6dc54c96ff489 100644
--- a/include/neural-graphics-primitives/sdf.h
+++ b/include/neural-graphics-primitives/sdf.h
@@ -29,34 +29,34 @@ struct SdfPayload {
 
 struct RaysSdfSoa {
 #if defined(__NVCC__) || (defined(__clang__) && defined(__CUDA__))
-	void enlarge(size_t n_elements) {
-		pos.enlarge(n_elements);
-		normal.enlarge(n_elements);
-		distance.enlarge(n_elements);
-		prev_distance.enlarge(n_elements);
-		total_distance.enlarge(n_elements);
-		min_visibility.enlarge(n_elements);
-		payload.enlarge(n_elements);
-	}
-
 	void copy_from_other_async(uint32_t n_elements, const RaysSdfSoa& other, cudaStream_t stream) {
-		CUDA_CHECK_THROW(cudaMemcpyAsync(pos.data(), other.pos.data(), n_elements * sizeof(Eigen::Vector3f), cudaMemcpyDeviceToDevice, stream));
-		CUDA_CHECK_THROW(cudaMemcpyAsync(normal.data(), other.normal.data(), n_elements * sizeof(Eigen::Vector3f), cudaMemcpyDeviceToDevice, stream));
-		CUDA_CHECK_THROW(cudaMemcpyAsync(distance.data(), other.distance.data(), n_elements * sizeof(float), cudaMemcpyDeviceToDevice, stream));
-		CUDA_CHECK_THROW(cudaMemcpyAsync(prev_distance.data(), other.prev_distance.data(), n_elements * sizeof(float), cudaMemcpyDeviceToDevice, stream));
-		CUDA_CHECK_THROW(cudaMemcpyAsync(total_distance.data(), other.total_distance.data(), n_elements * sizeof(float), cudaMemcpyDeviceToDevice, stream));
-		CUDA_CHECK_THROW(cudaMemcpyAsync(min_visibility.data(), other.min_visibility.data(), n_elements * sizeof(float), cudaMemcpyDeviceToDevice, stream));
-		CUDA_CHECK_THROW(cudaMemcpyAsync(payload.data(), other.payload.data(), n_elements * sizeof(SdfPayload), cudaMemcpyDeviceToDevice, stream));
+		CUDA_CHECK_THROW(cudaMemcpyAsync(pos, other.pos, n_elements * sizeof(Eigen::Vector3f), cudaMemcpyDeviceToDevice, stream));
+		CUDA_CHECK_THROW(cudaMemcpyAsync(normal, other.normal, n_elements * sizeof(Eigen::Vector3f), cudaMemcpyDeviceToDevice, stream));
+		CUDA_CHECK_THROW(cudaMemcpyAsync(distance, other.distance, n_elements * sizeof(float), cudaMemcpyDeviceToDevice, stream));
+		CUDA_CHECK_THROW(cudaMemcpyAsync(prev_distance, other.prev_distance, n_elements * sizeof(float), cudaMemcpyDeviceToDevice, stream));
+		CUDA_CHECK_THROW(cudaMemcpyAsync(total_distance, other.total_distance, n_elements * sizeof(float), cudaMemcpyDeviceToDevice, stream));
+		CUDA_CHECK_THROW(cudaMemcpyAsync(min_visibility, other.min_visibility, n_elements * sizeof(float), cudaMemcpyDeviceToDevice, stream));
+		CUDA_CHECK_THROW(cudaMemcpyAsync(payload, other.payload, n_elements * sizeof(SdfPayload), cudaMemcpyDeviceToDevice, stream));
 	}
 #endif
 
-	tcnn::GPUMemory<Eigen::Vector3f> pos;
-	tcnn::GPUMemory<Eigen::Vector3f> normal;
-	tcnn::GPUMemory<float> distance;
-	tcnn::GPUMemory<float> prev_distance;
-	tcnn::GPUMemory<float> total_distance;
-	tcnn::GPUMemory<float> min_visibility;
-	tcnn::GPUMemory<SdfPayload> payload;
+	void set(Eigen::Vector3f* pos, Eigen::Vector3f* normal, float* distance, float* prev_distance, float* total_distance, float* min_visibility, SdfPayload* payload) {
+		this->pos = pos;
+		this->normal = normal;
+		this->distance = distance;
+		this->prev_distance = prev_distance;
+		this->total_distance = total_distance;
+		this->min_visibility = min_visibility;
+		this->payload = payload;
+	}
+
+	Eigen::Vector3f* pos;
+	Eigen::Vector3f* normal;
+	float* distance;
+	float* prev_distance;
+	float* total_distance;
+	float* min_visibility;
+	SdfPayload* payload;
 };
 
 struct BRDFParams {
diff --git a/include/neural-graphics-primitives/testbed.h b/include/neural-graphics-primitives/testbed.h
index 9e40b6a3a16e805f65e80bdc15419a8b4ae7ffae..8e55eeb08c90198e5b45320549e89a0b789dbe82 100644
--- a/include/neural-graphics-primitives/testbed.h
+++ b/include/neural-graphics-primitives/testbed.h
@@ -69,12 +69,12 @@ public:
 	void load_training_data(const std::string& data_path);
 	void clear_training_data();
 
-	using distance_fun_t = std::function<void(uint32_t, const tcnn::GPUMemory<Eigen::Vector3f>&, tcnn::GPUMemory<float>&, cudaStream_t)>;
-	using normals_fun_t = std::function<void(uint32_t, const tcnn::GPUMemory<Eigen::Vector3f>&, tcnn::GPUMemory<Eigen::Vector3f>&, cudaStream_t)>;
+	using distance_fun_t = std::function<void(uint32_t, const Eigen::Vector3f*, float*, cudaStream_t)>;
+	using normals_fun_t = std::function<void(uint32_t, const Eigen::Vector3f*, Eigen::Vector3f*, cudaStream_t)>;
 
 	class SphereTracer {
 	public:
-		SphereTracer() : m_hit_counter(1), m_alive_counter(1) {}
+		SphereTracer() {}
 
 		void init_rays_from_camera(
 			uint32_t spp,
@@ -111,7 +111,7 @@ public:
 			uint32_t n_octree_levels,
 			cudaStream_t stream
 		);
-		void enlarge(size_t n_elements);
+		void enlarge(size_t n_elements, cudaStream_t stream);
 		RaysSdfSoa& rays_hit() { return m_rays_hit; }
 		RaysSdfSoa& rays_init() { return m_rays[0];	}
 		uint32_t n_rays_initialized() const { return m_n_rays_initialized; }
@@ -120,16 +120,19 @@ public:
 	private:
 		RaysSdfSoa m_rays[2];
 		RaysSdfSoa m_rays_hit;
-		tcnn::GPUMemory<uint32_t> m_hit_counter;
-		tcnn::GPUMemory<uint32_t> m_alive_counter;
+		uint32_t* m_hit_counter;
+		uint32_t* m_alive_counter;
+
 		uint32_t m_n_rays_initialized = 0;
 		float m_shadow_sharpness = 2048.f;
 		bool m_trace_shadow_rays = false;
+
+		tcnn::GPUMemoryArena::Allocation m_scratch_alloc;
 	};
 
 	class NerfTracer {
 	public:
-		NerfTracer() : m_hit_counter(1), m_alive_counter(1) {}
+		NerfTracer() {}
 
 		void init_rays_from_camera(
 			uint32_t spp,
@@ -193,38 +196,36 @@ public:
 		RaysNerfSoa& rays_init() { return m_rays[0]; }
 		uint32_t n_rays_initialized() const { return m_n_rays_initialized; }
 
-		void clear() {
-			m_scratch_alloc = {};
-		}
-
 	private:
 		RaysNerfSoa m_rays[2];
 		RaysNerfSoa m_rays_hit;
 		precision_t* m_network_output;
 		float* m_network_input;
-		tcnn::GPUMemory<uint32_t> m_hit_counter;
-		tcnn::GPUMemory<uint32_t> m_alive_counter;
+		uint32_t* m_hit_counter;
+		uint32_t* m_alive_counter;
 		uint32_t m_n_rays_initialized = 0;
 		tcnn::GPUMemoryArena::Allocation m_scratch_alloc;
 	};
 
 	class FiniteDifferenceNormalsApproximator {
 	public:
-		void enlarge(uint32_t n_elements);
-		void normal(uint32_t n_elements, const distance_fun_t& distance_function, tcnn::GPUMemory<Eigen::Vector3f>& pos, tcnn::GPUMemory<Eigen::Vector3f>& normal, float epsilon, cudaStream_t stream);
+		void enlarge(uint32_t n_elements, cudaStream_t stream);
+		void normal(uint32_t n_elements, const distance_fun_t& distance_function, const Eigen::Vector3f* pos, Eigen::Vector3f* normal, float epsilon, cudaStream_t stream);
 
 	private:
-		tcnn::GPUMemory<Eigen::Vector3f> dx;
-		tcnn::GPUMemory<Eigen::Vector3f> dy;
-		tcnn::GPUMemory<Eigen::Vector3f> dz;
+		Eigen::Vector3f* dx;
+		Eigen::Vector3f* dy;
+		Eigen::Vector3f* dz;
 
-		tcnn::GPUMemory<float> dist_dx_pos;
-		tcnn::GPUMemory<float> dist_dy_pos;
-		tcnn::GPUMemory<float> dist_dz_pos;
+		float* dist_dx_pos;
+		float* dist_dy_pos;
+		float* dist_dz_pos;
 
-		tcnn::GPUMemory<float> dist_dx_neg;
-		tcnn::GPUMemory<float> dist_dy_neg;
-		tcnn::GPUMemory<float> dist_dz_neg;
+		float* dist_dx_neg;
+		float* dist_dy_neg;
+		float* dist_dz_neg;
+
+		tcnn::GPUMemoryArena::Allocation m_scratch_alloc;
 	};
 
 	struct LevelStats {
@@ -532,8 +533,6 @@ public:
 	bool m_gui_redraw = true;
 
 	struct Nerf {
-		NerfTracer tracer;
-
 		struct Training {
 			NerfDataset dataset;
 			int n_images_for_training = 0; // how many images to train from, as a high watermark compared to the dataset size
@@ -640,9 +639,6 @@ public:
 
 		uint32_t max_cascade = 0;
 
-		tcnn::GPUMemory<float> vis_input;
-		tcnn::GPUMemory<Eigen::Array4f> vis_rgba;
-
 		ENerfActivation rgb_activation = ENerfActivation::Exponential;
 		ENerfActivation density_activation = ENerfActivation::Exponential;
 
@@ -666,8 +662,6 @@ public:
 	} m_nerf;
 
 	struct Sdf {
-		SphereTracer tracer;
-		SphereTracer shadow_tracer;
 		float shadow_sharpness = 2048.0f;
 		float maximum_distance = 0.00005f;
 		float fd_normals_epsilon = 0.0005f;
@@ -676,8 +670,6 @@ public:
 
 		BRDFParams brdf;
 
-		FiniteDifferenceNormalsApproximator fd_normals;
-
 		// Mesh data
 		EMeshSdfMode mesh_sdf_mode = EMeshSdfMode::Raystab;
 		float mesh_scale;
@@ -764,7 +756,7 @@ public:
 		tcnn::GPUMemory<char> nanovdb_grid;
 		tcnn::GPUMemory<uint8_t> bitgrid;
 		float global_majorant = 1.f;
-		Eigen::Vector3f world2index_offset = {0,0,0};
+		Eigen::Vector3f world2index_offset = {0, 0, 0};
 		float world2index_scale = 1.f;
 
 		struct Training {
diff --git a/src/testbed.cu b/src/testbed.cu
index 57ea85330703b0b5bcbc19e300b0037232f7e524..6b4388169f206d4b577dfb9b616c70283dca8c74 100644
--- a/src/testbed.cu
+++ b/src/testbed.cu
@@ -2710,7 +2710,7 @@ void Testbed::render_frame(const Matrix<float, 3, 4>& camera_matrix0, const Matr
 					}
 				}
 				distance_fun_t distance_fun =
-					m_render_ground_truth ? (distance_fun_t)[&](uint32_t n_elements, const GPUMemory<Vector3f>& positions, GPUMemory<float>& distances, cudaStream_t stream) {
+					m_render_ground_truth ? (distance_fun_t)[&](uint32_t n_elements, const Vector3f* positions, float* distances, cudaStream_t stream) {
 						if (n_elements == 0) {
 							return;
 						}
@@ -2730,35 +2730,35 @@ void Testbed::render_frame(const Matrix<float, 3, 4>& camera_matrix0, const Matr
 							m_sdf.triangle_bvh->signed_distance_gpu(
 								n_elements,
 								m_sdf.mesh_sdf_mode,
-								(Vector3f*)positions.data(),
-								distances.data(),
+								(Vector3f*)positions,
+								distances,
 								m_sdf.triangles_gpu.data(),
 								false,
 								m_stream.get()
 							);
 						}
-					} : (distance_fun_t)[&](uint32_t n_elements, const GPUMemory<Vector3f>& positions, GPUMemory<float>& distances, cudaStream_t stream) {
+					} : (distance_fun_t)[&](uint32_t n_elements, const Vector3f* positions, float* distances, cudaStream_t stream) {
 						if (n_elements == 0) {
 							return;
 						}
 						n_elements = next_multiple(n_elements, tcnn::batch_size_granularity);
-						GPUMatrix<float> positions_matrix((float*)positions.data(), 3, n_elements);
-						GPUMatrix<float, RM> distances_matrix(distances.data(), 1, n_elements);
+						GPUMatrix<float> positions_matrix((float*)positions, 3, n_elements);
+						GPUMatrix<float, RM> distances_matrix(distances, 1, n_elements);
 						m_network->inference(stream, positions_matrix, distances_matrix);
 					};
 
 				normals_fun_t normals_fun =
-					m_render_ground_truth ? (normals_fun_t)[&](uint32_t n_elements, const GPUMemory<Vector3f>& positions, GPUMemory<Vector3f>& normals, cudaStream_t stream) {
+					m_render_ground_truth ? (normals_fun_t)[&](uint32_t n_elements, const Vector3f* positions, Vector3f* normals, cudaStream_t stream) {
 						// NO-OP. Normals will automatically be populated by raytrace
-					} : (normals_fun_t)[&](uint32_t n_elements, const GPUMemory<Vector3f>& positions, GPUMemory<Vector3f>& normals, cudaStream_t stream) {
+					} : (normals_fun_t)[&](uint32_t n_elements, const Vector3f* positions, Vector3f* normals, cudaStream_t stream) {
 						if (n_elements == 0) {
 							return;
 						}
 
 						n_elements = next_multiple(n_elements, tcnn::batch_size_granularity);
 
-						GPUMatrix<float> positions_matrix((float*)positions.data(), 3, n_elements);
-						GPUMatrix<float> normals_matrix((float*)normals.data(), 3, n_elements);
+						GPUMatrix<float> positions_matrix((float*)positions, 3, n_elements);
+						GPUMatrix<float> normals_matrix((float*)normals, 3, n_elements);
 						m_network->input_gradient(stream, 0, positions_matrix, normals_matrix);
 					};
 
diff --git a/src/testbed_nerf.cu b/src/testbed_nerf.cu
index 8390e34d7c3401026aac7eaa884308390944ac6d..2a1d50d045028b92b609bdcb7d75588ec3139dfb 100644
--- a/src/testbed_nerf.cu
+++ b/src/testbed_nerf.cu
@@ -2081,7 +2081,7 @@ uint32_t Testbed::NerfTracer::trace(
 		return 0;
 	}
 
-	CUDA_CHECK_THROW(cudaMemsetAsync(m_hit_counter.data(), 0, sizeof(uint32_t), stream));
+	CUDA_CHECK_THROW(cudaMemsetAsync(m_hit_counter, 0, sizeof(uint32_t), stream));
 
 	uint32_t n_alive = m_n_rays_initialized;
 	// m_n_rays_initialized = 0;
@@ -2095,15 +2095,15 @@ uint32_t Testbed::NerfTracer::trace(
 
 		// Compact rays that did not diverge yet
 		{
-			CUDA_CHECK_THROW(cudaMemsetAsync(m_alive_counter.data(), 0, sizeof(uint32_t), stream));
+			CUDA_CHECK_THROW(cudaMemsetAsync(m_alive_counter, 0, sizeof(uint32_t), stream));
 			linear_kernel(compact_kernel_nerf, 0, stream,
 				n_alive,
 				rays_tmp.rgba, rays_tmp.depth, rays_tmp.payload,
 				rays_current.rgba, rays_current.depth, rays_current.payload,
 				m_rays_hit.rgba, m_rays_hit.depth, m_rays_hit.payload,
-				m_alive_counter.data(), m_hit_counter.data()
+				m_alive_counter, m_hit_counter
 			);
-			CUDA_CHECK_THROW(cudaMemcpyAsync(&n_alive, m_alive_counter.data(), sizeof(uint32_t), cudaMemcpyDeviceToHost, stream));
+			CUDA_CHECK_THROW(cudaMemcpyAsync(&n_alive, m_alive_counter, sizeof(uint32_t), cudaMemcpyDeviceToHost, stream));
 			CUDA_CHECK_THROW(cudaStreamSynchronize(stream));
 		}
 
@@ -2172,7 +2172,7 @@ uint32_t Testbed::NerfTracer::trace(
 	}
 
 	uint32_t n_hit;
-	CUDA_CHECK_THROW(cudaMemcpyAsync(&n_hit, m_hit_counter.data(), sizeof(uint32_t), cudaMemcpyDeviceToHost, stream));
+	CUDA_CHECK_THROW(cudaMemcpyAsync(&n_hit, m_hit_counter, sizeof(uint32_t), cudaMemcpyDeviceToHost, stream));
 	CUDA_CHECK_THROW(cudaStreamSynchronize(stream));
 	return n_hit;
 }
@@ -2186,14 +2186,18 @@ void Testbed::NerfTracer::enlarge(size_t n_elements, uint32_t padded_output_widt
 		Array4f, float, NerfPayload, // m_rays_hit
 
 		network_precision_t,
-		float
+		float,
+		uint32_t,
+		uint32_t
 	>(
 		stream, &m_scratch_alloc,
 		n_elements, n_elements, n_elements,
 		n_elements, n_elements, n_elements,
 		n_elements, n_elements, n_elements,
 		n_elements * MAX_STEPS_INBETWEEN_COMPACTION * padded_output_width,
-		n_elements * MAX_STEPS_INBETWEEN_COMPACTION * num_floats
+		n_elements * MAX_STEPS_INBETWEEN_COMPACTION * num_floats,
+		32, // 2 full cache lines to ensure no overlap
+		32  // 2 full cache lines to ensure no overlap
 	);
 
 	m_rays[0].set(std::get<0>(scratch), std::get<1>(scratch), std::get<2>(scratch), n_elements);
@@ -2202,6 +2206,9 @@ void Testbed::NerfTracer::enlarge(size_t n_elements, uint32_t padded_output_widt
 
 	m_network_output = std::get<9>(scratch);
 	m_network_input = std::get<10>(scratch);
+
+	m_hit_counter = std::get<11>(scratch);
+	m_alive_counter = std::get<12>(scratch);
 }
 
 void Testbed::Nerf::Training::reset_extra_dims(default_rng_t &rng) {
@@ -2256,9 +2263,7 @@ void Testbed::render_nerf(CudaRenderBuffer& render_buffer, const Vector2i& max_r
 
 	const float* extra_dims_gpu = get_inference_extra_dims(stream);
 
-	ScopeGuard tmp_memory_guard{[&]() {
-		m_nerf.tracer.clear();
-	}};
+	NerfTracer tracer;
 
 	// Our motion vector code can't undo f-theta and grid distortions -- so don't render these if DLSS is enabled.
 	bool render_opencv_lens = m_nerf.render_with_lens_distortion && (!render_buffer.dlss() || m_nerf.render_lens.mode == ELensMode::OpenCV);
@@ -2267,7 +2272,7 @@ void Testbed::render_nerf(CudaRenderBuffer& render_buffer, const Vector2i& max_r
 	Lens lens = render_opencv_lens ? m_nerf.render_lens : Lens{};
 
 
-	m_nerf.tracer.init_rays_from_camera(
+	tracer.init_rays_from_camera(
 		render_buffer.spp(),
 		m_network->padded_output_width(),
 		m_nerf_network->n_extra_dims(),
@@ -2301,10 +2306,10 @@ void Testbed::render_nerf(CudaRenderBuffer& render_buffer, const Vector2i& max_r
 
 	uint32_t n_hit;
 	if (m_render_mode == ERenderMode::Slice) {
-		n_hit = m_nerf.tracer.n_rays_initialized();
+		n_hit = tracer.n_rays_initialized();
 	} else {
 		float depth_scale = 1.0f / m_nerf.training.dataset.scale;
-		n_hit = m_nerf.tracer.trace(
+		n_hit = tracer.trace(
 			*m_nerf_network,
 			m_render_aabb,
 			m_render_aabb_to_local,
@@ -2329,7 +2334,7 @@ void Testbed::render_nerf(CudaRenderBuffer& render_buffer, const Vector2i& max_r
 			stream
 		);
 	}
-	RaysNerfSoa& rays_hit = m_render_mode == ERenderMode::Slice ? m_nerf.tracer.rays_init() : m_nerf.tracer.rays_hit();
+	RaysNerfSoa& rays_hit = m_render_mode == ERenderMode::Slice ? tracer.rays_init() : tracer.rays_hit();
 
 	if (m_render_mode == ERenderMode::Slice) {
 		// Store colors in the normal buffer
@@ -2337,23 +2342,21 @@ void Testbed::render_nerf(CudaRenderBuffer& render_buffer, const Vector2i& max_r
 		const uint32_t floats_per_coord = sizeof(NerfCoordinate) / sizeof(float) + m_nerf_network->n_extra_dims();
 		const uint32_t extra_stride = m_nerf_network->n_extra_dims() * sizeof(float); // extra stride on top of base NerfCoordinate struct
 
-		m_nerf.vis_input.enlarge(n_elements * floats_per_coord);
-		m_nerf.vis_rgba.enlarge(n_elements);
-		linear_kernel(generate_nerf_network_inputs_at_current_position, 0, stream, n_hit, m_aabb, rays_hit.payload, PitchedPtr<NerfCoordinate>((NerfCoordinate*)m_nerf.vis_input.data(), 1, 0, extra_stride), extra_dims_gpu );
+		GPUMatrix<float> positions_matrix{floats_per_coord, n_elements, stream};
+		GPUMatrix<float> rgbsigma_matrix{4, n_elements, stream};
 
-		GPUMatrix<float> positions_matrix((float*)m_nerf.vis_input.data(), floats_per_coord, n_elements);
-		GPUMatrix<float> rgbsigma_matrix((float*)m_nerf.vis_rgba.data(), 4, n_elements);
+		linear_kernel(generate_nerf_network_inputs_at_current_position, 0, stream, n_hit, m_aabb, rays_hit.payload, PitchedPtr<NerfCoordinate>((NerfCoordinate*)positions_matrix.data(), 1, 0, extra_stride), extra_dims_gpu );
 
 		if (m_visualized_dimension == -1) {
 			m_network->inference(stream, positions_matrix, rgbsigma_matrix);
-			linear_kernel(compute_nerf_rgba, 0, stream, n_hit, m_nerf.vis_rgba.data(), m_nerf.rgb_activation, m_nerf.density_activation, 0.01f, false);
+			linear_kernel(compute_nerf_rgba, 0, stream, n_hit, (Array4f*)rgbsigma_matrix.data(), m_nerf.rgb_activation, m_nerf.density_activation, 0.01f, false);
 		} else {
 			m_network->visualize_activation(stream, m_visualized_layer, m_visualized_dimension, positions_matrix, rgbsigma_matrix);
 		}
 
 		linear_kernel(shade_kernel_nerf, 0, stream,
 			n_hit,
-			m_nerf.vis_rgba.data(),
+			(Array4f*)rgbsigma_matrix.data(),
 			nullptr,
 			rays_hit.payload,
 			m_render_mode,
diff --git a/src/testbed_sdf.cu b/src/testbed_sdf.cu
index 780dfb20200a034040582e08de01036d263b6153..bf3e5af148d0d8fb97ae502de21736fd0e177395 100644
--- a/src/testbed_sdf.cu
+++ b/src/testbed_sdf.cu
@@ -550,12 +550,13 @@ __global__ void init_rays_with_payload_kernel_sdf(
 
 	if (octree_nodes && !TriangleOctree::contains(octree_nodes, max_depth, ray.o)) {
 		t = max(0.0f, TriangleOctree::ray_intersect(octree_nodes, max_depth, ray.o, ray.d));
-		if (ray.o.y() > floor_y && ray.d.y()<0.f) {
-			float floor_dist = -(ray.o.y() - floor_y)/ray.d.y();
-			if (floor_dist>0.f) {
-				t=min(t,floor_dist);
+		if (ray.o.y() > floor_y && ray.d.y() < 0.f) {
+			float floor_dist = -(ray.o.y() - floor_y) / ray.d.y();
+			if (floor_dist > 0.f) {
+				t = min(t, floor_dist);
 			}
 		}
+
 		ray.o = ray.o + (t + 1e-6f) * ray.d;
 	}
 
@@ -591,7 +592,8 @@ __global__ void sample_uniform_on_triangle_kernel(uint32_t n_elements, const flo
 	sampled_positions[i] = triangles[tri_idx].sample_uniform_position(sample.tail<2>());
 }
 
-void Testbed::SphereTracer::init_rays_from_camera(uint32_t sample_index,
+void Testbed::SphereTracer::init_rays_from_camera(
+	uint32_t sample_index,
 	const Vector2i& resolution,
 	const Vector2f& focal_length,
 	const Matrix<float, 3, 4>& camera_matrix,
@@ -613,15 +615,15 @@ void Testbed::SphereTracer::init_rays_from_camera(uint32_t sample_index,
 ) {
 	// Make sure we have enough memory reserved to render at the requested resolution
 	size_t n_pixels = (size_t)resolution.x() * resolution.y();
-	enlarge(n_pixels);
+	enlarge(n_pixels, stream);
 
 	const dim3 threads = { 16, 8, 1 };
 	const dim3 blocks = { div_round_up((uint32_t)resolution.x(), threads.x), div_round_up((uint32_t)resolution.y(), threads.y), 1 };
 	init_rays_with_payload_kernel_sdf<<<blocks, threads, 0, stream>>>(
 		sample_index,
-		m_rays[0].pos.data(),
-		m_rays[0].distance.data(),
-		m_rays[0].payload.data(),
+		m_rays[0].pos,
+		m_rays[0].distance,
+		m_rays[0].payload,
 		resolution,
 		focal_length,
 		camera_matrix,
@@ -644,7 +646,7 @@ void Testbed::SphereTracer::init_rays_from_camera(uint32_t sample_index,
 }
 
 void Testbed::SphereTracer::init_rays_from_data(uint32_t n_elements, const RaysSdfSoa& data, cudaStream_t stream) {
-	enlarge(n_elements);
+	enlarge(n_elements, stream);
 	m_rays[0].copy_from_other_async(n_elements, data, stream);
 	m_n_rays_initialized = n_elements;
 }
@@ -658,11 +660,11 @@ uint32_t Testbed::SphereTracer::trace_bvh(TriangleBvh* bvh, const Triangle* tria
 	}
 
 	// Abuse the normal buffer to temporarily hold ray directions
-	parallel_for_gpu(stream, n_alive, [payloads=m_rays[0].payload.data(), normals=m_rays[0].normal.data()] __device__ (size_t i) {
+	parallel_for_gpu(stream, n_alive, [payloads=m_rays[0].payload, normals=m_rays[0].normal] __device__ (size_t i) {
 		normals[i] = payloads[i].dir;
 	});
 
-	bvh->ray_trace_gpu(n_alive, m_rays[0].pos.data(), m_rays[0].normal.data(), triangles, stream);
+	bvh->ray_trace_gpu(n_alive, m_rays[0].pos, m_rays[0].normal, triangles, stream);
 	return n_alive;
 }
 
@@ -681,7 +683,7 @@ uint32_t Testbed::SphereTracer::trace(
 		return 0;
 	}
 
-	CUDA_CHECK_THROW(cudaMemsetAsync(m_hit_counter.data(), 0, sizeof(uint32_t), stream));
+	CUDA_CHECK_THROW(cudaMemsetAsync(m_hit_counter, 0, sizeof(uint32_t), stream));
 
 	const uint32_t STEPS_INBETWEEN_COMPACTION = 4;
 
@@ -700,29 +702,29 @@ uint32_t Testbed::SphereTracer::trace(
 
 		// Compact rays that did not diverge yet
 		{
-			CUDA_CHECK_THROW(cudaMemsetAsync(m_alive_counter.data(), 0, sizeof(uint32_t), stream));
+			CUDA_CHECK_THROW(cudaMemsetAsync(m_alive_counter, 0, sizeof(uint32_t), stream));
 			if (m_trace_shadow_rays) {
 				linear_kernel(compact_kernel_shadow_sdf, 0, stream,
 					n_alive,
 					zero_offset,
-					rays_tmp.pos.data(), rays_tmp.distance.data(), rays_tmp.payload.data(), rays_tmp.prev_distance.data(), rays_tmp.total_distance.data(), rays_tmp.min_visibility.data(),
-					rays_current.pos.data(), rays_current.distance.data(), rays_current.payload.data(), rays_current.prev_distance.data(), rays_current.total_distance.data(), rays_current.min_visibility.data(),
-					m_rays_hit.pos.data(), m_rays_hit.distance.data(), m_rays_hit.payload.data(), m_rays_hit.prev_distance.data(), m_rays_hit.total_distance.data(), m_rays_hit.min_visibility.data(),
+					rays_tmp.pos, rays_tmp.distance, rays_tmp.payload, rays_tmp.prev_distance, rays_tmp.total_distance, rays_tmp.min_visibility,
+					rays_current.pos, rays_current.distance, rays_current.payload, rays_current.prev_distance, rays_current.total_distance, rays_current.min_visibility,
+					m_rays_hit.pos, m_rays_hit.distance, m_rays_hit.payload, m_rays_hit.prev_distance, m_rays_hit.total_distance, m_rays_hit.min_visibility,
 					aabb,
-					m_alive_counter.data(), m_hit_counter.data()
+					m_alive_counter, m_hit_counter
 				);
 			} else {
 				linear_kernel(compact_kernel_sdf, 0, stream,
 					n_alive,
 					zero_offset,
-					rays_tmp.pos.data(), rays_tmp.distance.data(), rays_tmp.payload.data(),
-					rays_current.pos.data(), rays_current.distance.data(), rays_current.payload.data(),
-					m_rays_hit.pos.data(), m_rays_hit.distance.data(), m_rays_hit.payload.data(),
+					rays_tmp.pos, rays_tmp.distance, rays_tmp.payload,
+					rays_current.pos, rays_current.distance, rays_current.payload,
+					m_rays_hit.pos, m_rays_hit.distance, m_rays_hit.payload,
 					aabb,
-					m_alive_counter.data(), m_hit_counter.data()
+					m_alive_counter, m_hit_counter
 				);
 			}
-			CUDA_CHECK_THROW(cudaMemcpyAsync(&n_alive, m_alive_counter.data(), sizeof(uint32_t), cudaMemcpyDeviceToHost, stream));
+			CUDA_CHECK_THROW(cudaMemcpyAsync(&n_alive, m_alive_counter, sizeof(uint32_t), cudaMemcpyDeviceToHost, stream));
 			CUDA_CHECK_THROW(cudaStreamSynchronize(stream));
 		}
 
@@ -735,9 +737,9 @@ uint32_t Testbed::SphereTracer::trace(
 			linear_kernel(advance_pos_kernel_sdf, 0, stream,
 				n_alive,
 				zero_offset,
-				rays_current.pos.data(),
-				rays_current.distance.data(),
-				rays_current.payload.data(),
+				rays_current.pos,
+				rays_current.distance,
+				rays_current.payload,
 				aabb,
 				floor_y,
 				octree ? octree->nodes_gpu() : nullptr,
@@ -745,9 +747,9 @@ uint32_t Testbed::SphereTracer::trace(
 				distance_scale,
 				maximum_distance,
 				m_shadow_sharpness,
-				m_trace_shadow_rays ? rays_current.prev_distance.data() : nullptr,
-				m_trace_shadow_rays ? rays_current.total_distance.data() : nullptr,
-				m_trace_shadow_rays ? rays_current.min_visibility.data() : nullptr
+				m_trace_shadow_rays ? rays_current.prev_distance : nullptr,
+				m_trace_shadow_rays ? rays_current.total_distance : nullptr,
+				m_trace_shadow_rays ? rays_current.min_visibility : nullptr
 			);
 		}
 
@@ -755,36 +757,67 @@ uint32_t Testbed::SphereTracer::trace(
 	}
 
 	uint32_t n_hit;
-	CUDA_CHECK_THROW(cudaMemcpyAsync(&n_hit, m_hit_counter.data(), sizeof(uint32_t), cudaMemcpyDeviceToHost, stream));
+	CUDA_CHECK_THROW(cudaMemcpyAsync(&n_hit, m_hit_counter, sizeof(uint32_t), cudaMemcpyDeviceToHost, stream));
 	CUDA_CHECK_THROW(cudaStreamSynchronize(stream));
 	return n_hit;
 }
 
-void Testbed::SphereTracer::enlarge(size_t n_elements) {
+void Testbed::SphereTracer::enlarge(size_t n_elements, cudaStream_t stream) {
 	n_elements = next_multiple(n_elements, size_t(tcnn::batch_size_granularity));
-	m_rays[0].enlarge(n_elements);
-	m_rays[1].enlarge(n_elements);
-	m_rays_hit.enlarge(n_elements);
+	auto scratch = allocate_workspace_and_distribute<
+		Vector3f, Vector3f, float, float, float, float, SdfPayload, // m_rays[0]
+		Vector3f, Vector3f, float, float, float, float, SdfPayload, // m_rays[1]
+		Vector3f, Vector3f, float, float, float, float, SdfPayload, // m_rays_hit
+
+		uint32_t,
+		uint32_t
+	>(
+		stream, &m_scratch_alloc,
+		n_elements, n_elements, n_elements, n_elements, n_elements, n_elements, n_elements,
+		n_elements, n_elements, n_elements, n_elements, n_elements, n_elements, n_elements,
+		n_elements, n_elements, n_elements, n_elements, n_elements, n_elements, n_elements,
+		32, // 2 full cache lines to ensure no overlap
+		32  // 2 full cache lines to ensure no overlap
+	);
+
+	m_rays[0].set(std::get<0>(scratch), std::get<1>(scratch), std::get<2>(scratch), std::get<3>(scratch), std::get<4>(scratch), std::get<5>(scratch), std::get<6>(scratch));
+	m_rays[1].set(std::get<7>(scratch), std::get<8>(scratch), std::get<9>(scratch), std::get<10>(scratch), std::get<11>(scratch), std::get<12>(scratch), std::get<13>(scratch));
+	m_rays_hit.set(std::get<14>(scratch), std::get<15>(scratch), std::get<16>(scratch), std::get<17>(scratch), std::get<18>(scratch), std::get<19>(scratch), std::get<20>(scratch));
+
+	m_hit_counter = std::get<21>(scratch);
+	m_alive_counter = std::get<22>(scratch);
 }
 
-void Testbed::FiniteDifferenceNormalsApproximator::enlarge(uint32_t n_elements) {
-	dx.enlarge(n_elements);
-	dy.enlarge(n_elements);
-	dz.enlarge(n_elements);
+void Testbed::FiniteDifferenceNormalsApproximator::enlarge(uint32_t n_elements, cudaStream_t stream) {
+	n_elements = next_multiple(n_elements, tcnn::batch_size_granularity);
+	auto scratch = allocate_workspace_and_distribute<
+		Vector3f, Vector3f, Vector3f,
+		float, float, float,
+		float, float, float
+	>(
+		stream, &m_scratch_alloc,
+		n_elements, n_elements, n_elements,
+		n_elements, n_elements, n_elements,
+		n_elements, n_elements, n_elements
+	);
+
+	dx = std::get<0>(scratch);
+	dy = std::get<1>(scratch);
+	dz = std::get<2>(scratch);
 
-	dist_dx_pos.enlarge(n_elements);
-	dist_dy_pos.enlarge(n_elements);
-	dist_dz_pos.enlarge(n_elements);
+	dist_dx_pos = std::get<3>(scratch);
+	dist_dy_pos = std::get<4>(scratch);
+	dist_dz_pos = std::get<5>(scratch);
 
-	dist_dx_neg.enlarge(n_elements);
-	dist_dy_neg.enlarge(n_elements);
-	dist_dz_neg.enlarge(n_elements);
+	dist_dx_neg = std::get<6>(scratch);
+	dist_dy_neg = std::get<7>(scratch);
+	dist_dz_neg = std::get<8>(scratch);
 }
 
-void Testbed::FiniteDifferenceNormalsApproximator::normal(uint32_t n_elements, const distance_fun_t& distance_function, GPUMemory<Vector3f>& pos, GPUMemory<Vector3f>& normal, float epsilon, cudaStream_t stream) {
-	enlarge(n_elements);
+void Testbed::FiniteDifferenceNormalsApproximator::normal(uint32_t n_elements, const distance_fun_t& distance_function, const Vector3f* pos, Vector3f* normal, float epsilon, cudaStream_t stream) {
+	enlarge(n_elements, stream);
 
-	parallel_for_gpu(stream, n_elements, [pos=pos.data(), dx=dx.data(), dy=dy.data(), dz=dz.data(), epsilon] __device__ (size_t i) {
+	parallel_for_gpu(stream, n_elements, [pos=pos, dx=dx, dy=dy, dz=dz, epsilon] __device__ (size_t i) {
 		Vector3f p = pos[i];
 		dx[i] = Vector3f{p.x() + epsilon, p.y(), p.z()};
 		dy[i] = Vector3f{p.x(), p.y() + epsilon, p.z()};
@@ -795,7 +828,7 @@ void Testbed::FiniteDifferenceNormalsApproximator::normal(uint32_t n_elements, c
 	distance_function(n_elements, dy, dist_dy_pos, stream);
 	distance_function(n_elements, dz, dist_dz_pos, stream);
 
-	parallel_for_gpu(stream, n_elements, [pos=pos.data(), dx=dx.data(), dy=dy.data(), dz=dz.data(), epsilon] __device__ (size_t i) {
+	parallel_for_gpu(stream, n_elements, [pos=pos, dx=dx, dy=dy, dz=dz, epsilon] __device__ (size_t i) {
 		Vector3f p = pos[i];
 		dx[i] = Vector3f{p.x() - epsilon, p.y(), p.z()};
 		dy[i] = Vector3f{p.x(), p.y() - epsilon, p.z()};
@@ -806,7 +839,7 @@ void Testbed::FiniteDifferenceNormalsApproximator::normal(uint32_t n_elements, c
 	distance_function(n_elements, dy, dist_dy_neg, stream);
 	distance_function(n_elements, dz, dist_dz_neg, stream);
 
-	parallel_for_gpu(stream, n_elements, [normal=normal.data(), dist_dx_pos=dist_dx_pos.data(), dist_dx_neg=dist_dx_neg.data(), dist_dy_pos=dist_dy_pos.data(), dist_dy_neg=dist_dy_neg.data(), dist_dz_pos=dist_dz_pos.data(), dist_dz_neg=dist_dz_neg.data()] __device__ (size_t i) {
+	parallel_for_gpu(stream, n_elements, [normal=normal, dist_dx_pos=dist_dx_pos, dist_dx_neg=dist_dx_neg, dist_dy_pos=dist_dy_pos, dist_dy_neg=dist_dy_neg, dist_dz_pos=dist_dz_pos, dist_dz_neg=dist_dz_neg] __device__ (size_t i) {
 		normal[i] = {dist_dx_pos[i] - dist_dx_neg[i], dist_dy_pos[i] - dist_dy_neg[i], dist_dz_pos[i] - dist_dz_neg[i]};
 	});
 }
@@ -827,8 +860,7 @@ void Testbed::render_sdf(
 	}
 	auto* octree_ptr = m_sdf.uses_takikawa_encoding || m_sdf.use_triangle_octree ? m_sdf.triangle_octree.get() : nullptr;
 
-	// Reserve the memory for max-res rendering to prevent stuttering
-	m_sdf.tracer.enlarge(max_res.x() * max_res.y());
+	SphereTracer tracer;
 
 	uint32_t n_octree_levels = octree_ptr ? octree_ptr->depth() : 0;
 	if (m_render_ground_truth && m_sdf.groundtruth_mode == ESDFGroundTruthMode::SDFBricks) {
@@ -837,7 +869,7 @@ void Testbed::render_sdf(
 
 	BoundingBox sdf_bounding_box = m_aabb;
 	sdf_bounding_box.inflate(m_sdf.zero_offset);
-	m_sdf.tracer.init_rays_from_camera(
+	tracer.init_rays_from_camera(
 		render_buffer.spp(),
 		render_buffer.in_resolution(),
 		focal_length,
@@ -881,22 +913,22 @@ void Testbed::render_sdf(
 
 	uint32_t n_hit;
 	if (m_render_mode == ERenderMode::Slice) {
-		n_hit = m_sdf.tracer.n_rays_initialized();
+		n_hit = tracer.n_rays_initialized();
 	} else {
-		n_hit = trace(m_sdf.tracer);
+		n_hit = trace(tracer);
 	}
-	RaysSdfSoa& rays_hit = m_render_mode == ERenderMode::Slice || gt_raytrace ? m_sdf.tracer.rays_init() : m_sdf.tracer.rays_hit();
+	RaysSdfSoa& rays_hit = m_render_mode == ERenderMode::Slice || gt_raytrace ? tracer.rays_init() : tracer.rays_hit();
 
 	if (m_render_mode == ERenderMode::Slice) {
 		if (m_visualized_dimension == -1) {
 			distance_function(n_hit, rays_hit.pos, rays_hit.distance, stream);
-			extract_dimension_pos_neg_kernel<float><<<n_blocks_linear(n_hit*3), n_threads_linear, 0, stream>>>(n_hit*3, 0, 1, 3, rays_hit.distance.data(), CM, (float*)rays_hit.normal.data());
+			extract_dimension_pos_neg_kernel<float><<<n_blocks_linear(n_hit*3), n_threads_linear, 0, stream>>>(n_hit*3, 0, 1, 3, rays_hit.distance, CM, (float*)rays_hit.normal);
 		} else {
 			// Store colors in the normal buffer
 			uint32_t n_elements = next_multiple(n_hit, tcnn::batch_size_granularity);
 
-			GPUMatrix<float> positions_matrix((float*)rays_hit.pos.data(), 3, n_elements);
-			GPUMatrix<float> colors_matrix((float*)rays_hit.normal.data(), 3, n_elements);
+			GPUMatrix<float> positions_matrix((float*)rays_hit.pos, 3, n_elements);
+			GPUMatrix<float> colors_matrix((float*)rays_hit.normal, 3, n_elements);
 			m_network->visualize_activation(stream, m_visualized_layer, m_visualized_dimension, positions_matrix, colors_matrix);
 		}
 	}
@@ -906,45 +938,47 @@ void Testbed::render_sdf(
 		if (m_sdf.analytic_normals || gt_raytrace) {
 			normals_function(n_hit, rays_hit.pos, rays_hit.normal, stream);
 		} else {
-			// Prevent spurious enlargements by reserving enough memory to hold a full-res image in any case.
-			m_sdf.fd_normals.enlarge(render_buffer.in_resolution().x() * render_buffer.in_resolution().y());
 			float fd_normals_epsilon = m_sdf.fd_normals_epsilon;
 			if (m_render_ground_truth && m_sdf.groundtruth_mode == ESDFGroundTruthMode::SDFBricks && m_sdf.brick_smooth_normals) {
 				fd_normals_epsilon = exp2f(-float(n_octree_levels)) * (1.f/(m_sdf.brick_res-1)); // in sdf brick mode, use one voxel as the normal central difference radius
 			}
-			m_sdf.fd_normals.normal(n_hit, distance_function, rays_hit.pos, rays_hit.normal, fd_normals_epsilon, stream);
+
+			FiniteDifferenceNormalsApproximator fd_normals;
+			fd_normals.normal(n_hit, distance_function, rays_hit.pos, rays_hit.normal, fd_normals_epsilon, stream);
 		}
 
 		if (render_mode == ERenderMode::Shade && n_hit > 0) {
 			// Shadow rays towards the sun
-			m_sdf.shadow_tracer.init_rays_from_data(n_hit, rays_hit, stream);
-			m_sdf.shadow_tracer.set_trace_shadow_rays(true);
-			m_sdf.shadow_tracer.set_shadow_sharpness(m_sdf.shadow_sharpness);
-			RaysSdfSoa& shadow_rays_init = m_sdf.shadow_tracer.rays_init();
+			SphereTracer shadow_tracer;
+
+			shadow_tracer.init_rays_from_data(n_hit, rays_hit, stream);
+			shadow_tracer.set_trace_shadow_rays(true);
+			shadow_tracer.set_shadow_sharpness(m_sdf.shadow_sharpness);
+			RaysSdfSoa& shadow_rays_init = shadow_tracer.rays_init();
 			linear_kernel(prepare_shadow_rays, 0, stream,
 				n_hit,
 				m_sun_dir.normalized(),
-				shadow_rays_init.pos.data(),
-				shadow_rays_init.normal.data(),
-				shadow_rays_init.distance.data(),
-				shadow_rays_init.prev_distance.data(),
-				shadow_rays_init.total_distance.data(),
-				shadow_rays_init.min_visibility.data(),
-				shadow_rays_init.payload.data(),
+				shadow_rays_init.pos,
+				shadow_rays_init.normal,
+				shadow_rays_init.distance,
+				shadow_rays_init.prev_distance,
+				shadow_rays_init.total_distance,
+				shadow_rays_init.min_visibility,
+				shadow_rays_init.payload,
 				sdf_bounding_box,
 				octree_ptr ? octree_ptr->nodes_gpu() : nullptr,
 				n_octree_levels
 			);
-			uint32_t n_hit_shadow = trace(m_sdf.shadow_tracer);
-			auto& shadow_rays_hit = gt_raytrace ? m_sdf.shadow_tracer.rays_init() : m_sdf.shadow_tracer.rays_hit();
+			uint32_t n_hit_shadow = trace(shadow_tracer);
+			auto& shadow_rays_hit = gt_raytrace ? shadow_tracer.rays_init() : shadow_tracer.rays_hit();
 
 			linear_kernel(write_shadow_ray_result, 0, stream,
 				n_hit_shadow,
 				sdf_bounding_box,
-				shadow_rays_hit.pos.data(),
-				shadow_rays_hit.payload.data(),
-				shadow_rays_hit.min_visibility.data(),
-				rays_hit.distance.data()
+				shadow_rays_hit.pos,
+				shadow_rays_hit.payload,
+				shadow_rays_hit.min_visibility,
+				rays_hit.distance
 			);
 
 			// todo: Reflection rays?
@@ -953,8 +987,8 @@ void Testbed::render_sdf(
 		// HACK: Store colors temporarily in the normal buffer
 		uint32_t n_elements = next_multiple(n_hit, tcnn::batch_size_granularity);
 
-		GPUMatrix<float> positions_matrix((float*)rays_hit.pos.data(), 3, n_elements);
-		GPUMatrix<float> colors_matrix((float*)rays_hit.normal.data(), 3, n_elements);
+		GPUMatrix<float> positions_matrix((float*)rays_hit.pos, 3, n_elements);
+		GPUMatrix<float> colors_matrix((float*)rays_hit.normal, 3, n_elements);
 		m_network->visualize_activation(stream, m_visualized_layer, m_visualized_dimension, positions_matrix, colors_matrix);
 	}
 
@@ -967,17 +1001,18 @@ void Testbed::render_sdf(
 		m_sun_dir.normalized(),
 		m_up_dir.normalized(),
 		camera_matrix,
-		rays_hit.pos.data(),
-		rays_hit.normal.data(),
-		rays_hit.distance.data(),
-		rays_hit.payload.data(),
+		rays_hit.pos,
+		rays_hit.normal,
+		rays_hit.distance,
+		rays_hit.payload,
 		render_buffer.frame_buffer(),
 		render_buffer.depth_buffer()
 	);
 
 	if (render_mode == ERenderMode::Cost) {
 		std::vector<SdfPayload> payloads_final_cpu(n_hit);
-		rays_hit.payload.copy_to_host(payloads_final_cpu, n_hit);
+		CUDA_CHECK_THROW(cudaMemcpyAsync(payloads_final_cpu.data(), rays_hit.payload, n_hit * sizeof(SdfPayload), cudaMemcpyDeviceToHost, stream));
+		CUDA_CHECK_THROW(cudaStreamSynchronize(stream));
 		size_t total_n_steps = 0;
 		for (uint32_t i = 0; i < n_hit; ++i) {
 			total_n_steps += payloads_final_cpu[i].n_steps;