diff --git a/.github/workflows/main.yml b/.github/workflows/main.yml
index 35b134c469547036aa442d37575445102bda76a1..b131e66da9bdb5748cd2102f273facab90558da0 100644
--- a/.github/workflows/main.yml
+++ b/.github/workflows/main.yml
@@ -27,6 +27,9 @@ jobs:
           - os: ubuntu-18.04
             cuda: "10.2"
             arch: 61
+          - os: ubuntu-18.04
+            cuda: "10.2"
+            arch: 53
           - os: ubuntu-18.04
             cuda: "10.2"
             arch: 37
@@ -74,6 +77,10 @@ jobs:
             visual_studio: "Visual Studio 16 2019"
             cuda: "11.5.1"
             arch: 61
+          - os: windows-2019
+            visual_studio: "Visual Studio 16 2019"
+            cuda: "11.5.1"
+            arch: 53
           - os: windows-2019
             visual_studio: "Visual Studio 16 2019"
             cuda: "11.5.1"
diff --git a/dependencies/tiny-cuda-nn b/dependencies/tiny-cuda-nn
index 8575542682cb67cddfc748cc3d3cfc12593799aa..ece9cdd88601a3e754ef82de11dd0114008d0fbc 160000
--- a/dependencies/tiny-cuda-nn
+++ b/dependencies/tiny-cuda-nn
@@ -1 +1 @@
-Subproject commit 8575542682cb67cddfc748cc3d3cfc12593799aa
+Subproject commit ece9cdd88601a3e754ef82de11dd0114008d0fbc
diff --git a/include/neural-graphics-primitives/envmap.cuh b/include/neural-graphics-primitives/envmap.cuh
index 273f66100c26047648337c9cb46b61717ae277f3..89820b01664b946d4fefc8249fdf818735c69e37 100644
--- a/include/neural-graphics-primitives/envmap.cuh
+++ b/include/neural-graphics-primitives/envmap.cuh
@@ -64,8 +64,8 @@ __device__ Eigen::Array4f read_envmap(const T* __restrict__ envmap_data, const E
 	return result;
 }
 
-template <typename T>
-__device__ void deposit_envmap_gradient(const tcnn::vector_t<T, 4>& value, T* __restrict__ envmap_gradient, const Eigen::Vector2i envmap_resolution, const Eigen::Vector3f& dir) {
+template <typename T, typename GRAD_T>
+__device__ void deposit_envmap_gradient(const tcnn::vector_t<T, 4>& value, GRAD_T* __restrict__ envmap_gradient, const Eigen::Vector2i envmap_resolution, const Eigen::Vector3f& dir) {
 	auto dir_cyl = dir_to_spherical_unorm({dir.z(), -dir.x(), dir.y()});
 
 	auto envmap_float = Eigen::Vector2f{dir_cyl.y() * (envmap_resolution.x()-1), dir_cyl.x() * (envmap_resolution.y()-1)};
@@ -83,8 +83,8 @@ __device__ void deposit_envmap_gradient(const tcnn::vector_t<T, 4>& value, T* __
 
 		Eigen::Array4f result;
 
-#if TCNN_MIN_GPU_ARCH >= 60 // atomicAdd(__half2) is only supported with compute capability 60 and above
-		if (std::is_same<T, __half>::value) {
+#if defined(__CUDA_ARCH__) && __CUDA_ARCH__ >= 600 // atomicAdd(__half2) is only supported with compute capability 60 and above
+		if (std::is_same<GRAD_T, __half>::value) {
 			for (uint32_t c = 0; c < 4; c += 2) {
 				atomicAdd((__half2*)&envmap_gradient[(pos.x() + pos.y() * envmap_resolution.x()) * 4 + c], {value[c] * weight, value[c+1] * weight});
 			}
@@ -92,7 +92,7 @@ __device__ void deposit_envmap_gradient(const tcnn::vector_t<T, 4>& value, T* __
 #endif
 		{
 			for (uint32_t c = 0; c < 4; ++c) {
-				atomicAdd(&envmap_gradient[(pos.x() + pos.y() * envmap_resolution.x()) * 4 + c], value[c] * weight);
+				atomicAdd(&envmap_gradient[(pos.x() + pos.y() * envmap_resolution.x()) * 4 + c], (GRAD_T)(value[c] * weight));
 			}
 		}
 	};
diff --git a/include/neural-graphics-primitives/takikawa_encoding.cuh b/include/neural-graphics-primitives/takikawa_encoding.cuh
index 64ebd4ae1483c9b80db517c6fd92f4b4f6b782e3..5746ecaed851de7eaef46a36a9d8ae697746899e 100644
--- a/include/neural-graphics-primitives/takikawa_encoding.cuh
+++ b/include/neural-graphics-primitives/takikawa_encoding.cuh
@@ -183,7 +183,7 @@ __global__ void kernel_takikawa_backward_input(
 	dL_dx(i)[j] = result;
 }
 
-template <typename T, uint32_t N_FEATURES_PER_LEVEL>
+template <typename T, typename GRAD_T, uint32_t N_FEATURES_PER_LEVEL>
 __global__ void kernel_takikawa_backward(
 	const uint32_t num_elements,
 	const uint32_t n_levels,
@@ -191,7 +191,7 @@ __global__ void kernel_takikawa_backward(
 	const tcnn::InterpolationType interpolation_type,
 	const TriangleOctreeNode* octree_nodes,
 	const TriangleOctreeDualNode* octree_dual_nodes,
-	T* __restrict__ grid_gradient,
+	GRAD_T* __restrict__ params_gradient,
 	const tcnn::PitchedPtr<const float> data_in,
 	const tcnn::PitchedPtr<const T> dL_dy
 ) {
@@ -240,19 +240,24 @@ __global__ void kernel_takikawa_backward(
 
 				int param_idx = node.vertices[idx] * N_FEATURES_PER_LEVEL;
 
-#if TCNN_MIN_GPU_ARCH >= 60 // atomicAdd(__half2) is only supported with compute capability 60 and above
-				if (N_FEATURES_PER_LEVEL > 1 && std::is_same<T, __half>::value) {
+#if defined(__CUDA_ARCH__) && __CUDA_ARCH__ >= 600 // atomicAdd(__half2) is only supported with compute capability 60 and above
+				if (N_FEATURES_PER_LEVEL > 1 && std::is_same<GRAD_T, __half>::value) {
 					#pragma unroll
 					for (uint32_t feature = 0; feature < N_FEATURES_PER_LEVEL; feature += 2) {
 						__half2 v = {(__half)((float)grad[feature] * weight), (__half)((float)grad[feature+1] * weight)};
-						atomicAdd((__half2*)&grid_gradient[param_idx + feature], v);
+						atomicAdd((__half2*)&params_gradient[param_idx + feature], v);
 					}
 				} else
 #endif
 				{
-					#pragma unroll
-					for (uint32_t f = 0; f < N_FEATURES_PER_LEVEL; ++f) {
-						atomicAdd(&grid_gradient[param_idx], (T)((float)grad[f] * weight));
+					if (std::is_same<GRAD_T, __half>::value) {
+						// Should never happen
+						//printf("Attempted to use atomicAdd(__half)\n")
+					} else {
+						#pragma unroll
+						for (uint32_t f = 0; f < N_FEATURES_PER_LEVEL; ++f) {
+							atomicAdd((float*)&params_gradient[param_idx], (float)grad[f] * weight);
+						}
 					}
 				}
 			}
@@ -263,6 +268,19 @@ __global__ void kernel_takikawa_backward(
 template <typename T, uint32_t N_FEATURES_PER_LEVEL=8>
 class TakikawaEncoding : public tcnn::Encoding<T> {
 public:
+#if TCNN_MIN_GPU_ARCH >= 60
+	// The GPUs that we tested this on do not have an efficient 1D fp16
+	// atomicAdd feature. Thus, we accumulate gradients at fp32 if we're
+	// forced to use 1D atomicAdds. As soon as 2D or higher is possible,
+	// we can make use the efficient atomicAdd(half2) function.
+	using grad_t = std::conditional_t<N_FEATURES_PER_LEVEL == 1, float, T>;
+#else
+	// atomicAdd(__half2) is only supported with compute capability 60 and above.
+	// Since atomicAdd(__half) is relatively slow / doesn't exist for low compute
+	// capabilities, accumulate in fp32 instead.
+	using grad_t = float;
+#endif
+
 	TakikawaEncoding(uint32_t starting_level, bool sum_instead_of_concat, std::shared_ptr<TriangleOctree> octree, tcnn::InterpolationType interpolation_type)
 		: m_starting_level{starting_level}, m_sum_instead_of_concat{sum_instead_of_concat}, m_octree{octree}, m_interpolation_type{interpolation_type} {
 
@@ -315,21 +333,36 @@ public:
 		}
 
 		{
+			// We accumulate gradients with grad_t precision, which, for performance reasons, is not always T.
+			// If not, accumulate in a temporary buffer and cast later.
+			grad_t* params_gradient;
+			if (!std::is_same<grad_t, T>::value) {
+				params_gradient = (grad_t*)m_params_gradient_tmp.data();
+			} else {
+				params_gradient = (grad_t*)m_params_gradient;
+			}
+
 			if (!accumulate_param_gradients) {
-				CUDA_CHECK_THROW(cudaMemsetAsync(m_params_gradient, 0, n_params() * sizeof(T), stream));
+				CUDA_CHECK_THROW(cudaMemsetAsync(params_gradient, 0, n_params() * sizeof(grad_t), stream));
 			}
 
-			tcnn::linear_kernel(kernel_takikawa_backward<T, N_FEATURES_PER_LEVEL>, 0, stream,
+			tcnn::linear_kernel(kernel_takikawa_backward<T, grad_t, N_FEATURES_PER_LEVEL>, 0, stream,
 				num_elements,
 				n_levels(),
 				m_starting_level,
 				m_interpolation_type,
 				m_octree->nodes_gpu(),
 				m_octree->dual_nodes_gpu(),
-				m_params_gradient,
+				params_gradient,
 				inputs,
 				dL_dy
 			);
+
+			if (!std::is_same<grad_t, T>::value) {
+				parallel_for_gpu(stream, n_params(), [grad=m_params_gradient, grad_tmp=params_gradient] __device__ (size_t i) {
+					grad[i] = (T)grad_tmp[i];
+				});
+			}
 		}
 
 		// Gradient computation w.r.t. input
@@ -380,6 +413,11 @@ public:
 
 		// Initialize the encoding from the GPU, because the number of parameters can be quite large.
 		tcnn::generate_random_uniform<float>(rnd, n_params(), params_full_precision, -1e-4f, 1e-4f);
+
+		// Only needs temporary storage if gradients are computed with different precision from T.
+		if (!std::is_same<grad_t, T>::value) {
+			m_params_gradient_tmp.resize(n_params());
+		}
 	}
 
 	size_t n_params() const override {
@@ -410,6 +448,8 @@ private:
 	// Storage of params
 	T* m_params;
 	T* m_params_inference;
+
+	tcnn::GPUMemory<grad_t> m_params_gradient_tmp;
 	T* m_params_gradient;
 
 	std::shared_ptr<TriangleOctree> m_octree;
diff --git a/include/neural-graphics-primitives/testbed.h b/include/neural-graphics-primitives/testbed.h
index 5d224767101333a65fe033f83f9e6e85b0651326..a2294d10ef39fef69a03e4f1e2ae227c1f6d403c 100644
--- a/include/neural-graphics-primitives/testbed.h
+++ b/include/neural-graphics-primitives/testbed.h
@@ -85,7 +85,7 @@ public:
 			float floor_y,
 			float plane_z,
 			float dof,
-			const precision_t* envmap_data,
+			const float* envmap_data,
 			const Eigen::Vector2i& envmap_resolution,
 			Eigen::Array4f* frame_buffer,
 			const TriangleOctree* octree, cudaStream_t stream);
@@ -125,7 +125,7 @@ public:
 			float plane_z,
 			float dof,
 			const CameraDistortion& camera_distortion,
-			const precision_t* envmap_data,
+			const float* envmap_data,
 			const Eigen::Vector2i& envmap_resolution,
 			const float* distortion_data,
 			const Eigen::Vector2i& distortion_resolution,
@@ -718,14 +718,16 @@ public:
 	std::shared_ptr<tcnn::Encoding<precision_t>> m_encoding;
 	std::shared_ptr<tcnn::Network<float, precision_t>> m_network;
 	std::shared_ptr<tcnn::Trainer<float, precision_t, precision_t>> m_trainer;
+
 	struct TrainableEnvmap {
-		std::shared_ptr<tcnn::Optimizer<precision_t>> optimizer;
-		std::shared_ptr<TrainableBuffer<4, 2, precision_t>> envmap;
-		std::shared_ptr<tcnn::Trainer<float, precision_t, precision_t>> trainer;
+		std::shared_ptr<tcnn::Optimizer<float>> optimizer;
+		std::shared_ptr<TrainableBuffer<4, 2, float>> envmap;
+		std::shared_ptr<tcnn::Trainer<float, float, float>> trainer;
 
 		Eigen::Vector2i resolution;
 		ELossType loss_type;
 	} m_envmap;
+
 	struct TrainableDistortionMap {
 		std::shared_ptr<tcnn::Optimizer<float>> optimizer;
 		std::shared_ptr<TrainableBuffer<2, 2, float>> map;
diff --git a/src/testbed.cu b/src/testbed.cu
index 4a20e165de806b140d3864b78ec8dd349badfbbc..ab2bb281de64adc5ad2d7951d5e95f03d0cd1879 100644
--- a/src/testbed.cu
+++ b/src/testbed.cu
@@ -1693,9 +1693,9 @@ void Testbed::reset_network() {
 		m_envmap.loss_type = string_to_loss_type(envmap_loss_config.value("otype", "L2"));
 
 		m_envmap.resolution = m_nerf.training.dataset.envmap_resolution;
-		m_envmap.envmap = std::make_shared<TrainableBuffer<4, 2, precision_t>>(m_envmap.resolution);
-		m_envmap.optimizer.reset(create_optimizer<precision_t>(envmap_optimizer_config));
-		m_envmap.trainer = std::make_shared<Trainer<float, precision_t, precision_t>>(m_envmap.envmap, m_envmap.optimizer, m_loss, m_seed);
+		m_envmap.envmap = std::make_shared<TrainableBuffer<4, 2, float>>(m_envmap.resolution);
+		m_envmap.optimizer.reset(create_optimizer<float>(envmap_optimizer_config));
+		m_envmap.trainer = std::make_shared<Trainer<float, float, float>>(m_envmap.envmap, m_envmap.optimizer, std::shared_ptr<Loss<float>>{create_loss<float>(envmap_loss_config)}, m_seed);
 
 		if (m_nerf.training.dataset.envmap_data.data()) {
 			m_envmap.trainer->set_params_full_precision(m_nerf.training.dataset.envmap_data.data(), m_nerf.training.dataset.envmap_data.size());
diff --git a/src/testbed_nerf.cu b/src/testbed_nerf.cu
index ea41f988746cbd70da1ebf9aacfa6e7a01ff54b7..fd395a442d4734334ba4c45c542b2544e3f35e9c 100644
--- a/src/testbed_nerf.cu
+++ b/src/testbed_nerf.cu
@@ -1118,8 +1118,8 @@ __global__ void compute_loss_kernel_train_nerf(
 	const uint32_t* __restrict__ rays_counter,
 	float loss_scale,
 	int padded_output_width,
-	const tcnn::network_precision_t* __restrict__ envmap_data,
-	tcnn::network_precision_t* __restrict__ envmap_gradient,
+	const float* __restrict__ envmap_data,
+	float* __restrict__ envmap_gradient,
 	const Vector2i envmap_resolution,
 	ELossType envmap_loss_type,
 	Array3f background_color,
@@ -1569,7 +1569,7 @@ __global__ void init_rays_with_payload_kernel_nerf(
 	float plane_z,
 	float dof,
 	CameraDistortion camera_distortion,
-	const network_precision_t* __restrict__ envmap_data,
+	const float* __restrict__ envmap_data,
 	const Vector2i envmap_resolution,
 	Array4f* __restrict__ framebuffer,
 	const float* __restrict__ distortion_data,
@@ -1729,7 +1729,7 @@ void Testbed::NerfTracer::init_rays_from_camera(uint32_t spp,
 	float plane_z,
 	float dof,
 	const CameraDistortion& camera_distortion,
-	const network_precision_t* envmap_data,
+	const float* envmap_data,
 	const Vector2i& envmap_resolution,
 	const float* distortion_data,
 	const Vector2i& distortion_resolution,
@@ -2247,9 +2247,9 @@ void Testbed::train_nerf(uint32_t target_batch_size, uint32_t n_training_steps,
 		CUDA_CHECK_THROW(cudaMemsetAsync(m_nerf.training.error_map.data.data(), 0, m_nerf.training.error_map.data.get_bytes(), stream));
 	}
 
-	network_precision_t* envmap_gradient = m_nerf.training.train_envmap ? m_envmap.envmap->gradients() : nullptr;
+	float* envmap_gradient = m_nerf.training.train_envmap ? m_envmap.envmap->gradients() : nullptr;
 	if (envmap_gradient) {
-		CUDA_CHECK_THROW(cudaMemsetAsync(envmap_gradient, 0, sizeof(network_precision_t)*m_envmap.envmap->n_params(), stream));
+		CUDA_CHECK_THROW(cudaMemsetAsync(envmap_gradient, 0, sizeof(float)*m_envmap.envmap->n_params(), stream));
 	}
 
 	for (uint32_t i = 0; i < n_training_steps; ++i) {
@@ -2468,7 +2468,7 @@ void Testbed::train_nerf_step(uint32_t target_batch_size, uint32_t n_rays_per_ba
 	m_nerf.training.n_rays_since_error_map_update += n_rays_per_batch;
 
 	// If we have an envmap, prepare its gradient buffer
-	network_precision_t* envmap_gradient = m_nerf.training.train_envmap ? m_envmap.envmap->gradients() : nullptr;
+	float* envmap_gradient = m_nerf.training.train_envmap ? m_envmap.envmap->gradients() : nullptr;
 
 	bool sample_focal_plane_proportional_to_error = m_nerf.training.error_map.is_cdf_valid && m_nerf.training.sample_focal_plane_proportional_to_error;
 	bool sample_image_proportional_to_error = m_nerf.training.error_map.is_cdf_valid && m_nerf.training.sample_image_proportional_to_error;
diff --git a/src/testbed_sdf.cu b/src/testbed_sdf.cu
index 2a8d18ba5d08223a846791fcb4b24376f0748186..8fe55b13549b2315e20bf4d6f733c686a7abaaf0 100644
--- a/src/testbed_sdf.cu
+++ b/src/testbed_sdf.cu
@@ -502,7 +502,7 @@ __global__ void init_rays_with_payload_kernel_sdf(
 	float floor_y,
 	float plane_z,
 	float dof,
-	const network_precision_t* __restrict__ envmap_data,
+	const float* __restrict__ envmap_data,
 	const Vector2i envmap_resolution,
 	Array4f* __restrict__ framebuffer,
 	const TriangleOctreeNode* __restrict__ octree_nodes = nullptr,
@@ -594,7 +594,7 @@ void Testbed::SphereTracer::init_rays_from_camera(uint32_t spp,
 	float floor_y,
 	float plane_z,
 	float dof,
-	const network_precision_t* envmap_data,
+	const float* envmap_data,
 	const Vector2i& envmap_resolution,
 	Array4f* frame_buffer,
 	const TriangleOctree* octree, cudaStream_t stream
diff --git a/src/testbed_volume.cu b/src/testbed_volume.cu
index f39e7c369b16e2e20df122ee921d22afaba50938..36e3a84de25635fa5078bf6c2df6837a3bc6d6aa 100644
--- a/src/testbed_volume.cu
+++ b/src/testbed_volume.cu
@@ -224,7 +224,7 @@ __global__ void init_rays_volume(
 	BoundingBox aabb,
 	float plane_z,
 	float dof,
-	const network_precision_t* __restrict__ envmap_data,
+	const float* __restrict__ envmap_data,
 	const Vector2i envmap_resolution,
 	Array4f* __restrict__ framebuffer,
 	default_rng_t rng,