From 395e24374c996500d8a98b1cce2a6b57aba2f842 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Thomas=20M=C3=BCller?= <tmueller@nvidia.com>
Date: Fri, 3 Feb 2023 11:00:43 +0100
Subject: [PATCH] DLSS: use non-premultiplied alpha (doesn't completely remove
 jitter, but fixes black halos)

---
 src/render_buffer.cu | 14 +++++++++++++-
 1 file changed, 13 insertions(+), 1 deletion(-)

diff --git a/src/render_buffer.cu b/src/render_buffer.cu
index aa0c10d..ab01d4d 100644
--- a/src/render_buffer.cu
+++ b/src/render_buffer.cu
@@ -534,7 +534,7 @@ __global__ void overlay_false_color_kernel(Vector2i resolution, Vector2i trainin
 	surf2Dwrite(to_float4(color), surface, x * sizeof(float4), y);
 }
 
-__global__ void tonemap_kernel(Vector2i resolution, float exposure, Array4f background_color, Array4f* accumulate_buffer, EColorSpace color_space, EColorSpace output_color_space, ETonemapCurve tonemap_curve, bool clamp_output_color, cudaSurfaceObject_t surface) {
+__global__ void tonemap_kernel(Vector2i resolution, float exposure, Array4f background_color, Array4f* accumulate_buffer, EColorSpace color_space, EColorSpace output_color_space, ETonemapCurve tonemap_curve, bool clamp_output_color, bool unmultiply_alpha, cudaSurfaceObject_t surface) {
 	uint32_t x = threadIdx.x + blockDim.x * blockIdx.x;
 	uint32_t y = threadIdx.y + blockDim.y * blockIdx.y;
 
@@ -556,6 +556,11 @@ __global__ void tonemap_kernel(Vector2i resolution, float exposure, Array4f back
 	color.w() += weight;
 
 	color.head<3>() = tonemap(color.head<3>(), Array3f::Constant(exposure), tonemap_curve, color_space, output_color_space);
+
+	if (unmultiply_alpha && color.w() > 0.0f) {
+		color.head<3>() /= color.w();
+	}
+
 	if (clamp_output_color) {
 		color = color.cwiseMax(0.0f).cwiseMin(1.0f);
 	}
@@ -577,6 +582,12 @@ __global__ void dlss_splat_kernel(
 
 	float4 color;
 	surf2Dread(&color, dlss_surface, x * sizeof(float4), y);
+
+	// DLSS operates on non-premultiplied alpha, so multiply it back in
+	color.x *= color.w;
+	color.y *= color.w;
+	color.z *= color.w;
+
 	surf2Dwrite(color, surface, x * sizeof(float4), y);
 }
 
@@ -663,6 +674,7 @@ void CudaRenderBuffer::tonemap(float exposure, const Array4f& background_color,
 		output_color_space,
 		m_tonemap_curve,
 		m_dlss && output_color_space == EColorSpace::SRGB,
+		(bool)m_dlss, // DLSS seems to perform best with non-premultiplied alpha (probably trained on such data)
 		m_dlss ? m_dlss->frame() : surface()
 	);
 
-- 
GitLab