Skip to content
Snippets Groups Projects
Commit efb45dea authored by Thomas Müller's avatar Thomas Müller
Browse files

Add support for sm_37 architecture

parent 3cf5fbae
No related branches found
No related tags found
No related merge requests found
......@@ -26,7 +26,10 @@ jobs:
arch: 70
- os: ubuntu-18.04
cuda: "10.2"
arch: 60
arch: 61
- os: ubuntu-18.04
cuda: "10.2"
arch: 37
env:
build_dir: "build"
config: "Release"
......@@ -70,7 +73,11 @@ jobs:
- os: windows-2019
visual_studio: "Visual Studio 16 2019"
cuda: "11.5.1"
arch: 60
arch: 61
- os: windows-2019
visual_studio: "Visual Studio 16 2019"
cuda: "11.5.1"
arch: 37
env:
build_dir: "build"
config: "Release"
......
Subproject commit 6041eadf3fe5f2d7bb8e53b8328c7325a047ac42
Subproject commit 3eb4cd571f9a2e1276c556e9d8b8bc34d44b84d3
......@@ -122,16 +122,19 @@ __device__ void deposit_image_gradient(const Eigen::Matrix<float, N_DIMS, 1>& va
pos.x() = std::max(std::min(pos.x(), resolution.x()-1), 0);
pos.y() = std::max(std::min(pos.y(), resolution.y()-1), 0);
if (std::is_same<T, float>::value) {
for (uint32_t c = 0; c < N_DIMS; ++c) {
atomicAdd(&gradient[(pos.x() + pos.y() * resolution.x()) * N_DIMS + c], (T)value[c] * weight);
atomicAdd(&gradient_weight[(pos.x() + pos.y() * resolution.x()) * N_DIMS + c], weight);
}
} else if (std::is_same<T, __half>::value) {
#if TCNN_MIN_GPU_ARCH >= 60 // atomicAdd(__half2) is only supported with compute capability 60 and above
if (std::is_same<T, __half>::value) {
for (uint32_t c = 0; c < N_DIMS; c += 2) {
atomicAdd((__half2*)&gradient[(pos.x() + pos.y() * resolution.x()) * N_DIMS + c], {(T)value[c] * weight, (T)value[c+1] * weight});
atomicAdd((__half2*)&gradient_weight[(pos.x() + pos.y() * resolution.x()) * N_DIMS + c], {weight, weight});
}
} else
#endif
{
for (uint32_t c = 0; c < N_DIMS; ++c) {
atomicAdd(&gradient[(pos.x() + pos.y() * resolution.x()) * N_DIMS + c], (T)value[c] * weight);
atomicAdd(&gradient_weight[(pos.x() + pos.y() * resolution.x()) * N_DIMS + c], weight);
}
}
};
......
......@@ -82,14 +82,18 @@ __device__ void deposit_envmap_gradient(const tcnn::vector_t<T, 4>& value, T* __
pos.y() = std::max(std::min(pos.y(), envmap_resolution.y()-1), 0);
Eigen::Array4f result;
if (std::is_same<T, float>::value) {
for (uint32_t c = 0; c < 4; ++c) {
atomicAdd(&envmap_gradient[(pos.x() + pos.y() * envmap_resolution.x()) * 4 + c], value[c] * weight);
}
} else if (std::is_same<T, __half>::value) {
#if TCNN_MIN_GPU_ARCH >= 60 // atomicAdd(__half2) is only supported with compute capability 60 and above
if (std::is_same<T, __half>::value) {
for (uint32_t c = 0; c < 4; c += 2) {
atomicAdd((__half2*)&envmap_gradient[(pos.x() + pos.y() * envmap_resolution.x()) * 4 + c], {value[c] * weight, value[c+1] * weight});
}
} else
#endif
{
for (uint32_t c = 0; c < 4; ++c) {
atomicAdd(&envmap_gradient[(pos.x() + pos.y() * envmap_resolution.x()) * 4 + c], value[c] * weight);
}
}
};
......
......@@ -240,17 +240,20 @@ __global__ void kernel_takikawa_backward(
int param_idx = node.vertices[idx] * N_FEATURES_PER_LEVEL;
if (N_FEATURES_PER_LEVEL == 1 || !std::is_same<T, __half>::value) {
#pragma unroll
for (uint32_t f = 0; f < N_FEATURES_PER_LEVEL; ++f) {
atomicAdd(&grid_gradient[param_idx], (T)((float)grad[f] * weight));
}
} else {
#if TCNN_MIN_GPU_ARCH >= 60 // atomicAdd(__half2) is only supported with compute capability 60 and above
if (N_FEATURES_PER_LEVEL > 1 && std::is_same<T, __half>::value) {
#pragma unroll
for (uint32_t feature = 0; feature < N_FEATURES_PER_LEVEL; feature += 2) {
__half2 v = {(__half)((float)grad[feature] * weight), (__half)((float)grad[feature+1] * weight)};
atomicAdd((__half2*)&grid_gradient[param_idx + feature], v);
}
} else
#endif
{
#pragma unroll
for (uint32_t f = 0; f < N_FEATURES_PER_LEVEL; ++f) {
atomicAdd(&grid_gradient[param_idx], (T)((float)grad[f] * weight));
}
}
}
}
......
0% Loading or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment