Cycles: Improve denoising speed on GPUs with small tile sizes

Previously, the NLM kernels would be launched once per offset with one thread per pixel. However, with the smaller tile sizes that are now feasible, there wasn't enough work to fully occupy GPUs which results in a significant slowdown. Therefore, the kernels are now launched in a single call that handles all offsets at once. This has two downsides: Memory accesses to accumulating buffers are now atomic, and more importantly, the temporary memory now has to be allocated for every shift at once, increasing the required memory. On the other hand, of course, the smaller tiles significantly reduce the size of the memory. The main bottleneck right now is the construction of the transformation - there is nothing to be parallelized there, one thread per pixel is the maximum. I tried to parallelize the SVD implementation by storing the matrix in shared memory and launching one block per pixel, but that wasn't really going anywhere. To make the new code somewhat readable, the handling of rectangular regions was cleaned up a bit and commented, it should be easier to understand what's going on now. Also, some variables have been renamed to make the difference between buffer width and stride more apparent, in addition to some general style cleanup.
author: Lukas Stockner <lukas.stockner@freenet.de> 2017-11-10 06:34:14 +0300
committer: Lukas Stockner <lukas.stockner@freenet.de> 2017-11-30 09:37:08 +0300
commit: fa3d50af95fde76ef08590d2f86444f2f9fdca95 (patch)
tree: 516ea6cce9b6b3708389ad182a7dddf2974a1a10 /intern/cycles/util
parent: df7b9fa2eeb5908de4e1b3c2c6f7cf30329f1e3d (diff)
4 files changed, 81 insertions, 2 deletions
diff --git a/intern/cycles/util/CMakeLists.txt b/intern/cycles/util/CMakeLists.txt
index 7f3747a0f58..bc9def7ca41 100644
--- a/intern/cycles/util/CMakeLists.txt
+++ b/intern/cycles/util/CMakeLists.txt
@@ -68,6 +68,7 @@ set(SRC_HEADERS
 	util_path.h
 	util_progress.h
 	util_queue.h
+	util_rect.h
 	util_set.h
 	util_simd.h
 	util_sky_model.cpp
diff --git a/intern/cycles/util/util_math.h b/intern/cycles/util/util_math.h
index 39ce6a93982..d0e91a2a1c9 100644
--- a/intern/cycles/util/util_math.h
+++ b/intern/cycles/util/util_math.h
@@ -320,6 +320,8 @@ CCL_NAMESPACE_END
 #include "util/util_math_float3.h"
 #include "util/util_math_float4.h"
 
+#include "util/util_rect.h"
+
 CCL_NAMESPACE_BEGIN
 
 #ifndef __KERNEL_OPENCL__
diff --git a/intern/cycles/util/util_math_matrix.h b/intern/cycles/util/util_math_matrix.h
index b31dbe4fc67..382dad64ea5 100644
--- a/intern/cycles/util/util_math_matrix.h
+++ b/intern/cycles/util/util_math_matrix.h
@@ -98,7 +98,10 @@ ccl_device_inline void math_vec3_add(float3 *v, int n, float *x, float3 w)
 ccl_device_inline void math_vec3_add_strided(ccl_global float3 *v, int n, float *x, float3 w, int stride)
 {
 	for(int i = 0; i < n; i++) {
-		v[i*stride] += w*x[i];
+		ccl_global float *elem = (ccl_global float*) (v + i*stride);
+		atomic_add_and_fetch_float(elem+0, w.x*x[i]);
+		atomic_add_and_fetch_float(elem+1, w.y*x[i]);
+		atomic_add_and_fetch_float(elem+2, w.z*x[i]);
 	}
 }
 
@@ -136,7 +139,7 @@ ccl_device_inline void math_trimatrix_add_gramian_strided(ccl_global float *A,
 {
 	for(int row = 0; row < n; row++) {
 		for(int col = 0; col <= row; col++) {
-			MATHS(A, row, col, stride) += v[row]*v[col]*weight;
+			atomic_add_and_fetch_float(&MATHS(A, row, col, stride), v[row]*v[col]*weight);
 		}
 	}
 }
diff --git a/intern/cycles/util/util_rect.h b/intern/cycles/util/util_rect.h
new file mode 100644
index 00000000000..17a55a14d0b
--- /dev/null
+++ b/intern/cycles/util/util_rect.h
@@ -0,0 +1,73 @@
+/*
+ * Copyright 2017 Blender Foundation
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#ifndef __UTIL_RECT_H__
+#define __UTIL_RECT_H__
+
+#include "util/util_types.h"
+
+CCL_NAMESPACE_BEGIN
+
+/* Rectangles are represented as a int4 containing the coordinates of the lower-left and
+ * upper-right corners in the order (x0, y0, x1, y1). */
+
+ccl_device_inline int4 rect_from_shape(int x0, int y0, int w, int h)
+{
+	return make_int4(x0, y0, x0 + w, y0 + h);
+}
+
+ccl_device_inline int4 rect_expand(int4 rect, int d)
+{
+	return make_int4(rect.x - d, rect.y - d, rect.z + d, rect.w + d);
+}
+
+/* Returns the intersection of two rects. */
+ccl_device_inline int4 rect_clip(int4 a, int4 b)
+{
+	return make_int4(max(a.x, b.x), max(a.y, b.y), min(a.z, b.z), min(a.w, b.w));
+}
+
+ccl_device_inline bool rect_is_valid(int4 rect)
+{
+	return (rect.z > rect.x) && (rect.w > rect.y);
+}
+
+/* Returns the local row-major index of the pixel inside the rect. */
+ccl_device_inline int coord_to_local_index(int4 rect, int x, int y)
+{
+	int w = rect.z - rect.x;
+	return (y - rect.y) * w + (x - rect.x);
+}
+
+/* Finds the coordinates of a pixel given by its row-major index in the rect,
+ * and returns whether the pixel is inside it. */
+ccl_device_inline bool local_index_to_coord(int4 rect, int idx, int *x, int *y)
+{
+	int w = rect.z - rect.x;
+	*x = (idx % w) + rect.x;
+	*y = (idx / w) + rect.y;
+	return (*y < rect.w);
+}
+
+ccl_device_inline int rect_size(int4 rect)
+{
+	return (rect.z - rect.x) * (rect.w - rect.y);
+}
+
+CCL_NAMESPACE_END
+
+#endif /* __UTIL_RECT_H__ */
+
author	Lukas Stockner <lukas.stockner@freenet.de>	2017-11-10 06:34:14 +0300
committer	Lukas Stockner <lukas.stockner@freenet.de>	2017-11-30 09:37:08 +0300
commit	fa3d50af95fde76ef08590d2f86444f2f9fdca95 (patch)
tree	516ea6cce9b6b3708389ad182a7dddf2974a1a10 /intern/cycles/util
parent	df7b9fa2eeb5908de4e1b3c2c6f7cf30329f1e3d (diff)