1 files changed, 193 insertions, 0 deletions
diff --git a/intern/cycles/kernel/kernel_work_stealing.h b/intern/cycles/kernel/kernel_work_stealing.h
new file mode 100644
index 00000000000..9b83d972e97
--- /dev/null
+++ b/intern/cycles/kernel/kernel_work_stealing.h
@@ -0,0 +1,193 @@
+/*
+ * Copyright 2011-2015 Blender Foundation
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#ifndef __KERNEL_WORK_STEALING_H__
+#define __KERNEL_WORK_STEALING_H__
+
+/*
+ * Utility functions for work stealing
+ */
+
+#ifdef __WORK_STEALING__
+
+#ifdef __KERNEL_OPENCL__
+#pragma OPENCL EXTENSION cl_khr_global_int32_base_atomics : enable
+#endif
+
+uint get_group_id_with_ray_index(uint ray_index,
+                                 uint tile_dim_x,
+                                 uint tile_dim_y,
+                                 uint parallel_samples,
+                                 int dim)
+{
+	if(dim == 0) {
+		uint x_span = ray_index % (tile_dim_x * parallel_samples);
+		return x_span / get_local_size(0);
+	}
+	else /*if(dim == 1)*/ {
+		kernel_assert(dim == 1);
+		uint y_span = ray_index / (tile_dim_x * parallel_samples);
+		return y_span / get_local_size(1);
+	}
+}
+
+uint get_total_work(uint tile_dim_x,
+                    uint tile_dim_y,
+                    uint grp_idx,
+                    uint grp_idy,
+                    uint num_samples)
+{
+	uint threads_within_tile_border_x =
+		(grp_idx == (get_num_groups(0) - 1)) ? tile_dim_x % get_local_size(0)
+		                                     : get_local_size(0);
+	uint threads_within_tile_border_y =
+		(grp_idy == (get_num_groups(1) - 1)) ? tile_dim_y % get_local_size(1)
+		                                     : get_local_size(1);
+
+	threads_within_tile_border_x =
+		(threads_within_tile_border_x == 0) ? get_local_size(0)
+		                                    : threads_within_tile_border_x;
+	threads_within_tile_border_y =
+		(threads_within_tile_border_y == 0) ? get_local_size(1)
+		                                    : threads_within_tile_border_y;
+
+	return threads_within_tile_border_x *
+	       threads_within_tile_border_y *
+	       num_samples;
+}
+
+/* Returns 0 in case there is no next work available */
+/* Returns 1 in case work assigned is valid */
+int get_next_work(ccl_global uint *work_pool,
+                  ccl_private uint *my_work,
+                  uint tile_dim_x,
+                  uint tile_dim_y,
+                  uint num_samples,
+                  uint parallel_samples,
+                  uint ray_index)
+{
+	uint grp_idx = get_group_id_with_ray_index(ray_index,
+	                                           tile_dim_x,
+	                                           tile_dim_y,
+	                                           parallel_samples,
+	                                           0);
+	uint grp_idy = get_group_id_with_ray_index(ray_index,
+	                                           tile_dim_x,
+	                                           tile_dim_y,
+	                                           parallel_samples,
+	                                           1);
+	uint total_work = get_total_work(tile_dim_x,
+	                                 tile_dim_y,
+	                                 grp_idx,
+	                                 grp_idy,
+	                                 num_samples);
+	uint group_index = grp_idy * get_num_groups(0) + grp_idx;
+	*my_work = atomic_inc(&work_pool[group_index]);
+	return (*my_work < total_work) ? 1 : 0;
+}
+
+/* This function assumes that the passed my_work is valid. */
+/* Decode sample number w.r.t. assigned my_work. */
+uint get_my_sample(uint my_work,
+                   uint tile_dim_x,
+                   uint tile_dim_y,
+                   uint parallel_samples,
+                   uint ray_index)
+{
+	uint grp_idx = get_group_id_with_ray_index(ray_index,
+	                                           tile_dim_x,
+	                                           tile_dim_y,
+	                                           parallel_samples,
+	                                           0);
+	uint grp_idy = get_group_id_with_ray_index(ray_index,
+	                                           tile_dim_x,
+	                                           tile_dim_y,
+	                                           parallel_samples,
+	                                           1);
+	uint threads_within_tile_border_x =
+		(grp_idx == (get_num_groups(0) - 1)) ? tile_dim_x % get_local_size(0)
+		                                     : get_local_size(0);
+	uint threads_within_tile_border_y =
+		(grp_idy == (get_num_groups(1) - 1)) ? tile_dim_y % get_local_size(1)
+		                                     : get_local_size(1);
+
+	threads_within_tile_border_x =
+		(threads_within_tile_border_x == 0) ? get_local_size(0)
+		                                    : threads_within_tile_border_x;
+	threads_within_tile_border_y =
+		(threads_within_tile_border_y == 0) ? get_local_size(1)
+		                                    : threads_within_tile_border_y;
+
+	return my_work /
+	       (threads_within_tile_border_x * threads_within_tile_border_y);
+}
+
+/* Decode pixel and tile position w.r.t. assigned my_work. */
+void get_pixel_tile_position(ccl_private uint *pixel_x,
+                             ccl_private uint *pixel_y,
+                             ccl_private uint *tile_x,
+                             ccl_private uint *tile_y,
+                             uint my_work,
+                             uint tile_dim_x,
+                             uint tile_dim_y,
+                             uint tile_offset_x,
+                             uint tile_offset_y,
+                             uint parallel_samples,
+                             uint ray_index)
+{
+	uint grp_idx = get_group_id_with_ray_index(ray_index,
+	                                           tile_dim_x,
+	                                           tile_dim_y,
+	                                           parallel_samples,
+	                                           0);
+	uint grp_idy = get_group_id_with_ray_index(ray_index,
+	                                           tile_dim_x,
+	                                           tile_dim_y,
+	                                           parallel_samples,
+	                                           1);
+	uint threads_within_tile_border_x =
+		(grp_idx == (get_num_groups(0) - 1)) ? tile_dim_x % get_local_size(0)
+		                                     : get_local_size(0);
+	uint threads_within_tile_border_y =
+		(grp_idy == (get_num_groups(1) - 1)) ? tile_dim_y % get_local_size(1)
+		                                     : get_local_size(1);
+
+	threads_within_tile_border_x =
+		(threads_within_tile_border_x == 0) ? get_local_size(0)
+		                                    : threads_within_tile_border_x;
+	threads_within_tile_border_y =
+		(threads_within_tile_border_y == 0) ? get_local_size(1)
+		                                    : threads_within_tile_border_y;
+
+	uint total_associated_pixels =
+		threads_within_tile_border_x * threads_within_tile_border_y;
+	uint work_group_pixel_index = my_work % total_associated_pixels;
+	uint work_group_pixel_x =
+		work_group_pixel_index % threads_within_tile_border_x;
+	uint work_group_pixel_y =
+		work_group_pixel_index / threads_within_tile_border_x;
+
+	*pixel_x =
+		tile_offset_x + (grp_idx * get_local_size(0)) + work_group_pixel_x;
+	*pixel_y =
+		tile_offset_y + (grp_idy * get_local_size(1)) + work_group_pixel_y;
+	*tile_x = *pixel_x - tile_offset_x;
+	*tile_y = *pixel_y - tile_offset_y;
+}
+
+#endif  /* __WORK_STEALING__ */
+
+#endif  /* __KERNEL_WORK_STEALING_H__ */