From bae2b3a688a2c2ee3eb8457c62af3a10bae76131 Mon Sep 17 00:00:00 2001
From: Martijn Berger <mberger@denc.com>
Date: Fri, 25 Jul 2014 13:33:19 +0200
Subject: Switch to Cuda 4.0 style api for kernel invocation. This is a small
 clean-up that has no functional changes but makes code a bit more readable.

Differential revision: https://developer.blender.org/D659

Reviewed by: Sergey Sharybin, Thomas Dinges
---
 intern/cycles/device/device_cuda.cpp | 135 +++++++++++------------------------
 1 file changed, 41 insertions(+), 94 deletions(-)

(limited to 'intern/cycles/device')

diff --git a/intern/cycles/device/device_cuda.cpp b/intern/cycles/device/device_cuda.cpp
index d1d227b3761..022dcd0275c 100644
--- a/intern/cycles/device/device_cuda.cpp
+++ b/intern/cycles/device/device_cuda.cpp
@@ -615,40 +615,17 @@ public:
 
 		if(have_error())
 			return;
-	
-		/* pass in parameters */
-		int offset = 0;
-		
-		cuda_assert(cuParamSetv(cuPathTrace, offset, &d_buffer, sizeof(d_buffer)));
-		offset += sizeof(d_buffer);
-
-		cuda_assert(cuParamSetv(cuPathTrace, offset, &d_rng_state, sizeof(d_rng_state)));
-		offset += sizeof(d_rng_state);
-
-		offset = align_up(offset, __alignof(sample));
-
-		cuda_assert(cuParamSeti(cuPathTrace, offset, sample));
-		offset += sizeof(sample);
-
-		cuda_assert(cuParamSeti(cuPathTrace, offset, rtile.x));
-		offset += sizeof(rtile.x);
-
-		cuda_assert(cuParamSeti(cuPathTrace, offset, rtile.y));
-		offset += sizeof(rtile.y);
-
-		cuda_assert(cuParamSeti(cuPathTrace, offset, rtile.w));
-		offset += sizeof(rtile.w);
 
-		cuda_assert(cuParamSeti(cuPathTrace, offset, rtile.h));
-		offset += sizeof(rtile.h);
-
-		cuda_assert(cuParamSeti(cuPathTrace, offset, rtile.offset));
-		offset += sizeof(rtile.offset);
-
-		cuda_assert(cuParamSeti(cuPathTrace, offset, rtile.stride));
-		offset += sizeof(rtile.stride);
-
-		cuda_assert(cuParamSetSize(cuPathTrace, offset));
+		/* pass in parameters */
+		void *args[] = {&d_buffer,
+						 &d_rng_state,
+						 &sample,
+						 &rtile.x,
+						 &rtile.y,
+						 &rtile.w,
+						 &rtile.h,
+						 &rtile.offset,
+						 &rtile.stride};
 
 		/* launch kernel */
 		int threads_per_block;
@@ -666,8 +643,11 @@ public:
 		int yblocks = (rtile.h + ythreads - 1)/ythreads;
 
 		cuda_assert(cuFuncSetCacheConfig(cuPathTrace, CU_FUNC_CACHE_PREFER_L1));
-		cuda_assert(cuFuncSetBlockShape(cuPathTrace, xthreads, ythreads, 1));
-		cuda_assert(cuLaunchGrid(cuPathTrace, xblocks, yblocks));
+
+		cuda_assert(cuLaunchKernel(cuPathTrace,
+								   xblocks , yblocks, 1, /* blocks */
+								   xthreads, ythreads, 1, /* threads */
+								   0, 0, args, 0));
 
 		cuda_assert(cuCtxSynchronize());
 
@@ -693,40 +673,19 @@ public:
 			cuda_assert(cuModuleGetFunction(&cuFilmConvert, cuModule, "kernel_cuda_convert_to_byte"));
 		}
 
-		/* pass in parameters */
-		int offset = 0;
-
-		cuda_assert(cuParamSetv(cuFilmConvert, offset, &d_rgba, sizeof(d_rgba)));
-		offset += sizeof(d_rgba);
-		
-		cuda_assert(cuParamSetv(cuFilmConvert, offset, &d_buffer, sizeof(d_buffer)));
-		offset += sizeof(d_buffer);
 
 		float sample_scale = 1.0f/(task.sample + 1);
-		offset = align_up(offset, __alignof(sample_scale));
-
-		cuda_assert(cuParamSetf(cuFilmConvert, offset, sample_scale));
-		offset += sizeof(sample_scale);
 
-		cuda_assert(cuParamSeti(cuFilmConvert, offset, task.x));
-		offset += sizeof(task.x);
-
-		cuda_assert(cuParamSeti(cuFilmConvert, offset, task.y));
-		offset += sizeof(task.y);
-
-		cuda_assert(cuParamSeti(cuFilmConvert, offset, task.w));
-		offset += sizeof(task.w);
-
-		cuda_assert(cuParamSeti(cuFilmConvert, offset, task.h));
-		offset += sizeof(task.h);
-
-		cuda_assert(cuParamSeti(cuFilmConvert, offset, task.offset));
-		offset += sizeof(task.offset);
-
-		cuda_assert(cuParamSeti(cuFilmConvert, offset, task.stride));
-		offset += sizeof(task.stride);
-
-		cuda_assert(cuParamSetSize(cuFilmConvert, offset));
+		/* pass in parameters */
+		void *args[] = {&d_rgba,
+						 &d_buffer,
+						 &sample_scale,
+						 &task.x,
+						 &task.y,
+						 &task.w,
+						 &task.h,
+						 &task.offset,
+						 &task.stride};
 
 		/* launch kernel */
 		int threads_per_block;
@@ -738,8 +697,11 @@ public:
 		int yblocks = (task.h + ythreads - 1)/ythreads;
 
 		cuda_assert(cuFuncSetCacheConfig(cuFilmConvert, CU_FUNC_CACHE_PREFER_L1));
-		cuda_assert(cuFuncSetBlockShape(cuFilmConvert, xthreads, ythreads, 1));
-		cuda_assert(cuLaunchGrid(cuFilmConvert, xblocks, yblocks));
+
+		cuda_assert(cuLaunchKernel(cuFilmConvert,
+								   xblocks , yblocks, 1, /* blocks */
+								   xthreads, ythreads, 1, /* threads */
+								   0, 0, args, 0));
 
 		unmap_pixels((rgba_byte)? rgba_byte: rgba_half);
 
@@ -777,31 +739,14 @@ public:
 			int shader_w = min(shader_chunk_size, end - shader_x);
 
 			for(int sample = 0; sample < task.num_samples; sample++) {
-				/* pass in parameters */
-				int offset = 0;
 
-				cuda_assert(cuParamSetv(cuShader, offset, &d_input, sizeof(d_input)));
-				offset += sizeof(d_input);
-
-				cuda_assert(cuParamSetv(cuShader, offset, &d_output, sizeof(d_output)));
-				offset += sizeof(d_output);
-
-				int shader_eval_type = task.shader_eval_type;
-				offset = align_up(offset, __alignof(shader_eval_type));
-
-				cuda_assert(cuParamSeti(cuShader, offset, task.shader_eval_type));
-				offset += sizeof(task.shader_eval_type);
-
-				cuda_assert(cuParamSeti(cuShader, offset, shader_x));
-				offset += sizeof(shader_x);
-
-				cuda_assert(cuParamSeti(cuShader, offset, shader_w));
-				offset += sizeof(shader_w);
-
-				cuda_assert(cuParamSeti(cuShader, offset, sample));
-				offset += sizeof(sample);
-
-				cuda_assert(cuParamSetSize(cuShader, offset));
+				/* pass in parameters */
+				void *args[] = {&d_input,
+								 &d_output,
+								 &task.shader_eval_type,
+								 &shader_x,
+								 &shader_w,
+								 &sample};
 
 				/* launch kernel */
 				int threads_per_block;
@@ -810,8 +755,10 @@ public:
 				int xblocks = (shader_w + threads_per_block - 1)/threads_per_block;
 
 				cuda_assert(cuFuncSetCacheConfig(cuShader, CU_FUNC_CACHE_PREFER_L1));
-				cuda_assert(cuFuncSetBlockShape(cuShader, threads_per_block, 1, 1));
-				cuda_assert(cuLaunchGrid(cuShader, xblocks, 1));
+				cuda_assert(cuLaunchKernel(cuShader,
+										   xblocks , 1, 1, /* blocks */
+										   threads_per_block, 1, 1, /* threads */
+										   0, 0, args, 0));
 
 				cuda_assert(cuCtxSynchronize());
 			}
-- 
cgit v1.2.3


From fc55c41bba8121bf3db67280c26e840f8b3f4124 Mon Sep 17 00:00:00 2001
From: Dalai Felinto <dfelinto@gmail.com>
Date: Tue, 22 Jul 2014 18:41:01 -0300
Subject: Cycles Bake: show progress bar during bake

Baking progress preview is not possible, in parts due to the way the API
was designed. But at least you get to see the progress bar while baking.

Reviewers: sergey

Differential Revision: https://developer.blender.org/D656
---
 intern/cycles/device/device.h          |  1 +
 intern/cycles/device/device_cpu.cpp    | 56 +++++++++++++++++++++++-----------
 intern/cycles/device/device_cuda.cpp   | 25 ++++++++++-----
 intern/cycles/device/device_multi.cpp  |  5 +++
 intern/cycles/device/device_opencl.cpp | 15 +++++++--
 intern/cycles/device/device_task.cpp   | 25 +++++++++++----
 intern/cycles/device/device_task.h     |  3 +-
 7 files changed, 95 insertions(+), 35 deletions(-)

(limited to 'intern/cycles/device')

diff --git a/intern/cycles/device/device.h b/intern/cycles/device/device.h
index bcddd4f73e2..20ebfd391d6 100644
--- a/intern/cycles/device/device.h
+++ b/intern/cycles/device/device.h
@@ -122,6 +122,7 @@ public:
 	virtual bool load_kernels(bool experimental) { return true; }
 
 	/* tasks */
+	virtual int get_split_task_count(DeviceTask& task) = 0;
 	virtual void task_add(DeviceTask& task) = 0;
 	virtual void task_wait() = 0;
 	virtual void task_cancel() = 0;
diff --git a/intern/cycles/device/device_cpu.cpp b/intern/cycles/device/device_cpu.cpp
index 7308d036fe3..4fdeef6bdcb 100644
--- a/intern/cycles/device/device_cpu.cpp
+++ b/intern/cycles/device/device_cpu.cpp
@@ -185,7 +185,7 @@ public:
 
 					tile.sample = sample + 1;
 
-					task.update_progress(tile);
+					task.update_progress(&tile);
 				}
 			}
 			else
@@ -207,7 +207,7 @@ public:
 
 					tile.sample = sample + 1;
 
-					task.update_progress(tile);
+					task.update_progress(&tile);
 				}
 			}
 			else
@@ -229,7 +229,7 @@ public:
 
 					tile.sample = sample + 1;
 
-					task.update_progress(tile);
+					task.update_progress(&tile);
 				}
 			}
 			else
@@ -251,7 +251,7 @@ public:
 
 					tile.sample = sample + 1;
 
-					task.update_progress(tile);
+					task.update_progress(&tile);
 				}
 			}
 			else
@@ -273,7 +273,7 @@ public:
 
 					tile.sample = sample + 1;
 
-					task.update_progress(tile);
+					task.update_progress(&tile);
 				}
 			}
 			else
@@ -294,7 +294,7 @@ public:
 
 					tile.sample = sample + 1;
 
-					task.update_progress(tile);
+					task.update_progress(&tile);
 				}
 			}
 
@@ -433,71 +433,83 @@ public:
 
 #ifdef WITH_CYCLES_OPTIMIZED_KERNEL_AVX2
 		if(system_cpu_support_avx2()) {
-			for(int x = task.shader_x; x < task.shader_x + task.shader_w; x++) {
-				for(int sample = 0; sample < task.num_samples; sample++)
+			for(int sample = 0; sample < task.num_samples; sample++) {
+				for(int x = task.shader_x; x < task.shader_x + task.shader_w; x++)
 					kernel_cpu_avx2_shader(&kg, (uint4*)task.shader_input, (float4*)task.shader_output, task.shader_eval_type, x, sample);
 
 				if(task.get_cancel() || task_pool.canceled())
 					break;
+
+				task.update_progress(NULL);
 			}
 		}
 		else
 #endif
 #ifdef WITH_CYCLES_OPTIMIZED_KERNEL_AVX
 		if(system_cpu_support_avx()) {
-			for(int x = task.shader_x; x < task.shader_x + task.shader_w; x++) {
-				for(int sample = 0; sample < task.num_samples; sample++)
+			for(int sample = 0; sample < task.num_samples; sample++) {
+				for(int x = task.shader_x; x < task.shader_x + task.shader_w; x++)
 					kernel_cpu_avx_shader(&kg, (uint4*)task.shader_input, (float4*)task.shader_output, task.shader_eval_type, x, sample);
 
 				if(task.get_cancel() || task_pool.canceled())
 					break;
+
+				task.update_progress(NULL);
 			}
 		}
 		else
 #endif
 #ifdef WITH_CYCLES_OPTIMIZED_KERNEL_SSE41			
 		if(system_cpu_support_sse41()) {
-			for(int x = task.shader_x; x < task.shader_x + task.shader_w; x++) {
-				for(int sample = 0; sample < task.num_samples; sample++)
+			for(int sample = 0; sample < task.num_samples; sample++) {
+				for(int x = task.shader_x; x < task.shader_x + task.shader_w; x++)
 					kernel_cpu_sse41_shader(&kg, (uint4*)task.shader_input, (float4*)task.shader_output, task.shader_eval_type, x, sample);
 
 				if(task.get_cancel() || task_pool.canceled())
 					break;
+
+				task.update_progress(NULL);
 			}
 		}
 		else
 #endif
 #ifdef WITH_CYCLES_OPTIMIZED_KERNEL_SSE3
 		if(system_cpu_support_sse3()) {
-			for(int x = task.shader_x; x < task.shader_x + task.shader_w; x++) {
-				for(int sample = 0; sample < task.num_samples; sample++)
+			for(int sample = 0; sample < task.num_samples; sample++) {
+				for(int x = task.shader_x; x < task.shader_x + task.shader_w; x++)
 					kernel_cpu_sse3_shader(&kg, (uint4*)task.shader_input, (float4*)task.shader_output, task.shader_eval_type, x, sample);
 
 				if(task.get_cancel() || task_pool.canceled())
 					break;
+
+				task.update_progress(NULL);
 			}
 		}
 		else
 #endif
 #ifdef WITH_CYCLES_OPTIMIZED_KERNEL_SSE2
 		if(system_cpu_support_sse2()) {
-			for(int x = task.shader_x; x < task.shader_x + task.shader_w; x++) {
-				for(int sample = 0; sample < task.num_samples; sample++)
+			for(int sample = 0; sample < task.num_samples; sample++) {
+				for(int x = task.shader_x; x < task.shader_x + task.shader_w; x++)
 					kernel_cpu_sse2_shader(&kg, (uint4*)task.shader_input, (float4*)task.shader_output, task.shader_eval_type, x, sample);
 
 				if(task.get_cancel() || task_pool.canceled())
 					break;
+
+				task.update_progress(NULL);
 			}
 		}
 		else
 #endif
 		{
-			for(int x = task.shader_x; x < task.shader_x + task.shader_w; x++) {
-				for(int sample = 0; sample < task.num_samples; sample++)
+			for(int sample = 0; sample < task.num_samples; sample++) {
+				for(int x = task.shader_x; x < task.shader_x + task.shader_w; x++)
 					kernel_cpu_shader(&kg, (uint4*)task.shader_input, (float4*)task.shader_output, task.shader_eval_type, x, sample);
 
 				if(task.get_cancel() || task_pool.canceled())
 					break;
+
+				task.update_progress(NULL);
 			}
 		}
 
@@ -506,6 +518,14 @@ public:
 #endif
 	}
 
+	int get_split_task_count(DeviceTask& task)
+	{
+		if (task.type == DeviceTask::SHADER)
+			return task.get_subtask_count(TaskScheduler::num_threads(), 256);
+		else
+			return task.get_subtask_count(TaskScheduler::num_threads());
+	}
+
 	void task_add(DeviceTask& task)
 	{
 		/* split task into smaller ones */
diff --git a/intern/cycles/device/device_cuda.cpp b/intern/cycles/device/device_cuda.cpp
index 022dcd0275c..1c7f3a05b0b 100644
--- a/intern/cycles/device/device_cuda.cpp
+++ b/intern/cycles/device/device_cuda.cpp
@@ -732,13 +732,10 @@ public:
 		const int start = task.shader_x;
 		const int end = task.shader_x + task.shader_w;
 
-		for(int shader_x = start; shader_x < end; shader_x += shader_chunk_size) {
-			if(task.get_cancel())
-				break;
-
-			int shader_w = min(shader_chunk_size, end - shader_x);
-
-			for(int sample = 0; sample < task.num_samples; sample++) {
+		bool cancelled = false;
+		for(int sample = 0; sample < task.num_samples && !cancelled; sample++) {
+			for(int shader_x = start; shader_x < end; shader_x += shader_chunk_size) {
+				int shader_w = min(shader_chunk_size, end - shader_x);
 
 				/* pass in parameters */
 				void *args[] = {&d_input,
@@ -761,7 +758,14 @@ public:
 										   0, 0, args, 0));
 
 				cuda_assert(cuCtxSynchronize());
+
+				if(task.get_cancel()) {
+					cancelled = false;
+					break;
+				}
 			}
+
+			task.update_progress(NULL);
 		}
 
 		cuda_pop_context();
@@ -991,7 +995,7 @@ public:
 
 					tile.sample = sample + 1;
 
-					task->update_progress(tile);
+					task->update_progress(&tile);
 				}
 
 				task->release_tile(tile);
@@ -1015,6 +1019,11 @@ public:
 		}
 	};
 
+	int get_split_task_count(DeviceTask& task)
+	{
+		return 1;
+	}
+
 	void task_add(DeviceTask& task)
 	{
 		if(task.type == DeviceTask::FILM_CONVERT) {
diff --git a/intern/cycles/device/device_multi.cpp b/intern/cycles/device/device_multi.cpp
index c866ebaaea2..564fbdbadf8 100644
--- a/intern/cycles/device/device_multi.cpp
+++ b/intern/cycles/device/device_multi.cpp
@@ -278,6 +278,11 @@ public:
 		return -1;
 	}
 
+	int get_split_task_count(DeviceTask& task)
+	{
+		return 1;
+	}
+
 	void task_add(DeviceTask& task)
 	{
 		list<DeviceTask> tasks;
diff --git a/intern/cycles/device/device_opencl.cpp b/intern/cycles/device/device_opencl.cpp
index abfe445414a..3abda6a54c1 100644
--- a/intern/cycles/device/device_opencl.cpp
+++ b/intern/cycles/device/device_opencl.cpp
@@ -1068,7 +1068,11 @@ public:
 			kernel = ckShaderKernel;
 
 		for(int sample = 0; sample < task.num_samples; sample++) {
-			cl_int d_sample = task.sample;
+
+			if(task.get_cancel())
+				break;
+
+			cl_int d_sample = sample;
 
 			opencl_assert(clSetKernelArg(kernel, narg++, sizeof(d_data), (void*)&d_data));
 			opencl_assert(clSetKernelArg(kernel, narg++, sizeof(d_input), (void*)&d_input));
@@ -1084,6 +1088,8 @@ public:
 			opencl_assert(clSetKernelArg(kernel, narg++, sizeof(d_sample), (void*)&d_sample));
 
 			enqueue_kernel(kernel, task.shader_w, 1);
+
+			task.update_progress(NULL);
 		}
 	}
 
@@ -1113,7 +1119,7 @@ public:
 
 					tile.sample = sample + 1;
 
-					task->update_progress(tile);
+					task->update_progress(&tile);
 				}
 
 				task->release_tile(tile);
@@ -1130,6 +1136,11 @@ public:
 		}
 	};
 
+	int get_split_task_count(DeviceTask& task)
+	{
+		return 1;
+	}
+
 	void task_add(DeviceTask& task)
 	{
 		task_pool.push(new OpenCLDeviceTask(this, task));
diff --git a/intern/cycles/device/device_task.cpp b/intern/cycles/device/device_task.cpp
index f436b54df68..dc124f8cf37 100644
--- a/intern/cycles/device/device_task.cpp
+++ b/intern/cycles/device/device_task.cpp
@@ -35,7 +35,7 @@ DeviceTask::DeviceTask(Type type_)
 	last_update_time = time_dt();
 }
 
-void DeviceTask::split(list<DeviceTask>& tasks, int num, int max_size)
+int DeviceTask::get_subtask_count(int num, int max_size)
 {
 	if(max_size != 0) {
 		int max_size_num;
@@ -53,7 +53,21 @@ void DeviceTask::split(list<DeviceTask>& tasks, int num, int max_size)
 
 	if(type == SHADER) {
 		num = min(shader_w, num);
+	}
+	else if(type == PATH_TRACE) {
+	}
+	else {
+		num = min(h, num);
+	}
 
+	return num;
+}
+
+void DeviceTask::split(list<DeviceTask>& tasks, int num, int max_size)
+{
+	num = get_subtask_count(num, max_size);
+
+	if(type == SHADER) {
 		for(int i = 0; i < num; i++) {
 			int tx = shader_x + (shader_w/num)*i;
 			int tw = (i == num-1)? shader_w - i*(shader_w/num): shader_w/num;
@@ -71,8 +85,6 @@ void DeviceTask::split(list<DeviceTask>& tasks, int num, int max_size)
 			tasks.push_back(*this);
 	}
 	else {
-		num = min(h, num);
-
 		for(int i = 0; i < num; i++) {
 			int ty = y + (h/num)*i;
 			int th = (i == num-1)? h - i*(h/num): h/num;
@@ -87,9 +99,10 @@ void DeviceTask::split(list<DeviceTask>& tasks, int num, int max_size)
 	}
 }
 
-void DeviceTask::update_progress(RenderTile &rtile)
+void DeviceTask::update_progress(RenderTile *rtile)
 {
-	if (type != PATH_TRACE)
+	if((type != PATH_TRACE) &&
+	   (type != SHADER))
 		return;
 
 	if(update_progress_sample)
@@ -99,7 +112,7 @@ void DeviceTask::update_progress(RenderTile &rtile)
 		double current_time = time_dt();
 
 		if (current_time - last_update_time >= 1.0) {
-			update_tile_sample(rtile);
+			update_tile_sample(*rtile);
 
 			last_update_time = current_time;
 		}
diff --git a/intern/cycles/device/device_task.h b/intern/cycles/device/device_task.h
index 91390674286..50216adefe2 100644
--- a/intern/cycles/device/device_task.h
+++ b/intern/cycles/device/device_task.h
@@ -52,9 +52,10 @@ public:
 
 	DeviceTask(Type type = PATH_TRACE);
 
+	int get_subtask_count(int num, int max_size = 0);
 	void split(list<DeviceTask>& tasks, int num, int max_size = 0);
 
-	void update_progress(RenderTile &rtile);
+	void update_progress(RenderTile *rtile);
 
 	boost::function<bool(Device *device, RenderTile&)> acquire_tile;
 	boost::function<void(void)> update_progress_sample;
-- 
cgit v1.2.3


From 65bf694331e3b64e4fd08b7c59a735529f27b11f Mon Sep 17 00:00:00 2001
From: Martijn Berger <martijn.berger@gmail.com>
Date: Mon, 28 Jul 2014 22:44:33 +0200
Subject: Implement get_split_task_count to make device_network compile again.

---
 intern/cycles/device/device_network.cpp | 5 +++++
 1 file changed, 5 insertions(+)

(limited to 'intern/cycles/device')

diff --git a/intern/cycles/device/device_network.cpp b/intern/cycles/device/device_network.cpp
index af051076009..dca9bf29e70 100644
--- a/intern/cycles/device/device_network.cpp
+++ b/intern/cycles/device/device_network.cpp
@@ -299,6 +299,11 @@ public:
 		snd.write();
 	}
 
+	int get_split_task_count(DeviceTask& task)
+	{
+		return 1;
+	}
+
 private:
 	NetworkError error_func;
 };
-- 
cgit v1.2.3


From 9c3025cd26d547667847c81ec53b503d876be906 Mon Sep 17 00:00:00 2001
From: Campbell Barton <ideasman42@gmail.com>
Date: Sat, 2 Aug 2014 16:53:52 +1000
Subject: Spelling

---
 intern/cycles/device/device.cpp      | 2 +-
 intern/cycles/device/device_cuda.cpp | 8 ++++----
 2 files changed, 5 insertions(+), 5 deletions(-)

(limited to 'intern/cycles/device')

diff --git a/intern/cycles/device/device.cpp b/intern/cycles/device/device.cpp
index 7fd1b79f6bc..fa1f0acadde 100644
--- a/intern/cycles/device/device.cpp
+++ b/intern/cycles/device/device.cpp
@@ -66,7 +66,7 @@ void Device::draw_pixels(device_memory& rgba, int y, int w, int h, int dy, int w
 	glColor3f(1.0f, 1.0f, 1.0f);
 
 	if(rgba.data_type == TYPE_HALF) {
-		/* for multi devices, this assumes the ineffecient method that we allocate
+		/* for multi devices, this assumes the inefficient method that we allocate
 		 * all pixels on the device even though we only render to a subset */
 		GLhalf *data_pointer = (GLhalf*)rgba.data_pointer;
 		data_pointer += 4*y*w;
diff --git a/intern/cycles/device/device_cuda.cpp b/intern/cycles/device/device_cuda.cpp
index 1c7f3a05b0b..f0f32f87eed 100644
--- a/intern/cycles/device/device_cuda.cpp
+++ b/intern/cycles/device/device_cuda.cpp
@@ -732,8 +732,8 @@ public:
 		const int start = task.shader_x;
 		const int end = task.shader_x + task.shader_w;
 
-		bool cancelled = false;
-		for(int sample = 0; sample < task.num_samples && !cancelled; sample++) {
+		bool canceled = false;
+		for(int sample = 0; sample < task.num_samples && !canceled; sample++) {
 			for(int shader_x = start; shader_x < end; shader_x += shader_chunk_size) {
 				int shader_w = min(shader_chunk_size, end - shader_x);
 
@@ -760,7 +760,7 @@ public:
 				cuda_assert(cuCtxSynchronize());
 
 				if(task.get_cancel()) {
-					cancelled = false;
+					canceled = false;
 					break;
 				}
 			}
@@ -907,7 +907,7 @@ public:
 
 			cuda_push_context();
 
-			/* for multi devices, this assumes the ineffecient method that we allocate
+			/* for multi devices, this assumes the inefficient method that we allocate
 			 * all pixels on the device even though we only render to a subset */
 			size_t offset = 4*y*w;
 
-- 
cgit v1.2.3


From 77b7e1fe9abb882b7bd1d60f5273e03f079d8a54 Mon Sep 17 00:00:00 2001
From: Sergey Sharybin <sergey.vfx@gmail.com>
Date: Tue, 5 Aug 2014 13:57:50 +0600
Subject: Deduplicate CUDA and OpenCL wranglers

For now it was mainly about OpenCL wrangler being duplicated
between Cycles and Compositor, but with OpenSubdiv work those
wranglers were gonna to be duplicated just once again.

This commit makes it so Cycles and Compositor uses wranglers
from this repositories:

  - https://github.com/CudaWrangler/cuew
  - https://github.com/OpenCLWrangler/clew

This repositories are based on the wranglers we used before
and they'll be likely continued maintaining by us plus some
more players in the market.

Pretty much straightforward change with some tricks in the
CMake/SCons to make this libs being passed to the linker
after all other libraries in order to make OpenSubdiv linked
against those wranglers in the future.

For those who're worrying about Cycles being less standalone,
it's not truth, it's rather more flexible now and in the future
different wranglers might be used in Cycles. For now it'll
just mean those libs would need to be put into Cycles repository
together with some other libs from Blender such as mikkspace.

This is mainly platform maintenance commit, should not be any
changes to the user space.

Reviewers: juicyfruit, dingto, campbellbarton

Reviewed By: juicyfruit, dingto, campbellbarton

Differential Revision: https://developer.blender.org/D707
---
 intern/cycles/device/CMakeLists.txt    |   2 +
 intern/cycles/device/device.cpp        |  17 +++---
 intern/cycles/device/device_cuda.cpp   | 103 +++++++++++----------------------
 intern/cycles/device/device_intern.h   |   2 +
 intern/cycles/device/device_opencl.cpp |  25 +++++++-
 5 files changed, 71 insertions(+), 78 deletions(-)

(limited to 'intern/cycles/device')

diff --git a/intern/cycles/device/CMakeLists.txt b/intern/cycles/device/CMakeLists.txt
index ae3309df3d9..a62ce29f722 100644
--- a/intern/cycles/device/CMakeLists.txt
+++ b/intern/cycles/device/CMakeLists.txt
@@ -11,6 +11,8 @@ set(INC
 set(INC_SYS
 	${OPENGL_INCLUDE_DIR}
 	${GLEW_INCLUDE_PATH}
+	../../../extern/cuew/include
+	../../../extern/clew/include
 )
 
 set(SRC
diff --git a/intern/cycles/device/device.cpp b/intern/cycles/device/device.cpp
index fa1f0acadde..efdfa98cfb5 100644
--- a/intern/cycles/device/device.cpp
+++ b/intern/cycles/device/device.cpp
@@ -20,12 +20,13 @@
 #include "device.h"
 #include "device_intern.h"
 
-#include "util_cuda.h"
+#include "cuew.h"
+#include "clew.h"
+
 #include "util_debug.h"
 #include "util_foreach.h"
 #include "util_half.h"
 #include "util_math.h"
-#include "util_opencl.h"
 #include "util_opengl.h"
 #include "util_time.h"
 #include "util_types.h"
@@ -141,7 +142,7 @@ Device *Device::create(DeviceInfo& info, Stats &stats, bool background)
 			break;
 #ifdef WITH_CUDA
 		case DEVICE_CUDA:
-			if(cuLibraryInit())
+			if(device_cuda_init())
 				device = device_cuda_create(info, stats, background);
 			else
 				device = NULL;
@@ -159,7 +160,7 @@ Device *Device::create(DeviceInfo& info, Stats &stats, bool background)
 #endif
 #ifdef WITH_OPENCL
 		case DEVICE_OPENCL:
-			if(clLibraryInit())
+			if(device_opencl_init())
 				device = device_opencl_create(info, stats, background);
 			else
 				device = NULL;
@@ -213,12 +214,12 @@ vector<DeviceType>& Device::available_types()
 		types.push_back(DEVICE_CPU);
 
 #ifdef WITH_CUDA
-		if(cuLibraryInit())
+		if(device_cuda_init())
 			types.push_back(DEVICE_CUDA);
 #endif
 
 #ifdef WITH_OPENCL
-		if(clLibraryInit())
+		if(device_opencl_init())
 			types.push_back(DEVICE_OPENCL);
 #endif
 
@@ -242,12 +243,12 @@ vector<DeviceInfo>& Device::available_devices()
 
 	if(!devices_init) {
 #ifdef WITH_CUDA
-		if(cuLibraryInit())
+		if(device_cuda_init())
 			device_cuda_info(devices);
 #endif
 
 #ifdef WITH_OPENCL
-		if(clLibraryInit())
+		if(device_opencl_init())
 			device_opencl_info(devices);
 #endif
 
diff --git a/intern/cycles/device/device_cuda.cpp b/intern/cycles/device/device_cuda.cpp
index f0f32f87eed..6629069c6c6 100644
--- a/intern/cycles/device/device_cuda.cpp
+++ b/intern/cycles/device/device_cuda.cpp
@@ -23,7 +23,7 @@
 
 #include "buffers.h"
 
-#include "util_cuda.h"
+#include "cuew.h"
 #include "util_debug.h"
 #include "util_map.h"
 #include "util_opengl.h"
@@ -61,65 +61,10 @@ public:
 		return (CUdeviceptr)mem;
 	}
 
-	static const char *cuda_error_string(CUresult result)
+	static bool have_precompiled_kernels()
 	{
-		switch(result) {
-			case CUDA_SUCCESS: return "No errors";
-			case CUDA_ERROR_INVALID_VALUE: return "Invalid value";
-			case CUDA_ERROR_OUT_OF_MEMORY: return "Out of memory";
-			case CUDA_ERROR_NOT_INITIALIZED: return "Driver not initialized";
-			case CUDA_ERROR_DEINITIALIZED: return "Driver deinitialized";
-
-			case CUDA_ERROR_NO_DEVICE: return "No CUDA-capable device available";
-			case CUDA_ERROR_INVALID_DEVICE: return "Invalid device";
-
-			case CUDA_ERROR_INVALID_IMAGE: return "Invalid kernel image";
-			case CUDA_ERROR_INVALID_CONTEXT: return "Invalid context";
-			case CUDA_ERROR_MAP_FAILED: return "Map failed";
-			case CUDA_ERROR_UNMAP_FAILED: return "Unmap failed";
-			case CUDA_ERROR_ARRAY_IS_MAPPED: return "Array is mapped";
-			case CUDA_ERROR_ALREADY_MAPPED: return "Already mapped";
-			case CUDA_ERROR_NO_BINARY_FOR_GPU: return "No binary for GPU";
-			case CUDA_ERROR_ALREADY_ACQUIRED: return "Already acquired";
-			case CUDA_ERROR_NOT_MAPPED: return "Not mapped";
-			case CUDA_ERROR_NOT_MAPPED_AS_ARRAY: return "Mapped resource not available for access as an array";
-			case CUDA_ERROR_NOT_MAPPED_AS_POINTER: return "Mapped resource not available for access as a pointer";
-			case CUDA_ERROR_ECC_UNCORRECTABLE: return "Uncorrectable ECC error detected";
-			case CUDA_ERROR_UNSUPPORTED_LIMIT: return "CUlimit not supported by device";
-			case CUDA_ERROR_CONTEXT_ALREADY_IN_USE: return "Context already in use";
-			case CUDA_ERROR_PEER_ACCESS_UNSUPPORTED: return "Peer access unsupported";
-			case CUDA_ERROR_INVALID_PTX: return "Invalid PTX code";
-
-			case CUDA_ERROR_INVALID_SOURCE: return "Invalid source";
-			case CUDA_ERROR_FILE_NOT_FOUND: return "File not found";
-			case CUDA_ERROR_SHARED_OBJECT_SYMBOL_NOT_FOUND: return "Link to a shared object failed to resolve";
-			case CUDA_ERROR_SHARED_OBJECT_INIT_FAILED: return "Shared object initialization failed";
-			case CUDA_ERROR_OPERATING_SYSTEM: return "OS call failed";
-
-			case CUDA_ERROR_INVALID_HANDLE: return "Invalid handle";
-
-			case CUDA_ERROR_NOT_FOUND: return "Not found";
-
-			case CUDA_ERROR_NOT_READY: return "CUDA not ready";
-
-			case CUDA_ERROR_ILLEGAL_ADDRESS: return "Illegal address";
-			case CUDA_ERROR_LAUNCH_OUT_OF_RESOURCES: return "Launch exceeded resources";
-			case CUDA_ERROR_LAUNCH_TIMEOUT: return "Launch exceeded time out";
-			case CUDA_ERROR_LAUNCH_INCOMPATIBLE_TEXTURING: return "Launch with incompatible texturing";
-			case CUDA_ERROR_HARDWARE_STACK_ERROR: return "Stack error";
-			case CUDA_ERROR_ILLEGAL_INSTRUCTION: return "Illegal instruction";
-			case CUDA_ERROR_MISALIGNED_ADDRESS: return "Misaligned address";
-			case CUDA_ERROR_INVALID_ADDRESS_SPACE: return "Invalid address space";
-			case CUDA_ERROR_INVALID_PC: return "Invalid program counter";
-			case CUDA_ERROR_LAUNCH_FAILED: return "Launch failed";
-
-			case CUDA_ERROR_NOT_PERMITTED: return "Operation not permitted";
-			case CUDA_ERROR_NOT_SUPPORTED: return "Operation not supported";
-
-			case CUDA_ERROR_UNKNOWN: return "Unknown error";
-
-			default: return "Unknown CUDA error value";
-		}
+		string cubins_path = path_get("lib");
+		return path_exists(cubins_path);
 	}
 
 /*#ifdef NDEBUG
@@ -141,7 +86,7 @@ public:
 		CUresult result = stmt; \
 		\
 		if(result != CUDA_SUCCESS) { \
-			string message = string_printf("CUDA error: %s in %s", cuda_error_string(result), #stmt); \
+			string message = string_printf("CUDA error: %s in %s", cuewErrorString(result), #stmt); \
 			if(error_msg == "") \
 				error_msg = message; \
 			fprintf(stderr, "%s\n", message.c_str()); \
@@ -155,7 +100,7 @@ public:
 		if(result == CUDA_SUCCESS)
 			return false;
 
-		string message = string_printf("CUDA error at %s: %s", stmt.c_str(), cuda_error_string(result));
+		string message = string_printf("CUDA error at %s: %s", stmt.c_str(), cuewErrorString(result));
 		if(error_msg == "")
 			error_msg = message;
 		fprintf(stderr, "%s\n", message.c_str());
@@ -275,7 +220,7 @@ public:
 			return cubin;
 
 #ifdef _WIN32
-		if(cuHavePrecompiledKernels()) {
+		if(have_precompiled_kernels()) {
 			if(major < 2)
 				cuda_error_message(string_printf("CUDA device requires compute capability 2.0 or up, found %d.%d. Your GPU is not supported.", major, minor));
 			else
@@ -285,14 +230,14 @@ public:
 #endif
 
 		/* if not, find CUDA compiler */
-		string nvcc = cuCompilerPath();
+		const char *nvcc = cuewCompilerPath();
 
-		if(nvcc == "") {
+		if(nvcc == NULL) {
 			cuda_error_message("CUDA nvcc compiler not found. Install CUDA toolkit in default location.");
 			return "";
 		}
 
-		int cuda_version = cuCompilerVersion();
+		int cuda_version = cuewCompilerVersion();
 
 		if(cuda_version == 0) {
 			cuda_error_message("CUDA nvcc compiler version could not be parsed.");
@@ -317,7 +262,7 @@ public:
 
 		string command = string_printf("\"%s\" -arch=sm_%d%d -m%d --cubin \"%s\" "
 			"-o \"%s\" --ptxas-options=\"-v\" -I\"%s\" -DNVCC -D__KERNEL_CUDA_VERSION__=%d",
-			nvcc.c_str(), major, minor, machine, kernel.c_str(), cubin.c_str(), include.c_str(), cuda_version);
+			nvcc, major, minor, machine, kernel.c_str(), cubin.c_str(), include.c_str(), cuda_version);
 
 		printf("%s\n", command.c_str());
 
@@ -1050,6 +995,28 @@ public:
 	}
 };
 
+bool device_cuda_init(void)
+{
+	static bool initialized = false;
+	static bool result = false;
+
+	if (initialized)
+		return result;
+
+	initialized = true;
+
+	if (cuewInit() == CUEW_SUCCESS) {
+		if(CUDADevice::have_precompiled_kernels())
+			result = true;
+#ifndef _WIN32
+		else if(cuewCompilerPath() != NULL)
+			result = true;
+#endif
+	}
+
+	return result;
+}
+
 Device *device_cuda_create(DeviceInfo& info, Stats &stats, bool background)
 {
 	return new CUDADevice(info, stats, background);
@@ -1063,13 +1030,13 @@ void device_cuda_info(vector<DeviceInfo>& devices)
 	result = cuInit(0);
 	if(result != CUDA_SUCCESS) {
 		if(result != CUDA_ERROR_NO_DEVICE)
-			fprintf(stderr, "CUDA cuInit: %s\n", CUDADevice::cuda_error_string(result));
+			fprintf(stderr, "CUDA cuInit: %s\n", cuewErrorString(result));
 		return;
 	}
 
 	result = cuDeviceGetCount(&count);
 	if(result != CUDA_SUCCESS) {
-		fprintf(stderr, "CUDA cuDeviceGetCount: %s\n", CUDADevice::cuda_error_string(result));
+		fprintf(stderr, "CUDA cuDeviceGetCount: %s\n", cuewErrorString(result));
 		return;
 	}
 	
diff --git a/intern/cycles/device/device_intern.h b/intern/cycles/device/device_intern.h
index 7eb66c25a81..80f1e2441a5 100644
--- a/intern/cycles/device/device_intern.h
+++ b/intern/cycles/device/device_intern.h
@@ -22,7 +22,9 @@ CCL_NAMESPACE_BEGIN
 class Device;
 
 Device *device_cpu_create(DeviceInfo& info, Stats &stats, bool background);
+bool device_opencl_init(void);
 Device *device_opencl_create(DeviceInfo& info, Stats &stats, bool background);
+bool device_cuda_init(void);
 Device *device_cuda_create(DeviceInfo& info, Stats &stats, bool background);
 Device *device_network_create(DeviceInfo& info, Stats &stats, const char *address);
 Device *device_multi_create(DeviceInfo& info, Stats &stats, bool background);
diff --git a/intern/cycles/device/device_opencl.cpp b/intern/cycles/device/device_opencl.cpp
index 3abda6a54c1..fb106989201 100644
--- a/intern/cycles/device/device_opencl.cpp
+++ b/intern/cycles/device/device_opencl.cpp
@@ -25,11 +25,12 @@
 
 #include "buffers.h"
 
+#include "clew.h"
+
 #include "util_foreach.h"
 #include "util_map.h"
 #include "util_math.h"
 #include "util_md5.h"
-#include "util_opencl.h"
 #include "util_opengl.h"
 #include "util_path.h"
 #include "util_time.h"
@@ -552,7 +553,7 @@ public:
 		device_initialized = true;
 	}
 
-	static void context_notify_callback(const char *err_info,
+	static void CL_CALLBACK context_notify_callback(const char *err_info,
 		const void *private_info, size_t cb, void *user_data)
 	{
 		char name[256];
@@ -1162,6 +1163,26 @@ Device *device_opencl_create(DeviceInfo& info, Stats &stats, bool background)
 	return new OpenCLDevice(info, stats, background);
 }
 
+bool device_opencl_init(void) {
+	static bool initialized = false;
+	static bool result = false;
+
+	if (initialized)
+		return result;
+
+	initialized = true;
+
+	// OpenCL disabled for now, only works with this environment variable set
+	if(!getenv("CYCLES_OPENCL_TEST")) {
+		result = false;
+	}
+	else {
+		result = clewInit() == CLEW_SUCCESS;
+	}
+
+	return result;
+}
+
 void device_opencl_info(vector<DeviceInfo>& devices)
 {
 	vector<cl_device_id> device_ids;
-- 
cgit v1.2.3


From a48b372b04421b00644a0660bfdf42229b5ffceb Mon Sep 17 00:00:00 2001
From: Dalai Felinto <dfelinto@gmail.com>
Date: Tue, 5 Aug 2014 13:50:50 -0300
Subject: Fix T41222 Blender gives weird output when baking (4096*4096)
 resolution on GPU

In collaboration with Sergey Sharybin.

Also thanks to Wolfgang Faehnle (mib2berlin) for help testing the
solutions.

Reviewers: sergey

Differential Revision: https://developer.blender.org/D690
---
 intern/cycles/device/device_cuda.cpp  | 15 ++++++++++++++-
 intern/cycles/device/device_multi.cpp | 13 ++++++++++++-
 2 files changed, 26 insertions(+), 2 deletions(-)

(limited to 'intern/cycles/device')

diff --git a/intern/cycles/device/device_cuda.cpp b/intern/cycles/device/device_cuda.cpp
index 6629069c6c6..d76ffb10786 100644
--- a/intern/cycles/device/device_cuda.cpp
+++ b/intern/cycles/device/device_cuda.cpp
@@ -25,6 +25,7 @@
 
 #include "cuew.h"
 #include "util_debug.h"
+#include "util_foreach.h"
 #include "util_map.h"
 #include "util_opengl.h"
 #include "util_path.h"
@@ -966,7 +967,10 @@ public:
 
 	int get_split_task_count(DeviceTask& task)
 	{
-		return 1;
+		if (task.type == DeviceTask::SHADER)
+			return task.get_subtask_count(TaskScheduler::num_threads(), 1024 * 1024);
+		else
+			return 1;
 	}
 
 	void task_add(DeviceTask& task)
@@ -979,6 +983,15 @@ public:
 			cuda_assert(cuCtxSynchronize());
 			cuda_pop_context();
 		}
+		else if(task.type == DeviceTask::SHADER) {
+			/* split task into smaller ones */
+			list<DeviceTask> tasks;
+
+			task.split(tasks, TaskScheduler::num_threads(), 1024 * 1024);
+
+			foreach(DeviceTask& task, tasks)
+				task_pool.push(new CUDADeviceTask(this, task));
+		}
 		else {
 			task_pool.push(new CUDADeviceTask(this, task));
 		}
diff --git a/intern/cycles/device/device_multi.cpp b/intern/cycles/device/device_multi.cpp
index 564fbdbadf8..7f055c79491 100644
--- a/intern/cycles/device/device_multi.cpp
+++ b/intern/cycles/device/device_multi.cpp
@@ -280,7 +280,18 @@ public:
 
 	int get_split_task_count(DeviceTask& task)
 	{
-		return 1;
+		int total_tasks = 0;
+		list<DeviceTask> tasks;
+		task.split(tasks, devices.size());
+		foreach(SubDevice& sub, devices) {
+			if(!tasks.empty()) {
+				DeviceTask subtask = tasks.front();
+				tasks.pop_front();
+
+				total_tasks += sub.device->get_split_task_count(subtask);
+			}
+		}
+		return total_tasks;
 	}
 
 	void task_add(DeviceTask& task)
-- 
cgit v1.2.3


From c020bd2e73de1d20c0d5996d35a5a21821ecc9be Mon Sep 17 00:00:00 2001
From: Martijn Berger <martijn.berger@gmail.com>
Date: Sat, 9 Aug 2014 14:27:40 +0200
Subject: Cycles OpenCL error to string removed in favour of the same function
 in clew.

---
 intern/cycles/device/device_opencl.cpp | 59 ++--------------------------------
 1 file changed, 3 insertions(+), 56 deletions(-)

(limited to 'intern/cycles/device')

diff --git a/intern/cycles/device/device_opencl.cpp b/intern/cycles/device/device_opencl.cpp
index fb106989201..077ff9df51e 100644
--- a/intern/cycles/device/device_opencl.cpp
+++ b/intern/cycles/device/device_opencl.cpp
@@ -335,63 +335,10 @@ public:
 	bool device_initialized;
 	string platform_name;
 
-	const char *opencl_error_string(cl_int err)
-	{
-		switch (err) {
-			case CL_SUCCESS: return "Success!";
-			case CL_DEVICE_NOT_FOUND: return "Device not found.";
-			case CL_DEVICE_NOT_AVAILABLE: return "Device not available";
-			case CL_COMPILER_NOT_AVAILABLE: return "Compiler not available";
-			case CL_MEM_OBJECT_ALLOCATION_FAILURE: return "Memory object allocation failure";
-			case CL_OUT_OF_RESOURCES: return "Out of resources";
-			case CL_OUT_OF_HOST_MEMORY: return "Out of host memory";
-			case CL_PROFILING_INFO_NOT_AVAILABLE: return "Profiling information not available";
-			case CL_MEM_COPY_OVERLAP: return "Memory copy overlap";
-			case CL_IMAGE_FORMAT_MISMATCH: return "Image format mismatch";
-			case CL_IMAGE_FORMAT_NOT_SUPPORTED: return "Image format not supported";
-			case CL_BUILD_PROGRAM_FAILURE: return "Program build failure";
-			case CL_MAP_FAILURE: return "Map failure";
-			case CL_INVALID_VALUE: return "Invalid value";
-			case CL_INVALID_DEVICE_TYPE: return "Invalid device type";
-			case CL_INVALID_PLATFORM: return "Invalid platform";
-			case CL_INVALID_DEVICE: return "Invalid device";
-			case CL_INVALID_CONTEXT: return "Invalid context";
-			case CL_INVALID_QUEUE_PROPERTIES: return "Invalid queue properties";
-			case CL_INVALID_COMMAND_QUEUE: return "Invalid command queue";
-			case CL_INVALID_HOST_PTR: return "Invalid host pointer";
-			case CL_INVALID_MEM_OBJECT: return "Invalid memory object";
-			case CL_INVALID_IMAGE_FORMAT_DESCRIPTOR: return "Invalid image format descriptor";
-			case CL_INVALID_IMAGE_SIZE: return "Invalid image size";
-			case CL_INVALID_SAMPLER: return "Invalid sampler";
-			case CL_INVALID_BINARY: return "Invalid binary";
-			case CL_INVALID_BUILD_OPTIONS: return "Invalid build options";
-			case CL_INVALID_PROGRAM: return "Invalid program";
-			case CL_INVALID_PROGRAM_EXECUTABLE: return "Invalid program executable";
-			case CL_INVALID_KERNEL_NAME: return "Invalid kernel name";
-			case CL_INVALID_KERNEL_DEFINITION: return "Invalid kernel definition";
-			case CL_INVALID_KERNEL: return "Invalid kernel";
-			case CL_INVALID_ARG_INDEX: return "Invalid argument index";
-			case CL_INVALID_ARG_VALUE: return "Invalid argument value";
-			case CL_INVALID_ARG_SIZE: return "Invalid argument size";
-			case CL_INVALID_KERNEL_ARGS: return "Invalid kernel arguments";
-			case CL_INVALID_WORK_DIMENSION: return "Invalid work dimension";
-			case CL_INVALID_WORK_GROUP_SIZE: return "Invalid work group size";
-			case CL_INVALID_WORK_ITEM_SIZE: return "Invalid work item size";
-			case CL_INVALID_GLOBAL_OFFSET: return "Invalid global offset";
-			case CL_INVALID_EVENT_WAIT_LIST: return "Invalid event wait list";
-			case CL_INVALID_EVENT: return "Invalid event";
-			case CL_INVALID_OPERATION: return "Invalid operation";
-			case CL_INVALID_GL_OBJECT: return "Invalid OpenGL object";
-			case CL_INVALID_BUFFER_SIZE: return "Invalid buffer size";
-			case CL_INVALID_MIP_LEVEL: return "Invalid mip-map level";
-			default: return "Unknown";
-		}
-	}
-
 	bool opencl_error(cl_int err)
 	{
 		if(err != CL_SUCCESS) {
-			string message = string_printf("OpenCL error (%d): %s", err, opencl_error_string(err));
+			string message = string_printf("OpenCL error (%d): %s", err, clewErrorString(err));
 			if(error_msg == "")
 				error_msg = message;
 			fprintf(stderr, "%s\n", message.c_str());
@@ -413,7 +360,7 @@ public:
 		cl_int err = stmt; \
 		\
 		if(err != CL_SUCCESS) { \
-			string message = string_printf("OpenCL error: %s in %s", opencl_error_string(err), #stmt); \
+			string message = string_printf("OpenCL error: %s in %s", clewErrorString(err), #stmt); \
 			if(error_msg == "") \
 				error_msg = message; \
 			fprintf(stderr, "%s\n", message.c_str()); \
@@ -423,7 +370,7 @@ public:
 	void opencl_assert_err(cl_int err, const char* where)
 	{
 		if(err != CL_SUCCESS) {
-			string message = string_printf("OpenCL error (%d): %s in %s", err, opencl_error_string(err), where);
+			string message = string_printf("OpenCL error (%d): %s in %s", err, clewErrorString(err), where);
 			if(error_msg == "")
 				error_msg = message;
 			fprintf(stderr, "%s\n", message.c_str());
-- 
cgit v1.2.3


From 2c5b6859d91f086f3f7205f0cc15244882ca7ace Mon Sep 17 00:00:00 2001
From: Dalai Felinto <dfelinto@gmail.com>
Date: Wed, 6 Aug 2014 13:10:56 -0300
Subject: Revert "Fix T41222 Blender gives weird output when baking (4096*4096)
 resolution on GPU"

This reverts commit a48b372b04421b00644a0660bfdf42229b5ffceb.

Leaving only the part that fix device_multi.cpp
---
 intern/cycles/device/device_cuda.cpp | 15 +--------------
 1 file changed, 1 insertion(+), 14 deletions(-)

(limited to 'intern/cycles/device')

diff --git a/intern/cycles/device/device_cuda.cpp b/intern/cycles/device/device_cuda.cpp
index d76ffb10786..6629069c6c6 100644
--- a/intern/cycles/device/device_cuda.cpp
+++ b/intern/cycles/device/device_cuda.cpp
@@ -25,7 +25,6 @@
 
 #include "cuew.h"
 #include "util_debug.h"
-#include "util_foreach.h"
 #include "util_map.h"
 #include "util_opengl.h"
 #include "util_path.h"
@@ -967,10 +966,7 @@ public:
 
 	int get_split_task_count(DeviceTask& task)
 	{
-		if (task.type == DeviceTask::SHADER)
-			return task.get_subtask_count(TaskScheduler::num_threads(), 1024 * 1024);
-		else
-			return 1;
+		return 1;
 	}
 
 	void task_add(DeviceTask& task)
@@ -983,15 +979,6 @@ public:
 			cuda_assert(cuCtxSynchronize());
 			cuda_pop_context();
 		}
-		else if(task.type == DeviceTask::SHADER) {
-			/* split task into smaller ones */
-			list<DeviceTask> tasks;
-
-			task.split(tasks, TaskScheduler::num_threads(), 1024 * 1024);
-
-			foreach(DeviceTask& task, tasks)
-				task_pool.push(new CUDADeviceTask(this, task));
-		}
 		else {
 			task_pool.push(new CUDADeviceTask(this, task));
 		}
-- 
cgit v1.2.3


From 8d3cc431d7fdcc9f3243cc24dfdcb94124be0993 Mon Sep 17 00:00:00 2001
From: Dalai Felinto <dfelinto@gmail.com>
Date: Tue, 19 Aug 2014 11:39:40 +0200
Subject: Fix T41471 Cycles Bake: Setting small tile size results in wrong bake
 with stripes rather than the expected noise pattern

This problem was introduced in 983cbafd1877f8dbaae60b064a14e27b5b640f18
Basically the issue is that we were not getting a unique index in the
baking routine for the RNG (random number generator).

Reviewers: sergey

Differential Revision: https://developer.blender.org/D749
---
 intern/cycles/device/device_cpu.cpp    | 18 ++++++++++++------
 intern/cycles/device/device_cuda.cpp   |  2 ++
 intern/cycles/device/device_opencl.cpp |  2 ++
 3 files changed, 16 insertions(+), 6 deletions(-)

(limited to 'intern/cycles/device')

diff --git a/intern/cycles/device/device_cpu.cpp b/intern/cycles/device/device_cpu.cpp
index 4fdeef6bdcb..fd5ae1d7828 100644
--- a/intern/cycles/device/device_cpu.cpp
+++ b/intern/cycles/device/device_cpu.cpp
@@ -435,7 +435,8 @@ public:
 		if(system_cpu_support_avx2()) {
 			for(int sample = 0; sample < task.num_samples; sample++) {
 				for(int x = task.shader_x; x < task.shader_x + task.shader_w; x++)
-					kernel_cpu_avx2_shader(&kg, (uint4*)task.shader_input, (float4*)task.shader_output, task.shader_eval_type, x, sample);
+					kernel_cpu_avx2_shader(&kg, (uint4*)task.shader_input, (float4*)task.shader_output,
+					    task.shader_eval_type, x, task.offset, sample);
 
 				if(task.get_cancel() || task_pool.canceled())
 					break;
@@ -449,7 +450,8 @@ public:
 		if(system_cpu_support_avx()) {
 			for(int sample = 0; sample < task.num_samples; sample++) {
 				for(int x = task.shader_x; x < task.shader_x + task.shader_w; x++)
-					kernel_cpu_avx_shader(&kg, (uint4*)task.shader_input, (float4*)task.shader_output, task.shader_eval_type, x, sample);
+					kernel_cpu_avx_shader(&kg, (uint4*)task.shader_input, (float4*)task.shader_output,
+					    task.shader_eval_type, x, task.offset, sample);
 
 				if(task.get_cancel() || task_pool.canceled())
 					break;
@@ -463,7 +465,8 @@ public:
 		if(system_cpu_support_sse41()) {
 			for(int sample = 0; sample < task.num_samples; sample++) {
 				for(int x = task.shader_x; x < task.shader_x + task.shader_w; x++)
-					kernel_cpu_sse41_shader(&kg, (uint4*)task.shader_input, (float4*)task.shader_output, task.shader_eval_type, x, sample);
+					kernel_cpu_sse41_shader(&kg, (uint4*)task.shader_input, (float4*)task.shader_output,
+					    task.shader_eval_type, x, task.offset, sample);
 
 				if(task.get_cancel() || task_pool.canceled())
 					break;
@@ -477,7 +480,8 @@ public:
 		if(system_cpu_support_sse3()) {
 			for(int sample = 0; sample < task.num_samples; sample++) {
 				for(int x = task.shader_x; x < task.shader_x + task.shader_w; x++)
-					kernel_cpu_sse3_shader(&kg, (uint4*)task.shader_input, (float4*)task.shader_output, task.shader_eval_type, x, sample);
+					kernel_cpu_sse3_shader(&kg, (uint4*)task.shader_input, (float4*)task.shader_output,
+					    task.shader_eval_type, x, task.offset, sample);
 
 				if(task.get_cancel() || task_pool.canceled())
 					break;
@@ -491,7 +495,8 @@ public:
 		if(system_cpu_support_sse2()) {
 			for(int sample = 0; sample < task.num_samples; sample++) {
 				for(int x = task.shader_x; x < task.shader_x + task.shader_w; x++)
-					kernel_cpu_sse2_shader(&kg, (uint4*)task.shader_input, (float4*)task.shader_output, task.shader_eval_type, x, sample);
+					kernel_cpu_sse2_shader(&kg, (uint4*)task.shader_input, (float4*)task.shader_output,
+					    task.shader_eval_type, x, task.offset, sample);
 
 				if(task.get_cancel() || task_pool.canceled())
 					break;
@@ -504,7 +509,8 @@ public:
 		{
 			for(int sample = 0; sample < task.num_samples; sample++) {
 				for(int x = task.shader_x; x < task.shader_x + task.shader_w; x++)
-					kernel_cpu_shader(&kg, (uint4*)task.shader_input, (float4*)task.shader_output, task.shader_eval_type, x, sample);
+					kernel_cpu_shader(&kg, (uint4*)task.shader_input, (float4*)task.shader_output,
+					    task.shader_eval_type, x, task.offset, sample);
 
 				if(task.get_cancel() || task_pool.canceled())
 					break;
diff --git a/intern/cycles/device/device_cuda.cpp b/intern/cycles/device/device_cuda.cpp
index 6629069c6c6..9e3d703f5d9 100644
--- a/intern/cycles/device/device_cuda.cpp
+++ b/intern/cycles/device/device_cuda.cpp
@@ -676,6 +676,7 @@ public:
 		const int shader_chunk_size = 65536;
 		const int start = task.shader_x;
 		const int end = task.shader_x + task.shader_w;
+		int offset = task.offset;
 
 		bool canceled = false;
 		for(int sample = 0; sample < task.num_samples && !canceled; sample++) {
@@ -688,6 +689,7 @@ public:
 								 &task.shader_eval_type,
 								 &shader_x,
 								 &shader_w,
+								 &offset,
 								 &sample};
 
 				/* launch kernel */
diff --git a/intern/cycles/device/device_opencl.cpp b/intern/cycles/device/device_opencl.cpp
index 077ff9df51e..82419cd62b1 100644
--- a/intern/cycles/device/device_opencl.cpp
+++ b/intern/cycles/device/device_opencl.cpp
@@ -1004,6 +1004,7 @@ public:
 		cl_int d_shader_eval_type = task.shader_eval_type;
 		cl_int d_shader_x = task.shader_x;
 		cl_int d_shader_w = task.shader_w;
+		cl_int d_offset = task.offset;
 
 		/* sample arguments */
 		cl_uint narg = 0;
@@ -1033,6 +1034,7 @@ public:
 			opencl_assert(clSetKernelArg(kernel, narg++, sizeof(d_shader_eval_type), (void*)&d_shader_eval_type));
 			opencl_assert(clSetKernelArg(kernel, narg++, sizeof(d_shader_x), (void*)&d_shader_x));
 			opencl_assert(clSetKernelArg(kernel, narg++, sizeof(d_shader_w), (void*)&d_shader_w));
+			opencl_assert(clSetKernelArg(kernel, narg++, sizeof(d_offset), (void*)&d_offset));
 			opencl_assert(clSetKernelArg(kernel, narg++, sizeof(d_sample), (void*)&d_sample));
 
 			enqueue_kernel(kernel, task.shader_w, 1);
-- 
cgit v1.2.3


From 603348c56e5f86dbc10076daa01ebfab2b49f739 Mon Sep 17 00:00:00 2001
From: Thomas Dinges <blender@dingto.org>
Date: Thu, 21 Aug 2014 23:35:20 +0200
Subject: Cycles: Drop support for CUDA 5.0 Toolkit, only 6.0 and 6.5
 (recommended) are supported now.

---
 intern/cycles/device/device_cuda.cpp | 8 ++++----
 1 file changed, 4 insertions(+), 4 deletions(-)

(limited to 'intern/cycles/device')

diff --git a/intern/cycles/device/device_cuda.cpp b/intern/cycles/device/device_cuda.cpp
index 9e3d703f5d9..f2e470c21d5 100644
--- a/intern/cycles/device/device_cuda.cpp
+++ b/intern/cycles/device/device_cuda.cpp
@@ -243,12 +243,12 @@ public:
 			cuda_error_message("CUDA nvcc compiler version could not be parsed.");
 			return "";
 		}
-		if(cuda_version < 50) {
-			printf("Unsupported CUDA version %d.%d detected, you need CUDA 6.0.\n", cuda_version/10, cuda_version%10);
+		if(cuda_version < 60) {
+			printf("Unsupported CUDA version %d.%d detected, you need CUDA 6.5.\n", cuda_version/10, cuda_version%10);
 			return "";
 		}
-		else if(cuda_version != 60)
-			printf("CUDA version %d.%d detected, build may succeed but only CUDA 6.0 is officially supported.\n", cuda_version/10, cuda_version%10);
+		else if(cuda_version != 65)
+			printf("CUDA version %d.%d detected, build may succeed but only CUDA 6.5 is officially supported.\n", cuda_version/10, cuda_version%10);
 
 		/* compile */
 		string kernel = path_join(kernel_path, "kernel.cu");
-- 
cgit v1.2.3


From fb3f32760d68134aadb7978922360857f0ecccb7 Mon Sep 17 00:00:00 2001
From: Thomas Dinges <blender@dingto.org>
Date: Tue, 26 Aug 2014 17:02:03 +0200
Subject: Cycles: Add an experimental CUDA kernel.

Now we build 2 .cubins per architecture (e.g. kernel_sm_21.cubin, kernel_experimental_sm_21.cubin).
The experimental kernel can be used by switching to the Experimental Feature Set: http://wiki.blender.org/index.php/Doc:2.6/Manual/Render/Cycles/Experimental_Features

This enables Subsurface Scattering and Correlated Multi Jitter Sampling on GPU, while keeping the stability and performance of the regular kernel.

Differential Revision: https://developer.blender.org/D762
Patch by Sergey and myself.

Developer / Builder Note:
CUDA Toolkit 6.5 is highly recommended for this, also note that building the experimental kernel requires a lot of system memory (~7-8GB).
---
 intern/cycles/device/device_cuda.cpp | 18 ++++++++++++++----
 1 file changed, 14 insertions(+), 4 deletions(-)

(limited to 'intern/cycles/device')

diff --git a/intern/cycles/device/device_cuda.cpp b/intern/cycles/device/device_cuda.cpp
index f2e470c21d5..1ed26717f4b 100644
--- a/intern/cycles/device/device_cuda.cpp
+++ b/intern/cycles/device/device_cuda.cpp
@@ -197,14 +197,18 @@ public:
 		return true;
 	}
 
-	string compile_kernel()
+	string compile_kernel(bool experimental)
 	{
 		/* compute cubin name */
 		int major, minor;
 		cuDeviceComputeCapability(&major, &minor, cuDevId);
 
 		/* attempt to use kernel provided with blender */
-		string cubin = path_get(string_printf("lib/kernel_sm_%d%d.cubin", major, minor));
+		string cubin;
+		if(experimental)
+			cubin = path_get(string_printf("lib/kernel_experimental_sm_%d%d.cubin", major, minor));
+		else
+			cubin = path_get(string_printf("lib/kernel_sm_%d%d.cubin", major, minor));
 		if(path_exists(cubin))
 			return cubin;
 
@@ -212,7 +216,10 @@ public:
 		string kernel_path = path_get("kernel");
 		string md5 = path_files_md5_hash(kernel_path);
 
-		cubin = string_printf("cycles_kernel_sm%d%d_%s.cubin", major, minor, md5.c_str());
+		if(experimental)
+			cubin = string_printf("cycles_kernel_experimental_sm%d%d_%s.cubin", major, minor, md5.c_str());
+		else
+			cubin = string_printf("cycles_kernel_sm%d%d_%s.cubin", major, minor, md5.c_str());
 		cubin = path_user_get(path_join("cache", cubin));
 
 		/* if exists already, use it */
@@ -263,6 +270,9 @@ public:
 		string command = string_printf("\"%s\" -arch=sm_%d%d -m%d --cubin \"%s\" "
 			"-o \"%s\" --ptxas-options=\"-v\" -I\"%s\" -DNVCC -D__KERNEL_CUDA_VERSION__=%d",
 			nvcc, major, minor, machine, kernel.c_str(), cubin.c_str(), include.c_str(), cuda_version);
+		
+		if(experimental)
+			command += " -D__KERNEL_CUDA_EXPERIMENTAL__";
 
 		printf("%s\n", command.c_str());
 
@@ -293,7 +303,7 @@ public:
 			return false;
 
 		/* get kernel */
-		string cubin = compile_kernel();
+		string cubin = compile_kernel(experimental);
 
 		if(cubin == "")
 			return false;
-- 
cgit v1.2.3


From fbed2047c8e84a535c32bf3f3fb3ea1791a08571 Mon Sep 17 00:00:00 2001
From: Sergey Sharybin <sergey.vfx@gmail.com>
Date: Thu, 4 Sep 2014 17:22:40 +0600
Subject: Fix wrong track of the memory when doing device vector resize before
 freeing it

This is rather legit case which happens i.e. when having persistent images enabled
and session is updating the lookup tables.

Now device_memory keeps track of amount of memory being allocated on the device,
which makes freeing using the proper allocated size, not the CPU side buffer
size.
---
 intern/cycles/device/device_cpu.cpp    | 24 ++++++++++++++----------
 intern/cycles/device/device_cuda.cpp   | 14 ++++++++++----
 intern/cycles/device/device_memory.h   |  2 ++
 intern/cycles/device/device_opencl.cpp |  4 +++-
 4 files changed, 29 insertions(+), 15 deletions(-)

(limited to 'intern/cycles/device')

diff --git a/intern/cycles/device/device_cpu.cpp b/intern/cycles/device/device_cpu.cpp
index fd5ae1d7828..4623764d210 100644
--- a/intern/cycles/device/device_cpu.cpp
+++ b/intern/cycles/device/device_cpu.cpp
@@ -73,8 +73,8 @@ public:
 	void mem_alloc(device_memory& mem, MemoryType type)
 	{
 		mem.device_pointer = mem.data_pointer;
-
-		stats.mem_alloc(mem.memory_size());
+		mem.device_size = mem.memory_size();
+		stats.mem_alloc(mem.device_size);
 	}
 
 	void mem_copy_to(device_memory& mem)
@@ -94,9 +94,11 @@ public:
 
 	void mem_free(device_memory& mem)
 	{
-		mem.device_pointer = 0;
-
-		stats.mem_free(mem.memory_size());
+		if(mem.device_pointer) {
+			mem.device_pointer = 0;
+			stats.mem_free(mem.device_size);
+			mem.device_size = 0;
+		}
 	}
 
 	void const_copy_to(const char *name, void *host, size_t size)
@@ -108,15 +110,17 @@ public:
 	{
 		kernel_tex_copy(&kernel_globals, name, mem.data_pointer, mem.data_width, mem.data_height, mem.data_depth, interpolation);
 		mem.device_pointer = mem.data_pointer;
-
-		stats.mem_alloc(mem.memory_size());
+		mem.device_size = mem.memory_size();
+		stats.mem_alloc(mem.device_size);
 	}
 
 	void tex_free(device_memory& mem)
 	{
-		mem.device_pointer = 0;
-
-		stats.mem_free(mem.memory_size());
+		if(mem.device_pointer) {
+			mem.device_pointer = 0;
+			stats.mem_free(mem.device_size);
+			mem.device_size = 0;
+		}
 	}
 
 	void *osl_memory()
diff --git a/intern/cycles/device/device_cuda.cpp b/intern/cycles/device/device_cuda.cpp
index 1ed26717f4b..5de2efab8be 100644
--- a/intern/cycles/device/device_cuda.cpp
+++ b/intern/cycles/device/device_cuda.cpp
@@ -334,6 +334,7 @@ public:
 		size_t size = mem.memory_size();
 		cuda_assert(cuMemAlloc(&device_pointer, size));
 		mem.device_pointer = (device_ptr)device_pointer;
+		mem.device_size = size;
 		stats.mem_alloc(size);
 		cuda_pop_context();
 	}
@@ -381,7 +382,8 @@ public:
 
 			mem.device_pointer = 0;
 
-			stats.mem_free(mem.memory_size());
+			stats.mem_free(mem.device_size);
+			mem.device_size = 0;
 		}
 	}
 
@@ -473,6 +475,7 @@ public:
 				cuda_assert(cuTexRefSetFlags(texref, CU_TRSF_NORMALIZED_COORDINATES));
 
 				mem.device_pointer = (device_ptr)handle;
+				mem.device_size = size;
 
 				stats.mem_alloc(size);
 			}
@@ -540,7 +543,8 @@ public:
 				tex_interp_map.erase(tex_interp_map.find(mem.device_pointer));
 				mem.device_pointer = 0;
 
-				stats.mem_free(mem.memory_size());
+				stats.mem_free(mem.device_size);
+				mem.device_size = 0;
 			}
 			else {
 				tex_interp_map.erase(tex_interp_map.find(mem.device_pointer));
@@ -790,7 +794,8 @@ public:
 				mem.device_pointer = pmem.cuTexId;
 				pixel_mem_map[mem.device_pointer] = pmem;
 
-				stats.mem_alloc(mem.memory_size());
+				mem.device_size = mem.memory_size();
+				stats.mem_alloc(mem.device_size);
 
 				return;
 			}
@@ -847,7 +852,8 @@ public:
 				pixel_mem_map.erase(pixel_mem_map.find(mem.device_pointer));
 				mem.device_pointer = 0;
 
-				stats.mem_free(mem.memory_size());
+				stats.mem_free(mem.device_size);
+				mem.device_size = 0;
 
 				return;
 			}
diff --git a/intern/cycles/device/device_memory.h b/intern/cycles/device/device_memory.h
index 8d6f4a49a9c..8eee6a2c79e 100644
--- a/intern/cycles/device/device_memory.h
+++ b/intern/cycles/device/device_memory.h
@@ -167,6 +167,7 @@ public:
 	int data_elements;
 	device_ptr data_pointer;
 	size_t data_size;
+	size_t device_size;
 	size_t data_width;
 	size_t data_height;
 	size_t data_depth;
@@ -194,6 +195,7 @@ public:
 		data_elements = device_type_traits<T>::num_elements;
 		data_pointer = 0;
 		data_size = 0;
+		device_size = 0;
 		data_width = 0;
 		data_height = 0;
 		data_depth = 0;
diff --git a/intern/cycles/device/device_opencl.cpp b/intern/cycles/device/device_opencl.cpp
index 82419cd62b1..d950d084cd4 100644
--- a/intern/cycles/device/device_opencl.cpp
+++ b/intern/cycles/device/device_opencl.cpp
@@ -794,6 +794,7 @@ public:
 		opencl_assert_err(ciErr, "clCreateBuffer");
 
 		stats.mem_alloc(size);
+		mem.device_size = size;
 	}
 
 	void mem_copy_to(device_memory& mem)
@@ -825,7 +826,8 @@ public:
 			opencl_assert(clReleaseMemObject(CL_MEM_PTR(mem.device_pointer)));
 			mem.device_pointer = 0;
 
-			stats.mem_free(mem.memory_size());
+			stats.mem_free(mem.device_size);
+			mem.device_size = 0;
 		}
 	}
 
-- 
cgit v1.2.3