Fix T40370: cycles CUDA baking timeout with high number of AA samples.

Now baking does one AA sample at a time, just like final render. There is also some code for shader antialiasing that solves T40369 but it is disabled for now because there may be unpredictable side effects.
author: Brecht Van Lommel <brechtvanlommel@gmail.com> 2014-06-06 16:40:09 +0400
committer: Brecht Van Lommel <brechtvanlommel@gmail.com> 2014-06-06 17:39:04 +0400
commit: e4e58d46128dc7fe4fb9b881d73b38173f00f5c3 (patch)
tree: cc38ac39838bec84d28de396374ba022139a8aa2
parent: 553264ff8e20484d0b91bb468f56aa1b7144f7aa (diff)
17 files changed, 245 insertions, 165 deletions
diff --git a/intern/cycles/blender/blender_session.cpp b/intern/cycles/blender/blender_session.cpp
index 13d4041f6a7..0f31e55d60d 100644
--- a/intern/cycles/blender/blender_session.cpp
+++ b/intern/cycles/blender/blender_session.cpp
@@ -492,26 +492,6 @@ static void populate_bake_data(BakeData *data, BL::BakePixel pixel_array, const
 	}
 }
 
-static bool is_light_pass(ShaderEvalType type)
-{
-	switch (type) {
-		case SHADER_EVAL_AO:
-		case SHADER_EVAL_COMBINED:
-		case SHADER_EVAL_SHADOW:
-		case SHADER_EVAL_DIFFUSE_DIRECT:
-		case SHADER_EVAL_GLOSSY_DIRECT:
-		case SHADER_EVAL_TRANSMISSION_DIRECT:
-		case SHADER_EVAL_SUBSURFACE_DIRECT:
-		case SHADER_EVAL_DIFFUSE_INDIRECT:
-		case SHADER_EVAL_GLOSSY_INDIRECT:
-		case SHADER_EVAL_TRANSMISSION_INDIRECT:
-		case SHADER_EVAL_SUBSURFACE_INDIRECT:
-			return true;
-		default:
-			return false;
-	}
-}
-
 void BlenderSession::bake(BL::Object b_object, const string& pass_type, BL::BakePixel pixel_array, int num_pixels, int depth, float result[])
 {
 	ShaderEvalType shader_type = get_shader_type(pass_type);
@@ -529,7 +509,7 @@ void BlenderSession::bake(BL::Object b_object, const string& pass_type, BL::Bake
 		Pass::add(PASS_UV, scene->film->passes);
 	}
 
-	if(is_light_pass(shader_type)) {
+	if(BakeManager::is_light_pass(shader_type)) {
 		/* force use_light_pass to be true */
 		Pass::add(PASS_LIGHT, scene->film->passes);
 	}
diff --git a/intern/cycles/device/device_cpu.cpp b/intern/cycles/device/device_cpu.cpp
index b0739dd20b4..71bf2d23d6e 100644
--- a/intern/cycles/device/device_cpu.cpp
+++ b/intern/cycles/device/device_cpu.cpp
@@ -393,7 +393,8 @@ public:
 #ifdef WITH_CYCLES_OPTIMIZED_KERNEL_AVX
 		if(system_cpu_support_avx()) {
 			for(int x = task.shader_x; x < task.shader_x + task.shader_w; x++) {
-				kernel_cpu_avx_shader(&kg, (uint4*)task.shader_input, (float4*)task.shader_output, task.shader_eval_type, x);
+				for(int sample = 0; sample < task.num_samples; sample++)
+					kernel_cpu_avx_shader(&kg, (uint4*)task.shader_input, (float4*)task.shader_output, task.shader_eval_type, x, sample);
 
 				if(task.get_cancel() || task_pool.canceled())
 					break;
@@ -404,7 +405,8 @@ public:
 #ifdef WITH_CYCLES_OPTIMIZED_KERNEL_SSE41			
 		if(system_cpu_support_sse41()) {
 			for(int x = task.shader_x; x < task.shader_x + task.shader_w; x++) {
-				kernel_cpu_sse41_shader(&kg, (uint4*)task.shader_input, (float4*)task.shader_output, task.shader_eval_type, x);
+				for(int sample = 0; sample < task.num_samples; sample++)
+					kernel_cpu_sse41_shader(&kg, (uint4*)task.shader_input, (float4*)task.shader_output, task.shader_eval_type, x, sample);
 
 				if(task.get_cancel() || task_pool.canceled())
 					break;
@@ -415,7 +417,8 @@ public:
 #ifdef WITH_CYCLES_OPTIMIZED_KERNEL_SSE3
 		if(system_cpu_support_sse3()) {
 			for(int x = task.shader_x; x < task.shader_x + task.shader_w; x++) {
-				kernel_cpu_sse3_shader(&kg, (uint4*)task.shader_input, (float4*)task.shader_output, task.shader_eval_type, x);
+				for(int sample = 0; sample < task.num_samples; sample++)
+					kernel_cpu_sse3_shader(&kg, (uint4*)task.shader_input, (float4*)task.shader_output, task.shader_eval_type, x, sample);
 
 				if(task.get_cancel() || task_pool.canceled())
 					break;
@@ -426,7 +429,8 @@ public:
 #ifdef WITH_CYCLES_OPTIMIZED_KERNEL_SSE2
 		if(system_cpu_support_sse2()) {
 			for(int x = task.shader_x; x < task.shader_x + task.shader_w; x++) {
-				kernel_cpu_sse2_shader(&kg, (uint4*)task.shader_input, (float4*)task.shader_output, task.shader_eval_type, x);
+				for(int sample = 0; sample < task.num_samples; sample++)
+					kernel_cpu_sse2_shader(&kg, (uint4*)task.shader_input, (float4*)task.shader_output, task.shader_eval_type, x, sample);
 
 				if(task.get_cancel() || task_pool.canceled())
 					break;
@@ -436,7 +440,8 @@ public:
 #endif
 		{
 			for(int x = task.shader_x; x < task.shader_x + task.shader_w; x++) {
-				kernel_cpu_shader(&kg, (uint4*)task.shader_input, (float4*)task.shader_output, task.shader_eval_type, x);
+				for(int sample = 0; sample < task.num_samples; sample++)
+					kernel_cpu_shader(&kg, (uint4*)task.shader_input, (float4*)task.shader_output, task.shader_eval_type, x, sample);
 
 				if(task.get_cancel() || task_pool.canceled())
 					break;
diff --git a/intern/cycles/device/device_cuda.cpp b/intern/cycles/device/device_cuda.cpp
index 0429bfc6e97..0aa09ac5383 100644
--- a/intern/cycles/device/device_cuda.cpp
+++ b/intern/cycles/device/device_cuda.cpp
@@ -764,40 +764,45 @@ public:
 
 			int shader_w = min(shader_chunk_size, end - shader_x);
 
-			/* pass in parameters */
-			int offset = 0;
+			for(int sample = 0; sample < task.num_samples; sample++) {
+				/* pass in parameters */
+				int offset = 0;
 
-			cuda_assert(cuParamSetv(cuShader, offset, &d_input, sizeof(d_input)));
-			offset += sizeof(d_input);
+				cuda_assert(cuParamSetv(cuShader, offset, &d_input, sizeof(d_input)));
+				offset += sizeof(d_input);
 
-			cuda_assert(cuParamSetv(cuShader, offset, &d_output, sizeof(d_output)));
-			offset += sizeof(d_output);
+				cuda_assert(cuParamSetv(cuShader, offset, &d_output, sizeof(d_output)));
+				offset += sizeof(d_output);
 
-			int shader_eval_type = task.shader_eval_type;
-			offset = align_up(offset, __alignof(shader_eval_type));
+				int shader_eval_type = task.shader_eval_type;
+				offset = align_up(offset, __alignof(shader_eval_type));
 
-			cuda_assert(cuParamSeti(cuShader, offset, task.shader_eval_type));
-			offset += sizeof(task.shader_eval_type);
+				cuda_assert(cuParamSeti(cuShader, offset, task.shader_eval_type));
+				offset += sizeof(task.shader_eval_type);
 
-			cuda_assert(cuParamSeti(cuShader, offset, shader_x));
-			offset += sizeof(shader_x);
+				cuda_assert(cuParamSeti(cuShader, offset, shader_x));
+				offset += sizeof(shader_x);
 
-			cuda_assert(cuParamSeti(cuShader, offset, shader_w));
-			offset += sizeof(shader_w);
+				cuda_assert(cuParamSeti(cuShader, offset, shader_w));
+				offset += sizeof(shader_w);
 
-			cuda_assert(cuParamSetSize(cuShader, offset));
+				cuda_assert(cuParamSeti(cuShader, offset, sample));
+				offset += sizeof(sample);
 
-			/* launch kernel */
-			int threads_per_block;
-			cuda_assert(cuFuncGetAttribute(&threads_per_block, CU_FUNC_ATTRIBUTE_MAX_THREADS_PER_BLOCK, cuShader));
+				cuda_assert(cuParamSetSize(cuShader, offset));
 
-			int xblocks = (shader_w + threads_per_block - 1)/threads_per_block;
+				/* launch kernel */
+				int threads_per_block;
+				cuda_assert(cuFuncGetAttribute(&threads_per_block, CU_FUNC_ATTRIBUTE_MAX_THREADS_PER_BLOCK, cuShader));
 
-			cuda_assert(cuFuncSetCacheConfig(cuShader, CU_FUNC_CACHE_PREFER_L1));
-			cuda_assert(cuFuncSetBlockShape(cuShader, threads_per_block, 1, 1));
-			cuda_assert(cuLaunchGrid(cuShader, xblocks, 1));
+				int xblocks = (shader_w + threads_per_block - 1)/threads_per_block;
 
-			cuda_assert(cuCtxSynchronize());
+				cuda_assert(cuFuncSetCacheConfig(cuShader, CU_FUNC_CACHE_PREFER_L1));
+				cuda_assert(cuFuncSetBlockShape(cuShader, threads_per_block, 1, 1));
+				cuda_assert(cuLaunchGrid(cuShader, xblocks, 1));
+
+				cuda_assert(cuCtxSynchronize());
+			}
 		}
 
 		cuda_pop_context();
diff --git a/intern/cycles/device/device_opencl.cpp b/intern/cycles/device/device_opencl.cpp
index f841daba124..abfe445414a 100644
--- a/intern/cycles/device/device_opencl.cpp
+++ b/intern/cycles/device/device_opencl.cpp
@@ -1067,19 +1067,24 @@ public:
 		else
 			kernel = ckShaderKernel;
 
-		opencl_assert(clSetKernelArg(kernel, narg++, sizeof(d_data), (void*)&d_data));
-		opencl_assert(clSetKernelArg(kernel, narg++, sizeof(d_input), (void*)&d_input));
-		opencl_assert(clSetKernelArg(kernel, narg++, sizeof(d_output), (void*)&d_output));
+		for(int sample = 0; sample < task.num_samples; sample++) {
+			cl_int d_sample = task.sample;
+
+			opencl_assert(clSetKernelArg(kernel, narg++, sizeof(d_data), (void*)&d_data));
+			opencl_assert(clSetKernelArg(kernel, narg++, sizeof(d_input), (void*)&d_input));
+			opencl_assert(clSetKernelArg(kernel, narg++, sizeof(d_output), (void*)&d_output));
 
 #define KERNEL_TEX(type, ttype, name) \
-	set_kernel_arg_mem(kernel, &narg, #name);
+		set_kernel_arg_mem(kernel, &narg, #name);
 #include "kernel_textures.h"
 
-		opencl_assert(clSetKernelArg(kernel, narg++, sizeof(d_shader_eval_type), (void*)&d_shader_eval_type));
-		opencl_assert(clSetKernelArg(kernel, narg++, sizeof(d_shader_x), (void*)&d_shader_x));
-		opencl_assert(clSetKernelArg(kernel, narg++, sizeof(d_shader_w), (void*)&d_shader_w));
+			opencl_assert(clSetKernelArg(kernel, narg++, sizeof(d_shader_eval_type), (void*)&d_shader_eval_type));
+			opencl_assert(clSetKernelArg(kernel, narg++, sizeof(d_shader_x), (void*)&d_shader_x));
+			opencl_assert(clSetKernelArg(kernel, narg++, sizeof(d_shader_w), (void*)&d_shader_w));
+			opencl_assert(clSetKernelArg(kernel, narg++, sizeof(d_sample), (void*)&d_sample));
 
-		enqueue_kernel(kernel, task.shader_w, 1);
+			enqueue_kernel(kernel, task.shader_w, 1);
+		}
 	}
 
 	void thread_run(DeviceTask *task)
diff --git a/intern/cycles/kernel/kernel.cl b/intern/cycles/kernel/kernel.cl
index d7d3438036e..2e0a49435a8 100644
--- a/intern/cycles/kernel/kernel.cl
+++ b/intern/cycles/kernel/kernel.cl
@@ -115,7 +115,7 @@ __kernel void kernel_ocl_shader(
 	ccl_global type *name,
 #include "kernel_textures.h"
 
-	int type, int sx, int sw)
+	int type, int sx, int sw, int sample)
 {
 	KernelGlobals kglobals, *kg = &kglobals;
 
@@ -128,7 +128,7 @@ __kernel void kernel_ocl_shader(
 	int x = sx + get_global_id(0);
 
 	if(x < sx + sw)
-		kernel_shader_evaluate(kg, input, output, (ShaderEvalType)type, x);
+		kernel_shader_evaluate(kg, input, output, (ShaderEvalType)type, x, sample);
 }
 
 __kernel void kernel_ocl_bake(
@@ -140,7 +140,7 @@ __kernel void kernel_ocl_bake(
 	ccl_global type *name,
 #include "kernel_textures.h"
 
-	int type, int sx, int sw)
+	int type, int sx, int sw, int sample)
 {
 	KernelGlobals kglobals, *kg = &kglobals;
 
@@ -153,6 +153,6 @@ __kernel void kernel_ocl_bake(
 	int x = sx + get_global_id(0);
 
 	if(x < sx + sw)
-		kernel_bake_evaluate(kg, input, output, (ShaderEvalType)type, x);
+		kernel_bake_evaluate(kg, input, output, (ShaderEvalType)type, x, sample);
 }
 
diff --git a/intern/cycles/kernel/kernel.cpp b/intern/cycles/kernel/kernel.cpp
index a535659b3b1..42eb9a62518 100644
--- a/intern/cycles/kernel/kernel.cpp
+++ b/intern/cycles/kernel/kernel.cpp
@@ -120,12 +120,12 @@ void kernel_cpu_convert_to_half_float(KernelGlobals *kg, uchar4 *rgba, float *bu
 
 /* Shader Evaluation */
 
-void kernel_cpu_shader(KernelGlobals *kg, uint4 *input, float4 *output, int type, int i)
+void kernel_cpu_shader(KernelGlobals *kg, uint4 *input, float4 *output, int type, int i, int sample)
 {
 	if(type >= SHADER_EVAL_BAKE)
-		kernel_bake_evaluate(kg, input, output, (ShaderEvalType)type, i);
+		kernel_bake_evaluate(kg, input, output, (ShaderEvalType)type, i, sample);
 	else
-		kernel_shader_evaluate(kg, input, output, (ShaderEvalType)type, i);
+		kernel_shader_evaluate(kg, input, output, (ShaderEvalType)type, i, sample);
 }
 
 CCL_NAMESPACE_END
diff --git a/intern/cycles/kernel/kernel.cu b/intern/cycles/kernel/kernel.cu
index 12273cc2853..83b1381a0f5 100644
--- a/intern/cycles/kernel/kernel.cu
+++ b/intern/cycles/kernel/kernel.cu
@@ -146,22 +146,22 @@ kernel_cuda_convert_to_half_float(uchar4 *rgba, float *buffer, float sample_scal
 
 extern "C" __global__ void
 CUDA_LAUNCH_BOUNDS(CUDA_THREADS_BLOCK_WIDTH, CUDA_KERNEL_MAX_REGISTERS)
-kernel_cuda_shader(uint4 *input, float4 *output, int type, int sx, int sw)
+kernel_cuda_shader(uint4 *input, float4 *output, int type, int sx, int sw, int sample)
 {
 	int x = sx + blockDim.x*blockIdx.x + threadIdx.x;
 
 	if(x < sx + sw)
-		kernel_shader_evaluate(NULL, input, output, (ShaderEvalType)type, x);
+		kernel_shader_evaluate(NULL, input, output, (ShaderEvalType)type, x, sample);
 }
 
 extern "C" __global__ void
 CUDA_LAUNCH_BOUNDS(CUDA_THREADS_BLOCK_WIDTH, CUDA_KERNEL_MAX_REGISTERS)
-kernel_cuda_bake(uint4 *input, float4 *output, int type, int sx, int sw)
+kernel_cuda_bake(uint4 *input, float4 *output, int type, int sx, int sw, int sample)
 {
 	int x = sx + blockDim.x*blockIdx.x + threadIdx.x;
 
 	if(x < sx + sw)
-		kernel_bake_evaluate(NULL, input, output, (ShaderEvalType)type, x);
+		kernel_bake_evaluate(NULL, input, output, (ShaderEvalType)type, x, sample);
 }
 
 #endif
diff --git a/intern/cycles/kernel/kernel.h b/intern/cycles/kernel/kernel.h
index c4a08646bab..b169b15b9b5 100644
--- a/intern/cycles/kernel/kernel.h
+++ b/intern/cycles/kernel/kernel.h
@@ -41,7 +41,7 @@ void kernel_cpu_convert_to_byte(KernelGlobals *kg, uchar4 *rgba, float *buffer,
 void kernel_cpu_convert_to_half_float(KernelGlobals *kg, uchar4 *rgba, float *buffer,
 	float sample_scale, int x, int y, int offset, int stride);
 void kernel_cpu_shader(KernelGlobals *kg, uint4 *input, float4 *output,
-	int type, int i);
+	int type, int i, int sample);
 
 #ifdef WITH_CYCLES_OPTIMIZED_KERNEL_SSE2
 void kernel_cpu_sse2_path_trace(KernelGlobals *kg, float *buffer, unsigned int *rng_state,
@@ -51,7 +51,7 @@ void kernel_cpu_sse2_convert_to_byte(KernelGlobals *kg, uchar4 *rgba, float *buf
 void kernel_cpu_sse2_convert_to_half_float(KernelGlobals *kg, uchar4 *rgba, float *buffer,
 	float sample_scale, int x, int y, int offset, int stride);
 void kernel_cpu_sse2_shader(KernelGlobals *kg, uint4 *input, float4 *output,
-	int type, int i);
+	int type, int i, int sample);
 #endif
 
 #ifdef WITH_CYCLES_OPTIMIZED_KERNEL_SSE3
@@ -62,7 +62,7 @@ void kernel_cpu_sse3_convert_to_byte(KernelGlobals *kg, uchar4 *rgba, float *buf
 void kernel_cpu_sse3_convert_to_half_float(KernelGlobals *kg, uchar4 *rgba, float *buffer,
 	float sample_scale, int x, int y, int offset, int stride);
 void kernel_cpu_sse3_shader(KernelGlobals *kg, uint4 *input, float4 *output,
-	int type, int i);
+	int type, int i, int sample);
 #endif
 
 #ifdef WITH_CYCLES_OPTIMIZED_KERNEL_SSE41
@@ -73,7 +73,7 @@ void kernel_cpu_sse41_convert_to_byte(KernelGlobals *kg, uchar4 *rgba, float *bu
 void kernel_cpu_sse41_convert_to_half_float(KernelGlobals *kg, uchar4 *rgba, float *buffer,
 	float sample_scale, int x, int y, int offset, int stride);
 void kernel_cpu_sse41_shader(KernelGlobals *kg, uint4 *input, float4 *output,
-	int type, int i);
+	int type, int i, int sample);
 #endif
 
 #ifdef WITH_CYCLES_OPTIMIZED_KERNEL_AVX
@@ -84,7 +84,7 @@ void kernel_cpu_avx_convert_to_byte(KernelGlobals *kg, uchar4 *rgba, float *buff
 void kernel_cpu_avx_convert_to_half_float(KernelGlobals *kg, uchar4 *rgba, float *buffer,
 	float sample_scale, int x, int y, int offset, int stride);
 void kernel_cpu_avx_shader(KernelGlobals *kg, uint4 *input, float4 *output,
-	int type, int i);
+	int type, int i, int sample);
 #endif
 
 CCL_NAMESPACE_END
diff --git a/intern/cycles/kernel/kernel_avx.cpp b/intern/cycles/kernel/kernel_avx.cpp
index 7d354de16d2..f5e1b8a7bb7 100644
--- a/intern/cycles/kernel/kernel_avx.cpp
+++ b/intern/cycles/kernel/kernel_avx.cpp
@@ -67,12 +67,12 @@ void kernel_cpu_avx_convert_to_half_float(KernelGlobals *kg, uchar4 *rgba, float
 
 /* Shader Evaluate */
 
-void kernel_cpu_avx_shader(KernelGlobals *kg, uint4 *input, float4 *output, int type, int i)
+void kernel_cpu_avx_shader(KernelGlobals *kg, uint4 *input, float4 *output, int type, int i, int sample)
 {
 	if(type >= SHADER_EVAL_BAKE)
-		kernel_bake_evaluate(kg, input, output, (ShaderEvalType)type, i);
+		kernel_bake_evaluate(kg, input, output, (ShaderEvalType)type, i, sample);
 	else
-		kernel_shader_evaluate(kg, input, output, (ShaderEvalType)type, i);
+		kernel_shader_evaluate(kg, input, output, (ShaderEvalType)type, i, sample);
 }
 
 CCL_NAMESPACE_END
diff --git a/intern/cycles/kernel/kernel_bake.h b/intern/cycles/kernel/kernel_bake.h
index e31090fe9ba..e8845e03acb 100644
--- a/intern/cycles/kernel/kernel_bake.h
+++ b/intern/cycles/kernel/kernel_bake.h
@@ -17,108 +17,114 @@
 CCL_NAMESPACE_BEGIN
 
 ccl_device void compute_light_pass(KernelGlobals *kg, ShaderData *sd, PathRadiance *L, RNG rng,
-                                   const bool is_combined, const bool is_ao, const bool is_sss)
+                                   const bool is_combined, const bool is_ao, const bool is_sss, int sample)
 {
-	int samples = kernel_data.integrator.aa_samples;
-
 	/* initialize master radiance accumulator */
 	kernel_assert(kernel_data.film.use_light_pass);
 	path_radiance_init(L, kernel_data.film.use_light_pass);
 
-	/* take multiple samples */
-	for(int sample = 0; sample < samples; sample++) {
-		PathRadiance L_sample;
-		PathState state;
-		Ray ray;
-		float3 throughput = make_float3(1.0f, 1.0f, 1.0f);
-		bool is_sss_sample = is_sss;
+	PathRadiance L_sample;
+	PathState state;
+	Ray ray;
+	float3 throughput = make_float3(1.0f, 1.0f, 1.0f);
+	bool is_sss_sample = is_sss;
 
-		/* init radiance */
-		path_radiance_init(&L_sample, kernel_data.film.use_light_pass);
+	/* init radiance */
+	path_radiance_init(&L_sample, kernel_data.film.use_light_pass);
 
-		/* init path state */
-		path_state_init(kg, &state, &rng, sample);
-		state.num_samples = samples;
+	/* init path state */
+	path_state_init(kg, &state, &rng, sample);
+	state.num_samples = kernel_data.integrator.aa_samples;
 
-		/* evaluate surface shader */
-		float rbsdf = path_state_rng_1D(kg, &rng, &state, PRNG_BSDF);
-		shader_eval_surface(kg, sd, rbsdf, state.flag, SHADER_CONTEXT_MAIN);
+	/* evaluate surface shader */
+	float rbsdf = path_state_rng_1D(kg, &rng, &state, PRNG_BSDF);
+	shader_eval_surface(kg, sd, rbsdf, state.flag, SHADER_CONTEXT_MAIN);
 
-		/* TODO, disable the closures we won't need */
+	/* TODO, disable the closures we won't need */
 
 #ifdef __BRANCHED_PATH__
-		if(!kernel_data.integrator.branched) {
-			/* regular path tracer */
+	if(!kernel_data.integrator.branched) {
+		/* regular path tracer */
 #endif
 
-			/* sample ambient occlusion */
-			if(is_combined || is_ao) {
-				kernel_path_ao(kg, sd, &L_sample, &state, &rng, throughput);
-			}
+		/* sample ambient occlusion */
+		if(is_combined || is_ao) {
+			kernel_path_ao(kg, sd, &L_sample, &state, &rng, throughput);
+		}
 
 #ifdef __SUBSURFACE__
-			/* sample subsurface scattering */
-			if((is_combined || is_sss_sample) && (sd->flag & SD_BSSRDF)) {
-				/* when mixing BSSRDF and BSDF closures we should skip BSDF lighting if scattering was successful */
-				if (kernel_path_subsurface_scatter(kg, sd, &L_sample, &state, &rng, &ray, &throughput))
-					is_sss_sample = true;
-			}
+		/* sample subsurface scattering */
+		if((is_combined || is_sss_sample) && (sd->flag & SD_BSSRDF)) {
+			/* when mixing BSSRDF and BSDF closures we should skip BSDF lighting if scattering was successful */
+			if (kernel_path_subsurface_scatter(kg, sd, &L_sample, &state, &rng, &ray, &throughput))
+				is_sss_sample = true;
+		}
 #endif
 
-			/* sample light and BSDF */
-			if((!is_sss_sample) && (!is_ao)) {
+		/* sample light and BSDF */
+		if((!is_sss_sample) && (!is_ao)) {
 
-				if(sd->flag & SD_EMISSION) {
-					float3 emission = indirect_primitive_emission(kg, sd, 0.0f, state.flag, state.ray_pdf);
-					path_radiance_accum_emission(&L_sample, throughput, emission, state.bounce);
-				}
+			if(sd->flag & SD_EMISSION) {
+				float3 emission = indirect_primitive_emission(kg, sd, 0.0f, state.flag, state.ray_pdf);
+				path_radiance_accum_emission(&L_sample, throughput, emission, state.bounce);
+			}
 
-				if(kernel_path_integrate_lighting(kg, &rng, sd, &throughput, &state, &L_sample, &ray)) {
+			if(kernel_path_integrate_lighting(kg, &rng, sd, &throughput, &state, &L_sample, &ray)) {
 #ifdef __LAMP_MIS__
-					state.ray_t = 0.0f;
+				state.ray_t = 0.0f;
 #endif
-					/* compute indirect light */
-					kernel_path_indirect(kg, &rng, ray, throughput, state.num_samples, state, &L_sample);
+				/* compute indirect light */
+				kernel_path_indirect(kg, &rng, ray, throughput, 1, state, &L_sample);
 
-					/* sum and reset indirect light pass variables for the next samples */
-					path_radiance_sum_indirect(&L_sample);
-					path_radiance_reset_indirect(&L_sample);
-				}
+				/* sum and reset indirect light pass variables for the next samples */
+				path_radiance_sum_indirect(&L_sample);
+				path_radiance_reset_indirect(&L_sample);
 			}
-#ifdef __BRANCHED_PATH__
 		}
-		else {
-			/* branched path tracer */
+#ifdef __BRANCHED_PATH__
+	}
+	else {
+		/* branched path tracer */
 
-			/* sample ambient occlusion */
-			if(is_combined || is_ao) {
-				kernel_branched_path_ao(kg, sd, &L_sample, &state, &rng, throughput);
-			}
+		/* sample ambient occlusion */
+		if(is_combined || is_ao) {
+			kernel_branched_path_ao(kg, sd, &L_sample, &state, &rng, throughput);
+		}
 
 #ifdef __SUBSURFACE__
-			/* sample subsurface scattering */
-			if((is_combined || is_sss_sample) && (sd->flag & SD_BSSRDF)) {
-				/* when mixing BSSRDF and BSDF closures we should skip BSDF lighting if scattering was successful */
-				kernel_branched_path_subsurface_scatter(kg, sd, &L_sample, &state, &rng, throughput);
-			}
+		/* sample subsurface scattering */
+		if((is_combined || is_sss_sample) && (sd->flag & SD_BSSRDF)) {
+			/* when mixing BSSRDF and BSDF closures we should skip BSDF lighting if scattering was successful */
+			kernel_branched_path_subsurface_scatter(kg, sd, &L_sample, &state, &rng, throughput);
+		}
 #endif
 
-			/* sample light and BSDF */
-			if((!is_sss_sample) && (!is_ao)) {
-
-				if(sd->flag & SD_EMISSION) {
-					float3 emission = indirect_primitive_emission(kg, sd, 0.0f, state.flag, state.ray_pdf);
-					path_radiance_accum_emission(&L_sample, throughput, emission, state.bounce);
-				}
+		/* sample light and BSDF */
+		if((!is_sss_sample) && (!is_ao)) {
 
-				kernel_branched_path_integrate_lighting(kg, &rng,
-					sd, throughput, 1.0f, &state, &L_sample);
+			if(sd->flag & SD_EMISSION) {
+				float3 emission = indirect_primitive_emission(kg, sd, 0.0f, state.flag, state.ray_pdf);
+				path_radiance_accum_emission(&L_sample, throughput, emission, state.bounce);
 			}
+
+			kernel_branched_path_integrate_lighting(kg, &rng,
+				sd, throughput, 1.0f, &state, &L_sample);
 		}
+	}
 #endif
 
-		/* accumulate into master L */
-		path_radiance_accum_sample(L, &L_sample, samples);
+	/* accumulate into master L */
+	path_radiance_accum_sample(L, &L_sample, 1);
+}
+
+ccl_device bool is_aa_pass(ShaderEvalType type)
+{
+	switch(type) {
+		case SHADER_EVAL_UV:
+		case SHADER_EVAL_NORMAL:
+			return false;
+		default:
+			return true;
 	}
 }
 
@@ -142,7 +148,20 @@ ccl_device bool is_light_pass(ShaderEvalType type)
 	}
 }
 
-ccl_device void kernel_bake_evaluate(KernelGlobals *kg, ccl_global uint4 *input, ccl_global float4 *output, ShaderEvalType type, int i)
+#if 0
+ccl_device_inline float bake_clamp_mirror_repeat(float u)
+{
+	/* use mirror repeat (like opengl texture) so that if the barycentric
+	 * coordinate goes past the end of the triangle it is not always clamped
+	 * to the same value, gives ugly patterns */
+	float fu = floorf(u);
+	u = u - fu;
+
+	return (((int)fu) & 1)? 1.0f - u: u;
+}
+#endif
+
+ccl_device void kernel_bake_evaluate(KernelGlobals *kg, ccl_global uint4 *input, ccl_global float4 *output, ShaderEvalType type, int i, int sample)
 {
 	ShaderData sd;
 	uint4 in = input[i * 2];
@@ -164,6 +183,24 @@ ccl_device void kernel_bake_evaluate(KernelGlobals *kg, ccl_global uint4 *input,
 	float dvdx = __uint_as_float(diff.z);
 	float dvdy = __uint_as_float(diff.w);
 
+	int num_samples = kernel_data.integrator.aa_samples;
+
+	/* random number generator */
+	RNG rng = cmj_hash(i, 0);
+
+#if 0
+	uint rng_state = cmj_hash(i, 0);
+	float filter_x, filter_y;
+	path_rng_init(kg, &rng_state, sample, num_samples, &rng, 0, 0, &filter_x, &filter_y);
+
+	/* subpixel u/v offset */
+	if(sample > 0) {
+		u = bake_clamp_mirror_repeat(u + dudx*(filter_x - 0.5f) + dudy*(filter_y - 0.5f));
+		v = bake_clamp_mirror_repeat(v + dvdx*(filter_x - 0.5f) + dvdy*(filter_y - 0.5f));
+	}
+#endif
+
+	/* triangle */
 	int shader;
 	float3 P, Ng;
 
@@ -190,12 +227,14 @@ ccl_device void kernel_bake_evaluate(KernelGlobals *kg, ccl_global uint4 *input,
 	sd.dv.dx = dvdx;
 	sd.dv.dy = dvdy;
 
+	/* light passes */
 	if(is_light_pass(type)) {
-		RNG rng = cmj_hash(i, 0);
-		compute_light_pass(kg, &sd, &L, rng, (type == SHADER_EVAL_COMBINED),
-		                                     (type == SHADER_EVAL_AO),
-		                                     (type == SHADER_EVAL_SUBSURFACE_DIRECT ||
-		                                      type == SHADER_EVAL_SUBSURFACE_INDIRECT));
+		compute_light_pass(kg, &sd, &L, rng,
+		                   (type == SHADER_EVAL_COMBINED),
+		                   (type == SHADER_EVAL_AO),
+		                   (type == SHADER_EVAL_SUBSURFACE_DIRECT ||
+		                    type == SHADER_EVAL_SUBSURFACE_INDIRECT),
+		                   sample);
 	}
 
 	switch (type) {
@@ -350,11 +389,15 @@ ccl_device void kernel_bake_evaluate(KernelGlobals *kg, ccl_global uint4 *input,
 	}
 
 	/* write output */
-	output[i] = make_float4(out.x, out.y, out.z, 1.0f);
-	return;
+	float output_fac = is_aa_pass(type)? 1.0f/num_samples: 1.0f;
+
+	if(sample == 0)
+		output[i] = make_float4(out.x, out.y, out.z, 1.0f) * output_fac;
+	else
+		output[i] += make_float4(out.x, out.y, out.z, 1.0f) * output_fac;
 }
 
-ccl_device void kernel_shader_evaluate(KernelGlobals *kg, ccl_global uint4 *input, ccl_global float4 *output, ShaderEvalType type, int i)
+ccl_device void kernel_shader_evaluate(KernelGlobals *kg, ccl_global uint4 *input, ccl_global float4 *output, ShaderEvalType type, int i, int sample)
 {
 	ShaderData sd;
 	uint4 in = input[i];
@@ -401,7 +444,10 @@ ccl_device void kernel_shader_evaluate(KernelGlobals *kg, ccl_global uint4 *inpu
 	}
 	
 	/* write output */
-	output[i] = make_float4(out.x, out.y, out.z, 0.0f);
+	if(sample == 0)
+		output[i] = make_float4(out.x, out.y, out.z, 0.0f);
+	else
+		output[i] += make_float4(out.x, out.y, out.z, 0.0f);
 }
 
 CCL_NAMESPACE_END
diff --git a/intern/cycles/kernel/kernel_sse2.cpp b/intern/cycles/kernel/kernel_sse2.cpp
index 3b5faea2994..67bd0943b1b 100644
--- a/intern/cycles/kernel/kernel_sse2.cpp
+++ b/intern/cycles/kernel/kernel_sse2.cpp
@@ -64,12 +64,12 @@ void kernel_cpu_sse2_convert_to_half_float(KernelGlobals *kg, uchar4 *rgba, floa
 
 /* Shader Evaluate */
 
-void kernel_cpu_sse2_shader(KernelGlobals *kg, uint4 *input, float4 *output, int type, int i)
+void kernel_cpu_sse2_shader(KernelGlobals *kg, uint4 *input, float4 *output, int type, int i, int sample)
 {
 	if(type >= SHADER_EVAL_BAKE)
-		kernel_bake_evaluate(kg, input, output, (ShaderEvalType)type, i);
+		kernel_bake_evaluate(kg, input, output, (ShaderEvalType)type, i, sample);
 	else
-		kernel_shader_evaluate(kg, input, output, (ShaderEvalType)type, i);
+		kernel_shader_evaluate(kg, input, output, (ShaderEvalType)type, i, sample);
 }
 
 CCL_NAMESPACE_END
diff --git a/intern/cycles/kernel/kernel_sse3.cpp b/intern/cycles/kernel/kernel_sse3.cpp
index 3b18b164ffd..40d621b66f6 100644
--- a/intern/cycles/kernel/kernel_sse3.cpp
+++ b/intern/cycles/kernel/kernel_sse3.cpp
@@ -66,12 +66,12 @@ void kernel_cpu_sse3_convert_to_half_float(KernelGlobals *kg, uchar4 *rgba, floa
 
 /* Shader Evaluate */
 
-void kernel_cpu_sse3_shader(KernelGlobals *kg, uint4 *input, float4 *output, int type, int i)
+void kernel_cpu_sse3_shader(KernelGlobals *kg, uint4 *input, float4 *output, int type, int i, int sample)
 {
 	if(type >= SHADER_EVAL_BAKE)
-		kernel_bake_evaluate(kg, input, output, (ShaderEvalType)type, i);
+		kernel_bake_evaluate(kg, input, output, (ShaderEvalType)type, i, sample);
 	else
-		kernel_shader_evaluate(kg, input, output, (ShaderEvalType)type, i);
+		kernel_shader_evaluate(kg, input, output, (ShaderEvalType)type, i, sample);
 }
 
 CCL_NAMESPACE_END
diff --git a/intern/cycles/kernel/kernel_sse41.cpp b/intern/cycles/kernel/kernel_sse41.cpp
index a3731d790f4..4b48d10b020 100644
--- a/intern/cycles/kernel/kernel_sse41.cpp
+++ b/intern/cycles/kernel/kernel_sse41.cpp
@@ -67,12 +67,12 @@ void kernel_cpu_sse41_convert_to_half_float(KernelGlobals *kg, uchar4 *rgba, flo
 
 /* Shader Evaluate */
 
-void kernel_cpu_sse41_shader(KernelGlobals *kg, uint4 *input, float4 *output, int type, int i)
+void kernel_cpu_sse41_shader(KernelGlobals *kg, uint4 *input, float4 *output, int type, int i, int sample)
 {
 	if(type >= SHADER_EVAL_BAKE)
-		kernel_bake_evaluate(kg, input, output, (ShaderEvalType)type, i);
+		kernel_bake_evaluate(kg, input, output, (ShaderEvalType)type, i, sample);
 	else
-		kernel_shader_evaluate(kg, input, output, (ShaderEvalType)type, i);
+		kernel_shader_evaluate(kg, input, output, (ShaderEvalType)type, i, sample);
 }
 
 CCL_NAMESPACE_END
diff --git a/intern/cycles/render/bake.cpp b/intern/cycles/render/bake.cpp
index aa317ab672f..c68f6e1f08e 100644
--- a/intern/cycles/render/bake.cpp
+++ b/intern/cycles/render/bake.cpp
@@ -15,6 +15,7 @@
  */
 
 #include "bake.h"
+#include "integrator.h"
 
 CCL_NAMESPACE_BEGIN
 
@@ -152,6 +153,7 @@ bool BakeManager::bake(Device *device, DeviceScene *dscene, Scene *scene, Progre
 	task.shader_eval_type = shader_type;
 	task.shader_x = 0;
 	task.shader_w = d_output.size();
+	task.num_samples = is_aa_pass(shader_type)? scene->integrator->aa_samples: 1;
 	task.get_cancel = function_bind(&Progress::get_cancel, &progress);
 
 	device->task_add(task);
@@ -203,4 +205,35 @@ void BakeManager::device_free(Device *device, DeviceScene *dscene)
 {
 }
 
+bool BakeManager::is_aa_pass(ShaderEvalType type)
+{
+	switch(type) {
+		case SHADER_EVAL_UV:
+		case SHADER_EVAL_NORMAL:
+			return false;
+		default:
+			return true;
+	}
+}
+
+bool BakeManager::is_light_pass(ShaderEvalType type)
+{
+	switch(type) {
+		case SHADER_EVAL_AO:
+		case SHADER_EVAL_COMBINED:
+		case SHADER_EVAL_SHADOW:
+		case SHADER_EVAL_DIFFUSE_DIRECT:
+		case SHADER_EVAL_GLOSSY_DIRECT:
+		case SHADER_EVAL_TRANSMISSION_DIRECT:
+		case SHADER_EVAL_SUBSURFACE_DIRECT:
+		case SHADER_EVAL_DIFFUSE_INDIRECT:
+		case SHADER_EVAL_GLOSSY_INDIRECT:
+		case SHADER_EVAL_TRANSMISSION_INDIRECT:
+		case SHADER_EVAL_SUBSURFACE_INDIRECT:
+			return true;
+		default:
+			return false;
+	}
+}
+
 CCL_NAMESPACE_END
diff --git a/intern/cycles/render/bake.h b/intern/cycles/render/bake.h
index ea403f7d39a..f91ba589b8b 100644
--- a/intern/cycles/render/bake.h
+++ b/intern/cycles/render/bake.h
@@ -17,10 +17,11 @@
 #ifndef __BAKE_H__
 #define __BAKE_H__
 
-#include "util_vector.h"
 #include "device.h"
 #include "scene.h"
-#include "session.h"
+
+#include "util_progress.h"
+#include "util_vector.h"
 
 CCL_NAMESPACE_BEGIN
 
@@ -64,6 +65,9 @@ public:
 	void device_update(Device *device, DeviceScene *dscene, Scene *scene, Progress& progress);
 	void device_free(Device *device, DeviceScene *dscene);
 
+	static bool is_light_pass(ShaderEvalType type);
+	static bool is_aa_pass(ShaderEvalType type);
+
 	bool need_update;
 
 private:
diff --git a/intern/cycles/render/light.cpp b/intern/cycles/render/light.cpp
index 1325627ef05..9a0a7ead696 100644
--- a/intern/cycles/render/light.cpp
+++ b/intern/cycles/render/light.cpp
@@ -66,6 +66,7 @@ static void shade_background_pixels(Device *device, DeviceScene *dscene, int res
 	main_task.shader_eval_type = SHADER_EVAL_BACKGROUND;
 	main_task.shader_x = 0;
 	main_task.shader_w = width*height;
+	main_task.num_samples = 1;
 	main_task.get_cancel = function_bind(&Progress::get_cancel, &progress);
 
 	/* disabled splitting for now, there's an issue with multi-GPU mem_copy_from */
diff --git a/intern/cycles/render/mesh_displace.cpp b/intern/cycles/render/mesh_displace.cpp
index 661fd9c66c1..4c0ee76299c 100644
--- a/intern/cycles/render/mesh_displace.cpp
+++ b/intern/cycles/render/mesh_displace.cpp
@@ -119,6 +119,7 @@ bool MeshManager::displace(Device *device, DeviceScene *dscene, Scene *scene, Me
 	task.shader_eval_type = SHADER_EVAL_DISPLACE;
 	task.shader_x = 0;
 	task.shader_w = d_output.size();
+	task.num_samples = 1;
 	task.get_cancel = function_bind(&Progress::get_cancel, &progress);
 
 	device->task_add(task);
author	Brecht Van Lommel <brechtvanlommel@gmail.com>	2014-06-06 16:40:09 +0400
committer	Brecht Van Lommel <brechtvanlommel@gmail.com>	2014-06-06 17:39:04 +0400
commit	e4e58d46128dc7fe4fb9b881d73b38173f00f5c3 (patch)
tree	cc38ac39838bec84d28de396374ba022139a8aa2
parent	553264ff8e20484d0b91bb468f56aa1b7144f7aa (diff)