13 files changed, 417 insertions, 110 deletions
diff --git a/intern/cycles/blender/addon/properties.py b/intern/cycles/blender/addon/properties.py
index a94178f1b42..7ea84659764 100644
--- a/intern/cycles/blender/addon/properties.py
+++ b/intern/cycles/blender/addon/properties.py
@@ -114,6 +114,11 @@ enum_use_layer_samples = (
     ('IGNORE', "Ignore", "Ignore per render layer number of samples"),
     )
 
+enum_sampling_pattern = (
+    ('SOBOL', "Sobol", "Use Sobol random sampling pattern"),
+    ('CORRELATED_MUTI_JITTER', "Correlated Multi-Jitter", "Use Correlated Multi-Jitter random sampling pattern"),
+    )
+
 
 class CyclesRenderSettings(bpy.types.PropertyGroup):
     @classmethod
@@ -219,6 +224,13 @@ class CyclesRenderSettings(bpy.types.PropertyGroup):
                 default=1,
                 )
 
+        cls.sampling_pattern = EnumProperty(
+                name="Sampling Pattern",
+                description="Random sampling pattern used by the integrator",
+                items=enum_sampling_pattern,
+                default='SOBOL',
+                )
+
         cls.use_layer_samples = EnumProperty(
                 name="Layer Samples",
                 description="How to use per render layer sample settings",
diff --git a/intern/cycles/blender/addon/ui.py b/intern/cycles/blender/addon/ui.py
index f9c516d1963..b35bdf8f511 100644
--- a/intern/cycles/blender/addon/ui.py
+++ b/intern/cycles/blender/addon/ui.py
@@ -85,6 +85,9 @@ class CyclesRender_PT_sampling(CyclesButtonsPanel, Panel):
             sub.prop(cscene, "mesh_light_samples", text="Mesh Light")
             sub.prop(cscene, "subsurface_samples", text="Subsurface")
 
+        if cscene.feature_set == 'EXPERIMENTAL':
+            layout.row().prop(cscene, "sampling_pattern", text="Pattern")
+
         for rl in scene.render.layers:
             if rl.samples > 0:
                 layout.separator()
diff --git a/intern/cycles/blender/blender_sync.cpp b/intern/cycles/blender/blender_sync.cpp
index b324385134b..ef9ce85ddf8 100644
--- a/intern/cycles/blender/blender_sync.cpp
+++ b/intern/cycles/blender/blender_sync.cpp
@@ -199,6 +199,9 @@ void BlenderSync::sync_integrator()
 	integrator->subsurface_samples = get_int(cscene, "subsurface_samples");
 	integrator->progressive = get_boolean(cscene, "progressive");
 
+	if(experimental)
+		integrator->sampling_pattern = (SamplingPattern)RNA_enum_get(&cscene, "sampling_pattern");
+
 	if(integrator->modified(previntegrator))
 		integrator->tag_update(scene);
 }
diff --git a/intern/cycles/kernel/CMakeLists.txt b/intern/cycles/kernel/CMakeLists.txt
index 41048c7b379..912a1321d67 100644
--- a/intern/cycles/kernel/CMakeLists.txt
+++ b/intern/cycles/kernel/CMakeLists.txt
@@ -33,6 +33,7 @@ set(SRC_HEADERS
 	kernel_emission.h
 	kernel_film.h
 	kernel_globals.h
+	kernel_jitter.h
 	kernel_light.h
 	kernel_math.h
 	kernel_montecarlo.h
diff --git a/intern/cycles/kernel/kernel_jitter.h b/intern/cycles/kernel/kernel_jitter.h
new file mode 100644
index 00000000000..5ea44cd0cad
--- /dev/null
+++ b/intern/cycles/kernel/kernel_jitter.h
@@ -0,0 +1,181 @@
+/*
+ * Copyright 2013, Blender Foundation.
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public License
+ * as published by the Free Software Foundation; either version 2
+ * of the License, or (at your option) any later version.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write to the Free Software Foundation,
+ * Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301, USA.
+ */
+
+CCL_NAMESPACE_BEGIN
+
+/* "Correlated Multi-Jittered Sampling"
+ * Andrew Kensler, Pixar Technical Memo 13-01, 2013 */
+
+/* todo: find good value, suggested 64 gives pattern on cornell box ceiling */
+#define CMJ_RANDOM_OFFSET_LIMIT 4096
+
+__device_inline bool cmj_is_pow2(int i)
+{
+	return (i & (i - 1)) == 0;
+}
+
+__device_inline int cmj_fast_mod_pow2(int a, int b)
+{
+	return (a & (b - 1));
+}
+
+/* a must be > 0 and b must be > 1 */
+__device_inline int cmj_fast_div_pow2(int a, int b)
+{
+#ifdef __KERNEL_SSE2__
+	return a >> __builtin_ctz(b);
+#else
+	return a/b;
+#endif
+}
+
+__device_inline uint cmj_w_mask(uint w)
+{
+#ifdef __KERNEL_SSE2__
+	return ((1 << (32 - __builtin_clz(w))) - 1);
+#else
+	w |= w >> 1;
+	w |= w >> 2;
+	w |= w >> 4;
+	w |= w >> 8;
+	w |= w >> 16;
+
+	return w;
+#endif
+}
+
+__device_inline uint cmj_permute(uint i, uint l, uint p)
+{
+	uint w = l - 1;
+
+	if((l & w) == 0) {
+		/* l is a power of two (fast) */
+		i ^= p;
+		i *= 0xe170893d;
+		i ^= p >> 16;
+		i ^= (i & w) >> 4;
+		i ^= p >> 8;
+		i *= 0x0929eb3f;
+		i ^= p >> 23;
+		i ^= (i & w) >> 1;
+		i *= 1 | p >> 27;
+		i *= 0x6935fa69;
+		i ^= (i & w) >> 11;
+		i *= 0x74dcb303;
+		i ^= (i & w) >> 2;
+		i *= 0x9e501cc3;
+		i ^= (i & w) >> 2;
+		i *= 0xc860a3df;
+		i &= w;
+		i ^= i >> 5;
+
+		return (i + p) & w;
+	}
+	else {
+		/* l is not a power of two (slow) */
+		w = cmj_w_mask(w);
+
+		do {
+			i ^= p;
+			i *= 0xe170893d;
+			i ^= p >> 16;
+			i ^= (i & w) >> 4;
+			i ^= p >> 8;
+			i *= 0x0929eb3f;
+			i ^= p >> 23;
+			i ^= (i & w) >> 1;
+			i *= 1 | p >> 27;
+			i *= 0x6935fa69;
+			i ^= (i & w) >> 11;
+			i *= 0x74dcb303;
+			i ^= (i & w) >> 2;
+			i *= 0x9e501cc3;
+			i ^= (i & w) >> 2;
+			i *= 0xc860a3df;
+			i &= w;
+			i ^= i >> 5;
+		} while (i >= l);
+
+		return (i + p) % l;
+	}
+}
+
+__device_inline uint cmj_hash(uint i, uint p)
+{
+	i ^= p;
+	i ^= i >> 17;
+	i ^= i >> 10;
+	i *= 0xb36534e5;
+	i ^= i >> 12;
+	i ^= i >> 21;
+	i *= 0x93fc4795;
+	i ^= 0xdf6e307f;
+	i ^= i >> 17;
+	i *= 1 | p >> 18;
+
+	return i;
+}
+
+__device_inline float cmj_randfloat(uint i, uint p)
+{
+	return cmj_hash(i, p) * (1.0f / 4294967808.0f);
+}
+
+#ifdef __CMJ__
+__device_noinline float cmj_sample_1D(int s, int N, int p)
+{
+	uint x = cmj_permute(s, N, p * 0x68bc21eb);
+	float jx = cmj_randfloat(s, p * 0x967a889b);
+
+	float invN = 1.0f/N;
+	return (x + jx)*invN;
+}
+
+__device_noinline float2 cmj_sample_2D(int s, int N, int p)
+{
+	int m = float_to_int(sqrtf(N));
+	int n = (N + m - 1)/m;
+	float invN = 1.0f/N;
+	float invm = 1.0f/m;
+	float invn = 1.0f/n;
+
+	s = cmj_permute(s, N, p * 0x51633e2d);
+
+	int sdivm, smodm;
+
+	if(cmj_is_pow2(m)) {
+		sdivm = cmj_fast_div_pow2(s, m);
+		smodm = cmj_fast_mod_pow2(s, m);
+	}
+	else {
+		sdivm = float_to_int(s * invm);
+		smodm = s - sdivm*m;
+	}
+
+	uint sx = cmj_permute(smodm, m, p * 0x68bc21eb);
+	uint sy = cmj_permute(sdivm, n, p * 0x02e5be93);
+
+	float jx = cmj_randfloat(s, p * 0x967a889b);
+	float jy = cmj_randfloat(s, p * 0x368cc8b7);
+
+	return make_float2((sx + (sy + jx)*invn)*invm, (s + jy)*invN);
+}
+#endif
+
+CCL_NAMESPACE_END
+
diff --git a/intern/cycles/kernel/kernel_montecarlo.h b/intern/cycles/kernel/kernel_montecarlo.h
index 2ae95084162..f608429da36 100644
--- a/intern/cycles/kernel/kernel_montecarlo.h
+++ b/intern/cycles/kernel/kernel_montecarlo.h
@@ -36,36 +36,12 @@
 CCL_NAMESPACE_BEGIN
 
 /// Given values x and y on [0,1], convert them in place to values on
-/// [-1,1] uniformly distributed over a unit sphere.  This code is
-/// derived from Peter Shirley, "Realistic Ray Tracing", p. 103.
+/// [-1,1] uniformly distributed over a unit sphere.
 __device void to_unit_disk(float *x, float *y)
 {
-	float r, phi;
-	float a = 2.0f * (*x) - 1.0f;
-	float b = 2.0f * (*y) - 1.0f;
-	if(a > -b) {
-		if(a > b) {
-			r = a;
-			phi = M_PI_4_F *(b/a);
-		}
-		else {
-			r = b;
-			phi = M_PI_4_F *(2.0f - a/b);
-		}
-	}
-	else {
-		if(a < b) {
-			r = -a;
-			phi = M_PI_4_F *(4.0f + b/a);
-		}
-		else {
-			r = -b;
-			if(b != 0.0f)
-				phi = M_PI_4_F *(6.0f - a/b);
-			else
-				phi = 0.0f;
-		}
-	}
+	float phi = 2.0f * M_PI_F * (*x);
+	float r = sqrtf(*y);
+
 	*x = r * cosf(phi);
 	*y = r * sinf(phi);
 }
diff --git a/intern/cycles/kernel/kernel_path.h b/intern/cycles/kernel/kernel_path.h
index 5915dfed08b..866024ba303 100644
--- a/intern/cycles/kernel/kernel_path.h
+++ b/intern/cycles/kernel/kernel_path.h
@@ -233,7 +233,7 @@ __device_inline bool shadow_blocked(KernelGlobals *kg, PathState *state, Ray *ra
 	return result;
 }
 
-__device float4 kernel_path_progressive(KernelGlobals *kg, RNG *rng, int sample, Ray ray, __global float *buffer)
+__device float4 kernel_path_progressive(KernelGlobals *kg, RNG rng, int sample, Ray ray, __global float *buffer)
 {
 	/* initialize */
 	PathRadiance L;
@@ -249,6 +249,7 @@ __device float4 kernel_path_progressive(KernelGlobals *kg, RNG *rng, int sample,
 #endif
 	PathState state;
 	int rng_offset = PRNG_BASE_NUM;
+	int num_samples = kernel_data.integrator.aa_samples;
 
 	path_state_init(&state);
 
@@ -270,7 +271,7 @@ __device float4 kernel_path_progressive(KernelGlobals *kg, RNG *rng, int sample,
 			}
 
 			extmax = kernel_data.curve_kernel_data.maximum_width;
-			lcg_state = lcg_init(*rng + rng_offset + sample*0x51633e2d);
+			lcg_state = lcg_init(rng + rng_offset + sample*0x51633e2d);
 		}
 
 		bool hit = scene_intersect(kg, &ray, visibility, &isect, &lcg_state, difl, extmax);
@@ -292,7 +293,7 @@ __device float4 kernel_path_progressive(KernelGlobals *kg, RNG *rng, int sample,
 			light_ray.dP = ray.dP;
 
 			/* intersect with lamp */
-			float light_t = path_rng(kg, rng, sample, rng_offset + PRNG_LIGHT);
+			float light_t = path_rng_1D(kg, rng, sample, num_samples, rng_offset + PRNG_LIGHT);
 			float3 emission;
 
 			if(indirect_lamp_emission(kg, &light_ray, state.flag, ray_pdf, light_t, &emission))
@@ -323,7 +324,7 @@ __device float4 kernel_path_progressive(KernelGlobals *kg, RNG *rng, int sample,
 		/* setup shading */
 		ShaderData sd;
 		shader_setup_from_ray(kg, &sd, &isect, &ray);
-		float rbsdf = path_rng(kg, rng, sample, rng_offset + PRNG_BSDF);
+		float rbsdf = path_rng_1D(kg, rng, sample, num_samples, rng_offset + PRNG_BSDF);
 		shader_eval_surface(kg, &sd, rbsdf, state.flag, SHADER_CONTEXT_MAIN);
 
 		/* holdout */
@@ -373,12 +374,18 @@ __device float4 kernel_path_progressive(KernelGlobals *kg, RNG *rng, int sample,
 		 * mainly due to the mixed in MIS that we use. gives too many unneeded
 		 * shader evaluations, only need emission if we are going to terminate */
 		float probability = path_state_terminate_probability(kg, &state, throughput);
-		float terminate = path_rng(kg, rng, sample, rng_offset + PRNG_TERMINATE);
 
-		if(terminate >= probability)
+		if(probability == 0.0f) {
 			break;
+		}
+		else if(probability != 1.0f) {
+			float terminate = path_rng_1D(kg, rng, sample, num_samples, rng_offset + PRNG_TERMINATE);
+
+			if(terminate >= probability)
+				break;
 
-		throughput /= probability;
+			throughput /= probability;
+		}
 
 #ifdef __SUBSURFACE__
 		/* bssrdf scatter to a different location on the same object, replacing
@@ -392,7 +399,7 @@ __device float4 kernel_path_progressive(KernelGlobals *kg, RNG *rng, int sample,
 
 			/* do bssrdf scatter step if we picked a bssrdf closure */
 			if(sc) {
-				uint lcg_state = lcg_init(*rng + rng_offset + sample*0x68bc21eb);
+				uint lcg_state = lcg_init(rng + rng_offset + sample*0x68bc21eb);
 				subsurface_scatter_step(kg, &sd, state.flag, sc, &lcg_state, false);
 			}
 		}
@@ -402,8 +409,9 @@ __device float4 kernel_path_progressive(KernelGlobals *kg, RNG *rng, int sample,
 		/* ambient occlusion */
 		if(kernel_data.integrator.use_ambient_occlusion || (sd.flag & SD_AO)) {
 			/* todo: solve correlation */
-			float bsdf_u = path_rng(kg, rng, sample, rng_offset + PRNG_BSDF_U);
-			float bsdf_v = path_rng(kg, rng, sample, rng_offset + PRNG_BSDF_V);
+			float2 bsdf_uv = path_rng_2D(kg, rng, sample, num_samples, rng_offset + PRNG_BSDF_U);
+			float bsdf_u = bsdf_uv.x;
+			float bsdf_v = bsdf_uv.y;
 
 			float ao_factor = kernel_data.background.ao_factor;
 			float3 ao_N;
@@ -436,10 +444,15 @@ __device float4 kernel_path_progressive(KernelGlobals *kg, RNG *rng, int sample,
 		if(kernel_data.integrator.use_direct_light) {
 			/* sample illumination from lights to find path contribution */
 			if(sd.flag & SD_BSDF_HAS_EVAL) {
-				float light_t = path_rng(kg, rng, sample, rng_offset + PRNG_LIGHT);
-				float light_o = path_rng(kg, rng, sample, rng_offset + PRNG_LIGHT_F);
-				float light_u = path_rng(kg, rng, sample, rng_offset + PRNG_LIGHT_U);
-				float light_v = path_rng(kg, rng, sample, rng_offset + PRNG_LIGHT_V);
+				float light_t = path_rng_1D(kg, rng, sample, num_samples, rng_offset + PRNG_LIGHT);
+#ifdef __MULTI_CLOSURE__
+				float light_o = 0.0f;
+#else
+				float light_o = path_rng_1D(kg, rng, sample, num_samples, rng_offset + PRNG_LIGHT_F);
+#endif
+				float2 light_uv = path_rng_2D(kg, rng, sample, num_samples, rng_offset + PRNG_LIGHT_U);
+				float light_u = light_uv.x;
+				float light_v = light_uv.y;
 
 				Ray light_ray;
 				BsdfEval L_light;
@@ -471,8 +484,9 @@ __device float4 kernel_path_progressive(KernelGlobals *kg, RNG *rng, int sample,
 		BsdfEval bsdf_eval;
 		float3 bsdf_omega_in;
 		differential3 bsdf_domega_in;
-		float bsdf_u = path_rng(kg, rng, sample, rng_offset + PRNG_BSDF_U);
-		float bsdf_v = path_rng(kg, rng, sample, rng_offset + PRNG_BSDF_V);
+		float2 bsdf_uv = path_rng_2D(kg, rng, sample, num_samples, rng_offset + PRNG_BSDF_U);
+		float bsdf_u = bsdf_uv.x;
+		float bsdf_v = bsdf_uv.y;
 		int label;
 
 		label = shader_bsdf_sample(kg, &sd, bsdf_u, bsdf_v, &bsdf_eval,
@@ -524,8 +538,8 @@ __device float4 kernel_path_progressive(KernelGlobals *kg, RNG *rng, int sample,
 
 #ifdef __NON_PROGRESSIVE__
 
-__device void kernel_path_indirect(KernelGlobals *kg, RNG *rng, int sample, Ray ray, __global float *buffer,
-	float3 throughput, float num_samples_adjust,
+__device void kernel_path_indirect(KernelGlobals *kg, RNG rng, int sample, Ray ray, __global float *buffer,
+	float3 throughput, int num_samples, int num_total_samples,
 	float min_ray_pdf, float ray_pdf, PathState state, int rng_offset, PathRadiance *L)
 {
 #ifdef __LAMP_MIS__
@@ -557,7 +571,7 @@ __device void kernel_path_indirect(KernelGlobals *kg, RNG *rng, int sample, Ray
 			light_ray.dP = ray.dP;
 
 			/* intersect with lamp */
-			float light_t = path_rng(kg, rng, sample, rng_offset + PRNG_LIGHT);
+			float light_t = path_rng_1D(kg, rng, sample, num_total_samples, rng_offset + PRNG_LIGHT);
 			float3 emission;
 
 			if(indirect_lamp_emission(kg, &light_ray, state.flag, ray_pdf, light_t, &emission))
@@ -578,7 +592,7 @@ __device void kernel_path_indirect(KernelGlobals *kg, RNG *rng, int sample, Ray
 		/* setup shading */
 		ShaderData sd;
 		shader_setup_from_ray(kg, &sd, &isect, &ray);
-		float rbsdf = path_rng(kg, rng, sample, rng_offset + PRNG_BSDF);
+		float rbsdf = path_rng_1D(kg, rng, sample, num_total_samples, rng_offset + PRNG_BSDF);
 		shader_eval_surface(kg, &sd, rbsdf, state.flag, SHADER_CONTEXT_INDIRECT);
 		shader_merge_closures(kg, &sd);
 
@@ -604,13 +618,19 @@ __device void kernel_path_indirect(KernelGlobals *kg, RNG *rng, int sample, Ray
 		/* path termination. this is a strange place to put the termination, it's
 		 * mainly due to the mixed in MIS that we use. gives too many unneeded
 		 * shader evaluations, only need emission if we are going to terminate */
-		float probability = path_state_terminate_probability(kg, &state, throughput*num_samples_adjust);
-		float terminate = path_rng(kg, rng, sample, rng_offset + PRNG_TERMINATE);
+		float probability = path_state_terminate_probability(kg, &state, throughput*num_samples);
 
-		if(terminate >= probability)
+		if(probability == 0.0f) {
 			break;
+		}
+		else if(probability != 1.0f) {
+			float terminate = path_rng_1D(kg, rng, sample, num_total_samples, rng_offset + PRNG_TERMINATE);
+
+			if(terminate >= probability)
+				break;
 
-		throughput /= probability;
+			throughput /= probability;
+		}
 
 #ifdef __SUBSURFACE__
 		/* bssrdf scatter to a different location on the same object, replacing
@@ -624,7 +644,7 @@ __device void kernel_path_indirect(KernelGlobals *kg, RNG *rng, int sample, Ray
 
 			/* do bssrdf scatter step if we picked a bssrdf closure */
 			if(sc) {
-				uint lcg_state = lcg_init(*rng + rng_offset + sample*0x68bc21eb);
+				uint lcg_state = lcg_init(rng + rng_offset + sample*0x68bc21eb);
 				subsurface_scatter_step(kg, &sd, state.flag, sc, &lcg_state, false);
 			}
 		}
@@ -634,8 +654,9 @@ __device void kernel_path_indirect(KernelGlobals *kg, RNG *rng, int sample, Ray
 		/* ambient occlusion */
 		if(kernel_data.integrator.use_ambient_occlusion || (sd.flag & SD_AO)) {
 			/* todo: solve correlation */
-			float bsdf_u = path_rng(kg, rng, sample, rng_offset + PRNG_BSDF_U);
-			float bsdf_v = path_rng(kg, rng, sample, rng_offset + PRNG_BSDF_V);
+			float2 bsdf_uv = path_rng_2D(kg, rng, sample, num_total_samples, rng_offset + PRNG_BSDF_U);
+			float bsdf_u = bsdf_uv.x;
+			float bsdf_v = bsdf_uv.y;
 
 			float ao_factor = kernel_data.background.ao_factor;
 			float3 ao_N;
@@ -668,10 +689,15 @@ __device void kernel_path_indirect(KernelGlobals *kg, RNG *rng, int sample, Ray
 		if(kernel_data.integrator.use_direct_light) {
 			/* sample illumination from lights to find path contribution */
 			if(sd.flag & SD_BSDF_HAS_EVAL) {
-				float light_t = path_rng(kg, rng, sample, rng_offset + PRNG_LIGHT);
-				float light_o = path_rng(kg, rng, sample, rng_offset + PRNG_LIGHT_F);
-				float light_u = path_rng(kg, rng, sample, rng_offset + PRNG_LIGHT_U);
-				float light_v = path_rng(kg, rng, sample, rng_offset + PRNG_LIGHT_V);
+				float light_t = path_rng_1D(kg, rng, sample, num_total_samples, rng_offset + PRNG_LIGHT);
+#ifdef __MULTI_CLOSURE__
+				float light_o = 0.0f;
+#else
+				float light_o = path_rng_1D(kg, rng, sample, num_total_samples, rng_offset + PRNG_LIGHT_F);
+#endif
+				float2 light_uv = path_rng_2D(kg, rng, sample, num_total_samples, rng_offset + PRNG_LIGHT_U);
+				float light_u = light_uv.x;
+				float light_v = light_uv.y;
 
 				Ray light_ray;
 				BsdfEval L_light;
@@ -704,8 +730,9 @@ __device void kernel_path_indirect(KernelGlobals *kg, RNG *rng, int sample, Ray
 		BsdfEval bsdf_eval;
 		float3 bsdf_omega_in;
 		differential3 bsdf_domega_in;
-		float bsdf_u = path_rng(kg, rng, sample, rng_offset + PRNG_BSDF_U);
-		float bsdf_v = path_rng(kg, rng, sample, rng_offset + PRNG_BSDF_V);
+		float2 bsdf_uv = path_rng_2D(kg, rng, sample, num_total_samples, rng_offset + PRNG_BSDF_U);
+		float bsdf_u = bsdf_uv.x;
+		float bsdf_v = bsdf_uv.y;
 		int label;
 
 		label = shader_bsdf_sample(kg, &sd, bsdf_u, bsdf_v, &bsdf_eval,
@@ -740,15 +767,17 @@ __device void kernel_path_indirect(KernelGlobals *kg, RNG *rng, int sample, Ray
 	}
 }
 
-__device_noinline void kernel_path_non_progressive_lighting(KernelGlobals *kg, RNG *rng, int sample,
+__device_noinline void kernel_path_non_progressive_lighting(KernelGlobals *kg, RNG rng, int sample,
 	ShaderData *sd, float3 throughput, float num_samples_adjust,
 	float min_ray_pdf, float ray_pdf, PathState state,
 	int rng_offset, PathRadiance *L, __global float *buffer)
 {
+	int aa_samples = kernel_data.integrator.aa_samples;
+
 #ifdef __AO__
 	/* ambient occlusion */
 	if(kernel_data.integrator.use_ambient_occlusion || (sd->flag & SD_AO)) {
-		int num_samples = ceil(kernel_data.integrator.ao_samples*num_samples_adjust);
+		int num_samples = ceil_to_int(kernel_data.integrator.ao_samples*num_samples_adjust);
 		float num_samples_inv = num_samples_adjust/num_samples;
 		float ao_factor = kernel_data.background.ao_factor;
 		float3 ao_N;
@@ -756,8 +785,9 @@ __device_noinline void kernel_path_non_progressive_lighting(KernelGlobals *kg, R
 
 		for(int j = 0; j < num_samples; j++) {
 			/* todo: solve correlation */
-			float bsdf_u = path_rng(kg, rng, sample*num_samples + j, rng_offset + PRNG_BSDF_U);
-			float bsdf_v = path_rng(kg, rng, sample*num_samples + j, rng_offset + PRNG_BSDF_V);
+			float2 bsdf_uv = path_rng_2D(kg, rng, sample*num_samples + j, aa_samples*num_samples, rng_offset + PRNG_BSDF_U);
+			float bsdf_u = bsdf_uv.x;
+			float bsdf_v = bsdf_uv.y;
 
 			float3 ao_D;
 			float ao_pdf;
@@ -798,15 +828,17 @@ __device_noinline void kernel_path_non_progressive_lighting(KernelGlobals *kg, R
 
 		/* lamp sampling */
 		for(int i = 0; i < kernel_data.integrator.num_all_lights; i++) {
-			int num_samples = ceil(num_samples_adjust*light_select_num_samples(kg, i));
+			int num_samples = ceil_to_int(num_samples_adjust*light_select_num_samples(kg, i));
 			float num_samples_inv = num_samples_adjust/(num_samples*kernel_data.integrator.num_all_lights);
+			RNG lamp_rng = cmj_hash(rng, i);
 
 			if(kernel_data.integrator.pdf_triangles != 0.0f)
 				num_samples_inv *= 0.5f;
 
 			for(int j = 0; j < num_samples; j++) {
-				float light_u = path_rng(kg, rng, sample*num_samples + j, rng_offset + PRNG_LIGHT_U);
-				float light_v = path_rng(kg, rng, sample*num_samples + j, rng_offset + PRNG_LIGHT_V);
+				float2 light_uv = path_rng_2D(kg, lamp_rng, sample*num_samples + j, aa_samples*num_samples, rng_offset + PRNG_LIGHT_U);
+				float light_u = light_uv.x;
+				float light_v = light_uv.y;
 
 				if(direct_emission(kg, sd, i, 0.0f, 0.0f, light_u, light_v, &light_ray, &L_light, &is_lamp)) {
 					/* trace shadow ray */
@@ -822,16 +854,17 @@ __device_noinline void kernel_path_non_progressive_lighting(KernelGlobals *kg, R
 
 		/* mesh light sampling */
 		if(kernel_data.integrator.pdf_triangles != 0.0f) {
-			int num_samples = ceil(num_samples_adjust*kernel_data.integrator.mesh_light_samples);
+			int num_samples = ceil_to_int(num_samples_adjust*kernel_data.integrator.mesh_light_samples);
 			float num_samples_inv = num_samples_adjust/num_samples;
 
 			if(kernel_data.integrator.num_all_lights)
 				num_samples_inv *= 0.5f;
 
 			for(int j = 0; j < num_samples; j++) {
-				float light_t = path_rng(kg, rng, sample*num_samples + j, rng_offset + PRNG_LIGHT);
-				float light_u = path_rng(kg, rng, sample*num_samples + j, rng_offset + PRNG_LIGHT_U);
-				float light_v = path_rng(kg, rng, sample*num_samples + j, rng_offset + PRNG_LIGHT_V);
+				float light_t = path_rng_1D(kg, rng, sample*num_samples + j, aa_samples*num_samples, rng_offset + PRNG_LIGHT);
+				float2 light_uv = path_rng_2D(kg, rng, sample*num_samples + j, aa_samples*num_samples, rng_offset + PRNG_LIGHT_U);
+				float light_u = light_uv.x;
+				float light_v = light_uv.y;
 
 				/* only sample triangle lights */
 				if(kernel_data.integrator.num_all_lights)
@@ -869,9 +902,10 @@ __device_noinline void kernel_path_non_progressive_lighting(KernelGlobals *kg, R
 		else
 			num_samples = kernel_data.integrator.transmission_samples;
 
-		num_samples = ceil(num_samples_adjust*num_samples);
+		num_samples = ceil_to_int(num_samples_adjust*num_samples);
 
 		float num_samples_inv = num_samples_adjust/num_samples;
+		RNG bsdf_rng = cmj_hash(rng, i);
 
 		for(int j = 0; j < num_samples; j++) {
 			/* sample BSDF */
@@ -879,8 +913,9 @@ __device_noinline void kernel_path_non_progressive_lighting(KernelGlobals *kg, R
 			BsdfEval bsdf_eval;
 			float3 bsdf_omega_in;
 			differential3 bsdf_domega_in;
-			float bsdf_u = path_rng(kg, rng, sample*num_samples + j, rng_offset + PRNG_BSDF_U);
-			float bsdf_v = path_rng(kg, rng, sample*num_samples + j, rng_offset + PRNG_BSDF_V);
+			float2 bsdf_uv = path_rng_2D(kg, bsdf_rng, sample*num_samples + j, aa_samples*num_samples, rng_offset + PRNG_BSDF_U);
+			float bsdf_u = bsdf_uv.x;
+			float bsdf_v = bsdf_uv.y;
 			int label;
 
 			label = shader_bsdf_sample_closure(kg, sd, sc, bsdf_u, bsdf_v, &bsdf_eval,
@@ -918,7 +953,7 @@ __device_noinline void kernel_path_non_progressive_lighting(KernelGlobals *kg, R
 #endif
 
 			kernel_path_indirect(kg, rng, sample*num_samples + j, bsdf_ray, buffer,
-				tp*num_samples_inv, num_samples,
+				tp*num_samples_inv, num_samples, aa_samples*num_samples,
 				min_ray_pdf, bsdf_pdf, ps, rng_offset+PRNG_BOUNCE_NUM, L);
 
 			/* for render passes, sum and reset indirect light pass variables
@@ -929,7 +964,7 @@ __device_noinline void kernel_path_non_progressive_lighting(KernelGlobals *kg, R
 	}
 }
 
-__device float4 kernel_path_non_progressive(KernelGlobals *kg, RNG *rng, int sample, Ray ray, __global float *buffer)
+__device float4 kernel_path_non_progressive(KernelGlobals *kg, RNG rng, int sample, Ray ray, __global float *buffer)
 {
 	/* initialize */
 	PathRadiance L;
@@ -941,6 +976,7 @@ __device float4 kernel_path_non_progressive(KernelGlobals *kg, RNG *rng, int sam
 	float ray_pdf = 0.0f;
 	PathState state;
 	int rng_offset = PRNG_BASE_NUM;
+	int aa_samples = kernel_data.integrator.aa_samples;
 
 	path_state_init(&state);
 
@@ -961,7 +997,7 @@ __device float4 kernel_path_non_progressive(KernelGlobals *kg, RNG *rng, int sam
 			}
 
 			extmax = kernel_data.curve_kernel_data.maximum_width;
-			lcg_state = lcg_init(*rng + rng_offset + sample*0x51633e2d);
+			lcg_state = lcg_init(rng + rng_offset + sample*0x51633e2d);
 		}
 
 		if(!scene_intersect(kg, &ray, visibility, &isect, &lcg_state, difl, extmax)) {
@@ -990,8 +1026,7 @@ __device float4 kernel_path_non_progressive(KernelGlobals *kg, RNG *rng, int sam
 		/* setup shading */
 		ShaderData sd;
 		shader_setup_from_ray(kg, &sd, &isect, &ray);
-		float rbsdf = path_rng(kg, rng, sample, rng_offset + PRNG_BSDF);
-		shader_eval_surface(kg, &sd, rbsdf, state.flag, SHADER_CONTEXT_MAIN);
+		shader_eval_surface(kg, &sd, 0.0f, state.flag, SHADER_CONTEXT_MAIN);
 		shader_merge_closures(kg, &sd);
 
 		/* holdout */
@@ -1031,12 +1066,18 @@ __device float4 kernel_path_non_progressive(KernelGlobals *kg, RNG *rng, int sam
 			 * mainly due to the mixed in MIS that we use. gives too many unneeded
 			 * shader evaluations, only need emission if we are going to terminate */
 			float probability = path_state_terminate_probability(kg, &state, throughput);
-			float terminate = path_rng(kg, rng, sample, rng_offset + PRNG_TERMINATE);
 
-			if(terminate >= probability)
+			if(probability == 0.0f) {
 				break;
+			}
+			else if(probability != 1.0f) {
+				float terminate = path_rng_1D(kg, rng, sample, aa_samples, rng_offset + PRNG_TERMINATE);
 
-			throughput /= probability;
+				if(terminate >= probability)
+					break;
+
+				throughput /= probability;
+			}
 		}
 
 #ifdef __SUBSURFACE__
@@ -1049,7 +1090,7 @@ __device float4 kernel_path_non_progressive(KernelGlobals *kg, RNG *rng, int sam
 					continue;
 
 				/* set up random number generator */
-				uint lcg_state = lcg_init(*rng + rng_offset + sample*0x68bc21eb);
+				uint lcg_state = lcg_init(rng + rng_offset + sample*0x68bc21eb);
 				int num_samples = kernel_data.integrator.subsurface_samples;
 				float num_samples_inv = 1.0f/num_samples;
 
@@ -1112,22 +1153,26 @@ __device void kernel_path_trace(KernelGlobals *kg,
 
 	float filter_u;
 	float filter_v;
+	int num_samples = kernel_data.integrator.aa_samples;
 
-	path_rng_init(kg, rng_state, sample, &rng, x, y, &filter_u, &filter_v);
+	path_rng_init(kg, rng_state, sample, num_samples, &rng, x, y, &filter_u, &filter_v);
 
 	/* sample camera ray */
 	Ray ray;
-	
+
 	float lens_u = 0.0f, lens_v = 0.0f;
-	float time = 0.0f;
-	
+
 	if(kernel_data.cam.aperturesize > 0.0f) {
-		lens_u = path_rng(kg, &rng, sample, PRNG_LENS_U);
-		lens_v = path_rng(kg, &rng, sample, PRNG_LENS_V);
+		float2 lens_uv = path_rng_2D(kg, rng, sample, num_samples, PRNG_LENS_U);
+		lens_u = lens_uv.x;
+		lens_v = lens_uv.y;
 	}
+
+	float time = 0.0f;
+
 #ifdef __CAMERA_MOTION__
 	if(kernel_data.cam.shuttertime != -1.0f)
-		time = path_rng(kg, &rng, sample, PRNG_TIME);
+		time = path_rng_1D(kg, rng, sample, num_samples, PRNG_TIME);
 #endif
 
 	camera_sample(kg, x, y, filter_u, filter_v, lens_u, lens_v, time, &ray);
@@ -1139,10 +1184,10 @@ __device void kernel_path_trace(KernelGlobals *kg,
 #ifdef __NON_PROGRESSIVE__
 		if(kernel_data.integrator.progressive)
 #endif
-			L = kernel_path_progressive(kg, &rng, sample, ray, buffer);
+			L = kernel_path_progressive(kg, rng, sample, ray, buffer);
 #ifdef __NON_PROGRESSIVE__
 		else
-			L = kernel_path_non_progressive(kg, &rng, sample, ray, buffer);
+			L = kernel_path_non_progressive(kg, rng, sample, ray, buffer);
 #endif
 	}
 	else
diff --git a/intern/cycles/kernel/kernel_random.h b/intern/cycles/kernel/kernel_random.h
index fc33e226051..ecf80b817d4 100644
--- a/intern/cycles/kernel/kernel_random.h
+++ b/intern/cycles/kernel/kernel_random.h
@@ -16,6 +16,8 @@
  * Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301, USA.
  */
 
+#include "kernel_jitter.h"
+
 CCL_NAMESPACE_BEGIN
 
 typedef uint RNG;
@@ -100,10 +102,10 @@ __device uint sobol_lookup(const uint m, const uint frame, const uint ex, const
 	return index;
 }
 
-__device_inline float path_rng(KernelGlobals *kg, RNG *rng, int sample, int dimension)
+__device_inline float path_rng(KernelGlobals *kg, RNG rng, int sample, int dimension)
 {
 #ifdef __SOBOL_FULL_SCREEN__
-	uint result = sobol_dimension(kg, *rng, dimension);
+	uint result = sobol_dimension(kg, rng, dimension);
 	float r = (float)result * (1.0f/(float)0xFFFFFFFF);
 	return r;
 #else
@@ -115,15 +117,44 @@ __device_inline float path_rng(KernelGlobals *kg, RNG *rng, int sample, int dime
 	float shift;
 
 	if(dimension & 1)
-		shift = (*rng >> 16)/((float)0xFFFF);
+		shift = (rng >> 16)*(1.0f/(float)0xFFFF);
 	else
-		shift = (*rng & 0xFFFF)/((float)0xFFFF);
+		shift = (rng & 0xFFFF)*(1.0f/(float)0xFFFF);
 
 	return r + shift - floorf(r + shift);
 #endif
 }
 
-__device_inline void path_rng_init(KernelGlobals *kg, __global uint *rng_state, int sample, RNG *rng, int x, int y, float *fx, float *fy)
+__device_inline float path_rng_1D(KernelGlobals *kg, RNG rng, int sample, int num_samples, int dimension)
+{
+#ifdef __CMJ__
+	if(kernel_data.integrator.sampling_pattern == SAMPLING_PATTERN_CMJ) {
+		/* correlated multi-jittered */
+		int p = rng + dimension;
+		return cmj_sample_1D(sample, num_samples, p);
+	}
+#endif
+
+	/* sobol */
+	return path_rng(kg, rng, sample, dimension);
+}
+
+__device_inline float2 path_rng_2D(KernelGlobals *kg, RNG rng, int sample, int num_samples, int dimension)
+{
+#ifdef __CMJ__
+	if(kernel_data.integrator.sampling_pattern == SAMPLING_PATTERN_CMJ) {
+		/* correlated multi-jittered */
+		int p = rng + dimension;
+		return cmj_sample_2D(sample, num_samples, p);
+	}
+#endif
+
+	/* sobol */
+	return make_float2(path_rng(kg, rng, sample, dimension),
+	                   path_rng(kg, rng, sample, dimension + 1));
+}
+
+__device_inline void path_rng_init(KernelGlobals *kg, __global uint *rng_state, int sample, int num_samples, RNG *rng, int x, int y, float *fx, float *fy)
 {
 #ifdef __SOBOL_FULL_SCREEN__
 	uint px, py;
@@ -153,8 +184,10 @@ __device_inline void path_rng_init(KernelGlobals *kg, __global uint *rng_state,
 		*fy = 0.5f;
 	}
 	else {
-		*fx = path_rng(kg, rng, sample, PRNG_FILTER_U);
-		*fy = path_rng(kg, rng, sample, PRNG_FILTER_V);
+		float2 fxy = path_rng_2D(kg, *rng, sample, num_samples, PRNG_FILTER_U);
+
+		*fx = fxy.x;
+		*fy = fxy.y;
 	}
 #endif
 }
@@ -168,14 +201,25 @@ __device void path_rng_end(KernelGlobals *kg, __global uint *rng_state, RNG rng)
 
 /* Linear Congruential Generator */
 
-__device float path_rng(KernelGlobals *kg, RNG *rng, int sample, int dimension)
+__device float path_rng(KernelGlobals *kg, RNG& rng, int sample, int dimension)
 {
 	/* implicit mod 2^32 */
-	*rng = (1103515245*(*rng) + 12345);
-	return (float)*rng * (1.0f/(float)0xFFFFFFFF);
+	rng = (1103515245*(rng) + 12345);
+	return (float)rng * (1.0f/(float)0xFFFFFFFF);
 }
 
-__device void path_rng_init(KernelGlobals *kg, __global uint *rng_state, int sample, RNG *rng, int x, int y, float *fx, float *fy)
+__device_inline float path_rng_1D(KernelGlobals *kg, RNG& rng, int sample, int num_samples, int dimension)
+{
+	return path_rng(kg, rng, sample, dimension);
+}
+
+__device_inline float2 path_rng_2D(KernelGlobals *kg, RNG& rng, int sample, int num_samples, int dimension)
+{
+	return make_float2(path_rng(kg, rng, sample, dimension),
+	                   path_rng(kg, rng, sample, dimension + 1));
+}
+
+__device void path_rng_init(KernelGlobals *kg, __global uint *rng_state, int sample, int num_samples, RNG *rng, int x, int y, float *fx, float *fy)
 {
 	/* load state */
 	*rng = *rng_state;
@@ -187,8 +231,10 @@ __device void path_rng_init(KernelGlobals *kg, __global uint *rng_state, int sam
 		*fy = 0.5f;
 	}
 	else {
-		*fx = path_rng(kg, rng, sample, PRNG_FILTER_U);
-		*fy = path_rng(kg, rng, sample, PRNG_FILTER_V);
+		float2 fxy = path_rng_2D(kg, rng, sample, num_samples, PRNG_FILTER_U);
+
+		*fx = fxy.x;
+		*fy = fxy.y;
 	}
 }
 
diff --git a/intern/cycles/kernel/kernel_types.h b/intern/cycles/kernel/kernel_types.h
index f165a1f3839..abdb609b55f 100644
--- a/intern/cycles/kernel/kernel_types.h
+++ b/intern/cycles/kernel/kernel_types.h
@@ -117,6 +117,7 @@ CCL_NAMESPACE_BEGIN
 #define __CAMERA_CLIPPING__
 #define __INTERSECTION_REFINE__
 #define __CLAMP_SAMPLE__
+#define __CMJ__
 
 #ifdef __KERNEL_SHADING__
 #define __SVM__
@@ -164,8 +165,10 @@ enum PathTraceDimension {
 	PRNG_LENS_V = 3,
 #ifdef __CAMERA_MOTION__
 	PRNG_TIME = 4,
-	PRNG_UNUSED = 5,
-	PRNG_BASE_NUM = 6,
+	PRNG_UNUSED_0 = 5,
+	PRNG_UNUSED_1 = 6,	/* for some reason (6, 7) is a bad sobol pattern */
+	PRNG_UNUSED_2 = 7,  /* with a low number of samples (< 64) */
+	PRNG_BASE_NUM = 8,
 #else
 	PRNG_BASE_NUM = 4,
 #endif
@@ -181,6 +184,11 @@ enum PathTraceDimension {
 	PRNG_BOUNCE_NUM = 8
 };
 
+enum SamplingPattern {
+	SAMPLING_PATTERN_SOBOL = 0,
+	SAMPLING_PATTERN_CMJ = 1
+};
+
 /* these flags values correspond to raytypes in osl.cpp, so keep them in sync!
  *
  * for ray visibility tests in BVH traversal, the upper 20 bits are used for
@@ -728,6 +736,7 @@ typedef struct KernelIntegrator {
 
 	/* non-progressive */
 	int progressive;
+	int aa_samples;
 	int diffuse_samples;
 	int glossy_samples;
 	int transmission_samples;
@@ -736,7 +745,11 @@ typedef struct KernelIntegrator {
 	int use_lamp_mis;
 	int subsurface_samples;
 
-	int pad1, pad2, pad3;
+	/* sampler */
+	int sampling_pattern;
+
+	/* padding */
+	int pad;
 } KernelIntegrator;
 
 typedef struct KernelBVH {
diff --git a/intern/cycles/render/integrator.cpp b/intern/cycles/render/integrator.cpp
index 731ffd9e271..cc369e7abc9 100644
--- a/intern/cycles/render/integrator.cpp
+++ b/intern/cycles/render/integrator.cpp
@@ -49,6 +49,7 @@ Integrator::Integrator()
 	sample_clamp = 0.0f;
 	motion_blur = false;
 
+	aa_samples = 0;
 	diffuse_samples = 1;
 	glossy_samples = 1;
 	transmission_samples = 1;
@@ -57,6 +58,8 @@ Integrator::Integrator()
 	subsurface_samples = 1;
 	progressive = true;
 
+	sampling_pattern = SAMPLING_PATTERN_SOBOL;
+
 	need_update = true;
 }
 
@@ -104,6 +107,7 @@ void Integrator::device_update(Device *device, DeviceScene *dscene, Scene *scene
 	kintegrator->sample_clamp = (sample_clamp == 0.0f)? FLT_MAX: sample_clamp*3.0f;
 
 	kintegrator->progressive = progressive;
+	kintegrator->aa_samples = aa_samples;
 	kintegrator->diffuse_samples = diffuse_samples;
 	kintegrator->glossy_samples = glossy_samples;
 	kintegrator->transmission_samples = transmission_samples;
@@ -111,6 +115,8 @@ void Integrator::device_update(Device *device, DeviceScene *dscene, Scene *scene
 	kintegrator->mesh_light_samples = mesh_light_samples;
 	kintegrator->subsurface_samples = subsurface_samples;
 
+	kintegrator->sampling_pattern = sampling_pattern;
+
 	/* sobol directions table */
 	int max_samples = 1;
 
@@ -160,6 +166,7 @@ bool Integrator::modified(const Integrator& integrator)
 		seed == integrator.seed &&
 		sample_clamp == integrator.sample_clamp &&
 		progressive == integrator.progressive &&
+		aa_samples == integrator.aa_samples &&
 		diffuse_samples == integrator.diffuse_samples &&
 		glossy_samples == integrator.glossy_samples &&
 		transmission_samples == integrator.transmission_samples &&
diff --git a/intern/cycles/render/integrator.h b/intern/cycles/render/integrator.h
index 9867e310d4d..fff24b506fb 100644
--- a/intern/cycles/render/integrator.h
+++ b/intern/cycles/render/integrator.h
@@ -19,6 +19,8 @@
 #ifndef __INTEGRATOR_H__
 #define __INTEGRATOR_H__
 
+#include "kernel_types.h"
+
 CCL_NAMESPACE_BEGIN
 
 class Device;
@@ -49,6 +51,7 @@ public:
 	float sample_clamp;
 	bool motion_blur;
 
+	int aa_samples;
 	int diffuse_samples;
 	int glossy_samples;
 	int transmission_samples;
@@ -58,6 +61,8 @@ public:
 
 	bool progressive;
 
+	SamplingPattern sampling_pattern;
+
 	bool need_update;
 
 	Integrator();
diff --git a/intern/cycles/render/session.cpp b/intern/cycles/render/session.cpp
index bc847d5719c..44364418dcf 100644
--- a/intern/cycles/render/session.cpp
+++ b/intern/cycles/render/session.cpp
@@ -22,6 +22,7 @@
 #include "buffers.h"
 #include "camera.h"
 #include "device.h"
+#include "integrator.h"
 #include "scene.h"
 #include "session.h"
 
@@ -728,6 +729,18 @@ void Session::update_scene()
 		cam->tag_update();
 	}
 
+	/* number of samples is needed by multi jittered sampling pattern */
+	Integrator *integrator = scene->integrator;
+
+	if(integrator->sampling_pattern == SAMPLING_PATTERN_CMJ) {
+		int aa_samples = tile_manager.num_samples;
+
+		if(aa_samples != integrator->aa_samples) {
+			integrator->aa_samples = aa_samples;
+			integrator->tag_update(scene);
+		}
+	}
+
 	/* update scene */
 	if(scene->need_update()) {
 		progress.set_status("Updating Scene");
diff --git a/intern/cycles/util/util_types.h b/intern/cycles/util/util_types.h
index fe1cb61ffa9..0bc67f0618a 100644
--- a/intern/cycles/util/util_types.h
+++ b/intern/cycles/util/util_types.h
@@ -59,6 +59,8 @@
 
 /* SIMD Types */
 
+#ifndef __KERNEL_GPU__
+
 /* not enabled, globally applying it just gives slowdown,
  * but useful for testing. */
 //#define __KERNEL_SSE__
@@ -88,11 +90,11 @@
 #endif
 
 #ifndef _WIN32
-#ifndef __KERNEL_GPU__
 
 #include <stdint.h>
 
 #endif
+
 #endif
 
 CCL_NAMESPACE_BEGIN