Merge branch 'soc-2014-viewport' into soc-2013-viewport_fx

Conflicts: intern/cycles/device/device_cuda.cpp intern/ghost/intern/GHOST_WindowCocoa.mm source/blender/blenfont/intern/blf_font.c source/blender/blenfont/intern/blf_translation.c source/blender/blenkernel/BKE_brush.h source/blender/blenkernel/BKE_pbvh.h source/blender/blenkernel/intern/cdderivedmesh.c source/blender/blenkernel/intern/editderivedmesh.c source/blender/blenkernel/intern/mesh.c source/blender/blenkernel/intern/subsurf_ccg.c source/blender/blenlib/BLI_fileops.h source/blender/blenlib/BLI_math_matrix.h source/blender/blenlib/intern/fileops.c source/blender/blenlib/intern/math_matrix.c source/blender/editors/animation/anim_channels_defines.c source/blender/editors/animation/anim_draw.c source/blender/editors/animation/keyframes_draw.c source/blender/editors/include/ED_armature.h source/blender/editors/interface/interface.c source/blender/editors/interface/interface_draw.c source/blender/editors/interface/interface_icons.c source/blender/editors/interface/interface_panel.c source/blender/editors/interface/interface_widgets.c source/blender/editors/interface/view2d.c source/blender/editors/mask/mask_draw.c source/blender/editors/mesh/editmesh_select.c source/blender/editors/render/render_opengl.c source/blender/editors/screen/area.c source/blender/editors/screen/glutil.c source/blender/editors/sculpt_paint/paint_cursor.c source/blender/editors/sculpt_paint/paint_image.c source/blender/editors/sculpt_paint/paint_image_proj.c source/blender/editors/sculpt_paint/paint_utils.c source/blender/editors/sculpt_paint/sculpt_intern.h source/blender/editors/space_buttons/space_buttons.c source/blender/editors/space_clip/clip_dopesheet_draw.c source/blender/editors/space_clip/clip_draw.c source/blender/editors/space_clip/clip_graph_draw.c source/blender/editors/space_clip/clip_utils.c source/blender/editors/space_console/console_draw.c source/blender/editors/space_file/file_draw.c source/blender/editors/space_file/file_ops.c source/blender/editors/space_graph/graph_draw.c source/blender/editors/space_info/info_draw.c source/blender/editors/space_info/textview.c source/blender/editors/space_logic/logic_window.c source/blender/editors/space_nla/nla_draw.c source/blender/editors/space_outliner/outliner_draw.c source/blender/editors/space_sequencer/sequencer_draw.c source/blender/editors/space_view3d/drawanimviz.c source/blender/editors/space_view3d/drawarmature.c source/blender/editors/space_view3d/drawmesh.c source/blender/editors/space_view3d/drawobject.c source/blender/editors/space_view3d/view3d_draw.c source/blender/editors/space_view3d/view3d_fly.c source/blender/editors/space_view3d/view3d_intern.h source/blender/editors/space_view3d/view3d_walk.c source/blender/editors/transform/transform.c source/blender/editors/transform/transform_manipulator.c source/blender/editors/util/ed_util.c source/blender/editors/uvedit/uvedit_draw.c source/blender/gpu/GPU_buffers.h source/blender/gpu/intern/gpu_buffers.c source/blender/gpu/intern/gpu_codegen.c source/blender/gpu/intern/gpu_codegen.h source/blender/gpu/intern/gpu_draw.c source/blender/render/intern/source/convertblender.c source/blender/windowmanager/intern/wm_operators.c source/blender/windowmanager/intern/wm_subwindow.c source/blender/windowmanager/intern/wm_window.c
author: Jason Wilkins <Jason.A.Wilkins@gmail.com> 2014-05-22 04:02:02 +0400
committer: Jason Wilkins <Jason.A.Wilkins@gmail.com> 2014-05-22 04:02:02 +0400
commit: 6eff1cbebcf0766d2fe69db9b0fb3f76ede2c06b (patch)
tree: 3af4122e291f53f88b63ec6ded2e0fa7790e04ac /intern
parent: 49de1ada8dcba35862759e0f7da5ca2209b4f588 (diff)
parent: 146a1c77eacb925eb7c86bb49495c0f09adc607c (diff)
214 files changed, 9512 insertions, 4502 deletions
diff --git a/intern/SConscript b/intern/SConscript
index 828c1adc20d..20803884a39 100644
--- a/intern/SConscript
+++ b/intern/SConscript
@@ -53,9 +53,6 @@ if env['WITH_BF_FLUID']:
 if env['WITH_BF_CYCLES']:
     SConscript(['cycles/SConscript'])
 
-if env['WITH_BF_BOOLEAN']:
-    SConscript(['bsp/SConscript'])
-
 if env['WITH_BF_INTERNATIONAL']:
     SConscript(['locale/SConscript'])
 
diff --git a/intern/audaspace/OpenAL/AUD_OpenALDevice.cpp b/intern/audaspace/OpenAL/AUD_OpenALDevice.cpp
index c3877c2c9f2..d055c131183 100644
--- a/intern/audaspace/OpenAL/AUD_OpenALDevice.cpp
+++ b/intern/audaspace/OpenAL/AUD_OpenALDevice.cpp
@@ -994,7 +994,7 @@ void AUD_OpenALDevice::updateStreams()
 				if(info != AL_PLAYING)
 				{
 					// if it really stopped
-					if(sound->m_eos)
+					if(sound->m_eos && info != AL_INITIAL)
 					{
 						if(sound->m_stop)
 							sound->m_stop(sound->m_stop_data);
diff --git a/intern/audaspace/ffmpeg/AUD_FFMPEGWriter.cpp b/intern/audaspace/ffmpeg/AUD_FFMPEGWriter.cpp
index d8f0d837fec..d30835da4e5 100644
--- a/intern/audaspace/ffmpeg/AUD_FFMPEGWriter.cpp
+++ b/intern/audaspace/ffmpeg/AUD_FFMPEGWriter.cpp
@@ -169,14 +169,29 @@ AUD_FFMPEGWriter::AUD_FFMPEGWriter(std::string filename, AUD_DeviceSpecs specs,
 			if(!codec)
 				AUD_THROW(AUD_ERROR_FFMPEG, codec_error);
 
+			if(codec->sample_fmts) {
+				// Check if the prefered sample format for this codec is supported.
+				const enum AVSampleFormat *p = codec->sample_fmts;
+				for(; *p != -1; p++) {
+					if(*p == m_stream->codec->sample_fmt)
+						break;
+				}
+				if(*p == -1) {
+					// Sample format incompatible with codec. Defaulting to a format known to work.
+					m_stream->codec->sample_fmt = codec->sample_fmts[0];
+				}
+			}
+
 			if(avcodec_open2(m_codecCtx, codec, NULL))
 				AUD_THROW(AUD_ERROR_FFMPEG, codec_error);
 
 			m_output_buffer.resize(FF_MIN_BUFFER_SIZE);
 			int samplesize = AUD_MAX(AUD_SAMPLE_SIZE(m_specs), AUD_DEVICE_SAMPLE_SIZE(m_specs));
 
-			if(m_codecCtx->frame_size <= 1)
-				m_input_size = 0;
+			if(m_codecCtx->frame_size <= 1) {
+				m_input_size = FF_MIN_BUFFER_SIZE * 8 / m_codecCtx->bits_per_coded_sample / m_codecCtx->channels;
+				m_input_buffer.resize(m_input_size * samplesize);
+			}
 			else
 			{
 				m_input_buffer.resize(m_codecCtx->frame_size * samplesize);
@@ -187,14 +202,21 @@ AUD_FFMPEGWriter::AUD_FFMPEGWriter(std::string filename, AUD_DeviceSpecs specs,
 			m_frame = av_frame_alloc();
 			if (!m_frame)
 				AUD_THROW(AUD_ERROR_FFMPEG, codec_error);
+			avcodec_get_frame_defaults(m_frame);
 			m_frame->linesize[0]    = m_input_size * samplesize;
 			m_frame->format         = m_codecCtx->sample_fmt;
+			m_frame->nb_samples     = m_input_size;
 #  ifdef FFMPEG_HAVE_AVFRAME_SAMPLE_RATE
 			m_frame->sample_rate    = m_codecCtx->sample_rate;
 #  endif
 #  ifdef FFMPEG_HAVE_FRAME_CHANNEL_LAYOUT
 			m_frame->channel_layout = m_codecCtx->channel_layout;
 #  endif
+			m_sample_size = av_get_bytes_per_sample(m_codecCtx->sample_fmt);
+			m_frame_pts = 0;
+			m_deinterleave = av_sample_fmt_is_planar(m_codecCtx->sample_fmt);
+			if(m_deinterleave)
+				m_deinterleave_buffer.resize(m_input_size * m_codecCtx->channels * m_sample_size);
 #endif
 
 			try
@@ -272,13 +294,31 @@ void AUD_FFMPEGWriter::encode(sample_t* data)
 
 #ifdef FFMPEG_HAVE_ENCODE_AUDIO2
 	int got_output, ret;
+	m_frame->pts = m_frame_pts / av_q2d(m_codecCtx->time_base);
+	m_frame_pts++;
+#ifdef FFMPEG_HAVE_FRAME_CHANNEL_LAYOUT
+	m_frame->channel_layout = m_codecCtx->channel_layout;
+#endif
+
+	if(m_deinterleave) {
+		for(int channel = 0; channel < m_codecCtx->channels; channel++) {
+			for(int i = 0; i < m_frame->nb_samples; i++) {
+				memcpy(reinterpret_cast<uint8_t*>(m_deinterleave_buffer.getBuffer()) + (i + channel * m_frame->nb_samples) * m_sample_size,
+					   reinterpret_cast<uint8_t*>(data) + (m_codecCtx->channels * i + channel) * m_sample_size, m_sample_size);
+			}
+		}
+
+		data = m_deinterleave_buffer.getBuffer();
+	}
+
+	avcodec_fill_audio_frame(m_frame, m_codecCtx->channels, m_codecCtx->sample_fmt, reinterpret_cast<uint8_t*>(data),
+	                         m_frame->nb_samples * av_get_bytes_per_sample(m_codecCtx->sample_fmt) * m_codecCtx->channels, 1);
 
-	m_frame->data[0] = reinterpret_cast<uint8_t*>(data);
 	ret = avcodec_encode_audio2(m_codecCtx, &packet, m_frame, &got_output);
-	if (ret < 0)
+	if(ret < 0)
 		AUD_THROW(AUD_ERROR_FFMPEG, codec_error);
 
-	if (!got_output)
+	if(!got_output)
 		return;
 #else
 	sample_t* outbuf = m_output_buffer.getBuffer();
@@ -290,10 +330,23 @@ void AUD_FFMPEGWriter::encode(sample_t* data)
 	packet.data = reinterpret_cast<uint8_t*>(outbuf);
 #endif
 
+	if(packet.pts != AV_NOPTS_VALUE)
+		packet.pts = av_rescale_q(packet.pts, m_codecCtx->time_base, m_stream->time_base);
+	if(packet.dts != AV_NOPTS_VALUE)
+		packet.dts = av_rescale_q(packet.dts, m_codecCtx->time_base, m_stream->time_base);
+	if(packet.duration > 0)
+		packet.duration = av_rescale_q(packet.duration, m_codecCtx->time_base, m_stream->time_base);
+
 	packet.stream_index = m_stream->index;
 
-	if(av_interleaved_write_frame(m_formatCtx, &packet))
+	packet.flags |= AV_PKT_FLAG_KEY;
+
+	if(av_interleaved_write_frame(m_formatCtx, &packet)) {
+		av_free_packet(&packet);
 		AUD_THROW(AUD_ERROR_FFMPEG, write_error);
+	}
+
+	av_free_packet(&packet);
 }
 
 void AUD_FFMPEGWriter::write(unsigned int length, sample_t* buffer)
diff --git a/intern/audaspace/ffmpeg/AUD_FFMPEGWriter.h b/intern/audaspace/ffmpeg/AUD_FFMPEGWriter.h
index 310f69258ea..492aa35ff12 100644
--- a/intern/audaspace/ffmpeg/AUD_FFMPEGWriter.h
+++ b/intern/audaspace/ffmpeg/AUD_FFMPEGWriter.h
@@ -83,6 +83,23 @@ private:
 	AVFrame *m_frame;
 
 	/**
+	 * PTS of next frame to write.
+	 */
+	int m_frame_pts;
+
+	/**
+	 * Number of bytes per sample.
+	 */
+	int m_sample_size;
+
+	/**
+	 * Need to de-interleave audio for planar sample formats.
+	 */
+	bool m_deinterleave;
+
+	AUD_Buffer m_deinterleave_buffer;
+
+	/**
 	 * The input buffer for the format converted data before encoding.
 	 */
 	AUD_Buffer m_input_buffer;
diff --git a/intern/audaspace/intern/AUD_AnimateableProperty.cpp b/intern/audaspace/intern/AUD_AnimateableProperty.cpp
index 61adae4b34b..9f399a0b99f 100644
--- a/intern/audaspace/intern/AUD_AnimateableProperty.cpp
+++ b/intern/audaspace/intern/AUD_AnimateableProperty.cpp
@@ -47,6 +47,23 @@ AUD_AnimateableProperty::AUD_AnimateableProperty(int count) :
 	pthread_mutexattr_destroy(&attr);
 }
 
+AUD_AnimateableProperty::AUD_AnimateableProperty(int count, float value) :
+	AUD_Buffer(count * sizeof(float)), m_count(count), m_isAnimated(false)
+{
+	sample_t* buf = getBuffer();
+
+	for(int i = 0; i < count; i++)
+		buf[i] = value;
+
+	pthread_mutexattr_t attr;
+	pthread_mutexattr_init(&attr);
+	pthread_mutexattr_settype(&attr, PTHREAD_MUTEX_RECURSIVE);
+
+	pthread_mutex_init(&m_mutex, &attr);
+
+	pthread_mutexattr_destroy(&attr);
+}
+
 void AUD_AnimateableProperty::updateUnknownCache(int start, int end)
 {
 	float* buf = getBuffer();
@@ -104,7 +121,8 @@ void AUD_AnimateableProperty::write(const float* data, int position, int count)
 
 		if(pos == 0)
 		{
-			memset(buf, 0, position * m_count * sizeof(float));
+			for(int i = 0; i < position; i++)
+				memcpy(buf + i * m_count, data, m_count * sizeof(float));
 		}
 		else
 			updateUnknownCache(pos, position - 1);
diff --git a/intern/audaspace/intern/AUD_AnimateableProperty.h b/intern/audaspace/intern/AUD_AnimateableProperty.h
index 37eb8f84550..f07e5916b25 100644
--- a/intern/audaspace/intern/AUD_AnimateableProperty.h
+++ b/intern/audaspace/intern/AUD_AnimateableProperty.h
@@ -76,6 +76,13 @@ public:
 	AUD_AnimateableProperty(int count = 1);
 
 	/**
+	 * Creates a new animateable property.
+	 * \param count The count of floats for a single property.
+	 * \param count The value that the property should get initialized with. All count floats will be initialized to the same value.
+	 */
+	AUD_AnimateableProperty(int count, float value);
+
+	/**
 	 * Destroys the animateable property.
 	 */
 	~AUD_AnimateableProperty();
diff --git a/intern/audaspace/intern/AUD_ConverterFunctions.h b/intern/audaspace/intern/AUD_ConverterFunctions.h
index 1ffcf6c4ef0..7817ee88c07 100644
--- a/intern/audaspace/intern/AUD_ConverterFunctions.h
+++ b/intern/audaspace/intern/AUD_ConverterFunctions.h
@@ -34,12 +34,11 @@
 
 #include <cstring>
 #ifdef _MSC_VER
-#if (_MSC_VER < 1300)
+#if (_MSC_VER <= 1500)
    typedef short             int16_t;
    typedef int               int32_t;
 #else
-   typedef __int16           int16_t;
-   typedef __int32           int32_t;
+#	include <stdint.h>
 #endif
 #else
 #include <stdint.h>
diff --git a/intern/audaspace/intern/AUD_Sequencer.cpp b/intern/audaspace/intern/AUD_Sequencer.cpp
index c59c56a4479..6c5e48c73f0 100644
--- a/intern/audaspace/intern/AUD_Sequencer.cpp
+++ b/intern/audaspace/intern/AUD_Sequencer.cpp
@@ -42,6 +42,7 @@ AUD_Sequencer::AUD_Sequencer(AUD_Specs specs, float fps, bool muted) :
 	m_speed_of_sound(434),
 	m_doppler_factor(1),
 	m_distance_model(AUD_DISTANCE_MODEL_INVERSE_CLAMPED),
+	m_volume(1, 1.0f),
 	m_location(3),
 	m_orientation(4)
 {
diff --git a/intern/audaspace/intern/AUD_SequencerEntry.cpp b/intern/audaspace/intern/AUD_SequencerEntry.cpp
index 005557bbed1..6ef8479cdb8 100644
--- a/intern/audaspace/intern/AUD_SequencerEntry.cpp
+++ b/intern/audaspace/intern/AUD_SequencerEntry.cpp
@@ -53,6 +53,8 @@ AUD_SequencerEntry::AUD_SequencerEntry(boost::shared_ptr<AUD_IFactory> sound, fl
 	m_cone_angle_outer(360),
 	m_cone_angle_inner(360),
 	m_cone_volume_outer(0),
+	m_volume(1, 1.0f),
+	m_pitch(1, 1.0f),
 	m_location(3),
 	m_orientation(4)
 {
diff --git a/intern/cycles/CMakeLists.txt b/intern/cycles/CMakeLists.txt
index 5c8d68b07ee..a1b0030491e 100644
--- a/intern/cycles/CMakeLists.txt
+++ b/intern/cycles/CMakeLists.txt
@@ -1,4 +1,3 @@
-
 # Standalone or with Blender
 if(NOT WITH_BLENDER AND WITH_CYCLES_STANDALONE)
 	set(CYCLES_INSTALL_PATH "")
@@ -13,8 +12,11 @@ include(cmake/external_libs.cmake)
 
 # Build Flags
 # todo: refactor this code to match scons
+# note: CXX_HAS_SSE is needed in case passing SSE flags fails altogether (gcc-arm)
 
 if(WIN32 AND MSVC)
+	set(CXX_HAS_SSE TRUE)
+
 	# /arch:AVX for VC2012 and above
 	if(NOT MSVC_VERSION LESS 1700)
 		set(CYCLES_AVX_ARCH_FLAGS "/arch:AVX")
@@ -24,36 +26,49 @@ if(WIN32 AND MSVC)
 
 	# there is no /arch:SSE3, but intrinsics are available anyway
 	if(CMAKE_CL_64)
-		set(CYCLES_SSE2_KERNEL_FLAGS "/fp:fast -D_CRT_SECURE_NO_WARNINGS /Gs-")
-		set(CYCLES_SSE3_KERNEL_FLAGS "/fp:fast -D_CRT_SECURE_NO_WARNINGS /Gs-")
-		set(CYCLES_SSE41_KERNEL_FLAGS "/fp:fast -D_CRT_SECURE_NO_WARNINGS /Gs-")
-		set(CYCLES_AVX_KERNEL_FLAGS "${CYCLES_AVX_ARCH_FLAGS} /fp:fast -D_CRT_SECURE_NO_WARNINGS /Gs-")
+		set(CYCLES_SSE2_KERNEL_FLAGS "/fp:fast -D_CRT_SECURE_NO_WARNINGS /GS-")
+		set(CYCLES_SSE3_KERNEL_FLAGS "/fp:fast -D_CRT_SECURE_NO_WARNINGS /GS-")
+		set(CYCLES_SSE41_KERNEL_FLAGS "/fp:fast -D_CRT_SECURE_NO_WARNINGS /GS-")
+		set(CYCLES_AVX_KERNEL_FLAGS "${CYCLES_AVX_ARCH_FLAGS} /fp:fast -D_CRT_SECURE_NO_WARNINGS /GS-")
 	else()
-		set(CYCLES_SSE2_KERNEL_FLAGS "/arch:SSE2 /fp:fast -D_CRT_SECURE_NO_WARNINGS /Gs-")
-		set(CYCLES_SSE3_KERNEL_FLAGS "/arch:SSE2 /fp:fast -D_CRT_SECURE_NO_WARNINGS /Gs-")
-		set(CYCLES_SSE41_KERNEL_FLAGS "/arch:SSE2 /fp:fast -D_CRT_SECURE_NO_WARNINGS /Gs-")
-		set(CYCLES_AVX_KERNEL_FLAGS "${CYCLES_AVX_ARCH_FLAGS} /fp:fast -D_CRT_SECURE_NO_WARNINGS /Gs-") 
+		set(CYCLES_SSE2_KERNEL_FLAGS "/arch:SSE2 /fp:fast -D_CRT_SECURE_NO_WARNINGS /GS-")
+		set(CYCLES_SSE3_KERNEL_FLAGS "/arch:SSE2 /fp:fast -D_CRT_SECURE_NO_WARNINGS /GS-")
+		set(CYCLES_SSE41_KERNEL_FLAGS "/arch:SSE2 /fp:fast -D_CRT_SECURE_NO_WARNINGS /GS-")
+		set(CYCLES_AVX_KERNEL_FLAGS "${CYCLES_AVX_ARCH_FLAGS} /fp:fast -D_CRT_SECURE_NO_WARNINGS /GS-")
 	endif()
 
-	set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} /fp:fast -D_CRT_SECURE_NO_WARNINGS /Gs-")
+	set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} /fp:fast -D_CRT_SECURE_NO_WARNINGS /GS-")
 	set(CMAKE_CXX_FLAGS_RELEASE "${CMAKE_CXX_FLAGS_RELEASE} /Ox")
 	set(CMAKE_CXX_FLAGS_RELWITHDEBINFO "${CMAKE_CXX_FLAGS_RELWITHDEBINFO} /Ox")
 	set(CMAKE_CXX_FLAGS_MINSIZEREL "${CMAKE_CXX_FLAGS_MINSIZEREL} /Ox")
 elseif(CMAKE_COMPILER_IS_GNUCC)
-	set(CYCLES_SSE2_KERNEL_FLAGS "-ffast-math -msse -msse2 -mfpmath=sse")
-	set(CYCLES_SSE3_KERNEL_FLAGS "-ffast-math -msse -msse2 -msse3 -mssse3 -mfpmath=sse")
-	set(CYCLES_SSE41_KERNEL_FLAGS "-ffast-math -msse -msse2 -msse3 -mssse3 -msse4.1 -mfpmath=sse")
-	set(CYCLES_AVX_KERNEL_FLAGS "-ffast-math -msse -msse2 -msse3 -mssse3 -msse4.1 -mavx -mfpmath=sse")
+	check_cxx_compiler_flag(-msse CXX_HAS_SSE)
+	if(CXX_HAS_SSE)
+		set(CYCLES_SSE2_KERNEL_FLAGS "-ffast-math -msse -msse2 -mfpmath=sse")
+		set(CYCLES_SSE3_KERNEL_FLAGS "-ffast-math -msse -msse2 -msse3 -mssse3 -mfpmath=sse")
+		set(CYCLES_SSE41_KERNEL_FLAGS "-ffast-math -msse -msse2 -msse3 -mssse3 -msse4.1 -mfpmath=sse")
+		set(CYCLES_AVX_KERNEL_FLAGS "-ffast-math -msse -msse2 -msse3 -mssse3 -msse4.1 -mavx -mfpmath=sse")
+	endif()
 	set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -ffast-math")
 elseif(CMAKE_CXX_COMPILER_ID MATCHES "Clang")
-	set(CYCLES_SSE2_KERNEL_FLAGS "-ffast-math -msse -msse2")
-	set(CYCLES_SSE3_KERNEL_FLAGS "-ffast-math -msse -msse2 -msse3 -mssse3")
-	set(CYCLES_SSE41_KERNEL_FLAGS "-ffast-math -msse -msse2 -msse3 -mssse3 -msse4.1")
-	set(CYCLES_AVX_KERNEL_FLAGS "-ffast-math -msse -msse2 -msse3 -mssse3 -msse4.1 -mavx")
+	check_cxx_compiler_flag(-msse CXX_HAS_SSE)
+	if(CXX_HAS_SSE)
+		set(CYCLES_SSE2_KERNEL_FLAGS "-ffast-math -msse -msse2")
+		set(CYCLES_SSE3_KERNEL_FLAGS "-ffast-math -msse -msse2 -msse3 -mssse3")
+		set(CYCLES_SSE41_KERNEL_FLAGS "-ffast-math -msse -msse2 -msse3 -mssse3 -msse4.1")
+		set(CYCLES_AVX_KERNEL_FLAGS "-ffast-math -msse -msse2 -msse3 -mssse3 -msse4.1 -mavx")
+	endif()
 	set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -ffast-math")
 endif()
 
-add_definitions(-DWITH_KERNEL_SSE2 -DWITH_KERNEL_SSE3 -DWITH_KERNEL_SSE41 -DWITH_KERNEL_AVX)
+if(CXX_HAS_SSE)
+	add_definitions(
+		-DWITH_KERNEL_SSE2
+		-DWITH_KERNEL_SSE3
+		-DWITH_KERNEL_SSE41
+		-DWITH_KERNEL_AVX
+	)
+endif()
 
 # for OSL
 if(WIN32 AND MSVC)
@@ -64,10 +79,15 @@ endif()
 
 # Definitions and Includes
 
-add_definitions(${BOOST_DEFINITIONS} ${OPENIMAGEIO_DEFINITIONS})
+add_definitions(
+	${BOOST_DEFINITIONS}
+	${OPENIMAGEIO_DEFINITIONS}
+)
 
-add_definitions(-DCCL_NAMESPACE_BEGIN=namespace\ ccl\ {)
-add_definitions(-DCCL_NAMESPACE_END=})
+add_definitions(
+	-DCCL_NAMESPACE_BEGIN=namespace\ ccl\ {
+	-DCCL_NAMESPACE_END=}
+)
 
 if(WITH_CYCLES_NETWORK)
 	add_definitions(-DWITH_NETWORK)
@@ -91,9 +111,11 @@ if(WITH_CYCLES_OSL)
 	include_directories(${OSL_INCLUDES})
 endif()
 
-add_definitions(-DWITH_OPENCL)
-add_definitions(-DWITH_CUDA)
-add_definitions(-DWITH_MULTI)
+add_definitions(
+	-DWITH_OPENCL
+	-DWITH_CUDA
+	-DWITH_MULTI
+)
 
 include_directories(
 	SYSTEM
@@ -101,7 +123,16 @@ include_directories(
 	${OPENIMAGEIO_INCLUDE_DIRS}
 	${OPENIMAGEIO_INCLUDE_DIRS}/OpenImageIO
 	${OPENEXR_INCLUDE_DIR}
-	${OPENEXR_INCLUDE_DIRS})
+	${OPENEXR_INCLUDE_DIRS}
+)
+
+
+# Warnings
+if(CMAKE_COMPILER_IS_GNUCXX)
+	ADD_CHECK_CXX_COMPILER_FLAG(CMAKE_CXX_FLAGS _has_cxxflag_float_conversion "-Werror=float-conversion")
+	unset(_has_cxxflag_float_conversion)
+endif()
+
 
 # Subdirectories
 
diff --git a/intern/cycles/SConscript b/intern/cycles/SConscript
index b8c731e3315..532238b9d7e 100644
--- a/intern/cycles/SConscript
+++ b/intern/cycles/SConscript
@@ -72,6 +72,12 @@ if env['OURPLATFORM'] in ('win32-vc', 'win64-vc'):
 else:
     cxxflags.append('-ffast-math'.split())
 
+# Warnings
+# XXX Not supported by gcc < 4.9, since we do not have any 'supported flags' test as in cmake,
+#     simpler to comment for now.
+#if env['C_COMPILER_ID'] == 'gcc':
+#    cxxflags.append(['-Werror=float-conversion'])
+
 if env['OURPLATFORM'] in ('win32-vc', 'win32-mingw', 'linuxcross', 'win64-vc', 'win64-mingw'):
     incs.append(env['BF_PTHREADS_INC'])
 
@@ -81,12 +87,12 @@ kernel_flags = {}
 
 if env['OURPLATFORM'] == 'win32-vc':
     # there is no /arch:SSE3, but intrinsics are available anyway
-    kernel_flags['sse2'] = '/arch:SSE /arch:SSE2 -D_CRT_SECURE_NO_WARNINGS /fp:fast /Ox /Gs-'
+    kernel_flags['sse2'] = '/arch:SSE /arch:SSE2 -D_CRT_SECURE_NO_WARNINGS /fp:fast /Ox /GS-'
     kernel_flags['sse3'] = kernel_flags['sse2']
 
 elif env['OURPLATFORM'] == 'win64-vc':
     # /arch:AVX only available from visual studio 2012
-    kernel_flags['sse2'] = '-D_CRT_SECURE_NO_WARNINGS /fp:fast /Ox /Gs-'
+    kernel_flags['sse2'] = '-D_CRT_SECURE_NO_WARNINGS /fp:fast /Ox /GS-'
     kernel_flags['sse3'] = kernel_flags['sse2']
 
     if env['MSVC_VERSION'] in ('11.0', '12.0'):
diff --git a/intern/cycles/app/cycles_standalone.cpp b/intern/cycles/app/cycles_standalone.cpp
index 230833802b0..7ea1ca2d8fb 100644
--- a/intern/cycles/app/cycles_standalone.cpp
+++ b/intern/cycles/app/cycles_standalone.cpp
@@ -46,7 +46,8 @@ struct Options {
 	int width, height;
 	SceneParams scene_params;
 	SessionParams session_params;
-	bool quiet, show_help, interactive;
+	bool quiet;
+	bool show_help, interactive, pause;
 } options;
 
 static void session_print(const string& str)
@@ -114,15 +115,25 @@ static void session_init()
 	options.scene = NULL;
 }
 
-static void scene_init(int width, int height)
+static void scene_init()
 {
 	options.scene = new Scene(options.scene_params, options.session_params.device);
+
+	/* Read XML */
 	xml_read_file(options.scene, options.filepath.c_str());
 
-	if (width == 0 || height == 0) {
+	/* Camera width/height override? */
+	if (!(options.width == 0 || options.height == 0)) {
+		options.scene->camera->width = options.width;
+		options.scene->camera->height = options.height;
+	}
+	else {
 		options.width = options.scene->camera->width;
 		options.height = options.scene->camera->height;
 	}
+
+	/* Calculate Viewplane */
+	options.scene->camera->compute_auto_viewplane();
 }
 
 static void session_exit()
@@ -166,8 +177,14 @@ static void display_info(Progress& progress)
 
 	interactive = options.interactive? "On":"Off";
 
-	str = string_printf("%s        Time: %.2f        Latency: %.4f        Sample: %d        Average: %.4f        Interactive: %s",
-						status.c_str(), total_time, latency, sample, sample_time, interactive.c_str());
+	str = string_printf(
+	        "%s"
+	        "        Time: %.2f"
+	        "        Latency: %.4f"
+	        "        Sample: %d"
+	        "        Average: %.4f"
+	        "        Interactive: %s",
+	        status.c_str(), total_time, latency, sample, sample_time, interactive.c_str());
 
 	view_display_info(str.c_str());
 
@@ -177,7 +194,9 @@ static void display_info(Progress& progress)
 
 static void display()
 {
-	options.session->draw(session_buffer_params());
+	static DeviceDrawParams draw_params = DeviceDrawParams();
+
+	options.session->draw(session_buffer_params(), draw_params);
 
 	display_info(options.session->progress);
 }
@@ -195,11 +214,11 @@ static void motion(int x, int y, int button)
 
 		/* Rotate */
 		else if(button == 2) {
-			float4 r1= make_float4(x * 0.1f, 0.0f, 1.0f, 0.0f);
-			matrix = matrix * transform_rotate(r1.x * M_PI/180.0f, make_float3(r1.y, r1.z, r1.w));
+			float4 r1 = make_float4((float)x * 0.1f, 0.0f, 1.0f, 0.0f);
+			matrix = matrix * transform_rotate(DEG2RADF(r1.x), make_float3(r1.y, r1.z, r1.w));
 
-			float4 r2 = make_float4(y * 0.1, 1.0f, 0.0f, 0.0f);
-			matrix = matrix * transform_rotate(r2.x * M_PI/180.0f, make_float3(r2.y, r2.z, r2.w));
+			float4 r2 = make_float4(y * 0.1f, 1.0f, 0.0f, 0.0f);
+			matrix = matrix * transform_rotate(DEG2RADF(r2.x), make_float3(r2.y, r2.z, r2.w));
 		}
 
 		/* Update and Reset */
@@ -216,20 +235,64 @@ static void resize(int width, int height)
 	options.width = width;
 	options.height = height;
 
-	if(options.session)
+	if(options.session) {
+		/* Update camera */
+		options.session->scene->camera->width = width;
+		options.session->scene->camera->height = height;
+		options.session->scene->camera->compute_auto_viewplane();
+		options.session->scene->camera->need_update = true;
+		options.session->scene->camera->need_device_update = true;
+
 		options.session->reset(session_buffer_params(), options.session_params.samples);
+	}
 }
 
 static void keyboard(unsigned char key)
 {
-	if(key == 'r')
-		options.session->reset(session_buffer_params(), options.session_params.samples);
-	else if(key == 'h')
+	/* Toggle help */
+	if(key == 'h')
 		options.show_help = !(options.show_help);
-	else if(key == 'i')
-		options.interactive = !(options.interactive);
+
+	/* Reset */
+	else if(key == 'r')
+		options.session->reset(session_buffer_params(), options.session_params.samples);
+
+	/* Cancel */
 	else if(key == 27) // escape
 		options.session->progress.set_cancel("Canceled");
+
+	/* Pause */
+	else if(key == 'p') {
+		options.pause = !options.pause;
+		options.session->set_pause(options.pause);
+	}
+
+	/* Interactive Mode */
+	else if(key == 'i')
+		options.interactive = !(options.interactive);
+
+	else if(options.interactive && (key == 'w' || key == 'a' || key == 's' || key == 'd')) {
+		Transform matrix = options.session->scene->camera->matrix;
+		float3 translate;
+
+		if(key == 'w')
+			translate = make_float3(0.0f, 0.0f, 0.1f);
+		else if(key == 's')
+			translate = make_float3(0.0f, 0.0f, -0.1f);
+		else if(key == 'a')
+			translate = make_float3(-0.1f, 0.0f, 0.0f);
+		else if(key == 'd')
+			translate = make_float3(0.1f, 0.0f, 0.0f);
+
+		matrix = matrix * transform_translate(translate);
+
+		/* Update and Reset */
+		options.session->scene->camera->matrix = matrix;
+		options.session->scene->camera->need_update = true;
+		options.session->scene->camera->need_device_update = true;
+
+		options.session->reset(session_buffer_params(), options.session_params.samples);
+	}
 }
 #endif
 
@@ -314,15 +377,13 @@ static void options_parse(int argc, const char **argv)
 	else if(ssname == "svm")
 		options.scene_params.shadingsystem = SceneParams::SVM;
 
-#ifdef WITH_CYCLES_STANDALONE_GUI
-	/* Progressive rendering for GUI */
-	if(!options.session_params.background)
-		options.session_params.progressive = true;
-#else
-	/* When building without GUI, set background */
+#ifndef WITH_CYCLES_STANDALONE_GUI
 	options.session_params.background = true;
 #endif
 
+	/* Use progressive rendering */
+	options.session_params.progressive = true;
+
 	/* find matching device */
 	DeviceType device_type = Device::type_from_string(devicename.c_str());
 	vector<DeviceInfo>& devices = Device::available_devices();
@@ -360,12 +421,12 @@ static void options_parse(int argc, const char **argv)
 		fprintf(stderr, "No file path specified\n");
 		exit(EXIT_FAILURE);
 	}
-	
+
 	/* For smoother Viewport */
 	options.session_params.start_resolution = 64;
 
 	/* load scene */
-	scene_init(options.width, options.height);
+	scene_init();
 }
 
 CCL_NAMESPACE_END
diff --git a/intern/cycles/app/cycles_xml.cpp b/intern/cycles/app/cycles_xml.cpp
index 14fe43115d5..d5ef30e5c6f 100644
--- a/intern/cycles/app/cycles_xml.cpp
+++ b/intern/cycles/app/cycles_xml.cpp
@@ -105,7 +105,7 @@ static bool xml_read_float(float *value, pugi::xml_node node, const char *name)
 	pugi::xml_attribute attr = node.attribute(name);
 
 	if(attr) {
-		*value = atof(attr.value());
+		*value = (float)atof(attr.value());
 		return true;
 	}
 
@@ -121,7 +121,7 @@ static bool xml_read_float_array(vector<float>& value, pugi::xml_node node, cons
 		string_split(tokens, attr.value());
 
 		foreach(const string& token, tokens)
-			value.push_back(atof(token.c_str()));
+			value.push_back((float)atof(token.c_str()));
 
 		return true;
 	}
@@ -219,6 +219,35 @@ static bool xml_read_enum(ustring *str, ShaderEnum& enm, pugi::xml_node node, co
 	return false;
 }
 
+static ShaderSocketType xml_read_socket_type(pugi::xml_node node, const char *name)
+{
+	pugi::xml_attribute attr = node.attribute(name);
+
+	if(attr) {
+		string value = attr.value();
+		if (string_iequals(value, "float"))
+			return SHADER_SOCKET_FLOAT;
+		else if (string_iequals(value, "int"))
+			return SHADER_SOCKET_INT;
+		else if (string_iequals(value, "color"))
+			return SHADER_SOCKET_COLOR;
+		else if (string_iequals(value, "vector"))
+			return SHADER_SOCKET_VECTOR;
+		else if (string_iequals(value, "point"))
+			return SHADER_SOCKET_POINT;
+		else if (string_iequals(value, "normal"))
+			return SHADER_SOCKET_NORMAL;
+		else if (string_iequals(value, "closure color"))
+			return SHADER_SOCKET_CLOSURE;
+		else if (string_iequals(value, "string"))
+			return SHADER_SOCKET_STRING;
+		else
+			fprintf(stderr, "Unknown shader socket type \"%s\" for attribute \"%s\".\n", value.c_str(), name);
+	}
+	
+	return SHADER_SOCKET_UNDEFINED;
+}
+
 /* Film */
 
 static void xml_read_film(const XMLReadState& state, pugi::xml_node node)
@@ -251,6 +280,8 @@ static void xml_read_integrator(const XMLReadState& state, pugi::xml_node node)
 		xml_read_int(&integrator->mesh_light_samples, node, "mesh_light_samples");
 		xml_read_int(&integrator->subsurface_samples, node, "subsurface_samples");
 		xml_read_int(&integrator->volume_samples, node, "volume_samples");
+		xml_read_bool(&integrator->sample_all_lights_direct, node, "sample_all_lights_direct");
+		xml_read_bool(&integrator->sample_all_lights_indirect, node, "sample_all_lights_indirect");
 	}
 	
 	/* Bounces */
@@ -268,6 +299,7 @@ static void xml_read_integrator(const XMLReadState& state, pugi::xml_node node)
 	xml_read_bool(&integrator->transparent_shadows, node, "transparent_shadows");
 	
 	/* Volume */
+	xml_read_int(&integrator->volume_homogeneous_sampling, node, "volume_homogeneous_sampling");
 	xml_read_float(&integrator->volume_step_size, node, "volume_step_size");
 	xml_read_int(&integrator->volume_max_steps, node, "volume_max_steps");
 	
@@ -289,23 +321,8 @@ static void xml_read_camera(const XMLReadState& state, pugi::xml_node node)
 	xml_read_int(&cam->width, node, "width");
 	xml_read_int(&cam->height, node, "height");
 
-	float aspect = (float)cam->width/(float)cam->height;
-
-	if(cam->width >= cam->height) {
-		cam->viewplane.left = -aspect;
-		cam->viewplane.right = aspect;
-		cam->viewplane.bottom = -1.0f;
-		cam->viewplane.top = 1.0f;
-	}
-	else {
-		cam->viewplane.left = -1.0f;
-		cam->viewplane.right = 1.0f;
-		cam->viewplane.bottom = -1.0f/aspect;
-		cam->viewplane.top = 1.0f/aspect;
-	}
-
 	if(xml_read_float(&cam->fov, node, "fov"))
-		cam->fov *= M_PI/180.0f;
+		cam->fov = DEG2RADF(cam->fov);
 
 	xml_read_float(&cam->nearclip, node, "nearclip");
 	xml_read_float(&cam->farclip, node, "farclip");
@@ -333,7 +350,6 @@ static void xml_read_camera(const XMLReadState& state, pugi::xml_node node)
 	xml_read_float(&cam->sensorwidth, node, "sensorwidth");
 	xml_read_float(&cam->sensorheight, node, "sensorheight");
 
-
 	cam->matrix = state.tfm;
 
 	cam->need_update = true;
@@ -392,24 +408,41 @@ static void xml_read_shader_graph(const XMLReadState& state, Shader *shader, pug
 
 			/* Source */
 			xml_read_string(&osl->filepath, node, "src");
-			osl->filepath = path_join(state.base, osl->filepath);
-
-			/* Outputs */
-			string output = "", output_type = "";
-			ShaderSocketType type = SHADER_SOCKET_FLOAT;
+			if(path_is_relative(osl->filepath)) {
+				osl->filepath = path_join(state.base, osl->filepath);
+			}
 
-			xml_read_string(&output, node, "output");
-			xml_read_string(&output_type, node, "output_type");
-			
-			if(output_type == "float")
-				type = SHADER_SOCKET_FLOAT;
-			else if(output_type == "closure color")
-				type = SHADER_SOCKET_CLOSURE;
-			else if(output_type == "color")
-				type = SHADER_SOCKET_COLOR;
-
-			osl->output_names.push_back(ustring(output));
-			osl->add_output(osl->output_names.back().c_str(), type);
+			/* Generate inputs/outputs from node sockets
+			 *
+			 * Note: ShaderInput/ShaderOutput store shallow string copies only!
+			 * Socket names must be stored in the extra lists instead. */
+			/* read input values */
+			for(pugi::xml_node param = node.first_child(); param; param = param.next_sibling()) {
+				if (string_iequals(param.name(), "input")) {
+					string name;
+					if (!xml_read_string(&name, param, "name"))
+						continue;
+					
+					ShaderSocketType type = xml_read_socket_type(param, "type");
+					if (type == SHADER_SOCKET_UNDEFINED)
+						continue;
+					
+					osl->input_names.push_back(ustring(name));
+					osl->add_input(osl->input_names.back().c_str(), type);
+				}
+				else if (string_iequals(param.name(), "output")) {
+					string name;
+					if (!xml_read_string(&name, param, "name"))
+						continue;
+					
+					ShaderSocketType type = xml_read_socket_type(param, "type");
+					if (type == SHADER_SOCKET_UNDEFINED)
+						continue;
+					
+					osl->output_names.push_back(ustring(name));
+					osl->add_output(osl->output_names.back().c_str(), type);
+				}
+			}
 			
 			snode = osl;
 		}
@@ -616,6 +649,11 @@ static void xml_read_shader_graph(const XMLReadState& state, Shader *shader, pug
 			xml_read_ustring(&attr->attribute, node, "attribute");
 			snode = attr;
 		}
+		else if(string_iequals(node.name(), "uv_map")) {
+			UVMapNode *uvm = new UVMapNode();
+			xml_read_ustring(&uvm->attribute, node, "uv_map");
+			snode = uvm;
+		}
 		else if(string_iequals(node.name(), "camera")) {
 			snode = new CameraNode();
 		}
@@ -734,6 +772,9 @@ static void xml_read_shader_graph(const XMLReadState& state, Shader *shader, pug
 							case SHADER_SOCKET_NORMAL:
 								xml_read_float3(&in->value, node, attr.name());
 								break;
+							case SHADER_SOCKET_STRING:
+								xml_read_ustring( &in->value_string, node, attr.name() );
+								break;
 							default:
 								break;
 						}
@@ -765,6 +806,8 @@ static void xml_read_shader(const XMLReadState& state, pugi::xml_node node)
 static void xml_read_background(const XMLReadState& state, pugi::xml_node node)
 {
 	Shader *shader = state.scene->shaders[state.scene->default_background];
+	
+	xml_read_bool(&shader->heterogeneous_volume, node, "heterogeneous_volume");
 
 	xml_read_shader_graph(state, shader, node);
 }
@@ -846,7 +889,7 @@ static void xml_read_mesh(const XMLReadState& state, pugi::xml_node node)
 		SubdParams sdparams(mesh, shader, smooth);
 		xml_read_float(&sdparams.dicing_rate, node, "dicing_rate");
 
-		DiagSplit dsplit(sdparams);;
+		DiagSplit dsplit(sdparams);
 		sdmesh.tessellate(&dsplit);
 	}
 	else {
@@ -944,6 +987,26 @@ static void xml_read_light(const XMLReadState& state, pugi::xml_node node)
 {
 	Light *light = new Light();
 	light->shader = state.shader;
+
+	/* Light Type
+	 * 0: Point, 1: Sun, 3: Area, 5: Spot */
+	int type = 0;
+	xml_read_int(&type, node, "type");
+	light->type = (LightType)type;
+
+	/* Spot Light */
+	xml_read_float(&light->spot_angle, node, "spot_angle");
+	xml_read_float(&light->spot_smooth, node, "spot_smooth");
+
+	/* Area Light */
+	xml_read_float(&light->sizeu, node, "sizeu");
+	xml_read_float(&light->sizev, node, "sizev");
+	xml_read_float3(&light->axisu, node, "axisu");
+	xml_read_float3(&light->axisv, node, "axisv");
+	
+	/* Generic */
+	xml_read_float(&light->size, node, "size");
+	xml_read_float3(&light->dir, node, "dir");
 	xml_read_float3(&light->co, node, "P");
 	light->co = transform_point(&state.tfm, light->co);
 
@@ -969,7 +1032,7 @@ static void xml_read_transform(pugi::xml_node node, Transform& tfm)
 	if(node.attribute("rotate")) {
 		float4 rotate = make_float4(0.0f, 0.0f, 0.0f, 0.0f);
 		xml_read_float4(&rotate, node, "rotate");
-		tfm = tfm * transform_rotate(rotate.x*M_PI/180.0f, make_float3(rotate.y, rotate.z, rotate.w));
+		tfm = tfm * transform_rotate(DEG2RADF(rotate.x), make_float3(rotate.y, rotate.z, rotate.w));
 	}
 
 	if(node.attribute("scale")) {
diff --git a/intern/cycles/app/cycles_xml.h b/intern/cycles/app/cycles_xml.h
index 1e3ed411312..96bc79c35d8 100644
--- a/intern/cycles/app/cycles_xml.h
+++ b/intern/cycles/app/cycles_xml.h
@@ -14,8 +14,8 @@
  * limitations under the License
  */
 
-#ifndef __CYCLES_XML__
-#define __CYCLES_XML__
+#ifndef __CYCLES_XML_H__
+#define __CYCLES_XML_H__
 
 CCL_NAMESPACE_BEGIN
 
@@ -23,7 +23,10 @@ class Scene;
 
 void xml_read_file(Scene *scene, const char *filepath);
 
-CCL_NAMESPACE_END
+/* macros for importing */
+#define RAD2DEGF(_rad) ((_rad) * (float)(180.0 / M_PI))
+#define DEG2RADF(_deg) ((_deg) * (float)(M_PI / 180.0))
 
-#endif /* __CYCLES_XML__ */
+CCL_NAMESPACE_END
 
+#endif /* __CYCLES_XML_H__ */
diff --git a/intern/cycles/blender/CCL_api.h b/intern/cycles/blender/CCL_api.h
index 6532315cf39..2772b9ac8a7 100644
--- a/intern/cycles/blender/CCL_api.h
+++ b/intern/cycles/blender/CCL_api.h
@@ -14,8 +14,8 @@
  * limitations under the License
  */
 
-#ifndef CCL_API_H
-#define CCL_API_H
+#ifndef __CCL_API_H__
+#define __CCL_API_H__
 
 #ifdef __cplusplus
 extern "C" {
@@ -40,5 +40,4 @@ void *CCL_python_module_init(void);
 }
 #endif
 
-#endif /* CCL_API_H */
-
+#endif /* __CCL_API_H__ */
diff --git a/intern/cycles/blender/CMakeLists.txt b/intern/cycles/blender/CMakeLists.txt
index 25f91a0caea..9a60152841e 100644
--- a/intern/cycles/blender/CMakeLists.txt
+++ b/intern/cycles/blender/CMakeLists.txt
@@ -49,6 +49,11 @@ add_definitions(-DGLEW_STATIC)
 
 blender_add_lib(bf_intern_cycles "${SRC}" "${INC}" "${INC_SYS}")
 
+# avoid link failure with clang 3.4 debug
+if(CMAKE_C_COMPILER_ID MATCHES "Clang" AND NOT ${CMAKE_C_COMPILER_VERSION} VERSION_LESS '3.4')
+	set(CMAKE_CXX_FLAGS_DEBUG "${CMAKE_CXX_FLAGS_DEBUG} -gline-tables-only")
+endif()
+
 add_dependencies(bf_intern_cycles bf_rna)
 
 delayed_install(${CMAKE_CURRENT_SOURCE_DIR} "${ADDON_FILES}" ${CYCLES_INSTALL_PATH})
diff --git a/intern/cycles/blender/addon/__init__.py b/intern/cycles/blender/addon/__init__.py
index afd26945d6c..27d986900c8 100644
--- a/intern/cycles/blender/addon/__init__.py
+++ b/intern/cycles/blender/addon/__init__.py
@@ -19,7 +19,7 @@
 bl_info = {
     "name": "Cycles Render Engine",
     "author": "",
-    "blender": (2, 67, 0),
+    "blender": (2, 70, 0),
     "location": "Info header, render engine menu",
     "description": "Cycles Render Engine integration",
     "warning": "",
@@ -67,6 +67,9 @@ class CyclesRender(bpy.types.RenderEngine):
     def render(self, scene):
         engine.render(self)
 
+    def bake(self, scene, obj, pass_type, pixel_array, num_pixels, depth, result):
+        engine.bake(self, obj, pass_type, pixel_array, num_pixels, depth, result)
+
     # viewport render
     def view_update(self, context):
         if not self.session:
diff --git a/intern/cycles/blender/addon/engine.py b/intern/cycles/blender/addon/engine.py
index b9ce65588df..25a9e97a99b 100644
--- a/intern/cycles/blender/addon/engine.py
+++ b/intern/cycles/blender/addon/engine.py
@@ -59,6 +59,12 @@ def render(engine):
         _cycles.render(engine.session)
 
 
+def bake(engine, obj, pass_type, pixel_array, num_pixels, depth, result):
+    import _cycles
+    session = getattr(engine, "session", None)
+    if session is not None:
+        _cycles.bake(engine.session, obj.as_pointer(), pass_type, pixel_array.as_pointer(), num_pixels, depth, result.as_pointer())
+
 def reset(engine, data, scene):
     import _cycles
     data = data.as_pointer()
diff --git a/intern/cycles/blender/addon/properties.py b/intern/cycles/blender/addon/properties.py
index c80e8a3250c..7205a272395 100644
--- a/intern/cycles/blender/addon/properties.py
+++ b/intern/cycles/blender/addon/properties.py
@@ -108,6 +108,11 @@ enum_integrator = (
     ('PATH', "Path Tracing", "Pure path tracing integrator"),
     )
 
+enum_volume_homogeneous_sampling = (
+    ('DISTANCE', "Distance", "Use Distance Sampling"),
+    ('EQUI_ANGULAR', "Equi-angular", "Use Equi-angular Sampling"),
+    )
+
 
 class CyclesRenderSettings(bpy.types.PropertyGroup):
     @classmethod
@@ -141,6 +146,13 @@ class CyclesRenderSettings(bpy.types.PropertyGroup):
                 default='PATH',
                 )
 
+        cls.volume_homogeneous_sampling = EnumProperty(
+                name="Homogeneous Sampling",
+                description="Sampling method to use for homogeneous volumes",
+                items=enum_volume_homogeneous_sampling,
+                default='DISTANCE',
+                )
+
         cls.use_square_samples = BoolProperty(
                 name="Square Samples",
                 description="Square sampling values for easier artist control",
@@ -241,6 +253,18 @@ class CyclesRenderSettings(bpy.types.PropertyGroup):
                 default='USE',
                 )
 
+        cls.sample_all_lights_direct = BoolProperty(
+                name="Sample All Direct Lights",
+                description="Sample all lights (for direct samples), rather than randomly picking one",
+                default=True,
+                )
+
+        cls.sample_all_lights_indirect = BoolProperty(
+                name="Sample All Indirect Lights",
+                description="Sample all lights (for indirect samples), rather than randomly picking one",
+                default=True,
+                )
+
         cls.no_caustics = BoolProperty(
                 name="No Caustics",
                 description="Leave out caustics, resulting in a darker image with less noise",
@@ -447,6 +471,33 @@ class CyclesRenderSettings(bpy.types.PropertyGroup):
                 default=False,
                 )
 
+        cls.bake_type = EnumProperty(
+            name="Bake Type",
+            default='COMBINED',
+            description="Type of pass to bake",
+            items = (
+                ('COMBINED', "Combined", ""),
+                ('AO', "Ambient Occlusion", ""),
+                ('SHADOW', "Shadow", ""),
+                ('NORMAL', "Normal", ""),
+                ('UV', "UV", ""),
+                ('EMIT', "Emit", ""),
+                ('ENVIRONMENT', "Environment", ""),
+                ('DIFFUSE_DIRECT', "Diffuse Direct", ""),
+                ('DIFFUSE_INDIRECT', "Diffuse Indirect", ""),
+                ('DIFFUSE_COLOR', "Diffuse Color", ""),
+                ('GLOSSY_DIRECT', "Glossy Direct", ""),
+                ('GLOSSY_INDIRECT', "Glossy Indirect", ""),
+                ('GLOSSY_COLOR', "Glossy Color", ""),
+                ('TRANSMISSION_DIRECT', "Transmission Direct", ""),
+                ('TRANSMISSION_INDIRECT', "Transmission Indirect", ""),
+                ('TRANSMISSION_COLOR', "Transmission Color", ""),
+                ('SUBSURFACE_DIRECT', "Subsurface Direct", ""),
+                ('SUBSURFACE_INDIRECT', "Subsurface Indirect", ""),
+                ('SUBSURFACE_COLOR', "Subsurface Color", ""),
+                ),
+            )
+
     @classmethod
     def unregister(cls):
         del bpy.types.Scene.cycles
@@ -718,6 +769,41 @@ class CyclesMeshSettings(bpy.types.PropertyGroup):
         del bpy.types.MetaBall.cycles
 
 
+class CyclesObjectBlurSettings(bpy.types.PropertyGroup):
+
+    @classmethod
+    def register(cls):
+
+        bpy.types.Object.cycles = PointerProperty(
+                name="Cycles Object Settings",
+                description="Cycles object settings",
+                type=cls,
+                )
+
+        cls.use_motion_blur = BoolProperty(
+                name="Use Motion Blur",
+                description="Use motion blur for this object",
+                default=True,
+                )
+
+        cls.use_deform_motion = BoolProperty(
+                name="Use Deformation Motion",
+                description="Use deformation motion blur for this object",
+                default=True,
+                )
+
+        cls.motion_steps = IntProperty(
+                name="Motion Steps",
+                description="Control accuracy of deformation motion blur, more steps gives more memory usage (actual number of steps is 2^(steps - 1))",
+                min=1, soft_max=8,
+                default=1,
+                )
+
+    @classmethod
+    def unregister(cls):
+        del bpy.types.Object.cycles
+
+
 class CyclesCurveRenderSettings(bpy.types.PropertyGroup):
     @classmethod
     def register(cls):
diff --git a/intern/cycles/blender/addon/ui.py b/intern/cycles/blender/addon/ui.py
index c0ce80426c0..5c8115b6612 100644
--- a/intern/cycles/blender/addon/ui.py
+++ b/intern/cycles/blender/addon/ui.py
@@ -49,6 +49,13 @@ class CyclesButtonsPanel():
         return rd.engine in cls.COMPAT_ENGINES
 
 
+def use_cpu(context):
+    cscene = context.scene.cycles
+    device_type = context.user_preferences.system.compute_device_type
+
+    return (device_type == 'NONE' or cscene.device == 'CPU')
+
+
 def draw_samples_info(layout, cscene):
     integrator = cscene.progressive
 
@@ -103,7 +110,6 @@ class CyclesRender_PT_sampling(CyclesButtonsPanel, Panel):
 
         scene = context.scene
         cscene = scene.cycles
-        device_type = context.user_preferences.system.compute_device_type
 
         row = layout.row(align=True)
         row.menu("CYCLES_MT_sampling_presets", text=bpy.types.CYCLES_MT_sampling_presets.bl_label)
@@ -133,6 +139,9 @@ class CyclesRender_PT_sampling(CyclesButtonsPanel, Panel):
             sub.label(text="AA Samples:")
             sub.prop(cscene, "aa_samples", text="Render")
             sub.prop(cscene, "preview_aa_samples", text="Preview")
+            sub.separator()
+            sub.prop(cscene, "sample_all_lights_direct")
+            sub.prop(cscene, "sample_all_lights_indirect")
 
             col = split.column()
             sub = col.column(align=True)
@@ -145,7 +154,7 @@ class CyclesRender_PT_sampling(CyclesButtonsPanel, Panel):
             sub.prop(cscene, "subsurface_samples", text="Subsurface")
             sub.prop(cscene, "volume_samples", text="Volume")
 
-        if cscene.feature_set == 'EXPERIMENTAL' and (device_type == 'NONE' or cscene.device == 'CPU'):
+        if cscene.feature_set == 'EXPERIMENTAL' and use_cpu(context):
             layout.row().prop(cscene, "sampling_pattern", text="Pattern")
 
         for rl in scene.render.layers:
@@ -167,9 +176,16 @@ class CyclesRender_PT_volume_sampling(CyclesButtonsPanel, Panel):
         scene = context.scene
         cscene = scene.cycles
 
-        split = layout.split()
-        split.prop(cscene, "volume_step_size")
-        split.prop(cscene, "volume_max_steps")
+        split = layout.split(align=True)
+
+        sub = split.column(align=True)
+        sub.label("Heterogeneous:")
+        sub.prop(cscene, "volume_step_size")
+        sub.prop(cscene, "volume_max_steps")
+
+        sub = split.column(align=True)
+        sub.label("Homogeneous:")
+        sub.prop(cscene, "volume_homogeneous_sampling", text="")
 
 
 class CyclesRender_PT_light_paths(CyclesButtonsPanel, Panel):
@@ -310,28 +326,6 @@ class CyclesRender_PT_performance(CyclesButtonsPanel, Panel):
         col.prop(cscene, "debug_use_spatial_splits")
 
 
-class CyclesRender_PT_opengl(CyclesButtonsPanel, Panel):
-    bl_label = "OpenGL Render"
-    bl_options = {'DEFAULT_CLOSED'}
-
-    def draw(self, context):
-        layout = self.layout
-
-        rd = context.scene.render
-
-        split = layout.split()
-
-        col = split.column()
-        col.prop(rd, "use_antialiasing")
-        sub = col.row()
-        sub.active = rd.use_antialiasing
-        sub.prop(rd, "antialiasing_samples", expand=True)
-
-        col = split.column()
-        col.label(text="Alpha:")
-        col.prop(rd, "alpha_mode", text="")
-
-
 class CyclesRender_PT_layer_options(CyclesButtonsPanel, Panel):
     bl_label = "Layer"
     bl_context = "render_layer"
@@ -562,26 +556,48 @@ class Cycles_PT_mesh_displacement(CyclesButtonsPanel, Panel):
         layout.prop(cdata, "dicing_rate")
 
 
-class Cycles_PT_mesh_normals(CyclesButtonsPanel, Panel):
-    bl_label = "Normals"
-    bl_context = "data"
+class CyclesObject_PT_motion_blur(CyclesButtonsPanel, Panel):
+    bl_label = "Motion Blur"
+    bl_context = "object"
+    bl_options = {'DEFAULT_CLOSED'}
 
     @classmethod
     def poll(cls, context):
-        return CyclesButtonsPanel.poll(context) and context.mesh
+        ob = context.object
+        return CyclesButtonsPanel.poll(context) and ob and ob.type in {'MESH', 'CURVE', 'CURVE', 'SURFACE', 'FONT', 'META'}
+
+    def draw_header(self, context):
+        layout = self.layout
+
+        rd = context.scene.render
+        scene = context.scene
+        # cscene = scene.cycles
+
+        layout.active = rd.use_motion_blur
+
+        ob = context.object
+        cob = ob.cycles
+
+        layout.prop(cob, "use_motion_blur", text="")
 
     def draw(self, context):
         layout = self.layout
 
-        mesh = context.mesh
+        rd = context.scene.render
+        scene = context.scene
+        # cscene = scene.cycles
 
-        split = layout.split()
+        ob = context.object
+        cob = ob.cycles
 
-        col = split.column()
-        col.prop(mesh, "show_double_sided")
+        layout.active = (rd.use_motion_blur and cob.use_motion_blur)
 
-        col = split.column()
-        col.label()
+        row = layout.row()
+        row.prop(cob, "use_deform_motion", text="Deformation")
+
+        sub = row.row()
+        sub.active = cob.use_deform_motion
+        sub.prop(cob, "motion_steps", text="Steps")
 
 
 class CyclesObject_PT_ray_visibility(CyclesButtonsPanel, Panel):
@@ -593,7 +609,8 @@ class CyclesObject_PT_ray_visibility(CyclesButtonsPanel, Panel):
     def poll(cls, context):
         ob = context.object
         return (CyclesButtonsPanel.poll(context) and
-                ob and ob.type in {'MESH', 'CURVE', 'CURVE', 'SURFACE', 'FONT', 'META', 'LAMP'})
+                ob and ob.type in {'MESH', 'CURVE', 'SURFACE', 'FONT', 'META', 'LAMP'} or
+                ob and ob.dupli_type == 'GROUP' and ob.dupli_group)
 
     def draw(self, context):
         layout = self.layout
@@ -847,9 +864,10 @@ class CyclesWorld_PT_mist(CyclesButtonsPanel, Panel):
     @classmethod
     def poll(cls, context):
         if CyclesButtonsPanel.poll(context):
-            for rl in context.scene.render.layers:
-                if rl.use_pass_mist:
-                    return True
+            if context.world:
+                for rl in context.scene.render.layers:
+                    if rl.use_pass_mist:
+                        return True
 
         return False
 
@@ -997,8 +1015,9 @@ class CyclesMaterial_PT_settings(CyclesButtonsPanel, Panel):
 
         split = layout.split()
 
-        col = split.column()
+        col = split.column(align=True)
         col.prop(mat, "diffuse_color", text="Viewport Color")
+        col.prop(mat, "alpha")
 
         col = split.column(align=True)
         col.label()
@@ -1108,7 +1127,7 @@ class CyclesTexture_PT_colors(CyclesButtonsPanel, Panel):
     def poll(cls, context):
         # node = context.texture_node
         return False
-        #return node and CyclesButtonsPanel.poll(context)
+        # return node and CyclesButtonsPanel.poll(context)
 
     def draw(self, context):
         layout = self.layout
@@ -1176,7 +1195,7 @@ class CyclesRender_PT_CurveRendering(CyclesButtonsPanel, Panel):
     @classmethod
     def poll(cls, context):
         scene = context.scene
-        cscene = scene.cycles
+        # cscene = scene.cycles
         psys = context.particle_system
         return CyclesButtonsPanel.poll(context) and psys and psys.settings.type == 'HAIR'
 
@@ -1208,6 +1227,54 @@ class CyclesRender_PT_CurveRendering(CyclesButtonsPanel, Panel):
         row.prop(ccscene, "maximum_width", text="Max Ext.")
 
 
+class CyclesRender_PT_bake(CyclesButtonsPanel, Panel):
+    bl_label = "Bake"
+    bl_context = "render"
+    bl_options = {'DEFAULT_CLOSED'}
+    COMPAT_ENGINES = {'CYCLES'}
+
+    def draw(self, context):
+        layout = self.layout
+
+        scene = context.scene
+        cscene = scene.cycles
+
+        cbk = scene.render.bake
+
+        layout.operator("object.bake", icon='RENDER_STILL').type = \
+        cscene.bake_type
+
+        col = layout.column()
+        col.prop(cscene, "bake_type")
+
+        col.separator()
+        split = layout.split()
+
+        sub = split.column()
+        sub.prop(cbk, "use_clear")
+        sub.prop(cbk, "margin")
+
+        sub = split.column()
+        sub.prop(cbk, "use_selected_to_active")
+        sub = sub.column()
+
+        sub.active = cbk.use_selected_to_active
+        sub.prop(cbk, "cage_extrusion", text="Distance")
+        sub.prop_search(cbk, "cage", scene, "objects")
+
+        if cscene.bake_type == 'NORMAL':
+            col.separator()
+            box = col.box()
+            box.label(text="Normal Settings:")
+            box.prop(cbk, "normal_space", text="Space")
+
+            row = box.row(align=True)
+            row.label(text = "Swizzle:")
+            row.prop(cbk, "normal_r", text="")
+            row.prop(cbk, "normal_g", text="")
+            row.prop(cbk, "normal_b", text="")
+
+
 class CyclesParticle_PT_CurveSettings(CyclesButtonsPanel, Panel):
     bl_label = "Cycles Hair Settings"
     bl_context = "particle"
@@ -1215,7 +1282,7 @@ class CyclesParticle_PT_CurveSettings(CyclesButtonsPanel, Panel):
     @classmethod
     def poll(cls, context):
         scene = context.scene
-        cscene = scene.cycles
+        # cscene = scene.cycles
         ccscene = scene.cycles_curves
         psys = context.particle_system
         use_curves = ccscene.use_curves and psys
@@ -1275,7 +1342,7 @@ def draw_device(self, context):
         if device_type in {'CUDA', 'OPENCL', 'NETWORK'}:
             layout.prop(cscene, "device")
 
-        if engine.with_osl() and (cscene.device == 'CPU' or device_type == 'NONE'):
+        if engine.with_osl() and use_cpu(context):
             layout.prop(cscene, "shading_system")
 
 
@@ -1316,6 +1383,7 @@ def get_panels():
         "DATA_PT_context_camera",
         "DATA_PT_context_lamp",
         "DATA_PT_context_speaker",
+        "DATA_PT_normals",
         "DATA_PT_texture_space",
         "DATA_PT_curve_texture_space",
         "DATA_PT_mball_texture_space",
diff --git a/intern/cycles/blender/blender_camera.cpp b/intern/cycles/blender/blender_camera.cpp
index 4c6b42a9cbc..1a85561c6d5 100644
--- a/intern/cycles/blender/blender_camera.cpp
+++ b/intern/cycles/blender/blender_camera.cpp
@@ -212,8 +212,8 @@ static void blender_camera_viewplane(BlenderCamera *bcam, int width, int height,
 	BoundBox2D *viewplane, float *aspectratio, float *sensor_size)
 {
 	/* dimensions */
-	float xratio = width*bcam->pixelaspect.x;
-	float yratio = height*bcam->pixelaspect.y;
+	float xratio = (float)width*bcam->pixelaspect.x;
+	float yratio = (float)height*bcam->pixelaspect.y;
 
 	/* compute x/y aspect and ratio */
 	float xaspect, yaspect;
@@ -288,8 +288,8 @@ static void blender_camera_sync(Camera *cam, BlenderCamera *bcam, int width, int
 
 	/* panorama sensor */
 	if (bcam->type == CAMERA_PANORAMA && bcam->panorama_type == PANORAMA_FISHEYE_EQUISOLID) {
-		float fit_xratio = bcam->full_width*bcam->pixelaspect.x;
-		float fit_yratio = bcam->full_height*bcam->pixelaspect.y;
+		float fit_xratio = (float)bcam->full_width*bcam->pixelaspect.x;
+		float fit_yratio = (float)bcam->full_height*bcam->pixelaspect.y;
 		bool horizontal_fit;
 		float sensor_size;
 
@@ -386,7 +386,7 @@ void BlenderSync::sync_camera(BL::RenderSettings b_render, BL::Object b_override
 	blender_camera_sync(cam, &bcam, width, height);
 }
 
-void BlenderSync::sync_camera_motion(BL::Object b_ob, int motion)
+void BlenderSync::sync_camera_motion(BL::Object b_ob, float motion_time)
 {
 	Camera *cam = scene->camera;
 
@@ -394,12 +394,14 @@ void BlenderSync::sync_camera_motion(BL::Object b_ob, int motion)
 	tfm = blender_camera_matrix(tfm, cam->type);
 
 	if(tfm != cam->matrix) {
-		if(motion == -1)
+		if(motion_time == -1.0f) {
 			cam->motion.pre = tfm;
-		else
+			cam->use_motion = true;
+		}
+		else if(motion_time == 1.0f) {
 			cam->motion.post = tfm;
-
-		cam->use_motion = true;
+			cam->use_motion = true;
+		}
 	}
 }
 
@@ -563,10 +565,10 @@ BufferParams BlenderSync::get_buffer_params(BL::RenderSettings b_render, BL::Sce
 
 	if(use_border) {
 		/* border render */
-		params.full_x = cam->border.left*width;
-		params.full_y = cam->border.bottom*height;
-		params.width = (int)(cam->border.right*width) - params.full_x;
-		params.height = (int)(cam->border.top*height) - params.full_y;
+		params.full_x = (int)(cam->border.left * (float)width);
+		params.full_y = (int)(cam->border.bottom * (float)height);
+		params.width = (int)(cam->border.right * (float)width) - params.full_x;
+		params.height = (int)(cam->border.top * (float)height) - params.full_y;
 
 		/* survive in case border goes out of view or becomes too small */
 		params.width = max(params.width, 1);
diff --git a/intern/cycles/blender/blender_curves.cpp b/intern/cycles/blender/blender_curves.cpp
index 92c51b0aad3..22de7b64273 100644
--- a/intern/cycles/blender/blender_curves.cpp
+++ b/intern/cycles/blender/blender_curves.cpp
@@ -588,7 +588,7 @@ void ExportCurveSegments(Scene *scene, Mesh *mesh, ParticleCurveData *CData)
 				float radius = shaperadius(CData->psys_shape[sys], CData->psys_rootradius[sys], CData->psys_tipradius[sys], time);
 
 				if(CData->psys_closetip[sys] && (curvekey == CData->curve_firstkey[curve] + CData->curve_keynum[curve] - 1))
-					radius =0.0f;
+					radius = 0.0f;
 
 				mesh->add_curve_key(ickey_loc, radius);
 				if(attr_intercept)
@@ -612,16 +612,23 @@ void ExportCurveSegments(Scene *scene, Mesh *mesh, ParticleCurveData *CData)
 	}
 }
 
-static void ExportCurveSegmentsMotion(Scene *scene, Mesh *mesh, ParticleCurveData *CData, int motion)
+static void ExportCurveSegmentsMotion(Scene *scene, Mesh *mesh, ParticleCurveData *CData, int time_index)
 {
+	/* find attribute */
+	Attribute *attr_mP = mesh->curve_attributes.find(ATTR_STD_MOTION_VERTEX_POSITION);
+	bool new_attribute = false;
+
+	/* add new attribute if it doesn't exist already */
+	if(!attr_mP) {
+		attr_mP = mesh->curve_attributes.add(ATTR_STD_MOTION_VERTEX_POSITION);
+		new_attribute = true;
+	}
+
 	/* export motion vectors for curve keys */
-	AttributeStandard std = (motion == -1)? ATTR_STD_MOTION_PRE: ATTR_STD_MOTION_POST;
-	Attribute *attr_motion = mesh->curve_attributes.add(std);
-	float3 *data_motion = attr_motion->data_float3();
-	float3 *current_motion = data_motion;
-	size_t size = mesh->curve_keys.size();
-	size_t i = 0;
+	size_t numkeys = mesh->curve_keys.size();
+	float4 *mP = attr_mP->data_float4() + time_index*numkeys;
 	bool have_motion = false;
+	int i = 0;
 
 	for(int sys = 0; sys < CData->psys_firstcurve.size(); sys++) {
 		if(CData->psys_curvenum[sys] == 0)
@@ -633,15 +640,21 @@ static void ExportCurveSegmentsMotion(Scene *scene, Mesh *mesh, ParticleCurveDat
 
 			for(int curvekey = CData->curve_firstkey[curve]; curvekey < CData->curve_firstkey[curve] + CData->curve_keynum[curve]; curvekey++) {
 				if(i < mesh->curve_keys.size()) {
-					*current_motion = CData->curvekey_co[curvekey];
+					float3 ickey_loc = CData->curvekey_co[curvekey];
+					float time = CData->curvekey_time[curvekey]/CData->curve_length[curve];
+					float radius = shaperadius(CData->psys_shape[sys], CData->psys_rootradius[sys], CData->psys_tipradius[sys], time);
+
+					if(CData->psys_closetip[sys] && (curvekey == CData->curve_firstkey[curve] + CData->curve_keynum[curve] - 1))
+						radius = 0.0f;
+
+					mP[i] = float3_to_float4(ickey_loc);
+					mP[i].w = radius;
 
 					/* unlike mesh coordinates, these tend to be slightly different
 					 * between frames due to particle transforms into/out of object
 					 * space, so we use an epsilon to detect actual changes */
-					if(len_squared(*current_motion - mesh->curve_keys[i].co) > 1e-5f*1e-5f)
+					if(len_squared(mP[i] - mesh->curve_keys[i]) > 1e-5f*1e-5f)
 						have_motion = true;
-
-					current_motion++;
 				}
 
 				i++;
@@ -649,8 +662,23 @@ static void ExportCurveSegmentsMotion(Scene *scene, Mesh *mesh, ParticleCurveDat
 		}
 	}
 
-	if(i != size || !have_motion)
-		mesh->curve_attributes.remove(std);
+	/* in case of new attribute, we verify if there really was any motion */
+	if(new_attribute) {
+		if(i != numkeys || !have_motion) {
+			/* no motion, remove attributes again */
+			mesh->curve_attributes.remove(ATTR_STD_MOTION_VERTEX_POSITION);
+		}
+		else if(time_index > 0) {
+			/* motion, fill up previous steps that we might have skipped because
+			 * they had no motion, but we need them anyway now */
+			for(int step = 0; step < time_index; step++) {
+				float4 *mP = attr_mP->data_float4() + step*numkeys;
+
+				for(int key = 0; key < numkeys; key++)
+					mP[key] = mesh->curve_keys[key];
+			}
+		}
+	}
 }
 
 void ExportCurveTriangleUV(Mesh *mesh, ParticleCurveData *CData, int vert_offset, int resol, float3 *uvdata)
@@ -796,7 +824,7 @@ void BlenderSync::sync_curve_settings()
 		curve_system_manager->tag_update(scene);
 }
 
-void BlenderSync::sync_curves(Mesh *mesh, BL::Mesh b_mesh, BL::Object b_ob, int motion)
+void BlenderSync::sync_curves(Mesh *mesh, BL::Mesh b_mesh, BL::Object b_ob, bool motion, int time_index)
 {
 	if(!motion) {
 		/* Clear stored curve data */
@@ -851,7 +879,7 @@ void BlenderSync::sync_curves(Mesh *mesh, BL::Mesh b_mesh, BL::Object b_ob, int
 	}
 	else {
 		if(motion)
-			ExportCurveSegmentsMotion(scene, mesh, &CData, motion);
+			ExportCurveSegmentsMotion(scene, mesh, &CData, time_index);
 		else
 			ExportCurveSegments(scene, mesh, &CData);
 	}
@@ -876,7 +904,7 @@ void BlenderSync::sync_curves(Mesh *mesh, BL::Mesh b_mesh, BL::Object b_ob, int
 				size_t i = 0;
 
 				foreach(Mesh::Curve& curve, mesh->curves) {
-					float3 co = mesh->curve_keys[curve.first_key].co;
+					float3 co = float4_to_float3(mesh->curve_keys[curve.first_key]);
 					generated[i++] = co*size - loc;
 				}
 			}
diff --git a/intern/cycles/blender/blender_mesh.cpp b/intern/cycles/blender/blender_mesh.cpp
index 61c6ef6af1b..83514879477 100644
--- a/intern/cycles/blender/blender_mesh.cpp
+++ b/intern/cycles/blender/blender_mesh.cpp
@@ -206,6 +206,40 @@ static void mikk_compute_tangents(BL::Mesh b_mesh, BL::MeshTextureFaceLayer b_la
 	}
 }
 
+/* Create Volume Attribute */
+
+static void create_mesh_volume_attribute(BL::Object b_ob, Mesh *mesh, ImageManager *image_manager, AttributeStandard std)
+{
+	BL::SmokeDomainSettings b_domain = object_smoke_domain_find(b_ob);
+
+	if(!b_domain)
+		return;
+	
+	Attribute *attr = mesh->attributes.add(std);
+	VoxelAttribute *volume_data = attr->data_voxel();
+	bool is_float, is_linear;
+	bool animated = false;
+
+	volume_data->manager = image_manager;
+	volume_data->slot = image_manager->add_image(Attribute::standard_name(std),
+		b_ob.ptr.data, animated, is_float, is_linear, INTERPOLATION_LINEAR, true);
+}
+
+static void create_mesh_volume_attributes(Scene *scene, BL::Object b_ob, Mesh *mesh)
+{
+	/* for smoke volume rendering */
+	if(mesh->need_attribute(scene, ATTR_STD_VOLUME_DENSITY))
+		create_mesh_volume_attribute(b_ob, mesh, scene->image_manager, ATTR_STD_VOLUME_DENSITY);
+	if(mesh->need_attribute(scene, ATTR_STD_VOLUME_COLOR))
+		create_mesh_volume_attribute(b_ob, mesh, scene->image_manager, ATTR_STD_VOLUME_COLOR);
+	if(mesh->need_attribute(scene, ATTR_STD_VOLUME_FLAME))
+		create_mesh_volume_attribute(b_ob, mesh, scene->image_manager, ATTR_STD_VOLUME_FLAME);
+	if(mesh->need_attribute(scene, ATTR_STD_VOLUME_HEAT))
+		create_mesh_volume_attribute(b_ob, mesh, scene->image_manager, ATTR_STD_VOLUME_HEAT);
+	if(mesh->need_attribute(scene, ATTR_STD_VOLUME_VELOCITY))
+		create_mesh_volume_attribute(b_ob, mesh, scene->image_manager, ATTR_STD_VOLUME_VELOCITY);
+}
+
 /* Create Mesh */
 
 static void create_mesh(Scene *scene, Mesh *mesh, BL::Mesh b_mesh, const vector<uint>& used_shaders)
@@ -214,6 +248,7 @@ static void create_mesh(Scene *scene, Mesh *mesh, BL::Mesh b_mesh, const vector<
 	int numverts = b_mesh.vertices.length();
 	int numfaces = b_mesh.tessfaces.length();
 	int numtris = 0;
+	bool use_loop_normals = b_mesh.use_auto_smooth();
 
 	BL::Mesh::vertices_iterator v;
 	BL::Mesh::tessfaces_iterator f;
@@ -236,6 +271,21 @@ static void create_mesh(Scene *scene, Mesh *mesh, BL::Mesh b_mesh, const vector<
 
 	for(b_mesh.vertices.begin(v); v != b_mesh.vertices.end(); ++v, ++N)
 		*N = get_float3(v->normal());
+	N = attr_N->data_float3();
+
+	/* create generated coordinates from undeformed coordinates */
+	if(mesh->need_attribute(scene, ATTR_STD_GENERATED)) {
+		Attribute *attr = mesh->attributes.add(ATTR_STD_GENERATED);
+
+		float3 loc, size;
+		mesh_texture_space(b_mesh, loc, size);
+
+		float3 *generated = attr->data_float3();
+		size_t i = 0;
+
+		for(b_mesh.vertices.begin(v); v != b_mesh.vertices.end(); ++v)
+			generated[i++] = get_float3(v->undeformed_co())*size - loc;
+	}
 
 	/* create faces */
 	vector<int> nverts(numfaces);
@@ -248,9 +298,32 @@ static void create_mesh(Scene *scene, Mesh *mesh, BL::Mesh b_mesh, const vector<
 		int shader = used_shaders[mi];
 		bool smooth = f->use_smooth();
 
+		/* split vertices if normal is different
+		 *
+		 * note all vertex attributes must have been set here so we can split
+		 * and copy attributes in split_vertex without remapping later */
+		if(use_loop_normals) {
+			BL::Array<float, 12> loop_normals = f->split_normals();
+
+			for(int i = 0; i < n; i++) {
+				float3 loop_N = make_float3(loop_normals[i * 3], loop_normals[i * 3 + 1], loop_normals[i * 3 + 2]);
+
+				if(N[vi[i]] != loop_N) {
+					int new_vi = mesh->split_vertex(vi[i]);
+
+					/* set new normal and vertex index */
+					N = attr_N->data_float3();
+					N[new_vi] = loop_N;
+					vi[i] = new_vi;
+				}
+			}
+		}
+
+		/* create triangles */
 		if(n == 4) {
 			if(is_zero(cross(mesh->verts[vi[1]] - mesh->verts[vi[0]], mesh->verts[vi[2]] - mesh->verts[vi[0]])) ||
-				is_zero(cross(mesh->verts[vi[2]] - mesh->verts[vi[0]], mesh->verts[vi[3]] - mesh->verts[vi[0]]))) {
+			   is_zero(cross(mesh->verts[vi[2]] - mesh->verts[vi[0]], mesh->verts[vi[3]] - mesh->verts[vi[0]])))
+			{
 				mesh->set_triangle(ti++, vi[0], vi[1], vi[3], shader, smooth);
 				mesh->set_triangle(ti++, vi[2], vi[3], vi[1], shader, smooth);
 			}
@@ -348,20 +421,6 @@ static void create_mesh(Scene *scene, Mesh *mesh, BL::Mesh b_mesh, const vector<
 		}
 	}
 
-	/* create generated coordinates from undeformed coordinates */
-	if(mesh->need_attribute(scene, ATTR_STD_GENERATED)) {
-		Attribute *attr = mesh->attributes.add(ATTR_STD_GENERATED);
-
-		float3 loc, size;
-		mesh_texture_space(b_mesh, loc, size);
-
-		float3 *generated = attr->data_float3();
-		size_t i = 0;
-
-		for(b_mesh.vertices.begin(v); v != b_mesh.vertices.end(); ++v)
-			generated[i++] = get_float3(v->undeformed_co())*size - loc;
-	}
-
 	/* for volume objects, create a matrix to transform from object space to
 	 * mesh texture space. this does not work with deformations but that can
 	 * probably only be done well with a volume grid mapping of coordinates */
@@ -414,7 +473,7 @@ static void create_subd_mesh(Scene *scene, Mesh *mesh, BL::Mesh b_mesh, PointerR
 	//sdparams.camera = scene->camera;
 
 	/* tesselate */
-	DiagSplit dsplit(sdparams);;
+	DiagSplit dsplit(sdparams);
 	sdmesh.tessellate(&dsplit);
 }
 
@@ -449,6 +508,7 @@ Mesh *BlenderSync::sync_mesh(BL::Object b_ob, bool object_updated, bool hide_tri
 	Mesh *mesh;
 
 	if(!mesh_map.sync(&mesh, key)) {
+		
 		/* if transform was applied to mesh, need full update */
 		if(object_updated && mesh->transform_applied);
 		/* test if shaders changed, these can be object level so mesh
@@ -481,7 +541,7 @@ Mesh *BlenderSync::sync_mesh(BL::Object b_ob, bool object_updated, bool hide_tri
 	
 	/* compares curve_keys rather than strands in order to handle quick hair
 	 * adjustsments in dynamic BVH - other methods could probably do this better*/
-	vector<Mesh::CurveKey> oldcurve_keys = mesh->curve_keys;
+	vector<float4> oldcurve_keys = mesh->curve_keys;
 
 	mesh->clear();
 	mesh->used_shaders = used_shaders;
@@ -500,10 +560,12 @@ Mesh *BlenderSync::sync_mesh(BL::Object b_ob, bool object_updated, bool hide_tri
 					create_subd_mesh(scene, mesh, b_mesh, &cmesh, used_shaders);
 				else
 					create_mesh(scene, mesh, b_mesh, used_shaders);
+
+				create_mesh_volume_attributes(scene, b_ob, mesh);
 			}
 
 			if(render_layer.use_hair)
-				sync_curves(mesh, b_mesh, b_ob, 0);
+				sync_curves(mesh, b_mesh, b_ob, false);
 
 			/* free derived mesh */
 			b_data.meshes.remove(b_mesh);
@@ -535,7 +597,7 @@ Mesh *BlenderSync::sync_mesh(BL::Object b_ob, bool object_updated, bool hide_tri
 	if(oldcurve_keys.size() != mesh->curve_keys.size())
 		rebuild = true;
 	else if(oldcurve_keys.size()) {
-		if(memcmp(&oldcurve_keys[0], &mesh->curve_keys[0], sizeof(Mesh::CurveKey)*oldcurve_keys.size()) != 0)
+		if(memcmp(&oldcurve_keys[0], &mesh->curve_keys[0], sizeof(float4)*oldcurve_keys.size()) != 0)
 			rebuild = true;
 	}
 	
@@ -544,46 +606,153 @@ Mesh *BlenderSync::sync_mesh(BL::Object b_ob, bool object_updated, bool hide_tri
 	return mesh;
 }
 
-void BlenderSync::sync_mesh_motion(BL::Object b_ob, Mesh *mesh, int motion)
+void BlenderSync::sync_mesh_motion(BL::Object b_ob, Object *object, float motion_time)
 {
-	/* todo: displacement, subdivision */
-	size_t size = mesh->verts.size();
-
-	/* skip objects without deforming modifiers. this is not a totally reliable,
-	 * would need a more extensive check to see which objects are animated */
-	if(!size || !ccl::BKE_object_is_deform_modified(b_ob, b_scene, preview))
-		return;
-
 	/* ensure we only sync instanced meshes once */
+	Mesh *mesh = object->mesh;
+
 	if(mesh_motion_synced.find(mesh) != mesh_motion_synced.end())
 		return;
 
 	mesh_motion_synced.insert(mesh);
 
-	/* get derived mesh */
-	BL::Mesh b_mesh = object_to_mesh(b_data, b_ob, b_scene, true, !preview, false);
+	/* for motion pass always compute, for motion blur it can be disabled */
+	int time_index = 0;
+
+	if(scene->need_motion() == Scene::MOTION_BLUR) {
+		if(!mesh->use_motion_blur)
+			return;
+		
+		/* see if this mesh needs motion data at this time */
+		vector<float> object_times = object->motion_times();
+		bool found = false;
+
+		foreach(float object_time, object_times) {
+			if(motion_time == object_time) {
+				found = true;
+				break;
+			}
+			else
+				time_index++;
+		}
 
-	if(b_mesh) {
-		BL::Mesh::vertices_iterator v;
-		AttributeStandard std = (motion == -1)? ATTR_STD_MOTION_PRE: ATTR_STD_MOTION_POST;
-		Attribute *attr_M = mesh->attributes.add(std);
-		float3 *M = attr_M->data_float3(), *cur_M;
-		size_t i = 0;
+		if(!found)
+			return;
+	}
+	else {
+		if(motion_time == -1.0f)
+			time_index = 0;
+		else if(motion_time == 1.0f)
+			time_index = 1;
+		else
+			return;
+	}
+
+	/* skip empty meshes */
+	size_t numverts = mesh->verts.size();
+	size_t numkeys = mesh->curve_keys.size();
+
+	if(!numverts && !numkeys)
+		return;
+	
+	/* skip objects without deforming modifiers. this is not totally reliable,
+	 * would need a more extensive check to see which objects are animated */
+	BL::Mesh b_mesh(PointerRNA_NULL);
+
+	if(ccl::BKE_object_is_deform_modified(b_ob, b_scene, preview)) {
+		/* get derived mesh */
+		b_mesh = object_to_mesh(b_data, b_ob, b_scene, true, !preview, false);
+	}
 
-		for(b_mesh.vertices.begin(v), cur_M = M; v != b_mesh.vertices.end() && i < size; ++v, cur_M++, i++)
-			*cur_M = get_float3(v->co());
+	if(!b_mesh) {
+		/* if we have no motion blur on this frame, but on other frames, copy */
+		if(numverts) {
+			/* triangles */
+			Attribute *attr_mP = mesh->attributes.find(ATTR_STD_MOTION_VERTEX_POSITION);
+
+			if(attr_mP) {
+				Attribute *attr_mN = mesh->attributes.find(ATTR_STD_MOTION_VERTEX_NORMAL);
+				Attribute *attr_N = mesh->attributes.find(ATTR_STD_VERTEX_NORMAL);
+				float3 *P = &mesh->verts[0];
+				float3 *N = (attr_N)? attr_N->data_float3(): NULL;
+
+				memcpy(attr_mP->data_float3() + time_index*numverts, P, sizeof(float3)*numverts);
+				if(attr_mN)
+					memcpy(attr_mN->data_float3() + time_index*numverts, N, sizeof(float3)*numverts);
+			}
+		}
+
+		if(numkeys) {
+			/* curves */
+			Attribute *attr_mP = mesh->curve_attributes.find(ATTR_STD_MOTION_VERTEX_POSITION);
+
+			if(attr_mP) {
+				float4 *keys = &mesh->curve_keys[0];
+				memcpy(attr_mP->data_float4() + time_index*numkeys, keys, sizeof(float4)*numkeys);
+			}
+		}
+
+		return;
+	}
+
+	if(numverts) {
+		/* find attributes */
+		Attribute *attr_mP = mesh->attributes.find(ATTR_STD_MOTION_VERTEX_POSITION);
+		Attribute *attr_mN = mesh->attributes.find(ATTR_STD_MOTION_VERTEX_NORMAL);
+		Attribute *attr_N = mesh->attributes.find(ATTR_STD_VERTEX_NORMAL);
+		bool new_attribute = false;
+
+		/* add new attributes if they don't exist already */
+		if(!attr_mP) {
+			attr_mP = mesh->attributes.add(ATTR_STD_MOTION_VERTEX_POSITION);
+			if(attr_N)
+				attr_mN = mesh->attributes.add(ATTR_STD_MOTION_VERTEX_NORMAL);
 
-		/* if number of vertices changed, or if coordinates stayed the same, drop it */
-		if(i != size || memcmp(M, &mesh->verts[0], sizeof(float3)*size) == 0)
-			mesh->attributes.remove(std);
+			new_attribute = true;
+		}
+
+		/* load vertex data from mesh */
+		float3 *mP = attr_mP->data_float3() + time_index*numverts;
+		float3 *mN = (attr_mN)? attr_mN->data_float3() + time_index*numverts: NULL;
+
+		BL::Mesh::vertices_iterator v;
+		int i = 0;
 
-		/* hair motion */
-		if(render_layer.use_hair)
-			sync_curves(mesh, b_mesh, b_ob, motion);
+		for(b_mesh.vertices.begin(v); v != b_mesh.vertices.end() && i < numverts; ++v, ++i) {
+			mP[i] = get_float3(v->co());
+			if(mN)
+				mN[i] = get_float3(v->normal());
+		}
 
-		/* free derived mesh */
-		b_data.meshes.remove(b_mesh);
+		/* in case of new attribute, we verify if there really was any motion */
+		if(new_attribute) {
+			if(i != numverts || memcmp(mP, &mesh->verts[0], sizeof(float3)*numverts) == 0) {
+				/* no motion, remove attributes again */
+				mesh->attributes.remove(ATTR_STD_MOTION_VERTEX_POSITION);
+				if(attr_mN)
+					mesh->attributes.remove(ATTR_STD_MOTION_VERTEX_NORMAL);
+			}
+			else if(time_index > 0) {
+				/* motion, fill up previous steps that we might have skipped because
+				 * they had no motion, but we need them anyway now */
+				float3 *P = &mesh->verts[0];
+				float3 *N = (attr_N)? attr_N->data_float3(): NULL;
+
+				for(int step = 0; step < time_index; step++) {
+					memcpy(attr_mP->data_float3() + step*numverts, P, sizeof(float3)*numverts);
+					if(attr_mN)
+						memcpy(attr_mN->data_float3() + step*numverts, N, sizeof(float3)*numverts);
+				}
+			}
+		}
 	}
+
+	/* hair motion */
+	if(numkeys)
+		sync_curves(mesh, b_mesh, b_ob, true, time_index);
+
+	/* free derived mesh */
+	b_data.meshes.remove(b_mesh);
 }
 
 CCL_NAMESPACE_END
diff --git a/intern/cycles/blender/blender_object.cpp b/intern/cycles/blender/blender_object.cpp
index cc52717fdb6..167647608a5 100644
--- a/intern/cycles/blender/blender_object.cpp
+++ b/intern/cycles/blender/blender_object.cpp
@@ -38,7 +38,11 @@ CCL_NAMESPACE_BEGIN
 bool BlenderSync::BKE_object_is_modified(BL::Object b_ob)
 {
 	/* test if we can instance or if the object is modified */
-	if(ccl::BKE_object_is_modified(b_ob, b_scene, preview)) {
+	if(b_ob.type() == BL::Object::type_META) {
+		/* multi-user and dupli metaballs are fused, can't instance */
+		return true;
+	}
+	else if(ccl::BKE_object_is_modified(b_ob, b_scene, preview)) {
 		/* modifiers */
 		return true;
 	}
@@ -213,9 +217,11 @@ void BlenderSync::sync_background_light()
 
 /* Object */
 
-Object *BlenderSync::sync_object(BL::Object b_parent, int persistent_id[OBJECT_PERSISTENT_ID_SIZE], BL::DupliObject b_dupli_ob, Transform& tfm, uint layer_flag, int motion, bool hide_tris)
+Object *BlenderSync::sync_object(BL::Object b_parent, int persistent_id[OBJECT_PERSISTENT_ID_SIZE], BL::DupliObject b_dupli_ob,
+                                 Transform& tfm, uint layer_flag, float motion_time, bool hide_tris)
 {
 	BL::Object b_ob = (b_dupli_ob ? b_dupli_ob.object() : b_parent);
+	bool motion = motion_time != 0.0f;
 	
 	/* light is handled separately */
 	if(object_is_light(b_ob)) {
@@ -238,19 +244,22 @@ Object *BlenderSync::sync_object(BL::Object b_parent, int persistent_id[OBJECT_P
 	if(motion) {
 		object = object_map.find(key);
 
-		if(object) {
+		if(object && (scene->need_motion() == Scene::MOTION_PASS || object_use_motion(b_ob))) {
+			/* object transformation */
 			if(tfm != object->tfm) {
-				if(motion == -1)
+				if(motion_time == -1.0f) {
 					object->motion.pre = tfm;
-				else
+					object->use_motion = true;
+				}
+				else if(motion_time == 1.0f) {
 					object->motion.post = tfm;
-
-				object->use_motion = true;
+					object->use_motion = true;
+				}
 			}
 
-			/* mesh deformation blur not supported yet */
-			if(!scene->integrator->motion_blur)
-				sync_mesh_motion(b_ob, object->mesh, motion);
+			/* mesh deformation */
+			if(object->mesh)
+				sync_mesh_motion(b_ob, object, motion_time);
 		}
 
 		return object;
@@ -310,6 +319,24 @@ Object *BlenderSync::sync_object(BL::Object b_parent, int persistent_id[OBJECT_P
 		object->motion.post = tfm;
 		object->use_motion = false;
 
+		/* motion blur */
+		if(scene->need_motion() == Scene::MOTION_BLUR && object->mesh) {
+			Mesh *mesh = object->mesh;
+
+			mesh->use_motion_blur = false;
+
+			if(object_use_motion(b_ob)) {
+				if(object_use_deform_motion(b_ob)) {
+					mesh->motion_steps = object_motion_steps(b_ob);
+					mesh->use_motion_blur = true;
+				}
+
+				vector<float> times = object->motion_times();
+				foreach(float time, times)
+					motion_times.insert(time);
+			}
+		}
+
 		/* random number */
 		object->random_id = hash_string(object->name.c_str());
 
@@ -408,10 +435,11 @@ static bool object_render_hide_duplis(BL::Object b_ob)
 
 /* Object Loop */
 
-void BlenderSync::sync_objects(BL::SpaceView3D b_v3d, int motion)
+void BlenderSync::sync_objects(BL::SpaceView3D b_v3d, float motion_time)
 {
 	/* layer data */
 	uint scene_layer = render_layer.scene_layer;
+	bool motion = motion_time != 0.0f;
 	
 	if(!motion) {
 		/* prepare for sync */
@@ -420,36 +448,40 @@ void BlenderSync::sync_objects(BL::SpaceView3D b_v3d, int motion)
 		object_map.pre_sync();
 		mesh_synced.clear();
 		particle_system_map.pre_sync();
+		motion_times.clear();
 	}
 	else {
 		mesh_motion_synced.clear();
 	}
 
 	/* object loop */
-	BL::Scene::objects_iterator b_ob;
+	BL::Scene::object_bases_iterator b_base;
 	BL::Scene b_sce = b_scene;
-
-	/* global particle index counter */
-	int particle_id = 1;
+	/* modifier result type (not exposed as enum in C++ API)
+	 * 1 : eModifierMode_Realtime
+	 * 2 : eModifierMode_Render
+	 */
+	int dupli_settings = preview ? 1 : 2;
 
 	bool cancel = false;
 
 	for(; b_sce && !cancel; b_sce = b_sce.background_set()) {
-		for(b_sce.objects.begin(b_ob); b_ob != b_sce.objects.end() && !cancel; ++b_ob) {
-			bool hide = (render_layer.use_viewport_visibility)? b_ob->hide(): b_ob->hide_render();
-			uint ob_layer = get_layer(b_ob->layers(), b_ob->layers_local_view(), render_layer.use_localview, object_is_light(*b_ob));
+		for(b_sce.object_bases.begin(b_base); b_base != b_sce.object_bases.end() && !cancel; ++b_base) {
+			BL::Object b_ob = b_base->object();
+			bool hide = (render_layer.use_viewport_visibility)? b_ob.hide(): b_ob.hide_render();
+			uint ob_layer = get_layer(b_base->layers(), b_base->layers_local_view(), render_layer.use_localview, object_is_light(b_ob));
 			hide = hide || !(ob_layer & scene_layer);
 
 			if(!hide) {
-				progress.set_sync_status("Synchronizing object", (*b_ob).name());
+				progress.set_sync_status("Synchronizing object", b_ob.name());
 
-				if(b_ob->is_duplicator() && !object_render_hide_duplis(*b_ob)) {
+				if(b_ob.is_duplicator() && !object_render_hide_duplis(b_ob)) {
 					/* dupli objects */
-					b_ob->dupli_list_create(b_scene, 2);
+					b_ob.dupli_list_create(b_scene, dupli_settings);
 
 					BL::Object::dupli_list_iterator b_dup;
 
-					for(b_ob->dupli_list.begin(b_dup); b_dup != b_ob->dupli_list.end(); ++b_dup) {
+					for(b_ob.dupli_list.begin(b_dup); b_dup != b_ob.dupli_list.end(); ++b_dup) {
 						Transform tfm = get_transform(b_dup->matrix());
 						BL::Object b_dup_ob = b_dup->object();
 						bool dup_hide = (b_v3d)? b_dup_ob.hide(): b_dup_ob.hide_render();
@@ -462,32 +494,27 @@ void BlenderSync::sync_objects(BL::SpaceView3D b_v3d, int motion)
 							BL::Array<int, OBJECT_PERSISTENT_ID_SIZE> persistent_id = b_dup->persistent_id();
 
 							/* sync object and mesh or light data */
-							Object *object = sync_object(*b_ob, persistent_id.data, *b_dup, tfm, ob_layer, motion, hide_tris);
+							Object *object = sync_object(b_ob, persistent_id.data, *b_dup, tfm, ob_layer, motion_time, hide_tris);
 
 							/* sync possible particle data, note particle_id
 							 * starts counting at 1, first is dummy particle */
-							if(!motion && object && sync_dupli_particle(*b_ob, *b_dup, object)) {
-								if(particle_id != object->particle_id) {
-									object->particle_id = particle_id;
-									scene->object_manager->tag_update(scene);
-								}
-
-								particle_id++;
+							if(!motion && object) {
+								sync_dupli_particle(b_ob, *b_dup, object);
 							}
 
 						}
 					}
 
-					b_ob->dupli_list_clear();
+					b_ob.dupli_list_clear();
 				}
 
 				/* test if object needs to be hidden */
 				bool hide_tris;
 
-				if(!object_render_hide(*b_ob, true, true, hide_tris)) {
+				if(!object_render_hide(b_ob, true, true, hide_tris)) {
 					/* object itself */
-					Transform tfm = get_transform(b_ob->matrix_world());
-					sync_object(*b_ob, NULL, PointerRNA_NULL, tfm, ob_layer, motion, hide_tris);
+					Transform tfm = get_transform(b_ob.matrix_world());
+					sync_object(b_ob, NULL, PointerRNA_NULL, tfm, ob_layer, motion_time, hide_tris);
 				}
 			}
 
@@ -527,31 +554,46 @@ void BlenderSync::sync_motion(BL::SpaceView3D b_v3d, BL::Object b_override, void
 		b_cam = b_override;
 
 	Camera prevcam = *(scene->camera);
-	
-	/* go back and forth one frame */
-	int frame = b_scene.frame_current();
 
-	for(int motion = -1; motion <= 1; motion += 2) {
-		/* we need to set the python thread state again because this
-		 * function assumes it is being executed from python and will
-		 * try to save the thread state */
+	int frame_center = b_scene.frame_current();
+
+	/* always sample these times for camera motion */
+	motion_times.insert(-1.0f);
+	motion_times.insert(1.0f);
+
+	/* note iteration over motion_times set happens in sorted order */
+	foreach(float relative_time, motion_times) {
+		/* fixed shutter time to get previous and next frame for motion pass */
+		float shuttertime;
+
+		if(scene->need_motion() == Scene::MOTION_PASS)
+			shuttertime = 2.0f;
+		else
+			shuttertime = scene->camera->shuttertime;
+
+		/* compute frame and subframe time */
+		float time = frame_center + relative_time * shuttertime * 0.5f;
+		int frame = (int)floorf(time);
+		float subframe = time - frame;
+
+		/* change frame */
 		python_thread_state_restore(python_thread_state);
-		b_scene.frame_set(frame + motion, 0.0f);
+		b_scene.frame_set(frame, subframe);
 		python_thread_state_save(python_thread_state);
 
-		/* camera object */
-		if(b_cam)
-			sync_camera_motion(b_cam, motion);
+		/* sync camera, only supports two times at the moment */
+		if(relative_time == -1.0f || relative_time == 1.0f)
+			sync_camera_motion(b_cam, relative_time);
 
-		/* mesh objects */
-		sync_objects(b_v3d, motion);
+		/* sync object */
+		sync_objects(b_v3d, relative_time);
 	}
 
 	/* we need to set the python thread state again because this
 	 * function assumes it is being executed from python and will
 	 * try to save the thread state */
 	python_thread_state_restore(python_thread_state);
-	b_scene.frame_set(frame, 0.0f);
+	b_scene.frame_set(frame_center, 0.0f);
 	python_thread_state_save(python_thread_state);
 
 	/* tag camera for motion update */
diff --git a/intern/cycles/blender/blender_particles.cpp b/intern/cycles/blender/blender_particles.cpp
index ef832ed39c0..5b2782ec2ac 100644
--- a/intern/cycles/blender/blender_particles.cpp
+++ b/intern/cycles/blender/blender_particles.cpp
@@ -76,6 +76,11 @@ bool BlenderSync::sync_dupli_particle(BL::Object b_ob, BL::DupliObject b_dup, Ob
 
 	psys->particles.push_back(pa);
 
+	if (object->particle_index != psys->particles.size() - 1)
+		scene->object_manager->tag_update(scene);
+	object->particle_system = psys;
+	object->particle_index = psys->particles.size() - 1;
+
 	/* return that this object has particle data */
 	return true;
 }
diff --git a/intern/cycles/blender/blender_python.cpp b/intern/cycles/blender/blender_python.cpp
index e08b7980e78..872f891cc2a 100644
--- a/intern/cycles/blender/blender_python.cpp
+++ b/intern/cycles/blender/blender_python.cpp
@@ -147,6 +147,38 @@ static PyObject *render_func(PyObject *self, PyObject *value)
 	Py_RETURN_NONE;
 }
 
+/* pixel_array and result passed as pointers */
+static PyObject *bake_func(PyObject *self, PyObject *args)
+{
+	PyObject *pysession, *pyobject;
+	PyObject *pypixel_array, *pyresult;
+	const char *pass_type;
+	int num_pixels, depth;
+
+	if(!PyArg_ParseTuple(args, "OOsOiiO", &pysession, &pyobject, &pass_type, &pypixel_array,  &num_pixels, &depth, &pyresult))
+		return NULL;
+
+	Py_BEGIN_ALLOW_THREADS
+
+	BlenderSession *session = (BlenderSession*)PyLong_AsVoidPtr(pysession);
+
+	PointerRNA objectptr;
+	RNA_id_pointer_create((ID*)PyLong_AsVoidPtr(pyobject), &objectptr);
+	BL::Object b_object(objectptr);
+
+	void *b_result = PyLong_AsVoidPtr(pyresult);
+
+	PointerRNA bakepixelptr;
+	RNA_id_pointer_create((ID*)PyLong_AsVoidPtr(pypixel_array), &bakepixelptr);
+	BL::BakePixel b_bake_pixel(bakepixelptr);
+
+	session->bake(b_object, pass_type, b_bake_pixel, num_pixels, depth, (float *)b_result);
+
+	Py_END_ALLOW_THREADS
+
+	Py_RETURN_NONE;
+}
+
 static PyObject *draw_func(PyObject *self, PyObject *args)
 {
 	PyObject *pysession, *pyv3d, *pyrv3d;
@@ -285,7 +317,8 @@ static PyObject *osl_update_node_func(PyObject *self, PyObject *args)
 		}
 		else if(param->type.vecsemantics == TypeDesc::POINT ||
 		        param->type.vecsemantics == TypeDesc::VECTOR ||
-		        param->type.vecsemantics == TypeDesc::NORMAL) {
+		        param->type.vecsemantics == TypeDesc::NORMAL)
+		{
 			socket_type = "NodeSocketVector";
 			data_type = BL::NodeSocket::type_VECTOR;
 
@@ -418,6 +451,7 @@ static PyMethodDef methods[] = {
 	{"create", create_func, METH_VARARGS, ""},
 	{"free", free_func, METH_O, ""},
 	{"render", render_func, METH_O, ""},
+	{"bake", bake_func, METH_VARARGS, ""},
 	{"draw", draw_func, METH_VARARGS, ""},
 	{"sync", sync_func, METH_O, ""},
 	{"reset", reset_func, METH_VARARGS, ""},
@@ -493,7 +527,7 @@ void *CCL_python_module_init()
 	/* TODO(sergey): This gives us library we've been linking against.
 	 *               In theory with dynamic OSL library it might not be
 	 *               accurate, but there's nothing in OSL API which we
-	 *               might use th get version in runtime.
+	 *               might use to get version in runtime.
 	 */
 	int curversion = OSL_LIBRARY_VERSION_CODE;
 	PyModule_AddObject(mod, "with_osl", Py_True);
diff --git a/intern/cycles/blender/blender_session.cpp b/intern/cycles/blender/blender_session.cpp
index ef578493901..01a5acd8982 100644
--- a/intern/cycles/blender/blender_session.cpp
+++ b/intern/cycles/blender/blender_session.cpp
@@ -14,6 +14,8 @@
  * limitations under the License
  */
 
+#include <stdlib.h>
+
 #include "background.h"
 #include "buffers.h"
 #include "camera.h"
@@ -21,6 +23,8 @@
 #include "integrator.h"
 #include "film.h"
 #include "light.h"
+#include "mesh.h"
+#include "object.h"
 #include "scene.h"
 #include "session.h"
 #include "shader.h"
@@ -93,6 +97,11 @@ void BlenderSession::create_session()
 	/* create scene */
 	scene = new Scene(scene_params, session_params.device);
 
+	/* setup callbacks for builtin image support */
+	scene->image_manager->builtin_image_info_cb = function_bind(&BlenderSession::builtin_image_info, this, _1, _2, _3, _4, _5, _6, _7);
+	scene->image_manager->builtin_image_pixels_cb = function_bind(&BlenderSession::builtin_image_pixels, this, _1, _2, _3);
+	scene->image_manager->builtin_image_float_pixels_cb = function_bind(&BlenderSession::builtin_image_float_pixels, this, _1, _2, _3);
+
 	/* create session */
 	session = new Session(session_params);
 	session->scene = scene;
@@ -121,11 +130,6 @@ void BlenderSession::create_session()
 	session->reset(buffer_params, session_params.samples);
 
 	b_engine.use_highlight_tiles(session_params.progressive_refine == false);
-
-	/* setup callbacks for builtin image support */
-	scene->image_manager->builtin_image_info_cb = function_bind(&BlenderSession::builtin_image_info, this, _1, _2, _3, _4, _5, _6);
-	scene->image_manager->builtin_image_pixels_cb = function_bind(&BlenderSession::builtin_image_pixels, this, _1, _2, _3);
-	scene->image_manager->builtin_image_float_pixels_cb = function_bind(&BlenderSession::builtin_image_float_pixels, this, _1, _2, _3);
 }
 
 void BlenderSession::reset_session(BL::BlendData b_data_, BL::Scene b_scene_)
@@ -259,6 +263,58 @@ static PassType get_pass_type(BL::RenderPass b_pass)
 	return PASS_NONE;
 }
 
+static ShaderEvalType get_shader_type(const string& pass_type)
+{
+	const char *shader_type = pass_type.c_str();
+
+	/* data passes */
+	if(strcmp(shader_type, "NORMAL")==0)
+		return SHADER_EVAL_NORMAL;
+	else if(strcmp(shader_type, "UV")==0)
+		return SHADER_EVAL_UV;
+	else if(strcmp(shader_type, "DIFFUSE_COLOR")==0)
+		return SHADER_EVAL_DIFFUSE_COLOR;
+	else if(strcmp(shader_type, "GLOSSY_COLOR")==0)
+		return SHADER_EVAL_GLOSSY_COLOR;
+	else if(strcmp(shader_type, "TRANSMISSION_COLOR")==0)
+		return SHADER_EVAL_TRANSMISSION_COLOR;
+	else if(strcmp(shader_type, "SUBSURFACE_COLOR")==0)
+		return SHADER_EVAL_SUBSURFACE_COLOR;
+	else if(strcmp(shader_type, "EMIT")==0)
+		return SHADER_EVAL_EMISSION;
+
+	/* light passes */
+	else if(strcmp(shader_type, "AO")==0)
+		return SHADER_EVAL_AO;
+	else if(strcmp(shader_type, "COMBINED")==0)
+		return SHADER_EVAL_COMBINED;
+	else if(strcmp(shader_type, "SHADOW")==0)
+		return SHADER_EVAL_SHADOW;
+	else if(strcmp(shader_type, "DIFFUSE_DIRECT")==0)
+		return SHADER_EVAL_DIFFUSE_DIRECT;
+	else if(strcmp(shader_type, "GLOSSY_DIRECT")==0)
+		return SHADER_EVAL_GLOSSY_DIRECT;
+	else if(strcmp(shader_type, "TRANSMISSION_DIRECT")==0)
+		return SHADER_EVAL_TRANSMISSION_DIRECT;
+	else if(strcmp(shader_type, "SUBSURFACE_DIRECT")==0)
+		return SHADER_EVAL_SUBSURFACE_DIRECT;
+	else if(strcmp(shader_type, "DIFFUSE_INDIRECT")==0)
+		return SHADER_EVAL_DIFFUSE_INDIRECT;
+	else if(strcmp(shader_type, "GLOSSY_INDIRECT")==0)
+		return SHADER_EVAL_GLOSSY_INDIRECT;
+	else if(strcmp(shader_type, "TRANSMISSION_INDIRECT")==0)
+		return SHADER_EVAL_TRANSMISSION_INDIRECT;
+	else if(strcmp(shader_type, "SUBSURFACE_INDIRECT")==0)
+		return SHADER_EVAL_SUBSURFACE_INDIRECT;
+
+	/* extra */
+	else if(strcmp(shader_type, "ENVIRONMENT")==0)
+		return SHADER_EVAL_ENVIRONMENT;
+
+	else
+		return SHADER_EVAL_BAKE;
+}
+
 static BL::RenderResult begin_render_result(BL::RenderEngine b_engine, int x, int y, int w, int h, const char *layername)
 {
 	return b_engine.begin_result(x, y, w, h, layername);
@@ -425,6 +481,105 @@ void BlenderSession::render()
 	sync = NULL;
 }
 
+static void populate_bake_data(BakeData *data, BL::BakePixel pixel_array, const int num_pixels)
+{
+	BL::BakePixel bp = pixel_array;
+
+	int i;
+	for(i=0; i < num_pixels; i++) {
+		data->set(i, bp.primitive_id(), bp.uv(), bp.du_dx(), bp.du_dy(), bp.dv_dx(), bp.dv_dy());
+		bp = bp.next();
+	}
+}
+
+static bool is_light_pass(ShaderEvalType type)
+{
+	switch (type) {
+		case SHADER_EVAL_AO:
+		case SHADER_EVAL_COMBINED:
+		case SHADER_EVAL_SHADOW:
+		case SHADER_EVAL_DIFFUSE_DIRECT:
+		case SHADER_EVAL_GLOSSY_DIRECT:
+		case SHADER_EVAL_TRANSMISSION_DIRECT:
+		case SHADER_EVAL_SUBSURFACE_DIRECT:
+		case SHADER_EVAL_DIFFUSE_INDIRECT:
+		case SHADER_EVAL_GLOSSY_INDIRECT:
+		case SHADER_EVAL_TRANSMISSION_INDIRECT:
+		case SHADER_EVAL_SUBSURFACE_INDIRECT:
+			return true;
+		default:
+			return false;
+	}
+}
+
+void BlenderSession::bake(BL::Object b_object, const string& pass_type, BL::BakePixel pixel_array, int num_pixels, int depth, float result[])
+{
+	ShaderEvalType shader_type = get_shader_type(pass_type);
+	size_t object_index = OBJECT_NONE;
+	int tri_offset = 0;
+
+	if(shader_type == SHADER_EVAL_UV) {
+		/* force UV to be available */
+		Pass::add(PASS_UV, scene->film->passes);
+	}
+
+	if(is_light_pass(shader_type)) {
+		/* force use_light_pass to be true */
+		Pass::add(PASS_LIGHT, scene->film->passes);
+	}
+
+	/* create device and update scene */
+	scene->film->tag_update(scene);
+	scene->integrator->tag_update(scene);
+
+	/* update scene */
+	sync->sync_camera(b_render, b_engine.camera_override(), width, height);
+	sync->sync_data(b_v3d, b_engine.camera_override(), &python_thread_state);
+
+	/* get buffer parameters */
+	SessionParams session_params = BlenderSync::get_session_params(b_engine, b_userpref, b_scene, background);
+	BufferParams buffer_params = BlenderSync::get_buffer_params(b_render, b_scene, b_v3d, b_rv3d, scene->camera, width, height);
+
+	scene->bake_manager->set_baking(true);
+
+	/* set number of samples */
+	session->tile_manager.set_samples(session_params.samples);
+	session->reset(buffer_params, session_params.samples);
+	session->update_scene();
+
+	/* find object index. todo: is arbitrary - copied from mesh_displace.cpp */
+	for(size_t i = 0; i < scene->objects.size(); i++) {
+		if(strcmp(scene->objects[i]->name.c_str(), b_object.name().c_str()) == 0) {
+			object_index = i;
+			tri_offset = scene->objects[i]->mesh->tri_offset;
+			break;
+		}
+	}
+
+	/* when used, non-instanced convention: object = ~object */
+	int object = ~object_index;
+
+	BakeData *bake_data = scene->bake_manager->init(object, tri_offset, num_pixels);
+
+	populate_bake_data(bake_data, pixel_array, num_pixels);
+
+	/* set number of samples */
+	session->tile_manager.set_samples(session_params.samples);
+	session->reset(buffer_params, session_params.samples);
+	session->update_scene();
+
+	scene->bake_manager->bake(scene->device, &scene->dscene, scene, session->progress, shader_type, bake_data, result);
+
+	/* free all memory used (host and device), so we wouldn't leave render
+	 * engine with extra memory allocated
+	 */
+
+	session->device_free();
+
+	delete sync;
+	sync = NULL;
+}
+
 void BlenderSession::do_write_update_render_result(BL::RenderResult b_rr, BL::RenderLayer b_rlay, RenderTile& rtile, bool do_update_only)
 {
 	RenderBuffers *buffers = rtile.buffers;
@@ -592,16 +747,14 @@ bool BlenderSession::draw(int w, int h)
 
 	/* draw */
 	BufferParams buffer_params = BlenderSync::get_buffer_params(b_render, b_scene, b_v3d, b_rv3d, scene->camera, width, height);
+	DeviceDrawParams draw_params;
 
-	if(session->params.display_buffer_linear)
-		b_engine.bind_display_space_shader(b_scene);
-
-	bool draw_ok = !session->draw(buffer_params);
+	if(session->params.display_buffer_linear) {
+		draw_params.bind_display_space_shader_cb = function_bind(&BL::RenderEngine::bind_display_space_shader, &b_engine, b_scene);
+		draw_params.unbind_display_space_shader_cb = function_bind(&BL::RenderEngine::unbind_display_space_shader, &b_engine);
+	}
 
-	if(session->params.display_buffer_linear)
-		b_engine.unbind_display_space_shader();
-	
-	return draw_ok;
+	return !session->draw(buffer_params, draw_params);
 }
 
 void BlenderSession::get_status(string& status, string& substatus)
@@ -726,85 +879,123 @@ int BlenderSession::builtin_image_frame(const string &builtin_name)
 	return atoi(builtin_name.substr(last + 1, builtin_name.size() - last - 1).c_str());
 }
 
-void BlenderSession::builtin_image_info(const string &builtin_name, void *builtin_data, bool &is_float, int &width, int &height, int &channels)
+void BlenderSession::builtin_image_info(const string &builtin_name, void *builtin_data, bool &is_float, int &width, int &height, int &depth, int &channels)
 {
+	/* empty image */
+	is_float = false;
+	width = 0;
+	height = 0;
+	depth = 0;
+	channels = 0;
+
+	if(!builtin_data)
+		return;
+
+	/* recover ID pointer */
 	PointerRNA ptr;
 	RNA_id_pointer_create((ID*)builtin_data, &ptr);
-	BL::Image b_image(ptr);
+	BL::ID b_id(ptr);
+
+	if(b_id.is_a(&RNA_Image)) {
+		/* image data */
+		BL::Image b_image(b_id);
 
-	if(b_image) {
 		is_float = b_image.is_float();
 		width = b_image.size()[0];
 		height = b_image.size()[1];
+		depth = 1;
 		channels = b_image.channels();
 	}
-	else {
-		is_float = false;
-		width = 0;
-		height = 0;
-		channels = 0;
+	else if(b_id.is_a(&RNA_Object)) {
+		/* smoke volume data */
+		BL::Object b_ob(b_id);
+		BL::SmokeDomainSettings b_domain = object_smoke_domain_find(b_ob);
+
+		if(!b_domain)
+			return;
+
+		if(builtin_name == Attribute::standard_name(ATTR_STD_VOLUME_DENSITY) ||
+		   builtin_name == Attribute::standard_name(ATTR_STD_VOLUME_FLAME))
+			channels = 1;
+		else if(builtin_name == Attribute::standard_name(ATTR_STD_VOLUME_COLOR))
+			channels = 4;
+		else
+			return;
+
+		int3 resolution = get_int3(b_domain.domain_resolution());
+		int amplify = (b_domain.use_high_resolution())? b_domain.amplify() + 1: 1;
+
+		width = resolution.x * amplify;
+		height = resolution.y * amplify;
+		depth = resolution.z * amplify;
+
+		is_float = true;
 	}
 }
 
 bool BlenderSession::builtin_image_pixels(const string &builtin_name, void *builtin_data, unsigned char *pixels)
 {
+	if(!builtin_data)
+		return false;
+
 	int frame = builtin_image_frame(builtin_name);
 
 	PointerRNA ptr;
 	RNA_id_pointer_create((ID*)builtin_data, &ptr);
 	BL::Image b_image(ptr);
 
-	if(b_image) {
-		int width = b_image.size()[0];
-		int height = b_image.size()[1];
-		int channels = b_image.channels();
+	int width = b_image.size()[0];
+	int height = b_image.size()[1];
+	int channels = b_image.channels();
 
-		unsigned char *image_pixels;
-		image_pixels = image_get_pixels_for_frame(b_image, frame);
+	unsigned char *image_pixels;
+	image_pixels = image_get_pixels_for_frame(b_image, frame);
 
-		if(image_pixels) {
-			memcpy(pixels, image_pixels, width * height * channels * sizeof(unsigned char));
-			MEM_freeN(image_pixels);
+	if(image_pixels) {
+		memcpy(pixels, image_pixels, width * height * channels * sizeof(unsigned char));
+		MEM_freeN(image_pixels);
+	}
+	else {
+		if(channels == 1) {
+			memset(pixels, 0, width * height * sizeof(unsigned char));
 		}
 		else {
-			if(channels == 1) {
-				memset(pixels, 0, width * height * sizeof(unsigned char));
-			}
-			else {
-				unsigned char *cp = pixels;
-				for(int i = 0; i < width * height; i++, cp += channels) {
-					cp[0] = 255;
-					cp[1] = 0;
-					cp[2] = 255;
-					if(channels == 4)
-						cp[3] = 255;
-				}
+			unsigned char *cp = pixels;
+			for(int i = 0; i < width * height; i++, cp += channels) {
+				cp[0] = 255;
+				cp[1] = 0;
+				cp[2] = 255;
+				if(channels == 4)
+					cp[3] = 255;
 			}
 		}
+	}
 
-		/* premultiply, byte images are always straight for blender */
-		unsigned char *cp = pixels;
-		for(int i = 0; i < width * height; i++, cp += channels) {
-			cp[0] = (cp[0] * cp[3]) >> 8;
-			cp[1] = (cp[1] * cp[3]) >> 8;
-			cp[2] = (cp[2] * cp[3]) >> 8;
-		}
-
-		return true;
+	/* premultiply, byte images are always straight for blender */
+	unsigned char *cp = pixels;
+	for(int i = 0; i < width * height; i++, cp += channels) {
+		cp[0] = (cp[0] * cp[3]) >> 8;
+		cp[1] = (cp[1] * cp[3]) >> 8;
+		cp[2] = (cp[2] * cp[3]) >> 8;
 	}
 
-	return false;
+	return true;
 }
 
 bool BlenderSession::builtin_image_float_pixels(const string &builtin_name, void *builtin_data, float *pixels)
 {
-	int frame = builtin_image_frame(builtin_name);
+	if(!builtin_data)
+		return false;
 
 	PointerRNA ptr;
 	RNA_id_pointer_create((ID*)builtin_data, &ptr);
-	BL::Image b_image(ptr);
+	BL::ID b_id(ptr);
+
+	if(b_id.is_a(&RNA_Image)) {
+		/* image data */
+		BL::Image b_image(b_id);
+		int frame = builtin_image_frame(builtin_name);
 
-	if(b_image) {
 		int width = b_image.size()[0];
 		int height = b_image.size()[1];
 		int channels = b_image.channels();
@@ -834,6 +1025,51 @@ bool BlenderSession::builtin_image_float_pixels(const string &builtin_name, void
 
 		return true;
 	}
+	else if(b_id.is_a(&RNA_Object)) {
+		/* smoke volume data */
+		BL::Object b_ob(b_id);
+		BL::SmokeDomainSettings b_domain = object_smoke_domain_find(b_ob);
+
+		if(!b_domain)
+			return false;
+
+		int3 resolution = get_int3(b_domain.domain_resolution());
+		int length, amplify = (b_domain.use_high_resolution())? b_domain.amplify() + 1: 1;
+
+		int width = resolution.x * amplify;
+		int height = resolution.y * amplify;
+		int depth = resolution.z * amplify;
+
+		if(builtin_name == Attribute::standard_name(ATTR_STD_VOLUME_DENSITY)) {
+			SmokeDomainSettings_density_grid_get_length(&b_domain.ptr, &length);
+
+			if(length == width*height*depth) {
+				SmokeDomainSettings_density_grid_get(&b_domain.ptr, pixels);
+				return true;
+			}
+		}
+		else if(builtin_name == Attribute::standard_name(ATTR_STD_VOLUME_FLAME)) {
+			/* this is in range 0..1, and interpreted by the OpenGL smoke viewer
+			 * as 1500..3000 K with the first part faded to zero density */
+			SmokeDomainSettings_flame_grid_get_length(&b_domain.ptr, &length);
+
+			if(length == width*height*depth) {
+				SmokeDomainSettings_flame_grid_get(&b_domain.ptr, pixels);
+				return true;
+			}
+		}
+		else if(builtin_name == Attribute::standard_name(ATTR_STD_VOLUME_COLOR)) {
+			/* the RGB is "premultiplied" by density for better interpolation results */
+			SmokeDomainSettings_color_grid_get_length(&b_domain.ptr, &length);
+
+			if(length == width*height*depth*4) {
+				SmokeDomainSettings_color_grid_get(&b_domain.ptr, pixels);
+				return true;
+			}
+		}
+
+		fprintf(stderr, "Cycles error: unexpected smoke volume resolution, skipping\n");
+	}
 
 	return false;
 }
diff --git a/intern/cycles/blender/blender_session.h b/intern/cycles/blender/blender_session.h
index 0568fb291d0..0e44493d674 100644
--- a/intern/cycles/blender/blender_session.h
+++ b/intern/cycles/blender/blender_session.h
@@ -20,6 +20,7 @@
 #include "device.h"
 #include "scene.h"
 #include "session.h"
+#include "bake.h"
 
 #include "util_vector.h"
 
@@ -51,6 +52,8 @@ public:
 	/* offline render */
 	void render();
 
+	void bake(BL::Object b_object, const string& pass_type, BL::BakePixel pixel_array, int num_pixels, int depth, float pixels[]);
+
 	void write_render_result(BL::RenderResult b_rr, BL::RenderLayer b_rlay, RenderTile& rtile);
 	void write_render_tile(RenderTile& rtile);
 
@@ -99,7 +102,7 @@ protected:
 	void do_write_update_render_tile(RenderTile& rtile, bool do_update_only);
 
 	int builtin_image_frame(const string &builtin_name);
-	void builtin_image_info(const string &builtin_name, void *builtin_data, bool &is_float, int &width, int &height, int &channels);
+	void builtin_image_info(const string &builtin_name, void *builtin_data, bool &is_float, int &width, int &height, int &depth, int &channels);
 	bool builtin_image_pixels(const string &builtin_name, void *builtin_data, unsigned char *pixels);
 	bool builtin_image_float_pixels(const string &builtin_name, void *builtin_data, float *pixels);
 };
diff --git a/intern/cycles/blender/blender_shader.cpp b/intern/cycles/blender/blender_shader.cpp
index 6175c8ea399..ddbb40da7db 100644
--- a/intern/cycles/blender/blender_shader.cpp
+++ b/intern/cycles/blender/blender_shader.cpp
@@ -546,9 +546,11 @@ static ShaderNode *add_node(Scene *scene, BL::BlendData b_data, BL::Scene b_scen
 			}
 
 			image->animated = b_image_node.image_user().use_auto_refresh();
+			image->use_alpha = b_image.use_alpha();
 		}
 		image->color_space = ImageTextureNode::color_space_enum[(int)b_image_node.color_space()];
 		image->projection = ImageTextureNode::projection_enum[(int)b_image_node.projection()];
+		image->interpolation = (InterpolationType)b_image_node.interpolation();
 		image->projection_blend = b_image_node.projection_blend();
 		get_tex_mapping(&image->tex_mapping, b_image_node.texture_mapping());
 		node = image;
@@ -573,6 +575,8 @@ static ShaderNode *add_node(Scene *scene, BL::BlendData b_data, BL::Scene b_scen
 				env->animated = b_env_node.image_user().use_auto_refresh();
 				env->builtin_data = NULL;
 			}
+
+			env->use_alpha = b_image.use_alpha();
 		}
 		env->color_space = EnvironmentTextureNode::color_space_enum[(int)b_env_node.color_space()];
 		env->projection = EnvironmentTextureNode::projection_enum[(int)b_env_node.projection()];
@@ -667,6 +671,13 @@ static ShaderNode *add_node(Scene *scene, BL::BlendData b_data, BL::Scene b_scen
 		tangent->attribute = b_tangent_node.uv_map();
 		node = tangent;
 	}
+	else if (b_node.is_a(&RNA_ShaderNodeUVMap)) {
+		BL::ShaderNodeUVMap b_uvmap_node(b_node);
+		UVMapNode *uvm = new UVMapNode();
+		uvm->attribute = b_uvmap_node.uv_map();
+		uvm->from_dupli = b_uvmap_node.from_dupli();
+		node = uvm;
+	}
 
 	if(node)
 		graph->add(node);
diff --git a/intern/cycles/blender/blender_sync.cpp b/intern/cycles/blender/blender_sync.cpp
index 8e2197a2aa6..1f5e32a1123 100644
--- a/intern/cycles/blender/blender_sync.cpp
+++ b/intern/cycles/blender/blender_sync.cpp
@@ -172,6 +172,7 @@ void BlenderSync::sync_integrator()
 	integrator->transparent_min_bounce = get_int(cscene, "transparent_min_bounces");
 	integrator->transparent_shadows = get_boolean(cscene, "use_transparent_shadows");
 
+	integrator->volume_homogeneous_sampling = RNA_enum_get(&cscene, "volume_homogeneous_sampling");
 	integrator->volume_max_steps = get_int(cscene, "volume_max_steps");
 	integrator->volume_step_size = get_float(cscene, "volume_step_size");
 
@@ -197,6 +198,9 @@ void BlenderSync::sync_integrator()
 
 	integrator->method = (Integrator::Method)get_enum(cscene, "progressive");
 
+	integrator->sample_all_lights_direct = get_boolean(cscene, "sample_all_lights_direct");
+	integrator->sample_all_lights_indirect = get_boolean(cscene, "sample_all_lights_indirect");
+
 	int diffuse_samples = get_int(cscene, "diffuse_samples");
 	int glossy_samples = get_int(cscene, "glossy_samples");
 	int transmission_samples = get_int(cscene, "transmission_samples");
diff --git a/intern/cycles/blender/blender_sync.h b/intern/cycles/blender/blender_sync.h
index 205761ad302..9c4175ef690 100644
--- a/intern/cycles/blender/blender_sync.h
+++ b/intern/cycles/blender/blender_sync.h
@@ -71,7 +71,7 @@ private:
 	/* sync */
 	void sync_lamps(bool update_all);
 	void sync_materials(bool update_all);
-	void sync_objects(BL::SpaceView3D b_v3d, int motion = 0);
+	void sync_objects(BL::SpaceView3D b_v3d, float motion_time = 0.0f);
 	void sync_motion(BL::SpaceView3D b_v3d, BL::Object b_override, void **python_thread_state);
 	void sync_film();
 	void sync_view();
@@ -81,12 +81,13 @@ private:
 
 	void sync_nodes(Shader *shader, BL::ShaderNodeTree b_ntree);
 	Mesh *sync_mesh(BL::Object b_ob, bool object_updated, bool hide_tris);
-	void sync_curves(Mesh *mesh, BL::Mesh b_mesh, BL::Object b_ob, int motion);
-	Object *sync_object(BL::Object b_parent, int persistent_id[OBJECT_PERSISTENT_ID_SIZE], BL::DupliObject b_dupli_object, Transform& tfm, uint layer_flag, int motion, bool hide_tris);
+	void sync_curves(Mesh *mesh, BL::Mesh b_mesh, BL::Object b_ob, bool motion, int time_index = 0);
+	Object *sync_object(BL::Object b_parent, int persistent_id[OBJECT_PERSISTENT_ID_SIZE], BL::DupliObject b_dupli_ob,
+	                                 Transform& tfm, uint layer_flag, float motion_time, bool hide_tris);
 	void sync_light(BL::Object b_parent, int persistent_id[OBJECT_PERSISTENT_ID_SIZE], BL::Object b_ob, Transform& tfm);
 	void sync_background_light();
-	void sync_mesh_motion(BL::Object b_ob, Mesh *mesh, int motion);
-	void sync_camera_motion(BL::Object b_ob, int motion);
+	void sync_mesh_motion(BL::Object b_ob, Object *object, float motion_time);
+	void sync_camera_motion(BL::Object b_ob, float motion_time);
 
 	/* particles */
 	bool sync_dupli_particle(BL::Object b_ob, BL::DupliObject b_dup, Object *object);
@@ -109,6 +110,7 @@ private:
 	id_map<ParticleSystemKey, ParticleSystem> particle_system_map;
 	set<Mesh*> mesh_synced;
 	set<Mesh*> mesh_motion_synced;
+	std::set<float> motion_times;
 	void *world_map;
 	bool world_recalc;
 
diff --git a/intern/cycles/blender/blender_util.h b/intern/cycles/blender/blender_util.h
index 58e523d7fc2..35e417d8069 100644
--- a/intern/cycles/blender/blender_util.h
+++ b/intern/cycles/blender/blender_util.h
@@ -42,7 +42,14 @@ void python_thread_state_restore(void **python_thread_state);
 
 static inline BL::Mesh object_to_mesh(BL::BlendData data, BL::Object object, BL::Scene scene, bool apply_modifiers, bool render, bool calc_undeformed)
 {
-	return data.meshes.new_from_object(scene, object, apply_modifiers, (render)? 2: 1, true, calc_undeformed);
+	BL::Mesh me = data.meshes.new_from_object(scene, object, apply_modifiers, (render)? 2: 1, false, calc_undeformed);
+	if ((bool)me) {
+		if (me.use_auto_smooth()) {
+			me.calc_normals_split(me.auto_smooth_angle());
+		}
+		me.calc_tessface();
+	}
+	return me;
 }
 
 static inline void colorramp_to_array(BL::ColorRamp ramp, float4 *data, int size)
@@ -50,7 +57,7 @@ static inline void colorramp_to_array(BL::ColorRamp ramp, float4 *data, int size
 	for(int i = 0; i < size; i++) {
 		float color[4];
 
-		ramp.evaluate(i/(float)(size-1), color);
+		ramp.evaluate((float)i/(float)(size-1), color);
 		data[i] = make_float4(color[0], color[1], color[2], color[3]);
 	}
 }
@@ -67,7 +74,7 @@ static inline void curvemapping_color_to_array(BL::CurveMapping cumap, float4 *d
 		BL::CurveMap mapI = cumap.curves[3];
 
 		for(int i = 0; i < size; i++) {
-			float t = i/(float)(size-1);
+			float t = (float)i/(float)(size-1);
 
 			data[i][0] = mapR.evaluate(mapI.evaluate(t));
 			data[i][1] = mapG.evaluate(mapI.evaluate(t));
@@ -76,7 +83,7 @@ static inline void curvemapping_color_to_array(BL::CurveMapping cumap, float4 *d
 	}
 	else {
 		for(int i = 0; i < size; i++) {
-			float t = i/(float)(size-1);
+			float t = (float)i/(float)(size-1);
 
 			data[i][0] = mapR.evaluate(t);
 			data[i][1] = mapG.evaluate(t);
@@ -168,6 +175,11 @@ static inline float4 get_float4(BL::Array<float, 4> array)
 	return make_float4(array[0], array[1], array[2], array[3]);
 }
 
+static inline int3 get_int3(BL::Array<int, 3> array)
+{
+	return make_int3(array[0], array[1], array[2]);
+}
+
 static inline int4 get_int4(BL::Array<int, 4> array)
 {
 	return make_int4(array[0], array[1], array[2], array[3]);
@@ -341,6 +353,52 @@ static inline void mesh_texture_space(BL::Mesh b_mesh, float3& loc, float3& size
 	loc = loc*size - make_float3(0.5f, 0.5f, 0.5f);
 }
 
+/* object used for motion blur */
+static inline bool object_use_motion(BL::Object b_ob)
+{
+	PointerRNA cobject = RNA_pointer_get(&b_ob.ptr, "cycles");
+	bool use_motion = get_boolean(cobject, "use_motion_blur");
+	
+	return use_motion;
+}
+
+/* object motion steps */
+static inline uint object_motion_steps(BL::Object b_ob)
+{
+	PointerRNA cobject = RNA_pointer_get(&b_ob.ptr, "cycles");
+	uint steps = get_int(cobject, "motion_steps");
+
+	/* use uneven number of steps so we get one keyframe at the current frame,
+	 * and ue 2^(steps - 1) so objects with more/fewer steps still have samples
+	 * at the same times, to avoid sampling at many different times */
+	return (2 << (steps - 1)) + 1;
+}
+
+/* object uses deformation motion blur */
+static inline bool object_use_deform_motion(BL::Object b_ob)
+{
+	PointerRNA cobject = RNA_pointer_get(&b_ob.ptr, "cycles");
+	bool use_deform_motion = get_boolean(cobject, "use_deform_motion");
+	
+	return use_deform_motion;
+}
+
+static inline BL::SmokeDomainSettings object_smoke_domain_find(BL::Object b_ob)
+{
+	BL::Object::modifiers_iterator b_mod;
+
+	for(b_ob.modifiers.begin(b_mod); b_mod != b_ob.modifiers.end(); ++b_mod) {
+		if (b_mod->is_a(&RNA_SmokeModifier)) {
+			BL::SmokeModifier b_smd(*b_mod);
+
+			if(b_smd.smoke_type() == BL::SmokeModifier::smoke_type_DOMAIN)
+				return b_smd.domain_settings();
+		}
+	}
+	
+	return BL::SmokeDomainSettings(PointerRNA_NULL);
+}
+
 /* ID Map
  *
  * Utility class to keep in sync with blender data.
diff --git a/intern/cycles/bvh/bvh.cpp b/intern/cycles/bvh/bvh.cpp
index 6c636ac5c8d..3c0c5c021c8 100644
--- a/intern/cycles/bvh/bvh.cpp
+++ b/intern/cycles/bvh/bvh.cpp
@@ -77,13 +77,25 @@ bool BVH::cache_read(CacheData& key)
 	key.add(&params, sizeof(params));
 
 	foreach(Object *ob, objects) {
-		key.add(ob->mesh->verts);
-		key.add(ob->mesh->triangles);
-		key.add(ob->mesh->curve_keys);
-		key.add(ob->mesh->curves);
+		Mesh *mesh = ob->mesh;
+
+		key.add(mesh->verts);
+		key.add(mesh->triangles);
+		key.add(mesh->curve_keys);
+		key.add(mesh->curves);
 		key.add(&ob->bounds, sizeof(ob->bounds));
 		key.add(&ob->visibility, sizeof(ob->visibility));
-		key.add(&ob->mesh->transform_applied, sizeof(bool));
+		key.add(&mesh->transform_applied, sizeof(bool));
+
+		if(mesh->use_motion_blur) {
+			Attribute *attr = mesh->attributes.find(ATTR_STD_MOTION_VERTEX_POSITION);
+			if(attr)
+				key.add(attr->buffer);
+
+			attr = mesh->curve_attributes.find(ATTR_STD_MOTION_VERTEX_POSITION);
+			if(attr)
+				key.add(attr->buffer);
+		}
 	}
 
 	CacheData value;
@@ -97,7 +109,7 @@ bool BVH::cache_read(CacheData& key)
 		value.read(pack.nodes);
 		value.read(pack.object_node);
 		value.read(pack.tri_woop);
-		value.read(pack.prim_segment);
+		value.read(pack.prim_type);
 		value.read(pack.prim_visibility);
 		value.read(pack.prim_index);
 		value.read(pack.prim_object);
@@ -119,7 +131,7 @@ void BVH::cache_write(CacheData& key)
 	value.add(pack.nodes);
 	value.add(pack.object_node);
 	value.add(pack.tri_woop);
-	value.add(pack.prim_segment);
+	value.add(pack.prim_type);
 	value.add(pack.prim_visibility);
 	value.add(pack.prim_index);
 	value.add(pack.prim_object);
@@ -165,11 +177,11 @@ void BVH::build(Progress& progress)
 	}
 
 	/* build nodes */
-	vector<int> prim_segment;
+	vector<int> prim_type;
 	vector<int> prim_index;
 	vector<int> prim_object;
 
-	BVHBuild bvh_build(objects, prim_segment, prim_index, prim_object, params, progress);
+	BVHBuild bvh_build(objects, prim_type, prim_index, prim_object, params, progress);
 	BVHNode *root = bvh_build.run();
 
 	if(progress.get_cancel()) {
@@ -178,7 +190,7 @@ void BVH::build(Progress& progress)
 	}
 
 	/* todo: get rid of this copy */
-	pack.prim_segment = prim_segment;
+	pack.prim_type = prim_type;
 	pack.prim_index = prim_index;
 	pack.prim_object = prim_object;
 
@@ -238,9 +250,12 @@ void BVH::refit(Progress& progress)
 
 void BVH::pack_triangle(int idx, float4 woop[3])
 {
-	/* create Woop triangle */
 	int tob = pack.prim_object[idx];
 	const Mesh *mesh = objects[tob]->mesh;
+
+	if(mesh->has_motion_blur())
+		return;
+
 	int tidx = pack.prim_index[idx];
 	const int *vidx = mesh->triangles[tidx].v;
 	const float3* vpos = &mesh->verts[0];
@@ -280,11 +295,11 @@ void BVH::pack_curve_segment(int idx, float4 woop[3])
 	int tob = pack.prim_object[idx];
 	const Mesh *mesh = objects[tob]->mesh;
 	int tidx = pack.prim_index[idx];
-	int segment = pack.prim_segment[idx];
+	int segment = PRIMITIVE_UNPACK_SEGMENT(pack.prim_type[idx]);
 	int k0 = mesh->curves[tidx].first_key + segment;
 	int k1 = mesh->curves[tidx].first_key + segment + 1;
-	float3 v0 = mesh->curve_keys[k0].co;
-	float3 v1 = mesh->curve_keys[k1].co;
+	float3 v0 = float4_to_float3(mesh->curve_keys[k0]);
+	float3 v1 = float4_to_float3(mesh->curve_keys[k1]);
 
 	float3 d0 = v1 - v0;
 	float l =  len(d0);
@@ -324,7 +339,7 @@ void BVH::pack_primitives()
 		if(pack.prim_index[i] != -1) {
 			float4 woop[3];
 
-			if(pack.prim_segment[i] != ~0)
+			if(pack.prim_type[i] & PRIMITIVE_ALL_CURVE)
 				pack_curve_segment(i, woop);
 			else
 				pack_triangle(i, woop);
@@ -335,7 +350,7 @@ void BVH::pack_primitives()
 			Object *ob = objects[tob];
 			pack.prim_visibility[i] = ob->visibility;
 
-			if(pack.prim_segment[i] != ~0)
+			if(pack.prim_type[i] & PRIMITIVE_ALL_CURVE)
 				pack.prim_visibility[i] |= PATH_RAY_CURVE;
 		}
 		else {
@@ -359,7 +374,7 @@ void BVH::pack_instances(size_t nodes_size)
 	 * meshes with transform applied and already in the top level BVH */
 	for(size_t i = 0; i < pack.prim_index.size(); i++)
 		if(pack.prim_index[i] != -1) {
-			if(pack.prim_segment[i] != ~0)
+			if(pack.prim_type[i] & PRIMITIVE_ALL_CURVE)
 				pack.prim_index[i] += objects[pack.prim_object[i]]->mesh->curve_offset;
 			else
 				pack.prim_index[i] += objects[pack.prim_object[i]]->mesh->tri_offset;
@@ -401,7 +416,7 @@ void BVH::pack_instances(size_t nodes_size)
 	mesh_map.clear();
 
 	pack.prim_index.resize(prim_index_size);
-	pack.prim_segment.resize(prim_index_size);
+	pack.prim_type.resize(prim_index_size);
 	pack.prim_object.resize(prim_index_size);
 	pack.prim_visibility.resize(prim_index_size);
 	pack.tri_woop.resize(tri_woop_size);
@@ -409,7 +424,7 @@ void BVH::pack_instances(size_t nodes_size)
 	pack.object_node.resize(objects.size());
 
 	int *pack_prim_index = (pack.prim_index.size())? &pack.prim_index[0]: NULL;
-	int *pack_prim_segment = (pack.prim_segment.size())? &pack.prim_segment[0]: NULL;
+	int *pack_prim_type = (pack.prim_type.size())? &pack.prim_type[0]: NULL;
 	int *pack_prim_object = (pack.prim_object.size())? &pack.prim_object[0]: NULL;
 	uint *pack_prim_visibility = (pack.prim_visibility.size())? &pack.prim_visibility[0]: NULL;
 	float4 *pack_tri_woop = (pack.tri_woop.size())? &pack.tri_woop[0]: NULL;
@@ -454,16 +469,16 @@ void BVH::pack_instances(size_t nodes_size)
 		if(bvh->pack.prim_index.size()) {
 			size_t bvh_prim_index_size = bvh->pack.prim_index.size();
 			int *bvh_prim_index = &bvh->pack.prim_index[0];
-			int *bvh_prim_segment = &bvh->pack.prim_segment[0];
+			int *bvh_prim_type = &bvh->pack.prim_type[0];
 			uint *bvh_prim_visibility = &bvh->pack.prim_visibility[0];
 
 			for(size_t i = 0; i < bvh_prim_index_size; i++) {
-				if(bvh->pack.prim_segment[i] != ~0)
+				if(bvh->pack.prim_type[i] & PRIMITIVE_ALL_CURVE)
 					pack_prim_index[pack_prim_index_offset] = bvh_prim_index[i] + mesh_curve_offset;
 				else
 					pack_prim_index[pack_prim_index_offset] = bvh_prim_index[i] + mesh_tri_offset;
 
-				pack_prim_segment[pack_prim_index_offset] = bvh_prim_segment[i];
+				pack_prim_type[pack_prim_index_offset] = bvh_prim_type[i];
 				pack_prim_visibility[pack_prim_index_offset] = bvh_prim_visibility[i];
 				pack_prim_object[pack_prim_index_offset] = 0;  // unused for instances
 				pack_prim_index_offset++;
@@ -629,37 +644,51 @@ void RegularBVH::refit_node(int idx, bool leaf, BoundBox& bbox, uint& visibility
 				/* primitives */
 				const Mesh *mesh = ob->mesh;
 
-				if(pack.prim_segment[prim] != ~0) {
+				if(pack.prim_type[prim] & PRIMITIVE_ALL_CURVE) {
 					/* curves */
 					int str_offset = (params.top_level)? mesh->curve_offset: 0;
-					int k0 = mesh->curves[pidx - str_offset].first_key + pack.prim_segment[prim]; // XXX!
-					int k1 = k0 + 1;
-
-					float3 p[4];
-					p[0] = mesh->curve_keys[max(k0 - 1,mesh->curves[pidx - str_offset].first_key)].co;
-					p[1] = mesh->curve_keys[k0].co;
-					p[2] = mesh->curve_keys[k1].co;
-					p[3] = mesh->curve_keys[min(k1 + 1,mesh->curves[pidx - str_offset].first_key + mesh->curves[pidx - str_offset].num_keys - 1)].co;
-					float3 lower;
-					float3 upper;
-					curvebounds(&lower.x, &upper.x, p, 0);
-					curvebounds(&lower.y, &upper.y, p, 1);
-					curvebounds(&lower.z, &upper.z, p, 2);
-					float mr = max(mesh->curve_keys[k0].radius,mesh->curve_keys[k1].radius);
-					bbox.grow(lower, mr);
-					bbox.grow(upper, mr);
+					const Mesh::Curve& curve = mesh->curves[pidx - str_offset];
+					int k = PRIMITIVE_UNPACK_SEGMENT(pack.prim_type[prim]);
+
+					curve.bounds_grow(k, &mesh->curve_keys[0], bbox);
 
 					visibility |= PATH_RAY_CURVE;
+
+					/* motion curves */
+					if(mesh->use_motion_blur) {
+						Attribute *attr = mesh->curve_attributes.find(ATTR_STD_MOTION_VERTEX_POSITION);
+
+						if(attr) {
+							size_t mesh_size = mesh->curve_keys.size();
+							size_t steps = mesh->motion_steps - 1;
+							float4 *key_steps = attr->data_float4();
+
+							for (size_t i = 0; i < steps; i++)
+								curve.bounds_grow(k, key_steps + i*mesh_size, bbox);
+						}
+					}
 				}
 				else {
 					/* triangles */
 					int tri_offset = (params.top_level)? mesh->tri_offset: 0;
-					const int *vidx = mesh->triangles[pidx - tri_offset].v;
+					const Mesh::Triangle& triangle = mesh->triangles[pidx - tri_offset];
 					const float3 *vpos = &mesh->verts[0];
 
-					bbox.grow(vpos[vidx[0]]);
-					bbox.grow(vpos[vidx[1]]);
-					bbox.grow(vpos[vidx[2]]);
+					triangle.bounds_grow(vpos, bbox);
+
+					/* motion triangles */
+					if(mesh->use_motion_blur) {
+						Attribute *attr = mesh->attributes.find(ATTR_STD_MOTION_VERTEX_POSITION);
+
+						if(attr) {
+							size_t mesh_size = mesh->verts.size();
+							size_t steps = mesh->motion_steps - 1;
+							float3 *vert_steps = attr->data_float3();
+
+							for (size_t i = 0; i < steps; i++)
+								triangle.bounds_grow(vert_steps + i*mesh_size, bbox);
+						}
+					}
 				}
 			}
 
diff --git a/intern/cycles/bvh/bvh.h b/intern/cycles/bvh/bvh.h
index f2c96638b84..5fcaaaa988c 100644
--- a/intern/cycles/bvh/bvh.h
+++ b/intern/cycles/bvh/bvh.h
@@ -52,8 +52,8 @@ struct PackedBVH {
 	array<int> object_node; 
 	/* precomputed triangle intersection data, one triangle is 4x float4 */
 	array<float4> tri_woop;
-	/* primitive type - triangle or strand (should be moved to flag?) */
-	array<int> prim_segment;
+	/* primitive type - triangle or strand */
+	array<int> prim_type;
 	/* visibility visibilitys for primitives */
 	array<uint> prim_visibility;
 	/* mapping from BVH primitive index to true primitive index, as primitives
diff --git a/intern/cycles/bvh/bvh_binning.cpp b/intern/cycles/bvh/bvh_binning.cpp
index 05a674a47a7..bd37ffbcf38 100644
--- a/intern/cycles/bvh/bvh_binning.cpp
+++ b/intern/cycles/bvh/bvh_binning.cpp
@@ -83,14 +83,14 @@ BVHObjectBinning::BVHObjectBinning(const BVHRange& job, BVHReference *prims)
 			int4 bin1 = get_bin(prim1.bounds());
 
 			/* increase bounds for bins for even primitive */
-			int b00 = extract<0>(bin0); bin_count[b00][0]++; bin_bounds[b00][0].grow(prim0.bounds());
-			int b01 = extract<1>(bin0); bin_count[b01][1]++; bin_bounds[b01][1].grow(prim0.bounds());
-			int b02 = extract<2>(bin0); bin_count[b02][2]++; bin_bounds[b02][2].grow(prim0.bounds());
+			int b00 = (int)extract<0>(bin0); bin_count[b00][0]++; bin_bounds[b00][0].grow(prim0.bounds());
+			int b01 = (int)extract<1>(bin0); bin_count[b01][1]++; bin_bounds[b01][1].grow(prim0.bounds());
+			int b02 = (int)extract<2>(bin0); bin_count[b02][2]++; bin_bounds[b02][2].grow(prim0.bounds());
 
 			/* increase bounds of bins for odd primitive */
-			int b10 = extract<0>(bin1); bin_count[b10][0]++; bin_bounds[b10][0].grow(prim1.bounds());
-			int b11 = extract<1>(bin1); bin_count[b11][1]++; bin_bounds[b11][1].grow(prim1.bounds());
-			int b12 = extract<2>(bin1); bin_count[b12][2]++; bin_bounds[b12][2].grow(prim1.bounds());
+			int b10 = (int)extract<0>(bin1); bin_count[b10][0]++; bin_bounds[b10][0].grow(prim1.bounds());
+			int b11 = (int)extract<1>(bin1); bin_count[b11][1]++; bin_bounds[b11][1].grow(prim1.bounds());
+			int b12 = (int)extract<2>(bin1); bin_count[b12][2]++; bin_bounds[b12][2].grow(prim1.bounds());
 		}
 
 		/* for uneven number of primitives */
@@ -100,9 +100,9 @@ BVHObjectBinning::BVHObjectBinning(const BVHRange& job, BVHReference *prims)
 			int4 bin0 = get_bin(prim0.bounds());
 
 			/* increase bounds of bins */
-			int b00 = extract<0>(bin0); bin_count[b00][0]++; bin_bounds[b00][0].grow(prim0.bounds());
-			int b01 = extract<1>(bin0); bin_count[b01][1]++; bin_bounds[b01][1].grow(prim0.bounds());
-			int b02 = extract<2>(bin0); bin_count[b02][2]++; bin_bounds[b02][2].grow(prim0.bounds());
+			int b00 = (int)extract<0>(bin0); bin_count[b00][0]++; bin_bounds[b00][0].grow(prim0.bounds());
+			int b01 = (int)extract<1>(bin0); bin_count[b01][1]++; bin_bounds[b01][1].grow(prim0.bounds());
+			int b02 = (int)extract<2>(bin0); bin_count[b02][2]++; bin_bounds[b02][2].grow(prim0.bounds());
 		}
 	}
 
diff --git a/intern/cycles/bvh/bvh_build.cpp b/intern/cycles/bvh/bvh_build.cpp
index b21b20a87e5..eb4cca92b6b 100644
--- a/intern/cycles/bvh/bvh_build.cpp
+++ b/intern/cycles/bvh/bvh_build.cpp
@@ -49,10 +49,10 @@ public:
 /* Constructor / Destructor */
 
 BVHBuild::BVHBuild(const vector<Object*>& objects_,
-	vector<int>& prim_segment_, vector<int>& prim_index_, vector<int>& prim_object_,
+	vector<int>& prim_type_, vector<int>& prim_index_, vector<int>& prim_object_,
 	const BVHParams& params_, Progress& progress_)
 : objects(objects_),
-  prim_segment(prim_segment_),
+  prim_type(prim_type_),
   prim_index(prim_index_),
   prim_object(prim_object_),
   params(params_),
@@ -70,45 +70,66 @@ BVHBuild::~BVHBuild()
 
 void BVHBuild::add_reference_mesh(BoundBox& root, BoundBox& center, Mesh *mesh, int i)
 {
+	Attribute *attr_mP = NULL;
+	
+	if(mesh->has_motion_blur())
+		attr_mP = mesh->attributes.find(ATTR_STD_MOTION_VERTEX_POSITION);
+
 	for(uint j = 0; j < mesh->triangles.size(); j++) {
 		Mesh::Triangle t = mesh->triangles[j];
 		BoundBox bounds = BoundBox::empty;
+		PrimitiveType type = PRIMITIVE_TRIANGLE;
+
+		t.bounds_grow(&mesh->verts[0], bounds);
 
-		for(int k = 0; k < 3; k++) {
-			float3 co = mesh->verts[t.v[k]];
-			bounds.grow(co);
+		/* motion triangles */
+		if(attr_mP) {
+			size_t mesh_size = mesh->verts.size();
+			size_t steps = mesh->motion_steps - 1;
+			float3 *vert_steps = attr_mP->data_float3();
+
+			for(size_t i = 0; i < steps; i++)
+				t.bounds_grow(vert_steps + i*mesh_size, bounds);
+
+			type = PRIMITIVE_MOTION_TRIANGLE;
 		}
 
 		if(bounds.valid()) {
-			references.push_back(BVHReference(bounds, j, i, ~0));
+			references.push_back(BVHReference(bounds, j, i, type));
 			root.grow(bounds);
 			center.grow(bounds.center2());
 		}
 	}
 
+	Attribute *curve_attr_mP = NULL;
+
+	if(mesh->has_motion_blur())
+		curve_attr_mP = mesh->curve_attributes.find(ATTR_STD_MOTION_VERTEX_POSITION);
+
 	for(uint j = 0; j < mesh->curves.size(); j++) {
 		Mesh::Curve curve = mesh->curves[j];
+		PrimitiveType type = PRIMITIVE_CURVE;
 
 		for(int k = 0; k < curve.num_keys - 1; k++) {
 			BoundBox bounds = BoundBox::empty;
+			curve.bounds_grow(k, &mesh->curve_keys[0], bounds);
+
+			/* motion curve */
+			if(curve_attr_mP) {
+				size_t mesh_size = mesh->curve_keys.size();
+				size_t steps = mesh->motion_steps - 1;
+				float4 *key_steps = curve_attr_mP->data_float4();
 
-			float3 co[4];
-			co[0] = mesh->curve_keys[max(curve.first_key + k - 1,curve.first_key)].co;
-			co[1] = mesh->curve_keys[curve.first_key + k].co;
-			co[2] = mesh->curve_keys[curve.first_key + k + 1].co;
-			co[3] = mesh->curve_keys[min(curve.first_key + k + 2, curve.first_key + curve.num_keys - 1)].co;
-
-			float3 lower;
-			float3 upper;
-			curvebounds(&lower.x, &upper.x, co, 0);
-			curvebounds(&lower.y, &upper.y, co, 1);
-			curvebounds(&lower.z, &upper.z, co, 2);
-			float mr = max(mesh->curve_keys[curve.first_key + k].radius, mesh->curve_keys[curve.first_key + k + 1].radius);
-			bounds.grow(lower, mr);
-			bounds.grow(upper, mr);
+				for (size_t i = 0; i < steps; i++)
+					curve.bounds_grow(k, key_steps + i*mesh_size, bounds);
+
+				type = PRIMITIVE_MOTION_CURVE;
+			}
 
 			if(bounds.valid()) {
-				references.push_back(BVHReference(bounds, j, i, k));
+				int packed_type = PRIMITIVE_PACK_SEGMENT(type, k);
+				
+				references.push_back(BVHReference(bounds, j, i, packed_type));
 				root.grow(bounds);
 				center.grow(bounds.center2());
 			}
@@ -118,7 +139,7 @@ void BVHBuild::add_reference_mesh(BoundBox& root, BoundBox& center, Mesh *mesh,
 
 void BVHBuild::add_reference_object(BoundBox& root, BoundBox& center, Object *ob, int i)
 {
-	references.push_back(BVHReference(ob->bounds, -1, i, false));
+	references.push_back(BVHReference(ob->bounds, -1, i, 0));
 	root.grow(ob->bounds);
 	center.grow(ob->bounds.center2());
 }
@@ -207,7 +228,7 @@ BVHNode* BVHBuild::run()
 	progress_total = references.size();
 	progress_original_total = progress_total;
 
-	prim_segment.resize(references.size());
+	prim_type.resize(references.size());
 	prim_index.resize(references.size());
 	prim_object.resize(references.size());
 
@@ -277,18 +298,41 @@ void BVHBuild::thread_build_node(InnerNode *inner, int child, BVHObjectBinning *
 	}
 }
 
+bool BVHBuild::range_within_max_leaf_size(const BVHRange& range)
+{
+	size_t size = range.size();
+	size_t max_leaf_size = max(params.max_triangle_leaf_size, params.max_curve_leaf_size);
+
+	if(size > max_leaf_size)
+		return false;
+	
+	size_t num_triangles = 0;
+	size_t num_curves = 0;
+
+	for(int i = 0; i < size; i++) {
+		BVHReference& ref = references[range.start() + i];
+
+		if(ref.prim_type() & PRIMITIVE_ALL_CURVE)
+			num_curves++;
+		else if(ref.prim_type() & PRIMITIVE_ALL_TRIANGLE)
+			num_triangles++;
+	}
+
+	return (num_triangles < params.max_triangle_leaf_size) && (num_curves < params.max_curve_leaf_size);
+}
+
 /* multithreaded binning builder */
 BVHNode* BVHBuild::build_node(const BVHObjectBinning& range, int level)
 {
 	size_t size = range.size();
-	float leafSAH = params.sah_triangle_cost * range.leafSAH;
-	float splitSAH = params.sah_node_cost * range.bounds().half_area() + params.sah_triangle_cost * range.splitSAH;
+	float leafSAH = params.sah_primitive_cost * range.leafSAH;
+	float splitSAH = params.sah_node_cost * range.bounds().half_area() + params.sah_primitive_cost * range.splitSAH;
 
 	/* have at least one inner node on top level, for performance and correct
 	 * visibility tests, since object instances do not check visibility flag */
 	if(!(range.size() > 0 && params.top_level && level == 0)) {
 		/* make leaf node when threshold reached or SAH tells us */
-		if(params.small_enough_for_leaf(size, level) || (size <= params.max_leaf_size && leafSAH < splitSAH))
+		if(params.small_enough_for_leaf(size, level) || (range_within_max_leaf_size(range) && leafSAH < splitSAH))
 			return create_leaf_node(range);
 	}
 
@@ -373,12 +417,12 @@ BVHNode *BVHBuild::create_object_leaf_nodes(const BVHReference *ref, int start,
 		if(start == prim_index.size()) {
 			assert(params.use_spatial_split);
 
-			prim_segment.push_back(ref->prim_segment());
+			prim_type.push_back(ref->prim_type());
 			prim_index.push_back(ref->prim_index());
 			prim_object.push_back(ref->prim_object());
 		}
 		else {
-			prim_segment[start] = ref->prim_segment();
+			prim_type[start] = ref->prim_type();
 			prim_index[start] = ref->prim_index();
 			prim_object[start] = ref->prim_object();
 		}
@@ -401,7 +445,7 @@ BVHNode *BVHBuild::create_object_leaf_nodes(const BVHReference *ref, int start,
 
 BVHNode* BVHBuild::create_leaf_node(const BVHRange& range)
 {
-	vector<int>& p_segment = prim_segment;
+	vector<int>& p_type = prim_type;
 	vector<int>& p_index = prim_index;
 	vector<int>& p_object = prim_object;
 	BoundBox bounds = BoundBox::empty;
@@ -415,12 +459,12 @@ BVHNode* BVHBuild::create_leaf_node(const BVHRange& range)
 			if(range.start() + num == prim_index.size()) {
 				assert(params.use_spatial_split);
 
-				p_segment.push_back(ref.prim_segment());
+				p_type.push_back(ref.prim_type());
 				p_index.push_back(ref.prim_index());
 				p_object.push_back(ref.prim_object());
 			}
 			else {
-				p_segment[range.start() + num] = ref.prim_segment();
+				p_type[range.start() + num] = ref.prim_type();
 				p_index[range.start() + num] = ref.prim_index();
 				p_object[range.start() + num] = ref.prim_object();
 			}
@@ -490,7 +534,7 @@ void BVHBuild::rotate(BVHNode *node, int max_depth)
 	/* find best rotation. we pick a target child of a first child, and swap
 	 * this with an other child. we perform the best such swap. */
 	float best_cost = FLT_MAX;
-	int best_child = -1, bets_target = -1, best_other = -1;
+	int best_child = -1, best_target = -1, best_other = -1;
 
 	for(size_t c = 0; c < 2; c++) {
 		/* ignore leaf nodes as we cannot descent into */
@@ -514,11 +558,11 @@ void BVHBuild::rotate(BVHNode *node, int max_depth)
 
 			if(cost0 < cost1) {
 				best_cost = cost0;
-				bets_target = 0;
+				best_target = 0;
 			}
 			else {
 				best_cost = cost0;
-				bets_target = 1;
+				best_target = 1;
 			}
 		}
 	}
@@ -527,10 +571,13 @@ void BVHBuild::rotate(BVHNode *node, int max_depth)
 	if(best_cost >= 0)
 		return;
 
+	assert(best_child == 0 || best_child == 1);
+	assert(best_target != -1);
+
 	/* perform the best found tree rotation */
 	InnerNode *child = (InnerNode*)parent->children[best_child];
 
-	swap(parent->children[best_other], child->children[bets_target]);
+	swap(parent->children[best_other], child->children[best_target]);
 	child->m_bounds = merge(child->children[0]->m_bounds, child->children[1]->m_bounds);
 }
 
diff --git a/intern/cycles/bvh/bvh_build.h b/intern/cycles/bvh/bvh_build.h
index 3df4da1739a..a6b9916de9b 100644
--- a/intern/cycles/bvh/bvh_build.h
+++ b/intern/cycles/bvh/bvh_build.h
@@ -44,7 +44,7 @@ public:
 	/* Constructor/Destructor */
 	BVHBuild(
 		const vector<Object*>& objects,
-		vector<int>& prim_segment,
+		vector<int>& prim_type,
 		vector<int>& prim_index,
 		vector<int>& prim_object,
 		const BVHParams& params,
@@ -70,6 +70,8 @@ protected:
 	BVHNode *create_leaf_node(const BVHRange& range);
 	BVHNode *create_object_leaf_nodes(const BVHReference *ref, int start, int num);
 
+	bool range_within_max_leaf_size(const BVHRange& range);
+
 	/* threads */
 	enum { THREAD_TASK_SIZE = 4096 };
 	void thread_build_node(InnerNode *node, int child, BVHObjectBinning *range, int level);
@@ -88,7 +90,7 @@ protected:
 	int num_original_references;
 
 	/* output primitive indexes and objects */
-	vector<int>& prim_segment;
+	vector<int>& prim_type;
 	vector<int>& prim_index;
 	vector<int>& prim_object;
 
diff --git a/intern/cycles/bvh/bvh_params.h b/intern/cycles/bvh/bvh_params.h
index ad36bdfa326..ed67690a07f 100644
--- a/intern/cycles/bvh/bvh_params.h
+++ b/intern/cycles/bvh/bvh_params.h
@@ -33,11 +33,12 @@ public:
 
 	/* SAH costs */
 	float sah_node_cost;
-	float sah_triangle_cost;
+	float sah_primitive_cost;
 
-	/* number of triangles in leaf */
+	/* number of primitives in leaf */
 	int min_leaf_size;
-	int max_leaf_size;
+	int max_triangle_leaf_size;
+	int max_curve_leaf_size;
 
 	/* object or mesh level bvh */
 	int top_level;
@@ -62,11 +63,14 @@ public:
 		use_spatial_split = true;
 		spatial_split_alpha = 1e-5f;
 
+		/* todo: see if splitting up primitive cost to be separate for triangles
+		 * and curves can help. so far in tests it doesn't help, but why? */
 		sah_node_cost = 1.0f;
-		sah_triangle_cost = 1.0f;
+		sah_primitive_cost = 1.0f;
 
 		min_leaf_size = 1;
-		max_leaf_size = 8;
+		max_triangle_leaf_size = 8;
+		max_curve_leaf_size = 2;
 
 		top_level = false;
 		use_cache = false;
@@ -75,11 +79,11 @@ public:
 	}
 
 	/* SAH costs */
-	__forceinline float cost(int num_nodes, int num_tris) const
-	{ return node_cost(num_nodes) + triangle_cost(num_tris); }
+	__forceinline float cost(int num_nodes, int num_primitives) const
+	{ return node_cost(num_nodes) + primitive_cost(num_primitives); }
 
-	__forceinline float triangle_cost(int n) const
-	{ return n*sah_triangle_cost; }
+	__forceinline float primitive_cost(int n) const
+	{ return n*sah_primitive_cost; }
 
 	__forceinline float node_cost(int n) const
 	{ return n*sah_node_cost; }
@@ -98,22 +102,22 @@ class BVHReference
 public:
 	__forceinline BVHReference() {}
 
-	__forceinline BVHReference(const BoundBox& bounds_, int prim_index_, int prim_object_, int prim_segment)
+	__forceinline BVHReference(const BoundBox& bounds_, int prim_index_, int prim_object_, int prim_type)
 	: rbounds(bounds_)
 	{
 		rbounds.min.w = __int_as_float(prim_index_);
 		rbounds.max.w = __int_as_float(prim_object_);
-		segment = prim_segment;
+		type = prim_type;
 	}
 
 	__forceinline const BoundBox& bounds() const { return rbounds; }
 	__forceinline int prim_index() const { return __float_as_int(rbounds.min.w); }
 	__forceinline int prim_object() const { return __float_as_int(rbounds.max.w); }
-	__forceinline int prim_segment() const { return segment; }
+	__forceinline int prim_type() const { return type; }
 
 protected:
 	BoundBox rbounds;
-	uint segment;
+	uint type;
 };
 
 /* BVH Range
diff --git a/intern/cycles/bvh/bvh_sort.cpp b/intern/cycles/bvh/bvh_sort.cpp
index d7dbae36336..3140bf23376 100644
--- a/intern/cycles/bvh/bvh_sort.cpp
+++ b/intern/cycles/bvh/bvh_sort.cpp
@@ -52,8 +52,8 @@ public:
 		else if(ra.prim_object() > rb.prim_object()) return false;
 		else if(ra.prim_index() < rb.prim_index()) return true;
 		else if(ra.prim_index() > rb.prim_index()) return false;
-		else if(ra.prim_segment() < rb.prim_segment()) return true;
-		else if(ra.prim_segment() > rb.prim_segment()) return false;
+		else if(ra.prim_type() < rb.prim_type()) return true;
+		else if(ra.prim_type() > rb.prim_type()) return false;
 
 		return false;
 	}
diff --git a/intern/cycles/bvh/bvh_split.cpp b/intern/cycles/bvh/bvh_split.cpp
index 03ff69d7b6d..07c35c08c18 100644
--- a/intern/cycles/bvh/bvh_split.cpp
+++ b/intern/cycles/bvh/bvh_split.cpp
@@ -54,8 +54,8 @@ BVHObjectSplit::BVHObjectSplit(BVHBuild *builder, const BVHRange& range, float n
 			right_bounds = builder->spatial_right_bounds[i - 1];
 
 			float sah = nodeSAH +
-				left_bounds.safe_area() * builder->params.triangle_cost(i) +
-				right_bounds.safe_area() * builder->params.triangle_cost(range.size() - i);
+				left_bounds.safe_area() * builder->params.primitive_cost(i) +
+				right_bounds.safe_area() * builder->params.primitive_cost(range.size() - i);
 
 			if(sah < min_sah) {
 				min_sah = sah;
@@ -150,8 +150,8 @@ BVHSpatialSplit::BVHSpatialSplit(BVHBuild *builder, const BVHRange& range, float
 			rightNum -= builder->spatial_bins[dim][i - 1].exit;
 
 			float sah = nodeSAH +
-				left_bounds.safe_area() * builder->params.triangle_cost(leftNum) +
-				builder->spatial_right_bounds[i - 1].safe_area() * builder->params.triangle_cost(rightNum);
+				left_bounds.safe_area() * builder->params.primitive_cost(leftNum) +
+				builder->spatial_right_bounds[i - 1].safe_area() * builder->params.primitive_cost(rightNum);
 
 			if(sah < this->sah) {
 				this->sah = sah;
@@ -209,10 +209,10 @@ void BVHSpatialSplit::split(BVHBuild *builder, BVHRange& left, BVHRange& right,
 		ldb.grow(lref.bounds());
 		rdb.grow(rref.bounds());
 
-		float lac = builder->params.triangle_cost(left_end - left_start);
-		float rac = builder->params.triangle_cost(right_end - right_start);
-		float lbc = builder->params.triangle_cost(left_end - left_start + 1);
-		float rbc = builder->params.triangle_cost(right_end - right_start + 1);
+		float lac = builder->params.primitive_cost(left_end - left_start);
+		float rac = builder->params.primitive_cost(right_end - right_start);
+		float lbc = builder->params.primitive_cost(left_end - left_start + 1);
+		float rbc = builder->params.primitive_cost(right_end - right_start + 1);
 
 		float unsplitLeftSAH = lub.safe_area() * lbc + right_bounds.safe_area() * rac;
 		float unsplitRightSAH = left_bounds.safe_area() * lac + rub.safe_area() * rbc;
@@ -253,7 +253,7 @@ void BVHSpatialSplit::split_reference(BVHBuild *builder, BVHReference& left, BVH
 	Object *ob = builder->objects[ref.prim_object()];
 	const Mesh *mesh = ob->mesh;
 
-	if (ref.prim_segment() == ~0) {
+	if (ref.prim_type() & PRIMITIVE_ALL_TRIANGLE) {
 		const int *inds = mesh->triangles[ref.prim_index()].v;
 		const float3 *verts = &mesh->verts[0];
 		const float3* v1 = &verts[inds[2]];
@@ -282,30 +282,32 @@ void BVHSpatialSplit::split_reference(BVHBuild *builder, BVHReference& left, BVH
 	}
 	else {
 		/* curve split: NOTE - Currently ignores curve width and needs to be fixed.*/
-		const int k0 = mesh->curves[ref.prim_index()].first_key + ref.prim_segment();
+		const int k0 = mesh->curves[ref.prim_index()].first_key + PRIMITIVE_UNPACK_SEGMENT(ref.prim_type());
 		const int k1 = k0 + 1;
-		const float3* v0 = &mesh->curve_keys[k0].co;
-		const float3* v1 = &mesh->curve_keys[k1].co;
+		const float4 key0 = mesh->curve_keys[k0];
+		const float4 key1 = mesh->curve_keys[k1];
+		const float3 v0 = float4_to_float3(key0);
+		const float3 v1 = float4_to_float3(key1);
 
-		float v0p = (*v0)[dim];
-		float v1p = (*v1)[dim];
+		float v0p = v0[dim];
+		float v1p = v1[dim];
 
 		/* insert vertex to the boxes it belongs to. */
 		if(v0p <= pos)
-			left_bounds.grow(*v0);
+			left_bounds.grow(v0);
 
 		if(v0p >= pos)
-			right_bounds.grow(*v0);
+			right_bounds.grow(v0);
 
 		if(v1p <= pos)
-			left_bounds.grow(*v1);
+			left_bounds.grow(v1);
 
 		if(v1p >= pos)
-			right_bounds.grow(*v1);
+			right_bounds.grow(v1);
 
 		/* edge intersects the plane => insert intersection to both boxes. */
 		if((v0p < pos && v1p > pos) || (v0p > pos && v1p < pos)) {
-			float3 t = lerp(*v0, *v1, clamp((pos - v0p) / (v1p - v0p), 0.0f, 1.0f));
+			float3 t = lerp(v0, v1, clamp((pos - v0p) / (v1p - v0p), 0.0f, 1.0f));
 			left_bounds.grow(t);
 			right_bounds.grow(t);
 		}
@@ -318,8 +320,8 @@ void BVHSpatialSplit::split_reference(BVHBuild *builder, BVHReference& left, BVH
 	right_bounds.intersect(ref.bounds());
 
 	/* set references */
-	left = BVHReference(left_bounds, ref.prim_index(), ref.prim_object(), ref.prim_segment());
-	right = BVHReference(right_bounds, ref.prim_index(), ref.prim_object(), ref.prim_segment());
+	left = BVHReference(left_bounds, ref.prim_index(), ref.prim_object(), ref.prim_type());
+	right = BVHReference(right_bounds, ref.prim_index(), ref.prim_object(), ref.prim_type());
 }
 
 CCL_NAMESPACE_END
diff --git a/intern/cycles/bvh/bvh_split.h b/intern/cycles/bvh/bvh_split.h
index 1f4befbe8e2..5b739311e5f 100644
--- a/intern/cycles/bvh/bvh_split.h
+++ b/intern/cycles/bvh/bvh_split.h
@@ -77,7 +77,7 @@ public:
 		/* find split candidates. */
 		float area = range.bounds().safe_area();
 
-		leafSAH = area * builder->params.triangle_cost(range.size());
+		leafSAH = area * builder->params.primitive_cost(range.size());
 		nodeSAH = area * builder->params.node_cost(2);
 
 		object = BVHObjectSplit(builder, range, nodeSAH);
@@ -92,7 +92,7 @@ public:
 
 		/* leaf SAH is the lowest => create leaf. */
 		minSAH = min(min(leafSAH, object.sah), spatial.sah);
-		no_split = (minSAH == leafSAH && range.size() <= builder->params.max_leaf_size);
+		no_split = (minSAH == leafSAH && builder->range_within_max_leaf_size(range));
 	}
 
 	__forceinline void split(BVHBuild *builder, BVHRange& left, BVHRange& right, const BVHRange& range)
diff --git a/intern/cycles/device/device.cpp b/intern/cycles/device/device.cpp
index 9d60d062b8e..d9e68742c53 100644
--- a/intern/cycles/device/device.cpp
+++ b/intern/cycles/device/device.cpp
@@ -53,7 +53,8 @@ void Device::pixels_free(device_memory& mem)
 	mem_free(mem);
 }
 
-void Device::draw_pixels(device_memory& rgba, int y, int w, int h, int dy, int width, int height, bool transparent)
+void Device::draw_pixels(device_memory& rgba, int y, int w, int h, int dy, int width, int height, bool transparent,
+	const DeviceDrawParams &draw_params)
 {
 	pixels_copy_from(rgba, y, w, h);
 
@@ -80,6 +81,10 @@ void Device::draw_pixels(device_memory& rgba, int y, int w, int h, int dy, int w
 
 		glEnable(GL_TEXTURE_2D);
 
+		if(draw_params.bind_display_space_shader_cb) {
+			draw_params.bind_display_space_shader_cb();
+		}
+
 		glPushMatrix();
 		glTranslatef(0.0f, (float)dy, 0.0f);
 
@@ -98,6 +103,10 @@ void Device::draw_pixels(device_memory& rgba, int y, int w, int h, int dy, int w
 
 		glPopMatrix();
 
+		if(draw_params.unbind_display_space_shader_cb) {
+			draw_params.unbind_display_space_shader_cb();
+		}
+
 		glBindTexture(GL_TEXTURE_2D, 0);
 		glDisable(GL_TEXTURE_2D);
 		glDeleteTextures(1, &texid);
diff --git a/intern/cycles/device/device.h b/intern/cycles/device/device.h
index bd309e35788..bcddd4f73e2 100644
--- a/intern/cycles/device/device.h
+++ b/intern/cycles/device/device.h
@@ -54,6 +54,7 @@ public:
 	bool display_device;
 	bool advanced_shading;
 	bool pack_images;
+	bool extended_images; /* flag for GPU and Multi device */
 	vector<DeviceInfo> multi_devices;
 
 	DeviceInfo()
@@ -64,11 +65,17 @@ public:
 		display_device = false;
 		advanced_shading = true;
 		pack_images = false;
+		extended_images = false;
 	}
 };
 
 /* Device */
 
+struct DeviceDrawParams {
+	boost::function<void(void)> bind_display_space_shader_cb;
+	boost::function<void(void)> unbind_display_space_shader_cb;
+};
+
 class Device {
 protected:
 	Device(DeviceInfo& info_, Stats &stats_, bool background) : background(background), info(info_), stats(stats_) {}
@@ -100,7 +107,7 @@ public:
 
 	/* texture memory */
 	virtual void tex_alloc(const char *name, device_memory& mem,
-		bool interpolation = false, bool periodic = false) {};
+		InterpolationType interpolation = INTERPOLATION_NONE, bool periodic = false) {};
 	virtual void tex_free(device_memory& mem) {};
 
 	/* pixel memory */
@@ -121,7 +128,8 @@ public:
 	
 	/* opengl drawing */
 	virtual void draw_pixels(device_memory& mem, int y, int w, int h,
-		int dy, int width, int height, bool transparent);
+		int dy, int width, int height, bool transparent,
+		const DeviceDrawParams &draw_params);
 
 #ifdef WITH_NETWORK
 	/* networking */
diff --git a/intern/cycles/device/device_cpu.cpp b/intern/cycles/device/device_cpu.cpp
index 76123fe44d2..c9cc7592028 100644
--- a/intern/cycles/device/device_cpu.cpp
+++ b/intern/cycles/device/device_cpu.cpp
@@ -103,9 +103,9 @@ public:
 		kernel_const_copy(&kernel_globals, name, host, size);
 	}
 
-	void tex_alloc(const char *name, device_memory& mem, bool interpolation, bool periodic)
+	void tex_alloc(const char *name, device_memory& mem, InterpolationType interpolation, bool periodic)
 	{
-		kernel_tex_copy(&kernel_globals, name, mem.data_pointer, mem.data_width, mem.data_height);
+		kernel_tex_copy(&kernel_globals, name, mem.data_pointer, mem.data_width, mem.data_height, mem.data_depth, interpolation);
 		mem.device_pointer = mem.data_pointer;
 
 		stats.mem_alloc(mem.memory_size());
@@ -395,7 +395,7 @@ public:
 			for(int x = task.shader_x; x < task.shader_x + task.shader_w; x++) {
 				kernel_cpu_avx_shader(&kg, (uint4*)task.shader_input, (float4*)task.shader_output, task.shader_eval_type, x);
 
-				if(task_pool.canceled())
+				if(task.get_cancel() || task_pool.canceled())
 					break;
 			}
 		}
@@ -406,7 +406,7 @@ public:
 			for(int x = task.shader_x; x < task.shader_x + task.shader_w; x++) {
 				kernel_cpu_sse41_shader(&kg, (uint4*)task.shader_input, (float4*)task.shader_output, task.shader_eval_type, x);
 
-				if(task_pool.canceled())
+				if(task.get_cancel() || task_pool.canceled())
 					break;
 			}
 		}
@@ -417,7 +417,7 @@ public:
 			for(int x = task.shader_x; x < task.shader_x + task.shader_w; x++) {
 				kernel_cpu_sse3_shader(&kg, (uint4*)task.shader_input, (float4*)task.shader_output, task.shader_eval_type, x);
 
-				if(task_pool.canceled())
+				if(task.get_cancel() || task_pool.canceled())
 					break;
 			}
 		}
@@ -428,7 +428,7 @@ public:
 			for(int x = task.shader_x; x < task.shader_x + task.shader_w; x++) {
 				kernel_cpu_sse2_shader(&kg, (uint4*)task.shader_input, (float4*)task.shader_output, task.shader_eval_type, x);
 
-				if(task_pool.canceled())
+				if(task.get_cancel() || task_pool.canceled())
 					break;
 			}
 		}
@@ -438,7 +438,7 @@ public:
 			for(int x = task.shader_x; x < task.shader_x + task.shader_w; x++) {
 				kernel_cpu_shader(&kg, (uint4*)task.shader_input, (float4*)task.shader_output, task.shader_eval_type, x);
 
-				if(task_pool.canceled())
+				if(task.get_cancel() || task_pool.canceled())
 					break;
 			}
 		}
diff --git a/intern/cycles/device/device_cuda.cpp b/intern/cycles/device/device_cuda.cpp
index 107ca16c4d2..93b89dc38d9 100644
--- a/intern/cycles/device/device_cuda.cpp
+++ b/intern/cycles/device/device_cuda.cpp
@@ -48,6 +48,7 @@ public:
 	int cuDevArchitecture;
 	bool first_error;
 	bool use_texture_storage;
+	unsigned int target_update_frequency;
 
 	struct PixelMem {
 		GLuint cuPBO;
@@ -138,7 +139,7 @@ public:
 			/*cuda_abort();*/ \
 			cuda_error_documentation(); \
 		} \
-	}
+	} (void)0
 
 	bool cuda_error_(CUresult result, const string& stmt)
 	{
@@ -165,7 +166,7 @@ public:
 
 	void cuda_push_context()
 	{
-		cuda_assert(cuCtxSetCurrent(cuContext))
+		cuda_assert(cuCtxSetCurrent(cuContext));
 	}
 
 	void cuda_pop_context()
@@ -173,12 +174,14 @@ public:
 		cuda_assert(cuCtxSetCurrent(NULL));
 	}
 
-    CUDADevice(DeviceInfo& info, Stats &stats, bool background_)
+	CUDADevice(DeviceInfo& info, Stats &stats, bool background_)
 	: Device(info, stats, background_)
 	{
 		first_error = true;
 		background = background_;
 		use_texture_storage = true;
+		/* we try an update / sync every 1000 ms */
+		target_update_frequency = 1000;
 
 		cuDevId = info.num;
 		cuDevice = 0;
@@ -209,8 +212,8 @@ public:
 		if(cuda_error_(result, "cuCtxCreate"))
 			return;
 
-		cuda_assert(cuStreamCreate(&cuStream, 0))
-		cuda_assert(cuEventCreate(&tileDone, 0x1))
+		cuda_assert(cuStreamCreate(&cuStream, 0));
+		cuda_assert(cuEventCreate(&tileDone, 0x1));
 
 		int major, minor;
 		cuDeviceComputeCapability(&major, &minor, cuDevId);
@@ -219,7 +222,7 @@ public:
 		/* In order to use full 6GB of memory on Titan cards, use arrays instead
 		 * of textures. On earlier cards this seems slower, but on Titan it is
 		 * actually slightly faster in tests. */
-		use_texture_storage = (cuDevArchitecture < 350);
+		use_texture_storage = (cuDevArchitecture < 300);
 
 		cuda_pop_context();
 	}
@@ -228,21 +231,22 @@ public:
 	{
 		task_pool.stop();
 
-		cuda_assert(cuEventDestroy(tileDone))
-		cuda_assert(cuStreamDestroy(cuStream))
-		cuda_assert(cuCtxDestroy(cuContext))
+		cuda_assert(cuEventDestroy(tileDone));
+		cuda_assert(cuStreamDestroy(cuStream));
+		cuda_assert(cuCtxDestroy(cuContext));
 	}
 
-	bool support_device(bool experimental)
+	bool support_device(bool experimental, bool branched)
 	{
 		int major, minor;
 		cuDeviceComputeCapability(&major, &minor, cuDevId);
-
+		
+		/* We only support sm_20 and above */
 		if(major < 2) {
 			cuda_error_message(string_printf("CUDA device supported only with compute capability 2.0 or up, found %d.%d.", major, minor));
 			return false;
 		}
-
+		
 		return true;
 	}
 
@@ -293,28 +297,16 @@ public:
 			return "";
 		}
 		if(cuda_version < 50) {
-			printf("Unsupported CUDA version %d.%d detected, you need CUDA 5.0.\n", cuda_version/10, cuda_version%10);
+			printf("Unsupported CUDA version %d.%d detected, you need CUDA 6.0.\n", cuda_version/10, cuda_version%10);
 			return "";
 		}
-
-		else if(cuda_version > 50)
-			printf("CUDA version %d.%d detected, build may succeed but only CUDA 5.0 is officially supported.\n", cuda_version/10, cuda_version%10);
+		else if(cuda_version != 60)
+			printf("CUDA version %d.%d detected, build may succeed but only CUDA 6.0 is officially supported.\n", cuda_version/10, cuda_version%10);
 
 		/* compile */
 		string kernel = path_join(kernel_path, "kernel.cu");
 		string include = kernel_path;
 		const int machine = system_cpu_bits();
-		string arch_flags;
-
-		/* CUDA 5.x build flags for different archs */
-		if(major == 2) {
-			/* sm_2x */
-			arch_flags = "--maxrregcount=32 --use_fast_math";
-		}
-		else if(major == 3) {
-			/* sm_3x */
-			arch_flags = "--maxrregcount=32 --use_fast_math";
-		}
 
 		double starttime = time_dt();
 		printf("Compiling CUDA kernel ...\n");
@@ -322,8 +314,8 @@ public:
 		path_create_directories(cubin);
 
 		string command = string_printf("\"%s\" -arch=sm_%d%d -m%d --cubin \"%s\" "
-			"-o \"%s\" --ptxas-options=\"-v\" %s -I\"%s\" -DNVCC -D__KERNEL_CUDA_VERSION__=%d",
-			nvcc.c_str(), major, minor, machine, kernel.c_str(), cubin.c_str(), arch_flags.c_str(), include.c_str(), cuda_version);
+			"-o \"%s\" --ptxas-options=\"-v\" -I\"%s\" -DNVCC -D__KERNEL_CUDA_VERSION__=%d",
+			nvcc.c_str(), major, minor, machine, kernel.c_str(), cubin.c_str(), include.c_str(), cuda_version);
 
 		printf("%s\n", command.c_str());
 
@@ -349,8 +341,8 @@ public:
 		if(cuContext == 0)
 			return false;
 		
-		/* check if GPU is supported with current feature set */
-		if(!support_device(experimental))
+		/* check if GPU is supported */
+		if(!support_device(experimental, false))
 			return false;
 
 		/* get kernel */
@@ -383,7 +375,7 @@ public:
 		cuda_push_context();
 		CUdeviceptr device_pointer;
 		size_t size = mem.memory_size();
-		cuda_assert(cuMemAlloc(&device_pointer, size))
+		cuda_assert(cuMemAlloc(&device_pointer, size));
 		mem.device_pointer = (device_ptr)device_pointer;
 		stats.mem_alloc(size);
 		cuda_pop_context();
@@ -393,7 +385,7 @@ public:
 	{
 		cuda_push_context();
 		if(mem.device_pointer)
-			cuda_assert(cuMemcpyHtoD(cuda_device_ptr(mem.device_pointer), (void*)mem.data_pointer, mem.memory_size()))
+			cuda_assert(cuMemcpyHtoD(cuda_device_ptr(mem.device_pointer), (void*)mem.data_pointer, mem.memory_size()));
 		cuda_pop_context();
 	}
 
@@ -405,7 +397,7 @@ public:
 		cuda_push_context();
 		if(mem.device_pointer) {
 			cuda_assert(cuMemcpyDtoH((uchar*)mem.data_pointer + offset,
-				(CUdeviceptr)((uchar*)mem.device_pointer + offset), size))
+			                         (CUdeviceptr)((uchar*)mem.device_pointer + offset), size));
 		}
 		else {
 			memset((char*)mem.data_pointer + offset, 0, size);
@@ -419,7 +411,7 @@ public:
 
 		cuda_push_context();
 		if(mem.device_pointer)
-			cuda_assert(cuMemsetD8(cuda_device_ptr(mem.device_pointer), 0, mem.memory_size()))
+			cuda_assert(cuMemsetD8(cuda_device_ptr(mem.device_pointer), 0, mem.memory_size()));
 		cuda_pop_context();
 	}
 
@@ -427,7 +419,7 @@ public:
 	{
 		if(mem.device_pointer) {
 			cuda_push_context();
-			cuda_assert(cuMemFree(cuda_device_ptr(mem.device_pointer)))
+			cuda_assert(cuMemFree(cuda_device_ptr(mem.device_pointer)));
 			cuda_pop_context();
 
 			mem.device_pointer = 0;
@@ -442,19 +434,21 @@ public:
 		size_t bytes;
 
 		cuda_push_context();
-		cuda_assert(cuModuleGetGlobal(&mem, &bytes, cuModule, name))
+		cuda_assert(cuModuleGetGlobal(&mem, &bytes, cuModule, name));
 		//assert(bytes == size);
-		cuda_assert(cuMemcpyHtoD(mem, host, size))
+		cuda_assert(cuMemcpyHtoD(mem, host, size));
 		cuda_pop_context();
 	}
 
-	void tex_alloc(const char *name, device_memory& mem, bool interpolation, bool periodic)
+	void tex_alloc(const char *name, device_memory& mem, InterpolationType interpolation, bool periodic)
 	{
+		/* todo: support 3D textures, only CPU for now */
+
 		/* determine format */
 		CUarray_format_enum format;
 		size_t dsize = datatype_size(mem.data_type);
 		size_t size = mem.memory_size();
-		bool use_texture = interpolation || use_texture_storage;
+		bool use_texture = (interpolation != INTERPOLATION_NONE) || use_texture_storage;
 
 		if(use_texture) {
 
@@ -469,14 +463,14 @@ public:
 			CUtexref texref = NULL;
 
 			cuda_push_context();
-			cuda_assert(cuModuleGetTexRef(&texref, cuModule, name))
+			cuda_assert(cuModuleGetTexRef(&texref, cuModule, name));
 
 			if(!texref) {
 				cuda_pop_context();
 				return;
 			}
 
-			if(interpolation) {
+			if(interpolation != INTERPOLATION_NONE) {
 				CUarray handle = NULL;
 				CUDA_ARRAY_DESCRIPTOR desc;
 
@@ -485,7 +479,7 @@ public:
 				desc.Format = format;
 				desc.NumChannels = mem.data_elements;
 
-				cuda_assert(cuArrayCreate(&handle, &desc))
+				cuda_assert(cuArrayCreate(&handle, &desc));
 
 				if(!handle) {
 					cuda_pop_context();
@@ -503,15 +497,23 @@ public:
 					param.WidthInBytes = param.srcPitch;
 					param.Height = mem.data_height;
 
-					cuda_assert(cuMemcpy2D(&param))
+					cuda_assert(cuMemcpy2D(&param));
 				}
 				else
-					cuda_assert(cuMemcpyHtoA(handle, 0, (void*)mem.data_pointer, size))
+					cuda_assert(cuMemcpyHtoA(handle, 0, (void*)mem.data_pointer, size));
 
-				cuda_assert(cuTexRefSetArray(texref, handle, CU_TRSA_OVERRIDE_FORMAT))
+				cuda_assert(cuTexRefSetArray(texref, handle, CU_TRSA_OVERRIDE_FORMAT));
 
-				cuda_assert(cuTexRefSetFilterMode(texref, CU_TR_FILTER_MODE_LINEAR))
-				cuda_assert(cuTexRefSetFlags(texref, CU_TRSF_NORMALIZED_COORDINATES))
+				if(interpolation == INTERPOLATION_CLOSEST) {
+					cuda_assert(cuTexRefSetFilterMode(texref, CU_TR_FILTER_MODE_POINT));
+				}
+				else if (interpolation == INTERPOLATION_LINEAR) {
+					cuda_assert(cuTexRefSetFilterMode(texref, CU_TR_FILTER_MODE_LINEAR));
+				}
+				else {/* CUBIC and SMART are unsupported for CUDA */
+					cuda_assert(cuTexRefSetFilterMode(texref, CU_TR_FILTER_MODE_LINEAR));
+				}
+				cuda_assert(cuTexRefSetFlags(texref, CU_TRSF_NORMALIZED_COORDINATES));
 
 				mem.device_pointer = (device_ptr)handle;
 
@@ -525,20 +527,20 @@ public:
 
 				cuda_push_context();
 
-				cuda_assert(cuTexRefSetAddress(NULL, texref, cuda_device_ptr(mem.device_pointer), size))
-				cuda_assert(cuTexRefSetFilterMode(texref, CU_TR_FILTER_MODE_POINT))
-				cuda_assert(cuTexRefSetFlags(texref, CU_TRSF_READ_AS_INTEGER))
+				cuda_assert(cuTexRefSetAddress(NULL, texref, cuda_device_ptr(mem.device_pointer), size));
+				cuda_assert(cuTexRefSetFilterMode(texref, CU_TR_FILTER_MODE_POINT));
+				cuda_assert(cuTexRefSetFlags(texref, CU_TRSF_READ_AS_INTEGER));
 			}
 
 			if(periodic) {
-				cuda_assert(cuTexRefSetAddressMode(texref, 0, CU_TR_ADDRESS_MODE_WRAP))
-				cuda_assert(cuTexRefSetAddressMode(texref, 1, CU_TR_ADDRESS_MODE_WRAP))
+				cuda_assert(cuTexRefSetAddressMode(texref, 0, CU_TR_ADDRESS_MODE_WRAP));
+				cuda_assert(cuTexRefSetAddressMode(texref, 1, CU_TR_ADDRESS_MODE_WRAP));
 			}
 			else {
-				cuda_assert(cuTexRefSetAddressMode(texref, 0, CU_TR_ADDRESS_MODE_CLAMP))
-				cuda_assert(cuTexRefSetAddressMode(texref, 1, CU_TR_ADDRESS_MODE_CLAMP))
+				cuda_assert(cuTexRefSetAddressMode(texref, 0, CU_TR_ADDRESS_MODE_CLAMP));
+				cuda_assert(cuTexRefSetAddressMode(texref, 1, CU_TR_ADDRESS_MODE_CLAMP));
 			}
-			cuda_assert(cuTexRefSetFormat(texref, format, mem.data_elements))
+			cuda_assert(cuTexRefSetFormat(texref, format, mem.data_elements));
 
 			cuda_pop_context();
 		}
@@ -551,23 +553,23 @@ public:
 			CUdeviceptr cumem;
 			size_t cubytes;
 
-			cuda_assert(cuModuleGetGlobal(&cumem, &cubytes, cuModule, name))
+			cuda_assert(cuModuleGetGlobal(&cumem, &cubytes, cuModule, name));
 
 			if(cubytes == 8) {
 				/* 64 bit device pointer */
 				uint64_t ptr = mem.device_pointer;
-				cuda_assert(cuMemcpyHtoD(cumem, (void*)&ptr, cubytes))
+				cuda_assert(cuMemcpyHtoD(cumem, (void*)&ptr, cubytes));
 			}
 			else {
 				/* 32 bit device pointer */
 				uint32_t ptr = (uint32_t)mem.device_pointer;
-				cuda_assert(cuMemcpyHtoD(cumem, (void*)&ptr, cubytes))
+				cuda_assert(cuMemcpyHtoD(cumem, (void*)&ptr, cubytes));
 			}
 
 			cuda_pop_context();
 		}
 
-		tex_interp_map[mem.device_pointer] = interpolation;
+		tex_interp_map[mem.device_pointer] = (interpolation != INTERPOLATION_NONE);
 	}
 
 	void tex_free(device_memory& mem)
@@ -602,10 +604,12 @@ public:
 		CUdeviceptr d_rng_state = cuda_device_ptr(rtile.rng_state);
 
 		/* get kernel function */
-		if(branched)
-			cuda_assert(cuModuleGetFunction(&cuPathTrace, cuModule, "kernel_cuda_branched_path_trace"))
-		else
-			cuda_assert(cuModuleGetFunction(&cuPathTrace, cuModule, "kernel_cuda_path_trace"))
+		if(branched && support_device(true, branched)) {
+			cuda_assert(cuModuleGetFunction(&cuPathTrace, cuModule, "kernel_cuda_branched_path_trace"));
+		}
+		else {
+			cuda_assert(cuModuleGetFunction(&cuPathTrace, cuModule, "kernel_cuda_path_trace"));
+		}
 
 		if(have_error())
 			return;
@@ -613,49 +617,63 @@ public:
 		/* pass in parameters */
 		int offset = 0;
 		
-		cuda_assert(cuParamSetv(cuPathTrace, offset, &d_buffer, sizeof(d_buffer)))
+		cuda_assert(cuParamSetv(cuPathTrace, offset, &d_buffer, sizeof(d_buffer)));
 		offset += sizeof(d_buffer);
 
-		cuda_assert(cuParamSetv(cuPathTrace, offset, &d_rng_state, sizeof(d_rng_state)))
+		cuda_assert(cuParamSetv(cuPathTrace, offset, &d_rng_state, sizeof(d_rng_state)));
 		offset += sizeof(d_rng_state);
 
 		offset = align_up(offset, __alignof(sample));
 
-		cuda_assert(cuParamSeti(cuPathTrace, offset, sample))
+		cuda_assert(cuParamSeti(cuPathTrace, offset, sample));
 		offset += sizeof(sample);
 
-		cuda_assert(cuParamSeti(cuPathTrace, offset, rtile.x))
+		cuda_assert(cuParamSeti(cuPathTrace, offset, rtile.x));
 		offset += sizeof(rtile.x);
 
-		cuda_assert(cuParamSeti(cuPathTrace, offset, rtile.y))
+		cuda_assert(cuParamSeti(cuPathTrace, offset, rtile.y));
 		offset += sizeof(rtile.y);
 
-		cuda_assert(cuParamSeti(cuPathTrace, offset, rtile.w))
+		cuda_assert(cuParamSeti(cuPathTrace, offset, rtile.w));
 		offset += sizeof(rtile.w);
 
-		cuda_assert(cuParamSeti(cuPathTrace, offset, rtile.h))
+		cuda_assert(cuParamSeti(cuPathTrace, offset, rtile.h));
 		offset += sizeof(rtile.h);
 
-		cuda_assert(cuParamSeti(cuPathTrace, offset, rtile.offset))
+		cuda_assert(cuParamSeti(cuPathTrace, offset, rtile.offset));
 		offset += sizeof(rtile.offset);
 
-		cuda_assert(cuParamSeti(cuPathTrace, offset, rtile.stride))
+		cuda_assert(cuParamSeti(cuPathTrace, offset, rtile.stride));
 		offset += sizeof(rtile.stride);
 
-		cuda_assert(cuParamSetSize(cuPathTrace, offset))
+		cuda_assert(cuParamSetSize(cuPathTrace, offset));
+
+		/* launch kernel */
+		int threads_per_block;
+		cuda_assert(cuFuncGetAttribute(&threads_per_block, CU_FUNC_ATTRIBUTE_MAX_THREADS_PER_BLOCK, cuPathTrace));
+
+		/*int num_registers;
+		cuda_assert(cuFuncGetAttribute(&num_registers, CU_FUNC_ATTRIBUTE_NUM_REGS, cuPathTrace));
+
+		printf("threads_per_block %d\n", threads_per_block);
+		printf("num_registers %d\n", num_registers);*/
 
-		/* launch kernel: todo find optimal size, cache config for fermi */
-		int xthreads = 16;
-		int ythreads = 16;
+		int xthreads = (int)sqrt((float)threads_per_block);
+		int ythreads = (int)sqrt((float)threads_per_block);
 		int xblocks = (rtile.w + xthreads - 1)/xthreads;
 		int yblocks = (rtile.h + ythreads - 1)/ythreads;
 
-		cuda_assert(cuFuncSetCacheConfig(cuPathTrace, CU_FUNC_CACHE_PREFER_L1))
-		cuda_assert(cuFuncSetBlockShape(cuPathTrace, xthreads, ythreads, 1))
-		cuda_assert(cuLaunchGridAsync(cuPathTrace, xblocks, yblocks, cuStream))
+		cuda_assert(cuFuncSetCacheConfig(cuPathTrace, CU_FUNC_CACHE_PREFER_L1));
+		cuda_assert(cuFuncSetBlockShape(cuPathTrace, xthreads, ythreads, 1));
 
-		cuda_assert(cuEventRecord(tileDone, cuStream ))
-		cuda_assert(cuEventSynchronize(tileDone))
+		if(info.display_device) {
+			/* don't use async for device used for display, locks up UI too much */
+			cuda_assert(cuLaunchGrid(cuPathTrace, xblocks, yblocks));
+			cuda_assert(cuCtxSynchronize());
+		}
+		else {
+			cuda_assert(cuLaunchGridAsync(cuPathTrace, xblocks, yblocks, cuStream));
+		}
 
 		cuda_pop_context();
 	}
@@ -672,55 +690,60 @@ public:
 		CUdeviceptr d_buffer = cuda_device_ptr(buffer);
 
 		/* get kernel function */
-		if(rgba_half)
-			cuda_assert(cuModuleGetFunction(&cuFilmConvert, cuModule, "kernel_cuda_convert_to_half_float"))
-		else
-			cuda_assert(cuModuleGetFunction(&cuFilmConvert, cuModule, "kernel_cuda_convert_to_byte"))
+		if(rgba_half) {
+			cuda_assert(cuModuleGetFunction(&cuFilmConvert, cuModule, "kernel_cuda_convert_to_half_float"));
+		}
+		else {
+			cuda_assert(cuModuleGetFunction(&cuFilmConvert, cuModule, "kernel_cuda_convert_to_byte"));
+		}
 
 		/* pass in parameters */
 		int offset = 0;
 
-		cuda_assert(cuParamSetv(cuFilmConvert, offset, &d_rgba, sizeof(d_rgba)))
+		cuda_assert(cuParamSetv(cuFilmConvert, offset, &d_rgba, sizeof(d_rgba)));
 		offset += sizeof(d_rgba);
 		
-		cuda_assert(cuParamSetv(cuFilmConvert, offset, &d_buffer, sizeof(d_buffer)))
+		cuda_assert(cuParamSetv(cuFilmConvert, offset, &d_buffer, sizeof(d_buffer)));
 		offset += sizeof(d_buffer);
 
 		float sample_scale = 1.0f/(task.sample + 1);
 		offset = align_up(offset, __alignof(sample_scale));
 
-		cuda_assert(cuParamSetf(cuFilmConvert, offset, sample_scale))
+		cuda_assert(cuParamSetf(cuFilmConvert, offset, sample_scale));
 		offset += sizeof(sample_scale);
 
-		cuda_assert(cuParamSeti(cuFilmConvert, offset, task.x))
+		cuda_assert(cuParamSeti(cuFilmConvert, offset, task.x));
 		offset += sizeof(task.x);
 
-		cuda_assert(cuParamSeti(cuFilmConvert, offset, task.y))
+		cuda_assert(cuParamSeti(cuFilmConvert, offset, task.y));
 		offset += sizeof(task.y);
 
-		cuda_assert(cuParamSeti(cuFilmConvert, offset, task.w))
+		cuda_assert(cuParamSeti(cuFilmConvert, offset, task.w));
 		offset += sizeof(task.w);
 
-		cuda_assert(cuParamSeti(cuFilmConvert, offset, task.h))
+		cuda_assert(cuParamSeti(cuFilmConvert, offset, task.h));
 		offset += sizeof(task.h);
 
-		cuda_assert(cuParamSeti(cuFilmConvert, offset, task.offset))
+		cuda_assert(cuParamSeti(cuFilmConvert, offset, task.offset));
 		offset += sizeof(task.offset);
 
-		cuda_assert(cuParamSeti(cuFilmConvert, offset, task.stride))
+		cuda_assert(cuParamSeti(cuFilmConvert, offset, task.stride));
 		offset += sizeof(task.stride);
 
-		cuda_assert(cuParamSetSize(cuFilmConvert, offset))
+		cuda_assert(cuParamSetSize(cuFilmConvert, offset));
+
+		/* launch kernel */
+		int threads_per_block;
+		cuda_assert(cuFuncGetAttribute(&threads_per_block, CU_FUNC_ATTRIBUTE_MAX_THREADS_PER_BLOCK, cuFilmConvert));
 
-		/* launch kernel: todo find optimal size, cache config for fermi */
-		int xthreads = 16;
-		int ythreads = 16;
+		int xthreads = (int)sqrt((float)threads_per_block);
+		int ythreads = (int)sqrt((float)threads_per_block);
 		int xblocks = (task.w + xthreads - 1)/xthreads;
 		int yblocks = (task.h + ythreads - 1)/ythreads;
 
-		cuda_assert(cuFuncSetCacheConfig(cuFilmConvert, CU_FUNC_CACHE_PREFER_L1))
-		cuda_assert(cuFuncSetBlockShape(cuFilmConvert, xthreads, ythreads, 1))
-		cuda_assert(cuLaunchGrid(cuFilmConvert, xblocks, yblocks))
+		cuda_assert(cuFuncSetCacheConfig(cuFilmConvert, CU_FUNC_CACHE_PREFER_L1));
+		cuda_assert(cuFuncSetBlockShape(cuFilmConvert, xthreads, ythreads, 1));
+		cuda_assert(cuLaunchGrid(cuFilmConvert, xblocks, yblocks));
 
 		unmap_pixels((rgba_byte)? rgba_byte: rgba_half);
 
@@ -734,40 +757,55 @@ public:
 
 		cuda_push_context();
 
-		CUfunction cuDisplace;
+		CUfunction cuShader;
 		CUdeviceptr d_input = cuda_device_ptr(task.shader_input);
 		CUdeviceptr d_output = cuda_device_ptr(task.shader_output);
 
 		/* get kernel function */
-		cuda_assert(cuModuleGetFunction(&cuDisplace, cuModule, "kernel_cuda_shader"))
-		
-		/* pass in parameters */
-		int offset = 0;
-		
-		cuda_assert(cuParamSetv(cuDisplace, offset, &d_input, sizeof(d_input)))
-		offset += sizeof(d_input);
+		cuda_assert(cuModuleGetFunction(&cuShader, cuModule, "kernel_cuda_shader"));
+
+		/* do tasks in smaller chunks, so we can cancel it */
+		const int shader_chunk_size = 65536;
+		const int start = task.shader_x;
+		const int end = task.shader_x + task.shader_w;
 
-		cuda_assert(cuParamSetv(cuDisplace, offset, &d_output, sizeof(d_output)))
-		offset += sizeof(d_output);
+		for(int shader_x = start; shader_x < end; shader_x += shader_chunk_size) {
+			if(task.get_cancel())
+				break;
 
-		int shader_eval_type = task.shader_eval_type;
-		offset = align_up(offset, __alignof(shader_eval_type));
+			/* pass in parameters */
+			int offset = 0;
 
-		cuda_assert(cuParamSeti(cuDisplace, offset, task.shader_eval_type))
-		offset += sizeof(task.shader_eval_type);
+			cuda_assert(cuParamSetv(cuShader, offset, &d_input, sizeof(d_input)));
+			offset += sizeof(d_input);
 
-		cuda_assert(cuParamSeti(cuDisplace, offset, task.shader_x))
-		offset += sizeof(task.shader_x);
+			cuda_assert(cuParamSetv(cuShader, offset, &d_output, sizeof(d_output)));
+			offset += sizeof(d_output);
 
-		cuda_assert(cuParamSetSize(cuDisplace, offset))
+			int shader_eval_type = task.shader_eval_type;
+			offset = align_up(offset, __alignof(shader_eval_type));
 
-		/* launch kernel: todo find optimal size, cache config for fermi */
-		int xthreads = 16;
-		int xblocks = (task.shader_w + xthreads - 1)/xthreads;
+			cuda_assert(cuParamSeti(cuShader, offset, task.shader_eval_type));
+			offset += sizeof(task.shader_eval_type);
 
-		cuda_assert(cuFuncSetCacheConfig(cuDisplace, CU_FUNC_CACHE_PREFER_L1))
-		cuda_assert(cuFuncSetBlockShape(cuDisplace, xthreads, 1, 1))
-		cuda_assert(cuLaunchGrid(cuDisplace, xblocks, 1))
+			cuda_assert(cuParamSeti(cuShader, offset, shader_x));
+			offset += sizeof(shader_x);
+
+			cuda_assert(cuParamSetSize(cuShader, offset));
+
+			/* launch kernel */
+			int threads_per_block;
+			cuda_assert(cuFuncGetAttribute(&threads_per_block, CU_FUNC_ATTRIBUTE_MAX_THREADS_PER_BLOCK, cuShader));
+
+			int shader_w = min(shader_chunk_size, end - shader_x);
+			int xblocks = (shader_w + threads_per_block - 1)/threads_per_block;
+
+			cuda_assert(cuFuncSetCacheConfig(cuShader, CU_FUNC_CACHE_PREFER_L1));
+			cuda_assert(cuFuncSetBlockShape(cuShader, threads_per_block, 1, 1));
+			cuda_assert(cuLaunchGrid(cuShader, xblocks, 1));
+
+			cuda_assert(cuCtxSynchronize());
+		}
 
 		cuda_pop_context();
 	}
@@ -779,8 +817,8 @@ public:
 			CUdeviceptr buffer;
 			
 			size_t bytes;
-			cuda_assert(cuGraphicsMapResources(1, &pmem.cuPBOresource, 0))
-			cuda_assert(cuGraphicsResourceGetMappedPointer(&buffer, &bytes, pmem.cuPBOresource))
+			cuda_assert(cuGraphicsMapResources(1, &pmem.cuPBOresource, 0));
+			cuda_assert(cuGraphicsResourceGetMappedPointer(&buffer, &bytes, pmem.cuPBOresource));
 			
 			return buffer;
 		}
@@ -793,7 +831,7 @@ public:
 		if(!background) {
 			PixelMem pmem = pixel_mem_map[mem];
 
-			cuda_assert(cuGraphicsUnmapResources(1, &pmem.cuPBOresource, 0))
+			cuda_assert(cuGraphicsUnmapResources(1, &pmem.cuPBOresource, 0));
 		}
 	}
 
@@ -882,7 +920,7 @@ public:
 
 				cuda_push_context();
 
-				cuda_assert(cuGraphicsUnregisterResource(pmem.cuPBOresource))
+				cuda_assert(cuGraphicsUnregisterResource(pmem.cuPBOresource));
 				glDeleteBuffers(1, &pmem.cuPBO);
 				glDeleteTextures(1, &pmem.cuTexId);
 
@@ -900,7 +938,8 @@ public:
 		}
 	}
 
-	void draw_pixels(device_memory& mem, int y, int w, int h, int dy, int width, int height, bool transparent)
+	void draw_pixels(device_memory& mem, int y, int w, int h, int dy, int width, int height, bool transparent,
+		const DeviceDrawParams &draw_params)
 	{
 		if(!background) {
 			PixelMem pmem = pixel_mem_map[mem.device_pointer];
@@ -933,6 +972,10 @@ public:
 
 			glColor3f(1.0f, 1.0f, 1.0f);
 
+			if(draw_params.bind_display_space_shader_cb) {
+				draw_params.bind_display_space_shader_cb();
+			}
+
 			glPushMatrix();
 			glTranslatef(0.0f, (float)dy, 0.0f);
 				
@@ -951,6 +994,10 @@ public:
 
 			glPopMatrix();
 
+			if(draw_params.unbind_display_space_shader_cb) {
+				draw_params.unbind_display_space_shader_cb();
+			}
+
 			if(transparent) {
 				glDisable(GL_BLEND);
 				glBlendFunc(GL_SRC_ALPHA, GL_ONE_MINUS_SRC_ALPHA); /* reset blender default */
@@ -964,7 +1011,7 @@ public:
 			return;
 		}
 
-		Device::draw_pixels(mem, y, w, h, dy, width, height, transparent);
+		Device::draw_pixels(mem, y, w, h, dy, width, height, transparent, draw_params);
 	}
 
 	void thread_run(DeviceTask *task)
@@ -979,6 +1026,10 @@ public:
 				int start_sample = tile.start_sample;
 				int end_sample = tile.start_sample + tile.num_samples;
 
+				boost::posix_time::ptime start_time(boost::posix_time::microsec_clock::local_time());
+				boost::posix_time::ptime last_time = start_time;
+				int sync_sample = 10;
+
 				for(int sample = start_sample; sample < end_sample; sample++) {
 					if (task->get_cancel()) {
 						if(task->need_finish_queue == false)
@@ -988,8 +1039,28 @@ public:
 					path_trace(tile, sample, branched);
 
 					tile.sample = sample + 1;
-
 					task->update_progress(tile);
+
+					if(!info.display_device && sample == sync_sample) {
+						cuda_push_context();
+						cuda_assert(cuEventRecord(tileDone, cuStream));
+						cuda_assert(cuEventSynchronize(tileDone));
+
+						/* Do some time keeping to find out if we need to sync less */
+						boost::posix_time::ptime current_time(boost::posix_time::microsec_clock::local_time());
+						boost::posix_time::time_duration sample_duration = current_time - last_time;
+
+						long msec = sample_duration.total_milliseconds();
+						float scaling_factor = (float)target_update_frequency / (float)msec;
+
+						/* sync at earliest next sample and probably later */
+						sync_sample = (sample + 1) + sync_sample * (int)ceil(scaling_factor);
+
+						sync_sample = min(end_sample - 1, sync_sample); // make sure we sync the last sample always
+
+						last_time = current_time;
+						cuda_pop_context();
+					}
 				}
 
 				task->release_tile(tile);
@@ -999,7 +1070,7 @@ public:
 			shader(*task);
 
 			cuda_push_context();
-			cuda_assert(cuCtxSynchronize())
+			cuda_assert(cuCtxSynchronize());
 			cuda_pop_context();
 		}
 	}
@@ -1020,7 +1091,7 @@ public:
 			film_convert(task, task.buffer, task.rgba_byte, task.rgba_half);
 
 			cuda_push_context();
-			cuda_assert(cuCtxSynchronize())
+			cuda_assert(cuCtxSynchronize());
 			cuda_pop_context();
 		}
 		else {
@@ -1081,6 +1152,7 @@ void device_cuda_info(vector<DeviceInfo>& devices)
 		int major, minor;
 		cuDeviceComputeCapability(&major, &minor, num);
 		info.advanced_shading = (major >= 2);
+		info.extended_images = (major >= 3);
 		info.pack_images = false;
 
 		/* if device has a kernel timeout, assume it is used for display */
diff --git a/intern/cycles/device/device_memory.h b/intern/cycles/device/device_memory.h
index 1427d12cba2..8d6f4a49a9c 100644
--- a/intern/cycles/device/device_memory.h
+++ b/intern/cycles/device/device_memory.h
@@ -169,6 +169,7 @@ public:
 	size_t data_size;
 	size_t data_width;
 	size_t data_height;
+	size_t data_depth;
 
 	/* device pointer */
 	device_ptr device_pointer;
@@ -195,6 +196,7 @@ public:
 		data_size = 0;
 		data_width = 0;
 		data_height = 0;
+		data_depth = 0;
 
 		assert(data_elements > 0);
 
@@ -204,20 +206,21 @@ public:
 	virtual ~device_vector() {}
 
 	/* vector functions */
-	T *resize(size_t width, size_t height = 0)
+	T *resize(size_t width, size_t height = 0, size_t depth = 0)
 	{
-		data_size = (height == 0)? width: width*height;
+		data_size = width * ((height == 0)? 1: height) * ((depth == 0)? 1: depth);
 		data.resize(data_size);
 		data_pointer = (device_ptr)&data[0];
 		data_width = width;
 		data_height = height;
+		data_depth = depth;
 
 		return &data[0];
 	}
 
-	T *copy(T *ptr, size_t width, size_t height = 0)
+	T *copy(T *ptr, size_t width, size_t height = 0, size_t depth = 0)
 	{
-		T *mem = resize(width, height);
+		T *mem = resize(width, height, depth);
 		memcpy(mem, ptr, memory_size());
 		return mem;
 	}
@@ -230,13 +233,14 @@ public:
 		}
 	}
 
-	void reference(T *ptr, size_t width, size_t height = 0)
+	void reference(T *ptr, size_t width, size_t height = 0, size_t depth = 0)
 	{
 		data.clear();
-		data_size = (height == 0)? width: width*height;
+		data_size = width * ((height == 0)? 1: height) * ((depth == 0)? 1: depth);
 		data_pointer = (device_ptr)ptr;
 		data_width = width;
 		data_height = height;
+		data_depth = depth;
 	}
 
 	void clear()
@@ -245,6 +249,7 @@ public:
 		data_pointer = 0;
 		data_width = 0;
 		data_height = 0;
+		data_depth = 0;
 		data_size = 0;
 	}
 
diff --git a/intern/cycles/device/device_multi.cpp b/intern/cycles/device/device_multi.cpp
index 27b9de0769e..c866ebaaea2 100644
--- a/intern/cycles/device/device_multi.cpp
+++ b/intern/cycles/device/device_multi.cpp
@@ -168,7 +168,7 @@ public:
 			sub.device->const_copy_to(name, host, size);
 	}
 
-	void tex_alloc(const char *name, device_memory& mem, bool interpolation, bool periodic)
+	void tex_alloc(const char *name, device_memory& mem, InterpolationType interpolation, bool periodic)
 	{
 		foreach(SubDevice& sub, devices) {
 			mem.device_pointer = 0;
@@ -233,7 +233,8 @@ public:
 		mem.device_pointer = tmp;
 	}
 
-	void draw_pixels(device_memory& rgba, int y, int w, int h, int dy, int width, int height, bool transparent)
+	void draw_pixels(device_memory& rgba, int y, int w, int h, int dy, int width, int height, bool transparent,
+		const DeviceDrawParams &draw_params)
 	{
 		device_ptr tmp = rgba.device_pointer;
 		int i = 0, sub_h = h/devices.size();
@@ -247,7 +248,7 @@ public:
 			/* adjust math for w/width */
 
 			rgba.device_pointer = sub.ptr_map[tmp];
-			sub.device->draw_pixels(rgba, sy, w, sh, sdy, width, sheight, transparent);
+			sub.device->draw_pixels(rgba, sy, w, sh, sdy, width, sheight, transparent, draw_params);
 			i++;
 		}
 
@@ -327,6 +328,7 @@ static bool device_multi_add(vector<DeviceInfo>& devices, DeviceType type, bool
 
 	info.advanced_shading = with_advanced_shading;
 	info.pack_images = false;
+	info.extended_images = true;
 
 	foreach(DeviceInfo& subinfo, devices) {
 		if(subinfo.type == type) {
@@ -350,6 +352,7 @@ static bool device_multi_add(vector<DeviceInfo>& devices, DeviceType type, bool
 			if(subinfo.display_device)
 				info.display_device = true;
 			info.pack_images = info.pack_images || subinfo.pack_images;
+			info.extended_images = info.extended_images && subinfo.extended_images;
 			num_added++;
 		}
 	}
diff --git a/intern/cycles/device/device_network.cpp b/intern/cycles/device/device_network.cpp
index bffd993818f..af051076009 100644
--- a/intern/cycles/device/device_network.cpp
+++ b/intern/cycles/device/device_network.cpp
@@ -162,7 +162,7 @@ public:
 		snd.write_buffer(host, size);
 	}
 
-	void tex_alloc(const char *name, device_memory& mem, bool interpolation, bool periodic)
+	void tex_alloc(const char *name, device_memory& mem, InterpolationType interpolation, bool periodic)
 	{
 		thread_scoped_lock lock(rpc_lock);
 
@@ -326,7 +326,7 @@ class DeviceServer {
 public:
 	thread_mutex rpc_lock;
 
-	void network_error(const string &message){
+	void network_error(const string &message) {
 		error_func.network_error(message);
 	}
 
@@ -366,7 +366,7 @@ protected:
 	{
 		/* create a new DataVector and insert it into mem_data */
 		pair<DataMap::iterator,bool> data_ins = mem_data.insert(
-				DataMap::value_type(client_pointer, DataVector()));
+		        DataMap::value_type(client_pointer, DataVector()));
 
 		/* make sure it was a unique insertion */
 		assert(data_ins.second);
@@ -559,7 +559,7 @@ protected:
 		else if(rcv.name == "tex_alloc") {
 			network_device_memory mem;
 			string name;
-			bool interpolation;
+			InterpolationType interpolation;
 			bool periodic;
 			device_ptr client_pointer;
 
diff --git a/intern/cycles/device/device_network.h b/intern/cycles/device/device_network.h
index bf8f3c70c49..893841d1da7 100644
--- a/intern/cycles/device/device_network.h
+++ b/intern/cycles/device/device_network.h
@@ -118,7 +118,7 @@ public:
 	void add(const device_memory& mem)
 	{
 		archive & mem.data_type & mem.data_elements & mem.data_size;
-		archive & mem.data_width & mem.data_height & mem.device_pointer;
+		archive & mem.data_width & mem.data_height & mem.data_depth & mem.device_pointer;
 	}
 
 	template<typename T> void add(const T& data)
@@ -209,7 +209,7 @@ public:
 		boost::system::error_code error;
 		size_t len = boost::asio::read(socket, boost::asio::buffer(header), error);
 
-		if(error.value()){
+		if(error.value()) {
 			error_func->network_error(error.message());
 		}
 
@@ -261,7 +261,7 @@ public:
 	void read(network_device_memory& mem)
 	{
 		*archive & mem.data_type & mem.data_elements & mem.data_size;
-		*archive & mem.data_width & mem.data_height & mem.device_pointer;
+		*archive & mem.data_width & mem.data_height & mem.data_depth & mem.device_pointer;
 
 		mem.data_pointer = 0;
 	}
@@ -276,7 +276,7 @@ public:
 		boost::system::error_code error;
 		size_t len = boost::asio::read(socket, boost::asio::buffer(buffer, size), error);
 
-		if(error.value()){
+		if(error.value()) {
 			error_func->network_error(error.message());
 		}
 
@@ -391,7 +391,7 @@ private:
 
 					/* add address if it's not already in the list */
 					bool found = std::find(servers.begin(), servers.end(),
-							address) != servers.end();
+					                       address) != servers.end();
 
 					if(!found)
 						servers.push_back(address);
diff --git a/intern/cycles/device/device_opencl.cpp b/intern/cycles/device/device_opencl.cpp
index 9117b70d749..694ec9db036 100644
--- a/intern/cycles/device/device_opencl.cpp
+++ b/intern/cycles/device/device_opencl.cpp
@@ -101,9 +101,6 @@ static string opencl_kernel_build_options(const string& platform, const string *
 
 	if(opencl_kernel_use_debug())
 		build_options += "-D__KERNEL_OPENCL_DEBUG__ ";
-
-	if(opencl_kernel_use_advanced_shading(platform))
-		build_options += "-D__KERNEL_OPENCL_NEED_ADVANCED_SHADING__ ";
 	
 	return build_options;
 }
@@ -409,10 +406,22 @@ public:
 		fprintf(stderr, "%s\n", message.c_str());
 	}
 
-	void opencl_assert(cl_int err)
+#define opencl_assert(stmt) \
+	{ \
+		cl_int err = stmt; \
+		\
+		if(err != CL_SUCCESS) { \
+			string message = string_printf("OpenCL error: %s in %s", opencl_error_string(err), #stmt); \
+			if(error_msg == "") \
+				error_msg = message; \
+			fprintf(stderr, "%s\n", message.c_str()); \
+		} \
+	} (void)0
+
+	void opencl_assert_err(cl_int err, const char* where)
 	{
 		if(err != CL_SUCCESS) {
-			string message = string_printf("OpenCL error (%d): %s", err, opencl_error_string(err));
+			string message = string_printf("OpenCL error (%d): %s in %s", err, opencl_error_string(err), where);
 			if(error_msg == "")
 				error_msg = message;
 			fprintf(stderr, "%s\n", message.c_str());
@@ -452,8 +461,10 @@ public:
 		vector<cl_platform_id> platforms(num_platforms, NULL);
 
 		ciErr = clGetPlatformIDs(num_platforms, &platforms[0], NULL);
-		if(opencl_error(ciErr))
+		if(opencl_error(ciErr)) {
+			fprintf(stderr, "clGetPlatformIDs failed \n");
 			return;
+		}
 
 		int num_base = 0;
 		int total_devices = 0;
@@ -478,8 +489,10 @@ public:
 			/* get devices */
 			vector<cl_device_id> device_ids(num_devices, NULL);
 
-			if(opencl_error(clGetDeviceIDs(cpPlatform, opencl_device_type(), num_devices, &device_ids[0], NULL)))
+			if(opencl_error(clGetDeviceIDs(cpPlatform, opencl_device_type(), num_devices, &device_ids[0], NULL))) {
+				fprintf(stderr, "clGetDeviceIDs failed \n");
 				return;
+			}
 
 			cdDevice = device_ids[info.num - num_base];
 
@@ -515,8 +528,10 @@ public:
 				cxContext = clCreateContext(context_props, 1, &cdDevice,
 					context_notify_callback, cdDevice, &ciErr);
 
-				if(opencl_error(ciErr))
+				if(opencl_error(ciErr)) {
+					opencl_error("OpenCL: clCreateContext failed");
 					return;
+				}
 
 				/* cache it */
 				OpenCLCache::store_context(cpPlatform, cdDevice, cxContext, cache_locker);
@@ -531,6 +546,7 @@ public:
 		if(opencl_error(ciErr))
 			return;
 
+		fprintf(stderr,"Device init succes\n");
 		device_initialized = true;
 	}
 
@@ -821,7 +837,7 @@ public:
 
 		mem.device_pointer = (device_ptr)clCreateBuffer(cxContext, mem_flag, size, mem_ptr, &ciErr);
 
-		opencl_assert(ciErr);
+		opencl_assert_err(ciErr, "clCreateBuffer");
 
 		stats.mem_alloc(size);
 	}
@@ -830,8 +846,7 @@ public:
 	{
 		/* this is blocking */
 		size_t size = mem.memory_size();
-		ciErr = clEnqueueWriteBuffer(cqCommandQueue, CL_MEM_PTR(mem.device_pointer), CL_TRUE, 0, size, (void*)mem.data_pointer, 0, NULL, NULL);
-		opencl_assert(ciErr);
+		opencl_assert(clEnqueueWriteBuffer(cqCommandQueue, CL_MEM_PTR(mem.device_pointer), CL_TRUE, 0, size, (void*)mem.data_pointer, 0, NULL, NULL));
 	}
 
 	void mem_copy_from(device_memory& mem, int y, int w, int h, int elem)
@@ -839,8 +854,7 @@ public:
 		size_t offset = elem*y*w;
 		size_t size = elem*w*h;
 
-		ciErr = clEnqueueReadBuffer(cqCommandQueue, CL_MEM_PTR(mem.device_pointer), CL_TRUE, offset, size, (uchar*)mem.data_pointer + offset, 0, NULL, NULL);
-		opencl_assert(ciErr);
+		opencl_assert(clEnqueueReadBuffer(cqCommandQueue, CL_MEM_PTR(mem.device_pointer), CL_TRUE, offset, size, (uchar*)mem.data_pointer + offset, 0, NULL, NULL));
 	}
 
 	void mem_zero(device_memory& mem)
@@ -854,9 +868,8 @@ public:
 	void mem_free(device_memory& mem)
 	{
 		if(mem.device_pointer) {
-			ciErr = clReleaseMemObject(CL_MEM_PTR(mem.device_pointer));
+			opencl_assert(clReleaseMemObject(CL_MEM_PTR(mem.device_pointer)));
 			mem.device_pointer = 0;
-			opencl_assert(ciErr);
 
 			stats.mem_free(mem.memory_size());
 		}
@@ -881,7 +894,7 @@ public:
 		mem_copy_to(*i->second);
 	}
 
-	void tex_alloc(const char *name, device_memory& mem, bool interpolation, bool periodic)
+	void tex_alloc(const char *name, device_memory& mem, InterpolationType interpolation, bool periodic)
 	{
 		mem_alloc(mem, MEM_READ_ONLY);
 		mem_copy_to(mem);
@@ -919,7 +932,7 @@ public:
 			CL_DEVICE_MAX_WORK_ITEM_SIZES, sizeof(size_t)*3, max_work_items, NULL);
 	
 		/* try to divide evenly over 2 dimensions */
-		size_t sqrt_workgroup_size = max(sqrt((double)workgroup_size), 1.0);
+		size_t sqrt_workgroup_size = max((size_t)sqrt((double)workgroup_size), 1);
 		size_t local_size[2] = {sqrt_workgroup_size, sqrt_workgroup_size};
 
 		/* some implementations have max size 1 on 2nd dimension */
@@ -931,8 +944,7 @@ public:
 		size_t global_size[2] = {global_size_round_up(local_size[0], w), global_size_round_up(local_size[1], h)};
 
 		/* run kernel */
-		ciErr = clEnqueueNDRangeKernel(cqCommandQueue, kernel, 2, NULL, global_size, NULL, 0, NULL, NULL);
-		opencl_assert(ciErr);
+		opencl_assert(clEnqueueNDRangeKernel(cqCommandQueue, kernel, 2, NULL, global_size, NULL, 0, NULL, NULL));
 		opencl_assert(clFlush(cqCommandQueue));
 	}
 
@@ -952,33 +964,29 @@ public:
 
 		/* sample arguments */
 		cl_uint narg = 0;
-		ciErr = 0;
 
-		ciErr |= clSetKernelArg(ckPathTraceKernel, narg++, sizeof(d_data), (void*)&d_data);
-		ciErr |= clSetKernelArg(ckPathTraceKernel, narg++, sizeof(d_buffer), (void*)&d_buffer);
-		ciErr |= clSetKernelArg(ckPathTraceKernel, narg++, sizeof(d_rng_state), (void*)&d_rng_state);
+		opencl_assert(clSetKernelArg(ckPathTraceKernel, narg++, sizeof(d_data), (void*)&d_data));
+		opencl_assert(clSetKernelArg(ckPathTraceKernel, narg++, sizeof(d_buffer), (void*)&d_buffer));
+		opencl_assert(clSetKernelArg(ckPathTraceKernel, narg++, sizeof(d_rng_state), (void*)&d_rng_state));
 
 #define KERNEL_TEX(type, ttype, name) \
-	ciErr |= set_kernel_arg_mem(ckPathTraceKernel, &narg, #name);
+	set_kernel_arg_mem(ckPathTraceKernel, &narg, #name);
 #include "kernel_textures.h"
 
-		ciErr |= clSetKernelArg(ckPathTraceKernel, narg++, sizeof(d_sample), (void*)&d_sample);
-		ciErr |= clSetKernelArg(ckPathTraceKernel, narg++, sizeof(d_x), (void*)&d_x);
-		ciErr |= clSetKernelArg(ckPathTraceKernel, narg++, sizeof(d_y), (void*)&d_y);
-		ciErr |= clSetKernelArg(ckPathTraceKernel, narg++, sizeof(d_w), (void*)&d_w);
-		ciErr |= clSetKernelArg(ckPathTraceKernel, narg++, sizeof(d_h), (void*)&d_h);
-		ciErr |= clSetKernelArg(ckPathTraceKernel, narg++, sizeof(d_offset), (void*)&d_offset);
-		ciErr |= clSetKernelArg(ckPathTraceKernel, narg++, sizeof(d_stride), (void*)&d_stride);
-
-		opencl_assert(ciErr);
+		opencl_assert(clSetKernelArg(ckPathTraceKernel, narg++, sizeof(d_sample), (void*)&d_sample));
+		opencl_assert(clSetKernelArg(ckPathTraceKernel, narg++, sizeof(d_x), (void*)&d_x));
+		opencl_assert(clSetKernelArg(ckPathTraceKernel, narg++, sizeof(d_y), (void*)&d_y));
+		opencl_assert(clSetKernelArg(ckPathTraceKernel, narg++, sizeof(d_w), (void*)&d_w));
+		opencl_assert(clSetKernelArg(ckPathTraceKernel, narg++, sizeof(d_h), (void*)&d_h));
+		opencl_assert(clSetKernelArg(ckPathTraceKernel, narg++, sizeof(d_offset), (void*)&d_offset));
+		opencl_assert(clSetKernelArg(ckPathTraceKernel, narg++, sizeof(d_stride), (void*)&d_stride));
 
 		enqueue_kernel(ckPathTraceKernel, d_w, d_h);
 	}
 
-	cl_int set_kernel_arg_mem(cl_kernel kernel, cl_uint *narg, const char *name)
+	void set_kernel_arg_mem(cl_kernel kernel, cl_uint *narg, const char *name)
 	{
 		cl_mem ptr;
-		cl_int err = 0;
 
 		MemMap::iterator i = mem_map.find(name);
 		if(i != mem_map.end()) {
@@ -989,10 +997,7 @@ public:
 			ptr = CL_MEM_PTR(null_mem);
 		}
 		
-		err |= clSetKernelArg(kernel, (*narg)++, sizeof(ptr), (void*)&ptr);
-		opencl_assert(err);
-
-		return err;
+		opencl_assert(clSetKernelArg(kernel, (*narg)++, sizeof(ptr), (void*)&ptr));
 	}
 
 	void film_convert(DeviceTask& task, device_ptr buffer, device_ptr rgba_byte, device_ptr rgba_half)
@@ -1011,27 +1016,27 @@ public:
 
 		/* sample arguments */
 		cl_uint narg = 0;
-		ciErr = 0;
+
 
 		cl_kernel ckFilmConvertKernel = (rgba_byte)? ckFilmConvertByteKernel: ckFilmConvertHalfFloatKernel;
 
-		ciErr |= clSetKernelArg(ckFilmConvertKernel, narg++, sizeof(d_data), (void*)&d_data);
-		ciErr |= clSetKernelArg(ckFilmConvertKernel, narg++, sizeof(d_rgba), (void*)&d_rgba);
-		ciErr |= clSetKernelArg(ckFilmConvertKernel, narg++, sizeof(d_buffer), (void*)&d_buffer);
+		opencl_assert(clSetKernelArg(ckFilmConvertKernel, narg++, sizeof(d_data), (void*)&d_data));
+		opencl_assert(clSetKernelArg(ckFilmConvertKernel, narg++, sizeof(d_rgba), (void*)&d_rgba));
+		opencl_assert(clSetKernelArg(ckFilmConvertKernel, narg++, sizeof(d_buffer), (void*)&d_buffer));
 
 #define KERNEL_TEX(type, ttype, name) \
-	ciErr |= set_kernel_arg_mem(ckFilmConvertKernel, &narg, #name);
+	set_kernel_arg_mem(ckFilmConvertKernel, &narg, #name);
 #include "kernel_textures.h"
 
-		ciErr |= clSetKernelArg(ckFilmConvertKernel, narg++, sizeof(d_sample_scale), (void*)&d_sample_scale);
-		ciErr |= clSetKernelArg(ckFilmConvertKernel, narg++, sizeof(d_x), (void*)&d_x);
-		ciErr |= clSetKernelArg(ckFilmConvertKernel, narg++, sizeof(d_y), (void*)&d_y);
-		ciErr |= clSetKernelArg(ckFilmConvertKernel, narg++, sizeof(d_w), (void*)&d_w);
-		ciErr |= clSetKernelArg(ckFilmConvertKernel, narg++, sizeof(d_h), (void*)&d_h);
-		ciErr |= clSetKernelArg(ckFilmConvertKernel, narg++, sizeof(d_offset), (void*)&d_offset);
-		ciErr |= clSetKernelArg(ckFilmConvertKernel, narg++, sizeof(d_stride), (void*)&d_stride);
+		opencl_assert(clSetKernelArg(ckFilmConvertKernel, narg++, sizeof(d_sample_scale), (void*)&d_sample_scale));
+		opencl_assert(clSetKernelArg(ckFilmConvertKernel, narg++, sizeof(d_x), (void*)&d_x));
+		opencl_assert(clSetKernelArg(ckFilmConvertKernel, narg++, sizeof(d_y), (void*)&d_y));
+		opencl_assert(clSetKernelArg(ckFilmConvertKernel, narg++, sizeof(d_w), (void*)&d_w));
+		opencl_assert(clSetKernelArg(ckFilmConvertKernel, narg++, sizeof(d_h), (void*)&d_h));
+		opencl_assert(clSetKernelArg(ckFilmConvertKernel, narg++, sizeof(d_offset), (void*)&d_offset));
+		opencl_assert(clSetKernelArg(ckFilmConvertKernel, narg++, sizeof(d_stride), (void*)&d_stride));
+
 
-		opencl_assert(ciErr);
 
 		enqueue_kernel(ckFilmConvertKernel, d_w, d_h);
 	}
@@ -1048,21 +1053,18 @@ public:
 
 		/* sample arguments */
 		cl_uint narg = 0;
-		ciErr = 0;
 
-		ciErr |= clSetKernelArg(ckShaderKernel, narg++, sizeof(d_data), (void*)&d_data);
-		ciErr |= clSetKernelArg(ckShaderKernel, narg++, sizeof(d_input), (void*)&d_input);
-		ciErr |= clSetKernelArg(ckShaderKernel, narg++, sizeof(d_output), (void*)&d_output);
+		opencl_assert(clSetKernelArg(ckShaderKernel, narg++, sizeof(d_data), (void*)&d_data));
+		opencl_assert(clSetKernelArg(ckShaderKernel, narg++, sizeof(d_input), (void*)&d_input));
+		opencl_assert(clSetKernelArg(ckShaderKernel, narg++, sizeof(d_output), (void*)&d_output));
 
 #define KERNEL_TEX(type, ttype, name) \
-	ciErr |= set_kernel_arg_mem(ckShaderKernel, &narg, #name);
+	set_kernel_arg_mem(ckShaderKernel, &narg, #name);
 #include "kernel_textures.h"
 
-		ciErr |= clSetKernelArg(ckShaderKernel, narg++, sizeof(d_shader_eval_type), (void*)&d_shader_eval_type);
-		ciErr |= clSetKernelArg(ckShaderKernel, narg++, sizeof(d_shader_x), (void*)&d_shader_x);
-		ciErr |= clSetKernelArg(ckShaderKernel, narg++, sizeof(d_shader_w), (void*)&d_shader_w);
-
-		opencl_assert(ciErr);
+		opencl_assert(clSetKernelArg(ckShaderKernel, narg++, sizeof(d_shader_eval_type), (void*)&d_shader_eval_type));
+		opencl_assert(clSetKernelArg(ckShaderKernel, narg++, sizeof(d_shader_x), (void*)&d_shader_x));
+		opencl_assert(clSetKernelArg(ckShaderKernel, narg++, sizeof(d_shader_w), (void*)&d_shader_w));
 
 		enqueue_kernel(ckShaderKernel, task.shader_w, 1);
 	}
diff --git a/intern/cycles/kernel/CMakeLists.txt b/intern/cycles/kernel/CMakeLists.txt
index cbe0d4b5d10..d18f4fa2998 100644
--- a/intern/cycles/kernel/CMakeLists.txt
+++ b/intern/cycles/kernel/CMakeLists.txt
@@ -12,10 +12,6 @@ set(INC_SYS
 
 set(SRC
 	kernel.cpp
-	kernel_sse2.cpp
-	kernel_sse3.cpp
-	kernel_sse41.cpp
-	kernel_avx.cpp
 	kernel.cl
 	kernel.cu
 )
@@ -23,14 +19,10 @@ set(SRC
 set(SRC_HEADERS
 	kernel.h
 	kernel_accumulate.h
-	kernel_bvh.h
-	kernel_bvh_subsurface.h
-	kernel_bvh_traversal.h
 	kernel_camera.h
 	kernel_compat_cpu.h
 	kernel_compat_cuda.h
 	kernel_compat_opencl.h
-	kernel_curve.h
 	kernel_differential.h
 	kernel_displace.h
 	kernel_emission.h
@@ -40,18 +32,15 @@ set(SRC_HEADERS
 	kernel_light.h
 	kernel_math.h
 	kernel_montecarlo.h
-	kernel_object.h
 	kernel_passes.h
 	kernel_path.h
 	kernel_path_state.h
-	kernel_primitive.h
 	kernel_projection.h
 	kernel_random.h
 	kernel_shader.h
 	kernel_shadow.h
 	kernel_subsurface.h
 	kernel_textures.h
-	kernel_triangle.h
 	kernel_types.h
 	kernel_volume.h
 )
@@ -118,6 +107,21 @@ set(SRC_SVM_HEADERS
 	svm/svm_wave.h
 )
 
+set(SRC_GEOM_HEADERS
+	geom/geom.h
+	geom/geom_attribute.h
+	geom/geom_bvh.h
+	geom/geom_bvh_subsurface.h
+	geom/geom_bvh_traversal.h
+	geom/geom_curve.h
+	geom/geom_motion_curve.h
+	geom/geom_motion_triangle.h
+	geom/geom_object.h
+	geom/geom_primitive.h
+	geom/geom_triangle.h
+	geom/geom_volume.h
+)
+
 set(SRC_UTIL_HEADERS
 	../util/util_color.h
 	../util/util_half.h
@@ -142,37 +146,45 @@ if(WITH_CYCLES_CUDA_BINARIES)
 	set(CUDA_VERSION "${CUDA_VERSION_MAJOR}${CUDA_VERSION_MINOR}")
 
 	# warn for other versions
-	if(CUDA_VERSION MATCHES "50")
+	if(CUDA_VERSION MATCHES "60")
 	else()
-		message(WARNING "CUDA version ${CUDA_VERSION_MAJOR}.${CUDA_VERSION_MINOR} detected, build may succeed but only CUDA 5.0 is officially supported")
+		message(WARNING
+			"CUDA version ${CUDA_VERSION_MAJOR}.${CUDA_VERSION_MINOR} detected, "
+			"build may succeed but only CUDA 6.0 is officially supported")
 	endif()
 
 	# build for each arch
-	set(cuda_sources kernel.cu ${SRC_HEADERS} ${SRC_SVM_HEADERS} ${SRC_CLOSURE_HEADERS} ${SRC_UTIL_HEADERS})
+	set(cuda_sources kernel.cu ${SRC_HEADERS} ${SRC_SVM_HEADERS} ${SRC_GEOM_HEADERS} ${SRC_CLOSURE_HEADERS} ${SRC_UTIL_HEADERS})
 	set(cuda_cubins)
 
 	foreach(arch ${CYCLES_CUDA_BINARIES_ARCH})
 		set(cuda_cubin kernel_${arch}.cubin)
 
 		set(cuda_version_flags "-D__KERNEL_CUDA_VERSION__=${CUDA_VERSION}")
-
-		# CUDA 5.x build flags for different archs
-		if(${arch} MATCHES "sm_2[0-9]")
-			# sm_2x
-			set(cuda_arch_flags "--maxrregcount=32")
-		elseif(${arch} MATCHES "sm_3[0-9]")
-			# sm_3x
-			set(cuda_arch_flags "--maxrregcount=32")
-		endif()
-
 		set(cuda_math_flags "--use_fast_math")
-		
-		if(CUDA_VERSION LESS 50 AND ${arch} MATCHES "sm_35")
+
+		if(CUDA_VERSION LESS 60 AND ${arch} MATCHES "sm_50")
+			message(WARNING "Can't build kernel for CUDA sm_50 architecture, skipping")
+		elseif(CUDA_VERSION LESS 50 AND ${arch} MATCHES "sm_35")
 			message(WARNING "Can't build kernel for CUDA sm_35 architecture, skipping")
 		else()
 			add_custom_command(
 				OUTPUT ${cuda_cubin}
-				COMMAND ${CUDA_NVCC_EXECUTABLE} -arch=${arch} -m${CUDA_BITS} --cubin ${CMAKE_CURRENT_SOURCE_DIR}/kernel.cu -o ${CMAKE_CURRENT_BINARY_DIR}/${cuda_cubin} --ptxas-options="-v" ${cuda_arch_flags} ${cuda_version_flags} ${cuda_math_flags} -I${CMAKE_CURRENT_SOURCE_DIR}/../util -I${CMAKE_CURRENT_SOURCE_DIR}/svm -DCCL_NAMESPACE_BEGIN= -DCCL_NAMESPACE_END= -DNVCC
+				COMMAND ${CUDA_NVCC_EXECUTABLE}
+				        -arch=${arch}
+				        -m${CUDA_BITS}
+				        --cubin ${CMAKE_CURRENT_SOURCE_DIR}/kernel.cu
+				        -o ${CMAKE_CURRENT_BINARY_DIR}/${cuda_cubin}
+				        --ptxas-options="-v"
+				        ${cuda_arch_flags}
+				        ${cuda_version_flags}
+				        ${cuda_math_flags}
+				        -I${CMAKE_CURRENT_SOURCE_DIR}/../util
+				        -I${CMAKE_CURRENT_SOURCE_DIR}/svm
+				        -DCCL_NAMESPACE_BEGIN=
+				        -DCCL_NAMESPACE_END=
+				        -DNVCC
+
 				DEPENDS ${cuda_sources})
 
 			delayed_install("${CMAKE_CURRENT_BINARY_DIR}" "${cuda_cubin}" ${CYCLES_INSTALL_PATH}/lib)
@@ -195,12 +207,22 @@ endif()
 include_directories(${INC})
 include_directories(SYSTEM ${INC_SYS})
 
-set_source_files_properties(kernel_sse2.cpp PROPERTIES COMPILE_FLAGS "${CYCLES_SSE2_KERNEL_FLAGS}")
-set_source_files_properties(kernel_sse3.cpp PROPERTIES COMPILE_FLAGS "${CYCLES_SSE3_KERNEL_FLAGS}")
-set_source_files_properties(kernel_sse41.cpp PROPERTIES COMPILE_FLAGS "${CYCLES_SSE41_KERNEL_FLAGS}")
-set_source_files_properties(kernel_avx.cpp PROPERTIES COMPILE_FLAGS "${CYCLES_AVX_KERNEL_FLAGS}")
+if(CXX_HAS_SSE)
+	list(APPEND SRC
+		kernel_sse2.cpp
+		kernel_sse3.cpp
+		kernel_sse41.cpp
+		kernel_avx.cpp
+	)
+
+	set_source_files_properties(kernel_sse2.cpp PROPERTIES COMPILE_FLAGS "${CYCLES_SSE2_KERNEL_FLAGS}")
+	set_source_files_properties(kernel_sse3.cpp PROPERTIES COMPILE_FLAGS "${CYCLES_SSE3_KERNEL_FLAGS}")
+	set_source_files_properties(kernel_sse41.cpp PROPERTIES COMPILE_FLAGS "${CYCLES_SSE41_KERNEL_FLAGS}")
+	set_source_files_properties(kernel_avx.cpp PROPERTIES COMPILE_FLAGS "${CYCLES_AVX_KERNEL_FLAGS}")
+endif()
+
 
-add_library(cycles_kernel ${SRC} ${SRC_HEADERS} ${SRC_CLOSURE_HEADERS} ${SRC_SVM_HEADERS})
+add_library(cycles_kernel ${SRC} ${SRC_HEADERS} ${SRC_CLOSURE_HEADERS} ${SRC_SVM_HEADERS} ${SRC_GEOM_HEADERS})
 
 if(WITH_CYCLES_CUDA)
 	add_dependencies(cycles_kernel cycles_kernel_cuda)
@@ -221,5 +243,6 @@ delayed_install(${CMAKE_CURRENT_SOURCE_DIR} "kernel.cu" ${CYCLES_INSTALL_PATH}/k
 delayed_install(${CMAKE_CURRENT_SOURCE_DIR} "${SRC_HEADERS}" ${CYCLES_INSTALL_PATH}/kernel)
 delayed_install(${CMAKE_CURRENT_SOURCE_DIR} "${SRC_CLOSURE_HEADERS}" ${CYCLES_INSTALL_PATH}/kernel/closure)
 delayed_install(${CMAKE_CURRENT_SOURCE_DIR} "${SRC_SVM_HEADERS}" ${CYCLES_INSTALL_PATH}/kernel/svm)
+delayed_install(${CMAKE_CURRENT_SOURCE_DIR} "${SRC_GEOM_HEADERS}" ${CYCLES_INSTALL_PATH}/kernel/geom)
 delayed_install(${CMAKE_CURRENT_SOURCE_DIR} "${SRC_UTIL_HEADERS}" ${CYCLES_INSTALL_PATH}/kernel)
 
diff --git a/intern/cycles/kernel/SConscript b/intern/cycles/kernel/SConscript
index 5077d8c96b0..04e1bad7538 100644
--- a/intern/cycles/kernel/SConscript
+++ b/intern/cycles/kernel/SConscript
@@ -60,6 +60,7 @@ if env['WITH_BF_CYCLES_CUDA_BINARIES']:
     kernel_file = os.path.join(source_dir, "kernel.cu")
     util_dir = os.path.join(source_dir, "../util")
     svm_dir = os.path.join(source_dir, "../svm")
+    geom_dir = os.path.join(source_dir, "../geom")
     closure_dir = os.path.join(source_dir, "../closure")
 
     # get CUDA version
@@ -68,37 +69,33 @@ if env['WITH_BF_CYCLES_CUDA_BINARIES']:
     cuda_major_minor = re.findall(r'release (\d+).(\d+)', output)[0]
     cuda_version = int(cuda_major_minor[0])*10 + int(cuda_major_minor[1])
 
-    if cuda_version != 50:
-        print("CUDA version %d.%d detected, build may succeed but only CUDA 5.0 is officially supported." % (cuda_version/10, cuda_version%10))
+    if cuda_version != 60:
+        print("CUDA version %d.%d detected, build may succeed but only CUDA 6.0 is officially supported." % (cuda_version/10, cuda_version%10))
 
     # nvcc flags
     nvcc_flags = "-m%s" % (bits)
     nvcc_flags += " --cubin --ptxas-options=\"-v\""
     nvcc_flags += " -D__KERNEL_CUDA_VERSION__=%d" % (cuda_version)
     nvcc_flags += " -DCCL_NAMESPACE_BEGIN= -DCCL_NAMESPACE_END= -DNVCC"
-    nvcc_flags += " -I \"%s\" -I \"%s\" -I \"%s\"" % (util_dir, svm_dir, closure_dir)
+    nvcc_flags += " -I \"%s\" -I \"%s\" -I \"%s\" -I \"%s\"" % (util_dir, svm_dir, geom_dir, closure_dir)
 
     # dependencies
-    dependencies = ['kernel.cu'] + kernel.Glob('*.h') + kernel.Glob('../util/*.h') + kernel.Glob('svm/*.h') + kernel.Glob('closure/*.h')
+    dependencies = ['kernel.cu'] + kernel.Glob('*.h') + kernel.Glob('../util/*.h') + kernel.Glob('svm/*.h') + kernel.Glob('geom/*.h') + kernel.Glob('closure/*.h')
     last_cubin_file = None
 
     # add command for each cuda architecture
     for arch in cuda_archs:
-        cubin_file = os.path.join(build_dir, "kernel_%s.cubin" % arch)
+        if cuda_version < 60 and arch == "sm_50":
+            print("Can't build kernel for CUDA sm_50 architecture, skipping")
+            continue
 
-        # CUDA 5.x build flags for different archs
-        if arch.startswith("sm_2"):
-            # sm_2x
-            cuda_arch_flags = "--maxrregcount=32 --use_fast_math"
-        elif arch.startswith("sm_3"):
-            # sm_3x
-            cuda_arch_flags = "--maxrregcount=32 --use_fast_math"
+        cubin_file = os.path.join(build_dir, "kernel_%s.cubin" % arch)
 
         if env['BF_CYCLES_CUDA_ENV']:
             MS_SDK = "C:\\Program Files\\Microsoft SDKs\\Windows\\v7.1\\Bin\\SetEnv.cmd"
-            command = "\"%s\" & \"%s\" -arch=%s %s %s \"%s\" -o \"%s\"" % (MS_SDK, nvcc, arch, nvcc_flags, cuda_arch_flags, kernel_file, cubin_file)
+            command = "\"%s\" & \"%s\" -arch=%s %s \"%s\" -o \"%s\"" % (MS_SDK, nvcc, arch, nvcc_flags, kernel_file, cubin_file)
         else:
-            command = "\"%s\" -arch=%s %s %s \"%s\" -o \"%s\"" % (nvcc, arch, nvcc_flags, cuda_arch_flags, kernel_file, cubin_file)
+            command = "\"%s\" -arch=%s %s \"%s\" -o \"%s\"" % (nvcc, arch, nvcc_flags, kernel_file, cubin_file)
 
         kernel.Command(cubin_file, 'kernel.cu', command)
         kernel.Depends(cubin_file, dependencies)
diff --git a/intern/cycles/kernel/closure/bsdf_hair.h b/intern/cycles/kernel/closure/bsdf_hair.h
index 163e7cc5ee2..19cdb773255 100644
--- a/intern/cycles/kernel/closure/bsdf_hair.h
+++ b/intern/cycles/kernel/closure/bsdf_hair.h
@@ -84,7 +84,7 @@ ccl_device float3 bsdf_hair_reflection_eval_reflect(const ShaderClosure *sc, con
 	float theta_i = M_PI_2_F - safe_acosf(omega_in_z);
 	float cosphi_i = dot(omega_in_y, locy);
 
-	if(M_PI_2_F - fabsf(theta_i) < 0.001f || cosphi_i < 0.0f){
+	if(M_PI_2_F - fabsf(theta_i) < 0.001f || cosphi_i < 0.0f) {
 		*pdf = 0.0f;
 		return make_float3(*pdf, *pdf, *pdf);
 	}
@@ -99,7 +99,7 @@ ccl_device float3 bsdf_hair_reflection_eval_reflect(const ShaderClosure *sc, con
 	float theta_h = (theta_i + theta_r) * 0.5f;
 	float t = theta_h - offset;
 
-	float phi_pdf = cos(phi_i * 0.5f) * 0.25f / roughness2;
+	float phi_pdf = cosf(phi_i * 0.5f) * 0.25f / roughness2;
 	float theta_pdf = roughness1 / (2 * (t*t + roughness1*roughness1) * (a_R - b_R)* costheta_i);
 	*pdf = phi_pdf * theta_pdf;
 
@@ -140,7 +140,7 @@ ccl_device float3 bsdf_hair_transmission_eval_transmit(const ShaderClosure *sc,
 	float theta_i = M_PI_2_F - safe_acosf(omega_in_z);
 	float phi_i = safe_acosf(dot(omega_in_y, locy));
 
-	if(M_PI_2_F - fabsf(theta_i) < 0.001f){
+	if(M_PI_2_F - fabsf(theta_i) < 0.001f) {
 		*pdf = 0.0f;
 		return make_float3(*pdf, *pdf, *pdf);
 	}
@@ -191,7 +191,7 @@ ccl_device int bsdf_hair_reflection_sample(const ShaderClosure *sc, float3 Ng, f
 
 	float phi = 2 * safe_asinf(1 - 2 * randv) * roughness2;
 
-	float phi_pdf = cos(phi * 0.5f) * 0.25f / roughness2;
+	float phi_pdf = cosf(phi * 0.5f) * 0.25f / roughness2;
 
 	float theta_pdf = roughness1 / (2 * (t*t + roughness1*roughness1) * (a_R - b_R)*costheta_i);
 
@@ -251,8 +251,8 @@ ccl_device int bsdf_hair_transmission_sample(const ShaderClosure *sc, float3 Ng,
 	float phi_pdf = roughness2 / (c_TT * (p * p + roughness2 * roughness2));
 
 	*omega_in =(cosf(phi) * costheta_i) * locy -
-			   (sinf(phi) * costheta_i) * locx +
-			   (            sintheta_i) * Tg;
+	           (sinf(phi) * costheta_i) * locx +
+	           (            sintheta_i) * Tg;
 
 	//differentials - TODO: find a better approximation for the transmission bounce
 #ifdef __RAY_DIFFERENTIALS__
@@ -261,7 +261,7 @@ ccl_device int bsdf_hair_transmission_sample(const ShaderClosure *sc, float3 Ng,
 #endif
 
 	*pdf = fabsf(phi_pdf * theta_pdf);
-	if(M_PI_2_F - fabsf(theta_i) < 0.001f){
+	if(M_PI_2_F - fabsf(theta_i) < 0.001f) {
 		*pdf = 0.0f;
 	}
 
diff --git a/intern/cycles/kernel/closure/bsdf_microfacet.h b/intern/cycles/kernel/closure/bsdf_microfacet.h
index dfa8886c113..1ec35e444fe 100644
--- a/intern/cycles/kernel/closure/bsdf_microfacet.h
+++ b/intern/cycles/kernel/closure/bsdf_microfacet.h
@@ -154,8 +154,8 @@ ccl_device int bsdf_microfacet_ggx_sample(const ShaderClosure *sc, float3 Ng, fl
 		float sinThetaM  = cosThetaM * safe_sqrtf(tanThetaM2);
 		float phiM = M_2PI_F * randv;
 		float3 m = (cosf(phiM) * sinThetaM) * X +
-				 (sinf(phiM) * sinThetaM) * Y +
-							   cosThetaM  * Z;
+		           (sinf(phiM) * sinThetaM) * Y +
+		           (             cosThetaM) * Z;
 		if(!m_refractive) {
 			float cosMO = dot(m, I);
 			if(cosMO > 0) {
@@ -383,8 +383,8 @@ ccl_device int bsdf_microfacet_beckmann_sample(const ShaderClosure *sc, float3 N
 		float sinThetaM = cosThetaM * tanThetaM;
 		float phiM = M_2PI_F * randv;
 		float3 m = (cosf(phiM) * sinThetaM) * X +
-				 (sinf(phiM) * sinThetaM) * Y +
-							   cosThetaM  * Z;
+		           (sinf(phiM) * sinThetaM) * Y +
+		           (             cosThetaM) * Z;
 
 		if(!m_refractive) {
 			float cosMO = dot(m, I);
diff --git a/intern/cycles/kernel/closure/bsdf_phong_ramp.h b/intern/cycles/kernel/closure/bsdf_phong_ramp.h
index 219c5aea159..2b4e1c68640 100644
--- a/intern/cycles/kernel/closure/bsdf_phong_ramp.h
+++ b/intern/cycles/kernel/closure/bsdf_phong_ramp.h
@@ -109,8 +109,8 @@ ccl_device int bsdf_phong_ramp_sample(const ShaderClosure *sc, const float3 colo
 		float sinTheta2 = 1 - cosTheta * cosTheta;
 		float sinTheta = sinTheta2 > 0 ? sqrtf(sinTheta2) : 0;
 		*omega_in = (cosf(phi) * sinTheta) * T +
-				   (sinf(phi) * sinTheta) * B +
-				   (            cosTheta) * R;
+		            (sinf(phi) * sinTheta) * B +
+		            (            cosTheta) * R;
 		if (dot(Ng, *omega_in) > 0.0f)
 		{
 			// common terms for pdf and eval
diff --git a/intern/cycles/kernel/closure/bsdf_util.h b/intern/cycles/kernel/closure/bsdf_util.h
index f6dceb3ca82..b3dcb9dcc38 100644
--- a/intern/cycles/kernel/closure/bsdf_util.h
+++ b/intern/cycles/kernel/closure/bsdf_util.h
@@ -35,14 +35,15 @@
 
 CCL_NAMESPACE_BEGIN
 
-ccl_device float fresnel_dielectric(float eta, const float3 N,
-		const float3 I, float3 *R, float3 *T,
+ccl_device float fresnel_dielectric(
+        float eta, const float3 N,
+        const float3 I, float3 *R, float3 *T,
 #ifdef __RAY_DIFFERENTIALS__
-		const float3 dIdx, const float3 dIdy,
-		float3 *dRdx, float3 *dRdy,
-		float3 *dTdx, float3 *dTdy, 
+        const float3 dIdx, const float3 dIdy,
+        float3 *dRdx, float3 *dRdy,
+        float3 *dTdx, float3 *dTdy,
 #endif
-		bool *is_inside)
+        bool *is_inside)
 {
 	float cos = dot(N, I), neta;
 	float3 Nn;
diff --git a/intern/cycles/kernel/closure/bsdf_westin.h b/intern/cycles/kernel/closure/bsdf_westin.h
index ca4c05e91fe..9dc1c00bb3d 100644
--- a/intern/cycles/kernel/closure/bsdf_westin.h
+++ b/intern/cycles/kernel/closure/bsdf_westin.h
@@ -96,10 +96,9 @@ ccl_device int bsdf_westin_backscatter_sample(const ShaderClosure *sc, float3 Ng
 		float sinTheta2 = 1 - cosTheta * cosTheta;
 		float sinTheta = sinTheta2 > 0 ? sqrtf(sinTheta2) : 0;
 		*omega_in = (cosf(phi) * sinTheta) * T +
-				   (sinf(phi) * sinTheta) * B +
-				   (cosTheta) * I;
-		if(dot(Ng, *omega_in) > 0)
-		{
+		            (sinf(phi) * sinTheta) * B +
+		            (cosTheta) * I;
+		if(dot(Ng, *omega_in) > 0) {
 			// common terms for pdf and eval
 			float cosNI = dot(N, *omega_in);
 			// make sure the direction we chose is still in the right hemisphere
diff --git a/intern/cycles/kernel/geom/geom.h b/intern/cycles/kernel/geom/geom.h
new file mode 100644
index 00000000000..9495a2541f9
--- /dev/null
+++ b/intern/cycles/kernel/geom/geom.h
@@ -0,0 +1,44 @@
+/*
+ * Adapted from code Copyright 2009-2010 NVIDIA Corporation
+ * Modifications Copyright 2011, Blender Foundation.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+/* bottom-most stack entry, indicating the end of traversal */
+#define ENTRYPOINT_SENTINEL 0x76543210
+
+/* 64 object BVH + 64 mesh BVH + 64 object node splitting */
+#define BVH_STACK_SIZE 192
+#define BVH_NODE_SIZE 4
+#define TRI_NODE_SIZE 3
+
+/* silly workaround for float extended precision that happens when compiling
+ * without sse support on x86, it results in different results for float ops
+ * that you would otherwise expect to compare correctly */
+#if !defined(__i386__) || defined(__SSE__)
+#define NO_EXTENDED_PRECISION
+#else
+#define NO_EXTENDED_PRECISION volatile
+#endif
+
+#include "geom_attribute.h"
+#include "geom_object.h"
+#include "geom_triangle.h"
+#include "geom_motion_triangle.h"
+#include "geom_motion_curve.h"
+#include "geom_curve.h"
+#include "geom_volume.h"
+#include "geom_primitive.h"
+#include "geom_bvh.h"
+
diff --git a/intern/cycles/kernel/geom/geom_attribute.h b/intern/cycles/kernel/geom/geom_attribute.h
new file mode 100644
index 00000000000..63ce31c492f
--- /dev/null
+++ b/intern/cycles/kernel/geom/geom_attribute.h
@@ -0,0 +1,71 @@
+/*
+ * Copyright 2011-2013 Blender Foundation
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License
+ */
+
+CCL_NAMESPACE_BEGIN
+
+/* Attributes
+ *
+ * We support an arbitrary number of attributes on various mesh elements.
+ * On vertices, triangles, curve keys, curves, meshes and volume grids.
+ * Most of the code for attribute reading is in the primitive files.
+ *
+ * Lookup of attributes is different between OSL and SVM, as OSL is ustring
+ * based while for SVM we use integer ids. */
+
+/* Find attribute based on ID */
+
+ccl_device_inline int find_attribute(KernelGlobals *kg, const ShaderData *sd, uint id, AttributeElement *elem)
+{
+	if(sd->object == PRIM_NONE)
+		return (int)ATTR_STD_NOT_FOUND;
+
+	/* for SVM, find attribute by unique id */
+	uint attr_offset = sd->object*kernel_data.bvh.attributes_map_stride;
+#ifdef __HAIR__
+	attr_offset = (sd->type & PRIMITIVE_ALL_CURVE)? attr_offset + ATTR_PRIM_CURVE: attr_offset;
+#endif
+	uint4 attr_map = kernel_tex_fetch(__attributes_map, attr_offset);
+	
+	while(attr_map.x != id) {
+		attr_offset += ATTR_PRIM_TYPES;
+		attr_map = kernel_tex_fetch(__attributes_map, attr_offset);
+	}
+
+	*elem = (AttributeElement)attr_map.y;
+	
+	if(sd->prim == PRIM_NONE && (AttributeElement)attr_map.y != ATTR_ELEMENT_MESH)
+		return ATTR_STD_NOT_FOUND;
+
+	/* return result */
+	return (attr_map.y == ATTR_ELEMENT_NONE) ? (int)ATTR_STD_NOT_FOUND : (int)attr_map.z;
+}
+
+/* Transform matrix attribute on meshes */
+
+ccl_device Transform primitive_attribute_matrix(KernelGlobals *kg, const ShaderData *sd, int offset)
+{
+	Transform tfm;
+
+	tfm.x = kernel_tex_fetch(__attributes_float3, offset + 0);
+	tfm.y = kernel_tex_fetch(__attributes_float3, offset + 1);
+	tfm.z = kernel_tex_fetch(__attributes_float3, offset + 2);
+	tfm.w = kernel_tex_fetch(__attributes_float3, offset + 3);
+
+	return tfm;
+}
+
+CCL_NAMESPACE_END
+
diff --git a/intern/cycles/kernel/geom/geom_bvh.h b/intern/cycles/kernel/geom/geom_bvh.h
new file mode 100644
index 00000000000..dd7c25d581d
--- /dev/null
+++ b/intern/cycles/kernel/geom/geom_bvh.h
@@ -0,0 +1,318 @@
+/*
+ * Adapted from code Copyright 2009-2010 NVIDIA Corporation
+ * Modifications Copyright 2011, Blender Foundation.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+/* BVH
+ *
+ * Bounding volume hierarchy for ray tracing. We compile different variations
+ * of the same BVH traversal function for faster rendering when some types of
+ * primitives are not needed, using #includes to work around the lack of
+ * C++ templates in OpenCL.
+ *
+ * Originally based on "Understanding the Efficiency of Ray Traversal on GPUs",
+ * the code has been extended and modified to support more primitives and work
+ * with CPU/CUDA/OpenCL. */
+
+CCL_NAMESPACE_BEGIN
+
+/* BVH intersection function variations */
+
+#define BVH_INSTANCING			1
+#define BVH_MOTION				2
+#define BVH_HAIR				4
+#define BVH_HAIR_MINIMUM_WIDTH	8
+
+#define BVH_FUNCTION_NAME bvh_intersect
+#define BVH_FUNCTION_FEATURES 0
+#include "geom_bvh_traversal.h"
+
+#if defined(__INSTANCING__)
+#define BVH_FUNCTION_NAME bvh_intersect_instancing
+#define BVH_FUNCTION_FEATURES BVH_INSTANCING
+#include "geom_bvh_traversal.h"
+#endif
+
+#if defined(__HAIR__)
+#define BVH_FUNCTION_NAME bvh_intersect_hair
+#define BVH_FUNCTION_FEATURES BVH_INSTANCING|BVH_HAIR|BVH_HAIR_MINIMUM_WIDTH
+#include "geom_bvh_traversal.h"
+#endif
+
+#if defined(__OBJECT_MOTION__)
+#define BVH_FUNCTION_NAME bvh_intersect_motion
+#define BVH_FUNCTION_FEATURES BVH_INSTANCING|BVH_MOTION
+#include "geom_bvh_traversal.h"
+#endif
+
+#if defined(__HAIR__) && defined(__OBJECT_MOTION__)
+#define BVH_FUNCTION_NAME bvh_intersect_hair_motion
+#define BVH_FUNCTION_FEATURES BVH_INSTANCING|BVH_HAIR|BVH_HAIR_MINIMUM_WIDTH|BVH_MOTION
+#include "geom_bvh_traversal.h"
+#endif
+
+#if defined(__SUBSURFACE__)
+#define BVH_FUNCTION_NAME bvh_intersect_subsurface
+#define BVH_FUNCTION_FEATURES 0
+#include "geom_bvh_subsurface.h"
+#endif
+
+#if defined(__SUBSURFACE__) && defined(__INSTANCING__)
+#define BVH_FUNCTION_NAME bvh_intersect_subsurface_instancing
+#define BVH_FUNCTION_FEATURES BVH_INSTANCING
+#include "geom_bvh_subsurface.h"
+#endif
+
+#if defined(__SUBSURFACE__) && defined(__HAIR__)
+#define BVH_FUNCTION_NAME bvh_intersect_subsurface_hair
+#define BVH_FUNCTION_FEATURES BVH_INSTANCING|BVH_HAIR
+#include "geom_bvh_subsurface.h"
+#endif
+
+#if defined(__SUBSURFACE__) && defined(__OBJECT_MOTION__)
+#define BVH_FUNCTION_NAME bvh_intersect_subsurface_motion
+#define BVH_FUNCTION_FEATURES BVH_INSTANCING|BVH_MOTION
+#include "geom_bvh_subsurface.h"
+#endif
+
+#if defined(__SUBSURFACE__) && defined(__HAIR__) && defined(__OBJECT_MOTION__)
+#define BVH_FUNCTION_NAME bvh_intersect_subsurface_hair_motion
+#define BVH_FUNCTION_FEATURES BVH_INSTANCING|BVH_HAIR|BVH_MOTION
+#include "geom_bvh_subsurface.h"
+#endif
+
+#if defined(__SHADOW_RECORD_ALL__)
+#define BVH_FUNCTION_NAME bvh_intersect_shadow_all
+#define BVH_FUNCTION_FEATURES 0
+#include "geom_bvh_shadow.h"
+#endif
+
+#if defined(__SUBSURFACE__) && defined(__INSTANCING__)
+#define BVH_FUNCTION_NAME bvh_intersect_shadow_all_instancing
+#define BVH_FUNCTION_FEATURES BVH_INSTANCING
+#include "geom_bvh_shadow.h"
+#endif
+
+#if defined(__SUBSURFACE__) && defined(__HAIR__)
+#define BVH_FUNCTION_NAME bvh_intersect_shadow_all_hair
+#define BVH_FUNCTION_FEATURES BVH_INSTANCING|BVH_HAIR
+#include "geom_bvh_shadow.h"
+#endif
+
+#if defined(__SUBSURFACE__) && defined(__OBJECT_MOTION__)
+#define BVH_FUNCTION_NAME bvh_intersect_shadow_all_motion
+#define BVH_FUNCTION_FEATURES BVH_INSTANCING|BVH_MOTION
+#include "geom_bvh_shadow.h"
+#endif
+
+#if defined(__SUBSURFACE__) && defined(__HAIR__) && defined(__OBJECT_MOTION__)
+#define BVH_FUNCTION_NAME bvh_intersect_shadow_all_hair_motion
+#define BVH_FUNCTION_FEATURES BVH_INSTANCING|BVH_HAIR|BVH_MOTION
+#include "geom_bvh_shadow.h"
+#endif
+
+/* to work around titan bug when using arrays instead of textures */
+#if !defined(__KERNEL_CUDA__) || defined(__KERNEL_CUDA_TEX_STORAGE__)
+ccl_device_inline
+#else
+ccl_device_noinline
+#endif
+#ifdef __HAIR__ 
+bool scene_intersect(KernelGlobals *kg, const Ray *ray, const uint visibility, Intersection *isect, uint *lcg_state, float difl, float extmax)
+#else
+bool scene_intersect(KernelGlobals *kg, const Ray *ray, const uint visibility, Intersection *isect)
+#endif
+{
+#ifdef __OBJECT_MOTION__
+	if(kernel_data.bvh.have_motion) {
+#ifdef __HAIR__
+		if(kernel_data.bvh.have_curves)
+			return bvh_intersect_hair_motion(kg, ray, isect, visibility, lcg_state, difl, extmax);
+#endif /* __HAIR__ */
+
+		return bvh_intersect_motion(kg, ray, isect, visibility);
+	}
+#endif /* __OBJECT_MOTION__ */
+
+#ifdef __HAIR__ 
+	if(kernel_data.bvh.have_curves)
+		return bvh_intersect_hair(kg, ray, isect, visibility, lcg_state, difl, extmax);
+#endif /* __HAIR__ */
+
+#ifdef __KERNEL_CPU__
+
+#ifdef __INSTANCING__
+	if(kernel_data.bvh.have_instancing)
+		return bvh_intersect_instancing(kg, ray, isect, visibility);
+#endif /* __INSTANCING__ */
+
+	return bvh_intersect(kg, ray, isect, visibility);
+#else /* __KERNEL_CPU__ */
+
+#ifdef __INSTANCING__
+	return bvh_intersect_instancing(kg, ray, isect, visibility);
+#else
+	return bvh_intersect(kg, ray, isect, visibility);
+#endif /* __INSTANCING__ */
+
+#endif /* __KERNEL_CPU__ */
+}
+
+/* to work around titan bug when using arrays instead of textures */
+#ifdef __SUBSURFACE__
+#if !defined(__KERNEL_CUDA__) || defined(__KERNEL_CUDA_TEX_STORAGE__)
+ccl_device_inline
+#else
+ccl_device_noinline
+#endif
+uint scene_intersect_subsurface(KernelGlobals *kg, const Ray *ray, Intersection *isect, int subsurface_object, uint *lcg_state, int max_hits)
+{
+#ifdef __OBJECT_MOTION__
+	if(kernel_data.bvh.have_motion) {
+#ifdef __HAIR__
+		if(kernel_data.bvh.have_curves)
+			return bvh_intersect_subsurface_hair_motion(kg, ray, isect, subsurface_object, lcg_state, max_hits);
+#endif /* __HAIR__ */
+
+		return bvh_intersect_subsurface_motion(kg, ray, isect, subsurface_object, lcg_state, max_hits);
+	}
+#endif /* __OBJECT_MOTION__ */
+
+#ifdef __HAIR__ 
+	if(kernel_data.bvh.have_curves)
+		return bvh_intersect_subsurface_hair(kg, ray, isect, subsurface_object, lcg_state, max_hits);
+#endif /* __HAIR__ */
+
+#ifdef __KERNEL_CPU__
+
+#ifdef __INSTANCING__
+	if(kernel_data.bvh.have_instancing)
+		return bvh_intersect_subsurface_instancing(kg, ray, isect, subsurface_object, lcg_state, max_hits);
+#endif /* __INSTANCING__ */
+
+	return bvh_intersect_subsurface(kg, ray, isect, subsurface_object, lcg_state, max_hits);
+#else /* __KERNEL_CPU__ */
+
+#ifdef __INSTANCING__
+	return bvh_intersect_subsurface_instancing(kg, ray, isect, subsurface_object, lcg_state, max_hits);
+#else
+	return bvh_intersect_subsurface(kg, ray, isect, subsurface_object, lcg_state, max_hits);
+#endif /* __INSTANCING__ */
+
+#endif /* __KERNEL_CPU__ */
+}
+#endif
+
+/* to work around titan bug when using arrays instead of textures */
+#ifdef __SHADOW_RECORD_ALL__
+#if !defined(__KERNEL_CUDA__) || defined(__KERNEL_CUDA_TEX_STORAGE__)
+ccl_device_inline
+#else
+ccl_device_noinline
+#endif
+uint scene_intersect_shadow_all(KernelGlobals *kg, const Ray *ray, Intersection *isect, uint max_hits, uint *num_hits)
+{
+#ifdef __OBJECT_MOTION__
+	if(kernel_data.bvh.have_motion) {
+#ifdef __HAIR__
+		if(kernel_data.bvh.have_curves)
+			return bvh_intersect_shadow_all_hair_motion(kg, ray, isect, max_hits, num_hits);
+#endif /* __HAIR__ */
+
+		return bvh_intersect_shadow_all_motion(kg, ray, isect, max_hits, num_hits);
+	}
+#endif /* __OBJECT_MOTION__ */
+
+#ifdef __HAIR__ 
+	if(kernel_data.bvh.have_curves)
+		return bvh_intersect_shadow_all_hair(kg, ray, isect, max_hits, num_hits);
+#endif /* __HAIR__ */
+
+#ifdef __KERNEL_CPU__
+
+#ifdef __INSTANCING__
+	if(kernel_data.bvh.have_instancing)
+		return bvh_intersect_shadow_all_instancing(kg, ray, isect, max_hits, num_hits);
+#endif /* __INSTANCING__ */
+
+	return bvh_intersect_shadow_all(kg, ray, isect, max_hits, num_hits);
+#else /* __KERNEL_CPU__ */
+
+#ifdef __INSTANCING__
+	return bvh_intersect_shadow_all_instancing(kg, ray, isect, max_hits, num_hits);
+#else
+	return bvh_intersect_shadow_all(kg, ray, isect, max_hits, num_hits);
+#endif /* __INSTANCING__ */
+
+#endif /* __KERNEL_CPU__ */
+}
+#endif
+
+
+/* Ray offset to avoid self intersection.
+ *
+ * This function should be used to compute a modified ray start position for
+ * rays leaving from a surface. */
+
+ccl_device_inline float3 ray_offset(float3 P, float3 Ng)
+{
+#ifdef __INTERSECTION_REFINE__
+	const float epsilon_f = 1e-5f;
+	/* ideally this should match epsilon_f, but instancing and motion blur
+	 * precision makes it problematic */
+	const float epsilon_test = 1.0f;
+	const int epsilon_i = 32;
+
+	float3 res;
+
+	/* x component */
+	if(fabsf(P.x) < epsilon_test) {
+		res.x = P.x + Ng.x*epsilon_f;
+	}
+	else {
+		uint ix = __float_as_uint(P.x);
+		ix += ((ix ^ __float_as_uint(Ng.x)) >> 31)? -epsilon_i: epsilon_i;
+		res.x = __uint_as_float(ix);
+	}
+
+	/* y component */
+	if(fabsf(P.y) < epsilon_test) {
+		res.y = P.y + Ng.y*epsilon_f;
+	}
+	else {
+		uint iy = __float_as_uint(P.y);
+		iy += ((iy ^ __float_as_uint(Ng.y)) >> 31)? -epsilon_i: epsilon_i;
+		res.y = __uint_as_float(iy);
+	}
+
+	/* z component */
+	if(fabsf(P.z) < epsilon_test) {
+		res.z = P.z + Ng.z*epsilon_f;
+	}
+	else {
+		uint iz = __float_as_uint(P.z);
+		iz += ((iz ^ __float_as_uint(Ng.z)) >> 31)? -epsilon_i: epsilon_i;
+		res.z = __uint_as_float(iz);
+	}
+
+	return res;
+#else
+	const float epsilon_f = 1e-4f;
+	return P + epsilon_f*Ng;
+#endif
+}
+
+CCL_NAMESPACE_END
+
diff --git a/intern/cycles/kernel/geom/geom_bvh_shadow.h b/intern/cycles/kernel/geom/geom_bvh_shadow.h
new file mode 100644
index 00000000000..98bf82b3b2d
--- /dev/null
+++ b/intern/cycles/kernel/geom/geom_bvh_shadow.h
@@ -0,0 +1,375 @@
+/*
+ * Adapted from code Copyright 2009-2010 NVIDIA Corporation,
+ * and code copyright 2009-2012 Intel Corporation
+ *
+ * Modifications Copyright 2011-2013, Blender Foundation.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+/* This is a template BVH traversal function, where various features can be
+ * enabled/disabled. This way we can compile optimized versions for each case
+ * without new features slowing things down.
+ *
+ * BVH_INSTANCING: object instancing
+ * BVH_HAIR: hair curve rendering
+ * BVH_MOTION: motion blur rendering
+ *
+ */
+
+#define FEATURE(f) (((BVH_FUNCTION_FEATURES) & (f)) != 0)
+
+ccl_device bool BVH_FUNCTION_NAME
+(KernelGlobals *kg, const Ray *ray, Intersection *isect_array, const uint max_hits, uint *num_hits)
+{
+	/* todo:
+	 * - likely and unlikely for if() statements
+	 * - test restrict attribute for pointers
+	 */
+	
+	/* traversal stack in CUDA thread-local memory */
+	int traversalStack[BVH_STACK_SIZE];
+	traversalStack[0] = ENTRYPOINT_SENTINEL;
+
+	/* traversal variables in registers */
+	int stackPtr = 0;
+	int nodeAddr = kernel_data.bvh.root;
+
+	/* ray parameters in registers */
+	const float tmax = ray->t;
+	float3 P = ray->P;
+	float3 dir = bvh_clamp_direction(ray->D);
+	float3 idir = bvh_inverse_direction(dir);
+	int object = OBJECT_NONE;
+	float isect_t = tmax;
+
+#if FEATURE(BVH_MOTION)
+	Transform ob_tfm;
+#endif
+
+#if FEATURE(BVH_INSTANCING)
+	int num_hits_in_instance = 0;
+#endif
+
+	*num_hits = 0;
+	isect_array->t = tmax;
+
+#if defined(__KERNEL_SSE2__)
+	const shuffle_swap_t shuf_identity = shuffle_swap_identity();
+	const shuffle_swap_t shuf_swap = shuffle_swap_swap();
+	
+	const __m128 pn = _mm_castsi128_ps(_mm_set_epi32(0x80000000, 0x80000000, 0, 0));
+	__m128 Psplat[3], idirsplat[3];
+	shuffle_swap_t shufflexyz[3];
+
+	Psplat[0] = _mm_set_ps1(P.x);
+	Psplat[1] = _mm_set_ps1(P.y);
+	Psplat[2] = _mm_set_ps1(P.z);
+
+	__m128 tsplat = _mm_set_ps(-isect_t, -isect_t, 0.0f, 0.0f);
+
+	gen_idirsplat_swap(pn, shuf_identity, shuf_swap, idir, idirsplat, shufflexyz);
+#endif
+
+	/* traversal loop */
+	do {
+		do {
+			/* traverse internal nodes */
+			while(nodeAddr >= 0 && nodeAddr != ENTRYPOINT_SENTINEL) {
+				bool traverseChild0, traverseChild1;
+				int nodeAddrChild1;
+
+#if !defined(__KERNEL_SSE2__)
+				/* Intersect two child bounding boxes, non-SSE version */
+				float t = isect_t;
+
+				/* fetch node data */
+				float4 node0 = kernel_tex_fetch(__bvh_nodes, nodeAddr*BVH_NODE_SIZE+0);
+				float4 node1 = kernel_tex_fetch(__bvh_nodes, nodeAddr*BVH_NODE_SIZE+1);
+				float4 node2 = kernel_tex_fetch(__bvh_nodes, nodeAddr*BVH_NODE_SIZE+2);
+				float4 cnodes = kernel_tex_fetch(__bvh_nodes, nodeAddr*BVH_NODE_SIZE+3);
+
+				/* intersect ray against child nodes */
+				NO_EXTENDED_PRECISION float c0lox = (node0.x - P.x) * idir.x;
+				NO_EXTENDED_PRECISION float c0hix = (node0.z - P.x) * idir.x;
+				NO_EXTENDED_PRECISION float c0loy = (node1.x - P.y) * idir.y;
+				NO_EXTENDED_PRECISION float c0hiy = (node1.z - P.y) * idir.y;
+				NO_EXTENDED_PRECISION float c0loz = (node2.x - P.z) * idir.z;
+				NO_EXTENDED_PRECISION float c0hiz = (node2.z - P.z) * idir.z;
+				NO_EXTENDED_PRECISION float c0min = max4(min(c0lox, c0hix), min(c0loy, c0hiy), min(c0loz, c0hiz), 0.0f);
+				NO_EXTENDED_PRECISION float c0max = min4(max(c0lox, c0hix), max(c0loy, c0hiy), max(c0loz, c0hiz), t);
+
+				NO_EXTENDED_PRECISION float c1lox = (node0.y - P.x) * idir.x;
+				NO_EXTENDED_PRECISION float c1hix = (node0.w - P.x) * idir.x;
+				NO_EXTENDED_PRECISION float c1loy = (node1.y - P.y) * idir.y;
+				NO_EXTENDED_PRECISION float c1hiy = (node1.w - P.y) * idir.y;
+				NO_EXTENDED_PRECISION float c1loz = (node2.y - P.z) * idir.z;
+				NO_EXTENDED_PRECISION float c1hiz = (node2.w - P.z) * idir.z;
+				NO_EXTENDED_PRECISION float c1min = max4(min(c1lox, c1hix), min(c1loy, c1hiy), min(c1loz, c1hiz), 0.0f);
+				NO_EXTENDED_PRECISION float c1max = min4(max(c1lox, c1hix), max(c1loy, c1hiy), max(c1loz, c1hiz), t);
+
+				/* decide which nodes to traverse next */
+#ifdef __VISIBILITY_FLAG__
+				/* this visibility test gives a 5% performance hit, how to solve? */
+				traverseChild0 = (c0max >= c0min) && (__float_as_uint(cnodes.z) & PATH_RAY_SHADOW);
+				traverseChild1 = (c1max >= c1min) && (__float_as_uint(cnodes.w) & PATH_RAY_SHADOW);
+#else
+				traverseChild0 = (c0max >= c0min);
+				traverseChild1 = (c1max >= c1min);
+#endif
+
+#else // __KERNEL_SSE2__
+				/* Intersect two child bounding boxes, SSE3 version adapted from Embree */
+
+				/* fetch node data */
+				const __m128 *bvh_nodes = (__m128*)kg->__bvh_nodes.data + nodeAddr*BVH_NODE_SIZE;
+				const float4 cnodes = ((float4*)bvh_nodes)[3];
+
+				/* intersect ray against child nodes */
+				const __m128 tminmaxx = _mm_mul_ps(_mm_sub_ps(shuffle_swap(bvh_nodes[0], shufflexyz[0]), Psplat[0]), idirsplat[0]);
+				const __m128 tminmaxy = _mm_mul_ps(_mm_sub_ps(shuffle_swap(bvh_nodes[1], shufflexyz[1]), Psplat[1]), idirsplat[1]);
+				const __m128 tminmaxz = _mm_mul_ps(_mm_sub_ps(shuffle_swap(bvh_nodes[2], shufflexyz[2]), Psplat[2]), idirsplat[2]);
+
+				/* calculate { c0min, c1min, -c0max, -c1max} */
+				__m128 minmax = _mm_max_ps(_mm_max_ps(tminmaxx, tminmaxy), _mm_max_ps(tminmaxz, tsplat));
+				const __m128 tminmax = _mm_xor_ps(minmax, pn);
+				const __m128 lrhit = _mm_cmple_ps(tminmax, shuffle<2, 3, 0, 1>(tminmax));
+
+				/* decide which nodes to traverse next */
+#ifdef __VISIBILITY_FLAG__
+				/* this visibility test gives a 5% performance hit, how to solve? */
+				traverseChild0 = (_mm_movemask_ps(lrhit) & 1) && (__float_as_uint(cnodes.z) & PATH_RAY_SHADOW);
+				traverseChild1 = (_mm_movemask_ps(lrhit) & 2) && (__float_as_uint(cnodes.w) & PATH_RAY_SHADOW);
+#else
+				traverseChild0 = (_mm_movemask_ps(lrhit) & 1);
+				traverseChild1 = (_mm_movemask_ps(lrhit) & 2);
+#endif
+#endif // __KERNEL_SSE2__
+
+				nodeAddr = __float_as_int(cnodes.x);
+				nodeAddrChild1 = __float_as_int(cnodes.y);
+
+				if(traverseChild0 && traverseChild1) {
+					/* both children were intersected, push the farther one */
+#if !defined(__KERNEL_SSE2__)
+					bool closestChild1 = (c1min < c0min);
+#else
+					union { __m128 m128; float v[4]; } uminmax;
+					uminmax.m128 = tminmax;
+					bool closestChild1 = uminmax.v[1] < uminmax.v[0];
+#endif
+
+					if(closestChild1) {
+						int tmp = nodeAddr;
+						nodeAddr = nodeAddrChild1;
+						nodeAddrChild1 = tmp;
+					}
+
+					++stackPtr;
+					traversalStack[stackPtr] = nodeAddrChild1;
+				}
+				else {
+					/* one child was intersected */
+					if(traverseChild1) {
+						nodeAddr = nodeAddrChild1;
+					}
+					else if(!traverseChild0) {
+						/* neither child was intersected */
+						nodeAddr = traversalStack[stackPtr];
+						--stackPtr;
+					}
+				}
+			}
+
+			/* if node is leaf, fetch triangle list */
+			if(nodeAddr < 0) {
+				float4 leaf = kernel_tex_fetch(__bvh_nodes, (-nodeAddr-1)*BVH_NODE_SIZE+(BVH_NODE_SIZE-1));
+				int primAddr = __float_as_int(leaf.x);
+
+#if FEATURE(BVH_INSTANCING)
+				if(primAddr >= 0) {
+#endif
+					int primAddr2 = __float_as_int(leaf.y);
+
+					/* pop */
+					nodeAddr = traversalStack[stackPtr];
+					--stackPtr;
+
+					/* primitive intersection */
+					while(primAddr < primAddr2) {
+						bool hit;
+						uint type = kernel_tex_fetch(__prim_type, primAddr);
+
+						/* todo: specialized intersect functions which don't fill in
+						 * isect unless needed and check SD_HAS_TRANSPARENT_SHADOW?
+						 * might give a few % performance improvement */
+
+						switch(type & PRIMITIVE_ALL) {
+							case PRIMITIVE_TRIANGLE: {
+								hit = triangle_intersect(kg, isect_array, P, dir, PATH_RAY_SHADOW, object, primAddr);
+								break;
+							}
+#if FEATURE(BVH_MOTION)
+							case PRIMITIVE_MOTION_TRIANGLE: {
+								hit = motion_triangle_intersect(kg, isect_array, P, dir, ray->time, PATH_RAY_SHADOW, object, primAddr);
+								break;
+							}
+#endif
+#if FEATURE(BVH_HAIR)
+							case PRIMITIVE_CURVE:
+							case PRIMITIVE_MOTION_CURVE: {
+								if(kernel_data.curve.curveflags & CURVE_KN_INTERPOLATE) 
+									hit = bvh_cardinal_curve_intersect(kg, isect_array, P, dir, PATH_RAY_SHADOW, object, primAddr, ray->time, type, NULL, 0, 0);
+								else
+									hit = bvh_curve_intersect(kg, isect_array, P, dir, PATH_RAY_SHADOW, object, primAddr, ray->time, type, NULL, 0, 0);
+								break;
+							}
+#endif
+							default: {
+								hit = false;
+								break;
+							}
+						}
+
+						/* shadow ray early termination */
+						if(hit) {
+							/* detect if this surface has a shader with transparent shadows */
+
+							/* todo: optimize so primitive visibility flag indicates if
+							 * the primitive has a transparent shadow shader? */
+							int prim = kernel_tex_fetch(__prim_index, isect_array->prim);
+							int shader = 0;
+
+#ifdef __HAIR__
+							if(kernel_tex_fetch(__prim_type, isect_array->prim) & PRIMITIVE_ALL_TRIANGLE)
+#endif
+							{
+								float4 Ns = kernel_tex_fetch(__tri_normal, prim);
+								shader = __float_as_int(Ns.w);
+							}
+#ifdef __HAIR__
+							else {
+								float4 str = kernel_tex_fetch(__curves, prim);
+								shader = __float_as_int(str.z);
+							}
+#endif
+							int flag = kernel_tex_fetch(__shader_flag, (shader & SHADER_MASK)*2);
+
+							/* if no transparent shadows, all light is blocked */
+							if(!(flag & SD_HAS_TRANSPARENT_SHADOW)) {
+								return true;
+							}
+							/* if maximum number of hits reached, block all light */
+							else if(*num_hits == max_hits) {
+								return true;
+							}
+
+							/* move on to next entry in intersections array */
+							isect_array++;
+							(*num_hits)++;
+#if FEATURE(BVH_INSTANCING)
+							num_hits_in_instance++;
+#endif
+
+							isect_array->t = isect_t;
+						}
+
+						primAddr++;
+					}
+				}
+#if FEATURE(BVH_INSTANCING)
+				else {
+					/* instance push */
+					object = kernel_tex_fetch(__prim_object, -primAddr-1);
+
+#if FEATURE(BVH_MOTION)
+					bvh_instance_motion_push(kg, object, ray, &P, &dir, &idir, &isect_t, &ob_tfm);
+#else
+					bvh_instance_push(kg, object, ray, &P, &dir, &idir, &isect_t);
+#endif
+
+					num_hits_in_instance = 0;
+
+#if defined(__KERNEL_SSE2__)
+					Psplat[0] = _mm_set_ps1(P.x);
+					Psplat[1] = _mm_set_ps1(P.y);
+					Psplat[2] = _mm_set_ps1(P.z);
+
+					isect_array->t = isect_t;
+					tsplat = _mm_set_ps(-isect_t, -isect_t, 0.0f, 0.0f);
+
+					gen_idirsplat_swap(pn, shuf_identity, shuf_swap, idir, idirsplat, shufflexyz);
+#endif
+
+					++stackPtr;
+					traversalStack[stackPtr] = ENTRYPOINT_SENTINEL;
+
+					nodeAddr = kernel_tex_fetch(__object_node, object);
+				}
+			}
+#endif
+		} while(nodeAddr != ENTRYPOINT_SENTINEL);
+
+#if FEATURE(BVH_INSTANCING)
+		if(stackPtr >= 0) {
+			kernel_assert(object != OBJECT_NONE);
+
+			if(num_hits_in_instance) {
+				float t_fac;
+
+#if FEATURE(BVH_MOTION)
+				bvh_instance_motion_pop_factor(kg, object, ray, &P, &dir, &idir, &t_fac, &ob_tfm);
+#else
+				bvh_instance_pop_factor(kg, object, ray, &P, &dir, &idir, &t_fac);
+#endif
+
+				/* scale isect->t to adjust for instancing */
+				for(int i = 0; i < num_hits_in_instance; i++)
+					(isect_array-i-1)->t *= t_fac;
+			}
+			else {
+				float ignore_t = FLT_MAX;
+
+#if FEATURE(BVH_MOTION)
+				bvh_instance_motion_pop(kg, object, ray, &P, &dir, &idir, &ignore_t, &ob_tfm);
+#else
+				bvh_instance_pop(kg, object, ray, &P, &dir, &idir, &ignore_t);
+#endif
+			}
+
+#if defined(__KERNEL_SSE2__)
+			Psplat[0] = _mm_set_ps1(P.x);
+			Psplat[1] = _mm_set_ps1(P.y);
+			Psplat[2] = _mm_set_ps1(P.z);
+
+			isect_t = tmax;
+			isect_array->t = isect_t;
+			tsplat = _mm_set_ps(-isect_t, -isect_t, 0.0f, 0.0f);
+
+			gen_idirsplat_swap(pn, shuf_identity, shuf_swap, idir, idirsplat, shufflexyz);
+#endif
+
+			object = OBJECT_NONE;
+			nodeAddr = traversalStack[stackPtr];
+			--stackPtr;
+		}
+#endif
+	} while(nodeAddr != ENTRYPOINT_SENTINEL);
+
+	return false;
+}
+
+#undef FEATURE
+#undef BVH_FUNCTION_NAME
+#undef BVH_FUNCTION_FEATURES
+
diff --git a/intern/cycles/kernel/kernel_bvh_subsurface.h b/intern/cycles/kernel/geom/geom_bvh_subsurface.h
index df82dda2435..a19f05dd371 100644
--- a/intern/cycles/kernel/kernel_bvh_subsurface.h
+++ b/intern/cycles/kernel/geom/geom_bvh_subsurface.h
@@ -48,12 +48,13 @@ ccl_device uint BVH_FUNCTION_NAME(KernelGlobals *kg, const Ray *ray, Intersectio
 	int nodeAddr = kernel_data.bvh.root;
 
 	/* ray parameters in registers */
-	const float tmax = ray->t;
 	float3 P = ray->P;
-	float3 idir = bvh_inverse_direction(ray->D);
-	int object = ~0;
+	float3 dir = bvh_clamp_direction(ray->D);
+	float3 idir = bvh_inverse_direction(dir);
+	int object = OBJECT_NONE;
+	float isect_t = ray->t;
 
-	const uint visibility = ~0;
+	const uint visibility = PATH_RAY_ALL_VISIBILITY;
 	uint num_hits = 0;
 
 #if FEATURE(BVH_MOTION)
@@ -72,7 +73,7 @@ ccl_device uint BVH_FUNCTION_NAME(KernelGlobals *kg, const Ray *ray, Intersectio
 	Psplat[1] = _mm_set_ps1(P.y);
 	Psplat[2] = _mm_set_ps1(P.z);
 
-	__m128 tsplat = _mm_set_ps(-tmax, -tmax, 0.0f, 0.0f);
+	__m128 tsplat = _mm_set_ps(-isect_t, -isect_t, 0.0f, 0.0f);
 
 	gen_idirsplat_swap(pn, shuf_identity, shuf_swap, idir, idirsplat, shufflexyz);
 #endif
@@ -89,7 +90,7 @@ ccl_device uint BVH_FUNCTION_NAME(KernelGlobals *kg, const Ray *ray, Intersectio
 
 #if !defined(__KERNEL_SSE2__)
 				/* Intersect two child bounding boxes, non-SSE version */
-				float t = tmax;
+				float t = isect_t;
 
 				/* fetch node data */
 				float4 node0 = kernel_tex_fetch(__bvh_nodes, nodeAddr*BVH_NODE_SIZE+0);
@@ -130,8 +131,8 @@ ccl_device uint BVH_FUNCTION_NAME(KernelGlobals *kg, const Ray *ray, Intersectio
 				/* Intersect two child bounding boxes, SSE3 version adapted from Embree */
 
 				/* fetch node data */
-				__m128 *bvh_nodes = (__m128*)kg->__bvh_nodes.data + nodeAddr*BVH_NODE_SIZE;
-				float4 cnodes = ((float4*)bvh_nodes)[3];
+				const __m128 *bvh_nodes = (__m128*)kg->__bvh_nodes.data + nodeAddr*BVH_NODE_SIZE;
+				const float4 cnodes = ((float4*)bvh_nodes)[3];
 
 				/* intersect ray against child nodes */
 				const __m128 tminmaxx = _mm_mul_ps(_mm_sub_ps(shuffle_swap(bvh_nodes[0], shufflexyz[0]), Psplat[0]), idirsplat[0]);
@@ -203,19 +204,29 @@ ccl_device uint BVH_FUNCTION_NAME(KernelGlobals *kg, const Ray *ray, Intersectio
 
 					/* primitive intersection */
 					for(; primAddr < primAddr2; primAddr++) {
-#if FEATURE(BVH_HAIR)
-						uint segment = kernel_tex_fetch(__prim_segment, primAddr);
-						if(segment != ~0)
-							continue;
-#endif
-
 						/* only primitives from the same object */
-						uint tri_object = (object == ~0)? kernel_tex_fetch(__prim_object, primAddr): object;
+						uint tri_object = (object == OBJECT_NONE)? kernel_tex_fetch(__prim_object, primAddr): object;
 
-						if(tri_object == subsurface_object) {
+						if(tri_object != subsurface_object)
+							continue;
 
-							/* intersect ray against primitive */
-							bvh_triangle_intersect_subsurface(kg, isect_array, P, idir, object, primAddr, tmax, &num_hits, lcg_state, max_hits);
+						/* intersect ray against primitive */
+						uint type = kernel_tex_fetch(__prim_type, primAddr);
+
+						switch(type & PRIMITIVE_ALL) {
+							case PRIMITIVE_TRIANGLE: {
+								triangle_intersect_subsurface(kg, isect_array, P, dir, object, primAddr, isect_t, &num_hits, lcg_state, max_hits);
+								break;
+							}
+#if FEATURE(BVH_MOTION)
+							case PRIMITIVE_MOTION_TRIANGLE: {
+								motion_triangle_intersect_subsurface(kg, isect_array, P, dir, ray->time, object, primAddr, isect_t, &num_hits, lcg_state, max_hits);
+								break;
+							}
+#endif
+							default: {
+								break;
+							}
 						}
 					}
 				}
@@ -225,11 +236,10 @@ ccl_device uint BVH_FUNCTION_NAME(KernelGlobals *kg, const Ray *ray, Intersectio
 					if(subsurface_object == kernel_tex_fetch(__prim_object, -primAddr-1)) {
 						object = subsurface_object;
 
-						float t_ignore = FLT_MAX;
 #if FEATURE(BVH_MOTION)
-						bvh_instance_motion_push(kg, object, ray, &P, &idir, &t_ignore, &ob_tfm, tmax);
+						bvh_instance_motion_push(kg, object, ray, &P, &dir, &idir, &isect_t, &ob_tfm);
 #else
-						bvh_instance_push(kg, object, ray, &P, &idir, &t_ignore, tmax);
+						bvh_instance_push(kg, object, ray, &P, &dir, &idir, &isect_t);
 #endif
 
 #if defined(__KERNEL_SSE2__)
@@ -237,7 +247,7 @@ ccl_device uint BVH_FUNCTION_NAME(KernelGlobals *kg, const Ray *ray, Intersectio
 						Psplat[1] = _mm_set_ps1(P.y);
 						Psplat[2] = _mm_set_ps1(P.z);
 
-						tsplat = _mm_set_ps(-tmax, -tmax, 0.0f, 0.0f);
+						tsplat = _mm_set_ps(-isect_t, -isect_t, 0.0f, 0.0f);
 
 						gen_idirsplat_swap(pn, shuf_identity, shuf_swap, idir, idirsplat, shufflexyz);
 #endif
@@ -259,14 +269,13 @@ ccl_device uint BVH_FUNCTION_NAME(KernelGlobals *kg, const Ray *ray, Intersectio
 
 #if FEATURE(BVH_INSTANCING)
 		if(stackPtr >= 0) {
-			kernel_assert(object != ~0);
+			kernel_assert(object != OBJECT_NONE);
 
 			/* instance pop */
-			float t_ignore = FLT_MAX;
 #if FEATURE(BVH_MOTION)
-			bvh_instance_motion_pop(kg, object, ray, &P, &idir, &t_ignore, &ob_tfm, tmax);
+			bvh_instance_motion_pop(kg, object, ray, &P, &dir, &idir, &isect_t, &ob_tfm);
 #else
-			bvh_instance_pop(kg, object, ray, &P, &idir, &t_ignore, tmax);
+			bvh_instance_pop(kg, object, ray, &P, &dir, &idir, &isect_t);
 #endif
 
 #if defined(__KERNEL_SSE2__)
@@ -274,12 +283,12 @@ ccl_device uint BVH_FUNCTION_NAME(KernelGlobals *kg, const Ray *ray, Intersectio
 			Psplat[1] = _mm_set_ps1(P.y);
 			Psplat[2] = _mm_set_ps1(P.z);
 
-			tsplat = _mm_set_ps(-tmax, -tmax, 0.0f, 0.0f);
+			tsplat = _mm_set_ps(-isect_t, -isect_t, 0.0f, 0.0f);
 
 			gen_idirsplat_swap(pn, shuf_identity, shuf_swap, idir, idirsplat, shufflexyz);
 #endif
 
-			object = ~0;
+			object = OBJECT_NONE;
 			nodeAddr = traversalStack[stackPtr];
 			--stackPtr;
 		}
diff --git a/intern/cycles/kernel/kernel_bvh_traversal.h b/intern/cycles/kernel/geom/geom_bvh_traversal.h
index bfd72b0aa16..9fd40f91471 100644
--- a/intern/cycles/kernel/kernel_bvh_traversal.h
+++ b/intern/cycles/kernel/geom/geom_bvh_traversal.h
@@ -41,7 +41,6 @@ ccl_device bool BVH_FUNCTION_NAME
 	 * - test if pushing distance on the stack helps (for non shadow rays)
 	 * - separate version for shadow rays
 	 * - likely and unlikely for if() statements
-	 * - SSE for hair
 	 * - test restrict attribute for pointers
 	 */
 	
@@ -54,18 +53,18 @@ ccl_device bool BVH_FUNCTION_NAME
 	int nodeAddr = kernel_data.bvh.root;
 
 	/* ray parameters in registers */
-	const float tmax = ray->t;
-	ccl_align(16) float3 P = ray->P;
-	ccl_align(16) float3 idir = bvh_inverse_direction(ray->D);
-	int object = ~0;
+	float3 P = ray->P;
+	float3 dir = bvh_clamp_direction(ray->D);
+	float3 idir = bvh_inverse_direction(dir);
+	int object = OBJECT_NONE;
 
 #if FEATURE(BVH_MOTION)
 	Transform ob_tfm;
 #endif
 
-	isect->t = tmax;
-	isect->object = ~0;
-	isect->prim = ~0;
+	isect->t = ray->t;
+	isect->object = OBJECT_NONE;
+	isect->prim = PRIM_NONE;
 	isect->u = 0.0f;
 	isect->v = 0.0f;
 
@@ -88,11 +87,9 @@ ccl_device bool BVH_FUNCTION_NAME
 
 	/* traversal loop */
 	do {
-		do
-		{
+		do {
 			/* traverse internal nodes */
-			while(nodeAddr >= 0 && nodeAddr != ENTRYPOINT_SENTINEL)
-			{
+			while(nodeAddr >= 0 && nodeAddr != ENTRYPOINT_SENTINEL) {
 				bool traverseChild0, traverseChild1;
 				int nodeAddrChild1;
 
@@ -250,26 +247,34 @@ ccl_device bool BVH_FUNCTION_NAME
 					/* primitive intersection */
 					while(primAddr < primAddr2) {
 						bool hit;
+						uint type = kernel_tex_fetch(__prim_type, primAddr);
 
-						/* intersect ray against primitive */
+						switch(type & PRIMITIVE_ALL) {
+							case PRIMITIVE_TRIANGLE: {
+								hit = triangle_intersect(kg, isect, P, dir, visibility, object, primAddr);
+								break;
+							}
+#if FEATURE(BVH_MOTION)
+							case PRIMITIVE_MOTION_TRIANGLE: {
+								hit = motion_triangle_intersect(kg, isect, P, dir, ray->time, visibility, object, primAddr);
+								break;
+							}
+#endif
 #if FEATURE(BVH_HAIR)
-						uint segment = kernel_tex_fetch(__prim_segment, primAddr);
-						if(segment != ~0) {
-
-							if(kernel_data.curve.curveflags & CURVE_KN_INTERPOLATE) 
-#if FEATURE(BVH_HAIR_MINIMUM_WIDTH)
-								hit = bvh_cardinal_curve_intersect(kg, isect, P, idir, visibility, object, primAddr, segment, lcg_state, difl, extmax);
-							else
-								hit = bvh_curve_intersect(kg, isect, P, idir, visibility, object, primAddr, segment, lcg_state, difl, extmax);
-#else
-								hit = bvh_cardinal_curve_intersect(kg, isect, P, idir, visibility, object, primAddr, segment);
-							else
-								hit = bvh_curve_intersect(kg, isect, P, idir, visibility, object, primAddr, segment);
+							case PRIMITIVE_CURVE:
+							case PRIMITIVE_MOTION_CURVE: {
+								if(kernel_data.curve.curveflags & CURVE_KN_INTERPOLATE) 
+									hit = bvh_cardinal_curve_intersect(kg, isect, P, dir, visibility, object, primAddr, ray->time, type, lcg_state, difl, extmax);
+								else
+									hit = bvh_curve_intersect(kg, isect, P, dir, visibility, object, primAddr, ray->time, type, lcg_state, difl, extmax);
+								break;
+							}
 #endif
+							default: {
+								hit = false;
+								break;
+							}
 						}
-						else
-#endif
-							hit = bvh_triangle_intersect(kg, isect, P, idir, visibility, object, primAddr);
 
 						/* shadow ray early termination */
 #if defined(__KERNEL_SSE2__)
@@ -293,9 +298,9 @@ ccl_device bool BVH_FUNCTION_NAME
 					object = kernel_tex_fetch(__prim_object, -primAddr-1);
 
 #if FEATURE(BVH_MOTION)
-					bvh_instance_motion_push(kg, object, ray, &P, &idir, &isect->t, &ob_tfm, tmax);
+					bvh_instance_motion_push(kg, object, ray, &P, &dir, &idir, &isect->t, &ob_tfm);
 #else
-					bvh_instance_push(kg, object, ray, &P, &idir, &isect->t, tmax);
+					bvh_instance_push(kg, object, ray, &P, &dir, &idir, &isect->t);
 #endif
 
 #if defined(__KERNEL_SSE2__)
@@ -319,13 +324,13 @@ ccl_device bool BVH_FUNCTION_NAME
 
 #if FEATURE(BVH_INSTANCING)
 		if(stackPtr >= 0) {
-			kernel_assert(object != ~0);
+			kernel_assert(object != OBJECT_NONE);
 
 			/* instance pop */
 #if FEATURE(BVH_MOTION)
-			bvh_instance_motion_pop(kg, object, ray, &P, &idir, &isect->t, &ob_tfm, tmax);
+			bvh_instance_motion_pop(kg, object, ray, &P, &dir, &idir, &isect->t, &ob_tfm);
 #else
-			bvh_instance_pop(kg, object, ray, &P, &idir, &isect->t, tmax);
+			bvh_instance_pop(kg, object, ray, &P, &dir, &idir, &isect->t);
 #endif
 
 #if defined(__KERNEL_SSE2__)
@@ -338,14 +343,14 @@ ccl_device bool BVH_FUNCTION_NAME
 			gen_idirsplat_swap(pn, shuf_identity, shuf_swap, idir, idirsplat, shufflexyz);
 #endif
 
-			object = ~0;
+			object = OBJECT_NONE;
 			nodeAddr = traversalStack[stackPtr];
 			--stackPtr;
 		}
 #endif
 	} while(nodeAddr != ENTRYPOINT_SENTINEL);
 
-	return (isect->prim != ~0);
+	return (isect->prim != PRIM_NONE);
 }
 
 #undef FEATURE
diff --git a/intern/cycles/kernel/geom/geom_curve.h b/intern/cycles/kernel/geom/geom_curve.h
new file mode 100644
index 00000000000..e1d225436a6
--- /dev/null
+++ b/intern/cycles/kernel/geom/geom_curve.h
@@ -0,0 +1,1035 @@
+/*
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+CCL_NAMESPACE_BEGIN
+
+/* Curve Primitive
+ *
+ * Curve primitive for rendering hair and fur. These can be render as flat ribbons
+ * or curves with actual thickness. The curve can also be rendered as line segments
+ * rather than curves for better performance */
+
+#ifdef __HAIR__
+
+/* Reading attributes on various curve elements */
+
+ccl_device float curve_attribute_float(KernelGlobals *kg, const ShaderData *sd, AttributeElement elem, int offset, float *dx, float *dy)
+{
+	if(elem == ATTR_ELEMENT_CURVE) {
+#ifdef __RAY_DIFFERENTIALS__
+		if(dx) *dx = 0.0f;
+		if(dy) *dy = 0.0f;
+#endif
+
+		return kernel_tex_fetch(__attributes_float, offset + sd->prim);
+	}
+	else if(elem == ATTR_ELEMENT_CURVE_KEY || elem == ATTR_ELEMENT_CURVE_KEY_MOTION) {
+		float4 curvedata = kernel_tex_fetch(__curves, sd->prim);
+		int k0 = __float_as_int(curvedata.x) + PRIMITIVE_UNPACK_SEGMENT(sd->type);
+		int k1 = k0 + 1;
+
+		float f0 = kernel_tex_fetch(__attributes_float, offset + k0);
+		float f1 = kernel_tex_fetch(__attributes_float, offset + k1);
+
+#ifdef __RAY_DIFFERENTIALS__
+		if(dx) *dx = sd->du.dx*(f1 - f0);
+		if(dy) *dy = 0.0f;
+#endif
+
+		return (1.0f - sd->u)*f0 + sd->u*f1;
+	}
+	else {
+#ifdef __RAY_DIFFERENTIALS__
+		if(dx) *dx = 0.0f;
+		if(dy) *dy = 0.0f;
+#endif
+
+		return 0.0f;
+	}
+}
+
+ccl_device float3 curve_attribute_float3(KernelGlobals *kg, const ShaderData *sd, AttributeElement elem, int offset, float3 *dx, float3 *dy)
+{
+	if(elem == ATTR_ELEMENT_CURVE) {
+		/* idea: we can't derive any useful differentials here, but for tiled
+		 * mipmap image caching it would be useful to avoid reading the highest
+		 * detail level always. maybe a derivative based on the hair density
+		 * could be computed somehow? */
+#ifdef __RAY_DIFFERENTIALS__
+		if(dx) *dx = make_float3(0.0f, 0.0f, 0.0f);
+		if(dy) *dy = make_float3(0.0f, 0.0f, 0.0f);
+#endif
+
+		return float4_to_float3(kernel_tex_fetch(__attributes_float3, offset + sd->prim));
+	}
+	else if(elem == ATTR_ELEMENT_CURVE_KEY || elem == ATTR_ELEMENT_CURVE_KEY_MOTION) {
+		float4 curvedata = kernel_tex_fetch(__curves, sd->prim);
+		int k0 = __float_as_int(curvedata.x) + PRIMITIVE_UNPACK_SEGMENT(sd->type);
+		int k1 = k0 + 1;
+
+		float3 f0 = float4_to_float3(kernel_tex_fetch(__attributes_float3, offset + k0));
+		float3 f1 = float4_to_float3(kernel_tex_fetch(__attributes_float3, offset + k1));
+
+#ifdef __RAY_DIFFERENTIALS__
+		if(dx) *dx = sd->du.dx*(f1 - f0);
+		if(dy) *dy = make_float3(0.0f, 0.0f, 0.0f);
+#endif
+
+		return (1.0f - sd->u)*f0 + sd->u*f1;
+	}
+	else {
+#ifdef __RAY_DIFFERENTIALS__
+		if(dx) *dx = make_float3(0.0f, 0.0f, 0.0f);
+		if(dy) *dy = make_float3(0.0f, 0.0f, 0.0f);
+#endif
+
+		return make_float3(0.0f, 0.0f, 0.0f);
+	}
+}
+
+/* Curve thickness */
+
+ccl_device float curve_thickness(KernelGlobals *kg, ShaderData *sd)
+{
+	float r = 0.0f;
+
+	if(sd->type & PRIMITIVE_ALL_CURVE) {
+		float4 curvedata = kernel_tex_fetch(__curves, sd->prim);
+		int k0 = __float_as_int(curvedata.x) + PRIMITIVE_UNPACK_SEGMENT(sd->type);
+		int k1 = k0 + 1;
+
+		float4 P_curve[2];
+
+		if(sd->type & PRIMITIVE_CURVE) {
+			P_curve[0]= kernel_tex_fetch(__curve_keys, k0);
+			P_curve[1]= kernel_tex_fetch(__curve_keys, k1);
+		}
+		else {
+			motion_curve_keys(kg, sd->object, sd->prim, sd->time, k0, k1, P_curve);
+		}
+
+		r = (P_curve[1].w - P_curve[0].w) * sd->u + P_curve[0].w;
+	}
+
+	return r*2.0f;
+}
+
+/* Curve location for motion pass, linear interpolation between keys and
+ * ignoring radius because we do the same for the motion keys */
+
+ccl_device float3 curve_motion_center_location(KernelGlobals *kg, ShaderData *sd)
+{
+	float4 curvedata = kernel_tex_fetch(__curves, sd->prim);
+	int k0 = __float_as_int(curvedata.x) + PRIMITIVE_UNPACK_SEGMENT(sd->type);
+	int k1 = k0 + 1;
+
+	float4 P_curve[2];
+
+	P_curve[0]= kernel_tex_fetch(__curve_keys, k0);
+	P_curve[1]= kernel_tex_fetch(__curve_keys, k1);
+
+	return float4_to_float3(P_curve[1]) * sd->u + float4_to_float3(P_curve[0]) * (1.0f - sd->u);
+}
+
+/* Curve tangent normal */
+
+ccl_device float3 curve_tangent_normal(KernelGlobals *kg, ShaderData *sd)
+{	
+	float3 tgN = make_float3(0.0f,0.0f,0.0f);
+
+	if(sd->type & PRIMITIVE_ALL_CURVE) {
+
+		tgN = -(-sd->I - sd->dPdu * (dot(sd->dPdu,-sd->I) / len_squared(sd->dPdu)));
+		tgN = normalize(tgN);
+
+		/* need to find suitable scaled gd for corrected normal */
+#if 0
+		tgN = normalize(tgN - gd * sd->dPdu);
+#endif
+	}
+
+	return tgN;
+}
+
+/* Curve bounds utility function */
+
+ccl_device_inline void curvebounds(float *lower, float *upper, float *extremta, float *extrema, float *extremtb, float *extremb, float p0, float p1, float p2, float p3)
+{
+	float halfdiscroot = (p2 * p2 - 3 * p3 * p1);
+	float ta = -1.0f;
+	float tb = -1.0f;
+
+	*extremta = -1.0f;
+	*extremtb = -1.0f;
+	*upper = p0;
+	*lower = (p0 + p1) + (p2 + p3);
+	*extrema = *upper;
+	*extremb = *lower;
+
+	if(*lower >= *upper) {
+		*upper = *lower;
+		*lower = p0;
+	}
+
+	if(halfdiscroot >= 0) {
+		float inv3p3 = (1.0f/3.0f)/p3;
+		halfdiscroot = sqrtf(halfdiscroot);
+		ta = (-p2 - halfdiscroot) * inv3p3;
+		tb = (-p2 + halfdiscroot) * inv3p3;
+	}
+
+	float t2;
+	float t3;
+
+	if(ta > 0.0f && ta < 1.0f) {
+		t2 = ta * ta;
+		t3 = t2 * ta;
+		*extremta = ta;
+		*extrema = p3 * t3 + p2 * t2 + p1 * ta + p0;
+
+		*upper = fmaxf(*extrema, *upper);
+		*lower = fminf(*extrema, *lower);
+	}
+
+	if(tb > 0.0f && tb < 1.0f) {
+		t2 = tb * tb;
+		t3 = t2 * tb;
+		*extremtb = tb;
+		*extremb = p3 * t3 + p2 * t2 + p1 * tb + p0;
+
+		*upper = fmaxf(*extremb, *upper);
+		*lower = fminf(*extremb, *lower);
+	}
+}
+
+#ifdef __KERNEL_SSE2__
+ccl_device_inline __m128 transform_point_T3(const __m128 t[3], const __m128 &a)
+{
+	return fma(broadcast<0>(a), t[0], fma(broadcast<1>(a), t[1], _mm_mul_ps(broadcast<2>(a), t[2])));
+}
+#endif
+
+#ifdef __KERNEL_SSE2__
+/* Pass P and dir by reference to aligned vector */
+ccl_device_inline bool bvh_cardinal_curve_intersect(KernelGlobals *kg, Intersection *isect,
+	const float3 &P, const float3 &dir, uint visibility, int object, int curveAddr, float time, int type, uint *lcg_state, float difl, float extmax)
+#else
+ccl_device_inline bool bvh_cardinal_curve_intersect(KernelGlobals *kg, Intersection *isect,
+	float3 P, float3 dir, uint visibility, int object, int curveAddr, float time,int type, uint *lcg_state, float difl, float extmax)
+#endif
+{
+	int segment = PRIMITIVE_UNPACK_SEGMENT(type);
+	float epsilon = 0.0f;
+	float r_st, r_en;
+
+	int depth = kernel_data.curve.subdivisions;
+	int flags = kernel_data.curve.curveflags;
+	int prim = kernel_tex_fetch(__prim_index, curveAddr);
+
+#ifdef __KERNEL_SSE2__
+	__m128 vdir = load_m128(dir);
+	__m128 vcurve_coef[4];
+	const float3 *curve_coef = (float3 *)vcurve_coef;
+	
+	{
+		__m128 dtmp = _mm_mul_ps(vdir, vdir);
+		__m128 d_ss = _mm_sqrt_ss(_mm_add_ss(dtmp, broadcast<2>(dtmp)));
+		__m128 rd_ss = _mm_div_ss(_mm_set_ss(1.0f), d_ss);
+
+		__m128i v00vec = _mm_load_si128((__m128i *)&kg->__curves.data[prim]);
+		int2 &v00 = (int2 &)v00vec;
+
+		int k0 = v00.x + segment;
+		int k1 = k0 + 1;
+		int ka = max(k0 - 1, v00.x);
+		int kb = min(k1 + 1, v00.x + v00.y - 1);
+
+		__m128 P_curve[4];
+
+		if(type & PRIMITIVE_CURVE) {
+			P_curve[0] = _mm_load_ps(&kg->__curve_keys.data[ka].x);
+			P_curve[1] = _mm_load_ps(&kg->__curve_keys.data[k0].x);
+			P_curve[2] = _mm_load_ps(&kg->__curve_keys.data[k1].x);
+			P_curve[3] = _mm_load_ps(&kg->__curve_keys.data[kb].x);
+		}
+		else {
+			int fobject = (object == OBJECT_NONE)? kernel_tex_fetch(__prim_object, curveAddr): object;
+			motion_cardinal_curve_keys(kg, fobject, prim, time, ka, k0, k1, kb, (float4*)&P_curve);
+		}
+
+		__m128 rd_sgn = set_sign_bit<0, 1, 1, 1>(broadcast<0>(rd_ss));
+		__m128 mul_zxxy = _mm_mul_ps(shuffle<2, 0, 0, 1>(vdir), rd_sgn);
+		__m128 mul_yz = _mm_mul_ps(shuffle<1, 2, 1, 2>(vdir), mul_zxxy);
+		__m128 mul_shuf = shuffle<0, 1, 2, 3>(mul_zxxy, mul_yz);
+		__m128 vdir0 = _mm_and_ps(vdir, _mm_castsi128_ps(_mm_setr_epi32(0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, 0)));
+
+		__m128 htfm0 = shuffle<0, 2, 0, 3>(mul_shuf, vdir0);
+		__m128 htfm1 = shuffle<1, 0, 1, 3>(_mm_set_ss(_mm_cvtss_f32(d_ss)), vdir0);
+		__m128 htfm2 = shuffle<1, 3, 2, 3>(mul_shuf, vdir0);
+
+		__m128 htfm[] = { htfm0, htfm1, htfm2 };
+		__m128 vP = load_m128(P);
+		__m128 p0 = transform_point_T3(htfm, _mm_sub_ps(P_curve[0], vP));
+		__m128 p1 = transform_point_T3(htfm, _mm_sub_ps(P_curve[1], vP));
+		__m128 p2 = transform_point_T3(htfm, _mm_sub_ps(P_curve[2], vP));
+		__m128 p3 = transform_point_T3(htfm, _mm_sub_ps(P_curve[3], vP));
+
+		float fc = 0.71f;
+		__m128 vfc = _mm_set1_ps(fc);
+		__m128 vfcxp3 = _mm_mul_ps(vfc, p3);
+
+		vcurve_coef[0] = p1;
+		vcurve_coef[1] = _mm_mul_ps(vfc, _mm_sub_ps(p2, p0));
+		vcurve_coef[2] = fma(_mm_set1_ps(fc * 2.0f), p0, fma(_mm_set1_ps(fc - 3.0f), p1, fms(_mm_set1_ps(3.0f - 2.0f * fc), p2, vfcxp3)));
+		vcurve_coef[3] = fms(_mm_set1_ps(fc - 2.0f), _mm_sub_ps(p2, p1), fms(vfc, p0, vfcxp3));
+
+		r_st = ((float4 &)P_curve[1]).w;
+		r_en = ((float4 &)P_curve[2]).w;
+	}
+#else
+	float3 curve_coef[4];
+
+	/* curve Intersection check */
+	/* obtain curve parameters */
+	{
+		/* ray transform created - this should be created at beginning of intersection loop */
+		Transform htfm;
+		float d = sqrtf(dir.x * dir.x + dir.z * dir.z);
+		htfm = make_transform(
+			dir.z / d, 0, -dir.x /d, 0,
+			-dir.x * dir.y /d, d, -dir.y * dir.z /d, 0,
+			dir.x, dir.y, dir.z, 0,
+			0, 0, 0, 1);
+
+		float4 v00 = kernel_tex_fetch(__curves, prim);
+
+		int k0 = __float_as_int(v00.x) + segment;
+		int k1 = k0 + 1;
+
+		int ka = max(k0 - 1,__float_as_int(v00.x));
+		int kb = min(k1 + 1,__float_as_int(v00.x) + __float_as_int(v00.y) - 1);
+
+		float4 P_curve[4];
+
+		if(type & PRIMITIVE_CURVE) {
+			P_curve[0] = kernel_tex_fetch(__curve_keys, ka);
+			P_curve[1] = kernel_tex_fetch(__curve_keys, k0);
+			P_curve[2] = kernel_tex_fetch(__curve_keys, k1);
+			P_curve[3] = kernel_tex_fetch(__curve_keys, kb);
+		}
+		else {
+			int fobject = (object == OBJECT_NONE)? kernel_tex_fetch(__prim_object, curveAddr): object;
+			motion_cardinal_curve_keys(kg, fobject, prim, time, ka, k0, k1, kb, P_curve);
+		}
+
+		float3 p0 = transform_point(&htfm, float4_to_float3(P_curve[0]) - P);
+		float3 p1 = transform_point(&htfm, float4_to_float3(P_curve[1]) - P);
+		float3 p2 = transform_point(&htfm, float4_to_float3(P_curve[2]) - P);
+		float3 p3 = transform_point(&htfm, float4_to_float3(P_curve[3]) - P);
+
+		float fc = 0.71f;
+		curve_coef[0] = p1;
+		curve_coef[1] = -fc*p0 + fc*p2;
+		curve_coef[2] = 2.0f * fc * p0 + (fc - 3.0f) * p1 + (3.0f - 2.0f * fc) * p2 - fc * p3;
+		curve_coef[3] = -fc * p0 + (2.0f - fc) * p1 + (fc - 2.0f) * p2 + fc * p3;
+		r_st = P_curve[1].w;
+		r_en = P_curve[2].w;
+	}
+#endif
+
+	float r_curr = max(r_st, r_en);
+
+	if((flags & CURVE_KN_RIBBONS) || !(flags & CURVE_KN_BACKFACING))
+		epsilon = 2 * r_curr;
+
+	/* find bounds - this is slow for cubic curves */
+	float upper, lower;
+
+	float zextrem[4];
+	curvebounds(&lower, &upper, &zextrem[0], &zextrem[1], &zextrem[2], &zextrem[3], curve_coef[0].z, curve_coef[1].z, curve_coef[2].z, curve_coef[3].z);
+	if(lower - r_curr > isect->t || upper + r_curr < epsilon)
+		return false;
+
+	/* minimum width extension */
+	float mw_extension = min(difl * fabsf(upper), extmax);
+	float r_ext = mw_extension + r_curr;
+
+	float xextrem[4];
+	curvebounds(&lower, &upper, &xextrem[0], &xextrem[1], &xextrem[2], &xextrem[3], curve_coef[0].x, curve_coef[1].x, curve_coef[2].x, curve_coef[3].x);
+	if(lower > r_ext || upper < -r_ext)
+		return false;
+
+	float yextrem[4];
+	curvebounds(&lower, &upper, &yextrem[0], &yextrem[1], &yextrem[2], &yextrem[3], curve_coef[0].y, curve_coef[1].y, curve_coef[2].y, curve_coef[3].y);
+	if(lower > r_ext || upper < -r_ext)
+		return false;
+
+	/* setup recurrent loop */
+	int level = 1 << depth;
+	int tree = 0;
+	float resol = 1.0f / (float)level;
+	bool hit = false;
+
+	/* begin loop */
+	while(!(tree >> (depth))) {
+		float i_st = tree * resol;
+		float i_en = i_st + (level * resol);
+#ifdef __KERNEL_SSE2__
+		__m128 vi_st = _mm_set1_ps(i_st), vi_en = _mm_set1_ps(i_en);
+		__m128 vp_st = fma(fma(fma(vcurve_coef[3], vi_st, vcurve_coef[2]), vi_st, vcurve_coef[1]), vi_st, vcurve_coef[0]);
+		__m128 vp_en = fma(fma(fma(vcurve_coef[3], vi_en, vcurve_coef[2]), vi_en, vcurve_coef[1]), vi_en, vcurve_coef[0]);
+
+		__m128 vbmin = _mm_min_ps(vp_st, vp_en);
+		__m128 vbmax = _mm_max_ps(vp_st, vp_en);
+
+		float3 &bmin = (float3 &)vbmin, &bmax = (float3 &)vbmax;
+		float &bminx = bmin.x, &bminy = bmin.y, &bminz = bmin.z;
+		float &bmaxx = bmax.x, &bmaxy = bmax.y, &bmaxz = bmax.z;
+		float3 &p_st = (float3 &)vp_st, &p_en = (float3 &)vp_en;
+#else
+		float3 p_st = ((curve_coef[3] * i_st + curve_coef[2]) * i_st + curve_coef[1]) * i_st + curve_coef[0];
+		float3 p_en = ((curve_coef[3] * i_en + curve_coef[2]) * i_en + curve_coef[1]) * i_en + curve_coef[0];
+		
+		float bminx = min(p_st.x, p_en.x);
+		float bmaxx = max(p_st.x, p_en.x);
+		float bminy = min(p_st.y, p_en.y);
+		float bmaxy = max(p_st.y, p_en.y);
+		float bminz = min(p_st.z, p_en.z);
+		float bmaxz = max(p_st.z, p_en.z);
+#endif
+
+		if(xextrem[0] >= i_st && xextrem[0] <= i_en) {
+			bminx = min(bminx,xextrem[1]);
+			bmaxx = max(bmaxx,xextrem[1]);
+		}
+		if(xextrem[2] >= i_st && xextrem[2] <= i_en) {
+			bminx = min(bminx,xextrem[3]);
+			bmaxx = max(bmaxx,xextrem[3]);
+		}
+		if(yextrem[0] >= i_st && yextrem[0] <= i_en) {
+			bminy = min(bminy,yextrem[1]);
+			bmaxy = max(bmaxy,yextrem[1]);
+		}
+		if(yextrem[2] >= i_st && yextrem[2] <= i_en) {
+			bminy = min(bminy,yextrem[3]);
+			bmaxy = max(bmaxy,yextrem[3]);
+		}
+		if(zextrem[0] >= i_st && zextrem[0] <= i_en) {
+			bminz = min(bminz,zextrem[1]);
+			bmaxz = max(bmaxz,zextrem[1]);
+		}
+		if(zextrem[2] >= i_st && zextrem[2] <= i_en) {
+			bminz = min(bminz,zextrem[3]);
+			bmaxz = max(bmaxz,zextrem[3]);
+		}
+
+		float r1 = r_st + (r_en - r_st) * i_st;
+		float r2 = r_st + (r_en - r_st) * i_en;
+		r_curr = max(r1, r2);
+
+		mw_extension = min(difl * fabsf(bmaxz), extmax);
+		float r_ext = mw_extension + r_curr;
+		float coverage = 1.0f;
+
+		if (bminz - r_curr > isect->t || bmaxz + r_curr < epsilon || bminx > r_ext|| bmaxx < -r_ext|| bminy > r_ext|| bmaxy < -r_ext) {
+			/* the bounding box does not overlap the square centered at O */
+			tree += level;
+			level = tree & -tree;
+		}
+		else if (level == 1) {
+
+			/* the maximum recursion depth is reached.
+			* check if dP0.(Q-P0)>=0 and dPn.(Pn-Q)>=0.
+			* dP* is reversed if necessary.*/
+			float t = isect->t;
+			float u = 0.0f;
+			float gd = 0.0f;
+
+			if(flags & CURVE_KN_RIBBONS) {
+				float3 tg = (p_en - p_st);
+				float w = tg.x * tg.x + tg.y * tg.y;
+				if (w == 0) {
+					tree++;
+					level = tree & -tree;
+					continue;
+				}
+				w = -(p_st.x * tg.x + p_st.y * tg.y) / w;
+				w = clamp((float)w, 0.0f, 1.0f);
+
+				/* compute u on the curve segment */
+				u = i_st * (1 - w) + i_en * w;
+				r_curr = r_st + (r_en - r_st) * u;
+				/* compare x-y distances */
+				float3 p_curr = ((curve_coef[3] * u + curve_coef[2]) * u + curve_coef[1]) * u + curve_coef[0];
+
+				float3 dp_st = (3 * curve_coef[3] * i_st + 2 * curve_coef[2]) * i_st + curve_coef[1];
+				if (dot(tg, dp_st)< 0)
+					dp_st *= -1;
+				if (dot(dp_st, -p_st) + p_curr.z * dp_st.z < 0) {
+					tree++;
+					level = tree & -tree;
+					continue;
+				}
+				float3 dp_en = (3 * curve_coef[3] * i_en + 2 * curve_coef[2]) * i_en + curve_coef[1];
+				if (dot(tg, dp_en) < 0)
+					dp_en *= -1;
+				if (dot(dp_en, p_en) - p_curr.z * dp_en.z < 0) {
+					tree++;
+					level = tree & -tree;
+					continue;
+				}
+
+				/* compute coverage */
+				float r_ext = r_curr;
+				coverage = 1.0f;
+				if(difl != 0.0f) {
+					mw_extension = min(difl * fabsf(bmaxz), extmax);
+					r_ext = mw_extension + r_curr;
+					float d = sqrtf(p_curr.x * p_curr.x + p_curr.y * p_curr.y);
+					float d0 = d - r_curr;
+					float d1 = d + r_curr;
+					float inv_mw_extension = 1.0f/mw_extension;
+					if (d0 >= 0)
+						coverage = (min(d1 * inv_mw_extension, 1.0f) - min(d0 * inv_mw_extension, 1.0f)) * 0.5f;
+					else // inside
+						coverage = (min(d1 * inv_mw_extension, 1.0f) + min(-d0 * inv_mw_extension, 1.0f)) * 0.5f;
+				}
+				
+				if (p_curr.x * p_curr.x + p_curr.y * p_curr.y >= r_ext * r_ext || p_curr.z <= epsilon || isect->t < p_curr.z) {
+					tree++;
+					level = tree & -tree;
+					continue;
+				}
+
+				t = p_curr.z;
+
+				/* stochastic fade from minimum width */
+				if(difl != 0.0f && lcg_state) {
+					if(coverage != 1.0f && (lcg_step_float(lcg_state) > coverage))
+						return hit;
+				}
+			}
+			else {
+				float l = len(p_en - p_st);
+				/* minimum width extension */
+				float or1 = r1;
+				float or2 = r2;
+
+				if(difl != 0.0f) {
+					mw_extension = min(len(p_st - P) * difl, extmax);
+					or1 = r1 < mw_extension ? mw_extension : r1;
+					mw_extension = min(len(p_en - P) * difl, extmax);
+					or2 = r2 < mw_extension ? mw_extension : r2;
+				}
+				/* --- */
+				float invl = 1.0f/l;
+				float3 tg = (p_en - p_st) * invl;
+				gd = (or2 - or1) * invl;
+				float difz = -dot(p_st,tg);
+				float cyla = 1.0f - (tg.z * tg.z * (1 + gd*gd));
+				float invcyla = 1.0f/cyla;
+				float halfb = (-p_st.z - tg.z*(difz + gd*(difz*gd + or1)));
+				float tcentre = -halfb*invcyla;
+				float zcentre = difz + (tg.z * tcentre);
+				float3 tdif = - p_st;
+				tdif.z += tcentre;
+				float tdifz = dot(tdif,tg);
+				float tb = 2*(tdif.z - tg.z*(tdifz + gd*(tdifz*gd + or1)));
+				float tc = dot(tdif,tdif) - tdifz * tdifz * (1 + gd*gd) - or1*or1 - 2*or1*tdifz*gd;
+				float td = tb*tb - 4*cyla*tc;
+				if (td < 0.0f) {
+					tree++;
+					level = tree & -tree;
+					continue;
+				}
+				
+				float rootd = sqrtf(td);
+				float correction = (-tb - rootd) * 0.5f * invcyla;
+				t = tcentre + correction;
+
+				float3 dp_st = (3 * curve_coef[3] * i_st + 2 * curve_coef[2]) * i_st + curve_coef[1];
+				if (dot(tg, dp_st)< 0)
+					dp_st *= -1;
+				float3 dp_en = (3 * curve_coef[3] * i_en + 2 * curve_coef[2]) * i_en + curve_coef[1];
+				if (dot(tg, dp_en) < 0)
+					dp_en *= -1;
+
+				if(flags & CURVE_KN_BACKFACING && (dot(dp_st, -p_st) + t * dp_st.z < 0 || dot(dp_en, p_en) - t * dp_en.z < 0 || isect->t < t || t <= 0.0f)) {
+					correction = (-tb + rootd) * 0.5f * invcyla;
+					t = tcentre + correction;
+				}			
+
+				if (dot(dp_st, -p_st) + t * dp_st.z < 0 || dot(dp_en, p_en) - t * dp_en.z < 0 || isect->t < t || t <= 0.0f) {
+					tree++;
+					level = tree & -tree;
+					continue;
+				}
+
+				float w = (zcentre + (tg.z * correction)) * invl;
+				w = clamp((float)w, 0.0f, 1.0f);
+				/* compute u on the curve segment */
+				u = i_st * (1 - w) + i_en * w;
+
+				/* stochastic fade from minimum width */
+				if(difl != 0.0f && lcg_state) {
+					r_curr = r1 + (r2 - r1) * w;
+					r_ext = or1 + (or2 - or1) * w;
+					coverage = r_curr/r_ext;
+
+					if(coverage != 1.0f && (lcg_step_float(lcg_state) > coverage))
+						return hit;
+				}
+			}
+			/* we found a new intersection */
+
+#ifdef __VISIBILITY_FLAG__
+			/* visibility flag test. we do it here under the assumption
+			 * that most triangles are culled by node flags */
+			if(kernel_tex_fetch(__prim_visibility, curveAddr) & visibility)
+#endif
+			{
+				/* record intersection */
+				isect->prim = curveAddr;
+				isect->object = object;
+				isect->type = type;
+				isect->u = u;
+				isect->v = gd;
+				/*isect->transparency = 1.0f - coverage; */
+				isect->t = t;
+				hit = true;
+			}
+			
+			tree++;
+			level = tree & -tree;
+		}
+		else {
+			/* split the curve into two curves and process */
+			level = level >> 1;
+		}
+	}
+
+	return hit;
+}
+
+ccl_device_inline bool bvh_curve_intersect(KernelGlobals *kg, Intersection *isect,
+	float3 P, float3 direction, uint visibility, int object, int curveAddr, float time, int type, uint *lcg_state, float difl, float extmax)
+{
+	/* define few macros to minimize code duplication for SSE */
+#ifndef __KERNEL_SSE2__
+#define len3_squared(x) len_squared(x)
+#define len3(x) len(x)
+#define dot3(x, y) dot(x, y)
+#endif
+
+	int segment = PRIMITIVE_UNPACK_SEGMENT(type);
+	/* curve Intersection check */
+	int flags = kernel_data.curve.curveflags;
+
+	int prim = kernel_tex_fetch(__prim_index, curveAddr);
+	float4 v00 = kernel_tex_fetch(__curves, prim);
+
+	int cnum = __float_as_int(v00.x);
+	int k0 = cnum + segment;
+	int k1 = k0 + 1;
+
+#ifndef __KERNEL_SSE2__
+	float4 P_curve[2];
+
+	if(type & PRIMITIVE_CURVE) {
+		P_curve[0]= kernel_tex_fetch(__curve_keys, k0);
+		P_curve[1]= kernel_tex_fetch(__curve_keys, k1);
+	}
+	else {
+		int fobject = (object == OBJECT_NONE)? kernel_tex_fetch(__prim_object, curveAddr): object;
+		motion_curve_keys(kg, fobject, prim, time, k0, k1, P_curve);
+	}
+
+	float or1 = P_curve[0].w;
+	float or2 = P_curve[1].w;
+	float3 p1 = float4_to_float3(P_curve[0]);
+	float3 p2 = float4_to_float3(P_curve[1]);
+
+	/* minimum width extension */
+	float r1 = or1;
+	float r2 = or2;
+	float3 dif = P - p1;
+	float3 dif_second = P - p2;
+	if(difl != 0.0f) {
+		float pixelsize = min(len3(dif) * difl, extmax);
+		r1 = or1 < pixelsize ? pixelsize : or1;
+		pixelsize = min(len3(dif_second) * difl, extmax);
+		r2 = or2 < pixelsize ? pixelsize : or2;
+	}
+	/* --- */
+
+	float3 p21_diff = p2 - p1;
+	float3 sphere_dif1 = (dif + dif_second) * 0.5f;
+	float3 dir = direction;
+	float sphere_b_tmp = dot3(dir, sphere_dif1);
+	float3 sphere_dif2 = sphere_dif1 - sphere_b_tmp * dir;
+#else
+	__m128 P_curve[2];
+	
+	if(type & PRIMITIVE_CURVE) {
+		P_curve[0] = _mm_load_ps(&kg->__curve_keys.data[k0].x);
+		P_curve[1] = _mm_load_ps(&kg->__curve_keys.data[k1].x);
+	}
+	else {
+		int fobject = (object == OBJECT_NONE)? kernel_tex_fetch(__prim_object, curveAddr): object;
+		motion_curve_keys(kg, fobject, prim, time, k0, k1, (float4*)&P_curve);
+	}
+
+	const __m128 or12 = shuffle<3, 3, 3, 3>(P_curve[0], P_curve[1]);
+
+	__m128 r12 = or12;
+	const __m128 vP = load_m128(P);
+	const __m128 dif = _mm_sub_ps(vP, P_curve[0]);
+	const __m128 dif_second = _mm_sub_ps(vP, P_curve[1]);
+	if(difl != 0.0f) {
+		const __m128 len1_sq = len3_squared_splat(dif);
+		const __m128 len2_sq = len3_squared_splat(dif_second);
+		const __m128 len12 = _mm_sqrt_ps(shuffle<0, 0, 0, 0>(len1_sq, len2_sq));
+		const __m128 pixelsize12 = _mm_min_ps(_mm_mul_ps(len12, _mm_set1_ps(difl)), _mm_set1_ps(extmax));
+		r12 = _mm_max_ps(or12, pixelsize12);
+	}
+	float or1 = _mm_cvtss_f32(or12), or2 = _mm_cvtss_f32(broadcast<2>(or12));
+	float r1 = _mm_cvtss_f32(r12), r2 = _mm_cvtss_f32(broadcast<2>(r12));
+
+	const __m128 p21_diff = _mm_sub_ps(P_curve[1], P_curve[0]);
+	const __m128 sphere_dif1 = _mm_mul_ps(_mm_add_ps(dif, dif_second), _mm_set1_ps(0.5f));
+	const __m128 dir = load_m128(direction);
+	const __m128 sphere_b_tmp = dot3_splat(dir, sphere_dif1);
+	const __m128 sphere_dif2 = fnma(sphere_b_tmp, dir, sphere_dif1);
+#endif
+
+	float mr = max(r1, r2);
+	float l = len3(p21_diff);
+	float invl = 1.0f / l;
+	float sp_r = mr + 0.5f * l;
+
+	float sphere_b = dot3(dir, sphere_dif2);
+	float sdisc = sphere_b * sphere_b - len3_squared(sphere_dif2) + sp_r * sp_r;
+
+	if(sdisc < 0.0f)
+		return false;
+
+	/* obtain parameters and test midpoint distance for suitable modes */
+#ifndef __KERNEL_SSE2__
+	float3 tg = p21_diff * invl;
+#else
+	const __m128 tg = _mm_mul_ps(p21_diff, _mm_set1_ps(invl));
+#endif
+	float gd = (r2 - r1) * invl;
+
+	float dirz = dot3(dir, tg);
+	float difz = dot3(dif, tg);
+
+	float a = 1.0f - (dirz*dirz*(1 + gd*gd));
+
+	float halfb = dot3(dir, dif) - dirz*(difz + gd*(difz*gd + r1));
+
+	float tcentre = -halfb/a;
+	float zcentre = difz + (dirz * tcentre);
+
+	if((tcentre > isect->t) && !(flags & CURVE_KN_ACCURATE))
+		return false;
+	if((zcentre < 0 || zcentre > l) && !(flags & CURVE_KN_ACCURATE) && !(flags & CURVE_KN_INTERSECTCORRECTION))
+		return false;
+
+	/* test minimum separation */
+#ifndef __KERNEL_SSE2__
+	float3 cprod = cross(tg, dir);
+	float cprod2sq = len3_squared(cross(tg, dif));
+#else
+	const __m128 cprod = cross(tg, dir);
+	float cprod2sq = len3_squared(cross_zxy(tg, dif));
+#endif
+	float cprodsq = len3_squared(cprod);
+	float distscaled = dot3(cprod, dif);
+
+	if(cprodsq == 0)
+		distscaled = cprod2sq;
+	else
+		distscaled = (distscaled*distscaled)/cprodsq;
+
+	if(distscaled > mr*mr)
+		return false;
+
+	/* calculate true intersection */
+#ifndef __KERNEL_SSE2__
+	float3 tdif = dif + tcentre * dir;
+#else
+	const __m128 tdif = fma(_mm_set1_ps(tcentre), dir, dif);
+#endif
+	float tdifz = dot3(tdif, tg);
+	float tdifma = tdifz*gd + r1;
+	float tb = 2*(dot3(dir, tdif) - dirz*(tdifz + gd*tdifma));
+	float tc = dot3(tdif, tdif) - tdifz*tdifz - tdifma*tdifma;
+	float td = tb*tb - 4*a*tc;
+
+	if (td < 0.0f)
+		return false;
+
+	float rootd = 0.0f;
+	float correction = 0.0f;
+	if(flags & CURVE_KN_ACCURATE) {
+		rootd = sqrtf(td);
+		correction = ((-tb - rootd)/(2*a));
+	}
+
+	float t = tcentre + correction;
+
+	if(t < isect->t) {
+
+		if(flags & CURVE_KN_INTERSECTCORRECTION) {
+			rootd = sqrtf(td);
+			correction = ((-tb - rootd)/(2*a));
+			t = tcentre + correction;
+		}
+
+		float z = zcentre + (dirz * correction);
+		// bool backface = false;
+
+		if(flags & CURVE_KN_BACKFACING && (t < 0.0f || z < 0 || z > l)) {
+			// backface = true;
+			correction = ((-tb + rootd)/(2*a));
+			t = tcentre + correction;
+			z = zcentre + (dirz * correction);
+		}
+
+		/* stochastic fade from minimum width */
+		float adjradius = or1 + z * (or2 - or1) * invl;
+		adjradius = adjradius / (r1 + z * gd);
+		if(lcg_state && adjradius != 1.0f) {
+			if(lcg_step_float(lcg_state) > adjradius)
+				return false;
+		}
+		/* --- */
+
+		if(t > 0.0f && t < isect->t && z >= 0 && z <= l) {
+
+			if (flags & CURVE_KN_ENCLOSEFILTER) {
+				float enc_ratio = 1.01f;
+				if((difz > -r1 * enc_ratio) && (dot3(dif_second, tg) < r2 * enc_ratio)) {
+					float a2 = 1.0f - (dirz*dirz*(1 + gd*gd*enc_ratio*enc_ratio));
+					float c2 = dot3(dif, dif) - difz * difz * (1 + gd*gd*enc_ratio*enc_ratio) - r1*r1*enc_ratio*enc_ratio - 2*r1*difz*gd*enc_ratio;
+					if(a2*c2 < 0.0f)
+						return false;
+				}
+			}
+
+#ifdef __VISIBILITY_FLAG__
+			/* visibility flag test. we do it here under the assumption
+			 * that most triangles are culled by node flags */
+			if(kernel_tex_fetch(__prim_visibility, curveAddr) & visibility)
+#endif
+			{
+				/* record intersection */
+				isect->prim = curveAddr;
+				isect->object = object;
+				isect->type = type;
+				isect->u = z*invl;
+				isect->v = gd;
+				/*isect->transparency = 1.0f - adjradius;*/
+				isect->t = t;
+
+				return true;
+			}
+		}
+	}
+
+	return false;
+
+#ifndef __KERNEL_SSE2__
+#undef len3_squared
+#undef len3
+#undef dot3
+#endif
+}
+
+ccl_device_inline float3 curvetangent(float t, float3 p0, float3 p1, float3 p2, float3 p3)
+{
+	float fc = 0.71f;
+	float data[4];
+	float t2 = t * t;
+	data[0] = -3.0f * fc          * t2  + 4.0f * fc * t                  - fc;
+	data[1] =  3.0f * (2.0f - fc) * t2  + 2.0f * (fc - 3.0f) * t;
+	data[2] =  3.0f * (fc - 2.0f) * t2  + 2.0f * (3.0f - 2.0f * fc) * t  + fc;
+	data[3] =  3.0f * fc          * t2  - 2.0f * fc * t;
+	return data[0] * p0 + data[1] * p1 + data[2] * p2 + data[3] * p3;
+}
+
+ccl_device_inline float3 curvepoint(float t, float3 p0, float3 p1, float3 p2, float3 p3)
+{
+	float data[4];
+	float fc = 0.71f;
+	float t2 = t * t;
+	float t3 = t2 * t;
+	data[0] = -fc          * t3  + 2.0f * fc          * t2 - fc * t;
+	data[1] =  (2.0f - fc) * t3  + (fc - 3.0f)        * t2 + 1.0f;
+	data[2] =  (fc - 2.0f) * t3  + (3.0f - 2.0f * fc) * t2 + fc * t;
+	data[3] =  fc          * t3  - fc * t2;
+	return data[0] * p0 + data[1] * p1 + data[2] * p2 + data[3] * p3;
+}
+
+ccl_device_inline float3 bvh_curve_refine(KernelGlobals *kg, ShaderData *sd, const Intersection *isect, const Ray *ray)
+{
+	int flag = kernel_data.curve.curveflags;
+	float t = isect->t;
+	float3 P = ray->P;
+	float3 D = ray->D;
+
+	if(isect->object != OBJECT_NONE) {
+#ifdef __OBJECT_MOTION__
+		Transform tfm = sd->ob_itfm;
+#else
+		Transform tfm = object_fetch_transform(kg, isect->object, OBJECT_INVERSE_TRANSFORM);
+#endif
+
+		P = transform_point(&tfm, P);
+		D = transform_direction(&tfm, D*t);
+		D = normalize_len(D, &t);
+	}
+
+	int prim = kernel_tex_fetch(__prim_index, isect->prim);
+	float4 v00 = kernel_tex_fetch(__curves, prim);
+
+	int k0 = __float_as_int(v00.x) + PRIMITIVE_UNPACK_SEGMENT(sd->type);
+	int k1 = k0 + 1;
+
+	float3 tg;
+
+	if(flag & CURVE_KN_INTERPOLATE) {
+		int ka = max(k0 - 1,__float_as_int(v00.x));
+		int kb = min(k1 + 1,__float_as_int(v00.x) + __float_as_int(v00.y) - 1);
+
+		float4 P_curve[4];
+
+		if(sd->type & PRIMITIVE_CURVE) {
+			P_curve[0] = kernel_tex_fetch(__curve_keys, ka);
+			P_curve[1] = kernel_tex_fetch(__curve_keys, k0);
+			P_curve[2] = kernel_tex_fetch(__curve_keys, k1);
+			P_curve[3] = kernel_tex_fetch(__curve_keys, kb);
+		}
+		else {
+			motion_cardinal_curve_keys(kg, sd->object, sd->prim, sd->time, ka, k0, k1, kb, P_curve);
+		}
+
+		float3 p[4];
+		p[0] = float4_to_float3(P_curve[0]);
+		p[1] = float4_to_float3(P_curve[1]);
+		p[2] = float4_to_float3(P_curve[2]);
+		p[3] = float4_to_float3(P_curve[3]);
+
+		P = P + D*t;
+
+#ifdef __UV__
+		sd->u = isect->u;
+		sd->v = 0.0f;
+#endif
+	
+		if(kernel_data.curve.curveflags & CURVE_KN_RIBBONS) {
+			tg = normalize(curvetangent(isect->u, p[0], p[1], p[2], p[3]));
+			sd->Ng = normalize(-(D - tg * (dot(tg, D))));
+		}
+		else {
+			/* direction from inside to surface of curve */
+			float3 p_curr = curvepoint(isect->u, p[0], p[1], p[2], p[3]);	
+			sd->Ng = normalize(P - p_curr);
+
+			/* adjustment for changing radius */
+			float gd = isect->v;
+
+			if(gd != 0.0f) {
+				tg = normalize(curvetangent(isect->u, p[0], p[1], p[2], p[3]));
+				sd->Ng = sd->Ng - gd * tg;
+				sd->Ng = normalize(sd->Ng);
+			}
+		}
+
+		/* todo: sometimes the normal is still so that this is detected as
+		 * backfacing even if cull backfaces is enabled */
+
+		sd->N = sd->Ng;
+	}
+	else {
+		float4 P_curve[2];
+
+		if(sd->type & PRIMITIVE_CURVE) {
+			P_curve[0]= kernel_tex_fetch(__curve_keys, k0);
+			P_curve[1]= kernel_tex_fetch(__curve_keys, k1);
+		}
+		else {
+			motion_curve_keys(kg, sd->object, sd->prim, sd->time, k0, k1, P_curve);
+		}
+
+		float l = 1.0f;
+		tg = normalize_len(float4_to_float3(P_curve[1] - P_curve[0]), &l);
+		
+		P = P + D*t;
+
+		float3 dif = P - float4_to_float3(P_curve[0]);
+
+#ifdef __UV__
+		sd->u = dot(dif,tg)/l;
+		sd->v = 0.0f;
+#endif
+
+		if (flag & CURVE_KN_TRUETANGENTGNORMAL) {
+			sd->Ng = -(D - tg * dot(tg, D));
+			sd->Ng = normalize(sd->Ng);
+		}
+		else {
+			float gd = isect->v;
+
+			/* direction from inside to surface of curve */
+			sd->Ng = (dif - tg * sd->u * l) / (P_curve[0].w + sd->u * l * gd);
+
+			/* adjustment for changing radius */
+			if (gd != 0.0f) {
+				sd->Ng = sd->Ng - gd * tg;
+				sd->Ng = normalize(sd->Ng);
+			}
+		}
+
+		sd->N = sd->Ng;
+	}
+
+#ifdef __DPDU__
+	/* dPdu/dPdv */
+	sd->dPdu = tg;
+	sd->dPdv = cross(tg, sd->Ng);
+#endif
+
+	/*add fading parameter for minimum pixel width with transparency bsdf*/
+	/*sd->curve_transparency = isect->transparency;*/
+	/*sd->curve_radius = sd->u * gd * l + r1;*/
+
+	if(isect->object != OBJECT_NONE) {
+#ifdef __OBJECT_MOTION__
+		Transform tfm = sd->ob_tfm;
+#else
+		Transform tfm = object_fetch_transform(kg, isect->object, OBJECT_TRANSFORM);
+#endif
+
+		P = transform_point(&tfm, P);
+	}
+
+	return P;
+}
+
+#endif
+
+CCL_NAMESPACE_END
+
diff --git a/intern/cycles/kernel/geom/geom_motion_curve.h b/intern/cycles/kernel/geom/geom_motion_curve.h
new file mode 100644
index 00000000000..1022a957b05
--- /dev/null
+++ b/intern/cycles/kernel/geom/geom_motion_curve.h
@@ -0,0 +1,148 @@
+/*
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+CCL_NAMESPACE_BEGIN
+
+/* Motion Curve Primitive
+ *
+ * These are stored as regular curves, plus extra positions and radii at times
+ * other than the frame center. Computing the curve keys at a given ray time is
+ * a matter of interpolation of the two steps between which the ray time lies.
+ *
+ * The extra curve keys are stored as ATTR_STD_MOTION_VERTEX_POSITION.
+ */
+
+#ifdef __HAIR__
+
+ccl_device_inline int find_attribute_curve_motion(KernelGlobals *kg, int object, uint id, AttributeElement *elem)
+{
+	/* todo: find a better (faster) solution for this, maybe store offset per object */
+	uint attr_offset = object*kernel_data.bvh.attributes_map_stride + ATTR_PRIM_CURVE;
+	uint4 attr_map = kernel_tex_fetch(__attributes_map, attr_offset);
+	
+	while(attr_map.x != id) {
+		attr_offset += ATTR_PRIM_TYPES;
+		attr_map = kernel_tex_fetch(__attributes_map, attr_offset);
+	}
+
+	*elem = (AttributeElement)attr_map.y;
+	
+	/* return result */
+	return (attr_map.y == ATTR_ELEMENT_NONE) ? (int)ATTR_STD_NOT_FOUND : (int)attr_map.z;
+}
+
+ccl_device_inline void motion_curve_keys_for_step(KernelGlobals *kg, int offset, int numkeys, int numsteps, int step, int k0, int k1, float4 keys[2])
+{
+	if(step == numsteps) {
+		/* center step: regular vertex location */
+		keys[0] = kernel_tex_fetch(__curve_keys, k0);
+		keys[1] = kernel_tex_fetch(__curve_keys, k1);
+	}
+	else {
+		/* center step not stored in this array */
+		if(step > numsteps)
+			step--;
+
+		offset += step*numkeys;
+
+		keys[0] = kernel_tex_fetch(__attributes_float3, offset + k0);
+		keys[1] = kernel_tex_fetch(__attributes_float3, offset + k1);
+	}
+}
+
+/* return 2 curve key locations */
+ccl_device_inline void motion_curve_keys(KernelGlobals *kg, int object, int prim, float time, int k0, int k1, float4 keys[2])
+{
+	/* get motion info */
+	int numsteps, numkeys;
+	object_motion_info(kg, object, &numsteps, NULL, &numkeys);
+
+	/* figure out which steps we need to fetch and their interpolation factor */
+	int maxstep = numsteps*2;
+	int step = min((int)(time*maxstep), maxstep-1);
+	float t = time*maxstep - step;
+
+	/* find attribute */
+	AttributeElement elem;
+	int offset = find_attribute_curve_motion(kg, object, ATTR_STD_MOTION_VERTEX_POSITION, &elem);
+	kernel_assert(offset != ATTR_STD_NOT_FOUND);
+
+	/* fetch key coordinates */
+	float4 next_keys[2];
+
+	motion_curve_keys_for_step(kg, offset, numkeys, numsteps, step, k0, k1, keys);
+	motion_curve_keys_for_step(kg, offset, numkeys, numsteps, step+1, k0, k1, next_keys);
+
+	/* interpolate between steps */
+	keys[0] = (1.0f - t)*keys[0] + t*next_keys[0];
+	keys[1] = (1.0f - t)*keys[1] + t*next_keys[1];
+}
+
+ccl_device_inline void motion_cardinal_curve_keys_for_step(KernelGlobals *kg, int offset, int numkeys, int numsteps, int step, int k0, int k1, int k2, int k3, float4 keys[4])
+{
+	if(step == numsteps) {
+		/* center step: regular vertex location */
+		keys[0] = kernel_tex_fetch(__curve_keys, k0);
+		keys[1] = kernel_tex_fetch(__curve_keys, k1);
+		keys[2] = kernel_tex_fetch(__curve_keys, k2);
+		keys[3] = kernel_tex_fetch(__curve_keys, k3);
+	}
+	else {
+		/* center step not store in this array */
+		if(step > numsteps)
+			step--;
+
+		offset += step*numkeys;
+
+		keys[0] = kernel_tex_fetch(__attributes_float3, offset + k0);
+		keys[1] = kernel_tex_fetch(__attributes_float3, offset + k1);
+		keys[2] = kernel_tex_fetch(__attributes_float3, offset + k2);
+		keys[3] = kernel_tex_fetch(__attributes_float3, offset + k3);
+	}
+}
+
+/* return 2 curve key locations */
+ccl_device_inline void motion_cardinal_curve_keys(KernelGlobals *kg, int object, int prim, float time, int k0, int k1, int k2, int k3, float4 keys[4])
+{
+	/* get motion info */
+	int numsteps, numkeys;
+	object_motion_info(kg, object, &numsteps, NULL, &numkeys);
+
+	/* figure out which steps we need to fetch and their interpolation factor */
+	int maxstep = numsteps*2;
+	int step = min((int)(time*maxstep), maxstep-1);
+	float t = time*maxstep - step;
+
+	/* find attribute */
+	AttributeElement elem;
+	int offset = find_attribute_curve_motion(kg, object, ATTR_STD_MOTION_VERTEX_POSITION, &elem);
+	kernel_assert(offset != ATTR_STD_NOT_FOUND);
+
+	/* fetch key coordinates */
+	float4 next_keys[4];
+
+	motion_cardinal_curve_keys_for_step(kg, offset, numkeys, numsteps, step, k0, k1, k2, k3, keys);
+	motion_cardinal_curve_keys_for_step(kg, offset, numkeys, numsteps, step+1, k0, k1, k2, k3, next_keys);
+
+	/* interpolate between steps */
+	keys[0] = (1.0f - t)*keys[0] + t*next_keys[0];
+	keys[1] = (1.0f - t)*keys[1] + t*next_keys[1];
+	keys[2] = (1.0f - t)*keys[2] + t*next_keys[2];
+	keys[3] = (1.0f - t)*keys[3] + t*next_keys[3];
+}
+
+#endif
+
+CCL_NAMESPACE_END
+
diff --git a/intern/cycles/kernel/geom/geom_motion_triangle.h b/intern/cycles/kernel/geom/geom_motion_triangle.h
new file mode 100644
index 00000000000..73338bb6b3b
--- /dev/null
+++ b/intern/cycles/kernel/geom/geom_motion_triangle.h
@@ -0,0 +1,392 @@
+/*
+ * Adapted from code Copyright 2009-2010 NVIDIA Corporation
+ * Modifications Copyright 2011, Blender Foundation.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+/* Motion Triangle Primitive
+ *
+ * These are stored as regular triangles, plus extra positions and normals at
+ * times other than the frame center. Computing the triangle vertex positions
+ * or normals at a given ray time is a matter of interpolation of the two steps
+ * between which the ray time lies.
+ *
+ * The extra positions and normals are stored as ATTR_STD_MOTION_VERTEX_POSITION
+ * and ATTR_STD_MOTION_VERTEX_NORMAL mesh attributes.
+ */
+
+CCL_NAMESPACE_BEGIN
+
+/* Time interpolation of vertex positions and normals */
+
+ccl_device_inline int find_attribute_motion(KernelGlobals *kg, int object, uint id, AttributeElement *elem)
+{
+	/* todo: find a better (faster) solution for this, maybe store offset per object */
+	uint attr_offset = object*kernel_data.bvh.attributes_map_stride;
+	uint4 attr_map = kernel_tex_fetch(__attributes_map, attr_offset);
+	
+	while(attr_map.x != id) {
+		attr_offset += ATTR_PRIM_TYPES;
+		attr_map = kernel_tex_fetch(__attributes_map, attr_offset);
+	}
+
+	*elem = (AttributeElement)attr_map.y;
+	
+	/* return result */
+	return (attr_map.y == ATTR_ELEMENT_NONE) ? (int)ATTR_STD_NOT_FOUND : (int)attr_map.z;
+}
+
+ccl_device_inline void motion_triangle_verts_for_step(KernelGlobals *kg, float3 tri_vindex, int offset, int numverts, int numsteps, int step, float3 verts[3])
+{
+	if(step == numsteps) {
+		/* center step: regular vertex location */
+		verts[0] = float4_to_float3(kernel_tex_fetch(__tri_verts, __float_as_int(tri_vindex.x)));
+		verts[1] = float4_to_float3(kernel_tex_fetch(__tri_verts, __float_as_int(tri_vindex.y)));
+		verts[2] = float4_to_float3(kernel_tex_fetch(__tri_verts, __float_as_int(tri_vindex.z)));
+	}
+	else {
+		/* center step not store in this array */
+		if(step > numsteps)
+			step--;
+
+		offset += step*numverts;
+
+		verts[0] = float4_to_float3(kernel_tex_fetch(__attributes_float3, offset + __float_as_int(tri_vindex.x)));
+		verts[1] = float4_to_float3(kernel_tex_fetch(__attributes_float3, offset + __float_as_int(tri_vindex.y)));
+		verts[2] = float4_to_float3(kernel_tex_fetch(__attributes_float3, offset + __float_as_int(tri_vindex.z)));
+	}
+}
+
+ccl_device_inline void motion_triangle_normals_for_step(KernelGlobals *kg, float3 tri_vindex, int offset, int numverts, int numsteps, int step, float3 normals[3])
+{
+	if(step == numsteps) {
+		/* center step: regular vertex location */
+		normals[0] = float4_to_float3(kernel_tex_fetch(__tri_vnormal, __float_as_int(tri_vindex.x)));
+		normals[1] = float4_to_float3(kernel_tex_fetch(__tri_vnormal, __float_as_int(tri_vindex.y)));
+		normals[2] = float4_to_float3(kernel_tex_fetch(__tri_vnormal, __float_as_int(tri_vindex.z)));
+	}
+	else {
+		/* center step not stored in this array */
+		if(step > numsteps)
+			step--;
+
+		offset += step*numverts;
+
+		normals[0] = float4_to_float3(kernel_tex_fetch(__attributes_float3, offset + __float_as_int(tri_vindex.x)));
+		normals[1] = float4_to_float3(kernel_tex_fetch(__attributes_float3, offset + __float_as_int(tri_vindex.y)));
+		normals[2] = float4_to_float3(kernel_tex_fetch(__attributes_float3, offset + __float_as_int(tri_vindex.z)));
+	}
+}
+
+ccl_device_inline void motion_triangle_vertices(KernelGlobals *kg, int object, int prim, float time, float3 verts[3])
+{
+	/* get motion info */
+	int numsteps, numverts;
+	object_motion_info(kg, object, &numsteps, &numverts, NULL);
+
+	/* figure out which steps we need to fetch and their interpolation factor */
+	int maxstep = numsteps*2;
+	int step = min((int)(time*maxstep), maxstep-1);
+	float t = time*maxstep - step;
+
+	/* find attribute */
+	AttributeElement elem;
+	int offset = find_attribute_motion(kg, object, ATTR_STD_MOTION_VERTEX_POSITION, &elem);
+	kernel_assert(offset != ATTR_STD_NOT_FOUND);
+
+	/* fetch vertex coordinates */
+	float3 next_verts[3];
+	float3 tri_vindex = float4_to_float3(kernel_tex_fetch(__tri_vindex, prim));
+
+	motion_triangle_verts_for_step(kg, tri_vindex, offset, numverts, numsteps, step, verts);
+	motion_triangle_verts_for_step(kg, tri_vindex, offset, numverts, numsteps, step+1, next_verts);
+
+	/* interpolate between steps */
+	verts[0] = (1.0f - t)*verts[0] + t*next_verts[0];
+	verts[1] = (1.0f - t)*verts[1] + t*next_verts[1];
+	verts[2] = (1.0f - t)*verts[2] + t*next_verts[2];
+}
+
+/* Refine triangle intersection to more precise hit point. For rays that travel
+ * far the precision is often not so good, this reintersects the primitive from
+ * a closer distance. */
+
+ccl_device_inline float3 motion_triangle_refine(KernelGlobals *kg, ShaderData *sd, const Intersection *isect, const Ray *ray, float3 verts[3])
+{
+	float3 P = ray->P;
+	float3 D = ray->D;
+	float t = isect->t;
+
+#ifdef __INTERSECTION_REFINE__
+	if(isect->object != OBJECT_NONE) {
+#ifdef __OBJECT_MOTION__
+		Transform tfm = sd->ob_itfm;
+#else
+		Transform tfm = object_fetch_transform(kg, isect->object, OBJECT_INVERSE_TRANSFORM);
+#endif
+
+		P = transform_point(&tfm, P);
+		D = transform_direction(&tfm, D*t);
+		D = normalize_len(D, &t);
+	}
+
+	P = P + D*t;
+
+	/* compute refined intersection distance */
+	const float3 e1 = verts[0] - verts[2];
+	const float3 e2 = verts[1] - verts[2];
+	const float3 s1 = cross(D, e2);
+
+	const float invdivisor = 1.0f/dot(s1, e1);
+	const float3 d = P - verts[2];
+	const float3 s2 = cross(d, e1);
+	float rt = dot(e2, s2)*invdivisor;
+
+	/* compute refined position */
+	P = P + D*rt;
+
+	if(isect->object != OBJECT_NONE) {
+#ifdef __OBJECT_MOTION__
+		Transform tfm = sd->ob_tfm;
+#else
+		Transform tfm = object_fetch_transform(kg, isect->object, OBJECT_TRANSFORM);
+#endif
+
+		P = transform_point(&tfm, P);
+	}
+
+	return P;
+#else
+	return P + D*t;
+#endif
+}
+
+/* Same as above, except that isect->t is assumed to be in object space for instancing */
+
+#ifdef __SUBSURFACE__
+ccl_device_inline float3 motion_triangle_refine_subsurface(KernelGlobals *kg, ShaderData *sd, const Intersection *isect, const Ray *ray, float3 verts[3])
+{
+	float3 P = ray->P;
+	float3 D = ray->D;
+	float t = isect->t;
+
+#ifdef __INTERSECTION_REFINE__
+	if(isect->object != OBJECT_NONE) {
+#ifdef __OBJECT_MOTION__
+		Transform tfm = sd->ob_itfm;
+#else
+		Transform tfm = object_fetch_transform(kg, isect->object, OBJECT_INVERSE_TRANSFORM);
+#endif
+
+		P = transform_point(&tfm, P);
+		D = transform_direction(&tfm, D);
+		D = normalize(D);
+	}
+
+	P = P + D*t;
+
+	/* compute refined intersection distance */
+	const float3 e1 = verts[0] - verts[2];
+	const float3 e2 = verts[1] - verts[2];
+	const float3 s1 = cross(D, e2);
+
+	const float invdivisor = 1.0f/dot(s1, e1);
+	const float3 d = P - verts[2];
+	const float3 s2 = cross(d, e1);
+	float rt = dot(e2, s2)*invdivisor;
+
+	P = P + D*rt;
+
+	if(isect->object != OBJECT_NONE) {
+#ifdef __OBJECT_MOTION__
+		Transform tfm = sd->ob_tfm;
+#else
+		Transform tfm = object_fetch_transform(kg, isect->object, OBJECT_TRANSFORM);
+#endif
+
+		P = transform_point(&tfm, P);
+	}
+
+	return P;
+#else
+	return P + D*t;
+#endif
+}
+#endif
+
+/* Setup of motion triangle specific parts of ShaderData, moved into this one
+ * function to more easily share computation of interpolated positions and
+ * normals */
+
+/* return 3 triangle vertex normals */
+ccl_device_noinline void motion_triangle_shader_setup(KernelGlobals *kg, ShaderData *sd, const Intersection *isect, const Ray *ray, bool subsurface)
+{
+	/* get shader */
+	float4 Ns = kernel_tex_fetch(__tri_normal, sd->prim);
+	sd->shader = __float_as_int(Ns.w);
+
+	/* get motion info */
+	int numsteps, numverts;
+	object_motion_info(kg, sd->object, &numsteps, &numverts, NULL);
+
+	/* figure out which steps we need to fetch and their interpolation factor */
+	int maxstep = numsteps*2;
+	int step = min((int)(sd->time*maxstep), maxstep-1);
+	float t = sd->time*maxstep - step;
+
+	/* find attribute */
+	AttributeElement elem;
+	int offset = find_attribute_motion(kg, sd->object, ATTR_STD_MOTION_VERTEX_POSITION, &elem);
+	kernel_assert(offset != ATTR_STD_NOT_FOUND);
+
+	/* fetch vertex coordinates */
+	float3 verts[3], next_verts[3];
+	float3 tri_vindex = float4_to_float3(kernel_tex_fetch(__tri_vindex, sd->prim));
+
+	motion_triangle_verts_for_step(kg, tri_vindex, offset, numverts, numsteps, step, verts);
+	motion_triangle_verts_for_step(kg, tri_vindex, offset, numverts, numsteps, step+1, next_verts);
+
+	/* interpolate between steps */
+	verts[0] = (1.0f - t)*verts[0] + t*next_verts[0];
+	verts[1] = (1.0f - t)*verts[1] + t*next_verts[1];
+	verts[2] = (1.0f - t)*verts[2] + t*next_verts[2];
+
+	/* compute refined position */
+#ifdef __SUBSURFACE__
+	if(!subsurface)
+#endif
+		sd->P = motion_triangle_refine(kg, sd, isect, ray, verts);
+#ifdef __SUBSURFACE__
+	else
+		sd->P = motion_triangle_refine_subsurface(kg, sd, isect, ray, verts);
+#endif
+
+	/* compute face normal */
+	float3 Ng = normalize(cross(verts[1] - verts[0], verts[2] - verts[0]));
+
+	sd->Ng = Ng;
+	sd->N = Ng;
+
+	/* compute derivatives of P w.r.t. uv */
+#ifdef __DPDU__
+	sd->dPdu = (verts[0] - verts[2]);
+	sd->dPdv = (verts[1] - verts[2]);
+#endif
+
+	/* compute smooth normal */
+	if(sd->shader & SHADER_SMOOTH_NORMAL) {
+		/* find attribute */
+		AttributeElement elem;
+		int offset = find_attribute_motion(kg, sd->object, ATTR_STD_MOTION_VERTEX_NORMAL, &elem);
+		kernel_assert(offset != ATTR_STD_NOT_FOUND);
+
+		/* fetch vertex coordinates */
+		float3 normals[3], next_normals[3];
+		motion_triangle_normals_for_step(kg, tri_vindex, offset, numverts, numsteps, step, normals);
+		motion_triangle_normals_for_step(kg, tri_vindex, offset, numverts, numsteps, step+1, next_normals);
+
+		/* interpolate between steps */
+		normals[0] = (1.0f - t)*normals[0] + t*next_normals[0];
+		normals[1] = (1.0f - t)*normals[1] + t*next_normals[1];
+		normals[2] = (1.0f - t)*normals[2] + t*next_normals[2];
+
+		/* interpolate between vertices */
+		float u = sd->u;
+		float v = sd->v;
+		float w = 1.0f - u - v;
+		sd->N = (u*normals[0] + v*normals[1] + w*normals[2]);
+	}
+}
+
+/* Ray intersection. We simply compute the vertex positions at the given ray
+ * time and do a ray intersection with the resulting triangle */
+
+ccl_device_inline bool motion_triangle_intersect(KernelGlobals *kg, Intersection *isect,
+	float3 P, float3 dir, float time, uint visibility, int object, int triAddr)
+{
+	/* primitive index for vertex location lookup */
+	int prim = kernel_tex_fetch(__prim_index, triAddr);
+	int fobject = (object == OBJECT_NONE)? kernel_tex_fetch(__prim_object, triAddr): object;
+
+	/* get vertex locations for intersection */
+	float3 verts[3];
+	motion_triangle_vertices(kg, fobject, prim, time, verts);
+
+	/* ray-triangle intersection, unoptimized */
+	float t, u, v;
+
+	if(ray_triangle_intersect_uv(P, dir, isect->t, verts[2], verts[0], verts[1], &u, &v, &t)) {
+		isect->prim = triAddr;
+		isect->object = object;
+		isect->type = PRIMITIVE_MOTION_TRIANGLE;
+		isect->u = u;
+		isect->v = v;
+		isect->t = t;
+		
+		return true;
+	}
+
+	return false;
+}
+
+/* Special ray intersection routines for subsurface scattering. In that case we
+ * only want to intersect with primitives in the same object, and if case of
+ * multiple hits we pick a single random primitive as the intersection point. */
+
+#ifdef __SUBSURFACE__
+ccl_device_inline void motion_triangle_intersect_subsurface(KernelGlobals *kg, Intersection *isect_array,
+	float3 P, float3 dir, float time, int object, int triAddr, float tmax, uint *num_hits, uint *lcg_state, int max_hits)
+{
+	/* primitive index for vertex location lookup */
+	int prim = kernel_tex_fetch(__prim_index, triAddr);
+	int fobject = (object == OBJECT_NONE)? kernel_tex_fetch(__prim_object, triAddr): object;
+
+	/* get vertex locations for intersection */
+	float3 verts[3];
+	motion_triangle_vertices(kg, fobject, prim, time, verts);
+
+	/* ray-triangle intersection, unoptimized */
+	float t, u, v;
+
+	if(ray_triangle_intersect_uv(P, dir, tmax, verts[2], verts[0], verts[1], &u, &v, &t)) {
+		(*num_hits)++;
+
+		int hit;
+
+		if(*num_hits <= max_hits) {
+			hit = *num_hits - 1;
+		}
+		else {
+			/* reservoir sampling: if we are at the maximum number of
+			 * hits, randomly replace element or skip it */
+			hit = lcg_step_uint(lcg_state) % *num_hits;
+
+			if(hit >= max_hits)
+				return;
+		}
+
+		/* record intersection */
+		Intersection *isect = &isect_array[hit];
+		isect->prim = triAddr;
+		isect->object = object;
+		isect->type = PRIMITIVE_MOTION_TRIANGLE;
+		isect->u = u;
+		isect->v = v;
+		isect->t = t;
+	}
+}
+#endif
+
+CCL_NAMESPACE_END
+
diff --git a/intern/cycles/kernel/kernel_object.h b/intern/cycles/kernel/geom/geom_object.h
index a66277e10cd..91edd5863ac 100644
--- a/intern/cycles/kernel/kernel_object.h
+++ b/intern/cycles/kernel/geom/geom_object.h
@@ -1,6 +1,4 @@
 /*
- * Copyright 2011-2013 Blender Foundation
- *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
  * You may obtain a copy of the License at
@@ -11,11 +9,23 @@
  * distributed under the License is distributed on an "AS IS" BASIS,
  * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
  * See the License for the specific language governing permissions and
- * limitations under the License
+ * limitations under the License.
  */
 
+/* Object Primitive
+ *
+ * All mesh and curve primitives are part of an object. The same mesh and curves
+ * may be instanced multiple times by different objects.
+ *
+ * If the mesh is not instanced multiple times, the object will not be explicitly
+ * stored as a primitive in the BVH, rather the bare triangles are curved are
+ * directly primitives in the BVH with world space locations applied, and the object
+ * ID is looked up afterwards. */
+
 CCL_NAMESPACE_BEGIN
 
+/* Object attributes, for now a fixed size and contents */
+
 enum ObjectTransform {
 	OBJECT_TRANSFORM = 0,
 	OBJECT_TRANSFORM_MOTION_PRE = 0,
@@ -30,6 +40,8 @@ enum ObjectVectorTransform {
 	OBJECT_VECTOR_MOTION_POST = 3
 };
 
+/* Object to world space transformation */
+
 ccl_device_inline Transform object_fetch_transform(KernelGlobals *kg, int object, enum ObjectTransform type)
 {
 	int offset = object*OBJECT_SIZE + (int)type;
@@ -43,6 +55,8 @@ ccl_device_inline Transform object_fetch_transform(KernelGlobals *kg, int object
 	return tfm;
 }
 
+/* Object to world space transformation for motion vectors */
+
 ccl_device_inline Transform object_fetch_vector_transform(KernelGlobals *kg, int object, enum ObjectVectorTransform type)
 {
 	int offset = object*OBJECT_VECTOR_SIZE + (int)type;
@@ -56,6 +70,8 @@ ccl_device_inline Transform object_fetch_vector_transform(KernelGlobals *kg, int
 	return tfm;
 }
 
+/* Motion blurred object transformations */
+
 #ifdef __OBJECT_MOTION__
 ccl_device_inline Transform object_fetch_transform_motion(KernelGlobals *kg, int object, float time)
 {
@@ -102,7 +118,9 @@ ccl_device_inline Transform object_fetch_transform_motion_test(KernelGlobals *kg
 }
 #endif
 
-ccl_device_inline void object_position_transform(KernelGlobals *kg, ShaderData *sd, float3 *P)
+/* Transform position from object to world space */
+
+ccl_device_inline void object_position_transform(KernelGlobals *kg, const ShaderData *sd, float3 *P)
 {
 #ifdef __OBJECT_MOTION__
 	*P = transform_point(&sd->ob_tfm, *P);
@@ -112,7 +130,9 @@ ccl_device_inline void object_position_transform(KernelGlobals *kg, ShaderData *
 #endif
 }
 
-ccl_device_inline void object_inverse_position_transform(KernelGlobals *kg, ShaderData *sd, float3 *P)
+/* Transform position from world to object space */
+
+ccl_device_inline void object_inverse_position_transform(KernelGlobals *kg, const ShaderData *sd, float3 *P)
 {
 #ifdef __OBJECT_MOTION__
 	*P = transform_point(&sd->ob_itfm, *P);
@@ -122,7 +142,9 @@ ccl_device_inline void object_inverse_position_transform(KernelGlobals *kg, Shad
 #endif
 }
 
-ccl_device_inline void object_inverse_normal_transform(KernelGlobals *kg, ShaderData *sd, float3 *N)
+/* Transform normal from world to object space */
+
+ccl_device_inline void object_inverse_normal_transform(KernelGlobals *kg, const ShaderData *sd, float3 *N)
 {
 #ifdef __OBJECT_MOTION__
 	*N = normalize(transform_direction_transposed(&sd->ob_tfm, *N));
@@ -132,7 +154,9 @@ ccl_device_inline void object_inverse_normal_transform(KernelGlobals *kg, Shader
 #endif
 }
 
-ccl_device_inline void object_normal_transform(KernelGlobals *kg, ShaderData *sd, float3 *N)
+/* Transform normal from object to world space */
+
+ccl_device_inline void object_normal_transform(KernelGlobals *kg, const ShaderData *sd, float3 *N)
 {
 #ifdef __OBJECT_MOTION__
 	*N = normalize(transform_direction_transposed(&sd->ob_itfm, *N));
@@ -142,7 +166,9 @@ ccl_device_inline void object_normal_transform(KernelGlobals *kg, ShaderData *sd
 #endif
 }
 
-ccl_device_inline void object_dir_transform(KernelGlobals *kg, ShaderData *sd, float3 *D)
+/* Transform direction vector from object to world space */
+
+ccl_device_inline void object_dir_transform(KernelGlobals *kg, const ShaderData *sd, float3 *D)
 {
 #ifdef __OBJECT_MOTION__
 	*D = transform_direction(&sd->ob_tfm, *D);
@@ -152,7 +178,9 @@ ccl_device_inline void object_dir_transform(KernelGlobals *kg, ShaderData *sd, f
 #endif
 }
 
-ccl_device_inline void object_inverse_dir_transform(KernelGlobals *kg, ShaderData *sd, float3 *D)
+/* Transform direction vector from world to object space */
+
+ccl_device_inline void object_inverse_dir_transform(KernelGlobals *kg, const ShaderData *sd, float3 *D)
 {
 #ifdef __OBJECT_MOTION__
 	*D = transform_direction(&sd->ob_itfm, *D);
@@ -162,9 +190,11 @@ ccl_device_inline void object_inverse_dir_transform(KernelGlobals *kg, ShaderDat
 #endif
 }
 
-ccl_device_inline float3 object_location(KernelGlobals *kg, ShaderData *sd)
+/* Object center position */
+
+ccl_device_inline float3 object_location(KernelGlobals *kg, const ShaderData *sd)
 {
-	if(sd->object == ~0)
+	if(sd->object == OBJECT_NONE)
 		return make_float3(0.0f, 0.0f, 0.0f);
 
 #ifdef __OBJECT_MOTION__
@@ -175,6 +205,8 @@ ccl_device_inline float3 object_location(KernelGlobals *kg, ShaderData *sd)
 #endif
 }
 
+/* Total surface area of object */
+
 ccl_device_inline float object_surface_area(KernelGlobals *kg, int object)
 {
 	int offset = object*OBJECT_SIZE + OBJECT_PROPERTIES;
@@ -182,9 +214,11 @@ ccl_device_inline float object_surface_area(KernelGlobals *kg, int object)
 	return f.x;
 }
 
+/* Pass ID number of object */
+
 ccl_device_inline float object_pass_id(KernelGlobals *kg, int object)
 {
-	if(object == ~0)
+	if(object == OBJECT_NONE)
 		return 0.0f;
 
 	int offset = object*OBJECT_SIZE + OBJECT_PROPERTIES;
@@ -192,9 +226,11 @@ ccl_device_inline float object_pass_id(KernelGlobals *kg, int object)
 	return f.y;
 }
 
+/* Per object random number for shader variation */
+
 ccl_device_inline float object_random_number(KernelGlobals *kg, int object)
 {
-	if(object == ~0)
+	if(object == OBJECT_NONE)
 		return 0.0f;
 
 	int offset = object*OBJECT_SIZE + OBJECT_PROPERTIES;
@@ -202,9 +238,11 @@ ccl_device_inline float object_random_number(KernelGlobals *kg, int object)
 	return f.z;
 }
 
-ccl_device_inline uint object_particle_id(KernelGlobals *kg, int object)
+/* Particle ID from which this object was generated */
+
+ccl_device_inline int object_particle_id(KernelGlobals *kg, int object)
 {
-	if(object == ~0)
+	if(object == OBJECT_NONE)
 		return 0.0f;
 
 	int offset = object*OBJECT_SIZE + OBJECT_PROPERTIES;
@@ -212,9 +250,11 @@ ccl_device_inline uint object_particle_id(KernelGlobals *kg, int object)
 	return __float_as_uint(f.w);
 }
 
+/* Generated texture coordinate on surface from where object was instanced */
+
 ccl_device_inline float3 object_dupli_generated(KernelGlobals *kg, int object)
 {
-	if(object == ~0)
+	if(object == OBJECT_NONE)
 		return make_float3(0.0f, 0.0f, 0.0f);
 
 	int offset = object*OBJECT_SIZE + OBJECT_DUPLI;
@@ -222,9 +262,11 @@ ccl_device_inline float3 object_dupli_generated(KernelGlobals *kg, int object)
 	return make_float3(f.x, f.y, f.z);
 }
 
+/* UV texture coordinate on surface from where object was instanced */
+
 ccl_device_inline float3 object_dupli_uv(KernelGlobals *kg, int object)
 {
-	if(object == ~0)
+	if(object == OBJECT_NONE)
 		return make_float3(0.0f, 0.0f, 0.0f);
 
 	int offset = object*OBJECT_SIZE + OBJECT_DUPLI;
@@ -232,12 +274,33 @@ ccl_device_inline float3 object_dupli_uv(KernelGlobals *kg, int object)
 	return make_float3(f.x, f.y, 0.0f);
 }
 
+/* Information about mesh for motion blurred triangles and curves */
+
+ccl_device_inline void object_motion_info(KernelGlobals *kg, int object, int *numsteps, int *numverts, int *numkeys)
+{
+	int offset = object*OBJECT_SIZE + OBJECT_DUPLI;
+
+	if(numkeys) {
+		float4 f = kernel_tex_fetch(__objects, offset);
+		*numkeys = __float_as_int(f.w);
+	}
+
+	float4 f = kernel_tex_fetch(__objects, offset + 1);
+	if(numsteps)
+		*numsteps = __float_as_int(f.z);
+	if(numverts)
+		*numverts = __float_as_int(f.w);
+}
+
+/* Pass ID for shader */
 
-ccl_device int shader_pass_id(KernelGlobals *kg, ShaderData *sd)
+ccl_device int shader_pass_id(KernelGlobals *kg, const ShaderData *sd)
 {
 	return kernel_tex_fetch(__shader_flag, (sd->shader & SHADER_MASK)*2 + 1);
 }
 
+/* Particle data from which object was instanced */
+
 ccl_device_inline float particle_index(KernelGlobals *kg, int particle)
 {
 	int offset = particle*PARTICLE_SIZE;
@@ -296,5 +359,107 @@ ccl_device float3 particle_angular_velocity(KernelGlobals *kg, int particle)
 	return make_float3(f3.z, f3.w, f4.x);
 }
 
+/* Object intersection in BVH */
+
+ccl_device_inline float3 bvh_clamp_direction(float3 dir)
+{
+	/* clamp absolute values by exp2f(-80.0f) to avoid division by zero when calculating inverse direction */
+	float ooeps = 8.271806E-25f;
+	return make_float3((fabsf(dir.x) > ooeps)? dir.x: copysignf(ooeps, dir.x),
+	                   (fabsf(dir.y) > ooeps)? dir.y: copysignf(ooeps, dir.y),
+	                   (fabsf(dir.z) > ooeps)? dir.z: copysignf(ooeps, dir.z));
+}
+
+ccl_device_inline float3 bvh_inverse_direction(float3 dir)
+{
+	return 1.0f / dir;
+}
+
+/* Transform ray into object space to enter static object in BVH */
+
+ccl_device_inline void bvh_instance_push(KernelGlobals *kg, int object, const Ray *ray, float3 *P, float3 *dir, float3 *idir, float *t)
+{
+	Transform tfm = object_fetch_transform(kg, object, OBJECT_INVERSE_TRANSFORM);
+
+	*P = transform_point(&tfm, ray->P);
+
+	float len;
+	*dir = bvh_clamp_direction(normalize_len(transform_direction(&tfm, ray->D), &len));
+	*idir = bvh_inverse_direction(*dir);
+
+	if(*t != FLT_MAX)
+		*t *= len;
+}
+
+/* Transorm ray to exit static object in BVH */
+
+ccl_device_inline void bvh_instance_pop(KernelGlobals *kg, int object, const Ray *ray, float3 *P, float3 *dir, float3 *idir, float *t)
+{
+	if(*t != FLT_MAX) {
+		Transform tfm = object_fetch_transform(kg, object, OBJECT_TRANSFORM);
+		*t *= len(transform_direction(&tfm, 1.0f/(*idir)));
+	}
+
+	*P = ray->P;
+	*dir = bvh_clamp_direction(ray->D);
+	*idir = bvh_inverse_direction(*dir);
+}
+
+/* Same as above, but returns scale factor to apply to multiple intersection distances */
+
+ccl_device_inline void bvh_instance_pop_factor(KernelGlobals *kg, int object, const Ray *ray, float3 *P, float3 *dir, float3 *idir, float *t_fac)
+{
+	Transform tfm = object_fetch_transform(kg, object, OBJECT_TRANSFORM);
+	*t_fac = len(transform_direction(&tfm, 1.0f/(*idir)));
+
+	*P = ray->P;
+	*dir = bvh_clamp_direction(ray->D);
+	*idir = bvh_inverse_direction(*dir);
+}
+
+
+#ifdef __OBJECT_MOTION__
+/* Transform ray into object space to enter motion blurred object in BVH */
+
+ccl_device_inline void bvh_instance_motion_push(KernelGlobals *kg, int object, const Ray *ray, float3 *P, float3 *dir, float3 *idir, float *t, Transform *tfm)
+{
+	Transform itfm;
+	*tfm = object_fetch_transform_motion_test(kg, object, ray->time, &itfm);
+
+	*P = transform_point(&itfm, ray->P);
+
+	float len;
+	*dir = bvh_clamp_direction(normalize_len(transform_direction(&itfm, ray->D), &len));
+	*idir = bvh_inverse_direction(*dir);
+
+	if(*t != FLT_MAX)
+		*t *= len;
+}
+
+/* Transorm ray to exit motion blurred object in BVH */
+
+ccl_device_inline void bvh_instance_motion_pop(KernelGlobals *kg, int object, const Ray *ray, float3 *P, float3 *dir, float3 *idir, float *t, Transform *tfm)
+{
+	if(*t != FLT_MAX)
+		*t *= len(transform_direction(tfm, 1.0f/(*idir)));
+
+	*P = ray->P;
+	*dir = bvh_clamp_direction(ray->D);
+	*idir = bvh_inverse_direction(*dir);
+}
+
+/* Same as above, but returns scale factor to apply to multiple intersection distances */
+
+ccl_device_inline void bvh_instance_motion_pop_factor(KernelGlobals *kg, int object, const Ray *ray, float3 *P, float3 *dir, float3 *idir, float *t_fac, Transform *tfm)
+{
+	*t_fac = len(transform_direction(tfm, 1.0f/(*idir)));
+
+	*P = ray->P;
+	*dir = bvh_clamp_direction(ray->D);
+	*idir = bvh_inverse_direction(*dir);
+}
+
+#endif
+
 CCL_NAMESPACE_END
 
diff --git a/intern/cycles/kernel/kernel_primitive.h b/intern/cycles/kernel/geom/geom_primitive.h
index fa450c97cbf..533973621d7 100644
--- a/intern/cycles/kernel/kernel_primitive.h
+++ b/intern/cycles/kernel/geom/geom_primitive.h
@@ -14,82 +14,60 @@
  * limitations under the License
  */
 
-#ifndef __KERNEL_ATTRIBUTE_CL__
-#define __KERNEL_ATTRIBUTE_CL__
+/* Primitive Utilities
+ *
+ * Generic functions to look up mesh, curve and volume primitive attributes for
+ * shading and render passes. */
 
 CCL_NAMESPACE_BEGIN
 
-/* attribute lookup */
-
-ccl_device_inline int find_attribute(KernelGlobals *kg, ShaderData *sd, uint id, AttributeElement *elem)
-{
-	if(sd->object == ~0)
-		return (int)ATTR_STD_NOT_FOUND;
-
-#ifdef __OSL__
-	if (kg->osl) {
-		return OSLShader::find_attribute(kg, sd, id, elem);
-	}
-	else
-#endif
-	{
-		/* for SVM, find attribute by unique id */
-		uint attr_offset = sd->object*kernel_data.bvh.attributes_map_stride;
-#ifdef __HAIR__
-		attr_offset = (sd->segment == ~0)? attr_offset: attr_offset + ATTR_PRIM_CURVE;
-#endif
-		uint4 attr_map = kernel_tex_fetch(__attributes_map, attr_offset);
-		
-		while(attr_map.x != id) {
-			attr_offset += ATTR_PRIM_TYPES;
-			attr_map = kernel_tex_fetch(__attributes_map, attr_offset);
-		}
-
-		*elem = (AttributeElement)attr_map.y;
-		
-		if(sd->prim == ~0 && (AttributeElement)attr_map.y != ATTR_ELEMENT_MESH)
-			return ATTR_STD_NOT_FOUND;
-
-		/* return result */
-		return (attr_map.y == ATTR_ELEMENT_NONE) ? (int)ATTR_STD_NOT_FOUND : (int)attr_map.z;
-	}
-}
+/* Generic primitive attribute reading functions */
 
 ccl_device float primitive_attribute_float(KernelGlobals *kg, const ShaderData *sd, AttributeElement elem, int offset, float *dx, float *dy)
 {
-#ifdef __HAIR__
-	if(sd->segment == ~0)
-#endif
+	if(sd->type & PRIMITIVE_ALL_TRIANGLE) {
 		return triangle_attribute_float(kg, sd, elem, offset, dx, dy);
+	}
 #ifdef __HAIR__
-	else
+	else if(sd->type & PRIMITIVE_ALL_CURVE) {
 		return curve_attribute_float(kg, sd, elem, offset, dx, dy);
+	}
+#endif
+#ifdef __VOLUME__
+	else if(sd->object != OBJECT_NONE && elem == ATTR_ELEMENT_VOXEL) {
+		return volume_attribute_float(kg, sd, elem, offset, dx, dy);
+	}
 #endif
+	else {
+		if(dx) *dx = 0.0f;
+		if(dy) *dy = 0.0f;
+		return 0.0f;
+	}
 }
 
 ccl_device float3 primitive_attribute_float3(KernelGlobals *kg, const ShaderData *sd, AttributeElement elem, int offset, float3 *dx, float3 *dy)
 {
-#ifdef __HAIR__
-	if(sd->segment == ~0)
-#endif
+	if(sd->type & PRIMITIVE_ALL_TRIANGLE) {
 		return triangle_attribute_float3(kg, sd, elem, offset, dx, dy);
+	}
 #ifdef __HAIR__
-	else
+	else if(sd->type & PRIMITIVE_ALL_CURVE) {
 		return curve_attribute_float3(kg, sd, elem, offset, dx, dy);
+	}
+#endif
+#ifdef __VOLUME__
+	else if(sd->object != OBJECT_NONE && elem == ATTR_ELEMENT_VOXEL) {
+		return volume_attribute_float3(kg, sd, elem, offset, dx, dy);
+	}
 #endif
+	else {
+		if(dx) *dx = make_float3(0.0f, 0.0f, 0.0f);
+		if(dy) *dy = make_float3(0.0f, 0.0f, 0.0f);
+		return make_float3(0.0f, 0.0f, 0.0f);
+	}
 }
 
-ccl_device Transform primitive_attribute_matrix(KernelGlobals *kg, const ShaderData *sd, int offset)
-{
-	Transform tfm;
-
-	tfm.x = kernel_tex_fetch(__attributes_float3, offset + 0);
-	tfm.y = kernel_tex_fetch(__attributes_float3, offset + 1);
-	tfm.z = kernel_tex_fetch(__attributes_float3, offset + 2);
-	tfm.w = kernel_tex_fetch(__attributes_float3, offset + 3);
-
-	return tfm;
-}
+/* Default UV coordinate */
 
 ccl_device float3 primitive_uv(KernelGlobals *kg, ShaderData *sd)
 {
@@ -104,6 +82,8 @@ ccl_device float3 primitive_uv(KernelGlobals *kg, ShaderData *sd)
 	return uv;
 }
 
+/* Ptex coordinates */
+
 ccl_device bool primitive_ptex(KernelGlobals *kg, ShaderData *sd, float2 *uv, int *face_id)
 {
 	/* storing ptex data as attributes is not memory efficient but simple for tests */
@@ -123,10 +103,12 @@ ccl_device bool primitive_ptex(KernelGlobals *kg, ShaderData *sd, float2 *uv, in
 	return true;
 }
 
+/* Surface tangent */
+
 ccl_device float3 primitive_tangent(KernelGlobals *kg, ShaderData *sd)
 {
 #ifdef __HAIR__
-	if(sd->segment != ~0)
+	if(sd->type & PRIMITIVE_ALL_CURVE)
 #ifdef __DPDU__
 		return normalize(sd->dPdu);
 #else
@@ -154,21 +136,39 @@ ccl_device float3 primitive_tangent(KernelGlobals *kg, ShaderData *sd)
 	}
 }
 
-/* motion */
+/* Motion vector for motion pass */
 
 ccl_device float4 primitive_motion_vector(KernelGlobals *kg, ShaderData *sd)
 {
-	float3 motion_pre = sd->P, motion_post = sd->P;
+	/* center position */
+	float3 center;
+
+	if(sd->type & PRIMITIVE_ALL_CURVE) {
+		center = curve_motion_center_location(kg, sd);
+
+		if(!(sd->flag & SD_TRANSFORM_APPLIED))
+			object_position_transform(kg, sd, &center);
+	}
+	else
+		center = sd->P;
+
+	float3 motion_pre = center, motion_post = center;
 
 	/* deformation motion */
-	AttributeElement elem_pre, elem_post;
-	int offset_pre = find_attribute(kg, sd, ATTR_STD_MOTION_PRE, &elem_pre);
-	int offset_post = find_attribute(kg, sd, ATTR_STD_MOTION_POST, &elem_post);
+	AttributeElement elem;
+	int offset = find_attribute(kg, sd, ATTR_STD_MOTION_VERTEX_POSITION, &elem);
+
+	if(offset != ATTR_STD_NOT_FOUND) {
+		/* get motion info */
+		int numverts, numkeys;
+		object_motion_info(kg, sd->object, NULL, &numverts, &numkeys);
 
-	if(offset_pre != ATTR_STD_NOT_FOUND)
-		motion_pre = primitive_attribute_float3(kg, sd, elem_pre, offset_pre, NULL, NULL);
-	if(offset_post != ATTR_STD_NOT_FOUND)
-		motion_post = primitive_attribute_float3(kg, sd, elem_post, offset_post, NULL, NULL);
+		/* lookup attributes */
+		int offset_next = (sd->type & PRIMITIVE_ALL_TRIANGLE)? offset + numverts: offset + numkeys;
+
+		motion_pre = primitive_attribute_float3(kg, sd, elem, offset, NULL, NULL);
+		motion_post = primitive_attribute_float3(kg, sd, elem, offset_next, NULL, NULL);
+	}
 
 	/* object motion. note that depending on the mesh having motion vectors, this
 	 * transformation was set match the world/object space of motion_pre/post */
@@ -180,13 +180,13 @@ ccl_device float4 primitive_motion_vector(KernelGlobals *kg, ShaderData *sd)
 	tfm = object_fetch_vector_transform(kg, sd->object, OBJECT_VECTOR_MOTION_POST);
 	motion_post = transform_point(&tfm, motion_post);
 
-	float3 P;
+	float3 motion_center;
 
 	/* camera motion, for perspective/orthographic motion.pre/post will be a
 	 * world-to-raster matrix, for panorama it's world-to-camera */
 	if (kernel_data.cam.type != CAMERA_PANORAMA) {
 		tfm = kernel_data.cam.worldtoraster;
-		P = transform_perspective(&tfm, sd->P);
+		motion_center = transform_perspective(&tfm, center);
 
 		tfm = kernel_data.cam.motion.pre;
 		motion_pre = transform_perspective(&tfm, motion_pre);
@@ -196,10 +196,10 @@ ccl_device float4 primitive_motion_vector(KernelGlobals *kg, ShaderData *sd)
 	}
 	else {
 		tfm = kernel_data.cam.worldtocamera;
-		P = normalize(transform_point(&tfm, sd->P));
-		P = float2_to_float3(direction_to_panorama(kg, P));
-		P.x *= kernel_data.cam.width;
-		P.y *= kernel_data.cam.height;
+		motion_center = normalize(transform_point(&tfm, center));
+		motion_center = float2_to_float3(direction_to_panorama(kg, motion_center));
+		motion_center.x *= kernel_data.cam.width;
+		motion_center.y *= kernel_data.cam.height;
 
 		tfm = kernel_data.cam.motion.pre;
 		motion_pre = normalize(transform_point(&tfm, motion_pre));
@@ -214,12 +214,11 @@ ccl_device float4 primitive_motion_vector(KernelGlobals *kg, ShaderData *sd)
 		motion_post.y *= kernel_data.cam.height;
 	}
 
-	motion_pre = motion_pre - P;
-	motion_post = P - motion_post;
+	motion_pre = motion_pre - motion_center;
+	motion_post = motion_center - motion_post;
 
 	return make_float4(motion_pre.x, motion_pre.y, motion_post.x, motion_post.y);
 }
 
 CCL_NAMESPACE_END
 
-#endif /* __KERNEL_ATTRIBUTE_CL__ */
diff --git a/intern/cycles/kernel/geom/geom_triangle.h b/intern/cycles/kernel/geom/geom_triangle.h
new file mode 100644
index 00000000000..355e36fef0c
--- /dev/null
+++ b/intern/cycles/kernel/geom/geom_triangle.h
@@ -0,0 +1,379 @@
+/*
+ * Adapted from code Copyright 2009-2010 NVIDIA Corporation
+ * Modifications Copyright 2011, Blender Foundation.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+/* Triangle Primitive
+ *
+ * Basic triangle with 3 vertices is used to represent mesh surfaces. For BVH
+ * ray intersection we use a precomputed triangle storage to accelarate
+ * intersection at the cost of more memory usage */
+
+CCL_NAMESPACE_BEGIN
+
+/* Refine triangle intersection to more precise hit point. For rays that travel
+ * far the precision is often not so good, this reintersects the primitive from
+ * a closer distance. */
+
+ccl_device_inline float3 triangle_refine(KernelGlobals *kg, ShaderData *sd, const Intersection *isect, const Ray *ray)
+{
+	float3 P = ray->P;
+	float3 D = ray->D;
+	float t = isect->t;
+
+#ifdef __INTERSECTION_REFINE__
+	if(isect->object != OBJECT_NONE) {
+#ifdef __OBJECT_MOTION__
+		Transform tfm = sd->ob_itfm;
+#else
+		Transform tfm = object_fetch_transform(kg, isect->object, OBJECT_INVERSE_TRANSFORM);
+#endif
+
+		P = transform_point(&tfm, P);
+		D = transform_direction(&tfm, D*t);
+		D = normalize_len(D, &t);
+	}
+
+	P = P + D*t;
+
+	float4 v00 = kernel_tex_fetch(__tri_woop, isect->prim*TRI_NODE_SIZE+0);
+	float Oz = v00.w - P.x*v00.x - P.y*v00.y - P.z*v00.z;
+	float invDz = 1.0f/(D.x*v00.x + D.y*v00.y + D.z*v00.z);
+	float rt = Oz * invDz;
+
+	P = P + D*rt;
+
+	if(isect->object != OBJECT_NONE) {
+#ifdef __OBJECT_MOTION__
+		Transform tfm = sd->ob_tfm;
+#else
+		Transform tfm = object_fetch_transform(kg, isect->object, OBJECT_TRANSFORM);
+#endif
+
+		P = transform_point(&tfm, P);
+	}
+
+	return P;
+#else
+	return P + D*t;
+#endif
+}
+
+/* same as above, except that isect->t is assumed to be in object space for instancing */
+ccl_device_inline float3 triangle_refine_subsurface(KernelGlobals *kg, ShaderData *sd, const Intersection *isect, const Ray *ray)
+{
+	float3 P = ray->P;
+	float3 D = ray->D;
+	float t = isect->t;
+
+#ifdef __INTERSECTION_REFINE__
+	if(isect->object != OBJECT_NONE) {
+#ifdef __OBJECT_MOTION__
+		Transform tfm = sd->ob_itfm;
+#else
+		Transform tfm = object_fetch_transform(kg, isect->object, OBJECT_INVERSE_TRANSFORM);
+#endif
+
+		P = transform_point(&tfm, P);
+		D = transform_direction(&tfm, D);
+		D = normalize(D);
+	}
+
+	P = P + D*t;
+
+	float4 v00 = kernel_tex_fetch(__tri_woop, isect->prim*TRI_NODE_SIZE+0);
+	float Oz = v00.w - P.x*v00.x - P.y*v00.y - P.z*v00.z;
+	float invDz = 1.0f/(D.x*v00.x + D.y*v00.y + D.z*v00.z);
+	float rt = Oz * invDz;
+
+	P = P + D*rt;
+
+	if(isect->object != OBJECT_NONE) {
+#ifdef __OBJECT_MOTION__
+		Transform tfm = sd->ob_tfm;
+#else
+		Transform tfm = object_fetch_transform(kg, isect->object, OBJECT_TRANSFORM);
+#endif
+
+		P = transform_point(&tfm, P);
+	}
+
+	return P;
+#else
+	return P + D*t;
+#endif
+}
+
+/* point and normal on triangle  */
+ccl_device_inline void triangle_point_normal(KernelGlobals *kg, int prim, float u, float v, float3 *P, float3 *Ng, int *shader)
+{
+	/* load triangle vertices */
+	float3 tri_vindex = float4_to_float3(kernel_tex_fetch(__tri_vindex, prim));
+
+	float3 v0 = float4_to_float3(kernel_tex_fetch(__tri_verts, __float_as_int(tri_vindex.x)));
+	float3 v1 = float4_to_float3(kernel_tex_fetch(__tri_verts, __float_as_int(tri_vindex.y)));
+	float3 v2 = float4_to_float3(kernel_tex_fetch(__tri_verts, __float_as_int(tri_vindex.z)));
+
+	/* compute point */
+	float t = 1.0f - u - v;
+	*P = (u*v0 + v*v1 + t*v2);
+
+	float4 Nm = kernel_tex_fetch(__tri_normal, prim);
+	*Ng = make_float3(Nm.x, Nm.y, Nm.z);
+	*shader = __float_as_int(Nm.w);
+}
+
+/* Triangle vertex locations */
+
+ccl_device_inline void triangle_vertices(KernelGlobals *kg, int prim, float3 P[3])
+{
+	float3 tri_vindex = float4_to_float3(kernel_tex_fetch(__tri_vindex, prim));
+
+	P[0] = float4_to_float3(kernel_tex_fetch(__tri_verts, __float_as_int(tri_vindex.x)));
+	P[1] = float4_to_float3(kernel_tex_fetch(__tri_verts, __float_as_int(tri_vindex.y)));
+	P[2] = float4_to_float3(kernel_tex_fetch(__tri_verts, __float_as_int(tri_vindex.z)));
+}
+
+/* Interpolate smooth vertex normal from vertices */
+
+ccl_device_inline float3 triangle_smooth_normal(KernelGlobals *kg, int prim, float u, float v)
+{
+	/* load triangle vertices */
+	float3 tri_vindex = float4_to_float3(kernel_tex_fetch(__tri_vindex, prim));
+
+	float3 n0 = float4_to_float3(kernel_tex_fetch(__tri_vnormal, __float_as_int(tri_vindex.x)));
+	float3 n1 = float4_to_float3(kernel_tex_fetch(__tri_vnormal, __float_as_int(tri_vindex.y)));
+	float3 n2 = float4_to_float3(kernel_tex_fetch(__tri_vnormal, __float_as_int(tri_vindex.z)));
+
+	return normalize((1.0f - u - v)*n2 + u*n0 + v*n1);
+}
+
+/* Ray differentials on triangle */
+
+ccl_device_inline void triangle_dPdudv(KernelGlobals *kg, int prim, float3 *dPdu, float3 *dPdv)
+{
+	/* fetch triangle vertex coordinates */
+	float3 tri_vindex = float4_to_float3(kernel_tex_fetch(__tri_vindex, prim));
+
+	float3 p0 = float4_to_float3(kernel_tex_fetch(__tri_verts, __float_as_int(tri_vindex.x)));
+	float3 p1 = float4_to_float3(kernel_tex_fetch(__tri_verts, __float_as_int(tri_vindex.y)));
+	float3 p2 = float4_to_float3(kernel_tex_fetch(__tri_verts, __float_as_int(tri_vindex.z)));
+
+	/* compute derivatives of P w.r.t. uv */
+	*dPdu = (p0 - p2);
+	*dPdv = (p1 - p2);
+}
+
+/* Reading attributes on various triangle elements */
+
+ccl_device float triangle_attribute_float(KernelGlobals *kg, const ShaderData *sd, AttributeElement elem, int offset, float *dx, float *dy)
+{
+	if(elem == ATTR_ELEMENT_FACE) {
+		if(dx) *dx = 0.0f;
+		if(dy) *dy = 0.0f;
+
+		return kernel_tex_fetch(__attributes_float, offset + sd->prim);
+	}
+	else if(elem == ATTR_ELEMENT_VERTEX || elem == ATTR_ELEMENT_VERTEX_MOTION) {
+		float3 tri_vindex = float4_to_float3(kernel_tex_fetch(__tri_vindex, sd->prim));
+
+		float f0 = kernel_tex_fetch(__attributes_float, offset + __float_as_int(tri_vindex.x));
+		float f1 = kernel_tex_fetch(__attributes_float, offset + __float_as_int(tri_vindex.y));
+		float f2 = kernel_tex_fetch(__attributes_float, offset + __float_as_int(tri_vindex.z));
+
+#ifdef __RAY_DIFFERENTIALS__
+		if(dx) *dx = sd->du.dx*f0 + sd->dv.dx*f1 - (sd->du.dx + sd->dv.dx)*f2;
+		if(dy) *dy = sd->du.dy*f0 + sd->dv.dy*f1 - (sd->du.dy + sd->dv.dy)*f2;
+#endif
+
+		return sd->u*f0 + sd->v*f1 + (1.0f - sd->u - sd->v)*f2;
+	}
+	else if(elem == ATTR_ELEMENT_CORNER) {
+		int tri = offset + sd->prim*3;
+		float f0 = kernel_tex_fetch(__attributes_float, tri + 0);
+		float f1 = kernel_tex_fetch(__attributes_float, tri + 1);
+		float f2 = kernel_tex_fetch(__attributes_float, tri + 2);
+
+#ifdef __RAY_DIFFERENTIALS__
+		if(dx) *dx = sd->du.dx*f0 + sd->dv.dx*f1 - (sd->du.dx + sd->dv.dx)*f2;
+		if(dy) *dy = sd->du.dy*f0 + sd->dv.dy*f1 - (sd->du.dy + sd->dv.dy)*f2;
+#endif
+
+		return sd->u*f0 + sd->v*f1 + (1.0f - sd->u - sd->v)*f2;
+	}
+	else {
+		if(dx) *dx = 0.0f;
+		if(dy) *dy = 0.0f;
+
+		return 0.0f;
+	}
+}
+
+ccl_device float3 triangle_attribute_float3(KernelGlobals *kg, const ShaderData *sd, AttributeElement elem, int offset, float3 *dx, float3 *dy)
+{
+	if(elem == ATTR_ELEMENT_FACE) {
+		if(dx) *dx = make_float3(0.0f, 0.0f, 0.0f);
+		if(dy) *dy = make_float3(0.0f, 0.0f, 0.0f);
+
+		return float4_to_float3(kernel_tex_fetch(__attributes_float3, offset + sd->prim));
+	}
+	else if(elem == ATTR_ELEMENT_VERTEX || elem == ATTR_ELEMENT_VERTEX_MOTION) {
+		float3 tri_vindex = float4_to_float3(kernel_tex_fetch(__tri_vindex, sd->prim));
+
+		float3 f0 = float4_to_float3(kernel_tex_fetch(__attributes_float3, offset + __float_as_int(tri_vindex.x)));
+		float3 f1 = float4_to_float3(kernel_tex_fetch(__attributes_float3, offset + __float_as_int(tri_vindex.y)));
+		float3 f2 = float4_to_float3(kernel_tex_fetch(__attributes_float3, offset + __float_as_int(tri_vindex.z)));
+
+#ifdef __RAY_DIFFERENTIALS__
+		if(dx) *dx = sd->du.dx*f0 + sd->dv.dx*f1 - (sd->du.dx + sd->dv.dx)*f2;
+		if(dy) *dy = sd->du.dy*f0 + sd->dv.dy*f1 - (sd->du.dy + sd->dv.dy)*f2;
+#endif
+
+		return sd->u*f0 + sd->v*f1 + (1.0f - sd->u - sd->v)*f2;
+	}
+	else if(elem == ATTR_ELEMENT_CORNER) {
+		int tri = offset + sd->prim*3;
+		float3 f0 = float4_to_float3(kernel_tex_fetch(__attributes_float3, tri + 0));
+		float3 f1 = float4_to_float3(kernel_tex_fetch(__attributes_float3, tri + 1));
+		float3 f2 = float4_to_float3(kernel_tex_fetch(__attributes_float3, tri + 2));
+
+#ifdef __RAY_DIFFERENTIALS__
+		if(dx) *dx = sd->du.dx*f0 + sd->dv.dx*f1 - (sd->du.dx + sd->dv.dx)*f2;
+		if(dy) *dy = sd->du.dy*f0 + sd->dv.dy*f1 - (sd->du.dy + sd->dv.dy)*f2;
+#endif
+
+		return sd->u*f0 + sd->v*f1 + (1.0f - sd->u - sd->v)*f2;
+	}
+	else {
+		if(dx) *dx = make_float3(0.0f, 0.0f, 0.0f);
+		if(dy) *dy = make_float3(0.0f, 0.0f, 0.0f);
+
+		return make_float3(0.0f, 0.0f, 0.0f);
+	}
+}
+
+/* Ray-Triangle intersection for BVH traversal
+ *
+ * Based on Sven Woop's algorithm with precomputed triangle storage */
+
+ccl_device_inline bool triangle_intersect(KernelGlobals *kg, Intersection *isect,
+	float3 P, float3 dir, uint visibility, int object, int triAddr)
+{
+	/* compute and check intersection t-value */
+	float4 v00 = kernel_tex_fetch(__tri_woop, triAddr*TRI_NODE_SIZE+0);
+	float4 v11 = kernel_tex_fetch(__tri_woop, triAddr*TRI_NODE_SIZE+1);
+
+	float Oz = v00.w - P.x*v00.x - P.y*v00.y - P.z*v00.z;
+	float invDz = 1.0f/(dir.x*v00.x + dir.y*v00.y + dir.z*v00.z);
+	float t = Oz * invDz;
+
+	if(t > 0.0f && t < isect->t) {
+		/* compute and check barycentric u */
+		float Ox = v11.w + P.x*v11.x + P.y*v11.y + P.z*v11.z;
+		float Dx = dir.x*v11.x + dir.y*v11.y + dir.z*v11.z;
+		float u = Ox + t*Dx;
+
+		if(u >= 0.0f) {
+			/* compute and check barycentric v */
+			float4 v22 = kernel_tex_fetch(__tri_woop, triAddr*TRI_NODE_SIZE+2);
+			float Oy = v22.w + P.x*v22.x + P.y*v22.y + P.z*v22.z;
+			float Dy = dir.x*v22.x + dir.y*v22.y + dir.z*v22.z;
+			float v = Oy + t*Dy;
+
+			if(v >= 0.0f && u + v <= 1.0f) {
+#ifdef __VISIBILITY_FLAG__
+				/* visibility flag test. we do it here under the assumption
+				 * that most triangles are culled by node flags */
+				if(kernel_tex_fetch(__prim_visibility, triAddr) & visibility)
+#endif
+				{
+					/* record intersection */
+					isect->prim = triAddr;
+					isect->object = object;
+					isect->type = PRIMITIVE_TRIANGLE;
+					isect->u = u;
+					isect->v = v;
+					isect->t = t;
+					return true;
+				}
+			}
+		}
+	}
+
+	return false;
+}
+
+/* Special ray intersection routines for subsurface scattering. In that case we
+ * only want to intersect with primitives in the same object, and if case of
+ * multiple hits we pick a single random primitive as the intersection point. */
+
+#ifdef __SUBSURFACE__
+ccl_device_inline void triangle_intersect_subsurface(KernelGlobals *kg, Intersection *isect_array,
+	float3 P, float3 dir, int object, int triAddr, float tmax, uint *num_hits, uint *lcg_state, int max_hits)
+{
+	/* compute and check intersection t-value */
+	float4 v00 = kernel_tex_fetch(__tri_woop, triAddr*TRI_NODE_SIZE+0);
+	float4 v11 = kernel_tex_fetch(__tri_woop, triAddr*TRI_NODE_SIZE+1);
+
+	float Oz = v00.w - P.x*v00.x - P.y*v00.y - P.z*v00.z;
+	float invDz = 1.0f/(dir.x*v00.x + dir.y*v00.y + dir.z*v00.z);
+	float t = Oz * invDz;
+
+	if(t > 0.0f && t < tmax) {
+		/* compute and check barycentric u */
+		float Ox = v11.w + P.x*v11.x + P.y*v11.y + P.z*v11.z;
+		float Dx = dir.x*v11.x + dir.y*v11.y + dir.z*v11.z;
+		float u = Ox + t*Dx;
+
+		if(u >= 0.0f) {
+			/* compute and check barycentric v */
+			float4 v22 = kernel_tex_fetch(__tri_woop, triAddr*TRI_NODE_SIZE+2);
+			float Oy = v22.w + P.x*v22.x + P.y*v22.y + P.z*v22.z;
+			float Dy = dir.x*v22.x + dir.y*v22.y + dir.z*v22.z;
+			float v = Oy + t*Dy;
+
+			if(v >= 0.0f && u + v <= 1.0f) {
+				(*num_hits)++;
+
+				int hit;
+
+				if(*num_hits <= max_hits) {
+					hit = *num_hits - 1;
+				}
+				else {
+					/* reservoir sampling: if we are at the maximum number of
+					 * hits, randomly replace element or skip it */
+					hit = lcg_step_uint(lcg_state) % *num_hits;
+
+					if(hit >= max_hits)
+						return;
+				}
+
+				/* record intersection */
+				Intersection *isect = &isect_array[hit];
+				isect->prim = triAddr;
+				isect->object = object;
+				isect->type = PRIMITIVE_TRIANGLE;
+				isect->u = u;
+				isect->v = v;
+				isect->t = t;
+			}
+		}
+	}
+}
+#endif
+
+CCL_NAMESPACE_END
+
diff --git a/intern/cycles/kernel/geom/geom_volume.h b/intern/cycles/kernel/geom/geom_volume.h
new file mode 100644
index 00000000000..963d6cbee9c
--- /dev/null
+++ b/intern/cycles/kernel/geom/geom_volume.h
@@ -0,0 +1,75 @@
+/*
+ * Copyright 2011-2013 Blender Foundation
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License
+ */
+
+/* Volume Primitive
+ *
+ * Volumes are just regions inside meshes with the mesh surface as boundaries.
+ * There isn't as much data to access as for surfaces, there is only a position
+ * to do lookups in 3D voxel or procedural textures.
+ *
+ * 3D voxel textures can be assigned as attributes per mesh, which means the
+ * same shader can be used for volume objects with different densities, etc. */
+
+CCL_NAMESPACE_BEGIN
+
+#ifdef __VOLUME__
+
+/* Return position normalized to 0..1 in mesh bounds */
+
+ccl_device float3 volume_normalized_position(KernelGlobals *kg, const ShaderData *sd, float3 P)
+{
+	/* todo: optimize this so it's just a single matrix multiplication when
+	 * possible (not motion blur), or perhaps even just translation + scale */
+	AttributeElement attr_elem;
+	int attr_offset = find_attribute(kg, sd, ATTR_STD_GENERATED_TRANSFORM, &attr_elem);
+
+	object_inverse_position_transform(kg, sd, &P);
+
+	if(attr_offset != ATTR_STD_NOT_FOUND) {
+		Transform tfm = primitive_attribute_matrix(kg, sd, attr_offset);
+		P = transform_point(&tfm, P);
+	}
+
+	return P;
+}
+
+ccl_device float volume_attribute_float(KernelGlobals *kg, const ShaderData *sd, AttributeElement elem, int id, float *dx, float *dy)
+{
+	float3 P = volume_normalized_position(kg, sd, sd->P);
+	float4 r = kernel_tex_image_interp_3d(id, P.x, P.y, P.z);
+
+	if(dx) *dx = 0.0f;
+	if(dx) *dy = 0.0f;
+
+	/* todo: support float textures to lower memory usage for single floats */
+	return average(float4_to_float3(r));
+}
+
+ccl_device float3 volume_attribute_float3(KernelGlobals *kg, const ShaderData *sd, AttributeElement elem, int id, float3 *dx, float3 *dy)
+{
+	float3 P = volume_normalized_position(kg, sd, sd->P);
+	float4 r = kernel_tex_image_interp_3d(id, P.x, P.y, P.z);
+
+	if(dx) *dx = make_float3(0.0f, 0.0f, 0.0f);
+	if(dy) *dy = make_float3(0.0f, 0.0f, 0.0f);
+
+	return float4_to_float3(r);
+}
+
+#endif
+
+CCL_NAMESPACE_END
+
diff --git a/intern/cycles/kernel/kernel.cpp b/intern/cycles/kernel/kernel.cpp
index 6cd14d3c51c..173028d50c8 100644
--- a/intern/cycles/kernel/kernel.cpp
+++ b/intern/cycles/kernel/kernel.cpp
@@ -37,7 +37,7 @@ void kernel_const_copy(KernelGlobals *kg, const char *name, void *host, size_t s
 		assert(0);
 }
 
-void kernel_tex_copy(KernelGlobals *kg, const char *name, device_ptr mem, size_t width, size_t height)
+void kernel_tex_copy(KernelGlobals *kg, const char *name, device_ptr mem, size_t width, size_t height, size_t depth, InterpolationType interpolation)
 {
 	if(0) {
 	}
@@ -61,8 +61,8 @@ void kernel_tex_copy(KernelGlobals *kg, const char *name, device_ptr mem, size_t
 
 		if(tex) {
 			tex->data = (float4*)mem;
-			tex->width = width;
-			tex->height = height;
+			tex->dimensions_set(width, height, depth);
+			tex->interpolation = interpolation;
 		}
 	}
 	else if(strstr(name, "__tex_image")) {
@@ -76,8 +76,8 @@ void kernel_tex_copy(KernelGlobals *kg, const char *name, device_ptr mem, size_t
 
 		if(tex) {
 			tex->data = (uchar4*)mem;
-			tex->width = width;
-			tex->height = height;
+			tex->dimensions_set(width, height, depth);
+			tex->interpolation = interpolation;
 		}
 	}
 	else
diff --git a/intern/cycles/kernel/kernel.cu b/intern/cycles/kernel/kernel.cu
index 5e6748c66fc..636e48b5456 100644
--- a/intern/cycles/kernel/kernel.cu
+++ b/intern/cycles/kernel/kernel.cu
@@ -24,7 +24,83 @@
 #include "kernel_path.h"
 #include "kernel_displace.h"
 
-extern "C" __global__ void kernel_cuda_path_trace(float *buffer, uint *rng_state, int sample, int sx, int sy, int sw, int sh, int offset, int stride)
+/* device data taken from CUDA occupancy calculator */
+
+#ifdef __CUDA_ARCH__
+
+/* 2.0 and 2.1 */
+#if __CUDA_ARCH__ == 200 || __CUDA_ARCH__ == 210
+#define CUDA_MULTIPRESSOR_MAX_REGISTERS 32768
+#define CUDA_MULTIPROCESSOR_MAX_BLOCKS 8
+#define CUDA_BLOCK_MAX_THREADS 1024
+#define CUDA_THREAD_MAX_REGISTERS 63
+
+/* tunable parameters */
+#define CUDA_THREADS_BLOCK_WIDTH 16
+#define CUDA_KERNEL_MAX_REGISTERS 32
+#define CUDA_KERNEL_BRANCHED_MAX_REGISTERS 40
+
+/* 3.0 and 3.5 */
+#elif __CUDA_ARCH__ == 300 || __CUDA_ARCH__ == 350
+#define CUDA_MULTIPRESSOR_MAX_REGISTERS 65536
+#define CUDA_MULTIPROCESSOR_MAX_BLOCKS 16
+#define CUDA_BLOCK_MAX_THREADS 1024
+#define CUDA_THREAD_MAX_REGISTERS 63
+
+/* tunable parameters */
+#define CUDA_THREADS_BLOCK_WIDTH 16
+#define CUDA_KERNEL_MAX_REGISTERS 63
+#define CUDA_KERNEL_BRANCHED_MAX_REGISTERS 63
+
+/* 5.0 */
+#elif __CUDA_ARCH__ == 500
+#define CUDA_MULTIPRESSOR_MAX_REGISTERS 65536
+#define CUDA_MULTIPROCESSOR_MAX_BLOCKS 32
+#define CUDA_BLOCK_MAX_THREADS 1024
+#define CUDA_THREAD_MAX_REGISTERS 255
+
+/* tunable parameters */
+#define CUDA_THREADS_BLOCK_WIDTH 16
+#define CUDA_KERNEL_MAX_REGISTERS 63
+#define CUDA_KERNEL_BRANCHED_MAX_REGISTERS 63
+
+/* unknown architecture */
+#else
+#error "Unknown or unuspported CUDA architecture, can't determine launch bounds"
+#endif
+
+/* compute number of threads per block and minimum blocks per multiprocessor
+ * given the maximum number of registers per thread */
+
+#define CUDA_LAUNCH_BOUNDS(threads_block_width, thread_num_registers) \
+	__launch_bounds__( \
+		threads_block_width*threads_block_width, \
+		CUDA_MULTIPRESSOR_MAX_REGISTERS/(threads_block_width*threads_block_width*thread_num_registers) \
+		)
+
+/* sanity checks */
+
+#if CUDA_THREADS_BLOCK_WIDTH*CUDA_THREADS_BLOCK_WIDTH > CUDA_BLOCK_MAX_THREADS
+#error "Maximum number of threads per block exceeded"
+#endif
+
+#if CUDA_MULTIPRESSOR_MAX_REGISTERS/(CUDA_THREADS_BLOCK_WIDTH*CUDA_THREADS_BLOCK_WIDTH*CUDA_KERNEL_MAX_REGISTERS) > CUDA_MULTIPROCESSOR_MAX_BLOCKS
+#error "Maximum number of blocks per multiprocessor exceeded"
+#endif
+
+#if CUDA_KERNEL_MAX_REGISTERS > CUDA_THREAD_MAX_REGISTERS
+#error "Maximum number of registers per thread exceeded"
+#endif
+
+#if CUDA_KERNEL_BRANCHED_MAX_REGISTERS > CUDA_THREAD_MAX_REGISTERS
+#error "Maximum number of registers per thread exceeded"
+#endif
+
+/* kernels */
+
+extern "C" __global__ void
+CUDA_LAUNCH_BOUNDS(CUDA_THREADS_BLOCK_WIDTH, CUDA_KERNEL_MAX_REGISTERS)
+kernel_cuda_path_trace(float *buffer, uint *rng_state, int sample, int sx, int sy, int sw, int sh, int offset, int stride)
 {
 	int x = sx + blockDim.x*blockIdx.x + threadIdx.x;
 	int y = sy + blockDim.y*blockIdx.y + threadIdx.y;
@@ -34,7 +110,9 @@ extern "C" __global__ void kernel_cuda_path_trace(float *buffer, uint *rng_state
 }
 
 #ifdef __BRANCHED_PATH__
-extern "C" __global__ void kernel_cuda_branched_path_trace(float *buffer, uint *rng_state, int sample, int sx, int sy, int sw, int sh, int offset, int stride)
+extern "C" __global__ void
+CUDA_LAUNCH_BOUNDS(CUDA_THREADS_BLOCK_WIDTH, CUDA_KERNEL_BRANCHED_MAX_REGISTERS)
+kernel_cuda_branched_path_trace(float *buffer, uint *rng_state, int sample, int sx, int sy, int sw, int sh, int offset, int stride)
 {
 	int x = sx + blockDim.x*blockIdx.x + threadIdx.x;
 	int y = sy + blockDim.y*blockIdx.y + threadIdx.y;
@@ -44,7 +122,9 @@ extern "C" __global__ void kernel_cuda_branched_path_trace(float *buffer, uint *
 }
 #endif
 
-extern "C" __global__ void kernel_cuda_convert_to_byte(uchar4 *rgba, float *buffer, float sample_scale, int sx, int sy, int sw, int sh, int offset, int stride)
+extern "C" __global__ void
+CUDA_LAUNCH_BOUNDS(CUDA_THREADS_BLOCK_WIDTH, CUDA_KERNEL_MAX_REGISTERS)
+kernel_cuda_convert_to_byte(uchar4 *rgba, float *buffer, float sample_scale, int sx, int sy, int sw, int sh, int offset, int stride)
 {
 	int x = sx + blockDim.x*blockIdx.x + threadIdx.x;
 	int y = sy + blockDim.y*blockIdx.y + threadIdx.y;
@@ -53,7 +133,9 @@ extern "C" __global__ void kernel_cuda_convert_to_byte(uchar4 *rgba, float *buff
 		kernel_film_convert_to_byte(NULL, rgba, buffer, sample_scale, x, y, offset, stride);
 }
 
-extern "C" __global__ void kernel_cuda_convert_to_half_float(uchar4 *rgba, float *buffer, float sample_scale, int sx, int sy, int sw, int sh, int offset, int stride)
+extern "C" __global__ void
+CUDA_LAUNCH_BOUNDS(CUDA_THREADS_BLOCK_WIDTH, CUDA_KERNEL_MAX_REGISTERS)
+kernel_cuda_convert_to_half_float(uchar4 *rgba, float *buffer, float sample_scale, int sx, int sy, int sw, int sh, int offset, int stride)
 {
 	int x = sx + blockDim.x*blockIdx.x + threadIdx.x;
 	int y = sy + blockDim.y*blockIdx.y + threadIdx.y;
@@ -62,10 +144,14 @@ extern "C" __global__ void kernel_cuda_convert_to_half_float(uchar4 *rgba, float
 		kernel_film_convert_to_half_float(NULL, rgba, buffer, sample_scale, x, y, offset, stride);
 }
 
-extern "C" __global__ void kernel_cuda_shader(uint4 *input, float4 *output, int type, int sx)
+extern "C" __global__ void
+CUDA_LAUNCH_BOUNDS(CUDA_THREADS_BLOCK_WIDTH, CUDA_KERNEL_MAX_REGISTERS)
+kernel_cuda_shader(uint4 *input, float4 *output, int type, int sx)
 {
 	int x = sx + blockDim.x*blockIdx.x + threadIdx.x;
 
 	kernel_shader_evaluate(NULL, input, output, (ShaderEvalType)type, x);
 }
 
+#endif
+
diff --git a/intern/cycles/kernel/kernel.h b/intern/cycles/kernel/kernel.h
index 039dc791b08..c4a08646bab 100644
--- a/intern/cycles/kernel/kernel.h
+++ b/intern/cycles/kernel/kernel.h
@@ -32,7 +32,7 @@ void *kernel_osl_memory(KernelGlobals *kg);
 bool kernel_osl_use(KernelGlobals *kg);
 
 void kernel_const_copy(KernelGlobals *kg, const char *name, void *host, size_t size);
-void kernel_tex_copy(KernelGlobals *kg, const char *name, device_ptr mem, size_t width, size_t height);
+void kernel_tex_copy(KernelGlobals *kg, const char *name, device_ptr mem, size_t width, size_t height, size_t depth, InterpolationType interpolation=INTERPOLATION_LINEAR);
 
 void kernel_cpu_path_trace(KernelGlobals *kg, float *buffer, unsigned int *rng_state,
 	int sample, int x, int y, int offset, int stride);
diff --git a/intern/cycles/kernel/kernel_accumulate.h b/intern/cycles/kernel/kernel_accumulate.h
index 582a220ab3c..b4f6dcdace9 100644
--- a/intern/cycles/kernel/kernel_accumulate.h
+++ b/intern/cycles/kernel/kernel_accumulate.h
@@ -407,5 +407,30 @@ ccl_device_inline float3 path_radiance_clamp_and_sum(KernelGlobals *kg, PathRadi
 	return L_sum;
 }
 
+ccl_device_inline void path_radiance_accum_sample(PathRadiance *L, PathRadiance *L_sample, int num_samples)
+{
+	float fac = 1.0f/num_samples;
+
+#ifdef __PASSES__
+	L->direct_diffuse += L_sample->direct_diffuse*fac;
+	L->direct_glossy += L_sample->direct_glossy*fac;
+	L->direct_transmission += L_sample->direct_transmission*fac;
+	L->direct_subsurface += L_sample->direct_subsurface*fac;
+
+	L->indirect_diffuse += L_sample->indirect_diffuse*fac;
+	L->indirect_glossy += L_sample->indirect_glossy*fac;
+	L->indirect_transmission += L_sample->indirect_transmission*fac;
+	L->indirect_subsurface += L_sample->indirect_subsurface*fac;
+
+	L->emission += L_sample->emission*fac;
+	L->background += L_sample->background*fac;
+	L->ao += L_sample->ao*fac;
+	L->shadow += L_sample->shadow*fac;
+	L->mist += L_sample->mist*fac;
+#else
+	*L += *L_sample * fac;
+#endif
+}
+
 CCL_NAMESPACE_END
 
diff --git a/intern/cycles/kernel/kernel_avx.cpp b/intern/cycles/kernel/kernel_avx.cpp
index d2a7142c551..354214c406e 100644
--- a/intern/cycles/kernel/kernel_avx.cpp
+++ b/intern/cycles/kernel/kernel_avx.cpp
@@ -77,6 +77,6 @@ CCL_NAMESPACE_END
 
 /* needed for some linkers in combination with scons making empty compilation unit in a library */
 void __dummy_function_cycles_avx(void);
-void __dummy_function_cycles_avx(void){}
+void __dummy_function_cycles_avx(void) {}
 
 #endif
diff --git a/intern/cycles/kernel/kernel_bvh.h b/intern/cycles/kernel/kernel_bvh.h
deleted file mode 100644
index 93e546eaece..00000000000
--- a/intern/cycles/kernel/kernel_bvh.h
+++ /dev/null
@@ -1,1258 +0,0 @@
-/*
- * Adapted from code Copyright 2009-2010 NVIDIA Corporation
- * Modifications Copyright 2011, Blender Foundation.
- *
- * Licensed under the Apache License, Version 2.0 (the "License");
- * you may not use this file except in compliance with the License.
- * You may obtain a copy of the License at
- *
- * http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-
-CCL_NAMESPACE_BEGIN
-
-/*
- * "Persistent while-while kernel" used in:
- *
- * "Understanding the Efficiency of Ray Traversal on GPUs",
- * Timo Aila and Samuli Laine,
- * Proc. High-Performance Graphics 2009
- */
-
-/* bottom-most stack entry, indicating the end of traversal */
-#define ENTRYPOINT_SENTINEL 0x76543210
-
-/* 64 object BVH + 64 mesh BVH + 64 object node splitting */
-#define BVH_STACK_SIZE 192
-#define BVH_NODE_SIZE 4
-#define TRI_NODE_SIZE 3
-
-/* silly workaround for float extended precision that happens when compiling
- * without sse support on x86, it results in different results for float ops
- * that you would otherwise expect to compare correctly */
-#if !defined(__i386__) || defined(__SSE__)
-#define NO_EXTENDED_PRECISION
-#else
-#define NO_EXTENDED_PRECISION volatile
-#endif
-
-ccl_device_inline float3 bvh_inverse_direction(float3 dir)
-{
-	/* avoid divide by zero (ooeps = exp2f(-80.0f)) */
-	float ooeps = 0.00000000000000000000000082718061255302767487140869206996285356581211090087890625f;
-	float3 idir;
-
-	idir.x = 1.0f/((fabsf(dir.x) > ooeps)? dir.x: copysignf(ooeps, dir.x));
-	idir.y = 1.0f/((fabsf(dir.y) > ooeps)? dir.y: copysignf(ooeps, dir.y));
-	idir.z = 1.0f/((fabsf(dir.z) > ooeps)? dir.z: copysignf(ooeps, dir.z));
-
-	return idir;
-}
-
-ccl_device_inline void bvh_instance_push(KernelGlobals *kg, int object, const Ray *ray, float3 *P, float3 *idir, float *t, const float tmax)
-{
-	Transform tfm = object_fetch_transform(kg, object, OBJECT_INVERSE_TRANSFORM);
-
-	*P = transform_point(&tfm, ray->P);
-
-	float3 dir = transform_direction(&tfm, ray->D);
-
-	float len;
-	dir = normalize_len(dir, &len);
-
-	*idir = bvh_inverse_direction(dir);
-
-	if(*t != FLT_MAX)
-		*t *= len;
-}
-
-ccl_device_inline void bvh_instance_pop(KernelGlobals *kg, int object, const Ray *ray, float3 *P, float3 *idir, float *t, const float tmax)
-{
-	if(*t != FLT_MAX) {
-		Transform tfm = object_fetch_transform(kg, object, OBJECT_TRANSFORM);
-		*t *= len(transform_direction(&tfm, 1.0f/(*idir)));
-	}
-
-	*P = ray->P;
-	*idir = bvh_inverse_direction(ray->D);
-}
-
-#ifdef __OBJECT_MOTION__
-ccl_device_inline void bvh_instance_motion_push(KernelGlobals *kg, int object, const Ray *ray, float3 *P, float3 *idir, float *t, Transform *tfm, const float tmax)
-{
-	Transform itfm;
-	*tfm = object_fetch_transform_motion_test(kg, object, ray->time, &itfm);
-
-	*P = transform_point(&itfm, ray->P);
-
-	float3 dir = transform_direction(&itfm, ray->D);
-
-	float len;
-	dir = normalize_len(dir, &len);
-
-	*idir = bvh_inverse_direction(dir);
-
-	if(*t != FLT_MAX)
-		*t *= len;
-}
-
-ccl_device_inline void bvh_instance_motion_pop(KernelGlobals *kg, int object, const Ray *ray, float3 *P, float3 *idir, float *t, Transform *tfm, const float tmax)
-{
-	if(*t != FLT_MAX)
-		*t *= len(transform_direction(tfm, 1.0f/(*idir)));
-
-	*P = ray->P;
-	*idir = bvh_inverse_direction(ray->D);
-}
-#endif
-
-/* Sven Woop's algorithm */
-ccl_device_inline bool bvh_triangle_intersect(KernelGlobals *kg, Intersection *isect,
-	float3 P, float3 idir, uint visibility, int object, int triAddr)
-{
-	/* compute and check intersection t-value */
-	float4 v00 = kernel_tex_fetch(__tri_woop, triAddr*TRI_NODE_SIZE+0);
-	float4 v11 = kernel_tex_fetch(__tri_woop, triAddr*TRI_NODE_SIZE+1);
-	float3 dir = 1.0f/idir;
-
-	float Oz = v00.w - P.x*v00.x - P.y*v00.y - P.z*v00.z;
-	float invDz = 1.0f/(dir.x*v00.x + dir.y*v00.y + dir.z*v00.z);
-	float t = Oz * invDz;
-
-	if(t > 0.0f && t < isect->t) {
-		/* compute and check barycentric u */
-		float Ox = v11.w + P.x*v11.x + P.y*v11.y + P.z*v11.z;
-		float Dx = dir.x*v11.x + dir.y*v11.y + dir.z*v11.z;
-		float u = Ox + t*Dx;
-
-		if(u >= 0.0f) {
-			/* compute and check barycentric v */
-			float4 v22 = kernel_tex_fetch(__tri_woop, triAddr*TRI_NODE_SIZE+2);
-			float Oy = v22.w + P.x*v22.x + P.y*v22.y + P.z*v22.z;
-			float Dy = dir.x*v22.x + dir.y*v22.y + dir.z*v22.z;
-			float v = Oy + t*Dy;
-
-			if(v >= 0.0f && u + v <= 1.0f) {
-#ifdef __VISIBILITY_FLAG__
-				/* visibility flag test. we do it here under the assumption
-				 * that most triangles are culled by node flags */
-				if(kernel_tex_fetch(__prim_visibility, triAddr) & visibility)
-#endif
-				{
-					/* record intersection */
-					isect->prim = triAddr;
-					isect->object = object;
-					isect->u = u;
-					isect->v = v;
-					isect->t = t;
-					return true;
-				}
-			}
-		}
-	}
-
-	return false;
-}
-
-#ifdef __HAIR__
-ccl_device_inline void curvebounds(float *lower, float *upper, float *extremta, float *extrema, float *extremtb, float *extremb, float p0, float p1, float p2, float p3)
-{
-	float halfdiscroot = (p2 * p2 - 3 * p3 * p1);
-	float ta = -1.0f;
-	float tb = -1.0f;
-	*extremta = -1.0f;
-	*extremtb = -1.0f;
-	*upper = p0;
-	*lower = p0 + p1 + p2 + p3;
-	*extrema = *upper;
-	*extremb = *lower;
-	if(*lower >= *upper) {
-		*upper = *lower;
-		*lower = p0;
-	}
-
-	if(halfdiscroot >= 0) {
-		halfdiscroot = sqrt(halfdiscroot);
-		ta = (-p2 - halfdiscroot) / (3 * p3);
-		tb = (-p2 + halfdiscroot) / (3 * p3);
-	}
-
-	float t2;
-	float t3;
-	if(ta > 0.0f && ta < 1.0f) {
-		t2 = ta * ta;
-		t3 = t2 * ta;
-		*extremta = ta;
-		*extrema = p3 * t3 + p2 * t2 + p1 * ta + p0;
-		if(*extrema > *upper) {
-			*upper = *extrema;
-		}
-		if(*extrema < *lower) {
-			*lower = *extrema;
-		}
-	}
-	if(tb > 0.0f && tb < 1.0f) {
-		t2 = tb * tb;
-		t3 = t2 * tb;
-		*extremtb = tb;
-		*extremb = p3 * t3 + p2 * t2 + p1 * tb + p0;
-		if(*extremb >= *upper) {
-			*upper = *extremb;
-		}
-		if(*extremb <= *lower) {
-			*lower = *extremb;
-		}
-	}
-}
-
-#ifdef __KERNEL_SSE2__
-ccl_device_inline __m128 transform_point_T3(const __m128 t[3], const __m128 &a)
-{
-	return fma(broadcast<0>(a), t[0], fma(broadcast<1>(a), t[1], _mm_mul_ps(broadcast<2>(a), t[2])));
-}
-#endif
-
-#ifdef __KERNEL_SSE2__
-/* Pass P and idir by reference to aligned vector */
-ccl_device_inline bool bvh_cardinal_curve_intersect(KernelGlobals *kg, Intersection *isect,
-	const float3 &P, const float3 &idir, uint visibility, int object, int curveAddr, int segment, uint *lcg_state, float difl, float extmax)
-#else
-ccl_device_inline bool bvh_cardinal_curve_intersect(KernelGlobals *kg, Intersection *isect,
-	float3 P, float3 idir, uint visibility, int object, int curveAddr, int segment, uint *lcg_state, float difl, float extmax)
-#endif
-{
-	float epsilon = 0.0f;
-	float r_st, r_en;
-
-	int depth = kernel_data.curve.subdivisions;
-	int flags = kernel_data.curve.curveflags;
-	int prim = kernel_tex_fetch(__prim_index, curveAddr);
-
-#ifdef __KERNEL_SSE2__
-	__m128 vdir = _mm_div_ps(_mm_set1_ps(1.0f), (__m128 &)idir);
-	__m128 vcurve_coef[4];
-	const float3 *curve_coef = (float3 *)vcurve_coef;
-	
-	{
-		__m128 dtmp = _mm_mul_ps(vdir, vdir);
-		__m128 d_ss = _mm_sqrt_ss(_mm_add_ss(dtmp, broadcast<2>(dtmp)));
-		__m128 rd_ss = _mm_div_ss(_mm_set_ss(1.0f), d_ss);
-
-		__m128i v00vec = _mm_load_si128((__m128i *)&kg->__curves.data[prim]);
-		int2 &v00 = (int2 &)v00vec;
-
-		int k0 = v00.x + segment;
-		int k1 = k0 + 1;
-		int ka = max(k0 - 1, v00.x);
-		int kb = min(k1 + 1, v00.x + v00.y - 1);
-
-		__m128 P0 = _mm_load_ps(&kg->__curve_keys.data[ka].x);
-		__m128 P1 = _mm_load_ps(&kg->__curve_keys.data[k0].x);
-		__m128 P2 = _mm_load_ps(&kg->__curve_keys.data[k1].x);
-		__m128 P3 = _mm_load_ps(&kg->__curve_keys.data[kb].x);
-
-		__m128 rd_sgn = set_sign_bit<0, 1, 1, 1>(broadcast<0>(rd_ss));
-		__m128 mul_zxxy = _mm_mul_ps(shuffle<2, 0, 0, 1>(vdir), rd_sgn);
-		__m128 mul_yz = _mm_mul_ps(shuffle<1, 2, 1, 2>(vdir), mul_zxxy);
-		__m128 mul_shuf = shuffle<0, 1, 2, 3>(mul_zxxy, mul_yz);
-		__m128 vdir0 = _mm_and_ps(vdir, _mm_castsi128_ps(_mm_setr_epi32(0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, 0)));
-
-		__m128 htfm0 = shuffle<0, 2, 0, 3>(mul_shuf, vdir0);
-		__m128 htfm1 = shuffle<1, 0, 1, 3>(_mm_set_ss(_mm_cvtss_f32(d_ss)), vdir0);
-		__m128 htfm2 = shuffle<1, 3, 2, 3>(mul_shuf, vdir0);
-
-		__m128 htfm[] = { htfm0, htfm1, htfm2 };
-		__m128 p0 = transform_point_T3(htfm, _mm_sub_ps(P0, (__m128 &)P));
-		__m128 p1 = transform_point_T3(htfm, _mm_sub_ps(P1, (__m128 &)P));
-		__m128 p2 = transform_point_T3(htfm, _mm_sub_ps(P2, (__m128 &)P));
-		__m128 p3 = transform_point_T3(htfm, _mm_sub_ps(P3, (__m128 &)P));
-
-		float fc = 0.71f;
-		__m128 vfc = _mm_set1_ps(fc);
-		__m128 vfcxp3 = _mm_mul_ps(vfc, p3);
-
-		vcurve_coef[0] = p1;
-		vcurve_coef[1] = _mm_mul_ps(vfc, _mm_sub_ps(p2, p0));
-		vcurve_coef[2] = fma(_mm_set1_ps(fc * 2.0f), p0, fma(_mm_set1_ps(fc - 3.0f), p1, fms(_mm_set1_ps(3.0f - 2.0f * fc), p2, vfcxp3)));
-		vcurve_coef[3] = fms(_mm_set1_ps(fc - 2.0f), _mm_sub_ps(p2, p1), fms(vfc, p0, vfcxp3));
-
-		r_st = ((float4 &)P1).w;
-		r_en = ((float4 &)P2).w;
-	}
-#else
-	float3 curve_coef[4];
-
-	/* curve Intersection check */
-	float3 dir = 1.0f/idir;
-
-	/* obtain curve parameters */
-	{
-		/* ray transform created - this should be created at beginning of intersection loop */
-		Transform htfm;
-		float d = sqrtf(dir.x * dir.x + dir.z * dir.z);
-		htfm = make_transform(
-			dir.z / d, 0, -dir.x /d, 0,
-			-dir.x * dir.y /d, d, -dir.y * dir.z /d, 0,
-			dir.x, dir.y, dir.z, 0,
-			0, 0, 0, 1);
-
-		float4 v00 = kernel_tex_fetch(__curves, prim);
-
-		int k0 = __float_as_int(v00.x) + segment;
-		int k1 = k0 + 1;
-
-		int ka = max(k0 - 1,__float_as_int(v00.x));
-		int kb = min(k1 + 1,__float_as_int(v00.x) + __float_as_int(v00.y) - 1);
-
-		float4 P0 = kernel_tex_fetch(__curve_keys, ka);
-		float4 P1 = kernel_tex_fetch(__curve_keys, k0);
-		float4 P2 = kernel_tex_fetch(__curve_keys, k1);
-		float4 P3 = kernel_tex_fetch(__curve_keys, kb);
-
-		float3 p0 = transform_point(&htfm, float4_to_float3(P0) - P);
-		float3 p1 = transform_point(&htfm, float4_to_float3(P1) - P);
-		float3 p2 = transform_point(&htfm, float4_to_float3(P2) - P);
-		float3 p3 = transform_point(&htfm, float4_to_float3(P3) - P);
-
-		float fc = 0.71f;
-		curve_coef[0] = p1;
-		curve_coef[1] = -fc*p0 + fc*p2;
-		curve_coef[2] = 2.0f * fc * p0 + (fc - 3.0f) * p1 + (3.0f - 2.0f * fc) * p2 - fc * p3;
-		curve_coef[3] = -fc * p0 + (2.0f - fc) * p1 + (fc - 2.0f) * p2 + fc * p3;
-		r_st = P1.w;
-		r_en = P2.w;
-	}
-#endif
-
-	float r_curr = max(r_st, r_en);
-
-	if((flags & CURVE_KN_RIBBONS) || !(flags & CURVE_KN_BACKFACING))
-		epsilon = 2 * r_curr;
-
-	/* find bounds - this is slow for cubic curves */
-	float upper, lower;
-
-	float zextrem[4];
-	curvebounds(&lower, &upper, &zextrem[0], &zextrem[1], &zextrem[2], &zextrem[3], curve_coef[0].z, curve_coef[1].z, curve_coef[2].z, curve_coef[3].z);
-	if(lower - r_curr > isect->t || upper + r_curr < epsilon)
-		return false;
-
-	/* minimum width extension */
-	float mw_extension = min(difl * fabsf(upper), extmax);
-	float r_ext = mw_extension + r_curr;
-
-	float xextrem[4];
-	curvebounds(&lower, &upper, &xextrem[0], &xextrem[1], &xextrem[2], &xextrem[3], curve_coef[0].x, curve_coef[1].x, curve_coef[2].x, curve_coef[3].x);
-	if(lower > r_ext || upper < -r_ext)
-		return false;
-
-	float yextrem[4];
-	curvebounds(&lower, &upper, &yextrem[0], &yextrem[1], &yextrem[2], &yextrem[3], curve_coef[0].y, curve_coef[1].y, curve_coef[2].y, curve_coef[3].y);
-	if(lower > r_ext || upper < -r_ext)
-		return false;
-
-	/* setup recurrent loop */
-	int level = 1 << depth;
-	int tree = 0;
-	float resol = 1.0f / (float)level;
-	bool hit = false;
-
-	/* begin loop */
-	while(!(tree >> (depth))) {
-		float i_st = tree * resol;
-		float i_en = i_st + (level * resol);
-#ifdef __KERNEL_SSE2__
-		__m128 vi_st = _mm_set1_ps(i_st), vi_en = _mm_set1_ps(i_en);
-		__m128 vp_st = fma(fma(fma(vcurve_coef[3], vi_st, vcurve_coef[2]), vi_st, vcurve_coef[1]), vi_st, vcurve_coef[0]);
-		__m128 vp_en = fma(fma(fma(vcurve_coef[3], vi_en, vcurve_coef[2]), vi_en, vcurve_coef[1]), vi_en, vcurve_coef[0]);
-
-		__m128 vbmin = _mm_min_ps(vp_st, vp_en);
-		__m128 vbmax = _mm_max_ps(vp_st, vp_en);
-
-		float3 &bmin = (float3 &)vbmin, &bmax = (float3 &)vbmax;
-		float &bminx = bmin.x, &bminy = bmin.y, &bminz = bmin.z;
-		float &bmaxx = bmax.x, &bmaxy = bmax.y, &bmaxz = bmax.z;
-		float3 &p_st = (float3 &)vp_st, &p_en = (float3 &)vp_en;
-#else
-		float3 p_st = ((curve_coef[3] * i_st + curve_coef[2]) * i_st + curve_coef[1]) * i_st + curve_coef[0];
-		float3 p_en = ((curve_coef[3] * i_en + curve_coef[2]) * i_en + curve_coef[1]) * i_en + curve_coef[0];
-		
-		float bminx = min(p_st.x, p_en.x);
-		float bmaxx = max(p_st.x, p_en.x);
-		float bminy = min(p_st.y, p_en.y);
-		float bmaxy = max(p_st.y, p_en.y);
-		float bminz = min(p_st.z, p_en.z);
-		float bmaxz = max(p_st.z, p_en.z);
-#endif
-
-		if(xextrem[0] >= i_st && xextrem[0] <= i_en) {
-			bminx = min(bminx,xextrem[1]);
-			bmaxx = max(bmaxx,xextrem[1]);
-		}
-		if(xextrem[2] >= i_st && xextrem[2] <= i_en) {
-			bminx = min(bminx,xextrem[3]);
-			bmaxx = max(bmaxx,xextrem[3]);
-		}
-		if(yextrem[0] >= i_st && yextrem[0] <= i_en) {
-			bminy = min(bminy,yextrem[1]);
-			bmaxy = max(bmaxy,yextrem[1]);
-		}
-		if(yextrem[2] >= i_st && yextrem[2] <= i_en) {
-			bminy = min(bminy,yextrem[3]);
-			bmaxy = max(bmaxy,yextrem[3]);
-		}
-		if(zextrem[0] >= i_st && zextrem[0] <= i_en) {
-			bminz = min(bminz,zextrem[1]);
-			bmaxz = max(bmaxz,zextrem[1]);
-		}
-		if(zextrem[2] >= i_st && zextrem[2] <= i_en) {
-			bminz = min(bminz,zextrem[3]);
-			bmaxz = max(bmaxz,zextrem[3]);
-		}
-
-		float r1 = r_st + (r_en - r_st) * i_st;
-		float r2 = r_st + (r_en - r_st) * i_en;
-		r_curr = max(r1, r2);
-
-		mw_extension = min(difl * fabsf(bmaxz), extmax);
-		float r_ext = mw_extension + r_curr;
-		float coverage = 1.0f;
-
-		if (bminz - r_curr > isect->t || bmaxz + r_curr < epsilon || bminx > r_ext|| bmaxx < -r_ext|| bminy > r_ext|| bmaxy < -r_ext) {
-			/* the bounding box does not overlap the square centered at O */
-			tree += level;
-			level = tree & -tree;
-		}
-		else if (level == 1) {
-
-			/* the maximum recursion depth is reached.
-			* check if dP0.(Q-P0)>=0 and dPn.(Pn-Q)>=0.
-			* dP* is reversed if necessary.*/
-			float t = isect->t;
-			float u = 0.0f;
-			if(flags & CURVE_KN_RIBBONS) {
-				float3 tg = (p_en - p_st);
-				float w = tg.x * tg.x + tg.y * tg.y;
-				if (w == 0) {
-					tree++;
-					level = tree & -tree;
-					continue;
-				}
-				w = -(p_st.x * tg.x + p_st.y * tg.y) / w;
-				w = clamp((float)w, 0.0f, 1.0f);
-
-				/* compute u on the curve segment */
-				u = i_st * (1 - w) + i_en * w;
-				r_curr = r_st + (r_en - r_st) * u;
-				/* compare x-y distances */
-				float3 p_curr = ((curve_coef[3] * u + curve_coef[2]) * u + curve_coef[1]) * u + curve_coef[0];
-
-				float3 dp_st = (3 * curve_coef[3] * i_st + 2 * curve_coef[2]) * i_st + curve_coef[1];
-				if (dot(tg, dp_st)< 0)
-					dp_st *= -1;
-				if (dot(dp_st, -p_st) + p_curr.z * dp_st.z < 0) {
-					tree++;
-					level = tree & -tree;
-					continue;
-				}
-				float3 dp_en = (3 * curve_coef[3] * i_en + 2 * curve_coef[2]) * i_en + curve_coef[1];
-				if (dot(tg, dp_en) < 0)
-					dp_en *= -1;
-				if (dot(dp_en, p_en) - p_curr.z * dp_en.z < 0) {
-					tree++;
-					level = tree & -tree;
-					continue;
-				}
-
-				/* compute coverage */
-				float r_ext = r_curr;
-				coverage = 1.0f;
-				if(difl != 0.0f) {
-					mw_extension = min(difl * fabsf(bmaxz), extmax);
-					r_ext = mw_extension + r_curr;
-					float d = sqrtf(p_curr.x * p_curr.x + p_curr.y * p_curr.y);
-					float d0 = d - r_curr;
-					float d1 = d + r_curr;
-					if (d0 >= 0)
-						coverage = (min(d1 / mw_extension, 1.0f) - min(d0 / mw_extension, 1.0f)) * 0.5f;
-					else // inside
-						coverage = (min(d1 / mw_extension, 1.0f) + min(-d0 / mw_extension, 1.0f)) * 0.5f;
-				}
-				
-				if (p_curr.x * p_curr.x + p_curr.y * p_curr.y >= r_ext * r_ext || p_curr.z <= epsilon || isect->t < p_curr.z) {
-					tree++;
-					level = tree & -tree;
-					continue;
-				}
-
-				t = p_curr.z;
-			}
-			else {
-				float l = len(p_en - p_st);
-				/* minimum width extension */
-				float or1 = r1;
-				float or2 = r2;
-				if(difl != 0.0f) {
-					mw_extension = min(len(p_st - P) * difl, extmax);
-					or1 = r1 < mw_extension ? mw_extension : r1;
-					mw_extension = min(len(p_en - P) * difl, extmax);
-					or2 = r2 < mw_extension ? mw_extension : r2;
-				}
-				/* --- */
-				float3 tg = (p_en - p_st) / l;
-				float gd = (or2 - or1) / l;
-				float difz = -dot(p_st,tg);
-				float cyla = 1.0f - (tg.z * tg.z * (1 + gd*gd));
-				float halfb = (-p_st.z - tg.z*(difz + gd*(difz*gd + or1)));
-				float tcentre = -halfb/cyla;
-				float zcentre = difz + (tg.z * tcentre);
-				float3 tdif = - p_st;
-				tdif.z += tcentre;
-				float tdifz = dot(tdif,tg);
-				float tb = 2*(tdif.z - tg.z*(tdifz + gd*(tdifz*gd + or1)));
-				float tc = dot(tdif,tdif) - tdifz * tdifz * (1 + gd*gd) - or1*or1 - 2*or1*tdifz*gd;
-				float td = tb*tb - 4*cyla*tc;
-				if (td < 0.0f) {
-					tree++;
-					level = tree & -tree;
-					continue;
-				}
-				
-				float rootd = sqrtf(td);
-				float correction = ((-tb - rootd)/(2*cyla));
-				t = tcentre + correction;
-
-				float3 dp_st = (3 * curve_coef[3] * i_st + 2 * curve_coef[2]) * i_st + curve_coef[1];
-				if (dot(tg, dp_st)< 0)
-					dp_st *= -1;
-				float3 dp_en = (3 * curve_coef[3] * i_en + 2 * curve_coef[2]) * i_en + curve_coef[1];
-				if (dot(tg, dp_en) < 0)
-					dp_en *= -1;
-
-				if(flags & CURVE_KN_BACKFACING && (dot(dp_st, -p_st) + t * dp_st.z < 0 || dot(dp_en, p_en) - t * dp_en.z < 0 || isect->t < t || t <= 0.0f)) {
-					correction = ((-tb + rootd)/(2*cyla));
-					t = tcentre + correction;
-				}			
-
-				if (dot(dp_st, -p_st) + t * dp_st.z < 0 || dot(dp_en, p_en) - t * dp_en.z < 0 || isect->t < t || t <= 0.0f) {
-					tree++;
-					level = tree & -tree;
-					continue;
-				}
-
-				float w = (zcentre + (tg.z * correction))/l;
-				w = clamp((float)w, 0.0f, 1.0f);
-				/* compute u on the curve segment */
-				u = i_st * (1 - w) + i_en * w;
-				r_curr = r1 + (r2 - r1) * w;
-				r_ext = or1 + (or2 - or1) * w;
-				coverage = r_curr/r_ext;
-
-			}
-			/* we found a new intersection */
-
-			/* stochastic fade from minimum width */
-			if(lcg_state && coverage != 1.0f) {
-				if(lcg_step_float(lcg_state) > coverage)
-					return hit;
-			}
-
-#ifdef __VISIBILITY_FLAG__
-			/* visibility flag test. we do it here under the assumption
-			 * that most triangles are culled by node flags */
-			if(kernel_tex_fetch(__prim_visibility, curveAddr) & visibility)
-#endif
-			{
-				/* record intersection */
-				isect->prim = curveAddr;
-				isect->segment = segment;
-				isect->object = object;
-				isect->u = u;
-				isect->v = 0.0f;
-				/*isect->v = 1.0f - coverage; */
-				isect->t = t;
-				hit = true;
-			}
-			
-			tree++;
-			level = tree & -tree;
-		}
-		else {
-			/* split the curve into two curves and process */
-			level = level >> 1;
-		}
-	}
-
-	return hit;
-}
-
-ccl_device_inline bool bvh_curve_intersect(KernelGlobals *kg, Intersection *isect,
-	float3 P, float3 idir, uint visibility, int object, int curveAddr, int segment, uint *lcg_state, float difl, float extmax)
-{
-	/* curve Intersection check */
-	int flags = kernel_data.curve.curveflags;
-
-	int prim = kernel_tex_fetch(__prim_index, curveAddr);
-	float4 v00 = kernel_tex_fetch(__curves, prim);
-
-	int cnum = __float_as_int(v00.x);
-	int k0 = cnum + segment;
-	int k1 = k0 + 1;
-
-	float4 P1 = kernel_tex_fetch(__curve_keys, k0);
-	float4 P2 = kernel_tex_fetch(__curve_keys, k1);
-
-	float or1 = P1.w;
-	float or2 = P2.w;
-	float3 p1 = float4_to_float3(P1);
-	float3 p2 = float4_to_float3(P2);
-
-	/* minimum width extension */
-	float r1 = or1;
-	float r2 = or2;
-	if(difl != 0.0f) {
-		float pixelsize = min(len(p1 - P) * difl, extmax);
-		r1 = or1 < pixelsize ? pixelsize : or1;
-		pixelsize = min(len(p2 - P) * difl, extmax);
-		r2 = or2 < pixelsize ? pixelsize : or2;
-	}
-	/* --- */
-
-	float mr = max(r1,r2);
-	float3 dif = P - p1;
-	float3 dir = 1.0f/idir;
-	float l = len(p2 - p1);
-
-	float sp_r = mr + 0.5f * l;
-	float3 sphere_dif = P - ((p1 + p2) * 0.5f);
-	float sphere_b = dot(dir,sphere_dif);
-	sphere_dif = sphere_dif - sphere_b * dir;
-	sphere_b = dot(dir,sphere_dif);
-	float sdisc = sphere_b * sphere_b - len_squared(sphere_dif) + sp_r * sp_r;
-	if(sdisc < 0.0f)
-		return false;
-
-	/* obtain parameters and test midpoint distance for suitable modes */
-	float3 tg = (p2 - p1) / l;
-	float gd = (r2 - r1) / l;
-	float dirz = dot(dir,tg);
-	float difz = dot(dif,tg);
-
-	float a = 1.0f - (dirz*dirz*(1 + gd*gd));
-	float halfb = dot(dir,dif) - dirz*(difz + gd*(difz*gd + r1));
-
-	float tcentre = -halfb/a;
-	float zcentre = difz + (dirz * tcentre);
-
-	if((tcentre > isect->t) && !(flags & CURVE_KN_ACCURATE))
-		return false;
-	if((zcentre < 0 || zcentre > l) && !(flags & CURVE_KN_ACCURATE) && !(flags & CURVE_KN_INTERSECTCORRECTION))
-		return false;
-
-	/* test minimum separation */
-	float3 cprod = cross(tg, dir);
-	float3 cprod2 = cross(tg, dif);
-	float cprodsq = len_squared(cprod);
-	float cprod2sq = len_squared(cprod2);
-	float distscaled = dot(cprod,dif);
-
-	if(cprodsq == 0)
-		distscaled = cprod2sq;
-	else
-		distscaled = (distscaled*distscaled)/cprodsq;
-
-	if(distscaled > mr*mr)
-		return false;
-
-	/* calculate true intersection */
-	float3 tdif = P - p1 + tcentre * dir;
-	float tdifz = dot(tdif,tg);
-	float tb = 2*(dot(dir,tdif) - dirz*(tdifz + gd*(tdifz*gd + r1)));
-	float tc = dot(tdif,tdif) - tdifz * tdifz * (1 + gd*gd) - r1*r1 - 2*r1*tdifz*gd;
-	float td = tb*tb - 4*a*tc;
-
-	if (td < 0.0f)
-		return false;
-
-	float rootd = 0.0f;
-	float correction = 0.0f;
-	if(flags & CURVE_KN_ACCURATE) {
-		rootd = sqrtf(td);
-		correction = ((-tb - rootd)/(2*a));
-	}
-
-	float t = tcentre + correction;
-
-	if(t < isect->t) {
-
-		if(flags & CURVE_KN_INTERSECTCORRECTION) {
-			rootd = sqrtf(td);
-			correction = ((-tb - rootd)/(2*a));
-			t = tcentre + correction;
-		}
-
-		float z = zcentre + (dirz * correction);
-		bool backface = false;
-
-		if(flags & CURVE_KN_BACKFACING && (t < 0.0f || z < 0 || z > l)) {
-			backface = true;
-			correction = ((-tb + rootd)/(2*a));
-			t = tcentre + correction;
-			z = zcentre + (dirz * correction);
-		}
-
-		/* stochastic fade from minimum width */
-		float adjradius = or1 + z * (or2 - or1) / l;
-		adjradius = adjradius / (r1 + z * gd);
-		if(lcg_state && adjradius != 1.0f) {
-			if(lcg_step_float(lcg_state) > adjradius)
-				return false;
-		}
-		/* --- */
-
-		if(t > 0.0f && t < isect->t && z >= 0 && z <= l) {
-
-			if (flags & CURVE_KN_ENCLOSEFILTER) {
-				float enc_ratio = 1.01f;
-				if((dot(P - p1, tg) > -r1 * enc_ratio) && (dot(P - p2, tg) < r2 * enc_ratio)) {
-					float a2 = 1.0f - (dirz*dirz*(1 + gd*gd*enc_ratio*enc_ratio));
-					float c2 = dot(dif,dif) - difz * difz * (1 + gd*gd*enc_ratio*enc_ratio) - r1*r1*enc_ratio*enc_ratio - 2*r1*difz*gd*enc_ratio;
-					if(a2*c2 < 0.0f)
-						return false;
-				}
-			}
-
-#ifdef __VISIBILITY_FLAG__
-			/* visibility flag test. we do it here under the assumption
-			 * that most triangles are culled by node flags */
-			if(kernel_tex_fetch(__prim_visibility, curveAddr) & visibility)
-#endif
-			{
-				/* record intersection */
-				isect->prim = curveAddr;
-				isect->segment = segment;
-				isect->object = object;
-				isect->u = z/l;
-				isect->v = td/(4*a*a);
-				/*isect->v = 1.0f - adjradius;*/
-				isect->t = t;
-
-				if(backface) 
-					isect->u = -isect->u;
-				
-				return true;
-			}
-		}
-	}
-
-	return false;
-}
-#endif
-
-#ifdef __SUBSURFACE__
-/* Special ray intersection routines for subsurface scattering. In that case we
- * only want to intersect with primitives in the same object, and if case of
- * multiple hits we pick a single random primitive as the intersection point. */
-
-ccl_device_inline void bvh_triangle_intersect_subsurface(KernelGlobals *kg, Intersection *isect_array,
-	float3 P, float3 idir, int object, int triAddr, float tmax, uint *num_hits, uint *lcg_state, int max_hits)
-{
-	/* compute and check intersection t-value */
-	float4 v00 = kernel_tex_fetch(__tri_woop, triAddr*TRI_NODE_SIZE+0);
-	float4 v11 = kernel_tex_fetch(__tri_woop, triAddr*TRI_NODE_SIZE+1);
-	float3 dir = 1.0f/idir;
-
-	float Oz = v00.w - P.x*v00.x - P.y*v00.y - P.z*v00.z;
-	float invDz = 1.0f/(dir.x*v00.x + dir.y*v00.y + dir.z*v00.z);
-	float t = Oz * invDz;
-
-	if(t > 0.0f && t < tmax) {
-		/* compute and check barycentric u */
-		float Ox = v11.w + P.x*v11.x + P.y*v11.y + P.z*v11.z;
-		float Dx = dir.x*v11.x + dir.y*v11.y + dir.z*v11.z;
-		float u = Ox + t*Dx;
-
-		if(u >= 0.0f) {
-			/* compute and check barycentric v */
-			float4 v22 = kernel_tex_fetch(__tri_woop, triAddr*TRI_NODE_SIZE+2);
-			float Oy = v22.w + P.x*v22.x + P.y*v22.y + P.z*v22.z;
-			float Dy = dir.x*v22.x + dir.y*v22.y + dir.z*v22.z;
-			float v = Oy + t*Dy;
-
-			if(v >= 0.0f && u + v <= 1.0f) {
-				(*num_hits)++;
-
-				int hit;
-
-				if(*num_hits <= max_hits) {
-					hit = *num_hits - 1;
-				}
-				else {
-					/* reservoir sampling: if we are at the maximum number of
-					 * hits, randomly replace element or skip it */
-					hit = lcg_step_uint(lcg_state) % *num_hits;
-
-					if(hit >= max_hits)
-						return;
-				}
-
-				/* record intersection */
-				Intersection *isect = &isect_array[hit];
-				isect->prim = triAddr;
-				isect->object = object;
-				isect->u = u;
-				isect->v = v;
-				isect->t = t;
-			}
-		}
-	}
-}
-#endif
-
-/* BVH intersection function variations */
-
-#define BVH_INSTANCING			1
-#define BVH_MOTION				2
-#define BVH_HAIR				4
-#define BVH_HAIR_MINIMUM_WIDTH	8
-
-#define BVH_FUNCTION_NAME bvh_intersect
-#define BVH_FUNCTION_FEATURES 0
-#include "kernel_bvh_traversal.h"
-
-#if defined(__INSTANCING__)
-#define BVH_FUNCTION_NAME bvh_intersect_instancing
-#define BVH_FUNCTION_FEATURES BVH_INSTANCING
-#include "kernel_bvh_traversal.h"
-#endif
-
-#if defined(__HAIR__)
-#define BVH_FUNCTION_NAME bvh_intersect_hair
-#define BVH_FUNCTION_FEATURES BVH_INSTANCING|BVH_HAIR|BVH_HAIR_MINIMUM_WIDTH
-#include "kernel_bvh_traversal.h"
-#endif
-
-#if defined(__OBJECT_MOTION__)
-#define BVH_FUNCTION_NAME bvh_intersect_motion
-#define BVH_FUNCTION_FEATURES BVH_INSTANCING|BVH_MOTION
-#include "kernel_bvh_traversal.h"
-#endif
-
-#if defined(__HAIR__) && defined(__OBJECT_MOTION__)
-#define BVH_FUNCTION_NAME bvh_intersect_hair_motion
-#define BVH_FUNCTION_FEATURES BVH_INSTANCING|BVH_HAIR|BVH_HAIR_MINIMUM_WIDTH|BVH_MOTION
-#include "kernel_bvh_traversal.h"
-#endif
-
-#if defined(__SUBSURFACE__)
-#define BVH_FUNCTION_NAME bvh_intersect_subsurface
-#define BVH_FUNCTION_FEATURES 0
-#include "kernel_bvh_subsurface.h"
-#endif
-
-#if defined(__SUBSURFACE__) && defined(__INSTANCING__)
-#define BVH_FUNCTION_NAME bvh_intersect_subsurface_instancing
-#define BVH_FUNCTION_FEATURES BVH_INSTANCING
-#include "kernel_bvh_subsurface.h"
-#endif
-
-#if defined(__SUBSURFACE__) && defined(__HAIR__)
-#define BVH_FUNCTION_NAME bvh_intersect_subsurface_hair
-#define BVH_FUNCTION_FEATURES BVH_INSTANCING|BVH_HAIR
-#include "kernel_bvh_subsurface.h"
-#endif
-
-#if defined(__SUBSURFACE__) && defined(__OBJECT_MOTION__)
-#define BVH_FUNCTION_NAME bvh_intersect_subsurface_motion
-#define BVH_FUNCTION_FEATURES BVH_INSTANCING|BVH_MOTION
-#include "kernel_bvh_subsurface.h"
-#endif
-
-#if defined(__SUBSURFACE__) && defined(__HAIR__) && defined(__OBJECT_MOTION__)
-#define BVH_FUNCTION_NAME bvh_intersect_subsurface_hair_motion
-#define BVH_FUNCTION_FEATURES BVH_INSTANCING|BVH_HAIR|BVH_MOTION
-#include "kernel_bvh_subsurface.h"
-#endif
-
-/* to work around titan bug when using arrays instead of textures */
-#if !defined(__KERNEL_CUDA__) || defined(__KERNEL_CUDA_TEX_STORAGE__)
-ccl_device_inline
-#else
-ccl_device_noinline
-#endif
-#ifdef __HAIR__ 
-bool scene_intersect(KernelGlobals *kg, const Ray *ray, const uint visibility, Intersection *isect, uint *lcg_state, float difl, float extmax)
-#else
-bool scene_intersect(KernelGlobals *kg, const Ray *ray, const uint visibility, Intersection *isect)
-#endif
-{
-#ifdef __OBJECT_MOTION__
-	if(kernel_data.bvh.have_motion) {
-#ifdef __HAIR__
-		if(kernel_data.bvh.have_curves)
-			return bvh_intersect_hair_motion(kg, ray, isect, visibility, lcg_state, difl, extmax);
-#endif /* __HAIR__ */
-
-		return bvh_intersect_motion(kg, ray, isect, visibility);
-	}
-#endif /* __OBJECT_MOTION__ */
-
-#ifdef __HAIR__ 
-	if(kernel_data.bvh.have_curves)
-		return bvh_intersect_hair(kg, ray, isect, visibility, lcg_state, difl, extmax);
-#endif /* __HAIR__ */
-
-#ifdef __KERNEL_CPU__
-
-#ifdef __INSTANCING__
-	if(kernel_data.bvh.have_instancing)
-		return bvh_intersect_instancing(kg, ray, isect, visibility);
-#endif /* __INSTANCING__ */
-
-	return bvh_intersect(kg, ray, isect, visibility);
-#else /* __KERNEL_CPU__ */
-
-#ifdef __INSTANCING__
-	return bvh_intersect_instancing(kg, ray, isect, visibility);
-#else
-	return bvh_intersect(kg, ray, isect, visibility);
-#endif /* __INSTANCING__ */
-
-#endif /* __KERNEL_CPU__ */
-}
-
-/* to work around titan bug when using arrays instead of textures */
-#ifdef __SUBSURFACE__
-#if !defined(__KERNEL_CUDA__) || defined(__KERNEL_CUDA_TEX_STORAGE__)
-ccl_device_inline
-#else
-ccl_device_noinline
-#endif
-uint scene_intersect_subsurface(KernelGlobals *kg, const Ray *ray, Intersection *isect, int subsurface_object, uint *lcg_state, int max_hits)
-{
-#ifdef __OBJECT_MOTION__
-	if(kernel_data.bvh.have_motion) {
-#ifdef __HAIR__
-		if(kernel_data.bvh.have_curves)
-			return bvh_intersect_subsurface_hair_motion(kg, ray, isect, subsurface_object, lcg_state, max_hits);
-#endif /* __HAIR__ */
-
-		return bvh_intersect_subsurface_motion(kg, ray, isect, subsurface_object, lcg_state, max_hits);
-	}
-#endif /* __OBJECT_MOTION__ */
-
-#ifdef __HAIR__ 
-	if(kernel_data.bvh.have_curves)
-		return bvh_intersect_subsurface_hair(kg, ray, isect, subsurface_object, lcg_state, max_hits);
-#endif /* __HAIR__ */
-
-#ifdef __KERNEL_CPU__
-
-#ifdef __INSTANCING__
-	if(kernel_data.bvh.have_instancing)
-		return bvh_intersect_subsurface_instancing(kg, ray, isect, subsurface_object, lcg_state, max_hits);
-#endif /* __INSTANCING__ */
-
-	return bvh_intersect_subsurface(kg, ray, isect, subsurface_object, lcg_state, max_hits);
-#else /* __KERNEL_CPU__ */
-
-#ifdef __INSTANCING__
-	return bvh_intersect_subsurface_instancing(kg, ray, isect, subsurface_object, lcg_state, max_hits);
-#else
-	return bvh_intersect_subsurface(kg, ray, isect, subsurface_object, lcg_state, max_hits);
-#endif /* __INSTANCING__ */
-
-#endif /* __KERNEL_CPU__ */
-}
-#endif
-
-/* Ray offset to avoid self intersection */
-
-ccl_device_inline float3 ray_offset(float3 P, float3 Ng)
-{
-#ifdef __INTERSECTION_REFINE__
-	const float epsilon_f = 1e-5f;
-	/* ideally this should match epsilon_f, but instancing/mblur
-	 * precision makes it problematic */
-	const float epsilon_test = 1.0f;
-	const int epsilon_i = 32;
-
-	float3 res;
-
-	/* x component */
-	if(fabsf(P.x) < epsilon_test) {
-		res.x = P.x + Ng.x*epsilon_f;
-	}
-	else {
-		uint ix = __float_as_uint(P.x);
-		ix += ((ix ^ __float_as_uint(Ng.x)) >> 31)? -epsilon_i: epsilon_i;
-		res.x = __uint_as_float(ix);
-	}
-
-	/* y component */
-	if(fabsf(P.y) < epsilon_test) {
-		res.y = P.y + Ng.y*epsilon_f;
-	}
-	else {
-		uint iy = __float_as_uint(P.y);
-		iy += ((iy ^ __float_as_uint(Ng.y)) >> 31)? -epsilon_i: epsilon_i;
-		res.y = __uint_as_float(iy);
-	}
-
-	/* z component */
-	if(fabsf(P.z) < epsilon_test) {
-		res.z = P.z + Ng.z*epsilon_f;
-	}
-	else {
-		uint iz = __float_as_uint(P.z);
-		iz += ((iz ^ __float_as_uint(Ng.z)) >> 31)? -epsilon_i: epsilon_i;
-		res.z = __uint_as_float(iz);
-	}
-
-	return res;
-#else
-	const float epsilon_f = 1e-4f;
-	return P + epsilon_f*Ng;
-#endif
-}
-
-/* Refine triangle intersection to more precise hit point. For rays that travel
- * far the precision is often not so good, this reintersects the primitive from
- * a closer distance. */
-
-ccl_device_inline float3 bvh_triangle_refine(KernelGlobals *kg, ShaderData *sd, const Intersection *isect, const Ray *ray)
-{
-	float3 P = ray->P;
-	float3 D = ray->D;
-	float t = isect->t;
-
-#ifdef __INTERSECTION_REFINE__
-	if(isect->object != ~0) {
-#ifdef __OBJECT_MOTION__
-		Transform tfm = sd->ob_itfm;
-#else
-		Transform tfm = object_fetch_transform(kg, isect->object, OBJECT_INVERSE_TRANSFORM);
-#endif
-
-		P = transform_point(&tfm, P);
-		D = transform_direction(&tfm, D*t);
-		D = normalize_len(D, &t);
-	}
-
-	P = P + D*t;
-
-	float4 v00 = kernel_tex_fetch(__tri_woop, isect->prim*TRI_NODE_SIZE+0);
-	float Oz = v00.w - P.x*v00.x - P.y*v00.y - P.z*v00.z;
-	float invDz = 1.0f/(D.x*v00.x + D.y*v00.y + D.z*v00.z);
-	float rt = Oz * invDz;
-
-	P = P + D*rt;
-
-	if(isect->object != ~0) {
-#ifdef __OBJECT_MOTION__
-		Transform tfm = sd->ob_tfm;
-#else
-		Transform tfm = object_fetch_transform(kg, isect->object, OBJECT_TRANSFORM);
-#endif
-
-		P = transform_point(&tfm, P);
-	}
-
-	return P;
-#else
-	return P + D*t;
-#endif
-}
-
-/* same as above, except that isect->t is assumed to be in object space for instancing */
-ccl_device_inline float3 bvh_triangle_refine_subsurface(KernelGlobals *kg, ShaderData *sd, const Intersection *isect, const Ray *ray)
-{
-	float3 P = ray->P;
-	float3 D = ray->D;
-	float t = isect->t;
-
-#ifdef __INTERSECTION_REFINE__
-	if(isect->object != ~0) {
-#ifdef __OBJECT_MOTION__
-		Transform tfm = sd->ob_itfm;
-#else
-		Transform tfm = object_fetch_transform(kg, isect->object, OBJECT_INVERSE_TRANSFORM);
-#endif
-
-		P = transform_point(&tfm, P);
-		D = transform_direction(&tfm, D);
-		D = normalize(D);
-	}
-
-	P = P + D*t;
-
-	float4 v00 = kernel_tex_fetch(__tri_woop, isect->prim*TRI_NODE_SIZE+0);
-	float Oz = v00.w - P.x*v00.x - P.y*v00.y - P.z*v00.z;
-	float invDz = 1.0f/(D.x*v00.x + D.y*v00.y + D.z*v00.z);
-	float rt = Oz * invDz;
-
-	P = P + D*rt;
-
-	if(isect->object != ~0) {
-#ifdef __OBJECT_MOTION__
-		Transform tfm = sd->ob_tfm;
-#else
-		Transform tfm = object_fetch_transform(kg, isect->object, OBJECT_TRANSFORM);
-#endif
-
-		P = transform_point(&tfm, P);
-	}
-
-	return P;
-#else
-	return P + D*t;
-#endif
-}
-
-#ifdef __HAIR__
-
-ccl_device_inline float3 curvetangent(float t, float3 p0, float3 p1, float3 p2, float3 p3)
-{
-	float fc = 0.71f;
-	float data[4];
-	float t2 = t * t;
-	data[0] = -3.0f * fc          * t2  + 4.0f * fc * t                  - fc;
-	data[1] =  3.0f * (2.0f - fc) * t2  + 2.0f * (fc - 3.0f) * t;
-	data[2] =  3.0f * (fc - 2.0f) * t2  + 2.0f * (3.0f - 2.0f * fc) * t  + fc;
-	data[3] =  3.0f * fc          * t2  - 2.0f * fc * t;
-	return data[0] * p0 + data[1] * p1 + data[2] * p2 + data[3] * p3;
-}
-
-ccl_device_inline float3 curvepoint(float t, float3 p0, float3 p1, float3 p2, float3 p3)
-{
-	float data[4];
-	float fc = 0.71f;
-	float t2 = t * t;
-	float t3 = t2 * t;
-	data[0] = -fc          * t3  + 2.0f * fc          * t2 - fc * t;
-	data[1] =  (2.0f - fc) * t3  + (fc - 3.0f)        * t2 + 1.0f;
-	data[2] =  (fc - 2.0f) * t3  + (3.0f - 2.0f * fc) * t2 + fc * t;
-	data[3] =  fc          * t3  - fc * t2;
-	return data[0] * p0 + data[1] * p1 + data[2] * p2 + data[3] * p3;
-}
-
-ccl_device_inline float3 bvh_curve_refine(KernelGlobals *kg, ShaderData *sd, const Intersection *isect, const Ray *ray)
-{
-	int flag = kernel_data.curve.curveflags;
-	float t = isect->t;
-	float3 P = ray->P;
-	float3 D = ray->D;
-
-	if(isect->object != ~0) {
-#ifdef __OBJECT_MOTION__
-		Transform tfm = sd->ob_itfm;
-#else
-		Transform tfm = object_fetch_transform(kg, isect->object, OBJECT_INVERSE_TRANSFORM);
-#endif
-
-		P = transform_point(&tfm, P);
-		D = transform_direction(&tfm, D*t);
-		D = normalize_len(D, &t);
-	}
-
-	int prim = kernel_tex_fetch(__prim_index, isect->prim);
-	float4 v00 = kernel_tex_fetch(__curves, prim);
-
-	int k0 = __float_as_int(v00.x) + isect->segment;
-	int k1 = k0 + 1;
-
-	float4 P1 = kernel_tex_fetch(__curve_keys, k0);
-	float4 P2 = kernel_tex_fetch(__curve_keys, k1);
-	float l = 1.0f;
-	float3 tg = normalize_len(float4_to_float3(P2 - P1), &l);
-	float r1 = P1.w;
-	float r2 = P2.w;
-	float gd = ((r2 - r1)/l);
-	
-	P = P + D*t;
-
-	if(flag & CURVE_KN_INTERPOLATE) {
-		int ka = max(k0 - 1,__float_as_int(v00.x));
-		int kb = min(k1 + 1,__float_as_int(v00.x) + __float_as_int(v00.y) - 1);
-
-		float4 P0 = kernel_tex_fetch(__curve_keys, ka);
-		float4 P3 = kernel_tex_fetch(__curve_keys, kb);
-
-		float3 p[4];
-		p[0] = float4_to_float3(P0);
-		p[1] = float4_to_float3(P1);
-		p[2] = float4_to_float3(P2);
-		p[3] = float4_to_float3(P3);
-
-#ifdef __UV__
-		sd->u = isect->u;
-		sd->v = 0.0f;
-#endif
-	
-		tg = normalize(curvetangent(isect->u, p[0], p[1], p[2], p[3]));
-
-		if(kernel_data.curve.curveflags & CURVE_KN_RIBBONS)
-			sd->Ng = normalize(-(D - tg * (dot(tg, D))));
-		else {
-			float3 p_curr = curvepoint(isect->u, p[0], p[1], p[2], p[3]);	
-			sd->Ng = normalize(P - p_curr);
-			sd->Ng = sd->Ng - gd * tg;
-			sd->Ng = normalize(sd->Ng);
-		}
-		sd->N = sd->Ng;
-	}
-	else {
-		float3 dif = P - float4_to_float3(P1);
-
-#ifdef __UV__
-		sd->u = dot(dif,tg)/l;
-		sd->v = 0.0f;
-#endif
-
-		if (flag & CURVE_KN_TRUETANGENTGNORMAL) {
-			sd->Ng = -(D - tg * dot(tg, D));
-			sd->Ng = normalize(sd->Ng);
-		}
-		else {
-			sd->Ng = (dif - tg * sd->u * l) / (P1.w + sd->u * l * gd);
-			if (gd != 0.0f) {
-				sd->Ng = sd->Ng - gd * tg ;
-				sd->Ng = normalize(sd->Ng);
-			}
-		}
-
-		sd->N = sd->Ng;
-	}
-
-#ifdef __DPDU__
-	/* dPdu/dPdv */
-	sd->dPdu = tg;
-	sd->dPdv = cross(tg, sd->Ng);
-#endif
-
-	/*add fading parameter for minimum pixel width with transparency bsdf*/
-	/*sd->curve_transparency = isect->v;*/
-	/*sd->curve_radius = sd->u * gd * l + r1;*/
-
-	if(isect->object != ~0) {
-#ifdef __OBJECT_MOTION__
-		Transform tfm = sd->ob_tfm;
-#else
-		Transform tfm = object_fetch_transform(kg, isect->object, OBJECT_TRANSFORM);
-#endif
-
-		P = transform_point(&tfm, P);
-	}
-
-	return P;
-}
-#endif
-
-CCL_NAMESPACE_END
-
diff --git a/intern/cycles/kernel/kernel_camera.h b/intern/cycles/kernel/kernel_camera.h
index 887b1afddd4..7fc66a9fdee 100644
--- a/intern/cycles/kernel/kernel_camera.h
+++ b/intern/cycles/kernel/kernel_camera.h
@@ -229,7 +229,7 @@ ccl_device void camera_sample(KernelGlobals *kg, int x, int y, float filter_u, f
 	if(kernel_data.cam.shuttertime == -1.0f)
 		ray->time = TIME_INVALID;
 	else
-		ray->time = 0.5f + 0.5f*(time - 0.5f)*kernel_data.cam.shuttertime;
+		ray->time = time;
 #endif
 
 	/* sample */
@@ -266,7 +266,7 @@ ccl_device_inline float3 camera_world_to_ndc(KernelGlobals *kg, ShaderData *sd,
 {
 	if(kernel_data.cam.type != CAMERA_PANORAMA) {
 		/* perspective / ortho */
-		if(sd->object == ~0 && kernel_data.cam.type == CAMERA_PERSPECTIVE)
+		if(sd->object == PRIM_NONE && kernel_data.cam.type == CAMERA_PERSPECTIVE)
 			P += camera_position(kg);
 
 		Transform tfm = kernel_data.cam.worldtondc;
@@ -276,7 +276,7 @@ ccl_device_inline float3 camera_world_to_ndc(KernelGlobals *kg, ShaderData *sd,
 		/* panorama */
 		Transform tfm = kernel_data.cam.worldtocamera;
 
-		if(sd->object != ~0)
+		if(sd->object != OBJECT_NONE)
 			P = normalize(transform_point(&tfm, P));
 		else
 			P = normalize(transform_direction(&tfm, P));
diff --git a/intern/cycles/kernel/kernel_compat_cpu.h b/intern/cycles/kernel/kernel_compat_cpu.h
index b213e91274d..d027bb62ebe 100644
--- a/intern/cycles/kernel/kernel_compat_cpu.h
+++ b/intern/cycles/kernel/kernel_compat_cpu.h
@@ -20,9 +20,9 @@
 #define __KERNEL_CPU__
 
 #include "util_debug.h"
-#include "util_half.h"
 #include "util_math.h"
 #include "util_simd.h"
+#include "util_half.h"
 #include "util_types.h"
 
 CCL_NAMESPACE_BEGIN
@@ -95,38 +95,128 @@ template<typename T> struct texture_image  {
 
 	ccl_always_inline float4 interp(float x, float y, bool periodic = true)
 	{
-		if(!data)
+		if(UNLIKELY(!data))
 			return make_float4(0.0f, 0.0f, 0.0f, 0.0f);
 
 		int ix, iy, nix, niy;
-		float tx = frac(x*width - 0.5f, &ix);
-		float ty = frac(y*height - 0.5f, &iy);
 
-		if(periodic) {
-			ix = wrap_periodic(ix, width);
-			iy = wrap_periodic(iy, height);
-
-			nix = wrap_periodic(ix+1, width);
-			niy = wrap_periodic(iy+1, height);
+		if(interpolation == INTERPOLATION_CLOSEST) {
+			frac(x*(float)width, &ix);
+			frac(y*(float)height, &iy);
+			if(periodic) {
+				ix = wrap_periodic(ix, width);
+				iy = wrap_periodic(iy, height);
+
+			}
+			else {
+				ix = wrap_clamp(ix, width);
+				iy = wrap_clamp(iy, height);
+			}
+			return read(data[ix + iy*width]);
 		}
 		else {
-			ix = wrap_clamp(ix, width);
-			iy = wrap_clamp(iy, height);
-
-			nix = wrap_clamp(ix+1, width);
-			niy = wrap_clamp(iy+1, height);
+			float tx = frac(x*(float)width - 0.5f, &ix);
+			float ty = frac(y*(float)height - 0.5f, &iy);
+
+			if(periodic) {
+				ix = wrap_periodic(ix, width);
+				iy = wrap_periodic(iy, height);
+
+				nix = wrap_periodic(ix+1, width);
+				niy = wrap_periodic(iy+1, height);
+			}
+			else {
+				ix = wrap_clamp(ix, width);
+				iy = wrap_clamp(iy, height);
+
+				nix = wrap_clamp(ix+1, width);
+				niy = wrap_clamp(iy+1, height);
+			}
+
+			float4 r = (1.0f - ty)*(1.0f - tx)*read(data[ix + iy*width]);
+			r += (1.0f - ty)*tx*read(data[nix + iy*width]);
+			r += ty*(1.0f - tx)*read(data[ix + niy*width]);
+			r += ty*tx*read(data[nix + niy*width]);
+
+			return r;
 		}
+	}
+
+	ccl_always_inline float4 interp_3d(float x, float y, float z, bool periodic = false)
+	{
+		if(UNLIKELY(!data))
+			return make_float4(0.0f, 0.0f, 0.0f, 0.0f);
 
-		float4 r = (1.0f - ty)*(1.0f - tx)*read(data[ix + iy*width]);
-		r += (1.0f - ty)*tx*read(data[nix + iy*width]);
-		r += ty*(1.0f - tx)*read(data[ix + niy*width]);
-		r += ty*tx*read(data[nix + niy*width]);
+		int ix, iy, iz, nix, niy, niz;
+
+		if(interpolation == INTERPOLATION_CLOSEST) {
+			frac(x*(float)width, &ix);
+			frac(y*(float)height, &iy);
+			frac(z*(float)depth, &iz);
+
+			if(periodic) {
+				ix = wrap_periodic(ix, width);
+				iy = wrap_periodic(iy, height);
+				iz = wrap_periodic(iz, depth);
+			}
+			else {
+				ix = wrap_clamp(ix, width);
+				iy = wrap_clamp(iy, height);
+				iz = wrap_clamp(iz, depth);
+			}
+
+			return read(data[ix + iy*width + iz*width*height]);
+		}
+		else {
+			float tx = frac(x*(float)width - 0.5f, &ix);
+			float ty = frac(y*(float)height - 0.5f, &iy);
+			float tz = frac(z*(float)depth - 0.5f, &iz);
+
+			if(periodic) {
+				ix = wrap_periodic(ix, width);
+				iy = wrap_periodic(iy, height);
+				iz = wrap_periodic(iz, depth);
+
+				nix = wrap_periodic(ix+1, width);
+				niy = wrap_periodic(iy+1, height);
+				niz = wrap_periodic(iz+1, depth);
+			}
+			else {
+				ix = wrap_clamp(ix, width);
+				iy = wrap_clamp(iy, height);
+				iz = wrap_clamp(iz, depth);
+
+				nix = wrap_clamp(ix+1, width);
+				niy = wrap_clamp(iy+1, height);
+				niz = wrap_clamp(iz+1, depth);
+			}
+
+			float4 r;
+
+			r  = (1.0f - tz)*(1.0f - ty)*(1.0f - tx)*read(data[ix + iy*width + iz*width*height]);
+			r += (1.0f - tz)*(1.0f - ty)*tx*read(data[nix + iy*width + iz*width*height]);
+			r += (1.0f - tz)*ty*(1.0f - tx)*read(data[ix + niy*width + iz*width*height]);
+			r += (1.0f - tz)*ty*tx*read(data[nix + niy*width + iz*width*height]);
+
+			r += tz*(1.0f - ty)*(1.0f - tx)*read(data[ix + iy*width + niz*width*height]);
+			r += tz*(1.0f - ty)*tx*read(data[nix + iy*width + niz*width*height]);
+			r += tz*ty*(1.0f - tx)*read(data[ix + niy*width + niz*width*height]);
+			r += tz*ty*tx*read(data[nix + niy*width + niz*width*height]);
+
+			return r;
+		}
+	}
 
-		return r;
+	ccl_always_inline void dimensions_set(int width_, int height_, int depth_)
+	{
+		width = width_;
+		height = height_;
+		depth = depth_;
 	}
 
 	T *data;
-	int width, height;
+	int interpolation;
+	int width, height, depth;
 };
 
 typedef texture<float4> texture_float4;
@@ -146,6 +236,7 @@ typedef texture_image<uchar4> texture_image_uchar4;
 #define kernel_tex_fetch_m128i(tex, index) (kg->tex.fetch_m128i(index))
 #define kernel_tex_lookup(tex, t, offset, size) (kg->tex.lookup(t, offset, size))
 #define kernel_tex_image_interp(tex, x, y) ((tex < MAX_FLOAT_IMAGES) ? kg->texture_float_images[tex].interp(x, y) : kg->texture_byte_images[tex - MAX_FLOAT_IMAGES].interp(x, y))
+#define kernel_tex_image_interp_3d(tex, x, y, z) ((tex < MAX_FLOAT_IMAGES) ? kg->texture_float_images[tex].interp_3d(x, y, z) : kg->texture_byte_images[tex - MAX_FLOAT_IMAGES].interp_3d(x, y, z))
 
 #define kernel_data (kg->__data)
 
diff --git a/intern/cycles/kernel/kernel_compat_cuda.h b/intern/cycles/kernel/kernel_compat_cuda.h
index 15e7353ec38..e4c20d26ff1 100644
--- a/intern/cycles/kernel/kernel_compat_cuda.h
+++ b/intern/cycles/kernel/kernel_compat_cuda.h
@@ -60,7 +60,7 @@ typedef texture<uchar4, 2, cudaReadModeNormalizedFloat> texture_image_uchar4;
 /* In order to use full 6GB of memory on Titan cards, use arrays instead
  * of textures. On earlier cards this seems slower, but on Titan it is
  * actually slightly faster in tests. */
-#if __CUDA_ARCH__ < 350
+#if __CUDA_ARCH__ < 300
 #define __KERNEL_CUDA_TEX_STORAGE__
 #endif
 
diff --git a/intern/cycles/kernel/kernel_compat_opencl.h b/intern/cycles/kernel/kernel_compat_opencl.h
index 4f4414cc298..8346b09619e 100644
--- a/intern/cycles/kernel/kernel_compat_opencl.h
+++ b/intern/cycles/kernel/kernel_compat_opencl.h
@@ -85,27 +85,36 @@
 #define __float_as_uint(x) as_uint(x)
 #define __int_as_float(x) as_float(x)
 #define __float_as_int(x) as_int(x)
-#define sqrtf(x) sqrt(((float)x))
-#define cosf(x) cos(((float)x))
-#define sinf(x) sin(((float)x))
 #define powf(x, y) pow(((float)x), ((float)y))
 #define fabsf(x) fabs(((float)x))
 #define copysignf(x, y) copysign(((float)x), ((float)y))
-#define cosf(x) cos(((float)x))
 #define asinf(x) asin(((float)x))
 #define acosf(x) acos(((float)x))
 #define atanf(x) atan(((float)x))
-#define tanf(x) tan(((float)x))
-#define logf(x) log(((float)x))
 #define floorf(x) floor(((float)x))
 #define ceilf(x) ceil(((float)x))
-#define expf(x) exp(((float)x))
 #define hypotf(x, y) hypot(((float)x), ((float)y))
 #define atan2f(x, y) atan2(((float)x), ((float)y))
 #define fmaxf(x, y) fmax(((float)x), ((float)y))
 #define fminf(x, y) fmin(((float)x), ((float)y))
 #define fmodf(x, y) fmod((float)x, (float)y)
 
+#ifndef __CL_USE_NATIVE__
+#define sinf(x) native_sin(((float)x))
+#define cosf(x) native_cos(((float)x))
+#define tanf(x) native_tan(((float)x))
+#define expf(x) native_exp(((float)x))
+#define sqrtf(x) native_sqrt(((float)x))
+#define logf(x) native_log(((float)x))
+#else
+#define sinf(x) sin(((float)x))
+#define cosf(x) cos(((float)x))
+#define tanf(x) tan(((float)x))
+#define expf(x) exp(((float)x))
+#define sqrtf(x) sqrt(((float)x))
+#define logf(x) log(((float)x))
+#endif
+
 /* data lookup defines */
 #define kernel_data (*kg->data)
 #define kernel_tex_fetch(t, index) kg->t[index]
diff --git a/intern/cycles/kernel/kernel_curve.h b/intern/cycles/kernel/kernel_curve.h
deleted file mode 100644
index 821ac50eaa9..00000000000
--- a/intern/cycles/kernel/kernel_curve.h
+++ /dev/null
@@ -1,137 +0,0 @@
-/*
- * Copyright 2011-2013 Blender Foundation
- *
- * Licensed under the Apache License, Version 2.0 (the "License");
- * you may not use this file except in compliance with the License.
- * You may obtain a copy of the License at
- *
- * http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License
- */
-
-CCL_NAMESPACE_BEGIN
-
-#ifdef __HAIR__
-
-/* curve attributes */
-
-ccl_device float curve_attribute_float(KernelGlobals *kg, const ShaderData *sd, AttributeElement elem, int offset, float *dx, float *dy)
-{
-	if(elem == ATTR_ELEMENT_CURVE) {
-#ifdef __RAY_DIFFERENTIALS__
-		if(dx) *dx = 0.0f;
-		if(dy) *dy = 0.0f;
-#endif
-
-		return kernel_tex_fetch(__attributes_float, offset + sd->prim);
-	}
-	else if(elem == ATTR_ELEMENT_CURVE_KEY) {
-		float4 curvedata = kernel_tex_fetch(__curves, sd->prim);
-		int k0 = __float_as_int(curvedata.x) + sd->segment;
-		int k1 = k0 + 1;
-
-		float f0 = kernel_tex_fetch(__attributes_float, offset + k0);
-		float f1 = kernel_tex_fetch(__attributes_float, offset + k1);
-
-#ifdef __RAY_DIFFERENTIALS__
-		if(dx) *dx = sd->du.dx*(f1 - f0);
-		if(dy) *dy = 0.0f;
-#endif
-
-		return (1.0f - sd->u)*f0 + sd->u*f1;
-	}
-	else {
-#ifdef __RAY_DIFFERENTIALS__
-		if(dx) *dx = 0.0f;
-		if(dy) *dy = 0.0f;
-#endif
-
-		return 0.0f;
-	}
-}
-
-ccl_device float3 curve_attribute_float3(KernelGlobals *kg, const ShaderData *sd, AttributeElement elem, int offset, float3 *dx, float3 *dy)
-{
-	if(elem == ATTR_ELEMENT_CURVE) {
-		/* idea: we can't derive any useful differentials here, but for tiled
-		 * mipmap image caching it would be useful to avoid reading the highest
-		 * detail level always. maybe a derivative based on the hair density
-		 * could be computed somehow? */
-#ifdef __RAY_DIFFERENTIALS__
-		if(dx) *dx = make_float3(0.0f, 0.0f, 0.0f);
-		if(dy) *dy = make_float3(0.0f, 0.0f, 0.0f);
-#endif
-
-		return float4_to_float3(kernel_tex_fetch(__attributes_float3, offset + sd->prim));
-	}
-	else if(elem == ATTR_ELEMENT_CURVE_KEY) {
-		float4 curvedata = kernel_tex_fetch(__curves, sd->prim);
-		int k0 = __float_as_int(curvedata.x) + sd->segment;
-		int k1 = k0 + 1;
-
-		float3 f0 = float4_to_float3(kernel_tex_fetch(__attributes_float3, offset + k0));
-		float3 f1 = float4_to_float3(kernel_tex_fetch(__attributes_float3, offset + k1));
-
-#ifdef __RAY_DIFFERENTIALS__
-		if(dx) *dx = sd->du.dx*(f1 - f0);
-		if(dy) *dy = make_float3(0.0f, 0.0f, 0.0f);
-#endif
-
-		return (1.0f - sd->u)*f0 + sd->u*f1;
-	}
-	else {
-#ifdef __RAY_DIFFERENTIALS__
-		if(dx) *dx = make_float3(0.0f, 0.0f, 0.0f);
-		if(dy) *dy = make_float3(0.0f, 0.0f, 0.0f);
-#endif
-
-		return make_float3(0.0f, 0.0f, 0.0f);
-	}
-}
-
-/* hair info node functions */
-
-ccl_device float curve_thickness(KernelGlobals *kg, ShaderData *sd)
-{
-	float r = 0.0f;
-
-	if(sd->segment != ~0) {
-		float4 curvedata = kernel_tex_fetch(__curves, sd->prim);
-		int k0 = __float_as_int(curvedata.x) + sd->segment;
-		int k1 = k0 + 1;
-
-		float4 P1 = kernel_tex_fetch(__curve_keys, k0);
-		float4 P2 = kernel_tex_fetch(__curve_keys, k1);
-		r = (P2.w - P1.w) * sd->u + P1.w;
-	}
-
-	return r*2.0f;
-}
-
-ccl_device float3 curve_tangent_normal(KernelGlobals *kg, ShaderData *sd)
-{	
-	float3 tgN = make_float3(0.0f,0.0f,0.0f);
-
-	if(sd->segment != ~0) {
-
-		tgN = -(-sd->I - sd->dPdu * (dot(sd->dPdu,-sd->I) / len_squared(sd->dPdu)));
-		tgN = normalize(tgN);
-
-		/* need to find suitable scaled gd for corrected normal */
-#if 0
-		tgN = normalize(tgN - gd * sd->dPdu);
-#endif
-	}
-
-	return tgN;
-}
-
-#endif
-
-CCL_NAMESPACE_END
-
diff --git a/intern/cycles/kernel/kernel_displace.h b/intern/cycles/kernel/kernel_displace.h
index c50e2166660..b8c64af658f 100644
--- a/intern/cycles/kernel/kernel_displace.h
+++ b/intern/cycles/kernel/kernel_displace.h
@@ -16,8 +16,308 @@
 
 CCL_NAMESPACE_BEGIN
 
+ccl_device void compute_light_pass(KernelGlobals *kg, ShaderData *sd, PathRadiance *L, RNG rng,
+                                   bool is_combined, bool is_ao, bool is_sss)
+{
+	int samples = kernel_data.integrator.aa_samples;
+
+	/* initialize master radiance accumulator */
+	kernel_assert(kernel_data.film.use_light_pass);
+	path_radiance_init(L, kernel_data.film.use_light_pass);
+
+	/* take multiple samples */
+	for(int sample = 0; sample < samples; sample++) {
+		PathRadiance L_sample;
+		PathState state;
+		Ray ray;
+		float3 throughput = make_float3(1.0f, 1.0f, 1.0f);
+
+		/* init radiance */
+		path_radiance_init(&L_sample, kernel_data.film.use_light_pass);
+
+		/* init path state */
+		path_state_init(kg, &state, &rng, sample);
+		state.num_samples = samples;
+
+		/* evaluate surface shader */
+		float rbsdf = path_state_rng_1D(kg, &rng, &state, PRNG_BSDF);
+		shader_eval_surface(kg, sd, rbsdf, state.flag, SHADER_CONTEXT_MAIN);
+
+		/* TODO, disable the closures we won't need */
+
+		/* sample ambient occlusion */
+		if(is_combined || is_ao) {
+			kernel_path_ao(kg, sd, &L_sample, &state, &rng, throughput);
+		}
+
+		/* sample subsurface scattering */
+		if((is_combined || is_sss) && (sd->flag & SD_BSSRDF)) {
+#ifdef __SUBSURFACE__
+			/* when mixing BSSRDF and BSDF closures we should skip BSDF lighting if scattering was successful */
+			if (kernel_path_subsurface_scatter(kg, sd, &L_sample, &state, &rng, &ray, &throughput))
+				is_sss = true;
+#endif
+		}
+
+		/* sample light and BSDF */
+		if((!is_sss) && (!is_ao)) {
+			if(kernel_path_integrate_lighting(kg, &rng, sd, &throughput, &state, &L_sample, &ray)) {
+#ifdef __LAMP_MIS__
+				state.ray_t = 0.0f;
+#endif
+				/* compute indirect light */
+				kernel_path_indirect(kg, &rng, ray, throughput, state.num_samples, state, &L_sample);
+
+				/* sum and reset indirect light pass variables for the next samples */
+				path_radiance_sum_indirect(&L_sample);
+				path_radiance_reset_indirect(&L_sample);
+			}
+		}
+
+		/* accumulate into master L */
+		path_radiance_accum_sample(L, &L_sample, samples);
+	}
+}
+
+ccl_device bool is_light_pass(ShaderEvalType type)
+{
+	switch (type) {
+		case SHADER_EVAL_AO:
+		case SHADER_EVAL_COMBINED:
+		case SHADER_EVAL_SHADOW:
+		case SHADER_EVAL_DIFFUSE_DIRECT:
+		case SHADER_EVAL_GLOSSY_DIRECT:
+		case SHADER_EVAL_TRANSMISSION_DIRECT:
+		case SHADER_EVAL_SUBSURFACE_DIRECT:
+		case SHADER_EVAL_DIFFUSE_INDIRECT:
+		case SHADER_EVAL_GLOSSY_INDIRECT:
+		case SHADER_EVAL_TRANSMISSION_INDIRECT:
+		case SHADER_EVAL_SUBSURFACE_INDIRECT:
+			return true;
+		default:
+			return false;
+	}
+}
+
+ccl_device void kernel_bake_evaluate(KernelGlobals *kg, ccl_global uint4 *input, ccl_global float4 *output, ShaderEvalType type, int i)
+{
+	ShaderData sd;
+	uint4 in = input[i * 2];
+	uint4 diff = input[i * 2 + 1];
+
+	float3 out;
+
+	int object = in.x;
+	int prim = in.y;
+
+	if(prim == -1)
+		return;
+
+	float u = __uint_as_float(in.z);
+	float v = __uint_as_float(in.w);
+
+	float dudx = __uint_as_float(diff.x);
+	float dudy = __uint_as_float(diff.y);
+	float dvdx = __uint_as_float(diff.z);
+	float dvdy = __uint_as_float(diff.w);
+
+	int shader;
+	float3 P, Ng;
+
+	triangle_point_normal(kg, prim, u, v, &P, &Ng, &shader);
+
+	/* dummy initilizations copied from SHADER_EVAL_DISPLACE */
+	float3 I = Ng;
+	float t = 0.0f;
+	float time = TIME_INVALID;
+	int bounce = 0;
+	int transparent_bounce = 0;
+
+	/* light passes */
+	PathRadiance L;
+
+	shader_setup_from_sample(kg, &sd, P, Ng, I, shader, object, prim, u, v, t, time, bounce, transparent_bounce);
+	sd.I = sd.N;
+
+	/* update differentials */
+	sd.dP.dx = sd.dPdu * dudx + sd.dPdv * dvdx;
+	sd.dP.dy = sd.dPdu * dudy + sd.dPdv * dvdy;
+	sd.du.dx = dudx;
+	sd.du.dy = dudy;
+	sd.dv.dx = dvdx;
+	sd.dv.dy = dvdy;
+
+	if(is_light_pass(type)) {
+		RNG rng = cmj_hash(i, 0);
+		compute_light_pass(kg, &sd, &L, rng, (type == SHADER_EVAL_COMBINED),
+		                                     (type == SHADER_EVAL_AO),
+		                                     (type == SHADER_EVAL_SUBSURFACE_DIRECT ||
+		                                      type == SHADER_EVAL_SUBSURFACE_INDIRECT));
+	}
+
+	switch (type) {
+		/* data passes */
+		case SHADER_EVAL_NORMAL:
+		{
+			/* compression: normal = (2 * color) - 1 */
+			out = sd.N * 0.5f + make_float3(0.5f, 0.5f, 0.5f);
+			break;
+		}
+		case SHADER_EVAL_UV:
+		{
+			out = primitive_uv(kg, &sd);
+			break;
+		}
+		case SHADER_EVAL_DIFFUSE_COLOR:
+		{
+			shader_eval_surface(kg, &sd, 0.f, 0, SHADER_CONTEXT_MAIN);
+			out = shader_bsdf_diffuse(kg, &sd);
+			break;
+		}
+		case SHADER_EVAL_GLOSSY_COLOR:
+		{
+			shader_eval_surface(kg, &sd, 0.f, 0, SHADER_CONTEXT_MAIN);
+			out = shader_bsdf_glossy(kg, &sd);
+			break;
+		}
+		case SHADER_EVAL_TRANSMISSION_COLOR:
+		{
+			shader_eval_surface(kg, &sd, 0.f, 0, SHADER_CONTEXT_MAIN);
+			out = shader_bsdf_transmission(kg, &sd);
+			break;
+		}
+		case SHADER_EVAL_SUBSURFACE_COLOR:
+		{
+#ifdef __SUBSURFACE__
+			shader_eval_surface(kg, &sd, 0.f, 0, SHADER_CONTEXT_MAIN);
+			out = shader_bsdf_subsurface(kg, &sd);
+#endif
+			break;
+		}
+		case SHADER_EVAL_EMISSION:
+		{
+			shader_eval_surface(kg, &sd, 0.f, 0, SHADER_CONTEXT_EMISSION);
+			out = shader_emissive_eval(kg, &sd);
+			break;
+		}
+
+#ifdef __PASSES__
+		/* light passes */
+		case SHADER_EVAL_AO:
+		{
+			out = L.ao;
+			break;
+		}
+		case SHADER_EVAL_COMBINED:
+		{
+			out = path_radiance_clamp_and_sum(kg, &L);
+			break;
+		}
+		case SHADER_EVAL_SHADOW:
+		{
+			out = make_float3(L.shadow.x, L.shadow.y, L.shadow.z);
+			break;
+		}
+		case SHADER_EVAL_DIFFUSE_DIRECT:
+		{
+			shader_eval_surface(kg, &sd, 0.f, 0, SHADER_CONTEXT_MAIN);
+			out = safe_divide_color(L.direct_diffuse, shader_bsdf_diffuse(kg, &sd));
+			break;
+		}
+		case SHADER_EVAL_GLOSSY_DIRECT:
+		{
+			shader_eval_surface(kg, &sd, 0.f, 0, SHADER_CONTEXT_MAIN);
+			out = safe_divide_color(L.direct_glossy, shader_bsdf_glossy(kg, &sd));
+			break;
+		}
+		case SHADER_EVAL_TRANSMISSION_DIRECT:
+		{
+			shader_eval_surface(kg, &sd, 0.f, 0, SHADER_CONTEXT_MAIN);
+			out = safe_divide_color(L.direct_transmission, shader_bsdf_transmission(kg, &sd));
+			break;
+		}
+		case SHADER_EVAL_SUBSURFACE_DIRECT:
+		{
+#ifdef __SUBSURFACE__
+			shader_eval_surface(kg, &sd, 0.f, 0, SHADER_CONTEXT_MAIN);
+			out = safe_divide_color(L.direct_subsurface, shader_bsdf_subsurface(kg, &sd));
+#endif
+			break;
+		}
+		case SHADER_EVAL_DIFFUSE_INDIRECT:
+		{
+			shader_eval_surface(kg, &sd, 0.f, 0, SHADER_CONTEXT_MAIN);
+			out = safe_divide_color(L.indirect_diffuse, shader_bsdf_diffuse(kg, &sd));
+			break;
+		}
+		case SHADER_EVAL_GLOSSY_INDIRECT:
+		{
+			shader_eval_surface(kg, &sd, 0.f, 0, SHADER_CONTEXT_MAIN);
+			out = safe_divide_color(L.indirect_glossy, shader_bsdf_glossy(kg, &sd));
+			break;
+		}
+		case SHADER_EVAL_TRANSMISSION_INDIRECT:
+		{
+			shader_eval_surface(kg, &sd, 0.f, 0, SHADER_CONTEXT_MAIN);
+			out = safe_divide_color(L.indirect_transmission, shader_bsdf_transmission(kg, &sd));
+			break;
+		}
+		case SHADER_EVAL_SUBSURFACE_INDIRECT:
+		{
+#ifdef __SUBSURFACE__
+			shader_eval_surface(kg, &sd, 0.f, 0, SHADER_CONTEXT_MAIN);
+			out = safe_divide_color(L.indirect_subsurface, shader_bsdf_subsurface(kg, &sd));
+#endif
+			break;
+		}
+#endif
+
+		/* extra */
+		case SHADER_EVAL_ENVIRONMENT:
+		{
+			/* setup ray */
+			Ray ray;
+
+			ray.P = make_float3(0.0f, 0.0f, 0.0f);
+			ray.D = normalize(P);
+			ray.t = 0.0f;
+#ifdef __CAMERA_MOTION__
+			ray.time = 0.5f;
+#endif
+
+#ifdef __RAY_DIFFERENTIALS__
+			ray.dD = differential3_zero();
+			ray.dP = differential3_zero();
+#endif
+
+			/* setup shader data */
+			shader_setup_from_background(kg, &sd, &ray, 0, 0);
+
+			/* evaluate */
+			int flag = 0; /* we can't know which type of BSDF this is for */
+			out = shader_eval_background(kg, &sd, flag, SHADER_CONTEXT_MAIN);
+			break;
+		}
+		default:
+		{
+			/* no real shader, returning the position of the verts for debugging */
+			out = normalize(P);
+			break;
+		}
+	}
+
+	/* write output */
+	output[i] = make_float4(out.x, out.y, out.z, 1.0f);
+	return;
+}
+
 ccl_device void kernel_shader_evaluate(KernelGlobals *kg, ccl_global uint4 *input, ccl_global float4 *output, ShaderEvalType type, int i)
 {
+	if(type >= SHADER_EVAL_BAKE) {
+		kernel_bake_evaluate(kg, input, output, type, i);
+		return;
+	}
+
 	ShaderData sd;
 	uint4 in = input[i];
 	float3 out;
@@ -55,7 +355,7 @@ ccl_device void kernel_shader_evaluate(KernelGlobals *kg, ccl_global uint4 *inpu
 #endif
 
 		/* setup shader data */
-		shader_setup_from_background(kg, &sd, &ray, 0);
+		shader_setup_from_background(kg, &sd, &ray, 0, 0);
 
 		/* evaluate */
 		int flag = 0; /* we can't know which type of BSDF this is for */
diff --git a/intern/cycles/kernel/kernel_emission.h b/intern/cycles/kernel/kernel_emission.h
index 58bdc2b70ca..deffa7f2ba2 100644
--- a/intern/cycles/kernel/kernel_emission.h
+++ b/intern/cycles/kernel/kernel_emission.h
@@ -18,8 +18,8 @@ CCL_NAMESPACE_BEGIN
 
 /* Direction Emission */
 
-ccl_device_noinline float3 direct_emissive_eval(KernelGlobals *kg, float rando,
-	LightSample *ls, float3 I, differential3 dI, float t, float time, int bounce)
+ccl_device_noinline float3 direct_emissive_eval(KernelGlobals *kg,
+	LightSample *ls, float3 I, differential3 dI, float t, float time, int bounce, int transparent_bounce)
 {
 	/* setup shading at emitter */
 	ShaderData sd;
@@ -36,27 +36,20 @@ ccl_device_noinline float3 direct_emissive_eval(KernelGlobals *kg, float rando,
 #endif
 		ray.dP = differential3_zero();
 		ray.dD = dI;
-#ifdef __CAMERA_MOTION__
-		ray.time = time;
-#endif
-		shader_setup_from_background(kg, &sd, &ray, bounce+1);
+
+		shader_setup_from_background(kg, &sd, &ray, bounce+1, transparent_bounce);
 		eval = shader_eval_background(kg, &sd, 0, SHADER_CONTEXT_EMISSION);
 	}
 	else
 #endif
 	{
-#ifdef __HAIR__
-		if(ls->type == LIGHT_STRAND)
-			shader_setup_from_sample(kg, &sd, ls->P, ls->Ng, I, ls->shader, ls->object, ls->prim, ls->u, ls->v, t, time, bounce+1, ls->prim);
-		else
-#endif
-			shader_setup_from_sample(kg, &sd, ls->P, ls->Ng, I, ls->shader, ls->object, ls->prim, ls->u, ls->v, t, time, bounce+1, ~0);
+		shader_setup_from_sample(kg, &sd, ls->P, ls->Ng, I, ls->shader, ls->object, ls->prim, ls->u, ls->v, t, time, bounce+1, transparent_bounce);
 
 		ls->Ng = sd.Ng;
 
 		/* no path flag, we're evaluating this for all closures. that's weak but
 		 * we'd have to do multiple evaluations otherwise */
-		shader_eval_surface(kg, &sd, rando, 0, SHADER_CONTEXT_EMISSION);
+		shader_eval_surface(kg, &sd, 0.0f, 0, SHADER_CONTEXT_EMISSION);
 
 		/* evaluate emissive closure */
 		if(sd.flag & SD_EMISSION)
@@ -71,13 +64,13 @@ ccl_device_noinline float3 direct_emissive_eval(KernelGlobals *kg, float rando,
 }
 
 ccl_device_noinline bool direct_emission(KernelGlobals *kg, ShaderData *sd, int lindex,
-	float randt, float rando, float randu, float randv, Ray *ray, BsdfEval *eval,
-	bool *is_lamp, int bounce)
+	float randt, float randu, float randv, Ray *ray, BsdfEval *eval,
+	bool *is_lamp, int bounce, int transparent_bounce)
 {
 	LightSample ls;
 
 #ifdef __BRANCHED_PATH__
-	if(lindex != -1) {
+	if(lindex != LAMP_NONE) {
 		/* sample position on a specified light */
 		light_select(kg, lindex, randu, randv, sd->P, &ls);
 	}
@@ -95,7 +88,7 @@ ccl_device_noinline bool direct_emission(KernelGlobals *kg, ShaderData *sd, int
 	differential3 dD = differential3_zero();
 
 	/* evaluate closure */
-	float3 light_eval = direct_emissive_eval(kg, rando, &ls, -ls.D, dD, ls.t, sd->time, bounce);
+	float3 light_eval = direct_emissive_eval(kg, &ls, -ls.D, dD, ls.t, sd->time, bounce, transparent_bounce);
 
 	if(is_zero(light_eval))
 		return false;
@@ -104,7 +97,7 @@ ccl_device_noinline bool direct_emission(KernelGlobals *kg, ShaderData *sd, int
 	float bsdf_pdf;
 
 #ifdef __VOLUME__
-	if(sd->prim != ~0)
+	if(sd->prim != PRIM_NONE)
 		shader_bsdf_eval(kg, sd, ls.D, eval, &bsdf_pdf);
 	else
 		shader_volume_phase_eval(kg, sd, ls.D, eval, &bsdf_pdf);
@@ -160,7 +153,7 @@ ccl_device_noinline bool direct_emission(KernelGlobals *kg, ShaderData *sd, int
 	}
 
 	/* return if it's a lamp for shadow pass */
-	*is_lamp = (ls.prim == ~0 && ls.type != LIGHT_BACKGROUND);
+	*is_lamp = (ls.prim == PRIM_NONE && ls.type != LIGHT_BACKGROUND);
 
 	return true;
 }
@@ -173,10 +166,11 @@ ccl_device_noinline float3 indirect_primitive_emission(KernelGlobals *kg, Shader
 	float3 L = shader_emissive_eval(kg, sd);
 
 #ifdef __HAIR__
-	if(!(path_flag & PATH_RAY_MIS_SKIP) && (sd->flag & SD_USE_MIS) && (sd->segment == ~0)) {
+	if(!(path_flag & PATH_RAY_MIS_SKIP) && (sd->flag & SD_USE_MIS) && (sd->type & PRIMITIVE_ALL_TRIANGLE))
 #else
-	if(!(path_flag & PATH_RAY_MIS_SKIP) && (sd->flag & SD_USE_MIS)) {
+	if(!(path_flag & PATH_RAY_MIS_SKIP) && (sd->flag & SD_USE_MIS))
 #endif
+	{
 		/* multiple importance sampling, get triangle light pdf,
 		 * and compute weight with respect to BSDF pdf */
 		float pdf = triangle_light_pdf(kg, sd->Ng, sd->I, t);
@@ -190,71 +184,75 @@ ccl_device_noinline float3 indirect_primitive_emission(KernelGlobals *kg, Shader
 
 /* Indirect Lamp Emission */
 
-ccl_device_noinline bool indirect_lamp_emission(KernelGlobals *kg, Ray *ray, int path_flag, float bsdf_pdf, float randt, float3 *emission, int bounce)
+ccl_device_noinline bool indirect_lamp_emission(KernelGlobals *kg, PathState *state, Ray *ray, float3 *emission)
 {
-	LightSample ls;
-	int lamp = lamp_light_eval_sample(kg, randt);
+	bool hit_lamp = false;
 
-	if(lamp == ~0)
-		return false;
+	*emission = make_float3(0.0f, 0.0f, 0.0f);
 
-	if(!lamp_light_eval(kg, lamp, ray->P, ray->D, ray->t, &ls))
-		return false;
+	for(int lamp = 0; lamp < kernel_data.integrator.num_all_lights; lamp++) {
+		LightSample ls;
+
+		if(!lamp_light_eval(kg, lamp, ray->P, ray->D, ray->t, &ls))
+			continue;
 
 #ifdef __PASSES__
-	/* use visibility flag to skip lights */
-	if(ls.shader & SHADER_EXCLUDE_ANY) {
-		if(((ls.shader & SHADER_EXCLUDE_DIFFUSE) && (path_flag & PATH_RAY_DIFFUSE)) ||
-		   ((ls.shader & SHADER_EXCLUDE_GLOSSY) && (path_flag & PATH_RAY_GLOSSY)) ||
-		   ((ls.shader & SHADER_EXCLUDE_TRANSMIT) && (path_flag & PATH_RAY_TRANSMIT)))
-			return false;
-	}
+		/* use visibility flag to skip lights */
+		if(ls.shader & SHADER_EXCLUDE_ANY) {
+			if(((ls.shader & SHADER_EXCLUDE_DIFFUSE) && (state->flag & PATH_RAY_DIFFUSE)) ||
+			   ((ls.shader & SHADER_EXCLUDE_GLOSSY) && (state->flag & PATH_RAY_GLOSSY)) ||
+			   ((ls.shader & SHADER_EXCLUDE_TRANSMIT) && (state->flag & PATH_RAY_TRANSMIT)))
+				continue;
+		}
 #endif
 
-	float3 L = direct_emissive_eval(kg, 0.0f, &ls, -ray->D, ray->dD, ls.t, ray->time, bounce);
+		float3 L = direct_emissive_eval(kg, &ls, -ray->D, ray->dD, ls.t, ray->time, state->bounce, state->transparent_bounce);
 
-	if(!(path_flag & PATH_RAY_MIS_SKIP)) {
-		/* multiple importance sampling, get regular light pdf,
-		 * and compute weight with respect to BSDF pdf */
-		float mis_weight = power_heuristic(bsdf_pdf, ls.pdf);
-		L *= mis_weight;
+		if(!(state->flag & PATH_RAY_MIS_SKIP)) {
+			/* multiple importance sampling, get regular light pdf,
+			 * and compute weight with respect to BSDF pdf */
+			float mis_weight = power_heuristic(state->ray_pdf, ls.pdf);
+			L *= mis_weight;
+		}
+
+		*emission += L;
+		hit_lamp = true;
 	}
 
-	*emission = L;
-	return true;
+	return hit_lamp;
 }
 
 /* Indirect Background */
 
-ccl_device_noinline float3 indirect_background(KernelGlobals *kg, Ray *ray, int path_flag, float bsdf_pdf, int bounce)
+ccl_device_noinline float3 indirect_background(KernelGlobals *kg, PathState *state, Ray *ray)
 {
 #ifdef __BACKGROUND__
 	int shader = kernel_data.background.surface_shader;
 
 	/* use visibility flag to skip lights */
 	if(shader & SHADER_EXCLUDE_ANY) {
-		if(((shader & SHADER_EXCLUDE_DIFFUSE) && (path_flag & PATH_RAY_DIFFUSE)) ||
-		   ((shader & SHADER_EXCLUDE_GLOSSY) && (path_flag & PATH_RAY_GLOSSY)) ||
-		   ((shader & SHADER_EXCLUDE_TRANSMIT) && (path_flag & PATH_RAY_TRANSMIT)) ||
-		   ((shader & SHADER_EXCLUDE_CAMERA) && (path_flag & PATH_RAY_CAMERA)))
+		if(((shader & SHADER_EXCLUDE_DIFFUSE) && (state->flag & PATH_RAY_DIFFUSE)) ||
+		   ((shader & SHADER_EXCLUDE_GLOSSY) && (state->flag & PATH_RAY_GLOSSY)) ||
+		   ((shader & SHADER_EXCLUDE_TRANSMIT) && (state->flag & PATH_RAY_TRANSMIT)) ||
+		   ((shader & SHADER_EXCLUDE_CAMERA) && (state->flag & PATH_RAY_CAMERA)))
 			return make_float3(0.0f, 0.0f, 0.0f);
 	}
 
 	/* evaluate background closure */
 	ShaderData sd;
-	shader_setup_from_background(kg, &sd, ray, bounce+1);
+	shader_setup_from_background(kg, &sd, ray, state->bounce+1, state->transparent_bounce);
 
-	float3 L = shader_eval_background(kg, &sd, path_flag, SHADER_CONTEXT_EMISSION);
+	float3 L = shader_eval_background(kg, &sd, state->flag, SHADER_CONTEXT_EMISSION);
 
 #ifdef __BACKGROUND_MIS__
 	/* check if background light exists or if we should skip pdf */
 	int res = kernel_data.integrator.pdf_background_res;
 
-	if(!(path_flag & PATH_RAY_MIS_SKIP) && res) {
+	if(!(state->flag & PATH_RAY_MIS_SKIP) && res) {
 		/* multiple importance sampling, get background light pdf for ray
 		 * direction, and compute weight with respect to BSDF pdf */
 		float pdf = background_light_pdf(kg, ray->D);
-		float mis_weight = power_heuristic(bsdf_pdf, pdf);
+		float mis_weight = power_heuristic(state->ray_pdf, pdf);
 
 		return L*mis_weight;
 	}
diff --git a/intern/cycles/kernel/kernel_film.h b/intern/cycles/kernel/kernel_film.h
index cbd875e994c..dc5f6e7ce38 100644
--- a/intern/cycles/kernel/kernel_film.h
+++ b/intern/cycles/kernel/kernel_film.h
@@ -75,7 +75,7 @@ ccl_device void kernel_film_convert_to_half_float(KernelGlobals *kg,
 
 	float exposure = kernel_data.film.exposure;
 
-	ccl_align(16) float4 rgba_in = *in;
+	float4 rgba_in = *in;
 
 	if(exposure != 1.0f) {
 		rgba_in.x *= exposure;
@@ -83,7 +83,7 @@ ccl_device void kernel_film_convert_to_half_float(KernelGlobals *kg,
 		rgba_in.z *= exposure;
 	}
 
-	float4_store_half(out, &rgba_in, sample_scale);
+	float4_store_half(out, rgba_in, sample_scale);
 }
 
 CCL_NAMESPACE_END
diff --git a/intern/cycles/kernel/kernel_light.h b/intern/cycles/kernel/kernel_light.h
index c32f0395744..ac432d3fe04 100644
--- a/intern/cycles/kernel/kernel_light.h
+++ b/intern/cycles/kernel/kernel_light.h
@@ -217,8 +217,8 @@ ccl_device void lamp_light_sample(KernelGlobals *kg, int lamp,
 	LightType type = (LightType)__float_as_int(data0.x);
 	ls->type = type;
 	ls->shader = __float_as_int(data1.x);
-	ls->object = ~0;
-	ls->prim = ~0;
+	ls->object = PRIM_NONE;
+	ls->prim = PRIM_NONE;
 	ls->lamp = lamp;
 	ls->u = randu;
 	ls->v = randv;
@@ -309,8 +309,8 @@ ccl_device bool lamp_light_eval(KernelGlobals *kg, int lamp, float3 P, float3 D,
 	LightType type = (LightType)__float_as_int(data0.x);
 	ls->type = type;
 	ls->shader = __float_as_int(data1.x);
-	ls->object = ~0;
-	ls->prim = ~0;
+	ls->object = PRIM_NONE;
+	ls->prim = PRIM_NONE;
 	ls->lamp = lamp;
 	/* todo: missing texture coordinates */
 	ls->u = 0.0f;
@@ -421,7 +421,6 @@ ccl_device bool lamp_light_eval(KernelGlobals *kg, int lamp, float3 P, float3 D,
 	/* compute pdf */
 	if(ls->t != FLT_MAX)
 		ls->pdf *= lamp_light_pdf(kg, ls->Ng, -ls->D, ls->t);
-	ls->eval_fac *= kernel_data.integrator.inv_pdf_lights;
 
 	return true;
 }
@@ -458,11 +457,10 @@ ccl_device void triangle_light_sample(KernelGlobals *kg, int prim, int object,
 	v = randv*randu;
 
 	/* triangle, so get position, normal, shader */
-	ls->P = triangle_point_MT(kg, prim, u, v);
-	ls->Ng = triangle_normal_MT(kg, prim, &ls->shader);
+	triangle_point_normal(kg, prim, u, v, &ls->P, &ls->Ng, &ls->shader);
 	ls->object = object;
 	ls->prim = prim;
-	ls->lamp = ~0;
+	ls->lamp = LAMP_NONE;
 	ls->shader |= SHADER_USE_MIS;
 	ls->t = 0.0f;
 	ls->u = u;
@@ -485,52 +483,6 @@ ccl_device float triangle_light_pdf(KernelGlobals *kg,
 	return t*t*pdf/cos_pi;
 }
 
-/* Curve Light */
-
-#ifdef __HAIR__
-
-ccl_device void curve_segment_light_sample(KernelGlobals *kg, int prim, int object,
-	int segment, float randu, float randv, float time, LightSample *ls)
-{
-	/* this strand code needs completion */
-	float4 v00 = kernel_tex_fetch(__curves, prim);
-
-	int k0 = __float_as_int(v00.x) + segment;
-	int k1 = k0 + 1;
-
-	float4 P1 = kernel_tex_fetch(__curve_keys, k0);
-	float4 P2 = kernel_tex_fetch(__curve_keys, k1);
-
-	float l = len(float4_to_float3(P2) - float4_to_float3(P1));
-
-	float r1 = P1.w;
-	float r2 = P2.w;
-	float3 tg = (float4_to_float3(P2) - float4_to_float3(P1)) / l;
-	float3 xc = make_float3(tg.x * tg.z, tg.y * tg.z, -(tg.x * tg.x + tg.y * tg.y));
-	if (is_zero(xc))
-		xc = make_float3(tg.x * tg.y, -(tg.x * tg.x + tg.z * tg.z), tg.z * tg.y);
-	xc = normalize(xc);
-	float3 yc = cross(tg, xc);
-	float gd = ((r2 - r1)/l);
-
-	/* normal currently ignores gradient */
-	ls->Ng = sinf(M_2PI_F * randv) * xc + cosf(M_2PI_F * randv) * yc;
-	ls->P = randu * l * tg + (gd * l + r1) * ls->Ng;
-	ls->object = object;
-	ls->prim = prim;
-	ls->lamp = ~0;
-	ls->t = 0.0f;
-	ls->u = randu;
-	ls->v = randv;
-	ls->type = LIGHT_STRAND;
-	ls->eval_fac = 1.0f;
-	ls->shader = __float_as_int(v00.z) | SHADER_USE_MIS;
-
-	object_transform_light_sample(kg, ls, object, time);
-}
-
-#endif
-
 /* Light Distribution */
 
 ccl_device int light_distribution_sample(KernelGlobals *kg, float randt)
@@ -573,21 +525,14 @@ ccl_device void light_sample(KernelGlobals *kg, float randt, float randu, float
 
 	if(prim >= 0) {
 		int object = __float_as_int(l.w);
-#ifdef __HAIR__
-		int segment = __float_as_int(l.z) & SHADER_MASK;
-#endif
+		int shader_flag = __float_as_int(l.z);
 
-#ifdef __HAIR__
-		if (segment != SHADER_MASK)
-			curve_segment_light_sample(kg, prim, object, segment, randu, randv, time, ls);
-		else
-#endif
-			triangle_light_sample(kg, prim, object, randu, randv, time, ls);
+		triangle_light_sample(kg, prim, object, randu, randv, time, ls);
 
 		/* compute incoming direction, distance and pdf */
 		ls->D = normalize_len(ls->P - P, &ls->t);
 		ls->pdf = triangle_light_pdf(kg, ls->Ng, -ls->D, ls->t);
-		ls->shader |= __float_as_int(l.z) & (~SHADER_MASK);
+		ls->shader |= shader_flag;
 	}
 	else {
 		int lamp = -prim-1;
@@ -620,7 +565,7 @@ ccl_device int lamp_light_eval_sample(KernelGlobals *kg, float randt)
 		return lamp;
 	}
 	else
-		return ~0;
+		return LAMP_NONE;
 }
 
 CCL_NAMESPACE_END
diff --git a/intern/cycles/kernel/kernel_montecarlo.h b/intern/cycles/kernel/kernel_montecarlo.h
index 92f3420a218..af7b727c1ba 100644
--- a/intern/cycles/kernel/kernel_montecarlo.h
+++ b/intern/cycles/kernel/kernel_montecarlo.h
@@ -131,6 +131,11 @@ ccl_device float power_heuristic_3(float a, float b, float c)
 	return (a*a)/(a*a + b*b + c*c);
 }
 
+ccl_device float max_heuristic(float a, float b)
+{
+	return (a > b)? 1.0f: 0.0f;
+}
+
 /* distribute uniform xy on [0,1] over unit disk [-1,1], with concentric mapping
  * to better preserve stratification for some RNG sequences */
 ccl_device float2 concentric_sample_disk(float u1, float u2)
diff --git a/intern/cycles/kernel/kernel_passes.h b/intern/cycles/kernel/kernel_passes.h
index 9cdcb8c5229..b3b6fc02894 100644
--- a/intern/cycles/kernel/kernel_passes.h
+++ b/intern/cycles/kernel/kernel_passes.h
@@ -51,7 +51,8 @@ ccl_device_inline void kernel_write_data_passes(KernelGlobals *kg, ccl_global fl
 	if(!(path_flag & PATH_RAY_SINGLE_PASS_DONE)) {
 		if(!(sd->flag & SD_TRANSPARENT) ||
 		   kernel_data.film.pass_alpha_threshold == 0.0f ||
-		   average(shader_bsdf_alpha(kg, sd)) >= kernel_data.film.pass_alpha_threshold) {
+		   average(shader_bsdf_alpha(kg, sd)) >= kernel_data.film.pass_alpha_threshold)
+		{
 
 			if(sample == 0) {
 				if(flag & PASS_DEPTH) {
diff --git a/intern/cycles/kernel/kernel_path.h b/intern/cycles/kernel/kernel_path.h
index 635201471e1..a80a0033712 100644
--- a/intern/cycles/kernel/kernel_path.h
+++ b/intern/cycles/kernel/kernel_path.h
@@ -18,18 +18,15 @@
 #include "osl_shader.h"
 #endif
 
-#include "kernel_differential.h"
-#include "kernel_montecarlo.h"
-#include "kernel_projection.h"
-#include "kernel_object.h"
-#include "kernel_triangle.h"
-#include "kernel_curve.h"
-#include "kernel_primitive.h"
-#include "kernel_projection.h"
 #include "kernel_random.h"
-#include "kernel_bvh.h"
-#include "kernel_accumulate.h"
+#include "kernel_projection.h"
+#include "kernel_montecarlo.h"
+#include "kernel_differential.h"
 #include "kernel_camera.h"
+
+#include "geom/geom.h"
+
+#include "kernel_accumulate.h"
 #include "kernel_shader.h"
 #include "kernel_light.h"
 #include "kernel_emission.h"
@@ -59,11 +56,6 @@ ccl_device_inline bool kernel_path_integrate_scatter_lighting(KernelGlobals *kg,
 		/* sample illumination from lights to find path contribution */
 		if(sd->flag & SD_BSDF_HAS_EVAL) {
 			float light_t = path_state_rng_1D(kg, rng, state, PRNG_LIGHT);
-#ifdef __MULTI_CLOSURE__
-			float light_o = 0.0f;
-#else
-			float light_o = path_state_rng_1D(kg, rng, state, PRNG_LIGHT_F);
-#endif
 			float light_u, light_v;
 			path_state_rng_2D(kg, rng, state, PRNG_LIGHT_U, &light_u, &light_v);
 
@@ -75,7 +67,7 @@ ccl_device_inline bool kernel_path_integrate_scatter_lighting(KernelGlobals *kg,
 			light_ray.time = sd->time;
 #endif
 
-			if(direct_emission(kg, sd, -1, light_t, light_o, light_u, light_v, &light_ray, &L_light, &is_lamp, state->bounce)) {
+			if(direct_emission(kg, sd, LAMP_NONE, light_t, light_u, light_v, &light_ray, &L_light, &is_lamp, state->bounce, state->transparent_bounce)) {
 				/* trace shadow ray */
 				float3 shadow;
 
@@ -133,7 +125,96 @@ ccl_device_inline bool kernel_path_integrate_scatter_lighting(KernelGlobals *kg,
 
 #if defined(__BRANCHED_PATH__) || defined(__SUBSURFACE__)
 
-ccl_device void kernel_path_indirect(KernelGlobals *kg, RNG *rng, Ray ray, ccl_global float *buffer,
+ccl_device void kernel_branched_path_integrate_direct_lighting(KernelGlobals *kg, RNG *rng,
+	ShaderData *sd, PathState *state, float3 throughput, float num_samples_adjust, PathRadiance *L, bool sample_all_lights)
+{
+	/* sample illumination from lights to find path contribution */
+	if(sd->flag & SD_BSDF_HAS_EVAL) {
+		Ray light_ray;
+		BsdfEval L_light;
+		bool is_lamp;
+
+#ifdef __OBJECT_MOTION__
+		light_ray.time = sd->time;
+#endif
+
+		if(sample_all_lights) {
+			/* lamp sampling */
+			for(int i = 0; i < kernel_data.integrator.num_all_lights; i++) {
+				int num_samples = ceil_to_int(num_samples_adjust*light_select_num_samples(kg, i));
+				float num_samples_inv = num_samples_adjust/(num_samples*kernel_data.integrator.num_all_lights);
+				RNG lamp_rng = cmj_hash(*rng, i);
+
+				if(kernel_data.integrator.pdf_triangles != 0.0f)
+					num_samples_inv *= 0.5f;
+
+				for(int j = 0; j < num_samples; j++) {
+					float light_u, light_v;
+					path_branched_rng_2D(kg, &lamp_rng, state, j, num_samples, PRNG_LIGHT_U, &light_u, &light_v);
+
+					if(direct_emission(kg, sd, i, 0.0f, light_u, light_v, &light_ray, &L_light, &is_lamp, state->bounce, state->transparent_bounce)) {
+						/* trace shadow ray */
+						float3 shadow;
+
+						if(!shadow_blocked(kg, state, &light_ray, &shadow)) {
+							/* accumulate */
+							path_radiance_accum_light(L, throughput*num_samples_inv, &L_light, shadow, num_samples_inv, state->bounce, is_lamp);
+						}
+					}
+				}
+			}
+
+			/* mesh light sampling */
+			if(kernel_data.integrator.pdf_triangles != 0.0f) {
+				int num_samples = ceil_to_int(num_samples_adjust*kernel_data.integrator.mesh_light_samples);
+				float num_samples_inv = num_samples_adjust/num_samples;
+
+				if(kernel_data.integrator.num_all_lights)
+					num_samples_inv *= 0.5f;
+
+				for(int j = 0; j < num_samples; j++) {
+					float light_t = path_branched_rng_1D(kg, rng, state, j, num_samples, PRNG_LIGHT);
+					float light_u, light_v;
+					path_branched_rng_2D(kg, rng, state, j, num_samples, PRNG_LIGHT_U, &light_u, &light_v);
+
+					/* only sample triangle lights */
+					if(kernel_data.integrator.num_all_lights)
+						light_t = 0.5f*light_t;
+
+					if(direct_emission(kg, sd, LAMP_NONE, light_t, light_u, light_v, &light_ray, &L_light, &is_lamp, state->bounce, state->transparent_bounce)) {
+						/* trace shadow ray */
+						float3 shadow;
+
+						if(!shadow_blocked(kg, state, &light_ray, &shadow)) {
+							/* accumulate */
+							path_radiance_accum_light(L, throughput*num_samples_inv, &L_light, shadow, num_samples_inv, state->bounce, is_lamp);
+						}
+					}
+				}
+			}
+		}
+		else {
+			float light_t = path_state_rng_1D(kg, rng, state, PRNG_LIGHT);
+			float light_u, light_v;
+			path_state_rng_2D(kg, rng, state, PRNG_LIGHT_U, &light_u, &light_v);
+
+			/* sample random light */
+			if(direct_emission(kg, sd, LAMP_NONE, light_t, light_u, light_v, &light_ray, &L_light, &is_lamp, state->bounce, state->transparent_bounce)) {
+				/* trace shadow ray */
+				float3 shadow;
+
+				if(!shadow_blocked(kg, state, &light_ray, &shadow)) {
+					/* accumulate */
+					path_radiance_accum_light(L, throughput, &L_light, shadow, 1.0f, state->bounce, is_lamp);
+				}
+			}
+		}
+	}
+}
+
+#endif
+
+ccl_device void kernel_path_indirect(KernelGlobals *kg, RNG *rng, Ray ray,
 	float3 throughput, int num_samples, PathState state, PathRadiance *L)
 {
 	/* path iteration */
@@ -161,17 +242,16 @@ ccl_device void kernel_path_indirect(KernelGlobals *kg, RNG *rng, Ray ray, ccl_g
 			light_ray.dP = ray.dP;
 
 			/* intersect with lamp */
-			float light_t = path_state_rng_1D(kg, rng, &state, PRNG_LIGHT);
 			float3 emission;
 
-			if(indirect_lamp_emission(kg, &light_ray, state.flag, state.ray_pdf, light_t, &emission, state.bounce))
+			if(indirect_lamp_emission(kg, &state, &light_ray, &emission))
 				path_radiance_accum_emission(L, throughput, emission, state.bounce);
 		}
 #endif
 
 #ifdef __VOLUME__
 		/* volume attenuation, emission, scatter */
-		if(state.volume_stack[0].shader != SHADER_NO_ID) {
+		if(state.volume_stack[0].shader != SHADER_NONE) {
 			Ray volume_ray = ray;
 			volume_ray.t = (hit)? isect.t: FLT_MAX;
 
@@ -191,7 +271,7 @@ ccl_device void kernel_path_indirect(KernelGlobals *kg, RNG *rng, Ray ray, ccl_g
 		if(!hit) {
 #ifdef __BACKGROUND__
 			/* sample background shader */
-			float3 L_background = indirect_background(kg, &ray, state.flag, state.ray_pdf, state.bounce);
+			float3 L_background = indirect_background(kg, &state, &ray);
 			path_radiance_accum_background(L, throughput, L_background, state.bounce);
 #endif
 
@@ -200,7 +280,7 @@ ccl_device void kernel_path_indirect(KernelGlobals *kg, RNG *rng, Ray ray, ccl_g
 
 		/* setup shading */
 		ShaderData sd;
-		shader_setup_from_ray(kg, &sd, &isect, &ray, state.bounce);
+		shader_setup_from_ray(kg, &sd, &isect, &ray, state.bounce, state.transparent_bounce);
 		float rbsdf = path_state_rng_1D(kg, rng, &state, PRNG_BSDF);
 		shader_eval_surface(kg, &sd, rbsdf, state.flag, SHADER_CONTEXT_INDIRECT);
 #ifdef __BRANCHED_PATH__
@@ -300,38 +380,10 @@ ccl_device void kernel_path_indirect(KernelGlobals *kg, RNG *rng, Ray ray, ccl_g
 		}
 #endif
 
-#ifdef __EMISSION__
+#if defined(__EMISSION__) && defined(__BRANCHED_PATH__)
 		if(kernel_data.integrator.use_direct_light) {
-			/* sample illumination from lights to find path contribution */
-			if(sd.flag & SD_BSDF_HAS_EVAL) {
-				float light_t = path_state_rng_1D(kg, rng, &state, PRNG_LIGHT);
-#ifdef __MULTI_CLOSURE__
-				float light_o = 0.0f;
-#else
-				float light_o = path_state_rng_1D(kg, rng, &state, PRNG_LIGHT_F);
-#endif
-				float light_u, light_v;
-				path_state_rng_2D(kg, rng, &state, PRNG_LIGHT_U, &light_u, &light_v);
-
-				Ray light_ray;
-				BsdfEval L_light;
-				bool is_lamp;
-
-#ifdef __OBJECT_MOTION__
-				light_ray.time = sd.time;
-#endif
-
-				/* sample random light */
-				if(direct_emission(kg, &sd, -1, light_t, light_o, light_u, light_v, &light_ray, &L_light, &is_lamp, state.bounce)) {
-					/* trace shadow ray */
-					float3 shadow;
-
-					if(!shadow_blocked(kg, &state, &light_ray, &shadow)) {
-						/* accumulate */
-						path_radiance_accum_light(L, throughput, &L_light, shadow, 1.0f, state.bounce, is_lamp);
-					}
-				}
-			}
+			bool all = kernel_data.integrator.sample_all_lights_indirect;
+			kernel_branched_path_integrate_direct_lighting(kg, rng, &sd, &state, throughput, 1.0f, L, all);
 		}
 #endif
 
@@ -406,10 +458,6 @@ ccl_device void kernel_path_indirect(KernelGlobals *kg, RNG *rng, Ray ray, ccl_g
 	}
 }
 
-#endif
-
-#ifdef __SUBSURFACE__
-
 ccl_device_inline bool kernel_path_integrate_lighting(KernelGlobals *kg, RNG *rng,
 	ShaderData *sd, float3 *throughput, PathState *state, PathRadiance *L, Ray *ray)
 {
@@ -418,11 +466,6 @@ ccl_device_inline bool kernel_path_integrate_lighting(KernelGlobals *kg, RNG *rn
 		/* sample illumination from lights to find path contribution */
 		if(sd->flag & SD_BSDF_HAS_EVAL) {
 			float light_t = path_state_rng_1D(kg, rng, state, PRNG_LIGHT);
-#ifdef __MULTI_CLOSURE__
-			float light_o = 0.0f;
-#else
-			float light_o = path_state_rng_1D(kg, rng, state, PRNG_LIGHT_F);
-#endif
 			float light_u, light_v;
 			path_state_rng_2D(kg, rng, state, PRNG_LIGHT_U, &light_u, &light_v);
 
@@ -434,7 +477,7 @@ ccl_device_inline bool kernel_path_integrate_lighting(KernelGlobals *kg, RNG *rn
 			light_ray.time = sd->time;
 #endif
 
-			if(direct_emission(kg, sd, -1, light_t, light_o, light_u, light_v, &light_ray, &L_light, &is_lamp, state->bounce)) {
+			if(direct_emission(kg, sd, LAMP_NONE, light_t, light_u, light_v, &light_ray, &L_light, &is_lamp, state->bounce, state->transparent_bounce)) {
 				/* trace shadow ray */
 				float3 shadow;
 
@@ -524,6 +567,84 @@ ccl_device_inline bool kernel_path_integrate_lighting(KernelGlobals *kg, RNG *rn
 	}
 }
 
+ccl_device void kernel_path_ao(KernelGlobals *kg, ShaderData *sd, PathRadiance *L, PathState *state, RNG *rng, float3 throughput)
+{
+	/* todo: solve correlation */
+	float bsdf_u, bsdf_v;
+
+	path_state_rng_2D(kg, rng, state, PRNG_BSDF_U, &bsdf_u, &bsdf_v);
+
+	float ao_factor = kernel_data.background.ao_factor;
+	float3 ao_N;
+	float3 ao_bsdf = shader_bsdf_ao(kg, sd, ao_factor, &ao_N);
+	float3 ao_D;
+	float ao_pdf;
+	float3 ao_alpha = shader_bsdf_alpha(kg, sd);
+
+	sample_cos_hemisphere(ao_N, bsdf_u, bsdf_v, &ao_D, &ao_pdf);
+
+	if(dot(sd->Ng, ao_D) > 0.0f && ao_pdf != 0.0f) {
+		Ray light_ray;
+		float3 ao_shadow;
+
+		light_ray.P = ray_offset(sd->P, sd->Ng);
+		light_ray.D = ao_D;
+		light_ray.t = kernel_data.background.ao_distance;
+#ifdef __OBJECT_MOTION__
+		light_ray.time = sd->time;
+#endif
+		light_ray.dP = sd->dP;
+		light_ray.dD = differential3_zero();
+
+		if(!shadow_blocked(kg, state, &light_ray, &ao_shadow))
+			path_radiance_accum_ao(L, throughput, ao_alpha, ao_bsdf, ao_shadow, state->bounce);
+	}
+}
+
+#ifdef __SUBSURFACE__
+ccl_device bool kernel_path_subsurface_scatter(KernelGlobals *kg, ShaderData *sd, PathRadiance *L, PathState *state, RNG *rng, Ray *ray, float3 *throughput)
+{
+	float bssrdf_probability;
+	ShaderClosure *sc = subsurface_scatter_pick_closure(kg, sd, &bssrdf_probability);
+
+	/* modify throughput for picking bssrdf or bsdf */
+	*throughput *= bssrdf_probability;
+
+	/* do bssrdf scatter step if we picked a bssrdf closure */
+	if(sc) {
+		uint lcg_state = lcg_state_init(rng, state, 0x68bc21eb);
+
+		ShaderData bssrdf_sd[BSSRDF_MAX_HITS];
+		float bssrdf_u, bssrdf_v;
+		path_state_rng_2D(kg, rng, state, PRNG_BSDF_U, &bssrdf_u, &bssrdf_v);
+		int num_hits = subsurface_scatter_multi_step(kg, sd, bssrdf_sd, state->flag, sc, &lcg_state, bssrdf_u, bssrdf_v, false);
+
+		/* compute lighting with the BSDF closure */
+		for(int hit = 0; hit < num_hits; hit++) {
+			float3 tp = *throughput;
+			PathState hit_state = *state;
+			Ray hit_ray = *ray;
+
+			hit_state.flag |= PATH_RAY_BSSRDF_ANCESTOR;
+			hit_state.rng_offset += PRNG_BOUNCE_NUM;
+
+			if(kernel_path_integrate_lighting(kg, rng, &bssrdf_sd[hit], &tp, &hit_state, L, &hit_ray)) {
+#ifdef __LAMP_MIS__
+				hit_state.ray_t = 0.0f;
+#endif
+
+				kernel_path_indirect(kg, rng, hit_ray, tp, state->num_samples, hit_state, L);
+
+				/* for render passes, sum and reset indirect light pass variables
+				 * for the next samples */
+				path_radiance_sum_indirect(L);
+				path_radiance_reset_indirect(L);
+			}
+		}
+		return true;
+	}
+	return false;
+}
 #endif
 
 ccl_device float4 kernel_path_integrate(KernelGlobals *kg, RNG *rng, int sample, Ray ray, ccl_global float *buffer)
@@ -578,17 +699,16 @@ ccl_device float4 kernel_path_integrate(KernelGlobals *kg, RNG *rng, int sample,
 			light_ray.dP = ray.dP;
 
 			/* intersect with lamp */
-			float light_t = path_state_rng_1D(kg, rng, &state, PRNG_LIGHT);
 			float3 emission;
 
-			if(indirect_lamp_emission(kg, &light_ray, state.flag, state.ray_pdf, light_t, &emission, state.bounce))
+			if(indirect_lamp_emission(kg, &state, &light_ray, &emission))
 				path_radiance_accum_emission(&L, throughput, emission, state.bounce);
 		}
 #endif
 
 #ifdef __VOLUME__
 		/* volume attenuation, emission, scatter */
-		if(state.volume_stack[0].shader != SHADER_NO_ID) {
+		if(state.volume_stack[0].shader != SHADER_NONE) {
 			Ray volume_ray = ray;
 			volume_ray.t = (hit)? isect.t: FLT_MAX;
 
@@ -618,7 +738,7 @@ ccl_device float4 kernel_path_integrate(KernelGlobals *kg, RNG *rng, int sample,
 
 #ifdef __BACKGROUND__
 			/* sample background shader */
-			float3 L_background = indirect_background(kg, &ray, state.flag, state.ray_pdf, state.bounce);
+			float3 L_background = indirect_background(kg, &state, &ray);
 			path_radiance_accum_background(&L, throughput, L_background, state.bounce);
 #endif
 
@@ -627,7 +747,7 @@ ccl_device float4 kernel_path_integrate(KernelGlobals *kg, RNG *rng, int sample,
 
 		/* setup shading */
 		ShaderData sd;
-		shader_setup_from_ray(kg, &sd, &isect, &ray, state.bounce);
+		shader_setup_from_ray(kg, &sd, &isect, &ray, state.bounce, state.transparent_bounce);
 		float rbsdf = path_state_rng_1D(kg, rng, &state, PRNG_BSDF);
 		shader_eval_surface(kg, &sd, rbsdf, state.flag, SHADER_CONTEXT_MAIN);
 
@@ -694,35 +814,7 @@ ccl_device float4 kernel_path_integrate(KernelGlobals *kg, RNG *rng, int sample,
 #ifdef __AO__
 		/* ambient occlusion */
 		if(kernel_data.integrator.use_ambient_occlusion || (sd.flag & SD_AO)) {
-			/* todo: solve correlation */
-			float bsdf_u, bsdf_v;
-			path_state_rng_2D(kg, rng, &state, PRNG_BSDF_U, &bsdf_u, &bsdf_v);
-
-			float ao_factor = kernel_data.background.ao_factor;
-			float3 ao_N;
-			float3 ao_bsdf = shader_bsdf_ao(kg, &sd, ao_factor, &ao_N);
-			float3 ao_D;
-			float ao_pdf;
-			float3 ao_alpha = shader_bsdf_alpha(kg, &sd);
-
-			sample_cos_hemisphere(ao_N, bsdf_u, bsdf_v, &ao_D, &ao_pdf);
-
-			if(dot(sd.Ng, ao_D) > 0.0f && ao_pdf != 0.0f) {
-				Ray light_ray;
-				float3 ao_shadow;
-
-				light_ray.P = ray_offset(sd.P, sd.Ng);
-				light_ray.D = ao_D;
-				light_ray.t = kernel_data.background.ao_distance;
-#ifdef __OBJECT_MOTION__
-				light_ray.time = sd.time;
-#endif
-				light_ray.dP = sd.dP;
-				light_ray.dD = differential3_zero();
-
-				if(!shadow_blocked(kg, &state, &light_ray, &ao_shadow))
-					path_radiance_accum_ao(&L, throughput, ao_alpha, ao_bsdf, ao_shadow, state.bounce);
-			}
+			kernel_path_ao(kg, &sd, &L, &state, rng, throughput);
 		}
 #endif
 
@@ -730,60 +822,18 @@ ccl_device float4 kernel_path_integrate(KernelGlobals *kg, RNG *rng, int sample,
 		/* bssrdf scatter to a different location on the same object, replacing
 		 * the closures with a diffuse BSDF */
 		if(sd.flag & SD_BSSRDF) {
-			float bssrdf_probability;
-			ShaderClosure *sc = subsurface_scatter_pick_closure(kg, &sd, &bssrdf_probability);
-
-			/* modify throughput for picking bssrdf or bsdf */
-			throughput *= bssrdf_probability;
-
-			/* do bssrdf scatter step if we picked a bssrdf closure */
-			if(sc) {
-				uint lcg_state = lcg_state_init(rng, &state, 0x68bc21eb);
-
-				ShaderData bssrdf_sd[BSSRDF_MAX_HITS];
-				float bssrdf_u, bssrdf_v;
-				path_state_rng_2D(kg, rng, &state, PRNG_BSDF_U, &bssrdf_u, &bssrdf_v);
-				int num_hits = subsurface_scatter_multi_step(kg, &sd, bssrdf_sd, state.flag, sc, &lcg_state, bssrdf_u, bssrdf_v, false);
-
-				/* compute lighting with the BSDF closure */
-				for(int hit = 0; hit < num_hits; hit++) {
-					float3 tp = throughput;
-					PathState hit_state = state;
-					Ray hit_ray = ray;
-
-					hit_state.flag |= PATH_RAY_BSSRDF_ANCESTOR;
-					hit_state.rng_offset += PRNG_BOUNCE_NUM;
-					
-					if(kernel_path_integrate_lighting(kg, rng, &bssrdf_sd[hit], &tp, &hit_state, &L, &hit_ray)) {
-#ifdef __LAMP_MIS__
-						hit_state.ray_t = 0.0f;
-#endif
-
-						kernel_path_indirect(kg, rng, hit_ray, buffer, tp, state.num_samples, hit_state, &L);
-
-						/* for render passes, sum and reset indirect light pass variables
-						 * for the next samples */
-						path_radiance_sum_indirect(&L);
-						path_radiance_reset_indirect(&L);
-					}
-				}
+			if(kernel_path_subsurface_scatter(kg, &sd, &L, &state, rng, &ray, &throughput))
 				break;
-			}
 		}
 #endif
 		
-		/* The following code is the same as in kernel_path_integrate_lighting(),
+		/* Same as kernel_path_integrate_lighting(kg, rng, &sd, &throughput, &state, &L, &ray),
 		   but for CUDA the function call is slower. */
 #ifdef __EMISSION__
 		if(kernel_data.integrator.use_direct_light) {
 			/* sample illumination from lights to find path contribution */
 			if(sd.flag & SD_BSDF_HAS_EVAL) {
 				float light_t = path_state_rng_1D(kg, rng, &state, PRNG_LIGHT);
-#ifdef __MULTI_CLOSURE__
-				float light_o = 0.0f;
-#else
-				float light_o = path_state_rng_1D(kg, rng, &state, PRNG_LIGHT_F);
-#endif
 				float light_u, light_v;
 				path_state_rng_2D(kg, rng, &state, PRNG_LIGHT_U, &light_u, &light_v);
 
@@ -795,7 +845,7 @@ ccl_device float4 kernel_path_integrate(KernelGlobals *kg, RNG *rng, int sample,
 				light_ray.time = sd.time;
 #endif
 
-				if(direct_emission(kg, &sd, -1, light_t, light_o, light_u, light_v, &light_ray, &L_light, &is_lamp, state.bounce)) {
+				if(direct_emission(kg, &sd, LAMP_NONE, light_t, light_u, light_v, &light_ray, &L_light, &is_lamp, state.bounce, state.transparent_bounce)) {
 					/* trace shadow ray */
 					float3 shadow;
 
@@ -898,69 +948,9 @@ ccl_device_noinline void kernel_branched_path_integrate_lighting(KernelGlobals *
 	PathState *state, PathRadiance *L, ccl_global float *buffer)
 {
 #ifdef __EMISSION__
-	/* sample illumination from lights to find path contribution */
-	if(sd->flag & SD_BSDF_HAS_EVAL) {
-		Ray light_ray;
-		BsdfEval L_light;
-		bool is_lamp;
-
-#ifdef __OBJECT_MOTION__
-		light_ray.time = sd->time;
-#endif
-
-		/* lamp sampling */
-		for(int i = 0; i < kernel_data.integrator.num_all_lights; i++) {
-			int num_samples = ceil_to_int(num_samples_adjust*light_select_num_samples(kg, i));
-			float num_samples_inv = num_samples_adjust/(num_samples*kernel_data.integrator.num_all_lights);
-			RNG lamp_rng = cmj_hash(*rng, i);
-
-			if(kernel_data.integrator.pdf_triangles != 0.0f)
-				num_samples_inv *= 0.5f;
-
-			for(int j = 0; j < num_samples; j++) {
-				float light_u, light_v;
-				path_branched_rng_2D(kg, &lamp_rng, state, j, num_samples, PRNG_LIGHT_U, &light_u, &light_v);
-
-				if(direct_emission(kg, sd, i, 0.0f, 0.0f, light_u, light_v, &light_ray, &L_light, &is_lamp, state->bounce)) {
-					/* trace shadow ray */
-					float3 shadow;
-
-					if(!shadow_blocked(kg, state, &light_ray, &shadow)) {
-						/* accumulate */
-						path_radiance_accum_light(L, throughput*num_samples_inv, &L_light, shadow, num_samples_inv, state->bounce, is_lamp);
-					}
-				}
-			}
-		}
-
-		/* mesh light sampling */
-		if(kernel_data.integrator.pdf_triangles != 0.0f) {
-			int num_samples = ceil_to_int(num_samples_adjust*kernel_data.integrator.mesh_light_samples);
-			float num_samples_inv = num_samples_adjust/num_samples;
-
-			if(kernel_data.integrator.num_all_lights)
-				num_samples_inv *= 0.5f;
-
-			for(int j = 0; j < num_samples; j++) {
-				float light_t = path_branched_rng_1D(kg, rng, state, j, num_samples, PRNG_LIGHT);
-				float light_u, light_v;
-				path_branched_rng_2D(kg, rng, state, j, num_samples, PRNG_LIGHT_U, &light_u, &light_v);
-
-				/* only sample triangle lights */
-				if(kernel_data.integrator.num_all_lights)
-					light_t = 0.5f*light_t;
-
-				if(direct_emission(kg, sd, -1, light_t, 0.0f, light_u, light_v, &light_ray, &L_light, &is_lamp, state->bounce)) {
-					/* trace shadow ray */
-					float3 shadow;
-
-					if(!shadow_blocked(kg, state, &light_ray, &shadow)) {
-						/* accumulate */
-						path_radiance_accum_light(L, throughput*num_samples_inv, &L_light, shadow, num_samples_inv, state->bounce, is_lamp);
-					}
-				}
-			}
-		}
+	if(kernel_data.integrator.use_direct_light) {
+		bool all = kernel_data.integrator.sample_all_lights_direct;
+		kernel_branched_path_integrate_direct_lighting(kg, rng, sd, state, throughput, num_samples_adjust, L, all);
 	}
 #endif
 
@@ -1043,7 +1033,7 @@ ccl_device_noinline void kernel_branched_path_integrate_lighting(KernelGlobals *
 			ps.ray_t = 0.0f;
 #endif
 
-			kernel_path_indirect(kg, rng, bsdf_ray, buffer, tp*num_samples_inv, num_samples, ps, L);
+			kernel_path_indirect(kg, rng, bsdf_ray, tp*num_samples_inv, num_samples, ps, L);
 
 			/* for render passes, sum and reset indirect light pass variables
 			 * for the next samples */
@@ -1092,13 +1082,66 @@ ccl_device float4 kernel_branched_path_integrate(KernelGlobals *kg, RNG *rng, in
 
 #ifdef __VOLUME__
 		/* volume attenuation, emission, scatter */
-		if(state.volume_stack[0].shader != SHADER_NO_ID) {
+		if(state.volume_stack[0].shader != SHADER_NONE) {
 			Ray volume_ray = ray;
 			volume_ray.t = (hit)? isect.t: FLT_MAX;
 
+#ifdef __KERNEL_CPU__
+			/* decoupled ray marching only supported on CPU */
+			bool heterogeneous = volume_stack_is_heterogeneous(kg, state.volume_stack);
+
+			/* cache steps along volume for repeated sampling */
+			VolumeSegment volume_segment;
+			ShaderData volume_sd;
+
+			shader_setup_from_volume(kg, &volume_sd, &volume_ray, state.bounce, state.transparent_bounce);
+			kernel_volume_decoupled_record(kg, &state,
+				&volume_ray, &volume_sd, &volume_segment, heterogeneous);
+
+			/* sample scattering */
+			int num_samples = kernel_data.integrator.volume_samples;
+			float num_samples_inv = 1.0f/num_samples;
+
+			for(int j = 0; j < num_samples; j++) {
+				/* workaround to fix correlation bug in T38710, can find better solution
+				 * in random number generator later, for now this is done here to not impact
+				 * performance of rendering without volumes */
+				RNG tmp_rng = cmj_hash(*rng, state.rng_offset);
+
+				PathState ps = state;
+				Ray pray = ray;
+				float3 tp = throughput;
+
+				/* branch RNG state */
+				path_state_branch(&ps, j, num_samples);
+
+				VolumeIntegrateResult result = kernel_volume_decoupled_scatter(kg,
+					&ps, &volume_ray, &volume_sd, &tp, &tmp_rng, &volume_segment);
+				
+				if(result == VOLUME_PATH_SCATTERED) {
+					/* todo: use all-light sampling */
+					if(kernel_path_integrate_scatter_lighting(kg, rng, &volume_sd, &tp, &ps, &L, &pray, num_samples_inv)) {
+						kernel_path_indirect(kg, rng, pray, tp*num_samples_inv, num_samples, ps, &L);
+
+						/* for render passes, sum and reset indirect light pass variables
+						 * for the next samples */
+						path_radiance_sum_indirect(&L);
+						path_radiance_reset_indirect(&L);
+					}
+				}
+			}
+
+			/* emission and transmittance */
+			if(volume_segment.closure_flag & SD_EMISSION)
+				path_radiance_accum_emission(&L, throughput, volume_segment.accum_emission, state.bounce);
+			throughput *= volume_segment.accum_transmittance;
+
+			/* free cached steps */
+			kernel_volume_decoupled_free(kg, &volume_segment);
+#else
+			/* GPU: no decoupled ray marching, scatter probalistically */
 			int num_samples = kernel_data.integrator.volume_samples;
 			float num_samples_inv = 1.0f/num_samples;
-			float3 avg_tp = make_float3(0.0f, 0.0f, 0.0f);
 
 			/* todo: we should cache the shader evaluations from stepping
 			 * through the volume, for now we redo them multiple times */
@@ -1118,7 +1161,7 @@ ccl_device float4 kernel_branched_path_integrate(KernelGlobals *kg, RNG *rng, in
 				if(result == VOLUME_PATH_SCATTERED) {
 					/* todo: use all-light sampling */
 					if(kernel_path_integrate_scatter_lighting(kg, rng, &volume_sd, &tp, &ps, &L, &pray, num_samples_inv)) {
-						kernel_path_indirect(kg, rng, pray, buffer, tp*num_samples_inv, num_samples, ps, &L);
+						kernel_path_indirect(kg, rng, pray, tp*num_samples_inv, num_samples, ps, &L);
 
 						/* for render passes, sum and reset indirect light pass variables
 						 * for the next samples */
@@ -1126,11 +1169,11 @@ ccl_device float4 kernel_branched_path_integrate(KernelGlobals *kg, RNG *rng, in
 						path_radiance_reset_indirect(&L);
 					}
 				}
-				else
-					avg_tp += tp;
 			}
 
-			throughput = avg_tp * num_samples_inv;
+			/* todo: avoid this calculation using decoupled ray marching */
+			kernel_volume_shadow(kg, &state, &volume_ray, &throughput);
+#endif
 		}
 #endif
 
@@ -1147,7 +1190,7 @@ ccl_device float4 kernel_branched_path_integrate(KernelGlobals *kg, RNG *rng, in
 
 #ifdef __BACKGROUND__
 			/* sample background shader */
-			float3 L_background = indirect_background(kg, &ray, state.flag, state.ray_pdf, state.bounce);
+			float3 L_background = indirect_background(kg, &state, &ray);
 			path_radiance_accum_background(&L, throughput, L_background, state.bounce);
 #endif
 
@@ -1156,7 +1199,7 @@ ccl_device float4 kernel_branched_path_integrate(KernelGlobals *kg, RNG *rng, in
 
 		/* setup shading */
 		ShaderData sd;
-		shader_setup_from_ray(kg, &sd, &isect, &ray, state.bounce);
+		shader_setup_from_ray(kg, &sd, &isect, &ray, state.bounce, state.transparent_bounce);
 		shader_eval_surface(kg, &sd, 0.0f, state.flag, SHADER_CONTEXT_MAIN);
 		shader_merge_closures(&sd);
 
@@ -1270,21 +1313,21 @@ ccl_device float4 kernel_branched_path_integrate(KernelGlobals *kg, RNG *rng, in
 				/* do subsurface scatter step with copy of shader data, this will
 				 * replace the BSSRDF with a diffuse BSDF closure */
 				for(int j = 0; j < num_samples; j++) {
-						ShaderData bssrdf_sd[BSSRDF_MAX_HITS];
-						float bssrdf_u, bssrdf_v;
-						path_branched_rng_2D(kg, &bssrdf_rng, &state, j, num_samples, PRNG_BSDF_U, &bssrdf_u, &bssrdf_v);
-						int num_hits = subsurface_scatter_multi_step(kg, &sd, bssrdf_sd, state.flag, sc, &lcg_state, bssrdf_u, bssrdf_v, true);
+					ShaderData bssrdf_sd[BSSRDF_MAX_HITS];
+					float bssrdf_u, bssrdf_v;
+					path_branched_rng_2D(kg, &bssrdf_rng, &state, j, num_samples, PRNG_BSDF_U, &bssrdf_u, &bssrdf_v);
+					int num_hits = subsurface_scatter_multi_step(kg, &sd, bssrdf_sd, state.flag, sc, &lcg_state, bssrdf_u, bssrdf_v, true);
 
-						/* compute lighting with the BSDF closure */
-						for(int hit = 0; hit < num_hits; hit++) {
-							PathState hit_state = state;
+					/* compute lighting with the BSDF closure */
+					for(int hit = 0; hit < num_hits; hit++) {
+						PathState hit_state = state;
 
-							path_state_branch(&hit_state, j, num_samples);
+						path_state_branch(&hit_state, j, num_samples);
 
-							kernel_branched_path_integrate_lighting(kg, rng,
-								&bssrdf_sd[hit], throughput, num_samples_inv,
-								&hit_state, &L, buffer);
-						}
+						kernel_branched_path_integrate_lighting(kg, rng,
+						                                        &bssrdf_sd[hit], throughput, num_samples_inv,
+						                                        &hit_state, &L, buffer);
+					}
 				}
 
 				state.flag &= ~PATH_RAY_BSSRDF_ANCESTOR;
diff --git a/intern/cycles/kernel/kernel_path_state.h b/intern/cycles/kernel/kernel_path_state.h
index c3f617542a6..406654c1741 100644
--- a/intern/cycles/kernel/kernel_path_state.h
+++ b/intern/cycles/kernel/kernel_path_state.h
@@ -50,7 +50,7 @@ ccl_device_inline void path_state_init(KernelGlobals *kg, PathState *state, RNG
 		state->rng_congruential = lcg_init(*rng + sample*0x51633e2d);
 	}
 	else {
-		state->volume_stack[0].shader = SHADER_NO_ID;
+		state->volume_stack[0].shader = SHADER_NONE;
 	}
 #endif
 }
@@ -132,6 +132,9 @@ ccl_device_inline uint path_state_ray_visibility(KernelGlobals *kg, PathState *s
 	/* for visibility, diffuse/glossy are for reflection only */
 	if(flag & PATH_RAY_TRANSMIT)
 		flag &= ~(PATH_RAY_DIFFUSE|PATH_RAY_GLOSSY);
+	/* todo: this is not supported as its own ray visibility yet */
+	if(state->flag & PATH_RAY_VOLUME_SCATTER)
+		flag |= PATH_RAY_DIFFUSE;
 	/* for camera visibility, use render layer flags */
 	if(flag & PATH_RAY_CAMERA)
 		flag |= kernel_data.integrator.layer_flag;
diff --git a/intern/cycles/kernel/kernel_projection.h b/intern/cycles/kernel/kernel_projection.h
index e2108604bc8..6744471d659 100644
--- a/intern/cycles/kernel/kernel_projection.h
+++ b/intern/cycles/kernel/kernel_projection.h
@@ -39,7 +39,7 @@ CCL_NAMESPACE_BEGIN
 
 ccl_device float2 direction_to_spherical(float3 dir)
 {
-	float theta = acosf(dir.z);
+	float theta = safe_acosf(dir.z);
 	float phi = atan2f(dir.x, dir.y);
 
 	return make_float2(theta, phi);
@@ -97,7 +97,7 @@ ccl_device float3 fisheye_to_direction(float u, float v, float fov)
 	if(r > 1.0f)
 		return make_float3(0.0f, 0.0f, 0.0f);
 
-	float phi = acosf((r != 0.0f)? u/r: 0.0f);
+	float phi = safe_acosf((r != 0.0f)? u/r: 0.0f);
 	float theta = r * fov * 0.5f;
 
 	if(v < 0.0f) phi = -phi;
@@ -111,7 +111,7 @@ ccl_device float3 fisheye_to_direction(float u, float v, float fov)
 
 ccl_device float2 direction_to_fisheye_equisolid(float3 dir, float lens, float width, float height)
 {
-	float theta = acosf(dir.x);
+	float theta = safe_acosf(dir.x);
 	float r = 2.0f * lens * sinf(theta * 0.5f);
 	float phi = atan2f(dir.z, dir.y);
 
@@ -132,7 +132,7 @@ ccl_device float3 fisheye_equisolid_to_direction(float u, float v, float lens, f
 	if(r > rmax)
 		return make_float3(0.0f, 0.0f, 0.0f);
 
-	float phi = acosf((r != 0.0f)? u/r: 0.0f);
+	float phi = safe_acosf((r != 0.0f)? u/r: 0.0f);
 	float theta = 2.0f * asinf(r/(2.0f * lens));
 
 	if(v < 0.0f) phi = -phi;
diff --git a/intern/cycles/kernel/kernel_random.h b/intern/cycles/kernel/kernel_random.h
index ef397269ec2..31cb6ff6abd 100644
--- a/intern/cycles/kernel/kernel_random.h
+++ b/intern/cycles/kernel/kernel_random.h
@@ -120,6 +120,9 @@ ccl_device_inline float path_rng_1D(KernelGlobals *kg, RNG *rng, int sample, int
 	/* Cranly-Patterson rotation using rng seed */
 	float shift;
 
+	/* using the same *rng value to offset seems to give correlation issues,
+	 * we could hash it with the dimension but this has a performance impact,
+	 * we need to find a solution for this */
 	if(dimension & 1)
 		shift = (*rng >> 16) * (1.0f/(float)0xFFFF);
 	else
diff --git a/intern/cycles/kernel/kernel_shader.h b/intern/cycles/kernel/kernel_shader.h
index b113e906e9d..58cec090410 100644
--- a/intern/cycles/kernel/kernel_shader.h
+++ b/intern/cycles/kernel/kernel_shader.h
@@ -39,7 +39,7 @@ ccl_device void shader_setup_object_transforms(KernelGlobals *kg, ShaderData *sd
 {
 	if(sd->flag & SD_OBJECT_MOTION) {
 		sd->ob_tfm = object_fetch_transform_motion(kg, sd->object, time);
-		sd->ob_itfm= transform_quick_inverse(sd->ob_tfm);
+		sd->ob_itfm = transform_quick_inverse(sd->ob_tfm);
 	}
 	else {
 		sd->ob_tfm = object_fetch_transform(kg, sd->object, OBJECT_TRANSFORM);
@@ -49,12 +49,13 @@ ccl_device void shader_setup_object_transforms(KernelGlobals *kg, ShaderData *sd
 #endif
 
 ccl_device void shader_setup_from_ray(KernelGlobals *kg, ShaderData *sd,
-	const Intersection *isect, const Ray *ray, int bounce)
+	const Intersection *isect, const Ray *ray, int bounce, int transparent_bounce)
 {
 #ifdef __INSTANCING__
-	sd->object = (isect->object == ~0)? kernel_tex_fetch(__prim_object, isect->prim): isect->object;
+	sd->object = (isect->object == PRIM_NONE)? kernel_tex_fetch(__prim_object, isect->prim): isect->object;
 #endif
 
+	sd->type = isect->type;
 	sd->flag = kernel_tex_fetch(__object_flag, sd->object);
 
 	/* matrices and time */
@@ -66,37 +67,31 @@ ccl_device void shader_setup_from_ray(KernelGlobals *kg, ShaderData *sd,
 	sd->prim = kernel_tex_fetch(__prim_index, isect->prim);
 	sd->ray_length = isect->t;
 	sd->ray_depth = bounce;
+	sd->transparent_depth = transparent_bounce;
+
+#ifdef __UV__
+	sd->u = isect->u;
+	sd->v = isect->v;
+#endif
 
 #ifdef __HAIR__
-	if(kernel_tex_fetch(__prim_segment, isect->prim) != ~0) {
-		/* Strand Shader setting*/
+	if(sd->type & PRIMITIVE_ALL_CURVE) {
+		/* curve */
 		float4 curvedata = kernel_tex_fetch(__curves, sd->prim);
 
 		sd->shader = __float_as_int(curvedata.z);
-		sd->segment = isect->segment;
 		sd->P = bvh_curve_refine(kg, sd, isect, ray);
 	}
-	else {
+	else
 #endif
-		/* fetch triangle data */
+	if(sd->type & PRIMITIVE_TRIANGLE) {
+		/* static triangle */
 		float4 Ns = kernel_tex_fetch(__tri_normal, sd->prim);
 		float3 Ng = make_float3(Ns.x, Ns.y, Ns.z);
 		sd->shader = __float_as_int(Ns.w);
 
-#ifdef __HAIR__
-		sd->segment = ~0;
-		/*elements for minimum hair width using transparency bsdf*/
-		/*sd->curve_transparency = 0.0f;*/
-		/*sd->curve_radius = 0.0f;*/
-#endif
-
-#ifdef __UV__
-		sd->u = isect->u;
-		sd->v = isect->v;
-#endif
-
 		/* vectors */
-		sd->P = bvh_triangle_refine(kg, sd, isect, ray);
+		sd->P = triangle_refine(kg, sd, isect, ray);
 		sd->Ng = Ng;
 		sd->N = Ng;
 		
@@ -106,19 +101,20 @@ ccl_device void shader_setup_from_ray(KernelGlobals *kg, ShaderData *sd,
 
 #ifdef __DPDU__
 		/* dPdu/dPdv */
-		triangle_dPdudv(kg, &sd->dPdu, &sd->dPdv, sd->prim);
+		triangle_dPdudv(kg, sd->prim, &sd->dPdu, &sd->dPdv);
 #endif
-
-#ifdef __HAIR__
 	}
-#endif
+	else {
+		/* motion triangle */
+		motion_triangle_shader_setup(kg, sd, isect, ray, false);
+	}
 
 	sd->I = -ray->D;
 
 	sd->flag |= kernel_tex_fetch(__shader_flag, (sd->shader & SHADER_MASK)*2);
 
 #ifdef __INSTANCING__
-	if(isect->object != ~0) {
+	if(isect->object != OBJECT_NONE) {
 		/* instance transform */
 		object_normal_transform(kg, sd, &sd->N);
 		object_normal_transform(kg, sd, &sd->Ng);
@@ -161,39 +157,41 @@ ccl_device_inline void shader_setup_from_subsurface(KernelGlobals *kg, ShaderDat
 	/* object, matrices, time, ray_length stay the same */
 	sd->flag = kernel_tex_fetch(__object_flag, sd->object);
 	sd->prim = kernel_tex_fetch(__prim_index, isect->prim);
-
-	/* fetch triangle data */
-	float4 Ns = kernel_tex_fetch(__tri_normal, sd->prim);
-	float3 Ng = make_float3(Ns.x, Ns.y, Ns.z);
-	sd->shader = __float_as_int(Ns.w);
-
-#ifdef __HAIR__
-	sd->segment = ~0;
-#endif
+	sd->type = isect->type;
 
 #ifdef __UV__
 	sd->u = isect->u;
 	sd->v = isect->v;
 #endif
 
-	/* vectors */
-	sd->P = bvh_triangle_refine_subsurface(kg, sd, isect, ray);
-	sd->Ng = Ng;
-	sd->N = Ng;
-	
-	/* smooth normal */
-	if(sd->shader & SHADER_SMOOTH_NORMAL)
-		sd->N = triangle_smooth_normal(kg, sd->prim, sd->u, sd->v);
+	/* fetch triangle data */
+	if(sd->type == PRIMITIVE_TRIANGLE) {
+		float4 Ns = kernel_tex_fetch(__tri_normal, sd->prim);
+		float3 Ng = make_float3(Ns.x, Ns.y, Ns.z);
+		sd->shader = __float_as_int(Ns.w);
+
+		/* static triangle */
+		sd->P = triangle_refine_subsurface(kg, sd, isect, ray);
+		sd->Ng = Ng;
+		sd->N = Ng;
+
+		if(sd->shader & SHADER_SMOOTH_NORMAL)
+			sd->N = triangle_smooth_normal(kg, sd->prim, sd->u, sd->v);
 
 #ifdef __DPDU__
-	/* dPdu/dPdv */
-	triangle_dPdudv(kg, &sd->dPdu, &sd->dPdv, sd->prim);
+		/* dPdu/dPdv */
+		triangle_dPdudv(kg, sd->prim, &sd->dPdu, &sd->dPdv);
 #endif
+	}
+	else {
+		/* motion triangle */
+		motion_triangle_shader_setup(kg, sd, isect, ray, true);
+	}
 
 	sd->flag |= kernel_tex_fetch(__shader_flag, (sd->shader & SHADER_MASK)*2);
 
 #ifdef __INSTANCING__
-	if(isect->object != ~0) {
+	if(isect->object != OBJECT_NONE) {
 		/* instance transform */
 		object_normal_transform(kg, sd, &sd->N);
 		object_normal_transform(kg, sd, &sd->Ng);
@@ -231,7 +229,7 @@ ccl_device_inline void shader_setup_from_subsurface(KernelGlobals *kg, ShaderDat
 
 ccl_device void shader_setup_from_sample(KernelGlobals *kg, ShaderData *sd,
 	const float3 P, const float3 Ng, const float3 I,
-	int shader, int object, int prim, float u, float v, float t, float time, int bounce, int segment)
+	int shader, int object, int prim, float u, float v, float t, float time, int bounce, int transparent_bounce)
 {
 	/* vectors */
 	sd->P = P;
@@ -239,9 +237,7 @@ ccl_device void shader_setup_from_sample(KernelGlobals *kg, ShaderData *sd,
 	sd->Ng = Ng;
 	sd->I = I;
 	sd->shader = shader;
-#ifdef __HAIR__
-	sd->segment = segment;
-#endif
+	sd->type = (prim == PRIM_NONE)? PRIMITIVE_NONE: PRIMITIVE_TRIANGLE;
 
 	/* primitive */
 #ifdef __INSTANCING__
@@ -255,12 +251,13 @@ ccl_device void shader_setup_from_sample(KernelGlobals *kg, ShaderData *sd,
 #endif
 	sd->ray_length = t;
 	sd->ray_depth = bounce;
+	sd->transparent_depth = transparent_bounce;
 
 	/* detect instancing, for non-instanced the object index is -object-1 */
 #ifdef __INSTANCING__
 	bool instanced = false;
 
-	if(sd->prim != ~0) {
+	if(sd->prim != PRIM_NONE) {
 		if(sd->object >= 0)
 			instanced = true;
 		else
@@ -271,7 +268,7 @@ ccl_device void shader_setup_from_sample(KernelGlobals *kg, ShaderData *sd,
 #endif
 
 	sd->flag = kernel_tex_fetch(__shader_flag, (sd->shader & SHADER_MASK)*2);
-	if(sd->object != -1) {
+	if(sd->object != OBJECT_NONE) {
 		sd->flag |= kernel_tex_fetch(__object_flag, sd->object);
 
 #ifdef __OBJECT_MOTION__
@@ -283,36 +280,20 @@ ccl_device void shader_setup_from_sample(KernelGlobals *kg, ShaderData *sd,
 	}
 #endif
 
-	/* smooth normal */
-#ifdef __HAIR__
-	if(sd->shader & SHADER_SMOOTH_NORMAL && sd->segment == ~0) {
-		sd->N = triangle_smooth_normal(kg, sd->prim, sd->u, sd->v);
-#else
-	if(sd->shader & SHADER_SMOOTH_NORMAL) {
-		sd->N = triangle_smooth_normal(kg, sd->prim, sd->u, sd->v);
-#endif
+	if(sd->type & PRIMITIVE_TRIANGLE) {
+		/* smooth normal */
+		if(sd->shader & SHADER_SMOOTH_NORMAL) {
+			sd->N = triangle_smooth_normal(kg, sd->prim, sd->u, sd->v);
 
 #ifdef __INSTANCING__
-		if(instanced)
-			object_normal_transform(kg, sd, &sd->N);
+			if(instanced)
+				object_normal_transform(kg, sd, &sd->N);
 #endif
-	}
+		}
 
+		/* dPdu/dPdv */
 #ifdef __DPDU__
-	/* dPdu/dPdv */
-#ifdef __HAIR__
-	if(sd->prim == ~0 || sd->segment != ~0) {
-		sd->dPdu = make_float3(0.0f, 0.0f, 0.0f);
-		sd->dPdv = make_float3(0.0f, 0.0f, 0.0f);
-	}
-#else
-	if(sd->prim == ~0) {
-		sd->dPdu = make_float3(0.0f, 0.0f, 0.0f);
-		sd->dPdv = make_float3(0.0f, 0.0f, 0.0f);
-	}
-#endif
-	else {
-		triangle_dPdudv(kg, &sd->dPdu, &sd->dPdv, sd->prim);
+		triangle_dPdudv(kg, sd->prim, &sd->dPdu, &sd->dPdv);
 
 #ifdef __INSTANCING__
 		if(instanced) {
@@ -320,11 +301,17 @@ ccl_device void shader_setup_from_sample(KernelGlobals *kg, ShaderData *sd,
 			object_dir_transform(kg, sd, &sd->dPdv);
 		}
 #endif
+#endif
 	}
+	else {
+#ifdef __DPDU__
+		sd->dPdu = make_float3(0.0f, 0.0f, 0.0f);
+		sd->dPdv = make_float3(0.0f, 0.0f, 0.0f);
 #endif
+	}
 
 	/* backfacing test */
-	if(sd->prim != ~0) {
+	if(sd->prim != PRIM_NONE) {
 		bool backfacing = (dot(sd->Ng, sd->I) < 0.0f);
 
 		if(backfacing) {
@@ -355,20 +342,19 @@ ccl_device void shader_setup_from_displace(KernelGlobals *kg, ShaderData *sd,
 	float3 P, Ng, I = make_float3(0.0f, 0.0f, 0.0f);
 	int shader;
 
-	P = triangle_point_MT(kg, prim, u, v);
-	Ng = triangle_normal_MT(kg, prim, &shader);
+	triangle_point_normal(kg, prim, u, v, &P, &Ng, &shader);
 
 	/* force smooth shading for displacement */
 	shader |= SHADER_SMOOTH_NORMAL;
 
 	/* watch out: no instance transform currently */
 
-	shader_setup_from_sample(kg, sd, P, Ng, I, shader, object, prim, u, v, 0.0f, TIME_INVALID, 0, ~0);
+	shader_setup_from_sample(kg, sd, P, Ng, I, shader, object, prim, u, v, 0.0f, TIME_INVALID, 0, 0);
 }
 
 /* ShaderData setup from ray into background */
 
-ccl_device_inline void shader_setup_from_background(KernelGlobals *kg, ShaderData *sd, const Ray *ray, int bounce)
+ccl_device_inline void shader_setup_from_background(KernelGlobals *kg, ShaderData *sd, const Ray *ray, int bounce, int transparent_bounce)
 {
 	/* vectors */
 	sd->P = ray->D;
@@ -382,11 +368,12 @@ ccl_device_inline void shader_setup_from_background(KernelGlobals *kg, ShaderDat
 #endif
 	sd->ray_length = 0.0f;
 	sd->ray_depth = bounce;
+	sd->transparent_depth = transparent_bounce;
 
 #ifdef __INSTANCING__
-	sd->object = ~0;
+	sd->object = PRIM_NONE;
 #endif
-	sd->prim = ~0;
+	sd->prim = PRIM_NONE;
 #ifdef __UV__
 	sd->u = 0.0f;
 	sd->v = 0.0f;
@@ -411,28 +398,27 @@ ccl_device_inline void shader_setup_from_background(KernelGlobals *kg, ShaderDat
 
 /* ShaderData setup from point inside volume */
 
-ccl_device_inline void shader_setup_from_volume(KernelGlobals *kg, ShaderData *sd, const Ray *ray, int bounce)
+ccl_device_inline void shader_setup_from_volume(KernelGlobals *kg, ShaderData *sd, const Ray *ray, int bounce, int transparent_bounce)
 {
 	/* vectors */
 	sd->P = ray->P;
 	sd->N = -ray->D;  
 	sd->Ng = -ray->D;
 	sd->I = -ray->D;
-	sd->shader = SHADER_NO_ID;
+	sd->shader = SHADER_NONE;
 	sd->flag = 0;
 #ifdef __OBJECT_MOTION__
 	sd->time = ray->time;
 #endif
 	sd->ray_length = 0.0f; /* todo: can we set this to some useful value? */
 	sd->ray_depth = bounce;
+	sd->transparent_depth = transparent_bounce;
 
 #ifdef __INSTANCING__
-	sd->object = ~0; /* todo: fill this for texture coordinates */
-#endif
-	sd->prim = ~0;
-#ifdef __HAIR__
-	sd->segment = ~0;
+	sd->object = PRIM_NONE; /* todo: fill this for texture coordinates */
 #endif
+	sd->prim = PRIM_NONE;
+	sd->type = PRIMITIVE_NONE;
 
 #ifdef __UV__
 	sd->u = 0.0f;
@@ -471,23 +457,32 @@ ccl_device void shader_merge_closures(ShaderData *sd)
 			ShaderClosure *scj = &sd->closure[j];
 
 #ifdef __OSL__
-			if(!sci->prim && !scj->prim && sci->type == scj->type && sci->data0 == scj->data0 && sci->data1 == scj->data1) {
-#else
-			if(sci->type == scj->type && sci->data0 == scj->data0 && sci->data1 == scj->data1) {
+			if(sci->prim || scj->prim)
+				continue;
 #endif
-				sci->weight += scj->weight;
-				sci->sample_weight += scj->sample_weight;
-
-				int size = sd->num_closure - (j+1);
-				if(size > 0) {
-					for(int k = 0; k < size; k++) {
-						scj[k] = scj[k+1];
-					}
-				}
 
-				sd->num_closure--;
-				j--;
+			if(!(sci->type == scj->type && sci->data0 == scj->data0 && sci->data1 == scj->data1))
+				continue;
+
+			if(CLOSURE_IS_BSDF_OR_BSSRDF(sci->type)) {
+				if(sci->N != scj->N)
+					continue;
+				else if(CLOSURE_IS_BSDF_ANISOTROPIC(sci->type) && sci->T != scj->T)
+					continue;
 			}
+
+			sci->weight += scj->weight;
+			sci->sample_weight += scj->sample_weight;
+
+			int size = sd->num_closure - (j+1);
+			if(size > 0) {
+				for(int k = 0; k < size; k++) {
+					scj[k] = scj[k+1];
+				}
+			}
+
+			sd->num_closure--;
+			j--;
 		}
 	}
 }
@@ -495,8 +490,6 @@ ccl_device void shader_merge_closures(ShaderData *sd)
 
 /* BSDF */
 
-#ifdef __MULTI_CLOSURE__
-
 ccl_device_inline void _shader_bsdf_multi_eval(KernelGlobals *kg, const ShaderData *sd, const float3 omega_in, float *pdf,
 	int skip_bsdf, BsdfEval *result_eval, float sum_pdf, float sum_sample_weight)
 {
@@ -524,28 +517,18 @@ ccl_device_inline void _shader_bsdf_multi_eval(KernelGlobals *kg, const ShaderDa
 	*pdf = (sum_sample_weight > 0.0f)? sum_pdf/sum_sample_weight: 0.0f;
 }
 
-#endif
-
 ccl_device void shader_bsdf_eval(KernelGlobals *kg, const ShaderData *sd,
 	const float3 omega_in, BsdfEval *eval, float *pdf)
 {
-#ifdef __MULTI_CLOSURE__
 	bsdf_eval_init(eval, NBUILTIN_CLOSURES, make_float3(0.0f, 0.0f, 0.0f), kernel_data.film.use_light_pass);
 
 	_shader_bsdf_multi_eval(kg, sd, omega_in, pdf, -1, eval, 0.0f, 0.0f);
-#else
-	const ShaderClosure *sc = &sd->closure;
-
-	*pdf = 0.0f;
-	*eval = bsdf_eval(kg, sd, sc, omega_in, pdf)*sc->weight;
-#endif
 }
 
 ccl_device int shader_bsdf_sample(KernelGlobals *kg, const ShaderData *sd,
 	float randu, float randv, BsdfEval *bsdf_eval,
 	float3 *omega_in, differential3 *domega_in, float *pdf)
 {
-#ifdef __MULTI_CLOSURE__
 	int sampled = 0;
 
 	if(sd->num_closure > 1) {
@@ -596,13 +579,6 @@ ccl_device int shader_bsdf_sample(KernelGlobals *kg, const ShaderData *sd,
 	}
 
 	return label;
-#else
-	/* sample the single closure that we picked */
-	*pdf = 0.0f;
-	int label = bsdf_sample(kg, sd, &sd->closure, randu, randv, bsdf_eval, omega_in, domega_in, pdf);
-	*bsdf_eval *= sd->closure.weight;
-	return label;
-#endif
 }
 
 ccl_device int shader_bsdf_sample_closure(KernelGlobals *kg, const ShaderData *sd,
@@ -623,21 +599,16 @@ ccl_device int shader_bsdf_sample_closure(KernelGlobals *kg, const ShaderData *s
 
 ccl_device void shader_bsdf_blur(KernelGlobals *kg, ShaderData *sd, float roughness)
 {
-#ifdef __MULTI_CLOSURE__
 	for(int i = 0; i< sd->num_closure; i++) {
 		ShaderClosure *sc = &sd->closure[i];
 
 		if(CLOSURE_IS_BSDF(sc->type))
 			bsdf_blur(kg, sc, roughness);
 	}
-#else
-	bsdf_blur(kg, &sd->closure, roughness);
-#endif
 }
 
 ccl_device float3 shader_bsdf_transparency(KernelGlobals *kg, ShaderData *sd)
 {
-#ifdef __MULTI_CLOSURE__
 	float3 eval = make_float3(0.0f, 0.0f, 0.0f);
 
 	for(int i = 0; i< sd->num_closure; i++) {
@@ -648,12 +619,6 @@ ccl_device float3 shader_bsdf_transparency(KernelGlobals *kg, ShaderData *sd)
 	}
 
 	return eval;
-#else
-	if(sd->closure.type == CLOSURE_BSDF_TRANSPARENT_ID)
-		return sd->closure.weight;
-	else
-		return make_float3(0.0f, 0.0f, 0.0f);
-#endif
 }
 
 ccl_device float3 shader_bsdf_alpha(KernelGlobals *kg, ShaderData *sd)
@@ -668,7 +633,6 @@ ccl_device float3 shader_bsdf_alpha(KernelGlobals *kg, ShaderData *sd)
 
 ccl_device float3 shader_bsdf_diffuse(KernelGlobals *kg, ShaderData *sd)
 {
-#ifdef __MULTI_CLOSURE__
 	float3 eval = make_float3(0.0f, 0.0f, 0.0f);
 
 	for(int i = 0; i< sd->num_closure; i++) {
@@ -679,17 +643,10 @@ ccl_device float3 shader_bsdf_diffuse(KernelGlobals *kg, ShaderData *sd)
 	}
 
 	return eval;
-#else
-	if(CLOSURE_IS_BSDF_DIFFUSE(sd->closure.type))
-		return sd->closure.weight;
-	else
-		return make_float3(0.0f, 0.0f, 0.0f);
-#endif
 }
 
 ccl_device float3 shader_bsdf_glossy(KernelGlobals *kg, ShaderData *sd)
 {
-#ifdef __MULTI_CLOSURE__
 	float3 eval = make_float3(0.0f, 0.0f, 0.0f);
 
 	for(int i = 0; i< sd->num_closure; i++) {
@@ -700,17 +657,10 @@ ccl_device float3 shader_bsdf_glossy(KernelGlobals *kg, ShaderData *sd)
 	}
 
 	return eval;
-#else
-	if(CLOSURE_IS_BSDF_GLOSSY(sd->closure.type))
-		return sd->closure.weight;
-	else
-		return make_float3(0.0f, 0.0f, 0.0f);
-#endif
 }
 
 ccl_device float3 shader_bsdf_transmission(KernelGlobals *kg, ShaderData *sd)
 {
-#ifdef __MULTI_CLOSURE__
 	float3 eval = make_float3(0.0f, 0.0f, 0.0f);
 
 	for(int i = 0; i< sd->num_closure; i++) {
@@ -721,17 +671,10 @@ ccl_device float3 shader_bsdf_transmission(KernelGlobals *kg, ShaderData *sd)
 	}
 
 	return eval;
-#else
-	if(CLOSURE_IS_BSDF_TRANSMISSION(sd->closure.type))
-		return sd->closure.weight;
-	else
-		return make_float3(0.0f, 0.0f, 0.0f);
-#endif
 }
 
 ccl_device float3 shader_bsdf_subsurface(KernelGlobals *kg, ShaderData *sd)
 {
-#ifdef __MULTI_CLOSURE__
 	float3 eval = make_float3(0.0f, 0.0f, 0.0f);
 
 	for(int i = 0; i< sd->num_closure; i++) {
@@ -742,17 +685,10 @@ ccl_device float3 shader_bsdf_subsurface(KernelGlobals *kg, ShaderData *sd)
 	}
 
 	return eval;
-#else
-	if(CLOSURE_IS_BSSRDF(sd->closure.type))
-		return sd->closure.weight;
-	else
-		return make_float3(0.0f, 0.0f, 0.0f);
-#endif
 }
 
 ccl_device float3 shader_bsdf_ao(KernelGlobals *kg, ShaderData *sd, float ao_factor, float3 *N_)
 {
-#ifdef __MULTI_CLOSURE__
 	float3 eval = make_float3(0.0f, 0.0f, 0.0f);
 	float3 N = make_float3(0.0f, 0.0f, 0.0f);
 
@@ -776,21 +712,10 @@ ccl_device float3 shader_bsdf_ao(KernelGlobals *kg, ShaderData *sd, float ao_fac
 
 	*N_ = N;
 	return eval;
-#else
-	*N_ = sd->N;
-
-	if(CLOSURE_IS_BSDF_DIFFUSE(sd->closure.type))
-		return sd->closure.weight*ao_factor;
-	else if(CLOSURE_IS_AMBIENT_OCCLUSION(sd->closure.type))
-		return sd->closure.weight;
-	else
-		return make_float3(0.0f, 0.0f, 0.0f);
-#endif
 }
 
 ccl_device float3 shader_bssrdf_sum(ShaderData *sd, float3 *N_, float *texture_blur_)
 {
-#ifdef __MULTI_CLOSURE__
 	float3 eval = make_float3(0.0f, 0.0f, 0.0f);
 	float3 N = make_float3(0.0f, 0.0f, 0.0f);
 	float texture_blur = 0.0f, weight_sum = 0.0f;
@@ -815,20 +740,6 @@ ccl_device float3 shader_bssrdf_sum(ShaderData *sd, float3 *N_, float *texture_b
 		*texture_blur_ = texture_blur/weight_sum;
 	
 	return eval;
-#else
-	if(CLOSURE_IS_BSSRDF(sd->closure.type)) {
-		if(N_) *N_ = sd->closure.N;
-		if(texture_blur_) *texture_blur_ = sd->closure.data1;
-
-		return sd->closure.weight;
-	}
-	else {
-		if(N_) *N_ = sd->N;
-		if(texture_blur_) *texture_blur_ = 0.0f;
-
-		return make_float3(0.0f, 0.0f, 0.0f);
-	}
-#endif
 }
 
 /* Emission */
@@ -841,7 +752,6 @@ ccl_device float3 emissive_eval(KernelGlobals *kg, ShaderData *sd, ShaderClosure
 ccl_device float3 shader_emissive_eval(KernelGlobals *kg, ShaderData *sd)
 {
 	float3 eval;
-#ifdef __MULTI_CLOSURE__
 	eval = make_float3(0.0f, 0.0f, 0.0f);
 
 	for(int i = 0; i < sd->num_closure; i++) {
@@ -850,9 +760,6 @@ ccl_device float3 shader_emissive_eval(KernelGlobals *kg, ShaderData *sd)
 		if(CLOSURE_IS_EMISSION(sc->type))
 			eval += emissive_eval(kg, sd, sc)*sc->weight;
 	}
-#else
-	eval = emissive_eval(kg, sd, &sd->closure)*sd->closure.weight;
-#endif
 
 	return eval;
 }
@@ -861,7 +768,6 @@ ccl_device float3 shader_emissive_eval(KernelGlobals *kg, ShaderData *sd)
 
 ccl_device float3 shader_holdout_eval(KernelGlobals *kg, ShaderData *sd)
 {
-#ifdef __MULTI_CLOSURE__
 	float3 weight = make_float3(0.0f, 0.0f, 0.0f);
 
 	for(int i = 0; i < sd->num_closure; i++) {
@@ -872,12 +778,6 @@ ccl_device float3 shader_holdout_eval(KernelGlobals *kg, ShaderData *sd)
 	}
 
 	return weight;
-#else
-	if(sd->closure.type == CLOSURE_HOLDOUT_ID)
-		return make_float3(1.0f, 1.0f, 1.0f);
-
-	return make_float3(0.0f, 0.0f, 0.0f);
-#endif
 }
 
 /* Surface Evaluation */
@@ -885,12 +785,8 @@ ccl_device float3 shader_holdout_eval(KernelGlobals *kg, ShaderData *sd)
 ccl_device void shader_eval_surface(KernelGlobals *kg, ShaderData *sd,
 	float randb, int path_flag, ShaderContext ctx)
 {
-#ifdef __MULTI_CLOSURE__
 	sd->num_closure = 0;
 	sd->randb_closure = randb;
-#else
-	sd->closure.type = NBUILTIN_CLOSURES;
-#endif
 
 #ifdef __OSL__
 	if(kg->osl)
@@ -899,7 +795,7 @@ ccl_device void shader_eval_surface(KernelGlobals *kg, ShaderData *sd,
 #endif
 	{
 #ifdef __SVM__
-		svm_eval_nodes(kg, sd, SHADER_TYPE_SURFACE, randb, path_flag);
+		svm_eval_nodes(kg, sd, SHADER_TYPE_SURFACE, path_flag);
 #else
 		sd->closure.weight = make_float3(0.8f, 0.8f, 0.8f);
 		sd->closure.N = sd->N;
@@ -912,12 +808,8 @@ ccl_device void shader_eval_surface(KernelGlobals *kg, ShaderData *sd,
 
 ccl_device float3 shader_eval_background(KernelGlobals *kg, ShaderData *sd, int path_flag, ShaderContext ctx)
 {
-#ifdef __MULTI_CLOSURE__
 	sd->num_closure = 0;
 	sd->randb_closure = 0.0f;
-#else
-	sd->closure.type = NBUILTIN_CLOSURES;
-#endif
 
 #ifdef __OSL__
 	if(kg->osl) {
@@ -928,9 +820,8 @@ ccl_device float3 shader_eval_background(KernelGlobals *kg, ShaderData *sd, int
 
 	{
 #ifdef __SVM__
-		svm_eval_nodes(kg, sd, SHADER_TYPE_SURFACE, 0.0f, path_flag);
+		svm_eval_nodes(kg, sd, SHADER_TYPE_SURFACE, path_flag);
 
-#ifdef __MULTI_CLOSURE__
 		float3 eval = make_float3(0.0f, 0.0f, 0.0f);
 
 		for(int i = 0; i< sd->num_closure; i++) {
@@ -942,13 +833,6 @@ ccl_device float3 shader_eval_background(KernelGlobals *kg, ShaderData *sd, int
 
 		return eval;
 #else
-		if(sd->closure.type == CLOSURE_BACKGROUND_ID)
-			return sd->closure.weight;
-		else
-			return make_float3(0.0f, 0.0f, 0.0f);
-#endif
-
-#else
 		return make_float3(0.8f, 0.8f, 0.8f);
 #endif
 	}
@@ -1067,14 +951,10 @@ ccl_device void shader_eval_volume(KernelGlobals *kg, ShaderData *sd,
 {
 	/* reset closures once at the start, we will be accumulating the closures
 	 * for all volumes in the stack into a single array of closures */
-#ifdef __MULTI_CLOSURE__
 	sd->num_closure = 0;
-#else
-	sd->closure.type = NBUILTIN_CLOSURES;
-#endif
 	sd->flag = 0;
 
-	for(int i = 0; stack[i].shader != SHADER_NO_ID; i++) {
+	for(int i = 0; stack[i].shader != SHADER_NONE; i++) {
 		/* setup shaderdata from stack. it's mostly setup already in
 		 * shader_setup_from_volume, this switching should be quick */
 		sd->object = stack[i].object;
@@ -1083,7 +963,7 @@ ccl_device void shader_eval_volume(KernelGlobals *kg, ShaderData *sd,
 		sd->flag &= ~(SD_SHADER_FLAGS|SD_OBJECT_FLAGS);
 		sd->flag |= kernel_tex_fetch(__shader_flag, (sd->shader & SHADER_MASK)*2);
 
-		if(sd->object != ~0) {
+		if(sd->object != OBJECT_NONE) {
 			sd->flag |= kernel_tex_fetch(__object_flag, sd->object);
 
 #ifdef __OBJECT_MOTION__
@@ -1102,7 +982,7 @@ ccl_device void shader_eval_volume(KernelGlobals *kg, ShaderData *sd,
 		else
 #endif
 		{
-			svm_eval_nodes(kg, sd, SHADER_TYPE_VOLUME, 0.0f, path_flag);
+			svm_eval_nodes(kg, sd, SHADER_TYPE_VOLUME, path_flag);
 		}
 #endif
 
@@ -1118,12 +998,8 @@ ccl_device void shader_eval_volume(KernelGlobals *kg, ShaderData *sd,
 
 ccl_device void shader_eval_displacement(KernelGlobals *kg, ShaderData *sd, ShaderContext ctx)
 {
-#ifdef __MULTI_CLOSURE__
 	sd->num_closure = 0;
 	sd->randb_closure = 0.0f;
-#else
-	sd->closure.type = NBUILTIN_CLOSURES;
-#endif
 
 	/* this will modify sd->P */
 #ifdef __SVM__
@@ -1133,7 +1009,7 @@ ccl_device void shader_eval_displacement(KernelGlobals *kg, ShaderData *sd, Shad
 	else
 #endif
 	{
-		svm_eval_nodes(kg, sd, SHADER_TYPE_DISPLACEMENT, 0.0f, 0);
+		svm_eval_nodes(kg, sd, SHADER_TYPE_DISPLACEMENT, 0);
 	}
 #endif
 }
@@ -1147,7 +1023,7 @@ ccl_device bool shader_transparent_shadow(KernelGlobals *kg, Intersection *isect
 	int shader = 0;
 
 #ifdef __HAIR__
-	if(kernel_tex_fetch(__prim_segment, isect->prim) == ~0) {
+	if(kernel_tex_fetch(__prim_type, isect->prim) & PRIMITIVE_ALL_TRIANGLE) {
 #endif
 		float4 Ns = kernel_tex_fetch(__tri_normal, prim);
 		shader = __float_as_int(Ns.w);
diff --git a/intern/cycles/kernel/kernel_shadow.h b/intern/cycles/kernel/kernel_shadow.h
index 9b015c98c40..ab7524c411a 100644
--- a/intern/cycles/kernel/kernel_shadow.h
+++ b/intern/cycles/kernel/kernel_shadow.h
@@ -16,6 +16,178 @@
 
 CCL_NAMESPACE_BEGIN
 
+#ifdef __SHADOW_RECORD_ALL__
+
+/* Shadow function to compute how much light is blocked, CPU variation.
+ *
+ * We trace a single ray. If it hits any opaque surface, or more than a given
+ * number of transparent surfaces is hit, then we consider the geometry to be
+ * entirely blocked. If not, all transparent surfaces will be recorded and we
+ * will shade them one by one to determine how much light is blocked. This all
+ * happens in one scene intersection function.
+ *
+ * Recording all hits works well in some cases but may be slower in others. If
+ * we have many semi-transparent hairs, one intersection may be faster because
+ * you'd be reinteresecting the same hairs a lot with each step otherwise. If
+ * however there is mostly binary transparency then we may be recording many
+ * unnecessary intersections when one of the first surfaces blocks all light.
+ *
+ * From tests in real scenes it seems the performance loss is either minimal,
+ * or there is a performance increase anyway due to avoiding the need to send
+ * two rays with transparent shadows.
+ *
+ * This is CPU only because of qsort, and malloc or high stack space usage to
+ * record all these intersections. */
+
+ccl_device_noinline int shadow_intersections_compare(const void *a, const void *b)
+{
+	const Intersection *isect_a = (const Intersection*)a;
+	const Intersection *isect_b = (const Intersection*)b;
+
+	if(isect_a->t < isect_b->t)
+		return -1;
+	else if(isect_a->t > isect_b->t)
+		return 1;
+	else
+		return 0;
+}
+
+#define STACK_MAX_HITS 64
+
+ccl_device_inline bool shadow_blocked(KernelGlobals *kg, PathState *state, Ray *ray, float3 *shadow)
+{
+	*shadow = make_float3(1.0f, 1.0f, 1.0f);
+
+	if(ray->t == 0.0f)
+		return false;
+	
+	bool blocked;
+
+	if(kernel_data.integrator.transparent_shadows) {
+		/* intersect to find an opaque surface, or record all transparent surface hits */
+		Intersection hits_stack[STACK_MAX_HITS];
+		Intersection *hits;
+		uint max_hits = kernel_data.integrator.transparent_max_bounce - state->transparent_bounce - 1;
+
+		/* prefer to use stack but use dynamic allocation if too deep max hits
+		 * we need max_hits + 1 storage space due to the logic in
+		 * scene_intersect_shadow_all which will first store and then check if
+		 * the limit is exceeded */
+		if(max_hits + 1 <= STACK_MAX_HITS)
+			hits = hits_stack;
+		else
+			hits = (Intersection*)malloc(sizeof(Intersection)*(max_hits + 1));
+
+		uint num_hits;
+		blocked = scene_intersect_shadow_all(kg, ray, hits, max_hits, &num_hits);
+
+		/* if no opaque surface found but we did find transparent hits, shade them */
+		if(!blocked && num_hits > 0) {
+			float3 throughput = make_float3(1.0f, 1.0f, 1.0f);
+			float3 Pend = ray->P + ray->D*ray->t;
+			float last_t = 0.0f;
+			int bounce = state->transparent_bounce;
+			Intersection *isect = hits;
+#ifdef __VOLUME__
+			PathState ps = *state;
+#endif
+
+			qsort(hits, num_hits, sizeof(Intersection), shadow_intersections_compare);
+
+			for(int hit = 0; hit < num_hits; hit++, isect++) {
+				/* adjust intersection distance for moving ray forward */
+				float new_t = isect->t;
+				isect->t -= last_t;
+
+				/* skip hit if we did not move forward, step by step raytracing
+				 * would have skipped it as well then */
+				if(last_t == new_t)
+					continue;
+
+				last_t = new_t;
+
+#ifdef __VOLUME__
+				/* attenuation between last surface and next surface */
+				if(ps.volume_stack[0].shader != SHADER_NONE) {
+					Ray segment_ray = *ray;
+					segment_ray.t = isect->t;
+					kernel_volume_shadow(kg, &ps, &segment_ray, &throughput);
+				}
+#endif
+
+				/* setup shader data at surface */
+				ShaderData sd;
+				shader_setup_from_ray(kg, &sd, isect, ray, state->bounce+1, bounce);
+
+				/* attenuation from transparent surface */
+				if(!(sd.flag & SD_HAS_ONLY_VOLUME)) {
+					shader_eval_surface(kg, &sd, 0.0f, PATH_RAY_SHADOW, SHADER_CONTEXT_SHADOW);
+					throughput *= shader_bsdf_transparency(kg, &sd);
+				}
+
+				/* stop if all light is blocked */
+				if(is_zero(throughput)) {
+					/* free dynamic storage */
+					if(hits != hits_stack)
+						free(hits);
+					return true;
+				}
+
+				/* move ray forward */
+				ray->P = sd.P;
+				if(ray->t != FLT_MAX)
+					ray->D = normalize_len(Pend - ray->P, &ray->t);
+
+#ifdef __VOLUME__
+				/* exit/enter volume */
+				kernel_volume_stack_enter_exit(kg, &sd, ps.volume_stack);
+#endif
+
+				bounce++;
+			}
+
+#ifdef __VOLUME__
+			/* attenuation for last line segment towards light */
+			if(ps.volume_stack[0].shader != SHADER_NONE)
+				kernel_volume_shadow(kg, &ps, ray, &throughput);
+#endif
+
+			*shadow *= throughput;
+		}
+
+		/* free dynamic storage */
+		if(hits != hits_stack)
+			free(hits);
+	}
+	else {
+		Intersection isect;
+#ifdef __HAIR__
+		blocked = scene_intersect(kg, ray, PATH_RAY_SHADOW_OPAQUE, &isect, NULL, 0.0f, 0.0f);
+#else
+		blocked = scene_intersect(kg, ray, PATH_RAY_SHADOW_OPAQUE, &isect);
+#endif
+	}
+
+#ifdef __VOLUME__
+	if(!blocked && state->volume_stack[0].shader != SHADER_NONE) {
+		/* apply attenuation from current volume shader */
+		kernel_volume_shadow(kg, state, ray, shadow);
+	}
+#endif
+
+	return blocked;
+}
+
+#else
+
+/* Shadow function to compute how much light is blocked, GPU variation.
+ *
+ * Here we raytrace from one transparent surface to the next step by step.
+ * To minimize overhead in cases where we don't need transparent shadows, we
+ * first trace a regular shadow ray. We check if the hit primitive was
+ * potentially transparent, and only in that case start marching. this gives
+ * one extra ray cast for the cases were we do want transparency. */
+
 ccl_device_inline bool shadow_blocked(KernelGlobals *kg, PathState *state, Ray *ray, float3 *shadow)
 {
 	*shadow = make_float3(1.0f, 1.0f, 1.0f);
@@ -25,21 +197,13 @@ ccl_device_inline bool shadow_blocked(KernelGlobals *kg, PathState *state, Ray *
 
 	Intersection isect;
 #ifdef __HAIR__
-	bool result = scene_intersect(kg, ray, PATH_RAY_SHADOW_OPAQUE, &isect, NULL, 0.0f, 0.0f);
+	bool blocked = scene_intersect(kg, ray, PATH_RAY_SHADOW_OPAQUE, &isect, NULL, 0.0f, 0.0f);
 #else
-	bool result = scene_intersect(kg, ray, PATH_RAY_SHADOW_OPAQUE, &isect);
+	bool blocked = scene_intersect(kg, ray, PATH_RAY_SHADOW_OPAQUE, &isect);
 #endif
 
 #ifdef __TRANSPARENT_SHADOWS__
-	if(result && kernel_data.integrator.transparent_shadows) {
-		/* transparent shadows work in such a way to try to minimize overhead
-		 * in cases where we don't need them. after a regular shadow ray is
-		 * cast we check if the hit primitive was potentially transparent, and
-		 * only in that case start marching. this gives on extra ray cast for
-		 * the cases were we do want transparency.
-		 *
-		 * also note that for this to work correct, multi close sampling must
-		 * be used, since we don't pass a random number to shader_eval_surface */
+	if(blocked && kernel_data.integrator.transparent_shadows) {
 		if(shader_transparent_shadow(kg, &isect)) {
 			float3 throughput = make_float3(1.0f, 1.0f, 1.0f);
 			float3 Pend = ray->P + ray->D*ray->t;
@@ -49,35 +213,24 @@ ccl_device_inline bool shadow_blocked(KernelGlobals *kg, PathState *state, Ray *
 #endif
 
 			for(;;) {
-				if(bounce >= kernel_data.integrator.transparent_max_bounce) {
+				if(bounce >= kernel_data.integrator.transparent_max_bounce)
 					return true;
-				}
-				else if(bounce >= kernel_data.integrator.transparent_min_bounce) {
-					/* todo: get random number somewhere for probabilistic terminate */
-#if 0
-					float probability = average(throughput);
-					float terminate = 0.0f;
-
-					if(terminate >= probability)
-						return true;
-
-					throughput /= probability;
-#endif
-				}
 
 #ifdef __HAIR__
-				if(!scene_intersect(kg, ray, PATH_RAY_SHADOW_TRANSPARENT, &isect, NULL, 0.0f, 0.0f)) {
+				if(!scene_intersect(kg, ray, PATH_RAY_SHADOW_TRANSPARENT, &isect, NULL, 0.0f, 0.0f))
 #else
-				if(!scene_intersect(kg, ray, PATH_RAY_SHADOW_TRANSPARENT, &isect)) {
+				if(!scene_intersect(kg, ray, PATH_RAY_SHADOW_TRANSPARENT, &isect))
 #endif
+				{
 
 #ifdef __VOLUME__
 					/* attenuation for last line segment towards light */
-					if(ps.volume_stack[0].shader != SHADER_NO_ID)
+					if(ps.volume_stack[0].shader != SHADER_NONE)
 						kernel_volume_shadow(kg, &ps, ray, &throughput);
 #endif
 
 					*shadow *= throughput;
+
 					return false;
 				}
 
@@ -86,7 +239,7 @@ ccl_device_inline bool shadow_blocked(KernelGlobals *kg, PathState *state, Ray *
 
 #ifdef __VOLUME__
 				/* attenuation between last surface and next surface */
-				if(ps.volume_stack[0].shader != SHADER_NO_ID) {
+				if(ps.volume_stack[0].shader != SHADER_NONE) {
 					Ray segment_ray = *ray;
 					segment_ray.t = isect.t;
 					kernel_volume_shadow(kg, &ps, &segment_ray, &throughput);
@@ -95,7 +248,7 @@ ccl_device_inline bool shadow_blocked(KernelGlobals *kg, PathState *state, Ray *
 
 				/* setup shader data at surface */
 				ShaderData sd;
-				shader_setup_from_ray(kg, &sd, &isect, ray, state->bounce+1);
+				shader_setup_from_ray(kg, &sd, &isect, ray, state->bounce+1, bounce);
 
 				/* attenuation from transparent surface */
 				if(!(sd.flag & SD_HAS_ONLY_VOLUME)) {
@@ -103,6 +256,9 @@ ccl_device_inline bool shadow_blocked(KernelGlobals *kg, PathState *state, Ray *
 					throughput *= shader_bsdf_transparency(kg, &sd);
 				}
 
+				if(is_zero(throughput))
+					return true;
+
 				/* move ray forward */
 				ray->P = ray_offset(sd.P, -sd.Ng);
 				if(ray->t != FLT_MAX)
@@ -118,15 +274,17 @@ ccl_device_inline bool shadow_blocked(KernelGlobals *kg, PathState *state, Ray *
 		}
 	}
 #ifdef __VOLUME__
-	else if(!result && state->volume_stack[0].shader != SHADER_NO_ID) {
+	else if(!blocked && state->volume_stack[0].shader != SHADER_NONE) {
 		/* apply attenuation from current volume shader */
 		kernel_volume_shadow(kg, state, ray, shadow);
 	}
 #endif
 #endif
 
-	return result;
+	return blocked;
 }
 
+#endif
+
 CCL_NAMESPACE_END
 
diff --git a/intern/cycles/kernel/kernel_sse2.cpp b/intern/cycles/kernel/kernel_sse2.cpp
index 6a2a7804146..2d5f6091908 100644
--- a/intern/cycles/kernel/kernel_sse2.cpp
+++ b/intern/cycles/kernel/kernel_sse2.cpp
@@ -75,6 +75,6 @@ CCL_NAMESPACE_END
 
 /* needed for some linkers in combination with scons making empty compilation unit in a library */
 void __dummy_function_cycles_sse2(void);
-void __dummy_function_cycles_sse2(void){}
+void __dummy_function_cycles_sse2(void) {}
 
 #endif
diff --git a/intern/cycles/kernel/kernel_sse3.cpp b/intern/cycles/kernel/kernel_sse3.cpp
index 9d0abb93cc6..1062fd0c990 100644
--- a/intern/cycles/kernel/kernel_sse3.cpp
+++ b/intern/cycles/kernel/kernel_sse3.cpp
@@ -76,6 +76,6 @@ CCL_NAMESPACE_END
 
 /* needed for some linkers in combination with scons making empty compilation unit in a library */
 void __dummy_function_cycles_sse3(void);
-void __dummy_function_cycles_sse3(void){}
+void __dummy_function_cycles_sse3(void) {}
 
 #endif
diff --git a/intern/cycles/kernel/kernel_sse41.cpp b/intern/cycles/kernel/kernel_sse41.cpp
index bc20de0ec20..ba3b4887650 100644
--- a/intern/cycles/kernel/kernel_sse41.cpp
+++ b/intern/cycles/kernel/kernel_sse41.cpp
@@ -77,6 +77,6 @@ CCL_NAMESPACE_END
 
 /* needed for some linkers in combination with scons making empty compilation unit in a library */
 void __dummy_function_cycles_sse41(void);
-void __dummy_function_cycles_sse41(void){}
+void __dummy_function_cycles_sse41(void) {}
 
 #endif
diff --git a/intern/cycles/kernel/kernel_textures.h b/intern/cycles/kernel/kernel_textures.h
index f06fa119cfc..b07075c6c95 100644
--- a/intern/cycles/kernel/kernel_textures.h
+++ b/intern/cycles/kernel/kernel_textures.h
@@ -25,7 +25,7 @@
 /* bvh */
 KERNEL_TEX(float4, texture_float4, __bvh_nodes)
 KERNEL_TEX(float4, texture_float4, __tri_woop)
-KERNEL_TEX(uint, texture_uint, __prim_segment)
+KERNEL_TEX(uint, texture_uint, __prim_type)
 KERNEL_TEX(uint, texture_uint, __prim_visibility)
 KERNEL_TEX(uint, texture_uint, __prim_index)
 KERNEL_TEX(uint, texture_uint, __prim_object)
@@ -174,6 +174,61 @@ KERNEL_IMAGE_TEX(uchar4, texture_image_uchar4, __tex_image_097)
 KERNEL_IMAGE_TEX(uchar4, texture_image_uchar4, __tex_image_098)
 KERNEL_IMAGE_TEX(uchar4, texture_image_uchar4, __tex_image_099)
 
+/* Kepler and above */
+#if defined(__CUDA_ARCH__) && (__CUDA_ARCH__ >= 300)
+KERNEL_IMAGE_TEX(uchar4, texture_image_uchar4, __tex_image_100)
+KERNEL_IMAGE_TEX(uchar4, texture_image_uchar4, __tex_image_101)
+KERNEL_IMAGE_TEX(uchar4, texture_image_uchar4, __tex_image_102)
+KERNEL_IMAGE_TEX(uchar4, texture_image_uchar4, __tex_image_103)
+KERNEL_IMAGE_TEX(uchar4, texture_image_uchar4, __tex_image_104)
+KERNEL_IMAGE_TEX(uchar4, texture_image_uchar4, __tex_image_105)
+KERNEL_IMAGE_TEX(uchar4, texture_image_uchar4, __tex_image_106)
+KERNEL_IMAGE_TEX(uchar4, texture_image_uchar4, __tex_image_107)
+KERNEL_IMAGE_TEX(uchar4, texture_image_uchar4, __tex_image_108)
+KERNEL_IMAGE_TEX(uchar4, texture_image_uchar4, __tex_image_109)
+KERNEL_IMAGE_TEX(uchar4, texture_image_uchar4, __tex_image_110)
+KERNEL_IMAGE_TEX(uchar4, texture_image_uchar4, __tex_image_111)
+KERNEL_IMAGE_TEX(uchar4, texture_image_uchar4, __tex_image_112)
+KERNEL_IMAGE_TEX(uchar4, texture_image_uchar4, __tex_image_113)
+KERNEL_IMAGE_TEX(uchar4, texture_image_uchar4, __tex_image_114)
+KERNEL_IMAGE_TEX(uchar4, texture_image_uchar4, __tex_image_115)
+KERNEL_IMAGE_TEX(uchar4, texture_image_uchar4, __tex_image_116)
+KERNEL_IMAGE_TEX(uchar4, texture_image_uchar4, __tex_image_117)
+KERNEL_IMAGE_TEX(uchar4, texture_image_uchar4, __tex_image_118)
+KERNEL_IMAGE_TEX(uchar4, texture_image_uchar4, __tex_image_119)
+KERNEL_IMAGE_TEX(uchar4, texture_image_uchar4, __tex_image_120)
+KERNEL_IMAGE_TEX(uchar4, texture_image_uchar4, __tex_image_121)
+KERNEL_IMAGE_TEX(uchar4, texture_image_uchar4, __tex_image_122)
+KERNEL_IMAGE_TEX(uchar4, texture_image_uchar4, __tex_image_123)
+KERNEL_IMAGE_TEX(uchar4, texture_image_uchar4, __tex_image_124)
+KERNEL_IMAGE_TEX(uchar4, texture_image_uchar4, __tex_image_125)
+KERNEL_IMAGE_TEX(uchar4, texture_image_uchar4, __tex_image_126)
+KERNEL_IMAGE_TEX(uchar4, texture_image_uchar4, __tex_image_127)
+KERNEL_IMAGE_TEX(uchar4, texture_image_uchar4, __tex_image_128)
+KERNEL_IMAGE_TEX(uchar4, texture_image_uchar4, __tex_image_129)
+KERNEL_IMAGE_TEX(uchar4, texture_image_uchar4, __tex_image_130)
+KERNEL_IMAGE_TEX(uchar4, texture_image_uchar4, __tex_image_131)
+KERNEL_IMAGE_TEX(uchar4, texture_image_uchar4, __tex_image_132)
+KERNEL_IMAGE_TEX(uchar4, texture_image_uchar4, __tex_image_133)
+KERNEL_IMAGE_TEX(uchar4, texture_image_uchar4, __tex_image_134)
+KERNEL_IMAGE_TEX(uchar4, texture_image_uchar4, __tex_image_135)
+KERNEL_IMAGE_TEX(uchar4, texture_image_uchar4, __tex_image_136)
+KERNEL_IMAGE_TEX(uchar4, texture_image_uchar4, __tex_image_137)
+KERNEL_IMAGE_TEX(uchar4, texture_image_uchar4, __tex_image_138)
+KERNEL_IMAGE_TEX(uchar4, texture_image_uchar4, __tex_image_139)
+KERNEL_IMAGE_TEX(uchar4, texture_image_uchar4, __tex_image_140)
+KERNEL_IMAGE_TEX(uchar4, texture_image_uchar4, __tex_image_141)
+KERNEL_IMAGE_TEX(uchar4, texture_image_uchar4, __tex_image_142)
+KERNEL_IMAGE_TEX(uchar4, texture_image_uchar4, __tex_image_143)
+KERNEL_IMAGE_TEX(uchar4, texture_image_uchar4, __tex_image_144)
+KERNEL_IMAGE_TEX(uchar4, texture_image_uchar4, __tex_image_145)
+KERNEL_IMAGE_TEX(uchar4, texture_image_uchar4, __tex_image_146)
+KERNEL_IMAGE_TEX(uchar4, texture_image_uchar4, __tex_image_147)
+KERNEL_IMAGE_TEX(uchar4, texture_image_uchar4, __tex_image_148)
+KERNEL_IMAGE_TEX(uchar4, texture_image_uchar4, __tex_image_149)
+KERNEL_IMAGE_TEX(uchar4, texture_image_uchar4, __tex_image_150)
+#endif
+
 /* packed image (opencl) */
 KERNEL_TEX(uchar4, texture_uchar4, __tex_image_packed)
 KERNEL_TEX(uint4, texture_uint4, __tex_image_packed_info)
diff --git a/intern/cycles/kernel/kernel_triangle.h b/intern/cycles/kernel/kernel_triangle.h
deleted file mode 100644
index 0455df85961..00000000000
--- a/intern/cycles/kernel/kernel_triangle.h
+++ /dev/null
@@ -1,180 +0,0 @@
-/*
- * Copyright 2011-2013 Blender Foundation
- *
- * Licensed under the Apache License, Version 2.0 (the "License");
- * you may not use this file except in compliance with the License.
- * You may obtain a copy of the License at
- *
- * http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License
- */
-
-CCL_NAMESPACE_BEGIN
-
-/* Point on triangle for Moller-Trumbore triangles */
-ccl_device_inline float3 triangle_point_MT(KernelGlobals *kg, int tri_index, float u, float v)
-{
-	/* load triangle vertices */
-	float3 tri_vindex = float4_to_float3(kernel_tex_fetch(__tri_vindex, tri_index));
-
-	float3 v0 = float4_to_float3(kernel_tex_fetch(__tri_verts, __float_as_int(tri_vindex.x)));
-	float3 v1 = float4_to_float3(kernel_tex_fetch(__tri_verts, __float_as_int(tri_vindex.y)));
-	float3 v2 = float4_to_float3(kernel_tex_fetch(__tri_verts, __float_as_int(tri_vindex.z)));
-
-	/* compute point */
-	float t = 1.0f - u - v;
-	return (u*v0 + v*v1 + t*v2);
-}
-
-/* Normal for Moller-Trumbore triangles */
-ccl_device_inline float3 triangle_normal_MT(KernelGlobals *kg, int tri_index, int *shader)
-{
-#if 0
-	/* load triangle vertices */
-	float3 tri_vindex = float4_to_float3(kernel_tex_fetch(__tri_vindex, tri_index));
-
-	float3 v0 = float4_to_float3(kernel_tex_fetch(__tri_verts, __float_as_int(tri_vindex.x)));
-	float3 v1 = float4_to_float3(kernel_tex_fetch(__tri_verts, __float_as_int(tri_vindex.y)));
-	float3 v2 = float4_to_float3(kernel_tex_fetch(__tri_verts, __float_as_int(tri_vindex.z)));
-
-	/* compute normal */
-	return normalize(cross(v2 - v0, v1 - v0));
-#else
-	float4 Nm = kernel_tex_fetch(__tri_normal, tri_index);
-	*shader = __float_as_int(Nm.w);
-	return make_float3(Nm.x, Nm.y, Nm.z);
-#endif
-}
-
-/* Return 3 triangle vertex locations */
-ccl_device_inline void triangle_vertices(KernelGlobals *kg, int tri_index, float3 P[3])
-{
-	/* load triangle vertices */
-	float3 tri_vindex = float4_to_float3(kernel_tex_fetch(__tri_vindex, tri_index));
-
-	P[0] = float4_to_float3(kernel_tex_fetch(__tri_verts, __float_as_int(tri_vindex.x)));
-	P[1] = float4_to_float3(kernel_tex_fetch(__tri_verts, __float_as_int(tri_vindex.y)));
-	P[2] = float4_to_float3(kernel_tex_fetch(__tri_verts, __float_as_int(tri_vindex.z)));
-}
-
-ccl_device_inline float3 triangle_smooth_normal(KernelGlobals *kg, int tri_index, float u, float v)
-{
-	/* load triangle vertices */
-	float3 tri_vindex = float4_to_float3(kernel_tex_fetch(__tri_vindex, tri_index));
-
-	float3 n0 = float4_to_float3(kernel_tex_fetch(__tri_vnormal, __float_as_int(tri_vindex.x)));
-	float3 n1 = float4_to_float3(kernel_tex_fetch(__tri_vnormal, __float_as_int(tri_vindex.y)));
-	float3 n2 = float4_to_float3(kernel_tex_fetch(__tri_vnormal, __float_as_int(tri_vindex.z)));
-
-	return normalize((1.0f - u - v)*n2 + u*n0 + v*n1);
-}
-
-ccl_device_inline void triangle_dPdudv(KernelGlobals *kg, float3 *dPdu, float3 *dPdv, int tri)
-{
-	/* fetch triangle vertex coordinates */
-	float3 tri_vindex = float4_to_float3(kernel_tex_fetch(__tri_vindex, tri));
-
-	float3 p0 = float4_to_float3(kernel_tex_fetch(__tri_verts, __float_as_int(tri_vindex.x)));
-	float3 p1 = float4_to_float3(kernel_tex_fetch(__tri_verts, __float_as_int(tri_vindex.y)));
-	float3 p2 = float4_to_float3(kernel_tex_fetch(__tri_verts, __float_as_int(tri_vindex.z)));
-
-	/* compute derivatives of P w.r.t. uv */
-	*dPdu = (p0 - p2);
-	*dPdv = (p1 - p2);
-}
-
-/* attributes */
-
-ccl_device float triangle_attribute_float(KernelGlobals *kg, const ShaderData *sd, AttributeElement elem, int offset, float *dx, float *dy)
-{
-	if(elem == ATTR_ELEMENT_FACE) {
-		if(dx) *dx = 0.0f;
-		if(dy) *dy = 0.0f;
-
-		return kernel_tex_fetch(__attributes_float, offset + sd->prim);
-	}
-	else if(elem == ATTR_ELEMENT_VERTEX) {
-		float3 tri_vindex = float4_to_float3(kernel_tex_fetch(__tri_vindex, sd->prim));
-
-		float f0 = kernel_tex_fetch(__attributes_float, offset + __float_as_int(tri_vindex.x));
-		float f1 = kernel_tex_fetch(__attributes_float, offset + __float_as_int(tri_vindex.y));
-		float f2 = kernel_tex_fetch(__attributes_float, offset + __float_as_int(tri_vindex.z));
-
-#ifdef __RAY_DIFFERENTIALS__
-		if(dx) *dx = sd->du.dx*f0 + sd->dv.dx*f1 - (sd->du.dx + sd->dv.dx)*f2;
-		if(dy) *dy = sd->du.dy*f0 + sd->dv.dy*f1 - (sd->du.dy + sd->dv.dy)*f2;
-#endif
-
-		return sd->u*f0 + sd->v*f1 + (1.0f - sd->u - sd->v)*f2;
-	}
-	else if(elem == ATTR_ELEMENT_CORNER) {
-		int tri = offset + sd->prim*3;
-		float f0 = kernel_tex_fetch(__attributes_float, tri + 0);
-		float f1 = kernel_tex_fetch(__attributes_float, tri + 1);
-		float f2 = kernel_tex_fetch(__attributes_float, tri + 2);
-
-#ifdef __RAY_DIFFERENTIALS__
-		if(dx) *dx = sd->du.dx*f0 + sd->dv.dx*f1 - (sd->du.dx + sd->dv.dx)*f2;
-		if(dy) *dy = sd->du.dy*f0 + sd->dv.dy*f1 - (sd->du.dy + sd->dv.dy)*f2;
-#endif
-
-		return sd->u*f0 + sd->v*f1 + (1.0f - sd->u - sd->v)*f2;
-	}
-	else {
-		if(dx) *dx = 0.0f;
-		if(dy) *dy = 0.0f;
-
-		return 0.0f;
-	}
-}
-
-ccl_device float3 triangle_attribute_float3(KernelGlobals *kg, const ShaderData *sd, AttributeElement elem, int offset, float3 *dx, float3 *dy)
-{
-	if(elem == ATTR_ELEMENT_FACE) {
-		if(dx) *dx = make_float3(0.0f, 0.0f, 0.0f);
-		if(dy) *dy = make_float3(0.0f, 0.0f, 0.0f);
-
-		return float4_to_float3(kernel_tex_fetch(__attributes_float3, offset + sd->prim));
-	}
-	else if(elem == ATTR_ELEMENT_VERTEX) {
-		float3 tri_vindex = float4_to_float3(kernel_tex_fetch(__tri_vindex, sd->prim));
-
-		float3 f0 = float4_to_float3(kernel_tex_fetch(__attributes_float3, offset + __float_as_int(tri_vindex.x)));
-		float3 f1 = float4_to_float3(kernel_tex_fetch(__attributes_float3, offset + __float_as_int(tri_vindex.y)));
-		float3 f2 = float4_to_float3(kernel_tex_fetch(__attributes_float3, offset + __float_as_int(tri_vindex.z)));
-
-#ifdef __RAY_DIFFERENTIALS__
-		if(dx) *dx = sd->du.dx*f0 + sd->dv.dx*f1 - (sd->du.dx + sd->dv.dx)*f2;
-		if(dy) *dy = sd->du.dy*f0 + sd->dv.dy*f1 - (sd->du.dy + sd->dv.dy)*f2;
-#endif
-
-		return sd->u*f0 + sd->v*f1 + (1.0f - sd->u - sd->v)*f2;
-	}
-	else if(elem == ATTR_ELEMENT_CORNER) {
-		int tri = offset + sd->prim*3;
-		float3 f0 = float4_to_float3(kernel_tex_fetch(__attributes_float3, tri + 0));
-		float3 f1 = float4_to_float3(kernel_tex_fetch(__attributes_float3, tri + 1));
-		float3 f2 = float4_to_float3(kernel_tex_fetch(__attributes_float3, tri + 2));
-
-#ifdef __RAY_DIFFERENTIALS__
-		if(dx) *dx = sd->du.dx*f0 + sd->dv.dx*f1 - (sd->du.dx + sd->dv.dx)*f2;
-		if(dy) *dy = sd->du.dy*f0 + sd->dv.dy*f1 - (sd->du.dy + sd->dv.dy)*f2;
-#endif
-
-		return sd->u*f0 + sd->v*f1 + (1.0f - sd->u - sd->v)*f2;
-	}
-	else {
-		if(dx) *dx = make_float3(0.0f, 0.0f, 0.0f);
-		if(dy) *dy = make_float3(0.0f, 0.0f, 0.0f);
-
-		return make_float3(0.0f, 0.0f, 0.0f);
-	}
-}
-
-CCL_NAMESPACE_END
-
diff --git a/intern/cycles/kernel/kernel_types.h b/intern/cycles/kernel/kernel_types.h
index 5ee25a6cb98..11445aa1c93 100644
--- a/intern/cycles/kernel/kernel_types.h
+++ b/intern/cycles/kernel/kernel_types.h
@@ -46,7 +46,10 @@ CCL_NAMESPACE_BEGIN
 
 #define TEX_NUM_FLOAT_IMAGES	5
 
-#define SHADER_NO_ID			-1
+#define SHADER_NONE				(~0)
+#define OBJECT_NONE				(~0)
+#define PRIM_NONE				(~0)
+#define LAMP_NONE				(~0)
 
 #define VOLUME_STACK_SIZE		16
 
@@ -61,13 +64,17 @@ CCL_NAMESPACE_BEGIN
 #define __SUBSURFACE__
 #define __CMJ__
 #define __VOLUME__
+#define __SHADOW_RECORD_ALL__
 #endif
 
 #ifdef __KERNEL_CUDA__
 #define __KERNEL_SHADING__
 #define __KERNEL_ADV_SHADING__
 #define __BRANCHED_PATH__
+
+/* Experimental on GPU */
 //#define __VOLUME__
+//#define __SUBSURFACE__
 #endif
 
 #ifdef __KERNEL_OPENCL__
@@ -85,26 +92,24 @@ CCL_NAMESPACE_BEGIN
 #endif
 
 #ifdef __KERNEL_OPENCL_AMD__
-#define __SVM__
-#define __EMISSION__
-#define __IMAGE_TEXTURES__
-#define __PROCEDURAL_TEXTURES__
-#define __EXTRA_NODES__
-#define __HOLDOUT__
-#define __NORMAL_MAP__
-//#define __BACKGROUND_MIS__
-//#define __LAMP_MIS__
-//#define __AO__
-//#define __ANISOTROPIC__
+#define __CL_USE_NATIVE__
+#define __KERNEL_SHADING__
+//__KERNEL_ADV_SHADING__
+#define __MULTI_CLOSURE__
+#define __TRANSPARENT_SHADOWS__
+#define __PASSES__
+#define __BACKGROUND_MIS__
+#define __LAMP_MIS__
+#define __AO__
+#define __ANISOTROPIC__
 //#define __CAMERA_MOTION__
 //#define __OBJECT_MOTION__
 //#define __HAIR__
-//#define __MULTI_CLOSURE__
-//#define __TRANSPARENT_SHADOWS__
-//#define __PASSES__
+//end __KERNEL_ADV_SHADING__
 #endif
 
 #ifdef __KERNEL_OPENCL_INTEL_CPU__
+#define __CL_USE_NATIVE__
 #define __KERNEL_SHADING__
 #define __KERNEL_ADV_SHADING__
 #endif
@@ -147,12 +152,6 @@ CCL_NAMESPACE_BEGIN
 #define __HAIR__
 #endif
 
-/* Sanity check */
-
-#if defined(__KERNEL_OPENCL_NEED_ADVANCED_SHADING__) && !defined(__MULTI_CLOSURE__)
-#error "OpenCL: mismatch between advanced shading flags in device_opencl.cpp and kernel_types.h"
-#endif
-
 /* Random Numbers */
 
 typedef uint RNG;
@@ -161,7 +160,35 @@ typedef uint RNG;
 
 typedef enum ShaderEvalType {
 	SHADER_EVAL_DISPLACE,
-	SHADER_EVAL_BACKGROUND
+	SHADER_EVAL_BACKGROUND,
+	/* bake types */
+	SHADER_EVAL_BAKE, /* no real shade, it's used in the code to
+	                   * differentiate the type of shader eval from the above
+	                   */
+	/* data passes */
+	SHADER_EVAL_NORMAL,
+	SHADER_EVAL_UV,
+	SHADER_EVAL_DIFFUSE_COLOR,
+	SHADER_EVAL_GLOSSY_COLOR,
+	SHADER_EVAL_TRANSMISSION_COLOR,
+	SHADER_EVAL_SUBSURFACE_COLOR,
+	SHADER_EVAL_EMISSION,
+
+	/* light passes */
+	SHADER_EVAL_AO,
+	SHADER_EVAL_COMBINED,
+	SHADER_EVAL_SHADOW,
+	SHADER_EVAL_DIFFUSE_DIRECT,
+	SHADER_EVAL_GLOSSY_DIRECT,
+	SHADER_EVAL_TRANSMISSION_DIRECT,
+	SHADER_EVAL_SUBSURFACE_DIRECT,
+	SHADER_EVAL_DIFFUSE_INDIRECT,
+	SHADER_EVAL_GLOSSY_INDIRECT,
+	SHADER_EVAL_TRANSMISSION_INDIRECT,
+	SHADER_EVAL_SUBSURFACE_INDIRECT,
+
+	/* extra */
+	SHADER_EVAL_ENVIRONMENT,
 } ShaderEvalType;
 
 /* Path Tracing
@@ -177,10 +204,8 @@ enum PathTraceDimension {
 	PRNG_UNUSED_0 = 5,
 	PRNG_UNUSED_1 = 6,	/* for some reason (6, 7) is a bad sobol pattern */
 	PRNG_UNUSED_2 = 7,  /* with a low number of samples (< 64) */
-	PRNG_BASE_NUM = 8,
-#else
-	PRNG_BASE_NUM = 4,
 #endif
+	PRNG_BASE_NUM = 8,
 
 	PRNG_BSDF_U = 0,
 	PRNG_BSDF_V = 1,
@@ -188,7 +213,7 @@ enum PathTraceDimension {
 	PRNG_LIGHT = 3,
 	PRNG_LIGHT_U = 4,
 	PRNG_LIGHT_V = 5,
-	PRNG_LIGHT_F = 6,
+	PRNG_UNUSED_3 = 6,
 	PRNG_TERMINATE = 7,
 
 #ifdef __VOLUME__
@@ -220,7 +245,6 @@ enum PathRayFlag {
 	PATH_RAY_GLOSSY = 16,
 	PATH_RAY_SINGULAR = 32,
 	PATH_RAY_TRANSPARENT = 64,
-	PATH_RAY_VOLUME_SCATTER = 128,
 
 	PATH_RAY_SHADOW_OPAQUE = 128,
 	PATH_RAY_SHADOW_TRANSPARENT = 256,
@@ -228,16 +252,17 @@ enum PathRayFlag {
 
 	PATH_RAY_CURVE = 512, /* visibility flag to define curve segments*/
 
+	/* note that these can use maximum 12 bits, the other are for layers */
 	PATH_RAY_ALL_VISIBILITY = (1|2|4|8|16|32|64|128|256|512),
 
 	PATH_RAY_MIS_SKIP = 1024,
 	PATH_RAY_DIFFUSE_ANCESTOR = 2048,
 	PATH_RAY_GLOSSY_ANCESTOR = 4096,
 	PATH_RAY_BSSRDF_ANCESTOR = 8192,
-	PATH_RAY_SINGLE_PASS_DONE = 8192,
+	PATH_RAY_SINGLE_PASS_DONE = 16384,
+	PATH_RAY_VOLUME_SCATTER = 32768,
 
-	/* this gives collisions with localview bits
-	 * see: blender_util.h, grr - Campbell */
+	/* we need layer member flags to be the 20 upper bits */
 	PATH_RAY_LAYER_SHIFT = (32-20)
 };
 
@@ -282,7 +307,8 @@ typedef enum PassType {
 	PASS_MIST = 2097152,
 	PASS_SUBSURFACE_DIRECT = 4194304,
 	PASS_SUBSURFACE_INDIRECT = 8388608,
-	PASS_SUBSURFACE_COLOR = 16777216
+	PASS_SUBSURFACE_COLOR = 16777216,
+	PASS_LIGHT = 33554432, /* no real pass, used to force use_light_pass */
 } PassType;
 
 #define PASS_ALL (~0)
@@ -418,9 +444,27 @@ typedef struct Intersection {
 	float t, u, v;
 	int prim;
 	int object;
-	int segment;
+	int type;
 } Intersection;
 
+/* Primitives */
+
+typedef enum PrimitiveType {
+	PRIMITIVE_NONE = 0,
+	PRIMITIVE_TRIANGLE = 1,
+	PRIMITIVE_MOTION_TRIANGLE = 2,
+	PRIMITIVE_CURVE = 4,
+	PRIMITIVE_MOTION_CURVE = 8,
+
+	PRIMITIVE_ALL_TRIANGLE = (PRIMITIVE_TRIANGLE|PRIMITIVE_MOTION_TRIANGLE),
+	PRIMITIVE_ALL_CURVE = (PRIMITIVE_CURVE|PRIMITIVE_MOTION_CURVE),
+	PRIMITIVE_ALL_MOTION = (PRIMITIVE_MOTION_TRIANGLE|PRIMITIVE_MOTION_CURVE),
+	PRIMITIVE_ALL = (PRIMITIVE_ALL_TRIANGLE|PRIMITIVE_ALL_CURVE)
+} PrimitiveType;
+
+#define PRIMITIVE_PACK_SEGMENT(type, segment) ((segment << 16) | type)
+#define PRIMITIVE_UNPACK_SEGMENT(type) (type >> 16)
+
 /* Attributes */
 
 #define ATTR_PRIM_TYPES		2
@@ -432,9 +476,12 @@ typedef enum AttributeElement {
 	ATTR_ELEMENT_MESH,
 	ATTR_ELEMENT_FACE,
 	ATTR_ELEMENT_VERTEX,
+	ATTR_ELEMENT_VERTEX_MOTION,
 	ATTR_ELEMENT_CORNER,
 	ATTR_ELEMENT_CURVE,
-	ATTR_ELEMENT_CURVE_KEY
+	ATTR_ELEMENT_CURVE_KEY,
+	ATTR_ELEMENT_CURVE_KEY_MOTION,
+	ATTR_ELEMENT_VOXEL
 } AttributeElement;
 
 typedef enum AttributeStandard {
@@ -448,12 +495,17 @@ typedef enum AttributeStandard {
 	ATTR_STD_GENERATED_TRANSFORM,
 	ATTR_STD_POSITION_UNDEFORMED,
 	ATTR_STD_POSITION_UNDISPLACED,
-	ATTR_STD_MOTION_PRE,
-	ATTR_STD_MOTION_POST,
+	ATTR_STD_MOTION_VERTEX_POSITION,
+	ATTR_STD_MOTION_VERTEX_NORMAL,
 	ATTR_STD_PARTICLE,
 	ATTR_STD_CURVE_INTERCEPT,
 	ATTR_STD_PTEX_FACE_ID,
 	ATTR_STD_PTEX_UV,
+	ATTR_STD_VOLUME_DENSITY,
+	ATTR_STD_VOLUME_COLOR,
+	ATTR_STD_VOLUME_FLAME,
+	ATTR_STD_VOLUME_HEAT,
+	ATTR_STD_VOLUME_VELOCITY,
 	ATTR_STD_NUM,
 
 	ATTR_STD_NOT_FOUND = ~0
@@ -461,15 +513,17 @@ typedef enum AttributeStandard {
 
 /* Closure data */
 
+#ifdef __MULTI_CLOSURE__
 #define MAX_CLOSURE 64
+#else
+#define MAX_CLOSURE 1
+#endif
 
 typedef struct ShaderClosure {
 	ClosureType type;
 	float3 weight;
 
-#ifdef __MULTI_CLOSURE__
 	float sample_weight;
-#endif
 
 	float data0;
 	float data1;
@@ -561,13 +615,9 @@ typedef struct ShaderData {
 	/* primitive id if there is one, ~0 otherwise */
 	int prim;
 
-#ifdef __HAIR__
-	/* for curves, segment number in curve, ~0 for triangles */
-	int segment;
-	/* variables for minimum hair width using transparency bsdf */
-	/*float curve_transparency; */
-	/*float curve_radius; */
-#endif
+	/* combined type and curve segment for hair */
+	int type;
+
 	/* parametric coordinates
 	 * - barycentric weights for triangles */
 	float u, v;
@@ -583,6 +633,9 @@ typedef struct ShaderData {
 	/* ray bounce depth */
 	int ray_depth;
 
+	/* ray transparent depth */
+	int transparent_depth;
+
 #ifdef __RAY_DIFFERENTIALS__
 	/* differential of P. these are orthogonal to Ng, not N */
 	differential3 dP;
@@ -605,15 +658,10 @@ typedef struct ShaderData {
 	Transform ob_itfm;
 #endif
 
-#ifdef __MULTI_CLOSURE__
 	/* Closure data, we store a fixed array of closures */
 	ShaderClosure closure[MAX_CLOSURE];
 	int num_closure;
 	float randb_closure;
-#else
-	/* Closure data, with a single sampled closure for low memory usage */
-	ShaderClosure closure;
-#endif
 
 	/* ray start position, only set for backgrounds */
 	float3 ray_P;
@@ -824,25 +872,27 @@ typedef struct KernelIntegrator {
 	/* clamp */
 	float sample_clamp_direct;
 	float sample_clamp_indirect;
-	float pad1, pad2, pad3;
 
 	/* branched path */
 	int branched;
-	int aa_samples;
 	int diffuse_samples;
 	int glossy_samples;
 	int transmission_samples;
 	int ao_samples;
 	int mesh_light_samples;
 	int subsurface_samples;
-	
+	int sample_all_lights_direct;
+	int sample_all_lights_indirect;
+
 	/* mis */
 	int use_lamp_mis;
 
 	/* sampler */
 	int sampling_pattern;
+	int aa_samples;
 
 	/* volume render */
+	int volume_homogeneous_sampling;
 	int use_volumes;
 	int volume_max_steps;
 	float volume_step_size;
diff --git a/intern/cycles/kernel/kernel_volume.h b/intern/cycles/kernel/kernel_volume.h
index dc2ddf1098e..faaa68e3309 100644
--- a/intern/cycles/kernel/kernel_volume.h
+++ b/intern/cycles/kernel/kernel_volume.h
@@ -16,6 +16,8 @@
 
 CCL_NAMESPACE_BEGIN
 
+/* Events for probalistic scattering */
+
 typedef enum VolumeIntegrateResult {
 	VOLUME_PATH_SCATTERED = 0,
 	VOLUME_PATH_ATTENUATED = 1,
@@ -92,14 +94,19 @@ ccl_device bool volume_shader_sample(KernelGlobals *kg, ShaderData *sd, PathStat
 	return true;
 }
 
-ccl_device float3 volume_color_attenuation(float3 sigma, float t)
+ccl_device float3 volume_color_transmittance(float3 sigma, float t)
 {
 	return make_float3(expf(-sigma.x * t), expf(-sigma.y * t), expf(-sigma.z * t));
 }
 
+ccl_device float kernel_volume_channel_get(float3 value, int channel)
+{
+	return (channel == 0)? value.x: ((channel == 1)? value.y: value.z);
+}
+
 ccl_device bool volume_stack_is_heterogeneous(KernelGlobals *kg, VolumeStack *stack)
 {
-	for(int i = 0; stack[i].shader != SHADER_NO_ID; i++) {
+	for(int i = 0; stack[i].shader != SHADER_NONE; i++) {
 		int shader_flag = kernel_tex_fetch(__shader_flag, (stack[i].shader & SHADER_MASK)*2);
 
 		if(shader_flag & SD_HETEROGENEOUS_VOLUME)
@@ -114,14 +121,14 @@ ccl_device bool volume_stack_is_heterogeneous(KernelGlobals *kg, VolumeStack *st
  * These functions are used to attenuate shadow rays to lights. Both absorption
  * and scattering will block light, represented by the extinction coefficient. */
 
-/* homogenous volume: assume shader evaluation at the starts gives
+/* homogeneous volume: assume shader evaluation at the starts gives
  * the extinction coefficient for the entire line segment */
 ccl_device void kernel_volume_shadow_homogeneous(KernelGlobals *kg, PathState *state, Ray *ray, ShaderData *sd, float3 *throughput)
 {
 	float3 sigma_t;
 
 	if(volume_shader_extinction_sample(kg, sd, state, ray->P, &sigma_t))
-		*throughput *= volume_color_attenuation(sigma_t, ray->t);
+		*throughput *= volume_color_transmittance(sigma_t, ray->t);
 }
 
 /* heterogeneous volume: integrate stepping through the volume until we
@@ -138,34 +145,29 @@ ccl_device void kernel_volume_shadow_heterogeneous(KernelGlobals *kg, PathState
 
 	/* compute extinction at the start */
 	float t = 0.0f;
-	float3 P = ray->P;
-	float3 sigma_t;
-
-	if(!volume_shader_extinction_sample(kg, sd, state, P, &sigma_t))
-		sigma_t = make_float3(0.0f, 0.0f, 0.0f);
 
 	for(int i = 0; i < max_steps; i++) {
 		/* advance to new position */
-		float new_t = min(ray->t, t + random_jitter_offset + i * step);
-		float3 new_P = ray->P + ray->D * new_t;
-		float3 new_sigma_t;
+		float new_t = min(ray->t, (i+1) * step);
+		float dt = new_t - t;
+
+		/* use random position inside this segment to sample shader */
+		if(new_t == ray->t)
+			random_jitter_offset = lcg_step_float(&state->rng_congruential) * dt;
+
+		float3 new_P = ray->P + ray->D * (t + random_jitter_offset);
+		float3 sigma_t;
 
 		/* compute attenuation over segment */
-		if(volume_shader_extinction_sample(kg, sd, state, new_P, &new_sigma_t)) {
+		if(volume_shader_extinction_sample(kg, sd, state, new_P, &sigma_t)) {
 			/* todo: we could avoid computing expf() for each step by summing,
 			 * because exp(a)*exp(b) = exp(a+b), but we still want a quick
 			 * tp_eps check too */
-			tp *= volume_color_attenuation(0.5f*(sigma_t + new_sigma_t), new_t - t);
+			tp *= volume_color_transmittance(sigma_t, new_t - t);
 
 			/* stop if nearly all light blocked */
 			if(tp.x < tp_eps && tp.y < tp_eps && tp.z < tp_eps)
 				break;
-
-			sigma_t = new_sigma_t;
-		}
-		else {
-			/* skip empty space */
-			sigma_t = make_float3(0.0f, 0.0f, 0.0f);
 		}
 
 		/* stop if at the end of the volume */
@@ -182,7 +184,7 @@ ccl_device void kernel_volume_shadow_heterogeneous(KernelGlobals *kg, PathState
 ccl_device_noinline void kernel_volume_shadow(KernelGlobals *kg, PathState *state, Ray *ray, float3 *throughput)
 {
 	ShaderData sd;
-	shader_setup_from_volume(kg, &sd, ray, state->bounce);
+	shader_setup_from_volume(kg, &sd, ray, state->bounce, state->transparent_bounce);
 
 	if(volume_stack_is_heterogeneous(kg, state->volume_stack))
 		kernel_volume_shadow_heterogeneous(kg, state, ray, &sd, throughput);
@@ -190,9 +192,123 @@ ccl_device_noinline void kernel_volume_shadow(KernelGlobals *kg, PathState *stat
 		kernel_volume_shadow_homogeneous(kg, state, ray, &sd, throughput);
 }
 
+/* Equi-angular sampling as in:
+ * "Importance Sampling Techniques for Path Tracing in Participating Media" */
+
+ccl_device float kernel_volume_equiangular_sample(Ray *ray, float3 light_P, float xi, float *pdf)
+{
+	float t = ray->t;
+
+	float delta = dot((light_P - ray->P) , ray->D);
+	float D = sqrtf(len_squared(light_P - ray->P) - delta * delta);
+	float theta_a = -atan2f(delta, D);
+	float theta_b = atan2f(t - delta, D);
+	float t_ = D * tanf((xi * theta_b) + (1 - xi) * theta_a);
+
+	*pdf = D / ((theta_b - theta_a) * (D * D + t_ * t_));
+
+	return min(t, delta + t_); /* min is only for float precision errors */
+}
+
+ccl_device float kernel_volume_equiangular_pdf(Ray *ray, float3 light_P, float sample_t)
+{
+	float delta = dot((light_P - ray->P) , ray->D);
+	float D = sqrtf(len_squared(light_P - ray->P) - delta * delta);
+
+	float t = ray->t;
+	float t_ = sample_t - delta;
+
+	float theta_a = -atan2f(delta, D);
+	float theta_b = atan2f(t - delta, D);
+
+	float pdf = D / ((theta_b - theta_a) * (D * D + t_ * t_));
+
+	return pdf;
+}
+
+ccl_device bool kernel_volume_equiangular_light_position(KernelGlobals *kg, PathState *state, Ray *ray, RNG *rng, float3 *light_P)
+{
+	/* light RNGs */
+	float light_t = path_state_rng_1D(kg, rng, state, PRNG_LIGHT);
+	float light_u, light_v;
+	path_state_rng_2D(kg, rng, state, PRNG_LIGHT_U, &light_u, &light_v);
+
+	/* light sample */
+	LightSample ls;
+	light_sample(kg, light_t, light_u, light_v, ray->time, ray->P, &ls);
+	if(ls.pdf == 0.0f)
+		return false;
+	
+	*light_P = ls.P;
+	return true;
+}
+
+ccl_device float kernel_volume_decoupled_equiangular_pdf(KernelGlobals *kg, PathState *state, Ray *ray, RNG *rng, float sample_t)
+{
+	float3 light_P;
+
+	if(!kernel_volume_equiangular_light_position(kg, state, ray, rng, &light_P))
+		return 0.0f;
+
+	return kernel_volume_equiangular_pdf(ray, light_P, sample_t);
+}
+
+/* Distance sampling */
+
+ccl_device float kernel_volume_distance_sample(float max_t, float3 sigma_t, int channel, float xi, float3 *transmittance, float3 *pdf)
+{
+	/* xi is [0, 1[ so log(0) should never happen, division by zero is
+	 * avoided because sample_sigma_t > 0 when SD_SCATTER is set */
+	float sample_sigma_t = kernel_volume_channel_get(sigma_t, channel);
+	float3 full_transmittance = volume_color_transmittance(sigma_t, max_t);
+	float sample_transmittance = kernel_volume_channel_get(full_transmittance, channel);
+
+	float sample_t = min(max_t, -logf(1.0f - xi*(1.0f - sample_transmittance))/sample_sigma_t);
+
+	*transmittance = volume_color_transmittance(sigma_t, sample_t);
+	*pdf = (sigma_t * *transmittance)/(make_float3(1.0f, 1.0f, 1.0f) - full_transmittance);
+
+	/* todo: optimization: when taken together with hit/miss decision,
+	 * the full_transmittance cancels out drops out and xi does not
+	 * need to be remapped */
+
+	return sample_t;
+}
+
+ccl_device float3 kernel_volume_distance_pdf(float max_t, float3 sigma_t, float sample_t)
+{
+	float3 full_transmittance = volume_color_transmittance(sigma_t, max_t);
+	float3 transmittance = volume_color_transmittance(sigma_t, sample_t);
+
+	return (sigma_t * transmittance)/(make_float3(1.0f, 1.0f, 1.0f) - full_transmittance);
+}
+
+/* Emission */
+
+ccl_device float3 kernel_volume_emission_integrate(VolumeShaderCoefficients *coeff, int closure_flag, float3 transmittance, float t)
+{
+	/* integral E * exp(-sigma_t * t) from 0 to t = E * (1 - exp(-sigma_t * t))/sigma_t
+	 * this goes to E * t as sigma_t goes to zero
+	 *
+	 * todo: we should use an epsilon to avoid precision issues near zero sigma_t */
+	float3 emission = coeff->emission;
+
+	if(closure_flag & SD_ABSORPTION) {
+		float3 sigma_t = coeff->sigma_a + coeff->sigma_s;
+
+		emission.x *= (sigma_t.x > 0.0f)? (1.0f - transmittance.x)/sigma_t.x: t;
+		emission.y *= (sigma_t.y > 0.0f)? (1.0f - transmittance.y)/sigma_t.y: t;
+		emission.z *= (sigma_t.z > 0.0f)? (1.0f - transmittance.z)/sigma_t.z: t;
+	}
+	else
+		emission *= t;
+	
+	return emission;
+}
+
 /* Volume Path */
 
-/* homogenous volume: assume shader evaluation at the starts gives
+/* homogeneous volume: assume shader evaluation at the start gives
  * the volume shading coefficient for the entire line segment */
 ccl_device VolumeIntegrateResult kernel_volume_integrate_homogeneous(KernelGlobals *kg,
 	PathState *state, Ray *ray, ShaderData *sd, PathRadiance *L, float3 *throughput,
@@ -206,69 +322,73 @@ ccl_device VolumeIntegrateResult kernel_volume_integrate_homogeneous(KernelGloba
 	int closure_flag = sd->flag;
 	float t = ray->t;
 	float3 new_tp;
-	float3 transmittance;
 
 	/* randomly scatter, and if we do t is shortened */
 	if(closure_flag & SD_SCATTER) {
+		/* extinction coefficient */
 		float3 sigma_t = coeff.sigma_a + coeff.sigma_s;
 
-		/* set up variables for sampling */
+		/* pick random color channel, we use the Veach one-sample
+		 * model with balance heuristic for the channels */
 		float rphase = path_state_rng_1D(kg, rng, state, PRNG_PHASE);
 		int channel = (int)(rphase*3.0f);
 		sd->randb_closure = rphase*3.0f - channel;
 
-		/* pick random color channel, we use the Veach one-sample
-		 * model with balance heuristic for the channels */
-		float sample_sigma_t;
+		float xi = path_state_rng_1D(kg, rng, state, PRNG_SCATTER_DISTANCE);
 
-		if(channel == 0)
-			sample_sigma_t = sigma_t.x;
-		else if(channel == 1)
-			sample_sigma_t = sigma_t.y;
-		else
-			sample_sigma_t = sigma_t.z;
+		/* decide if we will hit or miss */
+		float sample_sigma_t = kernel_volume_channel_get(sigma_t, channel);
+		float sample_transmittance = expf(-sample_sigma_t * t);
 
-		/* xi is [0, 1[ so log(0) should never happen, division by zero is
-		 * avoided because sample_sigma_t > 0 when SD_SCATTER is set */
-		float xi = path_state_rng_1D(kg, rng, state, PRNG_SCATTER_DISTANCE);
-		float sample_t = min(t, -logf(1.0f - xi)/sample_sigma_t);
+		if(xi >= sample_transmittance) {
+			/* scattering */
+			float3 pdf;
+			float3 transmittance;
+			float sample_t;
 
-		transmittance = volume_color_attenuation(sigma_t, sample_t);
+			/* rescale random number so we can reuse it */
+			xi = (xi - sample_transmittance)/(1.0f - sample_transmittance);
 
-		if(sample_t < t) {
-			float pdf = dot(sigma_t, transmittance);
-			new_tp = *throughput * coeff.sigma_s * transmittance * (3.0f / pdf);
+			if(kernel_data.integrator.volume_homogeneous_sampling == 0 || !kernel_data.integrator.num_all_lights) { 
+				/* distance sampling */
+				sample_t = kernel_volume_distance_sample(ray->t, sigma_t, channel, xi, &transmittance, &pdf);
+			}
+			else {
+				/* equiangular sampling */
+				float3 light_P;
+				float equi_pdf;
+				if(!kernel_volume_equiangular_light_position(kg, state, ray, rng, &light_P))
+					return VOLUME_PATH_MISSED;
+
+				sample_t = kernel_volume_equiangular_sample(ray, light_P, xi, &equi_pdf);
+				transmittance = volume_color_transmittance(sigma_t, sample_t);
+				pdf = make_float3(equi_pdf, equi_pdf, equi_pdf);
+			}
+
+			/* modifiy pdf for hit/miss decision */
+			pdf *= make_float3(1.0f, 1.0f, 1.0f) - volume_color_transmittance(sigma_t, t);
+
+			new_tp = *throughput * coeff.sigma_s * transmittance / average(pdf);
 			t = sample_t;
 		}
 		else {
-			float pdf = (transmittance.x + transmittance.y + transmittance.z);
-			new_tp = *throughput * transmittance * (3.0f / pdf);
+			/* no scattering */
+			float3 transmittance = volume_color_transmittance(sigma_t, t);
+			float pdf = average(transmittance);
+			new_tp = *throughput * transmittance / pdf;
 		}
 	}
 	else if(closure_flag & SD_ABSORPTION) {
 		/* absorption only, no sampling needed */
-		transmittance = volume_color_attenuation(coeff.sigma_a, t);
+		float3 transmittance = volume_color_transmittance(coeff.sigma_a, t);
 		new_tp = *throughput * transmittance;
 	}
 
-	/* integrate emission attenuated by extinction
-	 * integral E * exp(-sigma_t * t) from 0 to t = E * (1 - exp(-sigma_t * t))/sigma_t
-	 * this goes to E * t as sigma_t goes to zero
-	 *
-	 * todo: we should use an epsilon to avoid precision issues near zero sigma_t */
+	/* integrate emission attenuated by extinction */
 	if(closure_flag & SD_EMISSION) {
-		float3 emission = coeff.emission;
-
-		if(closure_flag & SD_ABSORPTION) {
-			float3 sigma_t = coeff.sigma_a + coeff.sigma_s;
-
-			emission.x *= (sigma_t.x > 0.0f)? (1.0f - transmittance.x)/sigma_t.x: t;
-			emission.y *= (sigma_t.y > 0.0f)? (1.0f - transmittance.y)/sigma_t.y: t;
-			emission.z *= (sigma_t.z > 0.0f)? (1.0f - transmittance.z)/sigma_t.z: t;
-		}
-		else
-			emission *= t;
-
+		float3 sigma_t = coeff.sigma_a + coeff.sigma_s;
+		float3 transmittance = volume_color_transmittance(sigma_t, ray->t);
+		float3 emission = kernel_volume_emission_integrate(&coeff, closure_flag, transmittance, ray->t);
 		path_radiance_accum_emission(L, *throughput, emission, state->bounce);
 	}
 
@@ -293,45 +413,38 @@ ccl_device VolumeIntegrateResult kernel_volume_integrate_homogeneous(KernelGloba
 ccl_device VolumeIntegrateResult kernel_volume_integrate_heterogeneous(KernelGlobals *kg,
 	PathState *state, Ray *ray, ShaderData *sd, PathRadiance *L, float3 *throughput, RNG *rng)
 {
-	VolumeShaderCoefficients coeff;
 	float3 tp = *throughput;
 	const float tp_eps = 1e-10f; /* todo: this is likely not the right value */
 
 	/* prepare for stepping */
 	int max_steps = kernel_data.integrator.volume_max_steps;
-	float step = kernel_data.integrator.volume_step_size;
-	float random_jitter_offset = lcg_step_float(&state->rng_congruential) * step;
+	float step_size = kernel_data.integrator.volume_step_size;
+	float random_jitter_offset = lcg_step_float(&state->rng_congruential) * step_size;
 
 	/* compute coefficients at the start */
 	float t = 0.0f;
-	float3 P = ray->P;
-
-	if(!volume_shader_sample(kg, sd, state, P, &coeff)) {
-		coeff.sigma_a = make_float3(0.0f, 0.0f, 0.0f);
-		coeff.sigma_s = make_float3(0.0f, 0.0f, 0.0f);
-		coeff.emission = make_float3(0.0f, 0.0f, 0.0f);
-	}
-
-	/* accumulate these values so we can use a single stratified number to sample */
 	float3 accum_transmittance = make_float3(1.0f, 1.0f, 1.0f);
-	float3 accum_sigma_t = make_float3(0.0f, 0.0f, 0.0f);
-	float3 accum_sigma_s = make_float3(0.0f, 0.0f, 0.0f);
 
 	/* cache some constant variables */
-	float nlogxi;
+	float xi;
 	int channel = -1;
 	bool has_scatter = false;
 
 	for(int i = 0; i < max_steps; i++) {
 		/* advance to new position */
-		float new_t = min(ray->t, t + random_jitter_offset + i * step);
-		float3 new_P = ray->P + ray->D * new_t;
-		VolumeShaderCoefficients new_coeff;
+		float new_t = min(ray->t, (i+1) * step_size);
+		float dt = new_t - t;
+
+		/* use random position inside this segment to sample shader */
+		if(new_t == ray->t)
+			random_jitter_offset = lcg_step_float(&state->rng_congruential) * dt;
+
+		float3 new_P = ray->P + ray->D * (t + random_jitter_offset);
+		VolumeShaderCoefficients coeff;
 
 		/* compute segment */
-		if(volume_shader_sample(kg, sd, state, new_P, &new_coeff)) {
+		if(volume_shader_sample(kg, sd, state, new_P, &coeff)) {
 			int closure_flag = sd->flag;
-			float dt = new_t - t;
 			float3 new_tp;
 			float3 transmittance;
 			bool scatter = false;
@@ -341,94 +454,58 @@ ccl_device VolumeIntegrateResult kernel_volume_integrate_heterogeneous(KernelGlo
 				has_scatter = true;
 
 				/* average sigma_t and sigma_s over segment */
-				float3 last_sigma_t = coeff.sigma_a + coeff.sigma_s;
-				float3 new_sigma_t = new_coeff.sigma_a + new_coeff.sigma_s;
-				float3 sigma_t = 0.5f*(last_sigma_t + new_sigma_t);
-				float3 sigma_s = 0.5f*(coeff.sigma_s + new_coeff.sigma_s);
+				float3 sigma_t = coeff.sigma_a + coeff.sigma_s;
+				float3 sigma_s = coeff.sigma_s;
 
 				/* lazily set up variables for sampling */
 				if(channel == -1) {
-					float xi = path_state_rng_1D(kg, rng, state, PRNG_SCATTER_DISTANCE);
-					nlogxi = -logf(1.0f - xi);
+					/* pick random color channel, we use the Veach one-sample
+					 * model with balance heuristic for the channels */
+					xi = path_state_rng_1D(kg, rng, state, PRNG_SCATTER_DISTANCE);
 
 					float rphase = path_state_rng_1D(kg, rng, state, PRNG_PHASE);
 					channel = (int)(rphase*3.0f);
 					sd->randb_closure = rphase*3.0f - channel;
 				}
 
-				/* pick random color channel, we use the Veach one-sample
-				 * model with balance heuristic for the channels */
-				float sample_sigma_t;
+				/* compute transmittance over full step */
+				transmittance = volume_color_transmittance(sigma_t, dt);
 
-				if(channel == 0)
-					sample_sigma_t = accum_sigma_t.x + dt*sigma_t.x;
-				else if(channel == 1)
-					sample_sigma_t = accum_sigma_t.y + dt*sigma_t.y;
-				else
-					sample_sigma_t = accum_sigma_t.z + dt*sigma_t.z;
+				/* decide if we will scatter or continue */
+				float sample_transmittance = kernel_volume_channel_get(transmittance, channel);
 
-				if(nlogxi < sample_sigma_t) {
+				if(1.0f - xi >= sample_transmittance) {
 					/* compute sampling distance */
-					sample_sigma_t /= new_t;
-					new_t = nlogxi/sample_sigma_t;
-					dt = new_t - t;
-
-					transmittance = volume_color_attenuation(sigma_t, dt);
-
-					accum_transmittance *= transmittance;
-					accum_sigma_t = (accum_sigma_t + dt*sigma_t)/new_t;
-					accum_sigma_s = (accum_sigma_s + dt*sigma_s)/new_t;
-
-					/* todo: it's not clear to me that this is correct if we move
-					 * through a color volumed, needs verification */
-					float pdf = dot(accum_sigma_t, accum_transmittance);
-					new_tp = tp * accum_sigma_s * transmittance * (3.0f / pdf);
-
+					float sample_sigma_t = kernel_volume_channel_get(sigma_t, channel);
+					float new_dt = -logf(1.0f - xi)/sample_sigma_t;
+					new_t = t + new_dt;
+
+					/* transmittance, throughput */
+					float3 new_transmittance = volume_color_transmittance(sigma_t, new_dt);
+					float pdf = average(sigma_t * new_transmittance);
+					new_tp = tp * sigma_s * new_transmittance / pdf;
 					scatter = true;
 				}
 				else {
-					transmittance = volume_color_attenuation(sigma_t, dt);
-
-					accum_transmittance *= transmittance;
-					accum_sigma_t += dt*sigma_t;
-					accum_sigma_s += dt*sigma_s;
+					/* throughput */
+					float pdf = average(transmittance);
+					new_tp = tp * transmittance / pdf;
 
-					new_tp = tp * transmittance;
+					/* remap xi so we can reuse it and keep thing stratified */
+					xi = 1.0f - (1.0f - xi)/sample_transmittance;
 				}
 			}
 			else if(closure_flag & SD_ABSORPTION) {
 				/* absorption only, no sampling needed */
-				float3 sigma_a = 0.5f*(coeff.sigma_a + new_coeff.sigma_a);
-				transmittance = volume_color_attenuation(sigma_a, dt);
-
-				accum_transmittance *= transmittance;
-				accum_sigma_t += dt*sigma_a;
+				float3 sigma_a = coeff.sigma_a;
 
+				transmittance = volume_color_transmittance(sigma_a, dt);
 				new_tp = tp * transmittance;
-
-				/* todo: we could avoid computing expf() for each step by summing,
-				 * because exp(a)*exp(b) = exp(a+b), but we still want a quick
-				 * tp_eps check too */
 			}
 
-			/* integrate emission attenuated by absorption 
-			 * integral E * exp(-sigma_t * t) from 0 to t = E * (1 - exp(-sigma_t * t))/sigma_t
-			 * this goes to E * t as sigma_t goes to zero
-			 *
-			 * todo: we should use an epsilon to avoid precision issues near zero sigma_t */
+			/* integrate emission attenuated by absorption */
 			if(closure_flag & SD_EMISSION) {
-				float3 emission = 0.5f*(coeff.emission + new_coeff.emission);
-
-				if(closure_flag & SD_ABSORPTION) {
-					float3 sigma_t = 0.5f*(coeff.sigma_a + coeff.sigma_s + new_coeff.sigma_a + new_coeff.sigma_s);
-
-					emission.x *= (sigma_t.x > 0.0f)? (1.0f - transmittance.x)/sigma_t.x: dt;
-					emission.y *= (sigma_t.y > 0.0f)? (1.0f - transmittance.y)/sigma_t.y: dt;
-					emission.z *= (sigma_t.z > 0.0f)? (1.0f - transmittance.z)/sigma_t.z: dt;
-				}
-				else
-					emission *= dt;
-
+				float3 emission = kernel_volume_emission_integrate(&coeff, closure_flag, transmittance, dt);
 				path_radiance_accum_emission(L, tp, emission, state->bounce);
 			}
 
@@ -450,47 +527,323 @@ ccl_device VolumeIntegrateResult kernel_volume_integrate_heterogeneous(KernelGlo
 
 					return VOLUME_PATH_SCATTERED;
 				}
+				else {
+					/* accumulate transmittance */
+					accum_transmittance *= transmittance;
+				}
 			}
+		}
+
+		/* stop if at the end of the volume */
+		t = new_t;
+		if(t == ray->t)
+			break;
+	}
+
+	*throughput = tp;
 
-			coeff = new_coeff;
+	return VOLUME_PATH_ATTENUATED;
+}
+
+/* Decoupled Volume Sampling
+ *
+ * VolumeSegment is list of coefficients and transmittance stored at all steps
+ * through a volume. This can then latter be used for decoupled sampling as in:
+ * "Importance Sampling Techniques for Path Tracing in Participating Media" */
+
+/* CPU only because of malloc/free */
+#ifdef __KERNEL_CPU__
+
+typedef struct VolumeStep {
+	float3 sigma_s;				/* scatter coefficient */
+	float3 sigma_t;				/* extinction coefficient */
+	float3 accum_transmittance;	/* accumulated transmittance including this step */
+	float3 cdf_distance;		/* cumulative density function for distance sampling */
+	float t;					/* distance at end of this step */
+	float shade_t;				/* jittered distance where shading was done in step */
+	int closure_flag;			/* shader evaluation closure flags */
+} VolumeStep;
+
+typedef struct VolumeSegment {
+	VolumeStep *steps;			/* recorded steps */
+	int numsteps;				/* number of steps */
+	int closure_flag;			/* accumulated closure flags from all steps */
+
+	float3 accum_emission;		/* accumulated emission at end of segment */
+	float3 accum_transmittance;	/* accumulated transmittance at end of segment */
+} VolumeSegment;
+
+/* record volume steps to the end of the volume.
+ *
+ * it would be nice if we could only record up to the point that we need to scatter,
+ * but the entire segment is needed to do always scattering, rather than probalistically
+ * hitting or missing the volume. if we don't know the transmittance at the end of the
+ * volume we can't generate stratitied distance samples up to that transmittance */
+ccl_device void kernel_volume_decoupled_record(KernelGlobals *kg, PathState *state,
+	Ray *ray, ShaderData *sd, VolumeSegment *segment, bool heterogeneous)
+{
+	/* prepare for volume stepping */
+	int max_steps;
+	float step_size, random_jitter_offset;
+
+	if(heterogeneous) {
+		max_steps = kernel_data.integrator.volume_max_steps;
+		step_size = kernel_data.integrator.volume_step_size;
+		random_jitter_offset = lcg_step_float(&state->rng_congruential) * step_size;
+
+		/* compute exact steps in advance for malloc */
+		max_steps = max((int)ceilf(ray->t/step_size), 1);
+	}
+	else {
+		max_steps = 1;
+		step_size = ray->t;
+		random_jitter_offset = 0.0f;
+	}
+	
+	/* init accumulation variables */
+	float3 accum_emission = make_float3(0.0f, 0.0f, 0.0f);
+	float3 accum_transmittance = make_float3(1.0f, 1.0f, 1.0f);
+	float3 cdf_distance = make_float3(0.0f, 0.0f, 0.0f);
+	float t = 0.0f;
+
+	segment->closure_flag = 0;
+	segment->numsteps = 0;
+	segment->steps = (VolumeStep*)malloc(sizeof(VolumeStep)*max_steps);
+
+	VolumeStep *step = segment->steps;
+
+	for(int i = 0; i < max_steps; i++, step++) {
+		/* advance to new position */
+		float new_t = min(ray->t, (i+1) * step_size);
+		float dt = new_t - t;
+
+		/* use random position inside this segment to sample shader */
+		if(heterogeneous && new_t == ray->t)
+			random_jitter_offset = lcg_step_float(&state->rng_congruential) * dt;
+
+		float3 new_P = ray->P + ray->D * (t + random_jitter_offset);
+		VolumeShaderCoefficients coeff;
+
+		/* compute segment */
+		if(volume_shader_sample(kg, sd, state, new_P, &coeff)) {
+			int closure_flag = sd->flag;
+			float3 sigma_t = coeff.sigma_a + coeff.sigma_s;
+
+			/* compute accumulated transmittance */
+			float3 transmittance = volume_color_transmittance(sigma_t, dt);
+
+			/* compute emission attenuated by absorption */
+			if(closure_flag & SD_EMISSION) {
+				float3 emission = kernel_volume_emission_integrate(&coeff, closure_flag, transmittance, dt);
+				accum_emission += accum_transmittance * emission;
+			}
+
+			accum_transmittance *= transmittance;
+
+			/* compute pdf for distance sampling */
+			float3 pdf_distance = dt * accum_transmittance * coeff.sigma_s;
+			cdf_distance = cdf_distance + pdf_distance;
+
+			/* write step data */
+			step->sigma_t = sigma_t;
+			step->sigma_s = coeff.sigma_s;
+			step->closure_flag = closure_flag;
+
+			segment->closure_flag |= closure_flag;
 		}
 		else {
-			/* skip empty space */
-			coeff.sigma_a = make_float3(0.0f, 0.0f, 0.0f);
-			coeff.sigma_s = make_float3(0.0f, 0.0f, 0.0f);
-			coeff.emission = make_float3(0.0f, 0.0f, 0.0f);
+			/* store empty step (todo: skip consecutive empty steps) */
+			step->sigma_t = make_float3(0.0f, 0.0f, 0.0f);
+			step->sigma_s = make_float3(0.0f, 0.0f, 0.0f);
+			step->closure_flag = 0;
 		}
 
+		step->accum_transmittance = accum_transmittance;
+		step->cdf_distance = cdf_distance;
+		step->t = new_t;
+		step->shade_t = t + random_jitter_offset;
+
+		segment->numsteps++;
+
 		/* stop if at the end of the volume */
 		t = new_t;
 		if(t == ray->t)
 			break;
 	}
 
-	/* include pdf for volumes with scattering */
-	if(has_scatter) {
-		float pdf = (accum_transmittance.x + accum_transmittance.y + accum_transmittance.z);
-		if(pdf > 0.0f)
-			tp *= (3.0f/pdf);
+	/* store total emission and transmittance */
+	segment->accum_emission = accum_emission;
+	segment->accum_transmittance = accum_transmittance;
+
+	/* normalize cumulative density function for distance sampling */
+	VolumeStep *last_step = segment->steps + segment->numsteps - 1;
+
+	if(!is_zero(last_step->cdf_distance)) {
+		VolumeStep *step = &segment->steps[0];
+		int numsteps = segment->numsteps;
+		float3 inv_cdf_distance_sum = safe_invert_color(last_step->cdf_distance);
+
+		for(int i = 0; i < numsteps; i++, step++)
+			step->cdf_distance *= inv_cdf_distance_sum;
+	}
+}
+
+ccl_device void kernel_volume_decoupled_free(KernelGlobals *kg, VolumeSegment *segment)
+{
+	free(segment->steps);
+}
+
+/* scattering for homogeneous and heterogeneous volumes, using decoupled ray
+ * marching. unlike the non-decoupled functions, these do not do probalistic
+ * scattering, they always scatter if there is any non-zero scattering
+ * coefficient.
+ *
+ * these also do not do emission or modify throughput. */
+ccl_device VolumeIntegrateResult kernel_volume_decoupled_scatter(
+	KernelGlobals *kg, PathState *state, Ray *ray, ShaderData *sd,
+	float3 *throughput, RNG *rng, VolumeSegment *segment)
+{
+	int closure_flag = segment->closure_flag;
+
+	if(!(closure_flag & SD_SCATTER))
+		return VOLUME_PATH_MISSED;
+
+	/* pick random color channel, we use the Veach one-sample
+	 * model with balance heuristic for the channels */
+	float rphase = path_state_rng_1D(kg, rng, state, PRNG_PHASE);
+	int channel = (int)(rphase*3.0f);
+	sd->randb_closure = rphase*3.0f - channel;
+
+	float xi = path_state_rng_1D(kg, rng, state, PRNG_SCATTER_DISTANCE);
+
+	VolumeStep *step;
+	float3 transmittance;
+	float pdf, sample_t;
+
+	/* distance sampling */
+	if(kernel_data.integrator.volume_homogeneous_sampling == 0 || !kernel_data.integrator.num_all_lights) { 
+		/* find step in cdf */
+		step = segment->steps;
+
+		float prev_t = 0.0f;
+		float3 step_pdf = make_float3(1.0f, 1.0f, 1.0f);
+
+		if(segment->numsteps > 1) {
+			float prev_cdf = 0.0f;
+			float step_cdf = 1.0f;
+			float3 prev_cdf_distance = make_float3(0.0f, 0.0f, 0.0f);
+
+			for(int i = 0; ; i++, step++) {
+				/* todo: optimize using binary search */
+				step_cdf = kernel_volume_channel_get(step->cdf_distance, channel);
+
+				if(xi < step_cdf || i == segment->numsteps-1)
+					break;
+
+				prev_cdf = step_cdf;
+				prev_t = step->t;
+				prev_cdf_distance = step->cdf_distance;
+			}
+
+			/* remap xi so we can reuse it */
+			xi = (xi - prev_cdf)/(step_cdf - prev_cdf);
+
+			/* pdf for picking step */
+			step_pdf = step->cdf_distance - prev_cdf_distance;
+		}
+
+		/* determine range in which we will sample */
+		float step_t = step->t - prev_t;
+
+		/* sample distance and compute transmittance */
+		float3 distance_pdf;
+		sample_t = prev_t + kernel_volume_distance_sample(step_t, step->sigma_t, channel, xi, &transmittance, &distance_pdf);
+		pdf = average(distance_pdf * step_pdf);
 	}
+	/* equi-angular sampling */
+	else {
+		/* pick position on light */
+		float3 light_P;
+		if(!kernel_volume_equiangular_light_position(kg, state, ray, rng, &light_P))
+			return VOLUME_PATH_MISSED;
 
-	*throughput = tp;
+		/* sample distance */
+		sample_t = kernel_volume_equiangular_sample(ray, light_P, xi, &pdf);
 
-	return VOLUME_PATH_ATTENUATED;
+		/* find step in which sampled distance is located */
+		step = segment->steps;
+
+		float prev_t = 0.0f;
+
+		if(segment->numsteps > 1) {
+			/* todo: optimize using binary search */
+			for(int i = 0; i < segment->numsteps-1; i++, step++) {
+				if(sample_t < step->t)
+					break;
+
+				prev_t = step->t;
+			}
+		}
+		
+		/* compute transmittance */
+		transmittance = volume_color_transmittance(step->sigma_t, sample_t - prev_t);
+	}
+
+	/* compute transmittance up to this step */
+	if(step != segment->steps)
+		transmittance *= (step-1)->accum_transmittance;
+
+	/* modify throughput */
+	*throughput *= step->sigma_s * transmittance / pdf;
+
+	/* evaluate shader to create closures at shading point */
+	if(segment->numsteps > 1) {
+		sd->P = ray->P + step->shade_t*ray->D;
+
+		VolumeShaderCoefficients coeff;
+		volume_shader_sample(kg, sd, state, sd->P, &coeff);
+	}
+
+	/* move to new position */
+	sd->P = ray->P + sample_t*ray->D;
+
+	return VOLUME_PATH_SCATTERED;
 }
 
+#endif
+
 /* get the volume attenuation and emission over line segment defined by
  * ray, with the assumption that there are no surfaces blocking light
  * between the endpoints */
 ccl_device_noinline VolumeIntegrateResult kernel_volume_integrate(KernelGlobals *kg,
 	PathState *state, ShaderData *sd, Ray *ray, PathRadiance *L, float3 *throughput, RNG *rng)
 {
-	shader_setup_from_volume(kg, sd, ray, state->bounce);
+	/* workaround to fix correlation bug in T38710, can find better solution
+	 * in random number generator later, for now this is done here to not impact
+	 * performance of rendering without volumes */
+	RNG tmp_rng = cmj_hash(*rng, state->rng_offset);
+	bool heterogeneous = volume_stack_is_heterogeneous(kg, state->volume_stack);
 
-	if(volume_stack_is_heterogeneous(kg, state->volume_stack))
-		return kernel_volume_integrate_heterogeneous(kg, state, ray, sd, L, throughput, rng);
+#if 0
+	/* debugging code to compare decoupled ray marching */
+	VolumeSegment segment;
+
+	shader_setup_from_volume(kg, sd, ray, state->bounce, state->transparent_bounce);
+	kernel_volume_decoupled_record(kg, state, ray, sd, &segment, heterogeneous);
+
+	VolumeIntegrateResult result = kernel_volume_decoupled_scatter(kg, state, ray, sd, throughput, &tmp_rng, &segment);
+
+	kernel_volume_decoupled_free(kg, &segment);
+
+	return result;
+#else
+	shader_setup_from_volume(kg, sd, ray, state->bounce, state->transparent_bounce);
+
+	if(heterogeneous)
+		return kernel_volume_integrate_heterogeneous(kg, state, ray, sd, L, throughput, &tmp_rng);
 	else
-		return kernel_volume_integrate_homogeneous(kg, state, ray, sd, L, throughput, rng);
+		return kernel_volume_integrate_homogeneous(kg, state, ray, sd, L, throughput, &tmp_rng);
+#endif
 }
 
 /* Volume Stack
@@ -501,13 +854,13 @@ ccl_device_noinline VolumeIntegrateResult kernel_volume_integrate(KernelGlobals
 ccl_device void kernel_volume_stack_init(KernelGlobals *kg, VolumeStack *stack)
 {
 	/* todo: this assumes camera is always in air, need to detect when it isn't */
-	if(kernel_data.background.volume_shader == SHADER_NO_ID) {
-		stack[0].shader = SHADER_NO_ID;
+	if(kernel_data.background.volume_shader == SHADER_NONE) {
+		stack[0].shader = SHADER_NONE;
 	}
 	else {
 		stack[0].shader = kernel_data.background.volume_shader;
-		stack[0].object = ~0;
-		stack[1].shader = SHADER_NO_ID;
+		stack[0].object = PRIM_NONE;
+		stack[1].shader = SHADER_NONE;
 	}
 }
 
@@ -522,14 +875,14 @@ ccl_device void kernel_volume_stack_enter_exit(KernelGlobals *kg, ShaderData *sd
 	
 	if(sd->flag & SD_BACKFACING) {
 		/* exit volume object: remove from stack */
-		for(int i = 0; stack[i].shader != SHADER_NO_ID; i++) {
+		for(int i = 0; stack[i].shader != SHADER_NONE; i++) {
 			if(stack[i].object == sd->object) {
 				/* shift back next stack entries */
 				do {
 					stack[i] = stack[i+1];
 					i++;
 				}
-				while(stack[i].shader != SHADER_NO_ID);
+				while(stack[i].shader != SHADER_NONE);
 
 				return;
 			}
@@ -539,7 +892,7 @@ ccl_device void kernel_volume_stack_enter_exit(KernelGlobals *kg, ShaderData *sd
 		/* enter volume object: add to stack */
 		int i;
 
-		for(i = 0; stack[i].shader != SHADER_NO_ID; i++) {
+		for(i = 0; stack[i].shader != SHADER_NONE; i++) {
 			/* already in the stack? then we have nothing to do */
 			if(stack[i].object == sd->object)
 				return;
@@ -552,7 +905,7 @@ ccl_device void kernel_volume_stack_enter_exit(KernelGlobals *kg, ShaderData *sd
 		/* add to the end of the stack */
 		stack[i].shader = sd->shader;
 		stack[i].object = sd->object;
-		stack[i+1].shader = SHADER_NO_ID;
+		stack[i+1].shader = SHADER_NONE;
 	}
 }
 
diff --git a/intern/cycles/kernel/osl/osl_services.cpp b/intern/cycles/kernel/osl/osl_services.cpp
index 4fad66be6e1..54894ea19eb 100644
--- a/intern/cycles/kernel/osl/osl_services.cpp
+++ b/intern/cycles/kernel/osl/osl_services.cpp
@@ -30,18 +30,16 @@
 
 #include "kernel_compat_cpu.h"
 #include "kernel_globals.h"
-#include "kernel_montecarlo.h"
+#include "kernel_random.h"
 #include "kernel_projection.h"
 #include "kernel_differential.h"
-#include "kernel_object.h"
-#include "kernel_random.h"
-#include "kernel_bvh.h"
-#include "kernel_triangle.h"
-#include "kernel_curve.h"
-#include "kernel_primitive.h"
+#include "kernel_montecarlo.h"
+#include "kernel_camera.h"
+
+#include "geom/geom.h"
+
 #include "kernel_projection.h"
 #include "kernel_accumulate.h"
-#include "kernel_camera.h"
 #include "kernel_shader.h"
 
 #ifdef WITH_PTEX
@@ -52,11 +50,16 @@ CCL_NAMESPACE_BEGIN
 
 /* RenderServices implementation */
 
-#define COPY_MATRIX44(m1, m2) memcpy(m1, m2, sizeof(*m2))
+#define COPY_MATRIX44(m1, m2)  { \
+	CHECK_TYPE(m1, OSL::Matrix44*); \
+	CHECK_TYPE(m2, Transform*); \
+	memcpy(m1, m2, sizeof(*m2)); \
+} (void)0
 
 /* static ustrings */
 ustring OSLRenderServices::u_distance("distance");
 ustring OSLRenderServices::u_index("index");
+ustring OSLRenderServices::u_world("world");
 ustring OSLRenderServices::u_camera("camera");
 ustring OSLRenderServices::u_screen("screen");
 ustring OSLRenderServices::u_raster("raster");
@@ -87,6 +90,7 @@ ustring OSLRenderServices::u_curve_tangent_normal("geom:curve_tangent_normal");
 #endif
 ustring OSLRenderServices::u_path_ray_length("path:ray_length");
 ustring OSLRenderServices::u_path_ray_depth("path:ray_depth");
+ustring OSLRenderServices::u_path_transparent_depth("path:transparent_depth");
 ustring OSLRenderServices::u_trace("trace");
 ustring OSLRenderServices::u_hit("hit");
 ustring OSLRenderServices::u_hitdist("hitdist");
@@ -131,7 +135,7 @@ bool OSLRenderServices::get_matrix(OSL::Matrix44 &result, OSL::TransformationPtr
 		KernelGlobals *kg = sd->osl_globals;
 		int object = sd->object;
 
-		if (object != ~0) {
+		if (object != OBJECT_NONE) {
 #ifdef __OBJECT_MOTION__
 			Transform tfm;
 
@@ -161,7 +165,7 @@ bool OSLRenderServices::get_inverse_matrix(OSL::Matrix44 &result, OSL::Transform
 		KernelGlobals *kg = sd->osl_globals;
 		int object = sd->object;
 
-		if (object != ~0) {
+		if (object != OBJECT_NONE) {
 #ifdef __OBJECT_MOTION__
 			Transform itfm;
 
@@ -206,6 +210,10 @@ bool OSLRenderServices::get_matrix(OSL::Matrix44 &result, ustring from, float ti
 		COPY_MATRIX44(&result, &tfm);
 		return true;
 	}
+	else if (from == u_world) {
+		result.makeIdentity();
+		return true;
+	}
 
 	return false;
 }
@@ -234,6 +242,10 @@ bool OSLRenderServices::get_inverse_matrix(OSL::Matrix44 &result, ustring to, fl
 		COPY_MATRIX44(&result, &tfm);
 		return true;
 	}
+	else if (to == u_world) {
+		result.makeIdentity();
+		return true;
+	}
 
 	return false;
 }
@@ -246,7 +258,7 @@ bool OSLRenderServices::get_matrix(OSL::Matrix44 &result, OSL::TransformationPtr
 		const ShaderData *sd = (const ShaderData *)xform;
 		int object = sd->object;
 
-		if (object != ~0) {
+		if (object != OBJECT_NONE) {
 #ifdef __OBJECT_MOTION__
 			Transform tfm = sd->ob_tfm;
 #else
@@ -271,7 +283,7 @@ bool OSLRenderServices::get_inverse_matrix(OSL::Matrix44 &result, OSL::Transform
 		const ShaderData *sd = (const ShaderData *)xform;
 		int object = sd->object;
 
-		if (object != ~0) {
+		if (object != OBJECT_NONE) {
 #ifdef __OBJECT_MOTION__
 			Transform tfm = sd->ob_itfm;
 #else
@@ -525,7 +537,8 @@ static bool get_mesh_element_attribute(KernelGlobals *kg, const ShaderData *sd,
                                const TypeDesc& type, bool derivatives, void *val)
 {
 	if (attr.type == TypeDesc::TypePoint || attr.type == TypeDesc::TypeVector ||
-	    attr.type == TypeDesc::TypeNormal || attr.type == TypeDesc::TypeColor) {
+	    attr.type == TypeDesc::TypeNormal || attr.type == TypeDesc::TypeColor)
+	{
 		float3 fval[3];
 		fval[0] = primitive_attribute_float3(kg, sd, attr.elem, attr.offset,
 		                                     (derivatives) ? &fval[1] : NULL, (derivatives) ? &fval[2] : NULL);
@@ -596,44 +609,44 @@ bool OSLRenderServices::get_object_standard_attribute(KernelGlobals *kg, ShaderD
 
 	/* Particle Attributes */
 	else if (name == u_particle_index) {
-		uint particle_id = object_particle_id(kg, sd->object);
+		int particle_id = object_particle_id(kg, sd->object);
 		float f = particle_index(kg, particle_id);
 		return set_attribute_float(f, type, derivatives, val);
 	}
 	else if (name == u_particle_age) {
-		uint particle_id = object_particle_id(kg, sd->object);
+		int particle_id = object_particle_id(kg, sd->object);
 		float f = particle_age(kg, particle_id);
 		return set_attribute_float(f, type, derivatives, val);
 	}
 	else if (name == u_particle_lifetime) {
-		uint particle_id = object_particle_id(kg, sd->object);
-		float f= particle_lifetime(kg, particle_id);
+		int particle_id = object_particle_id(kg, sd->object);
+		float f = particle_lifetime(kg, particle_id);
 		return set_attribute_float(f, type, derivatives, val);
 	}
 	else if (name == u_particle_location) {
-		uint particle_id = object_particle_id(kg, sd->object);
+		int particle_id = object_particle_id(kg, sd->object);
 		float3 f = particle_location(kg, particle_id);
 		return set_attribute_float3(f, type, derivatives, val);
 	}
 #if 0	/* unsupported */
 	else if (name == u_particle_rotation) {
-		uint particle_id = object_particle_id(kg, sd->object);
+		int particle_id = object_particle_id(kg, sd->object);
 		float4 f = particle_rotation(kg, particle_id);
 		return set_attribute_float4(f, type, derivatives, val);
 	}
 #endif
 	else if (name == u_particle_size) {
-		uint particle_id = object_particle_id(kg, sd->object);
+		int particle_id = object_particle_id(kg, sd->object);
 		float f = particle_size(kg, particle_id);
 		return set_attribute_float(f, type, derivatives, val);
 	}
 	else if (name == u_particle_velocity) {
-		uint particle_id = object_particle_id(kg, sd->object);
+		int particle_id = object_particle_id(kg, sd->object);
 		float3 f = particle_velocity(kg, particle_id);
 		return set_attribute_float3(f, type, derivatives, val);
 	}
 	else if (name == u_particle_angular_velocity) {
-		uint particle_id = object_particle_id(kg, sd->object);
+		int particle_id = object_particle_id(kg, sd->object);
 		float3 f = particle_angular_velocity(kg, particle_id);
 		return set_attribute_float3(f, type, derivatives, val);
 	}
@@ -644,12 +657,17 @@ bool OSLRenderServices::get_object_standard_attribute(KernelGlobals *kg, ShaderD
 	}
 	else if ((name == u_geom_trianglevertices || name == u_geom_polyvertices)
 #ifdef __HAIR__
-		     && sd->segment == ~0) {
+		     && sd->type & PRIMITIVE_ALL_TRIANGLE)
 #else
-		) {
+		)
 #endif
+	{
 		float3 P[3];
-		triangle_vertices(kg, sd->prim, P);
+
+		if(sd->type & PRIMITIVE_TRIANGLE)
+			triangle_vertices(kg, sd->prim, P);
+		else
+			motion_triangle_vertices(kg, sd->object, sd->prim, sd->time, P);
 
 		if(!(sd->flag & SD_TRANSFORM_APPLIED)) {
 			object_position_transform(kg, sd, &P[0]);
@@ -670,7 +688,7 @@ bool OSLRenderServices::get_object_standard_attribute(KernelGlobals *kg, ShaderD
 #ifdef __HAIR__
 	/* Hair Attributes */
 	else if (name == u_is_curve) {
-		float f = (sd->segment != ~0);
+		float f = (sd->type & PRIMITIVE_ALL_CURVE) != 0;
 		return set_attribute_float(f, type, derivatives, val);
 	}
 	else if (name == u_curve_thickness) {
@@ -699,13 +717,18 @@ bool OSLRenderServices::get_background_attribute(KernelGlobals *kg, ShaderData *
 		int f = sd->ray_depth;
 		return set_attribute_int(f, type, derivatives, val);
 	}
+	else if (name == u_path_transparent_depth) {
+		/* Ray Depth */
+		int f = sd->transparent_depth;
+		return set_attribute_int(f, type, derivatives, val);
+	}
 	else if (name == u_ndc) {
 		/* NDC coordinates with special exception for otho */
 		OSLThreadData *tdata = kg->osl_tdata;
 		OSL::ShaderGlobals *globals = &tdata->globals;
 		float3 ndc[3];
 
-		if((globals->raytype & PATH_RAY_CAMERA) && sd->object == ~0 && kernel_data.cam.type == CAMERA_ORTHOGRAPHIC) {
+		if((globals->raytype & PATH_RAY_CAMERA) && sd->object == OBJECT_NONE && kernel_data.cam.type == CAMERA_ORTHOGRAPHIC) {
 			ndc[0] = camera_world_to_ndc(kg, sd, sd->ray_P);
 
 			if(derivatives) {
@@ -733,7 +756,9 @@ bool OSLRenderServices::get_attribute(void *renderstate, bool derivatives, ustri
 {
 	ShaderData *sd = (ShaderData *)renderstate;
 	KernelGlobals *kg = sd->osl_globals;
-	int object, prim, segment;
+	bool is_curve;
+	int object;
+	// int prim;
 
 	/* lookup of attribute on another object */
 	if (object_name != u_empty) {
@@ -743,24 +768,20 @@ bool OSLRenderServices::get_attribute(void *renderstate, bool derivatives, ustri
 			return false;
 
 		object = it->second;
-		prim = ~0;
-		segment = ~0;
+		// prim = PRIM_NONE;
+		is_curve = false;
 	}
 	else {
 		object = sd->object;
-		prim = sd->prim;
-#ifdef __HAIR__
-		segment = sd->segment;
-#else
-		segment = ~0;
-#endif
+		// prim = sd->prim;
+		is_curve = (sd->type & PRIMITIVE_ALL_CURVE) != 0;
 
-		if (object == ~0)
+		if (object == OBJECT_NONE)
 			return get_background_attribute(kg, sd, name, type, derivatives, val);
 	}
 
 	/* find attribute on object */
-	object = object*ATTR_PRIM_TYPES + (segment != ~0);
+	object = object*ATTR_PRIM_TYPES + (is_curve == true);
 	OSLGlobals::AttributeMap& attribute_map = kg->osl->attribute_map[object];
 	OSLGlobals::AttributeMap::iterator it = attribute_map.find(name);
 
@@ -769,8 +790,8 @@ bool OSLRenderServices::get_attribute(void *renderstate, bool derivatives, ustri
 
 		if (attr.elem != ATTR_ELEMENT_OBJECT) {
 			/* triangle and vertex attributes */
-			if (prim != ~0)
-				return get_mesh_element_attribute(kg, sd, attr, type, derivatives, val);
+			if(get_mesh_element_attribute(kg, sd, attr, type, derivatives, val))
+				return true;
 			else
 				return get_mesh_attribute(kg, sd, attr, type, derivatives, val);
 		}
@@ -1001,12 +1022,13 @@ bool OSLRenderServices::trace(TraceOpt &options, OSL::ShaderGlobals *sg,
 	tracedata->ray = ray;
 	tracedata->setup = false;
 	tracedata->init = true;
+	tracedata->sd.osl_globals = sd->osl_globals;
 
 	/* raytrace */
 #ifdef __HAIR__
-	return scene_intersect(sd->osl_globals, &ray, ~0, &tracedata->isect, NULL, 0.0f, 0.0f);
+	return scene_intersect(sd->osl_globals, &ray, PATH_RAY_ALL_VISIBILITY, &tracedata->isect, NULL, 0.0f, 0.0f);
 #else
-	return scene_intersect(sd->osl_globals, &ray, ~0, &tracedata->isect);
+	return scene_intersect(sd->osl_globals, &ray, PATH_RAY_ALL_VISIBILITY, &tracedata->isect);
 #endif
 }
 
@@ -1018,9 +1040,9 @@ bool OSLRenderServices::getmessage(OSL::ShaderGlobals *sg, ustring source, ustri
 
 	if(source == u_trace && tracedata->init) {
 		if(name == u_hit) {
-			return set_attribute_int((tracedata->isect.prim != ~0), type, derivatives, val);
+			return set_attribute_int((tracedata->isect.prim != PRIM_NONE), type, derivatives, val);
 		}
-		else if(tracedata->isect.prim != ~0) {
+		else if(tracedata->isect.prim != PRIM_NONE) {
 			if(name == u_hitdist) {
 				float f[3] = {tracedata->isect.t, 0.0f, 0.0f};
 				return set_attribute_float(f, type, derivatives, val);
@@ -1033,8 +1055,9 @@ bool OSLRenderServices::getmessage(OSL::ShaderGlobals *sg, ustring source, ustri
 					/* lazy shader data setup */
 					ShaderData *original_sd = (ShaderData *)(sg->renderstate);
 					int bounce = original_sd->ray_depth + 1;
+					int transparent_bounce = original_sd->transparent_depth;
 
-					shader_setup_from_ray(kg, sd, &tracedata->isect, &tracedata->ray, bounce);
+					shader_setup_from_ray(kg, sd, &tracedata->isect, &tracedata->ray, bounce, transparent_bounce);
 					tracedata->setup = true;
 				}
 
diff --git a/intern/cycles/kernel/osl/osl_services.h b/intern/cycles/kernel/osl/osl_services.h
index 479b6da1afb..069722d81b6 100644
--- a/intern/cycles/kernel/osl/osl_services.h
+++ b/intern/cycles/kernel/osl/osl_services.h
@@ -110,12 +110,13 @@ public:
 	                      ustring dataname, TypeDesc datatype, void *data);
 
 	static bool get_background_attribute(KernelGlobals *kg, ShaderData *sd, ustring name,
-			TypeDesc type, bool derivatives, void *val);
+	                                     TypeDesc type, bool derivatives, void *val);
 	static bool get_object_standard_attribute(KernelGlobals *kg, ShaderData *sd, ustring name,
-			TypeDesc type, bool derivatives, void *val);
+	                                          TypeDesc type, bool derivatives, void *val);
 
 	static ustring u_distance;
 	static ustring u_index;
+	static ustring u_world;
 	static ustring u_camera;
 	static ustring u_screen;
 	static ustring u_raster;
@@ -144,6 +145,7 @@ public:
 	static ustring u_curve_tangent_normal;
 	static ustring u_path_ray_length;
 	static ustring u_path_ray_depth;
+	static ustring u_path_transparent_depth;
 	static ustring u_trace;
 	static ustring u_hit;
 	static ustring u_hitdist;
diff --git a/intern/cycles/kernel/osl/osl_shader.cpp b/intern/cycles/kernel/osl/osl_shader.cpp
index 554f647df7c..843dcdd0985 100644
--- a/intern/cycles/kernel/osl/osl_shader.cpp
+++ b/intern/cycles/kernel/osl/osl_shader.cpp
@@ -18,7 +18,8 @@
 #include "kernel_montecarlo.h"
 #include "kernel_types.h"
 #include "kernel_globals.h"
-#include "kernel_object.h"
+
+#include "geom/geom_object.h"
 
 #include "closure/bsdf_diffuse.h"
 #include "closure/bssrdf.h"
@@ -112,7 +113,7 @@ static void shaderdata_to_shaderglobals(KernelGlobals *kg, ShaderData *sd,
 	globals->dvdy = sd->dv.dy;
 	globals->dPdu = TO_VEC3(sd->dPdu);
 	globals->dPdv = TO_VEC3(sd->dPdv);
-	globals->surfacearea = (sd->object == ~0) ? 1.0f : object_surface_area(kg, sd->object);
+	globals->surfacearea = (sd->object == OBJECT_NONE) ? 1.0f : object_surface_area(kg, sd->object);
 	globals->time = sd->time;
 
 	/* booleans */
@@ -408,8 +409,9 @@ static void flatten_volume_closure_tree(ShaderData *sd,
 					sc.data1 = volume->sc.data1;
 
 					/* add */
-					if(sc.sample_weight > CLOSURE_WEIGHT_CUTOFF &&
-					   sd->num_closure < MAX_CLOSURE) {
+					if((sc.sample_weight > CLOSURE_WEIGHT_CUTOFF) &&
+					   (sd->num_closure < MAX_CLOSURE))
+					{
 						sd->closure[sd->num_closure++] = sc;
 						sd->flag |= volume->shaderdata_flag();
 					}
@@ -535,7 +537,7 @@ int OSLShader::find_attribute(KernelGlobals *kg, const ShaderData *sd, uint id,
 	/* for OSL, a hash map is used to lookup the attribute by name. */
 	int object = sd->object*ATTR_PRIM_TYPES;
 #ifdef __HAIR__
-	if(sd->segment != ~0) object += ATTR_PRIM_CURVE;
+	if(sd->type & PRIMITIVE_ALL_CURVE) object += ATTR_PRIM_CURVE;
 #endif
 
 	OSLGlobals::AttributeMap &attr_map = kg->osl->attribute_map[object];
@@ -546,7 +548,7 @@ int OSLShader::find_attribute(KernelGlobals *kg, const ShaderData *sd, uint id,
 		const OSLGlobals::Attribute &osl_attr = it->second;
 		*elem = osl_attr.elem;
 
-		if(sd->prim == ~0 && (AttributeElement)osl_attr.elem != ATTR_ELEMENT_MESH)
+		if(sd->prim == PRIM_NONE && (AttributeElement)osl_attr.elem != ATTR_ELEMENT_MESH)
 			return ATTR_STD_NOT_FOUND;
 
 		/* return result */
diff --git a/intern/cycles/kernel/shaders/CMakeLists.txt b/intern/cycles/kernel/shaders/CMakeLists.txt
index 045abdb80af..5518d652bf9 100644
--- a/intern/cycles/kernel/shaders/CMakeLists.txt
+++ b/intern/cycles/kernel/shaders/CMakeLists.txt
@@ -77,6 +77,7 @@ set(SRC_OSL
 	node_wave_texture.osl
 	node_wireframe.osl
 	node_hair_bsdf.osl
+	node_uv_map.osl
 )
 
 set(SRC_OSL_HEADERS
diff --git a/intern/cycles/kernel/shaders/node_absorption_volume.osl b/intern/cycles/kernel/shaders/node_absorption_volume.osl
index 69c4c0ef7af..6bac83ba4f5 100644
--- a/intern/cycles/kernel/shaders/node_absorption_volume.osl
+++ b/intern/cycles/kernel/shaders/node_absorption_volume.osl
@@ -21,6 +21,6 @@ shader node_absorption_volume(
 	float Density = 1.0,
 	output closure color Volume = 0)
 {
-	Volume = ((color(1.0, 1.0, 1.0) - Color) * Density) * absorption();
+	Volume = ((color(1.0, 1.0, 1.0) - Color) * max(Density, 0.0)) * absorption();
 }
 
diff --git a/intern/cycles/kernel/shaders/node_fresnel.osl b/intern/cycles/kernel/shaders/node_fresnel.osl
index 8c59d5bb512..7ef553c0f39 100644
--- a/intern/cycles/kernel/shaders/node_fresnel.osl
+++ b/intern/cycles/kernel/shaders/node_fresnel.osl
@@ -23,7 +23,7 @@ shader node_fresnel(
 	output float Fac = 0.0)
 {
 	float f = max(IOR, 1e-5);
-	float eta = backfacing() ? 1.0 / f: f;
+	float eta = backfacing() ? 1.0 / f : f;
 	float cosi = dot(I, Normal);
 	Fac = fresnel_dielectric_cos(cosi, eta);
 }
diff --git a/intern/cycles/kernel/shaders/node_glass_bsdf.osl b/intern/cycles/kernel/shaders/node_glass_bsdf.osl
index 96934199621..b3d6133553b 100644
--- a/intern/cycles/kernel/shaders/node_glass_bsdf.osl
+++ b/intern/cycles/kernel/shaders/node_glass_bsdf.osl
@@ -26,7 +26,7 @@ shader node_glass_bsdf(
 	output closure color BSDF = 0)
 {
 	float f = max(IOR, 1e-5);
-	float eta = backfacing() ? 1.0 / f: f;
+	float eta = backfacing() ? 1.0 / f : f;
 	float cosi = dot(I, Normal);
 	float Fr = fresnel_dielectric_cos(cosi, eta);
 
diff --git a/intern/cycles/kernel/shaders/node_image_texture.osl b/intern/cycles/kernel/shaders/node_image_texture.osl
index caa755636b9..7238a1e8862 100644
--- a/intern/cycles/kernel/shaders/node_image_texture.osl
+++ b/intern/cycles/kernel/shaders/node_image_texture.osl
@@ -17,9 +17,9 @@
 #include "stdosl.h"
 #include "node_color.h"
 
-color image_texture_lookup(string filename, string color_space, float u, float v, output float Alpha, int use_alpha, int is_float)
+color image_texture_lookup(string filename, string color_space, float u, float v, output float Alpha, int use_alpha, int is_float, string interpolation)
 {
-	color rgb = (color)texture(filename, u, 1.0 - v, "wrap", "periodic", "alpha", Alpha);
+	color rgb = (color)texture(filename, u, 1.0 - v, "wrap", "periodic", "interp", interpolation, "alpha", Alpha);
 
 	if (use_alpha) {
 		rgb = color_unpremultiply(rgb, Alpha);
@@ -42,6 +42,7 @@ shader node_image_texture(
 	string filename = "",
 	string color_space = "sRGB",
 	string projection = "Flat",
+	string interpolation = "smartcubic",
 	float projection_blend = 0.0,
 	int is_float = 1,
 	int use_alpha = 1,
@@ -54,7 +55,7 @@ shader node_image_texture(
 		p = transform(mapping, p);
 	
 	if (projection == "Flat") {
-		Color = image_texture_lookup(filename, color_space, p[0], p[1], Alpha, use_alpha, is_float);
+		Color = image_texture_lookup(filename, color_space, p[0], p[1], Alpha, use_alpha, is_float, interpolation);
 	}
 	else if (projection == "Box") {
 		/* object space normal */
@@ -119,15 +120,15 @@ shader node_image_texture(
 		float tmp_alpha;
 
 		if (weight[0] > 0.0) {
-			Color += weight[0] * image_texture_lookup(filename, color_space, p[1], p[2], tmp_alpha, use_alpha, is_float);
+			Color += weight[0] * image_texture_lookup(filename, color_space, p[1], p[2], tmp_alpha, use_alpha, is_float, interpolation);
 			Alpha += weight[0] * tmp_alpha;
 		}
 		if (weight[1] > 0.0) {
-			Color += weight[1] * image_texture_lookup(filename, color_space, p[0], p[2], tmp_alpha, use_alpha, is_float);
+			Color += weight[1] * image_texture_lookup(filename, color_space, p[0], p[2], tmp_alpha, use_alpha, is_float, interpolation);
 			Alpha += weight[1] * tmp_alpha;
 		}
 		if (weight[2] > 0.0) {
-			Color += weight[2] * image_texture_lookup(filename, color_space, p[1], p[0], tmp_alpha, use_alpha, is_float);
+			Color += weight[2] * image_texture_lookup(filename, color_space, p[1], p[0], tmp_alpha, use_alpha, is_float, interpolation);
 			Alpha += weight[2] * tmp_alpha;
 		}
 	}
diff --git a/intern/cycles/kernel/shaders/node_light_path.osl b/intern/cycles/kernel/shaders/node_light_path.osl
index 599c7f5a262..95fbcabf917 100644
--- a/intern/cycles/kernel/shaders/node_light_path.osl
+++ b/intern/cycles/kernel/shaders/node_light_path.osl
@@ -26,7 +26,8 @@ shader node_light_path(
 	output float IsTransmissionRay = 0.0,
 	output float IsVolumeScatterRay = 0.0,
 	output float RayLength = 0.0,
-	output float RayDepth = 0.0)
+	output float RayDepth = 0.0,
+	output float TransparentDepth = 0.0)
 {
 	IsCameraRay = raytype("camera");
 	IsShadowRay = raytype("shadow");
@@ -42,5 +43,9 @@ shader node_light_path(
 	int ray_depth;
 	getattribute("path:ray_depth", ray_depth);
 	RayDepth = (float)ray_depth;
+
+	int transparent_depth;
+	getattribute("path:transparent_depth", transparent_depth);
+	TransparentDepth = (float)transparent_depth;
 }
 
diff --git a/intern/cycles/kernel/shaders/node_math.osl b/intern/cycles/kernel/shaders/node_math.osl
index 066e5f8dbe1..abb6a359e75 100644
--- a/intern/cycles/kernel/shaders/node_math.osl
+++ b/intern/cycles/kernel/shaders/node_math.osl
@@ -93,6 +93,8 @@ shader node_math(
 		Value = Value1 > Value2;
 	else if (type == "Modulo")
 		Value = safe_modulo(Value1, Value2);
+    else if (type == "Absolute")
+        Value = fabs(Value1);
 
 	if (Clamp)
 		Value = clamp(Value, 0.0, 1.0);
diff --git a/intern/cycles/kernel/shaders/node_mix.osl b/intern/cycles/kernel/shaders/node_mix.osl
index c2c397c6446..dd54fd814de 100644
--- a/intern/cycles/kernel/shaders/node_mix.osl
+++ b/intern/cycles/kernel/shaders/node_mix.osl
@@ -88,7 +88,7 @@ color node_mix_diff(float t, color col1, color col2)
 
 color node_mix_dark(float t, color col1, color col2)
 {
-	return min(col1, col2 * t);
+	return min(col1, col2) * t + col1 * (1.0 - t);
 }
 
 color node_mix_light(float t, color col1, color col2)
diff --git a/intern/cycles/kernel/shaders/node_refraction_bsdf.osl b/intern/cycles/kernel/shaders/node_refraction_bsdf.osl
index f87b3a5dd86..4a32415b482 100644
--- a/intern/cycles/kernel/shaders/node_refraction_bsdf.osl
+++ b/intern/cycles/kernel/shaders/node_refraction_bsdf.osl
@@ -25,7 +25,7 @@ shader node_refraction_bsdf(
 	output closure color BSDF = 0)
 {
 	float f = max(IOR, 1e-5);
-	float eta = backfacing() ? 1.0 / f: f;
+	float eta = backfacing() ? 1.0 / f : f;
 
 	if (distribution == "Sharp")
 		BSDF = Color * refraction(Normal, eta);
diff --git a/intern/cycles/kernel/shaders/node_scatter_volume.osl b/intern/cycles/kernel/shaders/node_scatter_volume.osl
index bf23abbf933..77c157bd92b 100644
--- a/intern/cycles/kernel/shaders/node_scatter_volume.osl
+++ b/intern/cycles/kernel/shaders/node_scatter_volume.osl
@@ -22,6 +22,6 @@ shader node_scatter_volume(
 	float Anisotropy = 0.0,
 	output closure color Volume = 0)
 {
-	Volume = (Color * Density) * henyey_greenstein(Anisotropy);
+	Volume = (Color * max(Density, 0.0)) * henyey_greenstein(Anisotropy);
 }
 
diff --git a/intern/cycles/kernel/shaders/node_uv_map.osl b/intern/cycles/kernel/shaders/node_uv_map.osl
new file mode 100644
index 00000000000..01c984aff4c
--- /dev/null
+++ b/intern/cycles/kernel/shaders/node_uv_map.osl
@@ -0,0 +1,45 @@
+/*
+ * Copyright 2011-2013 Blender Foundation
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License
+ */
+
+#include "stdosl.h"
+
+shader node_uv_map(
+	int from_dupli = 0,
+	string name = "",
+	string bump_offset = "center",
+	output point UV = point(0.0, 0.0, 0.0))
+{
+	if (from_dupli) {
+		getattribute("geom:dupli_uv", UV);
+	}
+	else {
+		if (name == "")
+			getattribute("geom:uv", UV);
+		else
+			getattribute(name, UV);
+	}
+
+	if (bump_offset == "dx") {
+		if (!from_dupli) {
+			UV += Dx(UV);
+		}
+	}
+	else if (bump_offset == "dy") {
+		if (!from_dupli) {
+			UV += Dy(UV);
+		}
+	}
+}
diff --git a/intern/cycles/kernel/svm/svm.h b/intern/cycles/kernel/svm/svm.h
index 96c7cefbcb2..dbf59c60cb0 100644
--- a/intern/cycles/kernel/svm/svm.h
+++ b/intern/cycles/kernel/svm/svm.h
@@ -182,10 +182,9 @@ CCL_NAMESPACE_BEGIN
 
 /* Main Interpreter Loop */
 
-ccl_device_noinline void svm_eval_nodes(KernelGlobals *kg, ShaderData *sd, ShaderType type, float randb, int path_flag)
+ccl_device_noinline void svm_eval_nodes(KernelGlobals *kg, ShaderData *sd, ShaderType type, int path_flag)
 {
 	float stack[SVM_STACK_SIZE];
-	float closure_weight = 1.0f;
 	int offset = sd->shader & SHADER_MASK;
 
 	while(1) {
@@ -200,7 +199,7 @@ ccl_device_noinline void svm_eval_nodes(KernelGlobals *kg, ShaderData *sd, Shade
 				break;
 			}
 			case NODE_CLOSURE_BSDF:
-				svm_node_closure_bsdf(kg, sd, stack, node, randb, path_flag, &offset);
+				svm_node_closure_bsdf(kg, sd, stack, node, path_flag, &offset);
 				break;
 			case NODE_CLOSURE_EMISSION:
 				svm_node_closure_emission(sd, stack, node);
@@ -227,13 +226,15 @@ ccl_device_noinline void svm_eval_nodes(KernelGlobals *kg, ShaderData *sd, Shade
 				svm_node_emission_weight(kg, sd, stack, node);
 				break;
 			case NODE_MIX_CLOSURE:
-				svm_node_mix_closure(sd, stack, node, &offset, &randb);
+				svm_node_mix_closure(sd, stack, node);
 				break;
-			case NODE_ADD_CLOSURE:
-				svm_node_add_closure(sd, stack, node.y, node.z, &offset, &randb, &closure_weight);
+			case NODE_JUMP_IF_ZERO:
+				if(stack_load_float(stack, node.z) == 0.0f)
+					offset += node.y;
 				break;
-			case NODE_JUMP:
-				offset = node.y;
+			case NODE_JUMP_IF_ONE:
+				if(stack_load_float(stack, node.z) == 1.0f)
+					offset += node.y;
 				break;
 #ifdef __IMAGE_TEXTURES__
 			case NODE_TEX_IMAGE:
@@ -437,9 +438,6 @@ ccl_device_noinline void svm_eval_nodes(KernelGlobals *kg, ShaderData *sd, Shade
 #endif			
 			case NODE_END:
 			default:
-#ifndef __MULTI_CLOSURE__
-				sd->closure.weight *= closure_weight;
-#endif
 				return;
 		}
 	}
diff --git a/intern/cycles/kernel/svm/svm_attribute.h b/intern/cycles/kernel/svm/svm_attribute.h
index 4c53bfd74fa..fd0ea7fef31 100644
--- a/intern/cycles/kernel/svm/svm_attribute.h
+++ b/intern/cycles/kernel/svm/svm_attribute.h
@@ -22,12 +22,12 @@ ccl_device void svm_node_attr_init(KernelGlobals *kg, ShaderData *sd,
 	uint4 node, NodeAttributeType *type,
 	NodeAttributeType *mesh_type, AttributeElement *elem, int *offset, uint *out_offset)
 {
-	if(sd->object != ~0 && sd->prim != ~0) {
+	if(sd->object != OBJECT_NONE) {
 		/* find attribute by unique id */
 		uint id = node.y;
 		uint attr_offset = sd->object*kernel_data.bvh.attributes_map_stride;
 #ifdef __HAIR__
-		attr_offset = (sd->segment == ~0)? attr_offset: attr_offset + ATTR_PRIM_CURVE;
+		attr_offset = (sd->type & PRIMITIVE_ALL_CURVE)? attr_offset + ATTR_PRIM_CURVE: attr_offset;
 #endif
 		uint4 attr_map = kernel_tex_fetch(__attributes_map, attr_offset);
 		
diff --git a/intern/cycles/kernel/svm/svm_closure.h b/intern/cycles/kernel/svm/svm_closure.h
index 2813e38d8f7..a3770877544 100644
--- a/intern/cycles/kernel/svm/svm_closure.h
+++ b/intern/cycles/kernel/svm/svm_closure.h
@@ -51,7 +51,6 @@ ccl_device void svm_node_glass_setup(ShaderData *sd, ShaderClosure *sc, int type
 
 ccl_device_inline ShaderClosure *svm_node_closure_get_non_bsdf(ShaderData *sd, ClosureType type, float mix_weight)
 {
-#ifdef __MULTI_CLOSURE__
 	ShaderClosure *sc = &sd->closure[sd->num_closure];
 
 	if(sd->num_closure < MAX_CLOSURE) {
@@ -65,14 +64,10 @@ ccl_device_inline ShaderClosure *svm_node_closure_get_non_bsdf(ShaderData *sd, C
 	}
 
 	return NULL;
-#else
-	return &sd->closure;
-#endif
 }
 
 ccl_device_inline ShaderClosure *svm_node_closure_get_bsdf(ShaderData *sd, float mix_weight)
 {
-#ifdef __MULTI_CLOSURE__
 	ShaderClosure *sc = &sd->closure[sd->num_closure];
 	float3 weight = sc->weight * mix_weight;
 	float sample_weight = fabsf(average(weight));
@@ -88,14 +83,10 @@ ccl_device_inline ShaderClosure *svm_node_closure_get_bsdf(ShaderData *sd, float
 	}
 
 	return NULL;
-#else
-	return &sd->closure;
-#endif
 }
 
 ccl_device_inline ShaderClosure *svm_node_closure_get_absorption(ShaderData *sd, float mix_weight)
 {
-#ifdef __MULTI_CLOSURE__
 	ShaderClosure *sc = &sd->closure[sd->num_closure];
 	float3 weight = (make_float3(1.0f, 1.0f, 1.0f) - sc->weight) * mix_weight;
 	float sample_weight = fabsf(average(weight));
@@ -111,16 +102,12 @@ ccl_device_inline ShaderClosure *svm_node_closure_get_absorption(ShaderData *sd,
 	}
 
 	return NULL;
-#else
-	return &sd->closure;
-#endif
 }
 
-ccl_device void svm_node_closure_bsdf(KernelGlobals *kg, ShaderData *sd, float *stack, uint4 node, float randb, int path_flag, int *offset)
+ccl_device void svm_node_closure_bsdf(KernelGlobals *kg, ShaderData *sd, float *stack, uint4 node, int path_flag, int *offset)
 {
 	uint type, param1_offset, param2_offset;
 
-#ifdef __MULTI_CLOSURE__
 	uint mix_weight_offset;
 	decode_node_uchar4(node.y, &type, &param1_offset, &param2_offset, &mix_weight_offset);
 	float mix_weight = (stack_valid(mix_weight_offset)? stack_load_float(stack, mix_weight_offset): 1.0f);
@@ -132,13 +119,6 @@ ccl_device void svm_node_closure_bsdf(KernelGlobals *kg, ShaderData *sd, float *
 		return;
 
 	float3 N = stack_valid(data_node.x)? stack_load_float3(stack, data_node.x): sd->N; 
-#else
-	decode_node_uchar4(node.y, &type, &param1_offset, &param2_offset, NULL);
-	float mix_weight = 1.0f;
-
-	uint4 data_node = read_node(kg, offset);
-	float3 N = stack_valid(data_node.x)? stack_load_float3(stack, data_node.x): sd->N; 
-#endif
 
 	float param1 = (stack_valid(param1_offset))? stack_load_float(stack, param1_offset): __uint_as_float(node.z);
 	float param2 = (stack_valid(param2_offset))? stack_load_float(stack, param2_offset): __uint_as_float(node.w);
@@ -255,7 +235,6 @@ ccl_device void svm_node_closure_bsdf(KernelGlobals *kg, ShaderData *sd, float *
 			float fresnel = fresnel_dielectric_cos(cosNO, eta);
 			float roughness = param1;
 
-#ifdef __MULTI_CLOSURE__
 			/* reflection */
 			ShaderClosure *sc = &sd->closure[sd->num_closure];
 			float3 weight = sc->weight;
@@ -279,15 +258,6 @@ ccl_device void svm_node_closure_bsdf(KernelGlobals *kg, ShaderData *sd, float *
 				sc->N = N;
 				svm_node_glass_setup(sd, sc, type, eta, roughness, true);
 			}
-#else
-			ShaderClosure *sc = svm_node_closure_get_bsdf(sd, mix_weight);
-
-			if(sc) {
-				sc->N = N;
-				bool refract = (randb > fresnel);
-				svm_node_glass_setup(sd, sc, type, eta, roughness, refract);
-			}
-#endif
 
 			break;
 		}
@@ -364,10 +334,16 @@ ccl_device void svm_node_closure_bsdf(KernelGlobals *kg, ShaderData *sd, float *
 		case CLOSURE_BSDF_HAIR_REFLECTION_ID:
 		case CLOSURE_BSDF_HAIR_TRANSMISSION_ID: {
 			
-			if(sd->flag & SD_BACKFACING && sd->segment != ~0) {
+			if(sd->flag & SD_BACKFACING && sd->type & PRIMITIVE_ALL_CURVE) {
 				ShaderClosure *sc = svm_node_closure_get_bsdf(sd, mix_weight);
+
 				if(sc) {
-					sc->weight = make_float3(1.0f,1.0f,1.0f);
+					/* todo: giving a fixed weight here will cause issues when
+					 * mixing multiple BSDFS. energey will not be conserved and
+					 * the throughput can blow up after multiple bounces. we
+					 * better figure out a way to skip backfaces from rays
+					 * spawned by transmission from the front */
+					sc->weight = make_float3(1.0f, 1.0f, 1.0f);
 					sc->N = N;
 					sd->flag |= bsdf_transparent_setup(sc);
 				}
@@ -381,12 +357,14 @@ ccl_device void svm_node_closure_bsdf(KernelGlobals *kg, ShaderData *sd, float *
 					sc->data0 = param1;
 					sc->data1 = param2;
 					sc->offset = -stack_load_float(stack, data_node.z);
-					if(sd->segment == ~0) {
+
+					if(!(sd->type & PRIMITIVE_ALL_CURVE)) {
 						sc->T = normalize(sd->dPdv);
 						sc->offset = 0.0f;
 					}
 					else
 						sc->T = sd->dPdu;
+
 					if(type == CLOSURE_BSDF_HAIR_REFLECTION_ID) {
 						sd->flag |= bsdf_hair_reflection_setup(sc);
 					}
@@ -484,21 +462,16 @@ ccl_device void svm_node_closure_volume(KernelGlobals *kg, ShaderData *sd, float
 #ifdef __VOLUME__
 	uint type, param1_offset, param2_offset;
 
-#ifdef __MULTI_CLOSURE__
 	uint mix_weight_offset;
 	decode_node_uchar4(node.y, &type, &param1_offset, &param2_offset, &mix_weight_offset);
 	float mix_weight = (stack_valid(mix_weight_offset)? stack_load_float(stack, mix_weight_offset): 1.0f);
 
 	if(mix_weight == 0.0f)
 		return;
-#else
-	decode_node_uchar4(node.y, &type, &param1_offset, &param2_offset, NULL);
-	float mix_weight = 1.0f;
-#endif
 
 	float param1 = (stack_valid(param1_offset))? stack_load_float(stack, param1_offset): __uint_as_float(node.z);
 	float param2 = (stack_valid(param2_offset))? stack_load_float(stack, param2_offset): __uint_as_float(node.w);
-	float density = param1;
+	float density = fmaxf(param1, 0.0f);
 
 	switch(type) {
 		case CLOSURE_VOLUME_ABSORPTION_ID: {
@@ -527,7 +500,6 @@ ccl_device void svm_node_closure_volume(KernelGlobals *kg, ShaderData *sd, float
 
 ccl_device void svm_node_closure_emission(ShaderData *sd, float *stack, uint4 node)
 {
-#ifdef __MULTI_CLOSURE__
 	uint mix_weight_offset = node.y;
 
 	if(stack_valid(mix_weight_offset)) {
@@ -540,17 +512,12 @@ ccl_device void svm_node_closure_emission(ShaderData *sd, float *stack, uint4 no
 	}
 	else
 		svm_node_closure_get_non_bsdf(sd, CLOSURE_EMISSION_ID, 1.0f);
-#else
-	ShaderClosure *sc = &sd->closure;
-	sc->type = CLOSURE_EMISSION_ID;
-#endif
 
 	sd->flag |= SD_EMISSION;
 }
 
 ccl_device void svm_node_closure_background(ShaderData *sd, float *stack, uint4 node)
 {
-#ifdef __MULTI_CLOSURE__
 	uint mix_weight_offset = node.y;
 
 	if(stack_valid(mix_weight_offset)) {
@@ -563,15 +530,10 @@ ccl_device void svm_node_closure_background(ShaderData *sd, float *stack, uint4
 	}
 	else
 		svm_node_closure_get_non_bsdf(sd, CLOSURE_BACKGROUND_ID, 1.0f);
-#else
-	ShaderClosure *sc = &sd->closure;
-	sc->type = CLOSURE_BACKGROUND_ID;
-#endif
 }
 
 ccl_device void svm_node_closure_holdout(ShaderData *sd, float *stack, uint4 node)
 {
-#ifdef __MULTI_CLOSURE__
 	uint mix_weight_offset = node.y;
 
 	if(stack_valid(mix_weight_offset)) {
@@ -584,17 +546,12 @@ ccl_device void svm_node_closure_holdout(ShaderData *sd, float *stack, uint4 nod
 	}
 	else
 		svm_node_closure_get_non_bsdf(sd, CLOSURE_HOLDOUT_ID, 1.0f);
-#else
-	ShaderClosure *sc = &sd->closure;
-	sc->type = CLOSURE_HOLDOUT_ID;
-#endif
 
 	sd->flag |= SD_HOLDOUT;
 }
 
 ccl_device void svm_node_closure_ambient_occlusion(ShaderData *sd, float *stack, uint4 node)
 {
-#ifdef __MULTI_CLOSURE__
 	uint mix_weight_offset = node.y;
 
 	if(stack_valid(mix_weight_offset)) {
@@ -607,10 +564,6 @@ ccl_device void svm_node_closure_ambient_occlusion(ShaderData *sd, float *stack,
 	}
 	else
 		svm_node_closure_get_non_bsdf(sd, CLOSURE_AMBIENT_OCCLUSION_ID, 1.0f);
-#else
-	ShaderClosure *sc = &sd->closure;
-	sc->type = CLOSURE_AMBIENT_OCCLUSION_ID;
-#endif
 
 	sd->flag |= SD_AO;
 }
@@ -619,12 +572,8 @@ ccl_device void svm_node_closure_ambient_occlusion(ShaderData *sd, float *stack,
 
 ccl_device_inline void svm_node_closure_store_weight(ShaderData *sd, float3 weight)
 {
-#ifdef __MULTI_CLOSURE__
 	if(sd->num_closure < MAX_CLOSURE)
 		sd->closure[sd->num_closure].weight = weight;
-#else
-	sd->closure.weight = weight;
-#endif
 }
 
 ccl_device void svm_node_closure_set_weight(ShaderData *sd, uint r, uint g, uint b)
@@ -637,7 +586,7 @@ ccl_device void svm_node_emission_set_weight_total(KernelGlobals *kg, ShaderData
 {
 	float3 weight = make_float3(__uint_as_float(r), __uint_as_float(g), __uint_as_float(b));
 
-	if(sd->object != ~0)
+	if(sd->object != OBJECT_NONE)
 		weight /= object_surface_area(kg, sd->object);
 
 	svm_node_closure_store_weight(sd, weight);
@@ -659,16 +608,14 @@ ccl_device void svm_node_emission_weight(KernelGlobals *kg, ShaderData *sd, floa
 	float strength = stack_load_float(stack, strength_offset);
 	float3 weight = stack_load_float3(stack, color_offset)*strength;
 
-	if(total_power && sd->object != ~0)
+	if(total_power && sd->object != OBJECT_NONE)
 		weight /= object_surface_area(kg, sd->object);
 
 	svm_node_closure_store_weight(sd, weight);
 }
 
-ccl_device void svm_node_mix_closure(ShaderData *sd, float *stack,
-	uint4 node, int *offset, float *randb)
+ccl_device void svm_node_mix_closure(ShaderData *sd, float *stack, uint4 node)
 {
-#ifdef __MULTI_CLOSURE__
 	/* fetch weight from blend input, previous mix closures,
 	 * and write to stack to be used by closure nodes later */
 	uint weight_offset, in_weight_offset, weight1_offset, weight2_offset;
@@ -683,44 +630,6 @@ ccl_device void svm_node_mix_closure(ShaderData *sd, float *stack,
 		stack_store_float(stack, weight1_offset, in_weight*(1.0f - weight));
 	if(stack_valid(weight2_offset))
 		stack_store_float(stack, weight2_offset, in_weight*weight);
-#else
-	/* pick a closure and make the random number uniform over 0..1 again.
-	 * closure 1 starts on the next node, for closure 2 the start is at an
-	 * offset from the current node, so we jump */
-	uint weight_offset = node.y;
-	uint node_jump = node.z;
-	float weight = stack_load_float(stack, weight_offset);
-	weight = clamp(weight, 0.0f, 1.0f);
-
-	if(*randb < weight) {
-		*offset += node_jump;
-		*randb = *randb/weight;
-	}
-	else
-		*randb = (*randb - weight)/(1.0f - weight);
-#endif
-}
-
-ccl_device void svm_node_add_closure(ShaderData *sd, float *stack, uint unused,
-	uint node_jump, int *offset, float *randb, float *closure_weight)
-{
-#ifdef __MULTI_CLOSURE__
-	/* nothing to do, handled in compiler */
-#else
-	/* pick one of the two closures with probability 0.5. sampling quality
-	 * is not going to be great, for that we'd need to evaluate the weights
-	 * of the two closures being added */
-	float weight = 0.5f;
-
-	if(*randb < weight) {
-		*offset += node_jump;
-		*randb = *randb/weight;
-	}
-	else
-		*randb = (*randb - weight)/(1.0f - weight);
-	
-	*closure_weight *= 2.0f;
-#endif
 }
 
 /* (Bump) normal */
diff --git a/intern/cycles/kernel/svm/svm_geometry.h b/intern/cycles/kernel/svm/svm_geometry.h
index ad0cacb027a..fe681ec92af 100644
--- a/intern/cycles/kernel/svm/svm_geometry.h
+++ b/intern/cycles/kernel/svm/svm_geometry.h
@@ -98,44 +98,44 @@ ccl_device void svm_node_particle_info(KernelGlobals *kg, ShaderData *sd, float
 {
 	switch(type) {
 		case NODE_INFO_PAR_INDEX: {
-			uint particle_id = object_particle_id(kg, sd->object);
+			int particle_id = object_particle_id(kg, sd->object);
 			stack_store_float(stack, out_offset, particle_index(kg, particle_id));
 			break;
 		}
 		case NODE_INFO_PAR_AGE: {
-			uint particle_id = object_particle_id(kg, sd->object);
+			int particle_id = object_particle_id(kg, sd->object);
 			stack_store_float(stack, out_offset, particle_age(kg, particle_id));
 			break;
 		}
 		case NODE_INFO_PAR_LIFETIME: {
-			uint particle_id = object_particle_id(kg, sd->object);
+			int particle_id = object_particle_id(kg, sd->object);
 			stack_store_float(stack, out_offset, particle_lifetime(kg, particle_id));
 			break;
 		}
 		case NODE_INFO_PAR_LOCATION: {
-			uint particle_id = object_particle_id(kg, sd->object);
+			int particle_id = object_particle_id(kg, sd->object);
 			stack_store_float3(stack, out_offset, particle_location(kg, particle_id));
 			break;
 		}
-		#if 0	/* XXX float4 currently not supported in SVM stack */
+#if 0	/* XXX float4 currently not supported in SVM stack */
 		case NODE_INFO_PAR_ROTATION: {
-			uint particle_id = object_particle_id(kg, sd->object);
+			int particle_id = object_particle_id(kg, sd->object);
 			stack_store_float4(stack, out_offset, particle_rotation(kg, particle_id));
 			break;
 		}
-		#endif
+#endif
 		case NODE_INFO_PAR_SIZE: {
-			uint particle_id = object_particle_id(kg, sd->object);
+			int particle_id = object_particle_id(kg, sd->object);
 			stack_store_float(stack, out_offset, particle_size(kg, particle_id));
 			break;
 		}
 		case NODE_INFO_PAR_VELOCITY: {
-			uint particle_id = object_particle_id(kg, sd->object);
+			int particle_id = object_particle_id(kg, sd->object);
 			stack_store_float3(stack, out_offset, particle_velocity(kg, particle_id));
 			break;
 		}
 		case NODE_INFO_PAR_ANGULAR_VELOCITY: {
-			uint particle_id = object_particle_id(kg, sd->object);
+			int particle_id = object_particle_id(kg, sd->object);
 			stack_store_float3(stack, out_offset, particle_angular_velocity(kg, particle_id));
 			break;
 		}
@@ -153,7 +153,7 @@ ccl_device void svm_node_hair_info(KernelGlobals *kg, ShaderData *sd, float *sta
 
 	switch(type) {
 		case NODE_INFO_CURVE_IS_STRAND: {
-			data = (sd->segment != ~0);
+			data = (sd->type & PRIMITIVE_ALL_CURVE) != 0;
 			stack_store_float(stack, out_offset, data);
 			break;
 		}
diff --git a/intern/cycles/kernel/svm/svm_image.h b/intern/cycles/kernel/svm/svm_image.h
index bc76ea1e662..daf7c6652d2 100644
--- a/intern/cycles/kernel/svm/svm_image.h
+++ b/intern/cycles/kernel/svm/svm_image.h
@@ -60,31 +60,51 @@ ccl_device float4 svm_image_texture(KernelGlobals *kg, int id, float x, float y,
 	uint width = info.x;
 	uint height = info.y;
 	uint offset = info.z;
-	uint periodic = info.w;
+	uint periodic = (info.w & 0x1);
+	uint interpolation = info.w >> 1;
 
+	float4 r;
 	int ix, iy, nix, niy;
-	float tx = svm_image_texture_frac(x*width, &ix);
-	float ty = svm_image_texture_frac(y*height, &iy);
+	if (interpolation == INTERPOLATION_CLOSEST) {
+		svm_image_texture_frac(x*width, &ix);
+		svm_image_texture_frac(y*height, &iy);
 
-	if(periodic) {
-		ix = svm_image_texture_wrap_periodic(ix, width);
-		iy = svm_image_texture_wrap_periodic(iy, height);
+		if(periodic) {
+			ix = svm_image_texture_wrap_periodic(ix, width);
+			iy = svm_image_texture_wrap_periodic(iy, height);
+		}
+		else {
+			ix = svm_image_texture_wrap_clamp(ix, width);
+			iy = svm_image_texture_wrap_clamp(iy, height);
 
-		nix = svm_image_texture_wrap_periodic(ix+1, width);
-		niy = svm_image_texture_wrap_periodic(iy+1, height);
+		}
+		r = svm_image_texture_read(kg, offset + ix + iy*width);
 	}
-	else {
-		ix = svm_image_texture_wrap_clamp(ix, width);
-		iy = svm_image_texture_wrap_clamp(iy, height);
+	else { /* We default to linear interpolation if it is not closest */
+		float tx = svm_image_texture_frac(x*width, &ix);
+		float ty = svm_image_texture_frac(y*height, &iy);
 
-		nix = svm_image_texture_wrap_clamp(ix+1, width);
-		niy = svm_image_texture_wrap_clamp(iy+1, height);
-	}
+		if(periodic) {
+			ix = svm_image_texture_wrap_periodic(ix, width);
+			iy = svm_image_texture_wrap_periodic(iy, height);
 
-	float4 r = (1.0f - ty)*(1.0f - tx)*svm_image_texture_read(kg, offset + ix + iy*width);
-	r += (1.0f - ty)*tx*svm_image_texture_read(kg, offset + nix + iy*width);
-	r += ty*(1.0f - tx)*svm_image_texture_read(kg, offset + ix + niy*width);
-	r += ty*tx*svm_image_texture_read(kg, offset + nix + niy*width);
+			nix = svm_image_texture_wrap_periodic(ix+1, width);
+			niy = svm_image_texture_wrap_periodic(iy+1, height);
+		}
+		else {
+			ix = svm_image_texture_wrap_clamp(ix, width);
+			iy = svm_image_texture_wrap_clamp(iy, height);
+
+			nix = svm_image_texture_wrap_clamp(ix+1, width);
+			niy = svm_image_texture_wrap_clamp(iy+1, height);
+		}
+
+
+		r = (1.0f - ty)*(1.0f - tx)*svm_image_texture_read(kg, offset + ix + iy*width);
+		r += (1.0f - ty)*tx*svm_image_texture_read(kg, offset + nix + iy*width);
+		r += ty*(1.0f - tx)*svm_image_texture_read(kg, offset + ix + niy*width);
+		r += ty*tx*svm_image_texture_read(kg, offset + nix + niy*width);
+	}
 
 	if(use_alpha && r.w != 1.0f && r.w != 0.0f) {
 		float invw = 1.0f/r.w;
@@ -129,8 +149,8 @@ ccl_device float4 svm_image_texture(KernelGlobals *kg, int id, float x, float y,
 	 * - group by size and use a 3d texture, performance impact
 	 * - group into larger texture with some padding for correct lerp
 	 *
-	 * also note that cuda has 128 textures limit, we use 100 now, since
-	 * we still need some for other storage */
+	 * also note that cuda has a textures limit (128 for Fermi, 256 for Kepler),
+	 * and we cannot use all since we still need some for other storage */
 
 	switch(id) {
 		case 0: r = kernel_tex_image_interp(__tex_image_float_000, x, y); break;
@@ -233,7 +253,62 @@ ccl_device float4 svm_image_texture(KernelGlobals *kg, int id, float x, float y,
 		case 97: r = kernel_tex_image_interp(__tex_image_097, x, y); break;
 		case 98: r = kernel_tex_image_interp(__tex_image_098, x, y); break;
 		case 99: r = kernel_tex_image_interp(__tex_image_099, x, y); break;
-		default: 
+
+#if defined(__CUDA_ARCH__) && (__CUDA_ARCH__ >= 300)
+		case 100: r = kernel_tex_image_interp(__tex_image_100, x, y); break;
+		case 101: r = kernel_tex_image_interp(__tex_image_101, x, y); break;
+		case 102: r = kernel_tex_image_interp(__tex_image_102, x, y); break;
+		case 103: r = kernel_tex_image_interp(__tex_image_103, x, y); break;
+		case 104: r = kernel_tex_image_interp(__tex_image_104, x, y); break;
+		case 105: r = kernel_tex_image_interp(__tex_image_105, x, y); break;
+		case 106: r = kernel_tex_image_interp(__tex_image_106, x, y); break;
+		case 107: r = kernel_tex_image_interp(__tex_image_107, x, y); break;
+		case 108: r = kernel_tex_image_interp(__tex_image_108, x, y); break;
+		case 109: r = kernel_tex_image_interp(__tex_image_109, x, y); break;
+		case 110: r = kernel_tex_image_interp(__tex_image_110, x, y); break;
+		case 111: r = kernel_tex_image_interp(__tex_image_111, x, y); break;
+		case 112: r = kernel_tex_image_interp(__tex_image_112, x, y); break;
+		case 113: r = kernel_tex_image_interp(__tex_image_113, x, y); break;
+		case 114: r = kernel_tex_image_interp(__tex_image_114, x, y); break;
+		case 115: r = kernel_tex_image_interp(__tex_image_115, x, y); break;
+		case 116: r = kernel_tex_image_interp(__tex_image_116, x, y); break;
+		case 117: r = kernel_tex_image_interp(__tex_image_117, x, y); break;
+		case 118: r = kernel_tex_image_interp(__tex_image_118, x, y); break;
+		case 119: r = kernel_tex_image_interp(__tex_image_119, x, y); break;
+		case 120: r = kernel_tex_image_interp(__tex_image_120, x, y); break;
+		case 121: r = kernel_tex_image_interp(__tex_image_121, x, y); break;
+		case 122: r = kernel_tex_image_interp(__tex_image_122, x, y); break;
+		case 123: r = kernel_tex_image_interp(__tex_image_123, x, y); break;
+		case 124: r = kernel_tex_image_interp(__tex_image_124, x, y); break;
+		case 125: r = kernel_tex_image_interp(__tex_image_125, x, y); break;
+		case 126: r = kernel_tex_image_interp(__tex_image_126, x, y); break;
+		case 127: r = kernel_tex_image_interp(__tex_image_127, x, y); break;
+		case 128: r = kernel_tex_image_interp(__tex_image_128, x, y); break;
+		case 129: r = kernel_tex_image_interp(__tex_image_129, x, y); break;
+		case 130: r = kernel_tex_image_interp(__tex_image_130, x, y); break;
+		case 131: r = kernel_tex_image_interp(__tex_image_131, x, y); break;
+		case 132: r = kernel_tex_image_interp(__tex_image_132, x, y); break;
+		case 133: r = kernel_tex_image_interp(__tex_image_133, x, y); break;
+		case 134: r = kernel_tex_image_interp(__tex_image_134, x, y); break;
+		case 135: r = kernel_tex_image_interp(__tex_image_135, x, y); break;
+		case 136: r = kernel_tex_image_interp(__tex_image_136, x, y); break;
+		case 137: r = kernel_tex_image_interp(__tex_image_137, x, y); break;
+		case 138: r = kernel_tex_image_interp(__tex_image_138, x, y); break;
+		case 139: r = kernel_tex_image_interp(__tex_image_139, x, y); break;
+		case 140: r = kernel_tex_image_interp(__tex_image_140, x, y); break;
+		case 141: r = kernel_tex_image_interp(__tex_image_141, x, y); break;
+		case 142: r = kernel_tex_image_interp(__tex_image_142, x, y); break;
+		case 143: r = kernel_tex_image_interp(__tex_image_143, x, y); break;
+		case 144: r = kernel_tex_image_interp(__tex_image_144, x, y); break;
+		case 145: r = kernel_tex_image_interp(__tex_image_145, x, y); break;
+		case 146: r = kernel_tex_image_interp(__tex_image_146, x, y); break;
+		case 147: r = kernel_tex_image_interp(__tex_image_147, x, y); break;
+		case 148: r = kernel_tex_image_interp(__tex_image_148, x, y); break;
+		case 149: r = kernel_tex_image_interp(__tex_image_149, x, y); break;
+		case 150: r = kernel_tex_image_interp(__tex_image_150, x, y); break;
+#endif
+
+		default:
 			kernel_assert(0);
 			return make_float4(0.0f, 0.0f, 0.0f, 0.0f);
 	}
@@ -302,7 +377,7 @@ ccl_device void svm_node_tex_image_box(KernelGlobals *kg, ShaderData *sd, float
 	float3 N = sd->N;
 
 	N = sd->N;
-	if(sd->object != ~0)
+	if(sd->object != OBJECT_NONE)
 		object_inverse_normal_transform(kg, sd, &N);
 
 	/* project from direction vector to barycentric coordinates in triangles */
diff --git a/intern/cycles/kernel/svm/svm_light_path.h b/intern/cycles/kernel/svm/svm_light_path.h
index 8968146c5e2..da544c63ae0 100644
--- a/intern/cycles/kernel/svm/svm_light_path.h
+++ b/intern/cycles/kernel/svm/svm_light_path.h
@@ -34,6 +34,7 @@ ccl_device void svm_node_light_path(ShaderData *sd, float *stack, uint type, uin
 		case NODE_LP_backfacing: info = (sd->flag & SD_BACKFACING)? 1.0f: 0.0f; break;
 		case NODE_LP_ray_length: info = sd->ray_length; break;
 		case NODE_LP_ray_depth: info = (float)sd->ray_depth; break;
+		case NODE_LP_ray_transparent: info = sd->transparent_depth; break;
 	}
 
 	stack_store_float(stack, out_offset, info);
diff --git a/intern/cycles/kernel/svm/svm_math.h b/intern/cycles/kernel/svm/svm_math.h
index bb46d443a6b..1ce9386e40e 100644
--- a/intern/cycles/kernel/svm/svm_math.h
+++ b/intern/cycles/kernel/svm/svm_math.h
@@ -56,6 +56,8 @@ ccl_device float svm_math(NodeMath type, float Fac1, float Fac2)
 		Fac = Fac1 > Fac2;
 	else if(type == NODE_MATH_MODULO)
 		Fac = safe_modulo(Fac1, Fac2);
+    else if(type == NODE_MATH_ABSOLUTE)
+        Fac = fabsf(Fac1);
 	else if(type == NODE_MATH_CLAMP)
 		Fac = clamp(Fac1, 0.0f, 1.0f);
 	else
diff --git a/intern/cycles/kernel/svm/svm_mix.h b/intern/cycles/kernel/svm/svm_mix.h
index 4e834b7c500..edc3903865e 100644
--- a/intern/cycles/kernel/svm/svm_mix.h
+++ b/intern/cycles/kernel/svm/svm_mix.h
@@ -89,7 +89,7 @@ ccl_device float3 svm_mix_diff(float t, float3 col1, float3 col2)
 
 ccl_device float3 svm_mix_dark(float t, float3 col1, float3 col2)
 {
-	return min(col1, col2*t);
+	return min(col1, col2)*t + col1*(1.0f - t);
 }
 
 ccl_device float3 svm_mix_light(float t, float3 col1, float3 col2)
diff --git a/intern/cycles/kernel/svm/svm_noise.h b/intern/cycles/kernel/svm/svm_noise.h
index 282ad191470..91dda8972f9 100644
--- a/intern/cycles/kernel/svm/svm_noise.h
+++ b/intern/cycles/kernel/svm/svm_noise.h
@@ -357,15 +357,13 @@ ccl_device float3 cellnoise_color(float3 p)
 	return make_float3(r, g, b);
 }
 #else
-ccl_device float3 cellnoise_color(const float3& p)
+ccl_device __m128 cellnoise_color(const __m128& p)
 {
-	__m128i v_yxz = quick_floor_sse(_mm_setr_ps(p.y, p.x, p.z, 0.0f));
-	__m128i v_xyy = shuffle<1, 0, 0, 3>(v_yxz);
-	__m128i v_zzx = shuffle<2, 2, 1, 3>(v_yxz);
-	__m128 rgb = bits_to_01_sse(hash_sse(v_xyy, v_yxz, v_zzx));
-
-	float3 result = *(float3*)&rgb;
-	return result;
+	__m128i ip = quick_floor_sse(p);
+	__m128i ip_yxz = shuffle<1, 0, 2, 3>(ip);
+	__m128i ip_xyy = shuffle<0, 1, 1, 3>(ip);
+	__m128i ip_zzx = shuffle<2, 2, 0, 3>(ip);
+	return bits_to_01_sse(hash_sse(ip_xyy, ip_yxz, ip_zzx));
 }
 #endif
 
diff --git a/intern/cycles/kernel/svm/svm_sepcomb_hsv.h b/intern/cycles/kernel/svm/svm_sepcomb_hsv.h
index 0f68ecbea03..111d5d47988 100644
--- a/intern/cycles/kernel/svm/svm_sepcomb_hsv.h
+++ b/intern/cycles/kernel/svm/svm_sepcomb_hsv.h
@@ -42,12 +42,12 @@ ccl_device void svm_node_separate_hsv(KernelGlobals *kg, ShaderData *sd, float *
 	/* Convert to HSV */
 	color = rgb_to_hsv(color);
 
-	if (stack_valid(hue_out)) 
-			stack_store_float(stack, hue_out, color.x);
-	if (stack_valid(saturation_out)) 
-			stack_store_float(stack, saturation_out, color.y);
-	if (stack_valid(value_out)) 
-			stack_store_float(stack, value_out, color.z);
+	if (stack_valid(hue_out))
+		stack_store_float(stack, hue_out, color.x);
+	if (stack_valid(saturation_out))
+		stack_store_float(stack, saturation_out, color.y);
+	if (stack_valid(value_out))
+		stack_store_float(stack, value_out, color.z);
 }
 
 CCL_NAMESPACE_END
diff --git a/intern/cycles/kernel/svm/svm_sky.h b/intern/cycles/kernel/svm/svm_sky.h
index 1e3552647bd..500b5146931 100644
--- a/intern/cycles/kernel/svm/svm_sky.h
+++ b/intern/cycles/kernel/svm/svm_sky.h
@@ -74,7 +74,7 @@ ccl_device float sky_radiance_internal(float *configuration, float theta, float
 	float expM = expf(configuration[4] * gamma);
 	float rayM = cgamma * cgamma;
 	float mieM = (1.0f + rayM) / powf((1.0f + configuration[8]*configuration[8] - 2.0f*configuration[8]*cgamma), 1.5f);
-	float zenith = sqrt(ctheta);
+	float zenith = sqrtf(ctheta);
 
 	return (1.0f + configuration[0] * expf(configuration[1] / (ctheta + 0.01f))) *
 		(configuration[2] + configuration[3] * expM + configuration[5] * rayM + configuration[6] * mieM + configuration[7] * zenith);
diff --git a/intern/cycles/kernel/svm/svm_tex_coord.h b/intern/cycles/kernel/svm/svm_tex_coord.h
index 4b1f30e55bb..a17e4a25efe 100644
--- a/intern/cycles/kernel/svm/svm_tex_coord.h
+++ b/intern/cycles/kernel/svm/svm_tex_coord.h
@@ -25,27 +25,27 @@ ccl_device void svm_node_tex_coord(KernelGlobals *kg, ShaderData *sd, int path_f
 	switch(type) {
 		case NODE_TEXCO_OBJECT: {
 			data = sd->P;
-			if(sd->object != ~0)
+			if(sd->object != OBJECT_NONE)
 				object_inverse_position_transform(kg, sd, &data);
 			break;
 		}
 		case NODE_TEXCO_NORMAL: {
 			data = sd->N;
-			if(sd->object != ~0)
+			if(sd->object != OBJECT_NONE)
 				object_inverse_normal_transform(kg, sd, &data);
 			break;
 		}
 		case NODE_TEXCO_CAMERA: {
 			Transform tfm = kernel_data.cam.worldtocamera;
 
-			if(sd->object != ~0)
+			if(sd->object != OBJECT_NONE)
 				data = transform_point(&tfm, sd->P);
 			else
 				data = transform_point(&tfm, sd->P + camera_position(kg));
 			break;
 		}
 		case NODE_TEXCO_WINDOW: {
-			if((path_flag & PATH_RAY_CAMERA) && sd->object == ~0 && kernel_data.cam.type == CAMERA_ORTHOGRAPHIC)
+			if((path_flag & PATH_RAY_CAMERA) && sd->object == OBJECT_NONE && kernel_data.cam.type == CAMERA_ORTHOGRAPHIC)
 				data = camera_world_to_ndc(kg, sd, sd->ray_P);
 			else
 				data = camera_world_to_ndc(kg, sd, sd->P);
@@ -53,7 +53,7 @@ ccl_device void svm_node_tex_coord(KernelGlobals *kg, ShaderData *sd, int path_f
 			break;
 		}
 		case NODE_TEXCO_REFLECTION: {
-			if(sd->object != ~0)
+			if(sd->object != OBJECT_NONE)
 				data = 2.0f*dot(sd->N, sd->I)*sd->N - sd->I;
 			else
 				data = sd->I;
@@ -70,17 +70,10 @@ ccl_device void svm_node_tex_coord(KernelGlobals *kg, ShaderData *sd, int path_f
 		case NODE_TEXCO_VOLUME_GENERATED: {
 			data = sd->P;
 
-			if(sd->object != ~0) {
-				AttributeElement attr_elem;
-				int attr_offset = find_attribute(kg, sd, ATTR_STD_GENERATED_TRANSFORM, &attr_elem);
-
-				object_inverse_position_transform(kg, sd, &data);
-
-				if(attr_offset != ATTR_STD_NOT_FOUND) {
-					Transform tfm = primitive_attribute_matrix(kg, sd, attr_offset);
-					data = transform_point(&tfm, data);
-				}
-			}
+#ifdef __VOLUME__
+			if(sd->object != OBJECT_NONE)
+				data = volume_normalized_position(kg, sd, data);
+#endif
 			break;
 		}
 	}
@@ -96,27 +89,27 @@ ccl_device void svm_node_tex_coord_bump_dx(KernelGlobals *kg, ShaderData *sd, in
 	switch(type) {
 		case NODE_TEXCO_OBJECT: {
 			data = sd->P + sd->dP.dx;
-			if(sd->object != ~0)
+			if(sd->object != OBJECT_NONE)
 				object_inverse_position_transform(kg, sd, &data);
 			break;
 		}
 		case NODE_TEXCO_NORMAL: {
 			data = sd->N;
-			if(sd->object != ~0)
+			if(sd->object != OBJECT_NONE)
 				object_inverse_normal_transform(kg, sd, &data);
 			break;
 		}
 		case NODE_TEXCO_CAMERA: {
 			Transform tfm = kernel_data.cam.worldtocamera;
 
-			if(sd->object != ~0)
+			if(sd->object != OBJECT_NONE)
 				data = transform_point(&tfm, sd->P + sd->dP.dx);
 			else
 				data = transform_point(&tfm, sd->P + sd->dP.dx + camera_position(kg));
 			break;
 		}
 		case NODE_TEXCO_WINDOW: {
-			if((path_flag & PATH_RAY_CAMERA) && sd->object == ~0 && kernel_data.cam.type == CAMERA_ORTHOGRAPHIC)
+			if((path_flag & PATH_RAY_CAMERA) && sd->object == OBJECT_NONE && kernel_data.cam.type == CAMERA_ORTHOGRAPHIC)
 				data = camera_world_to_ndc(kg, sd, sd->ray_P + sd->ray_dP.dx);
 			else
 				data = camera_world_to_ndc(kg, sd, sd->P + sd->dP.dx);
@@ -124,7 +117,7 @@ ccl_device void svm_node_tex_coord_bump_dx(KernelGlobals *kg, ShaderData *sd, in
 			break;
 		}
 		case NODE_TEXCO_REFLECTION: {
-			if(sd->object != ~0)
+			if(sd->object != OBJECT_NONE)
 				data = 2.0f*dot(sd->N, sd->I)*sd->N - sd->I;
 			else
 				data = sd->I;
@@ -141,17 +134,10 @@ ccl_device void svm_node_tex_coord_bump_dx(KernelGlobals *kg, ShaderData *sd, in
 		case NODE_TEXCO_VOLUME_GENERATED: {
 			data = sd->P + sd->dP.dx;
 
-			if(sd->object != ~0) {
-				AttributeElement attr_elem;
-				int attr_offset = find_attribute(kg, sd, ATTR_STD_GENERATED_TRANSFORM, &attr_elem);
-
-				object_inverse_position_transform(kg, sd, &data);
-
-				if(attr_offset != ATTR_STD_NOT_FOUND) {
-					Transform tfm = primitive_attribute_matrix(kg, sd, attr_offset);
-					data = transform_point(&tfm, data);
-				}
-			}
+#ifdef __VOLUME__
+			if(sd->object != OBJECT_NONE)
+				data = volume_normalized_position(kg, sd, data);
+#endif
 			break;
 		}
 	}
@@ -170,27 +156,27 @@ ccl_device void svm_node_tex_coord_bump_dy(KernelGlobals *kg, ShaderData *sd, in
 	switch(type) {
 		case NODE_TEXCO_OBJECT: {
 			data = sd->P + sd->dP.dy;
-			if(sd->object != ~0)
+			if(sd->object != OBJECT_NONE)
 				object_inverse_position_transform(kg, sd, &data);
 			break;
 		}
 		case NODE_TEXCO_NORMAL: {
 			data = sd->N;
-			if(sd->object != ~0)
+			if(sd->object != OBJECT_NONE)
 				object_inverse_normal_transform(kg, sd, &data);
 			break;
 		}
 		case NODE_TEXCO_CAMERA: {
 			Transform tfm = kernel_data.cam.worldtocamera;
 
-			if(sd->object != ~0)
+			if(sd->object != OBJECT_NONE)
 				data = transform_point(&tfm, sd->P + sd->dP.dy);
 			else
 				data = transform_point(&tfm, sd->P + sd->dP.dy + camera_position(kg));
 			break;
 		}
 		case NODE_TEXCO_WINDOW: {
-			if((path_flag & PATH_RAY_CAMERA) && sd->object == ~0 && kernel_data.cam.type == CAMERA_ORTHOGRAPHIC)
+			if((path_flag & PATH_RAY_CAMERA) && sd->object == OBJECT_NONE && kernel_data.cam.type == CAMERA_ORTHOGRAPHIC)
 				data = camera_world_to_ndc(kg, sd, sd->ray_P + sd->ray_dP.dy);
 			else
 				data = camera_world_to_ndc(kg, sd, sd->P + sd->dP.dy);
@@ -198,7 +184,7 @@ ccl_device void svm_node_tex_coord_bump_dy(KernelGlobals *kg, ShaderData *sd, in
 			break;
 		}
 		case NODE_TEXCO_REFLECTION: {
-			if(sd->object != ~0)
+			if(sd->object != OBJECT_NONE)
 				data = 2.0f*dot(sd->N, sd->I)*sd->N - sd->I;
 			else
 				data = sd->I;
@@ -215,17 +201,10 @@ ccl_device void svm_node_tex_coord_bump_dy(KernelGlobals *kg, ShaderData *sd, in
 		case NODE_TEXCO_VOLUME_GENERATED: {
 			data = sd->P + sd->dP.dy;
 
-			if(sd->object != ~0) {
-				AttributeElement attr_elem;
-				int attr_offset = find_attribute(kg, sd, ATTR_STD_GENERATED_TRANSFORM, &attr_elem);
-
-				object_inverse_position_transform(kg, sd, &data);
-
-				if(attr_offset != ATTR_STD_NOT_FOUND) {
-					Transform tfm = primitive_attribute_matrix(kg, sd, attr_offset);
-					data = transform_point(&tfm, data);
-				}
-			}
+#ifdef __VOLUME__
+			if(sd->object != OBJECT_NONE)
+				data = volume_normalized_position(kg, sd, data);
+#endif
 			break;
 		}
 	}
@@ -248,7 +227,7 @@ ccl_device void svm_node_normal_map(KernelGlobals *kg, ShaderData *sd, float *st
 
 	if(space == NODE_NORMAL_MAP_TANGENT) {
 		/* tangent space */
-		if(sd->object == ~0) {
+		if(sd->object == OBJECT_NONE) {
 			stack_store_float3(stack, normal_offset, make_float3(0.0f, 0.0f, 0.0f));
 			return;
 		}
diff --git a/intern/cycles/kernel/svm/svm_texture.h b/intern/cycles/kernel/svm/svm_texture.h
index 8ced8390b0b..5fd9204cbf6 100644
--- a/intern/cycles/kernel/svm/svm_texture.h
+++ b/intern/cycles/kernel/svm/svm_texture.h
@@ -18,6 +18,7 @@ CCL_NAMESPACE_BEGIN
 
 /* Voronoi Distances */
 
+#if 0
 ccl_device float voronoi_distance(NodeDistanceMetric distance_metric, float3 d, float e)
 {
 #if 0
@@ -43,8 +44,7 @@ ccl_device float voronoi_distance(NodeDistanceMetric distance_metric, float3 d,
 }
 
 /* Voronoi / Worley like */
-
-ccl_device_noinline float4 voronoi_Fn(float3 p, float e, int n1, int n2)
+ccl_device_inline float4 voronoi_Fn(float3 p, float e, int n1, int n2)
 {
 	float da[4];
 	float3 pa[4];
@@ -119,7 +119,95 @@ ccl_device_noinline float4 voronoi_Fn(float3 p, float e, int n1, int n2)
 
 	return result;
 }
+#endif
+
+ccl_device float voronoi_F1_distance(float3 p)
+{
+	/* returns squared distance in da */
+	float da = 1e10f;
+
+#ifndef __KERNEL_SSE2__
+	int ix = floor_to_int(p.x), iy = floor_to_int(p.y), iz = floor_to_int(p.z);
+
+	for (int xx = -1; xx <= 1; xx++) {
+		for (int yy = -1; yy <= 1; yy++) {
+			for (int zz = -1; zz <= 1; zz++) {
+				float3 ip = make_float3(ix + xx, iy + yy, iz + zz);
+				float3 vp = ip + cellnoise_color(ip);
+				float d = len_squared(p - vp);
+				da = min(d, da);
+			}
+		}
+	}
+#else
+	__m128 vec_p = load_m128(p);
+	__m128i xyzi = quick_floor_sse(vec_p);
+
+	for (int xx = -1; xx <= 1; xx++) {
+		for (int yy = -1; yy <= 1; yy++) {
+			for (int zz = -1; zz <= 1; zz++) {
+				__m128 ip = _mm_cvtepi32_ps(_mm_add_epi32(xyzi, _mm_setr_epi32(xx, yy, zz, 0)));
+				__m128 vp = _mm_add_ps(ip, cellnoise_color(ip));
+				float d = len_squared<1, 1, 1, 0>(_mm_sub_ps(vec_p, vp));
+				da = min(d, da);
+			}
+		}
+	}
+#endif
+
+	return da;
+}
+
+ccl_device float3 voronoi_F1_color(float3 p)
+{
+	/* returns color of the nearest point */
+	float da = 1e10f;
+
+#ifndef __KERNEL_SSE2__
+	float3 pa;
+	int ix = floor_to_int(p.x), iy = floor_to_int(p.y), iz = floor_to_int(p.z);
+
+	for (int xx = -1; xx <= 1; xx++) {
+		for (int yy = -1; yy <= 1; yy++) {
+			for (int zz = -1; zz <= 1; zz++) {
+				float3 ip = make_float3(ix + xx, iy + yy, iz + zz);
+				float3 vp = ip + cellnoise_color(ip);
+				float d = len_squared(p - vp);
+
+				if(d < da) {
+					da = d;
+					pa = vp;
+				}
+			}
+		}
+	}
+
+	return cellnoise_color(pa);
+#else
+	__m128 pa, vec_p = load_m128(p);
+	__m128i xyzi = quick_floor_sse(vec_p);
+
+	for (int xx = -1; xx <= 1; xx++) {
+		for (int yy = -1; yy <= 1; yy++) {
+			for (int zz = -1; zz <= 1; zz++) {
+				__m128 ip = _mm_cvtepi32_ps(_mm_add_epi32(xyzi, _mm_setr_epi32(xx, yy, zz, 0)));
+				__m128 vp = _mm_add_ps(ip, cellnoise_color(ip));
+				float d = len_squared<1, 1, 1, 0>(_mm_sub_ps(vec_p, vp));
+
+				if(d < da) {
+					da = d;
+					pa = vp;
+				}
+			}
+		}
+	}
+
+	__m128 color = cellnoise_color(pa);
+	return (float3 &)color;
+#endif
+}
 
+#if 0
 ccl_device float voronoi_F1(float3 p) { return voronoi_Fn(p, 0.0f, 0, -1).w; }
 ccl_device float voronoi_F2(float3 p) { return voronoi_Fn(p, 0.0f, 1, -1).w; }
 ccl_device float voronoi_F3(float3 p) { return voronoi_Fn(p, 0.0f, 2, -1).w; }
@@ -139,6 +227,7 @@ ccl_device float voronoi_F3S(float3 p) { return 2.0f*voronoi_F3(p) - 1.0f; }
 ccl_device float voronoi_F4S(float3 p) { return 2.0f*voronoi_F4(p) - 1.0f; }
 ccl_device float voronoi_F1F2S(float3 p) { return 2.0f*voronoi_F1F2(p) - 1.0f; }
 ccl_device float voronoi_CrS(float3 p) { return 2.0f*voronoi_Cr(p) - 1.0f; }
+#endif
 
 /* Noise Bases */
 
diff --git a/intern/cycles/kernel/svm/svm_types.h b/intern/cycles/kernel/svm/svm_types.h
index ad5e1ea6d2e..80972ec82bc 100644
--- a/intern/cycles/kernel/svm/svm_types.h
+++ b/intern/cycles/kernel/svm/svm_types.h
@@ -36,7 +36,8 @@ typedef enum NodeType {
 	NODE_CLOSURE_SET_WEIGHT,
 	NODE_CLOSURE_WEIGHT,
 	NODE_MIX_CLOSURE,
-	NODE_JUMP,
+	NODE_JUMP_IF_ZERO,
+	NODE_JUMP_IF_ONE,
 	NODE_TEX_IMAGE,
 	NODE_TEX_IMAGE_BOX,
 	NODE_TEX_SKY,
@@ -71,7 +72,6 @@ typedef enum NodeType {
 	NODE_TEX_COORD,
 	NODE_TEX_COORD_BUMP_DX,
 	NODE_TEX_COORD_BUMP_DY,
-	NODE_ADD_CLOSURE,
 	NODE_EMISSION_SET_WEIGHT_TOTAL,
 	NODE_ATTR_BUMP_DX,
 	NODE_ATTR_BUMP_DY,
@@ -102,7 +102,8 @@ typedef enum NodeType {
 	NODE_CLOSURE_AMBIENT_OCCLUSION,
 	NODE_TANGENT,
 	NODE_NORMAL_MAP,
-	NODE_HAIR_INFO
+	NODE_HAIR_INFO,
+	NODE_UVMAP
 } NodeType;
 
 typedef enum NodeAttributeType {
@@ -158,7 +159,8 @@ typedef enum NodeLightPath {
 	NODE_LP_volume_scatter,
 	NODE_LP_backfacing,
 	NODE_LP_ray_length,
-	NODE_LP_ray_depth
+	NODE_LP_ray_depth,
+	NODE_LP_ray_transparent
 } NodeLightPath;
 
 typedef enum NodeLightFalloff {
@@ -219,6 +221,7 @@ typedef enum NodeMath {
 	NODE_MATH_LESS_THAN,
 	NODE_MATH_GREATER_THAN,
 	NODE_MATH_MODULO,
+    NODE_MATH_ABSOLUTE,
 	NODE_MATH_CLAMP /* used for the clamp UI option */
 } NodeMath;
 
@@ -401,6 +404,8 @@ typedef enum ClosureType {
 #define CLOSURE_IS_BSDF_GLOSSY(type) (type >= CLOSURE_BSDF_GLOSSY_ID && type <= CLOSURE_BSDF_HAIR_REFLECTION_ID)
 #define CLOSURE_IS_BSDF_TRANSMISSION(type) (type >= CLOSURE_BSDF_TRANSMISSION_ID && type <= CLOSURE_BSDF_HAIR_TRANSMISSION_ID)
 #define CLOSURE_IS_BSDF_BSSRDF(type) (type == CLOSURE_BSDF_BSSRDF_ID)
+#define CLOSURE_IS_BSDF_ANISOTROPIC(type) (type == CLOSURE_BSDF_WARD_ID)
+#define CLOSURE_IS_BSDF_OR_BSSRDF(type) (type <= CLOSURE_BSSRDF_GAUSSIAN_ID)
 #define CLOSURE_IS_BSSRDF(type) (type >= CLOSURE_BSSRDF_CUBIC_ID && type <= CLOSURE_BSSRDF_GAUSSIAN_ID)
 #define CLOSURE_IS_VOLUME(type) (type >= CLOSURE_VOLUME_ID && type <= CLOSURE_VOLUME_HENYEY_GREENSTEIN_ID)
 #define CLOSURE_IS_EMISSION(type) (type == CLOSURE_EMISSION_ID)
diff --git a/intern/cycles/kernel/svm/svm_vector_transform.h b/intern/cycles/kernel/svm/svm_vector_transform.h
index 1e3fc2fa03b..61d33aeb8cf 100644
--- a/intern/cycles/kernel/svm/svm_vector_transform.h
+++ b/intern/cycles/kernel/svm/svm_vector_transform.h
@@ -33,7 +33,7 @@ ccl_device void svm_node_vector_transform(KernelGlobals *kg, ShaderData *sd, flo
 	NodeVectorTransformConvertSpace to = (NodeVectorTransformConvertSpace)ito;
 	
 	Transform tfm;
-	bool is_object = (sd->object != ~0);
+	bool is_object = (sd->object != OBJECT_NONE);
 	bool is_direction = (type == NODE_VECTOR_TRANSFORM_TYPE_VECTOR || type == NODE_VECTOR_TRANSFORM_TYPE_NORMAL);
 	
 	/* From world */
@@ -91,9 +91,9 @@ ccl_device void svm_node_vector_transform(KernelGlobals *kg, ShaderData *sd, flo
 	if(type == NODE_VECTOR_TRANSFORM_TYPE_NORMAL)
 		in = normalize(in);
 	
-	/* Output */	
+	/* Output */
 	if(stack_valid(vector_out)) {
-			stack_store_float3(stack, vector_out, in);
+		stack_store_float3(stack, vector_out, in);
 	}
 }
 
diff --git a/intern/cycles/kernel/svm/svm_voronoi.h b/intern/cycles/kernel/svm/svm_voronoi.h
index 7f597dc8bff..083a2f30e06 100644
--- a/intern/cycles/kernel/svm/svm_voronoi.h
+++ b/intern/cycles/kernel/svm/svm_voronoi.h
@@ -20,23 +20,16 @@ CCL_NAMESPACE_BEGIN
 
 ccl_device_noinline float4 svm_voronoi(NodeVoronoiColoring coloring, float3 p)
 {
-	/* compute distance and point coordinate of 4 nearest neighbours */
-	float4 dpa0 = voronoi_Fn(p, 1.0f, 0, -1);
-
-	/* output */
-	float fac;
-	float3 color;
-
 	if(coloring == NODE_VORONOI_INTENSITY) {
-		fac = fabsf(dpa0.w);
-		color = make_float3(fac, fac, fac);
+		/* compute squared distance to the nearest neighbour */
+		float fac = voronoi_F1_distance(p);
+		return make_float4(fac, fac, fac, fac);
 	}
 	else {
-		color = cellnoise_color(float4_to_float3(dpa0));
-		fac = average(color);
+		/* compute color of the nearest neighbour */
+		float3 color = voronoi_F1_color(p);
+		return make_float4(color.x, color.y, color.z, average(color));
 	}
-
-	return make_float4(color.x, color.y, color.z, fac);
 }
 
 ccl_device void svm_node_tex_voronoi(KernelGlobals *kg, ShaderData *sd, float *stack, uint4 node, int *offset)
diff --git a/intern/cycles/kernel/svm/svm_wavelength.h b/intern/cycles/kernel/svm/svm_wavelength.h
index dca4003b89a..9e57c470c0f 100644
--- a/intern/cycles/kernel/svm/svm_wavelength.h
+++ b/intern/cycles/kernel/svm/svm_wavelength.h
@@ -43,33 +43,33 @@ ccl_device void svm_node_wavelength(ShaderData *sd, float *stack, uint wavelengt
 	//		  cie_colour_match[(lambda - 380) / 5][1] = yBar
 	//		  cie_colour_match[(lambda - 380) / 5][2] = zBar
 	const float cie_colour_match[81][3] = {
-		{0.0014,0.0000,0.0065}, {0.0022,0.0001,0.0105}, {0.0042,0.0001,0.0201},
-		{0.0076,0.0002,0.0362}, {0.0143,0.0004,0.0679}, {0.0232,0.0006,0.1102},
-		{0.0435,0.0012,0.2074}, {0.0776,0.0022,0.3713}, {0.1344,0.0040,0.6456},
-		{0.2148,0.0073,1.0391}, {0.2839,0.0116,1.3856}, {0.3285,0.0168,1.6230},
-		{0.3483,0.0230,1.7471}, {0.3481,0.0298,1.7826}, {0.3362,0.0380,1.7721},
-		{0.3187,0.0480,1.7441}, {0.2908,0.0600,1.6692}, {0.2511,0.0739,1.5281},
-		{0.1954,0.0910,1.2876}, {0.1421,0.1126,1.0419}, {0.0956,0.1390,0.8130},
-		{0.0580,0.1693,0.6162}, {0.0320,0.2080,0.4652}, {0.0147,0.2586,0.3533},
-		{0.0049,0.3230,0.2720}, {0.0024,0.4073,0.2123}, {0.0093,0.5030,0.1582},
-		{0.0291,0.6082,0.1117}, {0.0633,0.7100,0.0782}, {0.1096,0.7932,0.0573},
-		{0.1655,0.8620,0.0422}, {0.2257,0.9149,0.0298}, {0.2904,0.9540,0.0203},
-		{0.3597,0.9803,0.0134}, {0.4334,0.9950,0.0087}, {0.5121,1.0000,0.0057},
-		{0.5945,0.9950,0.0039}, {0.6784,0.9786,0.0027}, {0.7621,0.9520,0.0021},
-		{0.8425,0.9154,0.0018}, {0.9163,0.8700,0.0017}, {0.9786,0.8163,0.0014},
-		{1.0263,0.7570,0.0011}, {1.0567,0.6949,0.0010}, {1.0622,0.6310,0.0008},
-		{1.0456,0.5668,0.0006}, {1.0026,0.5030,0.0003}, {0.9384,0.4412,0.0002},
-		{0.8544,0.3810,0.0002}, {0.7514,0.3210,0.0001}, {0.6424,0.2650,0.0000},
-		{0.5419,0.2170,0.0000}, {0.4479,0.1750,0.0000}, {0.3608,0.1382,0.0000},
-		{0.2835,0.1070,0.0000}, {0.2187,0.0816,0.0000}, {0.1649,0.0610,0.0000},
-		{0.1212,0.0446,0.0000}, {0.0874,0.0320,0.0000}, {0.0636,0.0232,0.0000},
-		{0.0468,0.0170,0.0000}, {0.0329,0.0119,0.0000}, {0.0227,0.0082,0.0000},
-		{0.0158,0.0057,0.0000}, {0.0114,0.0041,0.0000}, {0.0081,0.0029,0.0000},
-		{0.0058,0.0021,0.0000}, {0.0041,0.0015,0.0000}, {0.0029,0.0010,0.0000},
-		{0.0020,0.0007,0.0000}, {0.0014,0.0005,0.0000}, {0.0010,0.0004,0.0000},
-		{0.0007,0.0002,0.0000}, {0.0005,0.0002,0.0000}, {0.0003,0.0001,0.0000},
-		{0.0002,0.0001,0.0000}, {0.0002,0.0001,0.0000}, {0.0001,0.0000,0.0000},
-		{0.0001,0.0000,0.0000}, {0.0001,0.0000,0.0000}, {0.0000,0.0000,0.0000}
+		{0.0014f,0.0000f,0.0065f}, {0.0022f,0.0001f,0.0105f}, {0.0042f,0.0001f,0.0201f},
+		{0.0076f,0.0002f,0.0362f}, {0.0143f,0.0004f,0.0679f}, {0.0232f,0.0006f,0.1102f},
+		{0.0435f,0.0012f,0.2074f}, {0.0776f,0.0022f,0.3713f}, {0.1344f,0.0040f,0.6456f},
+		{0.2148f,0.0073f,1.0391f}, {0.2839f,0.0116f,1.3856f}, {0.3285f,0.0168f,1.6230f},
+		{0.3483f,0.0230f,1.7471f}, {0.3481f,0.0298f,1.7826f}, {0.3362f,0.0380f,1.7721f},
+		{0.3187f,0.0480f,1.7441f}, {0.2908f,0.0600f,1.6692f}, {0.2511f,0.0739f,1.5281f},
+		{0.1954f,0.0910f,1.2876f}, {0.1421f,0.1126f,1.0419f}, {0.0956f,0.1390f,0.8130f},
+		{0.0580f,0.1693f,0.6162f}, {0.0320f,0.2080f,0.4652f}, {0.0147f,0.2586f,0.3533f},
+		{0.0049f,0.3230f,0.2720f}, {0.0024f,0.4073f,0.2123f}, {0.0093f,0.5030f,0.1582f},
+		{0.0291f,0.6082f,0.1117f}, {0.0633f,0.7100f,0.0782f}, {0.1096f,0.7932f,0.0573f},
+		{0.1655f,0.8620f,0.0422f}, {0.2257f,0.9149f,0.0298f}, {0.2904f,0.9540f,0.0203f},
+		{0.3597f,0.9803f,0.0134f}, {0.4334f,0.9950f,0.0087f}, {0.5121f,1.0000f,0.0057f},
+		{0.5945f,0.9950f,0.0039f}, {0.6784f,0.9786f,0.0027f}, {0.7621f,0.9520f,0.0021f},
+		{0.8425f,0.9154f,0.0018f}, {0.9163f,0.8700f,0.0017f}, {0.9786f,0.8163f,0.0014f},
+		{1.0263f,0.7570f,0.0011f}, {1.0567f,0.6949f,0.0010f}, {1.0622f,0.6310f,0.0008f},
+		{1.0456f,0.5668f,0.0006f}, {1.0026f,0.5030f,0.0003f}, {0.9384f,0.4412f,0.0002f},
+		{0.8544f,0.3810f,0.0002f}, {0.7514f,0.3210f,0.0001f}, {0.6424f,0.2650f,0.0000f},
+		{0.5419f,0.2170f,0.0000f}, {0.4479f,0.1750f,0.0000f}, {0.3608f,0.1382f,0.0000f},
+		{0.2835f,0.1070f,0.0000f}, {0.2187f,0.0816f,0.0000f}, {0.1649f,0.0610f,0.0000f},
+		{0.1212f,0.0446f,0.0000f}, {0.0874f,0.0320f,0.0000f}, {0.0636f,0.0232f,0.0000f},
+		{0.0468f,0.0170f,0.0000f}, {0.0329f,0.0119f,0.0000f}, {0.0227f,0.0082f,0.0000f},
+		{0.0158f,0.0057f,0.0000f}, {0.0114f,0.0041f,0.0000f}, {0.0081f,0.0029f,0.0000f},
+		{0.0058f,0.0021f,0.0000f}, {0.0041f,0.0015f,0.0000f}, {0.0029f,0.0010f,0.0000f},
+		{0.0020f,0.0007f,0.0000f}, {0.0014f,0.0005f,0.0000f}, {0.0010f,0.0004f,0.0000f},
+		{0.0007f,0.0002f,0.0000f}, {0.0005f,0.0002f,0.0000f}, {0.0003f,0.0001f,0.0000f},
+		{0.0002f,0.0001f,0.0000f}, {0.0002f,0.0001f,0.0000f}, {0.0001f,0.0000f,0.0000f},
+		{0.0001f,0.0000f,0.0000f}, {0.0001f,0.0000f,0.0000f}, {0.0000f,0.0000f,0.0000f}
 	};
 
 	float lambda_nm = stack_load_float(stack, wavelength);
diff --git a/intern/cycles/kernel/svm/svm_wireframe.h b/intern/cycles/kernel/svm/svm_wireframe.h
index e560e6303cc..660e6e2ca47 100644
--- a/intern/cycles/kernel/svm/svm_wireframe.h
+++ b/intern/cycles/kernel/svm/svm_wireframe.h
@@ -45,17 +45,21 @@ ccl_device void svm_node_wireframe(KernelGlobals *kg, ShaderData *sd, float *sta
 
 	/* Calculate wireframe */
 #ifdef __HAIR__
-	if (sd->prim != ~0 && sd->segment == ~0) {
+	if (sd->prim != PRIM_NONE && sd->type & PRIMITIVE_ALL_TRIANGLE)
 #else
-	if (sd->prim != ~0) {
+	if (sd->prim != PRIM_NONE)
 #endif
+	{
 		float3 Co[3];
 		float pixelwidth = 1.0f;
 
 		/* Triangles */
-		float np = 3;
+		int np = 3;
 		
-		triangle_vertices(kg, sd->prim, Co);
+		if(sd->type & PRIMITIVE_TRIANGLE)
+			triangle_vertices(kg, sd->prim, Co);
+		else
+			motion_triangle_vertices(kg, sd->object, sd->prim, sd->time, Co);
 
 		if(!(sd->flag & SD_TRANSFORM_APPLIED)) {
 			object_position_transform(kg, sd, &Co[0]);
diff --git a/intern/cycles/render/CMakeLists.txt b/intern/cycles/render/CMakeLists.txt
index 7d00ed92164..449c1391980 100644
--- a/intern/cycles/render/CMakeLists.txt
+++ b/intern/cycles/render/CMakeLists.txt
@@ -16,6 +16,7 @@ set(INC_SYS
 set(SRC
 	attribute.cpp
 	background.cpp
+	bake.cpp
 	blackbody.cpp
 	buffers.cpp
 	camera.cpp
@@ -43,6 +44,7 @@ set(SRC
 
 set(SRC_HEADERS
 	attribute.h
+	bake.h
 	background.h
 	blackbody.h
 	buffers.h
diff --git a/intern/cycles/render/attribute.cpp b/intern/cycles/render/attribute.cpp
index 61b9cf2f3bc..14805b6f11a 100644
--- a/intern/cycles/render/attribute.cpp
+++ b/intern/cycles/render/attribute.cpp
@@ -14,6 +14,7 @@
  * limitations under the License
  */
 
+#include "image.h"
 #include "mesh.h"
 #include "attribute.h"
 
@@ -25,6 +26,17 @@ CCL_NAMESPACE_BEGIN
 
 /* Attribute */
 
+Attribute::~Attribute()
+{
+	/* for voxel data, we need to remove the image from the image manager */
+	if(element == ATTR_ELEMENT_VOXEL) {
+		VoxelAttribute *voxel_data = data_voxel();
+
+		if(voxel_data)
+			voxel_data->manager->remove_image(voxel_data->slot);
+	}
+}
+
 void Attribute::set(ustring name_, TypeDesc type_, AttributeElement element_)
 {
 	name = name_;
@@ -38,9 +50,14 @@ void Attribute::set(ustring name_, TypeDesc type_, AttributeElement element_)
 		type == TypeDesc::TypeNormal || type == TypeDesc::TypeMatrix);
 }
 
-void Attribute::reserve(int numverts, int numtris, int numcurves, int numkeys)
+void Attribute::reserve(int numverts, int numtris, int numsteps, int numcurves, int numkeys, bool resize)
 {
-	buffer.resize(buffer_size(numverts, numtris, numcurves, numkeys), 0);
+	if (resize) {
+		buffer.resize(buffer_size(numverts, numtris, numsteps, numcurves, numkeys), 0);
+	}
+	else {
+		buffer.reserve(buffer_size(numverts, numtris, numsteps, numcurves, numkeys));
+	}
 }
 
 void Attribute::add(const float& f)
@@ -70,9 +87,28 @@ void Attribute::add(const Transform& f)
 		buffer.push_back(data[i]);
 }
 
+void Attribute::add(const VoxelAttribute& f)
+{
+	char *data = (char*)&f;
+	size_t size = sizeof(f);
+
+	for(size_t i = 0; i < size; i++)
+		buffer.push_back(data[i]);
+}
+
+void Attribute::add(const char *data)
+{
+	size_t size = data_sizeof();
+
+	for(size_t i = 0; i < size; i++)
+		buffer.push_back(data[i]);
+}
+
 size_t Attribute::data_sizeof() const
 {
-	if(type == TypeDesc::TypeFloat)
+	if(element == ATTR_ELEMENT_VOXEL)
+		return sizeof(VoxelAttribute);
+	else if(type == TypeDesc::TypeFloat)
 		return sizeof(float);
 	else if(type == TypeDesc::TypeMatrix)
 		return sizeof(Transform);
@@ -80,18 +116,22 @@ size_t Attribute::data_sizeof() const
 		return sizeof(float3);
 }
 
-size_t Attribute::element_size(int numverts, int numtris, int numcurves, int numkeys) const
+size_t Attribute::element_size(int numverts, int numtris, int numsteps, int numcurves, int numkeys) const
 {
 	size_t size;
 	
 	switch(element) {
 		case ATTR_ELEMENT_OBJECT:
 		case ATTR_ELEMENT_MESH:
+		case ATTR_ELEMENT_VOXEL:
 			size = 1;
 			break;
 		case ATTR_ELEMENT_VERTEX:
 			size = numverts;
 			break;
+		case ATTR_ELEMENT_VERTEX_MOTION:
+			size = numverts * (numsteps - 1);
+			break;
 		case ATTR_ELEMENT_FACE:
 			size = numtris;
 			break;
@@ -104,6 +144,9 @@ size_t Attribute::element_size(int numverts, int numtris, int numcurves, int num
 		case ATTR_ELEMENT_CURVE_KEY:
 			size = numkeys;
 			break;
+		case ATTR_ELEMENT_CURVE_KEY_MOTION:
+			size = numkeys * (numsteps - 1);
+			break;
 		default:
 			size = 0;
 			break;
@@ -112,9 +155,9 @@ size_t Attribute::element_size(int numverts, int numtris, int numcurves, int num
 	return size;
 }
 
-size_t Attribute::buffer_size(int numverts, int numtris, int numcurves, int numkeys) const
+size_t Attribute::buffer_size(int numverts, int numtris, int numsteps, int numcurves, int numkeys) const
 {
-	return element_size(numverts, numtris, numcurves, numkeys)*data_sizeof();
+	return element_size(numverts, numtris, numsteps, numcurves, numkeys)*data_sizeof();
 }
 
 bool Attribute::same_storage(TypeDesc a, TypeDesc b)
@@ -136,40 +179,65 @@ bool Attribute::same_storage(TypeDesc a, TypeDesc b)
 
 const char *Attribute::standard_name(AttributeStandard std)
 {
-	if(std == ATTR_STD_VERTEX_NORMAL)
-		return "N";
-	else if(std == ATTR_STD_FACE_NORMAL)
-		return "Ng";
-	else if(std == ATTR_STD_UV)
-		return "uv";
-	else if(std == ATTR_STD_GENERATED)
-		return "generated";
-	else if(std == ATTR_STD_UV_TANGENT)
-		return "tangent";
-	else if(std == ATTR_STD_UV_TANGENT_SIGN)
-		return "tangent_sign";
-	else if(std == ATTR_STD_POSITION_UNDEFORMED)
-		return "undeformed";
-	else if(std == ATTR_STD_POSITION_UNDISPLACED)
-		return "undisplaced";
-	else if(std == ATTR_STD_MOTION_PRE)
-		return "motion_pre";
-	else if(std == ATTR_STD_MOTION_POST)
-		return "motion_post";
-	else if(std == ATTR_STD_PARTICLE)
-		return "particle";
-	else if(std == ATTR_STD_CURVE_INTERCEPT)
-		return "curve_intercept";
-	else if(std == ATTR_STD_PTEX_FACE_ID)
-		return "ptex_face_id";
-	else if(std == ATTR_STD_PTEX_UV)
-		return "ptex_uv";
-	else if(std == ATTR_STD_GENERATED_TRANSFORM)
-		return "generated_transform";
+	switch(std) {
+		case ATTR_STD_VERTEX_NORMAL:
+			return "N";
+		case ATTR_STD_FACE_NORMAL:
+			return "Ng";
+		case ATTR_STD_UV:
+			return "uv";
+		case ATTR_STD_GENERATED:
+			return "generated";
+		case ATTR_STD_GENERATED_TRANSFORM:
+			return "generated_transform";
+		case ATTR_STD_UV_TANGENT:
+			return "tangent";
+		case ATTR_STD_UV_TANGENT_SIGN:
+			return "tangent_sign";
+		case ATTR_STD_POSITION_UNDEFORMED:
+			return "undeformed";
+		case ATTR_STD_POSITION_UNDISPLACED:
+			return "undisplaced";
+		case ATTR_STD_MOTION_VERTEX_POSITION:
+			return "motion_P";
+		case ATTR_STD_MOTION_VERTEX_NORMAL:
+			return "motion_N";
+		case ATTR_STD_PARTICLE:
+			return "particle";
+		case ATTR_STD_CURVE_INTERCEPT:
+			return "curve_intercept";
+		case ATTR_STD_PTEX_FACE_ID:
+			return "ptex_face_id";
+		case ATTR_STD_PTEX_UV:
+			return "ptex_uv";
+		case ATTR_STD_VOLUME_DENSITY:
+			return "density";
+		case ATTR_STD_VOLUME_COLOR:
+			return "color";
+		case ATTR_STD_VOLUME_FLAME:
+			return "flame";
+		case ATTR_STD_VOLUME_HEAT:
+			return "heat";
+		case ATTR_STD_VOLUME_VELOCITY:
+			return "velocity";
+		case ATTR_STD_NOT_FOUND:
+		case ATTR_STD_NONE:
+		case ATTR_STD_NUM:
+			return "";
+	}
 	
 	return "";
 }
 
+AttributeStandard Attribute::name_standard(const char *name)
+{
+	for(int std = ATTR_STD_NONE; std < ATTR_STD_NUM; std++)
+		if(strcmp(name, Attribute::standard_name((AttributeStandard)std)) == 0)
+			return (AttributeStandard)std;
+
+	return ATTR_STD_NONE;
+}
+
 /* Attribute Set */
 
 AttributeSet::AttributeSet()
@@ -182,7 +250,7 @@ AttributeSet::~AttributeSet()
 {
 }
 
-Attribute *AttributeSet::add(ustring name, TypeDesc type, AttributeElement element)
+Attribute *AttributeSet::add(ustring name, TypeDesc type, AttributeElement element, bool resize)
 {
 	Attribute *attr = find(name);
 
@@ -202,9 +270,9 @@ Attribute *AttributeSet::add(ustring name, TypeDesc type, AttributeElement eleme
 	
 	/* this is weak .. */
 	if(triangle_mesh)
-		attr->reserve(triangle_mesh->verts.size(), triangle_mesh->triangles.size(), 0, 0);
+		attr->reserve(triangle_mesh->verts.size(), triangle_mesh->triangles.size(), triangle_mesh->motion_steps, 0, 0, resize);
 	if(curve_mesh)
-		attr->reserve(0, 0, curve_mesh->curves.size(), curve_mesh->curve_keys.size());
+		attr->reserve(0, 0, curve_mesh->motion_steps, curve_mesh->curves.size(), curve_mesh->curve_keys.size(), resize);
 	
 	return attr;
 }
@@ -261,10 +329,14 @@ Attribute *AttributeSet::add(AttributeStandard std, ustring name)
 			case ATTR_STD_GENERATED:
 			case ATTR_STD_POSITION_UNDEFORMED:
 			case ATTR_STD_POSITION_UNDISPLACED:
-			case ATTR_STD_MOTION_PRE:
-			case ATTR_STD_MOTION_POST:
 				attr = add(name, TypeDesc::TypePoint, ATTR_ELEMENT_VERTEX);
 				break;
+			case ATTR_STD_MOTION_VERTEX_POSITION:
+				attr = add(name, TypeDesc::TypePoint, ATTR_ELEMENT_VERTEX_MOTION);
+				break;
+			case ATTR_STD_MOTION_VERTEX_NORMAL:
+				attr = add(name, TypeDesc::TypeNormal, ATTR_ELEMENT_VERTEX_MOTION);
+				break;
 			case ATTR_STD_PTEX_FACE_ID:
 				attr = add(name, TypeDesc::TypeFloat, ATTR_ELEMENT_FACE);
 				break;
@@ -274,6 +346,17 @@ Attribute *AttributeSet::add(AttributeStandard std, ustring name)
 			case ATTR_STD_GENERATED_TRANSFORM:
 				attr = add(name, TypeDesc::TypeMatrix, ATTR_ELEMENT_MESH);
 				break;
+			case ATTR_STD_VOLUME_DENSITY:
+			case ATTR_STD_VOLUME_FLAME:
+			case ATTR_STD_VOLUME_HEAT:
+				attr = add(name, TypeDesc::TypeFloat, ATTR_ELEMENT_VOXEL);
+				break;
+			case ATTR_STD_VOLUME_COLOR:
+				attr = add(name, TypeDesc::TypeColor, ATTR_ELEMENT_VOXEL);
+				break;
+			case ATTR_STD_VOLUME_VELOCITY:
+				attr = add(name, TypeDesc::TypeVector, ATTR_ELEMENT_VOXEL);
+				break;
 			default:
 				assert(0);
 				break;
@@ -285,9 +368,8 @@ Attribute *AttributeSet::add(AttributeStandard std, ustring name)
 			case ATTR_STD_GENERATED:
 				attr = add(name, TypeDesc::TypePoint, ATTR_ELEMENT_CURVE);
 				break;
-			case ATTR_STD_MOTION_PRE:
-			case ATTR_STD_MOTION_POST:
-				attr = add(name, TypeDesc::TypePoint, ATTR_ELEMENT_CURVE_KEY);
+			case ATTR_STD_MOTION_VERTEX_POSITION:
+				attr = add(name, TypeDesc::TypePoint, ATTR_ELEMENT_CURVE_KEY_MOTION);
 				break;
 			case ATTR_STD_CURVE_INTERCEPT:
 				attr = add(name, TypeDesc::TypeFloat, ATTR_ELEMENT_CURVE_KEY);
@@ -343,9 +425,9 @@ void AttributeSet::reserve()
 {
 	foreach(Attribute& attr, attributes) {
 		if(triangle_mesh)
-			attr.reserve(triangle_mesh->verts.size(), triangle_mesh->triangles.size(), 0, 0);
+			attr.reserve(triangle_mesh->verts.size(), triangle_mesh->triangles.size(), triangle_mesh->motion_steps, 0, 0, true);
 		if(curve_mesh)
-			attr.reserve(0, 0, curve_mesh->curves.size(), curve_mesh->curve_keys.size());
+			attr.reserve(0, 0, 0, curve_mesh->curves.size(), curve_mesh->curve_keys.size(), true);
 	}
 }
 
diff --git a/intern/cycles/render/attribute.h b/intern/cycles/render/attribute.h
index 0b8905ae5a3..9fc32db8444 100644
--- a/intern/cycles/render/attribute.h
+++ b/intern/cycles/render/attribute.h
@@ -27,12 +27,20 @@
 CCL_NAMESPACE_BEGIN
 
 class Attribute;
-class AttributeSet;
 class AttributeRequest;
 class AttributeRequestSet;
+class AttributeSet;
+class ImageManager;
 class Mesh;
 struct Transform;
 
+/* Attributes for voxels are images */
+
+struct VoxelAttribute {
+	ImageManager *manager;
+	int slot;
+};
+
 /* Attribute
  *
  * Arbitrary data layers on meshes.
@@ -48,29 +56,37 @@ public:
 	AttributeElement element;
 
 	Attribute() {}
+	~Attribute();
 	void set(ustring name, TypeDesc type, AttributeElement element);
-	void reserve(int numverts, int numfaces, int numcurves, int numkeys);
+	void reserve(int numverts, int numfaces, int numsteps, int numcurves, int numkeys, bool resize);
 
 	size_t data_sizeof() const;
-	size_t element_size(int numverts, int numfaces, int numcurves, int numkeys) const;
-	size_t buffer_size(int numverts, int numfaces, int numcurves, int numkeys) const;
+	size_t element_size(int numverts, int numfaces, int numsteps, int numcurves, int numkeys) const;
+	size_t buffer_size(int numverts, int numfaces, int numsteps, int numcurves, int numkeys) const;
 
 	char *data() { return (buffer.size())? &buffer[0]: NULL; };
 	float3 *data_float3() { return (float3*)data(); }
+	float4 *data_float4() { return (float4*)data(); }
 	float *data_float() { return (float*)data(); }
 	Transform *data_transform() { return (Transform*)data(); }
+	VoxelAttribute *data_voxel()  { return ( VoxelAttribute*)data(); }
 
 	const char *data() const { return (buffer.size())? &buffer[0]: NULL; }
 	const float3 *data_float3() const { return (const float3*)data(); }
+	const float4 *data_float4() const { return (const float4*)data(); }
 	const float *data_float() const { return (const float*)data(); }
 	const Transform *data_transform() const { return (const Transform*)data(); }
+	const VoxelAttribute *data_voxel() const { return (const VoxelAttribute*)data(); }
 
 	void add(const float& f);
 	void add(const float3& f);
 	void add(const Transform& f);
+	void add(const VoxelAttribute& f);
+	void add(const char *data);
 
 	static bool same_storage(TypeDesc a, TypeDesc b);
 	static const char *standard_name(AttributeStandard std);
+	static AttributeStandard name_standard(const char *name);
 };
 
 /* Attribute Set
@@ -86,7 +102,7 @@ public:
 	AttributeSet();
 	~AttributeSet();
 
-	Attribute *add(ustring name, TypeDesc type, AttributeElement element);
+	Attribute *add(ustring name, TypeDesc type, AttributeElement element, bool resize = true);
 	Attribute *find(ustring name) const;
 	void remove(ustring name);
 
diff --git a/intern/cycles/render/background.cpp b/intern/cycles/render/background.cpp
index c9c66dad3fe..a877c52fbed 100644
--- a/intern/cycles/render/background.cpp
+++ b/intern/cycles/render/background.cpp
@@ -35,7 +35,7 @@ Background::Background()
 
 	use = true;
 
-	visibility = ~0;
+	visibility = PATH_RAY_ALL_VISIBILITY;
 	shader = 0;
 
 	transparent = false;
@@ -70,7 +70,7 @@ void Background::device_update(Device *device, DeviceScene *dscene, Scene *scene
 	if(scene->shaders[shader]->has_volume)
 		kbackground->volume_shader = kbackground->surface_shader;
 	else
-		kbackground->volume_shader = SHADER_NO_ID;
+		kbackground->volume_shader = SHADER_NONE;
 
 	if(!(visibility & PATH_RAY_DIFFUSE))
 		kbackground->surface_shader |= SHADER_EXCLUDE_DIFFUSE;
diff --git a/intern/cycles/render/bake.cpp b/intern/cycles/render/bake.cpp
new file mode 100644
index 00000000000..aa317ab672f
--- /dev/null
+++ b/intern/cycles/render/bake.cpp
@@ -0,0 +1,206 @@
+/*
+ * Copyright 2011-2014 Blender Foundation
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License
+ */
+
+#include "bake.h"
+
+CCL_NAMESPACE_BEGIN
+
+BakeData::BakeData(const int object, const int tri_offset, const int num_pixels):
+m_object(object),
+m_tri_offset(tri_offset),
+m_num_pixels(num_pixels)
+{
+	m_primitive.resize(num_pixels);
+	m_u.resize(num_pixels);
+	m_v.resize(num_pixels);
+	m_dudx.resize(num_pixels);
+	m_dudy.resize(num_pixels);
+	m_dvdx.resize(num_pixels);
+	m_dvdy.resize(num_pixels);
+}
+
+BakeData::~BakeData()
+{
+	m_primitive.clear();
+	m_u.clear();
+	m_v.clear();
+	m_dudx.clear();
+	m_dudy.clear();
+	m_dvdx.clear();
+	m_dvdy.clear();
+}
+
+void BakeData::set(int i, int prim, float uv[2], float dudx, float dudy, float dvdx, float dvdy)
+{
+	m_primitive[i] = (prim == -1 ? -1 : m_tri_offset + prim);
+	m_u[i] = uv[0];
+	m_v[i] = uv[1];
+	m_dudx[i] = dudx;
+	m_dudy[i] = dudy;
+	m_dvdx[i] = dvdx;
+	m_dvdy[i] = dvdy;
+}
+
+int BakeData::object()
+{
+	return m_object;
+}
+
+int BakeData::size()
+{
+	return m_num_pixels;
+}
+
+bool BakeData::is_valid(int i)
+{
+	return m_primitive[i] != -1;
+}
+
+uint4 BakeData::data(int i)
+{
+	return make_uint4(
+		m_object,
+		m_primitive[i],
+		__float_as_int(m_u[i]),
+		__float_as_int(m_v[i])
+		);
+}
+
+uint4 BakeData::differentials(int i)
+{
+	return make_uint4(
+		  __float_as_int(m_dudx[i]),
+		  __float_as_int(m_dudy[i]),
+		  __float_as_int(m_dvdx[i]),
+		  __float_as_int(m_dvdy[i])
+		  );
+}
+
+BakeManager::BakeManager()
+{
+	m_bake_data = NULL;
+	m_is_baking = false;
+	need_update = true;
+}
+
+BakeManager::~BakeManager()
+{
+	if(m_bake_data)
+		delete m_bake_data;
+}
+
+bool BakeManager::get_baking()
+{
+	return m_is_baking;
+}
+
+void BakeManager::set_baking(const bool value)
+{
+	m_is_baking = value;
+}
+
+BakeData *BakeManager::init(const int object, const int tri_offset, const int num_pixels)
+{
+	m_bake_data = new BakeData(object, tri_offset, num_pixels);
+	return m_bake_data;
+}
+
+bool BakeManager::bake(Device *device, DeviceScene *dscene, Scene *scene, Progress& progress, ShaderEvalType shader_type, BakeData *bake_data, float result[])
+{
+	size_t limit = bake_data->size();
+
+	/* setup input for device task */
+	device_vector<uint4> d_input;
+	uint4 *d_input_data = d_input.resize(limit * 2);
+	size_t d_input_size = 0;
+
+	for(size_t i = 0; i < limit; i++) {
+		d_input_data[d_input_size++] = bake_data->data(i);
+		d_input_data[d_input_size++] = bake_data->differentials(i);
+	}
+
+	if(d_input_size == 0)
+		return false;
+
+	/* run device task */
+	device_vector<float4> d_output;
+	d_output.resize(limit);
+
+	/* needs to be up to data for attribute access */
+	device->const_copy_to("__data", &dscene->data, sizeof(dscene->data));
+
+	device->mem_alloc(d_input, MEM_READ_ONLY);
+	device->mem_copy_to(d_input);
+	device->mem_alloc(d_output, MEM_WRITE_ONLY);
+
+	DeviceTask task(DeviceTask::SHADER);
+	task.shader_input = d_input.device_pointer;
+	task.shader_output = d_output.device_pointer;
+	task.shader_eval_type = shader_type;
+	task.shader_x = 0;
+	task.shader_w = d_output.size();
+	task.get_cancel = function_bind(&Progress::get_cancel, &progress);
+
+	device->task_add(task);
+	device->task_wait();
+
+	if(progress.get_cancel()) {
+		device->mem_free(d_input);
+		device->mem_free(d_output);
+		m_is_baking = false;
+		return false;
+	}
+
+	device->mem_copy_from(d_output, 0, 1, d_output.size(), sizeof(float4));
+	device->mem_free(d_input);
+	device->mem_free(d_output);
+
+	/* read result */
+	int k = 0;
+
+	float4 *offset = (float4*)d_output.data_pointer;
+
+	size_t depth = 4;
+	for(size_t i = 0; i < limit; i++) {
+		size_t index = i * depth;
+		float4 out = offset[k++];
+
+		if(bake_data->is_valid(i)) {
+			for(size_t j=0; j < 4; j++) {
+				result[index + j] = out[j];
+			}
+		}
+	}
+
+	m_is_baking = false;
+	return true;
+}
+
+void BakeManager::device_update(Device *device, DeviceScene *dscene, Scene *scene, Progress& progress)
+{
+	if(!need_update)
+		return;
+
+	if(progress.get_cancel()) return;
+
+	need_update = false;
+}
+
+void BakeManager::device_free(Device *device, DeviceScene *dscene)
+{
+}
+
+CCL_NAMESPACE_END
diff --git a/intern/cycles/render/bake.h b/intern/cycles/render/bake.h
new file mode 100644
index 00000000000..ea403f7d39a
--- /dev/null
+++ b/intern/cycles/render/bake.h
@@ -0,0 +1,77 @@
+/*
+ * Copyright 2011-2014 Blender Foundation
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License
+ */
+
+#ifndef __BAKE_H__
+#define __BAKE_H__
+
+#include "util_vector.h"
+#include "device.h"
+#include "scene.h"
+#include "session.h"
+
+CCL_NAMESPACE_BEGIN
+
+class BakeData {
+public:
+	BakeData(const int object, const int tri_offset, const int num_pixels);
+	~BakeData();
+
+	void set(int i, int prim, float uv[2], float dudx, float dudy, float dvdx, float dvdy);
+	int object();
+	int size();
+	uint4 data(int i);
+	uint4 differentials(int i);
+	bool is_valid(int i);
+
+private:
+	int m_object;
+	int m_tri_offset;
+	int m_num_pixels;
+	vector<int>m_primitive;
+	vector<float>m_u;
+	vector<float>m_v;
+	vector<float>m_dudx;
+	vector<float>m_dudy;
+	vector<float>m_dvdx;
+	vector<float>m_dvdy;
+};
+
+class BakeManager {
+public:
+	BakeManager();
+	~BakeManager();
+
+	bool get_baking();
+	void set_baking(const bool value);
+
+	BakeData *init(const int object, const int tri_offset, const int num_pixels);
+
+	bool bake(Device *device, DeviceScene *dscene, Scene *scene, Progress& progress, ShaderEvalType shader_type, BakeData *bake_data, float result[]);
+
+	void device_update(Device *device, DeviceScene *dscene, Scene *scene, Progress& progress);
+	void device_free(Device *device, DeviceScene *dscene);
+
+	bool need_update;
+
+private:
+	BakeData *m_bake_data;
+	bool m_is_baking;
+};
+
+CCL_NAMESPACE_END
+
+#endif /* __BAKE_H__ */
+
diff --git a/intern/cycles/render/blackbody.cpp b/intern/cycles/render/blackbody.cpp
index ab61886e262..89af714e8ec 100644
--- a/intern/cycles/render/blackbody.cpp
+++ b/intern/cycles/render/blackbody.cpp
@@ -59,33 +59,33 @@ vector<float> blackbody_table()
 	*/
 
 	const float cie_colour_match[81][3] = {
-		{0.0014,0.0000,0.0065}, {0.0022,0.0001,0.0105}, {0.0042,0.0001,0.0201},
-		{0.0076,0.0002,0.0362}, {0.0143,0.0004,0.0679}, {0.0232,0.0006,0.1102},
-		{0.0435,0.0012,0.2074}, {0.0776,0.0022,0.3713}, {0.1344,0.0040,0.6456},
-		{0.2148,0.0073,1.0391}, {0.2839,0.0116,1.3856}, {0.3285,0.0168,1.6230},
-		{0.3483,0.0230,1.7471}, {0.3481,0.0298,1.7826}, {0.3362,0.0380,1.7721},
-		{0.3187,0.0480,1.7441}, {0.2908,0.0600,1.6692}, {0.2511,0.0739,1.5281},
-		{0.1954,0.0910,1.2876}, {0.1421,0.1126,1.0419}, {0.0956,0.1390,0.8130},
-		{0.0580,0.1693,0.6162}, {0.0320,0.2080,0.4652}, {0.0147,0.2586,0.3533},
-		{0.0049,0.3230,0.2720}, {0.0024,0.4073,0.2123}, {0.0093,0.5030,0.1582},
-		{0.0291,0.6082,0.1117}, {0.0633,0.7100,0.0782}, {0.1096,0.7932,0.0573},
-		{0.1655,0.8620,0.0422}, {0.2257,0.9149,0.0298}, {0.2904,0.9540,0.0203},
-		{0.3597,0.9803,0.0134}, {0.4334,0.9950,0.0087}, {0.5121,1.0000,0.0057},
-		{0.5945,0.9950,0.0039}, {0.6784,0.9786,0.0027}, {0.7621,0.9520,0.0021},
-		{0.8425,0.9154,0.0018}, {0.9163,0.8700,0.0017}, {0.9786,0.8163,0.0014},
-		{1.0263,0.7570,0.0011}, {1.0567,0.6949,0.0010}, {1.0622,0.6310,0.0008},
-		{1.0456,0.5668,0.0006}, {1.0026,0.5030,0.0003}, {0.9384,0.4412,0.0002},
-		{0.8544,0.3810,0.0002}, {0.7514,0.3210,0.0001}, {0.6424,0.2650,0.0000},
-		{0.5419,0.2170,0.0000}, {0.4479,0.1750,0.0000}, {0.3608,0.1382,0.0000},
-		{0.2835,0.1070,0.0000}, {0.2187,0.0816,0.0000}, {0.1649,0.0610,0.0000},
-		{0.1212,0.0446,0.0000}, {0.0874,0.0320,0.0000}, {0.0636,0.0232,0.0000},
-		{0.0468,0.0170,0.0000}, {0.0329,0.0119,0.0000}, {0.0227,0.0082,0.0000},
-		{0.0158,0.0057,0.0000}, {0.0114,0.0041,0.0000}, {0.0081,0.0029,0.0000},
-		{0.0058,0.0021,0.0000}, {0.0041,0.0015,0.0000}, {0.0029,0.0010,0.0000},
-		{0.0020,0.0007,0.0000}, {0.0014,0.0005,0.0000}, {0.0010,0.0004,0.0000},
-		{0.0007,0.0002,0.0000}, {0.0005,0.0002,0.0000}, {0.0003,0.0001,0.0000},
-		{0.0002,0.0001,0.0000}, {0.0002,0.0001,0.0000}, {0.0001,0.0000,0.0000},
-		{0.0001,0.0000,0.0000}, {0.0001,0.0000,0.0000}, {0.0000,0.0000,0.0000}
+		{0.0014f,0.0000f,0.0065f}, {0.0022f,0.0001f,0.0105f}, {0.0042f,0.0001f,0.0201f},
+		{0.0076f,0.0002f,0.0362f}, {0.0143f,0.0004f,0.0679f}, {0.0232f,0.0006f,0.1102f},
+		{0.0435f,0.0012f,0.2074f}, {0.0776f,0.0022f,0.3713f}, {0.1344f,0.0040f,0.6456f},
+		{0.2148f,0.0073f,1.0391f}, {0.2839f,0.0116f,1.3856f}, {0.3285f,0.0168f,1.6230f},
+		{0.3483f,0.0230f,1.7471f}, {0.3481f,0.0298f,1.7826f}, {0.3362f,0.0380f,1.7721f},
+		{0.3187f,0.0480f,1.7441f}, {0.2908f,0.0600f,1.6692f}, {0.2511f,0.0739f,1.5281f},
+		{0.1954f,0.0910f,1.2876f}, {0.1421f,0.1126f,1.0419f}, {0.0956f,0.1390f,0.8130f},
+		{0.0580f,0.1693f,0.6162f}, {0.0320f,0.2080f,0.4652f}, {0.0147f,0.2586f,0.3533f},
+		{0.0049f,0.3230f,0.2720f}, {0.0024f,0.4073f,0.2123f}, {0.0093f,0.5030f,0.1582f},
+		{0.0291f,0.6082f,0.1117f}, {0.0633f,0.7100f,0.0782f}, {0.1096f,0.7932f,0.0573f},
+		{0.1655f,0.8620f,0.0422f}, {0.2257f,0.9149f,0.0298f}, {0.2904f,0.9540f,0.0203f},
+		{0.3597f,0.9803f,0.0134f}, {0.4334f,0.9950f,0.0087f}, {0.5121f,1.0000f,0.0057f},
+		{0.5945f,0.9950f,0.0039f}, {0.6784f,0.9786f,0.0027f}, {0.7621f,0.9520f,0.0021f},
+		{0.8425f,0.9154f,0.0018f}, {0.9163f,0.8700f,0.0017f}, {0.9786f,0.8163f,0.0014f},
+		{1.0263f,0.7570f,0.0011f}, {1.0567f,0.6949f,0.0010f}, {1.0622f,0.6310f,0.0008f},
+		{1.0456f,0.5668f,0.0006f}, {1.0026f,0.5030f,0.0003f}, {0.9384f,0.4412f,0.0002f},
+		{0.8544f,0.3810f,0.0002f}, {0.7514f,0.3210f,0.0001f}, {0.6424f,0.2650f,0.0000f},
+		{0.5419f,0.2170f,0.0000f}, {0.4479f,0.1750f,0.0000f}, {0.3608f,0.1382f,0.0000f},
+		{0.2835f,0.1070f,0.0000f}, {0.2187f,0.0816f,0.0000f}, {0.1649f,0.0610f,0.0000f},
+		{0.1212f,0.0446f,0.0000f}, {0.0874f,0.0320f,0.0000f}, {0.0636f,0.0232f,0.0000f},
+		{0.0468f,0.0170f,0.0000f}, {0.0329f,0.0119f,0.0000f}, {0.0227f,0.0082f,0.0000f},
+		{0.0158f,0.0057f,0.0000f}, {0.0114f,0.0041f,0.0000f}, {0.0081f,0.0029f,0.0000f},
+		{0.0058f,0.0021f,0.0000f}, {0.0041f,0.0015f,0.0000f}, {0.0029f,0.0010f,0.0000f},
+		{0.0020f,0.0007f,0.0000f}, {0.0014f,0.0005f,0.0000f}, {0.0010f,0.0004f,0.0000f},
+		{0.0007f,0.0002f,0.0000f}, {0.0005f,0.0002f,0.0000f}, {0.0003f,0.0001f,0.0000f},
+		{0.0002f,0.0001f,0.0000f}, {0.0002f,0.0001f,0.0000f}, {0.0001f,0.0000f,0.0000f},
+		{0.0001f,0.0000f,0.0000f}, {0.0001f,0.0000f,0.0000f}, {0.0000f,0.0000f,0.0000f}
 	};
 
 	const double c1 = 3.74183e-16; // 2*pi*h*c^2, W*m^2
diff --git a/intern/cycles/render/buffers.cpp b/intern/cycles/render/buffers.cpp
index da1b7484b77..fc65922fc87 100644
--- a/intern/cycles/render/buffers.cpp
+++ b/intern/cycles/render/buffers.cpp
@@ -358,14 +358,14 @@ void DisplayBuffer::draw_set(int width, int height)
 	draw_height = height;
 }
 
-void DisplayBuffer::draw(Device *device)
+void DisplayBuffer::draw(Device *device, const DeviceDrawParams& draw_params)
 {
 	if(draw_width != 0 && draw_height != 0) {
 		glPushMatrix();
 		glTranslatef(params.full_x, params.full_y, 0.0f);
 		device_memory& rgba = rgba_data();
 
-		device->draw_pixels(rgba, 0, draw_width, draw_height, 0, params.width, params.height, transparent);
+		device->draw_pixels(rgba, 0, draw_width, draw_height, 0, params.width, params.height, transparent, draw_params);
 
 		glPopMatrix();
 	}
diff --git a/intern/cycles/render/buffers.h b/intern/cycles/render/buffers.h
index 81eaf41077f..27ab20bbafd 100644
--- a/intern/cycles/render/buffers.h
+++ b/intern/cycles/render/buffers.h
@@ -31,6 +31,7 @@
 CCL_NAMESPACE_BEGIN
 
 class Device;
+struct DeviceDrawParams;
 struct float4;
 
 /* Buffer Parameters
@@ -114,7 +115,7 @@ public:
 	void write(Device *device, const string& filename);
 
 	void draw_set(int width, int height);
-	void draw(Device *device);
+	void draw(Device *device, const DeviceDrawParams& draw_params);
 	bool draw_ready();
 
 	device_memory& rgba_data();
diff --git a/intern/cycles/render/camera.cpp b/intern/cycles/render/camera.cpp
index edf7f7fb09d..8659fe4f7a3 100644
--- a/intern/cycles/render/camera.cpp
+++ b/intern/cycles/render/camera.cpp
@@ -44,8 +44,8 @@ Camera::Camera()
 	fisheye_lens = 10.5f;
 	fov = M_PI_4_F;
 
-	sensorwidth = 0.036;
-	sensorheight = 0.024;
+	sensorwidth = 0.036f;
+	sensorheight = 0.024f;
 
 	nearclip = 1e-5f;
 	farclip = 1e5f;
@@ -78,6 +78,24 @@ Camera::~Camera()
 {
 }
 
+void Camera::compute_auto_viewplane()
+{
+	float aspect = (float)width/(float)height;
+
+	if(width >= height) {
+		viewplane.left = -aspect;
+		viewplane.right = aspect;
+		viewplane.bottom = -1.0f;
+		viewplane.top = 1.0f;
+	}
+	else {
+		viewplane.left = -1.0f;
+		viewplane.right = 1.0f;
+		viewplane.bottom = -1.0f/aspect;
+		viewplane.top = 1.0f/aspect;
+	}
+}
+
 void Camera::update()
 {
 	if(!need_update)
diff --git a/intern/cycles/render/camera.h b/intern/cycles/render/camera.h
index 4e8f3d72111..c28670bc55f 100644
--- a/intern/cycles/render/camera.h
+++ b/intern/cycles/render/camera.h
@@ -102,6 +102,8 @@ public:
 	/* functions */
 	Camera();
 	~Camera();
+	
+	void compute_auto_viewplane();
 
 	void update();
 
diff --git a/intern/cycles/render/curves.cpp b/intern/cycles/render/curves.cpp
index 6e6b11ca92f..2c96ffa655e 100644
--- a/intern/cycles/render/curves.cpp
+++ b/intern/cycles/render/curves.cpp
@@ -110,7 +110,7 @@ void CurveSystemManager::device_update(Device *device, DeviceScene *dscene, Scen
 
 	progress.set_status("Updating Hair settings", "Copying Hair settings to device");
 
-	KernelCurves *kcurve= &dscene->data.curve;
+	KernelCurves *kcurve = &dscene->data.curve;
 
 	kcurve->curveflags = 0;
 
diff --git a/intern/cycles/render/film.cpp b/intern/cycles/render/film.cpp
index 30ad86a8d4c..c1aefbcfbbc 100644
--- a/intern/cycles/render/film.cpp
+++ b/intern/cycles/render/film.cpp
@@ -155,6 +155,9 @@ void Pass::add(PassType type, vector<Pass>& passes)
 			pass.components = 4;
 			pass.exposure = false;
 			break;
+		case PASS_LIGHT:
+			/* ignores */
+			break;
 	}
 
 	passes.push_back(pass);
@@ -393,6 +396,10 @@ void Film::device_update(Device *device, DeviceScene *dscene, Scene *scene)
 				kfilm->pass_shadow = kfilm->pass_stride;
 				kfilm->use_light_pass = 1;
 				break;
+
+			case PASS_LIGHT:
+				kfilm->use_light_pass = 1;
+				break;
 			case PASS_NONE:
 				break;
 		}
diff --git a/intern/cycles/render/graph.cpp b/intern/cycles/render/graph.cpp
index 9142eb5308c..0ff904d06e7 100644
--- a/intern/cycles/render/graph.cpp
+++ b/intern/cycles/render/graph.cpp
@@ -227,7 +227,7 @@ void ShaderGraph::disconnect(ShaderInput *to)
 	from->links.erase(remove(from->links.begin(), from->links.end(), to), from->links.end());
 }
 
-void ShaderGraph::finalize(bool do_bump, bool do_osl, bool do_multi_transform)
+void ShaderGraph::finalize(bool do_bump, bool do_osl)
 {
 	/* before compiling, the shader graph may undergo a number of modifications.
 	 * currently we set default geometry shader inputs, and create automatic bump
@@ -242,17 +242,15 @@ void ShaderGraph::finalize(bool do_bump, bool do_osl, bool do_multi_transform)
 		if(do_bump)
 			bump_from_displacement();
 
-		if(do_multi_transform) {
-			ShaderInput *surface_in = output()->input("Surface");
-			ShaderInput *volume_in = output()->input("Volume");
+		ShaderInput *surface_in = output()->input("Surface");
+		ShaderInput *volume_in = output()->input("Volume");
 
-			/* todo: make this work when surface and volume closures are tangled up */
+		/* todo: make this work when surface and volume closures are tangled up */
 
-			if(surface_in->link)
-				transform_multi_closure(surface_in->link->parent, NULL, false);
-			if(volume_in->link)
-				transform_multi_closure(volume_in->link->parent, NULL, true);
-		}
+		if(surface_in->link)
+			transform_multi_closure(surface_in->link->parent, NULL, false);
+		if(volume_in->link)
+			transform_multi_closure(volume_in->link->parent, NULL, true);
 
 		finalized = true;
 	}
diff --git a/intern/cycles/render/graph.h b/intern/cycles/render/graph.h
index f31e2103229..89a066195d6 100644
--- a/intern/cycles/render/graph.h
+++ b/intern/cycles/render/graph.h
@@ -193,6 +193,7 @@ public:
 	virtual bool has_surface_bssrdf() { return false; }
 	virtual bool has_converter_blackbody() { return false; }
 	virtual bool has_bssrdf_bump() { return false; }
+	virtual bool has_spatial_varying() { return false; }
 
 	vector<ShaderInput*> inputs;
 	vector<ShaderOutput*> outputs;
@@ -246,7 +247,7 @@ public:
 	void disconnect(ShaderInput *to);
 
 	void remove_unneeded_nodes();
-	void finalize(bool do_bump = false, bool do_osl = false, bool do_multi_closure = false);
+	void finalize(bool do_bump = false, bool do_osl = false);
 
 protected:
 	typedef pair<ShaderNode* const, ShaderNode*> NodePair;
diff --git a/intern/cycles/render/image.cpp b/intern/cycles/render/image.cpp
index 91aae6f3ec3..86755badc42 100644
--- a/intern/cycles/render/image.cpp
+++ b/intern/cycles/render/image.cpp
@@ -59,11 +59,16 @@ void ImageManager::set_osl_texture_system(void *texture_system)
 	osl_texture_system = texture_system;
 }
 
-void ImageManager::set_extended_image_limits(void)
+void ImageManager::set_extended_image_limits(const DeviceInfo& info)
 {
-	tex_num_images = TEX_EXTENDED_NUM_IMAGES;
-	tex_num_float_images = TEX_EXTENDED_NUM_FLOAT_IMAGES;
-	tex_image_byte_start = TEX_EXTENDED_IMAGE_BYTE_START;
+	if(info.type == DEVICE_CPU) {
+		tex_num_images = TEX_EXTENDED_NUM_IMAGES_CPU;
+		tex_num_float_images = TEX_EXTENDED_NUM_FLOAT_IMAGES;
+		tex_image_byte_start = TEX_EXTENDED_IMAGE_BYTE_START;
+	}
+	else if((info.type == DEVICE_CUDA || info.type == DEVICE_MULTI) && info.extended_images) {
+		tex_num_images = TEX_EXTENDED_NUM_IMAGES_GPU;
+	}
 }
 
 bool ImageManager::set_animation_frame_update(int frame)
@@ -90,8 +95,8 @@ bool ImageManager::is_float_image(const string& filename, void *builtin_data, bo
 
 	if(builtin_data) {
 		if(builtin_image_info_cb) {
-			int width, height, channels;
-			builtin_image_info_cb(filename, builtin_data, is_float, width, height, channels);
+			int width, height, depth, channels;
+			builtin_image_info_cb(filename, builtin_data, is_float, width, height, depth, channels);
 		}
 
 		if(is_float)
@@ -145,7 +150,14 @@ bool ImageManager::is_float_image(const string& filename, void *builtin_data, bo
 	return is_float;
 }
 
-int ImageManager::add_image(const string& filename, void *builtin_data, bool animated, bool& is_float, bool& is_linear)
+static bool image_equals(ImageManager::Image *image, const string& filename, void *builtin_data, InterpolationType interpolation)
+{
+	return image->filename == filename &&
+	       image->builtin_data == builtin_data &&
+	       image->interpolation == interpolation;
+}
+
+int ImageManager::add_image(const string& filename, void *builtin_data, bool animated, bool& is_float, bool& is_linear, InterpolationType interpolation, bool use_alpha)
 {
 	Image *img;
 	size_t slot;
@@ -156,7 +168,7 @@ int ImageManager::add_image(const string& filename, void *builtin_data, bool ani
 	if(is_float) {
 		/* find existing image */
 		for(slot = 0; slot < float_images.size(); slot++) {
-			if(float_images[slot] && float_images[slot]->filename == filename) {
+			if(float_images[slot] && image_equals(float_images[slot], filename, builtin_data, interpolation)) {
 				float_images[slot]->users++;
 				return slot;
 			}
@@ -185,13 +197,15 @@ int ImageManager::add_image(const string& filename, void *builtin_data, bool ani
 		img->builtin_data = builtin_data;
 		img->need_load = true;
 		img->animated = animated;
+		img->interpolation = interpolation;
 		img->users = 1;
+		img->use_alpha = use_alpha;
 
 		float_images[slot] = img;
 	}
 	else {
 		for(slot = 0; slot < images.size(); slot++) {
-			if(images[slot] && images[slot]->filename == filename) {
+			if(images[slot] && image_equals(images[slot], filename, builtin_data, interpolation)) {
 				images[slot]->users++;
 				return slot+tex_image_byte_start;
 			}
@@ -220,7 +234,9 @@ int ImageManager::add_image(const string& filename, void *builtin_data, bool ani
 		img->builtin_data = builtin_data;
 		img->need_load = true;
 		img->animated = animated;
+		img->interpolation = interpolation;
 		img->users = 1;
+		img->use_alpha = use_alpha;
 
 		images[slot] = img;
 
@@ -231,22 +247,43 @@ int ImageManager::add_image(const string& filename, void *builtin_data, bool ani
 	return slot;
 }
 
-void ImageManager::remove_image(const string& filename, void *builtin_data)
+void ImageManager::remove_image(int slot)
 {
-	size_t slot;
+	if(slot >= tex_image_byte_start) {
+		slot -= tex_image_byte_start;
 
-	for(slot = 0; slot < images.size(); slot++) {
-		if(images[slot] && images[slot]->filename == filename && images[slot]->builtin_data == builtin_data) {
-			/* decrement user count */
-			images[slot]->users--;
-			assert(images[slot]->users >= 0);
+		assert(images[slot] != NULL);
+
+		/* decrement user count */
+		images[slot]->users--;
+		assert(images[slot]->users >= 0);
+
+		/* don't remove immediately, rather do it all together later on. one of
+		 * the reasons for this is that on shader changes we add and remove nodes
+		 * that use them, but we do not want to reload the image all the time. */
+		if(images[slot]->users == 0)
+			need_update = true;
+	}
+	else {
+		/* decrement user count */
+		float_images[slot]->users--;
+		assert(float_images[slot]->users >= 0);
+
+		/* don't remove immediately, rather do it all together later on. one of
+		 * the reasons for this is that on shader changes we add and remove nodes
+		 * that use them, but we do not want to reload the image all the time. */
+		if(float_images[slot]->users == 0)
+			need_update = true;
+	}
+}
 
-			/* don't remove immediately, rather do it all together later on. one of
-			 * the reasons for this is that on shader changes we add and remove nodes
-			 * that use them, but we do not want to reload the image all the time. */
-			if(images[slot]->users == 0)
-				need_update = true;
+void ImageManager::remove_image(const string& filename, void *builtin_data, InterpolationType interpolation)
+{
+	size_t slot;
 
+	for(slot = 0; slot < images.size(); slot++) {
+		if(images[slot] && image_equals(images[slot], filename, builtin_data, interpolation)) {
+			remove_image(slot+tex_image_byte_start);
 			break;
 		}
 	}
@@ -254,17 +291,8 @@ void ImageManager::remove_image(const string& filename, void *builtin_data)
 	if(slot == images.size()) {
 		/* see if it's in a float texture slot */
 		for(slot = 0; slot < float_images.size(); slot++) {
-			if(float_images[slot] && float_images[slot]->filename == filename && float_images[slot]->builtin_data == builtin_data) {
-				/* decrement user count */
-				float_images[slot]->users--;
-				assert(float_images[slot]->users >= 0);
-
-				/* don't remove immediately, rather do it all together later on. one of
-				 * the reasons for this is that on shader changes we add and remove nodes
-				 * that use them, but we do not want to reload the image all the time. */
-				if(float_images[slot]->users == 0)
-					need_update = true;
-
+			if(float_images[slot] && image_equals(float_images[slot], filename, builtin_data, interpolation)) {
+				remove_image(slot);
 				break;
 			}
 		}
@@ -277,7 +305,7 @@ bool ImageManager::file_load_image(Image *img, device_vector<uchar4>& tex_img)
 		return false;
 
 	ImageInput *in = NULL;
-	int width, height, components;
+	int width, height, depth, components;
 
 	if(!img->builtin_data) {
 		/* load image from file through OIIO */
@@ -286,15 +314,20 @@ bool ImageManager::file_load_image(Image *img, device_vector<uchar4>& tex_img)
 		if(!in)
 			return false;
 
-		ImageSpec spec;
+		ImageSpec spec = ImageSpec();
+		ImageSpec config = ImageSpec();
+
+		if(img->use_alpha == false)
+			config.attribute("oiio:UnassociatedAlpha", 1);
 
-		if(!in->open(img->filename, spec)) {
+		if(!in->open(img->filename, spec, config)) {
 			delete in;
 			return false;
 		}
 
 		width = spec.width;
 		height = spec.height;
+		depth = spec.depth;
 		components = spec.nchannels;
 	}
 	else {
@@ -303,7 +336,7 @@ bool ImageManager::file_load_image(Image *img, device_vector<uchar4>& tex_img)
 			return false;
 
 		bool is_float;
-		builtin_image_info_cb(img->filename, img->builtin_data, is_float, width, height, components);
+		builtin_image_info_cb(img->filename, img->builtin_data, is_float, width, height, depth, components);
 	}
 
 	/* we only handle certain number of components */
@@ -317,15 +350,21 @@ bool ImageManager::file_load_image(Image *img, device_vector<uchar4>& tex_img)
 	}
 
 	/* read RGBA pixels */
-	uchar *pixels = (uchar*)tex_img.resize(width, height);
-	int scanlinesize = width*components*sizeof(uchar);
+	uchar *pixels = (uchar*)tex_img.resize(width, height, depth);
 
 	if(in) {
-		in->read_image(TypeDesc::UINT8,
-			(uchar*)pixels + (height-1)*scanlinesize,
-			AutoStride,
-			-scanlinesize,
-			AutoStride);
+		if(depth <= 1) {
+			int scanlinesize = width*components*sizeof(uchar);
+
+			in->read_image(TypeDesc::UINT8,
+				(uchar*)pixels + (height-1)*scanlinesize,
+				AutoStride,
+				-scanlinesize,
+				AutoStride);
+		}
+		else {
+			in->read_image(TypeDesc::UINT8, (uchar*)pixels);
+		}
 
 		in->close();
 		delete in;
@@ -335,7 +374,7 @@ bool ImageManager::file_load_image(Image *img, device_vector<uchar4>& tex_img)
 	}
 
 	if(components == 2) {
-		for(int i = width*height-1; i >= 0; i--) {
+		for(int i = width*height*depth-1; i >= 0; i--) {
 			pixels[i*4+3] = pixels[i*2+1];
 			pixels[i*4+2] = pixels[i*2+0];
 			pixels[i*4+1] = pixels[i*2+0];
@@ -343,7 +382,7 @@ bool ImageManager::file_load_image(Image *img, device_vector<uchar4>& tex_img)
 		}
 	}
 	else if(components == 3) {
-		for(int i = width*height-1; i >= 0; i--) {
+		for(int i = width*height*depth-1; i >= 0; i--) {
 			pixels[i*4+3] = 255;
 			pixels[i*4+2] = pixels[i*3+2];
 			pixels[i*4+1] = pixels[i*3+1];
@@ -351,7 +390,7 @@ bool ImageManager::file_load_image(Image *img, device_vector<uchar4>& tex_img)
 		}
 	}
 	else if(components == 1) {
-		for(int i = width*height-1; i >= 0; i--) {
+		for(int i = width*height*depth-1; i >= 0; i--) {
 			pixels[i*4+3] = 255;
 			pixels[i*4+2] = pixels[i];
 			pixels[i*4+1] = pixels[i];
@@ -359,6 +398,12 @@ bool ImageManager::file_load_image(Image *img, device_vector<uchar4>& tex_img)
 		}
 	}
 
+	if(img->use_alpha == false) {
+		for(int i = width*height*depth-1; i >= 0; i--) {
+			pixels[i*4+3] = 255;
+		}
+	}
+
 	return true;
 }
 
@@ -368,7 +413,7 @@ bool ImageManager::file_load_float_image(Image *img, device_vector<float4>& tex_
 		return false;
 
 	ImageInput *in = NULL;
-	int width, height, components;
+	int width, height, depth, components;
 
 	if(!img->builtin_data) {
 		/* load image from file through OIIO */
@@ -377,9 +422,13 @@ bool ImageManager::file_load_float_image(Image *img, device_vector<float4>& tex_
 		if(!in)
 			return false;
 
-		ImageSpec spec;
+		ImageSpec spec = ImageSpec();
+		ImageSpec config = ImageSpec();
+
+		if(img->use_alpha == false)
+			config.attribute("oiio:UnassociatedAlpha",1);
 
-		if(!in->open(img->filename, spec)) {
+		if(!in->open(img->filename, spec, config)) {
 			delete in;
 			return false;
 		}
@@ -387,6 +436,7 @@ bool ImageManager::file_load_float_image(Image *img, device_vector<float4>& tex_
 		/* we only handle certain number of components */
 		width = spec.width;
 		height = spec.height;
+		depth = spec.depth;
 		components = spec.nchannels;
 	}
 	else {
@@ -395,7 +445,7 @@ bool ImageManager::file_load_float_image(Image *img, device_vector<float4>& tex_
 			return false;
 
 		bool is_float;
-		builtin_image_info_cb(img->filename, img->builtin_data, is_float, width, height, components);
+		builtin_image_info_cb(img->filename, img->builtin_data, is_float, width, height, depth, components);
 	}
 
 	if(!(components >= 1 && components <= 4)) {
@@ -407,15 +457,21 @@ bool ImageManager::file_load_float_image(Image *img, device_vector<float4>& tex_
 	}
 
 	/* read RGBA pixels */
-	float *pixels = (float*)tex_img.resize(width, height);
-	int scanlinesize = width*components*sizeof(float);
+	float *pixels = (float*)tex_img.resize(width, height, depth);
 
 	if(in) {
-		in->read_image(TypeDesc::FLOAT,
-			(uchar*)pixels + (height-1)*scanlinesize,
-			AutoStride,
-			-scanlinesize,
-			AutoStride);
+		if(depth <= 1) {
+			int scanlinesize = width*components*sizeof(float);
+
+			in->read_image(TypeDesc::FLOAT,
+				(uchar*)pixels + (height-1)*scanlinesize,
+				AutoStride,
+				-scanlinesize,
+				AutoStride);
+		}
+		else {
+			in->read_image(TypeDesc::FLOAT, (uchar*)pixels);
+		}
 
 		in->close();
 		delete in;
@@ -425,7 +481,7 @@ bool ImageManager::file_load_float_image(Image *img, device_vector<float4>& tex_
 	}
 
 	if(components == 2) {
-		for(int i = width*height-1; i >= 0; i--) {
+		for(int i = width*height*depth-1; i >= 0; i--) {
 			pixels[i*4+3] = pixels[i*2+1];
 			pixels[i*4+2] = pixels[i*2+0];
 			pixels[i*4+1] = pixels[i*2+0];
@@ -433,7 +489,7 @@ bool ImageManager::file_load_float_image(Image *img, device_vector<float4>& tex_
 		}
 	}
 	else if(components == 3) {
-		for(int i = width*height-1; i >= 0; i--) {
+		for(int i = width*height*depth-1; i >= 0; i--) {
 			pixels[i*4+3] = 1.0f;
 			pixels[i*4+2] = pixels[i*3+2];
 			pixels[i*4+1] = pixels[i*3+1];
@@ -441,7 +497,7 @@ bool ImageManager::file_load_float_image(Image *img, device_vector<float4>& tex_
 		}
 	}
 	else if(components == 1) {
-		for(int i = width*height-1; i >= 0; i--) {
+		for(int i = width*height*depth-1; i >= 0; i--) {
 			pixels[i*4+3] = 1.0f;
 			pixels[i*4+2] = pixels[i];
 			pixels[i*4+1] = pixels[i];
@@ -449,6 +505,12 @@ bool ImageManager::file_load_float_image(Image *img, device_vector<float4>& tex_
 		}
 	}
 
+	if(img->use_alpha == false) {
+		for(int i = width*height*depth-1; i >= 0; i--) {
+			pixels[i*4+3] = 1.0f;
+		}
+	}
+
 	return true;
 }
 
@@ -456,9 +518,7 @@ void ImageManager::device_load_image(Device *device, DeviceScene *dscene, int sl
 {
 	if(progress->get_cancel())
 		return;
-	if(osl_texture_system)
-		return;
-
+	
 	Image *img;
 	bool is_float;
 
@@ -471,6 +531,9 @@ void ImageManager::device_load_image(Device *device, DeviceScene *dscene, int sl
 		is_float = true;
 	}
 
+	if(osl_texture_system && !img->builtin_data)
+		return;
+
 	if(is_float) {
 		string filename = path_filename(float_images[slot]->filename);
 		progress->set_status("Updating Images", "Loading " + filename);
@@ -499,7 +562,7 @@ void ImageManager::device_load_image(Device *device, DeviceScene *dscene, int sl
 
 		if(!pack_images) {
 			thread_scoped_lock device_lock(device_mutex);
-			device->tex_alloc(name.c_str(), tex_img, true, true);
+			device->tex_alloc(name.c_str(), tex_img, img->interpolation, true);
 		}
 	}
 	else {
@@ -530,7 +593,7 @@ void ImageManager::device_load_image(Device *device, DeviceScene *dscene, int sl
 
 		if(!pack_images) {
 			thread_scoped_lock device_lock(device_mutex);
-			device->tex_alloc(name.c_str(), tex_img, true, true);
+			device->tex_alloc(name.c_str(), tex_img, img->interpolation, true);
 		}
 	}
 
@@ -552,7 +615,7 @@ void ImageManager::device_free_image(Device *device, DeviceScene *dscene, int sl
 	}
 
 	if(img) {
-		if(osl_texture_system) {
+		if(osl_texture_system && !img->builtin_data) {
 #ifdef WITH_OSL
 			ustring filename(images[slot]->filename);
 			((OSL::TextureSystem*)osl_texture_system)->invalidate(filename);
@@ -602,7 +665,7 @@ void ImageManager::device_update(Device *device, DeviceScene *dscene, Progress&
 			device_free_image(device, dscene, slot + tex_image_byte_start);
 		}
 		else if(images[slot]->need_load) {
-			if(!osl_texture_system) 
+			if(!osl_texture_system || images[slot]->builtin_data) 
 				pool.push(function_bind(&ImageManager::device_load_image, this, device, dscene, slot + tex_image_byte_start, &progress));
 		}
 	}
@@ -615,7 +678,7 @@ void ImageManager::device_update(Device *device, DeviceScene *dscene, Progress&
 			device_free_image(device, dscene, slot);
 		}
 		else if(float_images[slot]->need_load) {
-			if(!osl_texture_system) 
+			if(!osl_texture_system || float_images[slot]->builtin_data) 
 				pool.push(function_bind(&ImageManager::device_load_image, this, device, dscene, slot, &progress));
 		}
 	}
@@ -653,16 +716,32 @@ void ImageManager::device_pack_images(Device *device, DeviceScene *dscene, Progr
 
 		device_vector<uchar4>& tex_img = dscene->tex_image[slot];
 
-		info[slot] = make_uint4(tex_img.data_width, tex_img.data_height, offset, 1);
+		/* todo: support 3D textures, only CPU for now */
+
+		/* The image options are packed
+		   bit 0 -> periodic
+		   bit 1 + 2 -> interpolation type */
+		uint8_t interpolation = (images[slot]->interpolation << 1) + 1;
+		info[slot] = make_uint4(tex_img.data_width, tex_img.data_height, offset, interpolation);
 
 		memcpy(pixels+offset, (void*)tex_img.data_pointer, tex_img.memory_size());
 		offset += tex_img.size();
 	}
 
-	if(dscene->tex_image_packed.size())
+	if(dscene->tex_image_packed.size()) {
+		if(dscene->tex_image_packed.device_pointer) {
+			thread_scoped_lock device_lock(device_mutex);
+			device->tex_free(dscene->tex_image_packed);
+		}
 		device->tex_alloc("__tex_image_packed", dscene->tex_image_packed);
-	if(dscene->tex_image_packed_info.size())
+	}
+	if(dscene->tex_image_packed_info.size()) {
+		if(dscene->tex_image_packed_info.device_pointer) {
+			thread_scoped_lock device_lock(device_mutex);
+			device->tex_free(dscene->tex_image_packed_info);
+		}
 		device->tex_alloc("__tex_image_packed_info", dscene->tex_image_packed_info);
+	}
 }
 
 void ImageManager::device_free(Device *device, DeviceScene *dscene)
diff --git a/intern/cycles/render/image.h b/intern/cycles/render/image.h
index 187c5fd0f02..561550fe0d2 100644
--- a/intern/cycles/render/image.h
+++ b/intern/cycles/render/image.h
@@ -17,6 +17,7 @@
 #ifndef __IMAGE_H__
 #define __IMAGE_H__
 
+#include "device.h"
 #include "device_memory.h"
 
 #include "util_string.h"
@@ -27,11 +28,16 @@
 
 CCL_NAMESPACE_BEGIN
 
+/* generic */
 #define TEX_NUM_IMAGES			95
 #define TEX_IMAGE_BYTE_START	TEX_NUM_FLOAT_IMAGES
 
+/* extended gpu */
+#define TEX_EXTENDED_NUM_IMAGES_GPU		145
+
+/* extended cpu */
 #define TEX_EXTENDED_NUM_FLOAT_IMAGES	1024
-#define TEX_EXTENDED_NUM_IMAGES			1024
+#define TEX_EXTENDED_NUM_IMAGES_CPU		1024
 #define TEX_EXTENDED_IMAGE_BYTE_START	TEX_EXTENDED_NUM_FLOAT_IMAGES
 
 /* color to use when textures are not found */
@@ -49,8 +55,9 @@ public:
 	ImageManager();
 	~ImageManager();
 
-	int add_image(const string& filename, void *builtin_data, bool animated, bool& is_float, bool& is_linear);
-	void remove_image(const string& filename, void *builtin_data);
+	int add_image(const string& filename, void *builtin_data, bool animated, bool& is_float, bool& is_linear, InterpolationType interpolation, bool use_alpha);
+	void remove_image(int slot);
+	void remove_image(const string& filename, void *builtin_data, InterpolationType interpolation);
 	bool is_float_image(const string& filename, void *builtin_data, bool& is_linear);
 
 	void device_update(Device *device, DeviceScene *dscene, Progress& progress);
@@ -58,30 +65,34 @@ public:
 
 	void set_osl_texture_system(void *texture_system);
 	void set_pack_images(bool pack_images_);
-	void set_extended_image_limits(void);
+	void set_extended_image_limits(const DeviceInfo& info);
 	bool set_animation_frame_update(int frame);
 
 	bool need_update;
 
-	boost::function<void(const string &filename, void *data, bool &is_float, int &width, int &height, int &channels)> builtin_image_info_cb;
+	boost::function<void(const string &filename, void *data, bool &is_float, int &width, int &height, int &depth, int &channels)> builtin_image_info_cb;
 	boost::function<bool(const string &filename, void *data, unsigned char *pixels)> builtin_image_pixels_cb;
 	boost::function<bool(const string &filename, void *data, float *pixels)> builtin_image_float_pixels_cb;
-private:
-	int tex_num_images;
-	int tex_num_float_images;
-	int tex_image_byte_start;
-	thread_mutex device_mutex;
-	int animation_frame;
 
 	struct Image {
 		string filename;
 		void *builtin_data;
 
+		bool use_alpha;
 		bool need_load;
 		bool animated;
+		InterpolationType interpolation;
+
 		int users;
 	};
 
+private:
+	int tex_num_images;
+	int tex_num_float_images;
+	int tex_image_byte_start;
+	thread_mutex device_mutex;
+	int animation_frame;
+
 	vector<Image*> images;
 	vector<Image*> float_images;
 	void *osl_texture_system;
diff --git a/intern/cycles/render/integrator.cpp b/intern/cycles/render/integrator.cpp
index f48e04f31e1..59a0de07e5a 100644
--- a/intern/cycles/render/integrator.cpp
+++ b/intern/cycles/render/integrator.cpp
@@ -34,15 +34,14 @@ Integrator::Integrator()
 	max_glossy_bounce = max_bounce;
 	max_transmission_bounce = max_bounce;
 	max_volume_bounce = max_bounce;
-	probalistic_termination = true;
 
 	transparent_min_bounce = min_bounce;
 	transparent_max_bounce = max_bounce;
-	transparent_probalistic = true;
 	transparent_shadows = false;
 
+	volume_homogeneous_sampling = 0;
 	volume_max_steps = 1024;
-	volume_step_size = 0.1;
+	volume_step_size = 0.1f;
 
 	no_caustics = false;
 	filter_glossy = 0.0f;
@@ -82,10 +81,7 @@ void Integrator::device_update(Device *device, DeviceScene *dscene, Scene *scene
 
 	/* integrator parameters */
 	kintegrator->max_bounce = max_bounce + 1;
-	if(probalistic_termination)
-		kintegrator->min_bounce = min_bounce + 1;
-	else
-		kintegrator->min_bounce = kintegrator->max_bounce;
+	kintegrator->min_bounce = min_bounce + 1;
 
 	kintegrator->max_diffuse_bounce = max_diffuse_bounce + 1;
 	kintegrator->max_glossy_bounce = max_glossy_bounce + 1;
@@ -97,13 +93,11 @@ void Integrator::device_update(Device *device, DeviceScene *dscene, Scene *scene
 		kintegrator->max_volume_bounce = 1;
 
 	kintegrator->transparent_max_bounce = transparent_max_bounce + 1;
-	if(transparent_probalistic)
-		kintegrator->transparent_min_bounce = transparent_min_bounce + 1;
-	else
-		kintegrator->transparent_min_bounce = kintegrator->transparent_max_bounce;
+	kintegrator->transparent_min_bounce = transparent_min_bounce + 1;
 
 	kintegrator->transparent_shadows = transparent_shadows;
 
+	kintegrator->volume_homogeneous_sampling = volume_homogeneous_sampling;
 	kintegrator->volume_max_steps = volume_max_steps;
 	kintegrator->volume_step_size = volume_step_size;
 
@@ -120,7 +114,6 @@ void Integrator::device_update(Device *device, DeviceScene *dscene, Scene *scene
 	kintegrator->sample_clamp_indirect = (sample_clamp_indirect == 0.0f)? FLT_MAX: sample_clamp_indirect*3.0f;
 
 	kintegrator->branched = (method == BRANCHED_PATH);
-	kintegrator->aa_samples = aa_samples;
 	kintegrator->diffuse_samples = diffuse_samples;
 	kintegrator->glossy_samples = glossy_samples;
 	kintegrator->transmission_samples = transmission_samples;
@@ -128,8 +121,11 @@ void Integrator::device_update(Device *device, DeviceScene *dscene, Scene *scene
 	kintegrator->mesh_light_samples = mesh_light_samples;
 	kintegrator->subsurface_samples = subsurface_samples;
 	kintegrator->volume_samples = volume_samples;
+	kintegrator->sample_all_lights_direct = sample_all_lights_direct;
+	kintegrator->sample_all_lights_indirect = sample_all_lights_indirect;
 
 	kintegrator->sampling_pattern = sampling_pattern;
+	kintegrator->aa_samples = aa_samples;
 
 	/* sobol directions table */
 	int max_samples = 1;
@@ -171,11 +167,10 @@ bool Integrator::modified(const Integrator& integrator)
 		max_glossy_bounce == integrator.max_glossy_bounce &&
 		max_transmission_bounce == integrator.max_transmission_bounce &&
 		max_volume_bounce == integrator.max_volume_bounce &&
-		probalistic_termination == integrator.probalistic_termination &&
 		transparent_min_bounce == integrator.transparent_min_bounce &&
 		transparent_max_bounce == integrator.transparent_max_bounce &&
-		transparent_probalistic == integrator.transparent_probalistic &&
 		transparent_shadows == integrator.transparent_shadows &&
+		volume_homogeneous_sampling == integrator.volume_homogeneous_sampling &&
 		volume_max_steps == integrator.volume_max_steps &&
 		volume_step_size == integrator.volume_step_size &&
 		no_caustics == integrator.no_caustics &&
@@ -194,7 +189,9 @@ bool Integrator::modified(const Integrator& integrator)
 		subsurface_samples == integrator.subsurface_samples &&
 		volume_samples == integrator.volume_samples &&
 		motion_blur == integrator.motion_blur &&
-		sampling_pattern == integrator.sampling_pattern);
+		sampling_pattern == integrator.sampling_pattern &&
+		sample_all_lights_direct == integrator.sample_all_lights_direct &&
+		sample_all_lights_indirect == integrator.sample_all_lights_indirect);
 }
 
 void Integrator::tag_update(Scene *scene)
diff --git a/intern/cycles/render/integrator.h b/intern/cycles/render/integrator.h
index 573b258af60..380c1a65722 100644
--- a/intern/cycles/render/integrator.h
+++ b/intern/cycles/render/integrator.h
@@ -34,13 +34,12 @@ public:
 	int max_glossy_bounce;
 	int max_transmission_bounce;
 	int max_volume_bounce;
-	bool probalistic_termination;
 
 	int transparent_min_bounce;
 	int transparent_max_bounce;
-	bool transparent_probalistic;
 	bool transparent_shadows;
 
+	int volume_homogeneous_sampling;
 	int volume_max_steps;
 	float volume_step_size;
 
@@ -62,6 +61,8 @@ public:
 	int mesh_light_samples;
 	int subsurface_samples;
 	int volume_samples;
+	bool sample_all_lights_direct;
+	bool sample_all_lights_indirect;
 
 	enum Method {
 		BRANCHED_PATH = 0,
diff --git a/intern/cycles/render/light.cpp b/intern/cycles/render/light.cpp
index bab4218aae9..7bdb1fbf8af 100644
--- a/intern/cycles/render/light.cpp
+++ b/intern/cycles/render/light.cpp
@@ -29,7 +29,7 @@
 
 CCL_NAMESPACE_BEGIN
 
-static void shade_background_pixels(Device *device, DeviceScene *dscene, int res, vector<float3>& pixels)
+static void shade_background_pixels(Device *device, DeviceScene *dscene, int res, vector<float3>& pixels, Progress& progress)
 {
 	/* create input */
 	int width = res;
@@ -66,6 +66,7 @@ static void shade_background_pixels(Device *device, DeviceScene *dscene, int res
 	main_task.shader_eval_type = SHADER_EVAL_BACKGROUND;
 	main_task.shader_x = 0;
 	main_task.shader_w = width*height;
+	main_task.get_cancel = function_bind(&Progress::get_cancel, &progress);
 
 	/* disabled splitting for now, there's an issue with multi-GPU mem_copy_from */
 	list<DeviceTask> split_tasks;
@@ -149,7 +150,6 @@ void LightManager::device_update_distribution(Device *device, DeviceScene *dscen
 	size_t num_lights = scene->lights.size();
 	size_t num_background_lights = 0;
 	size_t num_triangles = 0;
-	size_t num_curve_segments = 0;
 
 	foreach(Object *object, scene->objects) {
 		Mesh *mesh = object->mesh;
@@ -159,6 +159,10 @@ void LightManager::device_update_distribution(Device *device, DeviceScene *dscen
 		if(!(object->visibility & (PATH_RAY_DIFFUSE|PATH_RAY_GLOSSY|PATH_RAY_TRANSMIT)))
 			continue;
 
+		/* skip motion blurred deforming meshes, not supported yet */
+		if(mesh->has_motion_blur())
+			continue;
+
 		/* skip if we have no emission shaders */
 		foreach(uint sindex, mesh->used_shaders) {
 			Shader *shader = scene->shaders[sindex];
@@ -177,20 +181,10 @@ void LightManager::device_update_distribution(Device *device, DeviceScene *dscen
 				if(shader->use_mis && shader->has_surface_emission)
 					num_triangles++;
 			}
-
-			/* disabled for curves */
-#if 0
-			foreach(Mesh::Curve& curve, mesh->curves) {
-				Shader *shader = scene->shaders[curve.shader];
-
-				if(shader->use_mis && shader->has_surface_emission)
-					num_curve_segments += curve.num_segments();
-#endif
 		}
 	}
 
-	size_t num_distribution = num_triangles + num_curve_segments;
-	num_distribution += num_lights;
+	size_t num_distribution = num_triangles + num_lights;
 
 	/* emission area */
 	float4 *distribution = dscene->light_distribution.resize(num_distribution + 1);
@@ -210,6 +204,10 @@ void LightManager::device_update_distribution(Device *device, DeviceScene *dscen
 			continue;
 		}
 
+		/* skip motion blurred deforming meshes, not supported yet */
+		if(mesh->has_motion_blur())
+			continue;
+
 		/* skip if we have no emission shaders */
 		foreach(uint sindex, mesh->used_shaders) {
 			Shader *shader = scene->shaders[sindex];
@@ -225,21 +223,21 @@ void LightManager::device_update_distribution(Device *device, DeviceScene *dscen
 			bool transform_applied = mesh->transform_applied;
 			Transform tfm = object->tfm;
 			int object_id = j;
-			int shader_id = SHADER_MASK;
+			int shader_flag = 0;
 
 			if(transform_applied)
 				object_id = ~object_id;
 
 			if(!(object->visibility & PATH_RAY_DIFFUSE)) {
-				shader_id |= SHADER_EXCLUDE_DIFFUSE;
+				shader_flag |= SHADER_EXCLUDE_DIFFUSE;
 				use_light_visibility = true;
 			}
 			if(!(object->visibility & PATH_RAY_GLOSSY)) {
-				shader_id |= SHADER_EXCLUDE_GLOSSY;
+				shader_flag |= SHADER_EXCLUDE_GLOSSY;
 				use_light_visibility = true;
 			}
 			if(!(object->visibility & PATH_RAY_TRANSMIT)) {
-				shader_id |= SHADER_EXCLUDE_TRANSMIT;
+				shader_flag |= SHADER_EXCLUDE_TRANSMIT;
 				use_light_visibility = true;
 			}
 
@@ -249,7 +247,7 @@ void LightManager::device_update_distribution(Device *device, DeviceScene *dscen
 				if(shader->use_mis && shader->has_surface_emission) {
 					distribution[offset].x = totarea;
 					distribution[offset].y = __int_as_float(i + mesh->tri_offset);
-					distribution[offset].z = __int_as_float(shader_id);
+					distribution[offset].z = __int_as_float(shader_flag);
 					distribution[offset].w = __int_as_float(object_id);
 					offset++;
 
@@ -267,40 +265,6 @@ void LightManager::device_update_distribution(Device *device, DeviceScene *dscen
 					totarea += triangle_area(p1, p2, p3);
 				}
 			}
-
-			/* sample as light disabled for strands */
-#if 0
-			size_t i = 0;
-
-			foreach(Mesh::Curve& curve, mesh->curves) {
-				Shader *shader = scene->shaders[curve.shader];
-				int first_key = curve.first_key;
-
-				if(shader->use_mis && shader->has_surface_emission) {
-					for(int j = 0; j < curve.num_segments(); j++) {
-						distribution[offset].x = totarea;
-						distribution[offset].y = __int_as_float(i + mesh->curve_offset); // XXX fix kernel code
-						distribution[offset].z = __int_as_float(j) & SHADER_MASK;
-						distribution[offset].w = __int_as_float(object_id);
-						offset++;
-				
-						float3 p1 = mesh->curve_keys[first_key + j].loc;
-						float r1 = mesh->curve_keys[first_key + j].radius;
-						float3 p2 = mesh->curve_keys[first_key + j + 1].loc;
-						float r2 = mesh->curve_keys[first_key + j + 1].radius;
-				
-						if(!transform_applied) {
-							p1 = transform_point(&tfm, p1);
-							p2 = transform_point(&tfm, p2);
-						}
-				
-						totarea += M_PI_F * (r1 + r2) * len(p1 - p2);
-					}
-				}
-
-				i++;
-			}
-#endif
 		}
 
 		if(progress.get_cancel()) return;
@@ -432,7 +396,7 @@ void LightManager::device_update_background(Device *device, DeviceScene *dscene,
 	assert(res > 0);
 
 	vector<float3> pixels;
-	shade_background_pixels(device, dscene, res, pixels);
+	shade_background_pixels(device, dscene, res, pixels, progress);
 
 	if(progress.get_cancel())
 		return;
diff --git a/intern/cycles/render/mesh.cpp b/intern/cycles/render/mesh.cpp
index 93f24886dc9..9c5ddd55010 100644
--- a/intern/cycles/render/mesh.cpp
+++ b/intern/cycles/render/mesh.cpp
@@ -18,6 +18,7 @@
 #include "bvh_build.h"
 
 #include "camera.h"
+#include "curves.h"
 #include "device.h"
 #include "shader.h"
 #include "light.h"
@@ -34,6 +35,39 @@
 
 CCL_NAMESPACE_BEGIN
 
+/* Triangle */
+
+void Mesh::Triangle::bounds_grow(const float3 *verts, BoundBox& bounds) const
+{
+	bounds.grow(verts[v[0]]);
+	bounds.grow(verts[v[1]]);
+	bounds.grow(verts[v[2]]);
+}
+
+/* Curve */
+
+void Mesh::Curve::bounds_grow(const int k, const float4 *curve_keys, BoundBox& bounds) const
+{
+	float3 P[4];
+
+	P[0] = float4_to_float3(curve_keys[max(first_key + k - 1,first_key)]);
+	P[1] = float4_to_float3(curve_keys[first_key + k]);
+	P[2] = float4_to_float3(curve_keys[first_key + k + 1]);
+	P[3] = float4_to_float3(curve_keys[min(first_key + k + 2, first_key + num_keys - 1)]);
+
+	float3 lower;
+	float3 upper;
+
+	curvebounds(&lower.x, &upper.x, P, 0);
+	curvebounds(&lower.y, &upper.y, P, 1);
+	curvebounds(&lower.z, &upper.z, P, 2);
+
+	float mr = max(curve_keys[first_key + k].w, curve_keys[first_key + k + 1].w);
+
+	bounds.grow(lower, mr);
+	bounds.grow(upper, mr);
+}
+
 /* Mesh */
 
 Mesh::Mesh()
@@ -46,6 +80,9 @@ Mesh::Mesh()
 	displacement_method = DISPLACE_BUMP;
 	bounds = BoundBox::empty;
 
+	motion_steps = 3;
+	use_motion_blur = false;
+
 	bvh = NULL;
 
 	tri_offset = 0;
@@ -97,6 +134,22 @@ void Mesh::clear()
 	transform_normal = transform_identity();
 }
 
+int Mesh::split_vertex(int vertex)
+{
+	/* copy vertex location and vertex attributes */
+	verts.push_back(verts[vertex]);
+
+	foreach(Attribute& attr, attributes.attributes) {
+		if(attr.element == ATTR_ELEMENT_VERTEX) {
+			vector<char> tmp(attr.data_sizeof());
+			memcpy(&tmp[0], attr.data() + tmp.size()*vertex, tmp.size());
+			attr.add(&tmp[0]);
+		}
+	}
+
+	return verts.size() - 1;
+}
+
 void Mesh::set_triangle(int i, int v0, int v1, int v2, int shader_, bool smooth_)
 {
 	Triangle tri;
@@ -123,9 +176,8 @@ void Mesh::add_triangle(int v0, int v1, int v2, int shader_, bool smooth_)
 
 void Mesh::add_curve_key(float3 co, float radius)
 {
-	CurveKey key;
-	key.co = co;
-	key.radius = radius;
+	float4 key = float3_to_float4(co);
+	key.w = radius;
 
 	curve_keys.push_back(key);
 }
@@ -151,7 +203,25 @@ void Mesh::compute_bounds()
 			bnds.grow(verts[i]);
 
 		for(size_t i = 0; i < curve_keys_size; i++)
-			bnds.grow(curve_keys[i].co, curve_keys[i].radius);
+			bnds.grow(float4_to_float3(curve_keys[i]), curve_keys[i].w);
+
+		Attribute *attr = attributes.find(ATTR_STD_MOTION_VERTEX_POSITION);
+		if (use_motion_blur && attr) {
+			size_t steps_size = verts.size() * (motion_steps - 1);
+			float3 *vert_steps = attr->data_float3();
+	
+			for (size_t i = 0; i < steps_size; i++)
+				bnds.grow(vert_steps[i]);
+		}
+
+		Attribute *curve_attr = curve_attributes.find(ATTR_STD_MOTION_VERTEX_POSITION);
+		if(use_motion_blur && curve_attr) {
+			size_t steps_size = curve_keys.size() * (motion_steps - 1);
+			float3 *key_steps = curve_attr->data_float3();
+	
+			for (size_t i = 0; i < steps_size; i++)
+				bnds.grow(key_steps[i]);
+		}
 
 		if(!bnds.valid()) {
 			bnds = BoundBox::empty;
@@ -161,7 +231,23 @@ void Mesh::compute_bounds()
 				bnds.grow_safe(verts[i]);
 
 			for(size_t i = 0; i < curve_keys_size; i++)
-				bnds.grow_safe(curve_keys[i].co, curve_keys[i].radius);
+				bnds.grow_safe(float4_to_float3(curve_keys[i]), curve_keys[i].w);
+			
+			if (use_motion_blur && attr) {
+				size_t steps_size = verts.size() * (motion_steps - 1);
+				float3 *vert_steps = attr->data_float3();
+		
+				for (size_t i = 0; i < steps_size; i++)
+					bnds.grow_safe(vert_steps[i]);
+			}
+
+			if (use_motion_blur && curve_attr) {
+				size_t steps_size = curve_keys.size() * (motion_steps - 1);
+				float3 *key_steps = curve_attr->data_float3();
+		
+				for (size_t i = 0; i < steps_size; i++)
+					bnds.grow_safe(key_steps[i]);
+			}
 		}
 	}
 
@@ -173,6 +259,21 @@ void Mesh::compute_bounds()
 	bounds = bnds;
 }
 
+static float3 compute_face_normal(const Mesh::Triangle& t, float3 *verts)
+{
+	float3 v0 = verts[t.v[0]];
+	float3 v1 = verts[t.v[1]];
+	float3 v2 = verts[t.v[2]];
+
+	float3 norm = cross(v1 - v0, v2 - v0);
+	float normlen = len(norm);
+
+	if(normlen == 0.0f)
+		return make_float3(0.0f, 0.0f, 0.0f);
+
+	return norm / normlen;
+}
+
 void Mesh::add_face_normals()
 {
 	/* don't compute if already there */
@@ -192,17 +293,7 @@ void Mesh::add_face_normals()
 		Triangle *triangles_ptr = &triangles[0];
 
 		for(size_t i = 0; i < triangles_size; i++) {
-			Triangle t = triangles_ptr[i];
-			float3 v0 = verts_ptr[t.v[0]];
-			float3 v1 = verts_ptr[t.v[1]];
-			float3 v2 = verts_ptr[t.v[2]];
-
-			float3 norm = cross(v1 - v0, v2 - v0);
-			float normlen = len(norm);
-			if(normlen == 0.0f)
-				fN[i] = make_float3(0.0f, 0.0f, 0.0f);
-			else
-				fN[i] = norm / normlen;
+			fN[i] = compute_face_normal(triangles_ptr[i], verts_ptr);
 
 			if(flip)
 				fN[i] = -fN[i];
@@ -220,36 +311,69 @@ void Mesh::add_face_normals()
 
 void Mesh::add_vertex_normals()
 {
-	/* don't compute if already there */
-	if(attributes.find(ATTR_STD_VERTEX_NORMAL))
-		return;
-	
-	/* get attributes */
-	Attribute *attr_fN = attributes.find(ATTR_STD_FACE_NORMAL);
-	Attribute *attr_vN = attributes.add(ATTR_STD_VERTEX_NORMAL);
+	bool flip = transform_negative_scaled;
+	size_t verts_size = verts.size();
+	size_t triangles_size = triangles.size();
 
-	float3 *fN = attr_fN->data_float3();
-	float3 *vN = attr_vN->data_float3();
+	/* static vertex normals */
+	if(!attributes.find(ATTR_STD_VERTEX_NORMAL)) {
+		/* get attributes */
+		Attribute *attr_fN = attributes.find(ATTR_STD_FACE_NORMAL);
+		Attribute *attr_vN = attributes.add(ATTR_STD_VERTEX_NORMAL);
 
-	/* compute vertex normals */
-	memset(vN, 0, verts.size()*sizeof(float3));
+		float3 *fN = attr_fN->data_float3();
+		float3 *vN = attr_vN->data_float3();
 
-	size_t verts_size = verts.size();
-	size_t triangles_size = triangles.size();
-	bool flip = transform_negative_scaled;
+		/* compute vertex normals */
+		memset(vN, 0, verts.size()*sizeof(float3));
 
-	if(triangles_size) {
-		Triangle *triangles_ptr = &triangles[0];
+		if(triangles_size) {
+			Triangle *triangles_ptr = &triangles[0];
 
-		for(size_t i = 0; i < triangles_size; i++)
-			for(size_t j = 0; j < 3; j++)
-				vN[triangles_ptr[i].v[j]] += fN[i];
+			for(size_t i = 0; i < triangles_size; i++)
+				for(size_t j = 0; j < 3; j++)
+					vN[triangles_ptr[i].v[j]] += fN[i];
+		}
+
+		for(size_t i = 0; i < verts_size; i++) {
+			vN[i] = normalize(vN[i]);
+			if(flip)
+				vN[i] = -vN[i];
+		}
 	}
 
-	for(size_t i = 0; i < verts_size; i++) {
-		vN[i] = normalize(vN[i]);
-		if(flip)
-			vN[i] = -vN[i];
+	/* motion vertex normals */
+	Attribute *attr_mP = attributes.find(ATTR_STD_MOTION_VERTEX_POSITION);
+	Attribute *attr_mN = attributes.find(ATTR_STD_MOTION_VERTEX_NORMAL);
+
+	if(has_motion_blur() && attr_mP && !attr_mN) {
+		/* create attribute */
+		attr_mN = attributes.add(ATTR_STD_MOTION_VERTEX_NORMAL);
+
+		for(int step = 0; step < motion_steps - 1; step++) {
+			float3 *mP = attr_mP->data_float3() + step*verts.size();
+			float3 *mN = attr_mN->data_float3() + step*verts.size();
+
+			/* compute */
+			memset(mN, 0, verts.size()*sizeof(float3));
+
+			if(triangles_size) {
+				Triangle *triangles_ptr = &triangles[0];
+
+				for(size_t i = 0; i < triangles_size; i++) {
+					for(size_t j = 0; j < 3; j++) {
+						float3 fN = compute_face_normal(triangles_ptr[i], mP);
+						mN[triangles_ptr[i].v[j]] += fN;
+					}
+				}
+			}
+
+			for(size_t i = 0; i < verts_size; i++) {
+				mN[i] = normalize(mN[i]);
+				if(flip)
+					mN[i] = -mN[i];
+			}
+		}
 	}
 }
 
@@ -335,18 +459,14 @@ void Mesh::pack_verts(float4 *tri_verts, float4 *tri_vindex, size_t vert_offset)
 void Mesh::pack_curves(Scene *scene, float4 *curve_key_co, float4 *curve_data, size_t curvekey_offset)
 {
 	size_t curve_keys_size = curve_keys.size();
-	CurveKey *keys_ptr = NULL;
+	float4 *keys_ptr = NULL;
 
 	/* pack curve keys */
 	if(curve_keys_size) {
 		keys_ptr = &curve_keys[0];
 
-		for(size_t i = 0; i < curve_keys_size; i++) {
-			float3 p = keys_ptr[i].co;
-			float radius = keys_ptr[i].radius;
-
-			curve_key_co[i] = make_float4(p.x, p.y, p.z, radius);
-		}
+		for(size_t i = 0; i < curve_keys_size; i++)
+			curve_key_co[i] = keys_ptr[i];
 	}
 
 	/* pack curve segments */
@@ -430,6 +550,13 @@ void Mesh::tag_update(Scene *scene, bool rebuild)
 	scene->object_manager->need_update = true;
 }
 
+bool Mesh::has_motion_blur() const
+{
+	return (use_motion_blur &&
+	        (attributes.find(ATTR_STD_MOTION_VERTEX_POSITION) ||
+	         curve_attributes.find(ATTR_STD_MOTION_VERTEX_POSITION)));
+}
+
 /* Mesh Manager */
 
 MeshManager::MeshManager()
@@ -641,10 +768,16 @@ static void update_attribute_element_offset(Mesh *mesh, vector<float>& attr_floa
 		size_t size = mattr->element_size(
 			mesh->verts.size(),
 			mesh->triangles.size(),
+			mesh->motion_steps,
 			mesh->curves.size(),
 			mesh->curve_keys.size());
 
-		if(mattr->type == TypeDesc::TypeFloat) {
+		if(mattr->element == ATTR_ELEMENT_VOXEL) {
+			/* store slot in offset value */
+			VoxelAttribute *voxel_data = mattr->data_voxel();
+			offset = voxel_data->slot;
+		}
+		else if(mattr->type == TypeDesc::TypeFloat) {
 			float *data = mattr->data_float();
 			offset = attr_float.size();
 
@@ -663,19 +796,21 @@ static void update_attribute_element_offset(Mesh *mesh, vector<float>& attr_floa
 				attr_float3[offset+k] = (&tfm->x)[k];
 		}
 		else {
-			float3 *data = mattr->data_float3();
+			float4 *data = mattr->data_float4();
 			offset = attr_float3.size();
 
 			attr_float3.resize(attr_float3.size() + size);
 
 			for(size_t k = 0; k < size; k++)
-				attr_float3[offset+k] = float3_to_float4(data[k]);
+				attr_float3[offset+k] = data[k];
 		}
 
 		/* mesh vertex/curve index is global, not per object, so we sneak
 		 * a correction for that in here */
 		if(element == ATTR_ELEMENT_VERTEX)
 			offset -= mesh->vert_offset;
+		else if(element == ATTR_ELEMENT_VERTEX_MOTION)
+			offset -= mesh->vert_offset;
 		else if(element == ATTR_ELEMENT_FACE)
 			offset -= mesh->tri_offset;
 		else if(element == ATTR_ELEMENT_CORNER)
@@ -684,6 +819,8 @@ static void update_attribute_element_offset(Mesh *mesh, vector<float>& attr_floa
 			offset -= mesh->curve_offset;
 		else if(element == ATTR_ELEMENT_CURVE_KEY)
 			offset -= mesh->curvekey_offset;
+		else if(element == ATTR_ELEMENT_CURVE_KEY_MOTION)
+			offset -= mesh->curvekey_offset;
 	}
 	else {
 		/* attribute not found */
@@ -750,8 +887,8 @@ void MeshManager::device_update_attributes(Device *device, DeviceScene *dscene,
 	/* create attribute lookup maps */
 	if(scene->shader_manager->use_osl())
 		update_osl_attributes(device, scene, mesh_attributes);
-	else
-		update_svm_attributes(device, dscene, scene, mesh_attributes);
+
+	update_svm_attributes(device, dscene, scene, mesh_attributes);
 
 	if(progress.get_cancel()) return;
 
@@ -866,9 +1003,9 @@ void MeshManager::device_update_bvh(Device *device, DeviceScene *dscene, Scene *
 		dscene->tri_woop.reference(&pack.tri_woop[0], pack.tri_woop.size());
 		device->tex_alloc("__tri_woop", dscene->tri_woop);
 	}
-	if(pack.prim_segment.size()) {
-		dscene->prim_segment.reference((uint*)&pack.prim_segment[0], pack.prim_segment.size());
-		device->tex_alloc("__prim_segment", dscene->prim_segment);
+	if(pack.prim_type.size()) {
+		dscene->prim_type.reference((uint*)&pack.prim_type[0], pack.prim_type.size());
+		device->tex_alloc("__prim_type", dscene->prim_type);
 	}
 	if(pack.prim_visibility.size()) {
 		dscene->prim_visibility.reference((uint*)&pack.prim_visibility[0], pack.prim_visibility.size());
@@ -956,7 +1093,6 @@ void MeshManager::device_update(Device *device, DeviceScene *dscene, Scene *scen
 	foreach(Shader *shader, scene->shaders)
 		shader->need_update_attributes = false;
 
-	float shuttertime = scene->camera->shuttertime;
 #ifdef __OBJECT_MOTION__
 	Scene::MotionType need_motion = scene->need_motion(device->info.advanced_shading);
 	bool motion_blur = need_motion == Scene::MOTION_BLUR;
@@ -965,7 +1101,7 @@ void MeshManager::device_update(Device *device, DeviceScene *dscene, Scene *scen
 #endif
 
 	foreach(Object *object, scene->objects)
-		object->compute_bounds(motion_blur, shuttertime);
+		object->compute_bounds(motion_blur);
 
 	if(progress.get_cancel()) return;
 
@@ -979,7 +1115,7 @@ void MeshManager::device_free(Device *device, DeviceScene *dscene)
 	device->tex_free(dscene->bvh_nodes);
 	device->tex_free(dscene->object_node);
 	device->tex_free(dscene->tri_woop);
-	device->tex_free(dscene->prim_segment);
+	device->tex_free(dscene->prim_type);
 	device->tex_free(dscene->prim_visibility);
 	device->tex_free(dscene->prim_index);
 	device->tex_free(dscene->prim_object);
@@ -996,7 +1132,7 @@ void MeshManager::device_free(Device *device, DeviceScene *dscene)
 	dscene->bvh_nodes.clear();
 	dscene->object_node.clear();
 	dscene->tri_woop.clear();
-	dscene->prim_segment.clear();
+	dscene->prim_type.clear();
 	dscene->prim_visibility.clear();
 	dscene->prim_index.clear();
 	dscene->prim_object.clear();
diff --git a/intern/cycles/render/mesh.h b/intern/cycles/render/mesh.h
index 281a8f0645e..247e3dd555e 100644
--- a/intern/cycles/render/mesh.h
+++ b/intern/cycles/render/mesh.h
@@ -46,6 +46,8 @@ public:
 	/* Mesh Triangle */
 	struct Triangle {
 		int v[3];
+
+		void bounds_grow(const float3 *verts, BoundBox& bounds) const;
 	};
 
 	/* Mesh Curve */
@@ -55,11 +57,8 @@ public:
 		uint shader;
 
 		int num_segments() { return num_keys - 1; }
-	};
 
-	struct CurveKey {
-		float3 co;
-		float radius;
+		void bounds_grow(const int k, const float4 *curve_keys, BoundBox& bounds) const;
 	};
 
 	/* Displacement */
@@ -77,7 +76,7 @@ public:
 	vector<uint> shader;
 	vector<bool> smooth;
 
-	vector<CurveKey> curve_keys;
+	vector<float4> curve_keys; /* co + radius */
 	vector<Curve> curves;
 
 	vector<uint> used_shaders;
@@ -90,6 +89,9 @@ public:
 	Transform transform_normal;
 	DisplacementMethod displacement_method;
 
+	uint motion_steps;
+	bool use_motion_blur;
+
 	/* Update Flags */
 	bool need_update;
 	bool need_update_rebuild;
@@ -112,6 +114,7 @@ public:
 	void add_triangle(int v0, int v1, int v2, int shader, bool smooth);
 	void add_curve_key(float3 loc, float radius);
 	void add_curve(int first_key, int num_keys, int shader);
+	int split_vertex(int vertex);
 
 	void compute_bounds();
 	void add_face_normals();
@@ -126,6 +129,8 @@ public:
 	bool need_attribute(Scene *scene, ustring name);
 
 	void tag_update(Scene *scene, bool rebuild);
+
+	bool has_motion_blur() const;
 };
 
 /* Mesh Manager */
diff --git a/intern/cycles/render/mesh_displace.cpp b/intern/cycles/render/mesh_displace.cpp
index 2fd8a978511..661fd9c66c1 100644
--- a/intern/cycles/render/mesh_displace.cpp
+++ b/intern/cycles/render/mesh_displace.cpp
@@ -44,7 +44,7 @@ bool MeshManager::displace(Device *device, DeviceScene *dscene, Scene *scene, Me
 	progress.set_status("Updating Mesh", msg);
 
 	/* find object index. todo: is arbitrary */
-	size_t object_index = ~0;
+	size_t object_index = OBJECT_NONE;
 
 	for(size_t i = 0; i < scene->objects.size(); i++) {
 		if(scene->objects[i]->mesh == mesh) {
@@ -119,17 +119,21 @@ bool MeshManager::displace(Device *device, DeviceScene *dscene, Scene *scene, Me
 	task.shader_eval_type = SHADER_EVAL_DISPLACE;
 	task.shader_x = 0;
 	task.shader_w = d_output.size();
+	task.get_cancel = function_bind(&Progress::get_cancel, &progress);
 
 	device->task_add(task);
 	device->task_wait();
 
+	if(progress.get_cancel()) {
+		device->mem_free(d_input);
+		device->mem_free(d_output);
+		return false;
+	}
+
 	device->mem_copy_from(d_output, 0, 1, d_output.size(), sizeof(float4));
 	device->mem_free(d_input);
 	device->mem_free(d_output);
 
-	if(progress.get_cancel())
-		return false;
-
 	/* read result */
 	done.clear();
 	done.resize(mesh->verts.size(), false);
diff --git a/intern/cycles/render/nodes.cpp b/intern/cycles/render/nodes.cpp
index af6fca29ab0..a53e0b39435 100644
--- a/intern/cycles/render/nodes.cpp
+++ b/intern/cycles/render/nodes.cpp
@@ -189,10 +189,12 @@ ImageTextureNode::ImageTextureNode()
 	slot = -1;
 	is_float = -1;
 	is_linear = false;
+	use_alpha = true;
 	filename = "";
 	builtin_data = NULL;
 	color_space = ustring("Color");
 	projection = ustring("Flat");
+	interpolation = INTERPOLATION_LINEAR;
 	projection_blend = 0.0f;
 	animated = false;
 
@@ -204,7 +206,7 @@ ImageTextureNode::ImageTextureNode()
 ImageTextureNode::~ImageTextureNode()
 {
 	if(image_manager)
-		image_manager->remove_image(filename, builtin_data);
+		image_manager->remove_image(filename, builtin_data, interpolation);
 }
 
 ShaderNode *ImageTextureNode::clone() const
@@ -241,7 +243,7 @@ void ImageTextureNode::compile(SVMCompiler& compiler)
 	image_manager = compiler.image_manager;
 	if(is_float == -1) {
 		bool is_float_bool;
-		slot = image_manager->add_image(filename, builtin_data, animated, is_float_bool, is_linear);
+		slot = image_manager->add_image(filename, builtin_data, animated, is_float_bool, is_linear, interpolation, use_alpha);
 		is_float = (int)is_float_bool;
 	}
 
@@ -315,6 +317,22 @@ void ImageTextureNode::compile(OSLCompiler& compiler)
 	compiler.parameter("projection_blend", projection_blend);
 	compiler.parameter("is_float", is_float);
 	compiler.parameter("use_alpha", !alpha_out->links.empty());
+
+	switch (interpolation) {
+		case INTERPOLATION_CLOSEST:
+			compiler.parameter("interpolation", "closest");
+			break;
+		case INTERPOLATION_CUBIC:
+			compiler.parameter("interpolation", "cubic");
+			break;
+		case INTERPOLATION_SMART:
+			compiler.parameter("interpolation", "smart");
+			break;
+		case INTERPOLATION_LINEAR:
+		default:
+			compiler.parameter("interpolation", "linear");
+			break;
+	}
 	compiler.add(this, "node_image_texture");
 }
 
@@ -340,6 +358,7 @@ EnvironmentTextureNode::EnvironmentTextureNode()
 	slot = -1;
 	is_float = -1;
 	is_linear = false;
+	use_alpha = true;
 	filename = "";
 	builtin_data = NULL;
 	color_space = ustring("Color");
@@ -354,7 +373,7 @@ EnvironmentTextureNode::EnvironmentTextureNode()
 EnvironmentTextureNode::~EnvironmentTextureNode()
 {
 	if(image_manager)
-		image_manager->remove_image(filename, builtin_data);
+		image_manager->remove_image(filename, builtin_data, INTERPOLATION_LINEAR);
 }
 
 ShaderNode *EnvironmentTextureNode::clone() const
@@ -389,7 +408,7 @@ void EnvironmentTextureNode::compile(SVMCompiler& compiler)
 	image_manager = compiler.image_manager;
 	if(slot == -1) {
 		bool is_float_bool;
-		slot = image_manager->add_image(filename, builtin_data, animated, is_float_bool, is_linear);
+		slot = image_manager->add_image(filename, builtin_data, animated, is_float_bool, is_linear, INTERPOLATION_LINEAR, use_alpha);
 		is_float = (int)is_float_bool;
 	}
 
@@ -565,13 +584,13 @@ static void sky_texture_precompute_new(SunSky *sunsky, float3 dir, float turbidi
 
 	/* Copy values from sky_state to SunSky */
 	for (int i = 0; i < 9; ++i) {
-		sunsky->config_x[i] = sky_state->configs[0][i];
-		sunsky->config_y[i] = sky_state->configs[1][i];
-		sunsky->config_z[i] = sky_state->configs[2][i];
+		sunsky->config_x[i] = (float)sky_state->configs[0][i];
+		sunsky->config_y[i] = (float)sky_state->configs[1][i];
+		sunsky->config_z[i] = (float)sky_state->configs[2][i];
 	}
-	sunsky->radiance_x = sky_state->radiances[0];
-	sunsky->radiance_y = sky_state->radiances[1];
-	sunsky->radiance_z = sky_state->radiances[2];
+	sunsky->radiance_x = (float)sky_state->radiances[0];
+	sunsky->radiance_y = (float)sky_state->radiances[1];
+	sunsky->radiance_z = (float)sky_state->radiances[2];
 
 	/* Free sky_state */
 	arhosekskymodelstate_free(sky_state);
@@ -612,6 +631,8 @@ void SkyTextureNode::compile(SVMCompiler& compiler)
 		sky_texture_precompute_old(&sunsky, sun_direction, turbidity);
 	else if(type_enum[type] == NODE_SKY_NEW)
 		sky_texture_precompute_new(&sunsky, sun_direction, turbidity, ground_albedo);
+	else
+		assert(false);
 
 	if(vector_in->link)
 		compiler.stack_assign(vector_in);
@@ -649,6 +670,8 @@ void SkyTextureNode::compile(OSLCompiler& compiler)
 		sky_texture_precompute_old(&sunsky, sun_direction, turbidity);
 	else if(type_enum[type] == NODE_SKY_NEW)
 		sky_texture_precompute_new(&sunsky, sun_direction, turbidity, ground_albedo);
+	else
+		assert(false);
 		
 	compiler.parameter("sky_model", type);
 	compiler.parameter("theta", sunsky.theta);
@@ -2192,8 +2215,9 @@ void TextureCoordinateNode::attributes(Shader *shader, AttributeRequestSet *attr
 
 	if(shader->has_volume) {
 		if(!from_dupli) {
-			if(!output("Generated")->links.empty())
+			if(!output("Generated")->links.empty()) {
 				attributes->add(ATTR_STD_GENERATED_TRANSFORM);
+			}
 		}
 	}
 
@@ -2310,6 +2334,78 @@ void TextureCoordinateNode::compile(OSLCompiler& compiler)
 	compiler.add(this, "node_texture_coordinate");
 }
 
+UVMapNode::UVMapNode()
+: ShaderNode("uvmap")
+{
+	attribute = "";
+	from_dupli = false;
+
+	add_output("UV", SHADER_SOCKET_POINT);
+}
+
+void UVMapNode::attributes(Shader *shader, AttributeRequestSet *attributes)
+{
+	if(shader->has_surface) {
+		if(!from_dupli) {
+			if(!output("UV")->links.empty()) {
+				if (attribute != "")
+					attributes->add(attribute);
+				else
+					attributes->add(ATTR_STD_UV);
+			}
+		}
+	}
+
+	ShaderNode::attributes(shader, attributes);
+}
+
+void UVMapNode::compile(SVMCompiler& compiler)
+{
+	ShaderOutput *out = output("UV");
+	NodeType texco_node = NODE_TEX_COORD;
+	NodeType attr_node = NODE_ATTR;
+	int attr;
+
+	if(bump == SHADER_BUMP_DX) {
+		texco_node = NODE_TEX_COORD_BUMP_DX;
+		attr_node = NODE_ATTR_BUMP_DX;
+	}
+	else if(bump == SHADER_BUMP_DY) {
+		texco_node = NODE_TEX_COORD_BUMP_DY;
+		attr_node = NODE_ATTR_BUMP_DY;
+	}
+
+	if(!out->links.empty()) {
+		if(from_dupli) {
+			compiler.stack_assign(out);
+			compiler.add_node(texco_node, NODE_TEXCO_DUPLI_UV, out->stack_offset);
+		}
+		else {
+			if (attribute != "")
+				attr = compiler.attribute(attribute);
+			else
+				attr = compiler.attribute(ATTR_STD_UV);
+
+			compiler.stack_assign(out);
+			compiler.add_node(attr_node, attr, out->stack_offset, NODE_ATTR_FLOAT3);
+		}
+	}
+}
+
+void UVMapNode::compile(OSLCompiler& compiler)
+{
+	if(bump == SHADER_BUMP_DX)
+		compiler.parameter("bump_offset", "dx");
+	else if(bump == SHADER_BUMP_DY)
+		compiler.parameter("bump_offset", "dy");
+	else
+		compiler.parameter("bump_offset", "center");
+
+	compiler.parameter("from_dupli", from_dupli);
+	compiler.parameter("name", attribute.c_str());
+	compiler.add(this, "node_uv_map");
+}
+
 /* Light Path */
 
 LightPathNode::LightPathNode()
@@ -2325,6 +2421,7 @@ LightPathNode::LightPathNode()
 	add_output("Is Volume Scatter Ray", SHADER_SOCKET_FLOAT);
 	add_output("Ray Length", SHADER_SOCKET_FLOAT);
 	add_output("Ray Depth", SHADER_SOCKET_FLOAT);
+	add_output("Transparent Depth", SHADER_SOCKET_FLOAT);
 }
 
 void LightPathNode::compile(SVMCompiler& compiler)
@@ -2392,6 +2489,11 @@ void LightPathNode::compile(SVMCompiler& compiler)
 		compiler.add_node(NODE_LIGHT_PATH, NODE_LP_ray_depth, out->stack_offset);
 	}
 
+	out = output("Transparent Depth");
+	if(!out->links.empty()) {
+		compiler.stack_assign(out);
+		compiler.add_node(NODE_LIGHT_PATH, NODE_LP_ray_transparent, out->stack_offset);
+	}
 }
 
 void LightPathNode::compile(OSLCompiler& compiler)
@@ -2612,7 +2714,7 @@ void HairInfoNode::attributes(Shader *shader, AttributeRequestSet *attributes)
 		if(!intercept_out->links.empty())
 			attributes->add(ATTR_STD_CURVE_INTERCEPT);
 	}
-	
+
 	ShaderNode::attributes(shader, attributes);
 }
 
@@ -3126,15 +3228,22 @@ AttributeNode::AttributeNode()
 
 void AttributeNode::attributes(Shader *shader, AttributeRequestSet *attributes)
 {
-	if(shader->has_surface) {
-		ShaderOutput *color_out = output("Color");
-		ShaderOutput *vector_out = output("Vector");
-		ShaderOutput *fac_out = output("Fac");
+	ShaderOutput *color_out = output("Color");
+	ShaderOutput *vector_out = output("Vector");
+	ShaderOutput *fac_out = output("Fac");
 
-		if(!color_out->links.empty() || !vector_out->links.empty() || !fac_out->links.empty())
+	if(!color_out->links.empty() || !vector_out->links.empty() || !fac_out->links.empty()) {
+		AttributeStandard std = Attribute::name_standard(attribute.c_str());
+
+		if(std != ATTR_STD_NONE)
+			attributes->add(std);
+		else
 			attributes->add(attribute);
 	}
-	
+
+	if(shader->has_volume)
+		attributes->add(ATTR_STD_GENERATED_TRANSFORM);
+
 	ShaderNode::attributes(shader, attributes);
 }
 
@@ -3144,6 +3253,13 @@ void AttributeNode::compile(SVMCompiler& compiler)
 	ShaderOutput *vector_out = output("Vector");
 	ShaderOutput *fac_out = output("Fac");
 	NodeType attr_node = NODE_ATTR;
+	AttributeStandard std = Attribute::name_standard(attribute.c_str());
+	int attr;
+
+	if(std != ATTR_STD_NONE)
+		attr = compiler.attribute(std);
+	else
+		attr = compiler.attribute(attribute);
 
 	if(bump == SHADER_BUMP_DX)
 		attr_node = NODE_ATTR_BUMP_DX;
@@ -3151,8 +3267,6 @@ void AttributeNode::compile(SVMCompiler& compiler)
 		attr_node = NODE_ATTR_BUMP_DY;
 
 	if(!color_out->links.empty() || !vector_out->links.empty()) {
-		int attr = compiler.attribute(attribute);
-
 		if(!color_out->links.empty()) {
 			compiler.stack_assign(color_out);
 			compiler.add_node(attr_node, attr, color_out->stack_offset, NODE_ATTR_FLOAT3);
@@ -3164,8 +3278,6 @@ void AttributeNode::compile(SVMCompiler& compiler)
 	}
 
 	if(!fac_out->links.empty()) {
-		int attr = compiler.attribute(attribute);
-
 		compiler.stack_assign(fac_out);
 		compiler.add_node(attr_node, attr, fac_out->stack_offset, NODE_ATTR_FLOAT);
 	}
@@ -3179,8 +3291,12 @@ void AttributeNode::compile(OSLCompiler& compiler)
 		compiler.parameter("bump_offset", "dy");
 	else
 		compiler.parameter("bump_offset", "center");
+	
+	if(Attribute::name_standard(attribute.c_str()) != ATTR_STD_NONE)
+		compiler.parameter("name", (string("geom:") + attribute.c_str()).c_str());
+	else
+		compiler.parameter("name", attribute.c_str());
 
-	compiler.parameter("name", attribute.c_str());
 	compiler.add(this, "node_attribute");
 }
 
@@ -3428,6 +3544,7 @@ static ShaderEnum math_type_init()
 	enm.insert("Less Than", NODE_MATH_LESS_THAN);
 	enm.insert("Greater Than", NODE_MATH_GREATER_THAN);
 	enm.insert("Modulo", NODE_MATH_MODULO);
+    enm.insert("Absolute", NODE_MATH_ABSOLUTE);
 
 	return enm;
 }
diff --git a/intern/cycles/render/nodes.h b/intern/cycles/render/nodes.h
index 86c4f490875..d94d8ce6033 100644
--- a/intern/cycles/render/nodes.h
+++ b/intern/cycles/render/nodes.h
@@ -72,10 +72,12 @@ public:
 	int slot;
 	int is_float;
 	bool is_linear;
+	bool use_alpha;
 	string filename;
 	void *builtin_data;
 	ustring color_space;
 	ustring projection;
+	InterpolationType interpolation;
 	float projection_blend;
 	bool animated;
 
@@ -94,6 +96,7 @@ public:
 	int slot;
 	int is_float;
 	bool is_linear;
+	bool use_alpha;
 	string filename;
 	void *builtin_data;
 	ustring color_space;
@@ -208,6 +211,7 @@ public:
 	BsdfNode(bool scattering = false);
 	SHADER_NODE_BASE_CLASS(BsdfNode);
 
+	bool has_spatial_varying() { return true; }
 	void compile(SVMCompiler& compiler, ShaderInput *param1, ShaderInput *param2, ShaderInput *param3 = NULL, ShaderInput *param4 = NULL);
 
 	ClosureType closure;
@@ -279,6 +283,7 @@ public:
 	SHADER_NODE_CLASS(SubsurfaceScatteringNode)
 	bool has_surface_bssrdf() { return true; }
 	bool has_bssrdf_bump();
+	bool has_spatial_varying() { return true; }
 
 	static ShaderEnum falloff_enum;
 };
@@ -288,6 +293,7 @@ public:
 	SHADER_NODE_CLASS(EmissionNode)
 
 	bool has_surface_emission() { return true; }
+	bool has_spatial_varying() { return true; }
 
 	bool total_power;
 };
@@ -305,6 +311,8 @@ public:
 class AmbientOcclusionNode : public ShaderNode {
 public:
 	SHADER_NODE_CLASS(AmbientOcclusionNode)
+
+	bool has_spatial_varying() { return true; }
 };
 
 class VolumeNode : public ShaderNode {
@@ -339,16 +347,28 @@ class GeometryNode : public ShaderNode {
 public:
 	SHADER_NODE_CLASS(GeometryNode)
 	void attributes(Shader *shader, AttributeRequestSet *attributes);
+	bool has_spatial_varying() { return true; }
 };
 
 class TextureCoordinateNode : public ShaderNode {
 public:
 	SHADER_NODE_CLASS(TextureCoordinateNode)
 	void attributes(Shader *shader, AttributeRequestSet *attributes);
+	bool has_spatial_varying() { return true; }
 	
 	bool from_dupli;
 };
 
+class UVMapNode : public ShaderNode {
+public:
+	SHADER_NODE_CLASS(UVMapNode)
+	void attributes(Shader *shader, AttributeRequestSet *attributes);
+	bool has_spatial_varying() { return true; }
+
+	ustring attribute;
+	bool from_dupli;
+};
+
 class LightPathNode : public ShaderNode {
 public:
 	SHADER_NODE_CLASS(LightPathNode)
@@ -357,6 +377,7 @@ public:
 class LightFalloffNode : public ShaderNode {
 public:
 	SHADER_NODE_CLASS(LightFalloffNode)
+	bool has_spatial_varying() { return true; }
 };
 
 class ObjectInfoNode : public ShaderNode {
@@ -375,6 +396,7 @@ public:
 	SHADER_NODE_CLASS(HairInfoNode)
 
 	void attributes(Shader *shader, AttributeRequestSet *attributes);
+	bool has_spatial_varying() { return true; }
 };
 
 class ValueNode : public ShaderNode {
@@ -460,6 +482,7 @@ class AttributeNode : public ShaderNode {
 public:
 	SHADER_NODE_CLASS(AttributeNode)
 	void attributes(Shader *shader, AttributeRequestSet *attributes);
+	bool has_spatial_varying() { return true; }
 
 	ustring attribute;
 };
@@ -467,21 +490,25 @@ public:
 class CameraNode : public ShaderNode {
 public:
 	SHADER_NODE_CLASS(CameraNode)
+	bool has_spatial_varying() { return true; }
 };
 
 class FresnelNode : public ShaderNode {
 public:
 	SHADER_NODE_CLASS(FresnelNode)
+	bool has_spatial_varying() { return true; }
 };
 
 class LayerWeightNode : public ShaderNode {
 public:
 	SHADER_NODE_CLASS(LayerWeightNode)
+	bool has_spatial_varying() { return true; }
 };
 
 class WireframeNode : public ShaderNode {
 public:
 	SHADER_NODE_CLASS(WireframeNode)
+	bool has_spatial_varying() { return true; }
 	
 	bool use_pixel_size;
 };
@@ -538,6 +565,8 @@ public:
 class BumpNode : public ShaderNode {
 public:
 	SHADER_NODE_CLASS(BumpNode)
+	bool has_spatial_varying() { return true; }
+
 	bool invert;
 };
 
@@ -568,6 +597,10 @@ public:
 class OSLScriptNode : public ShaderNode {
 public:
 	SHADER_NODE_CLASS(OSLScriptNode)
+
+	/* ideally we could beter detect this, but we can't query this now */
+	bool has_spatial_varying() { return true; }
+
 	string filepath;
 	string bytecode_hash;
 	
@@ -581,6 +614,7 @@ class NormalMapNode : public ShaderNode {
 public:
 	SHADER_NODE_CLASS(NormalMapNode)
 	void attributes(Shader *shader, AttributeRequestSet *attributes);
+	bool has_spatial_varying() { return true; }
 
 	ustring space;
 	static ShaderEnum space_enum;
@@ -592,6 +626,7 @@ class TangentNode : public ShaderNode {
 public:
 	SHADER_NODE_CLASS(TangentNode)
 	void attributes(Shader *shader, AttributeRequestSet *attributes);
+	bool has_spatial_varying() { return true; }
 
 	ustring direction_type;
 	static ShaderEnum direction_type_enum;
diff --git a/intern/cycles/render/object.cpp b/intern/cycles/render/object.cpp
index 3edb934ef2c..027bfd71931 100644
--- a/intern/cycles/render/object.cpp
+++ b/intern/cycles/render/object.cpp
@@ -19,6 +19,7 @@
 #include "mesh.h"
 #include "curves.h"
 #include "object.h"
+#include "particles.h"
 #include "scene.h"
 
 #include "util_foreach.h"
@@ -38,7 +39,8 @@ Object::Object()
 	visibility = ~0;
 	random_id = 0;
 	pass_id = 0;
-	particle_id = 0;
+	particle_system = NULL;
+	particle_index = 0;
 	bounds = BoundBox::empty;
 	motion.pre = transform_identity();
 	motion.mid = transform_identity();
@@ -53,7 +55,7 @@ Object::~Object()
 {
 }
 
-void Object::compute_bounds(bool motion_blur, float shuttertime)
+void Object::compute_bounds(bool motion_blur)
 {
 	BoundBox mbounds = mesh->bounds;
 
@@ -66,10 +68,7 @@ void Object::compute_bounds(bool motion_blur, float shuttertime)
 		/* todo: this is really terrible. according to pbrt there is a better
 		 * way to find this iteratively, but did not find implementation yet
 		 * or try to implement myself */
-		float start_t = 0.5f - shuttertime*0.25f;
-		float end_t = 0.5f + shuttertime*0.25f;
-
-		for(float t = start_t; t < end_t; t += (1.0f/128.0f)*shuttertime) {
+		for(float t = 0.0f; t < 1.0f; t += (1.0f/128.0f)) {
 			Transform ttfm;
 
 			transform_motion_interpolate(&ttfm, &decomp, t);
@@ -80,29 +79,83 @@ void Object::compute_bounds(bool motion_blur, float shuttertime)
 		bounds = mbounds.transformed(&tfm);
 }
 
-void Object::apply_transform()
+void Object::apply_transform(bool apply_to_motion)
 {
 	if(!mesh || tfm == transform_identity())
 		return;
+	
+	/* triangles */
+	if(mesh->verts.size()) {
+		/* store matrix to transform later. when accessing these as attributes we
+		 * do not want the transform to be applied for consistency between static
+		 * and dynamic BVH, so we do it on packing. */
+		mesh->transform_normal = transform_transpose(transform_inverse(tfm));
+
+		/* apply to mesh vertices */
+		for(size_t i = 0; i < mesh->verts.size(); i++)
+			mesh->verts[i] = transform_point(&tfm, mesh->verts[i]);
+		
+		if(apply_to_motion) {
+			Attribute *attr = mesh->attributes.find(ATTR_STD_MOTION_VERTEX_POSITION);
+
+			if (attr) {
+				size_t steps_size = mesh->verts.size() * (mesh->motion_steps - 1);
+				float3 *vert_steps = attr->data_float3();
+
+				for (size_t i = 0; i < steps_size; i++)
+					vert_steps[i] = transform_point(&tfm, vert_steps[i]);
+			}
 
-	float3 c0 = transform_get_column(&tfm, 0);
-	float3 c1 = transform_get_column(&tfm, 1);
-	float3 c2 = transform_get_column(&tfm, 2);
-	float scalar = pow(fabsf(dot(cross(c0, c1), c2)), 1.0f/3.0f);
+			Attribute *attr_N = mesh->attributes.find(ATTR_STD_MOTION_VERTEX_NORMAL);
 
-	for(size_t i = 0; i < mesh->verts.size(); i++)
-		mesh->verts[i] = transform_point(&tfm, mesh->verts[i]);
+			if(attr_N) {
+				Transform ntfm = mesh->transform_normal;
+				size_t steps_size = mesh->verts.size() * (mesh->motion_steps - 1);
+				float3 *normal_steps = attr_N->data_float3();
 
-	for(size_t i = 0; i < mesh->curve_keys.size(); i++) {
-		mesh->curve_keys[i].co = transform_point(&tfm, mesh->curve_keys[i].co);
-		/* scale for strand radius - only correct for uniform transforms*/
-		mesh->curve_keys[i].radius *= scalar;
+				for (size_t i = 0; i < steps_size; i++)
+					normal_steps[i] = normalize(transform_direction(&ntfm, normal_steps[i]));
+			}
+		}
 	}
 
-	/* store matrix to transform later. when accessing these as attributes we
-	 * do not want the transform to be applied for consistency between static
-	 * and dynamic BVH, so we do it on packing. */
-	mesh->transform_normal = transform_transpose(transform_inverse(tfm));
+	/* curves */
+	if(mesh->curve_keys.size()) {
+		/* compute uniform scale */
+		float3 c0 = transform_get_column(&tfm, 0);
+		float3 c1 = transform_get_column(&tfm, 1);
+		float3 c2 = transform_get_column(&tfm, 2);
+		float scalar = pow(fabsf(dot(cross(c0, c1), c2)), 1.0f/3.0f);
+
+		/* apply transform to curve keys */
+		for(size_t i = 0; i < mesh->curve_keys.size(); i++) {
+			float3 co = transform_point(&tfm, float4_to_float3(mesh->curve_keys[i]));
+			float radius = mesh->curve_keys[i].w * scalar;
+
+			/* scale for curve radius is only correct for uniform scale */
+			mesh->curve_keys[i] = float3_to_float4(co);
+			mesh->curve_keys[i].w = radius;
+		}
+
+		if(apply_to_motion) {
+			Attribute *curve_attr = mesh->curve_attributes.find(ATTR_STD_MOTION_VERTEX_POSITION);
+
+			if (curve_attr) {
+				/* apply transform to motion curve keys */
+				size_t steps_size = mesh->curve_keys.size() * (mesh->motion_steps - 1);
+				float4 *key_steps = curve_attr->data_float4();
+
+				for (size_t i = 0; i < steps_size; i++) {
+					float3 co = transform_point(&tfm, float4_to_float3(key_steps[i]));
+					float radius = key_steps[i].w * scalar;
+
+					/* scale for curve radius is only correct for uniform scale */
+					key_steps[i] = float3_to_float4(co);
+					key_steps[i].w = radius;
+				}
+			}
+		}
+	}
 
 	/* we keep normals pointing in same direction on negative scale, notify
 	 * mesh about this in it (re)calculates normals */
@@ -111,7 +164,7 @@ void Object::apply_transform()
 
 	if(bounds.valid()) {
 		mesh->compute_bounds();
-		compute_bounds(false, 0.0f);
+		compute_bounds(false);
 	}
 
 	/* tfm is not reset to identity, all code that uses it needs to check the
@@ -137,6 +190,26 @@ void Object::tag_update(Scene *scene)
 	scene->object_manager->need_update = true;
 }
 
+vector<float> Object::motion_times()
+{
+	/* compute times at which we sample motion for this object */
+	vector<float> times;
+
+	if(!mesh || mesh->motion_steps == 1)
+		return times;
+
+	int motion_steps = mesh->motion_steps;
+
+	for(int step = 0; step < motion_steps; step++) {
+		if(step != motion_steps / 2) {
+			float time = 2.0f * step / (motion_steps - 1) - 1.0f;
+			times.push_back(time);
+		}
+	}
+
+	return times;
+}
+
 /* Object Manager */
 
 ObjectManager::ObjectManager()
@@ -154,6 +227,7 @@ void ObjectManager::device_update_transforms(Device *device, DeviceScene *dscene
 	float4 *objects_vector = NULL;
 	int i = 0;
 	map<Mesh*, float> surface_area_map;
+	map<ParticleSystem*, int> particle_offset;
 	Scene::MotionType need_motion = scene->need_motion(device->info.advanced_shading);
 	bool have_motion = false;
 	bool have_curves = false;
@@ -162,6 +236,15 @@ void ObjectManager::device_update_transforms(Device *device, DeviceScene *dscene
 	if(need_motion == Scene::MOTION_PASS)
 		objects_vector = dscene->objects_vector.resize(OBJECT_VECTOR_SIZE*scene->objects.size());
 
+	/* particle system device offsets
+	 * 0 is dummy particle, index starts at 1
+	 */
+	int numparticles = 1;
+	foreach(ParticleSystem *psys, scene->particle_systems) {
+		particle_offset[psys] = numparticles;
+		numparticles += psys->particles.size();
+	}
+
 	foreach(Object *ob, scene->objects) {
 		Mesh *mesh = ob->mesh;
 		uint flag = 0;
@@ -177,6 +260,7 @@ void ObjectManager::device_update_transforms(Device *device, DeviceScene *dscene
 		float surface_area = 0.0f;
 		float pass_id = ob->pass_id;
 		float random_number = (float)ob->random_id * (1.0f/(float)0xFFFFFFFF);
+		int particle_index = (ob->particle_system)? ob->particle_index + particle_offset[ob->particle_system]: 0;
 
 		if(transform_uniform_scale(tfm, uniform_scale)) {
 			map<Mesh*, float>::iterator it = surface_area_map.find(mesh);
@@ -190,20 +274,6 @@ void ObjectManager::device_update_transforms(Device *device, DeviceScene *dscene
 					surface_area += triangle_area(p1, p2, p3);
 				}
 
-				foreach(Mesh::Curve& curve, mesh->curves) {
-					int first_key = curve.first_key;
-
-					for(int i = 0; i < curve.num_segments(); i++) {
-						float3 p1 = mesh->curve_keys[first_key + i].co;
-						float r1 = mesh->curve_keys[first_key + i].radius;
-						float3 p2 = mesh->curve_keys[first_key + i + 1].co;
-						float r2 = mesh->curve_keys[first_key + i + 1].radius;
-
-						/* currently ignores segment overlaps*/
-						surface_area += M_PI_F *(r1 + r2) * len(p1 - p2);
-					}
-				}
-
 				surface_area_map[mesh] = surface_area;
 			}
 			else
@@ -219,31 +289,17 @@ void ObjectManager::device_update_transforms(Device *device, DeviceScene *dscene
 
 				surface_area += triangle_area(p1, p2, p3);
 			}
-
-			foreach(Mesh::Curve& curve, mesh->curves) {
-				int first_key = curve.first_key;
-
-				for(int i = 0; i < curve.num_segments(); i++) {
-					float3 p1 = mesh->curve_keys[first_key + i].co;
-					float r1 = mesh->curve_keys[first_key + i].radius;
-					float3 p2 = mesh->curve_keys[first_key + i + 1].co;
-					float r2 = mesh->curve_keys[first_key + i + 1].radius;
-
-					p1 = transform_point(&tfm, p1);
-					p2 = transform_point(&tfm, p2);
-
-					/* currently ignores segment overlaps*/
-					surface_area += M_PI_F *(r1 + r2) * len(p1 - p2);
-				}
-			}
 		}
 
 		/* pack in texture */
 		int offset = i*OBJECT_SIZE;
 
+		/* OBJECT_TRANSFORM */
 		memcpy(&objects[offset], &tfm, sizeof(float4)*3);
+		/* OBJECT_INVERSE_TRANSFORM */
 		memcpy(&objects[offset+4], &itfm, sizeof(float4)*3);
-		objects[offset+8] = make_float4(surface_area, pass_id, random_number, __int_as_float(ob->particle_id));
+		/* OBJECT_PROPERTIES */
+		objects[offset+8] = make_float4(surface_area, pass_id, random_number, __int_as_float(particle_index));
 
 		if(need_motion == Scene::MOTION_PASS) {
 			/* motion transformations, is world/object space depending if mesh
@@ -252,10 +308,10 @@ void ObjectManager::device_update_transforms(Device *device, DeviceScene *dscene
 			Transform mtfm_pre = ob->motion.pre;
 			Transform mtfm_post = ob->motion.post;
 
-			if(!(mesh->attributes.find(ATTR_STD_MOTION_PRE) || mesh->curve_attributes.find(ATTR_STD_MOTION_PRE)))
+			if(!mesh->attributes.find(ATTR_STD_MOTION_VERTEX_POSITION)) {
 				mtfm_pre = mtfm_pre * itfm;
-			if(!(mesh->attributes.find(ATTR_STD_MOTION_POST) || mesh->curve_attributes.find(ATTR_STD_MOTION_POST)))
 				mtfm_post = mtfm_post * itfm;
+			}
 
 			memcpy(&objects_vector[i*OBJECT_VECTOR_SIZE+0], &mtfm_pre, sizeof(float4)*3);
 			memcpy(&objects_vector[i*OBJECT_VECTOR_SIZE+3], &mtfm_post, sizeof(float4)*3);
@@ -274,9 +330,17 @@ void ObjectManager::device_update_transforms(Device *device, DeviceScene *dscene
 		}
 #endif
 
-		/* dupli object coords */
-		objects[offset+9] = make_float4(ob->dupli_generated[0], ob->dupli_generated[1], ob->dupli_generated[2], 0.0f);
-		objects[offset+10] = make_float4(ob->dupli_uv[0], ob->dupli_uv[1], 0.0f, 0.0f);
+		if(mesh->use_motion_blur)
+			have_motion = true;
+
+		/* dupli object coords and motion info */
+		int totalsteps = mesh->motion_steps;
+		int numsteps = (totalsteps - 1)/2;
+		int numverts = mesh->verts.size();
+		int numkeys = mesh->curve_keys.size();
+
+		objects[offset+9] = make_float4(ob->dupli_generated[0], ob->dupli_generated[1], ob->dupli_generated[2], __int_as_float(numkeys));
+		objects[offset+10] = make_float4(ob->dupli_uv[0], ob->dupli_uv[1], __int_as_float(numsteps), __int_as_float(numverts));
 
 		/* object flag */
 		if(ob->use_holdout)
@@ -355,6 +419,7 @@ void ObjectManager::apply_static_transforms(DeviceScene *dscene, Scene *scene, u
 #ifdef __OBJECT_MOTION__
 	Scene::MotionType need_motion = scene->need_motion();
 	bool motion_blur = need_motion == Scene::MOTION_BLUR;
+	bool apply_to_motion = need_motion != Scene::MOTION_PASS;
 #else
 	bool motion_blur = false;
 #endif
@@ -377,7 +442,7 @@ void ObjectManager::apply_static_transforms(DeviceScene *dscene, Scene *scene, u
 		if(mesh_users[object->mesh] == 1) {
 			if(!(motion_blur && object->use_motion)) {
 				if(!object->mesh->transform_applied) {
-					object->apply_transform();
+					object->apply_transform(apply_to_motion);
 					object->mesh->transform_applied = true;
 
 					if(progress.get_cancel()) return;
diff --git a/intern/cycles/render/object.h b/intern/cycles/render/object.h
index 5da85be3873..677526b715f 100644
--- a/intern/cycles/render/object.h
+++ b/intern/cycles/render/object.h
@@ -27,6 +27,7 @@ CCL_NAMESPACE_BEGIN
 class Device;
 class DeviceScene;
 class Mesh;
+class ParticleSystem;
 class Progress;
 class Scene;
 struct Transform;
@@ -50,15 +51,18 @@ public:
 	float3 dupli_generated;
 	float2 dupli_uv;
 
-	int particle_id;
-
+	ParticleSystem *particle_system;
+	int particle_index;
+	
 	Object();
 	~Object();
 
 	void tag_update(Scene *scene);
 
-	void compute_bounds(bool motion_blur, float shuttertime);
-	void apply_transform();
+	void compute_bounds(bool motion_blur);
+	void apply_transform(bool apply_to_motion);
+
+	vector<float> motion_times();
 };
 
 /* Object Manager */
diff --git a/intern/cycles/render/osl.cpp b/intern/cycles/render/osl.cpp
index e2798f438e2..94866102f60 100644
--- a/intern/cycles/render/osl.cpp
+++ b/intern/cycles/render/osl.cpp
@@ -203,7 +203,6 @@ void OSLShaderManager::shading_system_init()
 			"glossy",			/* PATH_RAY_GLOSSY */
 			"singular",			/* PATH_RAY_SINGULAR */
 			"transparent",		/* PATH_RAY_TRANSPARENT */
-			"volume_scatter",	/* PATH_RAY_VOLUME_SCATTER */
 			"shadow",			/* PATH_RAY_SHADOW_OPAQUE */
 			"shadow",			/* PATH_RAY_SHADOW_TRANSPARENT */
 
@@ -212,6 +211,8 @@ void OSLShaderManager::shading_system_init()
 			"diffuse_ancestor", /* PATH_RAY_DIFFUSE_ANCESTOR */
 			"glossy_ancestor",  /* PATH_RAY_GLOSSY_ANCESTOR */
 			"bssrdf_ancestor",  /* PATH_RAY_BSSRDF_ANCESTOR */
+			"__unused__",		/* PATH_RAY_SINGLE_PASS_DONE */
+			"volume_scatter",	/* PATH_RAY_VOLUME_SCATTER */
 		};
 
 		const int nraytypes = sizeof(raytypes)/sizeof(raytypes[0]);
@@ -512,16 +513,14 @@ void OSLCompiler::add(ShaderNode *node, const char *name, bool isfilepath)
 		}
 	}
 
-	/* create shader of the appropriate type. we pass "surface" to all shaders,
-	 * because "volume" and "displacement" don't work yet in OSL. the shaders
-	 * work fine, but presumably these values would be used for more strict
-	 * checking, so when that is fixed, we should update the code here too. */
+	/* create shader of the appropriate type. OSL only distinguishes between "surface"
+	 * and "displacement" atm */
 	if(current_type == SHADER_TYPE_SURFACE)
 		ss->Shader("surface", name, id(node).c_str());
 	else if(current_type == SHADER_TYPE_VOLUME)
 		ss->Shader("surface", name, id(node).c_str());
 	else if(current_type == SHADER_TYPE_DISPLACEMENT)
-		ss->Shader("surface", name, id(node).c_str());
+		ss->Shader("displacement", name, id(node).c_str());
 	else
 		assert(0);
 	
@@ -544,7 +543,7 @@ void OSLCompiler::add(ShaderNode *node, const char *name, bool isfilepath)
 	/* test if we shader contains specific closures */
 	OSLShaderInfo *info = ((OSLShaderManager*)manager)->shader_loaded_info(name);
 
-	if(info) {
+	if(info && current_type == SHADER_TYPE_SURFACE) {
 		if(info->has_surface_emission)
 			current_shader->has_surface_emission = true;
 		if(info->has_surface_transparent)
@@ -554,6 +553,10 @@ void OSLCompiler::add(ShaderNode *node, const char *name, bool isfilepath)
 			current_shader->has_bssrdf_bump = true; /* can't detect yet */
 		}
 	}
+	else if(current_type == SHADER_TYPE_VOLUME) {
+		if(node->has_spatial_varying())
+			current_shader->has_heterogeneous_volume = true;
+	}
 }
 
 void OSLCompiler::parameter(const char *name, float f)
@@ -709,14 +712,20 @@ void OSLCompiler::generate_nodes(const set<ShaderNode*>& nodes)
 					node->compile(*this);
 					done.insert(node);
 
-					if(node->has_surface_emission())
-						current_shader->has_surface_emission = true;
-					if(node->has_surface_transparent())
-						current_shader->has_surface_transparent = true;
-					if(node->has_surface_bssrdf()) {
-						current_shader->has_surface_bssrdf = true;
-						if(node->has_bssrdf_bump())
-							current_shader->has_bssrdf_bump = true;
+					if(current_type == SHADER_TYPE_SURFACE) {
+						if(node->has_surface_emission())
+							current_shader->has_surface_emission = true;
+						if(node->has_surface_transparent())
+							current_shader->has_surface_transparent = true;
+						if(node->has_surface_bssrdf()) {
+							current_shader->has_surface_bssrdf = true;
+							if(node->has_bssrdf_bump())
+								current_shader->has_bssrdf_bump = true;
+						}
+					}
+					else if(current_type == SHADER_TYPE_VOLUME) {
+						if(node->has_spatial_varying())
+							current_shader->has_heterogeneous_volume = true;
 					}
 				}
 				else
@@ -798,6 +807,7 @@ void OSLCompiler::compile(OSLGlobals *og, Shader *shader)
 		shader->has_bssrdf_bump = false;
 		shader->has_volume = false;
 		shader->has_displacement = false;
+		shader->has_heterogeneous_volume = false;
 
 		/* generate surface shader */
 		if(shader->used && graph && output->input("Surface")->link) {
diff --git a/intern/cycles/render/scene.cpp b/intern/cycles/render/scene.cpp
index 71f5a9dafed..4f5ad439520 100644
--- a/intern/cycles/render/scene.cpp
+++ b/intern/cycles/render/scene.cpp
@@ -17,6 +17,7 @@
 #include <stdlib.h>
 
 #include "background.h"
+#include "bake.h"
 #include "camera.h"
 #include "curves.h"
 #include "device.h"
@@ -54,6 +55,7 @@ Scene::Scene(const SceneParams& params_, const DeviceInfo& device_info_)
 	image_manager = new ImageManager();
 	particle_system_manager = new ParticleSystemManager();
 	curve_system_manager = new CurveSystemManager();
+	bake_manager = new BakeManager();
 
 	/* OSL only works on the CPU */
 	if(device_info_.type == DEVICE_CPU)
@@ -61,8 +63,8 @@ Scene::Scene(const SceneParams& params_, const DeviceInfo& device_info_)
 	else
 		shader_manager = ShaderManager::create(this, SceneParams::SVM);
 
-	if (device_info_.type == DEVICE_CPU)
-		image_manager->set_extended_image_limits();
+	/* Extended image limits for CPU and GPUs */
+	image_manager->set_extended_image_limits(device_info_);
 }
 
 Scene::~Scene()
@@ -103,6 +105,8 @@ void Scene::free_memory(bool final)
 		particle_system_manager->device_free(device, &dscene);
 		curve_system_manager->device_free(device, &dscene);
 
+		bake_manager->device_free(device, &dscene);
+
 		if(!params.persistent_data || final)
 			image_manager->device_free(device, &dscene);
 
@@ -122,6 +126,7 @@ void Scene::free_memory(bool final)
 		delete particle_system_manager;
 		delete curve_system_manager;
 		delete image_manager;
+		delete bake_manager;
 	}
 }
 
@@ -137,6 +142,8 @@ void Scene::device_update(Device *device_, Progress& progress)
 	 * - Camera may be used for adapative subdivison.
 	 * - Displacement shader must have all shader data available.
 	 * - Light manager needs lookup tables and final mesh data to compute emission CDF.
+	 * - Film needs light manager to run for use_light_visibility
+	 * - Lookup tables are done a second time to handle film tables
 	 */
 	
 	image_manager->set_pack_images(device->info.pack_images);
@@ -171,11 +178,6 @@ void Scene::device_update(Device *device_, Progress& progress)
 
 	if(progress.get_cancel()) return;
 
-	progress.set_status("Updating Film");
-	film->device_update(device, &dscene, this);
-
-	if(progress.get_cancel()) return;
-
 	progress.set_status("Updating Lookup Tables");
 	lookup_tables->device_update(device, &dscene);
 
@@ -196,11 +198,26 @@ void Scene::device_update(Device *device_, Progress& progress)
 
 	if(progress.get_cancel()) return;
 
+	progress.set_status("Updating Film");
+	film->device_update(device, &dscene, this);
+
+	if(progress.get_cancel()) return;
+
 	progress.set_status("Updating Integrator");
 	integrator->device_update(device, &dscene, this);
 
 	if(progress.get_cancel()) return;
 
+	progress.set_status("Updating Lookup Tables");
+	lookup_tables->device_update(device, &dscene);
+
+	if(progress.get_cancel()) return;
+
+	progress.set_status("Updating Baking");
+	bake_manager->device_update(device, &dscene, this, progress);
+
+	if(progress.get_cancel()) return;
+
 	progress.set_status("Updating Device", "Writing constant memory");
 	device->const_copy_to("__data", &dscene.data, sizeof(dscene.data));
 }
@@ -219,8 +236,10 @@ bool Scene::need_global_attribute(AttributeStandard std)
 {
 	if(std == ATTR_STD_UV)
 		return Pass::contains(film->passes, PASS_UV);
-	if(std == ATTR_STD_MOTION_PRE || std == ATTR_STD_MOTION_POST)
-		return need_motion() == MOTION_PASS;
+	else if(std == ATTR_STD_MOTION_VERTEX_POSITION)
+		return need_motion() != MOTION_NONE;
+	else if(std == ATTR_STD_MOTION_VERTEX_NORMAL)
+		return need_motion() == MOTION_BLUR;
 	
 	return false;
 }
@@ -249,7 +268,8 @@ bool Scene::need_reset()
 		|| integrator->need_update
 		|| shader_manager->need_update
 		|| particle_system_manager->need_update
-		|| curve_system_manager->need_update);
+		|| curve_system_manager->need_update
+		|| bake_manager->need_update);
 }
 
 void Scene::reset()
diff --git a/intern/cycles/render/scene.h b/intern/cycles/render/scene.h
index 2c223192536..0f0bb725823 100644
--- a/intern/cycles/render/scene.h
+++ b/intern/cycles/render/scene.h
@@ -51,6 +51,8 @@ class CurveSystemManager;
 class Shader;
 class ShaderManager;
 class Progress;
+class BakeManager;
+class BakeData;
 
 /* Scene Device Data */
 
@@ -60,7 +62,7 @@ public:
 	device_vector<float4> bvh_nodes;
 	device_vector<uint> object_node;
 	device_vector<float4> tri_woop;
-	device_vector<uint> prim_segment;
+	device_vector<uint> prim_type;
 	device_vector<uint> prim_visibility;
 	device_vector<uint> prim_index;
 	device_vector<uint> prim_object;
@@ -103,8 +105,8 @@ public:
 	/* integrator */
 	device_vector<uint> sobol_directions;
 
-	/* images */
-	device_vector<uchar4> tex_image[TEX_EXTENDED_NUM_IMAGES];
+	/* cpu images */
+	device_vector<uchar4> tex_image[TEX_EXTENDED_NUM_IMAGES_CPU];
 	device_vector<float4> tex_float_image[TEX_EXTENDED_NUM_FLOAT_IMAGES];
 
 	/* opencl images */
@@ -174,6 +176,7 @@ public:
 	ObjectManager *object_manager;
 	ParticleSystemManager *particle_system_manager;
 	CurveSystemManager *curve_system_manager;
+	BakeManager *bake_manager;
 
 	/* default shaders */
 	int default_surface;
diff --git a/intern/cycles/render/session.cpp b/intern/cycles/render/session.cpp
index 0805a685467..28b44df6b36 100644
--- a/intern/cycles/render/session.cpp
+++ b/intern/cycles/render/session.cpp
@@ -23,6 +23,7 @@
 #include "integrator.h"
 #include "scene.h"
 #include "session.h"
+#include "bake.h"
 
 #include "util_foreach.h"
 #include "util_function.h"
@@ -50,7 +51,7 @@ Session::Session(const SessionParams& params_)
 
 	device = Device::create(params.device, stats, params.background);
 
-	if(params.background) {
+	if(params.background && params.output_path.empty()) {
 		buffers = NULL;
 		display = NULL;
 	}
@@ -81,6 +82,7 @@ Session::Session(const SessionParams& params_)
 Session::~Session()
 {
 	if(session_thread) {
+		/* wait for session thread to end */
 		progress.set_cancel("Exiting");
 
 		gpu_need_tonemap = false;
@@ -95,13 +97,19 @@ Session::~Session()
 		wait();
 	}
 
-	if(display && !params.output_path.empty()) {
-		tonemap();
+	if(!params.output_path.empty()) {
+		/* tonemap and write out image if requested */
+		delete display;
+
+		display = new DisplayBuffer(device, false);
+		display->reset(device, buffers->params);
+		tonemap(params.samples);
 
 		progress.set_status("Writing Image", params.output_path);
 		display->write(device, params.output_path);
 	}
 
+	/* clean up */
 	foreach(RenderBuffers *buffers, tile_buffers)
 		delete buffers;
 
@@ -151,7 +159,7 @@ void Session::reset_gpu(BufferParams& buffer_params, int samples)
 	pause_cond.notify_all();
 }
 
-bool Session::draw_gpu(BufferParams& buffer_params)
+bool Session::draw_gpu(BufferParams& buffer_params, DeviceDrawParams& draw_params)
 {
 	/* block for buffer access */
 	thread_scoped_lock display_lock(display_mutex);
@@ -165,12 +173,12 @@ bool Session::draw_gpu(BufferParams& buffer_params)
 			 * only access GL buffers from the main thread */
 			if(gpu_need_tonemap) {
 				thread_scoped_lock buffers_lock(buffers_mutex);
-				tonemap();
+				tonemap(tile_manager.state.sample);
 				gpu_need_tonemap = false;
 				gpu_need_tonemap_cond.notify_all();
 			}
 
-			display->draw(device);
+			display->draw(device, draw_params);
 
 			if(display_outdated && (time_dt() - reset_time) > params.text_timeout)
 				return false;
@@ -315,7 +323,7 @@ void Session::reset_cpu(BufferParams& buffer_params, int samples)
 	pause_cond.notify_all();
 }
 
-bool Session::draw_cpu(BufferParams& buffer_params)
+bool Session::draw_cpu(BufferParams& buffer_params, DeviceDrawParams& draw_params)
 {
 	thread_scoped_lock display_lock(display_mutex);
 
@@ -324,7 +332,7 @@ bool Session::draw_cpu(BufferParams& buffer_params)
 		/* then verify the buffers have the expected size, so we don't
 		 * draw previous results in a resized window */
 		if(!buffer_params.modified(display->params)) {
-			display->draw(device);
+			display->draw(device, draw_params);
 
 			if(display_outdated && (time_dt() - reset_time) > params.text_timeout)
 				return false;
@@ -367,7 +375,7 @@ bool Session::acquire_tile(Device *tile_device, RenderTile& rtile)
 
 	/* in case of a permanent buffer, return it, otherwise we will allocate
 	 * a new temporary buffer */
-	if(!params.background) {
+	if(!(params.background && params.output_path.empty())) {
 		tile_manager.state.buffer.get_offset_stride(rtile.offset, rtile.stride);
 
 		rtile.buffer = buffers->buffer.device_pointer;
@@ -567,8 +575,8 @@ void Session::run_cpu()
 			}
 			else if(need_tonemap) {
 				/* tonemap only if we do not reset, we don't we don't
-				 * want to show the result of an incomplete sample*/
-				tonemap();
+				 * want to show the result of an incomplete sample */
+				tonemap(tile_manager.state.sample);
 			}
 
 			if(!device->error_message().empty())
@@ -624,12 +632,12 @@ void Session::run()
 		progress.set_update();
 }
 
-bool Session::draw(BufferParams& buffer_params)
+bool Session::draw(BufferParams& buffer_params, DeviceDrawParams &draw_params)
 {
 	if(device_use_gl)
-		return draw_gpu(buffer_params);
+		return draw_gpu(buffer_params, draw_params);
 	else
-		return draw_cpu(buffer_params);
+		return draw_cpu(buffer_params, draw_params);
 }
 
 void Session::reset_(BufferParams& buffer_params, int samples)
@@ -726,10 +734,14 @@ void Session::update_scene()
 		cam->tag_update();
 	}
 
-	/* number of samples is needed by multi jittered sampling pattern */
+	/* number of samples is needed by multi jittered
+	 * sampling pattern and by baking */
 	Integrator *integrator = scene->integrator;
+	BakeManager *bake_manager = scene->bake_manager;
 
-	if(integrator->sampling_pattern == SAMPLING_PATTERN_CMJ) {
+	if(integrator->sampling_pattern == SAMPLING_PATTERN_CMJ ||
+	   bake_manager->get_baking())
+	{
 		int aa_samples = tile_manager.num_samples;
 
 		if(aa_samples != integrator->aa_samples) {
@@ -834,7 +846,7 @@ void Session::path_trace()
 	device->task_add(task);
 }
 
-void Session::tonemap()
+void Session::tonemap(int sample)
 {
 	/* add tonemap task */
 	DeviceTask task(DeviceTask::FILM_CONVERT);
@@ -846,7 +858,7 @@ void Session::tonemap()
 	task.rgba_byte = display->rgba_byte.device_pointer;
 	task.rgba_half = display->rgba_half.device_pointer;
 	task.buffer = buffers->buffer.device_pointer;
-	task.sample = tile_manager.state.sample;
+	task.sample = sample;
 	tile_manager.state.buffer.get_offset_stride(task.offset, task.stride);
 
 	if(task.w > 0 && task.h > 0) {
diff --git a/intern/cycles/render/session.h b/intern/cycles/render/session.h
index 1227edf81b6..1e625158652 100644
--- a/intern/cycles/render/session.h
+++ b/intern/cycles/render/session.h
@@ -128,7 +128,7 @@ public:
 	~Session();
 
 	void start();
-	bool draw(BufferParams& params);
+	bool draw(BufferParams& params, DeviceDrawParams& draw_params);
 	void wait();
 
 	bool ready_to_reset();
@@ -136,6 +136,7 @@ public:
 	void set_samples(int samples);
 	void set_pause(bool pause);
 
+	void update_scene();
 	void device_free();
 protected:
 	struct DelayedReset {
@@ -147,19 +148,18 @@ protected:
 
 	void run();
 
-	void update_scene();
 	void update_status_time(bool show_pause = false, bool show_done = false);
 
-	void tonemap();
+	void tonemap(int sample);
 	void path_trace();
 	void reset_(BufferParams& params, int samples);
 
 	void run_cpu();
-	bool draw_cpu(BufferParams& params);
+	bool draw_cpu(BufferParams& params, DeviceDrawParams& draw_params);
 	void reset_cpu(BufferParams& params, int samples);
 
 	void run_gpu();
-	bool draw_gpu(BufferParams& params);
+	bool draw_gpu(BufferParams& params, DeviceDrawParams& draw_params);
 	void reset_gpu(BufferParams& params, int samples);
 
 	bool acquire_tile(Device *tile_device, RenderTile& tile);
diff --git a/intern/cycles/render/shader.cpp b/intern/cycles/render/shader.cpp
index 20f0fd7ed1e..b25673b36c3 100644
--- a/intern/cycles/render/shader.cpp
+++ b/intern/cycles/render/shader.cpp
@@ -53,6 +53,7 @@ Shader::Shader()
 	has_volume = false;
 	has_displacement = false;
 	has_bssrdf_bump = false;
+	has_heterogeneous_volume = false;
 
 	used = false;
 
@@ -249,7 +250,7 @@ void ShaderManager::device_update_common(Device *device, DeviceScene *dscene, Sc
 			 * the case with camera inside volumes too */
 			flag |= SD_HAS_TRANSPARENT_SHADOW;
 		}
-		if(shader->heterogeneous_volume)
+		if(shader->heterogeneous_volume && shader->has_heterogeneous_volume)
 			flag |= SD_HETEROGENEOUS_VOLUME;
 		if(shader->has_bssrdf_bump)
 			flag |= SD_HAS_BSSRDF_BUMP;
diff --git a/intern/cycles/render/shader.h b/intern/cycles/render/shader.h
index 5f87050fe19..874e8face7a 100644
--- a/intern/cycles/render/shader.h
+++ b/intern/cycles/render/shader.h
@@ -77,6 +77,7 @@ public:
 	bool has_surface_bssrdf;
 	bool has_converter_blackbody;
 	bool has_bssrdf_bump;
+	bool has_heterogeneous_volume;
 
 	/* requested mesh attributes */
 	AttributeRequestSet attributes;
diff --git a/intern/cycles/render/sky_model.cpp b/intern/cycles/render/sky_model.cpp
index 6f250c06bc1..adb07d9e288 100644
--- a/intern/cycles/render/sky_model.cpp
+++ b/intern/cycles/render/sky_model.cpp
@@ -310,7 +310,7 @@ double arhosekskymodel_radiance(
         double                  wavelength
         )
 {
-    int low_wl = (wavelength - 320.0 ) / 40.0;
+    int low_wl = (int)((wavelength - 320.0) / 40.0);
 
     if ( low_wl < 0 || low_wl >= 11 )
         return 0.0f;
diff --git a/intern/cycles/render/svm.cpp b/intern/cycles/render/svm.cpp
index 538b1aae313..576c176759c 100644
--- a/intern/cycles/render/svm.cpp
+++ b/intern/cycles/render/svm.cpp
@@ -63,8 +63,6 @@ void SVMShaderManager::device_update(Device *device, DeviceScene *dscene, Scene
 		svm_nodes.push_back(make_int4(NODE_SHADER_JUMP, 0, 0, 0));
 	}
 	
-	bool use_multi_closure = device->info.advanced_shading;
-
 	for(i = 0; i < scene->shaders.size(); i++) {
 		Shader *shader = scene->shaders[i];
 
@@ -75,8 +73,7 @@ void SVMShaderManager::device_update(Device *device, DeviceScene *dscene, Scene
 		if(shader->use_mis && shader->has_surface_emission)
 			scene->light_manager->need_update = true;
 
-		SVMCompiler compiler(scene->shader_manager, scene->image_manager,
-			use_multi_closure);
+		SVMCompiler compiler(scene->shader_manager, scene->image_manager);
 		compiler.background = ((int)i == scene->default_background);
 		compiler.compile(shader, svm_nodes, i);
 	}
@@ -104,7 +101,7 @@ void SVMShaderManager::device_free(Device *device, DeviceScene *dscene, Scene *s
 
 /* Graph Compiler */
 
-SVMCompiler::SVMCompiler(ShaderManager *shader_manager_, ImageManager *image_manager_, bool use_multi_closure_)
+SVMCompiler::SVMCompiler(ShaderManager *shader_manager_, ImageManager *image_manager_)
 {
 	shader_manager = shader_manager_;
 	image_manager = image_manager_;
@@ -114,7 +111,6 @@ SVMCompiler::SVMCompiler(ShaderManager *shader_manager_, ImageManager *image_man
 	current_graph = NULL;
 	background = false;
 	mix_weight_offset = SVM_STACK_INVALID;
-	use_multi_closure = use_multi_closure_;
 	compile_failed = false;
 }
 
@@ -230,7 +226,8 @@ void SVMCompiler::stack_assign(ShaderInput *input)
 			else if(input->type == SHADER_SOCKET_VECTOR ||
 			        input->type == SHADER_SOCKET_NORMAL ||
 			        input->type == SHADER_SOCKET_POINT ||
-			        input->type == SHADER_SOCKET_COLOR) {
+			        input->type == SHADER_SOCKET_COLOR)
+			{
 
 				add_node(NODE_VALUE_V, input->stack_offset);
 				add_node(NODE_VALUE_V, input->value);
@@ -379,6 +376,22 @@ void SVMCompiler::find_dependencies(set<ShaderNode*>& dependencies, const set<Sh
 	}
 }
 
+void SVMCompiler::generate_node(ShaderNode *node, set<ShaderNode*>& done)
+{
+	node->compile(*this);
+	stack_clear_users(node, done);
+	stack_clear_temporary(node);
+
+	if(current_type == SHADER_TYPE_VOLUME) {
+		if(node->has_spatial_varying())
+			current_shader->has_heterogeneous_volume = true;
+	}
+
+	/* detect if we have a blackbody converter, to prepare lookup table */
+	if(node->has_converter_blackbody())
+		current_shader->has_converter_blackbody = true;
+}
+
 void SVMCompiler::generate_svm_nodes(const set<ShaderNode*>& nodes, set<ShaderNode*>& done)
 {
 	bool nodes_done;
@@ -396,13 +409,7 @@ void SVMCompiler::generate_svm_nodes(const set<ShaderNode*>& nodes, set<ShaderNo
 							inputs_done = false;
 
 				if(inputs_done) {
-					/* Detect if we have a blackbody converter, to prepare lookup table */
-					if(node->has_converter_blackbody())
-					current_shader->has_converter_blackbody = true;
-
-					node->compile(*this);
-					stack_clear_users(node, done);
-					stack_clear_temporary(node);
+					generate_node(node, done);
 					done.insert(node);
 				}
 				else
@@ -412,83 +419,34 @@ void SVMCompiler::generate_svm_nodes(const set<ShaderNode*>& nodes, set<ShaderNo
 	} while(!nodes_done);
 }
 
-void SVMCompiler::generate_closure(ShaderNode *node, set<ShaderNode*>& done)
+void SVMCompiler::generate_closure_node(ShaderNode *node, set<ShaderNode*>& done)
 {
-	if(node->name == ustring("mix_closure") || node->name == ustring("add_closure")) {
-		ShaderInput *fin = node->input("Fac");
-		ShaderInput *cl1in = node->input("Closure1");
-		ShaderInput *cl2in = node->input("Closure2");
-
-		/* execute dependencies for mix weight */
-		if(fin) {
+	/* execute dependencies for closure */
+	foreach(ShaderInput *in, node->inputs) {
+		if(!node_skip_input(node, in) && in->link) {
 			set<ShaderNode*> dependencies;
-			find_dependencies(dependencies, done, fin);
+			find_dependencies(dependencies, done, in);
 			generate_svm_nodes(dependencies, done);
-
-			/* add mix node */
-			stack_assign(fin);
-		}
-
-		int mix_offset = svm_nodes.size();
-
-		if(fin)
-			add_node(NODE_MIX_CLOSURE, fin->stack_offset, 0, 0);
-		else
-			add_node(NODE_ADD_CLOSURE, 0, 0, 0);
-
-		/* generate code for closure 1
-		 * note we backup all compiler state and restore it afterwards, so one
-		 * closure choice doesn't influence the other*/
-		if(cl1in->link) {
-			StackBackup backup;
-			stack_backup(backup, done);
-
-			generate_closure(cl1in->link->parent, done);
-			add_node(NODE_END, 0, 0, 0);
-
-			stack_restore(backup, done);
 		}
-		else
-			add_node(NODE_END, 0, 0, 0);
-
-		/* generate code for closure 2 */
-		int cl2_offset = svm_nodes.size();
-
-		if(cl2in->link) {
-			StackBackup backup;
-			stack_backup(backup, done);
-
-			generate_closure(cl2in->link->parent, done);
-			add_node(NODE_END, 0, 0, 0);
-
-			stack_restore(backup, done);
-		}
-		else
-			add_node(NODE_END, 0, 0, 0);
+	}
 
-		/* set jump for mix node, -1 because offset is already
-		 * incremented when this jump is added to it */
-		svm_nodes[mix_offset].z = cl2_offset - mix_offset - 1;
+	/* closure mix weight */
+	const char *weight_name = (current_type == SHADER_TYPE_VOLUME)? "VolumeMixWeight": "SurfaceMixWeight";
+	ShaderInput *weight_in = node->input(weight_name);
 
-		done.insert(node);
-		stack_clear_users(node, done);
-		stack_clear_temporary(node);
+	if(weight_in && (weight_in->link || weight_in->value.x != 1.0f)) {
+		stack_assign(weight_in);
+		mix_weight_offset = weight_in->stack_offset;
 	}
-	else {
-		/* execute dependencies for closure */
-		foreach(ShaderInput *in, node->inputs) {
-			if(!node_skip_input(node, in) && in->link) {
-				set<ShaderNode*> dependencies;
-				find_dependencies(dependencies, done, in);
-				generate_svm_nodes(dependencies, done);
-			}
-		}
+	else
+		mix_weight_offset = SVM_STACK_INVALID;
 
-		/* compile closure itself */
-		node->compile(*this);
-		stack_clear_users(node, done);
-		stack_clear_temporary(node);
+	/* compile closure itself */
+	generate_node(node, done);
 
+	mix_weight_offset = SVM_STACK_INVALID;
+
+	if(current_type == SHADER_TYPE_SURFACE) {
 		if(node->has_surface_emission())
 			current_shader->has_surface_emission = true;
 		if(node->has_surface_transparent())
@@ -498,18 +456,24 @@ void SVMCompiler::generate_closure(ShaderNode *node, set<ShaderNode*>& done)
 			if(node->has_bssrdf_bump())
 				current_shader->has_bssrdf_bump = true;
 		}
+	}
+}
 
-		/* end node is added outside of this */
+void SVMCompiler::generated_shared_closure_nodes(ShaderNode *node, set<ShaderNode*>& done, set<ShaderNode*>& closure_done, const set<ShaderNode*>& shared)
+{
+	if(shared.find(node) != shared.end()) {
+		generate_multi_closure(node, done, closure_done);
+	}
+	else {
+		foreach(ShaderInput *in, node->inputs) {
+			if(in->type == SHADER_SOCKET_CLOSURE && in->link)
+				generated_shared_closure_nodes(in->link->parent, done, closure_done, shared);
+		}
 	}
 }
 
 void SVMCompiler::generate_multi_closure(ShaderNode *node, set<ShaderNode*>& done, set<ShaderNode*>& closure_done)
 {
-	/* todo: the weak point here is that unlike the single closure sampling 
-	 * we will evaluate all nodes even if they are used as input for closures
-	 * that are unused. it's not clear what would be the best way to skip such
-	 * nodes at runtime, especially if they are tangled up  */
-	
 	/* only generate once */
 	if(closure_done.find(node) != closure_done.end())
 		return;
@@ -520,50 +484,81 @@ void SVMCompiler::generate_multi_closure(ShaderNode *node, set<ShaderNode*>& don
 		/* weighting is already taken care of in ShaderGraph::transform_multi_closure */
 		ShaderInput *cl1in = node->input("Closure1");
 		ShaderInput *cl2in = node->input("Closure2");
+		ShaderInput *facin = node->input("Fac");
 
-		if(cl1in->link)
-			generate_multi_closure(cl1in->link->parent, done, closure_done);
-		if(cl2in->link)
-			generate_multi_closure(cl2in->link->parent, done, closure_done);
-	}
-	else {
-		/* execute dependencies for closure */
-		foreach(ShaderInput *in, node->inputs) {
-			if(!node_skip_input(node, in) && in->link) {
-				set<ShaderNode*> dependencies;
-				find_dependencies(dependencies, done, in);
-				generate_svm_nodes(dependencies, done);
+		/* skip empty mix/add closure nodes */
+		if(!cl1in->link && !cl2in->link)
+			return;
+
+		if(facin && facin->link) {
+			/* mix closure: generate instructions to compute mix weight */
+			set<ShaderNode*> dependencies;
+			find_dependencies(dependencies, done, facin);
+			generate_svm_nodes(dependencies, done);
+
+			stack_assign(facin);
+
+			/* execute shared dependencies. this is needed to allow skipping
+			 * of zero weight closures and their dependencies later, so we
+			 * ensure that they only skip dependencies that are unique to them */
+			set<ShaderNode*> cl1deps, cl2deps, shareddeps;
+
+			find_dependencies(cl1deps, done, cl1in);
+			find_dependencies(cl2deps, done, cl2in);
+
+			set_intersection(cl1deps.begin(), cl1deps.end(),
+			                 cl2deps.begin(), cl2deps.end(),
+			                 std::inserter(shareddeps, shareddeps.begin()));
+			
+			if(!shareddeps.empty()) {
+				if(cl1in->link)
+					generated_shared_closure_nodes(cl1in->link->parent, done, closure_done, shareddeps);
+				if(cl2in->link)
+					generated_shared_closure_nodes(cl2in->link->parent, done, closure_done, shareddeps);
+
+				generate_svm_nodes(shareddeps, done);
 			}
-		}
 
-		/* closure mix weight */
-		const char *weight_name = (current_type == SHADER_TYPE_VOLUME)? "VolumeMixWeight": "SurfaceMixWeight";
-		ShaderInput *weight_in = node->input(weight_name);
+			/* generate instructions for input closure 1 */
+			if(cl1in->link) {
+				/* add instruction to skip closure and its dependencies if mix weight is zero */
+				svm_nodes.push_back(make_int4(NODE_JUMP_IF_ONE, 0, facin->stack_offset, 0));
+				int node_jump_skip_index = svm_nodes.size() - 1;
 
-		if(weight_in && (weight_in->link || weight_in->value.x != 1.0f)) {
-			stack_assign(weight_in);
-			mix_weight_offset = weight_in->stack_offset;
-		}
-		else
-			mix_weight_offset = SVM_STACK_INVALID;
+				generate_multi_closure(cl1in->link->parent, done, closure_done);
 
-		/* compile closure itself */
-		node->compile(*this);
-		stack_clear_users(node, done);
-		stack_clear_temporary(node);
+				/* fill in jump instruction location to be after closure */
+				svm_nodes[node_jump_skip_index].y = svm_nodes.size() - node_jump_skip_index - 1;
+			}
 
-		mix_weight_offset = SVM_STACK_INVALID;
+			/* generate instructions for input closure 2 */
+			if(cl2in->link) {
+				/* add instruction to skip closure and its dependencies if mix weight is zero */
+				svm_nodes.push_back(make_int4(NODE_JUMP_IF_ZERO, 0, facin->stack_offset, 0));
+				int node_jump_skip_index = svm_nodes.size() - 1;
 
-		if(node->has_surface_emission())
-			current_shader->has_surface_emission = true;
-		if(node->has_surface_transparent())
-			current_shader->has_surface_transparent = true;
-		if(node->has_surface_bssrdf()) {
-			current_shader->has_surface_bssrdf = true;
-			if(node->has_bssrdf_bump())
-				current_shader->has_bssrdf_bump = true;
+				generate_multi_closure(cl2in->link->parent, done, closure_done);
+
+				/* fill in jump instruction location to be after closure */
+				svm_nodes[node_jump_skip_index].y = svm_nodes.size() - node_jump_skip_index - 1;
+			}
+
+			/* unassign */
+			facin->stack_offset = SVM_STACK_INVALID;
+		}
+		else {
+			/* execute closures and their dependencies, no runtime checks
+			 * to skip closures here because was already optimized due to
+			 * fixed weight or add closure that always needs both */
+			if(cl1in->link)
+				generate_multi_closure(cl1in->link->parent, done, closure_done);
+			if(cl2in->link)
+				generate_multi_closure(cl2in->link->parent, done, closure_done);
 		}
 	}
+	else {
+		generate_closure_node(node, done);
+	}
 
 	done.insert(node);
 }
@@ -642,14 +637,8 @@ void SVMCompiler::compile_type(Shader *shader, ShaderGraph *graph, ShaderType ty
 			}
 
 			if(generate) {
-				set<ShaderNode*> done;
-
-				if(use_multi_closure) {
-					set<ShaderNode*> closure_done;
-					generate_multi_closure(clin->link->parent, done, closure_done);
-				}
-				else
-					generate_closure(clin->link->parent, done);
+				set<ShaderNode*> done, closure_done;
+				generate_multi_closure(clin->link->parent, done, closure_done);
 			}
 		}
 
@@ -676,9 +665,9 @@ void SVMCompiler::compile(Shader *shader, vector<int4>& global_svm_nodes, int in
 			shader->graph_bump = shader->graph->copy();
 
 	/* finalize */
-	shader->graph->finalize(false, false, use_multi_closure);
+	shader->graph->finalize(false, false);
 	if(shader->graph_bump)
-		shader->graph_bump->finalize(true, false, use_multi_closure);
+		shader->graph_bump->finalize(true, false);
 
 	current_shader = shader;
 
@@ -690,6 +679,7 @@ void SVMCompiler::compile(Shader *shader, vector<int4>& global_svm_nodes, int in
 	shader->has_converter_blackbody = false;
 	shader->has_volume = false;
 	shader->has_displacement = false;
+	shader->has_heterogeneous_volume = false;
 
 	/* generate surface shader */
 	compile_type(shader, shader->graph, SHADER_TYPE_SURFACE);
diff --git a/intern/cycles/render/svm.h b/intern/cycles/render/svm.h
index 3d84a67e173..45aa4d26926 100644
--- a/intern/cycles/render/svm.h
+++ b/intern/cycles/render/svm.h
@@ -52,8 +52,7 @@ public:
 
 class SVMCompiler {
 public:
-	SVMCompiler(ShaderManager *shader_manager, ImageManager *image_manager,
-		bool use_multi_closure_);
+	SVMCompiler(ShaderManager *shader_manager, ImageManager *image_manager);
 	void compile(Shader *shader, vector<int4>& svm_nodes, int index);
 
 	void stack_assign(ShaderOutput *output);
@@ -123,9 +122,13 @@ protected:
 	bool node_skip_input(ShaderNode *node, ShaderInput *input);
 
 	/* single closure */
-	void find_dependencies(set<ShaderNode*>& dependencies, const set<ShaderNode*>& done, ShaderInput *input);
+	void find_dependencies(set<ShaderNode*>& dependencies,
+		const set<ShaderNode*>& done, ShaderInput *input);
+	void generate_node(ShaderNode *node, set<ShaderNode*>& done);
+	void generate_closure_node(ShaderNode *node, set<ShaderNode*>& done);
+	void generated_shared_closure_nodes(ShaderNode *node, set<ShaderNode*>& done,
+		set<ShaderNode*>& closure_done, const set<ShaderNode*>& shared);
 	void generate_svm_nodes(const set<ShaderNode*>& nodes, set<ShaderNode*>& done);
-	void generate_closure(ShaderNode *node, set<ShaderNode*>& done);
 
 	/* multi closure */
 	void generate_multi_closure(ShaderNode *node, set<ShaderNode*>& done, set<ShaderNode*>& closure_done);
@@ -140,7 +143,6 @@ protected:
 	Stack active_stack;
 	int max_stack_use;
 	uint mix_weight_offset;
-	bool use_multi_closure;
 	bool compile_failed;
 };
 
diff --git a/intern/cycles/render/tables.cpp b/intern/cycles/render/tables.cpp
index be0d4afbe2c..a8d502c432d 100644
--- a/intern/cycles/render/tables.cpp
+++ b/intern/cycles/render/tables.cpp
@@ -39,7 +39,10 @@ void LookupTables::device_update(Device *device, DeviceScene *dscene)
 	if(!need_update)
 		return;
 
-	device->tex_alloc("__lookup_table", dscene->lookup_table);
+	device->tex_free(dscene->lookup_table);
+
+	if(lookup_tables.size() > 0)
+		device->tex_alloc("__lookup_table", dscene->lookup_table);
 
 	need_update = false;
 }
diff --git a/intern/cycles/subd/subd_split.cpp b/intern/cycles/subd/subd_split.cpp
index 417ecfffd49..6bbf4af3f85 100644
--- a/intern/cycles/subd/subd_split.cpp
+++ b/intern/cycles/subd/subd_split.cpp
@@ -94,7 +94,7 @@ void DiagSplit::partition_edge(Patch *patch, float2 *P, int *t0, int *t1, float2
 		*t1 = T(patch, *P, Pend);
 	}
 	else {
-		int I = floor(t*0.5f);
+		int I = (int)floor((float)t*0.5f);
 		*P = interp(Pstart, Pend, (t == 0)? 0: I/(float)t); /* XXX is t faces or verts */
 		*t0 = I;
 		*t1 = t - I;
diff --git a/intern/cycles/util/util_color.h b/intern/cycles/util/util_color.h
index 0cfa4049d3e..b72cc6bc873 100644
--- a/intern/cycles/util/util_color.h
+++ b/intern/cycles/util/util_color.h
@@ -61,22 +61,22 @@ ccl_device float3 rgb_to_hsv(float3 rgb)
 		h = 0.0f;
 	}
 
-	if(s == 0.0f) {
-		h = 0.0f;
-	}
-	else {
+	if(s != 0.0f) {
 		float3 cmax3 = make_float3(cmax, cmax, cmax);
 		c = (cmax3 - rgb)/cdelta;
 
-		if(rgb.x == cmax) h = c.z - c.y;
-		else if(rgb.y == cmax) h = 2.0f + c.x -  c.z;
-		else h = 4.0f + c.y - c.x;
+		if     (rgb.x == cmax) h =        c.z - c.y;
+		else if(rgb.y == cmax) h = 2.0f + c.x - c.z;
+		else                   h = 4.0f + c.y - c.x;
 
 		h /= 6.0f;
 
 		if(h < 0.0f)
 			h += 1.0f;
 	}
+	else {
+		h = 0.0f;
+	}
 
 	return make_float3(h, s, v);
 }
@@ -90,13 +90,10 @@ ccl_device float3 hsv_to_rgb(float3 hsv)
 	s = hsv.y;
 	v = hsv.z;
 
-	if(s == 0.0f) {
-		rgb = make_float3(v, v, v);
-	}
-	else {
+	if(s != 0.0f) {
 		if(h == 1.0f)
 			h = 0.0f;
-		
+
 		h *= 6.0f;
 		i = floorf(h);
 		f = h - i;
@@ -104,13 +101,16 @@ ccl_device float3 hsv_to_rgb(float3 hsv)
 		p = v*(1.0f-s);
 		q = v*(1.0f-(s*f));
 		t = v*(1.0f-(s*(1.0f-f)));
-		
-		if(i == 0.0f) rgb = make_float3(v, t, p);
+
+		if     (i == 0.0f) rgb = make_float3(v, t, p);
 		else if(i == 1.0f) rgb = make_float3(q, v, p);
 		else if(i == 2.0f) rgb = make_float3(p, v, t);
 		else if(i == 3.0f) rgb = make_float3(p, q, v);
 		else if(i == 4.0f) rgb = make_float3(t, p, v);
-		else rgb = make_float3(v, p, q);
+		else               rgb = make_float3(v, p, q);
+	}
+	else {
+		rgb = make_float3(v, v, v);
 	}
 
 	return rgb;
@@ -132,8 +132,8 @@ ccl_device float3 xyY_to_xyz(float x, float y, float Y)
 ccl_device float3 xyz_to_rgb(float x, float y, float z)
 {
 	return make_float3(3.240479f * x + -1.537150f * y + -0.498535f * z,
-					  -0.969256f * x +  1.875991f * y +  0.041556f * z,
-					   0.055648f * x + -0.204043f * y +  1.057311f * z);
+	                  -0.969256f * x +  1.875991f * y +  0.041556f * z,
+	                   0.055648f * x + -0.204043f * y +  1.057311f * z);
 }
 
 #ifndef __KERNEL_OPENCL__
diff --git a/intern/cycles/util/util_cuda.h b/intern/cycles/util/util_cuda.h
index deb2ff969d6..0c80303df9b 100644
--- a/intern/cycles/util/util_cuda.h
+++ b/intern/cycles/util/util_cuda.h
@@ -206,7 +206,8 @@ typedef enum CUjit_target_enum
 	CU_TARGET_COMPUTE_20,
 	CU_TARGET_COMPUTE_21,
 	CU_TARGET_COMPUTE_30,
-	CU_TARGET_COMPUTE_35
+	CU_TARGET_COMPUTE_35,
+	CU_TARGET_COMPUTE_50
 } CUjit_target;
 
 typedef enum CUjit_fallback_enum
diff --git a/intern/cycles/util/util_half.h b/intern/cycles/util/util_half.h
index 21192024f7f..da6fae79bb9 100644
--- a/intern/cycles/util/util_half.h
+++ b/intern/cycles/util/util_half.h
@@ -19,13 +19,17 @@
 
 #include "util_types.h"
 
+#ifdef __KERNEL_SSE2__
+#include "util_simd.h"
+#endif
+
 CCL_NAMESPACE_BEGIN
 
 /* Half Floats */
 
 #ifdef __KERNEL_OPENCL__
 
-#define float4_store_half(h, f, scale) vstore_half4(*(f) * (scale), 0, h);
+#define float4_store_half(h, f, scale) vstore_half4(f * (scale), 0, h);
 
 #else
 
@@ -34,24 +38,24 @@ struct half4 { half x, y, z, w; };
 
 #ifdef __KERNEL_CUDA__
 
-ccl_device_inline void float4_store_half(half *h, const float4 *f, float scale)
+ccl_device_inline void float4_store_half(half *h, float4 f, float scale)
 {
-	h[0] = __float2half_rn(f->x * scale);
-	h[1] = __float2half_rn(f->y * scale);
-	h[2] = __float2half_rn(f->z * scale);
-	h[3] = __float2half_rn(f->w * scale);
+	h[0] = __float2half_rn(f.x * scale);
+	h[1] = __float2half_rn(f.y * scale);
+	h[2] = __float2half_rn(f.z * scale);
+	h[3] = __float2half_rn(f.w * scale);
 }
 
 #else
 
-ccl_device_inline void float4_store_half(half *h, const float4 *f, float scale)
+ccl_device_inline void float4_store_half(half *h, float4 f, float scale)
 {
 #ifndef __KERNEL_SSE2__
 	for(int i = 0; i < 4; i++) {
 		/* optimized float to half for pixels:
 		 * assumes no negative, no nan, no inf, and sets denormal to 0 */
 		union { uint i; float f; } in;
-		float fscale = (*f)[i] * scale;
+		float fscale = f[i] * scale;
 		in.f = (fscale > 0.0f)? ((fscale < 65500.0f)? fscale: 65500.0f): 0.0f;
 		int x = in.i;
 
@@ -70,7 +74,7 @@ ccl_device_inline void float4_store_half(half *h, const float4 *f, float scale)
 	const __m128i mm_7FFFFFFF = _mm_set1_epi32(0x7FFFFFFF);
 	const __m128i mm_C8000000 = _mm_set1_epi32(0xC8000000);
 
-	__m128 mm_fscale = _mm_mul_ps(*(__m128*)f, mm_scale);
+	__m128 mm_fscale = _mm_mul_ps(load_m128(f), mm_scale);
 	__m128i x = _mm_castps_si128(_mm_min_ps(_mm_max_ps(mm_fscale, _mm_set_ps1(0.0f)), _mm_set_ps1(65500.0f)));
 	__m128i absolute = _mm_and_si128(x, mm_7FFFFFFF);
 	__m128i Z = _mm_add_epi32(absolute, mm_C8000000);
diff --git a/intern/cycles/util/util_hash.h b/intern/cycles/util/util_hash.h
index ded25c92b90..edd2448efa4 100644
--- a/intern/cycles/util/util_hash.h
+++ b/intern/cycles/util/util_hash.h
@@ -23,7 +23,7 @@ CCL_NAMESPACE_BEGIN
 
 static inline uint hash_int_2d(uint kx, uint ky)
 {
-	#define rot(x,k) (((x)<<(k)) | ((x)>>(32-(k))))
+#define rot(x,k) (((x)<<(k)) | ((x)>>(32-(k))))
 
 	uint a, b, c;
 
@@ -41,7 +41,7 @@ static inline uint hash_int_2d(uint kx, uint ky)
 
 	return c;
 
-	#undef rot
+#undef rot
 }
 
 static inline uint hash_int(uint k)
diff --git a/intern/cycles/util/util_math.h b/intern/cycles/util/util_math.h
index 2e73639d2bb..ded75762cd2 100644
--- a/intern/cycles/util/util_math.h
+++ b/intern/cycles/util/util_math.h
@@ -163,11 +163,7 @@ ccl_device_inline float clamp(float a, float mn, float mx)
 
 ccl_device_inline int float_to_int(float f)
 {
-#if defined(__KERNEL_SSE2__) && !defined(_MSC_VER)
-	return _mm_cvtt_ss2si(_mm_load_ss(&f));
-#else
 	return (int)f;
-#endif
 }
 
 ccl_device_inline int floor_to_int(float f)
@@ -469,6 +465,15 @@ ccl_device_inline float dot(const float3 a, const float3 b)
 #endif
 }
 
+ccl_device_inline float dot(const float4 a, const float4 b)
+{
+#if defined(__KERNEL_SSE41__) && defined(__KERNEL_SSE__)
+	return _mm_cvtss_f32(_mm_dp_ps(a, b, 0xFF));
+#else	
+	return (a.x*b.x + a.y*b.y) + (a.z*b.z + a.w*b.w);
+#endif
+}
+
 ccl_device_inline float3 cross(const float3 a, const float3 b)
 {
 	float3 r = make_float3(a.y*b.z - a.z*b.y, a.z*b.x - a.x*b.z, a.x*b.y - a.y*b.x);
@@ -493,6 +498,11 @@ ccl_device_inline float len_squared(const float3 a)
 
 #ifndef __KERNEL_OPENCL__
 
+ccl_device_inline float len_squared(const float4 a)
+{
+	return dot(a, a);
+}
+
 ccl_device_inline float3 normalize(const float3 a)
 {
 #if defined(__KERNEL_SSE41__) && defined(__KERNEL_SSE__)
@@ -812,11 +822,6 @@ ccl_device_inline float average(const float4& a)
 	return reduce_add(a) * 0.25f;
 }
 
-ccl_device_inline float dot(const float4& a, const float4& b)
-{
-	return reduce_add(a * b);
-}
-
 ccl_device_inline float len(const float4 a)
 {
 	return sqrtf(dot(a, a));
@@ -1113,6 +1118,17 @@ ccl_device_inline void make_orthonormals(const float3 N, float3 *a, float3 *b)
 
 /* Color division */
 
+ccl_device_inline float3 safe_invert_color(float3 a)
+{
+	float x, y, z;
+
+	x = (a.x != 0.0f)? 1.0f/a.x: 0.0f;
+	y = (a.y != 0.0f)? 1.0f/a.y: 0.0f;
+	z = (a.z != 0.0f)? 1.0f/a.z: 0.0f;
+
+	return make_float3(x, y, z);
+}
+
 ccl_device_inline float3 safe_divide_color(float3 a, float3 b)
 {
 	float x, y, z;
@@ -1221,7 +1237,7 @@ ccl_device float compatible_powf(float x, float y)
 
 ccl_device float safe_powf(float a, float b)
 {
-	if(a < 0.0f && b != float_to_int(b))
+	if(UNLIKELY(a < 0.0f && b != float_to_int(b)))
 		return 0.0f;
 
 	return compatible_powf(a, b);
@@ -1229,7 +1245,7 @@ ccl_device float safe_powf(float a, float b)
 
 ccl_device float safe_logf(float a, float b)
 {
-	if(a < 0.0f || b < 0.0f)
+	if(UNLIKELY(a < 0.0f || b < 0.0f))
 		return 0.0f;
 
 	return logf(a)/logf(b);
@@ -1289,7 +1305,7 @@ ccl_device bool ray_aligned_disk_intersect(
 	float3 disk_N = normalize_len(ray_P - disk_P, &disk_t);
 	float div = dot(ray_D, disk_N);
 
-	if(div == 0.0f)
+	if(UNLIKELY(div == 0.0f))
 		return false;
 
 	/* compute t to intersection point */
@@ -1319,7 +1335,7 @@ ccl_device bool ray_triangle_intersect(
 	float3 s1 = cross(ray_D, e2);
 
 	const float divisor = dot(s1, e1);
-	if(divisor == 0.0f)
+	if(UNLIKELY(divisor == 0.0f))
 		return false;
 
 	const float invdivisor = 1.0f/divisor;
@@ -1351,6 +1367,50 @@ ccl_device bool ray_triangle_intersect(
 	return true;
 }
 
+ccl_device bool ray_triangle_intersect_uv(
+	float3 ray_P, float3 ray_D, float ray_t,
+	float3 v0, float3 v1, float3 v2,
+	float *isect_u, float *isect_v, float *isect_t)
+{
+	/* Calculate intersection */
+	float3 e1 = v1 - v0;
+	float3 e2 = v2 - v0;
+	float3 s1 = cross(ray_D, e2);
+
+	const float divisor = dot(s1, e1);
+	if(UNLIKELY(divisor == 0.0f))
+		return false;
+
+	const float invdivisor = 1.0f/divisor;
+
+	/* compute first barycentric coordinate */
+	const float3 d = ray_P - v0;
+	const float u = dot(d, s1)*invdivisor;
+	if(u < 0.0f)
+		return false;
+
+	/* Compute second barycentric coordinate */
+	const float3 s2 = cross(d, e1);
+	const float v = dot(ray_D, s2)*invdivisor;
+	if(v < 0.0f)
+		return false;
+
+	const float b0 = 1.0f - u - v;
+	if(b0 < 0.0f)
+		return false;
+
+	/* compute t to intersection point */
+	const float t = dot(e2, s2)*invdivisor;
+	if(t < 0.0f || t > ray_t)
+		return false;
+
+	*isect_u = u;
+	*isect_v = v;
+	*isect_t = t;
+
+	return true;
+}
+
 ccl_device bool ray_quad_intersect(
 	float3 ray_P, float3 ray_D, float ray_t,
 	float3 quad_P, float3 quad_u, float3 quad_v,
diff --git a/intern/cycles/util/util_md5.cpp b/intern/cycles/util/util_md5.cpp
index c53fbd90c67..add0d18c742 100644
--- a/intern/cycles/util/util_md5.cpp
+++ b/intern/cycles/util/util_md5.cpp
@@ -367,7 +367,7 @@ string MD5Hash::get_hex()
 	finish(digest);
 
 	for(int i = 0; i < 16; i++)
-		sprintf(buf + i*2, "%02X", digest[i]);
+		sprintf(buf + i*2, "%02X", (unsigned int)digest[i]);
 	buf[sizeof(buf)-1] = '\0';
 	
 	return string(buf);
diff --git a/intern/cycles/util/util_opencl.h b/intern/cycles/util/util_opencl.h
index 5f3f1667bcc..141c5e38273 100644
--- a/intern/cycles/util/util_opencl.h
+++ b/intern/cycles/util/util_opencl.h
@@ -304,7 +304,9 @@ typedef struct _cl_kernel *         cl_kernel;
 typedef struct _cl_event *          cl_event;
 typedef struct _cl_sampler *        cl_sampler;
 
-typedef cl_uint             cl_bool;                     /* WARNING!  Unlike cl_ types in cl_platform.h, cl_bool is not guaranteed to be the same size as the bool in kernels. */ 
+/* WARNING!  Unlike cl_ types in cl_platform.h,
+ * cl_bool is not guaranteed to be the same size as the bool in kernels. */
+typedef cl_uint             cl_bool;
 typedef cl_ulong            cl_bitfield;
 typedef cl_bitfield         cl_device_type;
 typedef cl_uint             cl_platform_info;
diff --git a/intern/cycles/util/util_path.cpp b/intern/cycles/util/util_path.cpp
index 4fd5df4316d..85d19b6a325 100644
--- a/intern/cycles/util/util_path.cpp
+++ b/intern/cycles/util/util_path.cpp
@@ -111,6 +111,11 @@ string path_escape(const string& path)
 	return result;
 }
 
+bool path_is_relative(const string& path)
+{
+	return to_boost(path).is_relative();
+}
+
 bool path_exists(const string& path)
 {
 	return boost::filesystem::exists(to_boost(path));
diff --git a/intern/cycles/util/util_path.h b/intern/cycles/util/util_path.h
index e9041e63dae..fd9ea11740d 100644
--- a/intern/cycles/util/util_path.h
+++ b/intern/cycles/util/util_path.h
@@ -41,6 +41,7 @@ string path_filename(const string& path);
 string path_dirname(const string& path);
 string path_join(const string& dir, const string& file);
 string path_escape(const string& path);
+bool path_is_relative(const string& path);
 
 /* file info */
 bool path_exists(const string& path);
diff --git a/intern/cycles/util/util_simd.h b/intern/cycles/util/util_simd.h
index fd5ba1de37b..f0f37fa57aa 100644
--- a/intern/cycles/util/util_simd.h
+++ b/intern/cycles/util/util_simd.h
@@ -71,7 +71,7 @@ ccl_device_inline const __m128 shuffle_swap(const __m128& a, shuffle_swap_t shuf
 
 #ifdef __KERNEL_SSE41__
 ccl_device_inline void gen_idirsplat_swap(const __m128 &pn, const shuffle_swap_t &shuf_identity, const shuffle_swap_t &shuf_swap,
-										  const float3& idir, __m128 idirsplat[3], shuffle_swap_t shufflexyz[3])
+                                          const float3& idir, __m128 idirsplat[3], shuffle_swap_t shufflexyz[3])
 {
 	const __m128 idirsplat_raw[] = { _mm_set_ps1(idir.x), _mm_set_ps1(idir.y), _mm_set_ps1(idir.z) };
 	idirsplat[0] = _mm_xor_ps(idirsplat_raw[0], pn);
@@ -87,7 +87,7 @@ ccl_device_inline void gen_idirsplat_swap(const __m128 &pn, const shuffle_swap_t
 }
 #else
 ccl_device_inline void gen_idirsplat_swap(const __m128 &pn, const shuffle_swap_t &shuf_identity, const shuffle_swap_t &shuf_swap,
-										  const float3& idir, __m128 idirsplat[3], shuffle_swap_t shufflexyz[3])
+                                          const float3& idir, __m128 idirsplat[3], shuffle_swap_t shufflexyz[3])
 {
 	idirsplat[0] = _mm_xor_ps(_mm_set_ps1(idir.x), pn);
 	idirsplat[1] = _mm_xor_ps(_mm_set_ps1(idir.y), pn);
@@ -154,6 +154,12 @@ ccl_device_inline const __m128 fms(const __m128& a, const __m128& b, const __m12
 	return _mm_sub_ps(_mm_mul_ps(a, b), c);
 }
 
+/* calculate -a*b+c (replacement for fused negated-multiply-subtract on SSE CPUs) */
+ccl_device_inline const __m128 fnma(const __m128& a, const __m128& b, const __m128& c)
+{
+	return _mm_sub_ps(c, _mm_mul_ps(a, b));
+}
+
 template<size_t N> ccl_device_inline const __m128 broadcast(const __m128& a)
 {
 	return _mm_castsi128_ps(_mm_shuffle_epi32(_mm_castps_si128(a), _MM_SHUFFLE(N, N, N, N)));
@@ -180,6 +186,88 @@ ccl_device_inline const __m128 set_sign_bit(const __m128 &a)
 	return _mm_xor_ps(a, _mm_castsi128_ps(_mm_setr_epi32(S1 << 31, S2 << 31, S3 << 31, S4 << 31)));
 }
 
+#ifdef __KERNEL_WITH_SSE_ALIGN__
+ccl_device_inline const __m128 load_m128(const float4 &vec)
+{
+	return _mm_load_ps(&vec.x);
+}
+
+ccl_device_inline const __m128 load_m128(const float3 &vec)
+{
+	return _mm_load_ps(&vec.x);
+}
+
+#else
+
+ccl_device_inline const __m128 load_m128(const float4 &vec)
+{
+	return _mm_loadu_ps(&vec.x);
+}
+
+ccl_device_inline const __m128 load_m128(const float3 &vec)
+{
+	return _mm_loadu_ps(&vec.x);
+}
+#endif /* __KERNEL_WITH_SSE_ALIGN__ */
+
+ccl_device_inline const __m128 dot3_splat(const __m128& a, const __m128& b)
+{
+#ifdef __KERNEL_SSE41__
+	return _mm_dp_ps(a, b, 0x7f);
+#else
+	__m128 t = _mm_mul_ps(a, b);
+	return _mm_set1_ps(((float*)&t)[0] + ((float*)&t)[1] + ((float*)&t)[2]);
+#endif
+}
+
+/* squared length taking only specified axes into account */
+template<size_t X, size_t Y, size_t Z, size_t W>
+ccl_device_inline float len_squared(const __m128& a)
+{
+#ifndef __KERNEL_SSE41__
+	float4& t = (float4 &)a;
+	return (X ? t.x * t.x : 0.0f) + (Y ? t.y * t.y : 0.0f) + (Z ? t.z * t.z : 0.0f) + (W ? t.w * t.w : 0.0f);
+#else
+	return _mm_cvtss_f32(_mm_dp_ps(a, a, (X << 4) | (Y << 5) | (Z << 6) | (W << 7) | 0xf));
+#endif
+}
+
+ccl_device_inline float dot3(const __m128& a, const __m128& b)
+{
+#ifdef __KERNEL_SSE41__
+	return _mm_cvtss_f32(_mm_dp_ps(a, b, 0x7f));
+#else
+	__m128 t = _mm_mul_ps(a, b);
+	return ((float*)&t)[0] + ((float*)&t)[1] + ((float*)&t)[2];
+#endif
+}
+
+ccl_device_inline const __m128 len3_squared_splat(const __m128& a)
+{
+	return dot3_splat(a, a);
+}
+
+ccl_device_inline float len3_squared(const __m128& a)
+{
+	return dot3(a, a);
+}
+
+ccl_device_inline float len3(const __m128& a)
+{
+	return _mm_cvtss_f32(_mm_sqrt_ss(dot3_splat(a, a)));
+}
+
+/* calculate shuffled cross product, useful when order of components does not matter */
+ccl_device_inline const __m128 cross_zxy(const __m128& a, const __m128& b)
+{
+	return fms(a, shuffle<1, 2, 0, 3>(b), _mm_mul_ps(b, shuffle<1, 2, 0, 3>(a)));
+}
+
+ccl_device_inline const __m128 cross(const __m128& a, const __m128& b)
+{
+	return shuffle<1, 2, 0, 3>(cross_zxy(a, b));
+}
+
 #endif /* __KERNEL_SSE2__ */
 
 CCL_NAMESPACE_END
diff --git a/intern/cycles/util/util_system.cpp b/intern/cycles/util/util_system.cpp
index 3d7781f6146..0764f7d9345 100644
--- a/intern/cycles/util/util_system.cpp
+++ b/intern/cycles/util/util_system.cpp
@@ -161,8 +161,25 @@ static CPUCapabilities& system_cpu_capabilities()
 			caps.sse41 = (result[2] & ((int)1 << 19)) != 0;
 			caps.sse42 = (result[2] & ((int)1 << 20)) != 0;
 
-			caps.avx = (result[2] & ((int)1 << 28)) != 0;
 			caps.fma3 = (result[2] & ((int)1 << 12)) != 0;
+			caps.avx = false;
+			bool os_uses_xsave_xrestore = (result[2] & ((int)1 << 27)) != 0;
+			bool cpu_avx_support = (result[2] & ((int)1 << 28)) != 0;
+
+			if( os_uses_xsave_xrestore && cpu_avx_support) {
+				// Check if the OS will save the YMM registers
+				uint32_t xcr_feature_mask;
+#if defined(__GNUC__)
+				int edx; /* not used */
+				/* actual opcode for xgetbv */
+				__asm__ (".byte 0x0f, 0x01, 0xd0" : "=a" (xcr_feature_mask) , "=d" (edx) : "c" (0) );
+#elif defined(_MSC_VER) && defined(_XCR_XFEATURE_ENABLED_MASK)
+				xcr_feature_mask = (uint32_t)_xgetbv(_XCR_XFEATURE_ENABLED_MASK);  /* min VS2010 SP1 compiler is required */
+#else
+				xcr_feature_mask = 0;
+#endif
+				caps.avx = (xcr_feature_mask & 0x6) == 0x6;
+			}
 		}
 
 #if 0
diff --git a/intern/cycles/util/util_transform.cpp b/intern/cycles/util/util_transform.cpp
index 12c2270a8d4..14613558501 100644
--- a/intern/cycles/util/util_transform.cpp
+++ b/intern/cycles/util/util_transform.cpp
@@ -75,7 +75,7 @@ static bool transform_matrix4_gj_inverse(float R[][4], float M[][4])
 			}
 		}
 
-		if(pivotsize == 0)
+		if(UNLIKELY(pivotsize == 0.0f))
 			return false;
 
 		if(pivot != i) {
@@ -106,7 +106,7 @@ static bool transform_matrix4_gj_inverse(float R[][4], float M[][4])
 	for(int i = 3; i >= 0; --i) {
 		float f;
 
-		if((f = M[i][i]) == 0)
+		if(UNLIKELY((f = M[i][i]) == 0.0f))
 			return false;
 
 		for(int j = 0; j < 4; j++) {
@@ -135,15 +135,16 @@ Transform transform_inverse(const Transform& tfm)
 	memcpy(R, &tfmR, sizeof(R));
 	memcpy(M, &tfm, sizeof(M));
 
-	if(!transform_matrix4_gj_inverse(R, M)) {
+	if(UNLIKELY(!transform_matrix4_gj_inverse(R, M))) {
 		/* matrix is degenerate (e.g. 0 scale on some axis), ideally we should
 		 * never be in this situation, but try to invert it anyway with tweak */
 		M[0][0] += 1e-8f;
 		M[1][1] += 1e-8f;
 		M[2][2] += 1e-8f;
 
-		if(!transform_matrix4_gj_inverse(R, M))
+		if(UNLIKELY(!transform_matrix4_gj_inverse(R, M))) {
 			return transform_identity();
+		}
 	}
 
 	memcpy(&tfmR, R, sizeof(R));
diff --git a/intern/cycles/util/util_transform.h b/intern/cycles/util/util_transform.h
index 4c7ce12d1de..5b3dbe42f69 100644
--- a/intern/cycles/util/util_transform.h
+++ b/intern/cycles/util/util_transform.h
@@ -108,9 +108,9 @@ ccl_device_inline Transform transform_transpose(const Transform a)
 }
 
 ccl_device_inline Transform make_transform(float a, float b, float c, float d,
-									float e, float f, float g, float h,
-									float i, float j, float k, float l,
-									float m, float n, float o, float p)
+                                           float e, float f, float g, float h,
+                                           float i, float j, float k, float l,
+                                           float m, float n, float o, float p)
 {
 	Transform t;
 
diff --git a/intern/cycles/util/util_types.h b/intern/cycles/util/util_types.h
index c770931c69b..bfaab3dba3b 100644
--- a/intern/cycles/util/util_types.h
+++ b/intern/cycles/util/util_types.h
@@ -37,6 +37,7 @@
 #define ccl_device_noinline static
 #define ccl_global
 #define ccl_constant
+#define __KERNEL_WITH_SSE_ALIGN__
 
 #if defined(_WIN32) && !defined(FREE_WINDOWS)
 
@@ -45,6 +46,7 @@
 #ifdef __KERNEL_64_BIT__
 #define ccl_try_align(...) __declspec(align(__VA_ARGS__))
 #else
+#undef __KERNEL_WITH_SSE_ALIGN__
 #define ccl_try_align(...) /* not support for function arguments (error C2719) */
 #endif
 #define ccl_may_alias
@@ -63,8 +65,6 @@
 
 #endif
 
-#else
-#define ccl_align(...)
 #endif
 
 /* Standard Integer Types */
@@ -159,8 +159,8 @@ struct int2 {
 	__forceinline int& operator[](int i) { return *(&x + i); }
 };
 
-#ifdef __KERNEL_SSE__
 struct ccl_try_align(16) int3 {
+#ifdef __KERNEL_SSE__
 	union {
 		__m128i m128;
 		struct { int x, y, z, w; };
@@ -171,7 +171,6 @@ struct ccl_try_align(16) int3 {
 	__forceinline operator const __m128i&(void) const { return m128; }
 	__forceinline operator __m128i&(void) { return m128; }
 #else
-struct ccl_try_align(16) int3 {
 	int x, y, z, w;
 #endif
 
@@ -179,8 +178,8 @@ struct ccl_try_align(16) int3 {
 	__forceinline int& operator[](int i) { return *(&x + i); }
 };
 
-#ifdef __KERNEL_SSE__
 struct ccl_try_align(16) int4 {
+#ifdef __KERNEL_SSE__
 	union {
 		__m128i m128;
 		struct { int x, y, z, w; };
@@ -191,7 +190,6 @@ struct ccl_try_align(16) int4 {
 	__forceinline operator const __m128i&(void) const { return m128; }
 	__forceinline operator __m128i&(void) { return m128; }
 #else
-struct ccl_try_align(16) int4 {
 	int x, y, z, w;
 #endif
 
@@ -227,8 +225,8 @@ struct float2 {
 	__forceinline float& operator[](int i) { return *(&x + i); }
 };
 
-#ifdef __KERNEL_SSE__
 struct ccl_try_align(16) float3 {
+#ifdef __KERNEL_SSE__
 	union {
 		__m128 m128;
 		struct { float x, y, z, w; };
@@ -239,7 +237,6 @@ struct ccl_try_align(16) float3 {
 	__forceinline operator const __m128&(void) const { return m128; }
 	__forceinline operator __m128&(void) { return m128; }
 #else
-struct ccl_try_align(16) float3 {
 	float x, y, z, w;
 #endif
 
@@ -247,8 +244,8 @@ struct ccl_try_align(16) float3 {
 	__forceinline float& operator[](int i) { return *(&x + i); }
 };
 
-#ifdef __KERNEL_SSE__
 struct ccl_try_align(16) float4 {
+#ifdef __KERNEL_SSE__
 	union {
 		__m128 m128;
 		struct { float x, y, z, w; };
@@ -259,7 +256,6 @@ struct ccl_try_align(16) float4 {
 	__forceinline operator const __m128&(void) const { return m128; }
 	__forceinline operator __m128&(void) { return m128; }
 #else
-struct ccl_try_align(16) float4 {
 	float x, y, z, w;
 #endif
 
@@ -450,6 +446,53 @@ ccl_device_inline int4 make_int4(const float3& f)
 
 #endif
 
+/* Interpolation types for textures
+ * cuda also use texture space to store other objects */
+enum InterpolationType {
+	INTERPOLATION_NONE = -1,
+	INTERPOLATION_LINEAR = 0,
+	INTERPOLATION_CLOSEST = 1,
+	INTERPOLATION_CUBIC = 2,
+	INTERPOLATION_SMART = 3,
+};
+
+
+/* macros */
+
+/* hints for branch prediction, only use in code that runs a _lot_ */
+#if defined(__GNUC__) && defined(__KERNEL_CPU__)
+#  define LIKELY(x)       __builtin_expect(!!(x), 1)
+#  define UNLIKELY(x)     __builtin_expect(!!(x), 0)
+#else
+#  define LIKELY(x)       (x)
+#  define UNLIKELY(x)     (x)
+#endif
+
+/* Causes warning:
+ * incompatible types when assigning to type 'Foo' from type 'Bar'
+ * ... the compiler optimizes away the temp var */
+#ifdef __GNUC__
+#define CHECK_TYPE(var, type)  {  \
+	__typeof(var) *__tmp;         \
+	__tmp = (type *)NULL;         \
+	(void)__tmp;                  \
+} (void)0
+
+#define CHECK_TYPE_PAIR(var_a, var_b)  {  \
+	__typeof(var_a) *__tmp;               \
+	__tmp = (__typeof(var_b) *)NULL;      \
+	(void)__tmp;                          \
+} (void)0
+#else
+#  define CHECK_TYPE(var, type)
+#  define CHECK_TYPE_PAIR(var_a, var_b)
+#endif
+
+/* can be used in simple macros */
+#define CHECK_TYPE_INLINE(val, type) \
+	((void)(((type)0) != (val)))
+
+
 CCL_NAMESPACE_END
 
 #endif /* __UTIL_TYPES_H__ */
diff --git a/intern/cycles/util/util_view.cpp b/intern/cycles/util/util_view.cpp
index 361a7bc95f2..6bf9c9ed8c0 100644
--- a/intern/cycles/util/util_view.cpp
+++ b/intern/cycles/util/util_view.cpp
@@ -80,8 +80,8 @@ void view_display_info(const char *info)
 
 void view_display_help()
 {
-	const int w = V.width / 1.15;
-	const int h = V.height / 1.15;
+	const int w = (int)((float)V.width  / 1.15f);
+	const int h = (int)((float)V.height / 1.15f);
 
 	const int x1 = (V.width - w) / 2;
 	const int x2 = x1 + w;
@@ -100,14 +100,16 @@ void view_display_help()
 	view_display_text(x1+20, y2-20, "Cycles Renderer");
 	view_display_text(x1+20, y2-40, "(C) 2011-2014 Blender Foundation");
 	view_display_text(x1+20, y2-80, "Controls:");
-	view_display_text(x1+20, y2-100, "h:  Show/Hide this help message");
-	view_display_text(x1+20, y2-120, "r:  Restart the render");
-	view_display_text(x1+20, y2-140, "q:  Quit the program");
-	view_display_text(x1+20, y2-160, "esc:  Cancel the render");
+	view_display_text(x1+20, y2-100, "h:  Info/Help");
+	view_display_text(x1+20, y2-120, "r:  Reset");
+	view_display_text(x1+20, y2-140, "p:  Pause");
+	view_display_text(x1+20, y2-160, "esc:  Cancel");
+	view_display_text(x1+20, y2-180, "q:  Quit program");
 
-	view_display_text(x1+20, y2-190, "Interactive Mode (i-key):");
-	view_display_text(x1+20, y2-210, "LMB:  Move camera");
-	view_display_text(x1+20, y2-230, "RMB:  Rotate camera");
+	view_display_text(x1+20, y2-210, "i:  Interactive mode");
+	view_display_text(x1+20, y2-230, "Left mouse:  Move camera");
+	view_display_text(x1+20, y2-250, "Right mouse:  Rotate camera");
+	view_display_text(x1+20, y2-270, "W/A/S/D:  Move camera");
 
 	glColor3f(1.0f, 1.0f, 1.0f);
 }
@@ -246,9 +248,7 @@ void view_main_loop(const char *title, int width, int height,
 	glutInitDisplayMode(GLUT_RGB|GLUT_DOUBLE|GLUT_DEPTH);
 	glutCreateWindow(title);
 
-#ifndef __APPLE__
 	glewInit();
-#endif
 
 	view_reshape(width, height);
 
diff --git a/intern/elbeem/intern/mvmcoords.cpp b/intern/elbeem/intern/mvmcoords.cpp
index 281a9656fcf..838fc54491d 100644
--- a/intern/elbeem/intern/mvmcoords.cpp
+++ b/intern/elbeem/intern/mvmcoords.cpp
@@ -18,7 +18,7 @@
 #include <algorithm>
 
 #if defined(_MSC_VER) && _MSC_VER > 1600
-// sdt::greater
+// std::greater
 #include <functional>
 #endif
 
diff --git a/intern/ffmpeg/ffmpeg_compat.h b/intern/ffmpeg/ffmpeg_compat.h
index ff2cc405f4c..ac4da5b6133 100644
--- a/intern/ffmpeg/ffmpeg_compat.h
+++ b/intern/ffmpeg/ffmpeg_compat.h
@@ -103,6 +103,7 @@ FFMPEG_INLINE
 int av_sample_fmt_is_planar(enum AVSampleFormat sample_fmt)
 {
 	/* no planar formats in FFmpeg < 0.9 */
+	(void) sample_fmt;
 	return 0;
 }
 
@@ -172,6 +173,7 @@ FFMPEG_INLINE
 int av_opt_set(void *obj, const char *name, const char *val, int search_flags)
 {
 	const AVOption *rv = NULL;
+	(void) search_flags;
 	av_set_string3(obj, name, val, 1, &rv);
 	return rv != NULL;
 }
@@ -180,6 +182,7 @@ FFMPEG_INLINE
 int av_opt_set_int(void *obj, const char *name, int64_t val, int search_flags)
 {
 	const AVOption *rv = NULL;
+	(void) search_flags;
 	rv = av_set_int(obj, name, val);
 	return rv != NULL;
 }
@@ -188,6 +191,7 @@ FFMPEG_INLINE
 int av_opt_set_double(void *obj, const char *name, double val, int search_flags)
 {
 	const AVOption *rv = NULL;
+	(void) search_flags;
 	rv = av_set_double(obj, name, val);
 	return rv != NULL;
 }
@@ -210,15 +214,12 @@ enum AVSampleFormat av_get_packed_sample_fmt(enum AVSampleFormat sample_fmt)
 }
 #endif
 
-#if ((LIBAVFORMAT_VERSION_MAJOR < 53) || ((LIBAVFORMAT_VERSION_MAJOR == 53) && (LIBAVFORMAT_VERSION_MINOR < 24)) || ((LIBAVFORMAT_VERSION_MAJOR == 53) && (LIBAVFORMAT_VERSION_MINOR < 24) && (LIBAVFORMAT_VERSION_MICRO < 2)))
-#  define avformat_close_input(x) av_close_input_file(*(x))
-#endif
-
 #if ((LIBAVCODEC_VERSION_MAJOR < 53) || (LIBAVCODEC_VERSION_MAJOR == 53 && LIBAVCODEC_VERSION_MINOR < 35))
 FFMPEG_INLINE
 int avcodec_open2(AVCodecContext *avctx, AVCodec *codec, AVDictionary **options)
 {
 	/* TODO: no options are taking into account */
+	(void) options;
 	return avcodec_open(avctx, codec);
 }
 #endif
@@ -228,6 +229,7 @@ FFMPEG_INLINE
 AVStream *avformat_new_stream(AVFormatContext *s, AVCodec *c)
 {
 	/* TODO: no codec is taking into account */
+	(void) c;
 	return av_new_stream(s, 0);
 }
 
@@ -235,6 +237,7 @@ FFMPEG_INLINE
 int avformat_find_stream_info(AVFormatContext *ic, AVDictionary **options)
 {
 	/* TODO: no options are taking into account */
+	(void) options;
 	return av_find_stream_info(ic);
 }
 #endif
@@ -435,4 +438,12 @@ AVRational av_get_r_frame_rate_compat(const AVStream *stream)
 #endif
 }
 
+#if LIBAVUTIL_VERSION_INT < AV_VERSION_INT(51, 32, 0)
+#  define AV_OPT_SEARCH_FAKE_OBJ 0
+#endif
+
+#if LIBAVCODEC_VERSION_INT < AV_VERSION_INT(54, 59, 100)
+#  define FFMPEG_HAVE_DEPRECATED_FLAGS2
+#endif
+
 #endif
diff --git a/intern/ghost/intern/GHOST_NDOFManager.cpp b/intern/ghost/intern/GHOST_NDOFManager.cpp
index f8c707b668c..c99680641c3 100644
--- a/intern/ghost/intern/GHOST_NDOFManager.cpp
+++ b/intern/ghost/intern/GHOST_NDOFManager.cpp
@@ -295,14 +295,14 @@ bool GHOST_NDOFManager::setDevice(unsigned short vendor_id, unsigned short produ
 	return m_deviceType != NDOF_UnknownDevice;
 }
 
-void GHOST_NDOFManager::updateTranslation(short t[3], GHOST_TUns64 time)
+void GHOST_NDOFManager::updateTranslation(const short t[3], GHOST_TUns64 time)
 {
 	memcpy(m_translation, t, sizeof(m_translation));
 	m_motionTime = time;
 	m_motionEventPending = true;
 }
 
-void GHOST_NDOFManager::updateRotation(short r[3], GHOST_TUns64 time)
+void GHOST_NDOFManager::updateRotation(const short r[3], GHOST_TUns64 time)
 {
 	memcpy(m_rotation, r, sizeof(m_rotation));
 	m_motionTime = time;
@@ -506,7 +506,5 @@ bool GHOST_NDOFManager::sendMotionEvent()
 
 	m_system.pushEvent(event);
 
-	m_prevMotionTime = m_motionTime;
-
 	return true;
 }
diff --git a/intern/ghost/intern/GHOST_NDOFManager.h b/intern/ghost/intern/GHOST_NDOFManager.h
index 50f784d89c4..98aebfa4f30 100644
--- a/intern/ghost/intern/GHOST_NDOFManager.h
+++ b/intern/ghost/intern/GHOST_NDOFManager.h
@@ -128,8 +128,8 @@ public:
 	//       rotations are + when CCW, - when CW
 	// each platform is responsible for getting axis data into this form
 	// these values should not be scaled (just shuffled or flipped)
-	void updateTranslation(short t[3], GHOST_TUns64 time);
-	void updateRotation(short r[3], GHOST_TUns64 time);
+	void updateTranslation(const short t[3], GHOST_TUns64 time);
+	void updateRotation(const short r[3], GHOST_TUns64 time);
 
 	// the latest raw button data from the device
 	// use HID button encoding (not NDOF_ButtonT)
diff --git a/intern/ghost/intern/GHOST_NDOFManagerCocoa.mm b/intern/ghost/intern/GHOST_NDOFManagerCocoa.mm
index 4fc4f8016e5..1a029257f09 100644
--- a/intern/ghost/intern/GHOST_NDOFManagerCocoa.mm
+++ b/intern/ghost/intern/GHOST_NDOFManagerCocoa.mm
@@ -79,8 +79,8 @@ static void NDOF_DeviceEvent(io_connect_t connection, natural_t messageType, voi
 				case kConnexionCmdHandleAxis:
 				{
 					// convert to blender view coordinates
-					short t[3] = {s->axis[0], -(s->axis[2]), s->axis[1]};
-					short r[3] = {-(s->axis[3]), s->axis[5], -(s->axis[4])};
+					const short t[3] = {s->axis[0], -(s->axis[2]), s->axis[1]};
+					const short r[3] = {-(s->axis[3]), s->axis[5], -(s->axis[4])};
 
 					ndof_manager->updateTranslation(t, now);
 					ndof_manager->updateRotation(r, now);
@@ -162,7 +162,7 @@ GHOST_NDOFManagerCocoa::~GHOST_NDOFManagerCocoa()
 	if (GHOST_NDOFManager3Dconnexion_available())
 	{
 		GHOST_NDOFManager3Dconnexion_UnregisterConnexionClient(m_clientID);
-        GHOST_NDOFManager3Dconnexion_UnregisterConnexionClient(m_clientID);
+		GHOST_NDOFManager3Dconnexion_UnregisterConnexionClient(m_clientID);
 
 		GHOST_NDOFManager3Dconnexion_CleanupConnexionHandlers();
 		ghost_system = NULL;
diff --git a/intern/ghost/intern/GHOST_NDOFManagerX11.cpp b/intern/ghost/intern/GHOST_NDOFManagerX11.cpp
index 947d8d74461..77e09e7ef49 100644
--- a/intern/ghost/intern/GHOST_NDOFManagerX11.cpp
+++ b/intern/ghost/intern/GHOST_NDOFManagerX11.cpp
@@ -77,23 +77,46 @@ bool GHOST_NDOFManagerX11::available()
 	return m_available;
 }
 
+/*
+ * Workaround for a problem where we don't enter the 'GHOST_kFinished' state,
+ * this causes any proceeding event to have a very high 'dt' (time delta),
+ * many seconds for eg, causing the view to jump.
+ *
+ * this workaround expect's continuous events, if we miss a motion event,
+ * immediately send a dummy event with no motion to ensure the finished state is reached.
+ */
+#define USE_FINISH_GLITCH_WORKAROUND
+
+
+#ifdef USE_FINISH_GLITCH_WORKAROUND
+static bool motion_test_prev = false;
+#endif
+
 bool GHOST_NDOFManagerX11::processEvents()
 {
 	bool anyProcessed = false;
 
 	if (m_available) {
 		spnav_event e;
+
+#ifdef USE_FINISH_GLITCH_WORKAROUND
+		bool motion_test = false;
+#endif
+
 		while (spnav_poll_event(&e)) {
 			switch (e.type) {
 				case SPNAV_EVENT_MOTION:
 				{
 					/* convert to blender view coords */
 					GHOST_TUns64 now = m_system.getMilliSeconds();
-					short t[3] = {(short)e.motion.x, (short)e.motion.y, (short)-e.motion.z};
-					short r[3] = {(short)-e.motion.rx, (short)-e.motion.ry, (short)e.motion.rz};
+					const short t[3] = {(short)e.motion.x, (short)e.motion.y, (short)-e.motion.z};
+					const short r[3] = {(short)-e.motion.rx, (short)-e.motion.ry, (short)e.motion.rz};
 
 					updateTranslation(t, now);
 					updateRotation(r, now);
+#ifdef USE_FINISH_GLITCH_WORKAROUND
+					motion_test = true;
+#endif
 					break;
 				}
 				case SPNAV_EVENT_BUTTON:
@@ -103,6 +126,20 @@ bool GHOST_NDOFManagerX11::processEvents()
 			}
 			anyProcessed = true;
 		}
+
+#ifdef USE_FINISH_GLITCH_WORKAROUND
+		if (motion_test_prev == true && motion_test == false) {
+			GHOST_TUns64 now = m_system.getMilliSeconds();
+			const short v[3] = {0, 0, 0};
+
+			updateTranslation(v, now);
+			updateRotation(v, now);
+
+			anyProcessed = true;
+		}
+		motion_test_prev = motion_test;
+#endif
+
 	}
 
 	return anyProcessed;
diff --git a/intern/ghost/intern/GHOST_SystemWin32.cpp b/intern/ghost/intern/GHOST_SystemWin32.cpp
index 8280474437b..070dd86c0fb 100644
--- a/intern/ghost/intern/GHOST_SystemWin32.cpp
+++ b/intern/ghost/intern/GHOST_SystemWin32.cpp
@@ -843,14 +843,14 @@ bool GHOST_SystemWin32::processNDOF(RAWINPUT const& raw)
 	{
 		case 1: // translation
 		{
-			short *axis = (short *)(data + 1);
+			const short *axis = (short *)(data + 1);
 			// massage into blender view coords (same goes for rotation)
-			short t[3] = {axis[0], -axis[2], axis[1]};
+			const short t[3] = {axis[0], -axis[2], axis[1]};
 			m_ndofManager->updateTranslation(t, now);
 
 			if (raw.data.hid.dwSizeHid == 13)
 			{ // this report also includes rotation
-				short r[3] = {-axis[3], axis[5], -axis[4]};
+				const short r[3] = {-axis[3], axis[5], -axis[4]};
 				m_ndofManager->updateRotation(r, now);
 
 				// I've never gotten one of these, has anyone else?
@@ -860,8 +860,8 @@ bool GHOST_SystemWin32::processNDOF(RAWINPUT const& raw)
 		}
 		case 2: // rotation
 		{
-			short *axis = (short *)(data + 1);
-			short r[3] = {-axis[0], axis[2], -axis[1]};
+			const short *axis = (short *)(data + 1);
+			const short r[3] = {-axis[0], axis[2], -axis[1]};
 			m_ndofManager->updateRotation(r, now);
 			break;
 		}
diff --git a/intern/ghost/intern/GHOST_SystemX11.cpp b/intern/ghost/intern/GHOST_SystemX11.cpp
index 9900f7e153f..8f1f9867724 100644
--- a/intern/ghost/intern/GHOST_SystemX11.cpp
+++ b/intern/ghost/intern/GHOST_SystemX11.cpp
@@ -755,7 +755,7 @@ GHOST_SystemX11::processEvent(XEvent *xe)
 		case KeyRelease:
 		{
 			XKeyEvent *xke = &(xe->xkey);
-			KeySym key_sym = XLookupKeysym(xke, 0);
+			KeySym key_sym;
 			char ascii;
 #if defined(WITH_X11_XINPUT) && defined(X_HAVE_UTF8_STRING)
 			/* utf8_array[] is initial buffer used for Xutf8LookupString().
@@ -771,7 +771,29 @@ GHOST_SystemX11::processEvent(XEvent *xe)
 			char *utf8_buf = NULL;
 #endif
 			
-			GHOST_TKey gkey = convertXKey(key_sym);
+			GHOST_TKey gkey;
+
+			/* In keyboards like latin ones,
+			 * numbers needs a 'Shift' to be accessed but key_sym
+			 * is unmodified (or anyone swapping the keys with xmodmap).
+			 *
+			 * Here we look at the 'Shifted' version of the key.
+			 * If it is a number, then we take it instead of the normal key.
+			 *
+			 * The modified key is sent in the 'ascii's variable anyway.
+			 */
+			if ((xke->keycode >= 10 && xke->keycode < 20) &&
+			    ((key_sym = XLookupKeysym(xke, ShiftMask)) >= XK_0) && (key_sym <= XK_9))
+			{
+				/* pass (keep shift'ed key_sym) */
+			}
+			else {
+				/* regular case */
+				key_sym = XLookupKeysym(xke, 0);
+			}
+
+			gkey = convertXKey(key_sym);
+
 			GHOST_TEventType type = (xke->type == KeyPress) ? 
 			                        GHOST_kEventKeyDown : GHOST_kEventKeyUp;
 			
diff --git a/intern/ghost/intern/GHOST_WindowX11.cpp b/intern/ghost/intern/GHOST_WindowX11.cpp
index 4e3fcd4da3f..56e225e94a2 100644
--- a/intern/ghost/intern/GHOST_WindowX11.cpp
+++ b/intern/ghost/intern/GHOST_WindowX11.cpp
@@ -186,7 +186,8 @@ GHOST_WindowX11(
 	m_valid_setup(false),
 	m_invalid_window(false),
 	m_empty_cursor(None),
-	m_custom_cursor(None)
+	m_custom_cursor(None),
+	m_visible_cursor(None)
 {
 	
 	/* Set up the minimum atrributes that we require and see if
@@ -1454,7 +1455,10 @@ setWindowCursorVisibility(
 	Cursor xcursor;
 	
 	if (visible) {
-		xcursor = getStandardCursor(getCursorShape() );
+		if (m_visible_cursor)
+			xcursor = m_visible_cursor;
+		else
+			xcursor = getStandardCursor(getCursorShape() );
 	}
 	else {
 		xcursor = getEmptyCursor();
@@ -1517,6 +1521,8 @@ setWindowCursorShape(
 		GHOST_TStandardCursor shape)
 {
 	Cursor xcursor = getStandardCursor(shape);
+
+	m_visible_cursor = xcursor;
 	
 	XDefineCursor(m_display, m_window, xcursor);
 	XFlush(m_display);
@@ -1566,6 +1572,8 @@ setWindowCustomCursorShape(
 	m_custom_cursor = XCreatePixmapCursor(m_display, bitmap_pix, mask_pix, &fg, &bg, hotX, hotY);
 	XDefineCursor(m_display, m_window, m_custom_cursor);
 	XFlush(m_display);
+
+	m_visible_cursor = m_custom_cursor;
 	
 	XFreePixmap(m_display, bitmap_pix);
 	XFreePixmap(m_display, mask_pix);
diff --git a/intern/ghost/intern/GHOST_WindowX11.h b/intern/ghost/intern/GHOST_WindowX11.h
index ff7b7409627..93ee9edda0e 100644
--- a/intern/ghost/intern/GHOST_WindowX11.h
+++ b/intern/ghost/intern/GHOST_WindowX11.h
@@ -391,6 +391,9 @@ private:
 	
 	/** XCursor structure of the custom cursor */
 	Cursor m_custom_cursor;
+
+	/** XCursor to show when cursor is visible */
+	Cursor m_visible_cursor;
 	
 	/** Cache of XC_* ID's to XCursor structures */
 	std::map<unsigned int, Cursor> m_standard_cursors;
diff --git a/intern/ghost/test/multitest/MultiTest.c b/intern/ghost/test/multitest/MultiTest.c
index 8fb46ffc385..9a192c17180 100644
--- a/intern/ghost/test/multitest/MultiTest.c
+++ b/intern/ghost/test/multitest/MultiTest.c
@@ -74,7 +74,7 @@ void multitestapp_exit(MultiTestApp *app);
 
 /**/
 
-void rect_bevel_side(int rect[2][2], int side, float *lt, float *dk, float *col, int width)
+void rect_bevel_side(int rect[2][2], int side, float *lt, float *dk, const float col[3], int width)
 {
 	int ltidx = (side / 2) % 4;
 	int dkidx = (ltidx + 1 + (side & 1)) % 4;
diff --git a/intern/guardedalloc/intern/mallocn.c b/intern/guardedalloc/intern/mallocn.c
index 2ac01a6c7e4..e85fba7a6d0 100644
--- a/intern/guardedalloc/intern/mallocn.c
+++ b/intern/guardedalloc/intern/mallocn.c
@@ -15,11 +15,6 @@
  * along with this program; if not, write to the Free Software Foundation,
  * Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301, USA.
  *
- * The Original Code is Copyright (C) 2001-2002 by NaN Holding BV.
- * All rights reserved.
- *
- * The Original Code is: all of this file.
- *
  * Contributor(s): Brecht Van Lommel
  *                 Campbell Barton
  *
@@ -43,7 +38,7 @@ size_t (*MEM_allocN_len)(const void *vmemh) = MEM_lockfree_allocN_len;
 void (*MEM_freeN)(void *vmemh) = MEM_lockfree_freeN;
 void *(*MEM_dupallocN)(const void *vmemh) = MEM_lockfree_dupallocN;
 void *(*MEM_reallocN_id)(void *vmemh, size_t len, const char *str) = MEM_lockfree_reallocN_id;
-void *(*MEM_recallocN_id)(void *vmemh, size_t len, const char *str) = MEM_lockfree_recallocN_id;;
+void *(*MEM_recallocN_id)(void *vmemh, size_t len, const char *str) = MEM_lockfree_recallocN_id;
 void *(*MEM_callocN)(size_t len, const char *str) = MEM_lockfree_callocN;
 void *(*MEM_mallocN)(size_t len, const char *str) = MEM_lockfree_mallocN;
 void *(*MEM_mapallocN)(size_t len, const char *str) = MEM_lockfree_mapallocN;
@@ -71,7 +66,7 @@ void MEM_use_guarded_allocator(void)
 	MEM_freeN = MEM_guarded_freeN;
 	MEM_dupallocN = MEM_guarded_dupallocN;
 	MEM_reallocN_id = MEM_guarded_reallocN_id;
-	MEM_recallocN_id = MEM_guarded_recallocN_id;;
+	MEM_recallocN_id = MEM_guarded_recallocN_id;
 	MEM_callocN = MEM_guarded_callocN;
 	MEM_mallocN = MEM_guarded_mallocN;
 	MEM_mapallocN = MEM_guarded_mapallocN;
diff --git a/intern/guardedalloc/intern/mallocn_guarded_impl.c b/intern/guardedalloc/intern/mallocn_guarded_impl.c
index 352d18df732..172c79d50cd 100644
--- a/intern/guardedalloc/intern/mallocn_guarded_impl.c
+++ b/intern/guardedalloc/intern/mallocn_guarded_impl.c
@@ -497,9 +497,9 @@ void *MEM_guarded_mallocN(size_t len, const char *str)
 	
 	memh = (MemHead *)malloc(len + sizeof(MemHead) + sizeof(MemTail));
 
-	if (memh) {
+	if (LIKELY(memh)) {
 		make_memhead_header(memh, len, str);
-		if (malloc_debug_memset && len)
+		if (UNLIKELY(malloc_debug_memset && len))
 			memset(memh + 1, 255, len);
 
 #ifdef DEBUG_MEMCOUNTER
@@ -544,7 +544,7 @@ void *MEM_guarded_mapallocN(size_t len, const char *str)
 	/* on 64 bit, simply use calloc instead, as mmap does not support
 	 * allocating > 4 GB on Windows. the only reason mapalloc exists
 	 * is to get around address space limitations in 32 bit OSes. */
-	if(sizeof(void*) >= 8)
+	if (sizeof(void *) >= 8)
 		return MEM_guarded_callocN(len, str);
 
 	len = SIZET_ALIGN_4(len);
@@ -735,7 +735,7 @@ static void MEM_guarded_printmemlist_internal(int pydict)
 			            membl->_count);
 #else
 			print_error("%s len: " SIZET_FORMAT " %p\n",
-			            membl->name, SIZET_ARG(membl->len), membl + 1);
+			            membl->name, SIZET_ARG(membl->len), (void *)(membl + 1));
 #endif
 #ifdef DEBUG_BACKTRACE
 			print_memhead_backtrace(membl);
@@ -951,7 +951,7 @@ static void rem_memblock(MemHead *memh)
 #endif
 	}
 	else {
-		if (malloc_debug_memset && memh->len)
+		if (UNLIKELY(malloc_debug_memset && memh->len))
 			memset(memh + 1, 255, memh->len);
 		free(memh);
 	}
diff --git a/intern/guardedalloc/intern/mallocn_intern.h b/intern/guardedalloc/intern/mallocn_intern.h
index db45b59b884..b0fd52d2766 100644
--- a/intern/guardedalloc/intern/mallocn_intern.h
+++ b/intern/guardedalloc/intern/mallocn_intern.h
@@ -77,6 +77,14 @@
 
 #define SIZET_ALIGN_4(len) ((len + 3) & ~(size_t)3)
 
+#ifdef __GNUC__
+#  define LIKELY(x)       __builtin_expect(!!(x), 1)
+#  define UNLIKELY(x)     __builtin_expect(!!(x), 0)
+#else
+#  define LIKELY(x)       (x)
+#  define UNLIKELY(x)     (x)
+#endif
+
 /* Prototypes for counted allocator functions */
 size_t MEM_lockfree_allocN_len(const void *vmemh) ATTR_WARN_UNUSED_RESULT;
 void MEM_lockfree_freeN(void *vmemh);
diff --git a/intern/guardedalloc/intern/mallocn_lockfree_impl.c b/intern/guardedalloc/intern/mallocn_lockfree_impl.c
index 2c7c087966a..6fc01807af3 100644
--- a/intern/guardedalloc/intern/mallocn_lockfree_impl.c
+++ b/intern/guardedalloc/intern/mallocn_lockfree_impl.c
@@ -15,11 +15,6 @@
  * along with this program; if not, write to the Free Software Foundation,
  * Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301, USA.
  *
- * The Original Code is Copyright (C) 2001-2002 by NaN Holding BV.
- * All rights reserved.
- *
- * The Original Code is: all of this file.
- *
  * Contributor(s): Brecht Van Lommel
  *                 Campbell Barton
  *                 Sergey Sharybin
@@ -126,7 +121,7 @@ void MEM_lockfree_freeN(void *vmemh)
 #endif
 	}
 	else {
-		if (malloc_debug_memset && len) {
+		if (UNLIKELY(malloc_debug_memset && len)) {
 			memset(memh + 1, 255, len);
 		}
 		free(memh);
@@ -219,7 +214,7 @@ void *MEM_lockfree_callocN(size_t len, const char *str)
 
 	memh = (MemHead *)calloc(1, len + sizeof(MemHead));
 
-	if (memh) {
+	if (LIKELY(memh)) {
 		memh->len = len;
 		atomic_add_u(&totblock, 1);
 		atomic_add_z(&mem_in_use, len);
@@ -242,8 +237,8 @@ void *MEM_lockfree_mallocN(size_t len, const char *str)
 
 	memh = (MemHead *)malloc(len + sizeof(MemHead));
 
-	if (memh) {
-		if (malloc_debug_memset && len) {
+	if (LIKELY(memh)) {
+		if (UNLIKELY(malloc_debug_memset && len)) {
 			memset(memh + 1, 255, len);
 		}
 
@@ -268,7 +263,7 @@ void *MEM_lockfree_mapallocN(size_t len, const char *str)
 	/* on 64 bit, simply use calloc instead, as mmap does not support
 	 * allocating > 4 GB on Windows. the only reason mapalloc exists
 	 * is to get around address space limitations in 32 bit OSes. */
-	if(sizeof(void*) >= 8)
+	if (sizeof(void *) >= 8)
 		return MEM_lockfree_callocN(len, str);
 
 	len = SIZET_ALIGN_4(len);
diff --git a/intern/itasc/SConscript b/intern/itasc/SConscript
index 1b7709bb986..bd20368f001 100644
--- a/intern/itasc/SConscript
+++ b/intern/itasc/SConscript
@@ -35,7 +35,4 @@ incs = '. ../../extern/Eigen3'
 
 defs = []
 
-if env['OURPLATFORM']=='darwin' and env['C_COMPILER_ID'] == 'clang' and env['CCVERSION'] >= '3.4': # workaround for friend declaration specifies a default argument expression, not allowed anymore
-    env.BlenderLib ('bf_intern_itasc', sources, Split(incs), defs, libtype=['intern','player'], priority=[20,100], cc_compilerchange='/usr/bin/gcc', cxx_compilerchange='/usr/bin/g++' )
-else:
-    env.BlenderLib ('bf_intern_itasc', sources, Split(incs), defs, libtype=['intern','player'], priority=[20,100])
+env.BlenderLib ('bf_intern_itasc', sources, Split(incs), defs, libtype=['intern','player'], priority=[20,100])
diff --git a/intern/itasc/kdl/frameacc.hpp b/intern/itasc/kdl/frameacc.hpp
index 40dd5bfa712..bccd229804d 100644
--- a/intern/itasc/kdl/frameacc.hpp
+++ b/intern/itasc/kdl/frameacc.hpp
@@ -78,9 +78,9 @@ public:
     IMETHOD friend VectorAcc operator / (const VectorAcc& r2,const doubleAcc& r1);
 
 
-    IMETHOD friend bool Equal(const VectorAcc& r1,const VectorAcc& r2,double eps=epsilon);
-    IMETHOD friend bool Equal(const Vector& r1,const VectorAcc& r2,double eps=epsilon);
-    IMETHOD friend bool Equal(const VectorAcc& r1,const Vector& r2,double eps=epsilon);
+    IMETHOD friend bool Equal(const VectorAcc& r1,const VectorAcc& r2,double eps);
+    IMETHOD friend bool Equal(const Vector& r1,const VectorAcc& r2,double eps);
+    IMETHOD friend bool Equal(const VectorAcc& r1,const Vector& r2,double eps);
     IMETHOD friend VectorAcc operator - (const VectorAcc& r);
     IMETHOD friend doubleAcc dot(const VectorAcc& lhs,const VectorAcc& rhs);
     IMETHOD friend doubleAcc dot(const VectorAcc& lhs,const Vector& rhs);
@@ -132,9 +132,9 @@ public:
     IMETHOD friend RotationAcc operator* (const RotationAcc& r1,const RotationAcc& r2);
     IMETHOD friend RotationAcc operator* (const Rotation& r1,const RotationAcc& r2);
     IMETHOD friend RotationAcc operator* (const RotationAcc& r1,const Rotation& r2);
-    IMETHOD friend bool Equal(const RotationAcc& r1,const RotationAcc& r2,double eps=epsilon);
-    IMETHOD friend bool Equal(const Rotation& r1,const RotationAcc& r2,double eps=epsilon);
-    IMETHOD friend bool Equal(const RotationAcc& r1,const Rotation& r2,double eps=epsilon);
+    IMETHOD friend bool Equal(const RotationAcc& r1,const RotationAcc& r2,double eps);
+    IMETHOD friend bool Equal(const Rotation& r1,const RotationAcc& r2,double eps);
+    IMETHOD friend bool Equal(const RotationAcc& r1,const Rotation& r2,double eps);
     IMETHOD TwistAcc Inverse(const TwistAcc& arg) const;
     IMETHOD TwistAcc Inverse(const Twist& arg) const;
     IMETHOD TwistAcc operator * (const TwistAcc& arg) const;
@@ -170,9 +170,9 @@ public:
     IMETHOD friend FrameAcc operator * (const FrameAcc& f1,const FrameAcc& f2);
     IMETHOD friend FrameAcc operator * (const Frame& f1,const FrameAcc& f2);
     IMETHOD friend FrameAcc operator * (const FrameAcc& f1,const Frame& f2);
-    IMETHOD friend bool Equal(const FrameAcc& r1,const FrameAcc& r2,double eps=epsilon);
-    IMETHOD friend bool Equal(const Frame& r1,const FrameAcc& r2,double eps=epsilon);
-    IMETHOD friend bool Equal(const FrameAcc& r1,const Frame& r2,double eps=epsilon);
+    IMETHOD friend bool Equal(const FrameAcc& r1,const FrameAcc& r2,double eps);
+    IMETHOD friend bool Equal(const Frame& r1,const FrameAcc& r2,double eps);
+    IMETHOD friend bool Equal(const FrameAcc& r1,const Frame& r2,double eps);
 
     IMETHOD TwistAcc  Inverse(const TwistAcc& arg) const;
     IMETHOD TwistAcc  Inverse(const Twist& arg) const;
@@ -226,9 +226,9 @@ public:
      // the new point.
      // Complexity : 6M+6A
 
-     IMETHOD friend bool Equal(const TwistAcc& a,const TwistAcc& b,double eps=epsilon);
-     IMETHOD friend bool Equal(const Twist& a,const TwistAcc& b,double eps=epsilon);
-     IMETHOD friend bool Equal(const TwistAcc& a,const Twist& b,double eps=epsilon);
+     IMETHOD friend bool Equal(const TwistAcc& a,const TwistAcc& b,double eps);
+     IMETHOD friend bool Equal(const Twist& a,const TwistAcc& b,double eps);
+     IMETHOD friend bool Equal(const TwistAcc& a,const Twist& b,double eps);
 
 
      IMETHOD Twist GetTwist() const;
@@ -240,9 +240,18 @@ public:
 };
 
 
-
-
-
+IMETHOD bool Equal(const VectorAcc&,   const VectorAcc&,   double = epsilon);
+IMETHOD bool Equal(const Vector&,      const VectorAcc&,   double = epsilon);
+IMETHOD bool Equal(const VectorAcc&,   const Vector&,      double = epsilon);
+IMETHOD bool Equal(const RotationAcc&, const RotationAcc&, double = epsilon);
+IMETHOD bool Equal(const Rotation&,    const RotationAcc&, double = epsilon);
+IMETHOD bool Equal(const RotationAcc&, const Rotation&,    double = epsilon);
+IMETHOD bool Equal(const FrameAcc&,    const FrameAcc&,    double = epsilon);
+IMETHOD bool Equal(const Frame&,       const FrameAcc&,    double = epsilon);
+IMETHOD bool Equal(const FrameAcc&,    const Frame&,       double = epsilon);
+IMETHOD bool Equal(const TwistAcc&,    const TwistAcc&,    double = epsilon);
+IMETHOD bool Equal(const Twist&,       const TwistAcc&,    double = epsilon);
+IMETHOD bool Equal(const TwistAcc&,    const Twist&,       double = epsilon);
 
 
 #ifdef KDL_INLINE
diff --git a/intern/itasc/kdl/frames.hpp b/intern/itasc/kdl/frames.hpp
index 28a59898e20..87eedea29f7 100644
--- a/intern/itasc/kdl/frames.hpp
+++ b/intern/itasc/kdl/frames.hpp
@@ -248,10 +248,10 @@ public:
 
      //! do not use operator == because the definition of Equal(.,.) is slightly
      //! different.  It compares whether the 2 arguments are equal in an eps-interval
-     inline friend bool Equal(const Vector& a,const Vector& b,double eps=epsilon);
+     inline friend bool Equal(const Vector& a,const Vector& b,double eps);
 
 	 //! return a normalized vector
-	 inline friend Vector Normalize(const Vector& a, double eps=epsilon);
+	 inline friend Vector Normalize(const Vector& a, double eps);
 
 	 //! The literal equality operator==(), also identical.
      inline friend bool operator==(const Vector& a,const Vector& b);
@@ -261,7 +261,7 @@ public:
      friend class Rotation;
      friend class Frame;
 };
-
+	inline Vector Normalize(const Vector&, double eps=epsilon);
 
 /**
   \brief represents rotations in 3 dimensional space.
@@ -502,7 +502,7 @@ public:
 
      //! do not use operator == because the definition of Equal(.,.) is slightly
      //! different.  It compares whether the 2 arguments are equal in an eps-interval
-     friend bool Equal(const Rotation& a,const Rotation& b,double eps=epsilon);
+
 
 	 //! The literal equality operator==(), also identical.
      friend bool operator==(const Rotation& a,const Rotation& b);
@@ -663,7 +663,7 @@ public:
 
      //! do not use operator == because the definition of Equal(.,.) is slightly
      //! different.  It compares whether the 2 arguments are equal in an eps-interval
-     inline friend bool Equal(const Frame& a,const Frame& b,double eps=epsilon);
+     inline friend bool Equal(const Frame& a,const Frame& b,double eps);
 
 	 //! The literal equality operator==(), also identical.
      inline friend bool operator==(const Frame& a,const Frame& b);
@@ -735,7 +735,7 @@ public:
 
      //! do not use operator == because the definition of Equal(.,.) is slightly
      //! different.  It compares whether the 2 arguments are equal in an eps-interval
-     inline friend bool Equal(const Twist& a,const Twist& b,double eps=epsilon);
+     inline friend bool Equal(const Twist& a,const Twist& b,double eps);
 
 	 //! The literal equality operator==(), also identical.
      inline friend bool operator==(const Twist& a,const Twist& b);
@@ -898,7 +898,7 @@ public:
 
      //! do not use operator == because the definition of Equal(.,.) is slightly
      //! different.  It compares whether the 2 arguments are equal in an eps-interval
-     inline friend bool Equal(const Wrench& a,const Wrench& b,double eps=epsilon);
+     inline friend bool Equal(const Wrench& a,const Wrench& b,double eps);
 
 	 //! The literal equality operator==(), also identical.
      inline friend bool operator==(const Wrench& a,const Wrench& b);
@@ -979,7 +979,7 @@ public:
 
      //! do not use operator == because the definition of Equal(.,.) is slightly
      //! different.  It compares whether the 2 arguments are equal in an eps-interval
-     inline friend bool Equal(const Vector2& a,const Vector2& b,double eps=epsilon);
+     inline friend bool Equal(const Vector2& a,const Vector2& b,double eps);
 
     friend class Rotation2;
 };
@@ -1026,7 +1026,7 @@ public:
 
      //! do not use operator == because the definition of Equal(.,.) is slightly
      //! different.  It compares whether the 2 arguments are equal in an eps-interval
-     inline friend bool Equal(const Rotation2& a,const Rotation2& b,double eps=epsilon);
+     inline friend bool Equal(const Rotation2& a,const Rotation2& b,double eps);
 };
 
 //! A 2D frame class, for further documentation see the Frames class
@@ -1067,9 +1067,18 @@ public:
         tmp.SetIdentity();
         return tmp;
      }
-     inline friend bool Equal(const Frame2& a,const Frame2& b,double eps=epsilon);
+     inline friend bool Equal(const Frame2& a,const Frame2& b,double eps);
 };
 
+inline bool Equal(const Vector&,    const Vector&,    double = epsilon);
+       bool Equal(const Rotation&,  const Rotation&,  double = epsilon);
+inline bool Equal(const Frame&,     const Frame&,     double = epsilon);
+inline bool Equal(const Twist&,     const Twist&,     double = epsilon);
+inline bool Equal(const Wrench&,    const Wrench&,    double = epsilon);
+inline bool Equal(const Vector2&,   const Vector2&,   double = epsilon);
+inline bool Equal(const Rotation2&, const Rotation2&, double = epsilon);
+inline bool Equal(const Frame2&,    const Frame2&,    double = epsilon);
+       
 IMETHOD Vector diff(const Vector& a,const Vector& b,double dt=1);
 IMETHOD Vector diff(const Rotation& R_a_b1,const Rotation& R_a_b2,double dt=1);
 IMETHOD Twist diff(const Frame& F_a_b1,const Frame& F_a_b2,double dt=1);
diff --git a/intern/itasc/kdl/framevel.hpp b/intern/itasc/kdl/framevel.hpp
index e95c5ef7907..17e1f2adfa0 100644
--- a/intern/itasc/kdl/framevel.hpp
+++ b/intern/itasc/kdl/framevel.hpp
@@ -110,9 +110,9 @@ public:
     IMETHOD friend void SetToZero(VectorVel& v);
 
 
-    IMETHOD friend bool Equal(const VectorVel& r1,const VectorVel& r2,double eps=epsilon);
-    IMETHOD friend bool Equal(const Vector& r1,const VectorVel& r2,double eps=epsilon);
-    IMETHOD friend bool Equal(const VectorVel& r1,const Vector& r2,double eps=epsilon);
+    IMETHOD friend bool Equal(const VectorVel& r1,const VectorVel& r2,double eps);
+    IMETHOD friend bool Equal(const Vector& r1,const VectorVel& r2,double eps);
+    IMETHOD friend bool Equal(const VectorVel& r1,const Vector& r2,double eps);
     IMETHOD friend VectorVel operator - (const VectorVel& r);
     IMETHOD friend doubleVel dot(const VectorVel& lhs,const VectorVel& rhs);
     IMETHOD friend doubleVel dot(const VectorVel& lhs,const Vector& rhs);
@@ -166,9 +166,9 @@ public:
     IMETHOD friend RotationVel operator* (const RotationVel& r1,const RotationVel& r2);
     IMETHOD friend RotationVel operator* (const Rotation& r1,const RotationVel& r2);
     IMETHOD friend RotationVel operator* (const RotationVel& r1,const Rotation& r2);
-    IMETHOD friend bool Equal(const RotationVel& r1,const RotationVel& r2,double eps=epsilon);
-    IMETHOD friend bool Equal(const Rotation& r1,const RotationVel& r2,double eps=epsilon);
-    IMETHOD friend bool Equal(const RotationVel& r1,const Rotation& r2,double eps=epsilon);
+    IMETHOD friend bool Equal(const RotationVel& r1,const RotationVel& r2,double eps);
+    IMETHOD friend bool Equal(const Rotation& r1,const RotationVel& r2,double eps);
+    IMETHOD friend bool Equal(const RotationVel& r1,const Rotation& r2,double eps);
 
     IMETHOD TwistVel Inverse(const TwistVel& arg) const;
     IMETHOD TwistVel Inverse(const Twist& arg) const;
@@ -220,9 +220,9 @@ public:
     IMETHOD friend FrameVel operator * (const FrameVel& f1,const FrameVel& f2);
     IMETHOD friend FrameVel operator * (const Frame& f1,const FrameVel& f2);
     IMETHOD friend FrameVel operator * (const FrameVel& f1,const Frame& f2);
-    IMETHOD friend bool Equal(const FrameVel& r1,const FrameVel& r2,double eps=epsilon);
-    IMETHOD friend bool Equal(const Frame& r1,const FrameVel& r2,double eps=epsilon);
-    IMETHOD friend bool Equal(const FrameVel& r1,const Frame& r2,double eps=epsilon);
+    IMETHOD friend bool Equal(const FrameVel& r1,const FrameVel& r2,double eps);
+    IMETHOD friend bool Equal(const Frame& r1,const FrameVel& r2,double eps);
+    IMETHOD friend bool Equal(const FrameVel& r1,const Frame& r2,double eps);
 
     IMETHOD TwistVel  Inverse(const TwistVel& arg) const;
     IMETHOD TwistVel  Inverse(const Twist& arg) const;
@@ -292,9 +292,9 @@ public:
      // = Equality operators
      // do not use operator == because the definition of Equal(.,.) is slightly
      // different.  It compares whether the 2 arguments are equal in an eps-interval
-     IMETHOD friend bool Equal(const TwistVel& a,const TwistVel& b,double eps=epsilon);
-     IMETHOD friend bool Equal(const Twist& a,const TwistVel& b,double eps=epsilon);
-     IMETHOD friend bool Equal(const TwistVel& a,const Twist& b,double eps=epsilon);
+     IMETHOD friend bool Equal(const TwistVel& a,const TwistVel& b,double eps);
+     IMETHOD friend bool Equal(const Twist& a,const TwistVel& b,double eps);
+     IMETHOD friend bool Equal(const TwistVel& a,const Twist& b,double eps);
 
 // = Conversion to other entities
      IMETHOD Twist GetTwist() const;
@@ -305,6 +305,19 @@ public:
 
 };
 
+IMETHOD bool Equal(const VectorVel&,   const VectorVel&,   double = epsilon);
+IMETHOD bool Equal(const Vector&,      const VectorVel&,   double = epsilon);
+IMETHOD bool Equal(const VectorVel&,   const Vector&,      double = epsilon);
+IMETHOD bool Equal(const RotationVel&, const RotationVel&, double = epsilon);
+IMETHOD bool Equal(const Rotation&,    const RotationVel&, double = epsilon);
+IMETHOD bool Equal(const RotationVel&, const Rotation&,    double = epsilon);
+IMETHOD bool Equal(const FrameVel&,    const FrameVel&,    double = epsilon);
+IMETHOD bool Equal(const Frame&,       const FrameVel&,    double = epsilon);
+IMETHOD bool Equal(const FrameVel&,    const Frame&,       double = epsilon);
+IMETHOD bool Equal(const TwistVel&,    const TwistVel&,    double = epsilon);
+IMETHOD bool Equal(const Twist&,       const TwistVel&,    double = epsilon);
+IMETHOD bool Equal(const TwistVel&,    const Twist&,       double = epsilon);
+
 IMETHOD VectorVel diff(const VectorVel& a,const VectorVel& b,double dt=1.0) {
 	return VectorVel(diff(a.p,b.p,dt),diff(a.v,b.v,dt));
 }
diff --git a/intern/itasc/kdl/jacobian.hpp b/intern/itasc/kdl/jacobian.hpp
index e9057451c9f..9708ebd37be 100644
--- a/intern/itasc/kdl/jacobian.hpp
+++ b/intern/itasc/kdl/jacobian.hpp
@@ -45,7 +45,7 @@ namespace KDL
         bool operator ==(const Jacobian& arg);
         bool operator !=(const Jacobian& arg);
         
-        friend bool Equal(const Jacobian& a,const Jacobian& b,double eps=epsilon);
+        friend bool Equal(const Jacobian& a,const Jacobian& b,double eps);
         
 
         ~Jacobian();
@@ -63,6 +63,7 @@ namespace KDL
 
 
     };
+	bool Equal(const Jacobian&, const Jacobian&, double = epsilon);
 }
 
 #endif
diff --git a/intern/itasc/kdl/jntarray.hpp b/intern/itasc/kdl/jntarray.hpp
index ece6b0bdb6b..886171b11db 100644
--- a/intern/itasc/kdl/jntarray.hpp
+++ b/intern/itasc/kdl/jntarray.hpp
@@ -209,12 +209,12 @@ class MyTask : public RTT::TaskContext
          * @return true if each element of src1 is within eps of the same
 		 * element in src2, or if both src1 and src2 have no data (ie 0==rows())
          */
-        friend bool Equal(const JntArray& src1,const JntArray& src2,double eps=epsilon);
+        friend bool Equal(const JntArray& src1,const JntArray& src2,double eps);
 
         friend bool operator==(const JntArray& src1,const JntArray& src2);
         //friend bool operator!=(const JntArray& src1,const JntArray& src2);
         };
-
+	bool Equal(const JntArray&,const JntArray&, double = epsilon);
     bool operator==(const JntArray& src1,const JntArray& src2);
     //bool operator!=(const JntArray& src1,const JntArray& src2);
 
diff --git a/intern/itasc/kdl/jntarrayacc.hpp b/intern/itasc/kdl/jntarrayacc.hpp
index 275aa58f21e..fd1c26430e8 100644
--- a/intern/itasc/kdl/jntarrayacc.hpp
+++ b/intern/itasc/kdl/jntarrayacc.hpp
@@ -58,9 +58,10 @@ namespace KDL
         friend void Divide(const JntArrayAcc& src,const doubleVel& factor,JntArrayAcc& dest);
         friend void Divide(const JntArrayAcc& src,const doubleAcc& factor,JntArrayAcc& dest);
         friend void SetToZero(JntArrayAcc& array);
-        friend bool Equal(const JntArrayAcc& src1,const JntArrayAcc& src2,double eps=epsilon);
-
+        friend bool Equal(const JntArrayAcc& src1,const JntArrayAcc& src2,double eps);
     };
+
+    bool Equal(const JntArrayAcc&, const JntArrayAcc&, double = epsilon);
 }
 
 #endif
diff --git a/intern/itasc/kdl/jntarrayvel.hpp b/intern/itasc/kdl/jntarrayvel.hpp
index faa82076ebb..480f84f1708 100644
--- a/intern/itasc/kdl/jntarrayvel.hpp
+++ b/intern/itasc/kdl/jntarrayvel.hpp
@@ -51,9 +51,10 @@ namespace KDL
         friend void Divide(const JntArrayVel& src,const double& factor,JntArrayVel& dest);
         friend void Divide(const JntArrayVel& src,const doubleVel& factor,JntArrayVel& dest);
         friend void SetToZero(JntArrayVel& array);
-        friend bool Equal(const JntArrayVel& src1,const JntArrayVel& src2,double eps=epsilon);
-
+        friend bool Equal(const JntArrayVel& src1,const JntArrayVel& src2,double eps);
     };
+
+    bool Equal(const JntArrayVel&, const JntArrayVel&, double = epsilon);
 }
 
 #endif
diff --git a/intern/locale/CMakeLists.txt b/intern/locale/CMakeLists.txt
index 3599aa68545..217fe9a8c71 100644
--- a/intern/locale/CMakeLists.txt
+++ b/intern/locale/CMakeLists.txt
@@ -36,6 +36,14 @@ set(SRC
 	boost_locale_wrapper.h
 )
 
+if(WITH_HEADLESS)
+	add_definitions(-DWITH_HEADLESS)
+endif()
+
+if(WITH_GHOST_SDL)
+	add_definitions(-DWITH_GHOST_SDL)
+endif()
+
 if(WITH_INTERNATIONAL)
 	list(APPEND INC_SYS
 		${BOOST_INCLUDE_DIR}
@@ -51,5 +59,10 @@ blender_add_lib(bf_intern_locale "${SRC}" "${INC}" "${INC_SYS}")
 set(MSFFMT_SRC
 	msgfmt.cc
 )
-
 add_executable(msgfmt ${MSFFMT_SRC})
+
+if(CMAKE_C_COMPILER_ID MATCHES "Clang" AND (NOT (CMAKE_C_COMPILER_VERSION VERSION_LESS 3.4)))
+	# needed for clang 3.4+
+	target_link_libraries(msgfmt ${PLATFORM_LINKLIBS})
+endif()
+
diff --git a/intern/locale/SConscript b/intern/locale/SConscript
index 4136ac8237d..24828c120ec 100644
--- a/intern/locale/SConscript
+++ b/intern/locale/SConscript
@@ -66,10 +66,6 @@ if env['WITH_BF_INTERNATIONAL']:
 
         locale = env.Clone()
 
-        msgfmt_executable = targetpath
-        if env['OURPLATFORM'] in ('win32-vc', 'win64-vc', 'win32-mingw', 'win64-mingw'):
-            msgfmt_executable += ".exe"
-
         # dependencies
         dependencies = [msgfmt_target]
 
@@ -82,7 +78,7 @@ if env['WITH_BF_INTERNATIONAL']:
             po_file = os.path.join(po_dir, f)
             mo_file = os.path.join(build_dir, os.path.splitext(f)[0] + ".mo")
 
-            command = "\"%s\" \"%s\" \"%s\"" % (msgfmt_executable, po_file, mo_file)
+            command = "\"%s\" \"%s\" \"%s\"" % (targetpath, po_file, mo_file)
 
             locale.Command(mo_file, po_file, command)
             locale.Depends(mo_file, dependencies)
diff --git a/intern/locale/boost_locale_wrapper.cpp b/intern/locale/boost_locale_wrapper.cpp
index 945d0bbc5da..25843d60578 100644
--- a/intern/locale/boost_locale_wrapper.cpp
+++ b/intern/locale/boost_locale_wrapper.cpp
@@ -64,7 +64,7 @@ void bl_locale_set(const char *locale)
 			_locale = gen(locale);
 		}
 		else {
-#ifdef __APPLE__
+#if defined(__APPLE__) && !defined(WITH_HEADLESS) && !defined(WITH_GHOST_SDL)
 			extern char GHOST_user_locale[128]; // pulled from Ghost_SystemCocoa
 			std::string locale_osx = GHOST_user_locale + std::string(".UTF-8");
 			_locale = gen(locale_osx.c_str());
@@ -113,7 +113,11 @@ const char *bl_locale_pgettext(const char *msgctxt, const char *msgid)
 			return r;
 		return msgid;
 	}
-	catch(std::exception const &) {
+	catch(std::bad_cast const &e) { /* if std::has_facet<char_message_facet>(l) == false, LC_ALL = "C" case */
+//		std::cout << "bl_locale_pgettext(" << msgid << "): " << e.what() << " \n";
+		return msgid;
+	}
+	catch(std::exception const &e) {
 //		std::cout << "bl_locale_pgettext(" << msgctxt << ", " << msgid << "): " << e.what() << " \n";
 		return msgid;
 	}
diff --git a/intern/rigidbody/rb_bullet_api.cpp b/intern/rigidbody/rb_bullet_api.cpp
index ab7b851911a..6d39e328e82 100644
--- a/intern/rigidbody/rb_bullet_api.cpp
+++ b/intern/rigidbody/rb_bullet_api.cpp
@@ -726,8 +726,8 @@ rbMeshData *RB_trimesh_data_new(int num_tris, int num_verts)
 static void RB_trimesh_data_delete(rbMeshData *mesh)
 {
 	delete mesh->index_array;
-	delete mesh->vertices;
-	delete mesh->triangles;
+	delete[] mesh->vertices;
+	delete[] mesh->triangles;
 	delete mesh;
 }
  
diff --git a/intern/utfconv/utfconv.c b/intern/utfconv/utfconv.c
index 7f7a612528d..e5f8756917f 100644
--- a/intern/utfconv/utfconv.c
+++ b/intern/utfconv/utfconv.c
@@ -170,7 +170,7 @@ int conv_utf_8_to_16(const char *in8, wchar_t *out16, size_t size16)
 {
 	char u;
 	char type = 0;
-	wchar_t u32 = 0;
+	unsigned int u32 = 0;
 	wchar_t *out16end = out16 + size16;
 	int err = 0;
 	if (!size16 || !in8 || !out16) return UTF_ERROR_NULL_IN;
author	Jason Wilkins <Jason.A.Wilkins@gmail.com>	2014-05-22 04:02:02 +0400
committer	Jason Wilkins <Jason.A.Wilkins@gmail.com>	2014-05-22 04:02:02 +0400
commit	6eff1cbebcf0766d2fe69db9b0fb3f76ede2c06b (patch)
tree	3af4122e291f53f88b63ec6ded2e0fa7790e04ac /intern
parent	49de1ada8dcba35862759e0f7da5ca2209b4f588 (diff)
parent	146a1c77eacb925eb7c86bb49495c0f09adc607c (diff)