19 files changed, 387 insertions, 164 deletions
diff --git a/intern/cycles/blender/blender_mesh.cpp b/intern/cycles/blender/blender_mesh.cpp
index 1f0aa5eef34..55ef913408f 100644
--- a/intern/cycles/blender/blender_mesh.cpp
+++ b/intern/cycles/blender/blender_mesh.cpp
@@ -657,7 +657,7 @@ static void create_mesh(Scene *scene,
 
 static void create_subd_mesh(Scene *scene,
                              Mesh *mesh,
-                             BL::Object b_ob,
+                             BL::Object& b_ob,
                              BL::Mesh& b_mesh,
                              PointerRNA *cmesh,
                              const vector<uint>& used_shaders,
@@ -976,7 +976,12 @@ void BlenderSync::sync_mesh_motion(BL::Object& b_ob,
 			   memcmp(mP, &mesh->verts[0], sizeof(float3)*numverts) == 0)
 			{
 				/* no motion, remove attributes again */
-				VLOG(1) << "No actual deformation motion for object " << b_ob.name();
+				if(b_mesh.vertices.length() != numverts) {
+					VLOG(1) << "Topology differs, disabling motion blur.";
+				}
+				else {
+					VLOG(1) << "No actual deformation motion for object " << b_ob.name();
+				}
 				mesh->attributes.remove(ATTR_STD_MOTION_VERTEX_POSITION);
 				if(attr_mN)
 					mesh->attributes.remove(ATTR_STD_MOTION_VERTEX_NORMAL);
diff --git a/intern/cycles/bvh/bvh.cpp b/intern/cycles/bvh/bvh.cpp
index 9e63485c04e..5c474c8c3e9 100644
--- a/intern/cycles/bvh/bvh.cpp
+++ b/intern/cycles/bvh/bvh.cpp
@@ -327,6 +327,9 @@ void BVH::pack_instances(size_t nodes_size, size_t leaf_nodes_size)
 				data.x += prim_offset;
 				data.y += prim_offset;
 				pack_leaf_nodes[pack_leaf_nodes_offset] = data;
+				for(int j = 1; j < nsize_leaf; ++j) {
+					pack_leaf_nodes[pack_leaf_nodes_offset + j] = leaf_nodes_offset[i + j];
+				}
 				pack_leaf_nodes_offset += nsize_leaf;
 			}
 		}
diff --git a/intern/cycles/bvh/bvh_build.cpp b/intern/cycles/bvh/bvh_build.cpp
index a0b09c780ce..bba89a8f35c 100644
--- a/intern/cycles/bvh/bvh_build.cpp
+++ b/intern/cycles/bvh/bvh_build.cpp
@@ -617,7 +617,7 @@ BVHNode* BVHBuild::create_leaf_node(const BVHRange& range,
 	                                        BoundBox::empty,
 	                                        BoundBox::empty};
 	int ob_num = 0;
-
+	int num_new_prims = 0;
 	/* Fill in per-type type/index array. */
 	for(int i = 0; i < range.size(); i++) {
 		const BVHReference& ref = references[range.start() + i];
@@ -629,10 +629,11 @@ BVHNode* BVHBuild::create_leaf_node(const BVHRange& range,
 
 			bounds[type_index].grow(ref.bounds());
 			visibility[type_index] |= objects[ref.prim_object()]->visibility;
+			++num_new_prims;
 		}
 		else {
 			object_references.push_back(ref);
-			ob_num++;
+			++ob_num;
 		}
 	}
 
@@ -651,11 +652,11 @@ BVHNode* BVHBuild::create_leaf_node(const BVHRange& range,
 	vector<int, LeafStackAllocator> local_prim_type,
 	                                local_prim_index,
 	                                local_prim_object;
+	local_prim_type.resize(num_new_prims);
+	local_prim_index.resize(num_new_prims);
+	local_prim_object.resize(num_new_prims);
 	for(int i = 0; i < PRIMITIVE_NUM_TOTAL; ++i) {
 		int num = (int)p_type[i].size();
-		local_prim_type.resize(start_index + num);
-		local_prim_index.resize(start_index + num);
-		local_prim_object.resize(start_index + num);
 		if(num != 0) {
 			assert(p_type[i].size() == p_index[i].size());
 			assert(p_type[i].size() == p_object[i].size());
diff --git a/intern/cycles/bvh/bvh_split.cpp b/intern/cycles/bvh/bvh_split.cpp
index 9185bd99d10..8084975565e 100644
--- a/intern/cycles/bvh/bvh_split.cpp
+++ b/intern/cycles/bvh/bvh_split.cpp
@@ -44,6 +44,8 @@ BVHObjectSplit::BVHObjectSplit(BVHBuild *builder,
 	const BVHReference *ref_ptr = &references_->at(range.start());
 	float min_sah = FLT_MAX;
 
+	storage_->right_bounds.resize(range.size());
+
 	for(int dim = 0; dim < 3; dim++) {
 		/* Sort references. */
 		bvh_reference_sort(range.start(),
@@ -53,8 +55,6 @@ BVHObjectSplit::BVHObjectSplit(BVHBuild *builder,
 
 		/* sweep right to left and determine bounds. */
 		BoundBox right_bounds = BoundBox::empty;
-
-		storage_->right_bounds.resize(range.size());
 		for(int i = range.size() - 1; i > 0; i--) {
 			right_bounds.grow(ref_ptr[i].bounds());
 			storage_->right_bounds[i - 1] = right_bounds;
@@ -157,11 +157,10 @@ BVHSpatialSplit::BVHSpatialSplit(const BVHBuild& builder,
 	}
 
 	/* select best split plane. */
+	storage_->right_bounds.resize(BVHParams::NUM_SPATIAL_BINS);
 	for(int dim = 0; dim < 3; dim++) {
 		/* sweep right to left and determine bounds. */
 		BoundBox right_bounds = BoundBox::empty;
-
-		storage_->right_bounds.resize(BVHParams::NUM_SPATIAL_BINS);
 		for(int i = BVHParams::NUM_SPATIAL_BINS - 1; i > 0; i--) {
 			right_bounds.grow(storage_->bins[dim][i].bounds);
 			storage_->right_bounds[i - 1] = right_bounds;
diff --git a/intern/cycles/kernel/kernel_bake.h b/intern/cycles/kernel/kernel_bake.h
index 8e7a2c1b62b..d0ca256f323 100644
--- a/intern/cycles/kernel/kernel_bake.h
+++ b/intern/cycles/kernel/kernel_bake.h
@@ -303,7 +303,7 @@ ccl_device void kernel_bake_evaluate(KernelGlobals *kg, ccl_global uint4 *input,
 
 	/* dummy initilizations copied from SHADER_EVAL_DISPLACE */
 	float3 I = Ng;
-	float t = 0.0f;
+	float t = 1.0f;
 	float time = TIME_INVALID;
 
 	/* light passes */
diff --git a/intern/cycles/render/light.cpp b/intern/cycles/render/light.cpp
index 1637045ce84..fef28b25f3e 100644
--- a/intern/cycles/render/light.cpp
+++ b/intern/cycles/render/light.cpp
@@ -771,11 +771,11 @@ void LightManager::device_update_points(Device *device,
 
 void LightManager::device_update(Device *device, DeviceScene *dscene, Scene *scene, Progress& progress)
 {
-	VLOG(1) << "Total " << scene->lights.size() << " lights.";
-
 	if(!need_update)
 		return;
 
+	VLOG(1) << "Total " << scene->lights.size() << " lights.";
+
 	device_free(device, dscene);
 
 	use_light_visibility = false;
diff --git a/intern/cycles/render/mesh.cpp b/intern/cycles/render/mesh.cpp
index 241a1c44ebf..cc8519219ed 100644
--- a/intern/cycles/render/mesh.cpp
+++ b/intern/cycles/render/mesh.cpp
@@ -528,7 +528,7 @@ void Mesh::compute_bvh(SceneParams *params, Progress *progress, int n, int total
 
 			delete bvh;
 			bvh = BVH::create(bparams, objects);
-			bvh->build(*progress);
+			MEM_GUARDED_CALL(progress, bvh->build, *progress);
 		}
 	}
 
@@ -1232,11 +1232,11 @@ void MeshManager::device_update_displacement_images(Device *device,
 
 void MeshManager::device_update(Device *device, DeviceScene *dscene, Scene *scene, Progress& progress)
 {
-	VLOG(1) << "Total " << scene->meshes.size() << " meshes.";
-
 	if(!need_update)
 		return;
 
+	VLOG(1) << "Total " << scene->meshes.size() << " meshes.";
+
 	/* update normals */
 	foreach(Mesh *mesh, scene->meshes) {
 		foreach(uint shader, mesh->used_shaders) {
diff --git a/intern/cycles/render/object.cpp b/intern/cycles/render/object.cpp
index 42bb665cb9f..a7ea75820ea 100644
--- a/intern/cycles/render/object.cpp
+++ b/intern/cycles/render/object.cpp
@@ -231,160 +231,260 @@ ObjectManager::~ObjectManager()
 {
 }
 
-void ObjectManager::device_update_transforms(Device *device, DeviceScene *dscene, Scene *scene, uint *object_flag, Progress& progress)
+void ObjectManager::device_update_object_transform(UpdateObejctTransformState *state,
+                                                   Object *ob,
+                                                   int object_index)
 {
-	float4 *objects;
-	float4 *objects_vector = NULL;
-	int i = 0;
-	map<Mesh*, float> surface_area_map;
-	map<ParticleSystem*, int> particle_offset;
-	Scene::MotionType need_motion = scene->need_motion(device->info.advanced_shading);
-	bool have_motion = false;
-	bool have_curves = false;
-
-	objects = dscene->objects.resize(OBJECT_SIZE*scene->objects.size());
-	if(need_motion == Scene::MOTION_PASS)
-		objects_vector = dscene->objects_vector.resize(OBJECT_VECTOR_SIZE*scene->objects.size());
-
-	/* particle system device offsets
-	 * 0 is dummy particle, index starts at 1
+	float4 *objects = state->objects;
+	float4 *objects_vector = state->objects_vector;
+
+	Mesh *mesh = ob->mesh;
+	uint flag = 0;
+
+	/* Compute transformations. */
+	Transform tfm = ob->tfm;
+	Transform itfm = transform_inverse(tfm);
+
+	/* Compute surface area. for uniform scale we can do avoid the many
+	 * transform calls and share computation for instances.
+	 *
+	 * TODO(brecht): Correct for displacement, and move to a better place.
 	 */
-	int numparticles = 1;
-	foreach(ParticleSystem *psys, scene->particle_systems) {
-		particle_offset[psys] = numparticles;
-		numparticles += psys->particles.size();
-	}
+	float uniform_scale;
+	float surface_area = 0.0f;
+	float pass_id = ob->pass_id;
+	float random_number = (float)ob->random_id * (1.0f/(float)0xFFFFFFFF);
+	int particle_index = (ob->particle_system)
+	        ? ob->particle_index + state->particle_offset[ob->particle_system]
+	        : 0;
+
+	if(transform_uniform_scale(tfm, uniform_scale)) {
+		map<Mesh*, float>::iterator it;
+
+		/* NOTE: This isn't fully optimal and could in theory lead to multiple
+		 * threads calculating area of the same mesh in parallel. However, this
+		 * also prevents suspending all the threads when some mesh's area is
+		 * not yet known.
+		 */
+		state->surface_area_lock.lock();
+		it = state->surface_area_map.find(mesh);
+		state->surface_area_lock.unlock();
 
-	foreach(Object *ob, scene->objects) {
-		Mesh *mesh = ob->mesh;
-		uint flag = 0;
-
-		/* compute transformations */
-		Transform tfm = ob->tfm;
-		Transform itfm = transform_inverse(tfm);
-
-		/* compute surface area. for uniform scale we can do avoid the many
-		 * transform calls and share computation for instances */
-		/* todo: correct for displacement, and move to a better place */
-		float uniform_scale;
-		float surface_area = 0.0f;
-		float pass_id = ob->pass_id;
-		float random_number = (float)ob->random_id * (1.0f/(float)0xFFFFFFFF);
-		int particle_index = (ob->particle_system)? ob->particle_index + particle_offset[ob->particle_system]: 0;
-
-		if(transform_uniform_scale(tfm, uniform_scale)) {
-			map<Mesh*, float>::iterator it = surface_area_map.find(mesh);
-
-			if(it == surface_area_map.end()) {
-				foreach(Mesh::Triangle& t, mesh->triangles) {
-					float3 p1 = mesh->verts[t.v[0]];
-					float3 p2 = mesh->verts[t.v[1]];
-					float3 p3 = mesh->verts[t.v[2]];
-
-					surface_area += triangle_area(p1, p2, p3);
-				}
+		if(it == state->surface_area_map.end()) {
+			foreach(Mesh::Triangle& t, mesh->triangles) {
+				float3 p1 = mesh->verts[t.v[0]];
+				float3 p2 = mesh->verts[t.v[1]];
+				float3 p3 = mesh->verts[t.v[2]];
 
-				surface_area_map[mesh] = surface_area;
+				surface_area += triangle_area(p1, p2, p3);
 			}
-			else
-				surface_area = it->second;
 
-			surface_area *= uniform_scale;
+			state->surface_area_lock.lock();
+			state->surface_area_map[mesh] = surface_area;
+			state->surface_area_lock.unlock();
 		}
 		else {
-			foreach(Mesh::Triangle& t, mesh->triangles) {
-				float3 p1 = transform_point(&tfm, mesh->verts[t.v[0]]);
-				float3 p2 = transform_point(&tfm, mesh->verts[t.v[1]]);
-				float3 p3 = transform_point(&tfm, mesh->verts[t.v[2]]);
+			surface_area = it->second;
+		}
 
-				surface_area += triangle_area(p1, p2, p3);
-			}
+		surface_area *= uniform_scale;
+	}
+	else {
+		foreach(Mesh::Triangle& t, mesh->triangles) {
+			float3 p1 = transform_point(&tfm, mesh->verts[t.v[0]]);
+			float3 p2 = transform_point(&tfm, mesh->verts[t.v[1]]);
+			float3 p3 = transform_point(&tfm, mesh->verts[t.v[2]]);
+
+			surface_area += triangle_area(p1, p2, p3);
 		}
+	}
 
-		/* pack in texture */
-		int offset = i*OBJECT_SIZE;
-
-		/* OBJECT_TRANSFORM */
-		memcpy(&objects[offset], &tfm, sizeof(float4)*3);
-		/* OBJECT_INVERSE_TRANSFORM */
-		memcpy(&objects[offset+4], &itfm, sizeof(float4)*3);
-		/* OBJECT_PROPERTIES */
-		objects[offset+8] = make_float4(surface_area, pass_id, random_number, __int_as_float(particle_index));
-
-		if(need_motion == Scene::MOTION_PASS) {
-			/* motion transformations, is world/object space depending if mesh
-			 * comes with deformed position in object space, or if we transform
-			 * the shading point in world space */
-			Transform mtfm_pre = ob->motion.pre;
-			Transform mtfm_post = ob->motion.post;
-
-			if(!mesh->attributes.find(ATTR_STD_MOTION_VERTEX_POSITION)) {
-				mtfm_pre = mtfm_pre * itfm;
-				mtfm_post = mtfm_post * itfm;
-			}
-			else {
-				flag |= SD_OBJECT_HAS_VERTEX_MOTION;
-			}
+	/* Pack in texture. */
+	int offset = object_index*OBJECT_SIZE;
 
-			memcpy(&objects_vector[i*OBJECT_VECTOR_SIZE+0], &mtfm_pre, sizeof(float4)*3);
-			memcpy(&objects_vector[i*OBJECT_VECTOR_SIZE+3], &mtfm_post, sizeof(float4)*3);
+	/* OBJECT_TRANSFORM */
+	memcpy(&objects[offset], &tfm, sizeof(float4)*3);
+	/* OBJECT_INVERSE_TRANSFORM */
+	memcpy(&objects[offset+4], &itfm, sizeof(float4)*3);
+	/* OBJECT_PROPERTIES */
+	objects[offset+8] = make_float4(surface_area, pass_id, random_number, __int_as_float(particle_index));
+
+	if(state->need_motion == Scene::MOTION_PASS) {
+		/* Motion transformations, is world/object space depending if mesh
+		 * comes with deformed position in object space, or if we transform
+		 * the shading point in world space.
+		 */
+		Transform mtfm_pre = ob->motion.pre;
+		Transform mtfm_post = ob->motion.post;
+
+		if(!mesh->attributes.find(ATTR_STD_MOTION_VERTEX_POSITION)) {
+			mtfm_pre = mtfm_pre * itfm;
+			mtfm_post = mtfm_post * itfm;
+		}
+		else {
+			flag |= SD_OBJECT_HAS_VERTEX_MOTION;
 		}
+
+		memcpy(&objects_vector[object_index*OBJECT_VECTOR_SIZE+0], &mtfm_pre, sizeof(float4)*3);
+		memcpy(&objects_vector[object_index*OBJECT_VECTOR_SIZE+3], &mtfm_post, sizeof(float4)*3);
+	}
 #ifdef __OBJECT_MOTION__
-		else if(need_motion == Scene::MOTION_BLUR) {
-			if(ob->use_motion) {
-				/* decompose transformations for interpolation */
-				DecompMotionTransform decomp;
-
-				transform_motion_decompose(&decomp, &ob->motion, &ob->tfm);
-				memcpy(&objects[offset], &decomp, sizeof(float4)*8);
-				flag |= SD_OBJECT_MOTION;
-				have_motion = true;
-			}
+	else if(state->need_motion == Scene::MOTION_BLUR) {
+		if(ob->use_motion) {
+			/* decompose transformations for interpolation. */
+			DecompMotionTransform decomp;
+
+			transform_motion_decompose(&decomp, &ob->motion, &ob->tfm);
+			memcpy(&objects[offset], &decomp, sizeof(float4)*8);
+			flag |= SD_OBJECT_MOTION;
+			state->have_motion = true;
 		}
+	}
 #endif
 
-		if(mesh->use_motion_blur)
-			have_motion = true;
+	if(mesh->use_motion_blur) {
+		state->have_motion = true;
+	}
 
-		/* dupli object coords and motion info */
-		int totalsteps = mesh->motion_steps;
-		int numsteps = (totalsteps - 1)/2;
-		int numverts = mesh->verts.size();
-		int numkeys = mesh->curve_keys.size();
+	/* Dupli object coords and motion info. */
+	int totalsteps = mesh->motion_steps;
+	int numsteps = (totalsteps - 1)/2;
+	int numverts = mesh->verts.size();
+	int numkeys = mesh->curve_keys.size();
 
-		objects[offset+9] = make_float4(ob->dupli_generated[0], ob->dupli_generated[1], ob->dupli_generated[2], __int_as_float(numkeys));
-		objects[offset+10] = make_float4(ob->dupli_uv[0], ob->dupli_uv[1], __int_as_float(numsteps), __int_as_float(numverts));
+	objects[offset+9] = make_float4(ob->dupli_generated[0], ob->dupli_generated[1], ob->dupli_generated[2], __int_as_float(numkeys));
+	objects[offset+10] = make_float4(ob->dupli_uv[0], ob->dupli_uv[1], __int_as_float(numsteps), __int_as_float(numverts));
 
-		/* object flag */
-		if(ob->use_holdout)
-			flag |= SD_HOLDOUT_MASK;
-		object_flag[i] = flag;
+	/* Object flag. */
+	if(ob->use_holdout) {
+		flag |= SD_HOLDOUT_MASK;
+	}
+	state->object_flag[object_index] = flag;
 
-		/* have curves */
-		if(mesh->curves.size())
-			have_curves = true;
+	/* Have curves. */
+	if(mesh->curves.size()) {
+		state->have_curves = true;
+	}
+}
 
-		i++;
+bool ObjectManager::device_update_object_transform_pop_work(
+        UpdateObejctTransformState *state,
+        int *start_index,
+        int *num_objects)
+{
+	/* Tweakable parameter, number of objects per chunk.
+	 * Too small value will cause some extra overhead due to spin lock,
+	 * too big value might not use all threads nicely.
+	 */
+	static const int OBJECTS_PER_TASK = 32;
+	bool have_work = false;
+	state->queue_lock.lock();
+	int num_scene_objects = state->scene->objects.size();
+	if(state->queue_start_object < num_scene_objects) {
+		int count = min(OBJECTS_PER_TASK,
+		                num_scene_objects - state->queue_start_object);
+		*start_index = state->queue_start_object;
+		*num_objects = count;
+		state->queue_start_object += count;
+		have_work = true;
+	}
+	state->queue_lock.unlock();
+	return have_work;
+}
+
+void ObjectManager::device_update_object_transform_task(
+        UpdateObejctTransformState *state)
+{
+	int start_index, num_objects;
+	while(device_update_object_transform_pop_work(state,
+	                                              &start_index,
+	                                              &num_objects))
+	{
+		for(int i = 0; i < num_objects; ++i) {
+			const int object_index = start_index + i;
+			Object *ob = state->scene->objects[object_index];
+			device_update_object_transform(state, ob, object_index);
+		}
+	}
+}
 
-		if(progress.get_cancel()) return;
+void ObjectManager::device_update_transforms(Device *device,
+                                             DeviceScene *dscene,
+                                             Scene *scene,
+                                             uint *object_flag,
+                                             Progress& progress)
+{
+	UpdateObejctTransformState state;
+	state.need_motion = scene->need_motion(device->info.advanced_shading);
+	state.have_motion = false;
+	state.have_curves = false;
+	state.scene = scene;
+	state.queue_start_object = 0;
+
+	state.object_flag = object_flag;
+	state.objects = dscene->objects.resize(OBJECT_SIZE*scene->objects.size());
+	if(state.need_motion == Scene::MOTION_PASS) {
+		state.objects_vector = dscene->objects_vector.resize(OBJECT_VECTOR_SIZE*scene->objects.size());
+	}
+	else {
+		state.objects_vector = NULL;
+	}
+
+	/* Particle system device offsets
+	 * 0 is dummy particle, index starts at 1.
+	 */
+	int numparticles = 1;
+	foreach(ParticleSystem *psys, scene->particle_systems) {
+		state.particle_offset[psys] = numparticles;
+		numparticles += psys->particles.size();
+	}
+
+	/* NOTE: If it's just a handful of objects we deal with them in a single
+	 * thread to avoid threading overhead. However, this threshold is might
+	 * need some tweaks to make mid-complex scenes optimal.
+	 */
+	if(scene->objects.size() < 64) {
+		int object_index = 0;
+		foreach(Object *ob, scene->objects) {
+			device_update_object_transform(&state, ob, object_index);
+			object_index++;
+			if(progress.get_cancel()) {
+				return;
+			}
+		}
+	}
+	else {
+		const int num_threads = TaskScheduler::num_threads();
+		TaskPool pool;
+		for(int i = 0; i < num_threads; ++i) {
+			pool.push(function_bind(
+			        &ObjectManager::device_update_object_transform_task,
+			        this,
+			        &state));
+		}
+		pool.wait_work();
+		if(progress.get_cancel()) {
+			return;
+		}
 	}
 
 	device->tex_alloc("__objects", dscene->objects);
-	if(need_motion == Scene::MOTION_PASS)
+	if(state.need_motion == Scene::MOTION_PASS) {
 		device->tex_alloc("__objects_vector", dscene->objects_vector);
+	}
 
-	dscene->data.bvh.have_motion = have_motion;
-	dscene->data.bvh.have_curves = have_curves;
+	dscene->data.bvh.have_motion = state.have_motion;
+	dscene->data.bvh.have_curves = state.have_curves;
 	dscene->data.bvh.have_instancing = true;
 }
 
 void ObjectManager::device_update(Device *device, DeviceScene *dscene, Scene *scene, Progress& progress)
 {
-	VLOG(1) << "Total " << scene->objects.size() << " objects.";
-
 	if(!need_update)
 		return;
-	
+
+	VLOG(1) << "Total " << scene->objects.size() << " objects.";
+
 	device_free(device, dscene);
 
 	if(scene->objects.size() == 0)
diff --git a/intern/cycles/render/object.h b/intern/cycles/render/object.h
index 379d1748cdd..c2a79ca8dc4 100644
--- a/intern/cycles/render/object.h
+++ b/intern/cycles/render/object.h
@@ -17,9 +17,12 @@
 #ifndef __OBJECT_H__
 #define __OBJECT_H__
 
+#include "scene.h"
+
 #include "util_boundbox.h"
 #include "util_param.h"
 #include "util_transform.h"
+#include "util_thread.h"
 #include "util_types.h"
 
 CCL_NAMESPACE_BEGIN
@@ -76,7 +79,12 @@ public:
 	~ObjectManager();
 
 	void device_update(Device *device, DeviceScene *dscene, Scene *scene, Progress& progress);
-	void device_update_transforms(Device *device, DeviceScene *dscene, Scene *scene, uint *object_flag, Progress& progress);
+	void device_update_transforms(Device *device,
+	                              DeviceScene *dscene,
+	                              Scene *scene,
+	                              uint *object_flag,
+	                              Progress& progress);
+
 	void device_update_flags(Device *device,
 	                         DeviceScene *dscene,
 	                         Scene *scene,
@@ -87,6 +95,56 @@ public:
 	void tag_update(Scene *scene);
 
 	void apply_static_transforms(DeviceScene *dscene, Scene *scene, uint *object_flag, Progress& progress);
+
+protected:
+	/* Global state of object transform update. */
+	struct UpdateObejctTransformState {
+		/* Global state used by device_update_object_transform().
+		 * Common for both threaded and non-threaded update.
+		 */
+
+		/* Type of the motion required by the scene settings. */
+		Scene::MotionType need_motion;
+
+		/* Mapping from particle system to a index in packed particle array.
+		 * Only used for read.
+		 */
+		map<ParticleSystem*, int> particle_offset;
+
+		/* Mesh area.
+		 * Used to avoid calculation of mesh area multiple times. Used for both
+		 * read and write. Acquire surface_area_lock to keep it all thread safe.
+		 */
+		map<Mesh*, float> surface_area_map;
+
+		/* Packed object arrays. Those will be filled in. */
+		uint *object_flag;
+		float4 *objects;
+		float4 *objects_vector;
+
+		/* Flags which will be synchronized to Integrator. */
+		bool have_motion;
+		bool have_curves;
+
+		/* ** Scheduling queue. ** */
+
+		Scene *scene;
+
+		/* Some locks to keep everything thread-safe. */
+		thread_spin_lock queue_lock;
+		thread_spin_lock surface_area_lock;
+
+		/* First unused object index in the queue. */
+		int queue_start_object;
+	};
+	void device_update_object_transform(UpdateObejctTransformState *state,
+	                                    Object *ob,
+	                                    const int object_index);
+	void device_update_object_transform_task(UpdateObejctTransformState *state);
+	bool device_update_object_transform_pop_work(
+	        UpdateObejctTransformState *state,
+	        int *start_index,
+	        int *num_objects);
 };
 
 CCL_NAMESPACE_END
diff --git a/intern/cycles/render/osl.cpp b/intern/cycles/render/osl.cpp
index e1c5416b024..cb3cb8b9b1b 100644
--- a/intern/cycles/render/osl.cpp
+++ b/intern/cycles/render/osl.cpp
@@ -75,11 +75,11 @@ void OSLShaderManager::reset(Scene * /*scene*/)
 
 void OSLShaderManager::device_update(Device *device, DeviceScene *dscene, Scene *scene, Progress& progress)
 {
-	VLOG(1) << "Total " << scene->shaders.size() << " shaders.";
-
 	if(!need_update)
 		return;
 
+	VLOG(1) << "Total " << scene->shaders.size() << " shaders.";
+
 	device_free(device, dscene, scene);
 
 	/* determine which shaders are in use */
diff --git a/intern/cycles/render/particles.cpp b/intern/cycles/render/particles.cpp
index 8f9e8c6d639..50726bb4574 100644
--- a/intern/cycles/render/particles.cpp
+++ b/intern/cycles/render/particles.cpp
@@ -93,12 +93,12 @@ void ParticleSystemManager::device_update_particles(Device *device, DeviceScene
 
 void ParticleSystemManager::device_update(Device *device, DeviceScene *dscene, Scene *scene, Progress& progress)
 {
+	if(!need_update)
+		return;
+
 	VLOG(1) << "Total " << scene->particle_systems.size()
 	        << " particle systems.";
 
-	if(!need_update)
-		return;
-	
 	device_free(device, dscene);
 
 	progress.set_status("Updating Particle Systems", "Copying Particles to device");
diff --git a/intern/cycles/render/scene.cpp b/intern/cycles/render/scene.cpp
index 29163c53109..b0052c30af4 100644
--- a/intern/cycles/render/scene.cpp
+++ b/intern/cycles/render/scene.cpp
@@ -135,7 +135,9 @@ void Scene::device_update(Device *device_, Progress& progress)
 {
 	if(!device)
 		device = device_;
-	
+
+	bool print_stats = need_data_update();
+
 	/* The order of updates is important, because there's dependencies between
 	 * the different managers, using data computed by previous managers.
 	 *
@@ -239,9 +241,11 @@ void Scene::device_update(Device *device_, Progress& progress)
 		device->const_copy_to("__data", &dscene.data, sizeof(dscene.data));
 	}
 
-	VLOG(1) << "System memory statistics after full device sync:\n"
-	        << "  Usage: " << util_guarded_get_mem_used() << "\n"
-	        << "  Peak: " << util_guarded_get_mem_peak();
+	if(print_stats) {
+		VLOG(1) << "System memory statistics after full device sync:\n"
+		        << "  Usage: " << util_guarded_get_mem_used() << "\n"
+		        << "  Peak: " << util_guarded_get_mem_peak();
+	}
 }
 
 Scene::MotionType Scene::need_motion(bool advanced_shading)
@@ -278,11 +282,10 @@ bool Scene::need_update()
 	return (need_reset() || film->need_update);
 }
 
-bool Scene::need_reset()
+bool Scene::need_data_update()
 {
 	return (background->need_update
 		|| image_manager->need_update
-		|| camera->need_update
 		|| object_manager->need_update
 		|| mesh_manager->need_update
 		|| light_manager->need_update
@@ -295,6 +298,11 @@ bool Scene::need_reset()
 		|| film->need_update);
 }
 
+bool Scene::need_reset()
+{
+	return need_data_update() || camera->need_update;
+}
+
 void Scene::reset()
 {
 	shader_manager->reset(this);
diff --git a/intern/cycles/render/scene.h b/intern/cycles/render/scene.h
index d30a0cb45fe..b29aff88c01 100644
--- a/intern/cycles/render/scene.h
+++ b/intern/cycles/render/scene.h
@@ -213,6 +213,11 @@ public:
 	void device_free();
 
 protected:
+	/* Check if some heavy data worth logging was updated.
+	 * Mainly used to suppress extra annoying logging.
+	 */
+	bool need_data_update();
+
 	void free_memory(bool final);
 };
 
diff --git a/intern/cycles/render/session.cpp b/intern/cycles/render/session.cpp
index 24f48b61349..63037311889 100644
--- a/intern/cycles/render/session.cpp
+++ b/intern/cycles/render/session.cpp
@@ -816,7 +816,7 @@ void Session::update_scene()
 	/* update scene */
 	if(scene->need_update()) {
 		progress.set_status("Updating Scene");
-		scene->device_update(device, progress);
+		MEM_GUARDED_CALL(&progress, scene->device_update, device, progress);
 	}
 }
 
diff --git a/intern/cycles/render/svm.cpp b/intern/cycles/render/svm.cpp
index f3d39c1bd72..56fb57e9667 100644
--- a/intern/cycles/render/svm.cpp
+++ b/intern/cycles/render/svm.cpp
@@ -46,11 +46,11 @@ void SVMShaderManager::reset(Scene * /*scene*/)
 
 void SVMShaderManager::device_update(Device *device, DeviceScene *dscene, Scene *scene, Progress& progress)
 {
-	VLOG(1) << "Total " << scene->shaders.size() << " shaders.";
-
 	if(!need_update)
 		return;
 
+	VLOG(1) << "Total " << scene->shaders.size() << " shaders.";
+
 	/* test if we need to update */
 	device_free(device, dscene, scene);
 
diff --git a/intern/cycles/render/tables.cpp b/intern/cycles/render/tables.cpp
index ad3f4866072..cde024cc11c 100644
--- a/intern/cycles/render/tables.cpp
+++ b/intern/cycles/render/tables.cpp
@@ -37,11 +37,11 @@ LookupTables::~LookupTables()
 
 void LookupTables::device_update(Device *device, DeviceScene *dscene)
 {
-	VLOG(1) << "Total " << lookup_tables.size() << " lookup tables.";
-
 	if(!need_update)
 		return;
 
+	VLOG(1) << "Total " << lookup_tables.size() << " lookup tables.";
+
 	device->tex_free(dscene->lookup_table);
 
 	if(lookup_tables.size() > 0)
diff --git a/intern/cycles/util/util_guarded_allocator.h b/intern/cycles/util/util_guarded_allocator.h
index f6004749a13..78453d214be 100644
--- a/intern/cycles/util/util_guarded_allocator.h
+++ b/intern/cycles/util/util_guarded_allocator.h
@@ -53,19 +53,24 @@ public:
 		size_t size = n * sizeof(T);
 		util_guarded_mem_alloc(size);
 		(void)hint;
-#ifdef WITH_BLENDER_GUARDEDALLOC
 		if(n == 0) {
 			return NULL;
 		}
+		T *mem;
+#ifdef WITH_BLENDER_GUARDEDALLOC
 		/* C++ standard requires allocation functions to allocate memory suitably
 		 * aligned for any standard type. This is 16 bytes for 64 bit platform as
 		 * far as i concerned. We might over-align on 32bit here, but that should
 		 * be all safe actually.
 		 */
-		return (T*)MEM_mallocN_aligned(size, 16, "Cycles Alloc");
+		mem = (T*)MEM_mallocN_aligned(size, 16, "Cycles Alloc");
 #else
-		return (T*)malloc(size);
+		mem = (T*)malloc(size);
 #endif
+		if(mem == NULL) {
+			throw std::bad_alloc();
+		}
+		return mem;
 	}
 
 	void deallocate(T *p, size_t n)
@@ -97,7 +102,9 @@ public:
 
 	void construct(T *p, const T& val)
 	{
-		new ((T *)p) T(val);
+		if(p != NULL) {
+			new ((T *)p) T(val);
+		}
 	}
 
 	void destroy(T *p)
@@ -157,6 +164,26 @@ public:
 size_t util_guarded_get_mem_used(void);
 size_t util_guarded_get_mem_peak(void);
 
+/* Call given function and keep track if it runs out of memory.
+ *
+ * If it does run out f memory, stop execution and set progress
+ * to do a global cancel.
+ *
+ * It's not fully robust, but good enough to catch obvious issues
+ * when running out of memory.
+ */
+#define MEM_GUARDED_CALL(progress, func, ...) \
+	do { \
+		try { \
+			(func)(__VA_ARGS__); \
+		} \
+		catch (std::bad_alloc&) { \
+			fprintf(stderr, "Error: run out of memory!\n"); \
+			fflush(stderr); \
+			(progress)->set_error("Out of memory"); \
+		} \
+	} while(false)
+
 CCL_NAMESPACE_END
 
 #endif  /* __UTIL_GUARDED_ALLOCATOR_H__ */
diff --git a/intern/cycles/util/util_stack_allocator.h b/intern/cycles/util/util_stack_allocator.h
index 29260888eef..d7aab5b250c 100644
--- a/intern/cycles/util/util_stack_allocator.h
+++ b/intern/cycles/util/util_stack_allocator.h
@@ -40,14 +40,17 @@ public:
 	/* Allocator construction/destruction. */
 
 	StackAllocator()
-	: pointer_(0) {}
+	: pointer_(0),
+	  use_stack_(true) {}
 
 	StackAllocator(const StackAllocator&)
-	: pointer_(0) {}
+	: pointer_(0),
+	  use_stack_(true) {}
 
 	template <class U>
 	StackAllocator(const StackAllocator<SIZE, U>&)
-	: pointer_(0) {}
+	: pointer_(0),
+	  use_stack_(false) {}
 
 	/* Memory allocation/deallocation. */
 
@@ -57,14 +60,19 @@ public:
 		if(n == 0) {
 			return NULL;
 		}
-		if(pointer_ + n >= SIZE) {
+		if(pointer_ + n >= SIZE || use_stack_ == false) {
 			size_t size = n * sizeof(T);
 			util_guarded_mem_alloc(size);
+			T *mem;
 #ifdef WITH_BLENDER_GUARDEDALLOC
-			return (T*)MEM_mallocN_aligned(size, 16, "Cycles Alloc");
+			mem = (T*)MEM_mallocN_aligned(size, 16, "Cycles Alloc");
 #else
-			return (T*)malloc(size);
+			mem = (T*)malloc(size);
 #endif
+			if(mem == NULL) {
+				throw std::bad_alloc();
+			}
+			return mem;
 		}
 		T *mem = &data_[pointer_];
 		pointer_ += n;
@@ -104,7 +112,9 @@ public:
 
 	void construct(T *p, const T& val)
 	{
-		new ((T *)p) T(val);
+		if(p != NULL) {
+			new ((T *)p) T(val);
+		}
 	}
 
 	void destroy(T *p)
@@ -151,6 +161,7 @@ public:
 
 private:
 	int pointer_;
+	bool use_stack_;
 	T data_[SIZE];
 };
 
diff --git a/intern/cycles/util/util_vector.h b/intern/cycles/util/util_vector.h
index 4eb0dde8308..ad579da2d2e 100644
--- a/intern/cycles/util/util_vector.h
+++ b/intern/cycles/util/util_vector.h
@@ -218,10 +218,16 @@ public:
 protected:
 	inline T* mem_allocate(size_t N)
 	{
+		if(N == 0) {
+			return NULL;
+		}
 		T *mem = (T*)util_aligned_malloc(sizeof(T)*N, alignment);
 		if(mem != NULL) {
 			util_guarded_mem_alloc(sizeof(T)*N);
 		}
+		else {
+			throw std::bad_alloc();
+		}
 		return mem;
 	}