2 files changed, 275 insertions, 117 deletions
diff --git a/intern/cycles/render/object.cpp b/intern/cycles/render/object.cpp
index 42bb665cb9f..c9cd6921a56 100644
--- a/intern/cycles/render/object.cpp
+++ b/intern/cycles/render/object.cpp
@@ -231,150 +231,250 @@ ObjectManager::~ObjectManager()
 {
 }
 
-void ObjectManager::device_update_transforms(Device *device, DeviceScene *dscene, Scene *scene, uint *object_flag, Progress& progress)
+void ObjectManager::device_update_object_transform(UpdateObejctTransformState *state,
+                                                   Object *ob,
+                                                   int object_index)
 {
-	float4 *objects;
-	float4 *objects_vector = NULL;
-	int i = 0;
-	map<Mesh*, float> surface_area_map;
-	map<ParticleSystem*, int> particle_offset;
-	Scene::MotionType need_motion = scene->need_motion(device->info.advanced_shading);
-	bool have_motion = false;
-	bool have_curves = false;
-
-	objects = dscene->objects.resize(OBJECT_SIZE*scene->objects.size());
-	if(need_motion == Scene::MOTION_PASS)
-		objects_vector = dscene->objects_vector.resize(OBJECT_VECTOR_SIZE*scene->objects.size());
-
-	/* particle system device offsets
-	 * 0 is dummy particle, index starts at 1
+	float4 *objects = state->objects;
+	float4 *objects_vector = state->objects_vector;
+
+	Mesh *mesh = ob->mesh;
+	uint flag = 0;
+
+	/* Compute transformations. */
+	Transform tfm = ob->tfm;
+	Transform itfm = transform_inverse(tfm);
+
+	/* Compute surface area. for uniform scale we can do avoid the many
+	 * transform calls and share computation for instances.
+	 *
+	 * TODO(brecht): Correct for displacement, and move to a better place.
 	 */
-	int numparticles = 1;
-	foreach(ParticleSystem *psys, scene->particle_systems) {
-		particle_offset[psys] = numparticles;
-		numparticles += psys->particles.size();
-	}
+	float uniform_scale;
+	float surface_area = 0.0f;
+	float pass_id = ob->pass_id;
+	float random_number = (float)ob->random_id * (1.0f/(float)0xFFFFFFFF);
+	int particle_index = (ob->particle_system)
+	        ? ob->particle_index + state->particle_offset[ob->particle_system]
+	        : 0;
+
+	if(transform_uniform_scale(tfm, uniform_scale)) {
+		map<Mesh*, float>::iterator it;
+
+		/* NOTE: This isn't fully optimal and could in theory lead to multiple
+		 * threads calculating area of the same mesh in parallel. However, this
+		 * also prevents suspending all the threads when some mesh's area is
+		 * not yet known.
+		 */
+		state->surface_area_lock.lock();
+		it = state->surface_area_map.find(mesh);
+		state->surface_area_lock.unlock();
 
-	foreach(Object *ob, scene->objects) {
-		Mesh *mesh = ob->mesh;
-		uint flag = 0;
-
-		/* compute transformations */
-		Transform tfm = ob->tfm;
-		Transform itfm = transform_inverse(tfm);
-
-		/* compute surface area. for uniform scale we can do avoid the many
-		 * transform calls and share computation for instances */
-		/* todo: correct for displacement, and move to a better place */
-		float uniform_scale;
-		float surface_area = 0.0f;
-		float pass_id = ob->pass_id;
-		float random_number = (float)ob->random_id * (1.0f/(float)0xFFFFFFFF);
-		int particle_index = (ob->particle_system)? ob->particle_index + particle_offset[ob->particle_system]: 0;
-
-		if(transform_uniform_scale(tfm, uniform_scale)) {
-			map<Mesh*, float>::iterator it = surface_area_map.find(mesh);
-
-			if(it == surface_area_map.end()) {
-				foreach(Mesh::Triangle& t, mesh->triangles) {
-					float3 p1 = mesh->verts[t.v[0]];
-					float3 p2 = mesh->verts[t.v[1]];
-					float3 p3 = mesh->verts[t.v[2]];
-
-					surface_area += triangle_area(p1, p2, p3);
-				}
+		if(it == state->surface_area_map.end()) {
+			foreach(Mesh::Triangle& t, mesh->triangles) {
+				float3 p1 = mesh->verts[t.v[0]];
+				float3 p2 = mesh->verts[t.v[1]];
+				float3 p3 = mesh->verts[t.v[2]];
 
-				surface_area_map[mesh] = surface_area;
+				surface_area += triangle_area(p1, p2, p3);
 			}
-			else
-				surface_area = it->second;
 
-			surface_area *= uniform_scale;
+			state->surface_area_lock.lock();
+			state->surface_area_map[mesh] = surface_area;
+			state->surface_area_lock.unlock();
 		}
 		else {
-			foreach(Mesh::Triangle& t, mesh->triangles) {
-				float3 p1 = transform_point(&tfm, mesh->verts[t.v[0]]);
-				float3 p2 = transform_point(&tfm, mesh->verts[t.v[1]]);
-				float3 p3 = transform_point(&tfm, mesh->verts[t.v[2]]);
+			surface_area = it->second;
+		}
 
-				surface_area += triangle_area(p1, p2, p3);
-			}
+		surface_area *= uniform_scale;
+	}
+	else {
+		foreach(Mesh::Triangle& t, mesh->triangles) {
+			float3 p1 = transform_point(&tfm, mesh->verts[t.v[0]]);
+			float3 p2 = transform_point(&tfm, mesh->verts[t.v[1]]);
+			float3 p3 = transform_point(&tfm, mesh->verts[t.v[2]]);
+
+			surface_area += triangle_area(p1, p2, p3);
 		}
+	}
 
-		/* pack in texture */
-		int offset = i*OBJECT_SIZE;
-
-		/* OBJECT_TRANSFORM */
-		memcpy(&objects[offset], &tfm, sizeof(float4)*3);
-		/* OBJECT_INVERSE_TRANSFORM */
-		memcpy(&objects[offset+4], &itfm, sizeof(float4)*3);
-		/* OBJECT_PROPERTIES */
-		objects[offset+8] = make_float4(surface_area, pass_id, random_number, __int_as_float(particle_index));
-
-		if(need_motion == Scene::MOTION_PASS) {
-			/* motion transformations, is world/object space depending if mesh
-			 * comes with deformed position in object space, or if we transform
-			 * the shading point in world space */
-			Transform mtfm_pre = ob->motion.pre;
-			Transform mtfm_post = ob->motion.post;
-
-			if(!mesh->attributes.find(ATTR_STD_MOTION_VERTEX_POSITION)) {
-				mtfm_pre = mtfm_pre * itfm;
-				mtfm_post = mtfm_post * itfm;
-			}
-			else {
-				flag |= SD_OBJECT_HAS_VERTEX_MOTION;
-			}
+	/* Pack in texture. */
+	int offset = object_index*OBJECT_SIZE;
+
+	/* OBJECT_TRANSFORM */
+	memcpy(&objects[offset], &tfm, sizeof(float4)*3);
+	/* OBJECT_INVERSE_TRANSFORM */
+	memcpy(&objects[offset+4], &itfm, sizeof(float4)*3);
+	/* OBJECT_PROPERTIES */
+	objects[offset+8] = make_float4(surface_area, pass_id, random_number, __int_as_float(particle_index));
+
+	if(state->need_motion == Scene::MOTION_PASS) {
+		/* Motion transformations, is world/object space depending if mesh
+		 * comes with deformed position in object space, or if we transform
+		 * the shading point in world space.
+		 */
+		Transform mtfm_pre = ob->motion.pre;
+		Transform mtfm_post = ob->motion.post;
 
-			memcpy(&objects_vector[i*OBJECT_VECTOR_SIZE+0], &mtfm_pre, sizeof(float4)*3);
-			memcpy(&objects_vector[i*OBJECT_VECTOR_SIZE+3], &mtfm_post, sizeof(float4)*3);
+		if(!mesh->attributes.find(ATTR_STD_MOTION_VERTEX_POSITION)) {
+			mtfm_pre = mtfm_pre * itfm;
+			mtfm_post = mtfm_post * itfm;
+		}
+		else {
+			flag |= SD_OBJECT_HAS_VERTEX_MOTION;
 		}
+
+		memcpy(&objects_vector[object_index*OBJECT_VECTOR_SIZE+0], &mtfm_pre, sizeof(float4)*3);
+		memcpy(&objects_vector[object_index*OBJECT_VECTOR_SIZE+3], &mtfm_post, sizeof(float4)*3);
+	}
 #ifdef __OBJECT_MOTION__
-		else if(need_motion == Scene::MOTION_BLUR) {
-			if(ob->use_motion) {
-				/* decompose transformations for interpolation */
-				DecompMotionTransform decomp;
-
-				transform_motion_decompose(&decomp, &ob->motion, &ob->tfm);
-				memcpy(&objects[offset], &decomp, sizeof(float4)*8);
-				flag |= SD_OBJECT_MOTION;
-				have_motion = true;
-			}
+	else if(state->need_motion == Scene::MOTION_BLUR) {
+		if(ob->use_motion) {
+			/* decompose transformations for interpolation. */
+			DecompMotionTransform decomp;
+
+			transform_motion_decompose(&decomp, &ob->motion, &ob->tfm);
+			memcpy(&objects[offset], &decomp, sizeof(float4)*8);
+			flag |= SD_OBJECT_MOTION;
+			state->have_motion = true;
 		}
+	}
 #endif
 
-		if(mesh->use_motion_blur)
-			have_motion = true;
+	if(mesh->use_motion_blur) {
+		state->have_motion = true;
+	}
 
-		/* dupli object coords and motion info */
-		int totalsteps = mesh->motion_steps;
-		int numsteps = (totalsteps - 1)/2;
-		int numverts = mesh->verts.size();
-		int numkeys = mesh->curve_keys.size();
+	/* Dupli object coords and motion info. */
+	int totalsteps = mesh->motion_steps;
+	int numsteps = (totalsteps - 1)/2;
+	int numverts = mesh->verts.size();
+	int numkeys = mesh->curve_keys.size();
 
-		objects[offset+9] = make_float4(ob->dupli_generated[0], ob->dupli_generated[1], ob->dupli_generated[2], __int_as_float(numkeys));
-		objects[offset+10] = make_float4(ob->dupli_uv[0], ob->dupli_uv[1], __int_as_float(numsteps), __int_as_float(numverts));
+	objects[offset+9] = make_float4(ob->dupli_generated[0], ob->dupli_generated[1], ob->dupli_generated[2], __int_as_float(numkeys));
+	objects[offset+10] = make_float4(ob->dupli_uv[0], ob->dupli_uv[1], __int_as_float(numsteps), __int_as_float(numverts));
 
-		/* object flag */
-		if(ob->use_holdout)
-			flag |= SD_HOLDOUT_MASK;
-		object_flag[i] = flag;
+	/* Object flag. */
+	if(ob->use_holdout) {
+		flag |= SD_HOLDOUT_MASK;
+	}
+	state->object_flag[object_index] = flag;
 
-		/* have curves */
-		if(mesh->curves.size())
-			have_curves = true;
+	/* Have curves. */
+	if(mesh->curves.size()) {
+		state->have_curves = true;
+	}
+}
 
-		i++;
+bool ObjectManager::device_update_object_transform_pop_work(
+        UpdateObejctTransformState *state,
+        int *start_index,
+        int *num_objects)
+{
+	/* Tweakable parameter, number of objects per chunk.
+	 * Too small value will cause some extra overhead due to spin lock,
+	 * too big value might not use all threads nicely.
+	 */
+	static const int OBJECTS_PER_TASK = 32;
+	bool have_work = false;
+	state->queue_lock.lock();
+	int num_scene_objects = state->scene->objects.size();
+	if(state->queue_start_object < num_scene_objects) {
+		int count = min(OBJECTS_PER_TASK,
+		                num_scene_objects - state->queue_start_object);
+		*start_index = state->queue_start_object;
+		*num_objects = count;
+		state->queue_start_object += count;
+		have_work = true;
+	}
+	state->queue_lock.unlock();
+	return have_work;
+}
+
+void ObjectManager::device_update_object_transform_task(
+        UpdateObejctTransformState *state)
+{
+	int start_index, num_objects;
+	while(device_update_object_transform_pop_work(state,
+	                                              &start_index,
+	                                              &num_objects))
+	{
+		for(int i = 0; i < num_objects; ++i) {
+			const int object_index = start_index + i;
+			Object *ob = state->scene->objects[object_index];
+			device_update_object_transform(state, ob, object_index);
+		}
+	}
+}
+
+void ObjectManager::device_update_transforms(Device *device,
+                                             DeviceScene *dscene,
+                                             Scene *scene,
+                                             uint *object_flag,
+                                             Progress& progress)
+{
+	UpdateObejctTransformState state;
+	state.need_motion = scene->need_motion(device->info.advanced_shading);
+	state.have_motion = false;
+	state.have_curves = false;
+	state.scene = scene;
+	state.queue_start_object = 0;
+
+	state.object_flag = object_flag;
+	state.objects = dscene->objects.resize(OBJECT_SIZE*scene->objects.size());
+	if(state.need_motion == Scene::MOTION_PASS) {
+		state.objects_vector = dscene->objects_vector.resize(OBJECT_VECTOR_SIZE*scene->objects.size());
+	}
+	else {
+		state.objects_vector = NULL;
+	}
+
+	/* Particle system device offsets
+	 * 0 is dummy particle, index starts at 1.
+	 */
+	int numparticles = 1;
+	foreach(ParticleSystem *psys, scene->particle_systems) {
+		state.particle_offset[psys] = numparticles;
+		numparticles += psys->particles.size();
+	}
 
-		if(progress.get_cancel()) return;
+	/* NOTE: If it's just a handful of objects we deal with them in a single
+	 * thread to avoid threading overhead. However, this threshold is might
+	 * need some tweaks to make mid-complex scenes optimal.
+	 */
+	if(scene->objects.size() < 64) {
+		int object_index = 0;
+		foreach(Object *ob, scene->objects) {
+			device_update_object_transform(&state, ob, object_index);
+			object_index++;
+			if(progress.get_cancel()) {
+				return;
+			}
+		}
+	}
+	else {
+		const int num_threads = TaskScheduler::num_threads();
+		TaskPool pool;
+		for(int i = 0; i < num_threads; ++i) {
+			pool.push(function_bind(
+			        &ObjectManager::device_update_object_transform_task,
+			        this,
+			        &state));
+		}
+		pool.wait_work();
+		if(progress.get_cancel()) {
+			return;
+		}
 	}
 
 	device->tex_alloc("__objects", dscene->objects);
-	if(need_motion == Scene::MOTION_PASS)
+	if(state.need_motion == Scene::MOTION_PASS) {
 		device->tex_alloc("__objects_vector", dscene->objects_vector);
+	}
 
-	dscene->data.bvh.have_motion = have_motion;
-	dscene->data.bvh.have_curves = have_curves;
+	dscene->data.bvh.have_motion = state.have_motion;
+	dscene->data.bvh.have_curves = state.have_curves;
 	dscene->data.bvh.have_instancing = true;
 }
 
diff --git a/intern/cycles/render/object.h b/intern/cycles/render/object.h
index 379d1748cdd..c2a79ca8dc4 100644
--- a/intern/cycles/render/object.h
+++ b/intern/cycles/render/object.h
@@ -17,9 +17,12 @@
 #ifndef __OBJECT_H__
 #define __OBJECT_H__
 
+#include "scene.h"
+
 #include "util_boundbox.h"
 #include "util_param.h"
 #include "util_transform.h"
+#include "util_thread.h"
 #include "util_types.h"
 
 CCL_NAMESPACE_BEGIN
@@ -76,7 +79,12 @@ public:
 	~ObjectManager();
 
 	void device_update(Device *device, DeviceScene *dscene, Scene *scene, Progress& progress);
-	void device_update_transforms(Device *device, DeviceScene *dscene, Scene *scene, uint *object_flag, Progress& progress);
+	void device_update_transforms(Device *device,
+	                              DeviceScene *dscene,
+	                              Scene *scene,
+	                              uint *object_flag,
+	                              Progress& progress);
+
 	void device_update_flags(Device *device,
 	                         DeviceScene *dscene,
 	                         Scene *scene,
@@ -87,6 +95,56 @@ public:
 	void tag_update(Scene *scene);
 
 	void apply_static_transforms(DeviceScene *dscene, Scene *scene, uint *object_flag, Progress& progress);
+
+protected:
+	/* Global state of object transform update. */
+	struct UpdateObejctTransformState {
+		/* Global state used by device_update_object_transform().
+		 * Common for both threaded and non-threaded update.
+		 */
+
+		/* Type of the motion required by the scene settings. */
+		Scene::MotionType need_motion;
+
+		/* Mapping from particle system to a index in packed particle array.
+		 * Only used for read.
+		 */
+		map<ParticleSystem*, int> particle_offset;
+
+		/* Mesh area.
+		 * Used to avoid calculation of mesh area multiple times. Used for both
+		 * read and write. Acquire surface_area_lock to keep it all thread safe.
+		 */
+		map<Mesh*, float> surface_area_map;
+
+		/* Packed object arrays. Those will be filled in. */
+		uint *object_flag;
+		float4 *objects;
+		float4 *objects_vector;
+
+		/* Flags which will be synchronized to Integrator. */
+		bool have_motion;
+		bool have_curves;
+
+		/* ** Scheduling queue. ** */
+
+		Scene *scene;
+
+		/* Some locks to keep everything thread-safe. */
+		thread_spin_lock queue_lock;
+		thread_spin_lock surface_area_lock;
+
+		/* First unused object index in the queue. */
+		int queue_start_object;
+	};
+	void device_update_object_transform(UpdateObejctTransformState *state,
+	                                    Object *ob,
+	                                    const int object_index);
+	void device_update_object_transform_task(UpdateObejctTransformState *state);
+	bool device_update_object_transform_pop_work(
+	        UpdateObejctTransformState *state,
+	        int *start_index,
+	        int *num_objects);
 };
 
 CCL_NAMESPACE_END