Merge remote-tracking branch 'origin/master' into blender2.8

author: Dalai Felinto <dfelinto@gmail.com> 2016-11-16 19:01:19 +0300
committer: Dalai Felinto <dfelinto@gmail.com> 2016-11-16 19:04:21 +0300
commit: 2bcb1b208a4193fb28f1e0c4408b733f5ee2c028 (patch)
tree: 8b9260ffd9fb7b371dc66955903c7b0c7f4e7bf9 /intern
parent: 930f999f6ea683d02ac490026a52817f1d965377 (diff)
parent: 2a2eb0c463bd96d42f7306eb17f88cad87f73aea (diff)
20 files changed, 265 insertions, 366 deletions
diff --git a/intern/atomic/atomic_ops.h b/intern/atomic/atomic_ops.h
index f78eab7951f..1107deddf94 100644
--- a/intern/atomic/atomic_ops.h
+++ b/intern/atomic/atomic_ops.h
@@ -77,13 +77,15 @@
 /* Function prototypes. */
 
 #if (LG_SIZEOF_PTR == 8 || LG_SIZEOF_INT == 8)
-ATOMIC_INLINE uint64_t atomic_add_uint64(uint64_t *p, uint64_t x);
-ATOMIC_INLINE uint64_t atomic_sub_uint64(uint64_t *p, uint64_t x);
+ATOMIC_INLINE uint64_t atomic_add_and_fetch_uint64(uint64_t *p, uint64_t x);
+ATOMIC_INLINE uint64_t atomic_sub_and_fetch_uint64(uint64_t *p, uint64_t x);
+ATOMIC_INLINE uint64_t atomic_fetch_and_add_uint64(uint64_t *p, uint64_t x);
+ATOMIC_INLINE uint64_t atomic_fetch_and_sub_uint64(uint64_t *p, uint64_t x);
 ATOMIC_INLINE uint64_t atomic_cas_uint64(uint64_t *v, uint64_t old, uint64_t _new);
 #endif
 
-ATOMIC_INLINE uint32_t atomic_add_uint32(uint32_t *p, uint32_t x);
-ATOMIC_INLINE uint32_t atomic_sub_uint32(uint32_t *p, uint32_t x);
+ATOMIC_INLINE uint32_t atomic_add_and_fetch_uint32(uint32_t *p, uint32_t x);
+ATOMIC_INLINE uint32_t atomic_sub_and_fetch_uint32(uint32_t *p, uint32_t x);
 ATOMIC_INLINE uint32_t atomic_cas_uint32(uint32_t *v, uint32_t old, uint32_t _new);
 
 ATOMIC_INLINE uint32_t atomic_fetch_and_add_uint32(uint32_t *p, uint32_t x);
@@ -93,18 +95,22 @@ ATOMIC_INLINE uint32_t atomic_fetch_and_and_uint32(uint32_t *p, uint32_t x);
 ATOMIC_INLINE uint8_t atomic_fetch_and_or_uint8(uint8_t *p, uint8_t b);
 ATOMIC_INLINE uint8_t atomic_fetch_and_and_uint8(uint8_t *p, uint8_t b);
 
-ATOMIC_INLINE size_t atomic_add_z(size_t *p, size_t x);
-ATOMIC_INLINE size_t atomic_sub_z(size_t *p, size_t x);
+ATOMIC_INLINE size_t atomic_add_and_fetch_z(size_t *p, size_t x);
+ATOMIC_INLINE size_t atomic_sub_and_fetch_z(size_t *p, size_t x);
+ATOMIC_INLINE size_t atomic_fetch_and_add_z(size_t *p, size_t x);
+ATOMIC_INLINE size_t atomic_fetch_and_sub_z(size_t *p, size_t x);
 ATOMIC_INLINE size_t atomic_cas_z(size_t *v, size_t old, size_t _new);
 
-ATOMIC_INLINE unsigned atomic_add_u(unsigned *p, unsigned x);
-ATOMIC_INLINE unsigned atomic_sub_u(unsigned *p, unsigned x);
+ATOMIC_INLINE unsigned atomic_add_and_fetch_u(unsigned *p, unsigned x);
+ATOMIC_INLINE unsigned atomic_sub_and_fetch_u(unsigned *p, unsigned x);
+ATOMIC_INLINE unsigned atomic_fetch_and_add_u(unsigned *p, unsigned x);
+ATOMIC_INLINE unsigned atomic_fetch_and_sub_u(unsigned *p, unsigned x);
 ATOMIC_INLINE unsigned atomic_cas_u(unsigned *v, unsigned old, unsigned _new);
 
 /* WARNING! Float 'atomics' are really faked ones, those are actually closer to some kind of spinlock-sync'ed operation,
  *          which means they are only efficient if collisions are highly unlikely (i.e. if probability of two threads
  *          working on the same pointer at the same time is very low). */
-ATOMIC_INLINE float atomic_add_fl(float *p, const float x);
+ATOMIC_INLINE float atomic_add_and_fetch_fl(float *p, const float x);
 
 /******************************************************************************/
 /* Include system-dependent implementations. */
diff --git a/intern/atomic/intern/atomic_ops_ext.h b/intern/atomic/intern/atomic_ops_ext.h
index 4065299d2ea..8421aa72192 100644
--- a/intern/atomic/intern/atomic_ops_ext.h
+++ b/intern/atomic/intern/atomic_ops_ext.h
@@ -56,25 +56,47 @@
 
 /******************************************************************************/
 /* size_t operations. */
-ATOMIC_INLINE size_t atomic_add_z(size_t *p, size_t x)
+ATOMIC_INLINE size_t atomic_add_and_fetch_z(size_t *p, size_t x)
 {
 	assert(sizeof(size_t) == LG_SIZEOF_PTR);
 
 #if (LG_SIZEOF_PTR == 8)
-	return (size_t)atomic_add_uint64((uint64_t *)p, (uint64_t)x);
+	return (size_t)atomic_add_and_fetch_uint64((uint64_t *)p, (uint64_t)x);
 #elif (LG_SIZEOF_PTR == 4)
-	return (size_t)atomic_add_uint32((uint32_t *)p, (uint32_t)x);
+	return (size_t)atomic_add_and_fetch_uint32((uint32_t *)p, (uint32_t)x);
 #endif
 }
 
-ATOMIC_INLINE size_t atomic_sub_z(size_t *p, size_t x)
+ATOMIC_INLINE size_t atomic_sub_and_fetch_z(size_t *p, size_t x)
 {
 	assert(sizeof(size_t) == LG_SIZEOF_PTR);
 
 #if (LG_SIZEOF_PTR == 8)
-	return (size_t)atomic_add_uint64((uint64_t *)p, (uint64_t)-((int64_t)x));
+	return (size_t)atomic_add_and_fetch_uint64((uint64_t *)p, (uint64_t)-((int64_t)x));
 #elif (LG_SIZEOF_PTR == 4)
-	return (size_t)atomic_add_uint32((uint32_t *)p, (uint32_t)-((int32_t)x));
+	return (size_t)atomic_add_and_fetch_uint32((uint32_t *)p, (uint32_t)-((int32_t)x));
+#endif
+}
+
+ATOMIC_INLINE size_t atomic_fetch_and_add_z(size_t *p, size_t x)
+{
+	assert(sizeof(size_t) == LG_SIZEOF_PTR);
+
+#if (LG_SIZEOF_PTR == 8)
+	return (size_t)atomic_fetch_and_add_uint64((uint64_t *)p, (uint64_t)x);
+#elif (LG_SIZEOF_PTR == 4)
+	return (size_t)atomic_fetch_and_add_uint32((uint32_t *)p, (uint32_t)x);
+#endif
+}
+
+ATOMIC_INLINE size_t atomic_fetch_and_sub_z(size_t *p, size_t x)
+{
+	assert(sizeof(size_t) == LG_SIZEOF_PTR);
+
+#if (LG_SIZEOF_PTR == 8)
+	return (size_t)atomic_fetch_and_add_uint64((uint64_t *)p, (uint64_t)-((int64_t)x));
+#elif (LG_SIZEOF_PTR == 4)
+	return (size_t)atomic_fetch_and_add_uint32((uint32_t *)p, (uint32_t)-((int32_t)x));
 #endif
 }
 
@@ -91,25 +113,47 @@ ATOMIC_INLINE size_t atomic_cas_z(size_t *v, size_t old, size_t _new)
 
 /******************************************************************************/
 /* unsigned operations. */
-ATOMIC_INLINE unsigned atomic_add_u(unsigned *p, unsigned x)
+ATOMIC_INLINE unsigned atomic_add_and_fetch_u(unsigned *p, unsigned x)
+{
+	assert(sizeof(unsigned) == LG_SIZEOF_INT);
+
+#if (LG_SIZEOF_INT == 8)
+	return (unsigned)atomic_add_and_fetch_uint64((uint64_t *)p, (uint64_t)x);
+#elif (LG_SIZEOF_INT == 4)
+	return (unsigned)atomic_add_and_fetch_uint32((uint32_t *)p, (uint32_t)x);
+#endif
+}
+
+ATOMIC_INLINE unsigned atomic_sub_and_fetch_u(unsigned *p, unsigned x)
+{
+	assert(sizeof(unsigned) == LG_SIZEOF_INT);
+
+#if (LG_SIZEOF_INT == 8)
+	return (unsigned)atomic_add_and_fetch_uint64((uint64_t *)p, (uint64_t)-((int64_t)x));
+#elif (LG_SIZEOF_INT == 4)
+	return (unsigned)atomic_add_and_fetch_uint32((uint32_t *)p, (uint32_t)-((int32_t)x));
+#endif
+}
+
+ATOMIC_INLINE unsigned atomic_fetch_and_add_u(unsigned *p, unsigned x)
 {
 	assert(sizeof(unsigned) == LG_SIZEOF_INT);
 
 #if (LG_SIZEOF_INT == 8)
-	return (unsigned)atomic_add_uint64((uint64_t *)p, (uint64_t)x);
+	return (unsigned)atomic_fetch_and_add_uint64((uint64_t *)p, (uint64_t)x);
 #elif (LG_SIZEOF_INT == 4)
-	return (unsigned)atomic_add_uint32((uint32_t *)p, (uint32_t)x);
+	return (unsigned)atomic_fetch_and_add_uint32((uint32_t *)p, (uint32_t)x);
 #endif
 }
 
-ATOMIC_INLINE unsigned atomic_sub_u(unsigned *p, unsigned x)
+ATOMIC_INLINE unsigned atomic_fetch_and_sub_u(unsigned *p, unsigned x)
 {
 	assert(sizeof(unsigned) == LG_SIZEOF_INT);
 
 #if (LG_SIZEOF_INT == 8)
-	return (unsigned)atomic_add_uint64((uint64_t *)p, (uint64_t)-((int64_t)x));
+	return (unsigned)atomic_fetch_and_add_uint64((uint64_t *)p, (uint64_t)-((int64_t)x));
 #elif (LG_SIZEOF_INT == 4)
-	return (unsigned)atomic_add_uint32((uint32_t *)p, (uint32_t)-((int32_t)x));
+	return (unsigned)atomic_fetch_and_add_uint32((uint32_t *)p, (uint32_t)-((int32_t)x));
 #endif
 }
 
@@ -127,7 +171,7 @@ ATOMIC_INLINE unsigned atomic_cas_u(unsigned *v, unsigned old, unsigned _new)
 /******************************************************************************/
 /* float operations. */
 
-ATOMIC_INLINE float atomic_add_fl(float *p, const float x)
+ATOMIC_INLINE float atomic_add_and_fetch_fl(float *p, const float x)
 {
 	assert(sizeof(float) == sizeof(uint32_t));
 
diff --git a/intern/atomic/intern/atomic_ops_msvc.h b/intern/atomic/intern/atomic_ops_msvc.h
index 3461719a4e7..034ac1e3e53 100644
--- a/intern/atomic/intern/atomic_ops_msvc.h
+++ b/intern/atomic/intern/atomic_ops_msvc.h
@@ -43,12 +43,12 @@
 /******************************************************************************/
 /* 64-bit operations. */
 #if (LG_SIZEOF_PTR == 8 || LG_SIZEOF_INT == 8)
-ATOMIC_INLINE uint64_t atomic_add_uint64(uint64_t *p, uint64_t x)
+ATOMIC_INLINE uint64_t atomic_add_and_fetch_uint64(uint64_t *p, uint64_t x)
 {
 	return InterlockedExchangeAdd64((int64_t *)p, (int64_t)x) + x;
 }
 
-ATOMIC_INLINE uint64_t atomic_sub_uint64(uint64_t *p, uint64_t x)
+ATOMIC_INLINE uint64_t atomic_sub_and_fetch_uint64(uint64_t *p, uint64_t x)
 {
 	return InterlockedExchangeAdd64((int64_t *)p, -((int64_t)x)) - x;
 }
@@ -57,16 +57,26 @@ ATOMIC_INLINE uint64_t atomic_cas_uint64(uint64_t *v, uint64_t old, uint64_t _ne
 {
 	return InterlockedCompareExchange64((int64_t *)v, _new, old);
 }
+
+ATOMIC_INLINE uint64_t atomic_fetch_and_add_uint64(uint64_t *p, uint64_t x)
+{
+	return InterlockedExchangeAdd64((int64_t *)p, (int64_t)x);
+}
+
+ATOMIC_INLINE uint64_t atomic_fetch_and_sub_uint64(uint64_t *p, uint64_t x)
+{
+	return InterlockedExchangeAdd64((int64_t *)p, -((int64_t)x));
+}
 #endif
 
 /******************************************************************************/
 /* 32-bit operations. */
-ATOMIC_INLINE uint32_t atomic_add_uint32(uint32_t *p, uint32_t x)
+ATOMIC_INLINE uint32_t atomic_add_and_fetch_uint32(uint32_t *p, uint32_t x)
 {
 	return InterlockedExchangeAdd(p, x) + x;
 }
 
-ATOMIC_INLINE uint32_t atomic_sub_uint32(uint32_t *p, uint32_t x)
+ATOMIC_INLINE uint32_t atomic_sub_and_fetch_uint32(uint32_t *p, uint32_t x)
 {
 	return InterlockedExchangeAdd(p, -((int32_t)x)) - x;
 }
diff --git a/intern/atomic/intern/atomic_ops_unix.h b/intern/atomic/intern/atomic_ops_unix.h
index e63f09c76c5..0a3322ad2b1 100644
--- a/intern/atomic/intern/atomic_ops_unix.h
+++ b/intern/atomic/intern/atomic_ops_unix.h
@@ -58,22 +58,32 @@
 /* 64-bit operations. */
 #if (LG_SIZEOF_PTR == 8 || LG_SIZEOF_INT == 8)
 #  if (defined(__GCC_HAVE_SYNC_COMPARE_AND_SWAP_8) || defined(JE_FORCE_SYNC_COMPARE_AND_SWAP_8))
-ATOMIC_INLINE uint64_t atomic_add_uint64(uint64_t *p, uint64_t x)
+ATOMIC_INLINE uint64_t atomic_add_and_fetch_uint64(uint64_t *p, uint64_t x)
 {
 	return __sync_add_and_fetch(p, x);
 }
 
-ATOMIC_INLINE uint64_t atomic_sub_uint64(uint64_t *p, uint64_t x)
+ATOMIC_INLINE uint64_t atomic_sub_and_fetch_uint64(uint64_t *p, uint64_t x)
 {
 	return __sync_sub_and_fetch(p, x);
 }
 
+ATOMIC_INLINE uint64_t atomic_fetch_and_add_uint64(uint64_t *p, uint64_t x)
+{
+	return __sync_fetch_and_add(p, x);
+}
+
+ATOMIC_INLINE uint64_t atomic_fetch_and_sub_uint64(uint64_t *p, uint64_t x)
+{
+	return __sync_fetch_and_sub(p, x);
+}
+
 ATOMIC_INLINE uint64_t atomic_cas_uint64(uint64_t *v, uint64_t old, uint64_t _new)
 {
 	return __sync_val_compare_and_swap(v, old, _new);
 }
 #  elif (defined(__amd64__) || defined(__x86_64__))
-ATOMIC_INLINE uint64_t atomic_add_uint64(uint64_t *p, uint64_t x)
+ATOMIC_INLINE uint64_t atomic_fetch_and_add_uint64(uint64_t *p, uint64_t x)
 {
 	asm volatile (
 	    "lock; xaddq %0, %1;"
@@ -83,7 +93,7 @@ ATOMIC_INLINE uint64_t atomic_add_uint64(uint64_t *p, uint64_t x)
 	return x;
 }
 
-ATOMIC_INLINE uint64_t atomic_sub_uint64(uint64_t *p, uint64_t x)
+ATOMIC_INLINE uint64_t atomic_fetch_and_sub_uint64(uint64_t *p, uint64_t x)
 {
 	x = (uint64_t)(-(int64_t)x);
 	asm volatile (
@@ -94,6 +104,16 @@ ATOMIC_INLINE uint64_t atomic_sub_uint64(uint64_t *p, uint64_t x)
 	return x;
 }
 
+ATOMIC_INLINE uint64_t atomic_add_and_fetch_uint64(uint64_t *p, uint64_t x)
+{
+	return atomic_fetch_and_add_uint64(p, x) + x;
+}
+
+ATOMIC_INLINE uint64_t atomic_sub_and_fetch_uint64(uint64_t *p, uint64_t x)
+{
+	return atomic_fetch_and_sub_uint64(p, x) - x;
+}
+
 ATOMIC_INLINE uint64_t atomic_cas_uint64(uint64_t *v, uint64_t old, uint64_t _new)
 {
 	uint64_t ret;
@@ -112,12 +132,12 @@ ATOMIC_INLINE uint64_t atomic_cas_uint64(uint64_t *v, uint64_t old, uint64_t _ne
 /******************************************************************************/
 /* 32-bit operations. */
 #if (defined(__GCC_HAVE_SYNC_COMPARE_AND_SWAP_4) || defined(JE_FORCE_SYNC_COMPARE_AND_SWAP_4))
-ATOMIC_INLINE uint32_t atomic_add_uint32(uint32_t *p, uint32_t x)
+ATOMIC_INLINE uint32_t atomic_add_and_fetch_uint32(uint32_t *p, uint32_t x)
 {
 	return __sync_add_and_fetch(p, x);
 }
 
-ATOMIC_INLINE uint32_t atomic_sub_uint32(uint32_t *p, uint32_t x)
+ATOMIC_INLINE uint32_t atomic_sub_and_fetch_uint32(uint32_t *p, uint32_t x)
 {
 	return __sync_sub_and_fetch(p, x);
 }
@@ -127,7 +147,7 @@ ATOMIC_INLINE uint32_t atomic_cas_uint32(uint32_t *v, uint32_t old, uint32_t _ne
    return __sync_val_compare_and_swap(v, old, _new);
 }
 #elif (defined(__i386__) || defined(__amd64__) || defined(__x86_64__))
-ATOMIC_INLINE uint32_t atomic_add_uint32(uint32_t *p, uint32_t x)
+ATOMIC_INLINE uint32_t atomic_add_and_fetch_uint32(uint32_t *p, uint32_t x)
 {
 	uint32_t ret = x;
 	asm volatile (
@@ -138,7 +158,7 @@ ATOMIC_INLINE uint32_t atomic_add_uint32(uint32_t *p, uint32_t x)
 	return ret+x;
 }
 
-ATOMIC_INLINE uint32_t atomic_sub_uint32(uint32_t *p, uint32_t x)
+ATOMIC_INLINE uint32_t atomic_sub_and_fetch_uint32(uint32_t *p, uint32_t x)
 {
 	ret = (uint32_t)(-(int32_t)x);
 	asm volatile (
diff --git a/intern/cycles/app/cycles_standalone.cpp b/intern/cycles/app/cycles_standalone.cpp
index e8168bc15ff..b21e8630cdb 100644
--- a/intern/cycles/app/cycles_standalone.cpp
+++ b/intern/cycles/app/cycles_standalone.cpp
@@ -337,7 +337,7 @@ static void options_parse(int argc, const char **argv)
 
 	/* device names */
 	string device_names = "";
-	string devicename = "cpu";
+	string devicename = "CPU";
 	bool list = false;
 
 	vector<DeviceType>& types = Device::available_types();
diff --git a/intern/cycles/app/cycles_xml.cpp b/intern/cycles/app/cycles_xml.cpp
index 8a3eb98a5a0..29a68bf272e 100644
--- a/intern/cycles/app/cycles_xml.cpp
+++ b/intern/cycles/app/cycles_xml.cpp
@@ -210,17 +210,6 @@ static void xml_read_camera(XMLReadState& state, pugi::xml_node node)
 
 /* Shader */
 
-static string xml_socket_name(const char *name)
-{
-	string sname = name;
-	size_t i;
-
-	while((i = sname.find(" ")) != string::npos)
-		sname.replace(i, 1, "");
-	
-	return sname;
-}
-
 static void xml_read_shader_graph(XMLReadState& state, Shader *shader, pugi::xml_node graph_node)
 {
 	xml_read_node(state, shader, graph_node);
@@ -255,7 +244,7 @@ static void xml_read_shader_graph(XMLReadState& state, Shader *shader, pugi::xml
 					ShaderNode *fromnode = (ShaderNode*)graph_reader.node_map[from_node_name];
 
 					foreach(ShaderOutput *out, fromnode->outputs)
-						if(string_iequals(xml_socket_name(out->name().c_str()), from_socket_name.c_str()))
+						if(string_iequals(out->socket_type.name.string(), from_socket_name.string()))
 							output = out;
 
 					if(!output)
@@ -268,7 +257,7 @@ static void xml_read_shader_graph(XMLReadState& state, Shader *shader, pugi::xml
 					ShaderNode *tonode = (ShaderNode*)graph_reader.node_map[to_node_name];
 
 					foreach(ShaderInput *in, tonode->inputs)
-						if(string_iequals(xml_socket_name(in->name().c_str()), to_socket_name.c_str()))
+						if(string_iequals(in->socket_type.name.string(), to_socket_name.string()))
 							input = in;
 
 					if(!input)
@@ -406,7 +395,7 @@ static void xml_read_mesh(const XMLReadState& state, pugi::xml_node node)
 	int shader = 0;
 	bool smooth = state.smooth;
 
-	/* read vertices and polygons, RIB style */
+	/* read vertices and polygons */
 	vector<float3> P;
 	vector<float> UV;
 	vector<int> verts, nverts;
@@ -532,8 +521,12 @@ static void xml_read_mesh(const XMLReadState& state, pugi::xml_node node)
 		sdparams.objecttoworld = state.tfm;
 	}
 
-	/* temporary for test compatibility */
-	mesh->attributes.remove(ATTR_STD_VERTEX_NORMAL);
+	/* we don't yet support arbitrary attributes, for now add vertex
+	 * coordinates as generated coordinates if requested */
+	if (mesh->need_attribute(state.scene, ATTR_STD_GENERATED)) {
+		Attribute *attr = mesh->attributes.add(ATTR_STD_GENERATED);
+		memcpy(attr->data_float3(), mesh->verts.data(), sizeof(float3)*mesh->verts.size());
+	}
 }
 
 /* Light */
diff --git a/intern/cycles/blender/blender_sync.cpp b/intern/cycles/blender/blender_sync.cpp
index 62b9fa3c92b..49ddc8af9a8 100644
--- a/intern/cycles/blender/blender_sync.cpp
+++ b/intern/cycles/blender/blender_sync.cpp
@@ -249,9 +249,14 @@ void BlenderSync::sync_integrator()
 	integrator->seed = get_int(cscene, "seed");
 	if(get_boolean(cscene, "use_animated_seed")) {
 		integrator->seed = hash_int_2d(b_scene.frame_current(),
-		                               get_int(cscene, "seed")) +
-		                   hash_int_2d((int)(b_scene.frame_subframe() * (float)INT_MAX),
 		                               get_int(cscene, "seed"));
+		if(b_scene.frame_subframe() != 0.0f) {
+			/* TODO(sergey): Ideally should be some sort of hash_merge,
+			 * but this is good enough for now.
+			 */
+			integrator->seed += hash_int_2d((int)(b_scene.frame_subframe() * (float)INT_MAX),
+			                                get_int(cscene, "seed"));
+		}
 	}
 
 	integrator->sampling_pattern = (SamplingPattern)get_enum(
diff --git a/intern/cycles/kernel/kernel_passes.h b/intern/cycles/kernel/kernel_passes.h
index 20cf3fa931b..7aec47e4957 100644
--- a/intern/cycles/kernel/kernel_passes.h
+++ b/intern/cycles/kernel/kernel_passes.h
@@ -20,7 +20,7 @@ ccl_device_inline void kernel_write_pass_float(ccl_global float *buffer, int sam
 {
 	ccl_global float *buf = buffer;
 #if defined(__SPLIT_KERNEL__) && defined(__WORK_STEALING__)
-	atomic_add_float(buf, value);
+	atomic_add_and_fetch_float(buf, value);
 #else
 	*buf = (sample == 0)? value: *buf + value;
 #endif // __SPLIT_KERNEL__ && __WORK_STEALING__
@@ -33,9 +33,9 @@ ccl_device_inline void kernel_write_pass_float3(ccl_global float *buffer, int sa
 	ccl_global float *buf_y = buffer + 1;
 	ccl_global float *buf_z = buffer + 2;
 
-	atomic_add_float(buf_x, value.x);
-	atomic_add_float(buf_y, value.y);
-	atomic_add_float(buf_z, value.z);
+	atomic_add_and_fetch_float(buf_x, value.x);
+	atomic_add_and_fetch_float(buf_y, value.y);
+	atomic_add_and_fetch_float(buf_z, value.z);
 #else
 	ccl_global float3 *buf = (ccl_global float3*)buffer;
 	*buf = (sample == 0)? value: *buf + value;
@@ -50,10 +50,10 @@ ccl_device_inline void kernel_write_pass_float4(ccl_global float *buffer, int sa
 	ccl_global float *buf_z = buffer + 2;
 	ccl_global float *buf_w = buffer + 3;
 
-	atomic_add_float(buf_x, value.x);
-	atomic_add_float(buf_y, value.y);
-	atomic_add_float(buf_z, value.z);
-	atomic_add_float(buf_w, value.w);
+	atomic_add_and_fetch_float(buf_x, value.x);
+	atomic_add_and_fetch_float(buf_y, value.y);
+	atomic_add_and_fetch_float(buf_z, value.z);
+	atomic_add_and_fetch_float(buf_w, value.w);
 #else
 	ccl_global float4 *buf = (ccl_global float4*)buffer;
 	*buf = (sample == 0)? value: *buf + value;
diff --git a/intern/cycles/kernel/kernel_path.h b/intern/cycles/kernel/kernel_path.h
index 4237fdb32ff..6d89a89ed5b 100644
--- a/intern/cycles/kernel/kernel_path.h
+++ b/intern/cycles/kernel/kernel_path.h
@@ -84,7 +84,7 @@ ccl_device_noinline void kernel_path_ao(KernelGlobals *kg,
 		light_ray.t = kernel_data.background.ao_distance;
 #ifdef __OBJECT_MOTION__
 		light_ray.time = ccl_fetch(sd, time);
-#endif
+#endif  /* __OBJECT_MOTION__ */
 		light_ray.dP = ccl_fetch(sd, dP);
 		light_ray.dD = differential3_zero();
 
@@ -138,7 +138,7 @@ ccl_device void kernel_path_indirect(KernelGlobals *kg,
 				                             state->bounce);
 			}
 		}
-#endif
+#endif  /* __LAMP_MIS__ */
 
 #ifdef __VOLUME__
 		/* volume attenuation, emission, scatter */
@@ -239,7 +239,7 @@ ccl_device void kernel_path_indirect(KernelGlobals *kg,
 				}
 			}
 			else
-#  endif
+#  endif  /* __VOLUME_DECOUPLED__ */
 			{
 				/* integrate along volume segment with distance sampling */
 				VolumeIntegrateResult result = kernel_volume_integrate(
@@ -271,10 +271,10 @@ ccl_device void kernel_path_indirect(KernelGlobals *kg,
 						break;
 					}
 				}
-#  endif
+#  endif  /* __VOLUME_SCATTER__ */
 			}
 		}
-#endif
+#endif  /* __VOLUME__ */
 
 		if(!hit) {
 #ifdef __BACKGROUND__
@@ -284,7 +284,7 @@ ccl_device void kernel_path_indirect(KernelGlobals *kg,
 			                               throughput,
 			                               L_background,
 			                               state->bounce);
-#endif
+#endif  /* __BACKGROUND__ */
 
 			break;
 		}
@@ -298,7 +298,7 @@ ccl_device void kernel_path_indirect(KernelGlobals *kg,
 		shader_eval_surface(kg, sd, rng, state, rbsdf, state->flag, SHADER_CONTEXT_INDIRECT);
 #ifdef __BRANCHED_PATH__
 		shader_merge_closures(sd);
-#endif
+#endif  /* __BRANCHED_PATH__ */
 
 		/* blurring of bsdf after bounces, for rays that have a small likelihood
 		 * of following this particular path (diffuse, rough glossy) */
@@ -321,7 +321,7 @@ ccl_device void kernel_path_indirect(KernelGlobals *kg,
 			                                              state->ray_pdf);
 			path_radiance_accum_emission(L, throughput, emission, state->bounce);
 		}
-#endif
+#endif  /* __EMISSION__ */
 
 		/* path termination. this is a strange place to put the termination, it's
 		 * mainly due to the mixed in MIS that we use. gives too many unneeded
@@ -348,7 +348,7 @@ ccl_device void kernel_path_indirect(KernelGlobals *kg,
 		if(kernel_data.integrator.use_ambient_occlusion || (sd->flag & SD_AO)) {
 			kernel_path_ao(kg, sd, emission_sd, L, state, rng, throughput, make_float3(0.0f, 0.0f, 0.0f));
 		}
-#endif
+#endif  /* __AO__ */
 
 #ifdef __SUBSURFACE__
 		/* bssrdf scatter to a different location on the same object, replacing
@@ -380,7 +380,7 @@ ccl_device void kernel_path_indirect(KernelGlobals *kg,
 				                        false);
 			}
 		}
-#endif
+#endif  /* __SUBSURFACE__ */
 
 #if defined(__EMISSION__) && defined(__BRANCHED_PATH__)
 		if(kernel_data.integrator.use_direct_light) {
@@ -395,7 +395,7 @@ ccl_device void kernel_path_indirect(KernelGlobals *kg,
 			                                           L,
 			                                           all);
 		}
-#endif
+#endif  /* defined(__EMISSION__) && defined(__BRANCHED_PATH__) */
 
 		if(!kernel_path_surface_bounce(kg, rng, sd, &throughput, state, L, ray))
 			break;
@@ -449,7 +449,7 @@ bool kernel_path_subsurface_scatter(
 		ss_indirect->need_update_volume_stack =
 		        kernel_data.integrator.use_volumes &&
 		        ccl_fetch(sd, flag) & SD_OBJECT_INTERSECTS_VOLUME;
-#  endif
+#  endif  /* __VOLUME__ */
 
 		/* compute lighting with the BSDF closure */
 		for(int hit = 0; hit < num_hits; hit++) {
@@ -492,7 +492,7 @@ bool kernel_path_subsurface_scatter(
 			{
 #  ifdef __LAMP_MIS__
 				hit_state->ray_t = 0.0f;
-#  endif
+#  endif  /* __LAMP_MIS__ */
 
 #  ifdef __VOLUME__
 				if(ss_indirect->need_update_volume_stack) {
@@ -507,7 +507,7 @@ bool kernel_path_subsurface_scatter(
 					    &volume_ray,
 					    hit_state->volume_stack);
 				}
-#  endif
+#  endif  /* __VOLUME__ */
 				path_radiance_reset_indirect(L);
 				ss_indirect->num_rays++;
 			}
@@ -593,14 +593,14 @@ ccl_device_inline float4 kernel_path_integrate(KernelGlobals *kg,
 #ifdef __KERNEL_DEBUG__
 	DebugData debug_data;
 	debug_data_init(&debug_data);
-#endif
+#endif  /* __KERNEL_DEBUG__ */
 
 #ifdef __SUBSURFACE__
 	SubsurfaceIndirectRays ss_indirect;
 	kernel_path_subsurface_init_indirect(&ss_indirect);
 
 	for(;;) {
-#endif
+#endif  /* __SUBSURFACE__ */
 
 	/* path iteration */
 	for(;;) {
@@ -626,7 +626,7 @@ ccl_device_inline float4 kernel_path_integrate(KernelGlobals *kg,
 		bool hit = scene_intersect(kg, ray, visibility, &isect, &lcg_state, difl, extmax);
 #else
 		bool hit = scene_intersect(kg, ray, visibility, &isect, NULL, 0.0f, 0.0f);
-#endif
+#endif  /* __HAIR__ */
 
 #ifdef __KERNEL_DEBUG__
 		if(state.flag & PATH_RAY_CAMERA) {
@@ -634,7 +634,7 @@ ccl_device_inline float4 kernel_path_integrate(KernelGlobals *kg,
 			debug_data.num_bvh_traversed_instances += isect.num_traversed_instances;
 		}
 		debug_data.num_ray_bounces++;
-#endif
+#endif  /* __KERNEL_DEBUG__ */
 
 #ifdef __LAMP_MIS__
 		if(kernel_data.integrator.use_lamp_mis && !(state.flag & PATH_RAY_CAMERA)) {
@@ -655,7 +655,7 @@ ccl_device_inline float4 kernel_path_integrate(KernelGlobals *kg,
 			if(indirect_lamp_emission(kg, &emission_sd, &state, &light_ray, &emission))
 				path_radiance_accum_emission(&L, throughput, emission, state.bounce);
 		}
-#endif
+#endif  /* __LAMP_MIS__ */
 
 #ifdef __VOLUME__
 		/* volume attenuation, emission, scatter */
@@ -719,7 +719,7 @@ ccl_device_inline float4 kernel_path_integrate(KernelGlobals *kg,
 				}
 			}
 			else
-#  endif
+#  endif  /* __VOLUME_DECOUPLED__ */
 			{
 				/* integrate along volume segment with distance sampling */
 				VolumeIntegrateResult result = kernel_volume_integrate(
@@ -736,10 +736,10 @@ ccl_device_inline float4 kernel_path_integrate(KernelGlobals *kg,
 					else
 						break;
 				}
-#  endif
+#  endif  /* __VOLUME_SCATTER__ */
 			}
 		}
-#endif
+#endif  /* __VOLUME__ */
 
 		if(!hit) {
 			/* eval background shader if nothing hit */
@@ -748,7 +748,7 @@ ccl_device_inline float4 kernel_path_integrate(KernelGlobals *kg,
 
 #ifdef __PASSES__
 				if(!(kernel_data.film.pass_flag & PASS_BACKGROUND))
-#endif
+#endif  /* __PASSES__ */
 					break;
 			}
 
@@ -756,7 +756,7 @@ ccl_device_inline float4 kernel_path_integrate(KernelGlobals *kg,
 			/* sample background shader */
 			float3 L_background = indirect_background(kg, &emission_sd, &state, &ray);
 			path_radiance_accum_background(&L, throughput, L_background, state.bounce);
-#endif
+#endif  /* __BACKGROUND__ */
 
 			break;
 		}
@@ -784,7 +784,7 @@ ccl_device_inline float4 kernel_path_integrate(KernelGlobals *kg,
 			if(sd.flag & SD_HOLDOUT_MASK)
 				break;
 		}
-#endif
+#endif  /* __HOLDOUT__ */
 
 		/* holdout mask objects do not write data passes */
 		kernel_write_data_passes(kg, buffer, &L, &sd, sample, &state, throughput);
@@ -807,7 +807,7 @@ ccl_device_inline float4 kernel_path_integrate(KernelGlobals *kg,
 			float3 emission = indirect_primitive_emission(kg, &sd, isect.t, state.flag, state.ray_pdf);
 			path_radiance_accum_emission(&L, throughput, emission, state.bounce);
 		}
-#endif
+#endif  /* __EMISSION__ */
 
 		/* path termination. this is a strange place to put the termination, it's
 		 * mainly due to the mixed in MIS that we use. gives too many unneeded
@@ -830,7 +830,7 @@ ccl_device_inline float4 kernel_path_integrate(KernelGlobals *kg,
 		if(kernel_data.integrator.use_ambient_occlusion || (sd.flag & SD_AO)) {
 			kernel_path_ao(kg, &sd, &emission_sd, &L, &state, rng, throughput, shader_bsdf_alpha(kg, &sd));
 		}
-#endif
+#endif  /* __AO__ */
 
 #ifdef __SUBSURFACE__
 		/* bssrdf scatter to a different location on the same object, replacing
@@ -885,7 +885,7 @@ ccl_device_inline float4 kernel_path_integrate(KernelGlobals *kg,
 
 #ifdef __KERNEL_DEBUG__
 	kernel_write_debug_passes(kg, buffer, &state, &debug_data, sample);
-#endif
+#endif  /* __KERNEL_DEBUG__ */
 
 	return make_float4(L_sum.x, L_sum.y, L_sum.z, 1.0f - L_transparent);
 }
diff --git a/intern/cycles/kernel/kernel_path_branched.h b/intern/cycles/kernel/kernel_path_branched.h
index cdb07db587a..c84727ace99 100644
--- a/intern/cycles/kernel/kernel_path_branched.h
+++ b/intern/cycles/kernel/kernel_path_branched.h
@@ -51,7 +51,7 @@ ccl_device_inline void kernel_branched_path_ao(KernelGlobals *kg,
 			light_ray.t = kernel_data.background.ao_distance;
 #ifdef __OBJECT_MOTION__
 			light_ray.time = ccl_fetch(sd, time);
-#endif
+#endif  /* __OBJECT_MOTION__ */
 			light_ray.dP = ccl_fetch(sd, dP);
 			light_ray.dD = differential3_zero();
 
@@ -169,7 +169,7 @@ ccl_device void kernel_branched_path_subsurface_scatter(KernelGlobals *kg,
 			Ray volume_ray = *ray;
 			bool need_update_volume_stack = kernel_data.integrator.use_volumes &&
 			                                ccl_fetch(sd, flag) & SD_OBJECT_INTERSECTS_VOLUME;
-#endif
+#endif  /* __VOLUME__ */
 
 			/* compute lighting with the BSDF closure */
 			for(int hit = 0; hit < num_hits; hit++) {
@@ -200,7 +200,7 @@ ccl_device void kernel_branched_path_subsurface_scatter(KernelGlobals *kg,
 					    &volume_ray,
 					    hit_state.volume_stack);
 				}
-#endif
+#endif  /* __VOLUME__ */
 
 #ifdef __EMISSION__
 				/* direct light */
@@ -217,7 +217,7 @@ ccl_device void kernel_branched_path_subsurface_scatter(KernelGlobals *kg,
 					        L,
 					        all);
 				}
-#endif
+#endif  /* __EMISSION__ */
 
 				/* indirect light */
 				kernel_branched_path_surface_indirect_light(
@@ -234,7 +234,7 @@ ccl_device void kernel_branched_path_subsurface_scatter(KernelGlobals *kg,
 		}
 	}
 }
-#endif
+#endif  /* __SUBSURFACE__ */
 
 ccl_device float4 kernel_branched_path_integrate(KernelGlobals *kg, RNG *rng, int sample, Ray ray, ccl_global float *buffer)
 {
@@ -256,7 +256,7 @@ ccl_device float4 kernel_branched_path_integrate(KernelGlobals *kg, RNG *rng, in
 #ifdef __KERNEL_DEBUG__
 	DebugData debug_data;
 	debug_data_init(&debug_data);
-#endif
+#endif  /* __KERNEL_DEBUG__ */
 
 	/* Main Loop
 	 * Here we only handle transparency intersections from the camera ray.
@@ -285,13 +285,13 @@ ccl_device float4 kernel_branched_path_integrate(KernelGlobals *kg, RNG *rng, in
 		bool hit = scene_intersect(kg, ray, visibility, &isect, &lcg_state, difl, extmax);
 #else
 		bool hit = scene_intersect(kg, ray, visibility, &isect, NULL, 0.0f, 0.0f);
-#endif
+#endif  /* __HAIR__ */
 
 #ifdef __KERNEL_DEBUG__
 		debug_data.num_bvh_traversal_steps += isect.num_traversal_steps;
 		debug_data.num_bvh_traversed_instances += isect.num_traversed_instances;
 		debug_data.num_ray_bounces++;
-#endif
+#endif  /* __KERNEL_DEBUG__ */
 
 #ifdef __VOLUME__
 		/* volume attenuation, emission, scatter */
@@ -432,14 +432,14 @@ ccl_device float4 kernel_branched_path_integrate(KernelGlobals *kg, RNG *rng, in
 						path_radiance_reset_indirect(&L);
 					}
 				}
-#endif
+#endif  /* __VOLUME_SCATTER__ */
 			}
 
 			/* todo: avoid this calculation using decoupled ray marching */
 			kernel_volume_shadow(kg, &emission_sd, &state, &volume_ray, &throughput);
-#endif
+#endif  /* __VOLUME_DECOUPLED__ */
 		}
-#endif
+#endif  /* __VOLUME__ */
 
 		if(!hit) {
 			/* eval background shader if nothing hit */
@@ -448,7 +448,7 @@ ccl_device float4 kernel_branched_path_integrate(KernelGlobals *kg, RNG *rng, in
 
 #ifdef __PASSES__
 				if(!(kernel_data.film.pass_flag & PASS_BACKGROUND))
-#endif
+#endif  /* __PASSES__ */
 					break;
 			}
 
@@ -456,7 +456,7 @@ ccl_device float4 kernel_branched_path_integrate(KernelGlobals *kg, RNG *rng, in
 			/* sample background shader */
 			float3 L_background = indirect_background(kg, &emission_sd, &state, &ray);
 			path_radiance_accum_background(&L, throughput, L_background, state.bounce);
-#endif
+#endif  /* __BACKGROUND__ */
 
 			break;
 		}
@@ -484,7 +484,7 @@ ccl_device float4 kernel_branched_path_integrate(KernelGlobals *kg, RNG *rng, in
 			if(sd.flag & SD_HOLDOUT_MASK)
 				break;
 		}
-#endif
+#endif  /* __HOLDOUT__ */
 
 		/* holdout mask objects do not write data passes */
 		kernel_write_data_passes(kg, buffer, &L, &sd, sample, &state, throughput);
@@ -495,7 +495,7 @@ ccl_device float4 kernel_branched_path_integrate(KernelGlobals *kg, RNG *rng, in
 			float3 emission = indirect_primitive_emission(kg, &sd, isect.t, state.flag, state.ray_pdf);
 			path_radiance_accum_emission(&L, throughput, emission, state.bounce);
 		}
-#endif
+#endif  /* __EMISSION__ */
 
 		/* transparency termination */
 		if(state.flag & PATH_RAY_TRANSPARENT) {
@@ -522,7 +522,7 @@ ccl_device float4 kernel_branched_path_integrate(KernelGlobals *kg, RNG *rng, in
 		if(kernel_data.integrator.use_ambient_occlusion || (sd.flag & SD_AO)) {
 			kernel_branched_path_ao(kg, &sd, &emission_sd, &L, &state, rng, throughput);
 		}
-#endif
+#endif  /* __AO__ */
 
 #ifdef __SUBSURFACE__
 		/* bssrdf scatter to a different location on the same object */
@@ -530,7 +530,7 @@ ccl_device float4 kernel_branched_path_integrate(KernelGlobals *kg, RNG *rng, in
 			kernel_branched_path_subsurface_scatter(kg, &sd, &indirect_sd, &emission_sd,
 			                                        &L, &state, rng, &ray, throughput);
 		}
-#endif
+#endif  /* __SUBSURFACE__ */
 
 		if(!(sd.flag & SD_HAS_ONLY_VOLUME)) {
 			PathState hit_state = state;
@@ -542,7 +542,7 @@ ccl_device float4 kernel_branched_path_integrate(KernelGlobals *kg, RNG *rng, in
 				kernel_branched_path_surface_connect_light(kg, rng,
 					&sd, &emission_sd, &hit_state, throughput, 1.0f, &L, all);
 			}
-#endif
+#endif  /* __EMISSION__ */
 
 			/* indirect light */
 			kernel_branched_path_surface_indirect_light(kg, rng,
@@ -567,12 +567,12 @@ ccl_device float4 kernel_branched_path_integrate(KernelGlobals *kg, RNG *rng, in
 		ray.dP = sd.dP;
 		ray.dD.dx = -sd.dI.dx;
 		ray.dD.dy = -sd.dI.dy;
-#endif
+#endif  /* __RAY_DIFFERENTIALS__ */
 
 #ifdef __VOLUME__
 		/* enter/exit volume */
 		kernel_volume_stack_enter_exit(kg, &sd, state.volume_stack);
-#endif
+#endif  /* __VOLUME__ */
 	}
 
 	float3 L_sum = path_radiance_clamp_and_sum(kg, &L);
@@ -581,7 +581,7 @@ ccl_device float4 kernel_branched_path_integrate(KernelGlobals *kg, RNG *rng, in
 
 #ifdef __KERNEL_DEBUG__
 	kernel_write_debug_passes(kg, buffer, &state, &debug_data, sample);
-#endif
+#endif  /* __KERNEL_DEBUG__ */
 
 	return make_float4(L_sum.x, L_sum.y, L_sum.z, 1.0f - L_transparent);
 }
diff --git a/intern/cycles/render/image.cpp b/intern/cycles/render/image.cpp
index 073a0aa2ac9..7465fbd43a7 100644
--- a/intern/cycles/render/image.cpp
+++ b/intern/cycles/render/image.cpp
@@ -471,133 +471,43 @@ bool ImageManager::file_load_image_generic(Image *img, ImageInput **in, int &wid
 	return true;
 }
 
-template<typename T>
-bool ImageManager::file_load_byte_image(Image *img, ImageDataType type, device_vector<T>& tex_img)
+template<TypeDesc::BASETYPE FileFormat,
+         typename StorageType,
+         typename DeviceType>
+bool ImageManager::file_load_image(Image *img,
+                                   ImageDataType type,
+                                   device_vector<DeviceType>& tex_img)
 {
+	const StorageType alpha_one = (FileFormat == TypeDesc::UINT8)? 255 : 1;
 	ImageInput *in = NULL;
 	int width, height, depth, components;
-
-	if(!file_load_image_generic(img, &in, width, height, depth, components))
-		return false;
-
-	/* read RGBA pixels */
-	uchar *pixels = (uchar*)tex_img.resize(width, height, depth);
-	if(pixels == NULL) {
+	if(!file_load_image_generic(img, &in, width, height, depth, components)) {
 		return false;
 	}
-	bool cmyk = false;
-
-	if(in) {
-		if(depth <= 1) {
-			int scanlinesize = width*components*sizeof(uchar);
-
-			in->read_image(TypeDesc::UINT8,
-			               (uchar*)pixels + (((size_t)height)-1)*scanlinesize,
-			               AutoStride,
-			               -scanlinesize,
-			               AutoStride);
-		}
-		else {
-			in->read_image(TypeDesc::UINT8, (uchar*)pixels);
-		}
-
-		cmyk = strcmp(in->format_name(), "jpeg") == 0 && components == 4;
-
-		in->close();
-		delete in;
-	}
-	else {
-		builtin_image_pixels_cb(img->filename, img->builtin_data, pixels);
-	}
-
-	/* Check if we actually have a byte4 slot, in case components == 1, but device
-	 * doesn't support single channel textures. */
-	if(type == IMAGE_DATA_TYPE_BYTE4) {
-		size_t num_pixels = ((size_t)width) * height * depth;
-		if(cmyk) {
-			/* CMYK */
-			for(size_t i = num_pixels-1, pixel = 0; pixel < num_pixels; pixel++, i--) {
-				pixels[i*4+2] = (pixels[i*4+2]*pixels[i*4+3])/255;
-				pixels[i*4+1] = (pixels[i*4+1]*pixels[i*4+3])/255;
-				pixels[i*4+0] = (pixels[i*4+0]*pixels[i*4+3])/255;
-				pixels[i*4+3] = 255;
-			}
-		}
-		else if(components == 2) {
-			/* grayscale + alpha */
-			for(size_t i = num_pixels-1, pixel = 0; pixel < num_pixels; pixel++, i--) {
-				pixels[i*4+3] = pixels[i*2+1];
-				pixels[i*4+2] = pixels[i*2+0];
-				pixels[i*4+1] = pixels[i*2+0];
-				pixels[i*4+0] = pixels[i*2+0];
-			}
-		}
-		else if(components == 3) {
-			/* RGB */
-			for(size_t i = num_pixels-1, pixel = 0; pixel < num_pixels; pixel++, i--) {
-				pixels[i*4+3] = 255;
-				pixels[i*4+2] = pixels[i*3+2];
-				pixels[i*4+1] = pixels[i*3+1];
-				pixels[i*4+0] = pixels[i*3+0];
-			}
-		}
-		else if(components == 1) {
-			/* grayscale */
-			for(size_t i = num_pixels-1, pixel = 0; pixel < num_pixels; pixel++, i--) {
-				pixels[i*4+3] = 255;
-				pixels[i*4+2] = pixels[i];
-				pixels[i*4+1] = pixels[i];
-				pixels[i*4+0] = pixels[i];
-			}
-		}
-
-		if(img->use_alpha == false) {
-			for(size_t i = num_pixels-1, pixel = 0; pixel < num_pixels; pixel++, i--) {
-				pixels[i*4+3] = 255;
-			}
-		}
-	}
-
-	return true;
-}
-
-template<typename T>
-bool ImageManager::file_load_float_image(Image *img, ImageDataType type, device_vector<T>& tex_img)
-{
-	ImageInput *in = NULL;
-	int width, height, depth, components;
-
-	if(!file_load_image_generic(img, &in, width, height, depth, components))
-		return false;
-
-	/* read RGBA pixels */
-	float *pixels = (float*)tex_img.resize(width, height, depth);
+	/* Read RGBA pixels. */
+	StorageType *pixels = (StorageType*)tex_img.resize(width, height, depth);
 	if(pixels == NULL) {
 		return false;
 	}
 	bool cmyk = false;
-
 	if(in) {
-		float *readpixels = pixels;
-		vector<float> tmppixels;
-
+		StorageType *readpixels = pixels;
+		vector<StorageType> tmppixels;
 		if(components > 4) {
 			tmppixels.resize(((size_t)width)*height*components);
 			readpixels = &tmppixels[0];
 		}
-
 		if(depth <= 1) {
-			size_t scanlinesize = ((size_t)width)*components*sizeof(float);
-			in->read_image(TypeDesc::FLOAT,
+			size_t scanlinesize = ((size_t)width)*components*sizeof(StorageType);
+			in->read_image(FileFormat,
 			               (uchar*)readpixels + (height-1)*scanlinesize,
 			               AutoStride,
 			               -scanlinesize,
 			               AutoStride);
 		}
 		else {
-			in->read_image(TypeDesc::FLOAT, (uchar*)readpixels);
+			in->read_image(FileFormat, (uchar*)readpixels);
 		}
-
 		if(components > 4) {
 			size_t dimensions = ((size_t)width)*height;
 			for(size_t i = dimensions-1, pixel = 0; pixel < dimensions; pixel++, i--) {
@@ -606,30 +516,42 @@ bool ImageManager::file_load_float_image(Image *img, ImageDataType type, device_
 				pixels[i*4+1] = tmppixels[i*components+1];
 				pixels[i*4+0] = tmppixels[i*components+0];
 			}
-
 			tmppixels.clear();
 		}
-
 		cmyk = strcmp(in->format_name(), "jpeg") == 0 && components == 4;
-
 		in->close();
 		delete in;
 	}
 	else {
-		builtin_image_float_pixels_cb(img->filename, img->builtin_data, pixels);
+		if(FileFormat == TypeDesc::FLOAT) {
+			builtin_image_float_pixels_cb(img->filename,
+			                              img->builtin_data,
+			                              (float*)pixels);
+		}
+		else if(FileFormat == TypeDesc::UINT8) {
+			builtin_image_pixels_cb(img->filename,
+			                        img->builtin_data,
+			                        (uchar*)pixels);
+		}
+		else {
+			/* TODO(dingto): Support half for ImBuf. */
+		}
 	}
-
-	/* Check if we actually have a float4 slot, in case components == 1, but device
-	 * doesn't support single channel textures. */
-	if(type == IMAGE_DATA_TYPE_FLOAT4) {
+	/* Check if we actually have a float4 slot, in case components == 1,
+	 * but device doesn't support single channel textures.
+	 */
+	if(type == IMAGE_DATA_TYPE_FLOAT4 ||
+	   type == IMAGE_DATA_TYPE_HALF4 ||
+	   type == IMAGE_DATA_TYPE_BYTE4)
+	{
 		size_t num_pixels = ((size_t)width) * height * depth;
 		if(cmyk) {
 			/* CMYK */
 			for(size_t i = num_pixels-1, pixel = 0; pixel < num_pixels; pixel++, i--) {
-				pixels[i*4+3] = 255;
 				pixels[i*4+2] = (pixels[i*4+2]*pixels[i*4+3])/255;
 				pixels[i*4+1] = (pixels[i*4+1]*pixels[i*4+3])/255;
 				pixels[i*4+0] = (pixels[i*4+0]*pixels[i*4+3])/255;
+				pixels[i*4+3] = alpha_one;
 			}
 		}
 		else if(components == 2) {
@@ -644,7 +566,7 @@ bool ImageManager::file_load_float_image(Image *img, ImageDataType type, device_
 		else if(components == 3) {
 			/* RGB */
 			for(size_t i = num_pixels-1, pixel = 0; pixel < num_pixels; pixel++, i--) {
-				pixels[i*4+3] = 1.0f;
+				pixels[i*4+3] = alpha_one;
 				pixels[i*4+2] = pixels[i*3+2];
 				pixels[i*4+1] = pixels[i*3+1];
 				pixels[i*4+0] = pixels[i*3+0];
@@ -653,120 +575,18 @@ bool ImageManager::file_load_float_image(Image *img, ImageDataType type, device_
 		else if(components == 1) {
 			/* grayscale */
 			for(size_t i = num_pixels-1, pixel = 0; pixel < num_pixels; pixel++, i--) {
-				pixels[i*4+3] = 1.0f;
+				pixels[i*4+3] = alpha_one;
 				pixels[i*4+2] = pixels[i];
 				pixels[i*4+1] = pixels[i];
 				pixels[i*4+0] = pixels[i];
 			}
 		}
-
 		if(img->use_alpha == false) {
 			for(size_t i = num_pixels-1, pixel = 0; pixel < num_pixels; pixel++, i--) {
-				pixels[i*4+3] = 1.0f;
+				pixels[i*4+3] = alpha_one;
 			}
 		}
 	}
-
-	return true;
-}
-
-template<typename T>
-bool ImageManager::file_load_half_image(Image *img, ImageDataType type, device_vector<T>& tex_img)
-{
-	ImageInput *in = NULL;
-	int width, height, depth, components;
-
-	if(!file_load_image_generic(img, &in, width, height, depth, components))
-		return false;
-
-	/* read RGBA pixels */
-	half *pixels = (half*)tex_img.resize(width, height, depth);
-	if(pixels == NULL) {
-		return false;
-	}
-
-	if(in) {
-		half *readpixels = pixels;
-		vector<half> tmppixels;
-
-		if(components > 4) {
-			tmppixels.resize(((size_t)width)*height*components);
-			readpixels = &tmppixels[0];
-		}
-
-		if(depth <= 1) {
-			size_t scanlinesize = ((size_t)width)*components*sizeof(half);
-			in->read_image(TypeDesc::HALF,
-			               (uchar*)readpixels + (height-1)*scanlinesize,
-			               AutoStride,
-			               -scanlinesize,
-			               AutoStride);
-		}
-		else {
-			in->read_image(TypeDesc::HALF, (uchar*)readpixels);
-		}
-
-		if(components > 4) {
-			size_t dimensions = ((size_t)width)*height;
-			for(size_t i = dimensions-1, pixel = 0; pixel < dimensions; pixel++, i--) {
-				pixels[i*4+3] = tmppixels[i*components+3];
-				pixels[i*4+2] = tmppixels[i*components+2];
-				pixels[i*4+1] = tmppixels[i*components+1];
-				pixels[i*4+0] = tmppixels[i*components+0];
-			}
-
-			tmppixels.clear();
-		}
-
-		in->close();
-		delete in;
-	}
-#if 0
-	/* TODO(dingto): Support half for ImBuf. */
-	else {
-		builtin_image_float_pixels_cb(img->filename, img->builtin_data, pixels);
-	}
-#endif
-
-	/* Check if we actually have a half4 slot, in case components == 1, but device
-	 * doesn't support single channel textures. */
-	if(type == IMAGE_DATA_TYPE_HALF4) {
-		size_t num_pixels = ((size_t)width) * height * depth;
-		if(components == 2) {
-			/* grayscale + alpha */
-			for(size_t i = num_pixels-1, pixel = 0; pixel < num_pixels; pixel++, i--) {
-				pixels[i*4+3] = pixels[i*2+1];
-				pixels[i*4+2] = pixels[i*2+0];
-				pixels[i*4+1] = pixels[i*2+0];
-				pixels[i*4+0] = pixels[i*2+0];
-			}
-		}
-		else if(components == 3) {
-			/* RGB */
-			for(size_t i = num_pixels-1, pixel = 0; pixel < num_pixels; pixel++, i--) {
-				pixels[i*4+3] = 1.0f;
-				pixels[i*4+2] = pixels[i*3+2];
-				pixels[i*4+1] = pixels[i*3+1];
-				pixels[i*4+0] = pixels[i*3+0];
-			}
-		}
-		else if(components == 1) {
-			/* grayscale */
-			for(size_t i = num_pixels-1, pixel = 0; pixel < num_pixels; pixel++, i--) {
-				pixels[i*4+3] = 1.0f;
-				pixels[i*4+2] = pixels[i];
-				pixels[i*4+1] = pixels[i];
-				pixels[i*4+0] = pixels[i];
-			}
-		}
-
-		if(img->use_alpha == false) {
-			for(size_t i = num_pixels-1, pixel = 0; pixel < num_pixels; pixel++, i--) {
-				pixels[i*4+3] = 1.0f;
-			}
-		}
-	}
-
 	return true;
 }
 
@@ -802,7 +622,7 @@ void ImageManager::device_load_image(Device *device, DeviceScene *dscene, ImageD
 			device->tex_free(tex_img);
 		}
 
-		if(!file_load_float_image(img, type, tex_img)) {
+		if(!file_load_image<TypeDesc::FLOAT, float>(img, type, tex_img)) {
 			/* on failure to load, we set a 1x1 pixels pink image */
 			float *pixels = (float*)tex_img.resize(1, 1);
 
@@ -828,7 +648,7 @@ void ImageManager::device_load_image(Device *device, DeviceScene *dscene, ImageD
 			device->tex_free(tex_img);
 		}
 
-		if(!file_load_float_image(img, type, tex_img)) {
+		if(!file_load_image<TypeDesc::FLOAT, float>(img, type, tex_img)) {
 			/* on failure to load, we set a 1x1 pixels pink image */
 			float *pixels = (float*)tex_img.resize(1, 1);
 
@@ -851,7 +671,7 @@ void ImageManager::device_load_image(Device *device, DeviceScene *dscene, ImageD
 			device->tex_free(tex_img);
 		}
 
-		if(!file_load_byte_image(img, type, tex_img)) {
+		if(!file_load_image<TypeDesc::UINT8, uchar>(img, type, tex_img)) {
 			/* on failure to load, we set a 1x1 pixels pink image */
 			uchar *pixels = (uchar*)tex_img.resize(1, 1);
 
@@ -877,7 +697,7 @@ void ImageManager::device_load_image(Device *device, DeviceScene *dscene, ImageD
 			device->tex_free(tex_img);
 		}
 
-		if(!file_load_byte_image(img, type, tex_img)) {
+		if(!file_load_image<TypeDesc::UINT8, uchar>(img, type, tex_img)) {
 			/* on failure to load, we set a 1x1 pixels pink image */
 			uchar *pixels = (uchar*)tex_img.resize(1, 1);
 
@@ -900,7 +720,7 @@ void ImageManager::device_load_image(Device *device, DeviceScene *dscene, ImageD
 			device->tex_free(tex_img);
 		}
 
-		if(!file_load_half_image(img, type, tex_img)) {
+		if(!file_load_image<TypeDesc::HALF, half>(img, type, tex_img)) {
 			/* on failure to load, we set a 1x1 pixels pink image */
 			half *pixels = (half*)tex_img.resize(1, 1);
 
@@ -926,7 +746,7 @@ void ImageManager::device_load_image(Device *device, DeviceScene *dscene, ImageD
 			device->tex_free(tex_img);
 		}
 
-		if(!file_load_half_image(img, type, tex_img)) {
+		if(!file_load_image<TypeDesc::HALF, half>(img, type, tex_img)) {
 			/* on failure to load, we set a 1x1 pixels pink image */
 			half *pixels = (half*)tex_img.resize(1, 1);
 
diff --git a/intern/cycles/render/image.h b/intern/cycles/render/image.h
index cca71a6bb93..1dc4bf180f8 100644
--- a/intern/cycles/render/image.h
+++ b/intern/cycles/render/image.h
@@ -109,14 +109,12 @@ private:
 
 	bool file_load_image_generic(Image *img, ImageInput **in, int &width, int &height, int &depth, int &components);
 
-	template<typename T>
-	bool file_load_byte_image(Image *img, ImageDataType type, device_vector<T>& tex_img);
-
-	template<typename T>
-	bool file_load_float_image(Image *img, ImageDataType type, device_vector<T>& tex_img);
-
-	template<typename T>
-	bool file_load_half_image(Image *img, ImageDataType type, device_vector<T>& tex_img);
+	template<TypeDesc::BASETYPE FileFormat,
+	         typename StorageType,
+	         typename DeviceType>
+	bool file_load_image(Image *img,
+	                     ImageDataType type,
+	                     device_vector<DeviceType>& tex_img);
 
 	int type_index_to_flattened_slot(int slot, ImageDataType type);
 	int flattened_slot_to_type_index(int flat_slot, ImageDataType *type);
diff --git a/intern/cycles/render/light.cpp b/intern/cycles/render/light.cpp
index c43d646f515..2245c861d5a 100644
--- a/intern/cycles/render/light.cpp
+++ b/intern/cycles/render/light.cpp
@@ -106,6 +106,7 @@ NODE_DEFINE(Light)
 
 	static NodeEnum type_enum;
 	type_enum.insert("point", LIGHT_POINT);
+	type_enum.insert("distant", LIGHT_DISTANT);
 	type_enum.insert("background", LIGHT_BACKGROUND);
 	type_enum.insert("area", LIGHT_AREA);
 	type_enum.insert("spot", LIGHT_SPOT);
diff --git a/intern/cycles/util/util_atomic.h b/intern/cycles/util/util_atomic.h
index 1d1e2963348..433e41fbbb6 100644
--- a/intern/cycles/util/util_atomic.h
+++ b/intern/cycles/util/util_atomic.h
@@ -39,7 +39,7 @@ ATOMIC_INLINE void atomic_update_max_z(size_t *maximum_value, size_t value)
 /* Float atomics implementation credits:
  *   http://suhorukov.blogspot.in/2011/12/opencl-11-atomic-operations-on-floating.html
  */
-ccl_device_inline void atomic_add_float(volatile ccl_global float *source,
+ccl_device_inline void atomic_add_and_fetch_float(volatile ccl_global float *source,
                                         const float operand)
 {
 	union {
diff --git a/intern/cycles/util/util_path.cpp b/intern/cycles/util/util_path.cpp
index 62ef8fc0b48..5df262fcbbb 100644
--- a/intern/cycles/util/util_path.cpp
+++ b/intern/cycles/util/util_path.cpp
@@ -757,9 +757,9 @@ uint64_t path_modified_time(const string& path)
 {
 	path_stat_t st;
 	if(path_stat(path, &st) != 0) {
-		return st.st_mtime;
+		return 0;
 	}
-	return 0;
+	return st.st_mtime;
 }
 
 bool path_remove(const string& path)
diff --git a/intern/cycles/util/util_stats.h b/intern/cycles/util/util_stats.h
index b970b017270..c21a8488c81 100644
--- a/intern/cycles/util/util_stats.h
+++ b/intern/cycles/util/util_stats.h
@@ -29,13 +29,13 @@ public:
 	explicit Stats(static_init_t) {}
 
 	void mem_alloc(size_t size) {
-		atomic_add_z(&mem_used, size);
+		atomic_add_and_fetch_z(&mem_used, size);
 		atomic_update_max_z(&mem_peak, mem_used);
 	}
 
 	void mem_free(size_t size) {
 		assert(mem_used >= size);
-		atomic_sub_z(&mem_used, size);
+		atomic_sub_and_fetch_z(&mem_used, size);
 	}
 
 	size_t mem_used;
diff --git a/intern/guardedalloc/intern/mallocn_guarded_impl.c b/intern/guardedalloc/intern/mallocn_guarded_impl.c
index 1933e9d3ee3..76b7e072321 100644
--- a/intern/guardedalloc/intern/mallocn_guarded_impl.c
+++ b/intern/guardedalloc/intern/mallocn_guarded_impl.c
@@ -505,8 +505,8 @@ static void make_memhead_header(MemHead *memh, size_t len, const char *str)
 	memt = (MemTail *)(((char *) memh) + sizeof(MemHead) + len);
 	memt->tag3 = MEMTAG3;
 
-	atomic_add_u(&totblock, 1);
-	atomic_add_z(&mem_in_use, len);
+	atomic_add_and_fetch_u(&totblock, 1);
+	atomic_add_and_fetch_z(&mem_in_use, len);
 
 	mem_lock_thread();
 	addtail(membase, &memh->next);
@@ -638,7 +638,7 @@ void *MEM_guarded_mapallocN(size_t len, const char *str)
 	if (memh != (MemHead *)-1) {
 		make_memhead_header(memh, len, str);
 		memh->mmap = 1;
-		atomic_add_z(&mmap_in_use, len);
+		atomic_add_and_fetch_z(&mmap_in_use, len);
 		mem_lock_thread();
 		peak_mem = mmap_in_use > peak_mem ? mmap_in_use : peak_mem;
 		mem_unlock_thread();
@@ -1007,8 +1007,8 @@ static void rem_memblock(MemHead *memh)
 	}
 	mem_unlock_thread();
 
-	atomic_sub_u(&totblock, 1);
-	atomic_sub_z(&mem_in_use, memh->len);
+	atomic_sub_and_fetch_u(&totblock, 1);
+	atomic_sub_and_fetch_z(&mem_in_use, memh->len);
 
 #ifdef DEBUG_MEMDUPLINAME
 	if (memh->need_free_name)
@@ -1016,7 +1016,7 @@ static void rem_memblock(MemHead *memh)
 #endif
 
 	if (memh->mmap) {
-		atomic_sub_z(&mmap_in_use, memh->len);
+		atomic_sub_and_fetch_z(&mmap_in_use, memh->len);
 #if defined(WIN32)
 		/* our windows mmap implementation is not thread safe */
 		mem_lock_thread();
diff --git a/intern/guardedalloc/intern/mallocn_lockfree_impl.c b/intern/guardedalloc/intern/mallocn_lockfree_impl.c
index a80d67c3e80..ce8a5b29ece 100644
--- a/intern/guardedalloc/intern/mallocn_lockfree_impl.c
+++ b/intern/guardedalloc/intern/mallocn_lockfree_impl.c
@@ -142,11 +142,11 @@ void MEM_lockfree_freeN(void *vmemh)
 		return;
 	}
 
-	atomic_sub_u(&totblock, 1);
-	atomic_sub_z(&mem_in_use, len);
+	atomic_sub_and_fetch_u(&totblock, 1);
+	atomic_sub_and_fetch_z(&mem_in_use, len);
 
 	if (MEMHEAD_IS_MMAP(memh)) {
-		atomic_sub_z(&mmap_in_use, len);
+		atomic_sub_and_fetch_z(&mmap_in_use, len);
 #if defined(WIN32)
 		/* our windows mmap implementation is not thread safe */
 		mem_lock_thread();
@@ -287,8 +287,8 @@ void *MEM_lockfree_callocN(size_t len, const char *str)
 
 	if (LIKELY(memh)) {
 		memh->len = len;
-		atomic_add_u(&totblock, 1);
-		atomic_add_z(&mem_in_use, len);
+		atomic_add_and_fetch_u(&totblock, 1);
+		atomic_add_and_fetch_z(&mem_in_use, len);
 		update_maximum(&peak_mem, mem_in_use);
 
 		return PTR_FROM_MEMHEAD(memh);
@@ -312,8 +312,8 @@ void *MEM_lockfree_mallocN(size_t len, const char *str)
 		}
 
 		memh->len = len;
-		atomic_add_u(&totblock, 1);
-		atomic_add_z(&mem_in_use, len);
+		atomic_add_and_fetch_u(&totblock, 1);
+		atomic_add_and_fetch_z(&mem_in_use, len);
 		update_maximum(&peak_mem, mem_in_use);
 
 		return PTR_FROM_MEMHEAD(memh);
@@ -361,8 +361,8 @@ void *MEM_lockfree_mallocN_aligned(size_t len, size_t alignment, const char *str
 
 		memh->len = len | (size_t) MEMHEAD_ALIGN_FLAG;
 		memh->alignment = (short) alignment;
-		atomic_add_u(&totblock, 1);
-		atomic_add_z(&mem_in_use, len);
+		atomic_add_and_fetch_u(&totblock, 1);
+		atomic_add_and_fetch_z(&mem_in_use, len);
 		update_maximum(&peak_mem, mem_in_use);
 
 		return PTR_FROM_MEMHEAD(memh);
@@ -396,9 +396,9 @@ void *MEM_lockfree_mapallocN(size_t len, const char *str)
 
 	if (memh != (MemHead *)-1) {
 		memh->len = len | (size_t) MEMHEAD_MMAP_FLAG;
-		atomic_add_u(&totblock, 1);
-		atomic_add_z(&mem_in_use, len);
-		atomic_add_z(&mmap_in_use, len);
+		atomic_add_and_fetch_u(&totblock, 1);
+		atomic_add_and_fetch_z(&mem_in_use, len);
+		atomic_add_and_fetch_z(&mmap_in_use, len);
 
 		update_maximum(&peak_mem, mem_in_use);
 		update_maximum(&peak_mem, mmap_in_use);
diff --git a/intern/iksolver/intern/IK_QSegment.h b/intern/iksolver/intern/IK_QSegment.h
index 74f157aa763..247807dc5e0 100644
--- a/intern/iksolver/intern/IK_QSegment.h
+++ b/intern/iksolver/intern/IK_QSegment.h
@@ -60,6 +60,7 @@
 class IK_QSegment
 {
 public:
+	EIGEN_MAKE_ALIGNED_OPERATOR_NEW
 	virtual ~IK_QSegment();
 
 	// start: a user defined translation
diff --git a/intern/iksolver/intern/IK_Solver.cpp b/intern/iksolver/intern/IK_Solver.cpp
index cefb8c7ed7b..a00db4fa2f5 100644
--- a/intern/iksolver/intern/IK_Solver.cpp
+++ b/intern/iksolver/intern/IK_Solver.cpp
@@ -42,6 +42,7 @@ using namespace std;
 
 class IK_QSolver {
 public:
+	EIGEN_MAKE_ALIGNED_OPERATOR_NEW
 	IK_QSolver() : root(NULL) {
 	}
author	Dalai Felinto <dfelinto@gmail.com>	2016-11-16 19:01:19 +0300
committer	Dalai Felinto <dfelinto@gmail.com>	2016-11-16 19:04:21 +0300
commit	2bcb1b208a4193fb28f1e0c4408b733f5ee2c028 (patch)
tree	8b9260ffd9fb7b371dc66955903c7b0c7f4e7bf9 /intern
parent	930f999f6ea683d02ac490026a52817f1d965377 (diff)
parent	2a2eb0c463bd96d42f7306eb17f88cad87f73aea (diff)