83 files changed, 571 insertions, 570 deletions
diff --git a/intern/cycles/kernel/closure/bsdf.h b/intern/cycles/kernel/closure/bsdf.h
index 86fea48760f..b3141d1154f 100644
--- a/intern/cycles/kernel/closure/bsdf.h
+++ b/intern/cycles/kernel/closure/bsdf.h
@@ -35,7 +35,7 @@
 
 CCL_NAMESPACE_BEGIN
 
-__device int bsdf_sample(KernelGlobals *kg, const ShaderData *sd, const ShaderClosure *sc, float randu, float randv, float3 *eval, float3 *omega_in, differential3 *domega_in, float *pdf)
+ccl_device int bsdf_sample(KernelGlobals *kg, const ShaderData *sd, const ShaderClosure *sc, float randu, float randv, float3 *eval, float3 *omega_in, differential3 *domega_in, float *pdf)
 {
 	int label;
 
@@ -132,7 +132,7 @@ __device int bsdf_sample(KernelGlobals *kg, const ShaderData *sd, const ShaderCl
 	return label;
 }
 
-__device float3 bsdf_eval(KernelGlobals *kg, const ShaderData *sd, const ShaderClosure *sc, const float3 omega_in, float *pdf)
+ccl_device float3 bsdf_eval(KernelGlobals *kg, const ShaderData *sd, const ShaderClosure *sc, const float3 omega_in, float *pdf)
 {
 	float3 eval;
 
@@ -275,7 +275,7 @@ __device float3 bsdf_eval(KernelGlobals *kg, const ShaderData *sd, const ShaderC
 	return eval;
 }
 
-__device void bsdf_blur(KernelGlobals *kg, ShaderClosure *sc, float roughness)
+ccl_device void bsdf_blur(KernelGlobals *kg, ShaderClosure *sc, float roughness)
 {
 #ifdef __OSL__
 	if(kg->osl && sc->prim) {
diff --git a/intern/cycles/kernel/closure/bsdf_ashikhmin_velvet.h b/intern/cycles/kernel/closure/bsdf_ashikhmin_velvet.h
index 94bc6eb0dc5..3631f90bf8c 100644
--- a/intern/cycles/kernel/closure/bsdf_ashikhmin_velvet.h
+++ b/intern/cycles/kernel/closure/bsdf_ashikhmin_velvet.h
@@ -35,7 +35,7 @@
 
 CCL_NAMESPACE_BEGIN
 
-__device int bsdf_ashikhmin_velvet_setup(ShaderClosure *sc)
+ccl_device int bsdf_ashikhmin_velvet_setup(ShaderClosure *sc)
 {
 	float sigma = fmaxf(sc->data0, 0.01f);
 	sc->data0 = 1.0f/(sigma * sigma); /* m_invsigma2 */
@@ -45,11 +45,11 @@ __device int bsdf_ashikhmin_velvet_setup(ShaderClosure *sc)
 	return SD_BSDF|SD_BSDF_HAS_EVAL;
 }
 
-__device void bsdf_ashikhmin_velvet_blur(ShaderClosure *sc, float roughness)
+ccl_device void bsdf_ashikhmin_velvet_blur(ShaderClosure *sc, float roughness)
 {
 }
 
-__device float3 bsdf_ashikhmin_velvet_eval_reflect(const ShaderClosure *sc, const float3 I, const float3 omega_in, float *pdf)
+ccl_device float3 bsdf_ashikhmin_velvet_eval_reflect(const ShaderClosure *sc, const float3 I, const float3 omega_in, float *pdf)
 {
 	float m_invsigma2 = sc->data0;
 	float3 N = sc->N;
@@ -87,12 +87,12 @@ __device float3 bsdf_ashikhmin_velvet_eval_reflect(const ShaderClosure *sc, cons
 	return make_float3(0, 0, 0);
 }
 
-__device float3 bsdf_ashikhmin_velvet_eval_transmit(const ShaderClosure *sc, const float3 I, const float3 omega_in, float *pdf)
+ccl_device float3 bsdf_ashikhmin_velvet_eval_transmit(const ShaderClosure *sc, const float3 I, const float3 omega_in, float *pdf)
 {
 	return make_float3(0.0f, 0.0f, 0.0f);
 }
 
-__device int bsdf_ashikhmin_velvet_sample(const ShaderClosure *sc, float3 Ng, float3 I, float3 dIdx, float3 dIdy, float randu, float randv, float3 *eval, float3 *omega_in, float3 *domega_in_dx, float3 *domega_in_dy, float *pdf)
+ccl_device int bsdf_ashikhmin_velvet_sample(const ShaderClosure *sc, float3 Ng, float3 I, float3 dIdx, float3 dIdy, float randu, float randv, float3 *eval, float3 *omega_in, float3 *domega_in_dx, float3 *domega_in_dy, float *pdf)
 {
 	float m_invsigma2 = sc->data0;
 	float3 N = sc->N;
diff --git a/intern/cycles/kernel/closure/bsdf_diffuse.h b/intern/cycles/kernel/closure/bsdf_diffuse.h
index 46318ecd138..949fe869549 100644
--- a/intern/cycles/kernel/closure/bsdf_diffuse.h
+++ b/intern/cycles/kernel/closure/bsdf_diffuse.h
@@ -37,17 +37,17 @@ CCL_NAMESPACE_BEGIN
 
 /* DIFFUSE */
 
-__device int bsdf_diffuse_setup(ShaderClosure *sc)
+ccl_device int bsdf_diffuse_setup(ShaderClosure *sc)
 {
 	sc->type = CLOSURE_BSDF_DIFFUSE_ID;
 	return SD_BSDF|SD_BSDF_HAS_EVAL;
 }
 
-__device void bsdf_diffuse_blur(ShaderClosure *sc, float roughness)
+ccl_device void bsdf_diffuse_blur(ShaderClosure *sc, float roughness)
 {
 }
 
-__device float3 bsdf_diffuse_eval_reflect(const ShaderClosure *sc, const float3 I, const float3 omega_in, float *pdf)
+ccl_device float3 bsdf_diffuse_eval_reflect(const ShaderClosure *sc, const float3 I, const float3 omega_in, float *pdf)
 {
 	float3 N = sc->N;
 
@@ -56,12 +56,12 @@ __device float3 bsdf_diffuse_eval_reflect(const ShaderClosure *sc, const float3
 	return make_float3(cos_pi, cos_pi, cos_pi);
 }
 
-__device float3 bsdf_diffuse_eval_transmit(const ShaderClosure *sc, const float3 I, const float3 omega_in, float *pdf)
+ccl_device float3 bsdf_diffuse_eval_transmit(const ShaderClosure *sc, const float3 I, const float3 omega_in, float *pdf)
 {
 	return make_float3(0.0f, 0.0f, 0.0f);
 }
 
-__device int bsdf_diffuse_sample(const ShaderClosure *sc, float3 Ng, float3 I, float3 dIdx, float3 dIdy, float randu, float randv, float3 *eval, float3 *omega_in, float3 *domega_in_dx, float3 *domega_in_dy, float *pdf)
+ccl_device int bsdf_diffuse_sample(const ShaderClosure *sc, float3 Ng, float3 I, float3 dIdx, float3 dIdy, float randu, float randv, float3 *eval, float3 *omega_in, float3 *domega_in_dx, float3 *domega_in_dy, float *pdf)
 {
 	float3 N = sc->N;
 
@@ -84,22 +84,22 @@ __device int bsdf_diffuse_sample(const ShaderClosure *sc, float3 Ng, float3 I, f
 
 /* TRANSLUCENT */
 
-__device int bsdf_translucent_setup(ShaderClosure *sc)
+ccl_device int bsdf_translucent_setup(ShaderClosure *sc)
 {
 	sc->type = CLOSURE_BSDF_TRANSLUCENT_ID;
 	return SD_BSDF|SD_BSDF_HAS_EVAL;
 }
 
-__device void bsdf_translucent_blur(ShaderClosure *sc, float roughness)
+ccl_device void bsdf_translucent_blur(ShaderClosure *sc, float roughness)
 {
 }
 
-__device float3 bsdf_translucent_eval_reflect(const ShaderClosure *sc, const float3 I, const float3 omega_in, float *pdf)
+ccl_device float3 bsdf_translucent_eval_reflect(const ShaderClosure *sc, const float3 I, const float3 omega_in, float *pdf)
 {
 	return make_float3(0.0f, 0.0f, 0.0f);
 }
 
-__device float3 bsdf_translucent_eval_transmit(const ShaderClosure *sc, const float3 I, const float3 omega_in, float *pdf)
+ccl_device float3 bsdf_translucent_eval_transmit(const ShaderClosure *sc, const float3 I, const float3 omega_in, float *pdf)
 {
 	float3 N = sc->N;
 
@@ -108,12 +108,12 @@ __device float3 bsdf_translucent_eval_transmit(const ShaderClosure *sc, const fl
 	return make_float3 (cos_pi, cos_pi, cos_pi);
 }
 
-__device float bsdf_translucent_albedo(const ShaderClosure *sc, const float3 I)
+ccl_device float bsdf_translucent_albedo(const ShaderClosure *sc, const float3 I)
 {
 	return 1.0f;
 }
 
-__device int bsdf_translucent_sample(const ShaderClosure *sc, float3 Ng, float3 I, float3 dIdx, float3 dIdy, float randu, float randv, float3 *eval, float3 *omega_in, float3 *domega_in_dx, float3 *domega_in_dy, float *pdf)
+ccl_device int bsdf_translucent_sample(const ShaderClosure *sc, float3 Ng, float3 I, float3 dIdx, float3 dIdy, float randu, float randv, float3 *eval, float3 *omega_in, float3 *domega_in_dx, float3 *domega_in_dy, float *pdf)
 {
 	float3 N = sc->N;
 
diff --git a/intern/cycles/kernel/closure/bsdf_diffuse_ramp.h b/intern/cycles/kernel/closure/bsdf_diffuse_ramp.h
index 2e43e16693f..b856774375f 100644
--- a/intern/cycles/kernel/closure/bsdf_diffuse_ramp.h
+++ b/intern/cycles/kernel/closure/bsdf_diffuse_ramp.h
@@ -35,7 +35,7 @@
 
 CCL_NAMESPACE_BEGIN
 
-__device float3 bsdf_diffuse_ramp_get_color(const ShaderClosure *sc, const float3 colors[8], float pos)
+ccl_device float3 bsdf_diffuse_ramp_get_color(const ShaderClosure *sc, const float3 colors[8], float pos)
 {
 	int MAXCOLORS = 8;
 	
@@ -49,17 +49,17 @@ __device float3 bsdf_diffuse_ramp_get_color(const ShaderClosure *sc, const float
 	return colors[ipos] * (1.0f - offset) + colors[ipos+1] * offset;
 }
 
-__device int bsdf_diffuse_ramp_setup(ShaderClosure *sc)
+ccl_device int bsdf_diffuse_ramp_setup(ShaderClosure *sc)
 {
 	sc->type = CLOSURE_BSDF_DIFFUSE_RAMP_ID;
 	return SD_BSDF | SD_BSDF_HAS_EVAL;
 }
 
-__device void bsdf_diffuse_ramp_blur(ShaderClosure *sc, float roughness)
+ccl_device void bsdf_diffuse_ramp_blur(ShaderClosure *sc, float roughness)
 {
 }
 
-__device float3 bsdf_diffuse_ramp_eval_reflect(const ShaderClosure *sc, const float3 colors[8], const float3 I, const float3 omega_in, float *pdf)
+ccl_device float3 bsdf_diffuse_ramp_eval_reflect(const ShaderClosure *sc, const float3 colors[8], const float3 I, const float3 omega_in, float *pdf)
 {
 	float3 N = sc->N;
 
@@ -68,12 +68,12 @@ __device float3 bsdf_diffuse_ramp_eval_reflect(const ShaderClosure *sc, const fl
 	return bsdf_diffuse_ramp_get_color(sc, colors, cos_pi) * M_1_PI_F;
 }
 
-__device float3 bsdf_diffuse_ramp_eval_transmit(const ShaderClosure *sc, const float3 colors[8], const float3 I, const float3 omega_in, float *pdf)
+ccl_device float3 bsdf_diffuse_ramp_eval_transmit(const ShaderClosure *sc, const float3 colors[8], const float3 I, const float3 omega_in, float *pdf)
 {
 	return make_float3(0.0f, 0.0f, 0.0f);
 }
 
-__device int bsdf_diffuse_ramp_sample(const ShaderClosure *sc, const float3 colors[8], float3 Ng, float3 I, float3 dIdx, float3 dIdy, float randu, float randv, float3 *eval, float3 *omega_in, float3 *domega_in_dx, float3 *domega_in_dy, float *pdf)
+ccl_device int bsdf_diffuse_ramp_sample(const ShaderClosure *sc, const float3 colors[8], float3 Ng, float3 I, float3 dIdx, float3 dIdy, float randu, float randv, float3 *eval, float3 *omega_in, float3 *domega_in_dx, float3 *domega_in_dy, float *pdf)
 {
 	float3 N = sc->N;
 
diff --git a/intern/cycles/kernel/closure/bsdf_hair.h b/intern/cycles/kernel/closure/bsdf_hair.h
index 5825d2637ba..163e7cc5ee2 100644
--- a/intern/cycles/kernel/closure/bsdf_hair.h
+++ b/intern/cycles/kernel/closure/bsdf_hair.h
@@ -36,15 +36,15 @@
 CCL_NAMESPACE_BEGIN
 
 
-__device void bsdf_hair_reflection_blur(ShaderClosure *sc, float roughness)
+ccl_device void bsdf_hair_reflection_blur(ShaderClosure *sc, float roughness)
 {
 }
 
-__device void bsdf_hair_transmission_blur(ShaderClosure *sc, float roughness)
+ccl_device void bsdf_hair_transmission_blur(ShaderClosure *sc, float roughness)
 {
 }
 
-__device int bsdf_hair_reflection_setup(ShaderClosure *sc)
+ccl_device int bsdf_hair_reflection_setup(ShaderClosure *sc)
 {
 	sc->type = CLOSURE_BSDF_HAIR_REFLECTION_ID;
 	sc->data0 = clamp(sc->data0, 0.001f, 1.0f);
@@ -52,7 +52,7 @@ __device int bsdf_hair_reflection_setup(ShaderClosure *sc)
 	return SD_BSDF|SD_BSDF_HAS_EVAL|SD_BSDF_GLOSSY;
 }
 
-__device int bsdf_hair_transmission_setup(ShaderClosure *sc)
+ccl_device int bsdf_hair_transmission_setup(ShaderClosure *sc)
 {
 	sc->type = CLOSURE_BSDF_HAIR_TRANSMISSION_ID;
 	sc->data0 = clamp(sc->data0, 0.001f, 1.0f);
@@ -60,7 +60,7 @@ __device int bsdf_hair_transmission_setup(ShaderClosure *sc)
 	return SD_BSDF|SD_BSDF_HAS_EVAL|SD_BSDF_GLOSSY;
 }
 
-__device float3 bsdf_hair_reflection_eval_reflect(const ShaderClosure *sc, const float3 I, const float3 omega_in, float *pdf)
+ccl_device float3 bsdf_hair_reflection_eval_reflect(const ShaderClosure *sc, const float3 I, const float3 omega_in, float *pdf)
 {
 #ifdef __HAIR__
 	float offset = sc->offset;
@@ -106,18 +106,18 @@ __device float3 bsdf_hair_reflection_eval_reflect(const ShaderClosure *sc, const
 	return make_float3(*pdf, *pdf, *pdf);
 }
 
-__device float3 bsdf_hair_transmission_eval_reflect(const ShaderClosure *sc, const float3 I, const float3 omega_in, float *pdf)
+ccl_device float3 bsdf_hair_transmission_eval_reflect(const ShaderClosure *sc, const float3 I, const float3 omega_in, float *pdf)
 {
 	return make_float3(0.0f, 0.0f, 0.0f);
 }
 
 
-__device float3 bsdf_hair_reflection_eval_transmit(const ShaderClosure *sc, const float3 I, const float3 omega_in, float *pdf)
+ccl_device float3 bsdf_hair_reflection_eval_transmit(const ShaderClosure *sc, const float3 I, const float3 omega_in, float *pdf)
 {
 	return make_float3(0.0f, 0.0f, 0.0f);
 }
 
-__device float3 bsdf_hair_transmission_eval_transmit(const ShaderClosure *sc, const float3 I, const float3 omega_in, float *pdf)
+ccl_device float3 bsdf_hair_transmission_eval_transmit(const ShaderClosure *sc, const float3 I, const float3 omega_in, float *pdf)
 {
 #ifdef __HAIR__
 	float offset = sc->offset;
@@ -163,7 +163,7 @@ __device float3 bsdf_hair_transmission_eval_transmit(const ShaderClosure *sc, co
 	return make_float3(*pdf, *pdf, *pdf);
 }
 
-__device int bsdf_hair_reflection_sample(const ShaderClosure *sc, float3 Ng, float3 I, float3 dIdx, float3 dIdy, float randu, float randv, float3 *eval, float3 *omega_in, float3 *domega_in_dx, float3 *domega_in_dy, float *pdf)
+ccl_device int bsdf_hair_reflection_sample(const ShaderClosure *sc, float3 Ng, float3 I, float3 dIdx, float3 dIdy, float randu, float randv, float3 *eval, float3 *omega_in, float3 *domega_in_dx, float3 *domega_in_dy, float *pdf)
 {
 #ifdef __HAIR__
 	float offset = sc->offset;
@@ -218,7 +218,7 @@ __device int bsdf_hair_reflection_sample(const ShaderClosure *sc, float3 Ng, flo
 	return LABEL_REFLECT|LABEL_GLOSSY;
 }
 
-__device int bsdf_hair_transmission_sample(const ShaderClosure *sc, float3 Ng, float3 I, float3 dIdx, float3 dIdy, float randu, float randv, float3 *eval, float3 *omega_in, float3 *domega_in_dx, float3 *domega_in_dy, float *pdf)
+ccl_device int bsdf_hair_transmission_sample(const ShaderClosure *sc, float3 Ng, float3 I, float3 dIdx, float3 dIdy, float randu, float randv, float3 *eval, float3 *omega_in, float3 *domega_in_dx, float3 *domega_in_dy, float *pdf)
 {
 #ifdef __HAIR__
 	float offset = sc->offset;
diff --git a/intern/cycles/kernel/closure/bsdf_microfacet.h b/intern/cycles/kernel/closure/bsdf_microfacet.h
index b159f585831..737cffb0f18 100644
--- a/intern/cycles/kernel/closure/bsdf_microfacet.h
+++ b/intern/cycles/kernel/closure/bsdf_microfacet.h
@@ -37,7 +37,7 @@ CCL_NAMESPACE_BEGIN
 
 /* GGX */
 
-__device int bsdf_microfacet_ggx_setup(ShaderClosure *sc)
+ccl_device int bsdf_microfacet_ggx_setup(ShaderClosure *sc)
 {
 	sc->data0 = clamp(sc->data0, 0.0f, 1.0f); /* m_ag */
 	
@@ -46,7 +46,7 @@ __device int bsdf_microfacet_ggx_setup(ShaderClosure *sc)
 	return SD_BSDF|SD_BSDF_HAS_EVAL|SD_BSDF_GLOSSY;
 }
 
-__device int bsdf_microfacet_ggx_refraction_setup(ShaderClosure *sc)
+ccl_device int bsdf_microfacet_ggx_refraction_setup(ShaderClosure *sc)
 {
 	sc->data0 = clamp(sc->data0, 0.0f, 1.0f); /* m_ag */
 
@@ -55,12 +55,12 @@ __device int bsdf_microfacet_ggx_refraction_setup(ShaderClosure *sc)
 	return SD_BSDF|SD_BSDF_HAS_EVAL|SD_BSDF_GLOSSY;
 }
 
-__device void bsdf_microfacet_ggx_blur(ShaderClosure *sc, float roughness)
+ccl_device void bsdf_microfacet_ggx_blur(ShaderClosure *sc, float roughness)
 {
 	sc->data0 = fmaxf(roughness, sc->data0); /* m_ag */
 }
 
-__device float3 bsdf_microfacet_ggx_eval_reflect(const ShaderClosure *sc, const float3 I, const float3 omega_in, float *pdf)
+ccl_device float3 bsdf_microfacet_ggx_eval_reflect(const ShaderClosure *sc, const float3 I, const float3 omega_in, float *pdf)
 {
 	float m_ag = max(sc->data0, 1e-4f);
 	int m_refractive = sc->type == CLOSURE_BSDF_MICROFACET_GGX_REFRACTION_ID;
@@ -97,7 +97,7 @@ __device float3 bsdf_microfacet_ggx_eval_reflect(const ShaderClosure *sc, const
 	return make_float3 (0, 0, 0);
 }
 
-__device float3 bsdf_microfacet_ggx_eval_transmit(const ShaderClosure *sc, const float3 I, const float3 omega_in, float *pdf)
+ccl_device float3 bsdf_microfacet_ggx_eval_transmit(const ShaderClosure *sc, const float3 I, const float3 omega_in, float *pdf)
 {
 	float m_ag = max(sc->data0, 1e-4f);
 	float m_eta = sc->data1;
@@ -134,7 +134,7 @@ __device float3 bsdf_microfacet_ggx_eval_transmit(const ShaderClosure *sc, const
 	return make_float3 (out, out, out);
 }
 
-__device int bsdf_microfacet_ggx_sample(const ShaderClosure *sc, float3 Ng, float3 I, float3 dIdx, float3 dIdy, float randu, float randv, float3 *eval, float3 *omega_in, float3 *domega_in_dx, float3 *domega_in_dy, float *pdf)
+ccl_device int bsdf_microfacet_ggx_sample(const ShaderClosure *sc, float3 Ng, float3 I, float3 dIdx, float3 dIdy, float randu, float randv, float3 *eval, float3 *omega_in, float3 *domega_in_dx, float3 *domega_in_dy, float *pdf)
 {
 	float m_ag = sc->data0;
 	int m_refractive = sc->type == CLOSURE_BSDF_MICROFACET_GGX_REFRACTION_ID;
@@ -255,7 +255,7 @@ __device int bsdf_microfacet_ggx_sample(const ShaderClosure *sc, float3 Ng, floa
 
 /* BECKMANN */
 
-__device int bsdf_microfacet_beckmann_setup(ShaderClosure *sc)
+ccl_device int bsdf_microfacet_beckmann_setup(ShaderClosure *sc)
 {
 	sc->data0 = clamp(sc->data0, 0.0f, 1.0f); /* m_ab */
 
@@ -263,7 +263,7 @@ __device int bsdf_microfacet_beckmann_setup(ShaderClosure *sc)
 	return SD_BSDF|SD_BSDF_HAS_EVAL|SD_BSDF_GLOSSY;
 }
 
-__device int bsdf_microfacet_beckmann_refraction_setup(ShaderClosure *sc)
+ccl_device int bsdf_microfacet_beckmann_refraction_setup(ShaderClosure *sc)
 {
 	sc->data0 = clamp(sc->data0, 0.0f, 1.0f); /* m_ab */
 
@@ -271,12 +271,12 @@ __device int bsdf_microfacet_beckmann_refraction_setup(ShaderClosure *sc)
 	return SD_BSDF|SD_BSDF_HAS_EVAL|SD_BSDF_GLOSSY;
 }
 
-__device void bsdf_microfacet_beckmann_blur(ShaderClosure *sc, float roughness)
+ccl_device void bsdf_microfacet_beckmann_blur(ShaderClosure *sc, float roughness)
 {
 	sc->data0 = fmaxf(roughness, sc->data0); /* m_ab */
 }
 
-__device float3 bsdf_microfacet_beckmann_eval_reflect(const ShaderClosure *sc, const float3 I, const float3 omega_in, float *pdf)
+ccl_device float3 bsdf_microfacet_beckmann_eval_reflect(const ShaderClosure *sc, const float3 I, const float3 omega_in, float *pdf)
 {
 	float m_ab = max(sc->data0, 1e-4f);
 	int m_refractive = sc->type == CLOSURE_BSDF_MICROFACET_BECKMANN_REFRACTION_ID;
@@ -315,7 +315,7 @@ __device float3 bsdf_microfacet_beckmann_eval_reflect(const ShaderClosure *sc, c
 	return make_float3 (0, 0, 0);
 }
 
-__device float3 bsdf_microfacet_beckmann_eval_transmit(const ShaderClosure *sc, const float3 I, const float3 omega_in, float *pdf)
+ccl_device float3 bsdf_microfacet_beckmann_eval_transmit(const ShaderClosure *sc, const float3 I, const float3 omega_in, float *pdf)
 {
 	float m_ab = max(sc->data0, 1e-4f);
 	float m_eta = sc->data1;
@@ -354,7 +354,7 @@ __device float3 bsdf_microfacet_beckmann_eval_transmit(const ShaderClosure *sc,
 	return make_float3 (out, out, out);
 }
 
-__device int bsdf_microfacet_beckmann_sample(const ShaderClosure *sc, float3 Ng, float3 I, float3 dIdx, float3 dIdy, float randu, float randv, float3 *eval, float3 *omega_in, float3 *domega_in_dx, float3 *domega_in_dy, float *pdf)
+ccl_device int bsdf_microfacet_beckmann_sample(const ShaderClosure *sc, float3 Ng, float3 I, float3 dIdx, float3 dIdy, float randu, float randv, float3 *eval, float3 *omega_in, float3 *domega_in_dx, float3 *domega_in_dy, float *pdf)
 {
 	float m_ab = sc->data0;
 	int m_refractive = sc->type == CLOSURE_BSDF_MICROFACET_BECKMANN_REFRACTION_ID;
diff --git a/intern/cycles/kernel/closure/bsdf_oren_nayar.h b/intern/cycles/kernel/closure/bsdf_oren_nayar.h
index c6c6811c007..6f685d5eeea 100644
--- a/intern/cycles/kernel/closure/bsdf_oren_nayar.h
+++ b/intern/cycles/kernel/closure/bsdf_oren_nayar.h
@@ -19,7 +19,7 @@
 
 CCL_NAMESPACE_BEGIN
 
-__device float3 bsdf_oren_nayar_get_intensity(const ShaderClosure *sc, float3 n, float3 v, float3 l)
+ccl_device float3 bsdf_oren_nayar_get_intensity(const ShaderClosure *sc, float3 n, float3 v, float3 l)
 {
 	float nl = max(dot(n, l), 0.0f);
 	float nv = max(dot(n, v), 0.0f);
@@ -31,7 +31,7 @@ __device float3 bsdf_oren_nayar_get_intensity(const ShaderClosure *sc, float3 n,
 	return make_float3(is, is, is);
 }
 
-__device int bsdf_oren_nayar_setup(ShaderClosure *sc)
+ccl_device int bsdf_oren_nayar_setup(ShaderClosure *sc)
 {
 	float sigma = sc->data0;
 
@@ -47,11 +47,11 @@ __device int bsdf_oren_nayar_setup(ShaderClosure *sc)
 	return SD_BSDF | SD_BSDF_HAS_EVAL;
 }
 
-__device void bsdf_oren_nayar_blur(ShaderClosure *sc, float roughness)
+ccl_device void bsdf_oren_nayar_blur(ShaderClosure *sc, float roughness)
 {
 }
 
-__device float3 bsdf_oren_nayar_eval_reflect(const ShaderClosure *sc, const float3 I, const float3 omega_in, float *pdf)
+ccl_device float3 bsdf_oren_nayar_eval_reflect(const ShaderClosure *sc, const float3 I, const float3 omega_in, float *pdf)
 {
 	if (dot(sc->N, omega_in) > 0.0f) {
 		*pdf = 0.5f * M_1_PI_F;
@@ -63,12 +63,12 @@ __device float3 bsdf_oren_nayar_eval_reflect(const ShaderClosure *sc, const floa
 	}
 }
 
-__device float3 bsdf_oren_nayar_eval_transmit(const ShaderClosure *sc, const float3 I, const float3 omega_in, float *pdf)
+ccl_device float3 bsdf_oren_nayar_eval_transmit(const ShaderClosure *sc, const float3 I, const float3 omega_in, float *pdf)
 {
 	return make_float3(0.0f, 0.0f, 0.0f);
 }
 
-__device int bsdf_oren_nayar_sample(const ShaderClosure *sc, float3 Ng, float3 I, float3 dIdx, float3 dIdy, float randu, float randv, float3 *eval, float3 *omega_in, float3 *domega_in_dx, float3 *domega_in_dy, float *pdf)
+ccl_device int bsdf_oren_nayar_sample(const ShaderClosure *sc, float3 Ng, float3 I, float3 dIdx, float3 dIdy, float randu, float randv, float3 *eval, float3 *omega_in, float3 *domega_in_dx, float3 *domega_in_dy, float *pdf)
 {
 	sample_uniform_hemisphere(sc->N, randu, randv, omega_in, pdf);
 
diff --git a/intern/cycles/kernel/closure/bsdf_phong_ramp.h b/intern/cycles/kernel/closure/bsdf_phong_ramp.h
index 1e332933287..219c5aea159 100644
--- a/intern/cycles/kernel/closure/bsdf_phong_ramp.h
+++ b/intern/cycles/kernel/closure/bsdf_phong_ramp.h
@@ -35,7 +35,7 @@
 
 CCL_NAMESPACE_BEGIN
 
-__device float3 bsdf_phong_ramp_get_color(const ShaderClosure *sc, const float3 colors[8], float pos)
+ccl_device float3 bsdf_phong_ramp_get_color(const ShaderClosure *sc, const float3 colors[8], float pos)
 {
 	int MAXCOLORS = 8;
 	
@@ -49,7 +49,7 @@ __device float3 bsdf_phong_ramp_get_color(const ShaderClosure *sc, const float3
 	return colors[ipos] * (1.0f - offset) + colors[ipos+1] * offset;
 }
 
-__device int bsdf_phong_ramp_setup(ShaderClosure *sc)
+ccl_device int bsdf_phong_ramp_setup(ShaderClosure *sc)
 {
 	sc->data0 = max(sc->data0, 0.0f);
 	
@@ -57,11 +57,11 @@ __device int bsdf_phong_ramp_setup(ShaderClosure *sc)
 	return SD_BSDF | SD_BSDF_HAS_EVAL | SD_BSDF_GLOSSY;
 }
 
-__device void bsdf_phong_ramp_blur(ShaderClosure *sc, float roughness)
+ccl_device void bsdf_phong_ramp_blur(ShaderClosure *sc, float roughness)
 {
 }
 
-__device float3 bsdf_phong_ramp_eval_reflect(const ShaderClosure *sc, const float3 colors[8], const float3 I, const float3 omega_in, float *pdf)
+ccl_device float3 bsdf_phong_ramp_eval_reflect(const ShaderClosure *sc, const float3 colors[8], const float3 I, const float3 omega_in, float *pdf)
 {
 	float m_exponent = sc->data0;
 	float cosNI = dot(sc->N, omega_in);
@@ -83,12 +83,12 @@ __device float3 bsdf_phong_ramp_eval_reflect(const ShaderClosure *sc, const floa
 	return make_float3(0.0f, 0.0f, 0.0f);
 }
 
-__device float3 bsdf_phong_ramp_eval_transmit(const ShaderClosure *sc, const float3 colors[8], const float3 I, const float3 omega_in, float *pdf)
+ccl_device float3 bsdf_phong_ramp_eval_transmit(const ShaderClosure *sc, const float3 colors[8], const float3 I, const float3 omega_in, float *pdf)
 {
 	return make_float3(0.0f, 0.0f, 0.0f);
 }
 
-__device int bsdf_phong_ramp_sample(const ShaderClosure *sc, const float3 colors[8], float3 Ng, float3 I, float3 dIdx, float3 dIdy, float randu, float randv, float3 *eval, float3 *omega_in, float3 *domega_in_dx, float3 *domega_in_dy, float *pdf)
+ccl_device int bsdf_phong_ramp_sample(const ShaderClosure *sc, const float3 colors[8], float3 Ng, float3 I, float3 dIdx, float3 dIdy, float randu, float randv, float3 *eval, float3 *omega_in, float3 *domega_in_dx, float3 *domega_in_dy, float *pdf)
 {
 	float cosNO = dot(sc->N, I);
 	float m_exponent = sc->data0;
diff --git a/intern/cycles/kernel/closure/bsdf_reflection.h b/intern/cycles/kernel/closure/bsdf_reflection.h
index 7715aac936f..0baccdf155c 100644
--- a/intern/cycles/kernel/closure/bsdf_reflection.h
+++ b/intern/cycles/kernel/closure/bsdf_reflection.h
@@ -37,27 +37,27 @@ CCL_NAMESPACE_BEGIN
 
 /* REFLECTION */
 
-__device int bsdf_reflection_setup(ShaderClosure *sc)
+ccl_device int bsdf_reflection_setup(ShaderClosure *sc)
 {
 	sc->type = CLOSURE_BSDF_REFLECTION_ID;
 	return SD_BSDF;
 }
 
-__device void bsdf_reflection_blur(ShaderClosure *sc, float roughness)
+ccl_device void bsdf_reflection_blur(ShaderClosure *sc, float roughness)
 {
 }
 
-__device float3 bsdf_reflection_eval_reflect(const ShaderClosure *sc, const float3 I, const float3 omega_in, float *pdf)
+ccl_device float3 bsdf_reflection_eval_reflect(const ShaderClosure *sc, const float3 I, const float3 omega_in, float *pdf)
 {
 	return make_float3(0.0f, 0.0f, 0.0f);
 }
 
-__device float3 bsdf_reflection_eval_transmit(const ShaderClosure *sc, const float3 I, const float3 omega_in, float *pdf)
+ccl_device float3 bsdf_reflection_eval_transmit(const ShaderClosure *sc, const float3 I, const float3 omega_in, float *pdf)
 {
 	return make_float3(0.0f, 0.0f, 0.0f);
 }
 
-__device int bsdf_reflection_sample(const ShaderClosure *sc, float3 Ng, float3 I, float3 dIdx, float3 dIdy, float randu, float randv, float3 *eval, float3 *omega_in, float3 *domega_in_dx, float3 *domega_in_dy, float *pdf)
+ccl_device int bsdf_reflection_sample(const ShaderClosure *sc, float3 Ng, float3 I, float3 dIdx, float3 dIdy, float randu, float randv, float3 *eval, float3 *omega_in, float3 *domega_in_dx, float3 *domega_in_dy, float *pdf)
 {
 	float3 N = sc->N;
 
diff --git a/intern/cycles/kernel/closure/bsdf_refraction.h b/intern/cycles/kernel/closure/bsdf_refraction.h
index 8565c99d04e..c4698b42060 100644
--- a/intern/cycles/kernel/closure/bsdf_refraction.h
+++ b/intern/cycles/kernel/closure/bsdf_refraction.h
@@ -37,27 +37,27 @@ CCL_NAMESPACE_BEGIN
 
 /* REFRACTION */
 
-__device int bsdf_refraction_setup(ShaderClosure *sc)
+ccl_device int bsdf_refraction_setup(ShaderClosure *sc)
 {
 	sc->type = CLOSURE_BSDF_REFRACTION_ID;
 	return SD_BSDF;
 }
 
-__device void bsdf_refraction_blur(ShaderClosure *sc, float roughness)
+ccl_device void bsdf_refraction_blur(ShaderClosure *sc, float roughness)
 {
 }
 
-__device float3 bsdf_refraction_eval_reflect(const ShaderClosure *sc, const float3 I, const float3 omega_in, float *pdf)
+ccl_device float3 bsdf_refraction_eval_reflect(const ShaderClosure *sc, const float3 I, const float3 omega_in, float *pdf)
 {
 	return make_float3(0.0f, 0.0f, 0.0f);
 }
 
-__device float3 bsdf_refraction_eval_transmit(const ShaderClosure *sc, const float3 I, const float3 omega_in, float *pdf)
+ccl_device float3 bsdf_refraction_eval_transmit(const ShaderClosure *sc, const float3 I, const float3 omega_in, float *pdf)
 {
 	return make_float3(0.0f, 0.0f, 0.0f);
 }
 
-__device int bsdf_refraction_sample(const ShaderClosure *sc, float3 Ng, float3 I, float3 dIdx, float3 dIdy, float randu, float randv, float3 *eval, float3 *omega_in, float3 *domega_in_dx, float3 *domega_in_dy, float *pdf)
+ccl_device int bsdf_refraction_sample(const ShaderClosure *sc, float3 Ng, float3 I, float3 dIdx, float3 dIdy, float randu, float randv, float3 *eval, float3 *omega_in, float3 *domega_in_dx, float3 *domega_in_dy, float *pdf)
 {
 	float m_eta = sc->data0;
 	float3 N = sc->N;
diff --git a/intern/cycles/kernel/closure/bsdf_toon.h b/intern/cycles/kernel/closure/bsdf_toon.h
index e69981dba77..797fa4227ae 100644
--- a/intern/cycles/kernel/closure/bsdf_toon.h
+++ b/intern/cycles/kernel/closure/bsdf_toon.h
@@ -37,7 +37,7 @@ CCL_NAMESPACE_BEGIN
 
 /* DIFFUSE TOON */
 
-__device int bsdf_diffuse_toon_setup(ShaderClosure *sc)
+ccl_device int bsdf_diffuse_toon_setup(ShaderClosure *sc)
 {
 	sc->type = CLOSURE_BSDF_DIFFUSE_TOON_ID;
 	sc->data0 = clamp(sc->data0, 0.0f, 1.0f);
@@ -46,11 +46,11 @@ __device int bsdf_diffuse_toon_setup(ShaderClosure *sc)
 	return SD_BSDF|SD_BSDF_HAS_EVAL;
 }
 
-__device void bsdf_diffuse_toon_blur(ShaderClosure *sc, float roughness)
+ccl_device void bsdf_diffuse_toon_blur(ShaderClosure *sc, float roughness)
 {
 }
 
-__device float3 bsdf_toon_get_intensity(float max_angle, float smooth, float angle)
+ccl_device float3 bsdf_toon_get_intensity(float max_angle, float smooth, float angle)
 {
 	float is;
 
@@ -64,12 +64,12 @@ __device float3 bsdf_toon_get_intensity(float max_angle, float smooth, float ang
 	return make_float3(is, is, is);
 }
 
-__device float bsdf_toon_get_sample_angle(float max_angle, float smooth)
+ccl_device float bsdf_toon_get_sample_angle(float max_angle, float smooth)
 {
 	return fminf(max_angle + smooth, M_PI_2_F);
 }
 
-__device float3 bsdf_diffuse_toon_eval_reflect(const ShaderClosure *sc, const float3 I, const float3 omega_in, float *pdf)
+ccl_device float3 bsdf_diffuse_toon_eval_reflect(const ShaderClosure *sc, const float3 I, const float3 omega_in, float *pdf)
 {
 	float max_angle = sc->data0*M_PI_2_F;
 	float smooth = sc->data1*M_PI_2_F;
@@ -87,12 +87,12 @@ __device float3 bsdf_diffuse_toon_eval_reflect(const ShaderClosure *sc, const fl
 	return make_float3(0.0f, 0.0f, 0.0f);
 }
 
-__device float3 bsdf_diffuse_toon_eval_transmit(const ShaderClosure *sc, const float3 I, const float3 omega_in, float *pdf)
+ccl_device float3 bsdf_diffuse_toon_eval_transmit(const ShaderClosure *sc, const float3 I, const float3 omega_in, float *pdf)
 {
 	return make_float3(0.0f, 0.0f, 0.0f);
 }
 
-__device int bsdf_diffuse_toon_sample(const ShaderClosure *sc, float3 Ng, float3 I, float3 dIdx, float3 dIdy, float randu, float randv, float3 *eval, float3 *omega_in, float3 *domega_in_dx, float3 *domega_in_dy, float *pdf)
+ccl_device int bsdf_diffuse_toon_sample(const ShaderClosure *sc, float3 Ng, float3 I, float3 dIdx, float3 dIdy, float randu, float randv, float3 *eval, float3 *omega_in, float3 *domega_in_dx, float3 *domega_in_dy, float *pdf)
 {
 	float max_angle = sc->data0*M_PI_2_F;
 	float smooth = sc->data1*M_PI_2_F;
@@ -121,7 +121,7 @@ __device int bsdf_diffuse_toon_sample(const ShaderClosure *sc, float3 Ng, float3
 
 /* GLOSSY TOON */
 
-__device int bsdf_glossy_toon_setup(ShaderClosure *sc)
+ccl_device int bsdf_glossy_toon_setup(ShaderClosure *sc)
 {
 	sc->type = CLOSURE_BSDF_GLOSSY_TOON_ID;
 	sc->data0 = clamp(sc->data0, 0.0f, 1.0f);
@@ -130,11 +130,11 @@ __device int bsdf_glossy_toon_setup(ShaderClosure *sc)
 	return SD_BSDF|SD_BSDF_HAS_EVAL;
 }
 
-__device void bsdf_glossy_toon_blur(ShaderClosure *sc, float roughness)
+ccl_device void bsdf_glossy_toon_blur(ShaderClosure *sc, float roughness)
 {
 }
 
-__device float3 bsdf_glossy_toon_eval_reflect(const ShaderClosure *sc, const float3 I, const float3 omega_in, float *pdf)
+ccl_device float3 bsdf_glossy_toon_eval_reflect(const ShaderClosure *sc, const float3 I, const float3 omega_in, float *pdf)
 {
 	float max_angle = sc->data0*M_PI_2_F;
 	float smooth = sc->data1*M_PI_2_F;
@@ -158,12 +158,12 @@ __device float3 bsdf_glossy_toon_eval_reflect(const ShaderClosure *sc, const flo
 	return make_float3(0.0f, 0.0f, 0.0f);
 }
 
-__device float3 bsdf_glossy_toon_eval_transmit(const ShaderClosure *sc, const float3 I, const float3 omega_in, float *pdf)
+ccl_device float3 bsdf_glossy_toon_eval_transmit(const ShaderClosure *sc, const float3 I, const float3 omega_in, float *pdf)
 {
 	return make_float3(0.0f, 0.0f, 0.0f);
 }
 
-__device int bsdf_glossy_toon_sample(const ShaderClosure *sc, float3 Ng, float3 I, float3 dIdx, float3 dIdy, float randu, float randv, float3 *eval, float3 *omega_in, float3 *domega_in_dx, float3 *domega_in_dy, float *pdf)
+ccl_device int bsdf_glossy_toon_sample(const ShaderClosure *sc, float3 Ng, float3 I, float3 dIdx, float3 dIdy, float randu, float randv, float3 *eval, float3 *omega_in, float3 *domega_in_dx, float3 *domega_in_dy, float *pdf)
 {
 	float max_angle = sc->data0*M_PI_2_F;
 	float smooth = sc->data1*M_PI_2_F;
diff --git a/intern/cycles/kernel/closure/bsdf_transparent.h b/intern/cycles/kernel/closure/bsdf_transparent.h
index 81bc7690b50..e62aecf3da6 100644
--- a/intern/cycles/kernel/closure/bsdf_transparent.h
+++ b/intern/cycles/kernel/closure/bsdf_transparent.h
@@ -35,27 +35,27 @@
 
 CCL_NAMESPACE_BEGIN
 
-__device int bsdf_transparent_setup(ShaderClosure *sc)
+ccl_device int bsdf_transparent_setup(ShaderClosure *sc)
 {
 	sc->type = CLOSURE_BSDF_TRANSPARENT_ID;
 	return SD_BSDF;
 }
 
-__device void bsdf_transparent_blur(ShaderClosure *sc, float roughness)
+ccl_device void bsdf_transparent_blur(ShaderClosure *sc, float roughness)
 {
 }
 
-__device float3 bsdf_transparent_eval_reflect(const ShaderClosure *sc, const float3 I, const float3 omega_in, float *pdf)
+ccl_device float3 bsdf_transparent_eval_reflect(const ShaderClosure *sc, const float3 I, const float3 omega_in, float *pdf)
 {
 	return make_float3(0.0f, 0.0f, 0.0f);
 }
 
-__device float3 bsdf_transparent_eval_transmit(const ShaderClosure *sc, const float3 I, const float3 omega_in, float *pdf)
+ccl_device float3 bsdf_transparent_eval_transmit(const ShaderClosure *sc, const float3 I, const float3 omega_in, float *pdf)
 {
 	return make_float3(0.0f, 0.0f, 0.0f);
 }
 
-__device int bsdf_transparent_sample(const ShaderClosure *sc, float3 Ng, float3 I, float3 dIdx, float3 dIdy, float randu, float randv, float3 *eval, float3 *omega_in, float3 *domega_in_dx, float3 *domega_in_dy, float *pdf)
+ccl_device int bsdf_transparent_sample(const ShaderClosure *sc, float3 Ng, float3 I, float3 dIdx, float3 dIdy, float randu, float randv, float3 *eval, float3 *omega_in, float3 *domega_in_dx, float3 *domega_in_dy, float *pdf)
 {
 	// only one direction is possible
 	*omega_in = -I;
diff --git a/intern/cycles/kernel/closure/bsdf_util.h b/intern/cycles/kernel/closure/bsdf_util.h
index 5ac26055e8d..f6dceb3ca82 100644
--- a/intern/cycles/kernel/closure/bsdf_util.h
+++ b/intern/cycles/kernel/closure/bsdf_util.h
@@ -35,7 +35,7 @@
 
 CCL_NAMESPACE_BEGIN
 
-__device float fresnel_dielectric(float eta, const float3 N,
+ccl_device float fresnel_dielectric(float eta, const float3 N,
 		const float3 I, float3 *R, float3 *T,
 #ifdef __RAY_DIFFERENTIALS__
 		const float3 dIdx, const float3 dIdy,
@@ -95,7 +95,7 @@ __device float fresnel_dielectric(float eta, const float3 N,
 	}
 }
 
-__device float fresnel_dielectric_cos(float cosi, float eta)
+ccl_device float fresnel_dielectric_cos(float cosi, float eta)
 {
 	// compute fresnel reflectance without explicitly computing
 	// the refracted direction
@@ -110,7 +110,7 @@ __device float fresnel_dielectric_cos(float cosi, float eta)
 	return 1.0f; // TIR(no refracted component)
 }
 
-__device float fresnel_conductor(float cosi, float eta, float k)
+ccl_device float fresnel_conductor(float cosi, float eta, float k)
 {
 	float tmp_f = eta * eta + k * k;
 	float tmp = tmp_f * cosi * cosi;
@@ -121,7 +121,7 @@ __device float fresnel_conductor(float cosi, float eta, float k)
 	return(Rparl2 + Rperp2) * 0.5f;
 }
 
-__device float smooth_step(float edge0, float edge1, float x)
+ccl_device float smooth_step(float edge0, float edge1, float x)
 {
 	float result;
 	if(x < edge0) result = 0.0f;
diff --git a/intern/cycles/kernel/closure/bsdf_ward.h b/intern/cycles/kernel/closure/bsdf_ward.h
index 0e5b0c544c7..c9de615a011 100644
--- a/intern/cycles/kernel/closure/bsdf_ward.h
+++ b/intern/cycles/kernel/closure/bsdf_ward.h
@@ -37,7 +37,7 @@ CCL_NAMESPACE_BEGIN
 
 /* WARD */
 
-__device int bsdf_ward_setup(ShaderClosure *sc)
+ccl_device int bsdf_ward_setup(ShaderClosure *sc)
 {
 	sc->data0 = clamp(sc->data0, 1e-4f, 1.0f); /* m_ax */
 	sc->data1 = clamp(sc->data1, 1e-4f, 1.0f); /* m_ay */
@@ -46,13 +46,13 @@ __device int bsdf_ward_setup(ShaderClosure *sc)
 	return SD_BSDF|SD_BSDF_HAS_EVAL|SD_BSDF_GLOSSY;
 }
 
-__device void bsdf_ward_blur(ShaderClosure *sc, float roughness)
+ccl_device void bsdf_ward_blur(ShaderClosure *sc, float roughness)
 {
 	sc->data0 = fmaxf(roughness, sc->data0); /* m_ax */
 	sc->data1 = fmaxf(roughness, sc->data1); /* m_ay */
 }
 
-__device float3 bsdf_ward_eval_reflect(const ShaderClosure *sc, const float3 I, const float3 omega_in, float *pdf)
+ccl_device float3 bsdf_ward_eval_reflect(const ShaderClosure *sc, const float3 I, const float3 omega_in, float *pdf)
 {
 	float m_ax = sc->data0;
 	float m_ay = sc->data1;
@@ -87,12 +87,12 @@ __device float3 bsdf_ward_eval_reflect(const ShaderClosure *sc, const float3 I,
 	return make_float3 (0, 0, 0);
 }
 
-__device float3 bsdf_ward_eval_transmit(const ShaderClosure *sc, const float3 I, const float3 omega_in, float *pdf)
+ccl_device float3 bsdf_ward_eval_transmit(const ShaderClosure *sc, const float3 I, const float3 omega_in, float *pdf)
 {
 	return make_float3(0.0f, 0.0f, 0.0f);
 }
 
-__device int bsdf_ward_sample(const ShaderClosure *sc, float3 Ng, float3 I, float3 dIdx, float3 dIdy, float randu, float randv, float3 *eval, float3 *omega_in, float3 *domega_in_dx, float3 *domega_in_dy, float *pdf)
+ccl_device int bsdf_ward_sample(const ShaderClosure *sc, float3 Ng, float3 I, float3 dIdx, float3 dIdy, float randu, float randv, float3 *eval, float3 *omega_in, float3 *domega_in_dx, float3 *domega_in_dy, float *pdf)
 {
 	float m_ax = sc->data0;
 	float m_ay = sc->data1;
diff --git a/intern/cycles/kernel/closure/bsdf_westin.h b/intern/cycles/kernel/closure/bsdf_westin.h
index e1a6b031d5e..ca4c05e91fe 100644
--- a/intern/cycles/kernel/closure/bsdf_westin.h
+++ b/intern/cycles/kernel/closure/bsdf_westin.h
@@ -37,7 +37,7 @@ CCL_NAMESPACE_BEGIN
 
 /* WESTIN BACKSCATTER */
 
-__device int bsdf_westin_backscatter_setup(ShaderClosure *sc)
+ccl_device int bsdf_westin_backscatter_setup(ShaderClosure *sc)
 {
 	float roughness = sc->data0;
 	roughness = clamp(roughness, 1e-5f, 1.0f);
@@ -49,14 +49,14 @@ __device int bsdf_westin_backscatter_setup(ShaderClosure *sc)
 	return SD_BSDF|SD_BSDF_HAS_EVAL|SD_BSDF_GLOSSY;
 }
 
-__device void bsdf_westin_backscatter_blur(ShaderClosure *sc, float roughness)
+ccl_device void bsdf_westin_backscatter_blur(ShaderClosure *sc, float roughness)
 {
 	float m_invroughness = sc->data0;
 	m_invroughness = min(1.0f/roughness, m_invroughness);
 	sc->data0 = m_invroughness;
 }
 
-__device float3 bsdf_westin_backscatter_eval_reflect(const ShaderClosure *sc, const float3 I, const float3 omega_in, float *pdf)
+ccl_device float3 bsdf_westin_backscatter_eval_reflect(const ShaderClosure *sc, const float3 I, const float3 omega_in, float *pdf)
 {
 	float m_invroughness = sc->data0;
 	float3 N = sc->N;
@@ -73,12 +73,12 @@ __device float3 bsdf_westin_backscatter_eval_reflect(const ShaderClosure *sc, co
 	return make_float3 (0, 0, 0);
 }
 
-__device float3 bsdf_westin_backscatter_eval_transmit(const ShaderClosure *sc, const float3 I, const float3 omega_in, float *pdf)
+ccl_device float3 bsdf_westin_backscatter_eval_transmit(const ShaderClosure *sc, const float3 I, const float3 omega_in, float *pdf)
 {
 	return make_float3(0.0f, 0.0f, 0.0f);
 }
 
-__device int bsdf_westin_backscatter_sample(const ShaderClosure *sc, float3 Ng, float3 I, float3 dIdx, float3 dIdy, float randu, float randv, float3 *eval, float3 *omega_in, float3 *domega_in_dx, float3 *domega_in_dy, float *pdf)
+ccl_device int bsdf_westin_backscatter_sample(const ShaderClosure *sc, float3 Ng, float3 I, float3 dIdx, float3 dIdy, float randu, float randv, float3 *eval, float3 *omega_in, float3 *domega_in_dx, float3 *domega_in_dy, float *pdf)
 {
 	float m_invroughness = sc->data0;
 	float3 N = sc->N;
@@ -116,18 +116,18 @@ __device int bsdf_westin_backscatter_sample(const ShaderClosure *sc, float3 Ng,
 
 /* WESTIN SHEEN */
 
-__device int bsdf_westin_sheen_setup(ShaderClosure *sc)
+ccl_device int bsdf_westin_sheen_setup(ShaderClosure *sc)
 {
 	/* float edginess = sc->data0; */
 	sc->type = CLOSURE_BSDF_WESTIN_SHEEN_ID;
 	return SD_BSDF|SD_BSDF_HAS_EVAL|SD_BSDF_GLOSSY;
 }
 
-__device void bsdf_westin_sheen_blur(ShaderClosure *sc, float roughness)
+ccl_device void bsdf_westin_sheen_blur(ShaderClosure *sc, float roughness)
 {
 }
 
-__device float3 bsdf_westin_sheen_eval_reflect(const ShaderClosure *sc, const float3 I, const float3 omega_in, float *pdf)
+ccl_device float3 bsdf_westin_sheen_eval_reflect(const ShaderClosure *sc, const float3 I, const float3 omega_in, float *pdf)
 {
 	float m_edginess = sc->data0;
 	float3 N = sc->N;
@@ -144,12 +144,12 @@ __device float3 bsdf_westin_sheen_eval_reflect(const ShaderClosure *sc, const fl
 	return make_float3 (0, 0, 0);
 }
 
-__device float3 bsdf_westin_sheen_eval_transmit(const ShaderClosure *sc, const float3 I, const float3 omega_in, float *pdf)
+ccl_device float3 bsdf_westin_sheen_eval_transmit(const ShaderClosure *sc, const float3 I, const float3 omega_in, float *pdf)
 {
 	return make_float3(0.0f, 0.0f, 0.0f);
 }
 
-__device int bsdf_westin_sheen_sample(const ShaderClosure *sc, float3 Ng, float3 I, float3 dIdx, float3 dIdy, float randu, float randv, float3 *eval, float3 *omega_in, float3 *domega_in_dx, float3 *domega_in_dy, float *pdf)
+ccl_device int bsdf_westin_sheen_sample(const ShaderClosure *sc, float3 Ng, float3 I, float3 dIdx, float3 dIdy, float randu, float randv, float3 *eval, float3 *omega_in, float3 *domega_in_dx, float3 *domega_in_dy, float *pdf)
 {
 	float m_edginess = sc->data0;
 	float3 N = sc->N;
diff --git a/intern/cycles/kernel/closure/bssrdf.h b/intern/cycles/kernel/closure/bssrdf.h
index 4ceff655dd5..3849dedc3b6 100644
--- a/intern/cycles/kernel/closure/bssrdf.h
+++ b/intern/cycles/kernel/closure/bssrdf.h
@@ -19,7 +19,7 @@
 
 CCL_NAMESPACE_BEGIN
 
-__device int bssrdf_setup(ShaderClosure *sc, ClosureType type)
+ccl_device int bssrdf_setup(ShaderClosure *sc, ClosureType type)
 {
 	if(sc->data0 < BSSRDF_MIN_RADIUS) {
 		/* revert to diffuse BSDF if radius too small */
@@ -47,7 +47,7 @@ __device int bssrdf_setup(ShaderClosure *sc, ClosureType type)
 /* paper suggests 1/12.46 which is much too small, suspect it's *12.46 */
 #define GAUSS_TRUNCATE 12.46f
 
-__device float bssrdf_gaussian_eval(ShaderClosure *sc, float r)
+ccl_device float bssrdf_gaussian_eval(ShaderClosure *sc, float r)
 {
 	/* integrate (2*pi*r * exp(-r*r/(2*v)))/(2*pi*v)) from 0 to Rm
 	 * = 1 - exp(-Rm*Rm/(2*v)) */
@@ -60,7 +60,7 @@ __device float bssrdf_gaussian_eval(ShaderClosure *sc, float r)
 	return expf(-r*r/(2.0f*v))/(2.0f*M_PI_F*v);
 }
 
-__device float bssrdf_gaussian_pdf(ShaderClosure *sc, float r)
+ccl_device float bssrdf_gaussian_pdf(ShaderClosure *sc, float r)
 {
 	/* 1.0 - expf(-Rm*Rm/(2*v)) simplified */
 	const float area_truncated = 1.0f - expf(-0.5f*GAUSS_TRUNCATE);
@@ -68,7 +68,7 @@ __device float bssrdf_gaussian_pdf(ShaderClosure *sc, float r)
 	return bssrdf_gaussian_eval(sc, r) * (1.0f/(area_truncated));
 }
 
-__device void bssrdf_gaussian_sample(ShaderClosure *sc, float xi, float *r, float *h)
+ccl_device void bssrdf_gaussian_sample(ShaderClosure *sc, float xi, float *r, float *h)
 {
 	/* xi = integrate (2*pi*r * exp(-r*r/(2*v)))/(2*pi*v)) = -exp(-r^2/(2*v))
 	 * r = sqrt(-2*v*logf(xi)) */
@@ -94,7 +94,7 @@ __device void bssrdf_gaussian_sample(ShaderClosure *sc, float xi, float *r, floa
  * far as I can tell has no closed form solution. So we get an iterative solution
  * instead with newton-raphson. */
 
-__device float bssrdf_cubic_eval(ShaderClosure *sc, float r)
+ccl_device float bssrdf_cubic_eval(ShaderClosure *sc, float r)
 {
 	const float sharpness = sc->T.x;
 
@@ -141,13 +141,13 @@ __device float bssrdf_cubic_eval(ShaderClosure *sc, float r)
 	}
 }
 
-__device float bssrdf_cubic_pdf(ShaderClosure *sc, float r)
+ccl_device float bssrdf_cubic_pdf(ShaderClosure *sc, float r)
 {
 	return bssrdf_cubic_eval(sc, r);
 }
 
 /* solve 10x^2 - 20x^3 + 15x^4 - 4x^5 - xi == 0 */
-__device float bssrdf_cubic_quintic_root_find(float xi)
+ccl_device float bssrdf_cubic_quintic_root_find(float xi)
 {
 	/* newton-raphson iteration, usually succeeds in 2-4 iterations, except
 	 * outside 0.02 ... 0.98 where it can go up to 10, so overall performance
@@ -174,7 +174,7 @@ __device float bssrdf_cubic_quintic_root_find(float xi)
 	return x;
 }
 
-__device void bssrdf_cubic_sample(ShaderClosure *sc, float xi, float *r, float *h)
+ccl_device void bssrdf_cubic_sample(ShaderClosure *sc, float xi, float *r, float *h)
 {
 	float Rm = sc->data0;
 	float r_ = bssrdf_cubic_quintic_root_find(xi);
@@ -196,13 +196,13 @@ __device void bssrdf_cubic_sample(ShaderClosure *sc, float xi, float *r, float *
  * 
  * Samples distributed over disk with no falloff, for reference. */
 
-__device float bssrdf_none_eval(ShaderClosure *sc, float r)
+ccl_device float bssrdf_none_eval(ShaderClosure *sc, float r)
 {
 	const float Rm = sc->data0;
 	return (r < Rm)? 1.0f: 0.0f;
 }
 
-__device float bssrdf_none_pdf(ShaderClosure *sc, float r)
+ccl_device float bssrdf_none_pdf(ShaderClosure *sc, float r)
 {
 	/* integrate (2*pi*r)/(pi*Rm*Rm) from 0 to Rm = 1 */
 	const float Rm = sc->data0;
@@ -211,7 +211,7 @@ __device float bssrdf_none_pdf(ShaderClosure *sc, float r)
 	return bssrdf_none_eval(sc, r) / area;
 }
 
-__device void bssrdf_none_sample(ShaderClosure *sc, float xi, float *r, float *h)
+ccl_device void bssrdf_none_sample(ShaderClosure *sc, float xi, float *r, float *h)
 {
 	/* xi = integrate (2*pi*r)/(pi*Rm*Rm) = r^2/Rm^2
 	 * r = sqrt(xi)*Rm */
@@ -226,7 +226,7 @@ __device void bssrdf_none_sample(ShaderClosure *sc, float xi, float *r, float *h
 
 /* Generic */
 
-__device void bssrdf_sample(ShaderClosure *sc, float xi, float *r, float *h)
+ccl_device void bssrdf_sample(ShaderClosure *sc, float xi, float *r, float *h)
 {
 	if(sc->type == CLOSURE_BSSRDF_CUBIC_ID)
 		bssrdf_cubic_sample(sc, xi, r, h);
@@ -234,7 +234,7 @@ __device void bssrdf_sample(ShaderClosure *sc, float xi, float *r, float *h)
 		bssrdf_gaussian_sample(sc, xi, r, h);
 }
 
-__device float bssrdf_pdf(ShaderClosure *sc, float r)
+ccl_device float bssrdf_pdf(ShaderClosure *sc, float r)
 {
 	if(sc->type == CLOSURE_BSSRDF_CUBIC_ID)
 		return bssrdf_cubic_pdf(sc, r);
diff --git a/intern/cycles/kernel/closure/emissive.h b/intern/cycles/kernel/closure/emissive.h
index 33b1b695a9a..c534df373bd 100644
--- a/intern/cycles/kernel/closure/emissive.h
+++ b/intern/cycles/kernel/closure/emissive.h
@@ -37,19 +37,19 @@ CCL_NAMESPACE_BEGIN
 /* return the probability distribution function in the direction I,
  * given the parameters and the light's surface normal.  This MUST match
  * the PDF computed by sample(). */
-__device float emissive_pdf(const float3 Ng, const float3 I)
+ccl_device float emissive_pdf(const float3 Ng, const float3 I)
 {
 	float cosNO = fabsf(dot(Ng, I));
 	return (cosNO > 0.0f)? 1.0f: 0.0f;
 }
 
-__device void emissive_sample(const float3 Ng, float randu, float randv,
+ccl_device void emissive_sample(const float3 Ng, float randu, float randv,
 	float3 *omega_out, float *pdf)
 {
 	/* todo: not implemented and used yet */
 }
 
-__device float3 emissive_simple_eval(const float3 Ng, const float3 I)
+ccl_device float3 emissive_simple_eval(const float3 Ng, const float3 I)
 {
 	float res = emissive_pdf(Ng, I);
 	
diff --git a/intern/cycles/kernel/closure/volume.h b/intern/cycles/kernel/closure/volume.h
index ddaf939984e..f30b30c8c76 100644
--- a/intern/cycles/kernel/closure/volume.h
+++ b/intern/cycles/kernel/closure/volume.h
@@ -21,7 +21,7 @@ CCL_NAMESPACE_BEGIN
 
 /* ISOTROPIC VOLUME CLOSURE */
 
-__device int volume_isotropic_setup(ShaderClosure *sc, float density)
+ccl_device int volume_isotropic_setup(ShaderClosure *sc, float density)
 {
 	sc->type = CLOSURE_VOLUME_ISOTROPIC_ID;
 	sc->data0 = density;
@@ -29,14 +29,14 @@ __device int volume_isotropic_setup(ShaderClosure *sc, float density)
 	return SD_VOLUME;
 }
 
-__device float3 volume_isotropic_eval_phase(const ShaderClosure *sc, const float3 omega_in, const float3 omega_out)
+ccl_device float3 volume_isotropic_eval_phase(const ShaderClosure *sc, const float3 omega_in, const float3 omega_out)
 {
 	return make_float3(1.0f, 1.0f, 1.0f);
 }
 
 /* TRANSPARENT VOLUME CLOSURE */
 
-__device int volume_transparent_setup(ShaderClosure *sc, float density)
+ccl_device int volume_transparent_setup(ShaderClosure *sc, float density)
 {
 	sc->type = CLOSURE_VOLUME_TRANSPARENT_ID;
 	sc->data0 = density;
@@ -44,14 +44,14 @@ __device int volume_transparent_setup(ShaderClosure *sc, float density)
 	return SD_VOLUME;
 }
 
-__device float3 volume_transparent_eval_phase(const ShaderClosure *sc, const float3 omega_in, const float3 omega_out)
+ccl_device float3 volume_transparent_eval_phase(const ShaderClosure *sc, const float3 omega_in, const float3 omega_out)
 {
 	return make_float3(1.0f, 1.0f, 1.0f);
 }
 
 /* VOLUME CLOSURE */
 
-__device float3 volume_eval_phase(KernelGlobals *kg, const ShaderClosure *sc, const float3 omega_in, const float3 omega_out)
+ccl_device float3 volume_eval_phase(KernelGlobals *kg, const ShaderClosure *sc, const float3 omega_in, const float3 omega_out)
 {
 #ifdef __OSL__
 	if(kg->osl && sc->prim)
diff --git a/intern/cycles/kernel/kernel.cl b/intern/cycles/kernel/kernel.cl
index 28e72d78731..6988ad6027f 100644
--- a/intern/cycles/kernel/kernel.cl
+++ b/intern/cycles/kernel/kernel.cl
@@ -26,12 +26,12 @@
 #include "kernel_displace.h"
 
 __kernel void kernel_ocl_path_trace(
-	__constant KernelData *data,
-	__global float *buffer,
-	__global uint *rng_state,
+	ccl_constant KernelData *data,
+	ccl_global float *buffer,
+	ccl_global uint *rng_state,
 
 #define KERNEL_TEX(type, ttype, name) \
-	__global type *name,
+	ccl_global type *name,
 #include "kernel_textures.h"
 
 	int sample,
@@ -53,12 +53,12 @@ __kernel void kernel_ocl_path_trace(
 }
 
 __kernel void kernel_ocl_convert_to_byte(
-	__constant KernelData *data,
-	__global uchar4 *rgba,
-	__global float *buffer,
+	ccl_constant KernelData *data,
+	ccl_global uchar4 *rgba,
+	ccl_global float *buffer,
 
 #define KERNEL_TEX(type, ttype, name) \
-	__global type *name,
+	ccl_global type *name,
 #include "kernel_textures.h"
 
 	float sample_scale,
@@ -80,12 +80,12 @@ __kernel void kernel_ocl_convert_to_byte(
 }
 
 __kernel void kernel_ocl_convert_to_half_float(
-	__constant KernelData *data,
-	__global uchar4 *rgba,
-	__global float *buffer,
+	ccl_constant KernelData *data,
+	ccl_global uchar4 *rgba,
+	ccl_global float *buffer,
 
 #define KERNEL_TEX(type, ttype, name) \
-	__global type *name,
+	ccl_global type *name,
 #include "kernel_textures.h"
 
 	float sample_scale,
@@ -107,12 +107,12 @@ __kernel void kernel_ocl_convert_to_half_float(
 }
 
 __kernel void kernel_ocl_shader(
-	__constant KernelData *data,
-	__global uint4 *input,
-	__global float4 *output,
+	ccl_constant KernelData *data,
+	ccl_global uint4 *input,
+	ccl_global float4 *output,
 
 #define KERNEL_TEX(type, ttype, name) \
-	__global type *name,
+	ccl_global type *name,
 #include "kernel_textures.h"
 
 	int type, int sx, int sw)
diff --git a/intern/cycles/kernel/kernel_accumulate.h b/intern/cycles/kernel/kernel_accumulate.h
index d7531be0d8a..f4febd7cf2c 100644
--- a/intern/cycles/kernel/kernel_accumulate.h
+++ b/intern/cycles/kernel/kernel_accumulate.h
@@ -21,7 +21,7 @@ CCL_NAMESPACE_BEGIN
  * BSDF evaluation result, split per BSDF type. This is used to accumulate
  * render passes separately. */
 
-__device_inline void bsdf_eval_init(BsdfEval *eval, ClosureType type, float3 value, int use_light_pass)
+ccl_device_inline void bsdf_eval_init(BsdfEval *eval, ClosureType type, float3 value, int use_light_pass)
 {
 #ifdef __PASSES__
 	eval->use_light_pass = use_light_pass;
@@ -51,7 +51,7 @@ __device_inline void bsdf_eval_init(BsdfEval *eval, ClosureType type, float3 val
 #endif
 }
 
-__device_inline void bsdf_eval_accum(BsdfEval *eval, ClosureType type, float3 value)
+ccl_device_inline void bsdf_eval_accum(BsdfEval *eval, ClosureType type, float3 value)
 {
 #ifdef __PASSES__
 	if(eval->use_light_pass) {
@@ -73,7 +73,7 @@ __device_inline void bsdf_eval_accum(BsdfEval *eval, ClosureType type, float3 va
 #endif
 }
 
-__device_inline bool bsdf_eval_is_zero(BsdfEval *eval)
+ccl_device_inline bool bsdf_eval_is_zero(BsdfEval *eval)
 {
 #ifdef __PASSES__
 	if(eval->use_light_pass) {
@@ -90,7 +90,7 @@ __device_inline bool bsdf_eval_is_zero(BsdfEval *eval)
 #endif
 }
 
-__device_inline void bsdf_eval_mul(BsdfEval *eval, float3 value)
+ccl_device_inline void bsdf_eval_mul(BsdfEval *eval, float3 value)
 {
 #ifdef __PASSES__
 	if(eval->use_light_pass) {
@@ -115,7 +115,7 @@ __device_inline void bsdf_eval_mul(BsdfEval *eval, float3 value)
  * visible as the first non-transparent hit, while indirectly visible are the
  * bounces after that. */
 
-__device_inline void path_radiance_init(PathRadiance *L, int use_light_pass)
+ccl_device_inline void path_radiance_init(PathRadiance *L, int use_light_pass)
 {
 	/* clear all */
 #ifdef __PASSES__
@@ -159,7 +159,7 @@ __device_inline void path_radiance_init(PathRadiance *L, int use_light_pass)
 #endif
 }
 
-__device_inline void path_radiance_bsdf_bounce(PathRadiance *L, float3 *throughput,
+ccl_device_inline void path_radiance_bsdf_bounce(PathRadiance *L, float3 *throughput,
 	BsdfEval *bsdf_eval, float bsdf_pdf, int bounce, int bsdf_label)
 {
 	float inverse_pdf = 1.0f/bsdf_pdf;
@@ -192,7 +192,7 @@ __device_inline void path_radiance_bsdf_bounce(PathRadiance *L, float3 *throughp
 #endif
 }
 
-__device_inline void path_radiance_accum_emission(PathRadiance *L, float3 throughput, float3 value, int bounce)
+ccl_device_inline void path_radiance_accum_emission(PathRadiance *L, float3 throughput, float3 value, int bounce)
 {
 #ifdef __PASSES__
 	if(L->use_light_pass) {
@@ -210,7 +210,7 @@ __device_inline void path_radiance_accum_emission(PathRadiance *L, float3 throug
 #endif
 }
 
-__device_inline void path_radiance_accum_ao(PathRadiance *L, float3 throughput, float3 alpha, float3 bsdf, float3 ao, int bounce)
+ccl_device_inline void path_radiance_accum_ao(PathRadiance *L, float3 throughput, float3 alpha, float3 bsdf, float3 ao, int bounce)
 {
 #ifdef __PASSES__
 	if(L->use_light_pass) {
@@ -231,7 +231,7 @@ __device_inline void path_radiance_accum_ao(PathRadiance *L, float3 throughput,
 #endif
 }
 
-__device_inline void path_radiance_accum_light(PathRadiance *L, float3 throughput, BsdfEval *bsdf_eval, float3 shadow, float shadow_fac, int bounce, bool is_lamp)
+ccl_device_inline void path_radiance_accum_light(PathRadiance *L, float3 throughput, BsdfEval *bsdf_eval, float3 shadow, float shadow_fac, int bounce, bool is_lamp)
 {
 #ifdef __PASSES__
 	if(L->use_light_pass) {
@@ -261,7 +261,7 @@ __device_inline void path_radiance_accum_light(PathRadiance *L, float3 throughpu
 #endif
 }
 
-__device_inline void path_radiance_accum_background(PathRadiance *L, float3 throughput, float3 value, int bounce)
+ccl_device_inline void path_radiance_accum_background(PathRadiance *L, float3 throughput, float3 value, int bounce)
 {
 #ifdef __PASSES__
 	if(L->use_light_pass) {
@@ -279,7 +279,7 @@ __device_inline void path_radiance_accum_background(PathRadiance *L, float3 thro
 #endif
 }
 
-__device_inline void path_radiance_sum_indirect(PathRadiance *L)
+ccl_device_inline void path_radiance_sum_indirect(PathRadiance *L)
 {
 #ifdef __PASSES__
 	/* this division is a bit ugly, but means we only have to keep track of
@@ -301,7 +301,7 @@ __device_inline void path_radiance_sum_indirect(PathRadiance *L)
 #endif
 }
 
-__device_inline void path_radiance_reset_indirect(PathRadiance *L)
+ccl_device_inline void path_radiance_reset_indirect(PathRadiance *L)
 {
 #ifdef __PASSES__
 	if(L->use_light_pass) {
@@ -316,7 +316,7 @@ __device_inline void path_radiance_reset_indirect(PathRadiance *L)
 #endif
 }
 
-__device_inline float3 path_radiance_sum(KernelGlobals *kg, PathRadiance *L)
+ccl_device_inline float3 path_radiance_sum(KernelGlobals *kg, PathRadiance *L)
 {
 #ifdef __PASSES__
 	if(L->use_light_pass) {
@@ -338,7 +338,7 @@ __device_inline float3 path_radiance_sum(KernelGlobals *kg, PathRadiance *L)
 #endif
 }
 
-__device_inline void path_radiance_clamp(PathRadiance *L, float3 *L_sum, float clamp)
+ccl_device_inline void path_radiance_clamp(PathRadiance *L, float3 *L_sum, float clamp)
 {
 	float sum = fabsf((*L_sum).x) + fabsf((*L_sum).y) + fabsf((*L_sum).z);
 
diff --git a/intern/cycles/kernel/kernel_bvh.h b/intern/cycles/kernel/kernel_bvh.h
index 44a9822c103..5aae4111fca 100644
--- a/intern/cycles/kernel/kernel_bvh.h
+++ b/intern/cycles/kernel/kernel_bvh.h
@@ -42,7 +42,7 @@ CCL_NAMESPACE_BEGIN
 #define NO_EXTENDED_PRECISION volatile
 #endif
 
-__device_inline float3 bvh_inverse_direction(float3 dir)
+ccl_device_inline float3 bvh_inverse_direction(float3 dir)
 {
 	/* avoid divide by zero (ooeps = exp2f(-80.0f)) */
 	float ooeps = 0.00000000000000000000000082718061255302767487140869206996285356581211090087890625f;
@@ -55,7 +55,7 @@ __device_inline float3 bvh_inverse_direction(float3 dir)
 	return idir;
 }
 
-__device_inline void bvh_instance_push(KernelGlobals *kg, int object, const Ray *ray, float3 *P, float3 *idir, float *t, const float tmax)
+ccl_device_inline void bvh_instance_push(KernelGlobals *kg, int object, const Ray *ray, float3 *P, float3 *idir, float *t, const float tmax)
 {
 	Transform tfm = object_fetch_transform(kg, object, OBJECT_INVERSE_TRANSFORM);
 
@@ -72,7 +72,7 @@ __device_inline void bvh_instance_push(KernelGlobals *kg, int object, const Ray
 		*t *= len;
 }
 
-__device_inline void bvh_instance_pop(KernelGlobals *kg, int object, const Ray *ray, float3 *P, float3 *idir, float *t, const float tmax)
+ccl_device_inline void bvh_instance_pop(KernelGlobals *kg, int object, const Ray *ray, float3 *P, float3 *idir, float *t, const float tmax)
 {
 	if(*t != FLT_MAX) {
 		Transform tfm = object_fetch_transform(kg, object, OBJECT_TRANSFORM);
@@ -84,7 +84,7 @@ __device_inline void bvh_instance_pop(KernelGlobals *kg, int object, const Ray *
 }
 
 #ifdef __OBJECT_MOTION__
-__device_inline void bvh_instance_motion_push(KernelGlobals *kg, int object, const Ray *ray, float3 *P, float3 *idir, float *t, Transform *tfm, const float tmax)
+ccl_device_inline void bvh_instance_motion_push(KernelGlobals *kg, int object, const Ray *ray, float3 *P, float3 *idir, float *t, Transform *tfm, const float tmax)
 {
 	Transform itfm;
 	*tfm = object_fetch_transform_motion_test(kg, object, ray->time, &itfm);
@@ -102,7 +102,7 @@ __device_inline void bvh_instance_motion_push(KernelGlobals *kg, int object, con
 		*t *= len;
 }
 
-__device_inline void bvh_instance_motion_pop(KernelGlobals *kg, int object, const Ray *ray, float3 *P, float3 *idir, float *t, Transform *tfm, const float tmax)
+ccl_device_inline void bvh_instance_motion_pop(KernelGlobals *kg, int object, const Ray *ray, float3 *P, float3 *idir, float *t, Transform *tfm, const float tmax)
 {
 	if(*t != FLT_MAX)
 		*t *= len(transform_direction(tfm, 1.0f/(*idir)));
@@ -113,7 +113,7 @@ __device_inline void bvh_instance_motion_pop(KernelGlobals *kg, int object, cons
 #endif
 
 /* Sven Woop's algorithm */
-__device_inline bool bvh_triangle_intersect(KernelGlobals *kg, Intersection *isect,
+ccl_device_inline bool bvh_triangle_intersect(KernelGlobals *kg, Intersection *isect,
 	float3 P, float3 idir, uint visibility, int object, int triAddr)
 {
 	/* compute and check intersection t-value */
@@ -161,7 +161,7 @@ __device_inline bool bvh_triangle_intersect(KernelGlobals *kg, Intersection *ise
 }
 
 #ifdef __HAIR__
-__device_inline void curvebounds(float *lower, float *upper, float *extremta, float *extrema, float *extremtb, float *extremb, float p0, float p1, float p2, float p3)
+ccl_device_inline void curvebounds(float *lower, float *upper, float *extremta, float *extrema, float *extremtb, float *extremb, float p0, float p1, float p2, float p3)
 {
 	float halfdiscroot = (p2 * p2 - 3 * p3 * p1);
 	float ta = -1.0f;
@@ -211,7 +211,7 @@ __device_inline void curvebounds(float *lower, float *upper, float *extremta, fl
 	}
 }
 
-__device_inline bool bvh_cardinal_curve_intersect(KernelGlobals *kg, Intersection *isect,
+ccl_device_inline bool bvh_cardinal_curve_intersect(KernelGlobals *kg, Intersection *isect,
 	float3 P, float3 idir, uint visibility, int object, int curveAddr, int segment, uint *lcg_state, float difl, float extmax)
 {
 	float epsilon = 0.0f;
@@ -520,7 +520,7 @@ __device_inline bool bvh_cardinal_curve_intersect(KernelGlobals *kg, Intersectio
 	return hit;
 }
 
-__device_inline bool bvh_curve_intersect(KernelGlobals *kg, Intersection *isect,
+ccl_device_inline bool bvh_curve_intersect(KernelGlobals *kg, Intersection *isect,
 	float3 P, float3 idir, uint visibility, int object, int curveAddr, int segment, uint *lcg_state, float difl, float extmax)
 {
 	/* curve Intersection check */
@@ -689,7 +689,7 @@ __device_inline bool bvh_curve_intersect(KernelGlobals *kg, Intersection *isect,
  * only want to intersect with primitives in the same object, and if case of
  * multiple hits we pick a single random primitive as the intersection point. */
 
-__device_inline void bvh_triangle_intersect_subsurface(KernelGlobals *kg, Intersection *isect_array,
+ccl_device_inline void bvh_triangle_intersect_subsurface(KernelGlobals *kg, Intersection *isect_array,
 	float3 P, float3 idir, int object, int triAddr, float tmax, uint *num_hits, uint *lcg_state, int max_hits)
 {
 	/* compute and check intersection t-value */
@@ -811,9 +811,9 @@ __device_inline void bvh_triangle_intersect_subsurface(KernelGlobals *kg, Inters
 
 /* to work around titan bug when using arrays instead of textures */
 #if !defined(__KERNEL_CUDA__) || defined(__KERNEL_CUDA_TEX_STORAGE__)
-__device_inline
+ccl_device_inline
 #else
-__device_noinline
+ccl_device_noinline
 #endif
 #ifdef __HAIR__ 
 bool scene_intersect(KernelGlobals *kg, const Ray *ray, const uint visibility, Intersection *isect, uint *lcg_state, float difl, float extmax)
@@ -859,9 +859,9 @@ bool scene_intersect(KernelGlobals *kg, const Ray *ray, const uint visibility, I
 /* to work around titan bug when using arrays instead of textures */
 #ifdef __SUBSURFACE__
 #if !defined(__KERNEL_CUDA__) || defined(__KERNEL_CUDA_TEX_STORAGE__)
-__device_inline
+ccl_device_inline
 #else
-__device_noinline
+ccl_device_noinline
 #endif
 uint scene_intersect_subsurface(KernelGlobals *kg, const Ray *ray, Intersection *isect, int subsurface_object, uint *lcg_state, int max_hits)
 {
@@ -903,7 +903,7 @@ uint scene_intersect_subsurface(KernelGlobals *kg, const Ray *ray, Intersection
 
 /* Ray offset to avoid self intersection */
 
-__device_inline float3 ray_offset(float3 P, float3 Ng)
+ccl_device_inline float3 ray_offset(float3 P, float3 Ng)
 {
 #ifdef __INTERSECTION_REFINE__
 	const float epsilon_f = 1e-5f;
@@ -955,7 +955,7 @@ __device_inline float3 ray_offset(float3 P, float3 Ng)
  * far the precision is often not so good, this reintersects the primitive from
  * a closer distance. */
 
-__device_inline float3 bvh_triangle_refine(KernelGlobals *kg, ShaderData *sd, const Intersection *isect, const Ray *ray)
+ccl_device_inline float3 bvh_triangle_refine(KernelGlobals *kg, ShaderData *sd, const Intersection *isect, const Ray *ray)
 {
 	float3 P = ray->P;
 	float3 D = ray->D;
@@ -1000,7 +1000,7 @@ __device_inline float3 bvh_triangle_refine(KernelGlobals *kg, ShaderData *sd, co
 }
 
 /* same as above, except that isect->t is assumed to be in object space for instancing */
-__device_inline float3 bvh_triangle_refine_subsurface(KernelGlobals *kg, ShaderData *sd, const Intersection *isect, const Ray *ray)
+ccl_device_inline float3 bvh_triangle_refine_subsurface(KernelGlobals *kg, ShaderData *sd, const Intersection *isect, const Ray *ray)
 {
 	float3 P = ray->P;
 	float3 D = ray->D;
@@ -1046,7 +1046,7 @@ __device_inline float3 bvh_triangle_refine_subsurface(KernelGlobals *kg, ShaderD
 
 #ifdef __HAIR__
 
-__device_inline float3 curvetangent(float t, float3 p0, float3 p1, float3 p2, float3 p3)
+ccl_device_inline float3 curvetangent(float t, float3 p0, float3 p1, float3 p2, float3 p3)
 {
 	float fc = 0.71f;
 	float data[4];
@@ -1058,7 +1058,7 @@ __device_inline float3 curvetangent(float t, float3 p0, float3 p1, float3 p2, fl
 	return data[0] * p0 + data[1] * p1 + data[2] * p2 + data[3] * p3;
 }
 
-__device_inline float3 curvepoint(float t, float3 p0, float3 p1, float3 p2, float3 p3)
+ccl_device_inline float3 curvepoint(float t, float3 p0, float3 p1, float3 p2, float3 p3)
 {
 	float data[4];
 	float fc = 0.71f;
@@ -1071,7 +1071,7 @@ __device_inline float3 curvepoint(float t, float3 p0, float3 p1, float3 p2, floa
 	return data[0] * p0 + data[1] * p1 + data[2] * p2 + data[3] * p3;
 }
 
-__device_inline float3 bvh_curve_refine(KernelGlobals *kg, ShaderData *sd, const Intersection *isect, const Ray *ray, float t)
+ccl_device_inline float3 bvh_curve_refine(KernelGlobals *kg, ShaderData *sd, const Intersection *isect, const Ray *ray, float t)
 {
 	int flag = kernel_data.curve.curveflags;
 	float3 P = ray->P;
diff --git a/intern/cycles/kernel/kernel_bvh_subsurface.h b/intern/cycles/kernel/kernel_bvh_subsurface.h
index 4446c1821d5..fb41bdcfa37 100644
--- a/intern/cycles/kernel/kernel_bvh_subsurface.h
+++ b/intern/cycles/kernel/kernel_bvh_subsurface.h
@@ -28,7 +28,7 @@
 
 #define FEATURE(f) (((BVH_FUNCTION_FEATURES) & (f)) != 0)
 
-__device uint BVH_FUNCTION_NAME(KernelGlobals *kg, const Ray *ray, Intersection *isect_array,
+ccl_device uint BVH_FUNCTION_NAME(KernelGlobals *kg, const Ray *ray, Intersection *isect_array,
 	int subsurface_object, uint *lcg_state, int max_hits)
 {
 	/* todo:
diff --git a/intern/cycles/kernel/kernel_bvh_traversal.h b/intern/cycles/kernel/kernel_bvh_traversal.h
index a9264f318eb..8f69083575b 100644
--- a/intern/cycles/kernel/kernel_bvh_traversal.h
+++ b/intern/cycles/kernel/kernel_bvh_traversal.h
@@ -30,7 +30,7 @@
 
 #define FEATURE(f) (((BVH_FUNCTION_FEATURES) & (f)) != 0)
 
-__device bool BVH_FUNCTION_NAME
+ccl_device bool BVH_FUNCTION_NAME
 (KernelGlobals *kg, const Ray *ray, Intersection *isect, const uint visibility
 #if FEATURE(BVH_HAIR_MINIMUM_WIDTH)
 , uint *lcg_state, float difl, float extmax
diff --git a/intern/cycles/kernel/kernel_camera.h b/intern/cycles/kernel/kernel_camera.h
index 966f28df05f..887b1afddd4 100644
--- a/intern/cycles/kernel/kernel_camera.h
+++ b/intern/cycles/kernel/kernel_camera.h
@@ -18,7 +18,7 @@ CCL_NAMESPACE_BEGIN
 
 /* Perspective Camera */
 
-__device float2 camera_sample_aperture(KernelGlobals *kg, float u, float v)
+ccl_device float2 camera_sample_aperture(KernelGlobals *kg, float u, float v)
 {
 	float blades = kernel_data.cam.blades;
 
@@ -33,7 +33,7 @@ __device float2 camera_sample_aperture(KernelGlobals *kg, float u, float v)
 	}
 }
 
-__device void camera_sample_perspective(KernelGlobals *kg, float raster_x, float raster_y, float lens_u, float lens_v, Ray *ray)
+ccl_device void camera_sample_perspective(KernelGlobals *kg, float raster_x, float raster_y, float lens_u, float lens_v, Ray *ray)
 {
 	/* create ray form raster position */
 	Transform rastertocamera = kernel_data.cam.rastertocamera;
@@ -91,7 +91,7 @@ __device void camera_sample_perspective(KernelGlobals *kg, float raster_x, float
 
 /* Orthographic Camera */
 
-__device void camera_sample_orthographic(KernelGlobals *kg, float raster_x, float raster_y, float lens_u, float lens_v, Ray *ray)
+ccl_device void camera_sample_orthographic(KernelGlobals *kg, float raster_x, float raster_y, float lens_u, float lens_v, Ray *ray)
 {
 	/* create ray form raster position */
 	Transform rastertocamera = kernel_data.cam.rastertocamera;
@@ -147,7 +147,7 @@ __device void camera_sample_orthographic(KernelGlobals *kg, float raster_x, floa
 
 /* Panorama Camera */
 
-__device void camera_sample_panorama(KernelGlobals *kg, float raster_x, float raster_y, float lens_u, float lens_v, Ray *ray)
+ccl_device void camera_sample_panorama(KernelGlobals *kg, float raster_x, float raster_y, float lens_u, float lens_v, Ray *ray)
 {
 	Transform rastertocamera = kernel_data.cam.rastertocamera;
 	float3 Pcamera = transform_perspective(&rastertocamera, make_float3(raster_x, raster_y, 0.0f));
@@ -216,7 +216,7 @@ __device void camera_sample_panorama(KernelGlobals *kg, float raster_x, float ra
 
 /* Common */
 
-__device void camera_sample(KernelGlobals *kg, int x, int y, float filter_u, float filter_v,
+ccl_device void camera_sample(KernelGlobals *kg, int x, int y, float filter_u, float filter_v,
 	float lens_u, float lens_v, float time, Ray *ray)
 {
 	/* pixel filter */
@@ -243,13 +243,13 @@ __device void camera_sample(KernelGlobals *kg, int x, int y, float filter_u, flo
 
 /* Utilities */
 
-__device_inline float3 camera_position(KernelGlobals *kg)
+ccl_device_inline float3 camera_position(KernelGlobals *kg)
 {
 	Transform cameratoworld = kernel_data.cam.cameratoworld;
 	return make_float3(cameratoworld.x.w, cameratoworld.y.w, cameratoworld.z.w);
 }
 
-__device_inline float camera_distance(KernelGlobals *kg, float3 P)
+ccl_device_inline float camera_distance(KernelGlobals *kg, float3 P)
 {
 	Transform cameratoworld = kernel_data.cam.cameratoworld;
 	float3 camP = make_float3(cameratoworld.x.w, cameratoworld.y.w, cameratoworld.z.w);
@@ -262,7 +262,7 @@ __device_inline float camera_distance(KernelGlobals *kg, float3 P)
 		return len(P - camP);
 }
 
-__device_inline float3 camera_world_to_ndc(KernelGlobals *kg, ShaderData *sd, float3 P)
+ccl_device_inline float3 camera_world_to_ndc(KernelGlobals *kg, ShaderData *sd, float3 P)
 {
 	if(kernel_data.cam.type != CAMERA_PANORAMA) {
 		/* perspective / ortho */
diff --git a/intern/cycles/kernel/kernel_compat_cuda.h b/intern/cycles/kernel/kernel_compat_cuda.h
index 44c2b9effe9..76f885aefe0 100644
--- a/intern/cycles/kernel/kernel_compat_cuda.h
+++ b/intern/cycles/kernel/kernel_compat_cuda.h
@@ -27,13 +27,12 @@
 
 /* Qualifier wrappers for different names on different devices */
 
-#define __device  __device__ __inline__
-#define __device_inline  __device__ __inline__
-#define __device_noinline  __device__ __noinline__
-#define __global
-#define __shared __shared__
-#define __constant
-#define __may_alias
+#define ccl_device  __device__ __inline__
+#define ccl_device_inline  __device__ __inline__
+#define ccl_device_noinline  __device__ __noinline__
+#define ccl_global
+#define ccl_constant
+#define ccl_may_alias
 
 /* No assert supported for CUDA */
 
diff --git a/intern/cycles/kernel/kernel_compat_opencl.h b/intern/cycles/kernel/kernel_compat_opencl.h
index e0102a01146..1ff3615e448 100644
--- a/intern/cycles/kernel/kernel_compat_opencl.h
+++ b/intern/cycles/kernel/kernel_compat_opencl.h
@@ -33,16 +33,18 @@
 #endif
 
 #ifdef __CL_NOINLINE__
-#define __noinline __attribute__((noinline))
+#define ccl_noinline __attribute__((noinline))
 #else
-#define __noinline
+#define ccl_noinline
 #endif
 
 /* in opencl all functions are device functions, so leave this empty */
-#define __device
-#define __device_inline __device
-#define __device_noinline  __device __noinline
-#define __may_alias
+#define ccl_device
+#define ccl_device_inline ccl_device
+#define ccl_device_noinline ccl_device ccl_noinline
+#define ccl_may_alias
+#define ccl_constant __constant
+#define ccl_global __global
 
 /* no assert in opencl */
 #define kernel_assert(cond)
diff --git a/intern/cycles/kernel/kernel_curve.h b/intern/cycles/kernel/kernel_curve.h
index 9f7a1388a2b..821ac50eaa9 100644
--- a/intern/cycles/kernel/kernel_curve.h
+++ b/intern/cycles/kernel/kernel_curve.h
@@ -20,7 +20,7 @@ CCL_NAMESPACE_BEGIN
 
 /* curve attributes */
 
-__device float curve_attribute_float(KernelGlobals *kg, const ShaderData *sd, AttributeElement elem, int offset, float *dx, float *dy)
+ccl_device float curve_attribute_float(KernelGlobals *kg, const ShaderData *sd, AttributeElement elem, int offset, float *dx, float *dy)
 {
 	if(elem == ATTR_ELEMENT_CURVE) {
 #ifdef __RAY_DIFFERENTIALS__
@@ -55,7 +55,7 @@ __device float curve_attribute_float(KernelGlobals *kg, const ShaderData *sd, At
 	}
 }
 
-__device float3 curve_attribute_float3(KernelGlobals *kg, const ShaderData *sd, AttributeElement elem, int offset, float3 *dx, float3 *dy)
+ccl_device float3 curve_attribute_float3(KernelGlobals *kg, const ShaderData *sd, AttributeElement elem, int offset, float3 *dx, float3 *dy)
 {
 	if(elem == ATTR_ELEMENT_CURVE) {
 		/* idea: we can't derive any useful differentials here, but for tiled
@@ -96,7 +96,7 @@ __device float3 curve_attribute_float3(KernelGlobals *kg, const ShaderData *sd,
 
 /* hair info node functions */
 
-__device float curve_thickness(KernelGlobals *kg, ShaderData *sd)
+ccl_device float curve_thickness(KernelGlobals *kg, ShaderData *sd)
 {
 	float r = 0.0f;
 
@@ -113,7 +113,7 @@ __device float curve_thickness(KernelGlobals *kg, ShaderData *sd)
 	return r*2.0f;
 }
 
-__device float3 curve_tangent_normal(KernelGlobals *kg, ShaderData *sd)
+ccl_device float3 curve_tangent_normal(KernelGlobals *kg, ShaderData *sd)
 {	
 	float3 tgN = make_float3(0.0f,0.0f,0.0f);
 
diff --git a/intern/cycles/kernel/kernel_differential.h b/intern/cycles/kernel/kernel_differential.h
index 71d6e87a4d9..daba2d927b7 100644
--- a/intern/cycles/kernel/kernel_differential.h
+++ b/intern/cycles/kernel/kernel_differential.h
@@ -18,7 +18,7 @@ CCL_NAMESPACE_BEGIN
 
 /* See "Tracing Ray Differentials", Homan Igehy, 1999. */
 
-__device void differential_transfer(differential3 *dP_, const differential3 dP, float3 D, const differential3 dD, float3 Ng, float t)
+ccl_device void differential_transfer(differential3 *dP_, const differential3 dP, float3 D, const differential3 dD, float3 Ng, float t)
 {
 	/* ray differential transfer through homogeneous medium, to
 	 * compute dPdx/dy at a shading point from the incoming ray */
@@ -31,7 +31,7 @@ __device void differential_transfer(differential3 *dP_, const differential3 dP,
 	dP_->dy = tmpy - dot(tmpy, Ng)*tmp;
 }
 
-__device void differential_incoming(differential3 *dI, const differential3 dD)
+ccl_device void differential_incoming(differential3 *dI, const differential3 dD)
 {
 	/* compute dIdx/dy at a shading point, we just need to negate the
 	 * differential of the ray direction */
@@ -40,7 +40,7 @@ __device void differential_incoming(differential3 *dI, const differential3 dD)
 	dI->dy = -dD.dy;
 }
 
-__device void differential_dudv(differential *du, differential *dv, float3 dPdu, float3 dPdv, differential3 dP, float3 Ng)
+ccl_device void differential_dudv(differential *du, differential *dv, float3 dPdu, float3 dPdv, differential3 dP, float3 Ng)
 {
 	/* now we have dPdx/dy from the ray differential transfer, and dPdu/dv
 	 * from the primitive, we can compute dudx/dy and dvdx/dy. these are
@@ -84,7 +84,7 @@ __device void differential_dudv(differential *du, differential *dv, float3 dPdu,
 	dv->dy = (dP.dy.y*dPdu.x - dP.dy.x*dPdu.y)*det;
 }
 
-__device differential differential_zero()
+ccl_device differential differential_zero()
 {
 	differential d;
 	d.dx = 0.0f;
@@ -93,7 +93,7 @@ __device differential differential_zero()
 	return d;
 }
 
-__device differential3 differential3_zero()
+ccl_device differential3 differential3_zero()
 {
 	differential3 d;
 	d.dx = make_float3(0.0f, 0.0f, 0.0f);
diff --git a/intern/cycles/kernel/kernel_displace.h b/intern/cycles/kernel/kernel_displace.h
index 38152b5571e..c50e2166660 100644
--- a/intern/cycles/kernel/kernel_displace.h
+++ b/intern/cycles/kernel/kernel_displace.h
@@ -16,7 +16,7 @@
 
 CCL_NAMESPACE_BEGIN
 
-__device void kernel_shader_evaluate(KernelGlobals *kg, __global uint4 *input, __global float4 *output, ShaderEvalType type, int i)
+ccl_device void kernel_shader_evaluate(KernelGlobals *kg, ccl_global uint4 *input, ccl_global float4 *output, ShaderEvalType type, int i)
 {
 	ShaderData sd;
 	uint4 in = input[i];
diff --git a/intern/cycles/kernel/kernel_emission.h b/intern/cycles/kernel/kernel_emission.h
index 0c8d69fb594..2ce0b758972 100644
--- a/intern/cycles/kernel/kernel_emission.h
+++ b/intern/cycles/kernel/kernel_emission.h
@@ -18,7 +18,7 @@ CCL_NAMESPACE_BEGIN
 
 /* Direction Emission */
 
-__device_noinline float3 direct_emissive_eval(KernelGlobals *kg, float rando,
+ccl_device_noinline float3 direct_emissive_eval(KernelGlobals *kg, float rando,
 	LightSample *ls, float u, float v, float3 I, differential3 dI, float t, float time, int bounce)
 {
 	/* setup shading at emitter */
@@ -70,7 +70,7 @@ __device_noinline float3 direct_emissive_eval(KernelGlobals *kg, float rando,
 	return eval;
 }
 
-__device_noinline bool direct_emission(KernelGlobals *kg, ShaderData *sd, int lindex,
+ccl_device_noinline bool direct_emission(KernelGlobals *kg, ShaderData *sd, int lindex,
 	float randt, float rando, float randu, float randv, Ray *ray, BsdfEval *eval,
 	bool *is_lamp, int bounce)
 {
@@ -160,7 +160,7 @@ __device_noinline bool direct_emission(KernelGlobals *kg, ShaderData *sd, int li
 
 /* Indirect Primitive Emission */
 
-__device_noinline float3 indirect_primitive_emission(KernelGlobals *kg, ShaderData *sd, float t, int path_flag, float bsdf_pdf)
+ccl_device_noinline float3 indirect_primitive_emission(KernelGlobals *kg, ShaderData *sd, float t, int path_flag, float bsdf_pdf)
 {
 	/* evaluate emissive closure */
 	float3 L = shader_emissive_eval(kg, sd);
@@ -183,7 +183,7 @@ __device_noinline float3 indirect_primitive_emission(KernelGlobals *kg, ShaderDa
 
 /* Indirect Lamp Emission */
 
-__device_noinline bool indirect_lamp_emission(KernelGlobals *kg, Ray *ray, int path_flag, float bsdf_pdf, float randt, float3 *emission, int bounce)
+ccl_device_noinline bool indirect_lamp_emission(KernelGlobals *kg, Ray *ray, int path_flag, float bsdf_pdf, float randt, float3 *emission, int bounce)
 {
 	LightSample ls;
 	int lamp = lamp_light_eval_sample(kg, randt);
@@ -222,7 +222,7 @@ __device_noinline bool indirect_lamp_emission(KernelGlobals *kg, Ray *ray, int p
 
 /* Indirect Background */
 
-__device_noinline float3 indirect_background(KernelGlobals *kg, Ray *ray, int path_flag, float bsdf_pdf, int bounce)
+ccl_device_noinline float3 indirect_background(KernelGlobals *kg, Ray *ray, int path_flag, float bsdf_pdf, int bounce)
 {
 #ifdef __BACKGROUND__
 	int shader = kernel_data.background.shader;
diff --git a/intern/cycles/kernel/kernel_film.h b/intern/cycles/kernel/kernel_film.h
index 370c550a515..b4118666491 100644
--- a/intern/cycles/kernel/kernel_film.h
+++ b/intern/cycles/kernel/kernel_film.h
@@ -16,7 +16,7 @@
 
 CCL_NAMESPACE_BEGIN
 
-__device float4 film_map(KernelGlobals *kg, float4 irradiance, float scale)
+ccl_device float4 film_map(KernelGlobals *kg, float4 irradiance, float scale)
 {
 	float exposure = kernel_data.film.exposure;
 	float4 result = irradiance*scale;
@@ -32,7 +32,7 @@ __device float4 film_map(KernelGlobals *kg, float4 irradiance, float scale)
 	return result;
 }
 
-__device uchar4 film_float_to_byte(float4 color)
+ccl_device uchar4 film_float_to_byte(float4 color)
 {
 	uchar4 result;
 
@@ -45,8 +45,8 @@ __device uchar4 film_float_to_byte(float4 color)
 	return result;
 }
 
-__device void kernel_film_convert_to_byte(KernelGlobals *kg,
-	__global uchar4 *rgba, __global float *buffer,
+ccl_device void kernel_film_convert_to_byte(KernelGlobals *kg,
+	ccl_global uchar4 *rgba, ccl_global float *buffer,
 	float sample_scale, int x, int y, int offset, int stride)
 {
 	/* buffer offset */
@@ -56,22 +56,22 @@ __device void kernel_film_convert_to_byte(KernelGlobals *kg,
 	buffer += index*kernel_data.film.pass_stride;
 
 	/* map colors */
-	float4 irradiance = *((__global float4*)buffer);
+	float4 irradiance = *((ccl_global float4*)buffer);
 	float4 float_result = film_map(kg, irradiance, sample_scale);
 	uchar4 byte_result = film_float_to_byte(float_result);
 
 	*rgba = byte_result;
 }
 
-__device void kernel_film_convert_to_half_float(KernelGlobals *kg,
-	__global uchar4 *rgba, __global float *buffer,
+ccl_device void kernel_film_convert_to_half_float(KernelGlobals *kg,
+	ccl_global uchar4 *rgba, ccl_global float *buffer,
 	float sample_scale, int x, int y, int offset, int stride)
 {
 	/* buffer offset */
 	int index = offset + x + y*stride;
 
-	__global float4 *in = (__global float4*)(buffer + index*kernel_data.film.pass_stride);
-	__global half *out = (__global half*)rgba + index*4;
+	ccl_global float4 *in = (ccl_global float4*)(buffer + index*kernel_data.film.pass_stride);
+	ccl_global half *out = (ccl_global half*)rgba + index*4;
 
 	float exposure = kernel_data.film.exposure;
 
diff --git a/intern/cycles/kernel/kernel_globals.h b/intern/cycles/kernel/kernel_globals.h
index b5e691eb615..e60bd6c0067 100644
--- a/intern/cycles/kernel/kernel_globals.h
+++ b/intern/cycles/kernel/kernel_globals.h
@@ -81,10 +81,10 @@ typedef struct KernelGlobals {} KernelGlobals;
 #ifdef __KERNEL_OPENCL__
 
 typedef struct KernelGlobals {
-	__constant KernelData *data;
+	ccl_constant KernelData *data;
 
 #define KERNEL_TEX(type, ttype, name) \
-	__global type *name;
+	ccl_global type *name;
 #include "kernel_textures.h"
 } KernelGlobals;
 
@@ -92,7 +92,7 @@ typedef struct KernelGlobals {
 
 /* Interpolated lookup table access */
 
-__device float lookup_table_read(KernelGlobals *kg, float x, int offset, int size)
+ccl_device float lookup_table_read(KernelGlobals *kg, float x, int offset, int size)
 {
 	x = clamp(x, 0.0f, 1.0f)*(size-1);
 
@@ -108,7 +108,7 @@ __device float lookup_table_read(KernelGlobals *kg, float x, int offset, int siz
 	return (1.0f - t)*data0 + t*data1;
 }
 
-__device float lookup_table_read_2D(KernelGlobals *kg, float x, float y, int offset, int xsize, int ysize)
+ccl_device float lookup_table_read_2D(KernelGlobals *kg, float x, float y, int offset, int xsize, int ysize)
 {
 	y = clamp(y, 0.0f, 1.0f)*(ysize-1);
 
diff --git a/intern/cycles/kernel/kernel_jitter.h b/intern/cycles/kernel/kernel_jitter.h
index 18666b51c0c..7a850844bf2 100644
--- a/intern/cycles/kernel/kernel_jitter.h
+++ b/intern/cycles/kernel/kernel_jitter.h
@@ -22,18 +22,18 @@ CCL_NAMESPACE_BEGIN
 /* todo: find good value, suggested 64 gives pattern on cornell box ceiling */
 #define CMJ_RANDOM_OFFSET_LIMIT 4096
 
-__device_inline bool cmj_is_pow2(int i)
+ccl_device_inline bool cmj_is_pow2(int i)
 {
 	return (i & (i - 1)) == 0;
 }
 
-__device_inline int cmj_fast_mod_pow2(int a, int b)
+ccl_device_inline int cmj_fast_mod_pow2(int a, int b)
 {
 	return (a & (b - 1));
 }
 
 /* a must be > 0 and b must be > 1 */
-__device_inline int cmj_fast_div_pow2(int a, int b)
+ccl_device_inline int cmj_fast_div_pow2(int a, int b)
 {
 #if defined(__KERNEL_SSE2__) && !defined(_MSC_VER)
 	return a >> __builtin_ctz(b);
@@ -42,7 +42,7 @@ __device_inline int cmj_fast_div_pow2(int a, int b)
 #endif
 }
 
-__device_inline uint cmj_w_mask(uint w)
+ccl_device_inline uint cmj_w_mask(uint w)
 {
 #if defined(__KERNEL_SSE2__) && !defined(_MSC_VER)
 	return ((1 << (32 - __builtin_clz(w))) - 1);
@@ -57,7 +57,7 @@ __device_inline uint cmj_w_mask(uint w)
 #endif
 }
 
-__device_inline uint cmj_permute(uint i, uint l, uint p)
+ccl_device_inline uint cmj_permute(uint i, uint l, uint p)
 {
 	uint w = l - 1;
 
@@ -113,7 +113,7 @@ __device_inline uint cmj_permute(uint i, uint l, uint p)
 	}
 }
 
-__device_inline uint cmj_hash(uint i, uint p)
+ccl_device_inline uint cmj_hash(uint i, uint p)
 {
 	i ^= p;
 	i ^= i >> 17;
@@ -129,13 +129,13 @@ __device_inline uint cmj_hash(uint i, uint p)
 	return i;
 }
 
-__device_inline float cmj_randfloat(uint i, uint p)
+ccl_device_inline float cmj_randfloat(uint i, uint p)
 {
 	return cmj_hash(i, p) * (1.0f / 4294967808.0f);
 }
 
 #ifdef __CMJ__
-__device float cmj_sample_1D(int s, int N, int p)
+ccl_device float cmj_sample_1D(int s, int N, int p)
 {
 	kernel_assert(s < N);
 
@@ -146,7 +146,7 @@ __device float cmj_sample_1D(int s, int N, int p)
 	return (x + jx)*invN;
 }
 
-__device void cmj_sample_2D(int s, int N, int p, float *fx, float *fy)
+ccl_device void cmj_sample_2D(int s, int N, int p, float *fx, float *fy)
 {
 	kernel_assert(s < N);
 
diff --git a/intern/cycles/kernel/kernel_light.h b/intern/cycles/kernel/kernel_light.h
index 6b00bd2ab01..9915cd2495f 100644
--- a/intern/cycles/kernel/kernel_light.h
+++ b/intern/cycles/kernel/kernel_light.h
@@ -36,7 +36,7 @@ typedef struct LightSample {
 
 #ifdef __BACKGROUND_MIS__
 
-__device float3 background_light_sample(KernelGlobals *kg, float randu, float randv, float *pdf)
+ccl_device float3 background_light_sample(KernelGlobals *kg, float randu, float randv, float *pdf)
 {
 	/* for the following, the CDF values are actually a pair of floats, with the
 	 * function value as X and the actual CDF as Y.  The last entry's function
@@ -112,7 +112,7 @@ __device float3 background_light_sample(KernelGlobals *kg, float randu, float ra
 	return -equirectangular_to_direction(u, v);
 }
 
-__device float background_light_pdf(KernelGlobals *kg, float3 direction)
+ccl_device float background_light_pdf(KernelGlobals *kg, float3 direction)
 {
 	float2 uv = direction_to_equirectangular(direction);
 	int res = kernel_data.integrator.pdf_background_res;
@@ -146,7 +146,7 @@ __device float background_light_pdf(KernelGlobals *kg, float3 direction)
 
 /* Regular Light */
 
-__device float3 disk_light_sample(float3 v, float randu, float randv)
+ccl_device float3 disk_light_sample(float3 v, float randu, float randv)
 {
 	float3 ru, rv;
 
@@ -156,17 +156,17 @@ __device float3 disk_light_sample(float3 v, float randu, float randv)
 	return ru*randu + rv*randv;
 }
 
-__device float3 distant_light_sample(float3 D, float radius, float randu, float randv)
+ccl_device float3 distant_light_sample(float3 D, float radius, float randu, float randv)
 {
 	return normalize(D + disk_light_sample(D, randu, randv)*radius);
 }
 
-__device float3 sphere_light_sample(float3 P, float3 center, float radius, float randu, float randv)
+ccl_device float3 sphere_light_sample(float3 P, float3 center, float radius, float randu, float randv)
 {
 	return disk_light_sample(normalize(P - center), randu, randv)*radius;
 }
 
-__device float3 area_light_sample(float3 axisu, float3 axisv, float randu, float randv)
+ccl_device float3 area_light_sample(float3 axisu, float3 axisv, float randu, float randv)
 {
 	randu = randu - 0.5f;
 	randv = randv - 0.5f;
@@ -174,7 +174,7 @@ __device float3 area_light_sample(float3 axisu, float3 axisv, float randu, float
 	return axisu*randu + axisv*randv;
 }
 
-__device float spot_light_attenuation(float4 data1, float4 data2, LightSample *ls)
+ccl_device float spot_light_attenuation(float4 data1, float4 data2, LightSample *ls)
 {
 	float3 dir = make_float3(data2.y, data2.z, data2.w);
 	float3 I = ls->Ng;
@@ -197,7 +197,7 @@ __device float spot_light_attenuation(float4 data1, float4 data2, LightSample *l
 	return attenuation;
 }
 
-__device float lamp_light_pdf(KernelGlobals *kg, const float3 Ng, const float3 I, float t)
+ccl_device float lamp_light_pdf(KernelGlobals *kg, const float3 Ng, const float3 I, float t)
 {
 	float cos_pi = dot(Ng, I);
 
@@ -207,7 +207,7 @@ __device float lamp_light_pdf(KernelGlobals *kg, const float3 Ng, const float3 I
 	return t*t/cos_pi;
 }
 
-__device void lamp_light_sample(KernelGlobals *kg, int lamp,
+ccl_device void lamp_light_sample(KernelGlobals *kg, int lamp,
 	float randu, float randv, float3 P, LightSample *ls)
 {
 	float4 data0 = kernel_tex_fetch(__light_data, lamp*LIGHT_SIZE + 0);
@@ -298,7 +298,7 @@ __device void lamp_light_sample(KernelGlobals *kg, int lamp,
 	}
 }
 
-__device bool lamp_light_eval(KernelGlobals *kg, int lamp, float3 P, float3 D, float t, LightSample *ls)
+ccl_device bool lamp_light_eval(KernelGlobals *kg, int lamp, float3 P, float3 D, float t, LightSample *ls)
 {
 	float4 data0 = kernel_tex_fetch(__light_data, lamp*LIGHT_SIZE + 0);
 	float4 data1 = kernel_tex_fetch(__light_data, lamp*LIGHT_SIZE + 1);
@@ -422,7 +422,7 @@ __device bool lamp_light_eval(KernelGlobals *kg, int lamp, float3 P, float3 D, f
 
 /* Triangle Light */
 
-__device void object_transform_light_sample(KernelGlobals *kg, LightSample *ls, int object, float time)
+ccl_device void object_transform_light_sample(KernelGlobals *kg, LightSample *ls, int object, float time)
 {
 #ifdef __INSTANCING__
 	/* instance transform */
@@ -440,7 +440,7 @@ __device void object_transform_light_sample(KernelGlobals *kg, LightSample *ls,
 #endif
 }
 
-__device void triangle_light_sample(KernelGlobals *kg, int prim, int object,
+ccl_device void triangle_light_sample(KernelGlobals *kg, int prim, int object,
 	float randu, float randv, float time, LightSample *ls)
 {
 	/* triangle, so get position, normal, shader */
@@ -457,7 +457,7 @@ __device void triangle_light_sample(KernelGlobals *kg, int prim, int object,
 	object_transform_light_sample(kg, ls, object, time);
 }
 
-__device float triangle_light_pdf(KernelGlobals *kg,
+ccl_device float triangle_light_pdf(KernelGlobals *kg,
 	const float3 Ng, const float3 I, float t)
 {
 	float pdf = kernel_data.integrator.pdf_triangles;
@@ -473,7 +473,7 @@ __device float triangle_light_pdf(KernelGlobals *kg,
 
 #ifdef __HAIR__
 
-__device void curve_segment_light_sample(KernelGlobals *kg, int prim, int object,
+ccl_device void curve_segment_light_sample(KernelGlobals *kg, int prim, int object,
 	int segment, float randu, float randv, float time, LightSample *ls)
 {
 	/* this strand code needs completion */
@@ -515,7 +515,7 @@ __device void curve_segment_light_sample(KernelGlobals *kg, int prim, int object
 
 /* Light Distribution */
 
-__device int light_distribution_sample(KernelGlobals *kg, float randt)
+ccl_device int light_distribution_sample(KernelGlobals *kg, float randt)
 {
 	/* this is basically std::upper_bound as used by pbrt, to find a point light or
 	 * triangle to emit from, proportional to area. a good improvement would be to
@@ -544,7 +544,7 @@ __device int light_distribution_sample(KernelGlobals *kg, float randt)
 
 /* Generic Light */
 
-__device void light_sample(KernelGlobals *kg, float randt, float randu, float randv, float time, float3 P, LightSample *ls)
+ccl_device void light_sample(KernelGlobals *kg, float randt, float randu, float randv, float time, float3 P, LightSample *ls)
 {
 	/* sample index */
 	int index = light_distribution_sample(kg, randt);
@@ -577,18 +577,18 @@ __device void light_sample(KernelGlobals *kg, float randt, float randu, float ra
 	}
 }
 
-__device int light_select_num_samples(KernelGlobals *kg, int index)
+ccl_device int light_select_num_samples(KernelGlobals *kg, int index)
 {
 	float4 data3 = kernel_tex_fetch(__light_data, index*LIGHT_SIZE + 3);
 	return __float_as_int(data3.x);
 }
 
-__device void light_select(KernelGlobals *kg, int index, float randu, float randv, float3 P, LightSample *ls)
+ccl_device void light_select(KernelGlobals *kg, int index, float randu, float randv, float3 P, LightSample *ls)
 {
 	lamp_light_sample(kg, index, randu, randv, P, ls);
 }
 
-__device int lamp_light_eval_sample(KernelGlobals *kg, float randt)
+ccl_device int lamp_light_eval_sample(KernelGlobals *kg, float randt)
 {
 	/* sample index */
 	int index = light_distribution_sample(kg, randt);
diff --git a/intern/cycles/kernel/kernel_montecarlo.h b/intern/cycles/kernel/kernel_montecarlo.h
index b3d53e00be7..92f3420a218 100644
--- a/intern/cycles/kernel/kernel_montecarlo.h
+++ b/intern/cycles/kernel/kernel_montecarlo.h
@@ -36,7 +36,7 @@
 CCL_NAMESPACE_BEGIN
 
 /* distribute uniform xy on [0,1] over unit disk [-1,1] */
-__device void to_unit_disk(float *x, float *y)
+ccl_device void to_unit_disk(float *x, float *y)
 {
 	float phi = M_2PI_F * (*x);
 	float r = sqrtf(*y);
@@ -47,14 +47,14 @@ __device void to_unit_disk(float *x, float *y)
 
 /* return an orthogonal tangent and bitangent given a normal and tangent that
  * may not be exactly orthogonal */
-__device void make_orthonormals_tangent(const float3 N, const float3 T, float3 *a, float3 *b)
+ccl_device void make_orthonormals_tangent(const float3 N, const float3 T, float3 *a, float3 *b)
 {
 	*b = normalize(cross(N, T));
 	*a = cross(*b, N);
 }
 
 /* sample direction with cosine weighted distributed in hemisphere */
-__device_inline void sample_cos_hemisphere(const float3 N,
+ccl_device_inline void sample_cos_hemisphere(const float3 N,
 	float randu, float randv, float3 *omega_in, float *pdf)
 {
 	to_unit_disk(&randu, &randv);
@@ -66,7 +66,7 @@ __device_inline void sample_cos_hemisphere(const float3 N,
 }
 
 /* sample direction uniformly distributed in hemisphere */
-__device_inline void sample_uniform_hemisphere(const float3 N,
+ccl_device_inline void sample_uniform_hemisphere(const float3 N,
                                                float randu, float randv,
                                                float3 *omega_in, float *pdf)
 {
@@ -83,7 +83,7 @@ __device_inline void sample_uniform_hemisphere(const float3 N,
 }
 
 /* sample direction uniformly distributed in cone */
-__device_inline void sample_uniform_cone(const float3 N, float angle,
+ccl_device_inline void sample_uniform_cone(const float3 N, float angle,
                                          float randu, float randv,
                                          float3 *omega_in, float *pdf)
 {
@@ -100,7 +100,7 @@ __device_inline void sample_uniform_cone(const float3 N, float angle,
 }
 
 /* sample uniform point on the surface of a sphere */
-__device float3 sample_uniform_sphere(float u1, float u2)
+ccl_device float3 sample_uniform_sphere(float u1, float u2)
 {
 	float z = 1.0f - 2.0f*u1;
 	float r = sqrtf(fmaxf(0.0f, 1.0f - z*z));
@@ -111,29 +111,29 @@ __device float3 sample_uniform_sphere(float u1, float u2)
 	return make_float3(x, y, z);
 }
 
-__device float balance_heuristic(float a, float b)
+ccl_device float balance_heuristic(float a, float b)
 {
 	return (a)/(a + b);
 }
 
-__device float balance_heuristic_3(float a, float b, float c)
+ccl_device float balance_heuristic_3(float a, float b, float c)
 {
 	return (a)/(a + b + c);
 }
 
-__device float power_heuristic(float a, float b)
+ccl_device float power_heuristic(float a, float b)
 {
 	return (a*a)/(a*a + b*b);
 }
 
-__device float power_heuristic_3(float a, float b, float c)
+ccl_device float power_heuristic_3(float a, float b, float c)
 {
 	return (a*a)/(a*a + b*b + c*c);
 }
 
 /* distribute uniform xy on [0,1] over unit disk [-1,1], with concentric mapping
  * to better preserve stratification for some RNG sequences */
-__device float2 concentric_sample_disk(float u1, float u2)
+ccl_device float2 concentric_sample_disk(float u1, float u2)
 {
 	float phi, r;
 	float a = 2.0f*u1 - 1.0f;
@@ -155,7 +155,7 @@ __device float2 concentric_sample_disk(float u1, float u2)
 }
 
 /* sample point in unit polygon with given number of corners and rotation */
-__device float2 regular_polygon_sample(float corners, float rotation, float u, float v)
+ccl_device float2 regular_polygon_sample(float corners, float rotation, float u, float v)
 {
 	/* sample corner number and reuse u */
 	float corner = floorf(u*corners);
diff --git a/intern/cycles/kernel/kernel_object.h b/intern/cycles/kernel/kernel_object.h
index d0aae119476..a66277e10cd 100644
--- a/intern/cycles/kernel/kernel_object.h
+++ b/intern/cycles/kernel/kernel_object.h
@@ -30,7 +30,7 @@ enum ObjectVectorTransform {
 	OBJECT_VECTOR_MOTION_POST = 3
 };
 
-__device_inline Transform object_fetch_transform(KernelGlobals *kg, int object, enum ObjectTransform type)
+ccl_device_inline Transform object_fetch_transform(KernelGlobals *kg, int object, enum ObjectTransform type)
 {
 	int offset = object*OBJECT_SIZE + (int)type;
 
@@ -43,7 +43,7 @@ __device_inline Transform object_fetch_transform(KernelGlobals *kg, int object,
 	return tfm;
 }
 
-__device_inline Transform object_fetch_vector_transform(KernelGlobals *kg, int object, enum ObjectVectorTransform type)
+ccl_device_inline Transform object_fetch_vector_transform(KernelGlobals *kg, int object, enum ObjectVectorTransform type)
 {
 	int offset = object*OBJECT_VECTOR_SIZE + (int)type;
 
@@ -57,7 +57,7 @@ __device_inline Transform object_fetch_vector_transform(KernelGlobals *kg, int o
 }
 
 #ifdef __OBJECT_MOTION__
-__device_inline Transform object_fetch_transform_motion(KernelGlobals *kg, int object, float time)
+ccl_device_inline Transform object_fetch_transform_motion(KernelGlobals *kg, int object, float time)
 {
 	DecompMotionTransform motion;
 
@@ -79,7 +79,7 @@ __device_inline Transform object_fetch_transform_motion(KernelGlobals *kg, int o
 	return tfm;
 }
 
-__device_inline Transform object_fetch_transform_motion_test(KernelGlobals *kg, int object, float time, Transform *itfm)
+ccl_device_inline Transform object_fetch_transform_motion_test(KernelGlobals *kg, int object, float time, Transform *itfm)
 {
 	int object_flag = kernel_tex_fetch(__object_flag, object);
 
@@ -102,7 +102,7 @@ __device_inline Transform object_fetch_transform_motion_test(KernelGlobals *kg,
 }
 #endif
 
-__device_inline void object_position_transform(KernelGlobals *kg, ShaderData *sd, float3 *P)
+ccl_device_inline void object_position_transform(KernelGlobals *kg, ShaderData *sd, float3 *P)
 {
 #ifdef __OBJECT_MOTION__
 	*P = transform_point(&sd->ob_tfm, *P);
@@ -112,7 +112,7 @@ __device_inline void object_position_transform(KernelGlobals *kg, ShaderData *sd
 #endif
 }
 
-__device_inline void object_inverse_position_transform(KernelGlobals *kg, ShaderData *sd, float3 *P)
+ccl_device_inline void object_inverse_position_transform(KernelGlobals *kg, ShaderData *sd, float3 *P)
 {
 #ifdef __OBJECT_MOTION__
 	*P = transform_point(&sd->ob_itfm, *P);
@@ -122,7 +122,7 @@ __device_inline void object_inverse_position_transform(KernelGlobals *kg, Shader
 #endif
 }
 
-__device_inline void object_inverse_normal_transform(KernelGlobals *kg, ShaderData *sd, float3 *N)
+ccl_device_inline void object_inverse_normal_transform(KernelGlobals *kg, ShaderData *sd, float3 *N)
 {
 #ifdef __OBJECT_MOTION__
 	*N = normalize(transform_direction_transposed(&sd->ob_tfm, *N));
@@ -132,7 +132,7 @@ __device_inline void object_inverse_normal_transform(KernelGlobals *kg, ShaderDa
 #endif
 }
 
-__device_inline void object_normal_transform(KernelGlobals *kg, ShaderData *sd, float3 *N)
+ccl_device_inline void object_normal_transform(KernelGlobals *kg, ShaderData *sd, float3 *N)
 {
 #ifdef __OBJECT_MOTION__
 	*N = normalize(transform_direction_transposed(&sd->ob_itfm, *N));
@@ -142,7 +142,7 @@ __device_inline void object_normal_transform(KernelGlobals *kg, ShaderData *sd,
 #endif
 }
 
-__device_inline void object_dir_transform(KernelGlobals *kg, ShaderData *sd, float3 *D)
+ccl_device_inline void object_dir_transform(KernelGlobals *kg, ShaderData *sd, float3 *D)
 {
 #ifdef __OBJECT_MOTION__
 	*D = transform_direction(&sd->ob_tfm, *D);
@@ -152,7 +152,7 @@ __device_inline void object_dir_transform(KernelGlobals *kg, ShaderData *sd, flo
 #endif
 }
 
-__device_inline void object_inverse_dir_transform(KernelGlobals *kg, ShaderData *sd, float3 *D)
+ccl_device_inline void object_inverse_dir_transform(KernelGlobals *kg, ShaderData *sd, float3 *D)
 {
 #ifdef __OBJECT_MOTION__
 	*D = transform_direction(&sd->ob_itfm, *D);
@@ -162,7 +162,7 @@ __device_inline void object_inverse_dir_transform(KernelGlobals *kg, ShaderData
 #endif
 }
 
-__device_inline float3 object_location(KernelGlobals *kg, ShaderData *sd)
+ccl_device_inline float3 object_location(KernelGlobals *kg, ShaderData *sd)
 {
 	if(sd->object == ~0)
 		return make_float3(0.0f, 0.0f, 0.0f);
@@ -175,14 +175,14 @@ __device_inline float3 object_location(KernelGlobals *kg, ShaderData *sd)
 #endif
 }
 
-__device_inline float object_surface_area(KernelGlobals *kg, int object)
+ccl_device_inline float object_surface_area(KernelGlobals *kg, int object)
 {
 	int offset = object*OBJECT_SIZE + OBJECT_PROPERTIES;
 	float4 f = kernel_tex_fetch(__objects, offset);
 	return f.x;
 }
 
-__device_inline float object_pass_id(KernelGlobals *kg, int object)
+ccl_device_inline float object_pass_id(KernelGlobals *kg, int object)
 {
 	if(object == ~0)
 		return 0.0f;
@@ -192,7 +192,7 @@ __device_inline float object_pass_id(KernelGlobals *kg, int object)
 	return f.y;
 }
 
-__device_inline float object_random_number(KernelGlobals *kg, int object)
+ccl_device_inline float object_random_number(KernelGlobals *kg, int object)
 {
 	if(object == ~0)
 		return 0.0f;
@@ -202,7 +202,7 @@ __device_inline float object_random_number(KernelGlobals *kg, int object)
 	return f.z;
 }
 
-__device_inline uint object_particle_id(KernelGlobals *kg, int object)
+ccl_device_inline uint object_particle_id(KernelGlobals *kg, int object)
 {
 	if(object == ~0)
 		return 0.0f;
@@ -212,7 +212,7 @@ __device_inline uint object_particle_id(KernelGlobals *kg, int object)
 	return __float_as_uint(f.w);
 }
 
-__device_inline float3 object_dupli_generated(KernelGlobals *kg, int object)
+ccl_device_inline float3 object_dupli_generated(KernelGlobals *kg, int object)
 {
 	if(object == ~0)
 		return make_float3(0.0f, 0.0f, 0.0f);
@@ -222,7 +222,7 @@ __device_inline float3 object_dupli_generated(KernelGlobals *kg, int object)
 	return make_float3(f.x, f.y, f.z);
 }
 
-__device_inline float3 object_dupli_uv(KernelGlobals *kg, int object)
+ccl_device_inline float3 object_dupli_uv(KernelGlobals *kg, int object)
 {
 	if(object == ~0)
 		return make_float3(0.0f, 0.0f, 0.0f);
@@ -233,54 +233,54 @@ __device_inline float3 object_dupli_uv(KernelGlobals *kg, int object)
 }
 
 
-__device int shader_pass_id(KernelGlobals *kg, ShaderData *sd)
+ccl_device int shader_pass_id(KernelGlobals *kg, ShaderData *sd)
 {
 	return kernel_tex_fetch(__shader_flag, (sd->shader & SHADER_MASK)*2 + 1);
 }
 
-__device_inline float particle_index(KernelGlobals *kg, int particle)
+ccl_device_inline float particle_index(KernelGlobals *kg, int particle)
 {
 	int offset = particle*PARTICLE_SIZE;
 	float4 f = kernel_tex_fetch(__particles, offset + 0);
 	return f.x;
 }
 
-__device float particle_age(KernelGlobals *kg, int particle)
+ccl_device float particle_age(KernelGlobals *kg, int particle)
 {
 	int offset = particle*PARTICLE_SIZE;
 	float4 f = kernel_tex_fetch(__particles, offset + 0);
 	return f.y;
 }
 
-__device float particle_lifetime(KernelGlobals *kg, int particle)
+ccl_device float particle_lifetime(KernelGlobals *kg, int particle)
 {
 	int offset = particle*PARTICLE_SIZE;
 	float4 f = kernel_tex_fetch(__particles, offset + 0);
 	return f.z;
 }
 
-__device float particle_size(KernelGlobals *kg, int particle)
+ccl_device float particle_size(KernelGlobals *kg, int particle)
 {
 	int offset = particle*PARTICLE_SIZE;
 	float4 f = kernel_tex_fetch(__particles, offset + 0);
 	return f.w;
 }
 
-__device float4 particle_rotation(KernelGlobals *kg, int particle)
+ccl_device float4 particle_rotation(KernelGlobals *kg, int particle)
 {
 	int offset = particle*PARTICLE_SIZE;
 	float4 f = kernel_tex_fetch(__particles, offset + 1);
 	return f;
 }
 
-__device float3 particle_location(KernelGlobals *kg, int particle)
+ccl_device float3 particle_location(KernelGlobals *kg, int particle)
 {
 	int offset = particle*PARTICLE_SIZE;
 	float4 f = kernel_tex_fetch(__particles, offset + 2);
 	return make_float3(f.x, f.y, f.z);
 }
 
-__device float3 particle_velocity(KernelGlobals *kg, int particle)
+ccl_device float3 particle_velocity(KernelGlobals *kg, int particle)
 {
 	int offset = particle*PARTICLE_SIZE;
 	float4 f2 = kernel_tex_fetch(__particles, offset + 2);
@@ -288,7 +288,7 @@ __device float3 particle_velocity(KernelGlobals *kg, int particle)
 	return make_float3(f2.w, f3.x, f3.y);
 }
 
-__device float3 particle_angular_velocity(KernelGlobals *kg, int particle)
+ccl_device float3 particle_angular_velocity(KernelGlobals *kg, int particle)
 {
 	int offset = particle*PARTICLE_SIZE;
 	float4 f3 = kernel_tex_fetch(__particles, offset + 3);
diff --git a/intern/cycles/kernel/kernel_passes.h b/intern/cycles/kernel/kernel_passes.h
index 5e91b13f90c..512db9ec392 100644
--- a/intern/cycles/kernel/kernel_passes.h
+++ b/intern/cycles/kernel/kernel_passes.h
@@ -16,25 +16,25 @@
 
 CCL_NAMESPACE_BEGIN
 
-__device_inline void kernel_write_pass_float(__global float *buffer, int sample, float value)
+ccl_device_inline void kernel_write_pass_float(ccl_global float *buffer, int sample, float value)
 {
-	__global float *buf = buffer;
+	ccl_global float *buf = buffer;
 	*buf = (sample == 0)? value: *buf + value;
 }
 
-__device_inline void kernel_write_pass_float3(__global float *buffer, int sample, float3 value)
+ccl_device_inline void kernel_write_pass_float3(ccl_global float *buffer, int sample, float3 value)
 {
-	__global float3 *buf = (__global float3*)buffer;
+	ccl_global float3 *buf = (ccl_global float3*)buffer;
 	*buf = (sample == 0)? value: *buf + value;
 }
 
-__device_inline void kernel_write_pass_float4(__global float *buffer, int sample, float4 value)
+ccl_device_inline void kernel_write_pass_float4(ccl_global float *buffer, int sample, float4 value)
 {
-	__global float4 *buf = (__global float4*)buffer;
+	ccl_global float4 *buf = (ccl_global float4*)buffer;
 	*buf = (sample == 0)? value: *buf + value;
 }
 
-__device_inline void kernel_write_data_passes(KernelGlobals *kg, __global float *buffer, PathRadiance *L,
+ccl_device_inline void kernel_write_data_passes(KernelGlobals *kg, ccl_global float *buffer, PathRadiance *L,
 	ShaderData *sd, int sample, int path_flag, float3 throughput)
 {
 #ifdef __PASSES__
@@ -114,7 +114,7 @@ __device_inline void kernel_write_data_passes(KernelGlobals *kg, __global float
 #endif
 }
 
-__device_inline void kernel_write_light_passes(KernelGlobals *kg, __global float *buffer, PathRadiance *L, int sample)
+ccl_device_inline void kernel_write_light_passes(KernelGlobals *kg, ccl_global float *buffer, PathRadiance *L, int sample)
 {
 #ifdef __PASSES__
 	int flag = kernel_data.film.pass_flag;
diff --git a/intern/cycles/kernel/kernel_path.h b/intern/cycles/kernel/kernel_path.h
index 5354738d378..4f3957b66ef 100644
--- a/intern/cycles/kernel/kernel_path.h
+++ b/intern/cycles/kernel/kernel_path.h
@@ -42,7 +42,7 @@
 
 CCL_NAMESPACE_BEGIN
 
-__device_inline bool shadow_blocked(KernelGlobals *kg, PathState *state, Ray *ray, float3 *shadow)
+ccl_device_inline bool shadow_blocked(KernelGlobals *kg, PathState *state, Ray *ray, float3 *shadow)
 {
 	*shadow = make_float3(1.0f, 1.0f, 1.0f);
 
@@ -122,7 +122,7 @@ __device_inline bool shadow_blocked(KernelGlobals *kg, PathState *state, Ray *ra
 
 #if defined(__BRANCHED_PATH__) || defined(__SUBSURFACE__)
 
-__device void kernel_path_indirect(KernelGlobals *kg, RNG *rng, int sample, Ray ray, __global float *buffer,
+ccl_device void kernel_path_indirect(KernelGlobals *kg, RNG *rng, int sample, Ray ray, ccl_global float *buffer,
 	float3 throughput, int num_samples, int num_total_samples,
 	float min_ray_pdf, float ray_pdf, PathState state, int rng_offset, PathRadiance *L)
 {
@@ -359,7 +359,7 @@ __device void kernel_path_indirect(KernelGlobals *kg, RNG *rng, int sample, Ray
 
 #ifdef __SUBSURFACE__
 
-__device_inline bool kernel_path_integrate_lighting(KernelGlobals *kg, RNG *rng,
+ccl_device_inline bool kernel_path_integrate_lighting(KernelGlobals *kg, RNG *rng,
 	int sample, int num_samples,
 	ShaderData *sd, float3 *throughput,
 	float *min_ray_pdf, float *ray_pdf, PathState *state,
@@ -452,7 +452,7 @@ __device_inline bool kernel_path_integrate_lighting(KernelGlobals *kg, RNG *rng,
 
 #endif
 
-__device float4 kernel_path_integrate(KernelGlobals *kg, RNG *rng, int sample, Ray ray, __global float *buffer)
+ccl_device float4 kernel_path_integrate(KernelGlobals *kg, RNG *rng, int sample, Ray ray, ccl_global float *buffer)
 {
 	/* initialize */
 	PathRadiance L;
@@ -790,11 +790,11 @@ __device float4 kernel_path_integrate(KernelGlobals *kg, RNG *rng, int sample, R
 
 #ifdef __BRANCHED_PATH__
 
-__device_noinline void kernel_branched_path_integrate_lighting(KernelGlobals *kg, RNG *rng,
+ccl_device_noinline void kernel_branched_path_integrate_lighting(KernelGlobals *kg, RNG *rng,
 	int sample, int aa_samples,
 	ShaderData *sd, float3 throughput, float num_samples_adjust,
 	float min_ray_pdf, float ray_pdf, PathState state,
-	int rng_offset, PathRadiance *L, __global float *buffer)
+	int rng_offset, PathRadiance *L, ccl_global float *buffer)
 {
 #ifdef __EMISSION__
 	/* sample illumination from lights to find path contribution */
@@ -941,7 +941,7 @@ __device_noinline void kernel_branched_path_integrate_lighting(KernelGlobals *kg
 	}
 }
 
-__device float4 kernel_branched_path_integrate(KernelGlobals *kg, RNG *rng, int sample, Ray ray, __global float *buffer)
+ccl_device float4 kernel_branched_path_integrate(KernelGlobals *kg, RNG *rng, int sample, Ray ray, ccl_global float *buffer)
 {
 	/* initialize */
 	PathRadiance L;
@@ -1166,7 +1166,7 @@ __device float4 kernel_branched_path_integrate(KernelGlobals *kg, RNG *rng, int
 
 #endif
 
-__device_inline void kernel_path_trace_setup(KernelGlobals *kg, __global uint *rng_state, int sample, int x, int y, RNG *rng, Ray *ray)
+ccl_device_inline void kernel_path_trace_setup(KernelGlobals *kg, ccl_global uint *rng_state, int sample, int x, int y, RNG *rng, Ray *ray)
 {
 	float filter_u;
 	float filter_v;
@@ -1195,8 +1195,8 @@ __device_inline void kernel_path_trace_setup(KernelGlobals *kg, __global uint *r
 	camera_sample(kg, x, y, filter_u, filter_v, lens_u, lens_v, time, ray);
 }
 
-__device void kernel_path_trace(KernelGlobals *kg,
-	__global float *buffer, __global uint *rng_state,
+ccl_device void kernel_path_trace(KernelGlobals *kg,
+	ccl_global float *buffer, ccl_global uint *rng_state,
 	int sample, int x, int y, int offset, int stride)
 {
 	/* buffer offset */
@@ -1227,8 +1227,8 @@ __device void kernel_path_trace(KernelGlobals *kg,
 }
 
 #ifdef __BRANCHED_PATH__
-__device void kernel_branched_path_trace(KernelGlobals *kg,
-	__global float *buffer, __global uint *rng_state,
+ccl_device void kernel_branched_path_trace(KernelGlobals *kg,
+	ccl_global float *buffer, ccl_global uint *rng_state,
 	int sample, int x, int y, int offset, int stride)
 {
 	/* buffer offset */
diff --git a/intern/cycles/kernel/kernel_path_state.h b/intern/cycles/kernel/kernel_path_state.h
index e0e0f43fd26..0ded332b3b9 100644
--- a/intern/cycles/kernel/kernel_path_state.h
+++ b/intern/cycles/kernel/kernel_path_state.h
@@ -26,7 +26,7 @@ typedef struct PathState {
 	int transparent_bounce;
 } PathState;
 
-__device_inline void path_state_init(PathState *state)
+ccl_device_inline void path_state_init(PathState *state)
 {
 	state->flag = PATH_RAY_CAMERA|PATH_RAY_SINGULAR|PATH_RAY_MIS_SKIP;
 	state->bounce = 0;
@@ -36,7 +36,7 @@ __device_inline void path_state_init(PathState *state)
 	state->transparent_bounce = 0;
 }
 
-__device_inline void path_state_next(KernelGlobals *kg, PathState *state, int label)
+ccl_device_inline void path_state_next(KernelGlobals *kg, PathState *state, int label)
 {
 	/* ray through transparent keeps same flags from previous ray and is
 	 * not counted as a regular bounce, transparent has separate max */
@@ -88,7 +88,7 @@ __device_inline void path_state_next(KernelGlobals *kg, PathState *state, int la
 	}
 }
 
-__device_inline uint path_state_ray_visibility(KernelGlobals *kg, PathState *state)
+ccl_device_inline uint path_state_ray_visibility(KernelGlobals *kg, PathState *state)
 {
 	uint flag = state->flag & PATH_RAY_ALL_VISIBILITY;
 
@@ -102,7 +102,7 @@ __device_inline uint path_state_ray_visibility(KernelGlobals *kg, PathState *sta
 	return flag;
 }
 
-__device_inline float path_state_terminate_probability(KernelGlobals *kg, PathState *state, const float3 throughput)
+ccl_device_inline float path_state_terminate_probability(KernelGlobals *kg, PathState *state, const float3 throughput)
 {
 	if(state->flag & PATH_RAY_TRANSPARENT) {
 		/* transparent rays treated separately */
diff --git a/intern/cycles/kernel/kernel_primitive.h b/intern/cycles/kernel/kernel_primitive.h
index 636cfd06532..ababad28f35 100644
--- a/intern/cycles/kernel/kernel_primitive.h
+++ b/intern/cycles/kernel/kernel_primitive.h
@@ -21,7 +21,7 @@ CCL_NAMESPACE_BEGIN
 
 /* attribute lookup */
 
-__device_inline int find_attribute(KernelGlobals *kg, ShaderData *sd, uint id, AttributeElement *elem)
+ccl_device_inline int find_attribute(KernelGlobals *kg, ShaderData *sd, uint id, AttributeElement *elem)
 {
 	if(sd->object == ~0)
 		return (int)ATTR_STD_NOT_FOUND;
@@ -52,7 +52,7 @@ __device_inline int find_attribute(KernelGlobals *kg, ShaderData *sd, uint id, A
 	}
 }
 
-__device float primitive_attribute_float(KernelGlobals *kg, const ShaderData *sd, AttributeElement elem, int offset, float *dx, float *dy)
+ccl_device float primitive_attribute_float(KernelGlobals *kg, const ShaderData *sd, AttributeElement elem, int offset, float *dx, float *dy)
 {
 #ifdef __HAIR__
 	if(sd->segment == ~0)
@@ -64,7 +64,7 @@ __device float primitive_attribute_float(KernelGlobals *kg, const ShaderData *sd
 #endif
 }
 
-__device float3 primitive_attribute_float3(KernelGlobals *kg, const ShaderData *sd, AttributeElement elem, int offset, float3 *dx, float3 *dy)
+ccl_device float3 primitive_attribute_float3(KernelGlobals *kg, const ShaderData *sd, AttributeElement elem, int offset, float3 *dx, float3 *dy)
 {
 #ifdef __HAIR__
 	if(sd->segment == ~0)
@@ -76,7 +76,7 @@ __device float3 primitive_attribute_float3(KernelGlobals *kg, const ShaderData *
 #endif
 }
 
-__device float3 primitive_uv(KernelGlobals *kg, ShaderData *sd)
+ccl_device float3 primitive_uv(KernelGlobals *kg, ShaderData *sd)
 {
 	AttributeElement elem_uv;
 	int offset_uv = find_attribute(kg, sd, ATTR_STD_UV, &elem_uv);
@@ -89,7 +89,7 @@ __device float3 primitive_uv(KernelGlobals *kg, ShaderData *sd)
 	return uv;
 }
 
-__device float3 primitive_tangent(KernelGlobals *kg, ShaderData *sd)
+ccl_device float3 primitive_tangent(KernelGlobals *kg, ShaderData *sd)
 {
 #ifdef __HAIR__
 	if(sd->segment != ~0)
@@ -122,7 +122,7 @@ __device float3 primitive_tangent(KernelGlobals *kg, ShaderData *sd)
 
 /* motion */
 
-__device float4 primitive_motion_vector(KernelGlobals *kg, ShaderData *sd)
+ccl_device float4 primitive_motion_vector(KernelGlobals *kg, ShaderData *sd)
 {
 	float3 motion_pre = sd->P, motion_post = sd->P;
 
diff --git a/intern/cycles/kernel/kernel_projection.h b/intern/cycles/kernel/kernel_projection.h
index d9520de7956..e2108604bc8 100644
--- a/intern/cycles/kernel/kernel_projection.h
+++ b/intern/cycles/kernel/kernel_projection.h
@@ -37,7 +37,7 @@ CCL_NAMESPACE_BEGIN
 
 /* Spherical coordinates <-> Cartesian direction  */
 
-__device float2 direction_to_spherical(float3 dir)
+ccl_device float2 direction_to_spherical(float3 dir)
 {
 	float theta = acosf(dir.z);
 	float phi = atan2f(dir.x, dir.y);
@@ -45,7 +45,7 @@ __device float2 direction_to_spherical(float3 dir)
 	return make_float2(theta, phi);
 }
 
-__device float3 spherical_to_direction(float theta, float phi)
+ccl_device float3 spherical_to_direction(float theta, float phi)
 {
 	return make_float3(
 		sinf(theta)*cosf(phi),
@@ -55,7 +55,7 @@ __device float3 spherical_to_direction(float theta, float phi)
 
 /* Equirectangular coordinates <-> Cartesian direction */
 
-__device float2 direction_to_equirectangular(float3 dir)
+ccl_device float2 direction_to_equirectangular(float3 dir)
 {
 	float u = -atan2f(dir.y, dir.x)/(M_2PI_F) + 0.5f;
 	float v = atan2f(dir.z, hypotf(dir.x, dir.y))/M_PI_F + 0.5f;
@@ -63,7 +63,7 @@ __device float2 direction_to_equirectangular(float3 dir)
 	return make_float2(u, v);
 }
 
-__device float3 equirectangular_to_direction(float u, float v)
+ccl_device float3 equirectangular_to_direction(float u, float v)
 {
 	float phi = M_PI_F*(1.0f - 2.0f*u);
 	float theta = M_PI_F*(1.0f - v);
@@ -76,7 +76,7 @@ __device float3 equirectangular_to_direction(float u, float v)
 
 /* Fisheye <-> Cartesian direction */
 
-__device float2 direction_to_fisheye(float3 dir, float fov)
+ccl_device float2 direction_to_fisheye(float3 dir, float fov)
 {
 	float r = atan2f(sqrtf(dir.y*dir.y +  dir.z*dir.z), dir.x) / fov;
 	float phi = atan2f(dir.z, dir.y);
@@ -87,7 +87,7 @@ __device float2 direction_to_fisheye(float3 dir, float fov)
 	return make_float2(u, v);
 }
 
-__device float3 fisheye_to_direction(float u, float v, float fov)
+ccl_device float3 fisheye_to_direction(float u, float v, float fov)
 {
 	u = (u - 0.5f) * 2.0f;
 	v = (v - 0.5f) * 2.0f;
@@ -109,7 +109,7 @@ __device float3 fisheye_to_direction(float u, float v, float fov)
 	);
 }
 
-__device float2 direction_to_fisheye_equisolid(float3 dir, float lens, float width, float height)
+ccl_device float2 direction_to_fisheye_equisolid(float3 dir, float lens, float width, float height)
 {
 	float theta = acosf(dir.x);
 	float r = 2.0f * lens * sinf(theta * 0.5f);
@@ -121,7 +121,7 @@ __device float2 direction_to_fisheye_equisolid(float3 dir, float lens, float wid
 	return make_float2(u, v);
 }
 
-__device float3 fisheye_equisolid_to_direction(float u, float v, float lens, float fov, float width, float height)
+ccl_device float3 fisheye_equisolid_to_direction(float u, float v, float lens, float fov, float width, float height)
 {
 	u = (u - 0.5f) * width;
 	v = (v - 0.5f) * height;
@@ -146,7 +146,7 @@ __device float3 fisheye_equisolid_to_direction(float u, float v, float lens, flo
 
 /* Mirror Ball <-> Cartesion direction */
 
-__device float3 mirrorball_to_direction(float u, float v)
+ccl_device float3 mirrorball_to_direction(float u, float v)
 {
 	/* point on sphere */
 	float3 dir;
@@ -161,7 +161,7 @@ __device float3 mirrorball_to_direction(float u, float v)
 	return 2.0f*dot(dir, I)*dir - I;
 }
 
-__device float2 direction_to_mirrorball(float3 dir)
+ccl_device float2 direction_to_mirrorball(float3 dir)
 {
 	/* inverse of mirrorball_to_direction */
 	dir.y -= 1.0f;
@@ -176,7 +176,7 @@ __device float2 direction_to_mirrorball(float3 dir)
 	return make_float2(u, v);
 }
 
-__device float3 panorama_to_direction(KernelGlobals *kg, float u, float v)
+ccl_device float3 panorama_to_direction(KernelGlobals *kg, float u, float v)
 {
 	switch(kernel_data.cam.panorama_type) {
 		case PANORAMA_EQUIRECTANGULAR:
@@ -190,7 +190,7 @@ __device float3 panorama_to_direction(KernelGlobals *kg, float u, float v)
 	}
 }
 
-__device float2 direction_to_panorama(KernelGlobals *kg, float3 dir)
+ccl_device float2 direction_to_panorama(KernelGlobals *kg, float3 dir)
 {
 	switch(kernel_data.cam.panorama_type) {
 		case PANORAMA_EQUIRECTANGULAR:
diff --git a/intern/cycles/kernel/kernel_random.h b/intern/cycles/kernel/kernel_random.h
index dc977a8780f..69e7b439e1c 100644
--- a/intern/cycles/kernel/kernel_random.h
+++ b/intern/cycles/kernel/kernel_random.h
@@ -30,7 +30,7 @@ typedef uint RNG;
 /* High Dimensional Sobol */
 
 /* van der corput radical inverse */
-__device uint van_der_corput(uint bits)
+ccl_device uint van_der_corput(uint bits)
 {
 	bits = (bits << 16) | (bits >> 16);
 	bits = ((bits & 0x00ff00ff) << 8) | ((bits & 0xff00ff00) >> 8);
@@ -41,7 +41,7 @@ __device uint van_der_corput(uint bits)
 }
 
 /* sobol radical inverse */
-__device uint sobol(uint i)
+ccl_device uint sobol(uint i)
 {
 	uint r = 0;
 
@@ -53,7 +53,7 @@ __device uint sobol(uint i)
 }
 
 /* inverse of sobol radical inverse */
-__device uint sobol_inverse(uint i)
+ccl_device uint sobol_inverse(uint i)
 {
 	const uint msb = 1U << 31;
 	uint r = 0;
@@ -67,7 +67,7 @@ __device uint sobol_inverse(uint i)
 
 /* multidimensional sobol with generator matrices
  * dimension 0 and 1 are equal to van_der_corput() and sobol() respectively */
-__device uint sobol_dimension(KernelGlobals *kg, int index, int dimension)
+ccl_device uint sobol_dimension(KernelGlobals *kg, int index, int dimension)
 {
 	uint result = 0;
 	uint i = index;
@@ -80,7 +80,7 @@ __device uint sobol_dimension(KernelGlobals *kg, int index, int dimension)
 }
 
 /* lookup index and x/y coordinate, assumes m is a power of two */
-__device uint sobol_lookup(const uint m, const uint frame, const uint ex, const uint ey, uint *x, uint *y)
+ccl_device uint sobol_lookup(const uint m, const uint frame, const uint ex, const uint ey, uint *x, uint *y)
 {
 	/* shift is constant per frame */
 	const uint shift = frame << (m << 1);
@@ -100,7 +100,7 @@ __device uint sobol_lookup(const uint m, const uint frame, const uint ex, const
 	return index;
 }
 
-__device_inline float path_rng_1D(KernelGlobals *kg, RNG *rng, int sample, int num_samples, int dimension)
+ccl_device_inline float path_rng_1D(KernelGlobals *kg, RNG *rng, int sample, int num_samples, int dimension)
 {
 #ifdef __CMJ__
 	if(kernel_data.integrator.sampling_pattern == SAMPLING_PATTERN_CMJ) {
@@ -131,7 +131,7 @@ __device_inline float path_rng_1D(KernelGlobals *kg, RNG *rng, int sample, int n
 #endif
 }
 
-__device_inline void path_rng_2D(KernelGlobals *kg, RNG *rng, int sample, int num_samples, int dimension, float *fx, float *fy)
+ccl_device_inline void path_rng_2D(KernelGlobals *kg, RNG *rng, int sample, int num_samples, int dimension, float *fx, float *fy)
 {
 #ifdef __CMJ__
 	if(kernel_data.integrator.sampling_pattern == SAMPLING_PATTERN_CMJ) {
@@ -148,7 +148,7 @@ __device_inline void path_rng_2D(KernelGlobals *kg, RNG *rng, int sample, int nu
 	}
 }
 
-__device_inline void path_rng_init(KernelGlobals *kg, __global uint *rng_state, int sample, int num_samples, RNG *rng, int x, int y, float *fx, float *fy)
+ccl_device_inline void path_rng_init(KernelGlobals *kg, ccl_global uint *rng_state, int sample, int num_samples, RNG *rng, int x, int y, float *fx, float *fy)
 {
 #ifdef __SOBOL_FULL_SCREEN__
 	uint px, py;
@@ -183,7 +183,7 @@ __device_inline void path_rng_init(KernelGlobals *kg, __global uint *rng_state,
 #endif
 }
 
-__device void path_rng_end(KernelGlobals *kg, __global uint *rng_state, RNG rng)
+ccl_device void path_rng_end(KernelGlobals *kg, ccl_global uint *rng_state, RNG rng)
 {
 	/* nothing to do */
 }
@@ -192,24 +192,24 @@ __device void path_rng_end(KernelGlobals *kg, __global uint *rng_state, RNG rng)
 
 /* Linear Congruential Generator */
 
-__device float path_rng(KernelGlobals *kg, RNG& rng, int sample, int dimension)
+ccl_device float path_rng(KernelGlobals *kg, RNG& rng, int sample, int dimension)
 {
 }
 
-__device_inline float path_rng_1D(KernelGlobals *kg, RNG& rng, int sample, int num_samples, int dimension)
+ccl_device_inline float path_rng_1D(KernelGlobals *kg, RNG& rng, int sample, int num_samples, int dimension)
 {
 	/* implicit mod 2^32 */
 	rng = (1103515245*(rng) + 12345);
 	return (float)rng * (1.0f/(float)0xFFFFFFFF);
 }
 
-__device_inline void path_rng_2D(KernelGlobals *kg, RNG& rng, int sample, int num_samples, int dimension, float *fx, float *fy)
+ccl_device_inline void path_rng_2D(KernelGlobals *kg, RNG& rng, int sample, int num_samples, int dimension, float *fx, float *fy)
 {
 	*fx = path_rng_1D(kg, rng, sample, num_samples, dimension);
 	*fy = path_rng_1D(kg, rng, sample, num_samples, dimension + 1);
 }
 
-__device void path_rng_init(KernelGlobals *kg, __global uint *rng_state, int sample, int num_samples, RNG *rng, int x, int y, float *fx, float *fy)
+ccl_device void path_rng_init(KernelGlobals *kg, ccl_global uint *rng_state, int sample, int num_samples, RNG *rng, int x, int y, float *fx, float *fy)
 {
 	/* load state */
 	*rng = *rng_state;
@@ -225,7 +225,7 @@ __device void path_rng_init(KernelGlobals *kg, __global uint *rng_state, int sam
 	}
 }
 
-__device void path_rng_end(KernelGlobals *kg, __global uint *rng_state, RNG rng)
+ccl_device void path_rng_end(KernelGlobals *kg, ccl_global uint *rng_state, RNG rng)
 {
 	/* store state for next sample */
 	*rng_state = rng;
@@ -233,21 +233,21 @@ __device void path_rng_end(KernelGlobals *kg, __global uint *rng_state, RNG rng)
 
 #endif
 
-__device uint lcg_step_uint(uint *rng)
+ccl_device uint lcg_step_uint(uint *rng)
 {
 	/* implicit mod 2^32 */
 	*rng = (1103515245*(*rng) + 12345);
 	return *rng;
 }
 
-__device float lcg_step_float(uint *rng)
+ccl_device float lcg_step_float(uint *rng)
 {
 	/* implicit mod 2^32 */
 	*rng = (1103515245*(*rng) + 12345);
 	return (float)*rng * (1.0f/(float)0xFFFFFFFF);
 }
 
-__device uint lcg_init(uint seed)
+ccl_device uint lcg_init(uint seed)
 {
 	uint rng = seed;
 	lcg_step_uint(&rng);
diff --git a/intern/cycles/kernel/kernel_shader.h b/intern/cycles/kernel/kernel_shader.h
index 81630caed9a..77154ce3aef 100644
--- a/intern/cycles/kernel/kernel_shader.h
+++ b/intern/cycles/kernel/kernel_shader.h
@@ -36,7 +36,7 @@ CCL_NAMESPACE_BEGIN
 /* ShaderData setup from incoming ray */
 
 #ifdef __OBJECT_MOTION__
-__device void shader_setup_object_transforms(KernelGlobals *kg, ShaderData *sd, float time)
+ccl_device void shader_setup_object_transforms(KernelGlobals *kg, ShaderData *sd, float time)
 {
 	if(sd->flag & SD_OBJECT_MOTION) {
 		sd->ob_tfm = object_fetch_transform_motion(kg, sd->object, time);
@@ -49,7 +49,7 @@ __device void shader_setup_object_transforms(KernelGlobals *kg, ShaderData *sd,
 }
 #endif
 
-__device void shader_setup_from_ray(KernelGlobals *kg, ShaderData *sd,
+ccl_device void shader_setup_from_ray(KernelGlobals *kg, ShaderData *sd,
 	const Intersection *isect, const Ray *ray, int bounce)
 {
 #ifdef __INSTANCING__
@@ -161,7 +161,7 @@ __device void shader_setup_from_ray(KernelGlobals *kg, ShaderData *sd,
 /* ShaderData setup from BSSRDF scatter */
 
 #ifdef __SUBSURFACE__
-__device_inline void shader_setup_from_subsurface(KernelGlobals *kg, ShaderData *sd,
+ccl_device_inline void shader_setup_from_subsurface(KernelGlobals *kg, ShaderData *sd,
 	const Intersection *isect, const Ray *ray)
 {
 	bool backfacing = sd->flag & SD_BACKFACING;
@@ -237,7 +237,7 @@ __device_inline void shader_setup_from_subsurface(KernelGlobals *kg, ShaderData
 
 /* ShaderData setup from position sampled on mesh */
 
-__device void shader_setup_from_sample(KernelGlobals *kg, ShaderData *sd,
+ccl_device void shader_setup_from_sample(KernelGlobals *kg, ShaderData *sd,
 	const float3 P, const float3 Ng, const float3 I,
 	int shader, int object, int prim, float u, float v, float t, float time, int bounce, int segment)
 {
@@ -357,7 +357,7 @@ __device void shader_setup_from_sample(KernelGlobals *kg, ShaderData *sd,
 
 /* ShaderData setup for displacement */
 
-__device void shader_setup_from_displace(KernelGlobals *kg, ShaderData *sd,
+ccl_device void shader_setup_from_displace(KernelGlobals *kg, ShaderData *sd,
 	int object, int prim, float u, float v)
 {
 	float3 P, Ng, I = make_float3(0.0f, 0.0f, 0.0f);
@@ -376,7 +376,7 @@ __device void shader_setup_from_displace(KernelGlobals *kg, ShaderData *sd,
 
 /* ShaderData setup from ray into background */
 
-__device_inline void shader_setup_from_background(KernelGlobals *kg, ShaderData *sd, const Ray *ray, int bounce)
+ccl_device_inline void shader_setup_from_background(KernelGlobals *kg, ShaderData *sd, const Ray *ray, int bounce)
 {
 	/* vectors */
 	sd->P = ray->D;
@@ -426,7 +426,7 @@ __device_inline void shader_setup_from_background(KernelGlobals *kg, ShaderData
 
 #ifdef __MULTI_CLOSURE__
 
-__device_inline void _shader_bsdf_multi_eval(KernelGlobals *kg, const ShaderData *sd, const float3 omega_in, float *pdf,
+ccl_device_inline void _shader_bsdf_multi_eval(KernelGlobals *kg, const ShaderData *sd, const float3 omega_in, float *pdf,
 	int skip_bsdf, BsdfEval *result_eval, float sum_pdf, float sum_sample_weight)
 {
 	/* this is the veach one-sample model with balance heuristic, some pdf
@@ -455,7 +455,7 @@ __device_inline void _shader_bsdf_multi_eval(KernelGlobals *kg, const ShaderData
 
 #endif
 
-__device void shader_bsdf_eval(KernelGlobals *kg, const ShaderData *sd,
+ccl_device void shader_bsdf_eval(KernelGlobals *kg, const ShaderData *sd,
 	const float3 omega_in, BsdfEval *eval, float *pdf)
 {
 #ifdef __MULTI_CLOSURE__
@@ -470,7 +470,7 @@ __device void shader_bsdf_eval(KernelGlobals *kg, const ShaderData *sd,
 #endif
 }
 
-__device int shader_bsdf_sample(KernelGlobals *kg, const ShaderData *sd,
+ccl_device int shader_bsdf_sample(KernelGlobals *kg, const ShaderData *sd,
 	float randu, float randv, BsdfEval *bsdf_eval,
 	float3 *omega_in, differential3 *domega_in, float *pdf)
 {
@@ -534,7 +534,7 @@ __device int shader_bsdf_sample(KernelGlobals *kg, const ShaderData *sd,
 #endif
 }
 
-__device int shader_bsdf_sample_closure(KernelGlobals *kg, const ShaderData *sd,
+ccl_device int shader_bsdf_sample_closure(KernelGlobals *kg, const ShaderData *sd,
 	const ShaderClosure *sc, float randu, float randv, BsdfEval *bsdf_eval,
 	float3 *omega_in, differential3 *domega_in, float *pdf)
 {
@@ -550,7 +550,7 @@ __device int shader_bsdf_sample_closure(KernelGlobals *kg, const ShaderData *sd,
 	return label;
 }
 
-__device void shader_bsdf_blur(KernelGlobals *kg, ShaderData *sd, float roughness)
+ccl_device void shader_bsdf_blur(KernelGlobals *kg, ShaderData *sd, float roughness)
 {
 #ifdef __MULTI_CLOSURE__
 	for(int i = 0; i< sd->num_closure; i++) {
@@ -564,7 +564,7 @@ __device void shader_bsdf_blur(KernelGlobals *kg, ShaderData *sd, float roughnes
 #endif
 }
 
-__device float3 shader_bsdf_transparency(KernelGlobals *kg, ShaderData *sd)
+ccl_device float3 shader_bsdf_transparency(KernelGlobals *kg, ShaderData *sd)
 {
 #ifdef __MULTI_CLOSURE__
 	float3 eval = make_float3(0.0f, 0.0f, 0.0f);
@@ -585,7 +585,7 @@ __device float3 shader_bsdf_transparency(KernelGlobals *kg, ShaderData *sd)
 #endif
 }
 
-__device float3 shader_bsdf_alpha(KernelGlobals *kg, ShaderData *sd)
+ccl_device float3 shader_bsdf_alpha(KernelGlobals *kg, ShaderData *sd)
 {
 	float3 alpha = make_float3(1.0f, 1.0f, 1.0f) - shader_bsdf_transparency(kg, sd);
 
@@ -595,7 +595,7 @@ __device float3 shader_bsdf_alpha(KernelGlobals *kg, ShaderData *sd)
 	return alpha;
 }
 
-__device float3 shader_bsdf_diffuse(KernelGlobals *kg, ShaderData *sd)
+ccl_device float3 shader_bsdf_diffuse(KernelGlobals *kg, ShaderData *sd)
 {
 #ifdef __MULTI_CLOSURE__
 	float3 eval = make_float3(0.0f, 0.0f, 0.0f);
@@ -616,7 +616,7 @@ __device float3 shader_bsdf_diffuse(KernelGlobals *kg, ShaderData *sd)
 #endif
 }
 
-__device float3 shader_bsdf_glossy(KernelGlobals *kg, ShaderData *sd)
+ccl_device float3 shader_bsdf_glossy(KernelGlobals *kg, ShaderData *sd)
 {
 #ifdef __MULTI_CLOSURE__
 	float3 eval = make_float3(0.0f, 0.0f, 0.0f);
@@ -637,7 +637,7 @@ __device float3 shader_bsdf_glossy(KernelGlobals *kg, ShaderData *sd)
 #endif
 }
 
-__device float3 shader_bsdf_transmission(KernelGlobals *kg, ShaderData *sd)
+ccl_device float3 shader_bsdf_transmission(KernelGlobals *kg, ShaderData *sd)
 {
 #ifdef __MULTI_CLOSURE__
 	float3 eval = make_float3(0.0f, 0.0f, 0.0f);
@@ -658,7 +658,7 @@ __device float3 shader_bsdf_transmission(KernelGlobals *kg, ShaderData *sd)
 #endif
 }
 
-__device float3 shader_bsdf_subsurface(KernelGlobals *kg, ShaderData *sd)
+ccl_device float3 shader_bsdf_subsurface(KernelGlobals *kg, ShaderData *sd)
 {
 #ifdef __MULTI_CLOSURE__
 	float3 eval = make_float3(0.0f, 0.0f, 0.0f);
@@ -679,7 +679,7 @@ __device float3 shader_bsdf_subsurface(KernelGlobals *kg, ShaderData *sd)
 #endif
 }
 
-__device float3 shader_bsdf_ao(KernelGlobals *kg, ShaderData *sd, float ao_factor, float3 *N_)
+ccl_device float3 shader_bsdf_ao(KernelGlobals *kg, ShaderData *sd, float ao_factor, float3 *N_)
 {
 #ifdef __MULTI_CLOSURE__
 	float3 eval = make_float3(0.0f, 0.0f, 0.0f);
@@ -717,7 +717,7 @@ __device float3 shader_bsdf_ao(KernelGlobals *kg, ShaderData *sd, float ao_facto
 #endif
 }
 
-__device float3 shader_bssrdf_sum(ShaderData *sd, float3 *N_, float *texture_blur_)
+ccl_device float3 shader_bssrdf_sum(ShaderData *sd, float3 *N_, float *texture_blur_)
 {
 #ifdef __MULTI_CLOSURE__
 	float3 eval = make_float3(0.0f, 0.0f, 0.0f);
@@ -762,7 +762,7 @@ __device float3 shader_bssrdf_sum(ShaderData *sd, float3 *N_, float *texture_blu
 
 /* Emission */
 
-__device float3 emissive_eval(KernelGlobals *kg, ShaderData *sd, ShaderClosure *sc)
+ccl_device float3 emissive_eval(KernelGlobals *kg, ShaderData *sd, ShaderClosure *sc)
 {
 #ifdef __OSL__
 	if(kg->osl && sc->prim)
@@ -772,7 +772,7 @@ __device float3 emissive_eval(KernelGlobals *kg, ShaderData *sd, ShaderClosure *
 	return emissive_simple_eval(sd->Ng, sd->I);
 }
 
-__device float3 shader_emissive_eval(KernelGlobals *kg, ShaderData *sd)
+ccl_device float3 shader_emissive_eval(KernelGlobals *kg, ShaderData *sd)
 {
 	float3 eval;
 #ifdef __MULTI_CLOSURE__
@@ -793,7 +793,7 @@ __device float3 shader_emissive_eval(KernelGlobals *kg, ShaderData *sd)
 
 /* Holdout */
 
-__device float3 shader_holdout_eval(KernelGlobals *kg, ShaderData *sd)
+ccl_device float3 shader_holdout_eval(KernelGlobals *kg, ShaderData *sd)
 {
 #ifdef __MULTI_CLOSURE__
 	float3 weight = make_float3(0.0f, 0.0f, 0.0f);
@@ -816,7 +816,7 @@ __device float3 shader_holdout_eval(KernelGlobals *kg, ShaderData *sd)
 
 /* Surface Evaluation */
 
-__device void shader_eval_surface(KernelGlobals *kg, ShaderData *sd,
+ccl_device void shader_eval_surface(KernelGlobals *kg, ShaderData *sd,
 	float randb, int path_flag, ShaderContext ctx)
 {
 #ifdef __OSL__
@@ -837,7 +837,7 @@ __device void shader_eval_surface(KernelGlobals *kg, ShaderData *sd,
 
 /* Background Evaluation */
 
-__device float3 shader_eval_background(KernelGlobals *kg, ShaderData *sd, int path_flag, ShaderContext ctx)
+ccl_device float3 shader_eval_background(KernelGlobals *kg, ShaderData *sd, int path_flag, ShaderContext ctx)
 {
 #ifdef __OSL__
 	if (kg->osl)
@@ -875,7 +875,7 @@ __device float3 shader_eval_background(KernelGlobals *kg, ShaderData *sd, int pa
 
 /* Volume */
 
-__device float3 shader_volume_eval_phase(KernelGlobals *kg, ShaderData *sd,
+ccl_device float3 shader_volume_eval_phase(KernelGlobals *kg, ShaderData *sd,
 	float3 omega_in, float3 omega_out)
 {
 #ifdef __MULTI_CLOSURE__
@@ -896,7 +896,7 @@ __device float3 shader_volume_eval_phase(KernelGlobals *kg, ShaderData *sd,
 
 /* Volume Evaluation */
 
-__device void shader_eval_volume(KernelGlobals *kg, ShaderData *sd,
+ccl_device void shader_eval_volume(KernelGlobals *kg, ShaderData *sd,
 	float randb, int path_flag, ShaderContext ctx)
 {
 #ifdef __SVM__
@@ -911,7 +911,7 @@ __device void shader_eval_volume(KernelGlobals *kg, ShaderData *sd,
 
 /* Displacement Evaluation */
 
-__device void shader_eval_displacement(KernelGlobals *kg, ShaderData *sd, ShaderContext ctx)
+ccl_device void shader_eval_displacement(KernelGlobals *kg, ShaderData *sd, ShaderContext ctx)
 {
 	/* this will modify sd->P */
 #ifdef __SVM__
@@ -927,7 +927,7 @@ __device void shader_eval_displacement(KernelGlobals *kg, ShaderData *sd, Shader
 /* Transparent Shadows */
 
 #ifdef __TRANSPARENT_SHADOWS__
-__device bool shader_transparent_shadow(KernelGlobals *kg, Intersection *isect)
+ccl_device bool shader_transparent_shadow(KernelGlobals *kg, Intersection *isect)
 {
 	int prim = kernel_tex_fetch(__prim_index, isect->prim);
 	int shader = 0;
@@ -953,7 +953,7 @@ __device bool shader_transparent_shadow(KernelGlobals *kg, Intersection *isect)
 /* Merging */
 
 #ifdef __BRANCHED_PATH__
-__device void shader_merge_closures(KernelGlobals *kg, ShaderData *sd)
+ccl_device void shader_merge_closures(KernelGlobals *kg, ShaderData *sd)
 {
 	/* merge identical closures, better when we sample a single closure at a time */
 	for(int i = 0; i < sd->num_closure; i++) {
diff --git a/intern/cycles/kernel/kernel_subsurface.h b/intern/cycles/kernel/kernel_subsurface.h
index d16b9328bf2..2326ca18b55 100644
--- a/intern/cycles/kernel/kernel_subsurface.h
+++ b/intern/cycles/kernel/kernel_subsurface.h
@@ -25,7 +25,7 @@ CCL_NAMESPACE_BEGIN
 
 #define BSSRDF_MULTI_EVAL
 
-__device ShaderClosure *subsurface_scatter_pick_closure(KernelGlobals *kg, ShaderData *sd, float *probability)
+ccl_device ShaderClosure *subsurface_scatter_pick_closure(KernelGlobals *kg, ShaderData *sd, float *probability)
 {
 	/* sum sample weights of bssrdf and bsdf */
 	float bsdf_sum = 0.0f;
@@ -80,7 +80,7 @@ __device ShaderClosure *subsurface_scatter_pick_closure(KernelGlobals *kg, Shade
 	return NULL;
 }
 
-__device float3 subsurface_scatter_eval(ShaderData *sd, ShaderClosure *sc, float disk_r, float r, bool all)
+ccl_device float3 subsurface_scatter_eval(ShaderData *sd, ShaderClosure *sc, float disk_r, float r, bool all)
 {
 #ifdef BSSRDF_MULTI_EVAL
 	/* this is the veach one-sample model with balance heuristic, some pdf
@@ -133,7 +133,7 @@ __device float3 subsurface_scatter_eval(ShaderData *sd, ShaderClosure *sc, float
 }
 
 /* replace closures with a single diffuse bsdf closure after scatter step */
-__device void subsurface_scatter_setup_diffuse_bsdf(ShaderData *sd, float3 weight, bool hit, float3 N)
+ccl_device void subsurface_scatter_setup_diffuse_bsdf(ShaderData *sd, float3 weight, bool hit, float3 N)
 {
 	sd->flag &= ~SD_CLOSURE_FLAGS;
 	sd->randb_closure = 0.0f;
@@ -158,7 +158,7 @@ __device void subsurface_scatter_setup_diffuse_bsdf(ShaderData *sd, float3 weigh
 }
 
 /* optionally do blurring of color and/or bump mapping, at the cost of a shader evaluation */
-__device float3 subsurface_color_pow(float3 color, float exponent)
+ccl_device float3 subsurface_color_pow(float3 color, float exponent)
 {
 	color = max(color, make_float3(0.0f, 0.0f, 0.0f));
 
@@ -179,7 +179,7 @@ __device float3 subsurface_color_pow(float3 color, float exponent)
 	return color;
 }
 
-__device void subsurface_color_bump_blur(KernelGlobals *kg, ShaderData *out_sd, ShaderData *in_sd, int state_flag, float3 *eval, float3 *N)
+ccl_device void subsurface_color_bump_blur(KernelGlobals *kg, ShaderData *out_sd, ShaderData *in_sd, int state_flag, float3 *eval, float3 *N)
 {
 	/* average color and texture blur at outgoing point */
 	float texture_blur;
@@ -207,7 +207,7 @@ __device void subsurface_color_bump_blur(KernelGlobals *kg, ShaderData *out_sd,
 }
 
 /* subsurface scattering step, from a point on the surface to other nearby points on the same object */
-__device int subsurface_scatter_multi_step(KernelGlobals *kg, ShaderData *sd, ShaderData bssrdf_sd[BSSRDF_MAX_HITS],
+ccl_device int subsurface_scatter_multi_step(KernelGlobals *kg, ShaderData *sd, ShaderData bssrdf_sd[BSSRDF_MAX_HITS],
 	int state_flag, ShaderClosure *sc, uint *lcg_state, float disk_u, float disk_v, bool all)
 {
 	/* pick random axis in local frame and point on disk */
@@ -313,7 +313,7 @@ __device int subsurface_scatter_multi_step(KernelGlobals *kg, ShaderData *sd, Sh
 }
 
 /* subsurface scattering step, from a point on the surface to another nearby point on the same object */
-__device void subsurface_scatter_step(KernelGlobals *kg, ShaderData *sd,
+ccl_device void subsurface_scatter_step(KernelGlobals *kg, ShaderData *sd,
 	int state_flag, ShaderClosure *sc, uint *lcg_state, float disk_u, float disk_v, bool all)
 {
 	float3 eval = make_float3(0.0f, 0.0f, 0.0f);
diff --git a/intern/cycles/kernel/kernel_triangle.h b/intern/cycles/kernel/kernel_triangle.h
index 71389e0ec32..d457b67e77e 100644
--- a/intern/cycles/kernel/kernel_triangle.h
+++ b/intern/cycles/kernel/kernel_triangle.h
@@ -17,7 +17,7 @@
 CCL_NAMESPACE_BEGIN
 
 /* Point on triangle for Moller-Trumbore triangles */
-__device_inline float3 triangle_point_MT(KernelGlobals *kg, int tri_index, float u, float v)
+ccl_device_inline float3 triangle_point_MT(KernelGlobals *kg, int tri_index, float u, float v)
 {
 	/* load triangle vertices */
 	float3 tri_vindex = float4_to_float3(kernel_tex_fetch(__tri_vindex, tri_index));
@@ -32,7 +32,7 @@ __device_inline float3 triangle_point_MT(KernelGlobals *kg, int tri_index, float
 }
 
 /* Sample point on triangle */
-__device_inline float3 triangle_sample_MT(KernelGlobals *kg, int tri_index, float randu, float randv)
+ccl_device_inline float3 triangle_sample_MT(KernelGlobals *kg, int tri_index, float randu, float randv)
 {
 	/* compute point */
 	randu = sqrtf(randu);
@@ -44,7 +44,7 @@ __device_inline float3 triangle_sample_MT(KernelGlobals *kg, int tri_index, floa
 }
 
 /* Normal for Moller-Trumbore triangles */
-__device_inline float3 triangle_normal_MT(KernelGlobals *kg, int tri_index, int *shader)
+ccl_device_inline float3 triangle_normal_MT(KernelGlobals *kg, int tri_index, int *shader)
 {
 #if 0
 	/* load triangle vertices */
@@ -64,7 +64,7 @@ __device_inline float3 triangle_normal_MT(KernelGlobals *kg, int tri_index, int
 }
 
 /* Return 3 triangle vertex locations */
-__device_inline void triangle_vertices(KernelGlobals *kg, int tri_index, float3 P[3])
+ccl_device_inline void triangle_vertices(KernelGlobals *kg, int tri_index, float3 P[3])
 {
 	/* load triangle vertices */
 	float3 tri_vindex = float4_to_float3(kernel_tex_fetch(__tri_vindex, tri_index));
@@ -74,7 +74,7 @@ __device_inline void triangle_vertices(KernelGlobals *kg, int tri_index, float3
 	P[2] = float4_to_float3(kernel_tex_fetch(__tri_verts, __float_as_int(tri_vindex.z)));
 }
 
-__device_inline float3 triangle_smooth_normal(KernelGlobals *kg, int tri_index, float u, float v)
+ccl_device_inline float3 triangle_smooth_normal(KernelGlobals *kg, int tri_index, float u, float v)
 {
 	/* load triangle vertices */
 	float3 tri_vindex = float4_to_float3(kernel_tex_fetch(__tri_vindex, tri_index));
@@ -86,7 +86,7 @@ __device_inline float3 triangle_smooth_normal(KernelGlobals *kg, int tri_index,
 	return normalize((1.0f - u - v)*n2 + u*n0 + v*n1);
 }
 
-__device_inline void triangle_dPdudv(KernelGlobals *kg, float3 *dPdu, float3 *dPdv, int tri)
+ccl_device_inline void triangle_dPdudv(KernelGlobals *kg, float3 *dPdu, float3 *dPdv, int tri)
 {
 	/* fetch triangle vertex coordinates */
 	float3 tri_vindex = float4_to_float3(kernel_tex_fetch(__tri_vindex, tri));
@@ -102,7 +102,7 @@ __device_inline void triangle_dPdudv(KernelGlobals *kg, float3 *dPdu, float3 *dP
 
 /* attributes */
 
-__device float triangle_attribute_float(KernelGlobals *kg, const ShaderData *sd, AttributeElement elem, int offset, float *dx, float *dy)
+ccl_device float triangle_attribute_float(KernelGlobals *kg, const ShaderData *sd, AttributeElement elem, int offset, float *dx, float *dy)
 {
 	if(elem == ATTR_ELEMENT_FACE) {
 		if(dx) *dx = 0.0f;
@@ -145,7 +145,7 @@ __device float triangle_attribute_float(KernelGlobals *kg, const ShaderData *sd,
 	}
 }
 
-__device float3 triangle_attribute_float3(KernelGlobals *kg, const ShaderData *sd, AttributeElement elem, int offset, float3 *dx, float3 *dy)
+ccl_device float3 triangle_attribute_float3(KernelGlobals *kg, const ShaderData *sd, AttributeElement elem, int offset, float3 *dx, float3 *dy)
 {
 	if(elem == ATTR_ELEMENT_FACE) {
 		if(dx) *dx = make_float3(0.0f, 0.0f, 0.0f);
diff --git a/intern/cycles/kernel/svm/svm.h b/intern/cycles/kernel/svm/svm.h
index 168127c620c..b2be9deb938 100644
--- a/intern/cycles/kernel/svm/svm.h
+++ b/intern/cycles/kernel/svm/svm.h
@@ -45,14 +45,14 @@ CCL_NAMESPACE_BEGIN
 
 /* Stack */
 
-__device_inline float3 stack_load_float3(float *stack, uint a)
+ccl_device_inline float3 stack_load_float3(float *stack, uint a)
 {
 	kernel_assert(a+2 < SVM_STACK_SIZE);
 
 	return make_float3(stack[a+0], stack[a+1], stack[a+2]);
 }
 
-__device_inline void stack_store_float3(float *stack, uint a, float3 f)
+ccl_device_inline void stack_store_float3(float *stack, uint a, float3 f)
 {
 	kernel_assert(a+2 < SVM_STACK_SIZE);
 
@@ -61,59 +61,59 @@ __device_inline void stack_store_float3(float *stack, uint a, float3 f)
 	stack[a+2] = f.z;
 }
 
-__device_inline float stack_load_float(float *stack, uint a)
+ccl_device_inline float stack_load_float(float *stack, uint a)
 {
 	kernel_assert(a < SVM_STACK_SIZE);
 
 	return stack[a];
 }
 
-__device_inline float stack_load_float_default(float *stack, uint a, uint value)
+ccl_device_inline float stack_load_float_default(float *stack, uint a, uint value)
 {
 	return (a == (uint)SVM_STACK_INVALID)? __uint_as_float(value): stack_load_float(stack, a);
 }
 
-__device_inline void stack_store_float(float *stack, uint a, float f)
+ccl_device_inline void stack_store_float(float *stack, uint a, float f)
 {
 	kernel_assert(a < SVM_STACK_SIZE);
 
 	stack[a] = f;
 }
 
-__device_inline int stack_load_int(float *stack, uint a)
+ccl_device_inline int stack_load_int(float *stack, uint a)
 {
 	kernel_assert(a < SVM_STACK_SIZE);
 
 	return __float_as_int(stack[a]);
 }
 
-__device_inline float stack_load_int_default(float *stack, uint a, uint value)
+ccl_device_inline float stack_load_int_default(float *stack, uint a, uint value)
 {
 	return (a == (uint)SVM_STACK_INVALID)? (int)value: stack_load_int(stack, a);
 }
 
-__device_inline void stack_store_int(float *stack, uint a, int i)
+ccl_device_inline void stack_store_int(float *stack, uint a, int i)
 {
 	kernel_assert(a < SVM_STACK_SIZE);
 
 	stack[a] = __int_as_float(i);
 }
 
-__device_inline bool stack_valid(uint a)
+ccl_device_inline bool stack_valid(uint a)
 {
 	return a != (uint)SVM_STACK_INVALID;
 }
 
 /* Reading Nodes */
 
-__device_inline uint4 read_node(KernelGlobals *kg, int *offset)
+ccl_device_inline uint4 read_node(KernelGlobals *kg, int *offset)
 {
 	uint4 node = kernel_tex_fetch(__svm_nodes, *offset);
 	(*offset)++;
 	return node;
 }
 
-__device_inline float4 read_node_float(KernelGlobals *kg, int *offset)
+ccl_device_inline float4 read_node_float(KernelGlobals *kg, int *offset)
 {
 	uint4 node = kernel_tex_fetch(__svm_nodes, *offset);
 	float4 f = make_float4(__uint_as_float(node.x), __uint_as_float(node.y), __uint_as_float(node.z), __uint_as_float(node.w));
@@ -121,13 +121,13 @@ __device_inline float4 read_node_float(KernelGlobals *kg, int *offset)
 	return f;
 }
 
-__device_inline float4 fetch_node_float(KernelGlobals *kg, int offset)
+ccl_device_inline float4 fetch_node_float(KernelGlobals *kg, int offset)
 {
 	uint4 node = kernel_tex_fetch(__svm_nodes, offset);
 	return make_float4(__uint_as_float(node.x), __uint_as_float(node.y), __uint_as_float(node.z), __uint_as_float(node.w));
 }
 
-__device_inline void decode_node_uchar4(uint i, uint *x, uint *y, uint *z, uint *w)
+ccl_device_inline void decode_node_uchar4(uint i, uint *x, uint *y, uint *z, uint *w)
 {
 	if(x) *x = (i & 0xFF);
 	if(y) *y = ((i >> 8) & 0xFF);
@@ -182,7 +182,7 @@ CCL_NAMESPACE_BEGIN
 
 /* Main Interpreter Loop */
 
-__device_noinline void svm_eval_nodes(KernelGlobals *kg, ShaderData *sd, ShaderType type, float randb, int path_flag)
+ccl_device_noinline void svm_eval_nodes(KernelGlobals *kg, ShaderData *sd, ShaderType type, float randb, int path_flag)
 {
 	float stack[SVM_STACK_SIZE];
 	float closure_weight = 1.0f;
diff --git a/intern/cycles/kernel/svm/svm_attribute.h b/intern/cycles/kernel/svm/svm_attribute.h
index 8e71e7cdd56..90409e16477 100644
--- a/intern/cycles/kernel/svm/svm_attribute.h
+++ b/intern/cycles/kernel/svm/svm_attribute.h
@@ -18,7 +18,7 @@ CCL_NAMESPACE_BEGIN
 
 /* Attribute Node */
 
-__device void svm_node_attr_init(KernelGlobals *kg, ShaderData *sd,
+ccl_device void svm_node_attr_init(KernelGlobals *kg, ShaderData *sd,
 	uint4 node, NodeAttributeType *type,
 	NodeAttributeType *mesh_type, AttributeElement *elem, int *offset, uint *out_offset)
 {
@@ -52,7 +52,7 @@ __device void svm_node_attr_init(KernelGlobals *kg, ShaderData *sd,
 	*type = (NodeAttributeType)node.w;
 }
 
-__device void svm_node_attr(KernelGlobals *kg, ShaderData *sd, float *stack, uint4 node)
+ccl_device void svm_node_attr(KernelGlobals *kg, ShaderData *sd, float *stack, uint4 node)
 {
 	NodeAttributeType type, mesh_type;
 	AttributeElement elem;
@@ -84,7 +84,7 @@ __device void svm_node_attr(KernelGlobals *kg, ShaderData *sd, float *stack, uin
 	}
 }
 
-__device void svm_node_attr_bump_dx(KernelGlobals *kg, ShaderData *sd, float *stack, uint4 node)
+ccl_device void svm_node_attr_bump_dx(KernelGlobals *kg, ShaderData *sd, float *stack, uint4 node)
 {
 	NodeAttributeType type, mesh_type;
 	AttributeElement elem;
@@ -120,7 +120,7 @@ __device void svm_node_attr_bump_dx(KernelGlobals *kg, ShaderData *sd, float *st
 	}
 }
 
-__device void svm_node_attr_bump_dy(KernelGlobals *kg, ShaderData *sd, float *stack, uint4 node)
+ccl_device void svm_node_attr_bump_dy(KernelGlobals *kg, ShaderData *sd, float *stack, uint4 node)
 {
 	NodeAttributeType type, mesh_type;
 	AttributeElement elem;
diff --git a/intern/cycles/kernel/svm/svm_blackbody.h b/intern/cycles/kernel/svm/svm_blackbody.h
index 2fc2c770a83..63dbf27d35e 100644
--- a/intern/cycles/kernel/svm/svm_blackbody.h
+++ b/intern/cycles/kernel/svm/svm_blackbody.h
@@ -34,7 +34,7 @@ CCL_NAMESPACE_BEGIN
 
 /* Blackbody Node */
 
-__device void svm_node_blackbody(KernelGlobals *kg, ShaderData *sd, float *stack, uint temperature_offset, uint col_offset)
+ccl_device void svm_node_blackbody(KernelGlobals *kg, ShaderData *sd, float *stack, uint temperature_offset, uint col_offset)
 {
 	/* Output */
 	float3 color_rgb = make_float3(0.0f, 0.0f, 0.0f);
diff --git a/intern/cycles/kernel/svm/svm_brick.h b/intern/cycles/kernel/svm/svm_brick.h
index 19b4b5e779f..7cac922d8a6 100644
--- a/intern/cycles/kernel/svm/svm_brick.h
+++ b/intern/cycles/kernel/svm/svm_brick.h
@@ -18,7 +18,7 @@ CCL_NAMESPACE_BEGIN
 
 /* Brick */
 
-__device_noinline float brick_noise(int n) /* fast integer noise */
+ccl_device_noinline float brick_noise(int n) /* fast integer noise */
 {
 	int nn;
 	n = (n >> 13) ^ n;
@@ -26,7 +26,7 @@ __device_noinline float brick_noise(int n) /* fast integer noise */
 	return 0.5f * ((float)nn / 1073741824.0f);
 }
 
-__device_noinline float2 svm_brick(float3 p, float scale, float mortar_size, float bias,
+ccl_device_noinline float2 svm_brick(float3 p, float scale, float mortar_size, float bias,
 	float brick_width, float row_height, float offset_amount, int offset_frequency,
 	float squash_amount, int squash_frequency)
 {	
@@ -56,7 +56,7 @@ __device_noinline float2 svm_brick(float3 p, float scale, float mortar_size, flo
 		y > (row_height - mortar_size)) ? 1.0f : 0.0f);
 }
 
-__device void svm_node_tex_brick(KernelGlobals *kg, ShaderData *sd, float *stack, uint4 node, int *offset)
+ccl_device void svm_node_tex_brick(KernelGlobals *kg, ShaderData *sd, float *stack, uint4 node, int *offset)
 {	
 	uint4 node2 = read_node(kg, offset);
 	uint4 node3 = read_node(kg, offset);
diff --git a/intern/cycles/kernel/svm/svm_brightness.h b/intern/cycles/kernel/svm/svm_brightness.h
index 3e977dcbe1b..9b330b3213f 100644
--- a/intern/cycles/kernel/svm/svm_brightness.h
+++ b/intern/cycles/kernel/svm/svm_brightness.h
@@ -16,7 +16,7 @@
 
 CCL_NAMESPACE_BEGIN
 
-__device void svm_node_brightness(ShaderData *sd, float *stack, uint in_color, uint out_color, uint node)
+ccl_device void svm_node_brightness(ShaderData *sd, float *stack, uint in_color, uint out_color, uint node)
 {
 	uint bright_offset, contrast_offset;
 	float3 color = stack_load_float3(stack, in_color);
diff --git a/intern/cycles/kernel/svm/svm_camera.h b/intern/cycles/kernel/svm/svm_camera.h
index 76f50e196eb..bfe9289fa02 100644
--- a/intern/cycles/kernel/svm/svm_camera.h
+++ b/intern/cycles/kernel/svm/svm_camera.h
@@ -16,7 +16,7 @@
 
 CCL_NAMESPACE_BEGIN
 
-__device void svm_node_camera(KernelGlobals *kg, ShaderData *sd, float *stack, uint out_vector, uint out_zdepth, uint out_distance)
+ccl_device void svm_node_camera(KernelGlobals *kg, ShaderData *sd, float *stack, uint out_vector, uint out_zdepth, uint out_distance)
 {
 	float distance;
 	float zdepth;
diff --git a/intern/cycles/kernel/svm/svm_checker.h b/intern/cycles/kernel/svm/svm_checker.h
index 70fe2ac5a92..ebc48e16d68 100644
--- a/intern/cycles/kernel/svm/svm_checker.h
+++ b/intern/cycles/kernel/svm/svm_checker.h
@@ -18,7 +18,7 @@ CCL_NAMESPACE_BEGIN
 
 /* Checker */
 
-__device_noinline float svm_checker(float3 p, float scale)
+ccl_device_noinline float svm_checker(float3 p, float scale)
 {	
 	p *= scale;
 
@@ -34,7 +34,7 @@ __device_noinline float svm_checker(float3 p, float scale)
 	return ((xi % 2 == yi % 2) == (zi % 2))? 1.0f: 0.0f;
 }
 
-__device void svm_node_tex_checker(KernelGlobals *kg, ShaderData *sd, float *stack, uint4 node)
+ccl_device void svm_node_tex_checker(KernelGlobals *kg, ShaderData *sd, float *stack, uint4 node)
 {	
 	uint co_offset, color1_offset, color2_offset, scale_offset;
 	uint color_offset, fac_offset;
diff --git a/intern/cycles/kernel/svm/svm_closure.h b/intern/cycles/kernel/svm/svm_closure.h
index 0d4716ab078..2c6fb5deca4 100644
--- a/intern/cycles/kernel/svm/svm_closure.h
+++ b/intern/cycles/kernel/svm/svm_closure.h
@@ -18,7 +18,7 @@ CCL_NAMESPACE_BEGIN
 
 /* Closure Nodes */
 
-__device void svm_node_glass_setup(ShaderData *sd, ShaderClosure *sc, int type, float eta, float roughness, bool refract)
+ccl_device void svm_node_glass_setup(ShaderData *sd, ShaderClosure *sc, int type, float eta, float roughness, bool refract)
 {
 	if(type == CLOSURE_BSDF_SHARP_GLASS_ID) {
 		if(refract) {
@@ -49,7 +49,7 @@ __device void svm_node_glass_setup(ShaderData *sd, ShaderClosure *sc, int type,
 	}
 }
 
-__device_inline ShaderClosure *svm_node_closure_get_non_bsdf(ShaderData *sd, ClosureType type, float mix_weight)
+ccl_device_inline ShaderClosure *svm_node_closure_get_non_bsdf(ShaderData *sd, ClosureType type, float mix_weight)
 {
 #ifdef __MULTI_CLOSURE__
 	ShaderClosure *sc = &sd->closure[sd->num_closure];
@@ -70,7 +70,7 @@ __device_inline ShaderClosure *svm_node_closure_get_non_bsdf(ShaderData *sd, Clo
 #endif
 }
 
-__device_inline ShaderClosure *svm_node_closure_get_bsdf(ShaderData *sd, float mix_weight)
+ccl_device_inline ShaderClosure *svm_node_closure_get_bsdf(ShaderData *sd, float mix_weight)
 {
 #ifdef __MULTI_CLOSURE__
 	ShaderClosure *sc = &sd->closure[sd->num_closure];
@@ -93,7 +93,7 @@ __device_inline ShaderClosure *svm_node_closure_get_bsdf(ShaderData *sd, float m
 #endif
 }
 
-__device void svm_node_closure_bsdf(KernelGlobals *kg, ShaderData *sd, float *stack, uint4 node, float randb, int path_flag, int *offset)
+ccl_device void svm_node_closure_bsdf(KernelGlobals *kg, ShaderData *sd, float *stack, uint4 node, float randb, int path_flag, int *offset)
 {
 	uint type, param1_offset, param2_offset;
 
@@ -456,7 +456,7 @@ __device void svm_node_closure_bsdf(KernelGlobals *kg, ShaderData *sd, float *st
 	}
 }
 
-__device void svm_node_closure_volume(KernelGlobals *kg, ShaderData *sd, float *stack, uint4 node, int path_flag)
+ccl_device void svm_node_closure_volume(KernelGlobals *kg, ShaderData *sd, float *stack, uint4 node, int path_flag)
 {
 	uint type, param1_offset, param2_offset;
 
@@ -499,7 +499,7 @@ __device void svm_node_closure_volume(KernelGlobals *kg, ShaderData *sd, float *
 	}
 }
 
-__device void svm_node_closure_emission(ShaderData *sd, float *stack, uint4 node)
+ccl_device void svm_node_closure_emission(ShaderData *sd, float *stack, uint4 node)
 {
 #ifdef __MULTI_CLOSURE__
 	uint mix_weight_offset = node.y;
@@ -522,7 +522,7 @@ __device void svm_node_closure_emission(ShaderData *sd, float *stack, uint4 node
 	sd->flag |= SD_EMISSION;
 }
 
-__device void svm_node_closure_background(ShaderData *sd, float *stack, uint4 node)
+ccl_device void svm_node_closure_background(ShaderData *sd, float *stack, uint4 node)
 {
 #ifdef __MULTI_CLOSURE__
 	uint mix_weight_offset = node.y;
@@ -543,7 +543,7 @@ __device void svm_node_closure_background(ShaderData *sd, float *stack, uint4 no
 #endif
 }
 
-__device void svm_node_closure_holdout(ShaderData *sd, float *stack, uint4 node)
+ccl_device void svm_node_closure_holdout(ShaderData *sd, float *stack, uint4 node)
 {
 #ifdef __MULTI_CLOSURE__
 	uint mix_weight_offset = node.y;
@@ -566,7 +566,7 @@ __device void svm_node_closure_holdout(ShaderData *sd, float *stack, uint4 node)
 	sd->flag |= SD_HOLDOUT;
 }
 
-__device void svm_node_closure_ambient_occlusion(ShaderData *sd, float *stack, uint4 node)
+ccl_device void svm_node_closure_ambient_occlusion(ShaderData *sd, float *stack, uint4 node)
 {
 #ifdef __MULTI_CLOSURE__
 	uint mix_weight_offset = node.y;
@@ -591,7 +591,7 @@ __device void svm_node_closure_ambient_occlusion(ShaderData *sd, float *stack, u
 
 /* Closure Nodes */
 
-__device_inline void svm_node_closure_store_weight(ShaderData *sd, float3 weight)
+ccl_device_inline void svm_node_closure_store_weight(ShaderData *sd, float3 weight)
 {
 #ifdef __MULTI_CLOSURE__
 	if(sd->num_closure < MAX_CLOSURE)
@@ -601,13 +601,13 @@ __device_inline void svm_node_closure_store_weight(ShaderData *sd, float3 weight
 #endif
 }
 
-__device void svm_node_closure_set_weight(ShaderData *sd, uint r, uint g, uint b)
+ccl_device void svm_node_closure_set_weight(ShaderData *sd, uint r, uint g, uint b)
 {
 	float3 weight = make_float3(__uint_as_float(r), __uint_as_float(g), __uint_as_float(b));
 	svm_node_closure_store_weight(sd, weight);
 }
 
-__device void svm_node_emission_set_weight_total(KernelGlobals *kg, ShaderData *sd, uint r, uint g, uint b)
+ccl_device void svm_node_emission_set_weight_total(KernelGlobals *kg, ShaderData *sd, uint r, uint g, uint b)
 {
 	float3 weight = make_float3(__uint_as_float(r), __uint_as_float(g), __uint_as_float(b));
 
@@ -617,14 +617,14 @@ __device void svm_node_emission_set_weight_total(KernelGlobals *kg, ShaderData *
 	svm_node_closure_store_weight(sd, weight);
 }
 
-__device void svm_node_closure_weight(ShaderData *sd, float *stack, uint weight_offset)
+ccl_device void svm_node_closure_weight(ShaderData *sd, float *stack, uint weight_offset)
 {
 	float3 weight = stack_load_float3(stack, weight_offset);
 
 	svm_node_closure_store_weight(sd, weight);
 }
 
-__device void svm_node_emission_weight(KernelGlobals *kg, ShaderData *sd, float *stack, uint4 node)
+ccl_device void svm_node_emission_weight(KernelGlobals *kg, ShaderData *sd, float *stack, uint4 node)
 {
 	uint color_offset = node.y;
 	uint strength_offset = node.z;
@@ -639,7 +639,7 @@ __device void svm_node_emission_weight(KernelGlobals *kg, ShaderData *sd, float
 	svm_node_closure_store_weight(sd, weight);
 }
 
-__device void svm_node_mix_closure(ShaderData *sd, float *stack,
+ccl_device void svm_node_mix_closure(ShaderData *sd, float *stack,
 	uint4 node, int *offset, float *randb)
 {
 #ifdef __MULTI_CLOSURE__
@@ -675,7 +675,7 @@ __device void svm_node_mix_closure(ShaderData *sd, float *stack,
 #endif
 }
 
-__device void svm_node_add_closure(ShaderData *sd, float *stack, uint unused,
+ccl_device void svm_node_add_closure(ShaderData *sd, float *stack, uint unused,
 	uint node_jump, int *offset, float *randb, float *closure_weight)
 {
 #ifdef __MULTI_CLOSURE__
@@ -699,7 +699,7 @@ __device void svm_node_add_closure(ShaderData *sd, float *stack, uint unused,
 
 /* (Bump) normal */
 
-__device void svm_node_set_normal(KernelGlobals *kg, ShaderData *sd, float *stack, uint in_direction, uint out_normal)
+ccl_device void svm_node_set_normal(KernelGlobals *kg, ShaderData *sd, float *stack, uint in_direction, uint out_normal)
 {
 	float3 normal = stack_load_float3(stack, in_direction);
 	sd->N = normal;
diff --git a/intern/cycles/kernel/svm/svm_convert.h b/intern/cycles/kernel/svm/svm_convert.h
index 22f4651689d..2503912c5c6 100644
--- a/intern/cycles/kernel/svm/svm_convert.h
+++ b/intern/cycles/kernel/svm/svm_convert.h
@@ -18,7 +18,7 @@ CCL_NAMESPACE_BEGIN
 
 /* Conversion Nodes */
 
-__device void svm_node_convert(ShaderData *sd, float *stack, uint type, uint from, uint to)
+ccl_device void svm_node_convert(ShaderData *sd, float *stack, uint type, uint from, uint to)
 {
 	switch(type) {
 		case NODE_CONVERT_FI: {
diff --git a/intern/cycles/kernel/svm/svm_displace.h b/intern/cycles/kernel/svm/svm_displace.h
index d0bac647a7c..6cd5ee4b375 100644
--- a/intern/cycles/kernel/svm/svm_displace.h
+++ b/intern/cycles/kernel/svm/svm_displace.h
@@ -18,7 +18,7 @@ CCL_NAMESPACE_BEGIN
 
 /* Bump Node */
 
-__device void svm_node_set_bump(KernelGlobals *kg, ShaderData *sd, float *stack, uint4 node)
+ccl_device void svm_node_set_bump(KernelGlobals *kg, ShaderData *sd, float *stack, uint4 node)
 {
 #ifdef __RAY_DIFFERENTIALS__
 	/* get normal input */
@@ -62,7 +62,7 @@ __device void svm_node_set_bump(KernelGlobals *kg, ShaderData *sd, float *stack,
 
 /* Displacement Node */
 
-__device void svm_node_set_displacement(ShaderData *sd, float *stack, uint fac_offset)
+ccl_device void svm_node_set_displacement(ShaderData *sd, float *stack, uint fac_offset)
 {
 	float d = stack_load_float(stack, fac_offset);
 	sd->P += sd->N*d*0.1f; /* todo: get rid of this factor */
diff --git a/intern/cycles/kernel/svm/svm_fresnel.h b/intern/cycles/kernel/svm/svm_fresnel.h
index d97d6a3738f..bb70a3faa2a 100644
--- a/intern/cycles/kernel/svm/svm_fresnel.h
+++ b/intern/cycles/kernel/svm/svm_fresnel.h
@@ -18,7 +18,7 @@ CCL_NAMESPACE_BEGIN
 
 /* Fresnel Node */
 
-__device void svm_node_fresnel(ShaderData *sd, float *stack, uint ior_offset, uint ior_value, uint node)
+ccl_device void svm_node_fresnel(ShaderData *sd, float *stack, uint ior_offset, uint ior_value, uint node)
 {
 	uint normal_offset, out_offset;
 	decode_node_uchar4(node, &normal_offset, &out_offset, NULL, NULL);
@@ -35,7 +35,7 @@ __device void svm_node_fresnel(ShaderData *sd, float *stack, uint ior_offset, ui
 
 /* Layer Weight Node */
 
-__device void svm_node_layer_weight(ShaderData *sd, float *stack, uint4 node)
+ccl_device void svm_node_layer_weight(ShaderData *sd, float *stack, uint4 node)
 {
 	uint blend_offset = node.y;
 	uint blend_value = node.z;
diff --git a/intern/cycles/kernel/svm/svm_gamma.h b/intern/cycles/kernel/svm/svm_gamma.h
index ef1581fba8d..c4749e7b936 100644
--- a/intern/cycles/kernel/svm/svm_gamma.h
+++ b/intern/cycles/kernel/svm/svm_gamma.h
@@ -16,7 +16,7 @@
 
 CCL_NAMESPACE_BEGIN
 
-__device void svm_node_gamma(ShaderData *sd, float *stack, uint in_gamma, uint in_color, uint out_color)
+ccl_device void svm_node_gamma(ShaderData *sd, float *stack, uint in_gamma, uint in_color, uint out_color)
 {
 	float3 color = stack_load_float3(stack, in_color);
 	float gamma = stack_load_float(stack, in_gamma);
diff --git a/intern/cycles/kernel/svm/svm_geometry.h b/intern/cycles/kernel/svm/svm_geometry.h
index 818d8694453..ad0cacb027a 100644
--- a/intern/cycles/kernel/svm/svm_geometry.h
+++ b/intern/cycles/kernel/svm/svm_geometry.h
@@ -18,7 +18,7 @@ CCL_NAMESPACE_BEGIN
 
 /* Geometry Node */
 
-__device void svm_node_geometry(KernelGlobals *kg, ShaderData *sd, float *stack, uint type, uint out_offset)
+ccl_device void svm_node_geometry(KernelGlobals *kg, ShaderData *sd, float *stack, uint type, uint out_offset)
 {
 	float3 data;
 
@@ -38,7 +38,7 @@ __device void svm_node_geometry(KernelGlobals *kg, ShaderData *sd, float *stack,
 	stack_store_float3(stack, out_offset, data);
 }
 
-__device void svm_node_geometry_bump_dx(KernelGlobals *kg, ShaderData *sd, float *stack, uint type, uint out_offset)
+ccl_device void svm_node_geometry_bump_dx(KernelGlobals *kg, ShaderData *sd, float *stack, uint type, uint out_offset)
 {
 #ifdef __RAY_DIFFERENTIALS__
 	float3 data;
@@ -55,7 +55,7 @@ __device void svm_node_geometry_bump_dx(KernelGlobals *kg, ShaderData *sd, float
 #endif
 }
 
-__device void svm_node_geometry_bump_dy(KernelGlobals *kg, ShaderData *sd, float *stack, uint type, uint out_offset)
+ccl_device void svm_node_geometry_bump_dy(KernelGlobals *kg, ShaderData *sd, float *stack, uint type, uint out_offset)
 {
 #ifdef __RAY_DIFFERENTIALS__
 	float3 data;
@@ -74,7 +74,7 @@ __device void svm_node_geometry_bump_dy(KernelGlobals *kg, ShaderData *sd, float
 
 /* Object Info */
 
-__device void svm_node_object_info(KernelGlobals *kg, ShaderData *sd, float *stack, uint type, uint out_offset)
+ccl_device void svm_node_object_info(KernelGlobals *kg, ShaderData *sd, float *stack, uint type, uint out_offset)
 {
 	float data;
 
@@ -94,7 +94,7 @@ __device void svm_node_object_info(KernelGlobals *kg, ShaderData *sd, float *sta
 
 /* Particle Info */
 
-__device void svm_node_particle_info(KernelGlobals *kg, ShaderData *sd, float *stack, uint type, uint out_offset)
+ccl_device void svm_node_particle_info(KernelGlobals *kg, ShaderData *sd, float *stack, uint type, uint out_offset)
 {
 	switch(type) {
 		case NODE_INFO_PAR_INDEX: {
@@ -146,7 +146,7 @@ __device void svm_node_particle_info(KernelGlobals *kg, ShaderData *sd, float *s
 
 /* Hair Info */
 
-__device void svm_node_hair_info(KernelGlobals *kg, ShaderData *sd, float *stack, uint type, uint out_offset)
+ccl_device void svm_node_hair_info(KernelGlobals *kg, ShaderData *sd, float *stack, uint type, uint out_offset)
 {
 	float data;
 	float3 data3;
diff --git a/intern/cycles/kernel/svm/svm_gradient.h b/intern/cycles/kernel/svm/svm_gradient.h
index 1c0fe511f9b..a4b3c0583f7 100644
--- a/intern/cycles/kernel/svm/svm_gradient.h
+++ b/intern/cycles/kernel/svm/svm_gradient.h
@@ -18,7 +18,7 @@ CCL_NAMESPACE_BEGIN
 
 /* Gradient */
 
-__device float svm_gradient(float3 p, NodeGradientType type)
+ccl_device float svm_gradient(float3 p, NodeGradientType type)
 {
 	float x, y, z;
 
@@ -57,7 +57,7 @@ __device float svm_gradient(float3 p, NodeGradientType type)
 	return 0.0f;
 }
 
-__device void svm_node_tex_gradient(ShaderData *sd, float *stack, uint4 node)
+ccl_device void svm_node_tex_gradient(ShaderData *sd, float *stack, uint4 node)
 {
 	uint type, co_offset, color_offset, fac_offset;
 
diff --git a/intern/cycles/kernel/svm/svm_hsv.h b/intern/cycles/kernel/svm/svm_hsv.h
index e16fb7582c1..11dfc4f096b 100644
--- a/intern/cycles/kernel/svm/svm_hsv.h
+++ b/intern/cycles/kernel/svm/svm_hsv.h
@@ -19,7 +19,7 @@
 
 CCL_NAMESPACE_BEGIN
 
-__device void svm_node_hsv(KernelGlobals *kg, ShaderData *sd, float *stack, uint in_color_offset, uint fac_offset, uint out_color_offset, int *offset)
+ccl_device void svm_node_hsv(KernelGlobals *kg, ShaderData *sd, float *stack, uint in_color_offset, uint fac_offset, uint out_color_offset, int *offset)
 {
 	/* read extra data */
 	uint4 node1 = read_node(kg, offset);
diff --git a/intern/cycles/kernel/svm/svm_image.h b/intern/cycles/kernel/svm/svm_image.h
index e18fe7c53a7..58e5775265a 100644
--- a/intern/cycles/kernel/svm/svm_image.h
+++ b/intern/cycles/kernel/svm/svm_image.h
@@ -21,14 +21,14 @@ CCL_NAMESPACE_BEGIN
 /* For OpenCL all images are packed in a single array, and we do manual lookup
  * and interpolation. */
 
-__device_inline float4 svm_image_texture_read(KernelGlobals *kg, int offset)
+ccl_device_inline float4 svm_image_texture_read(KernelGlobals *kg, int offset)
 {
 	uchar4 r = kernel_tex_fetch(__tex_image_packed, offset);
 	float f = 1.0f/255.0f;
 	return make_float4(r.x*f, r.y*f, r.z*f, r.w*f);
 }
 
-__device_inline int svm_image_texture_wrap_periodic(int x, int width)
+ccl_device_inline int svm_image_texture_wrap_periodic(int x, int width)
 {
 	x %= width;
 	if(x < 0)
@@ -36,19 +36,19 @@ __device_inline int svm_image_texture_wrap_periodic(int x, int width)
 	return x;
 }
 
-__device_inline int svm_image_texture_wrap_clamp(int x, int width)
+ccl_device_inline int svm_image_texture_wrap_clamp(int x, int width)
 {
 	return clamp(x, 0, width-1);
 }
 
-__device_inline float svm_image_texture_frac(float x, int *ix)
+ccl_device_inline float svm_image_texture_frac(float x, int *ix)
 {
 	int i = float_to_int(x) - ((x < 0.0f)? 1: 0);
 	*ix = i;
 	return x - (float)i;
 }
 
-__device float4 svm_image_texture(KernelGlobals *kg, int id, float x, float y, uint srgb, uint use_alpha)
+ccl_device float4 svm_image_texture(KernelGlobals *kg, int id, float x, float y, uint srgb, uint use_alpha)
 {
 	/* first slots are used by float textures, which are not supported here */
 	if(id < TEX_NUM_FLOAT_IMAGES)
@@ -110,7 +110,7 @@ __device float4 svm_image_texture(KernelGlobals *kg, int id, float x, float y, u
 
 #else
 
-__device float4 svm_image_texture(KernelGlobals *kg, int id, float x, float y, uint srgb, uint use_alpha)
+ccl_device float4 svm_image_texture(KernelGlobals *kg, int id, float x, float y, uint srgb, uint use_alpha)
 {
 	float4 r;
 
@@ -257,7 +257,7 @@ __device float4 svm_image_texture(KernelGlobals *kg, int id, float x, float y, u
 
 #endif
 
-__device void svm_node_tex_image(KernelGlobals *kg, ShaderData *sd, float *stack, uint4 node)
+ccl_device void svm_node_tex_image(KernelGlobals *kg, ShaderData *sd, float *stack, uint4 node)
 {
 	uint id = node.y;
 	uint co_offset, out_offset, alpha_offset, srgb;
@@ -274,7 +274,7 @@ __device void svm_node_tex_image(KernelGlobals *kg, ShaderData *sd, float *stack
 		stack_store_float(stack, alpha_offset, f.w);
 }
 
-__device void svm_node_tex_image_box(KernelGlobals *kg, ShaderData *sd, float *stack, uint4 node)
+ccl_device void svm_node_tex_image_box(KernelGlobals *kg, ShaderData *sd, float *stack, uint4 node)
 {
 	/* get object space normal */
 	float3 N = sd->N;
@@ -363,7 +363,7 @@ __device void svm_node_tex_image_box(KernelGlobals *kg, ShaderData *sd, float *s
 }
 
 
-__device void svm_node_tex_environment(KernelGlobals *kg, ShaderData *sd, float *stack, uint4 node)
+ccl_device void svm_node_tex_environment(KernelGlobals *kg, ShaderData *sd, float *stack, uint4 node)
 {
 	uint id = node.y;
 	uint co_offset, out_offset, alpha_offset, srgb;
diff --git a/intern/cycles/kernel/svm/svm_invert.h b/intern/cycles/kernel/svm/svm_invert.h
index 4c40afeadd9..eb47e9ad4ab 100644
--- a/intern/cycles/kernel/svm/svm_invert.h
+++ b/intern/cycles/kernel/svm/svm_invert.h
@@ -16,12 +16,12 @@
 
 CCL_NAMESPACE_BEGIN
 
-__device float invert(float color, float factor)
+ccl_device float invert(float color, float factor)
 {
 	return factor*(1.0f - color) + (1.0f - factor) * color;
 }
 
-__device void svm_node_invert(ShaderData *sd, float *stack, uint in_fac, uint in_color, uint out_color)
+ccl_device void svm_node_invert(ShaderData *sd, float *stack, uint in_fac, uint in_color, uint out_color)
 {
 	float factor = stack_load_float(stack, in_fac);
 	float3 color = stack_load_float3(stack, in_color);
diff --git a/intern/cycles/kernel/svm/svm_light_path.h b/intern/cycles/kernel/svm/svm_light_path.h
index ff6776d751e..e7afa2d2200 100644
--- a/intern/cycles/kernel/svm/svm_light_path.h
+++ b/intern/cycles/kernel/svm/svm_light_path.h
@@ -18,7 +18,7 @@ CCL_NAMESPACE_BEGIN
 
 /* Light Path Node */
 
-__device void svm_node_light_path(ShaderData *sd, float *stack, uint type, uint out_offset, int path_flag)
+ccl_device void svm_node_light_path(ShaderData *sd, float *stack, uint type, uint out_offset, int path_flag)
 {
 	float info = 0.0f;
 
@@ -40,7 +40,7 @@ __device void svm_node_light_path(ShaderData *sd, float *stack, uint type, uint
 
 /* Light Falloff Node */
 
-__device void svm_node_light_falloff(ShaderData *sd, float *stack, uint4 node)
+ccl_device void svm_node_light_falloff(ShaderData *sd, float *stack, uint4 node)
 {
 	uint strength_offset, out_offset, smooth_offset;
 
diff --git a/intern/cycles/kernel/svm/svm_magic.h b/intern/cycles/kernel/svm/svm_magic.h
index 7a5eba3f564..b661f5cacf8 100644
--- a/intern/cycles/kernel/svm/svm_magic.h
+++ b/intern/cycles/kernel/svm/svm_magic.h
@@ -18,7 +18,7 @@ CCL_NAMESPACE_BEGIN
 
 /* Magic */
 
-__device_noinline float3 svm_magic(float3 p, int n, float distortion)
+ccl_device_noinline float3 svm_magic(float3 p, int n, float distortion)
 {
 	float x = sinf((p.x + p.y + p.z)*5.0f);
 	float y = cosf((-p.x + p.y - p.z)*5.0f);
@@ -87,7 +87,7 @@ __device_noinline float3 svm_magic(float3 p, int n, float distortion)
 	return make_float3(0.5f - x, 0.5f - y, 0.5f - z);
 }
 
-__device void svm_node_tex_magic(KernelGlobals *kg, ShaderData *sd, float *stack, uint4 node, int *offset)
+ccl_device void svm_node_tex_magic(KernelGlobals *kg, ShaderData *sd, float *stack, uint4 node, int *offset)
 {
 	uint depth;
 	uint scale_offset, distortion_offset, co_offset, fac_offset, color_offset;
diff --git a/intern/cycles/kernel/svm/svm_mapping.h b/intern/cycles/kernel/svm/svm_mapping.h
index fcdd92dd575..c9fa8502dd1 100644
--- a/intern/cycles/kernel/svm/svm_mapping.h
+++ b/intern/cycles/kernel/svm/svm_mapping.h
@@ -18,7 +18,7 @@ CCL_NAMESPACE_BEGIN
 
 /* Mapping Node */
 
-__device void svm_node_mapping(KernelGlobals *kg, ShaderData *sd, float *stack, uint vec_offset, uint out_offset, int *offset)
+ccl_device void svm_node_mapping(KernelGlobals *kg, ShaderData *sd, float *stack, uint vec_offset, uint out_offset, int *offset)
 {
 	float3 v = stack_load_float3(stack, vec_offset);
 
@@ -32,7 +32,7 @@ __device void svm_node_mapping(KernelGlobals *kg, ShaderData *sd, float *stack,
 	stack_store_float3(stack, out_offset, r);
 }
 
-__device void svm_node_min_max(KernelGlobals *kg, ShaderData *sd, float *stack, uint vec_offset, uint out_offset, int *offset)
+ccl_device void svm_node_min_max(KernelGlobals *kg, ShaderData *sd, float *stack, uint vec_offset, uint out_offset, int *offset)
 {
 	float3 v = stack_load_float3(stack, vec_offset);
 
diff --git a/intern/cycles/kernel/svm/svm_math.h b/intern/cycles/kernel/svm/svm_math.h
index d4863dd6216..bb46d443a6b 100644
--- a/intern/cycles/kernel/svm/svm_math.h
+++ b/intern/cycles/kernel/svm/svm_math.h
@@ -16,7 +16,7 @@
 
 CCL_NAMESPACE_BEGIN
 
-__device float svm_math(NodeMath type, float Fac1, float Fac2)
+ccl_device float svm_math(NodeMath type, float Fac1, float Fac2)
 {
 	float Fac;
 
@@ -64,12 +64,12 @@ __device float svm_math(NodeMath type, float Fac1, float Fac2)
 	return Fac;
 }
 
-__device float average_fac(float3 v)
+ccl_device float average_fac(float3 v)
 {
 	return (fabsf(v.x) + fabsf(v.y) + fabsf(v.z))/3.0f;
 }
 
-__device void svm_vector_math(float *Fac, float3 *Vector, NodeVectorMath type, float3 Vector1, float3 Vector2)
+ccl_device void svm_vector_math(float *Fac, float3 *Vector, NodeVectorMath type, float3 Vector1, float3 Vector2)
 {
 	if(type == NODE_VECTOR_MATH_ADD) {
 		*Vector = Vector1 + Vector2;
@@ -104,7 +104,7 @@ __device void svm_vector_math(float *Fac, float3 *Vector, NodeVectorMath type, f
 
 /* Nodes */
 
-__device void svm_node_math(KernelGlobals *kg, ShaderData *sd, float *stack, uint itype, uint f1_offset, uint f2_offset, int *offset)
+ccl_device void svm_node_math(KernelGlobals *kg, ShaderData *sd, float *stack, uint itype, uint f1_offset, uint f2_offset, int *offset)
 {
 	NodeMath type = (NodeMath)itype;
 	float f1 = stack_load_float(stack, f1_offset);
@@ -116,7 +116,7 @@ __device void svm_node_math(KernelGlobals *kg, ShaderData *sd, float *stack, uin
 	stack_store_float(stack, node1.y, f);
 }
 
-__device void svm_node_vector_math(KernelGlobals *kg, ShaderData *sd, float *stack, uint itype, uint v1_offset, uint v2_offset, int *offset)
+ccl_device void svm_node_vector_math(KernelGlobals *kg, ShaderData *sd, float *stack, uint itype, uint v1_offset, uint v2_offset, int *offset)
 {
 	NodeVectorMath type = (NodeVectorMath)itype;
 	float3 v1 = stack_load_float3(stack, v1_offset);
diff --git a/intern/cycles/kernel/svm/svm_mix.h b/intern/cycles/kernel/svm/svm_mix.h
index 506f772dba5..0eeb4cf9b05 100644
--- a/intern/cycles/kernel/svm/svm_mix.h
+++ b/intern/cycles/kernel/svm/svm_mix.h
@@ -16,22 +16,22 @@
 
 CCL_NAMESPACE_BEGIN
 
-__device float3 svm_mix_blend(float t, float3 col1, float3 col2)
+ccl_device float3 svm_mix_blend(float t, float3 col1, float3 col2)
 {
 	return interp(col1, col2, t);
 }
 
-__device float3 svm_mix_add(float t, float3 col1, float3 col2)
+ccl_device float3 svm_mix_add(float t, float3 col1, float3 col2)
 {
 	return interp(col1, col1 + col2, t);
 }
 
-__device float3 svm_mix_mul(float t, float3 col1, float3 col2)
+ccl_device float3 svm_mix_mul(float t, float3 col1, float3 col2)
 {
 	return interp(col1, col1 * col2, t);
 }
 
-__device float3 svm_mix_screen(float t, float3 col1, float3 col2)
+ccl_device float3 svm_mix_screen(float t, float3 col1, float3 col2)
 {
 	float tm = 1.0f - t;
 	float3 one = make_float3(1.0f, 1.0f, 1.0f);
@@ -40,7 +40,7 @@ __device float3 svm_mix_screen(float t, float3 col1, float3 col2)
 	return one - (tm3 + t*(one - col2))*(one - col1);
 }
 
-__device float3 svm_mix_overlay(float t, float3 col1, float3 col2)
+ccl_device float3 svm_mix_overlay(float t, float3 col1, float3 col2)
 {
 	float tm = 1.0f - t;
 
@@ -64,12 +64,12 @@ __device float3 svm_mix_overlay(float t, float3 col1, float3 col2)
 	return outcol;
 }
 
-__device float3 svm_mix_sub(float t, float3 col1, float3 col2)
+ccl_device float3 svm_mix_sub(float t, float3 col1, float3 col2)
 {
 	return interp(col1, col1 - col2, t);
 }
 
-__device float3 svm_mix_div(float t, float3 col1, float3 col2)
+ccl_device float3 svm_mix_div(float t, float3 col1, float3 col2)
 {
 	float tm = 1.0f - t;
 
@@ -82,22 +82,22 @@ __device float3 svm_mix_div(float t, float3 col1, float3 col2)
 	return outcol;
 }
 
-__device float3 svm_mix_diff(float t, float3 col1, float3 col2)
+ccl_device float3 svm_mix_diff(float t, float3 col1, float3 col2)
 {
 	return interp(col1, fabs(col1 - col2), t);
 }
 
-__device float3 svm_mix_dark(float t, float3 col1, float3 col2)
+ccl_device float3 svm_mix_dark(float t, float3 col1, float3 col2)
 {
 	return min(col1, col2*t);
 }
 
-__device float3 svm_mix_light(float t, float3 col1, float3 col2)
+ccl_device float3 svm_mix_light(float t, float3 col1, float3 col2)
 {
 	return max(col1, col2*t);
 }
 
-__device float3 svm_mix_dodge(float t, float3 col1, float3 col2)
+ccl_device float3 svm_mix_dodge(float t, float3 col1, float3 col2)
 {
 	float3 outcol = col1;
 
@@ -132,7 +132,7 @@ __device float3 svm_mix_dodge(float t, float3 col1, float3 col2)
 	return outcol;
 }
 
-__device float3 svm_mix_burn(float t, float3 col1, float3 col2)
+ccl_device float3 svm_mix_burn(float t, float3 col1, float3 col2)
 {
 	float tmp, tm = 1.0f - t;
 
@@ -171,7 +171,7 @@ __device float3 svm_mix_burn(float t, float3 col1, float3 col2)
 	return outcol;
 }
 
-__device float3 svm_mix_hue(float t, float3 col1, float3 col2)
+ccl_device float3 svm_mix_hue(float t, float3 col1, float3 col2)
 {
 	float3 outcol = col1;
 
@@ -188,7 +188,7 @@ __device float3 svm_mix_hue(float t, float3 col1, float3 col2)
 	return outcol;
 }
 
-__device float3 svm_mix_sat(float t, float3 col1, float3 col2)
+ccl_device float3 svm_mix_sat(float t, float3 col1, float3 col2)
 {
 	float tm = 1.0f - t;
 
@@ -206,7 +206,7 @@ __device float3 svm_mix_sat(float t, float3 col1, float3 col2)
 	return outcol;
 }
 
-__device float3 svm_mix_val(float t, float3 col1, float3 col2)
+ccl_device float3 svm_mix_val(float t, float3 col1, float3 col2)
 {
 	float tm = 1.0f - t;
 
@@ -218,7 +218,7 @@ __device float3 svm_mix_val(float t, float3 col1, float3 col2)
 	return hsv_to_rgb(hsv);
 }
 
-__device float3 svm_mix_color(float t, float3 col1, float3 col2)
+ccl_device float3 svm_mix_color(float t, float3 col1, float3 col2)
 {
 	float3 outcol = col1;
 	float3 hsv2 = rgb_to_hsv(col2);
@@ -235,7 +235,7 @@ __device float3 svm_mix_color(float t, float3 col1, float3 col2)
 	return outcol;
 }
 
-__device float3 svm_mix_soft(float t, float3 col1, float3 col2)
+ccl_device float3 svm_mix_soft(float t, float3 col1, float3 col2)
 {
 	float tm = 1.0f - t;
 
@@ -245,7 +245,7 @@ __device float3 svm_mix_soft(float t, float3 col1, float3 col2)
 	return tm*col1 + t*((one - col1)*col2*col1 + col1*scr);
 }
 
-__device float3 svm_mix_linear(float t, float3 col1, float3 col2)
+ccl_device float3 svm_mix_linear(float t, float3 col1, float3 col2)
 {
 	float3 outcol = col1;
 
@@ -267,7 +267,7 @@ __device float3 svm_mix_linear(float t, float3 col1, float3 col2)
 	return outcol;
 }
 
-__device float3 svm_mix_clamp(float3 col)
+ccl_device float3 svm_mix_clamp(float3 col)
 {
 	float3 outcol = col;
 
@@ -278,7 +278,7 @@ __device float3 svm_mix_clamp(float3 col)
 	return outcol;
 }
 
-__device float3 svm_mix(NodeMix type, float fac, float3 c1, float3 c2)
+ccl_device float3 svm_mix(NodeMix type, float fac, float3 c1, float3 c2)
 {
 	float t = clamp(fac, 0.0f, 1.0f);
 
@@ -309,7 +309,7 @@ __device float3 svm_mix(NodeMix type, float fac, float3 c1, float3 c2)
 
 /* Node */
 
-__device void svm_node_mix(KernelGlobals *kg, ShaderData *sd, float *stack, uint fac_offset, uint c1_offset, uint c2_offset, int *offset)
+ccl_device void svm_node_mix(KernelGlobals *kg, ShaderData *sd, float *stack, uint fac_offset, uint c1_offset, uint c2_offset, int *offset)
 {
 	/* read extra data */
 	uint4 node1 = read_node(kg, offset);
diff --git a/intern/cycles/kernel/svm/svm_musgrave.h b/intern/cycles/kernel/svm/svm_musgrave.h
index 65dcf1a9f83..c67dc8297e4 100644
--- a/intern/cycles/kernel/svm/svm_musgrave.h
+++ b/intern/cycles/kernel/svm/svm_musgrave.h
@@ -25,7 +25,7 @@ CCL_NAMESPACE_BEGIN
  * from "Texturing and Modelling: A procedural approach"
  */
 
-__device_noinline float noise_musgrave_fBm(float3 p, NodeNoiseBasis basis, float H, float lacunarity, float octaves)
+ccl_device_noinline float noise_musgrave_fBm(float3 p, NodeNoiseBasis basis, float H, float lacunarity, float octaves)
 {
 	float rmd;
 	float value = 0.0f;
@@ -53,7 +53,7 @@ __device_noinline float noise_musgrave_fBm(float3 p, NodeNoiseBasis basis, float
  * octaves: number of frequencies in the fBm
  */
 
-__device_noinline float noise_musgrave_multi_fractal(float3 p, NodeNoiseBasis basis, float H, float lacunarity, float octaves)
+ccl_device_noinline float noise_musgrave_multi_fractal(float3 p, NodeNoiseBasis basis, float H, float lacunarity, float octaves)
 {
 	float rmd;
 	float value = 1.0f;
@@ -82,7 +82,7 @@ __device_noinline float noise_musgrave_multi_fractal(float3 p, NodeNoiseBasis ba
  * offset: raises the terrain from `sea level'
  */
 
-__device_noinline float noise_musgrave_hetero_terrain(float3 p, NodeNoiseBasis basis, float H, float lacunarity, float octaves, float offset)
+ccl_device_noinline float noise_musgrave_hetero_terrain(float3 p, NodeNoiseBasis basis, float H, float lacunarity, float octaves, float offset)
 {
 	float value, increment, rmd;
 	float pwHL = powf(lacunarity, -H);
@@ -117,7 +117,7 @@ __device_noinline float noise_musgrave_hetero_terrain(float3 p, NodeNoiseBasis b
  * offset: raises the terrain from `sea level'
  */
 
-__device_noinline float noise_musgrave_hybrid_multi_fractal(float3 p, NodeNoiseBasis basis, float H, float lacunarity, float octaves, float offset, float gain)
+ccl_device_noinline float noise_musgrave_hybrid_multi_fractal(float3 p, NodeNoiseBasis basis, float H, float lacunarity, float octaves, float offset, float gain)
 {
 	float result, signal, weight, rmd;
 	float pwHL = powf(lacunarity, -H);
@@ -154,7 +154,7 @@ __device_noinline float noise_musgrave_hybrid_multi_fractal(float3 p, NodeNoiseB
  * offset: raises the terrain from `sea level'
  */
 
-__device_noinline float noise_musgrave_ridged_multi_fractal(float3 p, NodeNoiseBasis basis, float H, float lacunarity, float octaves, float offset, float gain)
+ccl_device_noinline float noise_musgrave_ridged_multi_fractal(float3 p, NodeNoiseBasis basis, float H, float lacunarity, float octaves, float offset, float gain)
 {
 	float result, signal, weight;
 	float pwHL = powf(lacunarity, -H);
@@ -181,7 +181,7 @@ __device_noinline float noise_musgrave_ridged_multi_fractal(float3 p, NodeNoiseB
 
 /* Shader */
 
-__device float svm_musgrave(NodeMusgraveType type, float dimension, float lacunarity, float octaves, float offset, float intensity, float gain, float scale, float3 p)
+ccl_device float svm_musgrave(NodeMusgraveType type, float dimension, float lacunarity, float octaves, float offset, float intensity, float gain, float scale, float3 p)
 {
 	NodeNoiseBasis basis = NODE_NOISE_PERLIN;
 	p *= scale;
@@ -200,7 +200,7 @@ __device float svm_musgrave(NodeMusgraveType type, float dimension, float lacuna
 	return 0.0f;
 }
 
-__device void svm_node_tex_musgrave(KernelGlobals *kg, ShaderData *sd, float *stack, uint4 node, int *offset)
+ccl_device void svm_node_tex_musgrave(KernelGlobals *kg, ShaderData *sd, float *stack, uint4 node, int *offset)
 {
 	uint4 node2 = read_node(kg, offset);
 	uint4 node3 = read_node(kg, offset);
diff --git a/intern/cycles/kernel/svm/svm_noise.h b/intern/cycles/kernel/svm/svm_noise.h
index a55c635b679..2055b0e3ec7 100644
--- a/intern/cycles/kernel/svm/svm_noise.h
+++ b/intern/cycles/kernel/svm/svm_noise.h
@@ -32,17 +32,17 @@
 
 CCL_NAMESPACE_BEGIN
 
-__device int quick_floor(float x)
+ccl_device int quick_floor(float x)
 {
 	return float_to_int(x) - ((x < 0) ? 1 : 0);
 }
 
-__device float bits_to_01(uint bits)
+ccl_device float bits_to_01(uint bits)
 {
 	return bits * (1.0f/(float)0xFFFFFFFF);
 }
 
-__device uint hash(uint kx, uint ky, uint kz)
+ccl_device uint hash(uint kx, uint ky, uint kz)
 {
 	// define some handy macros
 #define rot(x,k) (((x)<<(k)) | ((x)>>(32-(k))))
@@ -71,34 +71,34 @@ __device uint hash(uint kx, uint ky, uint kz)
 #undef final
 }
 
-__device int imod(int a, int b)
+ccl_device int imod(int a, int b)
 {
 	a %= b;
 	return a < 0 ? a + b : a;
 }
 
-__device uint phash(int kx, int ky, int kz, int3 p) 
+ccl_device uint phash(int kx, int ky, int kz, int3 p) 
 {
 	return hash(imod(kx, p.x), imod(ky, p.y), imod(kz, p.z));
 }
 
-__device float floorfrac(float x, int* i)
+ccl_device float floorfrac(float x, int* i)
 {
 	*i = quick_floor(x);
 	return x - *i;
 }
 
-__device float fade(float t)
+ccl_device float fade(float t)
 {
 	return t * t * t * (t * (t * 6.0f - 15.0f) + 10.0f);
 }
 
-__device float nerp(float t, float a, float b)
+ccl_device float nerp(float t, float a, float b)
 {
 	return (1.0f - t) * a + t * b;
 }
 
-__device float grad(int hash, float x, float y, float z)
+ccl_device float grad(int hash, float x, float y, float z)
 {
 	// use vectors pointing to the edges of the cube
 	int h = hash & 15;
@@ -107,12 +107,12 @@ __device float grad(int hash, float x, float y, float z)
 	return ((h&1) ? -u : u) + ((h&2) ? -v : v);
 }
 
-__device float scale3(float result)
+ccl_device float scale3(float result)
 {
 	return 0.9820f * result;
 }
 
-__device_noinline float perlin(float x, float y, float z)
+ccl_device_noinline float perlin(float x, float y, float z)
 {
 	int X; float fx = floorfrac(x, &X);
 	int Y; float fy = floorfrac(y, &Y);
@@ -138,7 +138,7 @@ __device_noinline float perlin(float x, float y, float z)
 	return (isfinite(r))? r: 0.0f;
 }
 
-__device_noinline float perlin_periodic(float x, float y, float z, float3 pperiod)
+ccl_device_noinline float perlin_periodic(float x, float y, float z, float3 pperiod)
 {
 	int X; float fx = floorfrac(x, &X);
 	int Y; float fy = floorfrac(y, &Y);
@@ -171,20 +171,20 @@ __device_noinline float perlin_periodic(float x, float y, float z, float3 pperio
 }
 
 /* perlin noise in range 0..1 */
-__device float noise(float3 p)
+ccl_device float noise(float3 p)
 {
 	float r = perlin(p.x, p.y, p.z);
 	return 0.5f*r + 0.5f;
 }
 
 /* perlin noise in range -1..1 */
-__device float snoise(float3 p)
+ccl_device float snoise(float3 p)
 {
 	return perlin(p.x, p.y, p.z);
 }
 
 /* cell noise */
-__device_noinline float cellnoise(float3 p)
+ccl_device_noinline float cellnoise(float3 p)
 {
 	uint ix = quick_floor(p.x);
 	uint iy = quick_floor(p.y);
@@ -193,7 +193,7 @@ __device_noinline float cellnoise(float3 p)
 	return bits_to_01(hash(ix, iy, iz));
 }
 
-__device float3 cellnoise_color(float3 p)
+ccl_device float3 cellnoise_color(float3 p)
 {
 	float r = cellnoise(p);
 	float g = cellnoise(make_float3(p.y, p.x, p.z));
@@ -203,14 +203,14 @@ __device float3 cellnoise_color(float3 p)
 }
 
 /* periodic perlin noise in range 0..1 */
-__device float pnoise(float3 p, float3 pperiod)
+ccl_device float pnoise(float3 p, float3 pperiod)
 {
 	float r = perlin_periodic(p.x, p.y, p.z, pperiod);
 	return 0.5f*r + 0.5f;
 }
 
 /* periodic perlin noise in range -1..1 */
-__device float psnoise(float3 p, float3 pperiod)
+ccl_device float psnoise(float3 p, float3 pperiod)
 {
 	return perlin_periodic(p.x, p.y, p.z, pperiod);
 }
diff --git a/intern/cycles/kernel/svm/svm_noisetex.h b/intern/cycles/kernel/svm/svm_noisetex.h
index acb3f20246e..02583131704 100644
--- a/intern/cycles/kernel/svm/svm_noisetex.h
+++ b/intern/cycles/kernel/svm/svm_noisetex.h
@@ -18,7 +18,7 @@ CCL_NAMESPACE_BEGIN
 
 /* Noise */
 
-__device_inline void svm_noise(float3 p, float scale, float detail, float distortion, float *fac, float3 *color)
+ccl_device_inline void svm_noise(float3 p, float scale, float detail, float distortion, float *fac, float3 *color)
 {
 	NodeNoiseBasis basis = NODE_NOISE_PERLIN;
 	int hard = 0;
@@ -41,7 +41,7 @@ __device_inline void svm_noise(float3 p, float scale, float detail, float distor
 		noise_turbulence(make_float3(p.y, p.z, p.x), basis, detail, hard));
 }
 
-__device void svm_node_tex_noise(KernelGlobals *kg, ShaderData *sd, float *stack, uint4 node, int *offset)
+ccl_device void svm_node_tex_noise(KernelGlobals *kg, ShaderData *sd, float *stack, uint4 node, int *offset)
 {
 	uint co_offset, scale_offset, detail_offset, distortion_offset, fac_offset, color_offset;
 
diff --git a/intern/cycles/kernel/svm/svm_normal.h b/intern/cycles/kernel/svm/svm_normal.h
index dd7506bb5fc..8695031b8b9 100644
--- a/intern/cycles/kernel/svm/svm_normal.h
+++ b/intern/cycles/kernel/svm/svm_normal.h
@@ -16,7 +16,7 @@
 
 CCL_NAMESPACE_BEGIN
 
-__device void svm_node_normal(KernelGlobals *kg, ShaderData *sd, float *stack, uint in_normal_offset, uint out_normal_offset, uint out_dot_offset, int *offset)
+ccl_device void svm_node_normal(KernelGlobals *kg, ShaderData *sd, float *stack, uint in_normal_offset, uint out_normal_offset, uint out_dot_offset, int *offset)
 {
 	/* read extra data */
 	uint4 node1 = read_node(kg, offset);
diff --git a/intern/cycles/kernel/svm/svm_ramp.h b/intern/cycles/kernel/svm/svm_ramp.h
index 3cb23a2b2dd..55eee3d24c3 100644
--- a/intern/cycles/kernel/svm/svm_ramp.h
+++ b/intern/cycles/kernel/svm/svm_ramp.h
@@ -19,7 +19,7 @@
 
 CCL_NAMESPACE_BEGIN
 
-__device float4 rgb_ramp_lookup(KernelGlobals *kg, int offset, float f, bool interpolate)
+ccl_device float4 rgb_ramp_lookup(KernelGlobals *kg, int offset, float f, bool interpolate)
 {
 	f = clamp(f, 0.0f, 1.0f)*(RAMP_TABLE_SIZE-1);
 
@@ -35,7 +35,7 @@ __device float4 rgb_ramp_lookup(KernelGlobals *kg, int offset, float f, bool int
 	return a;
 }
 
-__device void svm_node_rgb_ramp(KernelGlobals *kg, ShaderData *sd, float *stack, uint4 node, int *offset)
+ccl_device void svm_node_rgb_ramp(KernelGlobals *kg, ShaderData *sd, float *stack, uint4 node, int *offset)
 {
 	uint fac_offset, color_offset, alpha_offset;
 	uint interpolate = node.z;
@@ -53,7 +53,7 @@ __device void svm_node_rgb_ramp(KernelGlobals *kg, ShaderData *sd, float *stack,
 	*offset += RAMP_TABLE_SIZE;
 }
 
-__device void svm_node_rgb_curves(KernelGlobals *kg, ShaderData *sd, float *stack, uint4 node, int *offset)
+ccl_device void svm_node_rgb_curves(KernelGlobals *kg, ShaderData *sd, float *stack, uint4 node, int *offset)
 {
 	uint fac_offset = node.y;
 	uint color_offset = node.z;
@@ -72,7 +72,7 @@ __device void svm_node_rgb_curves(KernelGlobals *kg, ShaderData *sd, float *stac
 	*offset += RAMP_TABLE_SIZE;
 }
 
-__device void svm_node_vector_curves(KernelGlobals *kg, ShaderData *sd, float *stack, uint4 node, int *offset)
+ccl_device void svm_node_vector_curves(KernelGlobals *kg, ShaderData *sd, float *stack, uint4 node, int *offset)
 {
 	uint fac_offset = node.y;
 	uint color_offset = node.z;
diff --git a/intern/cycles/kernel/svm/svm_sepcomb_hsv.h b/intern/cycles/kernel/svm/svm_sepcomb_hsv.h
index 130890fdc8e..0f68ecbea03 100644
--- a/intern/cycles/kernel/svm/svm_sepcomb_hsv.h
+++ b/intern/cycles/kernel/svm/svm_sepcomb_hsv.h
@@ -16,7 +16,7 @@
 
 CCL_NAMESPACE_BEGIN
 
-__device void svm_node_combine_hsv(KernelGlobals *kg, ShaderData *sd, float *stack, uint hue_in, uint saturation_in, uint value_in, int *offset)
+ccl_device void svm_node_combine_hsv(KernelGlobals *kg, ShaderData *sd, float *stack, uint hue_in, uint saturation_in, uint value_in, int *offset)
 {
 	uint4 node1 = read_node(kg, offset);
 	uint color_out = node1.y;
@@ -32,7 +32,7 @@ __device void svm_node_combine_hsv(KernelGlobals *kg, ShaderData *sd, float *sta
 		stack_store_float3(stack, color_out, color);
 }
 
-__device void svm_node_separate_hsv(KernelGlobals *kg, ShaderData *sd, float *stack, uint color_in, uint hue_out, uint saturation_out, int *offset)
+ccl_device void svm_node_separate_hsv(KernelGlobals *kg, ShaderData *sd, float *stack, uint color_in, uint hue_out, uint saturation_out, int *offset)
 {
 	uint4 node1 = read_node(kg, offset);
 	uint value_out = node1.y;
diff --git a/intern/cycles/kernel/svm/svm_sepcomb_rgb.h b/intern/cycles/kernel/svm/svm_sepcomb_rgb.h
index 5c3d95435f2..34c4449ecdb 100644
--- a/intern/cycles/kernel/svm/svm_sepcomb_rgb.h
+++ b/intern/cycles/kernel/svm/svm_sepcomb_rgb.h
@@ -16,7 +16,7 @@
 
 CCL_NAMESPACE_BEGIN
 
-__device void svm_node_combine_rgb(ShaderData *sd, float *stack, uint in_offset, uint color_index, uint out_offset)
+ccl_device void svm_node_combine_rgb(ShaderData *sd, float *stack, uint in_offset, uint color_index, uint out_offset)
 {
 	float color = stack_load_float(stack, in_offset);
 
@@ -24,7 +24,7 @@ __device void svm_node_combine_rgb(ShaderData *sd, float *stack, uint in_offset,
 		stack_store_float(stack, out_offset+color_index, color);
 }
 
-__device void svm_node_separate_rgb(ShaderData *sd, float *stack, uint icolor_offset, uint color_index, uint out_offset)
+ccl_device void svm_node_separate_rgb(ShaderData *sd, float *stack, uint icolor_offset, uint color_index, uint out_offset)
 {
 	float3 color = stack_load_float3(stack, icolor_offset);
 
diff --git a/intern/cycles/kernel/svm/svm_sky.h b/intern/cycles/kernel/svm/svm_sky.h
index 81b5f1a201f..1e3552647bd 100644
--- a/intern/cycles/kernel/svm/svm_sky.h
+++ b/intern/cycles/kernel/svm/svm_sky.h
@@ -18,7 +18,7 @@ CCL_NAMESPACE_BEGIN
 
 /* Sky texture */
 
-__device float sky_angle_between(float thetav, float phiv, float theta, float phi)
+ccl_device float sky_angle_between(float thetav, float phiv, float theta, float phi)
 {
 	float cospsi = sinf(thetav)*sinf(theta)*cosf(phi - phiv) + cosf(thetav)*cosf(theta);
 	return safe_acosf(cospsi);
@@ -28,7 +28,7 @@ __device float sky_angle_between(float thetav, float phiv, float theta, float ph
  * "A Practical Analytic Model for Daylight"
  * A. J. Preetham, Peter Shirley, Brian Smits
  */
-__device float sky_perez_function(float *lam, float theta, float gamma)
+ccl_device float sky_perez_function(float *lam, float theta, float gamma)
 {
 	float ctheta = cosf(theta);
 	float cgamma = cosf(gamma);
@@ -36,7 +36,7 @@ __device float sky_perez_function(float *lam, float theta, float gamma)
 	return (1.0f + lam[0]*expf(lam[1]/ctheta)) * (1.0f + lam[2]*expf(lam[3]*gamma)  + lam[4]*cgamma*cgamma);
 }
 
-__device float3 sky_radiance_old(KernelGlobals *kg, float3 dir,
+ccl_device float3 sky_radiance_old(KernelGlobals *kg, float3 dir,
                                  float sunphi, float suntheta,
                                  float radiance_x, float radiance_y, float radiance_z,
                                  float *config_x, float *config_y, float *config_z)
@@ -66,7 +66,7 @@ __device float3 sky_radiance_old(KernelGlobals *kg, float3 dir,
  * "An Analytic Model for Full Spectral Sky-Dome Radiance"
  * Lukas Hosek, Alexander Wilkie
  */
-__device float sky_radiance_internal(float *configuration, float theta, float gamma)
+ccl_device float sky_radiance_internal(float *configuration, float theta, float gamma)
 {
 	float ctheta = cosf(theta);
 	float cgamma = cosf(gamma);
@@ -80,7 +80,7 @@ __device float sky_radiance_internal(float *configuration, float theta, float ga
 		(configuration[2] + configuration[3] * expM + configuration[5] * rayM + configuration[6] * mieM + configuration[7] * zenith);
 }
 
-__device float3 sky_radiance_new(KernelGlobals *kg, float3 dir,
+ccl_device float3 sky_radiance_new(KernelGlobals *kg, float3 dir,
                                  float sunphi, float suntheta,
                                  float radiance_x, float radiance_y, float radiance_z,
                                  float *config_x, float *config_y, float *config_z)
@@ -105,7 +105,7 @@ __device float3 sky_radiance_new(KernelGlobals *kg, float3 dir,
 	return xyz_to_rgb(x, y, z) * (M_2PI_F/683);
 }
 
-__device void svm_node_tex_sky(KernelGlobals *kg, ShaderData *sd, float *stack, uint4 node, int *offset)
+ccl_device void svm_node_tex_sky(KernelGlobals *kg, ShaderData *sd, float *stack, uint4 node, int *offset)
 {
 	/* Define variables */
 	float sunphi, suntheta, radiance_x, radiance_y, radiance_z;
diff --git a/intern/cycles/kernel/svm/svm_tex_coord.h b/intern/cycles/kernel/svm/svm_tex_coord.h
index 9f88389fcb1..3044cbf81e0 100644
--- a/intern/cycles/kernel/svm/svm_tex_coord.h
+++ b/intern/cycles/kernel/svm/svm_tex_coord.h
@@ -18,7 +18,7 @@ CCL_NAMESPACE_BEGIN
 
 /* Texture Coordinate Node */
 
-__device void svm_node_tex_coord(KernelGlobals *kg, ShaderData *sd, int path_flag, float *stack, uint type, uint out_offset)
+ccl_device void svm_node_tex_coord(KernelGlobals *kg, ShaderData *sd, int path_flag, float *stack, uint type, uint out_offset)
 {
 	float3 data;
 
@@ -78,7 +78,7 @@ __device void svm_node_tex_coord(KernelGlobals *kg, ShaderData *sd, int path_fla
 	stack_store_float3(stack, out_offset, data);
 }
 
-__device void svm_node_tex_coord_bump_dx(KernelGlobals *kg, ShaderData *sd, int path_flag, float *stack, uint type, uint out_offset)
+ccl_device void svm_node_tex_coord_bump_dx(KernelGlobals *kg, ShaderData *sd, int path_flag, float *stack, uint type, uint out_offset)
 {
 #ifdef __RAY_DIFFERENTIALS__
 	float3 data;
@@ -142,7 +142,7 @@ __device void svm_node_tex_coord_bump_dx(KernelGlobals *kg, ShaderData *sd, int
 #endif
 }
 
-__device void svm_node_tex_coord_bump_dy(KernelGlobals *kg, ShaderData *sd, int path_flag, float *stack, uint type, uint out_offset)
+ccl_device void svm_node_tex_coord_bump_dy(KernelGlobals *kg, ShaderData *sd, int path_flag, float *stack, uint type, uint out_offset)
 {
 #ifdef __RAY_DIFFERENTIALS__
 	float3 data;
@@ -206,7 +206,7 @@ __device void svm_node_tex_coord_bump_dy(KernelGlobals *kg, ShaderData *sd, int
 #endif
 }
 
-__device void svm_node_normal_map(KernelGlobals *kg, ShaderData *sd, float *stack, uint4 node)
+ccl_device void svm_node_normal_map(KernelGlobals *kg, ShaderData *sd, float *stack, uint4 node)
 {
 	uint color_offset, strength_offset, normal_offset, space;
 	decode_node_uchar4(node.y, &color_offset, &strength_offset, &normal_offset, &space);
@@ -280,7 +280,7 @@ __device void svm_node_normal_map(KernelGlobals *kg, ShaderData *sd, float *stac
 	stack_store_float3(stack, normal_offset, N);
 }
 
-__device void svm_node_tangent(KernelGlobals *kg, ShaderData *sd, float *stack, uint4 node)
+ccl_device void svm_node_tangent(KernelGlobals *kg, ShaderData *sd, float *stack, uint4 node)
 {
 	uint tangent_offset, direction_type, axis;
 	decode_node_uchar4(node.y, &tangent_offset, &direction_type, &axis, NULL);
diff --git a/intern/cycles/kernel/svm/svm_texture.h b/intern/cycles/kernel/svm/svm_texture.h
index 7f3e09a481d..8ced8390b0b 100644
--- a/intern/cycles/kernel/svm/svm_texture.h
+++ b/intern/cycles/kernel/svm/svm_texture.h
@@ -18,7 +18,7 @@ CCL_NAMESPACE_BEGIN
 
 /* Voronoi Distances */
 
-__device float voronoi_distance(NodeDistanceMetric distance_metric, float3 d, float e)
+ccl_device float voronoi_distance(NodeDistanceMetric distance_metric, float3 d, float e)
 {
 #if 0
 	if(distance_metric == NODE_VORONOI_DISTANCE_SQUARED)
@@ -44,7 +44,7 @@ __device float voronoi_distance(NodeDistanceMetric distance_metric, float3 d, fl
 
 /* Voronoi / Worley like */
 
-__device_noinline float4 voronoi_Fn(float3 p, float e, int n1, int n2)
+ccl_device_noinline float4 voronoi_Fn(float3 p, float e, int n1, int n2)
 {
 	float da[4];
 	float3 pa[4];
@@ -120,29 +120,29 @@ __device_noinline float4 voronoi_Fn(float3 p, float e, int n1, int n2)
 	return result;
 }
 
-__device float voronoi_F1(float3 p) { return voronoi_Fn(p, 0.0f, 0, -1).w; }
-__device float voronoi_F2(float3 p) { return voronoi_Fn(p, 0.0f, 1, -1).w; }
-__device float voronoi_F3(float3 p) { return voronoi_Fn(p, 0.0f, 2, -1).w; }
-__device float voronoi_F4(float3 p) { return voronoi_Fn(p, 0.0f, 3, -1).w; }
-__device float voronoi_F1F2(float3 p) { return voronoi_Fn(p, 0.0f, 0, 1).w; }
+ccl_device float voronoi_F1(float3 p) { return voronoi_Fn(p, 0.0f, 0, -1).w; }
+ccl_device float voronoi_F2(float3 p) { return voronoi_Fn(p, 0.0f, 1, -1).w; }
+ccl_device float voronoi_F3(float3 p) { return voronoi_Fn(p, 0.0f, 2, -1).w; }
+ccl_device float voronoi_F4(float3 p) { return voronoi_Fn(p, 0.0f, 3, -1).w; }
+ccl_device float voronoi_F1F2(float3 p) { return voronoi_Fn(p, 0.0f, 0, 1).w; }
 
-__device float voronoi_Cr(float3 p)
+ccl_device float voronoi_Cr(float3 p)
 {
 	/* crackle type pattern, just a scale/clamp of F2-F1 */
 	float t = 10.0f*voronoi_F1F2(p);
 	return (t > 1.0f)? 1.0f: t;
 }
 
-__device float voronoi_F1S(float3 p) { return 2.0f*voronoi_F1(p) - 1.0f; }
-__device float voronoi_F2S(float3 p) { return 2.0f*voronoi_F2(p) - 1.0f; }
-__device float voronoi_F3S(float3 p) { return 2.0f*voronoi_F3(p) - 1.0f; }
-__device float voronoi_F4S(float3 p) { return 2.0f*voronoi_F4(p) - 1.0f; }
-__device float voronoi_F1F2S(float3 p) { return 2.0f*voronoi_F1F2(p) - 1.0f; }
-__device float voronoi_CrS(float3 p) { return 2.0f*voronoi_Cr(p) - 1.0f; }
+ccl_device float voronoi_F1S(float3 p) { return 2.0f*voronoi_F1(p) - 1.0f; }
+ccl_device float voronoi_F2S(float3 p) { return 2.0f*voronoi_F2(p) - 1.0f; }
+ccl_device float voronoi_F3S(float3 p) { return 2.0f*voronoi_F3(p) - 1.0f; }
+ccl_device float voronoi_F4S(float3 p) { return 2.0f*voronoi_F4(p) - 1.0f; }
+ccl_device float voronoi_F1F2S(float3 p) { return 2.0f*voronoi_F1F2(p) - 1.0f; }
+ccl_device float voronoi_CrS(float3 p) { return 2.0f*voronoi_Cr(p) - 1.0f; }
 
 /* Noise Bases */
 
-__device float noise_basis(float3 p, NodeNoiseBasis basis)
+ccl_device float noise_basis(float3 p, NodeNoiseBasis basis)
 {
 	/* Only Perlin enabled for now, others break CUDA compile by making kernel
 	 * too big, with compile using > 4GB, due to everything being inlined. */
@@ -173,7 +173,7 @@ __device float noise_basis(float3 p, NodeNoiseBasis basis)
 
 /* Soft/Hard Noise */
 
-__device float noise_basis_hard(float3 p, NodeNoiseBasis basis, int hard)
+ccl_device float noise_basis_hard(float3 p, NodeNoiseBasis basis, int hard)
 {
 	float t = noise_basis(p, basis);
 	return (hard)? fabsf(2.0f*t - 1.0f): t;
@@ -181,7 +181,7 @@ __device float noise_basis_hard(float3 p, NodeNoiseBasis basis, int hard)
 
 /* Turbulence */
 
-__device_noinline float noise_turbulence(float3 p, NodeNoiseBasis basis, float octaves, int hard)
+ccl_device_noinline float noise_turbulence(float3 p, NodeNoiseBasis basis, float octaves, int hard)
 {
 	float fscale = 1.0f;
 	float amp = 1.0f;
diff --git a/intern/cycles/kernel/svm/svm_value.h b/intern/cycles/kernel/svm/svm_value.h
index cd5a2e0d871..7beed065288 100644
--- a/intern/cycles/kernel/svm/svm_value.h
+++ b/intern/cycles/kernel/svm/svm_value.h
@@ -18,12 +18,12 @@ CCL_NAMESPACE_BEGIN
 
 /* Value Nodes */
 
-__device void svm_node_value_f(KernelGlobals *kg, ShaderData *sd, float *stack, uint ivalue, uint out_offset)
+ccl_device void svm_node_value_f(KernelGlobals *kg, ShaderData *sd, float *stack, uint ivalue, uint out_offset)
 {
 	stack_store_float(stack, out_offset, __uint_as_float(ivalue));
 }
 
-__device void svm_node_value_v(KernelGlobals *kg, ShaderData *sd, float *stack, uint out_offset, int *offset)
+ccl_device void svm_node_value_v(KernelGlobals *kg, ShaderData *sd, float *stack, uint out_offset, int *offset)
 {
 	/* read extra data */
 	uint4 node1 = read_node(kg, offset);
diff --git a/intern/cycles/kernel/svm/svm_vector_transform.h b/intern/cycles/kernel/svm/svm_vector_transform.h
index 95ef8e3e558..1e3fc2fa03b 100644
--- a/intern/cycles/kernel/svm/svm_vector_transform.h
+++ b/intern/cycles/kernel/svm/svm_vector_transform.h
@@ -18,7 +18,7 @@ CCL_NAMESPACE_BEGIN
 
 /* Vector Transform */
 
-__device void svm_node_vector_transform(KernelGlobals *kg, ShaderData *sd, float *stack, uint4 node)
+ccl_device void svm_node_vector_transform(KernelGlobals *kg, ShaderData *sd, float *stack, uint4 node)
 {
 	uint itype, ifrom, ito;
 	uint vector_in, vector_out;
diff --git a/intern/cycles/kernel/svm/svm_voronoi.h b/intern/cycles/kernel/svm/svm_voronoi.h
index c9ebea2bceb..604fd3404c5 100644
--- a/intern/cycles/kernel/svm/svm_voronoi.h
+++ b/intern/cycles/kernel/svm/svm_voronoi.h
@@ -18,7 +18,7 @@ CCL_NAMESPACE_BEGIN
 
 /* Voronoi */
 
-__device_noinline float4 svm_voronoi(NodeVoronoiColoring coloring, float scale, float3 p)
+ccl_device_noinline float4 svm_voronoi(NodeVoronoiColoring coloring, float scale, float3 p)
 {
 	/* compute distance and point coordinate of 4 nearest neighbours */
 	float4 dpa0 = voronoi_Fn(p*scale, 1.0f, 0, -1);
@@ -39,7 +39,7 @@ __device_noinline float4 svm_voronoi(NodeVoronoiColoring coloring, float scale,
 	return make_float4(color.x, color.y, color.z, fac);
 }
 
-__device void svm_node_tex_voronoi(KernelGlobals *kg, ShaderData *sd, float *stack, uint4 node, int *offset)
+ccl_device void svm_node_tex_voronoi(KernelGlobals *kg, ShaderData *sd, float *stack, uint4 node, int *offset)
 {
 	uint coloring = node.y;
 	uint scale_offset, co_offset, fac_offset, color_offset;
diff --git a/intern/cycles/kernel/svm/svm_wave.h b/intern/cycles/kernel/svm/svm_wave.h
index d906266bcf9..3749135f8c7 100644
--- a/intern/cycles/kernel/svm/svm_wave.h
+++ b/intern/cycles/kernel/svm/svm_wave.h
@@ -18,7 +18,7 @@ CCL_NAMESPACE_BEGIN
 
 /* Wave */
 
-__device_noinline float svm_wave(NodeWaveType type, float3 p, float scale, float detail, float distortion, float dscale)
+ccl_device_noinline float svm_wave(NodeWaveType type, float3 p, float scale, float detail, float distortion, float dscale)
 {
 	float n;
 
@@ -35,7 +35,7 @@ __device_noinline float svm_wave(NodeWaveType type, float3 p, float scale, float
 	return 0.5f + 0.5f * sinf(n);
 }
 
-__device void svm_node_tex_wave(KernelGlobals *kg, ShaderData *sd, float *stack, uint4 node, int *offset)
+ccl_device void svm_node_tex_wave(KernelGlobals *kg, ShaderData *sd, float *stack, uint4 node, int *offset)
 {
 	uint4 node2 = read_node(kg, offset);
 
diff --git a/intern/cycles/kernel/svm/svm_wavelength.h b/intern/cycles/kernel/svm/svm_wavelength.h
index f9dd24dacef..dca4003b89a 100644
--- a/intern/cycles/kernel/svm/svm_wavelength.h
+++ b/intern/cycles/kernel/svm/svm_wavelength.h
@@ -34,7 +34,7 @@ CCL_NAMESPACE_BEGIN
 
 /* Wavelength to RGB */
 
-__device void svm_node_wavelength(ShaderData *sd, float *stack, uint wavelength, uint color_out)
+ccl_device void svm_node_wavelength(ShaderData *sd, float *stack, uint wavelength, uint color_out)
 {	
 	// CIE colour matching functions xBar, yBar, and zBar for
 	//	 wavelengths from 380 through 780 nanometers, every 5
diff --git a/intern/cycles/kernel/svm/svm_wireframe.h b/intern/cycles/kernel/svm/svm_wireframe.h
index 9ecb81847d7..e560e6303cc 100644
--- a/intern/cycles/kernel/svm/svm_wireframe.h
+++ b/intern/cycles/kernel/svm/svm_wireframe.h
@@ -34,7 +34,7 @@ CCL_NAMESPACE_BEGIN
 
 /* Wireframe Node */
 
-__device void svm_node_wireframe(KernelGlobals *kg, ShaderData *sd, float *stack, uint in_size, uint out_fac, uint use_pixel_size)
+ccl_device void svm_node_wireframe(KernelGlobals *kg, ShaderData *sd, float *stack, uint in_size, uint out_fac, uint use_pixel_size)
 {
 	/* Input Data */
 	float size = stack_load_float(stack, in_size);