Welcome to mirror list, hosted at ThFree Co, Russian Federation.

git.blender.org/blender.git - Unnamed repository; edit this file 'description' to name the repository.
summaryrefslogtreecommitdiff
diff options
context:
space:
mode:
authorDaniel Genrich <daniel.genrich@gmx.net>2014-10-23 17:12:28 +0400
committerDaniel Genrich <daniel.genrich@gmx.net>2014-10-23 17:12:28 +0400
commit9ff1ebed52e0f858a395eeea4caf89304e068b2d (patch)
treeb05d0f4b229de61b088a128ad412dd7bba347928 /intern/cycles
parenta2ed11c6eeab5fab8cb81e32e1c68fdafdd5dbbc (diff)
parenteaaeae469968c5c78a5d7e6d202f1af00b382a79 (diff)
Merge remote-tracking branch 'origin/master' into soc-2014-fluid
Conflicts: .gitignore intern/cycles/CMakeLists.txt source/blender/blenkernel/intern/smoke.c source/blender/python/intern/bpy_interface.c source/creator/CMakeLists.txt
Diffstat (limited to 'intern/cycles')
-rw-r--r--intern/cycles/CMakeLists.txt57
-rw-r--r--intern/cycles/SConscript17
-rw-r--r--intern/cycles/app/CMakeLists.txt9
-rw-r--r--intern/cycles/app/cycles_standalone.cpp6
-rw-r--r--intern/cycles/app/cycles_xml.cpp20
-rw-r--r--intern/cycles/app/io_export_cycles_xml.py21
-rw-r--r--intern/cycles/blender/CCL_api.h4
-rw-r--r--intern/cycles/blender/CMakeLists.txt5
-rw-r--r--intern/cycles/blender/addon/__init__.py5
-rw-r--r--intern/cycles/blender/addon/engine.py3
-rw-r--r--intern/cycles/blender/addon/presets.py17
-rw-r--r--intern/cycles/blender/addon/properties.py78
-rw-r--r--intern/cycles/blender/addon/ui.py114
-rw-r--r--intern/cycles/blender/addon/version_update.py59
-rw-r--r--intern/cycles/blender/blender_camera.cpp6
-rw-r--r--intern/cycles/blender/blender_curves.cpp74
-rw-r--r--intern/cycles/blender/blender_logging.cpp65
-rw-r--r--intern/cycles/blender/blender_mesh.cpp131
-rw-r--r--intern/cycles/blender/blender_object.cpp17
-rw-r--r--intern/cycles/blender/blender_python.cpp60
-rw-r--r--intern/cycles/blender/blender_session.cpp96
-rw-r--r--intern/cycles/blender/blender_session.h3
-rw-r--r--intern/cycles/blender/blender_shader.cpp150
-rw-r--r--intern/cycles/blender/blender_sync.cpp28
-rw-r--r--intern/cycles/bvh/bvh.cpp36
-rw-r--r--intern/cycles/bvh/bvh_params.h2
-rw-r--r--intern/cycles/cmake/external_libs.cmake16
-rw-r--r--intern/cycles/device/CMakeLists.txt6
-rw-r--r--intern/cycles/device/device.cpp19
-rw-r--r--intern/cycles/device/device.h1
-rw-r--r--intern/cycles/device/device_cpu.cpp149
-rw-r--r--intern/cycles/device/device_cuda.cpp375
-rw-r--r--intern/cycles/device/device_intern.h2
-rw-r--r--intern/cycles/device/device_memory.h7
-rw-r--r--intern/cycles/device/device_multi.cpp16
-rw-r--r--intern/cycles/device/device_network.cpp5
-rw-r--r--intern/cycles/device/device_opencl.cpp143
-rw-r--r--intern/cycles/device/device_task.cpp40
-rw-r--r--intern/cycles/device/device_task.h6
-rw-r--r--intern/cycles/kernel/CMakeLists.txt95
-rw-r--r--intern/cycles/kernel/SConscript62
-rw-r--r--intern/cycles/kernel/closure/bsdf.h71
-rw-r--r--intern/cycles/kernel/closure/bsdf_ashikhmin_shirley.h210
-rw-r--r--intern/cycles/kernel/closure/bsdf_hair.h8
-rw-r--r--intern/cycles/kernel/closure/bsdf_microfacet.h943
-rw-r--r--intern/cycles/kernel/closure/bsdf_util.h18
-rw-r--r--intern/cycles/kernel/closure/bsdf_ward.h189
-rw-r--r--intern/cycles/kernel/closure/bsdf_westin.h180
-rw-r--r--intern/cycles/kernel/geom/geom_bvh.h116
-rw-r--r--intern/cycles/kernel/geom/geom_bvh_shadow.h57
-rw-r--r--intern/cycles/kernel/geom/geom_bvh_subsurface.h54
-rw-r--r--intern/cycles/kernel/geom/geom_bvh_traversal.h72
-rw-r--r--intern/cycles/kernel/geom/geom_bvh_volume.h322
-rw-r--r--intern/cycles/kernel/geom/geom_curve.h146
-rw-r--r--intern/cycles/kernel/geom/geom_motion_triangle.h36
-rw-r--r--intern/cycles/kernel/geom/geom_primitive.h2
-rw-r--r--intern/cycles/kernel/geom/geom_triangle.h76
-rw-r--r--intern/cycles/kernel/geom/geom_volume.h20
-rw-r--r--intern/cycles/kernel/kernel.cl31
-rw-r--r--intern/cycles/kernel/kernel.cpp9
-rw-r--r--intern/cycles/kernel/kernel.cu37
-rw-r--r--intern/cycles/kernel/kernel.h21
-rw-r--r--intern/cycles/kernel/kernel_accumulate.h49
-rw-r--r--intern/cycles/kernel/kernel_avx.cpp10
-rw-r--r--intern/cycles/kernel/kernel_avx2.cpp87
-rw-r--r--intern/cycles/kernel/kernel_bake.h (renamed from intern/cycles/kernel/kernel_displace.h)182
-rw-r--r--intern/cycles/kernel/kernel_camera.h27
-rw-r--r--intern/cycles/kernel/kernel_compat_cpu.h116
-rw-r--r--intern/cycles/kernel/kernel_compat_cuda.h13
-rw-r--r--intern/cycles/kernel/kernel_compat_opencl.h66
-rw-r--r--intern/cycles/kernel/kernel_debug.h38
-rw-r--r--intern/cycles/kernel/kernel_emission.h77
-rw-r--r--intern/cycles/kernel/kernel_jitter.h24
-rw-r--r--intern/cycles/kernel/kernel_light.h171
-rw-r--r--intern/cycles/kernel/kernel_path.h1076
-rw-r--r--intern/cycles/kernel/kernel_path_state.h14
-rw-r--r--intern/cycles/kernel/kernel_path_surface.h299
-rw-r--r--intern/cycles/kernel/kernel_path_volume.h267
-rw-r--r--intern/cycles/kernel/kernel_random.h29
-rw-r--r--intern/cycles/kernel/kernel_shader.h24
-rw-r--r--intern/cycles/kernel/kernel_shadow.h31
-rw-r--r--intern/cycles/kernel/kernel_sse2.cpp9
-rw-r--r--intern/cycles/kernel/kernel_sse3.cpp9
-rw-r--r--intern/cycles/kernel/kernel_sse41.cpp9
-rw-r--r--intern/cycles/kernel/kernel_textures.h7
-rw-r--r--intern/cycles/kernel/kernel_types.h252
-rw-r--r--intern/cycles/kernel/kernel_volume.h507
-rw-r--r--intern/cycles/kernel/osl/SConscript3
-rw-r--r--intern/cycles/kernel/osl/osl_bssrdf.cpp24
-rw-r--r--intern/cycles/kernel/osl/osl_closures.cpp75
-rw-r--r--intern/cycles/kernel/osl/osl_closures.h17
-rw-r--r--intern/cycles/kernel/osl/osl_globals.h1
-rw-r--r--intern/cycles/kernel/osl/osl_services.cpp76
-rw-r--r--intern/cycles/kernel/osl/osl_services.h92
-rw-r--r--intern/cycles/kernel/osl/osl_shader.cpp20
-rw-r--r--intern/cycles/kernel/shaders/CMakeLists.txt4
-rw-r--r--intern/cycles/kernel/shaders/node_anisotropic_bsdf.osl (renamed from intern/cycles/kernel/shaders/node_ward_bsdf.osl)12
-rw-r--r--intern/cycles/kernel/shaders/node_brick_texture.osl2
-rw-r--r--intern/cycles/kernel/shaders/node_checker_texture.osl6
-rw-r--r--intern/cycles/kernel/shaders/node_combine_xyz.osl27
-rw-r--r--intern/cycles/kernel/shaders/node_emission.osl6
-rw-r--r--intern/cycles/kernel/shaders/node_fresnel.h13
-rw-r--r--intern/cycles/kernel/shaders/node_geometry.osl8
-rw-r--r--intern/cycles/kernel/shaders/node_glossy_bsdf.osl4
-rw-r--r--intern/cycles/kernel/shaders/node_image_texture.osl4
-rw-r--r--intern/cycles/kernel/shaders/node_musgrave_texture.osl24
-rw-r--r--intern/cycles/kernel/shaders/node_separate_xyz.osl28
-rw-r--r--intern/cycles/kernel/shaders/node_texture.h6
-rw-r--r--intern/cycles/kernel/shaders/stdosl.h12
-rw-r--r--intern/cycles/kernel/svm/svm.h27
-rw-r--r--intern/cycles/kernel/svm/svm_blackbody.h6
-rw-r--r--intern/cycles/kernel/svm/svm_checker.h6
-rw-r--r--intern/cycles/kernel/svm/svm_closure.h114
-rw-r--r--intern/cycles/kernel/svm/svm_convert.h4
-rw-r--r--intern/cycles/kernel/svm/svm_image.h16
-rw-r--r--intern/cycles/kernel/svm/svm_noise.h193
-rw-r--r--intern/cycles/kernel/svm/svm_sepcomb_rgb.h42
-rw-r--r--intern/cycles/kernel/svm/svm_sepcomb_vector.h44
-rw-r--r--intern/cycles/kernel/svm/svm_texture.h22
-rw-r--r--intern/cycles/kernel/svm/svm_types.h14
-rw-r--r--intern/cycles/render/CMakeLists.txt4
-rw-r--r--intern/cycles/render/attribute.cpp24
-rw-r--r--intern/cycles/render/attribute.h2
-rw-r--r--intern/cycles/render/background.cpp2
-rw-r--r--intern/cycles/render/bake.cpp159
-rw-r--r--intern/cycles/render/bake.h24
-rw-r--r--intern/cycles/render/blackbody.cpp2
-rw-r--r--intern/cycles/render/buffers.cpp8
-rw-r--r--intern/cycles/render/camera.cpp80
-rw-r--r--intern/cycles/render/camera.h6
-rw-r--r--intern/cycles/render/curves.cpp16
-rw-r--r--intern/cycles/render/film.cpp25
-rw-r--r--intern/cycles/render/graph.cpp100
-rw-r--r--intern/cycles/render/graph.h3
-rw-r--r--intern/cycles/render/image.cpp135
-rw-r--r--intern/cycles/render/image.h8
-rw-r--r--intern/cycles/render/integrator.cpp44
-rw-r--r--intern/cycles/render/integrator.h3
-rw-r--r--intern/cycles/render/light.cpp20
-rw-r--r--intern/cycles/render/light.h1
-rw-r--r--intern/cycles/render/mesh.cpp67
-rw-r--r--intern/cycles/render/mesh.h7
-rw-r--r--intern/cycles/render/mesh_displace.cpp1
-rw-r--r--intern/cycles/render/nodes.cpp182
-rw-r--r--intern/cycles/render/nodes.h20
-rw-r--r--intern/cycles/render/object.cpp54
-rw-r--r--intern/cycles/render/object.h1
-rw-r--r--intern/cycles/render/osl.cpp13
-rw-r--r--intern/cycles/render/scene.cpp32
-rw-r--r--intern/cycles/render/scene.h8
-rw-r--r--intern/cycles/render/session.cpp12
-rw-r--r--intern/cycles/render/session.h8
-rw-r--r--intern/cycles/render/shader.cpp125
-rw-r--r--intern/cycles/render/shader.h28
-rw-r--r--intern/cycles/render/svm.cpp67
-rw-r--r--intern/cycles/render/svm.h12
-rw-r--r--intern/cycles/render/tile.cpp6
-rw-r--r--intern/cycles/util/CMakeLists.txt14
-rw-r--r--intern/cycles/util/util_boundbox.h9
-rw-r--r--intern/cycles/util/util_cache.h44
-rw-r--r--intern/cycles/util/util_color.h69
-rw-r--r--intern/cycles/util/util_cuda.cpp495
-rw-r--r--intern/cycles/util/util_cuda.h624
-rw-r--r--intern/cycles/util/util_half.h24
-rw-r--r--intern/cycles/util/util_logging.cpp33
-rw-r--r--intern/cycles/util/util_logging.h53
-rw-r--r--intern/cycles/util/util_math.h37
-rw-r--r--intern/cycles/util/util_opencl.cpp337
-rw-r--r--intern/cycles/util/util_opencl.h1313
-rw-r--r--intern/cycles/util/util_opengl.h3
-rw-r--r--intern/cycles/util/util_optimization.h16
-rw-r--r--intern/cycles/util/util_path.cpp16
-rw-r--r--intern/cycles/util/util_progress.h6
-rw-r--r--intern/cycles/util/util_simd.cpp47
-rw-r--r--intern/cycles/util/util_simd.h479
-rw-r--r--intern/cycles/util/util_sseb.h161
-rw-r--r--intern/cycles/util/util_ssef.h588
-rw-r--r--intern/cycles/util/util_ssei.h294
-rw-r--r--intern/cycles/util/util_stats.h1
-rw-r--r--intern/cycles/util/util_system.cpp17
-rw-r--r--intern/cycles/util/util_system.h1
-rw-r--r--intern/cycles/util/util_types.h22
-rw-r--r--intern/cycles/util/util_vector.h6
-rw-r--r--intern/cycles/util/util_view.cpp2
184 files changed, 8593 insertions, 6415 deletions
diff --git a/intern/cycles/CMakeLists.txt b/intern/cycles/CMakeLists.txt
index 504d5a7b831..7de1182282d 100644
--- a/intern/cycles/CMakeLists.txt
+++ b/intern/cycles/CMakeLists.txt
@@ -14,14 +14,18 @@ include(cmake/external_libs.cmake)
# todo: refactor this code to match scons
# note: CXX_HAS_SSE is needed in case passing SSE flags fails altogether (gcc-arm)
-if(WIN32 AND MSVC)
+if(NOT WITH_CPU_SSE)
+ set(CXX_HAS_SSE FALSE)
+elseif(WIN32 AND MSVC)
set(CXX_HAS_SSE TRUE)
# /arch:AVX for VC2012 and above
if(NOT MSVC_VERSION LESS 1700)
set(CYCLES_AVX_ARCH_FLAGS "/arch:AVX")
+ set(CYCLES_AVX2_ARCH_FLAGS "/arch:AVX /arch:AVX2")
elseif(NOT CMAKE_CL_64)
set(CYCLES_AVX_ARCH_FLAGS "/arch:SSE2")
+ set(CYCLES_AVX2_ARCH_FLAGS "/arch:SSE2")
endif()
# there is no /arch:SSE3, but intrinsics are available anyway
@@ -30,11 +34,13 @@ if(WIN32 AND MSVC)
set(CYCLES_SSE3_KERNEL_FLAGS "/fp:fast -D_CRT_SECURE_NO_WARNINGS /GS-")
set(CYCLES_SSE41_KERNEL_FLAGS "/fp:fast -D_CRT_SECURE_NO_WARNINGS /GS-")
set(CYCLES_AVX_KERNEL_FLAGS "${CYCLES_AVX_ARCH_FLAGS} /fp:fast -D_CRT_SECURE_NO_WARNINGS /GS-")
+ set(CYCLES_AVX2_KERNEL_FLAGS "${CYCLES_AVX2_ARCH_FLAGS} /fp:fast -D_CRT_SECURE_NO_WARNINGS /GS-")
else()
set(CYCLES_SSE2_KERNEL_FLAGS "/arch:SSE2 /fp:fast -D_CRT_SECURE_NO_WARNINGS /GS-")
set(CYCLES_SSE3_KERNEL_FLAGS "/arch:SSE2 /fp:fast -D_CRT_SECURE_NO_WARNINGS /GS-")
set(CYCLES_SSE41_KERNEL_FLAGS "/arch:SSE2 /fp:fast -D_CRT_SECURE_NO_WARNINGS /GS-")
set(CYCLES_AVX_KERNEL_FLAGS "${CYCLES_AVX_ARCH_FLAGS} /fp:fast -D_CRT_SECURE_NO_WARNINGS /GS-")
+ set(CYCLES_AVX2_KERNEL_FLAGS "${CYCLES_AVX2_ARCH_FLAGS} /fp:fast -D_CRT_SECURE_NO_WARNINGS /GS-")
endif()
set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} /fp:fast -D_CRT_SECURE_NO_WARNINGS /GS-")
@@ -47,7 +53,8 @@ elseif(CMAKE_COMPILER_IS_GNUCC)
set(CYCLES_SSE2_KERNEL_FLAGS "-ffast-math -msse -msse2 -mfpmath=sse")
set(CYCLES_SSE3_KERNEL_FLAGS "-ffast-math -msse -msse2 -msse3 -mssse3 -mfpmath=sse")
set(CYCLES_SSE41_KERNEL_FLAGS "-ffast-math -msse -msse2 -msse3 -mssse3 -msse4.1 -mfpmath=sse")
- set(CYCLES_AVX_KERNEL_FLAGS "-ffast-math -msse -msse2 -msse3 -mssse3 -msse4.1 -mfpmath=sse")
+ set(CYCLES_AVX_KERNEL_FLAGS "-ffast-math -msse -msse2 -msse3 -mssse3 -msse4.1 -mavx -mfpmath=sse")
+ set(CYCLES_AVX2_KERNEL_FLAGS "-ffast-math -msse -msse2 -msse3 -mssse3 -msse4.1 -mavx -mavx2 -mfma -mlzcnt -mbmi -mbmi2 -mfpmath=sse")
endif()
set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -ffast-math")
elseif(CMAKE_CXX_COMPILER_ID MATCHES "Clang")
@@ -56,7 +63,8 @@ elseif(CMAKE_CXX_COMPILER_ID MATCHES "Clang")
set(CYCLES_SSE2_KERNEL_FLAGS "-ffast-math -msse -msse2")
set(CYCLES_SSE3_KERNEL_FLAGS "-ffast-math -msse -msse2 -msse3 -mssse3")
set(CYCLES_SSE41_KERNEL_FLAGS "-ffast-math -msse -msse2 -msse3 -mssse3 -msse4.1")
- set(CYCLES_AVX_KERNEL_FLAGS "-ffast-math -msse -msse2 -msse3 -mssse3 -msse4.1")
+ set(CYCLES_AVX_KERNEL_FLAGS "-ffast-math -msse -msse2 -msse3 -mssse3 -msse4.1 -mavx")
+ set(CYCLES_AVX2_KERNEL_FLAGS "-ffast-math -msse -msse2 -msse3 -mssse3 -msse4.1 -mavx -mavx2 -mfma -mlzcnt -mbmi -mbmi2")
endif()
set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -ffast-math")
endif()
@@ -67,14 +75,16 @@ if(CXX_HAS_SSE)
-DWITH_KERNEL_SSE3
-DWITH_KERNEL_SSE41
-DWITH_KERNEL_AVX
+ -DWITH_KERNEL_AVX2
)
endif()
-# for OSL
-if(WIN32 AND MSVC)
- set(RTTI_DISABLE_FLAGS "/GR- -DBOOST_NO_RTTI -DBOOST_NO_TYPEID")
-elseif(CMAKE_COMPILER_IS_GNUCC OR (CMAKE_C_COMPILER_ID MATCHES "Clang"))
- set(RTTI_DISABLE_FLAGS "-fno-rtti -DBOOST_NO_RTTI -DBOOST_NO_TYPEID")
+if(WITH_CYCLES_OSL)
+ if(WIN32 AND MSVC)
+ set(RTTI_DISABLE_FLAGS "/GR- -DBOOST_NO_RTTI -DBOOST_NO_TYPEID")
+ elseif(CMAKE_COMPILER_IS_GNUCC OR (CMAKE_C_COMPILER_ID MATCHES "Clang"))
+ set(RTTI_DISABLE_FLAGS "-fno-rtti -DBOOST_NO_RTTI -DBOOST_NO_TYPEID")
+ endif()
endif()
# Definitions and Includes
@@ -108,7 +118,10 @@ endif()
if(WITH_CYCLES_OSL)
add_definitions(-DWITH_OSL)
add_definitions(-DOSL_STATIC_LIBRARY)
- include_directories(${OSL_INCLUDES})
+ include_directories(
+ SYSTEM
+ ${OSL_INCLUDES}
+ )
endif()
add_definitions(
@@ -117,6 +130,30 @@ add_definitions(
-DWITH_MULTI
)
+# Logging capabilities using GLog library.
+if(WITH_CYCLES_LOGGING)
+ add_definitions(-DWITH_CYCLES_LOGGING)
+ add_definitions(-DGOOGLE_GLOG_DLL_DECL=)
+ if(WIN32)
+ include_directories(
+ SYSTEM
+ ../../extern/libmv/third_party/glog/src/windows
+ ../../extern/libmv/third_party/gflags
+ )
+ else()
+ include_directories(
+ SYSTEM
+ ../../extern/libmv/third_party/glog/src
+ ../../extern/libmv/third_party/gflags
+ )
+ endif()
+endif()
+
+# Debugging capabilities (debug passes etc).
+if(WITH_CYCLES_DEBUG)
+ add_definitions(-DWITH_CYCLES_DEBUG)
+endif()
+
include_directories(
SYSTEM
${BOOST_INCLUDE_DIR}
@@ -130,7 +167,9 @@ include_directories(
# Warnings
if(CMAKE_COMPILER_IS_GNUCXX)
ADD_CHECK_CXX_COMPILER_FLAG(CMAKE_CXX_FLAGS _has_cxxflag_float_conversion "-Werror=float-conversion")
+ ADD_CHECK_CXX_COMPILER_FLAG(CMAKE_CXX_FLAGS _has_cxxflag_double_promotion "-Werror=double-promotion")
unset(_has_cxxflag_float_conversion)
+ unset(_has_cxxflag_double_promotion)
endif()
diff --git a/intern/cycles/SConscript b/intern/cycles/SConscript
index 532238b9d7e..b399844534d 100644
--- a/intern/cycles/SConscript
+++ b/intern/cycles/SConscript
@@ -39,12 +39,13 @@ sources.remove(path.join('kernel', 'kernel_sse2.cpp'))
sources.remove(path.join('kernel', 'kernel_sse3.cpp'))
sources.remove(path.join('kernel', 'kernel_sse41.cpp'))
sources.remove(path.join('kernel', 'kernel_avx.cpp'))
+sources.remove(path.join('kernel', 'kernel_avx2.cpp'))
incs = []
defs = []
cxxflags = Split(env['CXXFLAGS'])
-defs.append('GLEW_STATIC')
+defs += env['BF_GL_DEFINITIONS']
defs.append('CCL_NAMESPACE_BEGIN=namespace ccl {')
defs.append('CCL_NAMESPACE_END=}')
@@ -58,10 +59,18 @@ if env['WITH_BF_CYCLES_OSL']:
defs.append('OSL_STATIC_LIBRARY')
incs.append(cycles['BF_OSL_INC'])
+if env['WITH_BF_CYCLES_DEBUG']:
+ defs.append('WITH_CYCLES_DEBUG')
+
incs.extend('. bvh render device kernel kernel/osl kernel/svm util subd'.split())
incs.extend('#intern/guardedalloc #source/blender/makesrna #source/blender/makesdna #source/blender/blenlib'.split())
incs.extend('#source/blender/blenloader ../../source/blender/makesrna/intern'.split())
-incs.extend('#extern/glew/include #intern/mikktspace'.split())
+
+incs.append(env['BF_GLEW_INC'])
+incs.append('#/intern/glew-mx')
+incs.append('#intern/mikktspace')
+incs.extend('#extern/glew/include #extern/clew/include #extern/cuew/include #intern/mikktspace'.split())
+
incs.append(cycles['BF_OIIO_INC'])
incs.append(cycles['BF_BOOST_INC'])
incs.append(cycles['BF_OPENEXR_INC'].split())
@@ -95,9 +104,10 @@ elif env['OURPLATFORM'] == 'win64-vc':
kernel_flags['sse2'] = '-D_CRT_SECURE_NO_WARNINGS /fp:fast /Ox /GS-'
kernel_flags['sse3'] = kernel_flags['sse2']
- if env['MSVC_VERSION'] in ('11.0', '12.0'):
+ if env['MSVC_VERSION'] >= '12.0':
kernel_flags['sse41'] = kernel_flags['sse3']
kernel_flags['avx'] = kernel_flags['sse41'] + ' /arch:AVX'
+ kernel_flags['avx2'] = kernel_flags['sse41'] + ' /arch:AVX /arch:AVX2'
else:
# -mavx only available with relatively new gcc/clang
kernel_flags['sse2'] = '-ffast-math -msse -msse2 -mfpmath=sse'
@@ -106,6 +116,7 @@ else:
if (env['C_COMPILER_ID'] == 'gcc' and env['CCVERSION'] >= '4.6') or (env['C_COMPILER_ID'] == 'clang' and env['CCVERSION'] >= '3.1'):
kernel_flags['avx'] = kernel_flags['sse41'] + ' -mavx'
+ kernel_flags['avx2'] = kernel_flags['avx'] + ' -mavx2 -mfma -mlzcnt -mbmi -mbmi2'
for kernel_type in kernel_flags.keys():
defs.append('WITH_KERNEL_' + kernel_type.upper())
diff --git a/intern/cycles/app/CMakeLists.txt b/intern/cycles/app/CMakeLists.txt
index 52806b0804b..c8464899725 100644
--- a/intern/cycles/app/CMakeLists.txt
+++ b/intern/cycles/app/CMakeLists.txt
@@ -21,15 +21,20 @@ set(LIBRARIES
cycles_util
${BOOST_LIBRARIES}
${OPENEXR_LIBRARIES}
- ${OPENGL_LIBRARIES}
- ${CYCLES_GLEW_LIBRARY}
+ ${BLENDER_GL_LIBRARIES}
+ bf_intern_glew_mx
+ ${CYCLES_APP_GLEW_LIBRARY}
${OPENIMAGEIO_LIBRARIES}
${PNG_LIBRARIES}
${JPEG_LIBRARIES}
${ZLIB_LIBRARIES}
${TIFF_LIBRARY}
+ extern_clew
+ extern_cuew
)
+add_definitions(${GL_DEFINITIONS})
+
if(WIN32)
list(APPEND LIBRARIES ${PTHREADS_LIBRARIES})
endif()
diff --git a/intern/cycles/app/cycles_standalone.cpp b/intern/cycles/app/cycles_standalone.cpp
index 7ea1ca2d8fb..90333eb3fc5 100644
--- a/intern/cycles/app/cycles_standalone.cpp
+++ b/intern/cycles/app/cycles_standalone.cpp
@@ -373,9 +373,9 @@ static void options_parse(int argc, const char **argv)
}
if(ssname == "osl")
- options.scene_params.shadingsystem = SceneParams::OSL;
+ options.scene_params.shadingsystem = SHADINGSYSTEM_OSL;
else if(ssname == "svm")
- options.scene_params.shadingsystem = SceneParams::SVM;
+ options.scene_params.shadingsystem = SHADINGSYSTEM_SVM;
#ifndef WITH_CYCLES_STANDALONE_GUI
options.session_params.background = true;
@@ -408,7 +408,7 @@ static void options_parse(int argc, const char **argv)
fprintf(stderr, "Unknown shading system: %s\n", ssname.c_str());
exit(EXIT_FAILURE);
}
- else if(options.scene_params.shadingsystem == SceneParams::OSL && options.session_params.device.type != DEVICE_CPU) {
+ else if(options.scene_params.shadingsystem == SHADINGSYSTEM_OSL && options.session_params.device.type != DEVICE_CPU) {
fprintf(stderr, "OSL shading system only works with CPU device\n");
exit(EXIT_FAILURE);
}
diff --git a/intern/cycles/app/cycles_xml.cpp b/intern/cycles/app/cycles_xml.cpp
index d5ef30e5c6f..431796e106b 100644
--- a/intern/cycles/app/cycles_xml.cpp
+++ b/intern/cycles/app/cycles_xml.cpp
@@ -304,7 +304,8 @@ static void xml_read_integrator(const XMLReadState& state, pugi::xml_node node)
xml_read_int(&integrator->volume_max_steps, node, "volume_max_steps");
/* Various Settings */
- xml_read_bool(&integrator->no_caustics, node, "no_caustics");
+ xml_read_bool(&integrator->caustics_reflective, node, "caustics_reflective");
+ xml_read_bool(&integrator->caustics_refractive, node, "caustics_refractive");
xml_read_float(&integrator->filter_glossy, node, "filter_glossy");
xml_read_int(&integrator->seed, node, "seed");
@@ -329,6 +330,7 @@ static void xml_read_camera(const XMLReadState& state, pugi::xml_node node)
xml_read_float(&cam->aperturesize, node, "aperturesize"); // 0.5*focallength/fstop
xml_read_float(&cam->focaldistance, node, "focaldistance");
xml_read_float(&cam->shuttertime, node, "shuttertime");
+ xml_read_float(&cam->aperture_ratio, node, "aperture_ratio");
if(xml_equal_string(node, "type", "orthographic"))
cam->type = CAMERA_ORTHOGRAPHIC;
@@ -509,8 +511,10 @@ static void xml_read_shader_graph(const XMLReadState& state, Shader *shader, pug
else if(string_iequals(node.name(), "mapping")) {
snode = new MappingNode();
}
- else if(string_iequals(node.name(), "ward_bsdf")) {
- snode = new WardBsdfNode();
+ else if(string_iequals(node.name(), "anisotropic_bsdf")) {
+ AnisotropicBsdfNode *aniso = new AnisotropicBsdfNode();
+ xml_read_enum(&aniso->distribution, AnisotropicBsdfNode::distribution_enum, node, "distribution");
+ snode = aniso;
}
else if(string_iequals(node.name(), "diffuse_bsdf")) {
snode = new DiffuseBsdfNode();
@@ -550,9 +554,7 @@ static void xml_read_shader_graph(const XMLReadState& state, Shader *shader, pug
snode = hair;
}
else if(string_iequals(node.name(), "emission")) {
- EmissionNode *emission = new EmissionNode();
- xml_read_bool(&emission->total_power, node, "total_power");
- snode = emission;
+ snode = new EmissionNode();
}
else if(string_iequals(node.name(), "ambient_occlusion")) {
snode = new AmbientOcclusionNode();
@@ -635,6 +637,12 @@ static void xml_read_shader_graph(const XMLReadState& state, Shader *shader, pug
else if(string_iequals(node.name(), "separate_hsv")) {
snode = new SeparateHSVNode();
}
+ else if(string_iequals(node.name(), "combine_xyz")) {
+ snode = new CombineHSVNode();
+ }
+ else if(string_iequals(node.name(), "separate_xyz")) {
+ snode = new SeparateHSVNode();
+ }
else if(string_iequals(node.name(), "hsv")) {
snode = new HSVNode();
}
diff --git a/intern/cycles/app/io_export_cycles_xml.py b/intern/cycles/app/io_export_cycles_xml.py
index e310d928b26..ad8fb9d3dd3 100644
--- a/intern/cycles/app/io_export_cycles_xml.py
+++ b/intern/cycles/app/io_export_cycles_xml.py
@@ -111,19 +111,29 @@ class ExportCyclesXML(bpy.types.Operator, ExportHelper):
# generate mesh node
nverts = ""
verts = ""
+ uvs = ""
P = ""
for v in mesh.vertices:
P += "%f %f %f " % (v.co[0], v.co[1], v.co[2])
- for i, f in enumerate(mesh.tessfaces):
- nverts += str(len(f.vertices)) + " "
+ verts_and_uvs = zip(mesh.tessfaces, mesh.tessface_uv_textures.active.data)
+
+ for f, uvf in verts_and_uvs:
+ vcount = len(f.vertices)
+ nverts += str(vcount) + " "
for v in f.vertices:
verts += str(v) + " "
- verts += " "
-
- node = etree.Element('mesh', attrib={'nverts': nverts, 'verts': verts, 'P': P})
+
+ uvs += str(uvf.uv1[0]) + " " + str(uvf.uv1[1]) + " "
+ uvs += str(uvf.uv2[0]) + " " + str(uvf.uv2[1]) + " "
+ uvs += str(uvf.uv3[0]) + " " + str(uvf.uv3[1]) + " "
+ if vcount==4:
+ uvs += " " + str(uvf.uv4[0]) + " " + str(uvf.uv4[1]) + " "
+
+
+ node = etree.Element('mesh', attrib={'nverts': nverts.strip(), 'verts': verts.strip(), 'P': P, 'UV' : uvs.strip()})
# write to file
write(node, filepath)
@@ -139,3 +149,4 @@ def unregister():
if __name__ == "__main__":
register()
+
diff --git a/intern/cycles/blender/CCL_api.h b/intern/cycles/blender/CCL_api.h
index 2772b9ac8a7..cfd0c3ef264 100644
--- a/intern/cycles/blender/CCL_api.h
+++ b/intern/cycles/blender/CCL_api.h
@@ -36,6 +36,10 @@ CCLDeviceInfo *CCL_compute_device_list(int device_type);
void *CCL_python_module_init(void);
+void CCL_init_logging(const char *argv0);
+void CCL_start_debug_logging(void);
+void CCL_logging_verbosity_set(int verbosity);
+
#ifdef __cplusplus
}
#endif
diff --git a/intern/cycles/blender/CMakeLists.txt b/intern/cycles/blender/CMakeLists.txt
index 9a60152841e..e1d592d32b4 100644
--- a/intern/cycles/blender/CMakeLists.txt
+++ b/intern/cycles/blender/CMakeLists.txt
@@ -6,6 +6,7 @@ set(INC
../kernel/svm
../util
../subd
+ ../../glew-mx
../../guardedalloc
../../mikktspace
../../../source/blender/makesdna
@@ -25,6 +26,7 @@ set(SRC
blender_object.cpp
blender_particles.cpp
blender_curves.cpp
+ blender_logging.cpp
blender_python.cpp
blender_session.cpp
blender_shader.cpp
@@ -43,9 +45,10 @@ set(ADDON_FILES
addon/presets.py
addon/properties.py
addon/ui.py
+ addon/version_update.py
)
-add_definitions(-DGLEW_STATIC)
+add_definitions(${GL_DEFINITIONS})
blender_add_lib(bf_intern_cycles "${SRC}" "${INC}" "${INC_SYS}")
diff --git a/intern/cycles/blender/addon/__init__.py b/intern/cycles/blender/addon/__init__.py
index 27d986900c8..8c60ea31053 100644
--- a/intern/cycles/blender/addon/__init__.py
+++ b/intern/cycles/blender/addon/__init__.py
@@ -31,6 +31,7 @@ bl_info = {
import bpy
from . import engine
+from . import version_update
class CyclesRender(bpy.types.RenderEngine):
@@ -100,12 +101,16 @@ def register():
presets.register()
bpy.utils.register_module(__name__)
+ bpy.app.handlers.version_update.append(version_update.do_versions)
+
def unregister():
from . import ui
from . import properties
from . import presets
+ bpy.app.handlers.version_update.remove(version_update.do_versions)
+
ui.unregister()
properties.unregister()
presets.unregister()
diff --git a/intern/cycles/blender/addon/engine.py b/intern/cycles/blender/addon/engine.py
index 25a9e97a99b..18235eca790 100644
--- a/intern/cycles/blender/addon/engine.py
+++ b/intern/cycles/blender/addon/engine.py
@@ -28,7 +28,7 @@ def init():
_cycles.init(path, user_path)
-def create(engine, data, scene, region=0, v3d=0, rv3d=0, preview_osl=False):
+def create(engine, data, scene, region=None, v3d=None, rv3d=None, preview_osl=False):
import bpy
import _cycles
@@ -65,6 +65,7 @@ def bake(engine, obj, pass_type, pixel_array, num_pixels, depth, result):
if session is not None:
_cycles.bake(engine.session, obj.as_pointer(), pass_type, pixel_array.as_pointer(), num_pixels, depth, result.as_pointer())
+
def reset(engine, data, scene):
import _cycles
data = data.as_pointer()
diff --git a/intern/cycles/blender/addon/presets.py b/intern/cycles/blender/addon/presets.py
index 9991fdb8e3b..2ec65d7183a 100644
--- a/intern/cycles/blender/addon/presets.py
+++ b/intern/cycles/blender/addon/presets.py
@@ -33,12 +33,16 @@ class AddPresetIntegrator(AddPresetBase, Operator):
preset_values = [
"cycles.max_bounces",
"cycles.min_bounces",
- "cycles.no_caustics",
"cycles.diffuse_bounces",
"cycles.glossy_bounces",
"cycles.transmission_bounces",
+ "cycles.volume_bounces",
"cycles.transparent_min_bounces",
- "cycles.transparent_max_bounces"
+ "cycles.transparent_max_bounces",
+ "cycles.use_transparent_shadows",
+ "cycles.caustics_reflective",
+ "cycles.caustics_refractive",
+ "cycles.blur_glossy"
]
preset_subdir = "cycles/integrator"
@@ -66,10 +70,13 @@ class AddPresetSampling(AddPresetBase, Operator):
"cycles.mesh_light_samples",
"cycles.subsurface_samples",
"cycles.volume_samples",
- "cycles.no_caustics",
- "cycles.blur_glossy",
"cycles.use_square_samples",
- "cycles.progressive"
+ "cycles.progressive",
+ "cycles.seed",
+ "cycles.sample_clamp_direct",
+ "cycles.sample_clamp_indirect",
+ "cycles.sample_all_lights_direct",
+ "cycles.sample_all_lights_indirect",
]
preset_subdir = "cycles/sampling"
diff --git a/intern/cycles/blender/addon/properties.py b/intern/cycles/blender/addon/properties.py
index 7205a272395..05a6f70d423 100644
--- a/intern/cycles/blender/addon/properties.py
+++ b/intern/cycles/blender/addon/properties.py
@@ -37,7 +37,7 @@ if _cycles.with_network:
enum_feature_set = (
('SUPPORTED', "Supported", "Only use finished and supported features"),
- ('EXPERIMENTAL', "Experimental", "Use experimental and incomplete features that might be broken or change in the future"),
+ ('EXPERIMENTAL', "Experimental", "Use experimental and incomplete features that might be broken or change in the future", 'ERROR', 1),
)
enum_displacement_methods = (
@@ -108,9 +108,15 @@ enum_integrator = (
('PATH', "Path Tracing", "Pure path tracing integrator"),
)
-enum_volume_homogeneous_sampling = (
- ('DISTANCE', "Distance", "Use Distance Sampling"),
- ('EQUI_ANGULAR', "Equi-angular", "Use Equi-angular Sampling"),
+enum_volume_sampling = (
+ ('DISTANCE', "Distance", "Use distance sampling, best for dense volumes with lights far away"),
+ ('EQUIANGULAR', "Equiangular", "Use equiangular sampling, best for volumes with low density with light inside or near the volume"),
+ ('MULTIPLE_IMPORTANCE', "Multiple Importance", "Combine distance and equi-angular sampling for volumes where neither method is ideal"),
+ )
+
+enum_volume_interpolation = (
+ ('LINEAR', "Linear", "Good smoothness and speed"),
+ ('CUBIC', 'Cubic', 'Smoothed high quality interpolation, but slower')
)
@@ -146,13 +152,6 @@ class CyclesRenderSettings(bpy.types.PropertyGroup):
default='PATH',
)
- cls.volume_homogeneous_sampling = EnumProperty(
- name="Homogeneous Sampling",
- description="Sampling method to use for homogeneous volumes",
- items=enum_volume_homogeneous_sampling,
- default='DISTANCE',
- )
-
cls.use_square_samples = BoolProperty(
name="Square Samples",
description="Square sampling values for easier artist control",
@@ -236,7 +235,7 @@ class CyclesRenderSettings(bpy.types.PropertyGroup):
name="Volume Samples",
description="Number of volume scattering samples to render for each AA sample",
min=1, max=10000,
- default=1,
+ default=0,
)
cls.sampling_pattern = EnumProperty(
@@ -265,11 +264,18 @@ class CyclesRenderSettings(bpy.types.PropertyGroup):
default=True,
)
- cls.no_caustics = BoolProperty(
- name="No Caustics",
- description="Leave out caustics, resulting in a darker image with less noise",
- default=False,
+ cls.caustics_reflective = BoolProperty(
+ name="Reflective Caustics",
+ description="Use reflective caustics, resulting in a brighter image (more noise but added realism)",
+ default=True,
)
+
+ cls.caustics_refractive = BoolProperty(
+ name="Refractive Caustics",
+ description="Use refractive caustics, resulting in a brighter image (more noise but added realism)",
+ default=True,
+ )
+
cls.blur_glossy = FloatProperty(
name="Filter Glossy",
description="Adaptively blur glossy shaders after blurry bounces, "
@@ -315,7 +321,7 @@ class CyclesRenderSettings(bpy.types.PropertyGroup):
name="Volume Bounces",
description="Maximum number of volumetric scattering events",
min=0, max=1024,
- default=1,
+ default=0,
)
cls.transparent_min_bounces = IntProperty(
@@ -550,6 +556,13 @@ class CyclesCameraSettings(bpy.types.PropertyGroup):
subtype='ANGLE',
default=0,
)
+ cls.aperture_ratio = FloatProperty(
+ name="Aperture Ratio",
+ description="Distortion to simulate anamorphic lens bokeh",
+ min=0.01, soft_min=1.0, soft_max=2.0,
+ default=1.0,
+ precision=4,
+ )
cls.panorama_type = EnumProperty(
name="Panorama Type",
description="Distortion to use for the calculation",
@@ -602,6 +615,19 @@ class CyclesMaterialSettings(bpy.types.PropertyGroup):
"(not using any textures), for faster rendering",
default=False,
)
+ cls.volume_sampling = EnumProperty(
+ name="Volume Sampling",
+ description="Sampling method to use for volumes",
+ items=enum_volume_sampling,
+ default='DISTANCE',
+ )
+
+ cls.volume_interpolation = EnumProperty(
+ name="Volume Interpolation",
+ description="Interpolation method to use for volumes",
+ items=enum_volume_interpolation,
+ default='LINEAR',
+ )
@classmethod
def unregister(cls):
@@ -672,6 +698,19 @@ class CyclesWorldSettings(bpy.types.PropertyGroup):
"(not using any textures), for faster rendering",
default=False,
)
+ cls.volume_sampling = EnumProperty(
+ name="Volume Sampling",
+ description="Sampling method to use for volumes",
+ items=enum_volume_sampling,
+ default='EQUIANGULAR',
+ )
+
+ cls.volume_interpolation = EnumProperty(
+ name="Volume Interpolation",
+ description="Interpolation method to use for volumes",
+ items=enum_volume_interpolation,
+ default='LINEAR',
+ )
@classmethod
def unregister(cls):
@@ -718,6 +757,11 @@ class CyclesVisibilitySettings(bpy.types.PropertyGroup):
description="Object visibility for shadow rays",
default=True,
)
+ cls.scatter = BoolProperty(
+ name="Volume Scatter",
+ description="Object visibility for volume scatter rays",
+ default=True,
+ )
@classmethod
def unregister(cls):
diff --git a/intern/cycles/blender/addon/ui.py b/intern/cycles/blender/addon/ui.py
index 5c8115b6612..6a08b47b01f 100644
--- a/intern/cycles/blender/addon/ui.py
+++ b/intern/cycles/blender/addon/ui.py
@@ -154,7 +154,7 @@ class CyclesRender_PT_sampling(CyclesButtonsPanel, Panel):
sub.prop(cscene, "subsurface_samples", text="Subsurface")
sub.prop(cscene, "volume_samples", text="Volume")
- if cscene.feature_set == 'EXPERIMENTAL' and use_cpu(context):
+ if use_cpu(context) or cscene.feature_set == 'EXPERIMENTAL':
layout.row().prop(cscene, "sampling_pattern", text="Pattern")
for rl in scene.render.layers:
@@ -176,16 +176,11 @@ class CyclesRender_PT_volume_sampling(CyclesButtonsPanel, Panel):
scene = context.scene
cscene = scene.cycles
- split = layout.split(align=True)
-
- sub = split.column(align=True)
- sub.label("Heterogeneous:")
- sub.prop(cscene, "volume_step_size")
- sub.prop(cscene, "volume_max_steps")
-
- sub = split.column(align=True)
- sub.label("Homogeneous:")
- sub.prop(cscene, "volume_homogeneous_sampling", text="")
+ row = layout.row()
+ row.label("Heterogeneous:")
+ row = layout.row()
+ row.prop(cscene, "volume_step_size")
+ row.prop(cscene, "volume_max_steps")
class CyclesRender_PT_light_paths(CyclesButtonsPanel, Panel):
@@ -215,7 +210,8 @@ class CyclesRender_PT_light_paths(CyclesButtonsPanel, Panel):
col.separator()
- col.prop(cscene, "no_caustics")
+ col.prop(cscene, "caustics_reflective")
+ col.prop(cscene, "caustics_refractive")
col.prop(cscene, "blur_glossy")
col = split.column()
@@ -473,6 +469,7 @@ class CyclesCamera_PT_dof(CyclesButtonsPanel, Panel):
sub = col.column(align=True)
sub.prop(ccam, "aperture_blades", text="Blades")
sub.prop(ccam, "aperture_rotation", text="Rotation")
+ sub.prop(ccam, "aperture_ratio", text="Ratio")
class Cycles_PT_context_material(CyclesButtonsPanel, Panel):
@@ -570,8 +567,7 @@ class CyclesObject_PT_motion_blur(CyclesButtonsPanel, Panel):
layout = self.layout
rd = context.scene.render
- scene = context.scene
- # cscene = scene.cycles
+ # scene = context.scene
layout.active = rd.use_motion_blur
@@ -584,8 +580,7 @@ class CyclesObject_PT_motion_blur(CyclesButtonsPanel, Panel):
layout = self.layout
rd = context.scene.render
- scene = context.scene
- # cscene = scene.cycles
+ # scene = context.scene
ob = context.object
cob = ob.cycles
@@ -624,6 +619,7 @@ class CyclesObject_PT_ray_visibility(CyclesButtonsPanel, Panel):
flow.prop(visibility, "diffuse")
flow.prop(visibility, "glossy")
flow.prop(visibility, "transmission")
+ flow.prop(visibility, "scatter")
if ob.type != 'LAMP':
flow.prop(visibility, "shadow")
@@ -636,7 +632,8 @@ class CYCLES_OT_use_shading_nodes(Operator):
@classmethod
def poll(cls, context):
- return context.material or context.world or context.lamp
+ return (getattr(context, "material", False) or getattr(context, "world", False) or
+ getattr(context, "lamp", False))
def execute(self, context):
if context.material:
@@ -829,8 +826,6 @@ class CyclesWorld_PT_volume(CyclesButtonsPanel, Panel):
world = context.world
panel_node_draw(layout, world, 'OUTPUT_WORLD', 'Volume')
- layout.prop(world.cycles, "homogeneous_volume")
-
class CyclesWorld_PT_ambient_occlusion(CyclesButtonsPanel, Panel):
bl_label = "Ambient Occlusion"
@@ -904,6 +899,7 @@ class CyclesWorld_PT_ray_visibility(CyclesButtonsPanel, Panel):
flow.prop(visibility, "diffuse")
flow.prop(visibility, "glossy")
flow.prop(visibility, "transmission")
+ flow.prop(visibility, "scatter")
class CyclesWorld_PT_settings(CyclesButtonsPanel, Panel):
@@ -922,15 +918,27 @@ class CyclesWorld_PT_settings(CyclesButtonsPanel, Panel):
cworld = world.cycles
cscene = context.scene.cycles
- col = layout.column()
+ split = layout.split()
- col.prop(cworld, "sample_as_light")
- sub = col.row(align=True)
+ col = split.column()
+
+ col.label(text="Surface:")
+ col.prop(cworld, "sample_as_light", text="Multiple Importance")
+
+ sub = col.column(align=True)
sub.active = cworld.sample_as_light
sub.prop(cworld, "sample_map_resolution")
if cscene.progressive == 'BRANCHED_PATH':
sub.prop(cworld, "samples")
+ col = split.column()
+ col.label(text="Volume:")
+ sub = col.column()
+ sub.active = use_cpu(context)
+ sub.prop(cworld, "volume_sampling", text="")
+ sub.prop(cworld, "volume_interpolation", text="")
+ col.prop(cworld, "homogeneous_volume", text="Homogeneous")
+
class CyclesMaterial_PT_preview(CyclesButtonsPanel, Panel):
bl_label = "Preview"
@@ -975,12 +983,10 @@ class CyclesMaterial_PT_volume(CyclesButtonsPanel, Panel):
layout = self.layout
mat = context.material
- cmat = mat.cycles
+ # cmat = mat.cycles
panel_node_draw(layout, mat, 'OUTPUT_MATERIAL', 'Volume')
- layout.prop(cmat, "homogeneous_volume")
-
class CyclesMaterial_PT_displacement(CyclesButtonsPanel, Panel):
bl_label = "Displacement"
@@ -1023,10 +1029,21 @@ class CyclesMaterial_PT_settings(CyclesButtonsPanel, Panel):
col.label()
col.prop(mat, "pass_index")
- col = layout.column()
- col.prop(cmat, "sample_as_light")
+ split = layout.split()
+
+ col = split.column()
+ col.label(text="Surface:")
+ col.prop(cmat, "sample_as_light", text="Multiple Importance")
col.prop(cmat, "use_transparent_shadow")
+ col = split.column()
+ col.label(text="Volume:")
+ sub = col.column()
+ sub.active = use_cpu(context)
+ sub.prop(cmat, "volume_sampling", text="")
+ col.prop(cmat, "volume_interpolation", text="")
+ col.prop(cmat, "homogeneous_volume", text="Homogeneous")
+
class CyclesTexture_PT_context(CyclesButtonsPanel, Panel):
bl_label = ""
@@ -1194,8 +1211,6 @@ class CyclesRender_PT_CurveRendering(CyclesButtonsPanel, Panel):
@classmethod
def poll(cls, context):
- scene = context.scene
- # cscene = scene.cycles
psys = context.particle_system
return CyclesButtonsPanel.poll(context) and psys and psys.settings.type == 'HAIR'
@@ -1238,38 +1253,39 @@ class CyclesRender_PT_bake(CyclesButtonsPanel, Panel):
scene = context.scene
cscene = scene.cycles
-
cbk = scene.render.bake
- layout.operator("object.bake", icon='RENDER_STILL').type = \
- cscene.bake_type
+ layout.operator("object.bake", icon='RENDER_STILL').type = cscene.bake_type
col = layout.column()
col.prop(cscene, "bake_type")
-
col.separator()
- split = layout.split()
- sub = split.column()
- sub.prop(cbk, "use_clear")
- sub.prop(cbk, "margin")
+ split = layout.split()
- sub = split.column()
- sub.prop(cbk, "use_selected_to_active")
- sub = sub.column()
+ col = split.column()
+ col.prop(cbk, "margin")
+ col.prop(cbk, "use_clear")
+ col = split.column()
+ col.prop(cbk, "use_selected_to_active")
+ sub = col.column()
sub.active = cbk.use_selected_to_active
- sub.prop(cbk, "cage_extrusion", text="Distance")
- sub.prop_search(cbk, "cage", scene, "objects")
+ sub.prop(cbk, "use_cage", text="Cage")
+ if cbk.use_cage:
+ sub.prop(cbk, "cage_extrusion", text="Extrusion")
+ sub.prop_search(cbk, "cage_object", scene, "objects", text="")
+ else:
+ sub.prop(cbk, "cage_extrusion", text="Ray Distance")
if cscene.bake_type == 'NORMAL':
- col.separator()
- box = col.box()
+ layout.separator()
+ box = layout.box()
box.label(text="Normal Settings:")
box.prop(cbk, "normal_space", text="Space")
row = box.row(align=True)
- row.label(text = "Swizzle:")
+ row.label(text="Swizzle:")
row.prop(cbk, "normal_r", text="")
row.prop(cbk, "normal_g", text="")
row.prop(cbk, "normal_b", text="")
@@ -1282,7 +1298,6 @@ class CyclesParticle_PT_CurveSettings(CyclesButtonsPanel, Panel):
@classmethod
def poll(cls, context):
scene = context.scene
- # cscene = scene.cycles
ccscene = scene.cycles_curves
psys = context.particle_system
use_curves = ccscene.use_curves and psys
@@ -1368,7 +1383,11 @@ def get_panels():
"RENDER_PT_encoding",
"RENDER_PT_dimensions",
"RENDER_PT_stamp",
+ "RENDER_PT_freestyle",
"RENDERLAYER_PT_layers",
+ "RENDERLAYER_PT_freestyle",
+ "RENDERLAYER_PT_freestyle_lineset",
+ "RENDERLAYER_PT_freestyle_linestyle",
"SCENE_PT_scene",
"SCENE_PT_color_management",
"SCENE_PT_custom_props",
@@ -1406,6 +1425,7 @@ def get_panels():
"DATA_PT_custom_props_curve",
"DATA_PT_custom_props_lattice",
"DATA_PT_custom_props_metaball",
+ "TEXTURE_PT_preview",
"TEXTURE_PT_custom_props",
"TEXTURE_PT_clouds",
"TEXTURE_PT_wood",
@@ -1423,6 +1443,7 @@ def get_panels():
"TEXTURE_PT_pointdensity",
"TEXTURE_PT_pointdensity_turbulence",
"TEXTURE_PT_mapping",
+ "TEXTURE_PT_ocean",
"TEXTURE_PT_influence",
"TEXTURE_PT_colors",
"PARTICLE_PT_context_particles",
@@ -1444,6 +1465,7 @@ def get_panels():
"PARTICLE_PT_force_fields",
"PARTICLE_PT_vertexgroups",
"MATERIAL_PT_custom_props",
+ "MATERIAL_PT_freestyle_line",
"BONE_PT_custom_props",
"OBJECT_PT_custom_props",
]
diff --git a/intern/cycles/blender/addon/version_update.py b/intern/cycles/blender/addon/version_update.py
new file mode 100644
index 00000000000..eaeec703ff5
--- /dev/null
+++ b/intern/cycles/blender/addon/version_update.py
@@ -0,0 +1,59 @@
+#
+# Copyright 2011-2014 Blender Foundation
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License
+#
+
+# <pep8 compliant>
+
+import bpy
+
+from bpy.app.handlers import persistent
+
+
+@persistent
+def do_versions(self):
+ # We don't modify startup file because it assumes to
+ # have all the default values only.
+ if not bpy.data.is_saved:
+ return
+
+ # Clamp Direct/Indirect separation in 270
+ if bpy.data.version <= (2, 70, 0):
+ for scene in bpy.data.scenes:
+ cscene = scene.cycles
+ sample_clamp = cscene.get("sample_clamp", False)
+ if (sample_clamp and
+ not cscene.is_property_set("sample_clamp_direct") and
+ not cscene.is_property_set("sample_clamp_indirect")):
+
+ cscene.sample_clamp_direct = sample_clamp
+ cscene.sample_clamp_indirect = sample_clamp
+
+ # Change of Volume Bounces in 271
+ if bpy.data.version <= (2, 71, 0):
+ for scene in bpy.data.scenes:
+ cscene = scene.cycles
+ if not cscene.is_property_set("volume_bounces"):
+ cscene.volume_bounces = 1
+
+ # Caustics Reflective/Refractive separation in 272
+ if bpy.data.version <= (2, 72, 0):
+ for scene in bpy.data.scenes:
+ cscene = scene.cycles
+ if (cscene.get("no_caustics", False) and
+ not cscene.is_property_set("caustics_reflective") and
+ not cscene.is_property_set("caustics_refractive")):
+
+ cscene.caustics_reflective = False
+ cscene.caustics_refractive = False
diff --git a/intern/cycles/blender/blender_camera.cpp b/intern/cycles/blender/blender_camera.cpp
index 1a85561c6d5..ce8c64c4819 100644
--- a/intern/cycles/blender/blender_camera.cpp
+++ b/intern/cycles/blender/blender_camera.cpp
@@ -46,6 +46,8 @@ struct BlenderCamera {
float2 pixelaspect;
+ float aperture_ratio;
+
PanoramaType panorama_type;
float fisheye_fov;
float fisheye_lens;
@@ -167,6 +169,7 @@ static void blender_camera_from_object(BlenderCamera *bcam, BL::Object b_ob, boo
bcam->apertureblades = RNA_int_get(&ccamera, "aperture_blades");
bcam->aperturerotation = RNA_float_get(&ccamera, "aperture_rotation");
bcam->focaldistance = blender_camera_focal_distance(b_ob, b_camera);
+ bcam->aperture_ratio = RNA_float_get(&ccamera, "aperture_ratio");
bcam->shift.x = b_camera.shift_x();
bcam->shift.y = b_camera.shift_y();
@@ -328,6 +331,9 @@ static void blender_camera_sync(Camera *cam, BlenderCamera *bcam, int width, int
cam->fisheye_fov = bcam->fisheye_fov;
cam->fisheye_lens = bcam->fisheye_lens;
+ /* anamorphic lens bokeh */
+ cam->aperture_ratio = bcam->aperture_ratio;
+
/* perspective */
cam->fov = 2.0f * atanf((0.5f * sensor_size) / bcam->lens / aspectratio);
cam->focaldistance = bcam->focaldistance;
diff --git a/intern/cycles/blender/blender_curves.cpp b/intern/cycles/blender/blender_curves.cpp
index 22de7b64273..8cfaea59a06 100644
--- a/intern/cycles/blender/blender_curves.cpp
+++ b/intern/cycles/blender/blender_curves.cpp
@@ -15,10 +15,11 @@
*/
#include "attribute.h"
+#include "camera.h"
+#include "curves.h"
#include "mesh.h"
#include "object.h"
#include "scene.h"
-#include "curves.h"
#include "blender_sync.h"
#include "blender_util.h"
@@ -39,10 +40,11 @@ bool ObtainCacheParticleUV(Mesh *mesh, BL::Mesh *b_mesh, BL::Object *b_ob, Parti
bool ObtainCacheParticleVcol(Mesh *mesh, BL::Mesh *b_mesh, BL::Object *b_ob, ParticleCurveData *CData, bool background, int vcol_num);
bool ObtainCacheParticleData(Mesh *mesh, BL::Mesh *b_mesh, BL::Object *b_ob, ParticleCurveData *CData, bool background);
void ExportCurveSegments(Scene *scene, Mesh *mesh, ParticleCurveData *CData);
-void ExportCurveTrianglePlanes(Mesh *mesh, ParticleCurveData *CData, float3 RotCam);
+void ExportCurveTrianglePlanes(Mesh *mesh, ParticleCurveData *CData,
+ float3 RotCam, bool is_ortho);
void ExportCurveTriangleGeometry(Mesh *mesh, ParticleCurveData *CData, int resolution);
void ExportCurveTriangleUV(Mesh *mesh, ParticleCurveData *CData, int vert_offset, int resol, float3 *uvdata);
-void ExportCurveTriangleVcol(Mesh *mesh, ParticleCurveData *CData, int vert_offset, int resol, float3 *fdata);
+void ExportCurveTriangleVcol(Mesh *mesh, ParticleCurveData *CData, int vert_offset, int resol, uchar4 *cdata);
ParticleCurveData::ParticleCurveData()
{
@@ -328,7 +330,8 @@ static void set_resolution(Mesh *mesh, BL::Mesh *b_mesh, BL::Object *b_ob, BL::S
}
}
-void ExportCurveTrianglePlanes(Mesh *mesh, ParticleCurveData *CData, float3 RotCam)
+void ExportCurveTrianglePlanes(Mesh *mesh, ParticleCurveData *CData,
+ float3 RotCam, bool is_ortho)
{
int vertexno = mesh->verts.size();
int vertexindex = vertexno;
@@ -362,7 +365,10 @@ void ExportCurveTrianglePlanes(Mesh *mesh, ParticleCurveData *CData, float3 RotC
float3 ickey_loc = CData->curvekey_co[CData->curve_firstkey[curve]];
float radius = shaperadius(CData->psys_shape[sys], CData->psys_rootradius[sys], CData->psys_tipradius[sys], 0.0f);
v1 = CData->curvekey_co[CData->curve_firstkey[curve] + 1] - CData->curvekey_co[CData->curve_firstkey[curve]];
- xbasis = normalize(cross(RotCam - ickey_loc,v1));
+ if(is_ortho)
+ xbasis = normalize(cross(RotCam, v1));
+ else
+ xbasis = normalize(cross(RotCam - ickey_loc, v1));
float3 ickey_loc_shfl = ickey_loc - radius * xbasis;
float3 ickey_loc_shfr = ickey_loc + radius * xbasis;
mesh->verts.push_back(ickey_loc_shfl);
@@ -386,7 +392,10 @@ void ExportCurveTrianglePlanes(Mesh *mesh, ParticleCurveData *CData, float3 RotC
if(CData->psys_closetip[sys] && (curvekey == CData->curve_firstkey[curve] + CData->curve_keynum[curve] - 1))
radius = shaperadius(CData->psys_shape[sys], CData->psys_rootradius[sys], 0.0f, 0.95f);
- xbasis = normalize(cross(RotCam - ickey_loc,v1));
+ if(is_ortho)
+ xbasis = normalize(cross(RotCam, v1));
+ else
+ xbasis = normalize(cross(RotCam - ickey_loc, v1));
float3 ickey_loc_shfl = ickey_loc - radius * xbasis;
float3 ickey_loc_shfr = ickey_loc + radius * xbasis;
mesh->verts.push_back(ickey_loc_shfl);
@@ -726,9 +735,9 @@ void ExportCurveTriangleUV(Mesh *mesh, ParticleCurveData *CData, int vert_offset
}
}
-void ExportCurveTriangleVcol(Mesh *mesh, ParticleCurveData *CData, int vert_offset, int resol, float3 *fdata)
+void ExportCurveTriangleVcol(Mesh *mesh, ParticleCurveData *CData, int vert_offset, int resol, uchar4 *cdata)
{
- if(fdata == NULL)
+ if(cdata == NULL)
return;
int vertexindex = vert_offset;
@@ -740,17 +749,17 @@ void ExportCurveTriangleVcol(Mesh *mesh, ParticleCurveData *CData, int vert_offs
for(int curvekey = CData->curve_firstkey[curve]; curvekey < CData->curve_firstkey[curve] + CData->curve_keynum[curve] - 1; curvekey++) {
for(int section = 0; section < resol; section++) {
- fdata[vertexindex] = color_srgb_to_scene_linear(CData->curve_vcol[curve]);
+ cdata[vertexindex] = color_float_to_byte(color_srgb_to_scene_linear(CData->curve_vcol[curve]));
vertexindex++;
- fdata[vertexindex] = color_srgb_to_scene_linear(CData->curve_vcol[curve]);
+ cdata[vertexindex] = color_float_to_byte(color_srgb_to_scene_linear(CData->curve_vcol[curve]));
vertexindex++;
- fdata[vertexindex] = color_srgb_to_scene_linear(CData->curve_vcol[curve]);
+ cdata[vertexindex] = color_float_to_byte(color_srgb_to_scene_linear(CData->curve_vcol[curve]));
vertexindex++;
- fdata[vertexindex] = color_srgb_to_scene_linear(CData->curve_vcol[curve]);
+ cdata[vertexindex] = color_float_to_byte(color_srgb_to_scene_linear(CData->curve_vcol[curve]));
vertexindex++;
- fdata[vertexindex] = color_srgb_to_scene_linear(CData->curve_vcol[curve]);
+ cdata[vertexindex] = color_float_to_byte(color_srgb_to_scene_linear(CData->curve_vcol[curve]));
vertexindex++;
- fdata[vertexindex] = color_srgb_to_scene_linear(CData->curve_vcol[curve]);
+ cdata[vertexindex] = color_float_to_byte(color_srgb_to_scene_linear(CData->curve_vcol[curve]));
vertexindex++;
}
}
@@ -858,20 +867,26 @@ void BlenderSync::sync_curves(Mesh *mesh, BL::Mesh b_mesh, BL::Object b_ob, bool
ObtainCacheParticleData(mesh, &b_mesh, &b_ob, &CData, !preview);
- /* obtain camera parameters */
- BL::Object b_CamOb = b_scene.camera();
- float3 RotCam = make_float3(0.0f, 0.0f, 0.0f);
- if(b_CamOb) {
- Transform ctfm = get_transform(b_CamOb.matrix_world());
- Transform tfm = get_transform(b_ob.matrix_world());
- Transform itfm = transform_quick_inverse(tfm);
- RotCam = transform_point(&itfm, make_float3(ctfm.x.w, ctfm.y.w, ctfm.z.w));
- }
-
/* add hair geometry to mesh */
if(primitive == CURVE_TRIANGLES) {
- if(triangle_method == CURVE_CAMERA_TRIANGLES)
- ExportCurveTrianglePlanes(mesh, &CData, RotCam);
+ if(triangle_method == CURVE_CAMERA_TRIANGLES) {
+ /* obtain camera parameters */
+ float3 RotCam;
+ Camera *camera = scene->camera;
+ Transform &ctfm = camera->matrix;
+ if(camera->type == CAMERA_ORTHOGRAPHIC) {
+ RotCam = -make_float3(ctfm.x.z, ctfm.y.z, ctfm.z.z);
+ }
+ else {
+ Transform tfm = get_transform(b_ob.matrix_world());
+ Transform itfm = transform_quick_inverse(tfm);
+ RotCam = transform_point(&itfm, make_float3(ctfm.x.w,
+ ctfm.y.w,
+ ctfm.z.w));
+ }
+ bool is_ortho = camera->type == CAMERA_ORTHOGRAPHIC;
+ ExportCurveTrianglePlanes(mesh, &CData, RotCam, is_ortho);
+ }
else {
ExportCurveTriangleGeometry(mesh, &CData, resolution);
used_res = resolution;
@@ -923,13 +938,12 @@ void BlenderSync::sync_curves(Mesh *mesh, BL::Mesh b_mesh, BL::Object b_ob, bool
ObtainCacheParticleVcol(mesh, &b_mesh, &b_ob, &CData, !preview, vcol_num);
if(primitive == CURVE_TRIANGLES) {
-
Attribute *attr_vcol = mesh->attributes.add(
- ustring(l->name().c_str()), TypeDesc::TypeColor, ATTR_ELEMENT_CORNER);
+ ustring(l->name().c_str()), TypeDesc::TypeColor, ATTR_ELEMENT_CORNER_BYTE);
- float3 *fdata = attr_vcol->data_float3();
+ uchar4 *cdata = attr_vcol->data_uchar4();
- ExportCurveTriangleVcol(mesh, &CData, tri_num * 3, used_res, fdata);
+ ExportCurveTriangleVcol(mesh, &CData, tri_num * 3, used_res, cdata);
}
else {
Attribute *attr_vcol = mesh->curve_attributes.add(
diff --git a/intern/cycles/blender/blender_logging.cpp b/intern/cycles/blender/blender_logging.cpp
new file mode 100644
index 00000000000..d3f1accf099
--- /dev/null
+++ b/intern/cycles/blender/blender_logging.cpp
@@ -0,0 +1,65 @@
+/*
+ * Copyright 2011-2014 Blender Foundation
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License
+ */
+
+#include "CCL_api.h"
+
+#include <stdio.h>
+
+#include "util_logging.h"
+
+#ifdef _MSC_VER
+# define snprintf _snprintf
+#endif
+
+void CCL_init_logging(const char *argv0)
+{
+#ifdef WITH_CYCLES_LOGGING
+ /* Make it so FATAL messages are always print into console. */
+ char severity_fatal[32];
+ snprintf(severity_fatal, sizeof(severity_fatal), "%d",
+ google::GLOG_FATAL);
+
+ google::InitGoogleLogging(argv0);
+ google::SetCommandLineOption("logtostderr", "1");
+ google::SetCommandLineOption("v", "0");
+ google::SetCommandLineOption("stderrthreshold", severity_fatal);
+ google::SetCommandLineOption("minloglevel", severity_fatal);
+#else
+ (void) argv0;
+#endif
+}
+
+void CCL_start_debug_logging(void)
+{
+#ifdef WITH_CYCLES_LOGGING
+ google::SetCommandLineOption("logtostderr", "1");
+ google::SetCommandLineOption("v", "2");
+ google::SetCommandLineOption("stderrthreshold", "1");
+ google::SetCommandLineOption("minloglevel", "0");
+#endif
+}
+
+void CCL_logging_verbosity_set(int verbosity)
+{
+#ifdef WITH_CYCLES_LOGGING
+ char val[10];
+ snprintf(val, sizeof(val), "%d", verbosity);
+
+ google::SetCommandLineOption("v", val);
+#else
+ (void) verbosity;
+#endif
+}
diff --git a/intern/cycles/blender/blender_mesh.cpp b/intern/cycles/blender/blender_mesh.cpp
index 83514879477..a5e4b7bd2ae 100644
--- a/intern/cycles/blender/blender_mesh.cpp
+++ b/intern/cycles/blender/blender_mesh.cpp
@@ -35,14 +35,14 @@ CCL_NAMESPACE_BEGIN
/* Tangent Space */
struct MikkUserData {
- MikkUserData(const BL::Mesh mesh_, const BL::MeshTextureFaceLayer layer_, int num_faces_)
+ MikkUserData(const BL::Mesh mesh_, BL::MeshTextureFaceLayer *layer_, int num_faces_)
: mesh(mesh_), layer(layer_), num_faces(num_faces_)
{
tangent.resize(num_faces*4);
}
BL::Mesh mesh;
- BL::MeshTextureFaceLayer layer;
+ BL::MeshTextureFaceLayer *layer;
int num_faces;
vector<float4> tangent;
};
@@ -78,26 +78,34 @@ static void mikk_get_position(const SMikkTSpaceContext *context, float P[3], con
static void mikk_get_texture_coordinate(const SMikkTSpaceContext *context, float uv[2], const int face_num, const int vert_num)
{
MikkUserData *userdata = (MikkUserData*)context->m_pUserData;
- BL::MeshTextureFace tf = userdata->layer.data[face_num];
- float3 tfuv;
-
- switch (vert_num) {
- case 0:
- tfuv = get_float3(tf.uv1());
- break;
- case 1:
- tfuv = get_float3(tf.uv2());
- break;
- case 2:
- tfuv = get_float3(tf.uv3());
- break;
- default:
- tfuv = get_float3(tf.uv4());
- break;
+ if(userdata->layer != NULL) {
+ BL::MeshTextureFace tf = userdata->layer->data[face_num];
+ float3 tfuv;
+
+ switch (vert_num) {
+ case 0:
+ tfuv = get_float3(tf.uv1());
+ break;
+ case 1:
+ tfuv = get_float3(tf.uv2());
+ break;
+ case 2:
+ tfuv = get_float3(tf.uv3());
+ break;
+ default:
+ tfuv = get_float3(tf.uv4());
+ break;
+ }
+
+ uv[0] = tfuv.x;
+ uv[1] = tfuv.y;
+ }
+ else {
+ int vert_idx = userdata->mesh.tessfaces[face_num].vertices()[vert_num];
+ float3 orco =
+ get_float3(userdata->mesh.vertices[vert_idx].undeformed_co());
+ map_to_sphere(&uv[0], &uv[1], orco[0], orco[1], orco[2]);
}
-
- uv[0] = tfuv.x;
- uv[1] = tfuv.y;
}
static void mikk_get_normal(const SMikkTSpaceContext *context, float N[3], const int face_num, const int vert_num)
@@ -127,7 +135,7 @@ static void mikk_set_tangent_space(const SMikkTSpaceContext *context, const floa
userdata->tangent[face*4 + vert] = make_float4(T[0], T[1], T[2], sign);
}
-static void mikk_compute_tangents(BL::Mesh b_mesh, BL::MeshTextureFaceLayer b_layer, Mesh *mesh, vector<int>& nverts, bool need_sign, bool active_render)
+static void mikk_compute_tangents(BL::Mesh b_mesh, BL::MeshTextureFaceLayer *b_layer, Mesh *mesh, vector<int>& nverts, bool need_sign, bool active_render)
{
/* setup userdata */
MikkUserData userdata(b_mesh, b_layer, nverts.size());
@@ -153,7 +161,11 @@ static void mikk_compute_tangents(BL::Mesh b_mesh, BL::MeshTextureFaceLayer b_la
/* create tangent attributes */
Attribute *attr;
- ustring name = ustring((string(b_layer.name().c_str()) + ".tangent").c_str());
+ ustring name;
+ if(b_layer != NULL)
+ name = ustring((string(b_layer->name().c_str()) + ".tangent").c_str());
+ else
+ name = ustring("orco.tangent");
if(active_render)
attr = mesh->attributes.add(ATTR_STD_UV_TANGENT, name);
@@ -167,7 +179,11 @@ static void mikk_compute_tangents(BL::Mesh b_mesh, BL::MeshTextureFaceLayer b_la
if(need_sign) {
Attribute *attr_sign;
- ustring name_sign = ustring((string(b_layer.name().c_str()) + ".tangent_sign").c_str());
+ ustring name_sign;
+ if(b_layer != NULL)
+ name_sign = ustring((string(b_layer->name().c_str()) + ".tangent_sign").c_str());
+ else
+ name_sign = ustring("orco.tangent_sign");
if(active_render)
attr_sign = mesh->attributes.add(ATTR_STD_UV_TANGENT_SIGN, name_sign);
@@ -208,7 +224,7 @@ static void mikk_compute_tangents(BL::Mesh b_mesh, BL::MeshTextureFaceLayer b_la
/* Create Volume Attribute */
-static void create_mesh_volume_attribute(BL::Object b_ob, Mesh *mesh, ImageManager *image_manager, AttributeStandard std)
+static void create_mesh_volume_attribute(BL::Object b_ob, Mesh *mesh, ImageManager *image_manager, AttributeStandard std, float frame)
{
BL::SmokeDomainSettings b_domain = object_smoke_domain_find(b_ob);
@@ -222,22 +238,22 @@ static void create_mesh_volume_attribute(BL::Object b_ob, Mesh *mesh, ImageManag
volume_data->manager = image_manager;
volume_data->slot = image_manager->add_image(Attribute::standard_name(std),
- b_ob.ptr.data, animated, is_float, is_linear, INTERPOLATION_LINEAR, true);
+ b_ob.ptr.data, animated, frame, is_float, is_linear, INTERPOLATION_LINEAR, true);
}
-static void create_mesh_volume_attributes(Scene *scene, BL::Object b_ob, Mesh *mesh)
+static void create_mesh_volume_attributes(Scene *scene, BL::Object b_ob, Mesh *mesh, float frame)
{
/* for smoke volume rendering */
if(mesh->need_attribute(scene, ATTR_STD_VOLUME_DENSITY))
- create_mesh_volume_attribute(b_ob, mesh, scene->image_manager, ATTR_STD_VOLUME_DENSITY);
+ create_mesh_volume_attribute(b_ob, mesh, scene->image_manager, ATTR_STD_VOLUME_DENSITY, frame);
if(mesh->need_attribute(scene, ATTR_STD_VOLUME_COLOR))
- create_mesh_volume_attribute(b_ob, mesh, scene->image_manager, ATTR_STD_VOLUME_COLOR);
+ create_mesh_volume_attribute(b_ob, mesh, scene->image_manager, ATTR_STD_VOLUME_COLOR, frame);
if(mesh->need_attribute(scene, ATTR_STD_VOLUME_FLAME))
- create_mesh_volume_attribute(b_ob, mesh, scene->image_manager, ATTR_STD_VOLUME_FLAME);
+ create_mesh_volume_attribute(b_ob, mesh, scene->image_manager, ATTR_STD_VOLUME_FLAME, frame);
if(mesh->need_attribute(scene, ATTR_STD_VOLUME_HEAT))
- create_mesh_volume_attribute(b_ob, mesh, scene->image_manager, ATTR_STD_VOLUME_HEAT);
+ create_mesh_volume_attribute(b_ob, mesh, scene->image_manager, ATTR_STD_VOLUME_HEAT, frame);
if(mesh->need_attribute(scene, ATTR_STD_VOLUME_VELOCITY))
- create_mesh_volume_attribute(b_ob, mesh, scene->image_manager, ATTR_STD_VOLUME_VELOCITY);
+ create_mesh_volume_attribute(b_ob, mesh, scene->image_manager, ATTR_STD_VOLUME_VELOCITY, frame);
}
/* Create Mesh */
@@ -347,31 +363,31 @@ static void create_mesh(Scene *scene, Mesh *mesh, BL::Mesh b_mesh, const vector<
continue;
Attribute *attr = mesh->attributes.add(
- ustring(l->name().c_str()), TypeDesc::TypeColor, ATTR_ELEMENT_CORNER);
+ ustring(l->name().c_str()), TypeDesc::TypeColor, ATTR_ELEMENT_CORNER_BYTE);
BL::MeshColorLayer::data_iterator c;
- float3 *fdata = attr->data_float3();
+ uchar4 *cdata = attr->data_uchar4();
size_t i = 0;
for(l->data.begin(c); c != l->data.end(); ++c, ++i) {
- fdata[0] = color_srgb_to_scene_linear(get_float3(c->color1()));
- fdata[1] = color_srgb_to_scene_linear(get_float3(c->color2()));
- fdata[2] = color_srgb_to_scene_linear(get_float3(c->color3()));
+ cdata[0] = color_float_to_byte(color_srgb_to_scene_linear(get_float3(c->color1())));
+ cdata[1] = color_float_to_byte(color_srgb_to_scene_linear(get_float3(c->color2())));
+ cdata[2] = color_float_to_byte(color_srgb_to_scene_linear(get_float3(c->color3())));
if(nverts[i] == 4) {
- fdata[3] = fdata[0];
- fdata[4] = fdata[2];
- fdata[5] = color_srgb_to_scene_linear(get_float3(c->color4()));
- fdata += 6;
+ cdata[3] = cdata[0];
+ cdata[4] = cdata[2];
+ cdata[5] = color_float_to_byte(color_srgb_to_scene_linear(get_float3(c->color4())));
+ cdata += 6;
}
else
- fdata += 3;
+ cdata += 3;
}
}
}
/* create uv map attributes */
- {
+ if (b_mesh.tessface_uv_textures.length() != 0) {
BL::Mesh::tessface_uv_textures_iterator l;
for(b_mesh.tessface_uv_textures.begin(l); l != b_mesh.tessface_uv_textures.end(); ++l) {
@@ -416,10 +432,14 @@ static void create_mesh(Scene *scene, Mesh *mesh, BL::Mesh b_mesh, const vector<
name = ustring((string(l->name().c_str()) + ".tangent_sign").c_str());
bool need_sign = (mesh->need_attribute(scene, name) || mesh->need_attribute(scene, std));
- mikk_compute_tangents(b_mesh, *l, mesh, nverts, need_sign, active_render);
+ mikk_compute_tangents(b_mesh, &(*l), mesh, nverts, need_sign, active_render);
}
}
}
+ else if(mesh->need_attribute(scene, ATTR_STD_UV_TANGENT)) {
+ bool need_sign = mesh->need_attribute(scene, ATTR_STD_UV_TANGENT_SIGN);
+ mikk_compute_tangents(b_mesh, NULL, mesh, nverts, need_sign, true);
+ }
/* for volume objects, create a matrix to transform from object space to
* mesh texture space. this does not work with deformations but that can
@@ -505,15 +525,16 @@ Mesh *BlenderSync::sync_mesh(BL::Object b_ob, bool object_updated, bool hide_tri
}
/* test if we need to sync */
+ bool use_mesh_geometry = render_layer.use_surfaces || render_layer.use_hair;
Mesh *mesh;
if(!mesh_map.sync(&mesh, key)) {
-
/* if transform was applied to mesh, need full update */
if(object_updated && mesh->transform_applied);
/* test if shaders changed, these can be object level so mesh
* does not get tagged for recalc */
else if(mesh->used_shaders != used_shaders);
+ else if(use_mesh_geometry != mesh->geometry_synced);
else {
/* even if not tagged for recalc, we may need to sync anyway
* because the shader needs different mesh attributes */
@@ -540,15 +561,21 @@ Mesh *BlenderSync::sync_mesh(BL::Object b_ob, bool object_updated, bool hide_tri
vector<Mesh::Triangle> oldtriangle = mesh->triangles;
/* compares curve_keys rather than strands in order to handle quick hair
- * adjustsments in dynamic BVH - other methods could probably do this better*/
+ * adjustments in dynamic BVH - other methods could probably do this better*/
vector<float4> oldcurve_keys = mesh->curve_keys;
mesh->clear();
mesh->used_shaders = used_shaders;
mesh->name = ustring(b_ob_data.name().c_str());
- if(render_layer.use_surfaces || render_layer.use_hair) {
- if(preview)
+ if(use_mesh_geometry) {
+ /* mesh objects does have special handle in the dependency graph,
+ * they're ensured to have properly updated.
+ *
+ * updating meshes here will end up having derived mesh referencing
+ * freed data from the blender side.
+ */
+ if(preview && b_ob.type() != BL::Object::type_MESH)
b_ob.update_from_editmode();
bool need_undeformed = mesh->need_attribute(scene, ATTR_STD_GENERATED);
@@ -561,7 +588,7 @@ Mesh *BlenderSync::sync_mesh(BL::Object b_ob, bool object_updated, bool hide_tri
else
create_mesh(scene, mesh, b_mesh, used_shaders);
- create_mesh_volume_attributes(scene, b_ob, mesh);
+ create_mesh_volume_attributes(scene, b_ob, mesh, b_scene.frame_current());
}
if(render_layer.use_hair)
@@ -570,6 +597,7 @@ Mesh *BlenderSync::sync_mesh(BL::Object b_ob, bool object_updated, bool hide_tri
/* free derived mesh */
b_data.meshes.remove(b_mesh);
}
+ mesh->geometry_synced = true;
}
/* displacement method */
@@ -616,6 +644,11 @@ void BlenderSync::sync_mesh_motion(BL::Object b_ob, Object *object, float motion
mesh_motion_synced.insert(mesh);
+ /* ensure we only motion sync meshes that also had mesh synced, to avoid
+ * unnecessary work and to ensure that its attributes were clear */
+ if(mesh_synced.find(mesh) == mesh_synced.end())
+ return;
+
/* for motion pass always compute, for motion blur it can be disabled */
int time_index = 0;
diff --git a/intern/cycles/blender/blender_object.cpp b/intern/cycles/blender/blender_object.cpp
index 167647608a5..1e07c5f9c96 100644
--- a/intern/cycles/blender/blender_object.cpp
+++ b/intern/cycles/blender/blender_object.cpp
@@ -82,6 +82,7 @@ static uint object_ray_visibility(BL::Object b_ob)
flag |= get_boolean(cvisibility, "glossy")? PATH_RAY_GLOSSY: 0;
flag |= get_boolean(cvisibility, "transmission")? PATH_RAY_TRANSMIT: 0;
flag |= get_boolean(cvisibility, "shadow")? PATH_RAY_SHADOW: 0;
+ flag |= get_boolean(cvisibility, "scatter")? PATH_RAY_VOLUME_SCATTER: 0;
return flag;
}
@@ -172,6 +173,7 @@ void BlenderSync::sync_light(BL::Object b_parent, int persistent_id[OBJECT_PERSI
light->use_diffuse = (visibility & PATH_RAY_DIFFUSE) != 0;
light->use_glossy = (visibility & PATH_RAY_GLOSSY) != 0;
light->use_transmission = (visibility & PATH_RAY_TRANSMIT) != 0;
+ light->use_scatter = (visibility & PATH_RAY_VOLUME_SCATTER) != 0;
/* tag */
light->tag_update(scene);
@@ -289,7 +291,6 @@ Object *BlenderSync::sync_object(BL::Object b_parent, int persistent_id[OBJECT_P
uint visibility = object_ray_visibility(b_ob) & PATH_RAY_ALL_VISIBILITY;
if(b_parent.ptr.data != b_ob.ptr.data) {
visibility &= object_ray_visibility(b_parent);
- object->random_id ^= hash_int(hash_string(b_parent.name().c_str()));
}
/* make holdout objects on excluded layer invisible for non-camera rays */
@@ -446,7 +447,6 @@ void BlenderSync::sync_objects(BL::SpaceView3D b_v3d, float motion_time)
light_map.pre_sync();
mesh_map.pre_sync();
object_map.pre_sync();
- mesh_synced.clear();
particle_system_map.pre_sync();
motion_times.clear();
}
@@ -458,10 +458,10 @@ void BlenderSync::sync_objects(BL::SpaceView3D b_v3d, float motion_time)
BL::Scene::object_bases_iterator b_base;
BL::Scene b_sce = b_scene;
/* modifier result type (not exposed as enum in C++ API)
- * 1 : eModifierMode_Realtime
- * 2 : eModifierMode_Render
- */
- int dupli_settings = preview ? 1 : 2;
+ * 1 : DAG_EVAL_PREVIEW
+ * 2 : DAG_EVAL_RENDER
+ */
+ int dupli_settings = preview ? 1 : 2;
bool cancel = false;
@@ -536,7 +536,6 @@ void BlenderSync::sync_objects(BL::SpaceView3D b_v3d, float motion_time)
scene->object_manager->tag_update(scene);
if(particle_system_map.post_sync())
scene->particle_system_manager->tag_update(scene);
- mesh_synced.clear();
}
if(motion)
@@ -578,7 +577,7 @@ void BlenderSync::sync_motion(BL::SpaceView3D b_v3d, BL::Object b_override, void
/* change frame */
python_thread_state_restore(python_thread_state);
- b_scene.frame_set(frame, subframe);
+ b_engine.frame_set(frame, subframe);
python_thread_state_save(python_thread_state);
/* sync camera, only supports two times at the moment */
@@ -593,7 +592,7 @@ void BlenderSync::sync_motion(BL::SpaceView3D b_v3d, BL::Object b_override, void
* function assumes it is being executed from python and will
* try to save the thread state */
python_thread_state_restore(python_thread_state);
- b_scene.frame_set(frame_center, 0.0f);
+ b_engine.frame_set(frame_center, 0.0f);
python_thread_state_save(python_thread_state);
/* tag camera for motion update */
diff --git a/intern/cycles/blender/blender_python.cpp b/intern/cycles/blender/blender_python.cpp
index 872f891cc2a..8e5a6c13f44 100644
--- a/intern/cycles/blender/blender_python.cpp
+++ b/intern/cycles/blender/blender_python.cpp
@@ -35,6 +35,13 @@
CCL_NAMESPACE_BEGIN
+static void *pylong_as_voidptr_typesafe(PyObject *object)
+{
+ if(object == Py_None)
+ return NULL;
+ return PyLong_AsVoidPtr(object);
+}
+
void python_thread_state_save(void **python_thread_state)
{
*python_thread_state = (void*)PyEval_SaveThread();
@@ -46,14 +53,36 @@ void python_thread_state_restore(void **python_thread_state)
*python_thread_state = NULL;
}
+static const char *PyC_UnicodeAsByte(PyObject *py_str, PyObject **coerce)
+{
+#ifdef WIN32
+ /* bug [#31856] oddly enough, Python3.2 --> 3.3 on Windows will throw an
+ * exception here this needs to be fixed in python:
+ * see: bugs.python.org/issue15859 */
+ if(!PyUnicode_Check(py_str)) {
+ PyErr_BadArgument();
+ return "";
+ }
+#endif
+ if((*coerce = PyUnicode_EncodeFSDefault(py_str))) {
+ return PyBytes_AS_STRING(*coerce);
+ }
+ return "";
+}
+
static PyObject *init_func(PyObject *self, PyObject *args)
{
- const char *path, *user_path;
+ PyObject *path, *user_path;
- if(!PyArg_ParseTuple(args, "ss", &path, &user_path))
+ if(!PyArg_ParseTuple(args, "OO", &path, &user_path)) {
return NULL;
-
- path_init(path, user_path);
+ }
+
+ PyObject *path_coerce = NULL, *user_path_coerce = NULL;
+ path_init(PyC_UnicodeAsByte(path, &path_coerce),
+ PyC_UnicodeAsByte(user_path, &user_path_coerce));
+ Py_XDECREF(path_coerce);
+ Py_XDECREF(user_path_coerce);
Py_RETURN_NONE;
}
@@ -84,15 +113,15 @@ static PyObject *create_func(PyObject *self, PyObject *args)
BL::Scene scene(sceneptr);
PointerRNA regionptr;
- RNA_id_pointer_create((ID*)PyLong_AsVoidPtr(pyregion), &regionptr);
+ RNA_id_pointer_create((ID*)pylong_as_voidptr_typesafe(pyregion), &regionptr);
BL::Region region(regionptr);
PointerRNA v3dptr;
- RNA_id_pointer_create((ID*)PyLong_AsVoidPtr(pyv3d), &v3dptr);
+ RNA_id_pointer_create((ID*)pylong_as_voidptr_typesafe(pyv3d), &v3dptr);
BL::SpaceView3D v3d(v3dptr);
PointerRNA rv3dptr;
- RNA_id_pointer_create((ID*)PyLong_AsVoidPtr(pyrv3d), &rv3dptr);
+ RNA_id_pointer_create((ID*)pylong_as_voidptr_typesafe(pyrv3d), &rv3dptr);
BL::RegionView3D rv3d(rv3dptr);
/* create session */
@@ -158,8 +187,6 @@ static PyObject *bake_func(PyObject *self, PyObject *args)
if(!PyArg_ParseTuple(args, "OOsOiiO", &pysession, &pyobject, &pass_type, &pypixel_array, &num_pixels, &depth, &pyresult))
return NULL;
- Py_BEGIN_ALLOW_THREADS
-
BlenderSession *session = (BlenderSession*)PyLong_AsVoidPtr(pysession);
PointerRNA objectptr;
@@ -172,9 +199,11 @@ static PyObject *bake_func(PyObject *self, PyObject *args)
RNA_id_pointer_create((ID*)PyLong_AsVoidPtr(pypixel_array), &bakepixelptr);
BL::BakePixel b_bake_pixel(bakepixelptr);
- session->bake(b_object, pass_type, b_bake_pixel, num_pixels, depth, (float *)b_result);
+ python_thread_state_save(&session->python_thread_state);
+
+ session->bake(b_object, pass_type, b_bake_pixel, (size_t)num_pixels, depth, (float *)b_result);
- Py_END_ALLOW_THREADS
+ python_thread_state_restore(&session->python_thread_state);
Py_RETURN_NONE;
}
@@ -356,7 +385,12 @@ static PyObject *osl_update_node_func(PyObject *self, PyObject *args)
/* find socket socket */
BL::NodeSocket b_sock(PointerRNA_NULL);
if (param->isoutput) {
+#if OSL_LIBRARY_VERSION_CODE < 10500
b_sock = b_node.outputs[param->name];
+#else
+ b_sock = b_node.outputs[param->name.string()];
+#endif
+
/* remove if type no longer matches */
if(b_sock && b_sock.bl_idname() != socket_type) {
@@ -365,7 +399,11 @@ static PyObject *osl_update_node_func(PyObject *self, PyObject *args)
}
}
else {
+#if OSL_LIBRARY_VERSION_CODE < 10500
b_sock = b_node.inputs[param->name];
+#else
+ b_sock = b_node.inputs[param->name.string()];
+#endif
/* remove if type no longer matches */
if(b_sock && b_sock.bl_idname() != socket_type) {
diff --git a/intern/cycles/blender/blender_session.cpp b/intern/cycles/blender/blender_session.cpp
index 01a5acd8982..57ffea4b1a9 100644
--- a/intern/cycles/blender/blender_session.cpp
+++ b/intern/cycles/blender/blender_session.cpp
@@ -88,6 +88,7 @@ void BlenderSession::create_session()
{
SceneParams scene_params = BlenderSync::get_scene_params(b_scene, background);
SessionParams session_params = BlenderSync::get_session_params(b_engine, b_userpref, b_scene, background);
+ bool session_pause = BlenderSync::get_session_pause(b_scene, background);
/* reset status/progress */
last_status = "";
@@ -107,15 +108,17 @@ void BlenderSession::create_session()
session->scene = scene;
session->progress.set_update_callback(function_bind(&BlenderSession::tag_redraw, this));
session->progress.set_cancel_callback(function_bind(&BlenderSession::test_cancel, this));
- session->set_pause(BlenderSync::get_session_pause(b_scene, background));
+ session->set_pause(session_pause);
/* create sync */
sync = new BlenderSync(b_engine, b_data, b_scene, scene, !background, session->progress, session_params.device.type == DEVICE_CPU);
if(b_v3d) {
- /* full data sync */
- sync->sync_data(b_v3d, b_engine.camera_override(), &python_thread_state);
- sync->sync_view(b_v3d, b_rv3d, width, height);
+ if(session_pause == false) {
+ /* full data sync */
+ sync->sync_view(b_v3d, b_rv3d, width, height);
+ sync->sync_data(b_v3d, b_engine.camera_override(), &python_thread_state);
+ }
}
else {
/* for final render we will do full data sync per render layer, only
@@ -258,6 +261,14 @@ static PassType get_pass_type(BL::RenderPass b_pass)
case BL::RenderPass::type_SPECULAR:
case BL::RenderPass::type_REFLECTION:
return PASS_NONE;
+#ifdef WITH_CYCLES_DEBUG
+ case BL::RenderPass::type_DEBUG:
+ {
+ if(b_pass.debug_type() == BL::RenderPass::debug_type_BVH_TRAVERSAL_STEPS)
+ return PASS_BVH_TRAVERSAL_STEPS;
+ break;
+ }
+#endif
}
return PASS_NONE;
@@ -420,6 +431,9 @@ void BlenderSession::render()
/* add passes */
vector<Pass> passes;
Pass::add(PASS_COMBINED, passes);
+#ifdef WITH_CYCLES_DEBUG
+ Pass::add(PASS_BVH_TRAVERSAL_STEPS, passes);
+#endif
if(session_params.device.advanced_shading) {
@@ -492,38 +506,24 @@ static void populate_bake_data(BakeData *data, BL::BakePixel pixel_array, const
}
}
-static bool is_light_pass(ShaderEvalType type)
-{
- switch (type) {
- case SHADER_EVAL_AO:
- case SHADER_EVAL_COMBINED:
- case SHADER_EVAL_SHADOW:
- case SHADER_EVAL_DIFFUSE_DIRECT:
- case SHADER_EVAL_GLOSSY_DIRECT:
- case SHADER_EVAL_TRANSMISSION_DIRECT:
- case SHADER_EVAL_SUBSURFACE_DIRECT:
- case SHADER_EVAL_DIFFUSE_INDIRECT:
- case SHADER_EVAL_GLOSSY_INDIRECT:
- case SHADER_EVAL_TRANSMISSION_INDIRECT:
- case SHADER_EVAL_SUBSURFACE_INDIRECT:
- return true;
- default:
- return false;
- }
-}
-
-void BlenderSession::bake(BL::Object b_object, const string& pass_type, BL::BakePixel pixel_array, int num_pixels, int depth, float result[])
+void BlenderSession::bake(BL::Object b_object, const string& pass_type, BL::BakePixel pixel_array, const size_t num_pixels, const int depth, float result[])
{
ShaderEvalType shader_type = get_shader_type(pass_type);
size_t object_index = OBJECT_NONE;
int tri_offset = 0;
+ /* ensure kernels are loaded before we do any scene updates */
+ session->load_kernels();
+
+ if(session->progress.get_cancel())
+ return;
+
if(shader_type == SHADER_EVAL_UV) {
/* force UV to be available */
Pass::add(PASS_UV, scene->film->passes);
}
- if(is_light_pass(shader_type)) {
+ if(BakeManager::is_light_pass(shader_type)) {
/* force use_light_pass to be true */
Pass::add(PASS_LIGHT, scene->film->passes);
}
@@ -540,6 +540,7 @@ void BlenderSession::bake(BL::Object b_object, const string& pass_type, BL::Bake
SessionParams session_params = BlenderSync::get_session_params(b_engine, b_userpref, b_scene, background);
BufferParams buffer_params = BlenderSync::get_buffer_params(b_render, b_scene, b_v3d, b_rv3d, scene->camera, width, height);
+ scene->bake_manager->set_shader_limit((size_t)b_engine.tile_x(), (size_t)b_engine.tile_y());
scene->bake_manager->set_baking(true);
/* set number of samples */
@@ -568,6 +569,8 @@ void BlenderSession::bake(BL::Object b_object, const string& pass_type, BL::Bake
session->reset(buffer_params, session_params.samples);
session->update_scene();
+ session->progress.set_update_callback(function_bind(&BlenderSession::update_bake_progress, this));
+
scene->bake_manager->bake(scene->device, &scene->dscene, scene, session->progress, shader_type, bake_data, result);
/* free all memory used (host and device), so we wouldn't leave render
@@ -639,6 +642,7 @@ void BlenderSession::synchronize()
/* on session/scene parameter changes, we recreate session entirely */
SceneParams scene_params = BlenderSync::get_scene_params(b_scene, background);
SessionParams session_params = BlenderSync::get_session_params(b_engine, b_userpref, b_scene, background);
+ bool session_pause = BlenderSync::get_session_pause(b_scene, background);
if(session->params.modified(session_params) ||
scene->params.modified(scene_params))
@@ -651,12 +655,18 @@ void BlenderSession::synchronize()
/* increase samples, but never decrease */
session->set_samples(session_params.samples);
- session->set_pause(BlenderSync::get_session_pause(b_scene, background));
+ session->set_pause(session_pause);
/* copy recalc flags, outside of mutex so we can decide to do the real
* synchronization at a later time to not block on running updates */
sync->sync_recalc();
+ /* don't do synchronization if on pause */
+ if(session_pause) {
+ tag_update();
+ return;
+ }
+
/* try to acquire mutex. if we don't want to or can't, come back later */
if(!session->ready_to_reset() || !session->scene->mutex.try_lock()) {
tag_update();
@@ -732,10 +742,12 @@ bool BlenderSession::draw(int w, int h)
if(reset) {
SessionParams session_params = BlenderSync::get_session_params(b_engine, b_userpref, b_scene, background);
BufferParams buffer_params = BlenderSync::get_buffer_params(b_render, b_scene, b_v3d, b_rv3d, scene->camera, width, height);
+ bool session_pause = BlenderSync::get_session_pause(b_scene, background);
- session->reset(buffer_params, session_params.samples);
-
- start_resize_time = 0.0;
+ if(session_pause == false) {
+ session->reset(buffer_params, session_params.samples);
+ start_resize_time = 0.0;
+ }
}
}
else {
@@ -779,6 +791,26 @@ void BlenderSession::get_progress(float& progress, double& total_time)
progress = 0.0;
}
+void BlenderSession::update_bake_progress()
+{
+ float progress;
+ int sample, samples_per_task, parts_total;
+
+ sample = session->progress.get_sample();
+ samples_per_task = scene->bake_manager->num_samples;
+ parts_total = scene->bake_manager->num_parts;
+
+ if(samples_per_task)
+ progress = ((float)sample / (float)(parts_total * samples_per_task));
+ else
+ progress = 0.0;
+
+ if(progress != last_progress) {
+ b_engine.update_progress(progress);
+ last_progress = progress;
+ }
+}
+
void BlenderSession::update_status_progress()
{
string timestatus, status, substatus;
@@ -798,7 +830,7 @@ void BlenderSession::update_status_progress()
if(background) {
if(progress>0)
- remaining_time = (1-progress) * (total_time / progress);
+ remaining_time = (1.0 - (double)progress) * (total_time / (double)progress);
scene += " | " + b_scene.name();
if(b_rlay_name != "")
@@ -817,7 +849,7 @@ void BlenderSession::update_status_progress()
timestatus += "Remaining:" + string(time_str) + " | ";
}
- timestatus += string_printf("Mem:%.2fM, Peak:%.2fM", mem_used, mem_peak);
+ timestatus += string_printf("Mem:%.2fM, Peak:%.2fM", (double)mem_used, (double)mem_peak);
if(status.size() > 0)
status = " | " + status;
diff --git a/intern/cycles/blender/blender_session.h b/intern/cycles/blender/blender_session.h
index 0e44493d674..ac685118b3d 100644
--- a/intern/cycles/blender/blender_session.h
+++ b/intern/cycles/blender/blender_session.h
@@ -52,7 +52,7 @@ public:
/* offline render */
void render();
- void bake(BL::Object b_object, const string& pass_type, BL::BakePixel pixel_array, int num_pixels, int depth, float pixels[]);
+ void bake(BL::Object b_object, const string& pass_type, BL::BakePixel pixel_array, const size_t num_pixels, const int depth, float pixels[]);
void write_render_result(BL::RenderResult b_rr, BL::RenderLayer b_rlay, RenderTile& rtile);
void write_render_tile(RenderTile& rtile);
@@ -73,6 +73,7 @@ public:
void get_progress(float& progress, double& total_time);
void test_cancel();
void update_status_progress();
+ void update_bake_progress();
bool background;
Session *session;
diff --git a/intern/cycles/blender/blender_shader.cpp b/intern/cycles/blender/blender_shader.cpp
index ddbb40da7db..27c2e9e9ae8 100644
--- a/intern/cycles/blender/blender_shader.cpp
+++ b/intern/cycles/blender/blender_shader.cpp
@@ -53,13 +53,13 @@ void BlenderSync::find_shader(BL::ID id, vector<uint>& used_shaders, int default
static BL::NodeSocket get_node_output(BL::Node b_node, const string& name)
{
BL::Node::outputs_iterator b_out;
-
+
for(b_node.outputs.begin(b_out); b_out != b_node.outputs.end(); ++b_out)
if(b_out->name() == name)
return *b_out;
-
+
assert(0);
-
+
return *b_out;
}
@@ -229,7 +229,11 @@ static ShaderNode *add_node(Scene *scene, BL::BlendData b_data, BL::Scene b_scen
BL::ShaderNodeMixRGB b_mix_node(b_node);
MixNode *mix = new MixNode();
mix->type = MixNode::type_enum[b_mix_node.blend_type()];
- mix->use_clamp = b_mix_node.use_clamp();
+ /* Tag if it's Mix */
+ if(b_mix_node.blend_type() == 0)
+ mix->special_type = SHADER_SPECIAL_TYPE_MIX_RGB;
+
+ mix->use_clamp = b_mix_node.use_clamp();
node = mix;
}
else if (b_node.is_a(&RNA_ShaderNodeSeparateRGB)) {
@@ -244,6 +248,12 @@ static ShaderNode *add_node(Scene *scene, BL::BlendData b_data, BL::Scene b_scen
else if (b_node.is_a(&RNA_ShaderNodeCombineHSV)) {
node = new CombineHSVNode();
}
+ else if (b_node.is_a(&RNA_ShaderNodeSeparateXYZ)) {
+ node = new SeparateXYZNode();
+ }
+ else if (b_node.is_a(&RNA_ShaderNodeCombineXYZ)) {
+ node = new CombineXYZNode();
+ }
else if (b_node.is_a(&RNA_ShaderNodeHueSaturation)) {
node = new HSVNode();
}
@@ -254,7 +264,7 @@ static ShaderNode *add_node(Scene *scene, BL::BlendData b_data, BL::Scene b_scen
BL::ShaderNodeMath b_math_node(b_node);
MathNode *math = new MathNode();
math->type = MathNode::type_enum[b_math_node.operation()];
- math->use_clamp = b_math_node.use_clamp();
+ math->use_clamp = b_math_node.use_clamp();
node = math;
}
else if (b_node.is_a(&RNA_ShaderNodeVectorMath)) {
@@ -274,7 +284,7 @@ static ShaderNode *add_node(Scene *scene, BL::BlendData b_data, BL::Scene b_scen
else if (b_node.is_a(&RNA_ShaderNodeNormal)) {
BL::Node::outputs_iterator out_it;
b_node.outputs.begin(out_it);
-
+
NormalNode *norm = new NormalNode();
norm->direction = get_node_output_vector(b_node, "Normal");
node = norm;
@@ -282,9 +292,9 @@ static ShaderNode *add_node(Scene *scene, BL::BlendData b_data, BL::Scene b_scen
else if (b_node.is_a(&RNA_ShaderNodeMapping)) {
BL::ShaderNodeMapping b_mapping_node(b_node);
MappingNode *mapping = new MappingNode();
-
+
get_tex_mapping(&mapping->tex_mapping, b_mapping_node);
-
+
node = mapping;
}
else if (b_node.is_a(&RNA_ShaderNodeFresnel)) {
@@ -312,7 +322,23 @@ static ShaderNode *add_node(Scene *scene, BL::BlendData b_data, BL::Scene b_scen
node = new HoldoutNode();
}
else if (b_node.is_a(&RNA_ShaderNodeBsdfAnisotropic)) {
- node = new WardBsdfNode();
+ BL::ShaderNodeBsdfAnisotropic b_aniso_node(b_node);
+ AnisotropicBsdfNode *aniso = new AnisotropicBsdfNode();
+
+ switch (b_aniso_node.distribution())
+ {
+ case BL::ShaderNodeBsdfAnisotropic::distribution_BECKMANN:
+ aniso->distribution = ustring("Beckmann");
+ break;
+ case BL::ShaderNodeBsdfAnisotropic::distribution_GGX:
+ aniso->distribution = ustring("GGX");
+ break;
+ case BL::ShaderNodeBsdfAnisotropic::distribution_ASHIKHMIN_SHIRLEY:
+ aniso->distribution = ustring("Ashikhmin-Shirley");
+ break;
+ }
+
+ node = aniso;
}
else if (b_node.is_a(&RNA_ShaderNodeBsdfDiffuse)) {
node = new DiffuseBsdfNode();
@@ -347,6 +373,9 @@ static ShaderNode *add_node(Scene *scene, BL::BlendData b_data, BL::Scene b_scen
case BL::ShaderNodeBsdfGlossy::distribution_GGX:
glossy->distribution = ustring("GGX");
break;
+ case BL::ShaderNodeBsdfGlossy::distribution_ASHIKHMIN_SHIRLEY:
+ glossy->distribution = ustring("Ashikhmin-Shirley");
+ break;
}
node = glossy;
}
@@ -471,7 +500,7 @@ static ShaderNode *add_node(Scene *scene, BL::BlendData b_data, BL::Scene b_scen
/* create script node */
BL::ShaderNodeScript b_script_node(b_node);
OSLScriptNode *script_node = new OSLScriptNode();
-
+
/* Generate inputs/outputs from node sockets
*
* Note: the node sockets are generated from OSL parameters,
@@ -480,38 +509,38 @@ static ShaderNode *add_node(Scene *scene, BL::BlendData b_data, BL::Scene b_scen
* Note 2: ShaderInput/ShaderOutput store shallow string copies only!
* Socket names must be stored in the extra lists instead. */
BL::Node::inputs_iterator b_input;
-
+
for (b_script_node.inputs.begin(b_input); b_input != b_script_node.inputs.end(); ++b_input) {
script_node->input_names.push_back(ustring(b_input->name()));
ShaderInput *input = script_node->add_input(script_node->input_names.back().c_str(),
convert_socket_type(*b_input));
set_default_value(input, b_node, *b_input, b_data, b_ntree);
}
-
+
BL::Node::outputs_iterator b_output;
-
+
for (b_script_node.outputs.begin(b_output); b_output != b_script_node.outputs.end(); ++b_output) {
script_node->output_names.push_back(ustring(b_output->name()));
script_node->add_output(script_node->output_names.back().c_str(),
convert_socket_type(*b_output));
}
-
+
/* load bytecode or filepath */
OSLShaderManager *manager = (OSLShaderManager*)scene->shader_manager;
string bytecode_hash = b_script_node.bytecode_hash();
-
+
if(!bytecode_hash.empty()) {
/* loaded bytecode if not already done */
if(!manager->shader_test_loaded(bytecode_hash))
manager->shader_load_bytecode(bytecode_hash, b_script_node.bytecode());
-
+
script_node->bytecode_hash = bytecode_hash;
}
else {
/* set filepath */
script_node->filepath = blender_absolute_path(b_data, b_ntree, b_script_node.filepath());
}
-
+
node = script_node;
}
#endif
@@ -547,6 +576,13 @@ static ShaderNode *add_node(Scene *scene, BL::BlendData b_data, BL::Scene b_scen
image->animated = b_image_node.image_user().use_auto_refresh();
image->use_alpha = b_image.use_alpha();
+
+ /* TODO(sergey): Does not work properly when we change builtin type. */
+ if (b_image.is_updated()) {
+ scene->image_manager->tag_reload_image(image->filename,
+ image->builtin_data,
+ (InterpolationType)b_image_node.interpolation());
+ }
}
image->color_space = ImageTextureNode::color_space_enum[(int)b_image_node.color_space()];
image->projection = ImageTextureNode::projection_enum[(int)b_image_node.projection()];
@@ -577,6 +613,13 @@ static ShaderNode *add_node(Scene *scene, BL::BlendData b_data, BL::Scene b_scen
}
env->use_alpha = b_image.use_alpha();
+
+ /* TODO(sergey): Does not work properly when we change builtin type. */
+ if (b_image.is_updated()) {
+ scene->image_manager->tag_reload_image(env->filename,
+ env->builtin_data,
+ INTERPOLATION_LINEAR);
+ }
}
env->color_space = EnvironmentTextureNode::color_space_enum[(int)b_env_node.color_space()];
env->projection = EnvironmentTextureNode::projection_enum[(int)b_env_node.projection()];
@@ -689,7 +732,7 @@ static bool node_use_modified_socket_name(ShaderNode *node)
{
if (node->special_type == SHADER_SPECIAL_TYPE_SCRIPT)
return false;
-
+
return true;
}
@@ -701,57 +744,57 @@ static ShaderInput *node_find_input_by_name(ShaderNode *node, BL::Node b_node, B
BL::Node::inputs_iterator b_input;
bool found = false;
int counter = 0, total = 0;
-
+
for (b_node.inputs.begin(b_input); b_input != b_node.inputs.end(); ++b_input) {
if (b_input->name() == name) {
if (!found)
counter++;
total++;
}
-
+
if(b_input->ptr.data == b_socket.ptr.data)
found = true;
}
-
+
/* rename if needed */
if (name == "Shader")
name = "Closure";
-
+
if (total > 1)
name = string_printf("%s%d", name.c_str(), counter);
}
-
+
return node->input(name.c_str());
}
static ShaderOutput *node_find_output_by_name(ShaderNode *node, BL::Node b_node, BL::NodeSocket b_socket)
{
string name = b_socket.name();
-
+
if (node_use_modified_socket_name(node)) {
BL::Node::outputs_iterator b_output;
bool found = false;
int counter = 0, total = 0;
-
+
for (b_node.outputs.begin(b_output); b_output != b_node.outputs.end(); ++b_output) {
if (b_output->name() == name) {
if (!found)
counter++;
total++;
}
-
+
if(b_output->ptr.data == b_socket.ptr.data)
found = true;
}
-
+
/* rename if needed */
if (name == "Shader")
name = "Closure";
-
+
if (total > 1)
name = string_printf("%s%d", name.c_str(), counter);
}
-
+
return node->output(name.c_str());
}
@@ -762,7 +805,7 @@ static void add_nodes(Scene *scene, BL::BlendData b_data, BL::Scene b_scene, Sha
BL::ShaderNodeTree::nodes_iterator b_node;
PtrInputMap input_map;
PtrOutputMap output_map;
-
+
BL::Node::inputs_iterator b_input;
BL::Node::outputs_iterator b_output;
@@ -792,10 +835,10 @@ static void add_nodes(Scene *scene, BL::BlendData b_data, BL::Scene b_scene, Sha
BL::Node::internal_links_iterator b_link;
for (b_node->internal_links.begin(b_link); b_link != b_node->internal_links.end(); ++b_link) {
ProxyNode *proxy = new ProxyNode(convert_socket_type(b_link->to_socket()));
-
+
input_map[b_link->from_socket().ptr.data] = proxy->inputs[0];
output_map[b_link->to_socket().ptr.data] = proxy->outputs[0];
-
+
graph->add(proxy);
}
}
@@ -807,7 +850,7 @@ static void add_nodes(Scene *scene, BL::BlendData b_data, BL::Scene b_scene, Sha
else
b_group_ntree = BL::ShaderNodeTree(((BL::NodeCustomGroup)(*b_node)).node_tree());
ProxyMap group_proxy_input_map, group_proxy_output_map;
-
+
/* Add a proxy node for each socket
* Do this even if the node group has no internal tree,
* so that links have something to connect to and assert won't fail.
@@ -815,21 +858,21 @@ static void add_nodes(Scene *scene, BL::BlendData b_data, BL::Scene b_scene, Sha
for(b_node->inputs.begin(b_input); b_input != b_node->inputs.end(); ++b_input) {
ProxyNode *proxy = new ProxyNode(convert_socket_type(*b_input));
graph->add(proxy);
-
+
/* register the proxy node for internal binding */
group_proxy_input_map[b_input->identifier()] = proxy;
-
+
input_map[b_input->ptr.data] = proxy->inputs[0];
-
+
set_default_value(proxy->inputs[0], *b_node, *b_input, b_data, b_ntree);
}
for(b_node->outputs.begin(b_output); b_output != b_node->outputs.end(); ++b_output) {
ProxyNode *proxy = new ProxyNode(convert_socket_type(*b_output));
graph->add(proxy);
-
+
/* register the proxy node for internal binding */
group_proxy_output_map[b_output->identifier()] = proxy;
-
+
output_map[b_output->ptr.data] = proxy->outputs[0];
}
@@ -842,7 +885,7 @@ static void add_nodes(Scene *scene, BL::BlendData b_data, BL::Scene b_scene, Sha
ProxyMap::const_iterator proxy_it = proxy_input_map.find(b_output->identifier());
if (proxy_it != proxy_input_map.end()) {
ProxyNode *proxy = proxy_it->second;
-
+
output_map[b_output->ptr.data] = proxy->outputs[0];
}
}
@@ -856,9 +899,9 @@ static void add_nodes(Scene *scene, BL::BlendData b_data, BL::Scene b_scene, Sha
ProxyMap::const_iterator proxy_it = proxy_output_map.find(b_input->identifier());
if (proxy_it != proxy_output_map.end()) {
ProxyNode *proxy = proxy_it->second;
-
+
input_map[b_input->ptr.data] = proxy->inputs[0];
-
+
set_default_value(proxy->inputs[0], *b_node, *b_input, b_data, b_ntree);
}
}
@@ -875,17 +918,25 @@ static void add_nodes(Scene *scene, BL::BlendData b_data, BL::Scene b_scene, Sha
else {
node = add_node(scene, b_data, b_scene, graph, b_ntree, BL::ShaderNode(*b_node));
}
-
+
if(node) {
/* map node sockets for linking */
for(b_node->inputs.begin(b_input); b_input != b_node->inputs.end(); ++b_input) {
ShaderInput *input = node_find_input_by_name(node, *b_node, *b_input);
+ if (!input) {
+ /* XXX should not happen, report error? */
+ continue;
+ }
input_map[b_input->ptr.data] = input;
-
+
set_default_value(input, *b_node, *b_input, b_data, b_ntree);
}
for(b_node->outputs.begin(b_output); b_output != b_node->outputs.end(); ++b_output) {
ShaderOutput *output = node_find_output_by_name(node, *b_node, *b_output);
+ if (!output) {
+ /* XXX should not happen, report error? */
+ continue;
+ }
output_map[b_output->ptr.data] = output;
}
}
@@ -902,7 +953,7 @@ static void add_nodes(Scene *scene, BL::BlendData b_data, BL::Scene b_scene, Sha
ShaderOutput *output = 0;
ShaderInput *input = 0;
-
+
PtrOutputMap::iterator output_it = output_map.find(b_from_sock.ptr.data);
if (output_it != output_map.end())
output = output_it->second;
@@ -934,7 +985,7 @@ void BlenderSync::sync_materials(bool update_all)
for(b_data.materials.begin(b_mat); b_mat != b_data.materials.end(); ++b_mat) {
Shader *shader;
-
+
/* test if we need to sync */
if(shader_map.sync(&shader, *b_mat) || update_all) {
ShaderGraph *graph = new ShaderGraph();
@@ -963,6 +1014,8 @@ void BlenderSync::sync_materials(bool update_all)
shader->use_mis = get_boolean(cmat, "sample_as_light");
shader->use_transparent_shadow = get_boolean(cmat, "use_transparent_shadow");
shader->heterogeneous_volume = !get_boolean(cmat, "homogeneous_volume");
+ shader->volume_sampling_method = (VolumeSampling)RNA_enum_get(&cmat, "volume_sampling");
+ shader->volume_interpolation_method = (VolumeInterpolation)RNA_enum_get(&cmat, "volume_interpolation");
shader->set_graph(graph);
shader->tag_update(scene);
@@ -988,10 +1041,12 @@ void BlenderSync::sync_world(bool update_all)
BL::ShaderNodeTree b_ntree(b_world.node_tree());
add_nodes(scene, b_data, b_scene, graph, b_ntree);
-
+
/* volume */
PointerRNA cworld = RNA_pointer_get(&b_world.ptr, "cycles");
shader->heterogeneous_volume = !get_boolean(cworld, "homogeneous_volume");
+ shader->volume_sampling_method = (VolumeSampling)RNA_enum_get(&cworld, "volume_sampling");
+ shader->volume_interpolation_method = (VolumeInterpolation)RNA_enum_get(&cworld, "volume_interpolation");
}
else if(b_world) {
ShaderNode *closure, *out;
@@ -1022,6 +1077,7 @@ void BlenderSync::sync_world(bool update_all)
visibility |= get_boolean(cvisibility, "diffuse")? PATH_RAY_DIFFUSE: 0;
visibility |= get_boolean(cvisibility, "glossy")? PATH_RAY_GLOSSY: 0;
visibility |= get_boolean(cvisibility, "transmission")? PATH_RAY_TRANSMIT: 0;
+ visibility |= get_boolean(cvisibility, "scatter")? PATH_RAY_VOLUME_SCATTER: 0;
background->visibility = visibility;
}
@@ -1059,7 +1115,7 @@ void BlenderSync::sync_lamps(bool update_all)
for(b_data.lamps.begin(b_lamp); b_lamp != b_data.lamps.end(); ++b_lamp) {
Shader *shader;
-
+
/* test if we need to sync */
if(shader_map.sync(&shader, *b_lamp) || update_all) {
ShaderGraph *graph = new ShaderGraph();
diff --git a/intern/cycles/blender/blender_sync.cpp b/intern/cycles/blender/blender_sync.cpp
index 1f5e32a1123..2ac90b34fd7 100644
--- a/intern/cycles/blender/blender_sync.cpp
+++ b/intern/cycles/blender/blender_sync.cpp
@@ -142,8 +142,13 @@ void BlenderSync::sync_data(BL::SpaceView3D b_v3d, BL::Object b_override, void *
sync_film();
sync_shaders();
sync_curve_settings();
+
+ mesh_synced.clear(); /* use for objects and motion sync */
+
sync_objects(b_v3d);
sync_motion(b_v3d, b_override, python_thread_state);
+
+ mesh_synced.clear();
}
/* Integrator */
@@ -172,14 +177,15 @@ void BlenderSync::sync_integrator()
integrator->transparent_min_bounce = get_int(cscene, "transparent_min_bounces");
integrator->transparent_shadows = get_boolean(cscene, "use_transparent_shadows");
- integrator->volume_homogeneous_sampling = RNA_enum_get(&cscene, "volume_homogeneous_sampling");
integrator->volume_max_steps = get_int(cscene, "volume_max_steps");
integrator->volume_step_size = get_float(cscene, "volume_step_size");
- integrator->no_caustics = get_boolean(cscene, "no_caustics");
+ integrator->caustics_reflective = get_boolean(cscene, "caustics_reflective");
+ integrator->caustics_refractive = get_boolean(cscene, "caustics_refractive");
integrator->filter_glossy = get_float(cscene, "blur_glossy");
integrator->seed = get_int(cscene, "seed");
+ integrator->sampling_pattern = (SamplingPattern)RNA_enum_get(&cscene, "sampling_pattern");
integrator->layer_flag = render_layer.layer;
@@ -227,10 +233,6 @@ void BlenderSync::sync_integrator()
integrator->subsurface_samples = subsurface_samples;
integrator->volume_samples = volume_samples;
}
-
-
- if(experimental)
- integrator->sampling_pattern = (SamplingPattern)RNA_enum_get(&cscene, "sampling_pattern");
if(integrator->modified(previntegrator))
integrator->tag_update(scene);
@@ -312,6 +314,8 @@ void BlenderSync::sync_render_layers(BL::SpaceView3D b_v3d, const char *layer)
BL::RenderSettings::layers_iterator b_rlay;
int use_layer_samples = RNA_enum_get(&cscene, "use_layer_samples");
bool first_layer = true;
+ uint layer_override = get_layer(b_engine.layer_override());
+ uint scene_layers = layer_override ? layer_override : get_layer(b_scene.layers());
for(r.layers.begin(b_rlay); b_rlay != r.layers.end(); ++b_rlay) {
if((!layer && first_layer) || (layer && b_rlay->name() == layer)) {
@@ -320,7 +324,7 @@ void BlenderSync::sync_render_layers(BL::SpaceView3D b_v3d, const char *layer)
render_layer.holdout_layer = get_layer(b_rlay->layers_zmask());
render_layer.exclude_layer = get_layer(b_rlay->layers_exclude());
- render_layer.scene_layer = get_layer(b_scene.layers()) & ~render_layer.exclude_layer;
+ render_layer.scene_layer = scene_layers & ~render_layer.exclude_layer;
render_layer.scene_layer |= render_layer.exclude_layer & render_layer.holdout_layer;
render_layer.layer = get_layer(b_rlay->layers());
@@ -357,9 +361,9 @@ SceneParams BlenderSync::get_scene_params(BL::Scene b_scene, bool background)
const bool shadingsystem = RNA_boolean_get(&cscene, "shading_system");
if(shadingsystem == 0)
- params.shadingsystem = SceneParams::SVM;
+ params.shadingsystem = SHADINGSYSTEM_SVM;
else if(shadingsystem == 1)
- params.shadingsystem = SceneParams::OSL;
+ params.shadingsystem = SHADINGSYSTEM_OSL;
if(background)
params.bvh_type = SceneParams::BVH_STATIC;
@@ -369,7 +373,7 @@ SceneParams BlenderSync::get_scene_params(BL::Scene b_scene, bool background)
params.use_bvh_spatial_split = RNA_boolean_get(&cscene, "debug_use_spatial_splits");
params.use_bvh_cache = (background)? RNA_boolean_get(&cscene, "use_cache"): false;
- if(background && params.shadingsystem != SceneParams::OSL)
+ if(background && params.shadingsystem != SHADINGSYSTEM_OSL)
params.persistent_data = r.use_persistent_data();
else
params.persistent_data = false;
@@ -506,9 +510,9 @@ SessionParams BlenderSync::get_session_params(BL::RenderEngine b_engine, BL::Use
const bool shadingsystem = RNA_boolean_get(&cscene, "shading_system");
if(shadingsystem == 0)
- params.shadingsystem = SessionParams::SVM;
+ params.shadingsystem = SHADINGSYSTEM_SVM;
else if(shadingsystem == 1)
- params.shadingsystem = SessionParams::OSL;
+ params.shadingsystem = SHADINGSYSTEM_OSL;
/* color managagement */
params.display_buffer_linear = GLEW_ARB_half_float_pixel && b_engine.support_display_space_shader(b_scene);
diff --git a/intern/cycles/bvh/bvh.cpp b/intern/cycles/bvh/bvh.cpp
index 3c0c5c021c8..15bd814b8d5 100644
--- a/intern/cycles/bvh/bvh.cpp
+++ b/intern/cycles/bvh/bvh.cpp
@@ -103,18 +103,30 @@ bool BVH::cache_read(CacheData& key)
if(Cache::global.lookup(key, value)) {
cache_filename = key.get_filename();
- value.read(pack.root_index);
- value.read(pack.SAH);
-
- value.read(pack.nodes);
- value.read(pack.object_node);
- value.read(pack.tri_woop);
- value.read(pack.prim_type);
- value.read(pack.prim_visibility);
- value.read(pack.prim_index);
- value.read(pack.prim_object);
- value.read(pack.is_leaf);
-
+ if(!(value.read(pack.root_index) &&
+ value.read(pack.SAH) &&
+ value.read(pack.nodes) &&
+ value.read(pack.object_node) &&
+ value.read(pack.tri_woop) &&
+ value.read(pack.prim_type) &&
+ value.read(pack.prim_visibility) &&
+ value.read(pack.prim_index) &&
+ value.read(pack.prim_object) &&
+ value.read(pack.is_leaf)))
+ {
+ /* Clear the pack if load failed. */
+ pack.root_index = 0;
+ pack.SAH = 0.0f;
+ pack.nodes.clear();
+ pack.object_node.clear();
+ pack.tri_woop.clear();
+ pack.prim_type.clear();
+ pack.prim_visibility.clear();
+ pack.prim_index.clear();
+ pack.prim_object.clear();
+ pack.is_leaf.clear();
+ return false;
+ }
return true;
}
diff --git a/intern/cycles/bvh/bvh_params.h b/intern/cycles/bvh/bvh_params.h
index ed67690a07f..e073b69472e 100644
--- a/intern/cycles/bvh/bvh_params.h
+++ b/intern/cycles/bvh/bvh_params.h
@@ -123,7 +123,7 @@ protected:
/* BVH Range
*
* Build range used during construction, to indicate the bounds and place in
- * the reference array of a subset of pirmitives Again uses trickery to pack
+ * the reference array of a subset of primitives Again uses trickery to pack
* integers into BoundBox for alignment purposes. */
class BVHRange
diff --git a/intern/cycles/cmake/external_libs.cmake b/intern/cycles/cmake/external_libs.cmake
index 8753ff4bf84..4f02b93f04a 100644
--- a/intern/cycles/cmake/external_libs.cmake
+++ b/intern/cycles/cmake/external_libs.cmake
@@ -1,4 +1,3 @@
-
###########################################################################
# GLUT
@@ -8,13 +7,17 @@ if(WITH_CYCLES_STANDALONE AND WITH_CYCLES_STANDALONE_GUI)
find_package(GLUT)
message(STATUS "GLUT_FOUND=${GLUT_FOUND}")
- include_directories(${GLUT_INCLUDE_DIR})
+ include_directories(
+ SYSTEM
+ ${GLUT_INCLUDE_DIR}
+ )
endif()
-if(WITH_SYSTEM_GLEW)
- set(CYCLES_GLEW_LIBRARY ${GLEW_LIBRARY})
-else()
- set(CYCLES_GLEW_LIBRARY extern_glew)
+###########################################################################
+# GLEW
+
+if(WITH_CYCLES_STANDALONE AND WITH_CYCLES_STANDALONE_GUI)
+ set(CYCLES_APP_GLEW_LIBRARY ${BLENDER_GLEW_LIBRARIES})
endif()
###########################################################################
@@ -29,4 +32,3 @@ if(WITH_CYCLES_CUDA_BINARIES)
set(WITH_CYCLES_CUDA_BINARIES OFF)
endif()
endif()
-
diff --git a/intern/cycles/device/CMakeLists.txt b/intern/cycles/device/CMakeLists.txt
index ae3309df3d9..998b35351e3 100644
--- a/intern/cycles/device/CMakeLists.txt
+++ b/intern/cycles/device/CMakeLists.txt
@@ -6,11 +6,13 @@ set(INC
../kernel/osl
../util
../render
+ ../../glew-mx
)
set(INC_SYS
- ${OPENGL_INCLUDE_DIR}
${GLEW_INCLUDE_PATH}
+ ../../../extern/cuew/include
+ ../../../extern/clew/include
)
set(SRC
@@ -36,7 +38,7 @@ set(SRC_HEADERS
device_task.h
)
-add_definitions(-DGLEW_STATIC)
+add_definitions(${GL_DEFINITIONS})
include_directories(${INC})
include_directories(SYSTEM ${INC_SYS})
diff --git a/intern/cycles/device/device.cpp b/intern/cycles/device/device.cpp
index 7fd1b79f6bc..efdfa98cfb5 100644
--- a/intern/cycles/device/device.cpp
+++ b/intern/cycles/device/device.cpp
@@ -20,12 +20,13 @@
#include "device.h"
#include "device_intern.h"
-#include "util_cuda.h"
+#include "cuew.h"
+#include "clew.h"
+
#include "util_debug.h"
#include "util_foreach.h"
#include "util_half.h"
#include "util_math.h"
-#include "util_opencl.h"
#include "util_opengl.h"
#include "util_time.h"
#include "util_types.h"
@@ -66,7 +67,7 @@ void Device::draw_pixels(device_memory& rgba, int y, int w, int h, int dy, int w
glColor3f(1.0f, 1.0f, 1.0f);
if(rgba.data_type == TYPE_HALF) {
- /* for multi devices, this assumes the ineffecient method that we allocate
+ /* for multi devices, this assumes the inefficient method that we allocate
* all pixels on the device even though we only render to a subset */
GLhalf *data_pointer = (GLhalf*)rgba.data_pointer;
data_pointer += 4*y*w;
@@ -141,7 +142,7 @@ Device *Device::create(DeviceInfo& info, Stats &stats, bool background)
break;
#ifdef WITH_CUDA
case DEVICE_CUDA:
- if(cuLibraryInit())
+ if(device_cuda_init())
device = device_cuda_create(info, stats, background);
else
device = NULL;
@@ -159,7 +160,7 @@ Device *Device::create(DeviceInfo& info, Stats &stats, bool background)
#endif
#ifdef WITH_OPENCL
case DEVICE_OPENCL:
- if(clLibraryInit())
+ if(device_opencl_init())
device = device_opencl_create(info, stats, background);
else
device = NULL;
@@ -213,12 +214,12 @@ vector<DeviceType>& Device::available_types()
types.push_back(DEVICE_CPU);
#ifdef WITH_CUDA
- if(cuLibraryInit())
+ if(device_cuda_init())
types.push_back(DEVICE_CUDA);
#endif
#ifdef WITH_OPENCL
- if(clLibraryInit())
+ if(device_opencl_init())
types.push_back(DEVICE_OPENCL);
#endif
@@ -242,12 +243,12 @@ vector<DeviceInfo>& Device::available_devices()
if(!devices_init) {
#ifdef WITH_CUDA
- if(cuLibraryInit())
+ if(device_cuda_init())
device_cuda_info(devices);
#endif
#ifdef WITH_OPENCL
- if(clLibraryInit())
+ if(device_opencl_init())
device_opencl_info(devices);
#endif
diff --git a/intern/cycles/device/device.h b/intern/cycles/device/device.h
index bcddd4f73e2..20ebfd391d6 100644
--- a/intern/cycles/device/device.h
+++ b/intern/cycles/device/device.h
@@ -122,6 +122,7 @@ public:
virtual bool load_kernels(bool experimental) { return true; }
/* tasks */
+ virtual int get_split_task_count(DeviceTask& task) = 0;
virtual void task_add(DeviceTask& task) = 0;
virtual void task_wait() = 0;
virtual void task_cancel() = 0;
diff --git a/intern/cycles/device/device_cpu.cpp b/intern/cycles/device/device_cpu.cpp
index c9cc7592028..c9b8a5b726b 100644
--- a/intern/cycles/device/device_cpu.cpp
+++ b/intern/cycles/device/device_cpu.cpp
@@ -17,6 +17,11 @@
#include <stdlib.h>
#include <string.h>
+/* So ImathMath is included before our kernel_cpu_compat. */
+#ifdef WITH_OSL
+# include <OSL/oslexec.h>
+#endif
+
#include "device.h"
#include "device_intern.h"
@@ -62,6 +67,7 @@ public:
system_cpu_support_sse3();
system_cpu_support_sse41();
system_cpu_support_avx();
+ system_cpu_support_avx2();
}
~CPUDevice()
@@ -72,8 +78,8 @@ public:
void mem_alloc(device_memory& mem, MemoryType type)
{
mem.device_pointer = mem.data_pointer;
-
- stats.mem_alloc(mem.memory_size());
+ mem.device_size = mem.memory_size();
+ stats.mem_alloc(mem.device_size);
}
void mem_copy_to(device_memory& mem)
@@ -93,9 +99,11 @@ public:
void mem_free(device_memory& mem)
{
- mem.device_pointer = 0;
-
- stats.mem_free(mem.memory_size());
+ if(mem.device_pointer) {
+ mem.device_pointer = 0;
+ stats.mem_free(mem.device_size);
+ mem.device_size = 0;
+ }
}
void const_copy_to(const char *name, void *host, size_t size)
@@ -107,15 +115,17 @@ public:
{
kernel_tex_copy(&kernel_globals, name, mem.data_pointer, mem.data_width, mem.data_height, mem.data_depth, interpolation);
mem.device_pointer = mem.data_pointer;
-
- stats.mem_alloc(mem.memory_size());
+ mem.device_size = mem.memory_size();
+ stats.mem_alloc(mem.device_size);
}
void tex_free(device_memory& mem)
{
- mem.device_pointer = 0;
-
- stats.mem_free(mem.memory_size());
+ if(mem.device_pointer) {
+ mem.device_pointer = 0;
+ stats.mem_free(mem.device_size);
+ mem.device_size = 0;
+ }
}
void *osl_memory()
@@ -167,6 +177,28 @@ public:
int start_sample = tile.start_sample;
int end_sample = tile.start_sample + tile.num_samples;
+#ifdef WITH_CYCLES_OPTIMIZED_KERNEL_AVX2
+ if(system_cpu_support_avx2()) {
+ for(int sample = start_sample; sample < end_sample; sample++) {
+ if (task.get_cancel() || task_pool.canceled()) {
+ if(task.need_finish_queue == false)
+ break;
+ }
+
+ for(int y = tile.y; y < tile.y + tile.h; y++) {
+ for(int x = tile.x; x < tile.x + tile.w; x++) {
+ kernel_cpu_avx2_path_trace(&kg, render_buffer, rng_state,
+ sample, x, y, tile.offset, tile.stride);
+ }
+ }
+
+ tile.sample = sample + 1;
+
+ task.update_progress(&tile);
+ }
+ }
+ else
+#endif
#ifdef WITH_CYCLES_OPTIMIZED_KERNEL_AVX
if(system_cpu_support_avx()) {
for(int sample = start_sample; sample < end_sample; sample++) {
@@ -184,7 +216,7 @@ public:
tile.sample = sample + 1;
- task.update_progress(tile);
+ task.update_progress(&tile);
}
}
else
@@ -206,7 +238,7 @@ public:
tile.sample = sample + 1;
- task.update_progress(tile);
+ task.update_progress(&tile);
}
}
else
@@ -228,7 +260,7 @@ public:
tile.sample = sample + 1;
- task.update_progress(tile);
+ task.update_progress(&tile);
}
}
else
@@ -250,7 +282,7 @@ public:
tile.sample = sample + 1;
- task.update_progress(tile);
+ task.update_progress(&tile);
}
}
else
@@ -271,7 +303,7 @@ public:
tile.sample = sample + 1;
- task.update_progress(tile);
+ task.update_progress(&tile);
}
}
@@ -293,6 +325,15 @@ public:
float sample_scale = 1.0f/(task.sample + 1);
if(task.rgba_half) {
+#ifdef WITH_CYCLES_OPTIMIZED_KERNEL_AVX2
+ if(system_cpu_support_avx2()) {
+ for(int y = task.y; y < task.y + task.h; y++)
+ for(int x = task.x; x < task.x + task.w; x++)
+ kernel_cpu_avx2_convert_to_half_float(&kernel_globals, (uchar4*)task.rgba_half, (float*)task.buffer,
+ sample_scale, x, y, task.offset, task.stride);
+ }
+ else
+#endif
#ifdef WITH_CYCLES_OPTIMIZED_KERNEL_AVX
if(system_cpu_support_avx()) {
for(int y = task.y; y < task.y + task.h; y++)
@@ -337,6 +378,15 @@ public:
}
}
else {
+#ifdef WITH_CYCLES_OPTIMIZED_KERNEL_AVX2
+ if(system_cpu_support_avx2()) {
+ for(int y = task.y; y < task.y + task.h; y++)
+ for(int x = task.x; x < task.x + task.w; x++)
+ kernel_cpu_avx2_convert_to_byte(&kernel_globals, (uchar4*)task.rgba_byte, (float*)task.buffer,
+ sample_scale, x, y, task.offset, task.stride);
+ }
+ else
+#endif
#ifdef WITH_CYCLES_OPTIMIZED_KERNEL_AVX
if(system_cpu_support_avx()) {
for(int y = task.y; y < task.y + task.h; y++)
@@ -390,56 +440,91 @@ public:
OSLShader::thread_init(&kg, &kernel_globals, &osl_globals);
#endif
+#ifdef WITH_CYCLES_OPTIMIZED_KERNEL_AVX2
+ if(system_cpu_support_avx2()) {
+ for(int sample = 0; sample < task.num_samples; sample++) {
+ for(int x = task.shader_x; x < task.shader_x + task.shader_w; x++)
+ kernel_cpu_avx2_shader(&kg, (uint4*)task.shader_input, (float4*)task.shader_output,
+ task.shader_eval_type, x, task.offset, sample);
+
+ if(task.get_cancel() || task_pool.canceled())
+ break;
+
+ task.update_progress(NULL);
+ }
+ }
+ else
+#endif
#ifdef WITH_CYCLES_OPTIMIZED_KERNEL_AVX
if(system_cpu_support_avx()) {
- for(int x = task.shader_x; x < task.shader_x + task.shader_w; x++) {
- kernel_cpu_avx_shader(&kg, (uint4*)task.shader_input, (float4*)task.shader_output, task.shader_eval_type, x);
+ for(int sample = 0; sample < task.num_samples; sample++) {
+ for(int x = task.shader_x; x < task.shader_x + task.shader_w; x++)
+ kernel_cpu_avx_shader(&kg, (uint4*)task.shader_input, (float4*)task.shader_output,
+ task.shader_eval_type, x, task.offset, sample);
if(task.get_cancel() || task_pool.canceled())
break;
+
+ task.update_progress(NULL);
}
}
else
#endif
#ifdef WITH_CYCLES_OPTIMIZED_KERNEL_SSE41
if(system_cpu_support_sse41()) {
- for(int x = task.shader_x; x < task.shader_x + task.shader_w; x++) {
- kernel_cpu_sse41_shader(&kg, (uint4*)task.shader_input, (float4*)task.shader_output, task.shader_eval_type, x);
+ for(int sample = 0; sample < task.num_samples; sample++) {
+ for(int x = task.shader_x; x < task.shader_x + task.shader_w; x++)
+ kernel_cpu_sse41_shader(&kg, (uint4*)task.shader_input, (float4*)task.shader_output,
+ task.shader_eval_type, x, task.offset, sample);
if(task.get_cancel() || task_pool.canceled())
break;
+
+ task.update_progress(NULL);
}
}
else
#endif
#ifdef WITH_CYCLES_OPTIMIZED_KERNEL_SSE3
if(system_cpu_support_sse3()) {
- for(int x = task.shader_x; x < task.shader_x + task.shader_w; x++) {
- kernel_cpu_sse3_shader(&kg, (uint4*)task.shader_input, (float4*)task.shader_output, task.shader_eval_type, x);
+ for(int sample = 0; sample < task.num_samples; sample++) {
+ for(int x = task.shader_x; x < task.shader_x + task.shader_w; x++)
+ kernel_cpu_sse3_shader(&kg, (uint4*)task.shader_input, (float4*)task.shader_output,
+ task.shader_eval_type, x, task.offset, sample);
if(task.get_cancel() || task_pool.canceled())
break;
+
+ task.update_progress(NULL);
}
}
else
#endif
#ifdef WITH_CYCLES_OPTIMIZED_KERNEL_SSE2
if(system_cpu_support_sse2()) {
- for(int x = task.shader_x; x < task.shader_x + task.shader_w; x++) {
- kernel_cpu_sse2_shader(&kg, (uint4*)task.shader_input, (float4*)task.shader_output, task.shader_eval_type, x);
+ for(int sample = 0; sample < task.num_samples; sample++) {
+ for(int x = task.shader_x; x < task.shader_x + task.shader_w; x++)
+ kernel_cpu_sse2_shader(&kg, (uint4*)task.shader_input, (float4*)task.shader_output,
+ task.shader_eval_type, x, task.offset, sample);
if(task.get_cancel() || task_pool.canceled())
break;
+
+ task.update_progress(NULL);
}
}
else
#endif
{
- for(int x = task.shader_x; x < task.shader_x + task.shader_w; x++) {
- kernel_cpu_shader(&kg, (uint4*)task.shader_input, (float4*)task.shader_output, task.shader_eval_type, x);
+ for(int sample = 0; sample < task.num_samples; sample++) {
+ for(int x = task.shader_x; x < task.shader_x + task.shader_w; x++)
+ kernel_cpu_shader(&kg, (uint4*)task.shader_input, (float4*)task.shader_output,
+ task.shader_eval_type, x, task.offset, sample);
if(task.get_cancel() || task_pool.canceled())
break;
+
+ task.update_progress(NULL);
}
}
@@ -448,11 +533,23 @@ public:
#endif
}
+ int get_split_task_count(DeviceTask& task)
+ {
+ if (task.type == DeviceTask::SHADER)
+ return task.get_subtask_count(TaskScheduler::num_threads(), 256);
+ else
+ return task.get_subtask_count(TaskScheduler::num_threads());
+ }
+
void task_add(DeviceTask& task)
{
/* split task into smaller ones */
list<DeviceTask> tasks;
- task.split(tasks, TaskScheduler::num_threads());
+
+ if(task.type == DeviceTask::SHADER)
+ task.split(tasks, TaskScheduler::num_threads(), 256);
+ else
+ task.split(tasks, TaskScheduler::num_threads());
foreach(DeviceTask& task, tasks)
task_pool.push(new CPUDeviceTask(this, task));
diff --git a/intern/cycles/device/device_cuda.cpp b/intern/cycles/device/device_cuda.cpp
index 68955211146..844fb3b8d50 100644
--- a/intern/cycles/device/device_cuda.cpp
+++ b/intern/cycles/device/device_cuda.cpp
@@ -23,7 +23,7 @@
#include "buffers.h"
-#include "util_cuda.h"
+#include "cuew.h"
#include "util_debug.h"
#include "util_map.h"
#include "util_opengl.h"
@@ -41,14 +41,11 @@ public:
CUdevice cuDevice;
CUcontext cuContext;
CUmodule cuModule;
- CUstream cuStream;
- CUevent tileDone;
map<device_ptr, bool> tex_interp_map;
int cuDevId;
int cuDevArchitecture;
bool first_error;
bool use_texture_storage;
- unsigned int target_update_frequency;
struct PixelMem {
GLuint cuPBO;
@@ -64,53 +61,10 @@ public:
return (CUdeviceptr)mem;
}
- static const char *cuda_error_string(CUresult result)
+ static bool have_precompiled_kernels()
{
- switch(result) {
- case CUDA_SUCCESS: return "No errors";
- case CUDA_ERROR_INVALID_VALUE: return "Invalid value";
- case CUDA_ERROR_OUT_OF_MEMORY: return "Out of memory";
- case CUDA_ERROR_NOT_INITIALIZED: return "Driver not initialized";
- case CUDA_ERROR_DEINITIALIZED: return "Driver deinitialized";
-
- case CUDA_ERROR_NO_DEVICE: return "No CUDA-capable device available";
- case CUDA_ERROR_INVALID_DEVICE: return "Invalid device";
-
- case CUDA_ERROR_INVALID_IMAGE: return "Invalid kernel image";
- case CUDA_ERROR_INVALID_CONTEXT: return "Invalid context";
- case CUDA_ERROR_CONTEXT_ALREADY_CURRENT: return "Context already current";
- case CUDA_ERROR_MAP_FAILED: return "Map failed";
- case CUDA_ERROR_UNMAP_FAILED: return "Unmap failed";
- case CUDA_ERROR_ARRAY_IS_MAPPED: return "Array is mapped";
- case CUDA_ERROR_ALREADY_MAPPED: return "Already mapped";
- case CUDA_ERROR_NO_BINARY_FOR_GPU: return "No binary for GPU";
- case CUDA_ERROR_ALREADY_ACQUIRED: return "Already acquired";
- case CUDA_ERROR_NOT_MAPPED: return "Not mapped";
- case CUDA_ERROR_NOT_MAPPED_AS_ARRAY: return "Mapped resource not available for access as an array";
- case CUDA_ERROR_NOT_MAPPED_AS_POINTER: return "Mapped resource not available for access as a pointer";
- case CUDA_ERROR_ECC_UNCORRECTABLE: return "Uncorrectable ECC error detected";
- case CUDA_ERROR_UNSUPPORTED_LIMIT: return "CUlimit not supported by device";
-
- case CUDA_ERROR_INVALID_SOURCE: return "Invalid source";
- case CUDA_ERROR_FILE_NOT_FOUND: return "File not found";
- case CUDA_ERROR_SHARED_OBJECT_SYMBOL_NOT_FOUND: return "Link to a shared object failed to resolve";
- case CUDA_ERROR_SHARED_OBJECT_INIT_FAILED: return "Shared object initialization failed";
-
- case CUDA_ERROR_INVALID_HANDLE: return "Invalid handle";
-
- case CUDA_ERROR_NOT_FOUND: return "Not found";
-
- case CUDA_ERROR_NOT_READY: return "CUDA not ready";
-
- case CUDA_ERROR_LAUNCH_FAILED: return "Launch failed";
- case CUDA_ERROR_LAUNCH_OUT_OF_RESOURCES: return "Launch exceeded resources";
- case CUDA_ERROR_LAUNCH_TIMEOUT: return "Launch exceeded timeout";
- case CUDA_ERROR_LAUNCH_INCOMPATIBLE_TEXTURING: return "Launch with incompatible texturing";
-
- case CUDA_ERROR_UNKNOWN: return "Unknown error";
-
- default: return "Unknown CUDA error value";
- }
+ string cubins_path = path_get("lib");
+ return path_exists(cubins_path);
}
/*#ifdef NDEBUG
@@ -132,7 +86,7 @@ public:
CUresult result = stmt; \
\
if(result != CUDA_SUCCESS) { \
- string message = string_printf("CUDA error: %s in %s", cuda_error_string(result), #stmt); \
+ string message = string_printf("CUDA error: %s in %s", cuewErrorString(result), #stmt); \
if(error_msg == "") \
error_msg = message; \
fprintf(stderr, "%s\n", message.c_str()); \
@@ -146,7 +100,7 @@ public:
if(result == CUDA_SUCCESS)
return false;
- string message = string_printf("CUDA error at %s: %s", stmt.c_str(), cuda_error_string(result));
+ string message = string_printf("CUDA error at %s: %s", stmt.c_str(), cuewErrorString(result));
if(error_msg == "")
error_msg = message;
fprintf(stderr, "%s\n", message.c_str());
@@ -180,8 +134,6 @@ public:
first_error = true;
background = background_;
use_texture_storage = true;
- /* we try an update / sync every 1000 ms */
- target_update_frequency = 1000;
cuDevId = info.num;
cuDevice = 0;
@@ -212,9 +164,6 @@ public:
if(cuda_error_(result, "cuCtxCreate"))
return;
- cuda_assert(cuStreamCreate(&cuStream, 0));
- cuda_assert(cuEventCreate(&tileDone, 0x1));
-
int major, minor;
cuDeviceComputeCapability(&major, &minor, cuDevId);
cuDevArchitecture = major*100 + minor*10;
@@ -231,12 +180,10 @@ public:
{
task_pool.stop();
- cuda_assert(cuEventDestroy(tileDone));
- cuda_assert(cuStreamDestroy(cuStream));
cuda_assert(cuCtxDestroy(cuContext));
}
- bool support_device(bool experimental, bool branched)
+ bool support_device(bool experimental)
{
int major, minor;
cuDeviceComputeCapability(&major, &minor, cuDevId);
@@ -250,14 +197,22 @@ public:
return true;
}
- string compile_kernel()
+ string compile_kernel(bool experimental)
{
/* compute cubin name */
int major, minor;
cuDeviceComputeCapability(&major, &minor, cuDevId);
+
+ /* workaround to make sm_52 cards work, until we bundle kernel */
+ if(major == 5 && minor == 2)
+ minor = 0;
/* attempt to use kernel provided with blender */
- string cubin = path_get(string_printf("lib/kernel_sm_%d%d.cubin", major, minor));
+ string cubin;
+ if(experimental)
+ cubin = path_get(string_printf("lib/kernel_experimental_sm_%d%d.cubin", major, minor));
+ else
+ cubin = path_get(string_printf("lib/kernel_sm_%d%d.cubin", major, minor));
if(path_exists(cubin))
return cubin;
@@ -265,7 +220,10 @@ public:
string kernel_path = path_get("kernel");
string md5 = path_files_md5_hash(kernel_path);
- cubin = string_printf("cycles_kernel_sm%d%d_%s.cubin", major, minor, md5.c_str());
+ if(experimental)
+ cubin = string_printf("cycles_kernel_experimental_sm%d%d_%s.cubin", major, minor, md5.c_str());
+ else
+ cubin = string_printf("cycles_kernel_sm%d%d_%s.cubin", major, minor, md5.c_str());
cubin = path_user_get(path_join("cache", cubin));
/* if exists already, use it */
@@ -273,7 +231,7 @@ public:
return cubin;
#ifdef _WIN32
- if(cuHavePrecompiledKernels()) {
+ if(have_precompiled_kernels()) {
if(major < 2)
cuda_error_message(string_printf("CUDA device requires compute capability 2.0 or up, found %d.%d. Your GPU is not supported.", major, minor));
else
@@ -283,25 +241,25 @@ public:
#endif
/* if not, find CUDA compiler */
- string nvcc = cuCompilerPath();
+ const char *nvcc = cuewCompilerPath();
- if(nvcc == "") {
+ if(nvcc == NULL) {
cuda_error_message("CUDA nvcc compiler not found. Install CUDA toolkit in default location.");
return "";
}
- int cuda_version = cuCompilerVersion();
+ int cuda_version = cuewCompilerVersion();
if(cuda_version == 0) {
cuda_error_message("CUDA nvcc compiler version could not be parsed.");
return "";
}
- if(cuda_version < 50) {
- printf("Unsupported CUDA version %d.%d detected, you need CUDA 6.0.\n", cuda_version/10, cuda_version%10);
+ if(cuda_version < 60) {
+ printf("Unsupported CUDA version %d.%d detected, you need CUDA 6.5.\n", cuda_version/10, cuda_version%10);
return "";
}
- else if(cuda_version != 60)
- printf("CUDA version %d.%d detected, build may succeed but only CUDA 6.0 is officially supported.\n", cuda_version/10, cuda_version%10);
+ else if(cuda_version != 65)
+ printf("CUDA version %d.%d detected, build may succeed but only CUDA 6.5 is officially supported.\n", cuda_version/10, cuda_version%10);
/* compile */
string kernel = path_join(kernel_path, "kernel.cu");
@@ -315,7 +273,14 @@ public:
string command = string_printf("\"%s\" -arch=sm_%d%d -m%d --cubin \"%s\" "
"-o \"%s\" --ptxas-options=\"-v\" -I\"%s\" -DNVCC -D__KERNEL_CUDA_VERSION__=%d",
- nvcc.c_str(), major, minor, machine, kernel.c_str(), cubin.c_str(), include.c_str(), cuda_version);
+ nvcc, major, minor, machine, kernel.c_str(), cubin.c_str(), include.c_str(), cuda_version);
+
+ if(experimental)
+ command += " -D__KERNEL_CUDA_EXPERIMENTAL__";
+
+#ifdef WITH_CYCLES_DEBUG
+ command += " -D__KERNEL_DEBUG__";
+#endif
printf("%s\n", command.c_str());
@@ -342,11 +307,11 @@ public:
return false;
/* check if GPU is supported */
- if(!support_device(experimental, false))
+ if(!support_device(experimental))
return false;
/* get kernel */
- string cubin = compile_kernel();
+ string cubin = compile_kernel(experimental);
if(cubin == "")
return false;
@@ -377,6 +342,7 @@ public:
size_t size = mem.memory_size();
cuda_assert(cuMemAlloc(&device_pointer, size));
mem.device_pointer = (device_ptr)device_pointer;
+ mem.device_size = size;
stats.mem_alloc(size);
cuda_pop_context();
}
@@ -397,7 +363,7 @@ public:
cuda_push_context();
if(mem.device_pointer) {
cuda_assert(cuMemcpyDtoH((uchar*)mem.data_pointer + offset,
- (CUdeviceptr)((uchar*)mem.device_pointer + offset), size));
+ (CUdeviceptr)(mem.device_pointer + offset), size));
}
else {
memset((char*)mem.data_pointer + offset, 0, size);
@@ -424,7 +390,8 @@ public:
mem.device_pointer = 0;
- stats.mem_free(mem.memory_size());
+ stats.mem_free(mem.device_size);
+ mem.device_size = 0;
}
}
@@ -516,6 +483,7 @@ public:
cuda_assert(cuTexRefSetFlags(texref, CU_TRSF_NORMALIZED_COORDINATES));
mem.device_pointer = (device_ptr)handle;
+ mem.device_size = size;
stats.mem_alloc(size);
}
@@ -583,7 +551,8 @@ public:
tex_interp_map.erase(tex_interp_map.find(mem.device_pointer));
mem.device_pointer = 0;
- stats.mem_free(mem.memory_size());
+ stats.mem_free(mem.device_size);
+ mem.device_size = 0;
}
else {
tex_interp_map.erase(tex_interp_map.find(mem.device_pointer));
@@ -604,7 +573,7 @@ public:
CUdeviceptr d_rng_state = cuda_device_ptr(rtile.rng_state);
/* get kernel function */
- if(branched && support_device(true, branched)) {
+ if(branched) {
cuda_assert(cuModuleGetFunction(&cuPathTrace, cuModule, "kernel_cuda_branched_path_trace"));
}
else {
@@ -613,40 +582,17 @@ public:
if(have_error())
return;
-
- /* pass in parameters */
- int offset = 0;
-
- cuda_assert(cuParamSetv(cuPathTrace, offset, &d_buffer, sizeof(d_buffer)));
- offset += sizeof(d_buffer);
-
- cuda_assert(cuParamSetv(cuPathTrace, offset, &d_rng_state, sizeof(d_rng_state)));
- offset += sizeof(d_rng_state);
-
- offset = align_up(offset, __alignof(sample));
-
- cuda_assert(cuParamSeti(cuPathTrace, offset, sample));
- offset += sizeof(sample);
-
- cuda_assert(cuParamSeti(cuPathTrace, offset, rtile.x));
- offset += sizeof(rtile.x);
-
- cuda_assert(cuParamSeti(cuPathTrace, offset, rtile.y));
- offset += sizeof(rtile.y);
-
- cuda_assert(cuParamSeti(cuPathTrace, offset, rtile.w));
- offset += sizeof(rtile.w);
-
- cuda_assert(cuParamSeti(cuPathTrace, offset, rtile.h));
- offset += sizeof(rtile.h);
-
- cuda_assert(cuParamSeti(cuPathTrace, offset, rtile.offset));
- offset += sizeof(rtile.offset);
- cuda_assert(cuParamSeti(cuPathTrace, offset, rtile.stride));
- offset += sizeof(rtile.stride);
-
- cuda_assert(cuParamSetSize(cuPathTrace, offset));
+ /* pass in parameters */
+ void *args[] = {&d_buffer,
+ &d_rng_state,
+ &sample,
+ &rtile.x,
+ &rtile.y,
+ &rtile.w,
+ &rtile.h,
+ &rtile.offset,
+ &rtile.stride};
/* launch kernel */
int threads_per_block;
@@ -664,16 +610,13 @@ public:
int yblocks = (rtile.h + ythreads - 1)/ythreads;
cuda_assert(cuFuncSetCacheConfig(cuPathTrace, CU_FUNC_CACHE_PREFER_L1));
- cuda_assert(cuFuncSetBlockShape(cuPathTrace, xthreads, ythreads, 1));
- if(info.display_device) {
- /* don't use async for device used for display, locks up UI too much */
- cuda_assert(cuLaunchGrid(cuPathTrace, xblocks, yblocks));
- cuda_assert(cuCtxSynchronize());
- }
- else {
- cuda_assert(cuLaunchGridAsync(cuPathTrace, xblocks, yblocks, cuStream));
- }
+ cuda_assert(cuLaunchKernel(cuPathTrace,
+ xblocks , yblocks, 1, /* blocks */
+ xthreads, ythreads, 1, /* threads */
+ 0, 0, args, 0));
+
+ cuda_assert(cuCtxSynchronize());
cuda_pop_context();
}
@@ -697,40 +640,19 @@ public:
cuda_assert(cuModuleGetFunction(&cuFilmConvert, cuModule, "kernel_cuda_convert_to_byte"));
}
- /* pass in parameters */
- int offset = 0;
-
- cuda_assert(cuParamSetv(cuFilmConvert, offset, &d_rgba, sizeof(d_rgba)));
- offset += sizeof(d_rgba);
-
- cuda_assert(cuParamSetv(cuFilmConvert, offset, &d_buffer, sizeof(d_buffer)));
- offset += sizeof(d_buffer);
float sample_scale = 1.0f/(task.sample + 1);
- offset = align_up(offset, __alignof(sample_scale));
-
- cuda_assert(cuParamSetf(cuFilmConvert, offset, sample_scale));
- offset += sizeof(sample_scale);
-
- cuda_assert(cuParamSeti(cuFilmConvert, offset, task.x));
- offset += sizeof(task.x);
-
- cuda_assert(cuParamSeti(cuFilmConvert, offset, task.y));
- offset += sizeof(task.y);
-
- cuda_assert(cuParamSeti(cuFilmConvert, offset, task.w));
- offset += sizeof(task.w);
- cuda_assert(cuParamSeti(cuFilmConvert, offset, task.h));
- offset += sizeof(task.h);
-
- cuda_assert(cuParamSeti(cuFilmConvert, offset, task.offset));
- offset += sizeof(task.offset);
-
- cuda_assert(cuParamSeti(cuFilmConvert, offset, task.stride));
- offset += sizeof(task.stride);
-
- cuda_assert(cuParamSetSize(cuFilmConvert, offset));
+ /* pass in parameters */
+ void *args[] = {&d_rgba,
+ &d_buffer,
+ &sample_scale,
+ &task.x,
+ &task.y,
+ &task.w,
+ &task.h,
+ &task.offset,
+ &task.stride};
/* launch kernel */
int threads_per_block;
@@ -742,8 +664,11 @@ public:
int yblocks = (task.h + ythreads - 1)/ythreads;
cuda_assert(cuFuncSetCacheConfig(cuFilmConvert, CU_FUNC_CACHE_PREFER_L1));
- cuda_assert(cuFuncSetBlockShape(cuFilmConvert, xthreads, ythreads, 1));
- cuda_assert(cuLaunchGrid(cuFilmConvert, xblocks, yblocks));
+
+ cuda_assert(cuLaunchKernel(cuFilmConvert,
+ xblocks , yblocks, 1, /* blocks */
+ xthreads, ythreads, 1, /* threads */
+ 0, 0, args, 0));
unmap_pixels((rgba_byte)? rgba_byte: rgba_half);
@@ -762,49 +687,54 @@ public:
CUdeviceptr d_output = cuda_device_ptr(task.shader_output);
/* get kernel function */
- cuda_assert(cuModuleGetFunction(&cuShader, cuModule, "kernel_cuda_shader"));
+ if(task.shader_eval_type >= SHADER_EVAL_BAKE) {
+ cuda_assert(cuModuleGetFunction(&cuShader, cuModule, "kernel_cuda_bake"));
+ }
+ else {
+ cuda_assert(cuModuleGetFunction(&cuShader, cuModule, "kernel_cuda_shader"));
+ }
/* do tasks in smaller chunks, so we can cancel it */
const int shader_chunk_size = 65536;
const int start = task.shader_x;
const int end = task.shader_x + task.shader_w;
+ int offset = task.offset;
+
+ bool canceled = false;
+ for(int sample = 0; sample < task.num_samples && !canceled; sample++) {
+ for(int shader_x = start; shader_x < end; shader_x += shader_chunk_size) {
+ int shader_w = min(shader_chunk_size, end - shader_x);
+
+ /* pass in parameters */
+ void *args[] = {&d_input,
+ &d_output,
+ &task.shader_eval_type,
+ &shader_x,
+ &shader_w,
+ &offset,
+ &sample};
+
+ /* launch kernel */
+ int threads_per_block;
+ cuda_assert(cuFuncGetAttribute(&threads_per_block, CU_FUNC_ATTRIBUTE_MAX_THREADS_PER_BLOCK, cuShader));
+
+ int xblocks = (shader_w + threads_per_block - 1)/threads_per_block;
+
+ cuda_assert(cuFuncSetCacheConfig(cuShader, CU_FUNC_CACHE_PREFER_L1));
+ cuda_assert(cuLaunchKernel(cuShader,
+ xblocks , 1, 1, /* blocks */
+ threads_per_block, 1, 1, /* threads */
+ 0, 0, args, 0));
+
+ cuda_assert(cuCtxSynchronize());
+
+ if(task.get_cancel()) {
+ canceled = false;
+ break;
+ }
+ }
- for(int shader_x = start; shader_x < end; shader_x += shader_chunk_size) {
- if(task.get_cancel())
- break;
-
- /* pass in parameters */
- int offset = 0;
-
- cuda_assert(cuParamSetv(cuShader, offset, &d_input, sizeof(d_input)));
- offset += sizeof(d_input);
-
- cuda_assert(cuParamSetv(cuShader, offset, &d_output, sizeof(d_output)));
- offset += sizeof(d_output);
-
- int shader_eval_type = task.shader_eval_type;
- offset = align_up(offset, __alignof(shader_eval_type));
-
- cuda_assert(cuParamSeti(cuShader, offset, task.shader_eval_type));
- offset += sizeof(task.shader_eval_type);
-
- cuda_assert(cuParamSeti(cuShader, offset, shader_x));
- offset += sizeof(shader_x);
-
- cuda_assert(cuParamSetSize(cuShader, offset));
-
- /* launch kernel */
- int threads_per_block;
- cuda_assert(cuFuncGetAttribute(&threads_per_block, CU_FUNC_ATTRIBUTE_MAX_THREADS_PER_BLOCK, cuShader));
-
- int shader_w = min(shader_chunk_size, end - shader_x);
- int xblocks = (shader_w + threads_per_block - 1)/threads_per_block;
-
- cuda_assert(cuFuncSetCacheConfig(cuShader, CU_FUNC_CACHE_PREFER_L1));
- cuda_assert(cuFuncSetBlockShape(cuShader, threads_per_block, 1, 1));
- cuda_assert(cuLaunchGrid(cuShader, xblocks, 1));
-
- cuda_assert(cuCtxSynchronize());
+ task.update_progress(NULL);
}
cuda_pop_context();
@@ -872,7 +802,8 @@ public:
mem.device_pointer = pmem.cuTexId;
pixel_mem_map[mem.device_pointer] = pmem;
- stats.mem_alloc(mem.memory_size());
+ mem.device_size = mem.memory_size();
+ stats.mem_alloc(mem.device_size);
return;
}
@@ -929,7 +860,8 @@ public:
pixel_mem_map.erase(pixel_mem_map.find(mem.device_pointer));
mem.device_pointer = 0;
- stats.mem_free(mem.memory_size());
+ stats.mem_free(mem.device_size);
+ mem.device_size = 0;
return;
}
@@ -946,7 +878,7 @@ public:
cuda_push_context();
- /* for multi devices, this assumes the ineffecient method that we allocate
+ /* for multi devices, this assumes the inefficient method that we allocate
* all pixels on the device even though we only render to a subset */
size_t offset = 4*y*w;
@@ -1024,10 +956,6 @@ public:
int start_sample = tile.start_sample;
int end_sample = tile.start_sample + tile.num_samples;
- boost::posix_time::ptime start_time(boost::posix_time::microsec_clock::local_time());
- boost::posix_time::ptime last_time = start_time;
- int sync_sample = 10;
-
for(int sample = start_sample; sample < end_sample; sample++) {
if (task->get_cancel()) {
if(task->need_finish_queue == false)
@@ -1037,28 +965,8 @@ public:
path_trace(tile, sample, branched);
tile.sample = sample + 1;
- task->update_progress(tile);
- if(!info.display_device && sample == sync_sample) {
- cuda_push_context();
- cuda_assert(cuEventRecord(tileDone, cuStream));
- cuda_assert(cuEventSynchronize(tileDone));
-
- /* Do some time keeping to find out if we need to sync less */
- boost::posix_time::ptime current_time(boost::posix_time::microsec_clock::local_time());
- boost::posix_time::time_duration sample_duration = current_time - last_time;
-
- long msec = sample_duration.total_milliseconds();
- float scaling_factor = (float)target_update_frequency / (float)msec;
-
- /* sync at earliest next sample and probably later */
- sync_sample = (sample + 1) + sync_sample * (int)ceil(scaling_factor);
-
- sync_sample = min(end_sample - 1, sync_sample); // make sure we sync the last sample always
-
- last_time = current_time;
- cuda_pop_context();
- }
+ task->update_progress(&tile);
}
task->release_tile(tile);
@@ -1082,6 +990,11 @@ public:
}
};
+ int get_split_task_count(DeviceTask& task)
+ {
+ return 1;
+ }
+
void task_add(DeviceTask& task)
{
if(task.type == DeviceTask::FILM_CONVERT) {
@@ -1108,6 +1021,28 @@ public:
}
};
+bool device_cuda_init(void)
+{
+ static bool initialized = false;
+ static bool result = false;
+
+ if (initialized)
+ return result;
+
+ initialized = true;
+
+ if (cuewInit() == CUEW_SUCCESS) {
+ if(CUDADevice::have_precompiled_kernels())
+ result = true;
+#ifndef _WIN32
+ else if(cuewCompilerPath() != NULL)
+ result = true;
+#endif
+ }
+
+ return result;
+}
+
Device *device_cuda_create(DeviceInfo& info, Stats &stats, bool background)
{
return new CUDADevice(info, stats, background);
@@ -1121,13 +1056,13 @@ void device_cuda_info(vector<DeviceInfo>& devices)
result = cuInit(0);
if(result != CUDA_SUCCESS) {
if(result != CUDA_ERROR_NO_DEVICE)
- fprintf(stderr, "CUDA cuInit: %s\n", CUDADevice::cuda_error_string(result));
+ fprintf(stderr, "CUDA cuInit: %s\n", cuewErrorString(result));
return;
}
result = cuDeviceGetCount(&count);
if(result != CUDA_SUCCESS) {
- fprintf(stderr, "CUDA cuDeviceGetCount: %s\n", CUDADevice::cuda_error_string(result));
+ fprintf(stderr, "CUDA cuDeviceGetCount: %s\n", cuewErrorString(result));
return;
}
diff --git a/intern/cycles/device/device_intern.h b/intern/cycles/device/device_intern.h
index 7eb66c25a81..80f1e2441a5 100644
--- a/intern/cycles/device/device_intern.h
+++ b/intern/cycles/device/device_intern.h
@@ -22,7 +22,9 @@ CCL_NAMESPACE_BEGIN
class Device;
Device *device_cpu_create(DeviceInfo& info, Stats &stats, bool background);
+bool device_opencl_init(void);
Device *device_opencl_create(DeviceInfo& info, Stats &stats, bool background);
+bool device_cuda_init(void);
Device *device_cuda_create(DeviceInfo& info, Stats &stats, bool background);
Device *device_network_create(DeviceInfo& info, Stats &stats, const char *address);
Device *device_multi_create(DeviceInfo& info, Stats &stats, bool background);
diff --git a/intern/cycles/device/device_memory.h b/intern/cycles/device/device_memory.h
index 8d6f4a49a9c..07a6eb36a3c 100644
--- a/intern/cycles/device/device_memory.h
+++ b/intern/cycles/device/device_memory.h
@@ -167,6 +167,7 @@ public:
int data_elements;
device_ptr data_pointer;
size_t data_size;
+ size_t device_size;
size_t data_width;
size_t data_height;
size_t data_depth;
@@ -194,6 +195,7 @@ public:
data_elements = device_type_traits<T>::num_elements;
data_pointer = 0;
data_size = 0;
+ device_size = 0;
data_width = 0;
data_height = 0;
data_depth = 0;
@@ -258,6 +260,11 @@ public:
return data.size();
}
+ T* get_data()
+ {
+ return &data[0];
+ }
+
private:
array<T> data;
};
diff --git a/intern/cycles/device/device_multi.cpp b/intern/cycles/device/device_multi.cpp
index c866ebaaea2..7f055c79491 100644
--- a/intern/cycles/device/device_multi.cpp
+++ b/intern/cycles/device/device_multi.cpp
@@ -278,6 +278,22 @@ public:
return -1;
}
+ int get_split_task_count(DeviceTask& task)
+ {
+ int total_tasks = 0;
+ list<DeviceTask> tasks;
+ task.split(tasks, devices.size());
+ foreach(SubDevice& sub, devices) {
+ if(!tasks.empty()) {
+ DeviceTask subtask = tasks.front();
+ tasks.pop_front();
+
+ total_tasks += sub.device->get_split_task_count(subtask);
+ }
+ }
+ return total_tasks;
+ }
+
void task_add(DeviceTask& task)
{
list<DeviceTask> tasks;
diff --git a/intern/cycles/device/device_network.cpp b/intern/cycles/device/device_network.cpp
index af051076009..dca9bf29e70 100644
--- a/intern/cycles/device/device_network.cpp
+++ b/intern/cycles/device/device_network.cpp
@@ -299,6 +299,11 @@ public:
snd.write();
}
+ int get_split_task_count(DeviceTask& task)
+ {
+ return 1;
+ }
+
private:
NetworkError error_func;
};
diff --git a/intern/cycles/device/device_opencl.cpp b/intern/cycles/device/device_opencl.cpp
index 694ec9db036..58b2bcafb82 100644
--- a/intern/cycles/device/device_opencl.cpp
+++ b/intern/cycles/device/device_opencl.cpp
@@ -25,11 +25,12 @@
#include "buffers.h"
+#include "clew.h"
+
#include "util_foreach.h"
#include "util_map.h"
#include "util_math.h"
#include "util_md5.h"
-#include "util_opencl.h"
#include "util_opengl.h"
#include "util_path.h"
#include "util_time.h"
@@ -101,7 +102,11 @@ static string opencl_kernel_build_options(const string& platform, const string *
if(opencl_kernel_use_debug())
build_options += "-D__KERNEL_OPENCL_DEBUG__ ";
-
+
+#ifdef WITH_CYCLES_DEBUG
+ build_options += "-D__KERNEL_DEBUG__ ";
+#endif
+
return build_options;
}
@@ -321,6 +326,7 @@ public:
cl_kernel ckFilmConvertByteKernel;
cl_kernel ckFilmConvertHalfFloatKernel;
cl_kernel ckShaderKernel;
+ cl_kernel ckBakeKernel;
cl_int ciErr;
typedef map<string, device_vector<uchar>*> ConstMemMap;
@@ -333,63 +339,10 @@ public:
bool device_initialized;
string platform_name;
- const char *opencl_error_string(cl_int err)
- {
- switch (err) {
- case CL_SUCCESS: return "Success!";
- case CL_DEVICE_NOT_FOUND: return "Device not found.";
- case CL_DEVICE_NOT_AVAILABLE: return "Device not available";
- case CL_COMPILER_NOT_AVAILABLE: return "Compiler not available";
- case CL_MEM_OBJECT_ALLOCATION_FAILURE: return "Memory object allocation failure";
- case CL_OUT_OF_RESOURCES: return "Out of resources";
- case CL_OUT_OF_HOST_MEMORY: return "Out of host memory";
- case CL_PROFILING_INFO_NOT_AVAILABLE: return "Profiling information not available";
- case CL_MEM_COPY_OVERLAP: return "Memory copy overlap";
- case CL_IMAGE_FORMAT_MISMATCH: return "Image format mismatch";
- case CL_IMAGE_FORMAT_NOT_SUPPORTED: return "Image format not supported";
- case CL_BUILD_PROGRAM_FAILURE: return "Program build failure";
- case CL_MAP_FAILURE: return "Map failure";
- case CL_INVALID_VALUE: return "Invalid value";
- case CL_INVALID_DEVICE_TYPE: return "Invalid device type";
- case CL_INVALID_PLATFORM: return "Invalid platform";
- case CL_INVALID_DEVICE: return "Invalid device";
- case CL_INVALID_CONTEXT: return "Invalid context";
- case CL_INVALID_QUEUE_PROPERTIES: return "Invalid queue properties";
- case CL_INVALID_COMMAND_QUEUE: return "Invalid command queue";
- case CL_INVALID_HOST_PTR: return "Invalid host pointer";
- case CL_INVALID_MEM_OBJECT: return "Invalid memory object";
- case CL_INVALID_IMAGE_FORMAT_DESCRIPTOR: return "Invalid image format descriptor";
- case CL_INVALID_IMAGE_SIZE: return "Invalid image size";
- case CL_INVALID_SAMPLER: return "Invalid sampler";
- case CL_INVALID_BINARY: return "Invalid binary";
- case CL_INVALID_BUILD_OPTIONS: return "Invalid build options";
- case CL_INVALID_PROGRAM: return "Invalid program";
- case CL_INVALID_PROGRAM_EXECUTABLE: return "Invalid program executable";
- case CL_INVALID_KERNEL_NAME: return "Invalid kernel name";
- case CL_INVALID_KERNEL_DEFINITION: return "Invalid kernel definition";
- case CL_INVALID_KERNEL: return "Invalid kernel";
- case CL_INVALID_ARG_INDEX: return "Invalid argument index";
- case CL_INVALID_ARG_VALUE: return "Invalid argument value";
- case CL_INVALID_ARG_SIZE: return "Invalid argument size";
- case CL_INVALID_KERNEL_ARGS: return "Invalid kernel arguments";
- case CL_INVALID_WORK_DIMENSION: return "Invalid work dimension";
- case CL_INVALID_WORK_GROUP_SIZE: return "Invalid work group size";
- case CL_INVALID_WORK_ITEM_SIZE: return "Invalid work item size";
- case CL_INVALID_GLOBAL_OFFSET: return "Invalid global offset";
- case CL_INVALID_EVENT_WAIT_LIST: return "Invalid event wait list";
- case CL_INVALID_EVENT: return "Invalid event";
- case CL_INVALID_OPERATION: return "Invalid operation";
- case CL_INVALID_GL_OBJECT: return "Invalid OpenGL object";
- case CL_INVALID_BUFFER_SIZE: return "Invalid buffer size";
- case CL_INVALID_MIP_LEVEL: return "Invalid mip-map level";
- default: return "Unknown";
- }
- }
-
bool opencl_error(cl_int err)
{
if(err != CL_SUCCESS) {
- string message = string_printf("OpenCL error (%d): %s", err, opencl_error_string(err));
+ string message = string_printf("OpenCL error (%d): %s", err, clewErrorString(err));
if(error_msg == "")
error_msg = message;
fprintf(stderr, "%s\n", message.c_str());
@@ -411,7 +364,7 @@ public:
cl_int err = stmt; \
\
if(err != CL_SUCCESS) { \
- string message = string_printf("OpenCL error: %s in %s", opencl_error_string(err), #stmt); \
+ string message = string_printf("OpenCL error: %s in %s", clewErrorString(err), #stmt); \
if(error_msg == "") \
error_msg = message; \
fprintf(stderr, "%s\n", message.c_str()); \
@@ -421,7 +374,7 @@ public:
void opencl_assert_err(cl_int err, const char* where)
{
if(err != CL_SUCCESS) {
- string message = string_printf("OpenCL error (%d): %s in %s", err, opencl_error_string(err), where);
+ string message = string_printf("OpenCL error (%d): %s in %s", err, clewErrorString(err), where);
if(error_msg == "")
error_msg = message;
fprintf(stderr, "%s\n", message.c_str());
@@ -443,6 +396,7 @@ public:
ckFilmConvertByteKernel = NULL;
ckFilmConvertHalfFloatKernel = NULL;
ckShaderKernel = NULL;
+ ckBakeKernel = NULL;
null_mem = 0;
device_initialized = false;
@@ -550,7 +504,7 @@ public:
device_initialized = true;
}
- static void context_notify_callback(const char *err_info,
+ static void CL_CALLBACK context_notify_callback(const char *err_info,
const void *private_info, size_t cb, void *user_data)
{
char name[256];
@@ -791,6 +745,10 @@ public:
if(opencl_error(ciErr))
return false;
+ ckBakeKernel = clCreateKernel(cpProgram, "kernel_ocl_bake", &ciErr);
+ if(opencl_error(ciErr))
+ return false;
+
return true;
}
@@ -840,6 +798,7 @@ public:
opencl_assert_err(ciErr, "clCreateBuffer");
stats.mem_alloc(size);
+ mem.device_size = size;
}
void mem_copy_to(device_memory& mem)
@@ -871,7 +830,8 @@ public:
opencl_assert(clReleaseMemObject(CL_MEM_PTR(mem.device_pointer)));
mem.device_pointer = 0;
- stats.mem_free(mem.memory_size());
+ stats.mem_free(mem.device_size);
+ mem.device_size = 0;
}
}
@@ -1050,23 +1010,43 @@ public:
cl_int d_shader_eval_type = task.shader_eval_type;
cl_int d_shader_x = task.shader_x;
cl_int d_shader_w = task.shader_w;
+ cl_int d_offset = task.offset;
/* sample arguments */
cl_uint narg = 0;
- opencl_assert(clSetKernelArg(ckShaderKernel, narg++, sizeof(d_data), (void*)&d_data));
- opencl_assert(clSetKernelArg(ckShaderKernel, narg++, sizeof(d_input), (void*)&d_input));
- opencl_assert(clSetKernelArg(ckShaderKernel, narg++, sizeof(d_output), (void*)&d_output));
+ cl_kernel kernel;
+
+ if(task.shader_eval_type >= SHADER_EVAL_BAKE)
+ kernel = ckBakeKernel;
+ else
+ kernel = ckShaderKernel;
+
+ for(int sample = 0; sample < task.num_samples; sample++) {
+
+ if(task.get_cancel())
+ break;
+
+ cl_int d_sample = sample;
+
+ opencl_assert(clSetKernelArg(kernel, narg++, sizeof(d_data), (void*)&d_data));
+ opencl_assert(clSetKernelArg(kernel, narg++, sizeof(d_input), (void*)&d_input));
+ opencl_assert(clSetKernelArg(kernel, narg++, sizeof(d_output), (void*)&d_output));
#define KERNEL_TEX(type, ttype, name) \
- set_kernel_arg_mem(ckShaderKernel, &narg, #name);
+ set_kernel_arg_mem(kernel, &narg, #name);
#include "kernel_textures.h"
- opencl_assert(clSetKernelArg(ckShaderKernel, narg++, sizeof(d_shader_eval_type), (void*)&d_shader_eval_type));
- opencl_assert(clSetKernelArg(ckShaderKernel, narg++, sizeof(d_shader_x), (void*)&d_shader_x));
- opencl_assert(clSetKernelArg(ckShaderKernel, narg++, sizeof(d_shader_w), (void*)&d_shader_w));
+ opencl_assert(clSetKernelArg(kernel, narg++, sizeof(d_shader_eval_type), (void*)&d_shader_eval_type));
+ opencl_assert(clSetKernelArg(kernel, narg++, sizeof(d_shader_x), (void*)&d_shader_x));
+ opencl_assert(clSetKernelArg(kernel, narg++, sizeof(d_shader_w), (void*)&d_shader_w));
+ opencl_assert(clSetKernelArg(kernel, narg++, sizeof(d_offset), (void*)&d_offset));
+ opencl_assert(clSetKernelArg(kernel, narg++, sizeof(d_sample), (void*)&d_sample));
+
+ enqueue_kernel(kernel, task.shader_w, 1);
- enqueue_kernel(ckShaderKernel, task.shader_w, 1);
+ task.update_progress(NULL);
+ }
}
void thread_run(DeviceTask *task)
@@ -1095,7 +1075,7 @@ public:
tile.sample = sample + 1;
- task->update_progress(tile);
+ task->update_progress(&tile);
}
task->release_tile(tile);
@@ -1112,6 +1092,11 @@ public:
}
};
+ int get_split_task_count(DeviceTask& task)
+ {
+ return 1;
+ }
+
void task_add(DeviceTask& task)
{
task_pool.push(new OpenCLDeviceTask(this, task));
@@ -1133,6 +1118,26 @@ Device *device_opencl_create(DeviceInfo& info, Stats &stats, bool background)
return new OpenCLDevice(info, stats, background);
}
+bool device_opencl_init(void) {
+ static bool initialized = false;
+ static bool result = false;
+
+ if (initialized)
+ return result;
+
+ initialized = true;
+
+ // OpenCL disabled for now, only works with this environment variable set
+ if(!getenv("CYCLES_OPENCL_TEST")) {
+ result = false;
+ }
+ else {
+ result = clewInit() == CLEW_SUCCESS;
+ }
+
+ return result;
+}
+
void device_opencl_info(vector<DeviceInfo>& devices)
{
vector<cl_device_id> device_ids;
diff --git a/intern/cycles/device/device_task.cpp b/intern/cycles/device/device_task.cpp
index 7d0eeab780d..dc124f8cf37 100644
--- a/intern/cycles/device/device_task.cpp
+++ b/intern/cycles/device/device_task.cpp
@@ -35,26 +35,39 @@ DeviceTask::DeviceTask(Type type_)
last_update_time = time_dt();
}
-void DeviceTask::split_max_size(list<DeviceTask>& tasks, int max_size)
+int DeviceTask::get_subtask_count(int num, int max_size)
{
- int num;
+ if(max_size != 0) {
+ int max_size_num;
+
+ if(type == SHADER) {
+ max_size_num = (shader_w + max_size - 1)/max_size;
+ }
+ else {
+ max_size = max(1, max_size/w);
+ max_size_num = (h + max_size - 1)/max_size;
+ }
+
+ num = max(max_size_num, num);
+ }
if(type == SHADER) {
- num = (shader_w + max_size - 1)/max_size;
+ num = min(shader_w, num);
+ }
+ else if(type == PATH_TRACE) {
}
else {
- max_size = max(1, max_size/w);
- num = (h + max_size - 1)/max_size;
+ num = min(h, num);
}
- split(tasks, num);
+ return num;
}
-void DeviceTask::split(list<DeviceTask>& tasks, int num)
+void DeviceTask::split(list<DeviceTask>& tasks, int num, int max_size)
{
- if(type == SHADER) {
- num = min(shader_w, num);
+ num = get_subtask_count(num, max_size);
+ if(type == SHADER) {
for(int i = 0; i < num; i++) {
int tx = shader_x + (shader_w/num)*i;
int tw = (i == num-1)? shader_w - i*(shader_w/num): shader_w/num;
@@ -72,8 +85,6 @@ void DeviceTask::split(list<DeviceTask>& tasks, int num)
tasks.push_back(*this);
}
else {
- num = min(h, num);
-
for(int i = 0; i < num; i++) {
int ty = y + (h/num)*i;
int th = (i == num-1)? h - i*(h/num): h/num;
@@ -88,9 +99,10 @@ void DeviceTask::split(list<DeviceTask>& tasks, int num)
}
}
-void DeviceTask::update_progress(RenderTile &rtile)
+void DeviceTask::update_progress(RenderTile *rtile)
{
- if (type != PATH_TRACE)
+ if((type != PATH_TRACE) &&
+ (type != SHADER))
return;
if(update_progress_sample)
@@ -100,7 +112,7 @@ void DeviceTask::update_progress(RenderTile &rtile)
double current_time = time_dt();
if (current_time - last_update_time >= 1.0) {
- update_tile_sample(rtile);
+ update_tile_sample(*rtile);
last_update_time = current_time;
}
diff --git a/intern/cycles/device/device_task.h b/intern/cycles/device/device_task.h
index c1bd39b70ca..50216adefe2 100644
--- a/intern/cycles/device/device_task.h
+++ b/intern/cycles/device/device_task.h
@@ -52,10 +52,10 @@ public:
DeviceTask(Type type = PATH_TRACE);
- void split(list<DeviceTask>& tasks, int num);
- void split_max_size(list<DeviceTask>& tasks, int max_size);
+ int get_subtask_count(int num, int max_size = 0);
+ void split(list<DeviceTask>& tasks, int num, int max_size = 0);
- void update_progress(RenderTile &rtile);
+ void update_progress(RenderTile *rtile);
boost::function<bool(Device *device, RenderTile&)> acquire_tile;
boost::function<void(void)> update_progress_sample;
diff --git a/intern/cycles/kernel/CMakeLists.txt b/intern/cycles/kernel/CMakeLists.txt
index d18f4fa2998..c521e1383a4 100644
--- a/intern/cycles/kernel/CMakeLists.txt
+++ b/intern/cycles/kernel/CMakeLists.txt
@@ -19,12 +19,13 @@ set(SRC
set(SRC_HEADERS
kernel.h
kernel_accumulate.h
+ kernel_bake.h
kernel_camera.h
kernel_compat_cpu.h
kernel_compat_cuda.h
kernel_compat_opencl.h
+ kernel_debug.h
kernel_differential.h
- kernel_displace.h
kernel_emission.h
kernel_film.h
kernel_globals.h
@@ -35,6 +36,8 @@ set(SRC_HEADERS
kernel_passes.h
kernel_path.h
kernel_path_state.h
+ kernel_path_surface.h
+ kernel_path_volume.h
kernel_projection.h
kernel_random.h
kernel_shader.h
@@ -58,8 +61,7 @@ set(SRC_CLOSURE_HEADERS
closure/bsdf_toon.h
closure/bsdf_transparent.h
closure/bsdf_util.h
- closure/bsdf_ward.h
- closure/bsdf_westin.h
+ closure/bsdf_ashikhmin_shirley.h
closure/bsdf_hair.h
closure/bssrdf.h
closure/emissive.h
@@ -95,8 +97,8 @@ set(SRC_SVM_HEADERS
svm/svm_noisetex.h
svm/svm_normal.h
svm/svm_ramp.h
- svm/svm_sepcomb_rgb.h
svm/svm_sepcomb_hsv.h
+ svm/svm_sepcomb_vector.h
svm/svm_sky.h
svm/svm_tex_coord.h
svm/svm_texture.h
@@ -111,8 +113,10 @@ set(SRC_GEOM_HEADERS
geom/geom.h
geom/geom_attribute.h
geom/geom_bvh.h
+ geom/geom_bvh_shadow.h
geom/geom_bvh_subsurface.h
geom/geom_bvh_traversal.h
+ geom/geom_bvh_volume.h
geom/geom_curve.h
geom/geom_motion_curve.h
geom/geom_motion_triangle.h
@@ -146,50 +150,69 @@ if(WITH_CYCLES_CUDA_BINARIES)
set(CUDA_VERSION "${CUDA_VERSION_MAJOR}${CUDA_VERSION_MINOR}")
# warn for other versions
- if(CUDA_VERSION MATCHES "60")
+ if(CUDA_VERSION MATCHES "65")
else()
message(WARNING
"CUDA version ${CUDA_VERSION_MAJOR}.${CUDA_VERSION_MINOR} detected, "
- "build may succeed but only CUDA 6.0 is officially supported")
+ "build may succeed but only CUDA 6.5 is officially supported")
endif()
# build for each arch
set(cuda_sources kernel.cu ${SRC_HEADERS} ${SRC_SVM_HEADERS} ${SRC_GEOM_HEADERS} ${SRC_CLOSURE_HEADERS} ${SRC_UTIL_HEADERS})
set(cuda_cubins)
- foreach(arch ${CYCLES_CUDA_BINARIES_ARCH})
- set(cuda_cubin kernel_${arch}.cubin)
+ macro(CYCLES_CUDA_KERNEL_ADD arch experimental)
+ if(${experimental})
+ set(cuda_extra_flags "-D__KERNEL_CUDA_EXPERIMENTAL__")
+ set(cuda_cubin kernel_experimental_${arch}.cubin)
+ else()
+ set(cuda_extra_flags "")
+ set(cuda_cubin kernel_${arch}.cubin)
+ endif()
+
+ if(WITH_CYCLES_DEBUG)
+ set(cuda_debug_flags "-D__KERNEL_DEBUG__")
+ else()
+ set(cuda_debug_flags "")
+ endif()
set(cuda_version_flags "-D__KERNEL_CUDA_VERSION__=${CUDA_VERSION}")
set(cuda_math_flags "--use_fast_math")
- if(CUDA_VERSION LESS 60 AND ${arch} MATCHES "sm_50")
- message(WARNING "Can't build kernel for CUDA sm_50 architecture, skipping")
- elseif(CUDA_VERSION LESS 50 AND ${arch} MATCHES "sm_35")
- message(WARNING "Can't build kernel for CUDA sm_35 architecture, skipping")
- else()
- add_custom_command(
- OUTPUT ${cuda_cubin}
- COMMAND ${CUDA_NVCC_EXECUTABLE}
- -arch=${arch}
- -m${CUDA_BITS}
- --cubin ${CMAKE_CURRENT_SOURCE_DIR}/kernel.cu
- -o ${CMAKE_CURRENT_BINARY_DIR}/${cuda_cubin}
- --ptxas-options="-v"
- ${cuda_arch_flags}
- ${cuda_version_flags}
- ${cuda_math_flags}
- -I${CMAKE_CURRENT_SOURCE_DIR}/../util
- -I${CMAKE_CURRENT_SOURCE_DIR}/svm
- -DCCL_NAMESPACE_BEGIN=
- -DCCL_NAMESPACE_END=
- -DNVCC
-
- DEPENDS ${cuda_sources})
-
- delayed_install("${CMAKE_CURRENT_BINARY_DIR}" "${cuda_cubin}" ${CYCLES_INSTALL_PATH}/lib)
- list(APPEND cuda_cubins ${cuda_cubin})
- endif()
+ add_custom_command(
+ OUTPUT ${cuda_cubin}
+ COMMAND ${CUDA_NVCC_EXECUTABLE}
+ -arch=${arch}
+ -m${CUDA_BITS}
+ --cubin ${CMAKE_CURRENT_SOURCE_DIR}/kernel.cu
+ -o ${CMAKE_CURRENT_BINARY_DIR}/${cuda_cubin}
+ --ptxas-options="-v"
+ ${cuda_arch_flags}
+ ${cuda_version_flags}
+ ${cuda_math_flags}
+ ${cuda_extra_flags}
+ ${cuda_debug_flags}
+ -I${CMAKE_CURRENT_SOURCE_DIR}/../util
+ -I${CMAKE_CURRENT_SOURCE_DIR}/svm
+ -DCCL_NAMESPACE_BEGIN=
+ -DCCL_NAMESPACE_END=
+ -DNVCC
+
+ DEPENDS ${cuda_sources})
+
+ delayed_install("${CMAKE_CURRENT_BINARY_DIR}" "${cuda_cubin}" ${CYCLES_INSTALL_PATH}/lib)
+ list(APPEND cuda_cubins ${cuda_cubin})
+
+ unset(cuda_extra_flags)
+ unset(cuda_debug_flags)
+ endmacro()
+
+ foreach(arch ${CYCLES_CUDA_BINARIES_ARCH})
+ # Compile regular kernel
+ CYCLES_CUDA_KERNEL_ADD(${arch} FALSE)
+
+ # Compile experimental kernel
+ CYCLES_CUDA_KERNEL_ADD(${arch} TRUE)
endforeach()
add_custom_target(cycles_kernel_cuda ALL DEPENDS ${cuda_cubins})
@@ -213,12 +236,14 @@ if(CXX_HAS_SSE)
kernel_sse3.cpp
kernel_sse41.cpp
kernel_avx.cpp
+ kernel_avx2.cpp
)
set_source_files_properties(kernel_sse2.cpp PROPERTIES COMPILE_FLAGS "${CYCLES_SSE2_KERNEL_FLAGS}")
set_source_files_properties(kernel_sse3.cpp PROPERTIES COMPILE_FLAGS "${CYCLES_SSE3_KERNEL_FLAGS}")
set_source_files_properties(kernel_sse41.cpp PROPERTIES COMPILE_FLAGS "${CYCLES_SSE41_KERNEL_FLAGS}")
set_source_files_properties(kernel_avx.cpp PROPERTIES COMPILE_FLAGS "${CYCLES_AVX_KERNEL_FLAGS}")
+ set_source_files_properties(kernel_avx2.cpp PROPERTIES COMPILE_FLAGS "${CYCLES_AVX2_KERNEL_FLAGS}")
endif()
diff --git a/intern/cycles/kernel/SConscript b/intern/cycles/kernel/SConscript
index 04e1bad7538..c0d969e24ae 100644
--- a/intern/cycles/kernel/SConscript
+++ b/intern/cycles/kernel/SConscript
@@ -30,6 +30,7 @@ import subprocess
import sys
import os
import Blender as B
+import btools
def normpath(path):
return os.path.abspath(os.path.normpath(path))
@@ -64,49 +65,56 @@ if env['WITH_BF_CYCLES_CUDA_BINARIES']:
closure_dir = os.path.join(source_dir, "../closure")
# get CUDA version
- nvcc_pipe = subprocess.Popen([nvcc, "--version"],stdout=subprocess.PIPE,stderr=subprocess.PIPE)
- output, erroroutput = nvcc_pipe.communicate()
+ output = btools.get_command_output([nvcc, "--version"])
cuda_major_minor = re.findall(r'release (\d+).(\d+)', output)[0]
cuda_version = int(cuda_major_minor[0])*10 + int(cuda_major_minor[1])
- if cuda_version != 60:
- print("CUDA version %d.%d detected, build may succeed but only CUDA 6.0 is officially supported." % (cuda_version/10, cuda_version%10))
+ if cuda_version != 65:
+ print("CUDA version %d.%d detected, build may succeed but only CUDA 6.5 is officially supported." % (cuda_version/10, cuda_version%10))
# nvcc flags
nvcc_flags = "-m%s" % (bits)
- nvcc_flags += " --cubin --ptxas-options=\"-v\""
+ nvcc_flags += " --cubin --ptxas-options=\"-v\" --use_fast_math"
nvcc_flags += " -D__KERNEL_CUDA_VERSION__=%d" % (cuda_version)
nvcc_flags += " -DCCL_NAMESPACE_BEGIN= -DCCL_NAMESPACE_END= -DNVCC"
nvcc_flags += " -I \"%s\" -I \"%s\" -I \"%s\" -I \"%s\"" % (util_dir, svm_dir, geom_dir, closure_dir)
+ if env['WITH_BF_CYCLES_DEBUG']:
+ nvcc_flags += " -D__KERNEL_DEBUG__"
+
# dependencies
dependencies = ['kernel.cu'] + kernel.Glob('*.h') + kernel.Glob('../util/*.h') + kernel.Glob('svm/*.h') + kernel.Glob('geom/*.h') + kernel.Glob('closure/*.h')
last_cubin_file = None
+ configs = (("kernel_%s.cubin", ''),
+ ("kernel_experimental_%s.cubin", ' -D__KERNEL_CUDA_EXPERIMENTAL__'))
+
# add command for each cuda architecture
for arch in cuda_archs:
- if cuda_version < 60 and arch == "sm_50":
- print("Can't build kernel for CUDA sm_50 architecture, skipping")
- continue
-
- cubin_file = os.path.join(build_dir, "kernel_%s.cubin" % arch)
-
- if env['BF_CYCLES_CUDA_ENV']:
- MS_SDK = "C:\\Program Files\\Microsoft SDKs\\Windows\\v7.1\\Bin\\SetEnv.cmd"
- command = "\"%s\" & \"%s\" -arch=%s %s \"%s\" -o \"%s\"" % (MS_SDK, nvcc, arch, nvcc_flags, kernel_file, cubin_file)
- else:
- command = "\"%s\" -arch=%s %s \"%s\" -o \"%s\"" % (nvcc, arch, nvcc_flags, kernel_file, cubin_file)
-
- kernel.Command(cubin_file, 'kernel.cu', command)
- kernel.Depends(cubin_file, dependencies)
-
- kernel_binaries.append(cubin_file)
-
- if not env['WITH_BF_CYCLES_CUDA_THREADED_COMPILE']:
- # trick to compile one kernel at a time to reduce memory usage
- if last_cubin_file:
- kernel.Depends(cubin_file, last_cubin_file)
- last_cubin_file = cubin_file
+ for config in configs:
+ # TODO(sergey): Use dict instead ocouple in order to increase readability?
+ name = config[0]
+ extra_flags = config[1]
+
+ cubin_file = os.path.join(build_dir, name % arch)
+ current_flags = nvcc_flags + extra_flags
+
+ if env['BF_CYCLES_CUDA_ENV']:
+ MS_SDK = "C:\\Program Files\\Microsoft SDKs\\Windows\\v7.1\\Bin\\SetEnv.cmd"
+ command = "\"%s\" & \"%s\" -arch=%s %s \"%s\" -o \"%s\"" % (MS_SDK, nvcc, arch, current_flags, kernel_file, cubin_file)
+ else:
+ command = "\"%s\" -arch=%s %s \"%s\" -o \"%s\"" % (nvcc, arch, current_flags, kernel_file, cubin_file)
+
+ kernel.Command(cubin_file, 'kernel.cu', command)
+ kernel.Depends(cubin_file, dependencies)
+
+ kernel_binaries.append(cubin_file)
+
+ if not env['WITH_BF_CYCLES_CUDA_THREADED_COMPILE']:
+ # trick to compile one kernel at a time to reduce memory usage
+ if last_cubin_file:
+ kernel.Depends(cubin_file, last_cubin_file)
+ last_cubin_file = cubin_file
Return('kernel_binaries')
diff --git a/intern/cycles/kernel/closure/bsdf.h b/intern/cycles/kernel/closure/bsdf.h
index 24b54cd9d9e..7d4783b0f3c 100644
--- a/intern/cycles/kernel/closure/bsdf.h
+++ b/intern/cycles/kernel/closure/bsdf.h
@@ -23,10 +23,7 @@
#include "../closure/bsdf_reflection.h"
#include "../closure/bsdf_refraction.h"
#include "../closure/bsdf_transparent.h"
-#ifdef __ANISOTROPIC__
-#include "../closure/bsdf_ward.h"
-#endif
-#include "../closure/bsdf_westin.h"
+#include "../closure/bsdf_ashikhmin_shirley.h"
#include "../closure/bsdf_toon.h"
#include "../closure/bsdf_hair.h"
#ifdef __SUBSURFACE__
@@ -83,21 +80,22 @@ ccl_device int bsdf_sample(KernelGlobals *kg, const ShaderData *sd, const Shader
eval, omega_in, &domega_in->dx, &domega_in->dy, pdf);
break;
case CLOSURE_BSDF_MICROFACET_GGX_ID:
+ case CLOSURE_BSDF_MICROFACET_GGX_ANISO_ID:
case CLOSURE_BSDF_MICROFACET_GGX_REFRACTION_ID:
- label = bsdf_microfacet_ggx_sample(sc, sd->Ng, sd->I, sd->dI.dx, sd->dI.dy, randu, randv,
+ label = bsdf_microfacet_ggx_sample(kg, sc, sd->Ng, sd->I, sd->dI.dx, sd->dI.dy, randu, randv,
eval, omega_in, &domega_in->dx, &domega_in->dy, pdf);
break;
case CLOSURE_BSDF_MICROFACET_BECKMANN_ID:
+ case CLOSURE_BSDF_MICROFACET_BECKMANN_ANISO_ID:
case CLOSURE_BSDF_MICROFACET_BECKMANN_REFRACTION_ID:
- label = bsdf_microfacet_beckmann_sample(sc, sd->Ng, sd->I, sd->dI.dx, sd->dI.dy, randu, randv,
+ label = bsdf_microfacet_beckmann_sample(kg, sc, sd->Ng, sd->I, sd->dI.dx, sd->dI.dy, randu, randv,
eval, omega_in, &domega_in->dx, &domega_in->dy, pdf);
break;
-#ifdef __ANISOTROPIC__
- case CLOSURE_BSDF_WARD_ID:
- label = bsdf_ward_sample(sc, sd->Ng, sd->I, sd->dI.dx, sd->dI.dy, randu, randv,
+ case CLOSURE_BSDF_ASHIKHMIN_SHIRLEY_ID:
+ case CLOSURE_BSDF_ASHIKHMIN_SHIRLEY_ANISO_ID:
+ label = bsdf_ashikhmin_shirley_sample(sc, sd->Ng, sd->I, sd->dI.dx, sd->dI.dy, randu, randv,
eval, omega_in, &domega_in->dx, &domega_in->dy, pdf);
break;
-#endif
case CLOSURE_BSDF_ASHIKHMIN_VELVET_ID:
label = bsdf_ashikhmin_velvet_sample(sc, sd->Ng, sd->I, sd->dI.dx, sd->dI.dy, randu, randv,
eval, omega_in, &domega_in->dx, &domega_in->dy, pdf);
@@ -110,14 +108,6 @@ ccl_device int bsdf_sample(KernelGlobals *kg, const ShaderData *sd, const Shader
label = bsdf_glossy_toon_sample(sc, sd->Ng, sd->I, sd->dI.dx, sd->dI.dy, randu, randv,
eval, omega_in, &domega_in->dx, &domega_in->dy, pdf);
break;
- case CLOSURE_BSDF_WESTIN_BACKSCATTER_ID:
- label = bsdf_westin_backscatter_sample(sc, sd->Ng, sd->I, sd->dI.dx, sd->dI.dy, randu, randv,
- eval, omega_in, &domega_in->dx, &domega_in->dy, pdf);
- break;
- case CLOSURE_BSDF_WESTIN_SHEEN_ID:
- label = bsdf_westin_sheen_sample(sc, sd->Ng, sd->I, sd->dI.dx, sd->dI.dy, randu, randv,
- eval, omega_in, &domega_in->dx, &domega_in->dy, pdf);
- break;
case CLOSURE_BSDF_HAIR_REFLECTION_ID:
label = bsdf_hair_reflection_sample(sc, sd->Ng, sd->I, sd->dI.dx, sd->dI.dy, randu, randv,
eval, omega_in, &domega_in->dx, &domega_in->dy, pdf);
@@ -178,18 +168,19 @@ ccl_device float3 bsdf_eval(KernelGlobals *kg, const ShaderData *sd, const Shade
eval = bsdf_transparent_eval_reflect(sc, sd->I, omega_in, pdf);
break;
case CLOSURE_BSDF_MICROFACET_GGX_ID:
+ case CLOSURE_BSDF_MICROFACET_GGX_ANISO_ID:
case CLOSURE_BSDF_MICROFACET_GGX_REFRACTION_ID:
eval = bsdf_microfacet_ggx_eval_reflect(sc, sd->I, omega_in, pdf);
break;
case CLOSURE_BSDF_MICROFACET_BECKMANN_ID:
+ case CLOSURE_BSDF_MICROFACET_BECKMANN_ANISO_ID:
case CLOSURE_BSDF_MICROFACET_BECKMANN_REFRACTION_ID:
eval = bsdf_microfacet_beckmann_eval_reflect(sc, sd->I, omega_in, pdf);
break;
-#ifdef __ANISOTROPIC__
- case CLOSURE_BSDF_WARD_ID:
- eval = bsdf_ward_eval_reflect(sc, sd->I, omega_in, pdf);
+ case CLOSURE_BSDF_ASHIKHMIN_SHIRLEY_ID:
+ case CLOSURE_BSDF_ASHIKHMIN_SHIRLEY_ANISO_ID:
+ eval = bsdf_ashikhmin_shirley_eval_reflect(sc, sd->I, omega_in, pdf);
break;
-#endif
case CLOSURE_BSDF_ASHIKHMIN_VELVET_ID:
eval = bsdf_ashikhmin_velvet_eval_reflect(sc, sd->I, omega_in, pdf);
break;
@@ -199,12 +190,6 @@ ccl_device float3 bsdf_eval(KernelGlobals *kg, const ShaderData *sd, const Shade
case CLOSURE_BSDF_GLOSSY_TOON_ID:
eval = bsdf_glossy_toon_eval_reflect(sc, sd->I, omega_in, pdf);
break;
- case CLOSURE_BSDF_WESTIN_BACKSCATTER_ID:
- eval = bsdf_westin_backscatter_eval_reflect(sc, sd->I, omega_in, pdf);
- break;
- case CLOSURE_BSDF_WESTIN_SHEEN_ID:
- eval = bsdf_westin_sheen_eval_reflect(sc, sd->I, omega_in, pdf);
- break;
case CLOSURE_BSDF_HAIR_REFLECTION_ID:
eval = bsdf_hair_reflection_eval_reflect(sc, sd->I, omega_in, pdf);
break;
@@ -245,18 +230,19 @@ ccl_device float3 bsdf_eval(KernelGlobals *kg, const ShaderData *sd, const Shade
eval = bsdf_transparent_eval_transmit(sc, sd->I, omega_in, pdf);
break;
case CLOSURE_BSDF_MICROFACET_GGX_ID:
+ case CLOSURE_BSDF_MICROFACET_GGX_ANISO_ID:
case CLOSURE_BSDF_MICROFACET_GGX_REFRACTION_ID:
eval = bsdf_microfacet_ggx_eval_transmit(sc, sd->I, omega_in, pdf);
break;
case CLOSURE_BSDF_MICROFACET_BECKMANN_ID:
+ case CLOSURE_BSDF_MICROFACET_BECKMANN_ANISO_ID:
case CLOSURE_BSDF_MICROFACET_BECKMANN_REFRACTION_ID:
eval = bsdf_microfacet_beckmann_eval_transmit(sc, sd->I, omega_in, pdf);
break;
-#ifdef __ANISOTROPIC__
- case CLOSURE_BSDF_WARD_ID:
- eval = bsdf_ward_eval_transmit(sc, sd->I, omega_in, pdf);
+ case CLOSURE_BSDF_ASHIKHMIN_SHIRLEY_ID:
+ case CLOSURE_BSDF_ASHIKHMIN_SHIRLEY_ANISO_ID:
+ eval = bsdf_ashikhmin_shirley_eval_transmit(sc, sd->I, omega_in, pdf);
break;
-#endif
case CLOSURE_BSDF_ASHIKHMIN_VELVET_ID:
eval = bsdf_ashikhmin_velvet_eval_transmit(sc, sd->I, omega_in, pdf);
break;
@@ -266,12 +252,6 @@ ccl_device float3 bsdf_eval(KernelGlobals *kg, const ShaderData *sd, const Shade
case CLOSURE_BSDF_GLOSSY_TOON_ID:
eval = bsdf_glossy_toon_eval_transmit(sc, sd->I, omega_in, pdf);
break;
- case CLOSURE_BSDF_WESTIN_BACKSCATTER_ID:
- eval = bsdf_westin_backscatter_eval_transmit(sc, sd->I, omega_in, pdf);
- break;
- case CLOSURE_BSDF_WESTIN_SHEEN_ID:
- eval = bsdf_westin_sheen_eval_transmit(sc, sd->I, omega_in, pdf);
- break;
case CLOSURE_BSDF_HAIR_REFLECTION_ID:
eval = bsdf_hair_reflection_eval_transmit(sc, sd->I, omega_in, pdf);
break;
@@ -330,18 +310,19 @@ ccl_device void bsdf_blur(KernelGlobals *kg, ShaderClosure *sc, float roughness)
bsdf_transparent_blur(sc, roughness);
break;
case CLOSURE_BSDF_MICROFACET_GGX_ID:
+ case CLOSURE_BSDF_MICROFACET_GGX_ANISO_ID:
case CLOSURE_BSDF_MICROFACET_GGX_REFRACTION_ID:
bsdf_microfacet_ggx_blur(sc, roughness);
break;
case CLOSURE_BSDF_MICROFACET_BECKMANN_ID:
+ case CLOSURE_BSDF_MICROFACET_BECKMANN_ANISO_ID:
case CLOSURE_BSDF_MICROFACET_BECKMANN_REFRACTION_ID:
bsdf_microfacet_beckmann_blur(sc, roughness);
break;
-#ifdef __ANISOTROPIC__
- case CLOSURE_BSDF_WARD_ID:
- bsdf_ward_blur(sc, roughness);
+ case CLOSURE_BSDF_ASHIKHMIN_SHIRLEY_ID:
+ case CLOSURE_BSDF_ASHIKHMIN_SHIRLEY_ANISO_ID:
+ bsdf_ashikhmin_shirley_blur(sc, roughness);
break;
-#endif
case CLOSURE_BSDF_ASHIKHMIN_VELVET_ID:
bsdf_ashikhmin_velvet_blur(sc, roughness);
break;
@@ -351,12 +332,6 @@ ccl_device void bsdf_blur(KernelGlobals *kg, ShaderClosure *sc, float roughness)
case CLOSURE_BSDF_GLOSSY_TOON_ID:
bsdf_glossy_toon_blur(sc, roughness);
break;
- case CLOSURE_BSDF_WESTIN_BACKSCATTER_ID:
- bsdf_westin_backscatter_blur(sc, roughness);
- break;
- case CLOSURE_BSDF_WESTIN_SHEEN_ID:
- bsdf_westin_sheen_blur(sc, roughness);
- break;
case CLOSURE_BSDF_HAIR_REFLECTION_ID:
case CLOSURE_BSDF_HAIR_TRANSMISSION_ID:
bsdf_hair_reflection_blur(sc, roughness);
diff --git a/intern/cycles/kernel/closure/bsdf_ashikhmin_shirley.h b/intern/cycles/kernel/closure/bsdf_ashikhmin_shirley.h
new file mode 100644
index 00000000000..ad7864cb8ea
--- /dev/null
+++ b/intern/cycles/kernel/closure/bsdf_ashikhmin_shirley.h
@@ -0,0 +1,210 @@
+/*
+ * Copyright 2011-2013 Blender Foundation
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License
+ */
+
+#ifndef __BSDF_ASHIKHMIN_SHIRLEY_H__
+#define __BSDF_ASHIKHMIN_SHIRLEY_H__
+
+/*
+ASHIKHMIN SHIRLEY BSDF
+
+Implementation of
+Michael Ashikhmin and Peter Shirley: "An Anisotropic Phong BRDF Model" (2000)
+
+The Fresnel factor is missing to get a separable bsdf (intensity*color), as is
+the case with all other microfacet-based BSDF implementations in Cycles.
+
+Other than that, the implementation directly follows the paper.
+*/
+
+CCL_NAMESPACE_BEGIN
+
+ccl_device int bsdf_ashikhmin_shirley_setup(ShaderClosure *sc)
+{
+ /* store roughness. could already convert to exponent to save some cycles
+ * in eval, but this is more consistent with other bsdfs and shader_blur. */
+ sc->data0 = clamp(sc->data0, 1e-4f, 1.0f);
+ sc->data1 = sc->data0;
+
+ sc->type = CLOSURE_BSDF_ASHIKHMIN_SHIRLEY_ID;
+ return SD_BSDF | SD_BSDF_HAS_EVAL | SD_BSDF_GLOSSY;
+}
+
+ccl_device int bsdf_ashikhmin_shirley_aniso_setup(ShaderClosure *sc)
+{
+ /* store roughness. could already convert to exponent to save some cycles
+ * in eval, but this is more consistent with other bsdfs and shader_blur. */
+ sc->data0 = clamp(sc->data0, 1e-4f, 1.0f);
+ sc->data1 = clamp(sc->data1, 1e-4f, 1.0f);
+
+ sc->type = CLOSURE_BSDF_ASHIKHMIN_SHIRLEY_ANISO_ID;
+ return SD_BSDF | SD_BSDF_HAS_EVAL | SD_BSDF_GLOSSY;
+}
+
+ccl_device void bsdf_ashikhmin_shirley_blur(ShaderClosure *sc, float roughness)
+{
+ sc->data0 = fmaxf(roughness, sc->data0); /* clamp roughness */
+ sc->data1 = fmaxf(roughness, sc->data1);
+}
+
+ccl_device_inline float bsdf_ashikhmin_shirley_roughness_to_exponent(float roughness)
+{
+ return 2.0f / (roughness*roughness) - 2.0f;
+}
+
+ccl_device float3 bsdf_ashikhmin_shirley_eval_reflect(const ShaderClosure *sc, const float3 I, const float3 omega_in, float *pdf)
+{
+ float3 N = sc->N;
+
+ float NdotI = dot(N, I); /* in Cycles/OSL convention I is omega_out */
+ float NdotO = dot(N, omega_in); /* and consequently we use for O omaga_in ;) */
+
+ float out = 0.0f;
+
+ if (NdotI > 0.0f && NdotO > 0.0f) {
+ NdotI = fmaxf(NdotI, 1e-6f);
+ NdotO = fmaxf(NdotO, 1e-6f);
+ float3 H = normalize(omega_in + I);
+ float HdotI = fmaxf(fabsf(dot(H, I)), 1e-6f);
+ float HdotN = fmaxf(dot(H, N), 1e-6f);
+
+ float pump = 1.0f / fmaxf(1e-6f, (HdotI*fmaxf(NdotO, NdotI))); /* pump from original paper (first derivative disc., but cancels the HdotI in the pdf nicely) */
+ /*float pump = 1.0f / fmaxf(1e-4f, ((NdotO + NdotI) * (NdotO*NdotI))); */ /* pump from d-brdf paper */
+
+ float n_x = bsdf_ashikhmin_shirley_roughness_to_exponent(sc->data0);
+ float n_y = bsdf_ashikhmin_shirley_roughness_to_exponent(sc->data1);
+
+ if (n_x == n_y) { /* => isotropic case */
+ float e = n_x;
+ float lobe = powf(HdotN, e);
+ float norm = (n_x + 1.0f) / (8.0f * M_PI_F);
+
+ out = NdotO * norm * lobe * pump;
+ *pdf = norm * lobe / HdotI; /* this is p_h / 4(H.I) (conversion from 'wh measure' to 'wi measure', eq. 8 in paper) */
+ }
+ else { /* => ANisotropic case */
+ float3 X, Y;
+ make_orthonormals_tangent(N, sc->T, &X, &Y);
+
+ float HdotX = dot(H, X);
+ float HdotY = dot(H, Y);
+ float e = (n_x * HdotX*HdotX + n_y * HdotY*HdotY) / (1.0f - HdotN*HdotN);
+ float lobe = powf(HdotN, e);
+ float norm = sqrtf((n_x + 1.0f)*(n_y + 1.0f)) / (8.0f * M_PI_F);
+
+ out = NdotO * norm * lobe * pump;
+ *pdf = norm * lobe / HdotI;
+ }
+ }
+
+ return make_float3(out, out, out);
+}
+
+ccl_device float3 bsdf_ashikhmin_shirley_eval_transmit(const ShaderClosure *sc, const float3 I, const float3 omega_in, float *pdf)
+{
+ return make_float3(0.0f, 0.0f, 0.0f);
+}
+
+ccl_device_inline void bsdf_ashikhmin_shirley_sample_first_quadrant(float n_x, float n_y, float randu, float randv, float *phi, float *cos_theta)
+{
+ *phi = atanf(sqrtf((n_x + 1.0f) / (n_y + 1.0f)) * tanf(M_PI_2_F * randu));
+ float cos_phi = cosf(*phi);
+ float sin_phi = sinf(*phi);
+ *cos_theta = powf(randv, 1.0f / (n_x * cos_phi*cos_phi + n_y * sin_phi*sin_phi + 1.0f));
+}
+
+ccl_device int bsdf_ashikhmin_shirley_sample(const ShaderClosure *sc, float3 Ng, float3 I, float3 dIdx, float3 dIdy, float randu, float randv, float3 *eval, float3 *omega_in, float3 *domega_in_dx, float3 *domega_in_dy, float *pdf)
+{
+ float3 N = sc->N;
+
+ float NdotI = dot(N, I);
+ if (NdotI > 0.0f) {
+
+ float n_x = bsdf_ashikhmin_shirley_roughness_to_exponent(sc->data0);
+ float n_y = bsdf_ashikhmin_shirley_roughness_to_exponent(sc->data1);
+
+ /* get x,y basis on the surface for anisotropy */
+ float3 X, Y;
+
+ if(n_x == n_y)
+ make_orthonormals(N, &X, &Y);
+ else
+ make_orthonormals_tangent(N, sc->T, &X, &Y);
+
+ /* sample spherical coords for h in tangent space */
+ float phi;
+ float cos_theta;
+ if (n_x == n_y) { /* => simple isotropic sampling */
+ phi = M_2PI_F * randu;
+ cos_theta = powf(randv, 1.0f / (n_x + 1.0f));
+ }
+ else { /* => more complex anisotropic sampling */
+ if (randu < 0.25f) { /* first quadrant */
+ float remapped_randu = 4.0f * randu;
+ bsdf_ashikhmin_shirley_sample_first_quadrant(n_x, n_y, remapped_randu, randv, &phi, &cos_theta);
+ }
+ else if (randu < 0.5f) { /* second quadrant */
+ float remapped_randu = 4.0f * (.5f - randu);
+ bsdf_ashikhmin_shirley_sample_first_quadrant(n_x, n_y, remapped_randu, randv, &phi, &cos_theta);
+ phi = M_PI_F - phi;
+ }
+ else if (randu < 0.75f) { /* third quadrant */
+ float remapped_randu = 4.0f * (randu - 0.5f);
+ bsdf_ashikhmin_shirley_sample_first_quadrant(n_x, n_y, remapped_randu, randv, &phi, &cos_theta);
+ phi = M_PI_F + phi;
+ }
+ else { /* fourth quadrant */
+ float remapped_randu = 4.0f * (1.0f - randu);
+ bsdf_ashikhmin_shirley_sample_first_quadrant(n_x, n_y, remapped_randu, randv, &phi, &cos_theta);
+ phi = 2.0f * M_PI_F - phi;
+ }
+ }
+
+ /* get half vector in tangent space */
+ float sin_theta = sqrtf(fmaxf(0.0f, 1.0f - cos_theta*cos_theta));
+ float cos_phi = cosf(phi);
+ float sin_phi = sinf(phi); /* no sqrt(1-cos^2) here b/c it causes artifacts */
+ float3 h = make_float3(
+ sin_theta * cos_phi,
+ sin_theta * sin_phi,
+ cos_theta
+ );
+
+ /* half vector to world space */
+ float3 H = h.x*X + h.y*Y + h.z*N;
+ float HdotI = dot(H, I);
+ if (HdotI < 0.0f) H = -H;
+
+ /* reflect I on H to get omega_in */
+ *omega_in = -I + (2.0f * HdotI) * H;
+
+ /* leave the rest to eval_reflect */
+ /* (could maybe optimize a few things by manual inlining, but I doubt it would make much difference) */
+ *eval = bsdf_ashikhmin_shirley_eval_reflect(sc, I, *omega_in, pdf);
+
+#ifdef __RAY_DIFFERENTIALS__
+ /* just do the reflection thing for now */
+ *domega_in_dx = (2.0f * dot(N, dIdx)) * N - dIdx;
+ *domega_in_dy = (2.0f * dot(N, dIdy)) * N - dIdy;
+#endif
+ }
+
+ return LABEL_REFLECT | LABEL_GLOSSY;
+}
+
+
+CCL_NAMESPACE_END
+
+#endif /* __BSDF_ASHIKHMIN_SHIRLEY_H__ */
diff --git a/intern/cycles/kernel/closure/bsdf_hair.h b/intern/cycles/kernel/closure/bsdf_hair.h
index 19cdb773255..e0b5454592b 100644
--- a/intern/cycles/kernel/closure/bsdf_hair.h
+++ b/intern/cycles/kernel/closure/bsdf_hair.h
@@ -63,7 +63,7 @@ ccl_device int bsdf_hair_transmission_setup(ShaderClosure *sc)
ccl_device float3 bsdf_hair_reflection_eval_reflect(const ShaderClosure *sc, const float3 I, const float3 omega_in, float *pdf)
{
#ifdef __HAIR__
- float offset = sc->offset;
+ float offset = sc->data2;
float3 Tg = sc->T;
#else
float offset = 0.0f;
@@ -120,7 +120,7 @@ ccl_device float3 bsdf_hair_reflection_eval_transmit(const ShaderClosure *sc, co
ccl_device float3 bsdf_hair_transmission_eval_transmit(const ShaderClosure *sc, const float3 I, const float3 omega_in, float *pdf)
{
#ifdef __HAIR__
- float offset = sc->offset;
+ float offset = sc->data2;
float3 Tg = sc->T;
#else
float offset = 0.0f;
@@ -166,7 +166,7 @@ ccl_device float3 bsdf_hair_transmission_eval_transmit(const ShaderClosure *sc,
ccl_device int bsdf_hair_reflection_sample(const ShaderClosure *sc, float3 Ng, float3 I, float3 dIdx, float3 dIdy, float randu, float randv, float3 *eval, float3 *omega_in, float3 *domega_in_dx, float3 *domega_in_dy, float *pdf)
{
#ifdef __HAIR__
- float offset = sc->offset;
+ float offset = sc->data2;
float3 Tg = sc->T;
#else
float offset = 0.0f;
@@ -221,7 +221,7 @@ ccl_device int bsdf_hair_reflection_sample(const ShaderClosure *sc, float3 Ng, f
ccl_device int bsdf_hair_transmission_sample(const ShaderClosure *sc, float3 Ng, float3 I, float3 dIdx, float3 dIdy, float randu, float randv, float3 *eval, float3 *omega_in, float3 *domega_in_dx, float3 *domega_in_dy, float *pdf)
{
#ifdef __HAIR__
- float offset = sc->offset;
+ float offset = sc->data2;
float3 Tg = sc->T;
#else
float offset = 0.0f;
diff --git a/intern/cycles/kernel/closure/bsdf_microfacet.h b/intern/cycles/kernel/closure/bsdf_microfacet.h
index 1ec35e444fe..8737b0e2d94 100644
--- a/intern/cycles/kernel/closure/bsdf_microfacet.h
+++ b/intern/cycles/kernel/closure/bsdf_microfacet.h
@@ -35,20 +35,293 @@
CCL_NAMESPACE_BEGIN
-/* GGX */
+/* Approximate erf and erfinv implementations.
+ * Implementation comes straight from Wikipedia:
+ *
+ * http://en.wikipedia.org/wiki/Error_function
+ *
+ * Some constants are baked into the code.
+ */
+
+ccl_device_inline float approx_erff_do(float x)
+{
+ /* Such a clamp doesn't give much distortion to the output value
+ * and gives quite a few of the speedup.
+ */
+ if(x > 3.0f) {
+ return 1.0f;
+ }
+ float t = 1.0f / (1.0f + 0.47047f*x);
+ return (1.0f -
+ t*(0.3480242f + t*(-0.0958798f + t*0.7478556f)) * expf(-x*x));
+}
+
+ccl_device_inline float approx_erff(float x)
+{
+ if(x >= 0.0f) {
+ return approx_erff_do(x);
+ }
+ else {
+ return -approx_erff_do(-x);
+ }
+}
+
+ccl_device_inline float approx_erfinvf_do(float x)
+{
+ if(x <= 0.7f) {
+ const float x2 = x * x;
+ const float a1 = 0.886226899f;
+ const float a2 = -1.645349621f;
+ const float a3 = 0.914624893f;
+ const float a4 = -0.140543331f;
+ const float b1 = -2.118377725f;
+ const float b2 = 1.442710462f;
+ const float b3 = -0.329097515f;
+ const float b4 = 0.012229801f;
+ return x * (((a4 * x2 + a3) * x2 + a2) * x2 + a1) /
+ ((((b4 * x2 + b3) * x2 + b2) * x2 + b1) * x2 + 1.0f);
+ }
+ else {
+ const float c1 = -1.970840454f;
+ const float c2 = -1.624906493f;
+ const float c3 = 3.429567803f;
+ const float c4 = 1.641345311f;
+ const float d1 = 3.543889200f;
+ const float d2 = 1.637067800f;
+ const float z = sqrtf(-logf((1.0f - x) * 0.5f));
+ return (((c4 * z + c3) * z + c2) * z + c1) /
+ ((d2 * z + d1) * z + 1.0f);
+ }
+}
+
+ccl_device_inline float approx_erfinvf(float x)
+{
+ if(x >= 0.0f) {
+ return approx_erfinvf_do(x);
+ }
+ else {
+ return -approx_erfinvf_do(-x);
+ }
+}
+
+/* Beckmann and GGX microfacet importance sampling from:
+ *
+ * Importance Sampling Microfacet-Based BSDFs using the Distribution of Visible Normals.
+ * E. Heitz and E. d'Eon, EGSR 2014 */
+
+ccl_device_inline void microfacet_beckmann_sample_slopes(
+ KernelGlobals *kg,
+ const float cos_theta_i, const float sin_theta_i,
+ float randu, float randv, float *slope_x, float *slope_y,
+ float *G1i)
+{
+ /* special case (normal incidence) */
+ if(cos_theta_i >= 0.99999f) {
+ const float r = sqrtf(-logf(randu));
+ const float phi = M_2PI_F * randv;
+ *slope_x = r * cosf(phi);
+ *slope_y = r * sinf(phi);
+ *G1i = 1.0f;
+ return;
+ }
+
+ /* precomputations */
+ const float tan_theta_i = sin_theta_i/cos_theta_i;
+ const float inv_a = tan_theta_i;
+ const float a = 1.0f/inv_a;
+ const float erf_a = approx_erff(a);
+ const float exp_a2 = expf(-a*a);
+ const float SQRT_PI_INV = 0.56418958354f;
+ const float Lambda = 0.5f*(erf_a - 1.0f) + (0.5f*SQRT_PI_INV)*(exp_a2*inv_a);
+ const float G1 = 1.0f/(1.0f + Lambda); /* masking */
+
+ *G1i = G1;
+
+#if 0
+ const float C = 1.0f - G1 * erf_a;
+
+ /* sample slope X */
+ if(randu < C) {
+ /* rescale randu */
+ randu = randu / C;
+ const float w_1 = 0.5f * SQRT_PI_INV * sin_theta_i * exp_a2;
+ const float w_2 = cos_theta_i * (0.5f - 0.5f*erf_a);
+ const float p = w_1 / (w_1 + w_2);
+
+ if(randu < p) {
+ randu = randu / p;
+ *slope_x = -sqrtf(-logf(randu*exp_a2));
+ }
+ else {
+ randu = (randu - p) / (1.0f - p);
+ *slope_x = approx_erfinvf(randu - 1.0f - randu*erf_a);
+ }
+ }
+ else {
+ /* rescale randu */
+ randu = (randu - C) / (1.0f - C);
+ *slope_x = approx_erfinvf((-1.0f + 2.0f*randu)*erf_a);
+
+ const float p = (-(*slope_x)*sin_theta_i + cos_theta_i) / (2.0f*cos_theta_i);
+
+ if(randv > p) {
+ *slope_x = -(*slope_x);
+ randv = (randv - p) / (1.0f - p);
+ }
+ else
+ randv = randv / p;
+ }
+
+ /* sample slope Y */
+ *slope_y = approx_erfinvf(2.0f*randv - 1.0f);
+#else
+ /* use precomputed table, because it better preserves stratification
+ * of the random number pattern */
+ int beckmann_table_offset = kernel_data.tables.beckmann_offset;
+
+ *slope_x = lookup_table_read_2D(kg, randu, cos_theta_i,
+ beckmann_table_offset, BECKMANN_TABLE_SIZE, BECKMANN_TABLE_SIZE);
+ *slope_y = approx_erfinvf(2.0f*randv - 1.0f);
+#endif
+
+}
+
+ccl_device_inline void microfacet_ggx_sample_slopes(
+ const float cos_theta_i, const float sin_theta_i,
+ float randu, float randv, float *slope_x, float *slope_y,
+ float *G1i)
+{
+ /* special case (normal incidence) */
+ if(cos_theta_i >= 0.99999f) {
+ const float r = sqrtf(randu/(1.0f - randu));
+ const float phi = M_2PI_F * randv;
+ *slope_x = r * cosf(phi);
+ *slope_y = r * sinf(phi);
+ *G1i = 1.0f;
+
+ return;
+ }
+
+ /* precomputations */
+ const float tan_theta_i = sin_theta_i/cos_theta_i;
+ const float G1_inv = 0.5f * (1.0f + safe_sqrtf(1.0f + tan_theta_i*tan_theta_i));
+
+ *G1i = 1.0f/G1_inv;
+
+ /* sample slope_x */
+ const float A = 2.0f*randu*G1_inv - 1.0f;
+ const float AA = A*A;
+ const float tmp = 1.0f/(AA - 1.0f);
+ const float B = tan_theta_i;
+ const float BB = B*B;
+ const float D = safe_sqrtf(BB*(tmp*tmp) - (AA - BB)*tmp);
+ const float slope_x_1 = B*tmp - D;
+ const float slope_x_2 = B*tmp + D;
+ *slope_x = (A < 0.0f || slope_x_2*tan_theta_i > 1.0f)? slope_x_1: slope_x_2;
+
+ /* sample slope_y */
+ float S;
+
+ if(randv > 0.5f) {
+ S = 1.0f;
+ randv = 2.0f*(randv - 0.5f);
+ }
+ else {
+ S = -1.0f;
+ randv = 2.0f*(0.5f - randv);
+ }
+
+ const float z = (randv*(randv*(randv*0.27385f - 0.73369f) + 0.46341f)) / (randv*(randv*(randv*0.093073f + 0.309420f) - 1.000000f) + 0.597999f);
+ *slope_y = S * z * safe_sqrtf(1.0f + (*slope_x)*(*slope_x));
+}
+
+ccl_device_inline float3 microfacet_sample_stretched(
+ KernelGlobals *kg, const float3 omega_i,
+ const float alpha_x, const float alpha_y,
+ const float randu, const float randv,
+ bool beckmann, float *G1i)
+{
+ /* 1. stretch omega_i */
+ float3 omega_i_ = make_float3(alpha_x * omega_i.x, alpha_y * omega_i.y, omega_i.z);
+ omega_i_ = normalize(omega_i_);
+
+ /* get polar coordinates of omega_i_ */
+ float costheta_ = 1.0f;
+ float sintheta_ = 0.0f;
+ float cosphi_ = 1.0f;
+ float sinphi_ = 0.0f;
+
+ if(omega_i_.z < 0.99999f) {
+ costheta_ = omega_i_.z;
+ sintheta_ = safe_sqrtf(1.0f - costheta_*costheta_);
+
+ float invlen = 1.0f/sintheta_;
+ cosphi_ = omega_i_.x * invlen;
+ sinphi_ = omega_i_.y * invlen;
+ }
+
+ /* 2. sample P22_{omega_i}(x_slope, y_slope, 1, 1) */
+ float slope_x, slope_y;
+
+ if(beckmann) {
+ microfacet_beckmann_sample_slopes(kg, costheta_, sintheta_,
+ randu, randv, &slope_x, &slope_y, G1i);
+ }
+ else {
+ microfacet_ggx_sample_slopes(costheta_, sintheta_,
+ randu, randv, &slope_x, &slope_y, G1i);
+ }
+
+ /* 3. rotate */
+ float tmp = cosphi_*slope_x - sinphi_*slope_y;
+ slope_y = sinphi_*slope_x + cosphi_*slope_y;
+ slope_x = tmp;
+
+ /* 4. unstretch */
+ slope_x = alpha_x * slope_x;
+ slope_y = alpha_y * slope_y;
+
+ /* 5. compute normal */
+ return normalize(make_float3(-slope_x, -slope_y, 1.0f));
+}
+
+/* GGX microfacet with Smith shadow-masking from:
+ *
+ * Microfacet Models for Refraction through Rough Surfaces
+ * B. Walter, S. R. Marschner, H. Li, K. E. Torrance, EGSR 2007
+ *
+ * Anisotropic from:
+ *
+ * Understanding the Masking-Shadowing Function in Microfacet-Based BRDFs
+ * E. Heitz, Research Report 2014
+ *
+ * Anisotropy is only supported for reflection currently, but adding it for
+ * transmission is just a matter of copying code from reflection if needed. */
ccl_device int bsdf_microfacet_ggx_setup(ShaderClosure *sc)
{
- sc->data0 = clamp(sc->data0, 0.0f, 1.0f); /* m_ag */
+ sc->data0 = clamp(sc->data0, 0.0f, 1.0f); /* alpha_x */
+ sc->data1 = sc->data0; /* alpha_y */
sc->type = CLOSURE_BSDF_MICROFACET_GGX_ID;
return SD_BSDF|SD_BSDF_HAS_EVAL|SD_BSDF_GLOSSY;
}
+ccl_device int bsdf_microfacet_ggx_aniso_setup(ShaderClosure *sc)
+{
+ sc->data0 = clamp(sc->data0, 0.0f, 1.0f); /* alpha_x */
+ sc->data1 = clamp(sc->data1, 0.0f, 1.0f); /* alpha_y */
+
+ sc->type = CLOSURE_BSDF_MICROFACET_GGX_ANISO_ID;
+
+ return SD_BSDF|SD_BSDF_HAS_EVAL|SD_BSDF_GLOSSY;
+}
+
ccl_device int bsdf_microfacet_ggx_refraction_setup(ShaderClosure *sc)
{
- sc->data0 = clamp(sc->data0, 0.0f, 1.0f); /* m_ag */
+ sc->data0 = clamp(sc->data0, 0.0f, 1.0f); /* alpha_x */
+ sc->data1 = sc->data0; /* alpha_y */
sc->type = CLOSURE_BSDF_MICROFACET_GGX_REFRACTION_ID;
@@ -57,136 +330,250 @@ ccl_device int bsdf_microfacet_ggx_refraction_setup(ShaderClosure *sc)
ccl_device void bsdf_microfacet_ggx_blur(ShaderClosure *sc, float roughness)
{
- sc->data0 = fmaxf(roughness, sc->data0); /* m_ag */
+ sc->data0 = fmaxf(roughness, sc->data0); /* alpha_x */
+ sc->data1 = fmaxf(roughness, sc->data1); /* alpha_y */
}
ccl_device float3 bsdf_microfacet_ggx_eval_reflect(const ShaderClosure *sc, const float3 I, const float3 omega_in, float *pdf)
{
- float m_ag = max(sc->data0, 1e-4f);
+ float alpha_x = sc->data0;
+ float alpha_y = sc->data1;
int m_refractive = sc->type == CLOSURE_BSDF_MICROFACET_GGX_REFRACTION_ID;
float3 N = sc->N;
- if(m_refractive || m_ag <= 1e-4f)
- return make_float3 (0, 0, 0);
+ if(m_refractive || fmaxf(alpha_x, alpha_y) <= 1e-4f)
+ return make_float3(0, 0, 0);
+
float cosNO = dot(N, I);
float cosNI = dot(N, omega_in);
+
if(cosNI > 0 && cosNO > 0) {
- // get half vector
- float3 Hr = normalize(omega_in + I);
- // eq. 20: (F*G*D)/(4*in*on)
- // eq. 33: first we calculate D(m) with m=Hr:
- float alpha2 = m_ag * m_ag;
- float cosThetaM = dot(N, Hr);
- float cosThetaM2 = cosThetaM * cosThetaM;
- float tanThetaM2 = (1 - cosThetaM2) / cosThetaM2;
- float cosThetaM4 = cosThetaM2 * cosThetaM2;
- float D = alpha2 / (M_PI_F * cosThetaM4 * (alpha2 + tanThetaM2) * (alpha2 + tanThetaM2));
- // eq. 34: now calculate G1(i,m) and G1(o,m)
- float G1o = 2 / (1 + safe_sqrtf(1 + alpha2 * (1 - cosNO * cosNO) / (cosNO * cosNO)));
- float G1i = 2 / (1 + safe_sqrtf(1 + alpha2 * (1 - cosNI * cosNI) / (cosNI * cosNI)));
+ /* get half vector */
+ float3 m = normalize(omega_in + I);
+ float alpha2 = alpha_x * alpha_y;
+ float D, G1o, G1i;
+
+ if(alpha_x == alpha_y) {
+ /* isotropic
+ * eq. 20: (F*G*D)/(4*in*on)
+ * eq. 33: first we calculate D(m) */
+ float cosThetaM = dot(N, m);
+ float cosThetaM2 = cosThetaM * cosThetaM;
+ float cosThetaM4 = cosThetaM2 * cosThetaM2;
+ float tanThetaM2 = (1 - cosThetaM2) / cosThetaM2;
+ D = alpha2 / (M_PI_F * cosThetaM4 * (alpha2 + tanThetaM2) * (alpha2 + tanThetaM2));
+
+ /* eq. 34: now calculate G1(i,m) and G1(o,m) */
+ G1o = 2 / (1 + safe_sqrtf(1 + alpha2 * (1 - cosNO * cosNO) / (cosNO * cosNO)));
+ G1i = 2 / (1 + safe_sqrtf(1 + alpha2 * (1 - cosNI * cosNI) / (cosNI * cosNI)));
+ }
+ else {
+ /* anisotropic */
+ float3 X, Y, Z = N;
+ make_orthonormals_tangent(Z, sc->T, &X, &Y);
+
+ /* distribution */
+ float3 local_m = make_float3(dot(X, m), dot(Y, m), dot(Z, m));
+ float slope_x = -local_m.x/(local_m.z*alpha_x);
+ float slope_y = -local_m.y/(local_m.z*alpha_y);
+ float slope_len = 1 + slope_x*slope_x + slope_y*slope_y;
+
+ float cosThetaM = local_m.z;
+ float cosThetaM2 = cosThetaM * cosThetaM;
+ float cosThetaM4 = cosThetaM2 * cosThetaM2;
+
+ D = 1 / ((slope_len * slope_len) * M_PI_F * alpha2 * cosThetaM4);
+
+ /* G1(i,m) and G1(o,m) */
+ float tanThetaO2 = (1 - cosNO * cosNO) / (cosNO * cosNO);
+ float cosPhiO = dot(I, X);
+ float sinPhiO = dot(I, Y);
+
+ float alphaO2 = (cosPhiO*cosPhiO)*(alpha_x*alpha_x) + (sinPhiO*sinPhiO)*(alpha_y*alpha_y);
+ alphaO2 /= cosPhiO*cosPhiO + sinPhiO*sinPhiO;
+
+ G1o = 2 / (1 + safe_sqrtf(1 + alphaO2 * tanThetaO2));
+
+ float tanThetaI2 = (1 - cosNI * cosNI) / (cosNI * cosNI);
+ float cosPhiI = dot(omega_in, X);
+ float sinPhiI = dot(omega_in, Y);
+
+ float alphaI2 = (cosPhiI*cosPhiI)*(alpha_x*alpha_x) + (sinPhiI*sinPhiI)*(alpha_y*alpha_y);
+ alphaI2 /= cosPhiI*cosPhiI + sinPhiI*sinPhiI;
+
+ G1i = 2 / (1 + safe_sqrtf(1 + alphaI2 * tanThetaI2));
+ }
+
float G = G1o * G1i;
- float out = (G * D) * 0.25f / cosNO;
- // eq. 24
- float pm = D * cosThetaM;
- // convert into pdf of the sampled direction
- // eq. 38 - but see also:
- // eq. 17 in http://www.graphics.cornell.edu/~bjw/wardnotes.pdf
- *pdf = pm * 0.25f / dot(Hr, I);
- return make_float3 (out, out, out);
+
+ /* eq. 20 */
+ float common = D * 0.25f / cosNO;
+ float out = G * common;
+
+ /* eq. 2 in distribution of visible normals sampling
+ * pm = Dw = G1o * dot(m, I) * D / dot(N, I); */
+
+ /* eq. 38 - but see also:
+ * eq. 17 in http://www.graphics.cornell.edu/~bjw/wardnotes.pdf
+ * pdf = pm * 0.25 / dot(m, I); */
+ *pdf = G1o * common;
+
+ return make_float3(out, out, out);
}
- return make_float3 (0, 0, 0);
+
+ return make_float3(0, 0, 0);
}
ccl_device float3 bsdf_microfacet_ggx_eval_transmit(const ShaderClosure *sc, const float3 I, const float3 omega_in, float *pdf)
{
- float m_ag = max(sc->data0, 1e-4f);
- float m_eta = sc->data1;
+ float alpha_x = sc->data0;
+ float alpha_y = sc->data1;
+ float m_eta = sc->data2;
int m_refractive = sc->type == CLOSURE_BSDF_MICROFACET_GGX_REFRACTION_ID;
float3 N = sc->N;
- if(!m_refractive || m_ag <= 1e-4f)
- return make_float3 (0, 0, 0);
+ if(!m_refractive || fmaxf(alpha_x, alpha_y) <= 1e-4f)
+ return make_float3(0, 0, 0);
+
float cosNO = dot(N, I);
float cosNI = dot(N, omega_in);
+
if(cosNO <= 0 || cosNI >= 0)
- return make_float3 (0, 0, 0); // vectors on same side -- not possible
- // compute half-vector of the refraction (eq. 16)
+ return make_float3(0, 0, 0); /* vectors on same side -- not possible */
+
+ /* compute half-vector of the refraction (eq. 16) */
float3 ht = -(m_eta * omega_in + I);
float3 Ht = normalize(ht);
float cosHO = dot(Ht, I);
-
float cosHI = dot(Ht, omega_in);
- // eq. 33: first we calculate D(m) with m=Ht:
- float alpha2 = m_ag * m_ag;
+
+ /* those situations makes chi+ terms in eq. 33, 34 be zero */
+ if(dot(Ht, N) <= 0.0f || cosHO * cosNO <= 0.0f || cosHI * cosNI <= 0.0f)
+ return make_float3(0.0f, 0.0f, 0.0f);
+
+ float D, G1o, G1i;
+
+ /* eq. 33: first we calculate D(m) with m=Ht: */
+ float alpha2 = alpha_x * alpha_y;
float cosThetaM = dot(N, Ht);
float cosThetaM2 = cosThetaM * cosThetaM;
float tanThetaM2 = (1 - cosThetaM2) / cosThetaM2;
float cosThetaM4 = cosThetaM2 * cosThetaM2;
- float D = alpha2 / (M_PI_F * cosThetaM4 * (alpha2 + tanThetaM2) * (alpha2 + tanThetaM2));
- // eq. 34: now calculate G1(i,m) and G1(o,m)
- float G1o = 2 / (1 + safe_sqrtf(1 + alpha2 * (1 - cosNO * cosNO) / (cosNO * cosNO)));
- float G1i = 2 / (1 + safe_sqrtf(1 + alpha2 * (1 - cosNI * cosNI) / (cosNI * cosNI)));
+ D = alpha2 / (M_PI_F * cosThetaM4 * (alpha2 + tanThetaM2) * (alpha2 + tanThetaM2));
+
+ /* eq. 34: now calculate G1(i,m) and G1(o,m) */
+ G1o = 2 / (1 + safe_sqrtf(1 + alpha2 * (1 - cosNO * cosNO) / (cosNO * cosNO)));
+ G1i = 2 / (1 + safe_sqrtf(1 + alpha2 * (1 - cosNI * cosNI) / (cosNI * cosNI)));
+
float G = G1o * G1i;
- // probability
- float invHt2 = 1 / dot(ht, ht);
- *pdf = D * fabsf(cosThetaM) * (fabsf(cosHI) * (m_eta * m_eta)) * invHt2;
- float out = (fabsf(cosHI * cosHO) * (m_eta * m_eta) * (G * D) * invHt2) / cosNO;
- return make_float3 (out, out, out);
+
+ /* probability */
+ float Ht2 = dot(ht, ht);
+
+ /* eq. 2 in distribution of visible normals sampling
+ * pm = Dw = G1o * dot(m, I) * D / dot(N, I); */
+
+ /* out = fabsf(cosHI * cosHO) * (m_eta * m_eta) * G * D / (cosNO * Ht2)
+ * pdf = pm * (m_eta * m_eta) * fabsf(cosHI) / Ht2 */
+ float common = D * (m_eta * m_eta) / (cosNO * Ht2);
+ float out = G * fabsf(cosHI * cosHO) * common;
+ *pdf = G1o * cosHO * fabsf(cosHI) * common;
+
+ return make_float3(out, out, out);
}
-ccl_device int bsdf_microfacet_ggx_sample(const ShaderClosure *sc, float3 Ng, float3 I, float3 dIdx, float3 dIdy, float randu, float randv, float3 *eval, float3 *omega_in, float3 *domega_in_dx, float3 *domega_in_dy, float *pdf)
+ccl_device int bsdf_microfacet_ggx_sample(KernelGlobals *kg, const ShaderClosure *sc, float3 Ng, float3 I, float3 dIdx, float3 dIdy, float randu, float randv, float3 *eval, float3 *omega_in, float3 *domega_in_dx, float3 *domega_in_dy, float *pdf)
{
- float m_ag = sc->data0;
+ float alpha_x = sc->data0;
+ float alpha_y = sc->data1;
int m_refractive = sc->type == CLOSURE_BSDF_MICROFACET_GGX_REFRACTION_ID;
float3 N = sc->N;
float cosNO = dot(N, I);
if(cosNO > 0) {
float3 X, Y, Z = N;
- make_orthonormals(Z, &X, &Y);
- // generate a random microfacet normal m
- // eq. 35,36:
- // we take advantage of cos(atan(x)) == 1/sqrt(1+x^2)
- //tttt and sin(atan(x)) == x/sqrt(1+x^2)
- float alpha2 = m_ag * m_ag;
- float tanThetaM2 = alpha2 * randu / (1 - randu);
- float cosThetaM = 1 / safe_sqrtf(1 + tanThetaM2);
- float sinThetaM = cosThetaM * safe_sqrtf(tanThetaM2);
- float phiM = M_2PI_F * randv;
- float3 m = (cosf(phiM) * sinThetaM) * X +
- (sinf(phiM) * sinThetaM) * Y +
- ( cosThetaM) * Z;
+
+ if(alpha_x == alpha_y)
+ make_orthonormals(Z, &X, &Y);
+ else
+ make_orthonormals_tangent(Z, sc->T, &X, &Y);
+
+ /* importance sampling with distribution of visible normals. vectors are
+ * transformed to local space before and after */
+ float3 local_I = make_float3(dot(X, I), dot(Y, I), cosNO);
+ float3 local_m;
+ float G1o;
+
+ local_m = microfacet_sample_stretched(kg, local_I, alpha_x, alpha_y,
+ randu, randv, false, &G1o);
+
+ float3 m = X*local_m.x + Y*local_m.y + Z*local_m.z;
+ float cosThetaM = local_m.z;
+
+ /* reflection or refraction? */
if(!m_refractive) {
float cosMO = dot(m, I);
+
if(cosMO > 0) {
- // eq. 39 - compute actual reflected direction
+ /* eq. 39 - compute actual reflected direction */
*omega_in = 2 * cosMO * m - I;
+
if(dot(Ng, *omega_in) > 0) {
- if (m_ag <= 1e-4f) {
- // some high number for MIS
+ if(fmaxf(alpha_x, alpha_y) <= 1e-4f) {
+ /* some high number for MIS */
*pdf = 1e6f;
*eval = make_float3(1e6f, 1e6f, 1e6f);
}
else {
- // microfacet normal is visible to this ray
- // eq. 33
- float cosThetaM2 = cosThetaM * cosThetaM;
- float cosThetaM4 = cosThetaM2 * cosThetaM2;
- float D = alpha2 / (M_PI_F * cosThetaM4 * (alpha2 + tanThetaM2) * (alpha2 + tanThetaM2));
- // eq. 24
- float pm = D * cosThetaM;
- // convert into pdf of the sampled direction
- // eq. 38 - but see also:
- // eq. 17 in http://www.graphics.cornell.edu/~bjw/wardnotes.pdf
- *pdf = pm * 0.25f / cosMO;
- // eval BRDF*cosNI
- float cosNI = dot(N, *omega_in);
- // eq. 34: now calculate G1(i,m) and G1(o,m)
- float G1o = 2 / (1 + safe_sqrtf(1 + alpha2 * (1 - cosNO * cosNO) / (cosNO * cosNO)));
- float G1i = 2 / (1 + safe_sqrtf(1 + alpha2 * (1 - cosNI * cosNI) / (cosNI * cosNI)));
- float G = G1o * G1i;
- // eq. 20: (F*G*D)/(4*in*on)
- float out = (G * D) * 0.25f / cosNO;
+ /* microfacet normal is visible to this ray */
+ /* eq. 33 */
+ float alpha2 = alpha_x * alpha_y;
+ float D, G1i;
+
+ if(alpha_x == alpha_y) {
+ /* isotropic */
+ float cosThetaM2 = cosThetaM * cosThetaM;
+ float cosThetaM4 = cosThetaM2 * cosThetaM2;
+ float tanThetaM2 = 1/(cosThetaM2) - 1;
+ D = alpha2 / (M_PI_F * cosThetaM4 * (alpha2 + tanThetaM2) * (alpha2 + tanThetaM2));
+
+ /* eval BRDF*cosNI */
+ float cosNI = dot(N, *omega_in);
+
+ /* eq. 34: now calculate G1(i,m) */
+ G1i = 2 / (1 + safe_sqrtf(1 + alpha2 * (1 - cosNI * cosNI) / (cosNI * cosNI)));
+ }
+ else {
+ /* anisotropic distribution */
+ float3 local_m = make_float3(dot(X, m), dot(Y, m), dot(Z, m));
+ float slope_x = -local_m.x/(local_m.z*alpha_x);
+ float slope_y = -local_m.y/(local_m.z*alpha_y);
+ float slope_len = 1 + slope_x*slope_x + slope_y*slope_y;
+
+ float cosThetaM = local_m.z;
+ float cosThetaM2 = cosThetaM * cosThetaM;
+ float cosThetaM4 = cosThetaM2 * cosThetaM2;
+
+ D = 1 / ((slope_len * slope_len) * M_PI_F * alpha2 * cosThetaM4);
+
+ /* calculate G1(i,m) */
+ float cosNI = dot(N, *omega_in);
+
+ float tanThetaI2 = (1 - cosNI * cosNI) / (cosNI * cosNI);
+ float cosPhiI = dot(*omega_in, X);
+ float sinPhiI = dot(*omega_in, Y);
+
+ float alphaI2 = (cosPhiI*cosPhiI)*(alpha_x*alpha_x) + (sinPhiI*sinPhiI)*(alpha_y*alpha_y);
+ alphaI2 /= cosPhiI*cosPhiI + sinPhiI*sinPhiI;
+
+ G1i = 2 / (1 + safe_sqrtf(1 + alphaI2 * tanThetaI2));
+ }
+
+ /* see eval function for derivation */
+ float common = (G1o * D) * 0.25f / cosNO;
+ float out = G1i * common;
+ *pdf = common;
+
*eval = make_float3(out, out, out);
}
@@ -198,14 +585,15 @@ ccl_device int bsdf_microfacet_ggx_sample(const ShaderClosure *sc, float3 Ng, fl
}
}
else {
- // CAUTION: the i and o variables are inverted relative to the paper
- // eq. 39 - compute actual refractive direction
+ /* CAUTION: the i and o variables are inverted relative to the paper
+ * eq. 39 - compute actual refractive direction */
float3 R, T;
#ifdef __RAY_DIFFERENTIALS__
float3 dRdx, dRdy, dTdx, dTdy;
#endif
- float m_eta = sc->data1;
+ float m_eta = sc->data2;
bool inside;
+
fresnel_dielectric(m_eta, m, I, &R, &T,
#ifdef __RAY_DIFFERENTIALS__
dIdx, dIdy, &dRdx, &dRdy, &dTdx, &dTdy,
@@ -213,38 +601,43 @@ ccl_device int bsdf_microfacet_ggx_sample(const ShaderClosure *sc, float3 Ng, fl
&inside);
if(!inside) {
+
*omega_in = T;
#ifdef __RAY_DIFFERENTIALS__
*domega_in_dx = dTdx;
*domega_in_dy = dTdy;
#endif
- if (m_ag <= 1e-4f || fabsf(m_eta - 1.0f) < 1e-4f) {
- // some high number for MIS
+ if(fmaxf(alpha_x, alpha_y) <= 1e-4f || fabsf(m_eta - 1.0f) < 1e-4f) {
+ /* some high number for MIS */
*pdf = 1e6f;
*eval = make_float3(1e6f, 1e6f, 1e6f);
}
else {
- // eq. 33
+ /* eq. 33 */
+ float alpha2 = alpha_x * alpha_y;
float cosThetaM2 = cosThetaM * cosThetaM;
float cosThetaM4 = cosThetaM2 * cosThetaM2;
+ float tanThetaM2 = 1/(cosThetaM2) - 1;
float D = alpha2 / (M_PI_F * cosThetaM4 * (alpha2 + tanThetaM2) * (alpha2 + tanThetaM2));
- // eq. 24
- float pm = D * cosThetaM;
- // eval BRDF*cosNI
+
+ /* eval BRDF*cosNI */
float cosNI = dot(N, *omega_in);
- // eq. 34: now calculate G1(i,m) and G1(o,m)
- float G1o = 2 / (1 + safe_sqrtf(1 + alpha2 * (1 - cosNO * cosNO) / (cosNO * cosNO)));
+
+ /* eq. 34: now calculate G1(i,m) */
float G1i = 2 / (1 + safe_sqrtf(1 + alpha2 * (1 - cosNI * cosNI) / (cosNI * cosNI)));
- float G = G1o * G1i;
- // eq. 21
+
+ /* eq. 21 */
float cosHI = dot(m, *omega_in);
float cosHO = dot(m, I);
float Ht2 = m_eta * cosHI + cosHO;
Ht2 *= Ht2;
- float out = (fabsf(cosHI * cosHO) * (m_eta * m_eta) * (G * D)) / (cosNO * Ht2);
- // eq. 38 and eq. 17
- *pdf = pm * (m_eta * m_eta) * fabsf(cosHI) / Ht2;
+
+ /* see eval function for derivation */
+ float common = (G1o * D) * (m_eta * m_eta) / (cosNO * Ht2);
+ float out = G1i * fabsf(cosHI * cosHO) * common;
+ *pdf = cosHO * fabsf(cosHI) * common;
+
*eval = make_float3(out, out, out);
}
}
@@ -253,19 +646,33 @@ ccl_device int bsdf_microfacet_ggx_sample(const ShaderClosure *sc, float3 Ng, fl
return (m_refractive) ? LABEL_TRANSMIT|LABEL_GLOSSY : LABEL_REFLECT|LABEL_GLOSSY;
}
-/* BECKMANN */
+/* Beckmann microfacet with Smith shadow-masking from:
+ *
+ * Microfacet Models for Refraction through Rough Surfaces
+ * B. Walter, S. R. Marschner, H. Li, K. E. Torrance, EGSR 2007 */
ccl_device int bsdf_microfacet_beckmann_setup(ShaderClosure *sc)
{
- sc->data0 = clamp(sc->data0, 0.0f, 1.0f); /* m_ab */
+ sc->data0 = clamp(sc->data0, 0.0f, 1.0f); /* alpha_x */
+ sc->data1 = sc->data0; /* alpha_y */
sc->type = CLOSURE_BSDF_MICROFACET_BECKMANN_ID;
return SD_BSDF|SD_BSDF_HAS_EVAL|SD_BSDF_GLOSSY;
}
+ccl_device int bsdf_microfacet_beckmann_aniso_setup(ShaderClosure *sc)
+{
+ sc->data0 = clamp(sc->data0, 0.0f, 1.0f); /* alpha_x */
+ sc->data1 = clamp(sc->data1, 0.0f, 1.0f); /* alpha_y */
+
+ sc->type = CLOSURE_BSDF_MICROFACET_BECKMANN_ANISO_ID;
+ return SD_BSDF|SD_BSDF_HAS_EVAL|SD_BSDF_GLOSSY;
+}
+
ccl_device int bsdf_microfacet_beckmann_refraction_setup(ShaderClosure *sc)
{
- sc->data0 = clamp(sc->data0, 0.0f, 1.0f); /* m_ab */
+ sc->data0 = clamp(sc->data0, 0.0f, 1.0f); /* alpha_x */
+ sc->data1 = sc->data0; /* alpha_y */
sc->type = CLOSURE_BSDF_MICROFACET_BECKMANN_REFRACTION_ID;
return SD_BSDF|SD_BSDF_HAS_EVAL|SD_BSDF_GLOSSY;
@@ -273,155 +680,257 @@ ccl_device int bsdf_microfacet_beckmann_refraction_setup(ShaderClosure *sc)
ccl_device void bsdf_microfacet_beckmann_blur(ShaderClosure *sc, float roughness)
{
- sc->data0 = fmaxf(roughness, sc->data0); /* m_ab */
+ sc->data0 = fmaxf(roughness, sc->data0); /* alpha_x */
+ sc->data1 = fmaxf(roughness, sc->data1); /* alpha_y */
}
ccl_device float3 bsdf_microfacet_beckmann_eval_reflect(const ShaderClosure *sc, const float3 I, const float3 omega_in, float *pdf)
{
- float m_ab = max(sc->data0, 1e-4f);
+ float alpha_x = sc->data0;
+ float alpha_y = sc->data1;
int m_refractive = sc->type == CLOSURE_BSDF_MICROFACET_BECKMANN_REFRACTION_ID;
float3 N = sc->N;
- if(m_refractive || m_ab <= 1e-4f)
- return make_float3 (0, 0, 0);
+ if(m_refractive || fmaxf(alpha_x, alpha_y) <= 1e-4f)
+ return make_float3(0, 0, 0);
+
float cosNO = dot(N, I);
float cosNI = dot(N, omega_in);
+
if(cosNO > 0 && cosNI > 0) {
- // get half vector
- float3 Hr = normalize(omega_in + I);
- // eq. 20: (F*G*D)/(4*in*on)
- // eq. 25: first we calculate D(m) with m=Hr:
- float alpha2 = m_ab * m_ab;
- float cosThetaM = dot(N, Hr);
- float cosThetaM2 = cosThetaM * cosThetaM;
- float tanThetaM2 = (1 - cosThetaM2) / cosThetaM2;
- float cosThetaM4 = cosThetaM2 * cosThetaM2;
- float D = expf(-tanThetaM2 / alpha2) / (M_PI_F * alpha2 * cosThetaM4);
- // eq. 26, 27: now calculate G1(i,m) and G1(o,m)
- float ao = 1 / (m_ab * safe_sqrtf((1 - cosNO * cosNO) / (cosNO * cosNO)));
- float ai = 1 / (m_ab * safe_sqrtf((1 - cosNI * cosNI) / (cosNI * cosNI)));
- float G1o = ao < 1.6f ? (3.535f * ao + 2.181f * ao * ao) / (1 + 2.276f * ao + 2.577f * ao * ao) : 1.0f;
- float G1i = ai < 1.6f ? (3.535f * ai + 2.181f * ai * ai) / (1 + 2.276f * ai + 2.577f * ai * ai) : 1.0f;
- float G = G1o * G1i;
- float out = (G * D) * 0.25f / cosNO;
- // eq. 24
- float pm = D * cosThetaM;
- // convert into pdf of the sampled direction
- // eq. 38 - but see also:
- // eq. 17 in http://www.graphics.cornell.edu/~bjw/wardnotes.pdf
- *pdf = pm * 0.25f / dot(Hr, I);
- return make_float3 (out, out, out);
+ /* get half vector */
+ float3 m = normalize(omega_in + I);
+
+ float alpha2 = alpha_x * alpha_y;
+ float D, G1o, G1i;
+
+ if(alpha_x == alpha_y) {
+ /* isotropic
+ * eq. 20: (F*G*D)/(4*in*on)
+ * eq. 25: first we calculate D(m) */
+ float cosThetaM = dot(N, m);
+ float cosThetaM2 = cosThetaM * cosThetaM;
+ float tanThetaM2 = (1 - cosThetaM2) / cosThetaM2;
+ float cosThetaM4 = cosThetaM2 * cosThetaM2;
+ D = expf(-tanThetaM2 / alpha2) / (M_PI_F * alpha2 * cosThetaM4);
+
+ /* eq. 26, 27: now calculate G1(i,m) and G1(o,m) */
+ float ao = 1 / (alpha_x * safe_sqrtf((1 - cosNO * cosNO) / (cosNO * cosNO)));
+ float ai = 1 / (alpha_x * safe_sqrtf((1 - cosNI * cosNI) / (cosNI * cosNI)));
+ G1o = ao < 1.6f ? (3.535f * ao + 2.181f * ao * ao) / (1 + 2.276f * ao + 2.577f * ao * ao) : 1.0f;
+ G1i = ai < 1.6f ? (3.535f * ai + 2.181f * ai * ai) / (1 + 2.276f * ai + 2.577f * ai * ai) : 1.0f;
+ }
+ else {
+ /* anisotropic */
+ float3 X, Y, Z = N;
+ make_orthonormals_tangent(Z, sc->T, &X, &Y);
+
+ /* distribution */
+ float3 local_m = make_float3(dot(X, m), dot(Y, m), dot(Z, m));
+ float slope_x = -local_m.x/(local_m.z*alpha_x);
+ float slope_y = -local_m.y/(local_m.z*alpha_y);
+
+ float cosThetaM = local_m.z;
+ float cosThetaM2 = cosThetaM * cosThetaM;
+ float cosThetaM4 = cosThetaM2 * cosThetaM2;
+
+ D = expf(-slope_x*slope_x - slope_y*slope_y) / (M_PI_F * alpha2 * cosThetaM4);
+
+ /* G1(i,m) and G1(o,m) */
+ float tanThetaO2 = (1 - cosNO * cosNO) / (cosNO * cosNO);
+ float cosPhiO = dot(I, X);
+ float sinPhiO = dot(I, Y);
+
+ float alphaO2 = (cosPhiO*cosPhiO)*(alpha_x*alpha_x) + (sinPhiO*sinPhiO)*(alpha_y*alpha_y);
+ alphaO2 /= cosPhiO*cosPhiO + sinPhiO*sinPhiO;
+
+ float tanThetaI2 = (1 - cosNI * cosNI) / (cosNI * cosNI);
+ float cosPhiI = dot(omega_in, X);
+ float sinPhiI = dot(omega_in, Y);
+
+ float alphaI2 = (cosPhiI*cosPhiI)*(alpha_x*alpha_x) + (sinPhiI*sinPhiI)*(alpha_y*alpha_y);
+ alphaI2 /= cosPhiI*cosPhiI + sinPhiI*sinPhiI;
+
+ float ao = 1 / (safe_sqrtf(alphaO2 * tanThetaO2));
+ float ai = 1 / (safe_sqrtf(alphaI2 * tanThetaI2));
+ G1o = ao < 1.6f ? (3.535f * ao + 2.181f * ao * ao) / (1 + 2.276f * ao + 2.577f * ao * ao) : 1.0f;
+ G1i = ai < 1.6f ? (3.535f * ai + 2.181f * ai * ai) / (1 + 2.276f * ai + 2.577f * ai * ai) : 1.0f;
+ }
+
+ float G = G1o * G1i;
+
+ /* eq. 20 */
+ float common = D * 0.25f / cosNO;
+ float out = G * common;
+
+ /* eq. 2 in distribution of visible normals sampling
+ * pm = Dw = G1o * dot(m, I) * D / dot(N, I); */
+
+ /* eq. 38 - but see also:
+ * eq. 17 in http://www.graphics.cornell.edu/~bjw/wardnotes.pdf
+ * pdf = pm * 0.25 / dot(m, I); */
+ *pdf = G1o * common;
+
+ return make_float3(out, out, out);
}
- return make_float3 (0, 0, 0);
+
+ return make_float3(0, 0, 0);
}
ccl_device float3 bsdf_microfacet_beckmann_eval_transmit(const ShaderClosure *sc, const float3 I, const float3 omega_in, float *pdf)
{
- float m_ab = max(sc->data0, 1e-4f);
- float m_eta = sc->data1;
+ float alpha_x = sc->data0;
+ float alpha_y = sc->data1;
+ float m_eta = sc->data2;
int m_refractive = sc->type == CLOSURE_BSDF_MICROFACET_BECKMANN_REFRACTION_ID;
float3 N = sc->N;
- if(!m_refractive || m_ab <= 1e-4f)
- return make_float3 (0, 0, 0);
+ if(!m_refractive || fmaxf(alpha_x, alpha_y) <= 1e-4f)
+ return make_float3(0, 0, 0);
+
float cosNO = dot(N, I);
float cosNI = dot(N, omega_in);
+
if(cosNO <= 0 || cosNI >= 0)
- return make_float3 (0, 0, 0);
- // compute half-vector of the refraction (eq. 16)
+ return make_float3(0, 0, 0);
+
+ /* compute half-vector of the refraction (eq. 16) */
float3 ht = -(m_eta * omega_in + I);
float3 Ht = normalize(ht);
float cosHO = dot(Ht, I);
-
float cosHI = dot(Ht, omega_in);
- // eq. 33: first we calculate D(m) with m=Ht:
- float alpha2 = m_ab * m_ab;
+
+ /* those situations makes chi+ terms in eq. 25, 27 be zero */
+ if(dot(Ht, N) <= 0.0f || cosHO * cosNO <= 0.0f || cosHI * cosNI <= 0.0f)
+ return make_float3(0.0f, 0.0f, 0.0f);
+
+ /* eq. 25: first we calculate D(m) with m=Ht: */
+ float alpha2 = alpha_x * alpha_y;
float cosThetaM = min(dot(N, Ht), 1.0f);
float cosThetaM2 = cosThetaM * cosThetaM;
float tanThetaM2 = (1 - cosThetaM2) / cosThetaM2;
float cosThetaM4 = cosThetaM2 * cosThetaM2;
float D = expf(-tanThetaM2 / alpha2) / (M_PI_F * alpha2 * cosThetaM4);
- // eq. 26, 27: now calculate G1(i,m) and G1(o,m)
- float ao = 1 / (m_ab * safe_sqrtf((1 - cosNO * cosNO) / (cosNO * cosNO)));
- float ai = 1 / (m_ab * safe_sqrtf((1 - cosNI * cosNI) / (cosNI * cosNI)));
+
+ /* eq. 26, 27: now calculate G1(i,m) and G1(o,m) */
+ float ao = 1 / (alpha_x * safe_sqrtf((1 - cosNO * cosNO) / (cosNO * cosNO)));
+ float ai = 1 / (alpha_x * safe_sqrtf((1 - cosNI * cosNI) / (cosNI * cosNI)));
float G1o = ao < 1.6f ? (3.535f * ao + 2.181f * ao * ao) / (1 + 2.276f * ao + 2.577f * ao * ao) : 1.0f;
float G1i = ai < 1.6f ? (3.535f * ai + 2.181f * ai * ai) / (1 + 2.276f * ai + 2.577f * ai * ai) : 1.0f;
float G = G1o * G1i;
- // probability
- float invHt2 = 1 / dot(ht, ht);
- *pdf = D * fabsf(cosThetaM) * (fabsf(cosHI) * (m_eta * m_eta)) * invHt2;
- float out = (fabsf(cosHI * cosHO) * (m_eta * m_eta) * (G * D) * invHt2) / cosNO;
- return make_float3 (out, out, out);
+
+ /* probability */
+ float Ht2 = dot(ht, ht);
+
+ /* eq. 2 in distribution of visible normals sampling
+ * pm = Dw = G1o * dot(m, I) * D / dot(N, I); */
+
+ /* out = fabsf(cosHI * cosHO) * (m_eta * m_eta) * G * D / (cosNO * Ht2)
+ * pdf = pm * (m_eta * m_eta) * fabsf(cosHI) / Ht2 */
+ float common = D * (m_eta * m_eta) / (cosNO * Ht2);
+ float out = G * fabsf(cosHI * cosHO) * common;
+ *pdf = G1o * cosHO * fabsf(cosHI) * common;
+
+ return make_float3(out, out, out);
}
-ccl_device int bsdf_microfacet_beckmann_sample(const ShaderClosure *sc, float3 Ng, float3 I, float3 dIdx, float3 dIdy, float randu, float randv, float3 *eval, float3 *omega_in, float3 *domega_in_dx, float3 *domega_in_dy, float *pdf)
+ccl_device int bsdf_microfacet_beckmann_sample(KernelGlobals *kg, const ShaderClosure *sc, float3 Ng, float3 I, float3 dIdx, float3 dIdy, float randu, float randv, float3 *eval, float3 *omega_in, float3 *domega_in_dx, float3 *domega_in_dy, float *pdf)
{
- float m_ab = sc->data0;
+ float alpha_x = sc->data0;
+ float alpha_y = sc->data1;
int m_refractive = sc->type == CLOSURE_BSDF_MICROFACET_BECKMANN_REFRACTION_ID;
float3 N = sc->N;
float cosNO = dot(N, I);
if(cosNO > 0) {
float3 X, Y, Z = N;
- make_orthonormals(Z, &X, &Y);
- // generate a random microfacet normal m
- // eq. 35,36:
- // we take advantage of cos(atan(x)) == 1/sqrt(1+x^2)
- //tttt and sin(atan(x)) == x/sqrt(1+x^2)
- float alpha2 = m_ab * m_ab;
- float tanThetaM, cosThetaM;
-
- if(alpha2 == 0.0f) {
- tanThetaM = 0.0f;
- cosThetaM = 1.0f;
- }
- else {
- tanThetaM = safe_sqrtf(-alpha2 * logf(1 - randu));
- cosThetaM = 1 / safe_sqrtf(1 + tanThetaM * tanThetaM);
- }
- float sinThetaM = cosThetaM * tanThetaM;
- float phiM = M_2PI_F * randv;
- float3 m = (cosf(phiM) * sinThetaM) * X +
- (sinf(phiM) * sinThetaM) * Y +
- ( cosThetaM) * Z;
+ if(alpha_x == alpha_y)
+ make_orthonormals(Z, &X, &Y);
+ else
+ make_orthonormals_tangent(Z, sc->T, &X, &Y);
+
+ /* importance sampling with distribution of visible normals. vectors are
+ * transformed to local space before and after */
+ float3 local_I = make_float3(dot(X, I), dot(Y, I), cosNO);
+ float3 local_m;
+ float G1o;
+ local_m = microfacet_sample_stretched(kg, local_I, alpha_x, alpha_x,
+ randu, randv, true, &G1o);
+
+ float3 m = X*local_m.x + Y*local_m.y + Z*local_m.z;
+ float cosThetaM = local_m.z;
+
+ /* reflection or refraction? */
if(!m_refractive) {
float cosMO = dot(m, I);
+
if(cosMO > 0) {
- // eq. 39 - compute actual reflected direction
+ /* eq. 39 - compute actual reflected direction */
*omega_in = 2 * cosMO * m - I;
+
if(dot(Ng, *omega_in) > 0) {
- if (m_ab <= 1e-4f) {
- // some high number for MIS
+ if(fmaxf(alpha_x, alpha_y) <= 1e-4f) {
+ /* some high number for MIS */
*pdf = 1e6f;
*eval = make_float3(1e6f, 1e6f, 1e6f);
}
else {
- // microfacet normal is visible to this ray
- // eq. 25
- float cosThetaM2 = cosThetaM * cosThetaM;
- float tanThetaM2 = tanThetaM * tanThetaM;
- float cosThetaM4 = cosThetaM2 * cosThetaM2;
- float D = expf(-tanThetaM2 / alpha2) / (M_PI_F * alpha2 * cosThetaM4);
- // eq. 24
- float pm = D * cosThetaM;
- // convert into pdf of the sampled direction
- // eq. 38 - but see also:
- // eq. 17 in http://www.graphics.cornell.edu/~bjw/wardnotes.pdf
- *pdf = pm * 0.25f / cosMO;
- // Eval BRDF*cosNI
- float cosNI = dot(N, *omega_in);
- // eq. 26, 27: now calculate G1(i,m) and G1(o,m)
- float ao = 1 / (m_ab * safe_sqrtf((1 - cosNO * cosNO) / (cosNO * cosNO)));
- float ai = 1 / (m_ab * safe_sqrtf((1 - cosNI * cosNI) / (cosNI * cosNI)));
- float G1o = ao < 1.6f ? (3.535f * ao + 2.181f * ao * ao) / (1 + 2.276f * ao + 2.577f * ao * ao) : 1.0f;
- float G1i = ai < 1.6f ? (3.535f * ai + 2.181f * ai * ai) / (1 + 2.276f * ai + 2.577f * ai * ai) : 1.0f;
+ /* microfacet normal is visible to this ray
+ * eq. 25 */
+ float alpha2 = alpha_x * alpha_y;
+ float D, G1i;
+
+ if(alpha_x == alpha_y) {
+ /* istropic distribution */
+ float cosThetaM2 = cosThetaM * cosThetaM;
+ float cosThetaM4 = cosThetaM2 * cosThetaM2;
+ float tanThetaM2 = 1/(cosThetaM2) - 1;
+ D = expf(-tanThetaM2 / alpha2) / (M_PI_F * alpha2 * cosThetaM4);
+
+ /* eval BRDF*cosNI */
+ float cosNI = dot(N, *omega_in);
+
+ /* eq. 26, 27: now calculate G1(i,m) */
+ float ai = 1 / (alpha_x * safe_sqrtf((1 - cosNI * cosNI) / (cosNI * cosNI)));
+ G1i = ai < 1.6f ? (3.535f * ai + 2.181f * ai * ai) / (1 + 2.276f * ai + 2.577f * ai * ai) : 1.0f;
+ }
+ else {
+ /* anisotropic distribution */
+ float3 local_m = make_float3(dot(X, m), dot(Y, m), dot(Z, m));
+ float slope_x = -local_m.x/(local_m.z*alpha_x);
+ float slope_y = -local_m.y/(local_m.z*alpha_y);
+
+ float cosThetaM = local_m.z;
+ float cosThetaM2 = cosThetaM * cosThetaM;
+ float cosThetaM4 = cosThetaM2 * cosThetaM2;
+
+ D = expf(-slope_x*slope_x - slope_y*slope_y) / (M_PI_F * alpha2 * cosThetaM4);
+
+ /* G1(i,m) */
+ float cosNI = dot(N, *omega_in);
+ float tanThetaI2 = (1 - cosNI * cosNI) / (cosNI * cosNI);
+ float cosPhiI = dot(*omega_in, X);
+ float sinPhiI = dot(*omega_in, Y);
+
+ float alphaI2 = (cosPhiI*cosPhiI)*(alpha_x*alpha_x) + (sinPhiI*sinPhiI)*(alpha_y*alpha_y);
+ alphaI2 /= cosPhiI*cosPhiI + sinPhiI*sinPhiI;
+
+ float ai = 1 / (safe_sqrtf(alphaI2 * tanThetaI2));
+ G1i = ai < 1.6f ? (3.535f * ai + 2.181f * ai * ai) / (1 + 2.276f * ai + 2.577f * ai * ai) : 1.0f;
+ }
+
float G = G1o * G1i;
- // eq. 20: (F*G*D)/(4*in*on)
- float out = (G * D) * 0.25f / cosNO;
+
+ /* see eval function for derivation */
+ float common = D * 0.25f / cosNO;
+ float out = G * common;
+ *pdf = G1o * common;
+
*eval = make_float3(out, out, out);
}
+
#ifdef __RAY_DIFFERENTIALS__
*domega_in_dx = (2 * dot(m, dIdx)) * m - dIdx;
*domega_in_dy = (2 * dot(m, dIdy)) * m - dIdy;
@@ -430,14 +939,15 @@ ccl_device int bsdf_microfacet_beckmann_sample(const ShaderClosure *sc, float3 N
}
}
else {
- // CAUTION: the i and o variables are inverted relative to the paper
- // eq. 39 - compute actual refractive direction
+ /* CAUTION: the i and o variables are inverted relative to the paper
+ * eq. 39 - compute actual refractive direction */
float3 R, T;
#ifdef __RAY_DIFFERENTIALS__
float3 dRdx, dRdy, dTdx, dTdy;
#endif
- float m_eta = sc->data1;
+ float m_eta = sc->data2;
bool inside;
+
fresnel_dielectric(m_eta, m, I, &R, &T,
#ifdef __RAY_DIFFERENTIALS__
dIdx, dIdy, &dRdx, &dRdy, &dTdx, &dTdy,
@@ -446,39 +956,44 @@ ccl_device int bsdf_microfacet_beckmann_sample(const ShaderClosure *sc, float3 N
if(!inside) {
*omega_in = T;
+
#ifdef __RAY_DIFFERENTIALS__
*domega_in_dx = dTdx;
*domega_in_dy = dTdy;
#endif
- if (m_ab <= 1e-4f || fabsf(m_eta - 1.0f) < 1e-4f) {
- // some high number for MIS
+
+ if(fmaxf(alpha_x, alpha_y) <= 1e-4f || fabsf(m_eta - 1.0f) < 1e-4f) {
+ /* some high number for MIS */
*pdf = 1e6f;
*eval = make_float3(1e6f, 1e6f, 1e6f);
}
else {
- // eq. 33
+ /* eq. 33 */
+ float alpha2 = alpha_x * alpha_y;
float cosThetaM2 = cosThetaM * cosThetaM;
- float tanThetaM2 = tanThetaM * tanThetaM;
float cosThetaM4 = cosThetaM2 * cosThetaM2;
+ float tanThetaM2 = 1/(cosThetaM2) - 1;
float D = expf(-tanThetaM2 / alpha2) / (M_PI_F * alpha2 * cosThetaM4);
- // eq. 24
- float pm = D * cosThetaM;
- // eval BRDF*cosNI
+
+ /* eval BRDF*cosNI */
float cosNI = dot(N, *omega_in);
- // eq. 26, 27: now calculate G1(i,m) and G1(o,m)
- float ao = 1 / (m_ab * safe_sqrtf((1 - cosNO * cosNO) / (cosNO * cosNO)));
- float ai = 1 / (m_ab * safe_sqrtf((1 - cosNI * cosNI) / (cosNI * cosNI)));
- float G1o = ao < 1.6f ? (3.535f * ao + 2.181f * ao * ao) / (1 + 2.276f * ao + 2.577f * ao * ao) : 1.0f;
+
+ /* eq. 26, 27: now calculate G1(i,m) */
+ float ai = 1 / (alpha_x * safe_sqrtf((1 - cosNI * cosNI) / (cosNI * cosNI)));
float G1i = ai < 1.6f ? (3.535f * ai + 2.181f * ai * ai) / (1 + 2.276f * ai + 2.577f * ai * ai) : 1.0f;
float G = G1o * G1i;
- // eq. 21
+
+ /* eq. 21 */
float cosHI = dot(m, *omega_in);
float cosHO = dot(m, I);
float Ht2 = m_eta * cosHI + cosHO;
Ht2 *= Ht2;
- float out = (fabsf(cosHI * cosHO) * (m_eta * m_eta) * (G * D)) / (cosNO * Ht2);
- // eq. 38 and eq. 17
- *pdf = pm * (m_eta * m_eta) * fabsf(cosHI) / Ht2;
+
+ /* see eval function for derivation */
+ float common = D * (m_eta * m_eta) / (cosNO * Ht2);
+ float out = G * fabsf(cosHI * cosHO) * common;
+ *pdf = G1o * cosHO * fabsf(cosHI) * common;
+
*eval = make_float3(out, out, out);
}
}
diff --git a/intern/cycles/kernel/closure/bsdf_util.h b/intern/cycles/kernel/closure/bsdf_util.h
index b3dcb9dcc38..05816bac2c1 100644
--- a/intern/cycles/kernel/closure/bsdf_util.h
+++ b/intern/cycles/kernel/closure/bsdf_util.h
@@ -111,16 +111,20 @@ ccl_device float fresnel_dielectric_cos(float cosi, float eta)
return 1.0f; // TIR(no refracted component)
}
-ccl_device float fresnel_conductor(float cosi, float eta, float k)
+#if 0
+ccl_device float3 fresnel_conductor(float cosi, const float3 eta, const float3 k)
{
- float tmp_f = eta * eta + k * k;
- float tmp = tmp_f * cosi * cosi;
- float Rparl2 = (tmp - (2.0f * eta * cosi) + 1)/
- (tmp + (2.0f * eta * cosi) + 1);
- float Rperp2 = (tmp_f - (2.0f * eta * cosi) + cosi * cosi)/
- (tmp_f + (2.0f * eta * cosi) + cosi * cosi);
+ float3 cosi2 = make_float3(cosi*cosi);
+ float3 one = make_float3(1.0f, 1.0f, 1.0f);
+ float3 tmp_f = eta * eta + k * k;
+ float3 tmp = tmp_f * cosi2;
+ float3 Rparl2 = (tmp - (2.0f * eta * cosi) + one) /
+ (tmp + (2.0f * eta * cosi) + one);
+ float3 Rperp2 = (tmp_f - (2.0f * eta * cosi) + cosi2) /
+ (tmp_f + (2.0f * eta * cosi) + cosi2);
return(Rparl2 + Rperp2) * 0.5f;
}
+#endif
ccl_device float smooth_step(float edge0, float edge1, float x)
{
diff --git a/intern/cycles/kernel/closure/bsdf_ward.h b/intern/cycles/kernel/closure/bsdf_ward.h
deleted file mode 100644
index c9de615a011..00000000000
--- a/intern/cycles/kernel/closure/bsdf_ward.h
+++ /dev/null
@@ -1,189 +0,0 @@
-/*
- * Adapted from Open Shading Language with this license:
- *
- * Copyright (c) 2009-2010 Sony Pictures Imageworks Inc., et al.
- * All Rights Reserved.
- *
- * Modifications Copyright 2011, Blender Foundation.
- *
- * Redistribution and use in source and binary forms, with or without
- * modification, are permitted provided that the following conditions are
- * met:
- * * Redistributions of source code must retain the above copyright
- * notice, this list of conditions and the following disclaimer.
- * * Redistributions in binary form must reproduce the above copyright
- * notice, this list of conditions and the following disclaimer in the
- * documentation and/or other materials provided with the distribution.
- * * Neither the name of Sony Pictures Imageworks nor the names of its
- * contributors may be used to endorse or promote products derived from
- * this software without specific prior written permission.
- * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
- * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
- * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
- * A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
- * OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
- * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
- * LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
- * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
- * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
- * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
- * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
- */
-
-#ifndef __BSDF_WARD_H__
-#define __BSDF_WARD_H__
-
-CCL_NAMESPACE_BEGIN
-
-/* WARD */
-
-ccl_device int bsdf_ward_setup(ShaderClosure *sc)
-{
- sc->data0 = clamp(sc->data0, 1e-4f, 1.0f); /* m_ax */
- sc->data1 = clamp(sc->data1, 1e-4f, 1.0f); /* m_ay */
-
- sc->type = CLOSURE_BSDF_WARD_ID;
- return SD_BSDF|SD_BSDF_HAS_EVAL|SD_BSDF_GLOSSY;
-}
-
-ccl_device void bsdf_ward_blur(ShaderClosure *sc, float roughness)
-{
- sc->data0 = fmaxf(roughness, sc->data0); /* m_ax */
- sc->data1 = fmaxf(roughness, sc->data1); /* m_ay */
-}
-
-ccl_device float3 bsdf_ward_eval_reflect(const ShaderClosure *sc, const float3 I, const float3 omega_in, float *pdf)
-{
- float m_ax = sc->data0;
- float m_ay = sc->data1;
- float3 N = sc->N;
- float3 T = sc->T;
-
- float cosNO = dot(N, I);
- float cosNI = dot(N, omega_in);
-
- if(cosNI > 0.0f && cosNO > 0.0f) {
- cosNO = max(cosNO, 1e-4f);
- cosNI = max(cosNI, 1e-4f);
-
- // get half vector and get x,y basis on the surface for anisotropy
- float3 H = normalize(omega_in + I); // normalize needed for pdf
- float3 X, Y;
- make_orthonormals_tangent(N, T, &X, &Y);
- // eq. 4
- float dotx = dot(H, X) / m_ax;
- float doty = dot(H, Y) / m_ay;
- float dotn = dot(H, N);
- float exp_arg = (dotx * dotx + doty * doty) / (dotn * dotn);
- float denom = (M_4PI_F * m_ax * m_ay * sqrtf(cosNO * cosNI));
- float exp_val = expf(-exp_arg);
- float out = cosNI * exp_val / denom;
- float oh = dot(H, I);
- denom = M_4PI_F * m_ax * m_ay * oh * dotn * dotn * dotn;
- *pdf = exp_val / denom;
- return make_float3 (out, out, out);
- }
-
- return make_float3 (0, 0, 0);
-}
-
-ccl_device float3 bsdf_ward_eval_transmit(const ShaderClosure *sc, const float3 I, const float3 omega_in, float *pdf)
-{
- return make_float3(0.0f, 0.0f, 0.0f);
-}
-
-ccl_device int bsdf_ward_sample(const ShaderClosure *sc, float3 Ng, float3 I, float3 dIdx, float3 dIdy, float randu, float randv, float3 *eval, float3 *omega_in, float3 *domega_in_dx, float3 *domega_in_dy, float *pdf)
-{
- float m_ax = sc->data0;
- float m_ay = sc->data1;
- float3 N = sc->N;
- float3 T = sc->T;
-
- float cosNO = dot(N, I);
- if(cosNO > 0.0f) {
- // get x,y basis on the surface for anisotropy
- float3 X, Y;
- make_orthonormals_tangent(N, T, &X, &Y);
- // generate random angles for the half vector
- // eq. 7 (taking care around discontinuities to keep
- //ttoutput angle in the right quadrant)
- // we take advantage of cos(atan(x)) == 1/sqrt(1+x^2)
- //tttt and sin(atan(x)) == x/sqrt(1+x^2)
- float alphaRatio = m_ay / m_ax;
- float cosPhi, sinPhi;
- if(randu < 0.25f) {
- float val = 4 * randu;
- float tanPhi = alphaRatio * tanf(M_PI_2_F * val);
- cosPhi = 1 / sqrtf(1 + tanPhi * tanPhi);
- sinPhi = tanPhi * cosPhi;
- }
- else if(randu < 0.5f) {
- float val = 1 - 4 * (0.5f - randu);
- float tanPhi = alphaRatio * tanf(M_PI_2_F * val);
- // phi = M_PI_F - phi;
- cosPhi = -1 / sqrtf(1 + tanPhi * tanPhi);
- sinPhi = -tanPhi * cosPhi;
- }
- else if(randu < 0.75f) {
- float val = 4 * (randu - 0.5f);
- float tanPhi = alphaRatio * tanf(M_PI_2_F * val);
- //phi = M_PI_F + phi;
- cosPhi = -1 / sqrtf(1 + tanPhi * tanPhi);
- sinPhi = tanPhi * cosPhi;
- }
- else {
- float val = 1 - 4 * (1 - randu);
- float tanPhi = alphaRatio * tanf(M_PI_2_F * val);
- // phi = M_2PI_F - phi;
- cosPhi = 1 / sqrtf(1 + tanPhi * tanPhi);
- sinPhi = -tanPhi * cosPhi;
- }
- // eq. 6
- // we take advantage of cos(atan(x)) == 1/sqrt(1+x^2)
- //tttt and sin(atan(x)) == x/sqrt(1+x^2)
- float thetaDenom = (cosPhi * cosPhi) / (m_ax * m_ax) + (sinPhi * sinPhi) / (m_ay * m_ay);
- float tanTheta2 = -logf(1 - randv) / thetaDenom;
- float cosTheta = 1 / sqrtf(1 + tanTheta2);
- float sinTheta = cosTheta * sqrtf(tanTheta2);
-
- float3 h; // already normalized becaused expressed from spherical coordinates
- h.x = sinTheta * cosPhi;
- h.y = sinTheta * sinPhi;
- h.z = cosTheta;
- // compute terms that are easier in local space
- float dotx = h.x / m_ax;
- float doty = h.y / m_ay;
- float dotn = h.z;
- // transform to world space
- h = h.x * X + h.y * Y + h.z * N;
- // generate the final sample
- float oh = dot(h, I);
- *omega_in = 2.0f * oh * h - I;
- if(dot(Ng, *omega_in) > 0) {
- float cosNI = dot(N, *omega_in);
- if(cosNI > 0) {
- cosNO = max(cosNO, 1e-4f);
- cosNI = max(cosNI, 1e-4f);
-
- // eq. 9
- float exp_arg = (dotx * dotx + doty * doty) / (dotn * dotn);
- float denom = M_4PI_F * m_ax * m_ay * oh * dotn * dotn * dotn;
- *pdf = expf(-exp_arg) / denom;
- // compiler will reuse expressions already computed
- denom = (M_4PI_F * m_ax * m_ay * sqrtf(cosNO * cosNI));
- float power = cosNI * expf(-exp_arg) / denom;
- *eval = make_float3(power, power, power);
-#ifdef __RAY_DIFFERENTIALS__
- *domega_in_dx = (2 * dot(N, dIdx)) * N - dIdx;
- *domega_in_dy = (2 * dot(N, dIdy)) * N - dIdy;
-#endif
- }
- }
- }
- return LABEL_REFLECT|LABEL_GLOSSY;
-}
-
-CCL_NAMESPACE_END
-
-#endif /* __BSDF_WARD_H__ */
-
diff --git a/intern/cycles/kernel/closure/bsdf_westin.h b/intern/cycles/kernel/closure/bsdf_westin.h
deleted file mode 100644
index 9dc1c00bb3d..00000000000
--- a/intern/cycles/kernel/closure/bsdf_westin.h
+++ /dev/null
@@ -1,180 +0,0 @@
-/*
- * Adapted from Open Shading Language with this license:
- *
- * Copyright (c) 2009-2010 Sony Pictures Imageworks Inc., et al.
- * All Rights Reserved.
- *
- * Modifications Copyright 2011, Blender Foundation.
- *
- * Redistribution and use in source and binary forms, with or without
- * modification, are permitted provided that the following conditions are
- * met:
- * * Redistributions of source code must retain the above copyright
- * notice, this list of conditions and the following disclaimer.
- * * Redistributions in binary form must reproduce the above copyright
- * notice, this list of conditions and the following disclaimer in the
- * documentation and/or other materials provided with the distribution.
- * * Neither the name of Sony Pictures Imageworks nor the names of its
- * contributors may be used to endorse or promote products derived from
- * this software without specific prior written permission.
- * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
- * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
- * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
- * A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
- * OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
- * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
- * LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
- * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
- * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
- * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
- * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
- */
-
-#ifndef __BSDF_WESTIN_H__
-#define __BSDF_WESTIN_H__
-
-CCL_NAMESPACE_BEGIN
-
-/* WESTIN BACKSCATTER */
-
-ccl_device int bsdf_westin_backscatter_setup(ShaderClosure *sc)
-{
- float roughness = sc->data0;
- roughness = clamp(roughness, 1e-5f, 1.0f);
- float m_invroughness = 1.0f/roughness;
-
- sc->type = CLOSURE_BSDF_WESTIN_BACKSCATTER_ID;
- sc->data0 = m_invroughness;
-
- return SD_BSDF|SD_BSDF_HAS_EVAL|SD_BSDF_GLOSSY;
-}
-
-ccl_device void bsdf_westin_backscatter_blur(ShaderClosure *sc, float roughness)
-{
- float m_invroughness = sc->data0;
- m_invroughness = min(1.0f/roughness, m_invroughness);
- sc->data0 = m_invroughness;
-}
-
-ccl_device float3 bsdf_westin_backscatter_eval_reflect(const ShaderClosure *sc, const float3 I, const float3 omega_in, float *pdf)
-{
- float m_invroughness = sc->data0;
- float3 N = sc->N;
-
- // pdf is implicitly 0 (no indirect sampling)
- float cosNO = dot(N, I);
- float cosNI = dot(N, omega_in);
- if(cosNO > 0 && cosNI > 0) {
- float cosine = dot(I, omega_in);
- *pdf = cosine > 0 ? (m_invroughness + 1) * powf(cosine, m_invroughness) : 0;
- *pdf *= 0.5f * M_1_PI_F;
- return make_float3 (*pdf, *pdf, *pdf);
- }
- return make_float3 (0, 0, 0);
-}
-
-ccl_device float3 bsdf_westin_backscatter_eval_transmit(const ShaderClosure *sc, const float3 I, const float3 omega_in, float *pdf)
-{
- return make_float3(0.0f, 0.0f, 0.0f);
-}
-
-ccl_device int bsdf_westin_backscatter_sample(const ShaderClosure *sc, float3 Ng, float3 I, float3 dIdx, float3 dIdy, float randu, float randv, float3 *eval, float3 *omega_in, float3 *domega_in_dx, float3 *domega_in_dy, float *pdf)
-{
- float m_invroughness = sc->data0;
- float3 N = sc->N;
-
- float cosNO = dot(N, I);
- if(cosNO > 0) {
-#ifdef __RAY_DIFFERENTIALS__
- *domega_in_dx = dIdx;
- *domega_in_dy = dIdy;
-#endif
- float3 T, B;
- make_orthonormals (I, &T, &B);
- float phi = M_2PI_F * randu;
- float cosTheta = powf(randv, 1 / (m_invroughness + 1));
- float sinTheta2 = 1 - cosTheta * cosTheta;
- float sinTheta = sinTheta2 > 0 ? sqrtf(sinTheta2) : 0;
- *omega_in = (cosf(phi) * sinTheta) * T +
- (sinf(phi) * sinTheta) * B +
- (cosTheta) * I;
- if(dot(Ng, *omega_in) > 0) {
- // common terms for pdf and eval
- float cosNI = dot(N, *omega_in);
- // make sure the direction we chose is still in the right hemisphere
- if(cosNI > 0)
- {
- *pdf = 0.5f * M_1_PI_F * powf(cosTheta, m_invroughness);
- *pdf = (m_invroughness + 1) * (*pdf);
- *eval = make_float3(*pdf, *pdf, *pdf);
- }
- }
- }
- return LABEL_REFLECT|LABEL_GLOSSY;
-}
-
-/* WESTIN SHEEN */
-
-ccl_device int bsdf_westin_sheen_setup(ShaderClosure *sc)
-{
- /* float edginess = sc->data0; */
- sc->type = CLOSURE_BSDF_WESTIN_SHEEN_ID;
- return SD_BSDF|SD_BSDF_HAS_EVAL|SD_BSDF_GLOSSY;
-}
-
-ccl_device void bsdf_westin_sheen_blur(ShaderClosure *sc, float roughness)
-{
-}
-
-ccl_device float3 bsdf_westin_sheen_eval_reflect(const ShaderClosure *sc, const float3 I, const float3 omega_in, float *pdf)
-{
- float m_edginess = sc->data0;
- float3 N = sc->N;
-
- // pdf is implicitly 0 (no indirect sampling)
- float cosNO = dot(N, I);
- float cosNI = dot(N, omega_in);
- if(cosNO > 0 && cosNI > 0) {
- float sinNO2 = 1 - cosNO * cosNO;
- *pdf = cosNI * M_1_PI_F;
- float westin = sinNO2 > 0 ? powf(sinNO2, 0.5f * m_edginess) * (*pdf) : 0;
- return make_float3 (westin, westin, westin);
- }
- return make_float3 (0, 0, 0);
-}
-
-ccl_device float3 bsdf_westin_sheen_eval_transmit(const ShaderClosure *sc, const float3 I, const float3 omega_in, float *pdf)
-{
- return make_float3(0.0f, 0.0f, 0.0f);
-}
-
-ccl_device int bsdf_westin_sheen_sample(const ShaderClosure *sc, float3 Ng, float3 I, float3 dIdx, float3 dIdy, float randu, float randv, float3 *eval, float3 *omega_in, float3 *domega_in_dx, float3 *domega_in_dy, float *pdf)
-{
- float m_edginess = sc->data0;
- float3 N = sc->N;
-
- // we are viewing the surface from the right side - send a ray out with cosine
- // distribution over the hemisphere
- sample_cos_hemisphere(N, randu, randv, omega_in, pdf);
- if(dot(Ng, *omega_in) > 0) {
- // TODO: account for sheen when sampling
- float cosNO = dot(N, I);
- float sinNO2 = 1 - cosNO * cosNO;
- float westin = sinNO2 > 0 ? powf(sinNO2, 0.5f * m_edginess) * (*pdf) : 0;
- *eval = make_float3(westin, westin, westin);
-#ifdef __RAY_DIFFERENTIALS__
- // TODO: find a better approximation for the diffuse bounce
- *domega_in_dx = (2 * dot(N, dIdx)) * N - dIdx;
- *domega_in_dy = (2 * dot(N, dIdy)) * N - dIdy;
-#endif
- }
- else {
- pdf = 0;
- }
- return LABEL_REFLECT|LABEL_DIFFUSE;
-}
-
-CCL_NAMESPACE_END
-
-#endif /* __BSDF_WESTIN_H__ */
-
diff --git a/intern/cycles/kernel/geom/geom_bvh.h b/intern/cycles/kernel/geom/geom_bvh.h
index dd7c25d581d..c5336e086b7 100644
--- a/intern/cycles/kernel/geom/geom_bvh.h
+++ b/intern/cycles/kernel/geom/geom_bvh.h
@@ -28,6 +28,13 @@
CCL_NAMESPACE_BEGIN
+/* Don't inline intersect functions on GPU, this is faster */
+#ifdef __KERNEL_GPU__
+#define ccl_device_intersect ccl_device_noinline
+#else
+#define ccl_device_intersect ccl_device_inline
+#endif
+
/* BVH intersection function variations */
#define BVH_INSTANCING 1
@@ -35,6 +42,8 @@ CCL_NAMESPACE_BEGIN
#define BVH_HAIR 4
#define BVH_HAIR_MINIMUM_WIDTH 8
+/* Regular BVH traversal */
+
#define BVH_FUNCTION_NAME bvh_intersect
#define BVH_FUNCTION_FEATURES 0
#include "geom_bvh_traversal.h"
@@ -63,6 +72,8 @@ CCL_NAMESPACE_BEGIN
#include "geom_bvh_traversal.h"
#endif
+/* Subsurface scattering BVH traversal */
+
#if defined(__SUBSURFACE__)
#define BVH_FUNCTION_NAME bvh_intersect_subsurface
#define BVH_FUNCTION_FEATURES 0
@@ -93,47 +104,72 @@ CCL_NAMESPACE_BEGIN
#include "geom_bvh_subsurface.h"
#endif
+/* Record all BVH intersection for shadows */
+
#if defined(__SHADOW_RECORD_ALL__)
#define BVH_FUNCTION_NAME bvh_intersect_shadow_all
#define BVH_FUNCTION_FEATURES 0
#include "geom_bvh_shadow.h"
#endif
-#if defined(__SUBSURFACE__) && defined(__INSTANCING__)
+#if defined(__SHADOW_RECORD_ALL__) && defined(__INSTANCING__)
#define BVH_FUNCTION_NAME bvh_intersect_shadow_all_instancing
#define BVH_FUNCTION_FEATURES BVH_INSTANCING
#include "geom_bvh_shadow.h"
#endif
-#if defined(__SUBSURFACE__) && defined(__HAIR__)
+#if defined(__SHADOW_RECORD_ALL__) && defined(__HAIR__)
#define BVH_FUNCTION_NAME bvh_intersect_shadow_all_hair
#define BVH_FUNCTION_FEATURES BVH_INSTANCING|BVH_HAIR
#include "geom_bvh_shadow.h"
#endif
-#if defined(__SUBSURFACE__) && defined(__OBJECT_MOTION__)
+#if defined(__SHADOW_RECORD_ALL__) && defined(__OBJECT_MOTION__)
#define BVH_FUNCTION_NAME bvh_intersect_shadow_all_motion
#define BVH_FUNCTION_FEATURES BVH_INSTANCING|BVH_MOTION
#include "geom_bvh_shadow.h"
#endif
-#if defined(__SUBSURFACE__) && defined(__HAIR__) && defined(__OBJECT_MOTION__)
+#if defined(__SHADOW_RECORD_ALL__) && defined(__HAIR__) && defined(__OBJECT_MOTION__)
#define BVH_FUNCTION_NAME bvh_intersect_shadow_all_hair_motion
#define BVH_FUNCTION_FEATURES BVH_INSTANCING|BVH_HAIR|BVH_MOTION
#include "geom_bvh_shadow.h"
#endif
-/* to work around titan bug when using arrays instead of textures */
-#if !defined(__KERNEL_CUDA__) || defined(__KERNEL_CUDA_TEX_STORAGE__)
-ccl_device_inline
-#else
-ccl_device_noinline
+/* Camera inside Volume BVH intersection */
+
+#if defined(__VOLUME__)
+#define BVH_FUNCTION_NAME bvh_intersect_volume
+#define BVH_FUNCTION_FEATURES 0
+#include "geom_bvh_volume.h"
#endif
-#ifdef __HAIR__
-bool scene_intersect(KernelGlobals *kg, const Ray *ray, const uint visibility, Intersection *isect, uint *lcg_state, float difl, float extmax)
-#else
-bool scene_intersect(KernelGlobals *kg, const Ray *ray, const uint visibility, Intersection *isect)
+
+#if defined(__VOLUME__) && defined(__INSTANCING__)
+#define BVH_FUNCTION_NAME bvh_intersect_volume_instancing
+#define BVH_FUNCTION_FEATURES BVH_INSTANCING
+#include "geom_bvh_volume.h"
+#endif
+
+#if defined(__VOLUME__) && defined(__HAIR__)
+#define BVH_FUNCTION_NAME bvh_intersect_volume_hair
+#define BVH_FUNCTION_FEATURES BVH_INSTANCING|BVH_HAIR|BVH_HAIR_MINIMUM_WIDTH
+#include "geom_bvh_volume.h"
+#endif
+
+#if defined(__VOLUME__) && defined(__OBJECT_MOTION__)
+#define BVH_FUNCTION_NAME bvh_intersect_volume_motion
+#define BVH_FUNCTION_FEATURES BVH_INSTANCING|BVH_MOTION
+#include "geom_bvh_volume.h"
+#endif
+
+#if defined(__VOLUME__) && defined(__HAIR__) && defined(__OBJECT_MOTION__)
+#define BVH_FUNCTION_NAME bvh_intersect_volume_hair_motion
+#define BVH_FUNCTION_FEATURES BVH_INSTANCING|BVH_HAIR|BVH_HAIR_MINIMUM_WIDTH|BVH_MOTION
+#include "geom_bvh_volume.h"
#endif
+
+ccl_device_intersect bool scene_intersect(KernelGlobals *kg, const Ray *ray, const uint visibility, Intersection *isect,
+ uint *lcg_state, float difl, float extmax)
{
#ifdef __OBJECT_MOTION__
if(kernel_data.bvh.have_motion) {
@@ -170,14 +206,8 @@ bool scene_intersect(KernelGlobals *kg, const Ray *ray, const uint visibility, I
#endif /* __KERNEL_CPU__ */
}
-/* to work around titan bug when using arrays instead of textures */
#ifdef __SUBSURFACE__
-#if !defined(__KERNEL_CUDA__) || defined(__KERNEL_CUDA_TEX_STORAGE__)
-ccl_device_inline
-#else
-ccl_device_noinline
-#endif
-uint scene_intersect_subsurface(KernelGlobals *kg, const Ray *ray, Intersection *isect, int subsurface_object, uint *lcg_state, int max_hits)
+ccl_device_intersect uint scene_intersect_subsurface(KernelGlobals *kg, const Ray *ray, Intersection *isect, int subsurface_object, uint *lcg_state, int max_hits)
{
#ifdef __OBJECT_MOTION__
if(kernel_data.bvh.have_motion) {
@@ -215,14 +245,8 @@ uint scene_intersect_subsurface(KernelGlobals *kg, const Ray *ray, Intersection
}
#endif
-/* to work around titan bug when using arrays instead of textures */
#ifdef __SHADOW_RECORD_ALL__
-#if !defined(__KERNEL_CUDA__) || defined(__KERNEL_CUDA_TEX_STORAGE__)
-ccl_device_inline
-#else
-ccl_device_noinline
-#endif
-uint scene_intersect_shadow_all(KernelGlobals *kg, const Ray *ray, Intersection *isect, uint max_hits, uint *num_hits)
+ccl_device_intersect bool scene_intersect_shadow_all(KernelGlobals *kg, const Ray *ray, Intersection *isect, uint max_hits, uint *num_hits)
{
#ifdef __OBJECT_MOTION__
if(kernel_data.bvh.have_motion) {
@@ -240,20 +264,50 @@ uint scene_intersect_shadow_all(KernelGlobals *kg, const Ray *ray, Intersection
return bvh_intersect_shadow_all_hair(kg, ray, isect, max_hits, num_hits);
#endif /* __HAIR__ */
-#ifdef __KERNEL_CPU__
-
#ifdef __INSTANCING__
if(kernel_data.bvh.have_instancing)
return bvh_intersect_shadow_all_instancing(kg, ray, isect, max_hits, num_hits);
#endif /* __INSTANCING__ */
return bvh_intersect_shadow_all(kg, ray, isect, max_hits, num_hits);
+}
+#endif
+
+#ifdef __VOLUME__
+ccl_device_intersect bool scene_intersect_volume(KernelGlobals *kg,
+ const Ray *ray,
+ Intersection *isect)
+{
+#ifdef __OBJECT_MOTION__
+ if(kernel_data.bvh.have_motion) {
+#ifdef __HAIR__
+ if(kernel_data.bvh.have_curves)
+ return bvh_intersect_volume_hair_motion(kg, ray, isect);
+#endif /* __HAIR__ */
+
+ return bvh_intersect_volume_motion(kg, ray, isect);
+ }
+#endif /* __OBJECT_MOTION__ */
+
+#ifdef __HAIR__
+ if(kernel_data.bvh.have_curves)
+ return bvh_intersect_volume_hair(kg, ray, isect);
+#endif /* __HAIR__ */
+
+#ifdef __KERNEL_CPU__
+
+#ifdef __INSTANCING__
+ if(kernel_data.bvh.have_instancing)
+ return bvh_intersect_volume_instancing(kg, ray, isect);
+#endif /* __INSTANCING__ */
+
+ return bvh_intersect_volume(kg, ray, isect);
#else /* __KERNEL_CPU__ */
#ifdef __INSTANCING__
- return bvh_intersect_shadow_all_instancing(kg, ray, isect, max_hits, num_hits);
+ return bvh_intersect_volume_instancing(kg, ray, isect);
#else
- return bvh_intersect_shadow_all(kg, ray, isect, max_hits, num_hits);
+ return bvh_intersect_volume(kg, ray, isect);
#endif /* __INSTANCING__ */
#endif /* __KERNEL_CPU__ */
diff --git a/intern/cycles/kernel/geom/geom_bvh_shadow.h b/intern/cycles/kernel/geom/geom_bvh_shadow.h
index 98bf82b3b2d..aee4097d77e 100644
--- a/intern/cycles/kernel/geom/geom_bvh_shadow.h
+++ b/intern/cycles/kernel/geom/geom_bvh_shadow.h
@@ -68,15 +68,15 @@ ccl_device bool BVH_FUNCTION_NAME
const shuffle_swap_t shuf_identity = shuffle_swap_identity();
const shuffle_swap_t shuf_swap = shuffle_swap_swap();
- const __m128 pn = _mm_castsi128_ps(_mm_set_epi32(0x80000000, 0x80000000, 0, 0));
- __m128 Psplat[3], idirsplat[3];
+ const ssef pn = cast(ssei(0, 0, 0x80000000, 0x80000000));
+ ssef Psplat[3], idirsplat[3];
shuffle_swap_t shufflexyz[3];
- Psplat[0] = _mm_set_ps1(P.x);
- Psplat[1] = _mm_set_ps1(P.y);
- Psplat[2] = _mm_set_ps1(P.z);
+ Psplat[0] = ssef(P.x);
+ Psplat[1] = ssef(P.y);
+ Psplat[2] = ssef(P.z);
- __m128 tsplat = _mm_set_ps(-isect_t, -isect_t, 0.0f, 0.0f);
+ ssef tsplat(0.0f, 0.0f, -isect_t, -isect_t);
gen_idirsplat_swap(pn, shuf_identity, shuf_swap, idir, idirsplat, shufflexyz);
#endif
@@ -132,27 +132,27 @@ ccl_device bool BVH_FUNCTION_NAME
/* Intersect two child bounding boxes, SSE3 version adapted from Embree */
/* fetch node data */
- const __m128 *bvh_nodes = (__m128*)kg->__bvh_nodes.data + nodeAddr*BVH_NODE_SIZE;
+ const ssef *bvh_nodes = (ssef*)kg->__bvh_nodes.data + nodeAddr*BVH_NODE_SIZE;
const float4 cnodes = ((float4*)bvh_nodes)[3];
/* intersect ray against child nodes */
- const __m128 tminmaxx = _mm_mul_ps(_mm_sub_ps(shuffle_swap(bvh_nodes[0], shufflexyz[0]), Psplat[0]), idirsplat[0]);
- const __m128 tminmaxy = _mm_mul_ps(_mm_sub_ps(shuffle_swap(bvh_nodes[1], shufflexyz[1]), Psplat[1]), idirsplat[1]);
- const __m128 tminmaxz = _mm_mul_ps(_mm_sub_ps(shuffle_swap(bvh_nodes[2], shufflexyz[2]), Psplat[2]), idirsplat[2]);
+ const ssef tminmaxx = (shuffle_swap(bvh_nodes[0], shufflexyz[0]) - Psplat[0]) * idirsplat[0];
+ const ssef tminmaxy = (shuffle_swap(bvh_nodes[1], shufflexyz[1]) - Psplat[1]) * idirsplat[1];
+ const ssef tminmaxz = (shuffle_swap(bvh_nodes[2], shufflexyz[2]) - Psplat[2]) * idirsplat[2];
/* calculate { c0min, c1min, -c0max, -c1max} */
- __m128 minmax = _mm_max_ps(_mm_max_ps(tminmaxx, tminmaxy), _mm_max_ps(tminmaxz, tsplat));
- const __m128 tminmax = _mm_xor_ps(minmax, pn);
- const __m128 lrhit = _mm_cmple_ps(tminmax, shuffle<2, 3, 0, 1>(tminmax));
+ const ssef minmax = max(max(tminmaxx, tminmaxy), max(tminmaxz, tsplat));
+ const ssef tminmax = minmax ^ pn;
+ const sseb lrhit = tminmax <= shuffle<2, 3, 0, 1>(tminmax);
/* decide which nodes to traverse next */
#ifdef __VISIBILITY_FLAG__
/* this visibility test gives a 5% performance hit, how to solve? */
- traverseChild0 = (_mm_movemask_ps(lrhit) & 1) && (__float_as_uint(cnodes.z) & PATH_RAY_SHADOW);
- traverseChild1 = (_mm_movemask_ps(lrhit) & 2) && (__float_as_uint(cnodes.w) & PATH_RAY_SHADOW);
+ traverseChild0 = (movemask(lrhit) & 1) && (__float_as_uint(cnodes.z) & PATH_RAY_SHADOW);
+ traverseChild1 = (movemask(lrhit) & 2) && (__float_as_uint(cnodes.w) & PATH_RAY_SHADOW);
#else
- traverseChild0 = (_mm_movemask_ps(lrhit) & 1);
- traverseChild1 = (_mm_movemask_ps(lrhit) & 2);
+ traverseChild0 = (movemask(lrhit) & 1);
+ traverseChild1 = (movemask(lrhit) & 2);
#endif
#endif // __KERNEL_SSE2__
@@ -164,9 +164,7 @@ ccl_device bool BVH_FUNCTION_NAME
#if !defined(__KERNEL_SSE2__)
bool closestChild1 = (c1min < c0min);
#else
- union { __m128 m128; float v[4]; } uminmax;
- uminmax.m128 = tminmax;
- bool closestChild1 = uminmax.v[1] < uminmax.v[0];
+ bool closestChild1 = tminmax[1] < tminmax[0];
#endif
if(closestChild1) {
@@ -254,8 +252,7 @@ ccl_device bool BVH_FUNCTION_NAME
if(kernel_tex_fetch(__prim_type, isect_array->prim) & PRIMITIVE_ALL_TRIANGLE)
#endif
{
- float4 Ns = kernel_tex_fetch(__tri_normal, prim);
- shader = __float_as_int(Ns.w);
+ shader = kernel_tex_fetch(__tri_shader, prim);
}
#ifdef __HAIR__
else {
@@ -301,12 +298,12 @@ ccl_device bool BVH_FUNCTION_NAME
num_hits_in_instance = 0;
#if defined(__KERNEL_SSE2__)
- Psplat[0] = _mm_set_ps1(P.x);
- Psplat[1] = _mm_set_ps1(P.y);
- Psplat[2] = _mm_set_ps1(P.z);
+ Psplat[0] = ssef(P.x);
+ Psplat[1] = ssef(P.y);
+ Psplat[2] = ssef(P.z);
isect_array->t = isect_t;
- tsplat = _mm_set_ps(-isect_t, -isect_t, 0.0f, 0.0f);
+ tsplat = ssef(0.0f, 0.0f, -isect_t, -isect_t);
gen_idirsplat_swap(pn, shuf_identity, shuf_swap, idir, idirsplat, shufflexyz);
#endif
@@ -348,13 +345,13 @@ ccl_device bool BVH_FUNCTION_NAME
}
#if defined(__KERNEL_SSE2__)
- Psplat[0] = _mm_set_ps1(P.x);
- Psplat[1] = _mm_set_ps1(P.y);
- Psplat[2] = _mm_set_ps1(P.z);
+ Psplat[0] = ssef(P.x);
+ Psplat[1] = ssef(P.y);
+ Psplat[2] = ssef(P.z);
isect_t = tmax;
isect_array->t = isect_t;
- tsplat = _mm_set_ps(-isect_t, -isect_t, 0.0f, 0.0f);
+ tsplat = ssef(0.0f, 0.0f, -isect_t, -isect_t);
gen_idirsplat_swap(pn, shuf_identity, shuf_swap, idir, idirsplat, shufflexyz);
#endif
diff --git a/intern/cycles/kernel/geom/geom_bvh_subsurface.h b/intern/cycles/kernel/geom/geom_bvh_subsurface.h
index a19f05dd371..a8f57cffa78 100644
--- a/intern/cycles/kernel/geom/geom_bvh_subsurface.h
+++ b/intern/cycles/kernel/geom/geom_bvh_subsurface.h
@@ -65,15 +65,15 @@ ccl_device uint BVH_FUNCTION_NAME(KernelGlobals *kg, const Ray *ray, Intersectio
const shuffle_swap_t shuf_identity = shuffle_swap_identity();
const shuffle_swap_t shuf_swap = shuffle_swap_swap();
- const __m128 pn = _mm_castsi128_ps(_mm_set_epi32(0x80000000, 0x80000000, 0, 0));
- __m128 Psplat[3], idirsplat[3];
+ const ssef pn = cast(ssei(0, 0, 0x80000000, 0x80000000));
+ ssef Psplat[3], idirsplat[3];
shuffle_swap_t shufflexyz[3];
- Psplat[0] = _mm_set_ps1(P.x);
- Psplat[1] = _mm_set_ps1(P.y);
- Psplat[2] = _mm_set_ps1(P.z);
+ Psplat[0] = ssef(P.x);
+ Psplat[1] = ssef(P.y);
+ Psplat[2] = ssef(P.z);
- __m128 tsplat = _mm_set_ps(-isect_t, -isect_t, 0.0f, 0.0f);
+ ssef tsplat(0.0f, 0.0f, -isect_t, -isect_t);
gen_idirsplat_swap(pn, shuf_identity, shuf_swap, idir, idirsplat, shufflexyz);
#endif
@@ -131,25 +131,27 @@ ccl_device uint BVH_FUNCTION_NAME(KernelGlobals *kg, const Ray *ray, Intersectio
/* Intersect two child bounding boxes, SSE3 version adapted from Embree */
/* fetch node data */
- const __m128 *bvh_nodes = (__m128*)kg->__bvh_nodes.data + nodeAddr*BVH_NODE_SIZE;
+ const ssef *bvh_nodes = (ssef*)kg->__bvh_nodes.data + nodeAddr*BVH_NODE_SIZE;
const float4 cnodes = ((float4*)bvh_nodes)[3];
/* intersect ray against child nodes */
- const __m128 tminmaxx = _mm_mul_ps(_mm_sub_ps(shuffle_swap(bvh_nodes[0], shufflexyz[0]), Psplat[0]), idirsplat[0]);
- const __m128 tminmaxy = _mm_mul_ps(_mm_sub_ps(shuffle_swap(bvh_nodes[1], shufflexyz[1]), Psplat[1]), idirsplat[1]);
- const __m128 tminmaxz = _mm_mul_ps(_mm_sub_ps(shuffle_swap(bvh_nodes[2], shufflexyz[2]), Psplat[2]), idirsplat[2]);
+ const ssef tminmaxx = (shuffle_swap(bvh_nodes[0], shufflexyz[0]) - Psplat[0]) * idirsplat[0];
+ const ssef tminmaxy = (shuffle_swap(bvh_nodes[1], shufflexyz[1]) - Psplat[1]) * idirsplat[1];
+ const ssef tminmaxz = (shuffle_swap(bvh_nodes[2], shufflexyz[2]) - Psplat[2]) * idirsplat[2];
- const __m128 tminmax = _mm_xor_ps(_mm_max_ps(_mm_max_ps(tminmaxx, tminmaxy), _mm_max_ps(tminmaxz, tsplat)), pn);
- const __m128 lrhit = _mm_cmple_ps(tminmax, shuffle<2, 3, 0, 1>(tminmax));
+ /* calculate { c0min, c1min, -c0max, -c1max} */
+ const ssef minmax = max(max(tminmaxx, tminmaxy), max(tminmaxz, tsplat));
+ const ssef tminmax = minmax ^ pn;
+ const sseb lrhit = tminmax <= shuffle<2, 3, 0, 1>(tminmax);
/* decide which nodes to traverse next */
#ifdef __VISIBILITY_FLAG__
/* this visibility test gives a 5% performance hit, how to solve? */
- traverseChild0 = (_mm_movemask_ps(lrhit) & 1) && (__float_as_uint(cnodes.z) & visibility);
- traverseChild1 = (_mm_movemask_ps(lrhit) & 2) && (__float_as_uint(cnodes.w) & visibility);
+ traverseChild0 = (movemask(lrhit) & 1) && (__float_as_uint(cnodes.z) & visibility);
+ traverseChild1 = (movemask(lrhit) & 2) && (__float_as_uint(cnodes.w) & visibility);
#else
- traverseChild0 = (_mm_movemask_ps(lrhit) & 1);
- traverseChild1 = (_mm_movemask_ps(lrhit) & 2);
+ traverseChild0 = (movemask(lrhit) & 1);
+ traverseChild1 = (movemask(lrhit) & 2);
#endif
#endif // __KERNEL_SSE2__
@@ -161,9 +163,7 @@ ccl_device uint BVH_FUNCTION_NAME(KernelGlobals *kg, const Ray *ray, Intersectio
#if !defined(__KERNEL_SSE2__)
bool closestChild1 = (c1min < c0min);
#else
- union { __m128 m128; float v[4]; } uminmax;
- uminmax.m128 = tminmax;
- bool closestChild1 = uminmax.v[1] < uminmax.v[0];
+ bool closestChild1 = tminmax[1] < tminmax[0];
#endif
if(closestChild1) {
@@ -243,11 +243,11 @@ ccl_device uint BVH_FUNCTION_NAME(KernelGlobals *kg, const Ray *ray, Intersectio
#endif
#if defined(__KERNEL_SSE2__)
- Psplat[0] = _mm_set_ps1(P.x);
- Psplat[1] = _mm_set_ps1(P.y);
- Psplat[2] = _mm_set_ps1(P.z);
+ Psplat[0] = ssef(P.x);
+ Psplat[1] = ssef(P.y);
+ Psplat[2] = ssef(P.z);
- tsplat = _mm_set_ps(-isect_t, -isect_t, 0.0f, 0.0f);
+ tsplat = ssef(0.0f, 0.0f, -isect_t, -isect_t);
gen_idirsplat_swap(pn, shuf_identity, shuf_swap, idir, idirsplat, shufflexyz);
#endif
@@ -279,11 +279,11 @@ ccl_device uint BVH_FUNCTION_NAME(KernelGlobals *kg, const Ray *ray, Intersectio
#endif
#if defined(__KERNEL_SSE2__)
- Psplat[0] = _mm_set_ps1(P.x);
- Psplat[1] = _mm_set_ps1(P.y);
- Psplat[2] = _mm_set_ps1(P.z);
+ Psplat[0] = ssef(P.x);
+ Psplat[1] = ssef(P.y);
+ Psplat[2] = ssef(P.z);
- tsplat = _mm_set_ps(-isect_t, -isect_t, 0.0f, 0.0f);
+ tsplat = ssef(0.0f, 0.0f, -isect_t, -isect_t);
gen_idirsplat_swap(pn, shuf_identity, shuf_swap, idir, idirsplat, shufflexyz);
#endif
diff --git a/intern/cycles/kernel/geom/geom_bvh_traversal.h b/intern/cycles/kernel/geom/geom_bvh_traversal.h
index 9fd40f91471..114d30a479d 100644
--- a/intern/cycles/kernel/geom/geom_bvh_traversal.h
+++ b/intern/cycles/kernel/geom/geom_bvh_traversal.h
@@ -63,24 +63,28 @@ ccl_device bool BVH_FUNCTION_NAME
#endif
isect->t = ray->t;
- isect->object = OBJECT_NONE;
- isect->prim = PRIM_NONE;
isect->u = 0.0f;
isect->v = 0.0f;
+ isect->prim = PRIM_NONE;
+ isect->object = OBJECT_NONE;
+
+#if defined(__KERNEL_DEBUG__)
+ isect->num_traversal_steps = 0;
+#endif
#if defined(__KERNEL_SSE2__)
const shuffle_swap_t shuf_identity = shuffle_swap_identity();
const shuffle_swap_t shuf_swap = shuffle_swap_swap();
- const __m128 pn = _mm_castsi128_ps(_mm_set_epi32(0x80000000, 0x80000000, 0, 0));
- __m128 Psplat[3], idirsplat[3];
+ const ssef pn = cast(ssei(0, 0, 0x80000000, 0x80000000));
+ ssef Psplat[3], idirsplat[3];
shuffle_swap_t shufflexyz[3];
- Psplat[0] = _mm_set_ps1(P.x);
- Psplat[1] = _mm_set_ps1(P.y);
- Psplat[2] = _mm_set_ps1(P.z);
+ Psplat[0] = ssef(P.x);
+ Psplat[1] = ssef(P.y);
+ Psplat[2] = ssef(P.z);
- __m128 tsplat = _mm_set_ps(-isect->t, -isect->t, 0.0f, 0.0f);
+ ssef tsplat(0.0f, 0.0f, -isect->t, -isect->t);
gen_idirsplat_swap(pn, shuf_identity, shuf_swap, idir, idirsplat, shufflexyz);
#endif
@@ -151,17 +155,17 @@ ccl_device bool BVH_FUNCTION_NAME
/* Intersect two child bounding boxes, SSE3 version adapted from Embree */
/* fetch node data */
- const __m128 *bvh_nodes = (__m128*)kg->__bvh_nodes.data + nodeAddr*BVH_NODE_SIZE;
+ const ssef *bvh_nodes = (ssef*)kg->__bvh_nodes.data + nodeAddr*BVH_NODE_SIZE;
const float4 cnodes = ((float4*)bvh_nodes)[3];
/* intersect ray against child nodes */
- const __m128 tminmaxx = _mm_mul_ps(_mm_sub_ps(shuffle_swap(bvh_nodes[0], shufflexyz[0]), Psplat[0]), idirsplat[0]);
- const __m128 tminmaxy = _mm_mul_ps(_mm_sub_ps(shuffle_swap(bvh_nodes[1], shufflexyz[1]), Psplat[1]), idirsplat[1]);
- const __m128 tminmaxz = _mm_mul_ps(_mm_sub_ps(shuffle_swap(bvh_nodes[2], shufflexyz[2]), Psplat[2]), idirsplat[2]);
+ const ssef tminmaxx = (shuffle_swap(bvh_nodes[0], shufflexyz[0]) - Psplat[0]) * idirsplat[0];
+ const ssef tminmaxy = (shuffle_swap(bvh_nodes[1], shufflexyz[1]) - Psplat[1]) * idirsplat[1];
+ const ssef tminmaxz = (shuffle_swap(bvh_nodes[2], shufflexyz[2]) - Psplat[2]) * idirsplat[2];
/* calculate { c0min, c1min, -c0max, -c1max} */
- __m128 minmax = _mm_max_ps(_mm_max_ps(tminmaxx, tminmaxy), _mm_max_ps(tminmaxz, tsplat));
- const __m128 tminmax = _mm_xor_ps(minmax, pn);
+ ssef minmax = max(max(tminmaxx, tminmaxy), max(tminmaxz, tsplat));
+ const ssef tminmax = minmax ^ pn;
#if FEATURE(BVH_HAIR_MINIMUM_WIDTH)
if(difl != 0.0f) {
@@ -182,16 +186,16 @@ ccl_device bool BVH_FUNCTION_NAME
}
#endif
- const __m128 lrhit = _mm_cmple_ps(tminmax, shuffle<2, 3, 0, 1>(tminmax));
+ const sseb lrhit = tminmax <= shuffle<2, 3, 0, 1>(tminmax);
/* decide which nodes to traverse next */
#ifdef __VISIBILITY_FLAG__
/* this visibility test gives a 5% performance hit, how to solve? */
- traverseChild0 = (_mm_movemask_ps(lrhit) & 1) && (__float_as_uint(cnodes.z) & visibility);
- traverseChild1 = (_mm_movemask_ps(lrhit) & 2) && (__float_as_uint(cnodes.w) & visibility);
+ traverseChild0 = (movemask(lrhit) & 1) && (__float_as_uint(cnodes.z) & visibility);
+ traverseChild1 = (movemask(lrhit) & 2) && (__float_as_uint(cnodes.w) & visibility);
#else
- traverseChild0 = (_mm_movemask_ps(lrhit) & 1);
- traverseChild1 = (_mm_movemask_ps(lrhit) & 2);
+ traverseChild0 = (movemask(lrhit) & 1);
+ traverseChild1 = (movemask(lrhit) & 2);
#endif
#endif // __KERNEL_SSE2__
@@ -203,9 +207,7 @@ ccl_device bool BVH_FUNCTION_NAME
#if !defined(__KERNEL_SSE2__)
bool closestChild1 = (c1min < c0min);
#else
- union { __m128 m128; float v[4]; } uminmax;
- uminmax.m128 = tminmax;
- bool closestChild1 = uminmax.v[1] < uminmax.v[0];
+ bool closestChild1 = tminmax[1] < tminmax[0];
#endif
if(closestChild1) {
@@ -228,6 +230,10 @@ ccl_device bool BVH_FUNCTION_NAME
--stackPtr;
}
}
+
+#if defined(__KERNEL_DEBUG__)
+ isect->num_traversal_steps++;
+#endif
}
/* if node is leaf, fetch triangle list */
@@ -276,13 +282,17 @@ ccl_device bool BVH_FUNCTION_NAME
}
}
+#if defined(__KERNEL_DEBUG__)
+ isect->num_traversal_steps++;
+#endif
+
/* shadow ray early termination */
#if defined(__KERNEL_SSE2__)
if(hit) {
if(visibility == PATH_RAY_SHADOW_OPAQUE)
return true;
- tsplat = _mm_set_ps(-isect->t, -isect->t, 0.0f, 0.0f);
+ tsplat = ssef(0.0f, 0.0f, -isect->t, -isect->t);
}
#else
if(hit && visibility == PATH_RAY_SHADOW_OPAQUE)
@@ -304,11 +314,11 @@ ccl_device bool BVH_FUNCTION_NAME
#endif
#if defined(__KERNEL_SSE2__)
- Psplat[0] = _mm_set_ps1(P.x);
- Psplat[1] = _mm_set_ps1(P.y);
- Psplat[2] = _mm_set_ps1(P.z);
+ Psplat[0] = ssef(P.x);
+ Psplat[1] = ssef(P.y);
+ Psplat[2] = ssef(P.z);
- tsplat = _mm_set_ps(-isect->t, -isect->t, 0.0f, 0.0f);
+ tsplat = ssef(0.0f, 0.0f, -isect->t, -isect->t);
gen_idirsplat_swap(pn, shuf_identity, shuf_swap, idir, idirsplat, shufflexyz);
#endif
@@ -334,11 +344,11 @@ ccl_device bool BVH_FUNCTION_NAME
#endif
#if defined(__KERNEL_SSE2__)
- Psplat[0] = _mm_set_ps1(P.x);
- Psplat[1] = _mm_set_ps1(P.y);
- Psplat[2] = _mm_set_ps1(P.z);
+ Psplat[0] = ssef(P.x);
+ Psplat[1] = ssef(P.y);
+ Psplat[2] = ssef(P.z);
- tsplat = _mm_set_ps(-isect->t, -isect->t, 0.0f, 0.0f);
+ tsplat = ssef(0.0f, 0.0f, -isect->t, -isect->t);
gen_idirsplat_swap(pn, shuf_identity, shuf_swap, idir, idirsplat, shufflexyz);
#endif
diff --git a/intern/cycles/kernel/geom/geom_bvh_volume.h b/intern/cycles/kernel/geom/geom_bvh_volume.h
new file mode 100644
index 00000000000..9dd8d226f5b
--- /dev/null
+++ b/intern/cycles/kernel/geom/geom_bvh_volume.h
@@ -0,0 +1,322 @@
+/*
+ * Adapted from code Copyright 2009-2010 NVIDIA Corporation,
+ * and code copyright 2009-2012 Intel Corporation
+ *
+ * Modifications Copyright 2011-2014, Blender Foundation.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+/* This is a template BVH traversal function for volumes, where
+ * various features can be enabled/disabled. This way we can compile optimized
+ * versions for each case without new features slowing things down.
+ *
+ * BVH_INSTANCING: object instancing
+ * BVH_HAIR: hair curve rendering
+ * BVH_MOTION: motion blur rendering
+ *
+ */
+
+#define FEATURE(f) (((BVH_FUNCTION_FEATURES) & (f)) != 0)
+
+ccl_device bool BVH_FUNCTION_NAME(KernelGlobals *kg,
+ const Ray *ray,
+ Intersection *isect)
+{
+ /* todo:
+ * - test if pushing distance on the stack helps (for non shadow rays)
+ * - separate version for shadow rays
+ * - likely and unlikely for if() statements
+ * - test restrict attribute for pointers
+ */
+
+ /* traversal stack in CUDA thread-local memory */
+ int traversalStack[BVH_STACK_SIZE];
+ traversalStack[0] = ENTRYPOINT_SENTINEL;
+
+ /* traversal variables in registers */
+ int stackPtr = 0;
+ int nodeAddr = kernel_data.bvh.root;
+
+ /* ray parameters in registers */
+ float3 P = ray->P;
+ float3 dir = bvh_clamp_direction(ray->D);
+ float3 idir = bvh_inverse_direction(dir);
+ int object = OBJECT_NONE;
+
+ const uint visibility = PATH_RAY_ALL_VISIBILITY;
+
+#if FEATURE(BVH_MOTION)
+ Transform ob_tfm;
+#endif
+
+ isect->t = ray->t;
+ isect->u = 0.0f;
+ isect->v = 0.0f;
+ isect->prim = PRIM_NONE;
+ isect->object = OBJECT_NONE;
+
+#if defined(__KERNEL_SSE2__)
+ const shuffle_swap_t shuf_identity = shuffle_swap_identity();
+ const shuffle_swap_t shuf_swap = shuffle_swap_swap();
+
+ const ssef pn = cast(ssei(0, 0, 0x80000000, 0x80000000));
+ ssef Psplat[3], idirsplat[3];
+ shuffle_swap_t shufflexyz[3];
+
+ Psplat[0] = ssef(P.x);
+ Psplat[1] = ssef(P.y);
+ Psplat[2] = ssef(P.z);
+
+ ssef tsplat(0.0f, 0.0f, -isect->t, -isect->t);
+
+ gen_idirsplat_swap(pn, shuf_identity, shuf_swap, idir, idirsplat, shufflexyz);
+#endif
+
+ /* traversal loop */
+ do {
+ do {
+ /* traverse internal nodes */
+ while(nodeAddr >= 0 && nodeAddr != ENTRYPOINT_SENTINEL) {
+ bool traverseChild0, traverseChild1;
+ int nodeAddrChild1;
+
+#if !defined(__KERNEL_SSE2__)
+ /* Intersect two child bounding boxes, non-SSE version */
+ float t = isect->t;
+
+ /* fetch node data */
+ float4 node0 = kernel_tex_fetch(__bvh_nodes, nodeAddr*BVH_NODE_SIZE+0);
+ float4 node1 = kernel_tex_fetch(__bvh_nodes, nodeAddr*BVH_NODE_SIZE+1);
+ float4 node2 = kernel_tex_fetch(__bvh_nodes, nodeAddr*BVH_NODE_SIZE+2);
+ float4 cnodes = kernel_tex_fetch(__bvh_nodes, nodeAddr*BVH_NODE_SIZE+3);
+
+ /* intersect ray against child nodes */
+ NO_EXTENDED_PRECISION float c0lox = (node0.x - P.x) * idir.x;
+ NO_EXTENDED_PRECISION float c0hix = (node0.z - P.x) * idir.x;
+ NO_EXTENDED_PRECISION float c0loy = (node1.x - P.y) * idir.y;
+ NO_EXTENDED_PRECISION float c0hiy = (node1.z - P.y) * idir.y;
+ NO_EXTENDED_PRECISION float c0loz = (node2.x - P.z) * idir.z;
+ NO_EXTENDED_PRECISION float c0hiz = (node2.z - P.z) * idir.z;
+ NO_EXTENDED_PRECISION float c0min = max4(min(c0lox, c0hix), min(c0loy, c0hiy), min(c0loz, c0hiz), 0.0f);
+ NO_EXTENDED_PRECISION float c0max = min4(max(c0lox, c0hix), max(c0loy, c0hiy), max(c0loz, c0hiz), t);
+
+ NO_EXTENDED_PRECISION float c1lox = (node0.y - P.x) * idir.x;
+ NO_EXTENDED_PRECISION float c1hix = (node0.w - P.x) * idir.x;
+ NO_EXTENDED_PRECISION float c1loy = (node1.y - P.y) * idir.y;
+ NO_EXTENDED_PRECISION float c1hiy = (node1.w - P.y) * idir.y;
+ NO_EXTENDED_PRECISION float c1loz = (node2.y - P.z) * idir.z;
+ NO_EXTENDED_PRECISION float c1hiz = (node2.w - P.z) * idir.z;
+ NO_EXTENDED_PRECISION float c1min = max4(min(c1lox, c1hix), min(c1loy, c1hiy), min(c1loz, c1hiz), 0.0f);
+ NO_EXTENDED_PRECISION float c1max = min4(max(c1lox, c1hix), max(c1loy, c1hiy), max(c1loz, c1hiz), t);
+
+ /* decide which nodes to traverse next */
+#ifdef __VISIBILITY_FLAG__
+ /* this visibility test gives a 5% performance hit, how to solve? */
+ traverseChild0 = (c0max >= c0min) && (__float_as_uint(cnodes.z) & visibility);
+ traverseChild1 = (c1max >= c1min) && (__float_as_uint(cnodes.w) & visibility);
+#else
+ traverseChild0 = (c0max >= c0min);
+ traverseChild1 = (c1max >= c1min);
+#endif
+
+#else // __KERNEL_SSE2__
+ /* Intersect two child bounding boxes, SSE3 version adapted from Embree */
+
+ /* fetch node data */
+ const ssef *bvh_nodes = (ssef*)kg->__bvh_nodes.data + nodeAddr*BVH_NODE_SIZE;
+ const float4 cnodes = ((float4*)bvh_nodes)[3];
+
+ /* intersect ray against child nodes */
+ const ssef tminmaxx = (shuffle_swap(bvh_nodes[0], shufflexyz[0]) - Psplat[0]) * idirsplat[0];
+ const ssef tminmaxy = (shuffle_swap(bvh_nodes[1], shufflexyz[1]) - Psplat[1]) * idirsplat[1];
+ const ssef tminmaxz = (shuffle_swap(bvh_nodes[2], shufflexyz[2]) - Psplat[2]) * idirsplat[2];
+
+ /* calculate { c0min, c1min, -c0max, -c1max} */
+ ssef minmax = max(max(tminmaxx, tminmaxy), max(tminmaxz, tsplat));
+ const ssef tminmax = minmax ^ pn;
+
+ const sseb lrhit = tminmax <= shuffle<2, 3, 0, 1>(tminmax);
+
+ /* decide which nodes to traverse next */
+#ifdef __VISIBILITY_FLAG__
+ /* this visibility test gives a 5% performance hit, how to solve? */
+ traverseChild0 = (movemask(lrhit) & 1) && (__float_as_uint(cnodes.z) & visibility);
+ traverseChild1 = (movemask(lrhit) & 2) && (__float_as_uint(cnodes.w) & visibility);
+#else
+ traverseChild0 = (movemask(lrhit) & 1);
+ traverseChild1 = (movemask(lrhit) & 2);
+#endif
+#endif // __KERNEL_SSE2__
+
+ nodeAddr = __float_as_int(cnodes.x);
+ nodeAddrChild1 = __float_as_int(cnodes.y);
+
+ if(traverseChild0 && traverseChild1) {
+ /* both children were intersected, push the farther one */
+#if !defined(__KERNEL_SSE2__)
+ bool closestChild1 = (c1min < c0min);
+#else
+ bool closestChild1 = tminmax[1] < tminmax[0];
+#endif
+
+ if(closestChild1) {
+ int tmp = nodeAddr;
+ nodeAddr = nodeAddrChild1;
+ nodeAddrChild1 = tmp;
+ }
+
+ ++stackPtr;
+ traversalStack[stackPtr] = nodeAddrChild1;
+ }
+ else {
+ /* one child was intersected */
+ if(traverseChild1) {
+ nodeAddr = nodeAddrChild1;
+ }
+ else if(!traverseChild0) {
+ /* neither child was intersected */
+ nodeAddr = traversalStack[stackPtr];
+ --stackPtr;
+ }
+ }
+ }
+
+ /* if node is leaf, fetch triangle list */
+ if(nodeAddr < 0) {
+ float4 leaf = kernel_tex_fetch(__bvh_nodes, (-nodeAddr-1)*BVH_NODE_SIZE+(BVH_NODE_SIZE-1));
+ int primAddr = __float_as_int(leaf.x);
+
+#if FEATURE(BVH_INSTANCING)
+ if(primAddr >= 0) {
+#endif
+ int primAddr2 = __float_as_int(leaf.y);
+
+ /* pop */
+ nodeAddr = traversalStack[stackPtr];
+ --stackPtr;
+
+ /* primitive intersection */
+ for(; primAddr < primAddr2; primAddr++) {
+ /* only primitives from volume object */
+ uint tri_object = (object == OBJECT_NONE)? kernel_tex_fetch(__prim_object, primAddr): object;
+ int object_flag = kernel_tex_fetch(__object_flag, tri_object);
+
+ if((object_flag & SD_OBJECT_HAS_VOLUME) == 0) {
+ continue;
+ }
+
+ /* intersect ray against primitive */
+ uint type = kernel_tex_fetch(__prim_type, primAddr);
+
+ switch(type & PRIMITIVE_ALL) {
+ case PRIMITIVE_TRIANGLE: {
+ triangle_intersect(kg, isect, P, dir, visibility, object, primAddr);
+ break;
+ }
+#if FEATURE(BVH_MOTION)
+ case PRIMITIVE_MOTION_TRIANGLE: {
+ motion_triangle_intersect(kg, isect, P, dir, ray->time, visibility, object, primAddr);
+ break;
+ }
+#endif
+#if FEATURE(BVH_HAIR)
+ case PRIMITIVE_CURVE:
+ case PRIMITIVE_MOTION_CURVE: {
+ if(kernel_data.curve.curveflags & CURVE_KN_INTERPOLATE)
+ bvh_cardinal_curve_intersect(kg, isect, P, dir, visibility, object, primAddr, ray->time, type, NULL, 0, 0);
+ else
+ bvh_curve_intersect(kg, isect, P, dir, visibility, object, primAddr, ray->time, type, NULL, 0, 0);
+ break;
+ }
+#endif
+ default: {
+ break;
+ }
+ }
+ }
+ }
+#if FEATURE(BVH_INSTANCING)
+ else {
+ /* instance push */
+ object = kernel_tex_fetch(__prim_object, -primAddr-1);
+ int object_flag = kernel_tex_fetch(__object_flag, object);
+
+ if(object_flag & SD_OBJECT_HAS_VOLUME) {
+
+#if FEATURE(BVH_MOTION)
+ bvh_instance_motion_push(kg, object, ray, &P, &dir, &idir, &isect->t, &ob_tfm);
+#else
+ bvh_instance_push(kg, object, ray, &P, &dir, &idir, &isect->t);
+#endif
+
+#if defined(__KERNEL_SSE2__)
+ Psplat[0] = ssef(P.x);
+ Psplat[1] = ssef(P.y);
+ Psplat[2] = ssef(P.z);
+
+ tsplat = ssef(0.0f, 0.0f, -isect->t, -isect->t);
+
+ gen_idirsplat_swap(pn, shuf_identity, shuf_swap, idir, idirsplat, shufflexyz);
+#endif
+
+ ++stackPtr;
+ traversalStack[stackPtr] = ENTRYPOINT_SENTINEL;
+
+ nodeAddr = kernel_tex_fetch(__object_node, object);
+ }
+ else {
+ /* pop */
+ nodeAddr = traversalStack[stackPtr];
+ --stackPtr;
+ }
+ }
+ }
+#endif
+ } while(nodeAddr != ENTRYPOINT_SENTINEL);
+
+#if FEATURE(BVH_INSTANCING)
+ if(stackPtr >= 0) {
+ kernel_assert(object != OBJECT_NONE);
+
+ /* instance pop */
+#if FEATURE(BVH_MOTION)
+ bvh_instance_motion_pop(kg, object, ray, &P, &dir, &idir, &isect->t, &ob_tfm);
+#else
+ bvh_instance_pop(kg, object, ray, &P, &dir, &idir, &isect->t);
+#endif
+
+#if defined(__KERNEL_SSE2__)
+ Psplat[0] = ssef(P.x);
+ Psplat[1] = ssef(P.y);
+ Psplat[2] = ssef(P.z);
+
+ tsplat = ssef(0.0f, 0.0f, -isect->t, -isect->t);
+
+ gen_idirsplat_swap(pn, shuf_identity, shuf_swap, idir, idirsplat, shufflexyz);
+#endif
+
+ object = OBJECT_NONE;
+ nodeAddr = traversalStack[stackPtr];
+ --stackPtr;
+ }
+#endif
+ } while(nodeAddr != ENTRYPOINT_SENTINEL);
+
+ return (isect->prim != PRIM_NONE);
+}
+
+#undef FEATURE
+#undef BVH_FUNCTION_NAME
+#undef BVH_FUNCTION_FEATURES
+
diff --git a/intern/cycles/kernel/geom/geom_curve.h b/intern/cycles/kernel/geom/geom_curve.h
index e1d225436a6..b6d21c91916 100644
--- a/intern/cycles/kernel/geom/geom_curve.h
+++ b/intern/cycles/kernel/geom/geom_curve.h
@@ -214,9 +214,9 @@ ccl_device_inline void curvebounds(float *lower, float *upper, float *extremta,
}
#ifdef __KERNEL_SSE2__
-ccl_device_inline __m128 transform_point_T3(const __m128 t[3], const __m128 &a)
+ccl_device_inline ssef transform_point_T3(const ssef t[3], const ssef &a)
{
- return fma(broadcast<0>(a), t[0], fma(broadcast<1>(a), t[1], _mm_mul_ps(broadcast<2>(a), t[2])));
+ return madd(shuffle<0>(a), t[0], madd(shuffle<1>(a), t[1], shuffle<2>(a) * t[2]));
}
#endif
@@ -238,16 +238,16 @@ ccl_device_inline bool bvh_cardinal_curve_intersect(KernelGlobals *kg, Intersect
int prim = kernel_tex_fetch(__prim_index, curveAddr);
#ifdef __KERNEL_SSE2__
- __m128 vdir = load_m128(dir);
- __m128 vcurve_coef[4];
+ ssef vdir = load4f(dir);
+ ssef vcurve_coef[4];
const float3 *curve_coef = (float3 *)vcurve_coef;
{
- __m128 dtmp = _mm_mul_ps(vdir, vdir);
- __m128 d_ss = _mm_sqrt_ss(_mm_add_ss(dtmp, broadcast<2>(dtmp)));
- __m128 rd_ss = _mm_div_ss(_mm_set_ss(1.0f), d_ss);
+ ssef dtmp = vdir * vdir;
+ ssef d_ss = mm_sqrt(dtmp + shuffle<2>(dtmp));
+ ssef rd_ss = load1f_first(1.0f) / d_ss;
- __m128i v00vec = _mm_load_si128((__m128i *)&kg->__curves.data[prim]);
+ ssei v00vec = load4i((ssei *)&kg->__curves.data[prim]);
int2 &v00 = (int2 &)v00vec;
int k0 = v00.x + segment;
@@ -255,44 +255,44 @@ ccl_device_inline bool bvh_cardinal_curve_intersect(KernelGlobals *kg, Intersect
int ka = max(k0 - 1, v00.x);
int kb = min(k1 + 1, v00.x + v00.y - 1);
- __m128 P_curve[4];
+ ssef P_curve[4];
if(type & PRIMITIVE_CURVE) {
- P_curve[0] = _mm_load_ps(&kg->__curve_keys.data[ka].x);
- P_curve[1] = _mm_load_ps(&kg->__curve_keys.data[k0].x);
- P_curve[2] = _mm_load_ps(&kg->__curve_keys.data[k1].x);
- P_curve[3] = _mm_load_ps(&kg->__curve_keys.data[kb].x);
+ P_curve[0] = load4f(&kg->__curve_keys.data[ka].x);
+ P_curve[1] = load4f(&kg->__curve_keys.data[k0].x);
+ P_curve[2] = load4f(&kg->__curve_keys.data[k1].x);
+ P_curve[3] = load4f(&kg->__curve_keys.data[kb].x);
}
else {
int fobject = (object == OBJECT_NONE)? kernel_tex_fetch(__prim_object, curveAddr): object;
motion_cardinal_curve_keys(kg, fobject, prim, time, ka, k0, k1, kb, (float4*)&P_curve);
}
- __m128 rd_sgn = set_sign_bit<0, 1, 1, 1>(broadcast<0>(rd_ss));
- __m128 mul_zxxy = _mm_mul_ps(shuffle<2, 0, 0, 1>(vdir), rd_sgn);
- __m128 mul_yz = _mm_mul_ps(shuffle<1, 2, 1, 2>(vdir), mul_zxxy);
- __m128 mul_shuf = shuffle<0, 1, 2, 3>(mul_zxxy, mul_yz);
- __m128 vdir0 = _mm_and_ps(vdir, _mm_castsi128_ps(_mm_setr_epi32(0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, 0)));
+ ssef rd_sgn = set_sign_bit<0, 1, 1, 1>(shuffle<0>(rd_ss));
+ ssef mul_zxxy = shuffle<2, 0, 0, 1>(vdir) * rd_sgn;
+ ssef mul_yz = shuffle<1, 2, 1, 2>(vdir) * mul_zxxy;
+ ssef mul_shuf = shuffle<0, 1, 2, 3>(mul_zxxy, mul_yz);
+ ssef vdir0 = vdir & cast(ssei(0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, 0));
- __m128 htfm0 = shuffle<0, 2, 0, 3>(mul_shuf, vdir0);
- __m128 htfm1 = shuffle<1, 0, 1, 3>(_mm_set_ss(_mm_cvtss_f32(d_ss)), vdir0);
- __m128 htfm2 = shuffle<1, 3, 2, 3>(mul_shuf, vdir0);
+ ssef htfm0 = shuffle<0, 2, 0, 3>(mul_shuf, vdir0);
+ ssef htfm1 = shuffle<1, 0, 1, 3>(load1f_first(extract<0>(d_ss)), vdir0);
+ ssef htfm2 = shuffle<1, 3, 2, 3>(mul_shuf, vdir0);
- __m128 htfm[] = { htfm0, htfm1, htfm2 };
- __m128 vP = load_m128(P);
- __m128 p0 = transform_point_T3(htfm, _mm_sub_ps(P_curve[0], vP));
- __m128 p1 = transform_point_T3(htfm, _mm_sub_ps(P_curve[1], vP));
- __m128 p2 = transform_point_T3(htfm, _mm_sub_ps(P_curve[2], vP));
- __m128 p3 = transform_point_T3(htfm, _mm_sub_ps(P_curve[3], vP));
+ ssef htfm[] = { htfm0, htfm1, htfm2 };
+ ssef vP = load4f(P);
+ ssef p0 = transform_point_T3(htfm, P_curve[0] - vP);
+ ssef p1 = transform_point_T3(htfm, P_curve[1] - vP);
+ ssef p2 = transform_point_T3(htfm, P_curve[2] - vP);
+ ssef p3 = transform_point_T3(htfm, P_curve[3] - vP);
float fc = 0.71f;
- __m128 vfc = _mm_set1_ps(fc);
- __m128 vfcxp3 = _mm_mul_ps(vfc, p3);
+ ssef vfc = ssef(fc);
+ ssef vfcxp3 = vfc * p3;
vcurve_coef[0] = p1;
- vcurve_coef[1] = _mm_mul_ps(vfc, _mm_sub_ps(p2, p0));
- vcurve_coef[2] = fma(_mm_set1_ps(fc * 2.0f), p0, fma(_mm_set1_ps(fc - 3.0f), p1, fms(_mm_set1_ps(3.0f - 2.0f * fc), p2, vfcxp3)));
- vcurve_coef[3] = fms(_mm_set1_ps(fc - 2.0f), _mm_sub_ps(p2, p1), fms(vfc, p0, vfcxp3));
+ vcurve_coef[1] = vfc * (p2 - p0);
+ vcurve_coef[2] = madd(ssef(fc * 2.0f), p0, madd(ssef(fc - 3.0f), p1, msub(ssef(3.0f - 2.0f * fc), p2, vfcxp3)));
+ vcurve_coef[3] = msub(ssef(fc - 2.0f), p2 - p1, msub(vfc, p0, vfcxp3));
r_st = ((float4 &)P_curve[1]).w;
r_en = ((float4 &)P_curve[2]).w;
@@ -386,12 +386,12 @@ ccl_device_inline bool bvh_cardinal_curve_intersect(KernelGlobals *kg, Intersect
float i_st = tree * resol;
float i_en = i_st + (level * resol);
#ifdef __KERNEL_SSE2__
- __m128 vi_st = _mm_set1_ps(i_st), vi_en = _mm_set1_ps(i_en);
- __m128 vp_st = fma(fma(fma(vcurve_coef[3], vi_st, vcurve_coef[2]), vi_st, vcurve_coef[1]), vi_st, vcurve_coef[0]);
- __m128 vp_en = fma(fma(fma(vcurve_coef[3], vi_en, vcurve_coef[2]), vi_en, vcurve_coef[1]), vi_en, vcurve_coef[0]);
+ ssef vi_st = ssef(i_st), vi_en = ssef(i_en);
+ ssef vp_st = madd(madd(madd(vcurve_coef[3], vi_st, vcurve_coef[2]), vi_st, vcurve_coef[1]), vi_st, vcurve_coef[0]);
+ ssef vp_en = madd(madd(madd(vcurve_coef[3], vi_en, vcurve_coef[2]), vi_en, vcurve_coef[1]), vi_en, vcurve_coef[0]);
- __m128 vbmin = _mm_min_ps(vp_st, vp_en);
- __m128 vbmax = _mm_max_ps(vp_st, vp_en);
+ ssef vbmin = min(vp_st, vp_en);
+ ssef vbmax = max(vp_st, vp_en);
float3 &bmin = (float3 &)vbmin, &bmax = (float3 &)vbmax;
float &bminx = bmin.x, &bminy = bmin.y, &bminz = bmin.z;
@@ -600,13 +600,12 @@ ccl_device_inline bool bvh_cardinal_curve_intersect(KernelGlobals *kg, Intersect
#endif
{
/* record intersection */
+ isect->t = t;
+ isect->u = u;
+ isect->v = gd;
isect->prim = curveAddr;
isect->object = object;
isect->type = type;
- isect->u = u;
- isect->v = gd;
- /*isect->transparency = 1.0f - coverage; */
- isect->t = t;
hit = true;
}
@@ -679,38 +678,38 @@ ccl_device_inline bool bvh_curve_intersect(KernelGlobals *kg, Intersection *isec
float sphere_b_tmp = dot3(dir, sphere_dif1);
float3 sphere_dif2 = sphere_dif1 - sphere_b_tmp * dir;
#else
- __m128 P_curve[2];
+ ssef P_curve[2];
if(type & PRIMITIVE_CURVE) {
- P_curve[0] = _mm_load_ps(&kg->__curve_keys.data[k0].x);
- P_curve[1] = _mm_load_ps(&kg->__curve_keys.data[k1].x);
+ P_curve[0] = load4f(&kg->__curve_keys.data[k0].x);
+ P_curve[1] = load4f(&kg->__curve_keys.data[k1].x);
}
else {
int fobject = (object == OBJECT_NONE)? kernel_tex_fetch(__prim_object, curveAddr): object;
motion_curve_keys(kg, fobject, prim, time, k0, k1, (float4*)&P_curve);
}
- const __m128 or12 = shuffle<3, 3, 3, 3>(P_curve[0], P_curve[1]);
+ const ssef or12 = shuffle<3, 3, 3, 3>(P_curve[0], P_curve[1]);
- __m128 r12 = or12;
- const __m128 vP = load_m128(P);
- const __m128 dif = _mm_sub_ps(vP, P_curve[0]);
- const __m128 dif_second = _mm_sub_ps(vP, P_curve[1]);
+ ssef r12 = or12;
+ const ssef vP = load4f(P);
+ const ssef dif = vP - P_curve[0];
+ const ssef dif_second = vP - P_curve[1];
if(difl != 0.0f) {
- const __m128 len1_sq = len3_squared_splat(dif);
- const __m128 len2_sq = len3_squared_splat(dif_second);
- const __m128 len12 = _mm_sqrt_ps(shuffle<0, 0, 0, 0>(len1_sq, len2_sq));
- const __m128 pixelsize12 = _mm_min_ps(_mm_mul_ps(len12, _mm_set1_ps(difl)), _mm_set1_ps(extmax));
- r12 = _mm_max_ps(or12, pixelsize12);
+ const ssef len1_sq = len3_squared_splat(dif);
+ const ssef len2_sq = len3_squared_splat(dif_second);
+ const ssef len12 = mm_sqrt(shuffle<0, 0, 0, 0>(len1_sq, len2_sq));
+ const ssef pixelsize12 = min(len12 * difl, ssef(extmax));
+ r12 = max(or12, pixelsize12);
}
- float or1 = _mm_cvtss_f32(or12), or2 = _mm_cvtss_f32(broadcast<2>(or12));
- float r1 = _mm_cvtss_f32(r12), r2 = _mm_cvtss_f32(broadcast<2>(r12));
-
- const __m128 p21_diff = _mm_sub_ps(P_curve[1], P_curve[0]);
- const __m128 sphere_dif1 = _mm_mul_ps(_mm_add_ps(dif, dif_second), _mm_set1_ps(0.5f));
- const __m128 dir = load_m128(direction);
- const __m128 sphere_b_tmp = dot3_splat(dir, sphere_dif1);
- const __m128 sphere_dif2 = fnma(sphere_b_tmp, dir, sphere_dif1);
+ float or1 = extract<0>(or12), or2 = extract<0>(shuffle<2>(or12));
+ float r1 = extract<0>(r12), r2 = extract<0>(shuffle<2>(r12));
+
+ const ssef p21_diff = P_curve[1] - P_curve[0];
+ const ssef sphere_dif1 = (dif + dif_second) * 0.5f;
+ const ssef dir = load4f(direction);
+ const ssef sphere_b_tmp = dot3_splat(dir, sphere_dif1);
+ const ssef sphere_dif2 = nmsub(sphere_b_tmp, dir, sphere_dif1);
#endif
float mr = max(r1, r2);
@@ -728,7 +727,7 @@ ccl_device_inline bool bvh_curve_intersect(KernelGlobals *kg, Intersection *isec
#ifndef __KERNEL_SSE2__
float3 tg = p21_diff * invl;
#else
- const __m128 tg = _mm_mul_ps(p21_diff, _mm_set1_ps(invl));
+ const ssef tg = p21_diff * invl;
#endif
float gd = (r2 - r1) * invl;
@@ -752,7 +751,7 @@ ccl_device_inline bool bvh_curve_intersect(KernelGlobals *kg, Intersection *isec
float3 cprod = cross(tg, dir);
float cprod2sq = len3_squared(cross(tg, dif));
#else
- const __m128 cprod = cross(tg, dir);
+ const ssef cprod = cross(tg, dir);
float cprod2sq = len3_squared(cross_zxy(tg, dif));
#endif
float cprodsq = len3_squared(cprod);
@@ -770,7 +769,7 @@ ccl_device_inline bool bvh_curve_intersect(KernelGlobals *kg, Intersection *isec
#ifndef __KERNEL_SSE2__
float3 tdif = dif + tcentre * dir;
#else
- const __m128 tdif = fma(_mm_set1_ps(tcentre), dir, dif);
+ const ssef tdif = madd(ssef(tcentre), dir, dif);
#endif
float tdifz = dot3(tdif, tg);
float tdifma = tdifz*gd + r1;
@@ -836,13 +835,12 @@ ccl_device_inline bool bvh_curve_intersect(KernelGlobals *kg, Intersection *isec
#endif
{
/* record intersection */
+ isect->t = t;
+ isect->u = z*invl;
+ isect->v = gd;
isect->prim = curveAddr;
isect->object = object;
isect->type = type;
- isect->u = z*invl;
- isect->v = gd;
- /*isect->transparency = 1.0f - adjradius;*/
- isect->t = t;
return true;
}
@@ -938,9 +936,10 @@ ccl_device_inline float3 bvh_curve_refine(KernelGlobals *kg, ShaderData *sd, con
sd->u = isect->u;
sd->v = 0.0f;
#endif
-
+
+ tg = normalize(curvetangent(isect->u, p[0], p[1], p[2], p[3]));
+
if(kernel_data.curve.curveflags & CURVE_KN_RIBBONS) {
- tg = normalize(curvetangent(isect->u, p[0], p[1], p[2], p[3]));
sd->Ng = normalize(-(D - tg * (dot(tg, D))));
}
else {
@@ -952,7 +951,6 @@ ccl_device_inline float3 bvh_curve_refine(KernelGlobals *kg, ShaderData *sd, con
float gd = isect->v;
if(gd != 0.0f) {
- tg = normalize(curvetangent(isect->u, p[0], p[1], p[2], p[3]));
sd->Ng = sd->Ng - gd * tg;
sd->Ng = normalize(sd->Ng);
}
@@ -1012,10 +1010,6 @@ ccl_device_inline float3 bvh_curve_refine(KernelGlobals *kg, ShaderData *sd, con
sd->dPdv = cross(tg, sd->Ng);
#endif
- /*add fading parameter for minimum pixel width with transparency bsdf*/
- /*sd->curve_transparency = isect->transparency;*/
- /*sd->curve_radius = sd->u * gd * l + r1;*/
-
if(isect->object != OBJECT_NONE) {
#ifdef __OBJECT_MOTION__
Transform tfm = sd->ob_tfm;
diff --git a/intern/cycles/kernel/geom/geom_motion_triangle.h b/intern/cycles/kernel/geom/geom_motion_triangle.h
index 73338bb6b3b..3a4b20e61aa 100644
--- a/intern/cycles/kernel/geom/geom_motion_triangle.h
+++ b/intern/cycles/kernel/geom/geom_motion_triangle.h
@@ -233,8 +233,7 @@ ccl_device_inline float3 motion_triangle_refine_subsurface(KernelGlobals *kg, Sh
ccl_device_noinline void motion_triangle_shader_setup(KernelGlobals *kg, ShaderData *sd, const Intersection *isect, const Ray *ray, bool subsurface)
{
/* get shader */
- float4 Ns = kernel_tex_fetch(__tri_normal, sd->prim);
- sd->shader = __float_as_int(Ns.w);
+ sd->shader = kernel_tex_fetch(__tri_shader, sd->prim);
/* get motion info */
int numsteps, numverts;
@@ -273,7 +272,11 @@ ccl_device_noinline void motion_triangle_shader_setup(KernelGlobals *kg, ShaderD
#endif
/* compute face normal */
- float3 Ng = normalize(cross(verts[1] - verts[0], verts[2] - verts[0]));
+ float3 Ng;
+ if(sd->flag & SD_NEGATIVE_SCALE_APPLIED)
+ Ng = normalize(cross(verts[2] - verts[0], verts[1] - verts[0]));
+ else
+ Ng = normalize(cross(verts[1] - verts[0], verts[2] - verts[0]));
sd->Ng = Ng;
sd->N = Ng;
@@ -327,14 +330,21 @@ ccl_device_inline bool motion_triangle_intersect(KernelGlobals *kg, Intersection
float t, u, v;
if(ray_triangle_intersect_uv(P, dir, isect->t, verts[2], verts[0], verts[1], &u, &v, &t)) {
- isect->prim = triAddr;
- isect->object = object;
- isect->type = PRIMITIVE_MOTION_TRIANGLE;
- isect->u = u;
- isect->v = v;
- isect->t = t;
+#ifdef __VISIBILITY_FLAG__
+ /* visibility flag test. we do it here under the assumption
+ * that most triangles are culled by node flags */
+ if(kernel_tex_fetch(__prim_visibility, triAddr) & visibility)
+#endif
+ {
+ isect->t = t;
+ isect->u = u;
+ isect->v = v;
+ isect->prim = triAddr;
+ isect->object = object;
+ isect->type = PRIMITIVE_MOTION_TRIANGLE;
- return true;
+ return true;
+ }
}
return false;
@@ -378,12 +388,12 @@ ccl_device_inline void motion_triangle_intersect_subsurface(KernelGlobals *kg, I
/* record intersection */
Intersection *isect = &isect_array[hit];
+ isect->t = t;
+ isect->u = u;
+ isect->v = v;
isect->prim = triAddr;
isect->object = object;
isect->type = PRIMITIVE_MOTION_TRIANGLE;
- isect->u = u;
- isect->v = v;
- isect->t = t;
}
}
#endif
diff --git a/intern/cycles/kernel/geom/geom_primitive.h b/intern/cycles/kernel/geom/geom_primitive.h
index 533973621d7..5df6c75df86 100644
--- a/intern/cycles/kernel/geom/geom_primitive.h
+++ b/intern/cycles/kernel/geom/geom_primitive.h
@@ -143,6 +143,7 @@ ccl_device float4 primitive_motion_vector(KernelGlobals *kg, ShaderData *sd)
/* center position */
float3 center;
+#ifdef __HAIR__
if(sd->type & PRIMITIVE_ALL_CURVE) {
center = curve_motion_center_location(kg, sd);
@@ -150,6 +151,7 @@ ccl_device float4 primitive_motion_vector(KernelGlobals *kg, ShaderData *sd)
object_position_transform(kg, sd, &center);
}
else
+#endif
center = sd->P;
float3 motion_pre = center, motion_post = center;
diff --git a/intern/cycles/kernel/geom/geom_triangle.h b/intern/cycles/kernel/geom/geom_triangle.h
index 355e36fef0c..c08a82ee038 100644
--- a/intern/cycles/kernel/geom/geom_triangle.h
+++ b/intern/cycles/kernel/geom/geom_triangle.h
@@ -18,7 +18,7 @@
/* Triangle Primitive
*
* Basic triangle with 3 vertices is used to represent mesh surfaces. For BVH
- * ray intersection we use a precomputed triangle storage to accelarate
+ * ray intersection we use a precomputed triangle storage to accelerate
* intersection at the cost of more memory usage */
CCL_NAMESPACE_BEGIN
@@ -116,11 +116,28 @@ ccl_device_inline float3 triangle_refine_subsurface(KernelGlobals *kg, ShaderDat
#endif
}
+/* normal on triangle */
+ccl_device_inline float3 triangle_normal(KernelGlobals *kg, ShaderData *sd)
+{
+ /* load triangle vertices */
+ float4 tri_vindex = kernel_tex_fetch(__tri_vindex, sd->prim);
+
+ float3 v0 = float4_to_float3(kernel_tex_fetch(__tri_verts, __float_as_int(tri_vindex.x)));
+ float3 v1 = float4_to_float3(kernel_tex_fetch(__tri_verts, __float_as_int(tri_vindex.y)));
+ float3 v2 = float4_to_float3(kernel_tex_fetch(__tri_verts, __float_as_int(tri_vindex.z)));
+
+ /* return normal */
+ if(sd->flag & SD_NEGATIVE_SCALE_APPLIED)
+ return normalize(cross(v2 - v0, v1 - v0));
+ else
+ return normalize(cross(v1 - v0, v2 - v0));
+}
+
/* point and normal on triangle */
-ccl_device_inline void triangle_point_normal(KernelGlobals *kg, int prim, float u, float v, float3 *P, float3 *Ng, int *shader)
+ccl_device_inline void triangle_point_normal(KernelGlobals *kg, int object, int prim, float u, float v, float3 *P, float3 *Ng, int *shader)
{
/* load triangle vertices */
- float3 tri_vindex = float4_to_float3(kernel_tex_fetch(__tri_vindex, prim));
+ float4 tri_vindex = kernel_tex_fetch(__tri_vindex, prim);
float3 v0 = float4_to_float3(kernel_tex_fetch(__tri_verts, __float_as_int(tri_vindex.x)));
float3 v1 = float4_to_float3(kernel_tex_fetch(__tri_verts, __float_as_int(tri_vindex.y)));
@@ -130,16 +147,24 @@ ccl_device_inline void triangle_point_normal(KernelGlobals *kg, int prim, float
float t = 1.0f - u - v;
*P = (u*v0 + v*v1 + t*v2);
- float4 Nm = kernel_tex_fetch(__tri_normal, prim);
- *Ng = make_float3(Nm.x, Nm.y, Nm.z);
- *shader = __float_as_int(Nm.w);
+ /* get object flags, instance-aware */
+ int object_flag = kernel_tex_fetch(__object_flag, object >= 0 ? object : ~object);
+
+ /* compute normal */
+ if(object_flag & SD_NEGATIVE_SCALE_APPLIED)
+ *Ng = normalize(cross(v2 - v0, v1 - v0));
+ else
+ *Ng = normalize(cross(v1 - v0, v2 - v0));
+
+ /* shader`*/
+ *shader = kernel_tex_fetch(__tri_shader, prim);
}
/* Triangle vertex locations */
ccl_device_inline void triangle_vertices(KernelGlobals *kg, int prim, float3 P[3])
{
- float3 tri_vindex = float4_to_float3(kernel_tex_fetch(__tri_vindex, prim));
+ float4 tri_vindex = kernel_tex_fetch(__tri_vindex, prim);
P[0] = float4_to_float3(kernel_tex_fetch(__tri_verts, __float_as_int(tri_vindex.x)));
P[1] = float4_to_float3(kernel_tex_fetch(__tri_verts, __float_as_int(tri_vindex.y)));
@@ -151,7 +176,7 @@ ccl_device_inline void triangle_vertices(KernelGlobals *kg, int prim, float3 P[3
ccl_device_inline float3 triangle_smooth_normal(KernelGlobals *kg, int prim, float u, float v)
{
/* load triangle vertices */
- float3 tri_vindex = float4_to_float3(kernel_tex_fetch(__tri_vindex, prim));
+ float4 tri_vindex = kernel_tex_fetch(__tri_vindex, prim);
float3 n0 = float4_to_float3(kernel_tex_fetch(__tri_vnormal, __float_as_int(tri_vindex.x)));
float3 n1 = float4_to_float3(kernel_tex_fetch(__tri_vnormal, __float_as_int(tri_vindex.y)));
@@ -165,7 +190,7 @@ ccl_device_inline float3 triangle_smooth_normal(KernelGlobals *kg, int prim, flo
ccl_device_inline void triangle_dPdudv(KernelGlobals *kg, int prim, float3 *dPdu, float3 *dPdv)
{
/* fetch triangle vertex coordinates */
- float3 tri_vindex = float4_to_float3(kernel_tex_fetch(__tri_vindex, prim));
+ float4 tri_vindex = kernel_tex_fetch(__tri_vindex, prim);
float3 p0 = float4_to_float3(kernel_tex_fetch(__tri_verts, __float_as_int(tri_vindex.x)));
float3 p1 = float4_to_float3(kernel_tex_fetch(__tri_verts, __float_as_int(tri_vindex.y)));
@@ -187,7 +212,7 @@ ccl_device float triangle_attribute_float(KernelGlobals *kg, const ShaderData *s
return kernel_tex_fetch(__attributes_float, offset + sd->prim);
}
else if(elem == ATTR_ELEMENT_VERTEX || elem == ATTR_ELEMENT_VERTEX_MOTION) {
- float3 tri_vindex = float4_to_float3(kernel_tex_fetch(__tri_vindex, sd->prim));
+ float4 tri_vindex = kernel_tex_fetch(__tri_vindex, sd->prim);
float f0 = kernel_tex_fetch(__attributes_float, offset + __float_as_int(tri_vindex.x));
float f1 = kernel_tex_fetch(__attributes_float, offset + __float_as_int(tri_vindex.y));
@@ -230,7 +255,7 @@ ccl_device float3 triangle_attribute_float3(KernelGlobals *kg, const ShaderData
return float4_to_float3(kernel_tex_fetch(__attributes_float3, offset + sd->prim));
}
else if(elem == ATTR_ELEMENT_VERTEX || elem == ATTR_ELEMENT_VERTEX_MOTION) {
- float3 tri_vindex = float4_to_float3(kernel_tex_fetch(__tri_vindex, sd->prim));
+ float4 tri_vindex = kernel_tex_fetch(__tri_vindex, sd->prim);
float3 f0 = float4_to_float3(kernel_tex_fetch(__attributes_float3, offset + __float_as_int(tri_vindex.x)));
float3 f1 = float4_to_float3(kernel_tex_fetch(__attributes_float3, offset + __float_as_int(tri_vindex.y)));
@@ -243,11 +268,20 @@ ccl_device float3 triangle_attribute_float3(KernelGlobals *kg, const ShaderData
return sd->u*f0 + sd->v*f1 + (1.0f - sd->u - sd->v)*f2;
}
- else if(elem == ATTR_ELEMENT_CORNER) {
+ else if(elem == ATTR_ELEMENT_CORNER || elem == ATTR_ELEMENT_CORNER_BYTE) {
int tri = offset + sd->prim*3;
- float3 f0 = float4_to_float3(kernel_tex_fetch(__attributes_float3, tri + 0));
- float3 f1 = float4_to_float3(kernel_tex_fetch(__attributes_float3, tri + 1));
- float3 f2 = float4_to_float3(kernel_tex_fetch(__attributes_float3, tri + 2));
+ float3 f0, f1, f2;
+
+ if(elem == ATTR_ELEMENT_CORNER) {
+ f0 = float4_to_float3(kernel_tex_fetch(__attributes_float3, tri + 0));
+ f1 = float4_to_float3(kernel_tex_fetch(__attributes_float3, tri + 1));
+ f2 = float4_to_float3(kernel_tex_fetch(__attributes_float3, tri + 2));
+ }
+ else {
+ f0 = color_byte_to_float(kernel_tex_fetch(__attributes_uchar4, tri + 0));
+ f1 = color_byte_to_float(kernel_tex_fetch(__attributes_uchar4, tri + 1));
+ f2 = color_byte_to_float(kernel_tex_fetch(__attributes_uchar4, tri + 2));
+ }
#ifdef __RAY_DIFFERENTIALS__
if(dx) *dx = sd->du.dx*f0 + sd->dv.dx*f1 - (sd->du.dx + sd->dv.dx)*f2;
@@ -300,12 +334,12 @@ ccl_device_inline bool triangle_intersect(KernelGlobals *kg, Intersection *isect
#endif
{
/* record intersection */
+ isect->t = t;
+ isect->u = u;
+ isect->v = v;
isect->prim = triAddr;
isect->object = object;
isect->type = PRIMITIVE_TRIANGLE;
- isect->u = u;
- isect->v = v;
- isect->t = t;
return true;
}
}
@@ -363,12 +397,12 @@ ccl_device_inline void triangle_intersect_subsurface(KernelGlobals *kg, Intersec
/* record intersection */
Intersection *isect = &isect_array[hit];
+ isect->t = t;
+ isect->u = u;
+ isect->v = v;
isect->prim = triAddr;
isect->object = object;
isect->type = PRIMITIVE_TRIANGLE;
- isect->u = u;
- isect->v = v;
- isect->t = t;
}
}
}
diff --git a/intern/cycles/kernel/geom/geom_volume.h b/intern/cycles/kernel/geom/geom_volume.h
index 963d6cbee9c..3cb6d168f80 100644
--- a/intern/cycles/kernel/geom/geom_volume.h
+++ b/intern/cycles/kernel/geom/geom_volume.h
@@ -49,7 +49,15 @@ ccl_device float3 volume_normalized_position(KernelGlobals *kg, const ShaderData
ccl_device float volume_attribute_float(KernelGlobals *kg, const ShaderData *sd, AttributeElement elem, int id, float *dx, float *dy)
{
float3 P = volume_normalized_position(kg, sd, sd->P);
- float4 r = kernel_tex_image_interp_3d(id, P.x, P.y, P.z);
+#ifdef __KERNEL_GPU__
+ float4 r = make_float4(0.0f, 0.0f, 0.0f, 0.0f);
+#else
+ float4 r;
+ if(sd->flag & SD_VOLUME_CUBIC)
+ r = kernel_tex_image_interp_3d_ex(id, P.x, P.y, P.z, INTERPOLATION_CUBIC);
+ else
+ r = kernel_tex_image_interp_3d(id, P.x, P.y, P.z);
+#endif
if(dx) *dx = 0.0f;
if(dx) *dy = 0.0f;
@@ -61,7 +69,15 @@ ccl_device float volume_attribute_float(KernelGlobals *kg, const ShaderData *sd,
ccl_device float3 volume_attribute_float3(KernelGlobals *kg, const ShaderData *sd, AttributeElement elem, int id, float3 *dx, float3 *dy)
{
float3 P = volume_normalized_position(kg, sd, sd->P);
- float4 r = kernel_tex_image_interp_3d(id, P.x, P.y, P.z);
+#ifdef __KERNEL_GPU__
+ float4 r = make_float4(0.0f, 0.0f, 0.0f, 0.0f);
+#else
+ float4 r;
+ if(sd->flag & SD_VOLUME_CUBIC)
+ r = kernel_tex_image_interp_3d_ex(id, P.x, P.y, P.z, INTERPOLATION_CUBIC);
+ else
+ r = kernel_tex_image_interp_3d(id, P.x, P.y, P.z);
+#endif
if(dx) *dx = make_float3(0.0f, 0.0f, 0.0f);
if(dy) *dy = make_float3(0.0f, 0.0f, 0.0f);
diff --git a/intern/cycles/kernel/kernel.cl b/intern/cycles/kernel/kernel.cl
index 6988ad6027f..4f20ef9ca15 100644
--- a/intern/cycles/kernel/kernel.cl
+++ b/intern/cycles/kernel/kernel.cl
@@ -23,7 +23,7 @@
#include "kernel_film.h"
#include "kernel_path.h"
-#include "kernel_displace.h"
+#include "kernel_bake.h"
__kernel void kernel_ocl_path_trace(
ccl_constant KernelData *data,
@@ -115,7 +115,7 @@ __kernel void kernel_ocl_shader(
ccl_global type *name,
#include "kernel_textures.h"
- int type, int sx, int sw)
+ int type, int sx, int sw, int offset, int sample)
{
KernelGlobals kglobals, *kg = &kglobals;
@@ -128,6 +128,31 @@ __kernel void kernel_ocl_shader(
int x = sx + get_global_id(0);
if(x < sx + sw)
- kernel_shader_evaluate(kg, input, output, (ShaderEvalType)type, x);
+ kernel_shader_evaluate(kg, input, output, (ShaderEvalType)type, x, sample);
+}
+
+__kernel void kernel_ocl_bake(
+ ccl_constant KernelData *data,
+ ccl_global uint4 *input,
+ ccl_global float4 *output,
+
+#define KERNEL_TEX(type, ttype, name) \
+ ccl_global type *name,
+#include "kernel_textures.h"
+
+ int type, int sx, int sw, int offset, int sample)
+{
+ KernelGlobals kglobals, *kg = &kglobals;
+
+ kg->data = data;
+
+#define KERNEL_TEX(type, ttype, name) \
+ kg->name = name;
+#include "kernel_textures.h"
+
+ int x = sx + get_global_id(0);
+
+ if(x < sx + sw)
+ kernel_bake_evaluate(kg, input, output, (ShaderEvalType)type, x, offset, sample);
}
diff --git a/intern/cycles/kernel/kernel.cpp b/intern/cycles/kernel/kernel.cpp
index 173028d50c8..fa2113fbb46 100644
--- a/intern/cycles/kernel/kernel.cpp
+++ b/intern/cycles/kernel/kernel.cpp
@@ -23,7 +23,7 @@
#include "kernel_globals.h"
#include "kernel_film.h"
#include "kernel_path.h"
-#include "kernel_displace.h"
+#include "kernel_bake.h"
CCL_NAMESPACE_BEGIN
@@ -120,9 +120,12 @@ void kernel_cpu_convert_to_half_float(KernelGlobals *kg, uchar4 *rgba, float *bu
/* Shader Evaluation */
-void kernel_cpu_shader(KernelGlobals *kg, uint4 *input, float4 *output, int type, int i)
+void kernel_cpu_shader(KernelGlobals *kg, uint4 *input, float4 *output, int type, int i, int offset, int sample)
{
- kernel_shader_evaluate(kg, input, output, (ShaderEvalType)type, i);
+ if(type >= SHADER_EVAL_BAKE)
+ kernel_bake_evaluate(kg, input, output, (ShaderEvalType)type, i, offset, sample);
+ else
+ kernel_shader_evaluate(kg, input, output, (ShaderEvalType)type, i, sample);
}
CCL_NAMESPACE_END
diff --git a/intern/cycles/kernel/kernel.cu b/intern/cycles/kernel/kernel.cu
index 636e48b5456..489daacddde 100644
--- a/intern/cycles/kernel/kernel.cu
+++ b/intern/cycles/kernel/kernel.cu
@@ -22,7 +22,7 @@
#include "kernel_globals.h"
#include "kernel_film.h"
#include "kernel_path.h"
-#include "kernel_displace.h"
+#include "kernel_bake.h"
/* device data taken from CUDA occupancy calculator */
@@ -52,8 +52,20 @@
#define CUDA_KERNEL_MAX_REGISTERS 63
#define CUDA_KERNEL_BRANCHED_MAX_REGISTERS 63
-/* 5.0 */
-#elif __CUDA_ARCH__ == 500
+/* 3.2 */
+#elif __CUDA_ARCH__ == 320
+#define CUDA_MULTIPRESSOR_MAX_REGISTERS 32768
+#define CUDA_MULTIPROCESSOR_MAX_BLOCKS 16
+#define CUDA_BLOCK_MAX_THREADS 1024
+#define CUDA_THREAD_MAX_REGISTERS 63
+
+/* tunable parameters */
+#define CUDA_THREADS_BLOCK_WIDTH 16
+#define CUDA_KERNEL_MAX_REGISTERS 63
+#define CUDA_KERNEL_BRANCHED_MAX_REGISTERS 63
+
+/* 5.0 and 5.2 */
+#elif __CUDA_ARCH__ == 500 || __CUDA_ARCH__ == 520
#define CUDA_MULTIPRESSOR_MAX_REGISTERS 65536
#define CUDA_MULTIPROCESSOR_MAX_BLOCKS 32
#define CUDA_BLOCK_MAX_THREADS 1024
@@ -61,12 +73,12 @@
/* tunable parameters */
#define CUDA_THREADS_BLOCK_WIDTH 16
-#define CUDA_KERNEL_MAX_REGISTERS 63
+#define CUDA_KERNEL_MAX_REGISTERS 40
#define CUDA_KERNEL_BRANCHED_MAX_REGISTERS 63
/* unknown architecture */
#else
-#error "Unknown or unuspported CUDA architecture, can't determine launch bounds"
+#error "Unknown or unsupported CUDA architecture, can't determine launch bounds"
#endif
/* compute number of threads per block and minimum blocks per multiprocessor
@@ -146,11 +158,22 @@ kernel_cuda_convert_to_half_float(uchar4 *rgba, float *buffer, float sample_scal
extern "C" __global__ void
CUDA_LAUNCH_BOUNDS(CUDA_THREADS_BLOCK_WIDTH, CUDA_KERNEL_MAX_REGISTERS)
-kernel_cuda_shader(uint4 *input, float4 *output, int type, int sx)
+kernel_cuda_shader(uint4 *input, float4 *output, int type, int sx, int sw, int offset, int sample)
+{
+ int x = sx + blockDim.x*blockIdx.x + threadIdx.x;
+
+ if(x < sx + sw)
+ kernel_shader_evaluate(NULL, input, output, (ShaderEvalType)type, x, sample);
+}
+
+extern "C" __global__ void
+CUDA_LAUNCH_BOUNDS(CUDA_THREADS_BLOCK_WIDTH, CUDA_KERNEL_MAX_REGISTERS)
+kernel_cuda_bake(uint4 *input, float4 *output, int type, int sx, int sw, int offset, int sample)
{
int x = sx + blockDim.x*blockIdx.x + threadIdx.x;
- kernel_shader_evaluate(NULL, input, output, (ShaderEvalType)type, x);
+ if(x < sx + sw)
+ kernel_bake_evaluate(NULL, input, output, (ShaderEvalType)type, x, offset, sample);
}
#endif
diff --git a/intern/cycles/kernel/kernel.h b/intern/cycles/kernel/kernel.h
index c4a08646bab..19e06b88797 100644
--- a/intern/cycles/kernel/kernel.h
+++ b/intern/cycles/kernel/kernel.h
@@ -41,7 +41,7 @@ void kernel_cpu_convert_to_byte(KernelGlobals *kg, uchar4 *rgba, float *buffer,
void kernel_cpu_convert_to_half_float(KernelGlobals *kg, uchar4 *rgba, float *buffer,
float sample_scale, int x, int y, int offset, int stride);
void kernel_cpu_shader(KernelGlobals *kg, uint4 *input, float4 *output,
- int type, int i);
+ int type, int i, int offset, int sample);
#ifdef WITH_CYCLES_OPTIMIZED_KERNEL_SSE2
void kernel_cpu_sse2_path_trace(KernelGlobals *kg, float *buffer, unsigned int *rng_state,
@@ -51,7 +51,7 @@ void kernel_cpu_sse2_convert_to_byte(KernelGlobals *kg, uchar4 *rgba, float *buf
void kernel_cpu_sse2_convert_to_half_float(KernelGlobals *kg, uchar4 *rgba, float *buffer,
float sample_scale, int x, int y, int offset, int stride);
void kernel_cpu_sse2_shader(KernelGlobals *kg, uint4 *input, float4 *output,
- int type, int i);
+ int type, int i, int offset, int sample);
#endif
#ifdef WITH_CYCLES_OPTIMIZED_KERNEL_SSE3
@@ -62,7 +62,7 @@ void kernel_cpu_sse3_convert_to_byte(KernelGlobals *kg, uchar4 *rgba, float *buf
void kernel_cpu_sse3_convert_to_half_float(KernelGlobals *kg, uchar4 *rgba, float *buffer,
float sample_scale, int x, int y, int offset, int stride);
void kernel_cpu_sse3_shader(KernelGlobals *kg, uint4 *input, float4 *output,
- int type, int i);
+ int type, int i, int offset, int sample);
#endif
#ifdef WITH_CYCLES_OPTIMIZED_KERNEL_SSE41
@@ -73,7 +73,7 @@ void kernel_cpu_sse41_convert_to_byte(KernelGlobals *kg, uchar4 *rgba, float *bu
void kernel_cpu_sse41_convert_to_half_float(KernelGlobals *kg, uchar4 *rgba, float *buffer,
float sample_scale, int x, int y, int offset, int stride);
void kernel_cpu_sse41_shader(KernelGlobals *kg, uint4 *input, float4 *output,
- int type, int i);
+ int type, int i, int offset, int sample);
#endif
#ifdef WITH_CYCLES_OPTIMIZED_KERNEL_AVX
@@ -84,7 +84,18 @@ void kernel_cpu_avx_convert_to_byte(KernelGlobals *kg, uchar4 *rgba, float *buff
void kernel_cpu_avx_convert_to_half_float(KernelGlobals *kg, uchar4 *rgba, float *buffer,
float sample_scale, int x, int y, int offset, int stride);
void kernel_cpu_avx_shader(KernelGlobals *kg, uint4 *input, float4 *output,
- int type, int i);
+ int type, int i, int offset, int sample);
+#endif
+
+#ifdef WITH_CYCLES_OPTIMIZED_KERNEL_AVX2
+void kernel_cpu_avx2_path_trace(KernelGlobals *kg, float *buffer, unsigned int *rng_state,
+ int sample, int x, int y, int offset, int stride);
+void kernel_cpu_avx2_convert_to_byte(KernelGlobals *kg, uchar4 *rgba, float *buffer,
+ float sample_scale, int x, int y, int offset, int stride);
+void kernel_cpu_avx2_convert_to_half_float(KernelGlobals *kg, uchar4 *rgba, float *buffer,
+ float sample_scale, int x, int y, int offset, int stride);
+void kernel_cpu_avx2_shader(KernelGlobals *kg, uint4 *input, float4 *output,
+ int type, int i, int offset, int sample);
#endif
CCL_NAMESPACE_END
diff --git a/intern/cycles/kernel/kernel_accumulate.h b/intern/cycles/kernel/kernel_accumulate.h
index b4f6dcdace9..b0efcdc66a7 100644
--- a/intern/cycles/kernel/kernel_accumulate.h
+++ b/intern/cycles/kernel/kernel_accumulate.h
@@ -32,10 +32,11 @@ ccl_device_inline void bsdf_eval_init(BsdfEval *eval, ClosureType type, float3 v
eval->transmission = make_float3(0.0f, 0.0f, 0.0f);
eval->transparent = make_float3(0.0f, 0.0f, 0.0f);
eval->subsurface = make_float3(0.0f, 0.0f, 0.0f);
+ eval->scatter = make_float3(0.0f, 0.0f, 0.0f);
if(type == CLOSURE_BSDF_TRANSPARENT_ID)
eval->transparent = value;
- else if(CLOSURE_IS_BSDF_DIFFUSE(type) || CLOSURE_IS_PHASE(type))
+ else if(CLOSURE_IS_BSDF_DIFFUSE(type))
eval->diffuse = value;
else if(CLOSURE_IS_BSDF_GLOSSY(type))
eval->glossy = value;
@@ -43,6 +44,8 @@ ccl_device_inline void bsdf_eval_init(BsdfEval *eval, ClosureType type, float3 v
eval->transmission = value;
else if(CLOSURE_IS_BSDF_BSSRDF(type))
eval->subsurface = value;
+ else if(CLOSURE_IS_PHASE(type))
+ eval->scatter = value;
}
else
eval->diffuse = value;
@@ -51,11 +54,17 @@ ccl_device_inline void bsdf_eval_init(BsdfEval *eval, ClosureType type, float3 v
#endif
}
-ccl_device_inline void bsdf_eval_accum(BsdfEval *eval, ClosureType type, float3 value)
+/* TODO(sergey): This is just a workaround for annoying 6.5 compiler bug. */
+#if !defined(__KERNEL_CUDA__) || __CUDA_ARCH__ < 500
+ccl_device_inline
+#else
+ccl_device_noinline
+#endif
+void bsdf_eval_accum(BsdfEval *eval, ClosureType type, float3 value)
{
#ifdef __PASSES__
if(eval->use_light_pass) {
- if(CLOSURE_IS_BSDF_DIFFUSE(type) || CLOSURE_IS_PHASE(type))
+ if(CLOSURE_IS_BSDF_DIFFUSE(type))
eval->diffuse += value;
else if(CLOSURE_IS_BSDF_GLOSSY(type))
eval->glossy += value;
@@ -63,6 +72,8 @@ ccl_device_inline void bsdf_eval_accum(BsdfEval *eval, ClosureType type, float3
eval->transmission += value;
else if(CLOSURE_IS_BSDF_BSSRDF(type))
eval->subsurface += value;
+ else if(CLOSURE_IS_PHASE(type))
+ eval->scatter += value;
/* skipping transparent, this function is used by for eval(), will be zero then */
}
@@ -81,7 +92,8 @@ ccl_device_inline bool bsdf_eval_is_zero(BsdfEval *eval)
&& is_zero(eval->glossy)
&& is_zero(eval->transmission)
&& is_zero(eval->transparent)
- && is_zero(eval->subsurface);
+ && is_zero(eval->subsurface)
+ && is_zero(eval->scatter);
}
else
return is_zero(eval->diffuse);
@@ -98,6 +110,7 @@ ccl_device_inline void bsdf_eval_mul(BsdfEval *eval, float3 value)
eval->glossy *= value;
eval->transmission *= value;
eval->subsurface *= value;
+ eval->scatter *= value;
/* skipping transparent, this function is used by for eval(), will be zero then */
}
@@ -111,7 +124,7 @@ ccl_device_inline void bsdf_eval_mul(BsdfEval *eval, float3 value)
/* Path Radiance
*
* We accumulate different render passes separately. After summing at the end
- * to get the combined result, it should be identical. We definte directly
+ * to get the combined result, it should be identical. We definite directly
* visible as the first non-transparent hit, while indirectly visible are the
* bounces after that. */
@@ -130,21 +143,25 @@ ccl_device_inline void path_radiance_init(PathRadiance *L, int use_light_pass)
L->color_glossy = make_float3(0.0f, 0.0f, 0.0f);
L->color_transmission = make_float3(0.0f, 0.0f, 0.0f);
L->color_subsurface = make_float3(0.0f, 0.0f, 0.0f);
+ L->color_scatter = make_float3(0.0f, 0.0f, 0.0f);
L->direct_diffuse = make_float3(0.0f, 0.0f, 0.0f);
L->direct_glossy = make_float3(0.0f, 0.0f, 0.0f);
L->direct_transmission = make_float3(0.0f, 0.0f, 0.0f);
L->direct_subsurface = make_float3(0.0f, 0.0f, 0.0f);
+ L->direct_scatter = make_float3(0.0f, 0.0f, 0.0f);
L->indirect_diffuse = make_float3(0.0f, 0.0f, 0.0f);
L->indirect_glossy = make_float3(0.0f, 0.0f, 0.0f);
L->indirect_transmission = make_float3(0.0f, 0.0f, 0.0f);
L->indirect_subsurface = make_float3(0.0f, 0.0f, 0.0f);
+ L->indirect_scatter = make_float3(0.0f, 0.0f, 0.0f);
L->path_diffuse = make_float3(0.0f, 0.0f, 0.0f);
L->path_glossy = make_float3(0.0f, 0.0f, 0.0f);
L->path_transmission = make_float3(0.0f, 0.0f, 0.0f);
L->path_subsurface = make_float3(0.0f, 0.0f, 0.0f);
+ L->path_scatter = make_float3(0.0f, 0.0f, 0.0f);
L->emission = make_float3(0.0f, 0.0f, 0.0f);
L->background = make_float3(0.0f, 0.0f, 0.0f);
@@ -174,14 +191,16 @@ ccl_device_inline void path_radiance_bsdf_bounce(PathRadiance *L, float3 *throug
L->path_glossy = bsdf_eval->glossy*value;
L->path_transmission = bsdf_eval->transmission*value;
L->path_subsurface = bsdf_eval->subsurface*value;
+ L->path_scatter = bsdf_eval->scatter*value;
- *throughput = L->path_diffuse + L->path_glossy + L->path_transmission + L->path_subsurface;
+ *throughput = L->path_diffuse + L->path_glossy + L->path_transmission + L->path_subsurface + L->path_scatter;
L->direct_throughput = *throughput;
}
else {
/* transparent bounce before first hit, or indirectly visible through BSDF */
- float3 sum = (bsdf_eval->diffuse + bsdf_eval->glossy + bsdf_eval->transmission + bsdf_eval->transparent + bsdf_eval->subsurface)*inverse_pdf;
+ float3 sum = (bsdf_eval->diffuse + bsdf_eval->glossy + bsdf_eval->transmission + bsdf_eval->transparent +
+ bsdf_eval->subsurface + bsdf_eval->scatter) * inverse_pdf;
*throughput *= sum;
}
}
@@ -241,6 +260,7 @@ ccl_device_inline void path_radiance_accum_light(PathRadiance *L, float3 through
L->direct_glossy += throughput*bsdf_eval->glossy*shadow;
L->direct_transmission += throughput*bsdf_eval->transmission*shadow;
L->direct_subsurface += throughput*bsdf_eval->subsurface*shadow;
+ L->direct_scatter += throughput*bsdf_eval->scatter*shadow;
if(is_lamp) {
L->shadow.x += shadow.x*shadow_fac;
@@ -250,7 +270,7 @@ ccl_device_inline void path_radiance_accum_light(PathRadiance *L, float3 through
}
else {
/* indirectly visible lighting after BSDF bounce */
- float3 sum = bsdf_eval->diffuse + bsdf_eval->glossy + bsdf_eval->transmission + bsdf_eval->subsurface;
+ float3 sum = bsdf_eval->diffuse + bsdf_eval->glossy + bsdf_eval->transmission + bsdf_eval->subsurface + bsdf_eval->scatter;
L->indirect += throughput*sum*shadow;
}
}
@@ -291,12 +311,14 @@ ccl_device_inline void path_radiance_sum_indirect(PathRadiance *L)
L->direct_glossy += L->path_glossy*L->direct_emission;
L->direct_transmission += L->path_transmission*L->direct_emission;
L->direct_subsurface += L->path_subsurface*L->direct_emission;
+ L->direct_scatter += L->path_scatter*L->direct_emission;
L->indirect = safe_divide_color(L->indirect, L->direct_throughput);
L->indirect_diffuse += L->path_diffuse*L->indirect;
L->indirect_glossy += L->path_glossy*L->indirect;
L->indirect_transmission += L->path_transmission*L->indirect;
L->indirect_subsurface += L->path_subsurface*L->indirect;
+ L->indirect_scatter += L->path_scatter*L->indirect;
}
#endif
}
@@ -309,6 +331,7 @@ ccl_device_inline void path_radiance_reset_indirect(PathRadiance *L)
L->path_glossy = make_float3(0.0f, 0.0f, 0.0f);
L->path_transmission = make_float3(0.0f, 0.0f, 0.0f);
L->path_subsurface = make_float3(0.0f, 0.0f, 0.0f);
+ L->path_scatter = make_float3(0.0f, 0.0f, 0.0f);
L->direct_emission = make_float3(0.0f, 0.0f, 0.0f);
L->indirect = make_float3(0.0f, 0.0f, 0.0f);
@@ -327,8 +350,8 @@ ccl_device_inline float3 path_radiance_clamp_and_sum(KernelGlobals *kg, PathRadi
if(L->use_light_pass) {
path_radiance_sum_indirect(L);
- L_direct = L->direct_diffuse + L->direct_glossy + L->direct_transmission + L->direct_subsurface + L->emission;
- L_indirect = L->indirect_diffuse + L->indirect_glossy + L->indirect_transmission + L->indirect_subsurface;
+ L_direct = L->direct_diffuse + L->direct_glossy + L->direct_transmission + L->direct_subsurface + L->direct_scatter + L->emission;
+ L_indirect = L->indirect_diffuse + L->indirect_glossy + L->indirect_transmission + L->indirect_subsurface + L->indirect_scatter;
if(!kernel_data.background.transparent)
L_direct += L->background;
@@ -344,11 +367,13 @@ ccl_device_inline float3 path_radiance_clamp_and_sum(KernelGlobals *kg, PathRadi
L->direct_glossy = make_float3(0.0f, 0.0f, 0.0f);
L->direct_transmission = make_float3(0.0f, 0.0f, 0.0f);
L->direct_subsurface = make_float3(0.0f, 0.0f, 0.0f);
+ L->direct_scatter = make_float3(0.0f, 0.0f, 0.0f);
L->indirect_diffuse = make_float3(0.0f, 0.0f, 0.0f);
L->indirect_glossy = make_float3(0.0f, 0.0f, 0.0f);
L->indirect_transmission = make_float3(0.0f, 0.0f, 0.0f);
L->indirect_subsurface = make_float3(0.0f, 0.0f, 0.0f);
+ L->indirect_scatter = make_float3(0.0f, 0.0f, 0.0f);
L->emission = make_float3(0.0f, 0.0f, 0.0f);
}
@@ -368,6 +393,7 @@ ccl_device_inline float3 path_radiance_clamp_and_sum(KernelGlobals *kg, PathRadi
L->direct_glossy *= scale;
L->direct_transmission *= scale;
L->direct_subsurface *= scale;
+ L->direct_scatter *= scale;
L->emission *= scale;
L->background *= scale;
}
@@ -382,6 +408,7 @@ ccl_device_inline float3 path_radiance_clamp_and_sum(KernelGlobals *kg, PathRadi
L->indirect_glossy *= scale;
L->indirect_transmission *= scale;
L->indirect_subsurface *= scale;
+ L->indirect_scatter *= scale;
}
/* Sum again, after clamping */
@@ -416,11 +443,13 @@ ccl_device_inline void path_radiance_accum_sample(PathRadiance *L, PathRadiance
L->direct_glossy += L_sample->direct_glossy*fac;
L->direct_transmission += L_sample->direct_transmission*fac;
L->direct_subsurface += L_sample->direct_subsurface*fac;
+ L->direct_scatter += L_sample->direct_scatter*fac;
L->indirect_diffuse += L_sample->indirect_diffuse*fac;
L->indirect_glossy += L_sample->indirect_glossy*fac;
L->indirect_transmission += L_sample->indirect_transmission*fac;
L->indirect_subsurface += L_sample->indirect_subsurface*fac;
+ L->indirect_scatter += L_sample->indirect_scatter*fac;
L->emission += L_sample->emission*fac;
L->background += L_sample->background*fac;
diff --git a/intern/cycles/kernel/kernel_avx.cpp b/intern/cycles/kernel/kernel_avx.cpp
index 354214c406e..e7ff21a6f09 100644
--- a/intern/cycles/kernel/kernel_avx.cpp
+++ b/intern/cycles/kernel/kernel_avx.cpp
@@ -24,6 +24,7 @@
#define __KERNEL_SSE3__
#define __KERNEL_SSSE3__
#define __KERNEL_SSE41__
+#define __KERNEL_AVX__
#endif
#include "util_optimization.h"
@@ -37,7 +38,7 @@
#include "kernel_globals.h"
#include "kernel_film.h"
#include "kernel_path.h"
-#include "kernel_displace.h"
+#include "kernel_bake.h"
CCL_NAMESPACE_BEGIN
@@ -67,9 +68,12 @@ void kernel_cpu_avx_convert_to_half_float(KernelGlobals *kg, uchar4 *rgba, float
/* Shader Evaluate */
-void kernel_cpu_avx_shader(KernelGlobals *kg, uint4 *input, float4 *output, int type, int i)
+void kernel_cpu_avx_shader(KernelGlobals *kg, uint4 *input, float4 *output, int type, int i, int offset, int sample)
{
- kernel_shader_evaluate(kg, input, output, (ShaderEvalType)type, i);
+ if(type >= SHADER_EVAL_BAKE)
+ kernel_bake_evaluate(kg, input, output, (ShaderEvalType)type, i, offset, sample);
+ else
+ kernel_shader_evaluate(kg, input, output, (ShaderEvalType)type, i, sample);
}
CCL_NAMESPACE_END
diff --git a/intern/cycles/kernel/kernel_avx2.cpp b/intern/cycles/kernel/kernel_avx2.cpp
new file mode 100644
index 00000000000..cb1662bbfbe
--- /dev/null
+++ b/intern/cycles/kernel/kernel_avx2.cpp
@@ -0,0 +1,87 @@
+/*
+ * Copyright 2011-2014 Blender Foundation
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License
+ */
+
+/* Optimized CPU kernel entry points. This file is compiled with AVX2
+ * optimization flags and nearly all functions inlined, while kernel.cpp
+ * is compiled without for other CPU's. */
+
+/* SSE optimization disabled for now on 32 bit, see bug #36316 */
+#if !(defined(__GNUC__) && (defined(i386) || defined(_M_IX86)))
+#define __KERNEL_SSE2__
+#define __KERNEL_SSE3__
+#define __KERNEL_SSSE3__
+#define __KERNEL_SSE41__
+#define __KERNEL_AVX__
+#define __KERNEL_AVX2__
+#endif
+
+#include "util_optimization.h"
+
+#ifdef WITH_CYCLES_OPTIMIZED_KERNEL_AVX2
+
+#include "kernel.h"
+#include "kernel_compat_cpu.h"
+#include "kernel_math.h"
+#include "kernel_types.h"
+#include "kernel_globals.h"
+#include "kernel_film.h"
+#include "kernel_path.h"
+#include "kernel_bake.h"
+
+CCL_NAMESPACE_BEGIN
+
+/* Path Tracing */
+
+void kernel_cpu_avx2_path_trace(KernelGlobals *kg, float *buffer, unsigned int *rng_state, int sample, int x, int y, int offset, int stride)
+{
+#ifdef __BRANCHED_PATH__
+ if(kernel_data.integrator.branched)
+ kernel_branched_path_trace(kg, buffer, rng_state, sample, x, y, offset, stride);
+ else
+#endif
+ kernel_path_trace(kg, buffer, rng_state, sample, x, y, offset, stride);
+}
+
+/* Film */
+
+void kernel_cpu_avx2_convert_to_byte(KernelGlobals *kg, uchar4 *rgba, float *buffer, float sample_scale, int x, int y, int offset, int stride)
+{
+ kernel_film_convert_to_byte(kg, rgba, buffer, sample_scale, x, y, offset, stride);
+}
+
+void kernel_cpu_avx2_convert_to_half_float(KernelGlobals *kg, uchar4 *rgba, float *buffer, float sample_scale, int x, int y, int offset, int stride)
+{
+ kernel_film_convert_to_half_float(kg, rgba, buffer, sample_scale, x, y, offset, stride);
+}
+
+/* Shader Evaluate */
+
+void kernel_cpu_avx2_shader(KernelGlobals *kg, uint4 *input, float4 *output, int type, int i, int offset, int sample)
+{
+ if(type >= SHADER_EVAL_BAKE)
+ kernel_bake_evaluate(kg, input, output, (ShaderEvalType)type, i, offset, sample);
+ else
+ kernel_shader_evaluate(kg, input, output, (ShaderEvalType)type, i, sample);
+}
+
+CCL_NAMESPACE_END
+#else
+
+/* needed for some linkers in combination with scons making empty compilation unit in a library */
+void __dummy_function_cycles_avx2(void);
+void __dummy_function_cycles_avx2(void) {}
+
+#endif
diff --git a/intern/cycles/kernel/kernel_displace.h b/intern/cycles/kernel/kernel_bake.h
index b8c64af658f..a1ec080e3d3 100644
--- a/intern/cycles/kernel/kernel_displace.h
+++ b/intern/cycles/kernel/kernel_bake.h
@@ -17,65 +17,125 @@
CCL_NAMESPACE_BEGIN
ccl_device void compute_light_pass(KernelGlobals *kg, ShaderData *sd, PathRadiance *L, RNG rng,
- bool is_combined, bool is_ao, bool is_sss)
+ const bool is_combined, const bool is_ao, const bool is_sss, int sample)
{
- int samples = kernel_data.integrator.aa_samples;
-
/* initialize master radiance accumulator */
kernel_assert(kernel_data.film.use_light_pass);
path_radiance_init(L, kernel_data.film.use_light_pass);
- /* take multiple samples */
- for(int sample = 0; sample < samples; sample++) {
- PathRadiance L_sample;
- PathState state;
- Ray ray;
- float3 throughput = make_float3(1.0f, 1.0f, 1.0f);
+ PathRadiance L_sample;
+ PathState state;
+ Ray ray;
+ float3 throughput = make_float3(1.0f, 1.0f, 1.0f);
+ bool is_sss_sample = is_sss;
- /* init radiance */
- path_radiance_init(&L_sample, kernel_data.film.use_light_pass);
+ /* init radiance */
+ path_radiance_init(&L_sample, kernel_data.film.use_light_pass);
- /* init path state */
- path_state_init(kg, &state, &rng, sample);
- state.num_samples = samples;
+ /* init path state */
+ path_state_init(kg, &state, &rng, sample, NULL);
- /* evaluate surface shader */
- float rbsdf = path_state_rng_1D(kg, &rng, &state, PRNG_BSDF);
- shader_eval_surface(kg, sd, rbsdf, state.flag, SHADER_CONTEXT_MAIN);
+ /* evaluate surface shader */
+ float rbsdf = path_state_rng_1D(kg, &rng, &state, PRNG_BSDF);
+ shader_eval_surface(kg, sd, rbsdf, state.flag, SHADER_CONTEXT_MAIN);
- /* TODO, disable the closures we won't need */
+ /* TODO, disable the closures we won't need */
+
+#ifdef __BRANCHED_PATH__
+ if(!kernel_data.integrator.branched) {
+ /* regular path tracer */
+#endif
/* sample ambient occlusion */
if(is_combined || is_ao) {
kernel_path_ao(kg, sd, &L_sample, &state, &rng, throughput);
}
- /* sample subsurface scattering */
- if((is_combined || is_sss) && (sd->flag & SD_BSSRDF)) {
#ifdef __SUBSURFACE__
+ /* sample subsurface scattering */
+ if((is_combined || is_sss_sample) && (sd->flag & SD_BSSRDF)) {
/* when mixing BSSRDF and BSDF closures we should skip BSDF lighting if scattering was successful */
if (kernel_path_subsurface_scatter(kg, sd, &L_sample, &state, &rng, &ray, &throughput))
- is_sss = true;
-#endif
+ is_sss_sample = true;
}
+#endif
/* sample light and BSDF */
- if((!is_sss) && (!is_ao)) {
- if(kernel_path_integrate_lighting(kg, &rng, sd, &throughput, &state, &L_sample, &ray)) {
+ if((!is_sss_sample) && (!is_ao)) {
+
+ if(sd->flag & SD_EMISSION) {
+ float3 emission = indirect_primitive_emission(kg, sd, 0.0f, state.flag, state.ray_pdf);
+ path_radiance_accum_emission(&L_sample, throughput, emission, state.bounce);
+ }
+
+ kernel_path_surface_connect_light(kg, &rng, sd, throughput, &state, &L_sample);
+
+ if(kernel_path_surface_bounce(kg, &rng, sd, &throughput, &state, &L_sample, &ray)) {
#ifdef __LAMP_MIS__
state.ray_t = 0.0f;
#endif
/* compute indirect light */
- kernel_path_indirect(kg, &rng, ray, throughput, state.num_samples, state, &L_sample);
+ kernel_path_indirect(kg, &rng, ray, throughput, 1, state, &L_sample);
/* sum and reset indirect light pass variables for the next samples */
path_radiance_sum_indirect(&L_sample);
path_radiance_reset_indirect(&L_sample);
}
}
+#ifdef __BRANCHED_PATH__
+ }
+ else {
+ /* branched path tracer */
+
+ /* sample ambient occlusion */
+ if(is_combined || is_ao) {
+ kernel_branched_path_ao(kg, sd, &L_sample, &state, &rng, throughput);
+ }
+
+#ifdef __SUBSURFACE__
+ /* sample subsurface scattering */
+ if((is_combined || is_sss_sample) && (sd->flag & SD_BSSRDF)) {
+ /* when mixing BSSRDF and BSDF closures we should skip BSDF lighting if scattering was successful */
+ kernel_branched_path_subsurface_scatter(kg, sd, &L_sample, &state, &rng, &ray, throughput);
+ }
+#endif
+
+ /* sample light and BSDF */
+ if((!is_sss_sample) && (!is_ao)) {
+
+ if(sd->flag & SD_EMISSION) {
+ float3 emission = indirect_primitive_emission(kg, sd, 0.0f, state.flag, state.ray_pdf);
+ path_radiance_accum_emission(&L_sample, throughput, emission, state.bounce);
+ }
+
+#if defined(__EMISSION__)
+ /* direct light */
+ if(kernel_data.integrator.use_direct_light) {
+ bool all = kernel_data.integrator.sample_all_lights_direct;
+ kernel_branched_path_surface_connect_light(kg, &rng,
+ sd, &state, throughput, 1.0f, &L_sample, all);
+ }
+#endif
+
+ /* indirect light */
+ kernel_branched_path_surface_indirect_light(kg, &rng,
+ sd, throughput, 1.0f, &state, &L_sample);
+ }
+ }
+#endif
+
+ /* accumulate into master L */
+ path_radiance_accum_sample(L, &L_sample, 1);
+}
- /* accumulate into master L */
- path_radiance_accum_sample(L, &L_sample, samples);
+ccl_device bool is_aa_pass(ShaderEvalType type)
+{
+ switch(type) {
+ case SHADER_EVAL_UV:
+ case SHADER_EVAL_NORMAL:
+ return false;
+ default:
+ return true;
}
}
@@ -99,7 +159,21 @@ ccl_device bool is_light_pass(ShaderEvalType type)
}
}
-ccl_device void kernel_bake_evaluate(KernelGlobals *kg, ccl_global uint4 *input, ccl_global float4 *output, ShaderEvalType type, int i)
+#if 0
+ccl_device_inline float bake_clamp_mirror_repeat(float u)
+{
+ /* use mirror repeat (like opengl texture) so that if the barycentric
+ * coordinate goes past the end of the triangle it is not always clamped
+ * to the same value, gives ugly patterns */
+ float fu = floorf(u);
+ u = u - fu;
+
+ return (((int)fu) & 1)? 1.0f - u: u;
+}
+#endif
+
+ccl_device void kernel_bake_evaluate(KernelGlobals *kg, ccl_global uint4 *input, ccl_global float4 *output,
+ ShaderEvalType type, int i, int offset, int sample)
{
ShaderData sd;
uint4 in = input[i * 2];
@@ -121,10 +195,28 @@ ccl_device void kernel_bake_evaluate(KernelGlobals *kg, ccl_global uint4 *input,
float dvdx = __uint_as_float(diff.z);
float dvdy = __uint_as_float(diff.w);
+ int num_samples = kernel_data.integrator.aa_samples;
+
+ /* random number generator */
+ RNG rng = cmj_hash(offset + i, 0);
+
+#if 0
+ uint rng_state = cmj_hash(i, 0);
+ float filter_x, filter_y;
+ path_rng_init(kg, &rng_state, sample, num_samples, &rng, 0, 0, &filter_x, &filter_y);
+
+ /* subpixel u/v offset */
+ if(sample > 0) {
+ u = bake_clamp_mirror_repeat(u + dudx*(filter_x - 0.5f) + dudy*(filter_y - 0.5f));
+ v = bake_clamp_mirror_repeat(v + dvdx*(filter_x - 0.5f) + dvdy*(filter_y - 0.5f));
+ }
+#endif
+
+ /* triangle */
int shader;
float3 P, Ng;
- triangle_point_normal(kg, prim, u, v, &P, &Ng, &shader);
+ triangle_point_normal(kg, object, prim, u, v, &P, &Ng, &shader);
/* dummy initilizations copied from SHADER_EVAL_DISPLACE */
float3 I = Ng;
@@ -147,12 +239,14 @@ ccl_device void kernel_bake_evaluate(KernelGlobals *kg, ccl_global uint4 *input,
sd.dv.dx = dvdx;
sd.dv.dy = dvdy;
+ /* light passes */
if(is_light_pass(type)) {
- RNG rng = cmj_hash(i, 0);
- compute_light_pass(kg, &sd, &L, rng, (type == SHADER_EVAL_COMBINED),
- (type == SHADER_EVAL_AO),
- (type == SHADER_EVAL_SUBSURFACE_DIRECT ||
- type == SHADER_EVAL_SUBSURFACE_INDIRECT));
+ compute_light_pass(kg, &sd, &L, rng,
+ (type == SHADER_EVAL_COMBINED),
+ (type == SHADER_EVAL_AO),
+ (type == SHADER_EVAL_SUBSURFACE_DIRECT ||
+ type == SHADER_EVAL_SUBSURFACE_INDIRECT),
+ sample);
}
switch (type) {
@@ -307,17 +401,16 @@ ccl_device void kernel_bake_evaluate(KernelGlobals *kg, ccl_global uint4 *input,
}
/* write output */
- output[i] = make_float4(out.x, out.y, out.z, 1.0f);
- return;
+ float output_fac = is_aa_pass(type)? 1.0f/num_samples: 1.0f;
+
+ if(sample == 0)
+ output[i] = make_float4(out.x, out.y, out.z, 1.0f) * output_fac;
+ else
+ output[i] += make_float4(out.x, out.y, out.z, 1.0f) * output_fac;
}
-ccl_device void kernel_shader_evaluate(KernelGlobals *kg, ccl_global uint4 *input, ccl_global float4 *output, ShaderEvalType type, int i)
+ccl_device void kernel_shader_evaluate(KernelGlobals *kg, ccl_global uint4 *input, ccl_global float4 *output, ShaderEvalType type, int i, int sample)
{
- if(type >= SHADER_EVAL_BAKE) {
- kernel_bake_evaluate(kg, input, output, type, i);
- return;
- }
-
ShaderData sd;
uint4 in = input[i];
float3 out;
@@ -363,7 +456,10 @@ ccl_device void kernel_shader_evaluate(KernelGlobals *kg, ccl_global uint4 *inpu
}
/* write output */
- output[i] = make_float4(out.x, out.y, out.z, 0.0f);
+ if(sample == 0)
+ output[i] = make_float4(out.x, out.y, out.z, 0.0f);
+ else
+ output[i] += make_float4(out.x, out.y, out.z, 0.0f);
}
CCL_NAMESPACE_END
diff --git a/intern/cycles/kernel/kernel_camera.h b/intern/cycles/kernel/kernel_camera.h
index 7fc66a9fdee..5c83358a56d 100644
--- a/intern/cycles/kernel/kernel_camera.h
+++ b/intern/cycles/kernel/kernel_camera.h
@@ -21,16 +21,22 @@ CCL_NAMESPACE_BEGIN
ccl_device float2 camera_sample_aperture(KernelGlobals *kg, float u, float v)
{
float blades = kernel_data.cam.blades;
+ float2 bokeh;
if(blades == 0.0f) {
/* sample disk */
- return concentric_sample_disk(u, v);
+ bokeh = concentric_sample_disk(u, v);
}
else {
/* sample polygon */
float rotation = kernel_data.cam.bladesrotation;
- return regular_polygon_sample(blades, rotation, u, v);
+ bokeh = regular_polygon_sample(blades, rotation, u, v);
}
+
+ /* anamorphic lens bokeh */
+ bokeh.x *= kernel_data.cam.inv_aperture_ratio;
+
+ return bokeh;
}
ccl_device void camera_sample_perspective(KernelGlobals *kg, float raster_x, float raster_y, float lens_u, float lens_v, Ray *ray)
@@ -183,7 +189,8 @@ ccl_device void camera_sample_panorama(KernelGlobals *kg, float raster_x, float
/* calculate orthonormal coordinates perpendicular to D */
float3 U, V;
- make_orthonormals(D, &U, &V);
+ U = normalize(make_float3(1.0f, 0.0f, 0.0f) - D.x * D);
+ V = normalize(cross(D, U));
/* update ray for effect of lens */
ray->P = U * lensuv.x + V * lensuv.y;
@@ -262,6 +269,20 @@ ccl_device_inline float camera_distance(KernelGlobals *kg, float3 P)
return len(P - camP);
}
+ccl_device_inline float3 camera_direction_from_point(KernelGlobals *kg, float3 P)
+{
+ Transform cameratoworld = kernel_data.cam.cameratoworld;
+
+ if(kernel_data.cam.type == CAMERA_ORTHOGRAPHIC) {
+ float3 camD = make_float3(cameratoworld.x.z, cameratoworld.y.z, cameratoworld.z.z);
+ return -camD;
+ }
+ else {
+ float3 camP = make_float3(cameratoworld.x.w, cameratoworld.y.w, cameratoworld.z.w);
+ return normalize(camP - P);
+ }
+}
+
ccl_device_inline float3 camera_world_to_ndc(KernelGlobals *kg, ShaderData *sd, float3 P)
{
if(kernel_data.cam.type != CAMERA_PANORAMA) {
diff --git a/intern/cycles/kernel/kernel_compat_cpu.h b/intern/cycles/kernel/kernel_compat_cpu.h
index d027bb62ebe..37cba03ff97 100644
--- a/intern/cycles/kernel/kernel_compat_cpu.h
+++ b/intern/cycles/kernel/kernel_compat_cpu.h
@@ -25,6 +25,13 @@
#include "util_half.h"
#include "util_types.h"
+/* On 64bit linux single precision exponent is really slow comparing to the
+ * double precision version, even with float<->double conversion involved.
+ */
+#if !defined(__KERNEL_GPU__) && defined(__linux__) && defined(__x86_64__)
+# define expf(x) ((float)exp((double)(x)))
+#endif
+
CCL_NAMESPACE_BEGIN
/* Assertions inside the kernel only work for the CPU device, so we wrap it in
@@ -44,16 +51,16 @@ template<typename T> struct texture {
}
#if 0
- ccl_always_inline __m128 fetch_m128(int index)
+ ccl_always_inline ssef fetch_ssef(int index)
{
kernel_assert(index >= 0 && index < width);
- return ((__m128*)data)[index];
+ return ((ssef*)data)[index];
}
- ccl_always_inline __m128i fetch_m128i(int index)
+ ccl_always_inline ssei fetch_ssei(int index)
{
kernel_assert(index >= 0 && index < width);
- return ((__m128i*)data)[index];
+ return ((ssei*)data)[index];
}
#endif
@@ -144,6 +151,13 @@ template<typename T> struct texture_image {
ccl_always_inline float4 interp_3d(float x, float y, float z, bool periodic = false)
{
+ return interp_3d_ex(x, y, z, interpolation, periodic);
+ }
+
+ ccl_always_inline float4 interp_3d_ex(float x, float y, float z,
+ int interpolation = INTERPOLATION_LINEAR,
+ bool periodic = false)
+ {
if(UNLIKELY(!data))
return make_float4(0.0f, 0.0f, 0.0f, 0.0f);
@@ -167,7 +181,7 @@ template<typename T> struct texture_image {
return read(data[ix + iy*width + iz*width*height]);
}
- else {
+ else if(interpolation == INTERPOLATION_LINEAR) {
float tx = frac(x*(float)width - 0.5f, &ix);
float ty = frac(y*(float)height - 0.5f, &iy);
float tz = frac(z*(float)depth - 0.5f, &iz);
@@ -205,6 +219,93 @@ template<typename T> struct texture_image {
return r;
}
+ else {
+ /* Tricubic b-spline interpolation. */
+ const float tx = frac(x*(float)width - 0.5f, &ix);
+ const float ty = frac(y*(float)height - 0.5f, &iy);
+ const float tz = frac(z*(float)depth - 0.5f, &iz);
+ int pix, piy, piz, nnix, nniy, nniz;
+
+ if(periodic) {
+ ix = wrap_periodic(ix, width);
+ iy = wrap_periodic(iy, height);
+ iz = wrap_periodic(iz, depth);
+
+ pix = wrap_periodic(ix-1, width);
+ piy = wrap_periodic(iy-1, height);
+ piz = wrap_periodic(iz-1, depth);
+
+ nix = wrap_periodic(ix+1, width);
+ niy = wrap_periodic(iy+1, height);
+ niz = wrap_periodic(iz+1, depth);
+
+ nnix = wrap_periodic(ix+2, width);
+ nniy = wrap_periodic(iy+2, height);
+ nniz = wrap_periodic(iz+2, depth);
+ }
+ else {
+ ix = wrap_clamp(ix, width);
+ iy = wrap_clamp(iy, height);
+ iz = wrap_clamp(iz, depth);
+
+ pix = wrap_clamp(ix-1, width);
+ piy = wrap_clamp(iy-1, height);
+ piz = wrap_clamp(iz-1, depth);
+
+ nix = wrap_clamp(ix+1, width);
+ niy = wrap_clamp(iy+1, height);
+ niz = wrap_clamp(iz+1, depth);
+
+ nnix = wrap_clamp(ix+2, width);
+ nniy = wrap_clamp(iy+2, height);
+ nniz = wrap_clamp(iz+2, depth);
+ }
+
+ const int xc[4] = {pix, ix, nix, nnix};
+ const int yc[4] = {width * piy,
+ width * iy,
+ width * niy,
+ width * nniy};
+ const int zc[4] = {width * height * piz,
+ width * height * iz,
+ width * height * niz,
+ width * height * nniz};
+ float u[4], v[4], w[4];
+
+ /* Some helper macro to keep code reasonable size,
+ * let compiler to inline all the matrix multiplications.
+ */
+#define SET_SPLINE_WEIGHTS(u, t) \
+ { \
+ u[0] = (((-1.0f/6.0f)* t + 0.5f) * t - 0.5f) * t + (1.0f/6.0f); \
+ u[1] = (( 0.5f * t - 1.0f) * t ) * t + (2.0f/3.0f); \
+ u[2] = (( -0.5f * t + 0.5f) * t + 0.5f) * t + (1.0f/6.0f); \
+ u[3] = (1.0f / 6.0f) * t * t * t; \
+ } (void)0
+#define DATA(x, y, z) (read(data[xc[x] + yc[y] + zc[z]]))
+#define COL_TERM(col, row) \
+ (v[col] * (u[0] * DATA(0, col, row) + \
+ u[1] * DATA(1, col, row) + \
+ u[2] * DATA(2, col, row) + \
+ u[3] * DATA(3, col, row)))
+#define ROW_TERM(row) \
+ (w[row] * (COL_TERM(0, row) + \
+ COL_TERM(1, row) + \
+ COL_TERM(2, row) + \
+ COL_TERM(3, row)))
+
+ SET_SPLINE_WEIGHTS(u, tx);
+ SET_SPLINE_WEIGHTS(v, ty);
+ SET_SPLINE_WEIGHTS(w, tz);
+
+ /* Actual interpolation. */
+ return ROW_TERM(0) + ROW_TERM(1) + ROW_TERM(2) + ROW_TERM(3);
+
+#undef COL_TERM
+#undef ROW_TERM
+#undef DATA
+#undef SET_SPLINE_WEIGHTS
+ }
}
ccl_always_inline void dimensions_set(int width_, int height_, int depth_)
@@ -232,11 +333,12 @@ typedef texture_image<uchar4> texture_image_uchar4;
/* Macros to handle different memory storage on different devices */
#define kernel_tex_fetch(tex, index) (kg->tex.fetch(index))
-#define kernel_tex_fetch_m128(tex, index) (kg->tex.fetch_m128(index))
-#define kernel_tex_fetch_m128i(tex, index) (kg->tex.fetch_m128i(index))
+#define kernel_tex_fetch_ssef(tex, index) (kg->tex.fetch_ssef(index))
+#define kernel_tex_fetch_ssei(tex, index) (kg->tex.fetch_ssei(index))
#define kernel_tex_lookup(tex, t, offset, size) (kg->tex.lookup(t, offset, size))
#define kernel_tex_image_interp(tex, x, y) ((tex < MAX_FLOAT_IMAGES) ? kg->texture_float_images[tex].interp(x, y) : kg->texture_byte_images[tex - MAX_FLOAT_IMAGES].interp(x, y))
#define kernel_tex_image_interp_3d(tex, x, y, z) ((tex < MAX_FLOAT_IMAGES) ? kg->texture_float_images[tex].interp_3d(x, y, z) : kg->texture_byte_images[tex - MAX_FLOAT_IMAGES].interp_3d(x, y, z))
+#define kernel_tex_image_interp_3d_ex(tex, x, y, z, interpolation) ((tex < MAX_FLOAT_IMAGES) ? kg->texture_float_images[tex].interp_3d_ex(x, y, z, interpolation) : kg->texture_byte_images[tex - MAX_FLOAT_IMAGES].interp_3d_ex(x, y, z, interpolation))
#define kernel_data (kg->__data)
diff --git a/intern/cycles/kernel/kernel_compat_cuda.h b/intern/cycles/kernel/kernel_compat_cuda.h
index e4c20d26ff1..f14f3262274 100644
--- a/intern/cycles/kernel/kernel_compat_cuda.h
+++ b/intern/cycles/kernel/kernel_compat_cuda.h
@@ -75,12 +75,11 @@ typedef texture<uchar4, 2, cudaReadModeNormalizedFloat> texture_image_uchar4;
/* Use fast math functions */
-#define cosf(x) __cosf(((float)x))
-#define sinf(x) __sinf(((float)x))
-#define powf(x, y) __powf(((float)x), ((float)y))
-#define tanf(x) __tanf(((float)x))
-#define logf(x) __logf(((float)x))
-#define expf(x) __expf(((float)x))
+#define cosf(x) __cosf(((float)(x)))
+#define sinf(x) __sinf(((float)(x)))
+#define powf(x, y) __powf(((float)(x)), ((float)(y)))
+#define tanf(x) __tanf(((float)(x)))
+#define logf(x) __logf(((float)(x)))
+#define expf(x) __expf(((float)(x)))
#endif /* __KERNEL_COMPAT_CUDA_H__ */
-
diff --git a/intern/cycles/kernel/kernel_compat_opencl.h b/intern/cycles/kernel/kernel_compat_opencl.h
index 8346b09619e..58031a41b78 100644
--- a/intern/cycles/kernel/kernel_compat_opencl.h
+++ b/intern/cycles/kernel/kernel_compat_opencl.h
@@ -24,14 +24,6 @@
#define CCL_NAMESPACE_BEGIN
#define CCL_NAMESPACE_END
-#ifdef __KERNEL_OPENCL_AMD__
-#define __CL_NO_FLOAT3__
-#endif
-
-#ifdef __CL_NO_FLOAT3__
-#define float3 float4
-#endif
-
#ifdef __CL_NOINLINE__
#define ccl_noinline __attribute__((noinline))
#else
@@ -68,51 +60,51 @@
#ifdef make_int4
#undef make_int4
#endif
+#ifdef make_uchar4
+#undef make_uchar4
+#endif
#define make_float2(x, y) ((float2)(x, y))
-#ifdef __CL_NO_FLOAT3__
-#define make_float3(x, y, z) ((float4)(x, y, z, 0.0f))
-#else
#define make_float3(x, y, z) ((float3)(x, y, z))
-#endif
#define make_float4(x, y, z, w) ((float4)(x, y, z, w))
#define make_int2(x, y) ((int2)(x, y))
#define make_int3(x, y, z) ((int3)(x, y, z))
#define make_int4(x, y, z, w) ((int4)(x, y, z, w))
+#define make_uchar4(x, y, z, w) ((uchar4)(x, y, z, w))
/* math functions */
#define __uint_as_float(x) as_float(x)
#define __float_as_uint(x) as_uint(x)
#define __int_as_float(x) as_float(x)
#define __float_as_int(x) as_int(x)
-#define powf(x, y) pow(((float)x), ((float)y))
-#define fabsf(x) fabs(((float)x))
-#define copysignf(x, y) copysign(((float)x), ((float)y))
-#define asinf(x) asin(((float)x))
-#define acosf(x) acos(((float)x))
-#define atanf(x) atan(((float)x))
-#define floorf(x) floor(((float)x))
-#define ceilf(x) ceil(((float)x))
-#define hypotf(x, y) hypot(((float)x), ((float)y))
-#define atan2f(x, y) atan2(((float)x), ((float)y))
-#define fmaxf(x, y) fmax(((float)x), ((float)y))
-#define fminf(x, y) fmin(((float)x), ((float)y))
-#define fmodf(x, y) fmod((float)x, (float)y)
+#define powf(x, y) pow(((float)(x)), ((float)(y)))
+#define fabsf(x) fabs(((float)(x)))
+#define copysignf(x, y) copysign(((float)(x)), ((float)(y)))
+#define asinf(x) asin(((float)(x)))
+#define acosf(x) acos(((float)(x)))
+#define atanf(x) atan(((float)(x)))
+#define floorf(x) floor(((float)(x)))
+#define ceilf(x) ceil(((float)(x)))
+#define hypotf(x, y) hypot(((float)(x)), ((float)(y)))
+#define atan2f(x, y) atan2(((float)(x)), ((float)(y)))
+#define fmaxf(x, y) fmax(((float)(x)), ((float)(y)))
+#define fminf(x, y) fmin(((float)(x)), ((float)(y)))
+#define fmodf(x, y) fmod((float)(x), (float)(y))
#ifndef __CL_USE_NATIVE__
-#define sinf(x) native_sin(((float)x))
-#define cosf(x) native_cos(((float)x))
-#define tanf(x) native_tan(((float)x))
-#define expf(x) native_exp(((float)x))
-#define sqrtf(x) native_sqrt(((float)x))
-#define logf(x) native_log(((float)x))
+#define sinf(x) native_sin(((float)(x)))
+#define cosf(x) native_cos(((float)(x)))
+#define tanf(x) native_tan(((float)(x)))
+#define expf(x) native_exp(((float)(x)))
+#define sqrtf(x) native_sqrt(((float)(x)))
+#define logf(x) native_log(((float)(x)))
#else
-#define sinf(x) sin(((float)x))
-#define cosf(x) cos(((float)x))
-#define tanf(x) tan(((float)x))
-#define expf(x) exp(((float)x))
-#define sqrtf(x) sqrt(((float)x))
-#define logf(x) log(((float)x))
+#define sinf(x) sin(((float)(x)))
+#define cosf(x) cos(((float)(x)))
+#define tanf(x) tan(((float)(x)))
+#define expf(x) exp(((float)(x)))
+#define sqrtf(x) sqrt(((float)(x)))
+#define logf(x) log(((float)(x)))
#endif
/* data lookup defines */
diff --git a/intern/cycles/kernel/kernel_debug.h b/intern/cycles/kernel/kernel_debug.h
new file mode 100644
index 00000000000..bf1bc0e9db8
--- /dev/null
+++ b/intern/cycles/kernel/kernel_debug.h
@@ -0,0 +1,38 @@
+/*
+ * Copyright 2011-2014 Blender Foundation
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License
+ */
+
+CCL_NAMESPACE_BEGIN
+
+ccl_device_inline void debug_data_init(DebugData *debug_data)
+{
+ debug_data->num_bvh_traversal_steps = 0;
+}
+
+ccl_device_inline void kernel_write_debug_passes(KernelGlobals *kg,
+ ccl_global float *buffer,
+ PathState *state,
+ DebugData *debug_data,
+ int sample)
+{
+ int flag = kernel_data.film.pass_flag;
+ if(flag & PASS_BVH_TRAVERSAL_STEPS) {
+ kernel_write_pass_float(buffer + kernel_data.film.pass_bvh_traversal_steps,
+ sample,
+ debug_data->num_bvh_traversal_steps);
+ }
+}
+
+CCL_NAMESPACE_END
diff --git a/intern/cycles/kernel/kernel_emission.h b/intern/cycles/kernel/kernel_emission.h
index deffa7f2ba2..4b2bb723ab6 100644
--- a/intern/cycles/kernel/kernel_emission.h
+++ b/intern/cycles/kernel/kernel_emission.h
@@ -63,32 +63,18 @@ ccl_device_noinline float3 direct_emissive_eval(KernelGlobals *kg,
return eval;
}
-ccl_device_noinline bool direct_emission(KernelGlobals *kg, ShaderData *sd, int lindex,
- float randt, float randu, float randv, Ray *ray, BsdfEval *eval,
- bool *is_lamp, int bounce, int transparent_bounce)
+ccl_device_noinline bool direct_emission(KernelGlobals *kg, ShaderData *sd,
+ LightSample *ls, Ray *ray, BsdfEval *eval, bool *is_lamp,
+ int bounce, int transparent_bounce)
{
- LightSample ls;
-
-#ifdef __BRANCHED_PATH__
- if(lindex != LAMP_NONE) {
- /* sample position on a specified light */
- light_select(kg, lindex, randu, randv, sd->P, &ls);
- }
- else
-#endif
- {
- /* sample a light and position on int */
- light_sample(kg, randt, randu, randv, sd->time, sd->P, &ls);
- }
-
- if(ls.pdf == 0.0f)
+ if(ls->pdf == 0.0f)
return false;
/* todo: implement */
differential3 dD = differential3_zero();
/* evaluate closure */
- float3 light_eval = direct_emissive_eval(kg, &ls, -ls.D, dD, ls.t, sd->time, bounce, transparent_bounce);
+ float3 light_eval = direct_emissive_eval(kg, ls, -ls->D, dD, ls->t, sd->time, bounce, transparent_bounce);
if(is_zero(light_eval))
return false;
@@ -98,49 +84,51 @@ ccl_device_noinline bool direct_emission(KernelGlobals *kg, ShaderData *sd, int
#ifdef __VOLUME__
if(sd->prim != PRIM_NONE)
- shader_bsdf_eval(kg, sd, ls.D, eval, &bsdf_pdf);
+ shader_bsdf_eval(kg, sd, ls->D, eval, &bsdf_pdf);
else
- shader_volume_phase_eval(kg, sd, ls.D, eval, &bsdf_pdf);
+ shader_volume_phase_eval(kg, sd, ls->D, eval, &bsdf_pdf);
#else
- shader_bsdf_eval(kg, sd, ls.D, eval, &bsdf_pdf);
+ shader_bsdf_eval(kg, sd, ls->D, eval, &bsdf_pdf);
#endif
- if(ls.shader & SHADER_USE_MIS) {
+ if(ls->shader & SHADER_USE_MIS) {
/* multiple importance sampling */
- float mis_weight = power_heuristic(ls.pdf, bsdf_pdf);
+ float mis_weight = power_heuristic(ls->pdf, bsdf_pdf);
light_eval *= mis_weight;
}
- bsdf_eval_mul(eval, light_eval/ls.pdf);
+ bsdf_eval_mul(eval, light_eval/ls->pdf);
#ifdef __PASSES__
/* use visibility flag to skip lights */
- if(ls.shader & SHADER_EXCLUDE_ANY) {
- if(ls.shader & SHADER_EXCLUDE_DIFFUSE)
+ if(ls->shader & SHADER_EXCLUDE_ANY) {
+ if(ls->shader & SHADER_EXCLUDE_DIFFUSE)
eval->diffuse = make_float3(0.0f, 0.0f, 0.0f);
- if(ls.shader & SHADER_EXCLUDE_GLOSSY)
+ if(ls->shader & SHADER_EXCLUDE_GLOSSY)
eval->glossy = make_float3(0.0f, 0.0f, 0.0f);
- if(ls.shader & SHADER_EXCLUDE_TRANSMIT)
+ if(ls->shader & SHADER_EXCLUDE_TRANSMIT)
eval->transmission = make_float3(0.0f, 0.0f, 0.0f);
+ if(ls->shader & SHADER_EXCLUDE_SCATTER)
+ eval->scatter = make_float3(0.0f, 0.0f, 0.0f);
}
#endif
if(bsdf_eval_is_zero(eval))
return false;
- if(ls.shader & SHADER_CAST_SHADOW) {
+ if(ls->shader & SHADER_CAST_SHADOW) {
/* setup ray */
- bool transmit = (dot(sd->Ng, ls.D) < 0.0f);
+ bool transmit = (dot(sd->Ng, ls->D) < 0.0f);
ray->P = ray_offset(sd->P, (transmit)? -sd->Ng: sd->Ng);
- if(ls.t == FLT_MAX) {
+ if(ls->t == FLT_MAX) {
/* distant light */
- ray->D = ls.D;
- ray->t = ls.t;
+ ray->D = ls->D;
+ ray->t = ls->t;
}
else {
/* other lights, avoid self-intersection */
- ray->D = ray_offset(ls.P, ls.Ng) - ray->P;
+ ray->D = ray_offset(ls->P, ls->Ng) - ray->P;
ray->D = normalize_len(ray->D, &ray->t);
}
@@ -153,7 +141,7 @@ ccl_device_noinline bool direct_emission(KernelGlobals *kg, ShaderData *sd, int
}
/* return if it's a lamp for shadow pass */
- *is_lamp = (ls.prim == PRIM_NONE && ls.type != LIGHT_BACKGROUND);
+ *is_lamp = (ls->prim == PRIM_NONE && ls->type != LIGHT_BACKGROUND);
return true;
}
@@ -201,13 +189,25 @@ ccl_device_noinline bool indirect_lamp_emission(KernelGlobals *kg, PathState *st
if(ls.shader & SHADER_EXCLUDE_ANY) {
if(((ls.shader & SHADER_EXCLUDE_DIFFUSE) && (state->flag & PATH_RAY_DIFFUSE)) ||
((ls.shader & SHADER_EXCLUDE_GLOSSY) && (state->flag & PATH_RAY_GLOSSY)) ||
- ((ls.shader & SHADER_EXCLUDE_TRANSMIT) && (state->flag & PATH_RAY_TRANSMIT)))
+ ((ls.shader & SHADER_EXCLUDE_TRANSMIT) && (state->flag & PATH_RAY_TRANSMIT)) ||
+ ((ls.shader & SHADER_EXCLUDE_SCATTER) && (state->flag & PATH_RAY_VOLUME_SCATTER)))
continue;
}
#endif
float3 L = direct_emissive_eval(kg, &ls, -ray->D, ray->dD, ls.t, ray->time, state->bounce, state->transparent_bounce);
+#ifdef __VOLUME__
+ if(state->volume_stack[0].shader != SHADER_NONE) {
+ /* shadow attenuation */
+ Ray volume_ray = *ray;
+ volume_ray.t = ls.t;
+ float3 volume_tp = make_float3(1.0f, 1.0f, 1.0f);
+ kernel_volume_shadow(kg, state, &volume_ray, &volume_tp);
+ L *= volume_tp;
+ }
+#endif
+
if(!(state->flag & PATH_RAY_MIS_SKIP)) {
/* multiple importance sampling, get regular light pdf,
* and compute weight with respect to BSDF pdf */
@@ -234,7 +234,8 @@ ccl_device_noinline float3 indirect_background(KernelGlobals *kg, PathState *sta
if(((shader & SHADER_EXCLUDE_DIFFUSE) && (state->flag & PATH_RAY_DIFFUSE)) ||
((shader & SHADER_EXCLUDE_GLOSSY) && (state->flag & PATH_RAY_GLOSSY)) ||
((shader & SHADER_EXCLUDE_TRANSMIT) && (state->flag & PATH_RAY_TRANSMIT)) ||
- ((shader & SHADER_EXCLUDE_CAMERA) && (state->flag & PATH_RAY_CAMERA)))
+ ((shader & SHADER_EXCLUDE_CAMERA) && (state->flag & PATH_RAY_CAMERA)) ||
+ ((shader & SHADER_EXCLUDE_SCATTER) && (state->flag & PATH_RAY_VOLUME_SCATTER)))
return make_float3(0.0f, 0.0f, 0.0f);
}
diff --git a/intern/cycles/kernel/kernel_jitter.h b/intern/cycles/kernel/kernel_jitter.h
index 7a850844bf2..2a5b7689e57 100644
--- a/intern/cycles/kernel/kernel_jitter.h
+++ b/intern/cycles/kernel/kernel_jitter.h
@@ -14,6 +14,8 @@
* limitations under the License
*/
+/* TODO(sergey): Consider moving portable ctz/clz stuff to util. */
+
CCL_NAMESPACE_BEGIN
/* "Correlated Multi-Jittered Sampling"
@@ -35,8 +37,16 @@ ccl_device_inline int cmj_fast_mod_pow2(int a, int b)
/* a must be > 0 and b must be > 1 */
ccl_device_inline int cmj_fast_div_pow2(int a, int b)
{
-#if defined(__KERNEL_SSE2__) && !defined(_MSC_VER)
+ kernel_assert(a > 0);
+ kernel_assert(b > 1);
+#if defined(__KERNEL_SSE2__)
+# ifdef _MSC_VER
+ unsigned long ctz;
+ _BitScanForward(&ctz, b);
+ return a >> ctz;
+# else
return a >> __builtin_ctz(b);
+# endif
#else
return a/b;
#endif
@@ -44,8 +54,15 @@ ccl_device_inline int cmj_fast_div_pow2(int a, int b)
ccl_device_inline uint cmj_w_mask(uint w)
{
-#if defined(__KERNEL_SSE2__) && !defined(_MSC_VER)
+ kernel_assert(w > 1);
+#if defined(__KERNEL_SSE2__)
+# ifdef _MSC_VER
+ unsigned long leading_zero;
+ _BitScanReverse(&leading_zero, w);
+ return ((1 << (1 + leading_zero)) - 1);
+# else
return ((1 << (32 - __builtin_clz(w))) - 1);
+# endif
#else
w |= w >> 1;
w |= w >> 2;
@@ -165,7 +182,8 @@ ccl_device void cmj_sample_2D(int s, int N, int p, float *fx, float *fy)
smodm = cmj_fast_mod_pow2(s, m);
}
else {
- sdivm = float_to_int(s * invm);
+ /* Doing s*inmv gives precision issues here. */
+ sdivm = s / m;
smodm = s - sdivm*m;
}
diff --git a/intern/cycles/kernel/kernel_light.h b/intern/cycles/kernel/kernel_light.h
index ac432d3fe04..b18f67ad524 100644
--- a/intern/cycles/kernel/kernel_light.h
+++ b/intern/cycles/kernel/kernel_light.h
@@ -27,7 +27,7 @@ typedef struct LightSample {
float pdf; /* light sampling probability density function */
float eval_fac; /* intensity multiplier */
int object; /* object id for triangle/curve lights */
- int prim; /* primitive id for triangle/curve ligths */
+ int prim; /* primitive id for triangle/curve lights */
int shader; /* shader id */
int lamp; /* lamp id */
LightType type; /* type of light */
@@ -167,12 +167,137 @@ ccl_device float3 sphere_light_sample(float3 P, float3 center, float radius, flo
return disk_light_sample(normalize(P - center), randu, randv)*radius;
}
-ccl_device float3 area_light_sample(float3 axisu, float3 axisv, float randu, float randv)
+/* Uses the following paper:
+ *
+ * Carlos Urena et al.
+ * An Area-Preserving Parametrization for Spherical Rectangles.
+ *
+ * https://www.solidangle.com/research/egsr2013_spherical_rectangle.pdf
+ */
+ccl_device float3 area_light_sample(float3 P,
+ float3 light_p,
+ float3 axisu, float3 axisv,
+ float randu, float randv,
+ float *pdf)
{
- randu = randu - 0.5f;
- randv = randv - 0.5f;
+ /* In our name system we're using P for the center,
+ * which is o in the paper.
+ */
+
+ float3 corner = light_p - axisu * 0.5f - axisv * 0.5f;
+ float axisu_len, axisv_len;
+ /* Compute local reference system R. */
+ float3 x = normalize_len(axisu, &axisu_len);
+ float3 y = normalize_len(axisv, &axisv_len);
+ float3 z = cross(x, y);
+ /* Compute rectangle coords in local reference system. */
+ float3 dir = corner - P;
+ float z0 = dot(dir, z);
+ /* Flip 'z' to make it point against Q. */
+ if(z0 > 0.0f) {
+ z *= -1.0f;
+ z0 *= -1.0f;
+ }
+ float z0sq = z0 * z0;
+ float x0 = dot(dir, x);
+ float y0 = dot(dir, y);
+ float x1 = x0 + axisu_len;
+ float y1 = y0 + axisv_len;
+ float y0sq = y0 * y0;
+ float y1sq = y1 * y1;
+ /* Create vectors to four vertices. */
+ float3 v00 = make_float3(x0, y0, z0);
+ float3 v01 = make_float3(x0, y1, z0);
+ float3 v10 = make_float3(x1, y0, z0);
+ float3 v11 = make_float3(x1, y1, z0);
+ /* Compute normals to edges. */
+ float3 n0 = normalize(cross(v00, v10));
+ float3 n1 = normalize(cross(v10, v11));
+ float3 n2 = normalize(cross(v11, v01));
+ float3 n3 = normalize(cross(v01, v00));
+ /* Compute internal angles (gamma_i). */
+ float g0 = acosf(-dot(n0, n1));
+ float g1 = acosf(-dot(n1, n2));
+ float g2 = acosf(-dot(n2, n3));
+ float g3 = acosf(-dot(n3, n0));
+ /* Compute predefined constants. */
+ float b0 = n0.z;
+ float b1 = n2.z;
+ float b0sq = b0 * b0;
+ float k = M_2PI_F - g2 - g3;
+ /* Compute solid angle from internal angles. */
+ float S = g0 + g1 - k;
+
+ /* Compute cu. */
+ float au = randu * S + k;
+ float fu = (cosf(au) * b0 - b1) / sinf(au);
+ float cu = 1.0f / sqrtf(fu * fu + b0sq) * (fu > 0.0f ? 1.0f : -1.0f);
+ cu = clamp(cu, -1.0f, 1.0f);
+ /* Compute xu. */
+ float xu = -(cu * z0) / sqrtf(1.0f - cu * cu);
+ xu = clamp(xu, x0, x1);
+ /* Compute yv. */
+ float d = sqrtf(xu * xu + z0sq);
+ float h0 = y0 / sqrtf(d * d + y0sq);
+ float h1 = y1 / sqrtf(d * d + y1sq);
+ float hv = h0 + randv * (h1 - h0), hv2 = hv * hv;
+ float yv = (hv2 < 1.0f - 1e-6f) ? (hv * d) / sqrtf(1.0f - hv2) : y1;
+
+ *pdf = 1.0f / S;
+
+ /* Transform (xu, yv, z0) to world coords. */
+ return P + xu * x + yv * y + z0 * z;
+}
- return axisu*randu + axisv*randv;
+/* TODO(sergey): This is actually a duplicated code from above, but how to avoid
+ * this without having some nasty function with loads of parameters?
+ */
+ccl_device float area_light_pdf(float3 P,
+ float3 light_p,
+ float3 axisu, float3 axisv)
+{
+ /* In our name system we're using P for the center,
+ * which is o in the paper.
+ */
+
+ float3 corner = light_p - axisu * 0.5f - axisv * 0.5f;
+ float axisu_len, axisv_len;
+ /* Compute local reference system R. */
+ float3 x = normalize_len(axisu, &axisu_len);
+ float3 y = normalize_len(axisv, &axisv_len);
+ float3 z = cross(x, y);
+ /* Compute rectangle coords in local reference system. */
+ float3 dir = corner - P;
+ float z0 = dot(dir, z);
+ /* Flip 'z' to make it point against Q. */
+ if(z0 > 0.0f) {
+ z *= -1.0f;
+ z0 *= -1.0f;
+ }
+ float x0 = dot(dir, x);
+ float y0 = dot(dir, y);
+ float x1 = x0 + axisu_len;
+ float y1 = y0 + axisv_len;
+ /* Create vectors to four vertices. */
+ float3 v00 = make_float3(x0, y0, z0);
+ float3 v01 = make_float3(x0, y1, z0);
+ float3 v10 = make_float3(x1, y0, z0);
+ float3 v11 = make_float3(x1, y1, z0);
+ /* Compute normals to edges. */
+ float3 n0 = normalize(cross(v00, v10));
+ float3 n1 = normalize(cross(v10, v11));
+ float3 n2 = normalize(cross(v11, v01));
+ float3 n3 = normalize(cross(v01, v00));
+ /* Compute internal angles (gamma_i). */
+ float g0 = acosf(-dot(n0, n1));
+ float g1 = acosf(-dot(n1, n2));
+ float g2 = acosf(-dot(n2, n3));
+ float g3 = acosf(-dot(n3, n0));
+ /* Compute predefined constants. */
+ float k = M_2PI_F - g2 - g3;
+ /* Compute solid angle from internal angles. */
+ float S = g0 + g1 - k;
+ return 1.0f / S;
}
ccl_device float spot_light_attenuation(float4 data1, float4 data2, LightSample *ls)
@@ -276,6 +401,7 @@ ccl_device void lamp_light_sample(KernelGlobals *kg, int lamp,
float4 data2 = kernel_tex_fetch(__light_data, lamp*LIGHT_SIZE + 2);
ls->eval_fac *= spot_light_attenuation(data1, data2, ls);
}
+ ls->pdf *= lamp_light_pdf(kg, ls->Ng, -ls->D, ls->t);
}
else {
/* area light */
@@ -286,18 +412,22 @@ ccl_device void lamp_light_sample(KernelGlobals *kg, int lamp,
float3 axisv = make_float3(data2.y, data2.z, data2.w);
float3 D = make_float3(data3.y, data3.z, data3.w);
- ls->P += area_light_sample(axisu, axisv, randu, randv);
+ ls->P = area_light_sample(P, ls->P,
+ axisu, axisv,
+ randu, randv,
+ &ls->pdf);
+
ls->Ng = D;
ls->D = normalize_len(ls->P - P, &ls->t);
float invarea = data2.x;
-
ls->eval_fac = 0.25f*invarea;
- ls->pdf = invarea;
+
+ if(dot(ls->D, D) > 0.0f)
+ ls->pdf = 0.0f;
}
ls->eval_fac *= kernel_data.integrator.inv_pdf_lights;
- ls->pdf *= lamp_light_pdf(kg, ls->Ng, -ls->D, ls->t);
}
}
@@ -355,8 +485,12 @@ ccl_device bool lamp_light_eval(KernelGlobals *kg, int lamp, float3 P, float3 D,
ls->D = D;
ls->t = FLT_MAX;
+ /* compute pdf */
float invarea = data1.w;
ls->pdf = invarea/(costheta*costheta*costheta);
+ if(ls->t != FLT_MAX)
+ ls->pdf *= lamp_light_pdf(kg, ls->Ng, -ls->D, ls->t);
+
ls->eval_fac = ls->pdf;
}
else if(type == LIGHT_POINT || type == LIGHT_SPOT) {
@@ -386,6 +520,10 @@ ccl_device bool lamp_light_eval(KernelGlobals *kg, int lamp, float3 P, float3 D,
if(ls->eval_fac == 0.0f)
return false;
}
+
+ /* compute pdf */
+ if(ls->t != FLT_MAX)
+ ls->pdf *= lamp_light_pdf(kg, ls->Ng, -ls->D, ls->t);
}
else if(type == LIGHT_AREA) {
/* area light */
@@ -412,16 +550,12 @@ ccl_device bool lamp_light_eval(KernelGlobals *kg, int lamp, float3 P, float3 D,
ls->D = D;
ls->Ng = Ng;
- ls->pdf = invarea;
- ls->eval_fac = 0.25f*ls->pdf;
+ ls->pdf = area_light_pdf(P, ls->P, axisu, axisv);
+ ls->eval_fac = 0.25f*invarea;
}
else
return false;
- /* compute pdf */
- if(ls->t != FLT_MAX)
- ls->pdf *= lamp_light_pdf(kg, ls->Ng, -ls->D, ls->t);
-
return true;
}
@@ -457,7 +591,7 @@ ccl_device void triangle_light_sample(KernelGlobals *kg, int prim, int object,
v = randv*randu;
/* triangle, so get position, normal, shader */
- triangle_point_normal(kg, prim, u, v, &ls->P, &ls->Ng, &ls->shader);
+ triangle_point_normal(kg, object, prim, u, v, &ls->P, &ls->Ng, &ls->shader);
ls->object = object;
ls->prim = prim;
ls->lamp = LAMP_NONE;
@@ -546,11 +680,6 @@ ccl_device int light_select_num_samples(KernelGlobals *kg, int index)
return __float_as_int(data3.x);
}
-ccl_device void light_select(KernelGlobals *kg, int index, float randu, float randv, float3 P, LightSample *ls)
-{
- lamp_light_sample(kg, index, randu, randv, P, ls);
-}
-
ccl_device int lamp_light_eval_sample(KernelGlobals *kg, float randt)
{
/* sample index */
diff --git a/intern/cycles/kernel/kernel_path.h b/intern/cycles/kernel/kernel_path.h
index a80a0033712..c03229f0a3a 100644
--- a/intern/cycles/kernel/kernel_path.h
+++ b/intern/cycles/kernel/kernel_path.h
@@ -29,7 +29,6 @@
#include "kernel_accumulate.h"
#include "kernel_shader.h"
#include "kernel_light.h"
-#include "kernel_emission.h"
#include "kernel_passes.h"
#ifdef __SUBSURFACE__
@@ -42,177 +41,15 @@
#include "kernel_path_state.h"
#include "kernel_shadow.h"
+#include "kernel_emission.h"
+#include "kernel_path_surface.h"
+#include "kernel_path_volume.h"
-CCL_NAMESPACE_BEGIN
-
-#ifdef __VOLUME__
-
-ccl_device_inline bool kernel_path_integrate_scatter_lighting(KernelGlobals *kg, RNG *rng,
- ShaderData *sd, float3 *throughput, PathState *state, PathRadiance *L, Ray *ray,
- float num_samples_adjust)
-{
-#ifdef __EMISSION__
- if(kernel_data.integrator.use_direct_light) {
- /* sample illumination from lights to find path contribution */
- if(sd->flag & SD_BSDF_HAS_EVAL) {
- float light_t = path_state_rng_1D(kg, rng, state, PRNG_LIGHT);
- float light_u, light_v;
- path_state_rng_2D(kg, rng, state, PRNG_LIGHT_U, &light_u, &light_v);
-
- Ray light_ray;
- BsdfEval L_light;
- bool is_lamp;
-
-#ifdef __OBJECT_MOTION__
- light_ray.time = sd->time;
-#endif
-
- if(direct_emission(kg, sd, LAMP_NONE, light_t, light_u, light_v, &light_ray, &L_light, &is_lamp, state->bounce, state->transparent_bounce)) {
- /* trace shadow ray */
- float3 shadow;
-
- if(!shadow_blocked(kg, state, &light_ray, &shadow)) {
- /* accumulate */
- path_radiance_accum_light(L, *throughput * num_samples_adjust, &L_light, shadow, 1.0f, state->bounce, is_lamp);
- }
- }
- }
- }
-#endif
-
- /* sample phase function */
- float phase_pdf;
- BsdfEval phase_eval;
- float3 phase_omega_in;
- differential3 phase_domega_in;
- float phase_u, phase_v;
- path_state_rng_2D(kg, rng, state, PRNG_PHASE_U, &phase_u, &phase_v);
- int label;
-
- label = shader_volume_phase_sample(kg, sd, phase_u, phase_v, &phase_eval,
- &phase_omega_in, &phase_domega_in, &phase_pdf);
-
- if(phase_pdf == 0.0f || bsdf_eval_is_zero(&phase_eval))
- return false;
-
- /* modify throughput */
- path_radiance_bsdf_bounce(L, throughput, &phase_eval, phase_pdf, state->bounce, label);
-
- /* set labels */
- state->ray_pdf = phase_pdf;
-#ifdef __LAMP_MIS__
- state->ray_t = 0.0f;
-#endif
- state->min_ray_pdf = fminf(phase_pdf, state->min_ray_pdf);
-
- /* update path state */
- path_state_next(kg, state, label);
-
- /* setup ray */
- ray->P = sd->P;
- ray->D = phase_omega_in;
- ray->t = FLT_MAX;
-
-#ifdef __RAY_DIFFERENTIALS__
- ray->dP = sd->dP;
- ray->dD = phase_domega_in;
-#endif
-
- return true;
-}
-
+#ifdef __KERNEL_DEBUG__
+#include "kernel_debug.h"
#endif
-#if defined(__BRANCHED_PATH__) || defined(__SUBSURFACE__)
-
-ccl_device void kernel_branched_path_integrate_direct_lighting(KernelGlobals *kg, RNG *rng,
- ShaderData *sd, PathState *state, float3 throughput, float num_samples_adjust, PathRadiance *L, bool sample_all_lights)
-{
- /* sample illumination from lights to find path contribution */
- if(sd->flag & SD_BSDF_HAS_EVAL) {
- Ray light_ray;
- BsdfEval L_light;
- bool is_lamp;
-
-#ifdef __OBJECT_MOTION__
- light_ray.time = sd->time;
-#endif
-
- if(sample_all_lights) {
- /* lamp sampling */
- for(int i = 0; i < kernel_data.integrator.num_all_lights; i++) {
- int num_samples = ceil_to_int(num_samples_adjust*light_select_num_samples(kg, i));
- float num_samples_inv = num_samples_adjust/(num_samples*kernel_data.integrator.num_all_lights);
- RNG lamp_rng = cmj_hash(*rng, i);
-
- if(kernel_data.integrator.pdf_triangles != 0.0f)
- num_samples_inv *= 0.5f;
-
- for(int j = 0; j < num_samples; j++) {
- float light_u, light_v;
- path_branched_rng_2D(kg, &lamp_rng, state, j, num_samples, PRNG_LIGHT_U, &light_u, &light_v);
-
- if(direct_emission(kg, sd, i, 0.0f, light_u, light_v, &light_ray, &L_light, &is_lamp, state->bounce, state->transparent_bounce)) {
- /* trace shadow ray */
- float3 shadow;
-
- if(!shadow_blocked(kg, state, &light_ray, &shadow)) {
- /* accumulate */
- path_radiance_accum_light(L, throughput*num_samples_inv, &L_light, shadow, num_samples_inv, state->bounce, is_lamp);
- }
- }
- }
- }
-
- /* mesh light sampling */
- if(kernel_data.integrator.pdf_triangles != 0.0f) {
- int num_samples = ceil_to_int(num_samples_adjust*kernel_data.integrator.mesh_light_samples);
- float num_samples_inv = num_samples_adjust/num_samples;
-
- if(kernel_data.integrator.num_all_lights)
- num_samples_inv *= 0.5f;
-
- for(int j = 0; j < num_samples; j++) {
- float light_t = path_branched_rng_1D(kg, rng, state, j, num_samples, PRNG_LIGHT);
- float light_u, light_v;
- path_branched_rng_2D(kg, rng, state, j, num_samples, PRNG_LIGHT_U, &light_u, &light_v);
-
- /* only sample triangle lights */
- if(kernel_data.integrator.num_all_lights)
- light_t = 0.5f*light_t;
-
- if(direct_emission(kg, sd, LAMP_NONE, light_t, light_u, light_v, &light_ray, &L_light, &is_lamp, state->bounce, state->transparent_bounce)) {
- /* trace shadow ray */
- float3 shadow;
-
- if(!shadow_blocked(kg, state, &light_ray, &shadow)) {
- /* accumulate */
- path_radiance_accum_light(L, throughput*num_samples_inv, &L_light, shadow, num_samples_inv, state->bounce, is_lamp);
- }
- }
- }
- }
- }
- else {
- float light_t = path_state_rng_1D(kg, rng, state, PRNG_LIGHT);
- float light_u, light_v;
- path_state_rng_2D(kg, rng, state, PRNG_LIGHT_U, &light_u, &light_v);
-
- /* sample random light */
- if(direct_emission(kg, sd, LAMP_NONE, light_t, light_u, light_v, &light_ray, &L_light, &is_lamp, state->bounce, state->transparent_bounce)) {
- /* trace shadow ray */
- float3 shadow;
-
- if(!shadow_blocked(kg, state, &light_ray, &shadow)) {
- /* accumulate */
- path_radiance_accum_light(L, throughput, &L_light, shadow, 1.0f, state->bounce, is_lamp);
- }
- }
- }
- }
-}
-
-#endif
+CCL_NAMESPACE_BEGIN
ccl_device void kernel_path_indirect(KernelGlobals *kg, RNG *rng, Ray ray,
float3 throughput, int num_samples, PathState state, PathRadiance *L)
@@ -222,11 +59,7 @@ ccl_device void kernel_path_indirect(KernelGlobals *kg, RNG *rng, Ray ray,
/* intersect scene */
Intersection isect;
uint visibility = path_state_ray_visibility(kg, &state);
-#ifdef __HAIR__
bool hit = scene_intersect(kg, &ray, visibility, &isect, NULL, 0.0f, 0.0f);
-#else
- bool hit = scene_intersect(kg, &ray, visibility, &isect);
-#endif
#ifdef __LAMP_MIS__
if(kernel_data.integrator.use_lamp_mis && !(state.flag & PATH_RAY_CAMERA)) {
@@ -255,15 +88,81 @@ ccl_device void kernel_path_indirect(KernelGlobals *kg, RNG *rng, Ray ray,
Ray volume_ray = ray;
volume_ray.t = (hit)? isect.t: FLT_MAX;
- ShaderData volume_sd;
- VolumeIntegrateResult result = kernel_volume_integrate(kg, &state,
- &volume_sd, &volume_ray, L, &throughput, rng);
+ bool heterogeneous = volume_stack_is_heterogeneous(kg, state.volume_stack);
- if(result == VOLUME_PATH_SCATTERED) {
- if(kernel_path_integrate_scatter_lighting(kg, rng, &volume_sd, &throughput, &state, L, &ray, 1.0f))
- continue;
- else
- break;
+#ifdef __VOLUME_DECOUPLED__
+ int sampling_method = volume_stack_sampling_method(kg, state.volume_stack);
+ bool decoupled = kernel_volume_use_decoupled(kg, heterogeneous, false, sampling_method);
+
+ if(decoupled) {
+ /* cache steps along volume for repeated sampling */
+ VolumeSegment volume_segment;
+ ShaderData volume_sd;
+
+ shader_setup_from_volume(kg, &volume_sd, &volume_ray, state.bounce, state.transparent_bounce);
+ kernel_volume_decoupled_record(kg, &state,
+ &volume_ray, &volume_sd, &volume_segment, heterogeneous);
+
+ volume_segment.sampling_method = sampling_method;
+
+ /* emission */
+ if(volume_segment.closure_flag & SD_EMISSION)
+ path_radiance_accum_emission(L, throughput, volume_segment.accum_emission, state.bounce);
+
+ /* scattering */
+ VolumeIntegrateResult result = VOLUME_PATH_ATTENUATED;
+
+ if(volume_segment.closure_flag & SD_SCATTER) {
+ bool all = kernel_data.integrator.sample_all_lights_indirect;
+
+ /* direct light sampling */
+ kernel_branched_path_volume_connect_light(kg, rng, &volume_sd,
+ throughput, &state, L, 1.0f, all, &volume_ray, &volume_segment);
+
+ /* indirect sample. if we use distance sampling and take just
+ * one sample for direct and indirect light, we could share
+ * this computation, but makes code a bit complex */
+ float rphase = path_state_rng_1D_for_decision(kg, rng, &state, PRNG_PHASE);
+ float rscatter = path_state_rng_1D_for_decision(kg, rng, &state, PRNG_SCATTER_DISTANCE);
+
+ result = kernel_volume_decoupled_scatter(kg,
+ &state, &volume_ray, &volume_sd, &throughput,
+ rphase, rscatter, &volume_segment, NULL, true);
+ }
+
+ if(result != VOLUME_PATH_SCATTERED)
+ throughput *= volume_segment.accum_transmittance;
+
+ /* free cached steps */
+ kernel_volume_decoupled_free(kg, &volume_segment);
+
+ if(result == VOLUME_PATH_SCATTERED) {
+ if(kernel_path_volume_bounce(kg, rng, &volume_sd, &throughput, &state, L, &ray))
+ continue;
+ else
+ break;
+ }
+ }
+ else
+#endif
+ {
+ /* integrate along volume segment with distance sampling */
+ ShaderData volume_sd;
+ VolumeIntegrateResult result = kernel_volume_integrate(
+ kg, &state, &volume_sd, &volume_ray, L, &throughput, rng, heterogeneous);
+
+#ifdef __VOLUME_SCATTER__
+ if(result == VOLUME_PATH_SCATTERED) {
+ /* direct lighting */
+ kernel_path_volume_connect_light(kg, rng, &volume_sd, throughput, &state, L);
+
+ /* indirect light bounce */
+ if(kernel_path_volume_bounce(kg, rng, &volume_sd, &throughput, &state, L, &ray))
+ continue;
+ else
+ break;
+ }
+#endif
}
}
#endif
@@ -281,7 +180,7 @@ ccl_device void kernel_path_indirect(KernelGlobals *kg, RNG *rng, Ray ray,
/* setup shading */
ShaderData sd;
shader_setup_from_ray(kg, &sd, &isect, &ray, state.bounce, state.transparent_bounce);
- float rbsdf = path_state_rng_1D(kg, rng, &state, PRNG_BSDF);
+ float rbsdf = path_state_rng_1D_for_decision(kg, rng, &state, PRNG_BSDF);
shader_eval_surface(kg, &sd, rbsdf, state.flag, SHADER_CONTEXT_INDIRECT);
#ifdef __BRANCHED_PATH__
shader_merge_closures(&sd);
@@ -315,7 +214,7 @@ ccl_device void kernel_path_indirect(KernelGlobals *kg, RNG *rng, Ray ray,
break;
}
else if(probability != 1.0f) {
- float terminate = path_state_rng_1D(kg, rng, &state, PRNG_TERMINATE);
+ float terminate = path_state_rng_1D_for_decision(kg, rng, &state, PRNG_TERMINATE);
if(terminate >= probability)
break;
@@ -383,187 +282,12 @@ ccl_device void kernel_path_indirect(KernelGlobals *kg, RNG *rng, Ray ray,
#if defined(__EMISSION__) && defined(__BRANCHED_PATH__)
if(kernel_data.integrator.use_direct_light) {
bool all = kernel_data.integrator.sample_all_lights_indirect;
- kernel_branched_path_integrate_direct_lighting(kg, rng, &sd, &state, throughput, 1.0f, L, all);
- }
-#endif
-
- /* no BSDF? we can stop here */
- if(sd.flag & SD_BSDF) {
- /* sample BSDF */
- float bsdf_pdf;
- BsdfEval bsdf_eval;
- float3 bsdf_omega_in;
- differential3 bsdf_domega_in;
- float bsdf_u, bsdf_v;
- path_state_rng_2D(kg, rng, &state, PRNG_BSDF_U, &bsdf_u, &bsdf_v);
- int label;
-
- label = shader_bsdf_sample(kg, &sd, bsdf_u, bsdf_v, &bsdf_eval,
- &bsdf_omega_in, &bsdf_domega_in, &bsdf_pdf);
-
- if(bsdf_pdf == 0.0f || bsdf_eval_is_zero(&bsdf_eval))
- break;
-
- /* modify throughput */
- path_radiance_bsdf_bounce(L, &throughput, &bsdf_eval, bsdf_pdf, state.bounce, label);
-
- /* set labels */
- if(!(label & LABEL_TRANSPARENT)) {
- state.ray_pdf = bsdf_pdf;
-#ifdef __LAMP_MIS__
- state.ray_t = 0.0f;
-#endif
- state.min_ray_pdf = fminf(bsdf_pdf, state.min_ray_pdf);
- }
-
- /* update path state */
- path_state_next(kg, &state, label);
-
- /* setup ray */
- ray.P = ray_offset(sd.P, (label & LABEL_TRANSMIT)? -sd.Ng: sd.Ng);
- ray.D = bsdf_omega_in;
- ray.t = FLT_MAX;
-#ifdef __RAY_DIFFERENTIALS__
- ray.dP = sd.dP;
- ray.dD = bsdf_domega_in;
-#endif
-
-#ifdef __VOLUME__
- /* enter/exit volume */
- if(label & LABEL_TRANSMIT)
- kernel_volume_stack_enter_exit(kg, &sd, state.volume_stack);
-#endif
+ kernel_branched_path_surface_connect_light(kg, rng, &sd, &state, throughput, 1.0f, L, all);
}
-#ifdef __VOLUME__
- else if(sd.flag & SD_HAS_ONLY_VOLUME) {
- /* no surface shader but have a volume shader? act transparent */
-
- /* update path state, count as transparent */
- path_state_next(kg, &state, LABEL_TRANSPARENT);
-
- /* setup ray position, direction stays unchanged */
- ray.P = ray_offset(sd.P, -sd.Ng);
-#ifdef __RAY_DIFFERENTIALS__
- ray.dP = sd.dP;
#endif
- /* enter/exit volume */
- kernel_volume_stack_enter_exit(kg, &sd, state.volume_stack);
- }
-#endif
- else {
- /* no bsdf or volume? we're done */
+ if(!kernel_path_surface_bounce(kg, rng, &sd, &throughput, &state, L, &ray))
break;
- }
- }
-}
-
-ccl_device_inline bool kernel_path_integrate_lighting(KernelGlobals *kg, RNG *rng,
- ShaderData *sd, float3 *throughput, PathState *state, PathRadiance *L, Ray *ray)
-{
-#ifdef __EMISSION__
- if(kernel_data.integrator.use_direct_light) {
- /* sample illumination from lights to find path contribution */
- if(sd->flag & SD_BSDF_HAS_EVAL) {
- float light_t = path_state_rng_1D(kg, rng, state, PRNG_LIGHT);
- float light_u, light_v;
- path_state_rng_2D(kg, rng, state, PRNG_LIGHT_U, &light_u, &light_v);
-
- Ray light_ray;
- BsdfEval L_light;
- bool is_lamp;
-
-#ifdef __OBJECT_MOTION__
- light_ray.time = sd->time;
-#endif
-
- if(direct_emission(kg, sd, LAMP_NONE, light_t, light_u, light_v, &light_ray, &L_light, &is_lamp, state->bounce, state->transparent_bounce)) {
- /* trace shadow ray */
- float3 shadow;
-
- if(!shadow_blocked(kg, state, &light_ray, &shadow)) {
- /* accumulate */
- path_radiance_accum_light(L, *throughput, &L_light, shadow, 1.0f, state->bounce, is_lamp);
- }
- }
- }
- }
-#endif
-
- /* no BSDF? we can stop here */
- if(sd->flag & SD_BSDF) {
- /* sample BSDF */
- float bsdf_pdf;
- BsdfEval bsdf_eval;
- float3 bsdf_omega_in;
- differential3 bsdf_domega_in;
- float bsdf_u, bsdf_v;
- path_state_rng_2D(kg, rng, state, PRNG_BSDF_U, &bsdf_u, &bsdf_v);
- int label;
-
- label = shader_bsdf_sample(kg, sd, bsdf_u, bsdf_v, &bsdf_eval,
- &bsdf_omega_in, &bsdf_domega_in, &bsdf_pdf);
-
- if(bsdf_pdf == 0.0f || bsdf_eval_is_zero(&bsdf_eval))
- return false;
-
- /* modify throughput */
- path_radiance_bsdf_bounce(L, throughput, &bsdf_eval, bsdf_pdf, state->bounce, label);
-
- /* set labels */
- if(!(label & LABEL_TRANSPARENT)) {
- state->ray_pdf = bsdf_pdf;
-#ifdef __LAMP_MIS__
- state->ray_t = 0.0f;
-#endif
- state->min_ray_pdf = fminf(bsdf_pdf, state->min_ray_pdf);
- }
-
- /* update path state */
- path_state_next(kg, state, label);
-
- /* setup ray */
- ray->P = ray_offset(sd->P, (label & LABEL_TRANSMIT)? -sd->Ng: sd->Ng);
- ray->D = bsdf_omega_in;
-
- if(state->bounce == 0)
- ray->t -= sd->ray_length; /* clipping works through transparent */
- else
- ray->t = FLT_MAX;
-
-#ifdef __RAY_DIFFERENTIALS__
- ray->dP = sd->dP;
- ray->dD = bsdf_domega_in;
-#endif
-
-#ifdef __VOLUME__
- /* enter/exit volume */
- if(label & LABEL_TRANSMIT)
- kernel_volume_stack_enter_exit(kg, sd, state->volume_stack);
-#endif
- return true;
- }
-#ifdef __VOLUME__
- else if(sd->flag & SD_HAS_ONLY_VOLUME) {
- /* no surface shader but have a volume shader? act transparent */
-
- /* update path state, count as transparent */
- path_state_next(kg, state, LABEL_TRANSPARENT);
-
- /* setup ray position, direction stays unchanged */
- ray->P = ray_offset(sd->P, -sd->Ng);
-#ifdef __RAY_DIFFERENTIALS__
- ray->dP = sd->dP;
-#endif
-
- /* enter/exit volume */
- kernel_volume_stack_enter_exit(kg, sd, state->volume_stack);
- return true;
- }
-#endif
- else {
- /* no bsdf or volume? */
- return false;
}
}
@@ -601,7 +325,68 @@ ccl_device void kernel_path_ao(KernelGlobals *kg, ShaderData *sd, PathRadiance *
}
}
+ccl_device void kernel_branched_path_ao(KernelGlobals *kg, ShaderData *sd, PathRadiance *L, PathState *state, RNG *rng, float3 throughput)
+{
+ int num_samples = kernel_data.integrator.ao_samples;
+ float num_samples_inv = 1.0f/num_samples;
+ float ao_factor = kernel_data.background.ao_factor;
+ float3 ao_N;
+ float3 ao_bsdf = shader_bsdf_ao(kg, sd, ao_factor, &ao_N);
+ float3 ao_alpha = shader_bsdf_alpha(kg, sd);
+
+ for(int j = 0; j < num_samples; j++) {
+ float bsdf_u, bsdf_v;
+ path_branched_rng_2D(kg, rng, state, j, num_samples, PRNG_BSDF_U, &bsdf_u, &bsdf_v);
+
+ float3 ao_D;
+ float ao_pdf;
+
+ sample_cos_hemisphere(ao_N, bsdf_u, bsdf_v, &ao_D, &ao_pdf);
+
+ if(dot(sd->Ng, ao_D) > 0.0f && ao_pdf != 0.0f) {
+ Ray light_ray;
+ float3 ao_shadow;
+
+ light_ray.P = ray_offset(sd->P, sd->Ng);
+ light_ray.D = ao_D;
+ light_ray.t = kernel_data.background.ao_distance;
+#ifdef __OBJECT_MOTION__
+ light_ray.time = sd->time;
+#endif
+ light_ray.dP = sd->dP;
+ light_ray.dD = differential3_zero();
+
+ if(!shadow_blocked(kg, state, &light_ray, &ao_shadow))
+ path_radiance_accum_ao(L, throughput*num_samples_inv, ao_alpha, ao_bsdf, ao_shadow, state->bounce);
+ }
+ }
+}
+
#ifdef __SUBSURFACE__
+
+#ifdef __VOLUME__
+ccl_device void kernel_path_subsurface_update_volume_stack(KernelGlobals *kg,
+ Ray *ray,
+ VolumeStack *stack)
+{
+ kernel_assert(kernel_data.integrator.use_volumes);
+
+ Ray volume_ray = *ray;
+ Intersection isect;
+
+ while(scene_intersect_volume(kg, &volume_ray, &isect))
+ {
+ ShaderData sd;
+ shader_setup_from_ray(kg, &sd, &isect, &volume_ray, 0, 0);
+ kernel_volume_stack_enter_exit(kg, &sd, stack);
+
+ /* Move ray forward. */
+ volume_ray.P = ray_offset(sd.P, -sd.Ng);
+ volume_ray.t -= sd.ray_length;
+ }
+}
+#endif
+
ccl_device bool kernel_path_subsurface_scatter(KernelGlobals *kg, ShaderData *sd, PathRadiance *L, PathState *state, RNG *rng, Ray *ray, float3 *throughput)
{
float bssrdf_probability;
@@ -618,6 +403,11 @@ ccl_device bool kernel_path_subsurface_scatter(KernelGlobals *kg, ShaderData *sd
float bssrdf_u, bssrdf_v;
path_state_rng_2D(kg, rng, state, PRNG_BSDF_U, &bssrdf_u, &bssrdf_v);
int num_hits = subsurface_scatter_multi_step(kg, sd, bssrdf_sd, state->flag, sc, &lcg_state, bssrdf_u, bssrdf_v, false);
+#ifdef __VOLUME__
+ Ray volume_ray = *ray;
+ bool need_update_volume_stack = kernel_data.integrator.use_volumes &&
+ sd->flag & SD_OBJECT_INTERSECTS_VOLUME;
+#endif
/* compute lighting with the BSDF closure */
for(int hit = 0; hit < num_hits; hit++) {
@@ -627,12 +417,30 @@ ccl_device bool kernel_path_subsurface_scatter(KernelGlobals *kg, ShaderData *sd
hit_state.flag |= PATH_RAY_BSSRDF_ANCESTOR;
hit_state.rng_offset += PRNG_BOUNCE_NUM;
+
+ kernel_path_surface_connect_light(kg, rng, &bssrdf_sd[hit], tp, state, L);
- if(kernel_path_integrate_lighting(kg, rng, &bssrdf_sd[hit], &tp, &hit_state, L, &hit_ray)) {
+ if(kernel_path_surface_bounce(kg, rng, &bssrdf_sd[hit], &tp, &hit_state, L, &hit_ray)) {
#ifdef __LAMP_MIS__
hit_state.ray_t = 0.0f;
#endif
+#ifdef __VOLUME__
+ if(need_update_volume_stack) {
+ /* Setup ray from previous surface point to the new one. */
+ volume_ray.D = normalize_len(hit_ray.P - volume_ray.P,
+ &volume_ray.t);
+
+ kernel_path_subsurface_update_volume_stack(
+ kg,
+ &volume_ray,
+ hit_state.volume_stack);
+
+ /* Move volume ray forward. */
+ volume_ray.P = hit_ray.P;
+ }
+#endif
+
kernel_path_indirect(kg, rng, hit_ray, tp, state->num_samples, hit_state, L);
/* for render passes, sum and reset indirect light pass variables
@@ -657,7 +465,12 @@ ccl_device float4 kernel_path_integrate(KernelGlobals *kg, RNG *rng, int sample,
path_radiance_init(&L, kernel_data.film.use_light_pass);
PathState state;
- path_state_init(kg, &state, rng, sample);
+ path_state_init(kg, &state, rng, sample, &ray);
+
+#ifdef __KERNEL_DEBUG__
+ DebugData debug_data;
+ debug_data_init(&debug_data);
+#endif
/* path iteration */
for(;;) {
@@ -682,7 +495,13 @@ ccl_device float4 kernel_path_integrate(KernelGlobals *kg, RNG *rng, int sample,
bool hit = scene_intersect(kg, &ray, visibility, &isect, &lcg_state, difl, extmax);
#else
- bool hit = scene_intersect(kg, &ray, visibility, &isect);
+ bool hit = scene_intersect(kg, &ray, visibility, &isect, NULL, 0.0f, 0.0f);
+#endif
+
+#ifdef __KERNEL_DEBUG__
+ if(state.flag & PATH_RAY_CAMERA) {
+ debug_data.num_bvh_traversal_steps += isect.num_traversal_steps;
+ }
#endif
#ifdef __LAMP_MIS__
@@ -712,15 +531,81 @@ ccl_device float4 kernel_path_integrate(KernelGlobals *kg, RNG *rng, int sample,
Ray volume_ray = ray;
volume_ray.t = (hit)? isect.t: FLT_MAX;
- ShaderData volume_sd;
- VolumeIntegrateResult result = kernel_volume_integrate(kg, &state,
- &volume_sd, &volume_ray, &L, &throughput, rng);
+ bool heterogeneous = volume_stack_is_heterogeneous(kg, state.volume_stack);
- if(result == VOLUME_PATH_SCATTERED) {
- if(kernel_path_integrate_scatter_lighting(kg, rng, &volume_sd, &throughput, &state, &L, &ray, 1.0f))
- continue;
- else
- break;
+#ifdef __VOLUME_DECOUPLED__
+ int sampling_method = volume_stack_sampling_method(kg, state.volume_stack);
+ bool decoupled = kernel_volume_use_decoupled(kg, heterogeneous, true, sampling_method);
+
+ if(decoupled) {
+ /* cache steps along volume for repeated sampling */
+ VolumeSegment volume_segment;
+ ShaderData volume_sd;
+
+ shader_setup_from_volume(kg, &volume_sd, &volume_ray, state.bounce, state.transparent_bounce);
+ kernel_volume_decoupled_record(kg, &state,
+ &volume_ray, &volume_sd, &volume_segment, heterogeneous);
+
+ volume_segment.sampling_method = sampling_method;
+
+ /* emission */
+ if(volume_segment.closure_flag & SD_EMISSION)
+ path_radiance_accum_emission(&L, throughput, volume_segment.accum_emission, state.bounce);
+
+ /* scattering */
+ VolumeIntegrateResult result = VOLUME_PATH_ATTENUATED;
+
+ if(volume_segment.closure_flag & SD_SCATTER) {
+ bool all = false;
+
+ /* direct light sampling */
+ kernel_branched_path_volume_connect_light(kg, rng, &volume_sd,
+ throughput, &state, &L, 1.0f, all, &volume_ray, &volume_segment);
+
+ /* indirect sample. if we use distance sampling and take just
+ * one sample for direct and indirect light, we could share
+ * this computation, but makes code a bit complex */
+ float rphase = path_state_rng_1D_for_decision(kg, rng, &state, PRNG_PHASE);
+ float rscatter = path_state_rng_1D_for_decision(kg, rng, &state, PRNG_SCATTER_DISTANCE);
+
+ result = kernel_volume_decoupled_scatter(kg,
+ &state, &volume_ray, &volume_sd, &throughput,
+ rphase, rscatter, &volume_segment, NULL, true);
+ }
+
+ if(result != VOLUME_PATH_SCATTERED)
+ throughput *= volume_segment.accum_transmittance;
+
+ /* free cached steps */
+ kernel_volume_decoupled_free(kg, &volume_segment);
+
+ if(result == VOLUME_PATH_SCATTERED) {
+ if(kernel_path_volume_bounce(kg, rng, &volume_sd, &throughput, &state, &L, &ray))
+ continue;
+ else
+ break;
+ }
+ }
+ else
+#endif
+ {
+ /* integrate along volume segment with distance sampling */
+ ShaderData volume_sd;
+ VolumeIntegrateResult result = kernel_volume_integrate(
+ kg, &state, &volume_sd, &volume_ray, &L, &throughput, rng, heterogeneous);
+
+#ifdef __VOLUME_SCATTER__
+ if(result == VOLUME_PATH_SCATTERED) {
+ /* direct lighting */
+ kernel_path_volume_connect_light(kg, rng, &volume_sd, throughput, &state, &L);
+
+ /* indirect light bounce */
+ if(kernel_path_volume_bounce(kg, rng, &volume_sd, &throughput, &state, &L, &ray))
+ continue;
+ else
+ break;
+ }
+#endif
}
}
#endif
@@ -748,7 +633,7 @@ ccl_device float4 kernel_path_integrate(KernelGlobals *kg, RNG *rng, int sample,
/* setup shading */
ShaderData sd;
shader_setup_from_ray(kg, &sd, &isect, &ray, state.bounce, state.transparent_bounce);
- float rbsdf = path_state_rng_1D(kg, rng, &state, PRNG_BSDF);
+ float rbsdf = path_state_rng_1D_for_decision(kg, rng, &state, PRNG_BSDF);
shader_eval_surface(kg, &sd, rbsdf, state.flag, SHADER_CONTEXT_MAIN);
/* holdout */
@@ -803,7 +688,7 @@ ccl_device float4 kernel_path_integrate(KernelGlobals *kg, RNG *rng, int sample,
break;
}
else if(probability != 1.0f) {
- float terminate = path_state_rng_1D(kg, rng, &state, PRNG_TERMINATE);
+ float terminate = path_state_rng_1D_for_decision(kg, rng, &state, PRNG_TERMINATE);
if(terminate >= probability)
break;
@@ -826,134 +711,33 @@ ccl_device float4 kernel_path_integrate(KernelGlobals *kg, RNG *rng, int sample,
break;
}
#endif
-
- /* Same as kernel_path_integrate_lighting(kg, rng, &sd, &throughput, &state, &L, &ray),
- but for CUDA the function call is slower. */
-#ifdef __EMISSION__
- if(kernel_data.integrator.use_direct_light) {
- /* sample illumination from lights to find path contribution */
- if(sd.flag & SD_BSDF_HAS_EVAL) {
- float light_t = path_state_rng_1D(kg, rng, &state, PRNG_LIGHT);
- float light_u, light_v;
- path_state_rng_2D(kg, rng, &state, PRNG_LIGHT_U, &light_u, &light_v);
-
- Ray light_ray;
- BsdfEval L_light;
- bool is_lamp;
-
-#ifdef __OBJECT_MOTION__
- light_ray.time = sd.time;
-#endif
-
- if(direct_emission(kg, &sd, LAMP_NONE, light_t, light_u, light_v, &light_ray, &L_light, &is_lamp, state.bounce, state.transparent_bounce)) {
- /* trace shadow ray */
- float3 shadow;
-
- if(!shadow_blocked(kg, &state, &light_ray, &shadow)) {
- /* accumulate */
- path_radiance_accum_light(&L, throughput, &L_light, shadow, 1.0f, state.bounce, is_lamp);
- }
- }
- }
- }
-#endif
-
- if(sd.flag & SD_BSDF) {
- /* sample BSDF */
- float bsdf_pdf;
- BsdfEval bsdf_eval;
- float3 bsdf_omega_in;
- differential3 bsdf_domega_in;
- float bsdf_u, bsdf_v;
- path_state_rng_2D(kg, rng, &state, PRNG_BSDF_U, &bsdf_u, &bsdf_v);
- int label;
-
- label = shader_bsdf_sample(kg, &sd, bsdf_u, bsdf_v, &bsdf_eval,
- &bsdf_omega_in, &bsdf_domega_in, &bsdf_pdf);
-
- if(bsdf_pdf == 0.0f || bsdf_eval_is_zero(&bsdf_eval))
- break;
-
- /* modify throughput */
- path_radiance_bsdf_bounce(&L, &throughput, &bsdf_eval, bsdf_pdf, state.bounce, label);
-
- /* set labels */
- if(!(label & LABEL_TRANSPARENT)) {
- state.ray_pdf = bsdf_pdf;
-#ifdef __LAMP_MIS__
- state.ray_t = 0.0f;
-#endif
- state.min_ray_pdf = fminf(bsdf_pdf, state.min_ray_pdf);
- }
-
- /* update path state */
- path_state_next(kg, &state, label);
-
- /* setup ray */
- ray.P = ray_offset(sd.P, (label & LABEL_TRANSMIT)? -sd.Ng: sd.Ng);
- ray.D = bsdf_omega_in;
-
-#ifdef __RAY_DIFFERENTIALS__
- ray.dP = sd.dP;
- ray.dD = bsdf_domega_in;
-#endif
-
-#ifdef __VOLUME__
- /* enter/exit volume */
- if(label & LABEL_TRANSMIT)
- kernel_volume_stack_enter_exit(kg, &sd, state.volume_stack);
-#endif
- }
-#ifdef __VOLUME__
- else if(sd.flag & SD_HAS_ONLY_VOLUME) {
- /* no surface shader but have a volume shader? act transparent */
+ /* direct lighting */
+ kernel_path_surface_connect_light(kg, rng, &sd, throughput, &state, &L);
- /* update path state, count as transparent */
- path_state_next(kg, &state, LABEL_TRANSPARENT);
-
- /* setup ray position, direction stays unchanged */
- ray.P = ray_offset(sd.P, -sd.Ng);
-#ifdef __RAY_DIFFERENTIALS__
- ray.dP = sd.dP;
-#endif
-
- /* enter/exit volume */
- kernel_volume_stack_enter_exit(kg, &sd, state.volume_stack);
- }
-#endif
- else {
- /* no bsdf or volume? we're done */
+ /* compute direct lighting and next bounce */
+ if(!kernel_path_surface_bounce(kg, rng, &sd, &throughput, &state, &L, &ray))
break;
- }
-
- /* adjust ray distance for clipping */
- if(state.bounce == 0)
- ray.t -= sd.ray_length; /* clipping works through transparent */
- else
- ray.t = FLT_MAX;
}
float3 L_sum = path_radiance_clamp_and_sum(kg, &L);
kernel_write_light_passes(kg, buffer, &L, sample);
+#ifdef __KERNEL_DEBUG__
+ kernel_write_debug_passes(kg, buffer, &state, &debug_data, sample);
+#endif
+
return make_float4(L_sum.x, L_sum.y, L_sum.z, 1.0f - L_transparent);
}
#ifdef __BRANCHED_PATH__
-ccl_device_noinline void kernel_branched_path_integrate_lighting(KernelGlobals *kg,
+/* branched path tracing: bounce off surface and integrate indirect light */
+ccl_device_noinline void kernel_branched_path_surface_indirect_light(KernelGlobals *kg,
RNG *rng, ShaderData *sd, float3 throughput, float num_samples_adjust,
- PathState *state, PathRadiance *L, ccl_global float *buffer)
+ PathState *state, PathRadiance *L)
{
-#ifdef __EMISSION__
- if(kernel_data.integrator.use_direct_light) {
- bool all = kernel_data.integrator.sample_all_lights_direct;
- kernel_branched_path_integrate_direct_lighting(kg, rng, sd, state, throughput, num_samples_adjust, L, all);
- }
-#endif
-
for(int i = 0; i< sd->num_closure; i++) {
const ShaderClosure *sc = &sd->closure[i];
@@ -980,68 +764,102 @@ ccl_device_noinline void kernel_branched_path_integrate_lighting(KernelGlobals *
RNG bsdf_rng = cmj_hash(*rng, i);
for(int j = 0; j < num_samples; j++) {
- /* sample BSDF */
- float bsdf_pdf;
- BsdfEval bsdf_eval;
- float3 bsdf_omega_in;
- differential3 bsdf_domega_in;
- float bsdf_u, bsdf_v;
- path_branched_rng_2D(kg, &bsdf_rng, state, j, num_samples, PRNG_BSDF_U, &bsdf_u, &bsdf_v);
- int label;
-
- label = shader_bsdf_sample_closure(kg, sd, sc, bsdf_u, bsdf_v, &bsdf_eval,
- &bsdf_omega_in, &bsdf_domega_in, &bsdf_pdf);
+ PathState ps = *state;
+ float3 tp = throughput;
+ Ray bsdf_ray;
- if(bsdf_pdf == 0.0f || bsdf_eval_is_zero(&bsdf_eval))
+ if(!kernel_branched_path_surface_bounce(kg, &bsdf_rng, sd, sc, j, num_samples, &tp, &ps, L, &bsdf_ray))
continue;
- /* modify throughput */
- float3 tp = throughput;
- path_radiance_bsdf_bounce(L, &tp, &bsdf_eval, bsdf_pdf, state->bounce, label);
+ kernel_path_indirect(kg, rng, bsdf_ray, tp*num_samples_inv, num_samples, ps, L);
- /* modify path state */
- PathState ps = *state;
- path_state_next(kg, &ps, label);
+ /* for render passes, sum and reset indirect light pass variables
+ * for the next samples */
+ path_radiance_sum_indirect(L);
+ path_radiance_reset_indirect(L);
+ }
+ }
+}
- /* setup ray */
- Ray bsdf_ray;
+#ifdef __SUBSURFACE__
+ccl_device void kernel_branched_path_subsurface_scatter(KernelGlobals *kg,
+ ShaderData *sd,
+ PathRadiance *L,
+ PathState *state,
+ RNG *rng,
+ Ray *ray,
+ float3 throughput)
+{
+ for(int i = 0; i< sd->num_closure; i++) {
+ ShaderClosure *sc = &sd->closure[i];
- bsdf_ray.P = ray_offset(sd->P, (label & LABEL_TRANSMIT)? -sd->Ng: sd->Ng);
- bsdf_ray.D = bsdf_omega_in;
- bsdf_ray.t = FLT_MAX;
-#ifdef __RAY_DIFFERENTIALS__
- bsdf_ray.dP = sd->dP;
- bsdf_ray.dD = bsdf_domega_in;
-#endif
-#ifdef __OBJECT_MOTION__
- bsdf_ray.time = sd->time;
-#endif
+ if(!CLOSURE_IS_BSSRDF(sc->type))
+ continue;
+
+ /* set up random number generator */
+ uint lcg_state = lcg_state_init(rng, state, 0x68bc21eb);
+ int num_samples = kernel_data.integrator.subsurface_samples;
+ float num_samples_inv = 1.0f/num_samples;
+ RNG bssrdf_rng = cmj_hash(*rng, i);
+
+ state->flag |= PATH_RAY_BSSRDF_ANCESTOR;
+ /* do subsurface scatter step with copy of shader data, this will
+ * replace the BSSRDF with a diffuse BSDF closure */
+ for(int j = 0; j < num_samples; j++) {
+ ShaderData bssrdf_sd[BSSRDF_MAX_HITS];
+ float bssrdf_u, bssrdf_v;
+ path_branched_rng_2D(kg, &bssrdf_rng, state, j, num_samples, PRNG_BSDF_U, &bssrdf_u, &bssrdf_v);
+ int num_hits = subsurface_scatter_multi_step(kg, sd, bssrdf_sd, state->flag, sc, &lcg_state, bssrdf_u, bssrdf_v, true);
#ifdef __VOLUME__
- /* enter/exit volume */
- if(label & LABEL_TRANSMIT)
- kernel_volume_stack_enter_exit(kg, sd, ps.volume_stack);
+ Ray volume_ray = *ray;
+ bool need_update_volume_stack = kernel_data.integrator.use_volumes &&
+ sd->flag & SD_OBJECT_INTERSECTS_VOLUME;
#endif
- /* branch RNG state */
- path_state_branch(&ps, j, num_samples);
+ /* compute lighting with the BSDF closure */
+ for(int hit = 0; hit < num_hits; hit++) {
+ PathState hit_state = *state;
- /* set MIS state */
- ps.min_ray_pdf = fminf(bsdf_pdf, FLT_MAX);
- ps.ray_pdf = bsdf_pdf;
-#ifdef __LAMP_MIS__
- ps.ray_t = 0.0f;
+ path_state_branch(&hit_state, j, num_samples);
+
+#ifdef __VOLUME__
+ if(need_update_volume_stack) {
+ /* Setup ray from previous surface point to the new one. */
+ float3 P = ray_offset(bssrdf_sd[hit].P, -bssrdf_sd[hit].Ng);
+ volume_ray.D = normalize_len(P - volume_ray.P,
+ &volume_ray.t);
+
+ kernel_path_subsurface_update_volume_stack(
+ kg,
+ &volume_ray,
+ hit_state.volume_stack);
+
+ /* Move volume ray forward. */
+ volume_ray.P = P;
+ }
#endif
- kernel_path_indirect(kg, rng, bsdf_ray, tp*num_samples_inv, num_samples, ps, L);
+#if defined(__EMISSION__) && defined(__BRANCHED_PATH__)
+ /* direct light */
+ if(kernel_data.integrator.use_direct_light) {
+ bool all = kernel_data.integrator.sample_all_lights_direct;
+ kernel_branched_path_surface_connect_light(kg, rng,
+ &bssrdf_sd[hit], &hit_state, throughput, num_samples_inv, L, all);
+ }
+#endif
- /* for render passes, sum and reset indirect light pass variables
- * for the next samples */
- path_radiance_sum_indirect(L);
- path_radiance_reset_indirect(L);
+ /* indirect light */
+ kernel_branched_path_surface_indirect_light(kg, rng,
+ &bssrdf_sd[hit], throughput, num_samples_inv,
+ &hit_state, L);
+ }
}
+
+ state->flag &= ~PATH_RAY_BSSRDF_ANCESTOR;
}
}
+#endif
ccl_device float4 kernel_branched_path_integrate(KernelGlobals *kg, RNG *rng, int sample, Ray ray, ccl_global float *buffer)
{
@@ -1053,7 +871,12 @@ ccl_device float4 kernel_branched_path_integrate(KernelGlobals *kg, RNG *rng, in
path_radiance_init(&L, kernel_data.film.use_light_pass);
PathState state;
- path_state_init(kg, &state, rng, sample);
+ path_state_init(kg, &state, rng, sample, &ray);
+
+#ifdef __KERNEL_DEBUG__
+ DebugData debug_data;
+ debug_data_init(&debug_data);
+#endif
for(;;) {
/* intersect scene */
@@ -1077,7 +900,13 @@ ccl_device float4 kernel_branched_path_integrate(KernelGlobals *kg, RNG *rng, in
bool hit = scene_intersect(kg, &ray, visibility, &isect, &lcg_state, difl, extmax);
#else
- bool hit = scene_intersect(kg, &ray, visibility, &isect);
+ bool hit = scene_intersect(kg, &ray, visibility, &isect, NULL, 0.0f, 0.0f);
+#endif
+
+#ifdef __KERNEL_DEBUG__
+ if(state.flag & PATH_RAY_CAMERA) {
+ debug_data.num_bvh_traversal_steps += isect.num_traversal_steps;
+ }
#endif
#ifdef __VOLUME__
@@ -1085,10 +914,11 @@ ccl_device float4 kernel_branched_path_integrate(KernelGlobals *kg, RNG *rng, in
if(state.volume_stack[0].shader != SHADER_NONE) {
Ray volume_ray = ray;
volume_ray.t = (hit)? isect.t: FLT_MAX;
+
+ bool heterogeneous = volume_stack_is_heterogeneous(kg, state.volume_stack);
-#ifdef __KERNEL_CPU__
+#ifdef __VOLUME_DECOUPLED__
/* decoupled ray marching only supported on CPU */
- bool heterogeneous = volume_stack_is_heterogeneous(kg, state.volume_stack);
/* cache steps along volume for repeated sampling */
VolumeSegment volume_segment;
@@ -1098,29 +928,45 @@ ccl_device float4 kernel_branched_path_integrate(KernelGlobals *kg, RNG *rng, in
kernel_volume_decoupled_record(kg, &state,
&volume_ray, &volume_sd, &volume_segment, heterogeneous);
- /* sample scattering */
- int num_samples = kernel_data.integrator.volume_samples;
- float num_samples_inv = 1.0f/num_samples;
+ /* direct light sampling */
+ if(volume_segment.closure_flag & SD_SCATTER) {
+ volume_segment.sampling_method = volume_stack_sampling_method(kg, state.volume_stack);
- for(int j = 0; j < num_samples; j++) {
- /* workaround to fix correlation bug in T38710, can find better solution
- * in random number generator later, for now this is done here to not impact
- * performance of rendering without volumes */
- RNG tmp_rng = cmj_hash(*rng, state.rng_offset);
+ bool all = kernel_data.integrator.sample_all_lights_direct;
- PathState ps = state;
- Ray pray = ray;
- float3 tp = throughput;
+ kernel_branched_path_volume_connect_light(kg, rng, &volume_sd,
+ throughput, &state, &L, 1.0f, all, &volume_ray, &volume_segment);
- /* branch RNG state */
- path_state_branch(&ps, j, num_samples);
+ /* indirect light sampling */
+ int num_samples = kernel_data.integrator.volume_samples;
+ float num_samples_inv = 1.0f/num_samples;
- VolumeIntegrateResult result = kernel_volume_decoupled_scatter(kg,
- &ps, &volume_ray, &volume_sd, &tp, &tmp_rng, &volume_segment);
-
- if(result == VOLUME_PATH_SCATTERED) {
- /* todo: use all-light sampling */
- if(kernel_path_integrate_scatter_lighting(kg, rng, &volume_sd, &tp, &ps, &L, &pray, num_samples_inv)) {
+ for(int j = 0; j < num_samples; j++) {
+ /* workaround to fix correlation bug in T38710, can find better solution
+ * in random number generator later, for now this is done here to not impact
+ * performance of rendering without volumes */
+ RNG tmp_rng = cmj_hash(*rng, state.rng_offset);
+
+ PathState ps = state;
+ Ray pray = ray;
+ float3 tp = throughput;
+
+ /* branch RNG state */
+ path_state_branch(&ps, j, num_samples);
+
+ /* scatter sample. if we use distance sampling and take just one
+ * sample for direct and indirect light, we could share this
+ * computation, but makes code a bit complex */
+ float rphase = path_state_rng_1D_for_decision(kg, &tmp_rng, &ps, PRNG_PHASE);
+ float rscatter = path_state_rng_1D_for_decision(kg, &tmp_rng, &ps, PRNG_SCATTER_DISTANCE);
+
+ VolumeIntegrateResult result = kernel_volume_decoupled_scatter(kg,
+ &ps, &pray, &volume_sd, &tp, rphase, rscatter, &volume_segment, NULL, false);
+
+ (void)result;
+ kernel_assert(result == VOLUME_PATH_SCATTERED);
+
+ if(kernel_path_volume_bounce(kg, rng, &volume_sd, &tp, &ps, &L, &pray)) {
kernel_path_indirect(kg, rng, pray, tp*num_samples_inv, num_samples, ps, &L);
/* for render passes, sum and reset indirect light pass variables
@@ -1150,18 +996,22 @@ ccl_device float4 kernel_branched_path_integrate(KernelGlobals *kg, RNG *rng, in
PathState ps = state;
Ray pray = ray;
ShaderData volume_sd;
- float3 tp = throughput;
+ float3 tp = throughput * num_samples_inv;
/* branch RNG state */
path_state_branch(&ps, j, num_samples);
- VolumeIntegrateResult result = kernel_volume_integrate(kg, &ps,
- &volume_sd, &volume_ray, &L, &tp, rng);
+ VolumeIntegrateResult result = kernel_volume_integrate(
+ kg, &ps, &volume_sd, &volume_ray, &L, &tp, rng, heterogeneous);
+#ifdef __VOLUME_SCATTER__
if(result == VOLUME_PATH_SCATTERED) {
- /* todo: use all-light sampling */
- if(kernel_path_integrate_scatter_lighting(kg, rng, &volume_sd, &tp, &ps, &L, &pray, num_samples_inv)) {
- kernel_path_indirect(kg, rng, pray, tp*num_samples_inv, num_samples, ps, &L);
+ /* todo: support equiangular, MIS and all light sampling.
+ * alternatively get decoupled ray marching working on the GPU */
+ kernel_path_volume_connect_light(kg, rng, &volume_sd, tp, &state, &L);
+
+ if(kernel_path_volume_bounce(kg, rng, &volume_sd, &tp, &ps, &L, &pray)) {
+ kernel_path_indirect(kg, rng, pray, tp, num_samples, ps, &L);
/* for render passes, sum and reset indirect light pass variables
* for the next samples */
@@ -1169,6 +1019,7 @@ ccl_device float4 kernel_branched_path_integrate(KernelGlobals *kg, RNG *rng, in
path_radiance_reset_indirect(&L);
}
}
+#endif
}
/* todo: avoid this calculation using decoupled ray marching */
@@ -1205,7 +1056,7 @@ ccl_device float4 kernel_branched_path_integrate(KernelGlobals *kg, RNG *rng, in
/* holdout */
#ifdef __HOLDOUT__
- if((sd.flag & (SD_HOLDOUT|SD_HOLDOUT_MASK))) {
+ if(sd.flag & (SD_HOLDOUT|SD_HOLDOUT_MASK)) {
if(kernel_data.background.transparent) {
float3 holdout_weight;
@@ -1245,7 +1096,7 @@ ccl_device float4 kernel_branched_path_integrate(KernelGlobals *kg, RNG *rng, in
break;
}
else if(probability != 1.0f) {
- float terminate = path_state_rng_1D(kg, rng, &state, PRNG_TERMINATE);
+ float terminate = path_state_rng_1D_for_decision(kg, rng, &state, PRNG_TERMINATE);
if(terminate >= probability)
break;
@@ -1257,90 +1108,33 @@ ccl_device float4 kernel_branched_path_integrate(KernelGlobals *kg, RNG *rng, in
#ifdef __AO__
/* ambient occlusion */
if(kernel_data.integrator.use_ambient_occlusion || (sd.flag & SD_AO)) {
- int num_samples = kernel_data.integrator.ao_samples;
- float num_samples_inv = 1.0f/num_samples;
- float ao_factor = kernel_data.background.ao_factor;
- float3 ao_N;
- float3 ao_bsdf = shader_bsdf_ao(kg, &sd, ao_factor, &ao_N);
- float3 ao_alpha = shader_bsdf_alpha(kg, &sd);
-
- for(int j = 0; j < num_samples; j++) {
- float bsdf_u, bsdf_v;
- path_branched_rng_2D(kg, rng, &state, j, num_samples, PRNG_BSDF_U, &bsdf_u, &bsdf_v);
-
- float3 ao_D;
- float ao_pdf;
-
- sample_cos_hemisphere(ao_N, bsdf_u, bsdf_v, &ao_D, &ao_pdf);
-
- if(dot(sd.Ng, ao_D) > 0.0f && ao_pdf != 0.0f) {
- Ray light_ray;
- float3 ao_shadow;
-
- light_ray.P = ray_offset(sd.P, sd.Ng);
- light_ray.D = ao_D;
- light_ray.t = kernel_data.background.ao_distance;
-#ifdef __OBJECT_MOTION__
- light_ray.time = sd.time;
-#endif
- light_ray.dP = sd.dP;
- light_ray.dD = differential3_zero();
-
- if(!shadow_blocked(kg, &state, &light_ray, &ao_shadow))
- path_radiance_accum_ao(&L, throughput*num_samples_inv, ao_alpha, ao_bsdf, ao_shadow, state.bounce);
- }
- }
+ kernel_branched_path_ao(kg, &sd, &L, &state, rng, throughput);
}
#endif
#ifdef __SUBSURFACE__
/* bssrdf scatter to a different location on the same object */
if(sd.flag & SD_BSSRDF) {
- for(int i = 0; i< sd.num_closure; i++) {
- ShaderClosure *sc = &sd.closure[i];
-
- if(!CLOSURE_IS_BSSRDF(sc->type))
- continue;
-
- /* set up random number generator */
- uint lcg_state = lcg_state_init(rng, &state, 0x68bc21eb);
- int num_samples = kernel_data.integrator.subsurface_samples;
- float num_samples_inv = 1.0f/num_samples;
- RNG bssrdf_rng = cmj_hash(*rng, i);
-
- state.flag |= PATH_RAY_BSSRDF_ANCESTOR;
-
- /* do subsurface scatter step with copy of shader data, this will
- * replace the BSSRDF with a diffuse BSDF closure */
- for(int j = 0; j < num_samples; j++) {
- ShaderData bssrdf_sd[BSSRDF_MAX_HITS];
- float bssrdf_u, bssrdf_v;
- path_branched_rng_2D(kg, &bssrdf_rng, &state, j, num_samples, PRNG_BSDF_U, &bssrdf_u, &bssrdf_v);
- int num_hits = subsurface_scatter_multi_step(kg, &sd, bssrdf_sd, state.flag, sc, &lcg_state, bssrdf_u, bssrdf_v, true);
-
- /* compute lighting with the BSDF closure */
- for(int hit = 0; hit < num_hits; hit++) {
- PathState hit_state = state;
-
- path_state_branch(&hit_state, j, num_samples);
-
- kernel_branched_path_integrate_lighting(kg, rng,
- &bssrdf_sd[hit], throughput, num_samples_inv,
- &hit_state, &L, buffer);
- }
- }
-
- state.flag &= ~PATH_RAY_BSSRDF_ANCESTOR;
- }
+ kernel_branched_path_subsurface_scatter(kg, &sd, &L, &state,
+ rng, &ray, throughput);
}
#endif
if(!(sd.flag & SD_HAS_ONLY_VOLUME)) {
PathState hit_state = state;
- /* lighting */
- kernel_branched_path_integrate_lighting(kg, rng,
- &sd, throughput, 1.0f, &hit_state, &L, buffer);
+#ifdef __EMISSION__
+ /* direct light */
+ if(kernel_data.integrator.use_direct_light) {
+ bool all = kernel_data.integrator.sample_all_lights_direct;
+ kernel_branched_path_surface_connect_light(kg, rng,
+ &sd, &hit_state, throughput, 1.0f, &L, all);
+ }
+#endif
+
+ /* indirect light */
+ kernel_branched_path_surface_indirect_light(kg, rng,
+ &sd, throughput, 1.0f, &hit_state, &L);
/* continue in case of transparency */
throughput *= shader_bsdf_transparency(kg, &sd);
@@ -1353,6 +1147,13 @@ ccl_device float4 kernel_branched_path_integrate(KernelGlobals *kg, RNG *rng, in
ray.P = ray_offset(sd.P, -sd.Ng);
ray.t -= sd.ray_length; /* clipping works through transparent */
+
+#ifdef __RAY_DIFFERENTIALS__
+ ray.dP = sd.dP;
+ ray.dD.dx = -sd.dI.dx;
+ ray.dD.dy = -sd.dI.dy;
+#endif
+
#ifdef __VOLUME__
/* enter/exit volume */
kernel_volume_stack_enter_exit(kg, &sd, state.volume_stack);
@@ -1363,6 +1164,10 @@ ccl_device float4 kernel_branched_path_integrate(KernelGlobals *kg, RNG *rng, in
kernel_write_light_passes(kg, buffer, &L, sample);
+#ifdef __KERNEL_DEBUG__
+ kernel_write_debug_passes(kg, buffer, &state, &debug_data, sample);
+#endif
+
return make_float4(L_sum.x, L_sum.y, L_sum.z, 1.0f - L_transparent);
}
@@ -1372,11 +1177,8 @@ ccl_device_inline void kernel_path_trace_setup(KernelGlobals *kg, ccl_global uin
{
float filter_u;
float filter_v;
-#ifdef __CMJ__
+
int num_samples = kernel_data.integrator.aa_samples;
-#else
- int num_samples = 0;
-#endif
path_rng_init(kg, rng_state, sample, num_samples, rng, x, y, &filter_u, &filter_v);
diff --git a/intern/cycles/kernel/kernel_path_state.h b/intern/cycles/kernel/kernel_path_state.h
index 406654c1741..f29168642a4 100644
--- a/intern/cycles/kernel/kernel_path_state.h
+++ b/intern/cycles/kernel/kernel_path_state.h
@@ -16,17 +16,13 @@
CCL_NAMESPACE_BEGIN
-ccl_device_inline void path_state_init(KernelGlobals *kg, PathState *state, RNG *rng, int sample)
+ccl_device_inline void path_state_init(KernelGlobals *kg, PathState *state, RNG *rng, int sample, Ray *ray)
{
- state->flag = PATH_RAY_CAMERA|PATH_RAY_SINGULAR|PATH_RAY_MIS_SKIP;
+ state->flag = PATH_RAY_CAMERA|PATH_RAY_MIS_SKIP;
state->rng_offset = PRNG_BASE_NUM;
state->sample = sample;
-#ifdef __CMJ__
state->num_samples = kernel_data.integrator.aa_samples;
-#else
- state->num_samples = 0;
-#endif
state->bounce = 0;
state->diffuse_bounce = 0;
@@ -45,7 +41,7 @@ ccl_device_inline void path_state_init(KernelGlobals *kg, PathState *state, RNG
if(kernel_data.integrator.use_volumes) {
/* initialize volume stack with volume we are inside of */
- kernel_volume_stack_init(kg, state->volume_stack);
+ kernel_volume_stack_init(kg, ray, state->volume_stack);
/* seed RNG for cases where we can't use stratified samples */
state->rng_congruential = lcg_init(*rng + sample*0x51633e2d);
}
@@ -63,8 +59,8 @@ ccl_device_inline void path_state_next(KernelGlobals *kg, PathState *state, int
state->flag |= PATH_RAY_TRANSPARENT;
state->transparent_bounce++;
- /* random number generator next bounce */
- state->rng_offset += PRNG_BOUNCE_NUM;
+ /* don't increase random number generator offset here, to avoid some
+ * unwanted patterns, see path_state_rng_1D_for_decision */
if(!kernel_data.integrator.transparent_shadows)
state->flag |= PATH_RAY_MIS_SKIP;
diff --git a/intern/cycles/kernel/kernel_path_surface.h b/intern/cycles/kernel/kernel_path_surface.h
new file mode 100644
index 00000000000..9553c2da0df
--- /dev/null
+++ b/intern/cycles/kernel/kernel_path_surface.h
@@ -0,0 +1,299 @@
+/*
+ * Copyright 2011-2013 Blender Foundation
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License
+ */
+
+CCL_NAMESPACE_BEGIN
+
+#if defined(__BRANCHED_PATH__) || defined(__SUBSURFACE__)
+
+/* branched path tracing: connect path directly to position on one or more lights and add it to L */
+ccl_device void kernel_branched_path_surface_connect_light(KernelGlobals *kg, RNG *rng,
+ ShaderData *sd, PathState *state, float3 throughput, float num_samples_adjust, PathRadiance *L, bool sample_all_lights)
+{
+#ifdef __EMISSION__
+ /* sample illumination from lights to find path contribution */
+ if(!(sd->flag & SD_BSDF_HAS_EVAL))
+ return;
+
+ Ray light_ray;
+ BsdfEval L_light;
+ bool is_lamp;
+
+#ifdef __OBJECT_MOTION__
+ light_ray.time = sd->time;
+#endif
+
+ if(sample_all_lights) {
+ /* lamp sampling */
+ for(int i = 0; i < kernel_data.integrator.num_all_lights; i++) {
+ int num_samples = ceil_to_int(num_samples_adjust*light_select_num_samples(kg, i));
+ float num_samples_inv = num_samples_adjust/(num_samples*kernel_data.integrator.num_all_lights);
+ RNG lamp_rng = cmj_hash(*rng, i);
+
+ if(kernel_data.integrator.pdf_triangles != 0.0f)
+ num_samples_inv *= 0.5f;
+
+ for(int j = 0; j < num_samples; j++) {
+ float light_u, light_v;
+ path_branched_rng_2D(kg, &lamp_rng, state, j, num_samples, PRNG_LIGHT_U, &light_u, &light_v);
+
+ LightSample ls;
+ lamp_light_sample(kg, i, light_u, light_v, sd->P, &ls);
+
+ if(direct_emission(kg, sd, &ls, &light_ray, &L_light, &is_lamp, state->bounce, state->transparent_bounce)) {
+ /* trace shadow ray */
+ float3 shadow;
+
+ if(!shadow_blocked(kg, state, &light_ray, &shadow)) {
+ /* accumulate */
+ path_radiance_accum_light(L, throughput*num_samples_inv, &L_light, shadow, num_samples_inv, state->bounce, is_lamp);
+ }
+ }
+ }
+ }
+
+ /* mesh light sampling */
+ if(kernel_data.integrator.pdf_triangles != 0.0f) {
+ int num_samples = ceil_to_int(num_samples_adjust*kernel_data.integrator.mesh_light_samples);
+ float num_samples_inv = num_samples_adjust/num_samples;
+
+ if(kernel_data.integrator.num_all_lights)
+ num_samples_inv *= 0.5f;
+
+ for(int j = 0; j < num_samples; j++) {
+ float light_t = path_branched_rng_1D(kg, rng, state, j, num_samples, PRNG_LIGHT);
+ float light_u, light_v;
+ path_branched_rng_2D(kg, rng, state, j, num_samples, PRNG_LIGHT_U, &light_u, &light_v);
+
+ /* only sample triangle lights */
+ if(kernel_data.integrator.num_all_lights)
+ light_t = 0.5f*light_t;
+
+ LightSample ls;
+ light_sample(kg, light_t, light_u, light_v, sd->time, sd->P, &ls);
+
+ if(direct_emission(kg, sd, &ls, &light_ray, &L_light, &is_lamp, state->bounce, state->transparent_bounce)) {
+ /* trace shadow ray */
+ float3 shadow;
+
+ if(!shadow_blocked(kg, state, &light_ray, &shadow)) {
+ /* accumulate */
+ path_radiance_accum_light(L, throughput*num_samples_inv, &L_light, shadow, num_samples_inv, state->bounce, is_lamp);
+ }
+ }
+ }
+ }
+ }
+ else {
+ /* sample one light at random */
+ float light_t = path_state_rng_1D(kg, rng, state, PRNG_LIGHT);
+ float light_u, light_v;
+ path_state_rng_2D(kg, rng, state, PRNG_LIGHT_U, &light_u, &light_v);
+
+ LightSample ls;
+ light_sample(kg, light_t, light_u, light_v, sd->time, sd->P, &ls);
+
+ /* sample random light */
+ if(direct_emission(kg, sd, &ls, &light_ray, &L_light, &is_lamp, state->bounce, state->transparent_bounce)) {
+ /* trace shadow ray */
+ float3 shadow;
+
+ if(!shadow_blocked(kg, state, &light_ray, &shadow)) {
+ /* accumulate */
+ path_radiance_accum_light(L, throughput*num_samples_adjust, &L_light, shadow, num_samples_adjust, state->bounce, is_lamp);
+ }
+ }
+ }
+#endif
+}
+
+/* branched path tracing: bounce off or through surface to with new direction stored in ray */
+ccl_device bool kernel_branched_path_surface_bounce(KernelGlobals *kg, RNG *rng,
+ ShaderData *sd, const ShaderClosure *sc, int sample, int num_samples,
+ float3 *throughput, PathState *state, PathRadiance *L, Ray *ray)
+{
+ /* sample BSDF */
+ float bsdf_pdf;
+ BsdfEval bsdf_eval;
+ float3 bsdf_omega_in;
+ differential3 bsdf_domega_in;
+ float bsdf_u, bsdf_v;
+ path_branched_rng_2D(kg, rng, state, sample, num_samples, PRNG_BSDF_U, &bsdf_u, &bsdf_v);
+ int label;
+
+ label = shader_bsdf_sample_closure(kg, sd, sc, bsdf_u, bsdf_v, &bsdf_eval,
+ &bsdf_omega_in, &bsdf_domega_in, &bsdf_pdf);
+
+ if(bsdf_pdf == 0.0f || bsdf_eval_is_zero(&bsdf_eval))
+ return false;
+
+ /* modify throughput */
+ path_radiance_bsdf_bounce(L, throughput, &bsdf_eval, bsdf_pdf, state->bounce, label);
+
+ /* modify path state */
+ path_state_next(kg, state, label);
+
+ /* setup ray */
+ ray->P = ray_offset(sd->P, (label & LABEL_TRANSMIT)? -sd->Ng: sd->Ng);
+ ray->D = bsdf_omega_in;
+ ray->t = FLT_MAX;
+#ifdef __RAY_DIFFERENTIALS__
+ ray->dP = sd->dP;
+ ray->dD = bsdf_domega_in;
+#endif
+#ifdef __OBJECT_MOTION__
+ ray->time = sd->time;
+#endif
+
+#ifdef __VOLUME__
+ /* enter/exit volume */
+ if(label & LABEL_TRANSMIT)
+ kernel_volume_stack_enter_exit(kg, sd, state->volume_stack);
+#endif
+
+ /* branch RNG state */
+ path_state_branch(state, sample, num_samples);
+
+ /* set MIS state */
+ state->min_ray_pdf = fminf(bsdf_pdf, FLT_MAX);
+ state->ray_pdf = bsdf_pdf;
+#ifdef __LAMP_MIS__
+ state->ray_t = 0.0f;
+#endif
+
+ return true;
+}
+
+#endif
+
+/* path tracing: connect path directly to position on a light and add it to L */
+ccl_device_inline void kernel_path_surface_connect_light(KernelGlobals *kg, RNG *rng,
+ ShaderData *sd, float3 throughput, PathState *state, PathRadiance *L)
+{
+#ifdef __EMISSION__
+ if(!(kernel_data.integrator.use_direct_light && (sd->flag & SD_BSDF_HAS_EVAL)))
+ return;
+
+ /* sample illumination from lights to find path contribution */
+ float light_t = path_state_rng_1D(kg, rng, state, PRNG_LIGHT);
+ float light_u, light_v;
+ path_state_rng_2D(kg, rng, state, PRNG_LIGHT_U, &light_u, &light_v);
+
+ Ray light_ray;
+ BsdfEval L_light;
+ bool is_lamp;
+
+#ifdef __OBJECT_MOTION__
+ light_ray.time = sd->time;
+#endif
+
+ LightSample ls;
+ light_sample(kg, light_t, light_u, light_v, sd->time, sd->P, &ls);
+
+ if(direct_emission(kg, sd, &ls, &light_ray, &L_light, &is_lamp, state->bounce, state->transparent_bounce)) {
+ /* trace shadow ray */
+ float3 shadow;
+
+ if(!shadow_blocked(kg, state, &light_ray, &shadow)) {
+ /* accumulate */
+ path_radiance_accum_light(L, throughput, &L_light, shadow, 1.0f, state->bounce, is_lamp);
+ }
+ }
+#endif
+}
+
+/* path tracing: bounce off or through surface to with new direction stored in ray */
+ccl_device_inline bool kernel_path_surface_bounce(KernelGlobals *kg, RNG *rng,
+ ShaderData *sd, float3 *throughput, PathState *state, PathRadiance *L, Ray *ray)
+{
+ /* no BSDF? we can stop here */
+ if(sd->flag & SD_BSDF) {
+ /* sample BSDF */
+ float bsdf_pdf;
+ BsdfEval bsdf_eval;
+ float3 bsdf_omega_in;
+ differential3 bsdf_domega_in;
+ float bsdf_u, bsdf_v;
+ path_state_rng_2D(kg, rng, state, PRNG_BSDF_U, &bsdf_u, &bsdf_v);
+ int label;
+
+ label = shader_bsdf_sample(kg, sd, bsdf_u, bsdf_v, &bsdf_eval,
+ &bsdf_omega_in, &bsdf_domega_in, &bsdf_pdf);
+
+ if(bsdf_pdf == 0.0f || bsdf_eval_is_zero(&bsdf_eval))
+ return false;
+
+ /* modify throughput */
+ path_radiance_bsdf_bounce(L, throughput, &bsdf_eval, bsdf_pdf, state->bounce, label);
+
+ /* set labels */
+ if(!(label & LABEL_TRANSPARENT)) {
+ state->ray_pdf = bsdf_pdf;
+#ifdef __LAMP_MIS__
+ state->ray_t = 0.0f;
+#endif
+ state->min_ray_pdf = fminf(bsdf_pdf, state->min_ray_pdf);
+ }
+
+ /* update path state */
+ path_state_next(kg, state, label);
+
+ /* setup ray */
+ ray->P = ray_offset(sd->P, (label & LABEL_TRANSMIT)? -sd->Ng: sd->Ng);
+ ray->D = bsdf_omega_in;
+
+ if(state->bounce == 0)
+ ray->t -= sd->ray_length; /* clipping works through transparent */
+ else
+ ray->t = FLT_MAX;
+
+#ifdef __RAY_DIFFERENTIALS__
+ ray->dP = sd->dP;
+ ray->dD = bsdf_domega_in;
+#endif
+
+#ifdef __VOLUME__
+ /* enter/exit volume */
+ if(label & LABEL_TRANSMIT)
+ kernel_volume_stack_enter_exit(kg, sd, state->volume_stack);
+#endif
+ return true;
+ }
+#ifdef __VOLUME__
+ else if(sd->flag & SD_HAS_ONLY_VOLUME) {
+ /* no surface shader but have a volume shader? act transparent */
+
+ /* update path state, count as transparent */
+ path_state_next(kg, state, LABEL_TRANSPARENT);
+
+ /* setup ray position, direction stays unchanged */
+ ray->P = ray_offset(sd->P, -sd->Ng);
+#ifdef __RAY_DIFFERENTIALS__
+ ray->dP = sd->dP;
+#endif
+
+ /* enter/exit volume */
+ kernel_volume_stack_enter_exit(kg, sd, state->volume_stack);
+ return true;
+ }
+#endif
+ else {
+ /* no bsdf or volume? */
+ return false;
+ }
+}
+
+CCL_NAMESPACE_END
+
diff --git a/intern/cycles/kernel/kernel_path_volume.h b/intern/cycles/kernel/kernel_path_volume.h
new file mode 100644
index 00000000000..d8143832294
--- /dev/null
+++ b/intern/cycles/kernel/kernel_path_volume.h
@@ -0,0 +1,267 @@
+/*
+ * Copyright 2011-2013 Blender Foundation
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License
+ */
+
+CCL_NAMESPACE_BEGIN
+
+#ifdef __VOLUME_SCATTER__
+
+ccl_device void kernel_path_volume_connect_light(KernelGlobals *kg, RNG *rng,
+ ShaderData *sd, float3 throughput, PathState *state, PathRadiance *L)
+{
+#ifdef __EMISSION__
+ if(!kernel_data.integrator.use_direct_light)
+ return;
+
+ /* sample illumination from lights to find path contribution */
+ float light_t = path_state_rng_1D(kg, rng, state, PRNG_LIGHT);
+ float light_u, light_v;
+ path_state_rng_2D(kg, rng, state, PRNG_LIGHT_U, &light_u, &light_v);
+
+ Ray light_ray;
+ BsdfEval L_light;
+ LightSample ls;
+ bool is_lamp;
+
+ /* connect to light from given point where shader has been evaluated */
+#ifdef __OBJECT_MOTION__
+ light_ray.time = sd->time;
+#endif
+
+ light_sample(kg, light_t, light_u, light_v, sd->time, sd->P, &ls);
+ if(ls.pdf == 0.0f)
+ return;
+
+ if(direct_emission(kg, sd, &ls, &light_ray, &L_light, &is_lamp, state->bounce, state->transparent_bounce)) {
+ /* trace shadow ray */
+ float3 shadow;
+
+ if(!shadow_blocked(kg, state, &light_ray, &shadow)) {
+ /* accumulate */
+ path_radiance_accum_light(L, throughput, &L_light, shadow, 1.0f, state->bounce, is_lamp);
+ }
+ }
+#endif
+}
+
+#ifdef __KERNEL_GPU__
+ccl_device_noinline
+#else
+ccl_device
+#endif
+bool kernel_path_volume_bounce(KernelGlobals *kg, RNG *rng,
+ ShaderData *sd, float3 *throughput, PathState *state, PathRadiance *L, Ray *ray)
+{
+ /* sample phase function */
+ float phase_pdf;
+ BsdfEval phase_eval;
+ float3 phase_omega_in;
+ differential3 phase_domega_in;
+ float phase_u, phase_v;
+ path_state_rng_2D(kg, rng, state, PRNG_PHASE_U, &phase_u, &phase_v);
+ int label;
+
+ label = shader_volume_phase_sample(kg, sd, phase_u, phase_v, &phase_eval,
+ &phase_omega_in, &phase_domega_in, &phase_pdf);
+
+ if(phase_pdf == 0.0f || bsdf_eval_is_zero(&phase_eval))
+ return false;
+
+ /* modify throughput */
+ path_radiance_bsdf_bounce(L, throughput, &phase_eval, phase_pdf, state->bounce, label);
+
+ /* set labels */
+ state->ray_pdf = phase_pdf;
+#ifdef __LAMP_MIS__
+ state->ray_t = 0.0f;
+#endif
+ state->min_ray_pdf = fminf(phase_pdf, state->min_ray_pdf);
+
+ /* update path state */
+ path_state_next(kg, state, label);
+
+ /* setup ray */
+ ray->P = sd->P;
+ ray->D = phase_omega_in;
+ ray->t = FLT_MAX;
+
+#ifdef __RAY_DIFFERENTIALS__
+ ray->dP = sd->dP;
+ ray->dD = phase_domega_in;
+#endif
+
+ return true;
+}
+
+ccl_device void kernel_branched_path_volume_connect_light(KernelGlobals *kg, RNG *rng,
+ ShaderData *sd, float3 throughput, PathState *state, PathRadiance *L,
+ float num_samples_adjust, bool sample_all_lights, Ray *ray, const VolumeSegment *segment)
+{
+#ifdef __EMISSION__
+ if(!kernel_data.integrator.use_direct_light)
+ return;
+
+ Ray light_ray;
+ BsdfEval L_light;
+ bool is_lamp;
+
+#ifdef __OBJECT_MOTION__
+ light_ray.time = sd->time;
+#endif
+
+ if(sample_all_lights) {
+ /* lamp sampling */
+ for(int i = 0; i < kernel_data.integrator.num_all_lights; i++) {
+ int num_samples = ceil_to_int(num_samples_adjust*light_select_num_samples(kg, i));
+ float num_samples_inv = num_samples_adjust/(num_samples*kernel_data.integrator.num_all_lights);
+ RNG lamp_rng = cmj_hash(*rng, i);
+
+ if(kernel_data.integrator.pdf_triangles != 0.0f)
+ num_samples_inv *= 0.5f;
+
+ for(int j = 0; j < num_samples; j++) {
+ /* sample random position on given light */
+ float light_u, light_v;
+ path_branched_rng_2D(kg, &lamp_rng, state, j, num_samples, PRNG_LIGHT_U, &light_u, &light_v);
+
+ LightSample ls;
+ lamp_light_sample(kg, i, light_u, light_v, ray->P, &ls);
+
+ float3 tp = throughput;
+
+ /* sample position on volume segment */
+ float rphase = path_branched_rng_1D_for_decision(kg, rng, state, j, num_samples, PRNG_PHASE);
+ float rscatter = path_branched_rng_1D_for_decision(kg, rng, state, j, num_samples, PRNG_SCATTER_DISTANCE);
+
+ VolumeIntegrateResult result = kernel_volume_decoupled_scatter(kg,
+ state, ray, sd, &tp, rphase, rscatter, segment, (ls.t != FLT_MAX)? &ls.P: NULL, false);
+
+ (void)result;
+ kernel_assert(result == VOLUME_PATH_SCATTERED);
+
+ /* todo: split up light_sample so we don't have to call it again with new position */
+ lamp_light_sample(kg, i, light_u, light_v, sd->P, &ls);
+
+ if(ls.pdf == 0.0f)
+ continue;
+
+ if(direct_emission(kg, sd, &ls, &light_ray, &L_light, &is_lamp, state->bounce, state->transparent_bounce)) {
+ /* trace shadow ray */
+ float3 shadow;
+
+ if(!shadow_blocked(kg, state, &light_ray, &shadow)) {
+ /* accumulate */
+ path_radiance_accum_light(L, tp*num_samples_inv, &L_light, shadow, num_samples_inv, state->bounce, is_lamp);
+ }
+ }
+ }
+ }
+
+ /* mesh light sampling */
+ if(kernel_data.integrator.pdf_triangles != 0.0f) {
+ int num_samples = ceil_to_int(num_samples_adjust*kernel_data.integrator.mesh_light_samples);
+ float num_samples_inv = num_samples_adjust/num_samples;
+
+ if(kernel_data.integrator.num_all_lights)
+ num_samples_inv *= 0.5f;
+
+ for(int j = 0; j < num_samples; j++) {
+ /* sample random position on random triangle */
+ float light_t = path_branched_rng_1D_for_decision(kg, rng, state, j, num_samples, PRNG_LIGHT);
+ float light_u, light_v;
+ path_branched_rng_2D(kg, rng, state, j, num_samples, PRNG_LIGHT_U, &light_u, &light_v);
+
+ /* only sample triangle lights */
+ if(kernel_data.integrator.num_all_lights)
+ light_t = 0.5f*light_t;
+
+ LightSample ls;
+ light_sample(kg, light_t, light_u, light_v, sd->time, ray->P, &ls);
+
+ float3 tp = throughput;
+
+ /* sample position on volume segment */
+ float rphase = path_branched_rng_1D_for_decision(kg, rng, state, j, num_samples, PRNG_PHASE);
+ float rscatter = path_branched_rng_1D_for_decision(kg, rng, state, j, num_samples, PRNG_SCATTER_DISTANCE);
+
+ VolumeIntegrateResult result = kernel_volume_decoupled_scatter(kg,
+ state, ray, sd, &tp, rphase, rscatter, segment, (ls.t != FLT_MAX)? &ls.P: NULL, false);
+
+ (void)result;
+ kernel_assert(result == VOLUME_PATH_SCATTERED);
+
+ /* todo: split up light_sample so we don't have to call it again with new position */
+ light_sample(kg, light_t, light_u, light_v, sd->time, sd->P, &ls);
+
+ if(ls.pdf == 0.0f)
+ continue;
+
+ if(direct_emission(kg, sd, &ls, &light_ray, &L_light, &is_lamp, state->bounce, state->transparent_bounce)) {
+ /* trace shadow ray */
+ float3 shadow;
+
+ if(!shadow_blocked(kg, state, &light_ray, &shadow)) {
+ /* accumulate */
+ path_radiance_accum_light(L, tp*num_samples_inv, &L_light, shadow, num_samples_inv, state->bounce, is_lamp);
+ }
+ }
+ }
+ }
+ }
+ else {
+ /* sample random position on random light */
+ float light_t = path_state_rng_1D(kg, rng, state, PRNG_LIGHT);
+ float light_u, light_v;
+ path_state_rng_2D(kg, rng, state, PRNG_LIGHT_U, &light_u, &light_v);
+
+ LightSample ls;
+ light_sample(kg, light_t, light_u, light_v, sd->time, ray->P, &ls);
+
+ float3 tp = throughput;
+
+ /* sample position on volume segment */
+ float rphase = path_state_rng_1D_for_decision(kg, rng, state, PRNG_PHASE);
+ float rscatter = path_state_rng_1D_for_decision(kg, rng, state, PRNG_SCATTER_DISTANCE);
+
+ VolumeIntegrateResult result = kernel_volume_decoupled_scatter(kg,
+ state, ray, sd, &tp, rphase, rscatter, segment, (ls.t != FLT_MAX)? &ls.P: NULL, false);
+
+ (void)result;
+ kernel_assert(result == VOLUME_PATH_SCATTERED);
+
+ /* todo: split up light_sample so we don't have to call it again with new position */
+ light_sample(kg, light_t, light_u, light_v, sd->time, sd->P, &ls);
+
+ if(ls.pdf == 0.0f)
+ return;
+
+ /* sample random light */
+ if(direct_emission(kg, sd, &ls, &light_ray, &L_light, &is_lamp, state->bounce, state->transparent_bounce)) {
+ /* trace shadow ray */
+ float3 shadow;
+
+ if(!shadow_blocked(kg, state, &light_ray, &shadow)) {
+ /* accumulate */
+ path_radiance_accum_light(L, tp, &L_light, shadow, 1.0f, state->bounce, is_lamp);
+ }
+ }
+ }
+#endif
+}
+
+#endif
+
+CCL_NAMESPACE_END
+
diff --git a/intern/cycles/kernel/kernel_random.h b/intern/cycles/kernel/kernel_random.h
index 31cb6ff6abd..236f74c0a82 100644
--- a/intern/cycles/kernel/kernel_random.h
+++ b/intern/cycles/kernel/kernel_random.h
@@ -261,22 +261,41 @@ ccl_device uint lcg_init(uint seed)
* For branches in the path we must be careful not to reuse the same number
* in a sequence and offset accordingly. */
-ccl_device_inline float path_state_rng_1D(KernelGlobals *kg, RNG *rng, PathState *state, int dimension)
+ccl_device_inline float path_state_rng_1D(KernelGlobals *kg, RNG *rng, const PathState *state, int dimension)
{
return path_rng_1D(kg, rng, state->sample, state->num_samples, state->rng_offset + dimension);
}
-ccl_device_inline void path_state_rng_2D(KernelGlobals *kg, RNG *rng, PathState *state, int dimension, float *fx, float *fy)
+ccl_device_inline float path_state_rng_1D_for_decision(KernelGlobals *kg, RNG *rng, const PathState *state, int dimension)
+{
+ /* the rng_offset is not increased for transparent bounces. if we do then
+ * fully transparent objects can become subtly visible by the different
+ * sampling patterns used where the transparent object is.
+ *
+ * however for some random numbers that will determine if we next bounce
+ * is transparent we do need to increase the offset to avoid always making
+ * the same decision */
+ int rng_offset = state->rng_offset + state->transparent_bounce*PRNG_BOUNCE_NUM;
+ return path_rng_1D(kg, rng, state->sample, state->num_samples, rng_offset + dimension);
+}
+
+ccl_device_inline void path_state_rng_2D(KernelGlobals *kg, RNG *rng, const PathState *state, int dimension, float *fx, float *fy)
{
path_rng_2D(kg, rng, state->sample, state->num_samples, state->rng_offset + dimension, fx, fy);
}
-ccl_device_inline float path_branched_rng_1D(KernelGlobals *kg, RNG *rng, PathState *state, int branch, int num_branches, int dimension)
+ccl_device_inline float path_branched_rng_1D(KernelGlobals *kg, RNG *rng, const PathState *state, int branch, int num_branches, int dimension)
{
return path_rng_1D(kg, rng, state->sample*num_branches + branch, state->num_samples*num_branches, state->rng_offset + dimension);
}
-ccl_device_inline void path_branched_rng_2D(KernelGlobals *kg, RNG *rng, PathState *state, int branch, int num_branches, int dimension, float *fx, float *fy)
+ccl_device_inline float path_branched_rng_1D_for_decision(KernelGlobals *kg, RNG *rng, const PathState *state, int branch, int num_branches, int dimension)
+{
+ int rng_offset = state->rng_offset + state->transparent_bounce*PRNG_BOUNCE_NUM;
+ return path_rng_1D(kg, rng, state->sample*num_branches + branch, state->num_samples*num_branches, rng_offset + dimension);
+}
+
+ccl_device_inline void path_branched_rng_2D(KernelGlobals *kg, RNG *rng, const PathState *state, int branch, int num_branches, int dimension, float *fx, float *fy)
{
path_rng_2D(kg, rng, state->sample*num_branches + branch, state->num_samples*num_branches, state->rng_offset + dimension, fx, fy);
}
@@ -290,7 +309,7 @@ ccl_device_inline void path_state_branch(PathState *state, int branch, int num_b
state->num_samples = state->num_samples*num_branches;
}
-ccl_device_inline uint lcg_state_init(RNG *rng, PathState *state, uint scramble)
+ccl_device_inline uint lcg_state_init(RNG *rng, const PathState *state, uint scramble)
{
return lcg_init(*rng + state->rng_offset + state->sample*scramble);
}
diff --git a/intern/cycles/kernel/kernel_shader.h b/intern/cycles/kernel/kernel_shader.h
index 58cec090410..db08c328d7e 100644
--- a/intern/cycles/kernel/kernel_shader.h
+++ b/intern/cycles/kernel/kernel_shader.h
@@ -86,9 +86,8 @@ ccl_device void shader_setup_from_ray(KernelGlobals *kg, ShaderData *sd,
#endif
if(sd->type & PRIMITIVE_TRIANGLE) {
/* static triangle */
- float4 Ns = kernel_tex_fetch(__tri_normal, sd->prim);
- float3 Ng = make_float3(Ns.x, Ns.y, Ns.z);
- sd->shader = __float_as_int(Ns.w);
+ float3 Ng = triangle_normal(kg, sd);
+ sd->shader = kernel_tex_fetch(__tri_shader, sd->prim);
/* vectors */
sd->P = triangle_refine(kg, sd, isect, ray);
@@ -166,9 +165,8 @@ ccl_device_inline void shader_setup_from_subsurface(KernelGlobals *kg, ShaderDat
/* fetch triangle data */
if(sd->type == PRIMITIVE_TRIANGLE) {
- float4 Ns = kernel_tex_fetch(__tri_normal, sd->prim);
- float3 Ng = make_float3(Ns.x, Ns.y, Ns.z);
- sd->shader = __float_as_int(Ns.w);
+ float3 Ng = triangle_normal(kg, sd);
+ sd->shader = kernel_tex_fetch(__tri_shader, sd->prim);
/* static triangle */
sd->P = triangle_refine_subsurface(kg, sd, isect, ray);
@@ -342,7 +340,7 @@ ccl_device void shader_setup_from_displace(KernelGlobals *kg, ShaderData *sd,
float3 P, Ng, I = make_float3(0.0f, 0.0f, 0.0f);
int shader;
- triangle_point_normal(kg, prim, u, v, &P, &Ng, &shader);
+ triangle_point_normal(kg, object, prim, u, v, &P, &Ng, &shader);
/* force smooth shading for displacement */
shader |= SHADER_SMOOTH_NORMAL;
@@ -609,6 +607,9 @@ ccl_device void shader_bsdf_blur(KernelGlobals *kg, ShaderData *sd, float roughn
ccl_device float3 shader_bsdf_transparency(KernelGlobals *kg, ShaderData *sd)
{
+ if(sd->flag & SD_HAS_ONLY_VOLUME)
+ return make_float3(1.0f, 1.0f, 1.0f);
+
float3 eval = make_float3(0.0f, 0.0f, 0.0f);
for(int i = 0; i< sd->num_closure; i++) {
@@ -797,8 +798,8 @@ ccl_device void shader_eval_surface(KernelGlobals *kg, ShaderData *sd,
#ifdef __SVM__
svm_eval_nodes(kg, sd, SHADER_TYPE_SURFACE, path_flag);
#else
- sd->closure.weight = make_float3(0.8f, 0.8f, 0.8f);
- sd->closure.N = sd->N;
+ sd->closure->weight = make_float3(0.8f, 0.8f, 0.8f);
+ sd->closure->N = sd->N;
sd->flag |= bsdf_diffuse_setup(&sd->closure);
#endif
}
@@ -857,7 +858,7 @@ ccl_device_inline void _shader_volume_phase_multi_eval(const ShaderData *sd, con
if(phase_pdf != 0.0f) {
bsdf_eval_accum(result_eval, sc->type, eval);
- sum_pdf += phase_pdf;
+ sum_pdf += phase_pdf*sc->sample_weight;
}
sum_sample_weight += sc->sample_weight;
@@ -1025,8 +1026,7 @@ ccl_device bool shader_transparent_shadow(KernelGlobals *kg, Intersection *isect
#ifdef __HAIR__
if(kernel_tex_fetch(__prim_type, isect->prim) & PRIMITIVE_ALL_TRIANGLE) {
#endif
- float4 Ns = kernel_tex_fetch(__tri_normal, prim);
- shader = __float_as_int(Ns.w);
+ shader = kernel_tex_fetch(__tri_shader, prim);
#ifdef __HAIR__
}
else {
diff --git a/intern/cycles/kernel/kernel_shadow.h b/intern/cycles/kernel/kernel_shadow.h
index ab7524c411a..61954282c28 100644
--- a/intern/cycles/kernel/kernel_shadow.h
+++ b/intern/cycles/kernel/kernel_shadow.h
@@ -64,18 +64,21 @@ ccl_device_inline bool shadow_blocked(KernelGlobals *kg, PathState *state, Ray *
bool blocked;
if(kernel_data.integrator.transparent_shadows) {
+ /* check transparent bounces here, for volume scatter which can do
+ * lighting before surface path termination is checked */
+ if(state->transparent_bounce >= kernel_data.integrator.transparent_max_bounce)
+ return true;
+
/* intersect to find an opaque surface, or record all transparent surface hits */
Intersection hits_stack[STACK_MAX_HITS];
- Intersection *hits;
+ Intersection *hits = hits_stack;
uint max_hits = kernel_data.integrator.transparent_max_bounce - state->transparent_bounce - 1;
/* prefer to use stack but use dynamic allocation if too deep max hits
* we need max_hits + 1 storage space due to the logic in
* scene_intersect_shadow_all which will first store and then check if
* the limit is exceeded */
- if(max_hits + 1 <= STACK_MAX_HITS)
- hits = hits_stack;
- else
+ if(max_hits + 1 > STACK_MAX_HITS)
hits = (Intersection*)malloc(sizeof(Intersection)*(max_hits + 1));
uint num_hits;
@@ -152,7 +155,11 @@ ccl_device_inline bool shadow_blocked(KernelGlobals *kg, PathState *state, Ray *
kernel_volume_shadow(kg, &ps, ray, &throughput);
#endif
- *shadow *= throughput;
+ *shadow = throughput;
+
+ if(hits != hits_stack)
+ free(hits);
+ return is_zero(throughput);
}
/* free dynamic storage */
@@ -161,11 +168,7 @@ ccl_device_inline bool shadow_blocked(KernelGlobals *kg, PathState *state, Ray *
}
else {
Intersection isect;
-#ifdef __HAIR__
blocked = scene_intersect(kg, ray, PATH_RAY_SHADOW_OPAQUE, &isect, NULL, 0.0f, 0.0f);
-#else
- blocked = scene_intersect(kg, ray, PATH_RAY_SHADOW_OPAQUE, &isect);
-#endif
}
#ifdef __VOLUME__
@@ -178,6 +181,8 @@ ccl_device_inline bool shadow_blocked(KernelGlobals *kg, PathState *state, Ray *
return blocked;
}
+#undef STACK_MAX_HITS
+
#else
/* Shadow function to compute how much light is blocked, GPU variation.
@@ -196,11 +201,7 @@ ccl_device_inline bool shadow_blocked(KernelGlobals *kg, PathState *state, Ray *
return false;
Intersection isect;
-#ifdef __HAIR__
bool blocked = scene_intersect(kg, ray, PATH_RAY_SHADOW_OPAQUE, &isect, NULL, 0.0f, 0.0f);
-#else
- bool blocked = scene_intersect(kg, ray, PATH_RAY_SHADOW_OPAQUE, &isect);
-#endif
#ifdef __TRANSPARENT_SHADOWS__
if(blocked && kernel_data.integrator.transparent_shadows) {
@@ -216,11 +217,7 @@ ccl_device_inline bool shadow_blocked(KernelGlobals *kg, PathState *state, Ray *
if(bounce >= kernel_data.integrator.transparent_max_bounce)
return true;
-#ifdef __HAIR__
if(!scene_intersect(kg, ray, PATH_RAY_SHADOW_TRANSPARENT, &isect, NULL, 0.0f, 0.0f))
-#else
- if(!scene_intersect(kg, ray, PATH_RAY_SHADOW_TRANSPARENT, &isect))
-#endif
{
#ifdef __VOLUME__
diff --git a/intern/cycles/kernel/kernel_sse2.cpp b/intern/cycles/kernel/kernel_sse2.cpp
index 2d5f6091908..740998e8c92 100644
--- a/intern/cycles/kernel/kernel_sse2.cpp
+++ b/intern/cycles/kernel/kernel_sse2.cpp
@@ -34,7 +34,7 @@
#include "kernel_globals.h"
#include "kernel_film.h"
#include "kernel_path.h"
-#include "kernel_displace.h"
+#include "kernel_bake.h"
CCL_NAMESPACE_BEGIN
@@ -64,9 +64,12 @@ void kernel_cpu_sse2_convert_to_half_float(KernelGlobals *kg, uchar4 *rgba, floa
/* Shader Evaluate */
-void kernel_cpu_sse2_shader(KernelGlobals *kg, uint4 *input, float4 *output, int type, int i)
+void kernel_cpu_sse2_shader(KernelGlobals *kg, uint4 *input, float4 *output, int type, int i, int offset, int sample)
{
- kernel_shader_evaluate(kg, input, output, (ShaderEvalType)type, i);
+ if(type >= SHADER_EVAL_BAKE)
+ kernel_bake_evaluate(kg, input, output, (ShaderEvalType)type, i, offset, sample);
+ else
+ kernel_shader_evaluate(kg, input, output, (ShaderEvalType)type, i, sample);
}
CCL_NAMESPACE_END
diff --git a/intern/cycles/kernel/kernel_sse3.cpp b/intern/cycles/kernel/kernel_sse3.cpp
index 1062fd0c990..da73a3a1c97 100644
--- a/intern/cycles/kernel/kernel_sse3.cpp
+++ b/intern/cycles/kernel/kernel_sse3.cpp
@@ -36,7 +36,7 @@
#include "kernel_globals.h"
#include "kernel_film.h"
#include "kernel_path.h"
-#include "kernel_displace.h"
+#include "kernel_bake.h"
CCL_NAMESPACE_BEGIN
@@ -66,9 +66,12 @@ void kernel_cpu_sse3_convert_to_half_float(KernelGlobals *kg, uchar4 *rgba, floa
/* Shader Evaluate */
-void kernel_cpu_sse3_shader(KernelGlobals *kg, uint4 *input, float4 *output, int type, int i)
+void kernel_cpu_sse3_shader(KernelGlobals *kg, uint4 *input, float4 *output, int type, int i, int offset, int sample)
{
- kernel_shader_evaluate(kg, input, output, (ShaderEvalType)type, i);
+ if(type >= SHADER_EVAL_BAKE)
+ kernel_bake_evaluate(kg, input, output, (ShaderEvalType)type, i, offset, sample);
+ else
+ kernel_shader_evaluate(kg, input, output, (ShaderEvalType)type, i, sample);
}
CCL_NAMESPACE_END
diff --git a/intern/cycles/kernel/kernel_sse41.cpp b/intern/cycles/kernel/kernel_sse41.cpp
index ba3b4887650..5704f60e138 100644
--- a/intern/cycles/kernel/kernel_sse41.cpp
+++ b/intern/cycles/kernel/kernel_sse41.cpp
@@ -37,7 +37,7 @@
#include "kernel_globals.h"
#include "kernel_film.h"
#include "kernel_path.h"
-#include "kernel_displace.h"
+#include "kernel_bake.h"
CCL_NAMESPACE_BEGIN
@@ -67,9 +67,12 @@ void kernel_cpu_sse41_convert_to_half_float(KernelGlobals *kg, uchar4 *rgba, flo
/* Shader Evaluate */
-void kernel_cpu_sse41_shader(KernelGlobals *kg, uint4 *input, float4 *output, int type, int i)
+void kernel_cpu_sse41_shader(KernelGlobals *kg, uint4 *input, float4 *output, int type, int i, int offset, int sample)
{
- kernel_shader_evaluate(kg, input, output, (ShaderEvalType)type, i);
+ if(type >= SHADER_EVAL_BAKE)
+ kernel_bake_evaluate(kg, input, output, (ShaderEvalType)type, i, offset, sample);
+ else
+ kernel_shader_evaluate(kg, input, output, (ShaderEvalType)type, i, sample);
}
CCL_NAMESPACE_END
diff --git a/intern/cycles/kernel/kernel_textures.h b/intern/cycles/kernel/kernel_textures.h
index b07075c6c95..ef46b2f707f 100644
--- a/intern/cycles/kernel/kernel_textures.h
+++ b/intern/cycles/kernel/kernel_textures.h
@@ -36,7 +36,7 @@ KERNEL_TEX(float4, texture_float4, __objects)
KERNEL_TEX(float4, texture_float4, __objects_vector)
/* triangles */
-KERNEL_TEX(float4, texture_float4, __tri_normal)
+KERNEL_TEX(uint, texture_uint, __tri_shader)
KERNEL_TEX(float4, texture_float4, __tri_vnormal)
KERNEL_TEX(float4, texture_float4, __tri_vindex)
KERNEL_TEX(float4, texture_float4, __tri_verts)
@@ -49,6 +49,7 @@ KERNEL_TEX(float4, texture_float4, __curve_keys)
KERNEL_TEX(uint4, texture_uint4, __attributes_map)
KERNEL_TEX(float, texture_float, __attributes_float)
KERNEL_TEX(float4, texture_float4, __attributes_float3)
+KERNEL_TEX(uchar4, texture_uchar4, __attributes_uchar4)
/* lights */
KERNEL_TEX(float4, texture_float4, __light_distribution)
@@ -172,10 +173,9 @@ KERNEL_IMAGE_TEX(uchar4, texture_image_uchar4, __tex_image_095)
KERNEL_IMAGE_TEX(uchar4, texture_image_uchar4, __tex_image_096)
KERNEL_IMAGE_TEX(uchar4, texture_image_uchar4, __tex_image_097)
KERNEL_IMAGE_TEX(uchar4, texture_image_uchar4, __tex_image_098)
-KERNEL_IMAGE_TEX(uchar4, texture_image_uchar4, __tex_image_099)
/* Kepler and above */
-#if defined(__CUDA_ARCH__) && (__CUDA_ARCH__ >= 300)
+KERNEL_IMAGE_TEX(uchar4, texture_image_uchar4, __tex_image_099)
KERNEL_IMAGE_TEX(uchar4, texture_image_uchar4, __tex_image_100)
KERNEL_IMAGE_TEX(uchar4, texture_image_uchar4, __tex_image_101)
KERNEL_IMAGE_TEX(uchar4, texture_image_uchar4, __tex_image_102)
@@ -227,7 +227,6 @@ KERNEL_IMAGE_TEX(uchar4, texture_image_uchar4, __tex_image_147)
KERNEL_IMAGE_TEX(uchar4, texture_image_uchar4, __tex_image_148)
KERNEL_IMAGE_TEX(uchar4, texture_image_uchar4, __tex_image_149)
KERNEL_IMAGE_TEX(uchar4, texture_image_uchar4, __tex_image_150)
-#endif
/* packed image (opencl) */
KERNEL_TEX(uchar4, texture_uchar4, __tex_image_packed)
diff --git a/intern/cycles/kernel/kernel_types.h b/intern/cycles/kernel/kernel_types.h
index 11445aa1c93..cfac8d1e905 100644
--- a/intern/cycles/kernel/kernel_types.h
+++ b/intern/cycles/kernel/kernel_types.h
@@ -38,12 +38,14 @@ CCL_NAMESPACE_BEGIN
#define BSSRDF_MIN_RADIUS 1e-8f
#define BSSRDF_MAX_HITS 4
-#define BB_DRAPPER 800.0f
+#define BB_DRAPER 800.0f
#define BB_MAX_TABLE_RANGE 12000.0f
#define BB_TABLE_XPOWER 1.5f
#define BB_TABLE_YPOWER 5.0f
#define BB_TABLE_SPACING 2.0f
+#define BECKMANN_TABLE_SIZE 256
+
#define TEX_NUM_FLOAT_IMAGES 5
#define SHADER_NONE (~0)
@@ -64,6 +66,8 @@ CCL_NAMESPACE_BEGIN
#define __SUBSURFACE__
#define __CMJ__
#define __VOLUME__
+#define __VOLUME_DECOUPLED__
+#define __VOLUME_SCATTER__
#define __SHADOW_RECORD_ALL__
#endif
@@ -71,10 +75,15 @@ CCL_NAMESPACE_BEGIN
#define __KERNEL_SHADING__
#define __KERNEL_ADV_SHADING__
#define __BRANCHED_PATH__
+#define __VOLUME__
+#define __VOLUME_SCATTER__
/* Experimental on GPU */
-//#define __VOLUME__
-//#define __SUBSURFACE__
+#ifdef __KERNEL_CUDA_EXPERIMENTAL__
+#define __SUBSURFACE__
+#define __CMJ__
+#endif
+
#endif
#ifdef __KERNEL_OPENCL__
@@ -101,7 +110,6 @@ CCL_NAMESPACE_BEGIN
#define __BACKGROUND_MIS__
#define __LAMP_MIS__
#define __AO__
-#define __ANISOTROPIC__
//#define __CAMERA_MOTION__
//#define __OBJECT_MOTION__
//#define __HAIR__
@@ -132,11 +140,9 @@ CCL_NAMESPACE_BEGIN
#ifdef __KERNEL_SHADING__
#define __SVM__
#define __EMISSION__
-#define __PROCEDURAL_TEXTURES__
-#define __IMAGE_TEXTURES__
+#define __TEXTURES__
#define __EXTRA_NODES__
#define __HOLDOUT__
-#define __NORMAL_MAP__
#endif
#ifdef __KERNEL_ADV_SHADING__
@@ -146,12 +152,15 @@ CCL_NAMESPACE_BEGIN
#define __BACKGROUND_MIS__
#define __LAMP_MIS__
#define __AO__
-#define __ANISOTROPIC__
#define __CAMERA_MOTION__
#define __OBJECT_MOTION__
#define __HAIR__
#endif
+#ifdef WITH_CYCLES_DEBUG
+# define __KERNEL_DEBUG__
+#endif
+
/* Random Numbers */
typedef uint RNG;
@@ -221,10 +230,9 @@ enum PathTraceDimension {
PRNG_PHASE_V = 9,
PRNG_PHASE = 10,
PRNG_SCATTER_DISTANCE = 11,
- PRNG_BOUNCE_NUM = 12,
-#else
- PRNG_BOUNCE_NUM = 8,
#endif
+
+ PRNG_BOUNCE_NUM = 12,
};
enum SamplingPattern {
@@ -250,17 +258,17 @@ enum PathRayFlag {
PATH_RAY_SHADOW_TRANSPARENT = 256,
PATH_RAY_SHADOW = (PATH_RAY_SHADOW_OPAQUE|PATH_RAY_SHADOW_TRANSPARENT),
- PATH_RAY_CURVE = 512, /* visibility flag to define curve segments*/
+ PATH_RAY_CURVE = 512, /* visibility flag to define curve segments */
+ PATH_RAY_VOLUME_SCATTER = 1024, /* volume scattering */
/* note that these can use maximum 12 bits, the other are for layers */
- PATH_RAY_ALL_VISIBILITY = (1|2|4|8|16|32|64|128|256|512),
+ PATH_RAY_ALL_VISIBILITY = (1|2|4|8|16|32|64|128|256|512|1024),
- PATH_RAY_MIS_SKIP = 1024,
- PATH_RAY_DIFFUSE_ANCESTOR = 2048,
- PATH_RAY_GLOSSY_ANCESTOR = 4096,
- PATH_RAY_BSSRDF_ANCESTOR = 8192,
- PATH_RAY_SINGLE_PASS_DONE = 16384,
- PATH_RAY_VOLUME_SCATTER = 32768,
+ PATH_RAY_MIS_SKIP = 2048,
+ PATH_RAY_DIFFUSE_ANCESTOR = 4096,
+ PATH_RAY_GLOSSY_ANCESTOR = 8192,
+ PATH_RAY_BSSRDF_ANCESTOR = 16384,
+ PATH_RAY_SINGLE_PASS_DONE = 32768,
/* we need layer member flags to be the 20 upper bits */
PATH_RAY_LAYER_SHIFT = (32-20)
@@ -283,32 +291,35 @@ typedef enum ClosureLabel {
typedef enum PassType {
PASS_NONE = 0,
- PASS_COMBINED = 1,
- PASS_DEPTH = 2,
- PASS_NORMAL = 4,
- PASS_UV = 8,
- PASS_OBJECT_ID = 16,
- PASS_MATERIAL_ID = 32,
- PASS_DIFFUSE_COLOR = 64,
- PASS_GLOSSY_COLOR = 128,
- PASS_TRANSMISSION_COLOR = 256,
- PASS_DIFFUSE_INDIRECT = 512,
- PASS_GLOSSY_INDIRECT = 1024,
- PASS_TRANSMISSION_INDIRECT = 2048,
- PASS_DIFFUSE_DIRECT = 4096,
- PASS_GLOSSY_DIRECT = 8192,
- PASS_TRANSMISSION_DIRECT = 16384,
- PASS_EMISSION = 32768,
- PASS_BACKGROUND = 65536,
- PASS_AO = 131072,
- PASS_SHADOW = 262144,
- PASS_MOTION = 524288,
- PASS_MOTION_WEIGHT = 1048576,
- PASS_MIST = 2097152,
- PASS_SUBSURFACE_DIRECT = 4194304,
- PASS_SUBSURFACE_INDIRECT = 8388608,
- PASS_SUBSURFACE_COLOR = 16777216,
- PASS_LIGHT = 33554432, /* no real pass, used to force use_light_pass */
+ PASS_COMBINED = (1 << 0),
+ PASS_DEPTH = (1 << 1),
+ PASS_NORMAL = (1 << 2),
+ PASS_UV = (1 << 3),
+ PASS_OBJECT_ID = (1 << 4),
+ PASS_MATERIAL_ID = (1 << 5),
+ PASS_DIFFUSE_COLOR = (1 << 6),
+ PASS_GLOSSY_COLOR = (1 << 7),
+ PASS_TRANSMISSION_COLOR = (1 << 8),
+ PASS_DIFFUSE_INDIRECT = (1 << 9),
+ PASS_GLOSSY_INDIRECT = (1 << 10),
+ PASS_TRANSMISSION_INDIRECT = (1 << 11),
+ PASS_DIFFUSE_DIRECT = (1 << 12),
+ PASS_GLOSSY_DIRECT = (1 << 13),
+ PASS_TRANSMISSION_DIRECT = (1 << 14),
+ PASS_EMISSION = (1 << 15),
+ PASS_BACKGROUND = (1 << 16),
+ PASS_AO = (1 << 17),
+ PASS_SHADOW = (1 << 18),
+ PASS_MOTION = (1 << 19),
+ PASS_MOTION_WEIGHT = (1 << 20),
+ PASS_MIST = (1 << 21),
+ PASS_SUBSURFACE_DIRECT = (1 << 22),
+ PASS_SUBSURFACE_INDIRECT = (1 << 23),
+ PASS_SUBSURFACE_COLOR = (1 << 24),
+ PASS_LIGHT = (1 << 25), /* no real pass, used to force use_light_pass */
+#ifdef __KERNEL_DEBUG__
+ PASS_BVH_TRAVERSAL_STEPS = (1 << 26),
+#endif
} PassType;
#define PASS_ALL (~0)
@@ -330,21 +341,25 @@ typedef struct PathRadiance {
float3 color_glossy;
float3 color_transmission;
float3 color_subsurface;
+ float3 color_scatter;
float3 direct_diffuse;
float3 direct_glossy;
float3 direct_transmission;
float3 direct_subsurface;
+ float3 direct_scatter;
float3 indirect_diffuse;
float3 indirect_glossy;
float3 indirect_transmission;
float3 indirect_subsurface;
+ float3 indirect_scatter;
float3 path_diffuse;
float3 path_glossy;
float3 path_transmission;
float3 path_subsurface;
+ float3 path_scatter;
float4 shadow;
float mist;
@@ -358,6 +373,7 @@ typedef struct BsdfEval {
float3 transmission;
float3 transparent;
float3 subsurface;
+ float3 scatter;
} BsdfEval;
#else
@@ -378,7 +394,8 @@ typedef enum ShaderFlag {
SHADER_EXCLUDE_GLOSSY = (1 << 26),
SHADER_EXCLUDE_TRANSMIT = (1 << 25),
SHADER_EXCLUDE_CAMERA = (1 << 24),
- SHADER_EXCLUDE_ANY = (SHADER_EXCLUDE_DIFFUSE|SHADER_EXCLUDE_GLOSSY|SHADER_EXCLUDE_TRANSMIT|SHADER_EXCLUDE_CAMERA),
+ SHADER_EXCLUDE_SCATTER = (1 << 23),
+ SHADER_EXCLUDE_ANY = (SHADER_EXCLUDE_DIFFUSE|SHADER_EXCLUDE_GLOSSY|SHADER_EXCLUDE_TRANSMIT|SHADER_EXCLUDE_CAMERA|SHADER_EXCLUDE_SCATTER),
SHADER_MASK = ~(SHADER_SMOOTH_NORMAL|SHADER_CAST_SHADOW|SHADER_AREA_LIGHT|SHADER_USE_MIS|SHADER_EXCLUDE_ANY)
} ShaderFlag;
@@ -390,10 +407,8 @@ typedef enum LightType {
LIGHT_DISTANT,
LIGHT_BACKGROUND,
LIGHT_AREA,
- LIGHT_AO,
LIGHT_SPOT,
- LIGHT_TRIANGLE,
- LIGHT_STRAND
+ LIGHT_TRIANGLE
} LightType;
/* Camera Type */
@@ -445,6 +460,10 @@ typedef struct Intersection {
int prim;
int object;
int type;
+
+#ifdef __KERNEL_DEBUG__
+ int num_traversal_steps;
+#endif
} Intersection;
/* Primitives */
@@ -478,6 +497,7 @@ typedef enum AttributeElement {
ATTR_ELEMENT_VERTEX,
ATTR_ELEMENT_VERTEX_MOTION,
ATTR_ELEMENT_CORNER,
+ ATTR_ELEMENT_CORNER_BYTE,
ATTR_ELEMENT_CURVE,
ATTR_ELEMENT_CURVE_KEY,
ATTR_ELEMENT_CURVE_KEY_MOTION,
@@ -519,24 +539,32 @@ typedef enum AttributeStandard {
#define MAX_CLOSURE 1
#endif
+/* TODO(sergey): This is rather nasty bug happening in here, which
+ * could be simply a compilers bug for which we can't find a generic
+ * platform independent workaround. Also even if it's a compiler
+ * issue, it's not so simple to upgrade the compiler in the release
+ * environment for linux and doing it so closer to the release is
+ * rather a risky business.
+ *
+ * For this release it's probably safer to stick with such a rather
+ * dirty solution, and look for a cleaner fix during the next release
+ * cycle.
+ */
typedef struct ShaderClosure {
ClosureType type;
float3 weight;
-
+#ifndef __APPLE__
float sample_weight;
-
+#endif
float data0;
float data1;
+ float data2;
float3 N;
-#if defined(__ANISOTROPIC__) || defined(__SUBSURFACE__) || defined(__HAIR__)
float3 T;
+#ifdef __APPLE__
+ float sample_weight;
#endif
-
-#ifdef __HAIR__
- float offset;
-#endif
-
#ifdef __OSL__
void *prim;
#endif
@@ -563,37 +591,49 @@ typedef enum ShaderContext {
enum ShaderDataFlag {
/* runtime flags */
- SD_BACKFACING = 1, /* backside of surface? */
- SD_EMISSION = 2, /* have emissive closure? */
- SD_BSDF = 4, /* have bsdf closure? */
- SD_BSDF_HAS_EVAL = 8, /* have non-singular bsdf closure? */
- SD_PHASE_HAS_EVAL = 8, /* have non-singular phase closure? */
- SD_BSDF_GLOSSY = 16, /* have glossy bsdf */
- SD_BSSRDF = 32, /* have bssrdf */
- SD_HOLDOUT = 64, /* have holdout closure? */
- SD_ABSORPTION = 128, /* have volume absorption closure? */
- SD_SCATTER = 256, /* have volume phase closure? */
- SD_AO = 512, /* have ao closure? */
- SD_TRANSPARENT = 1024, /* have transparent closure? */
-
- SD_CLOSURE_FLAGS = (SD_EMISSION|SD_BSDF|SD_BSDF_HAS_EVAL|SD_BSDF_GLOSSY|SD_BSSRDF|SD_HOLDOUT|SD_ABSORPTION|SD_SCATTER|SD_AO),
+ SD_BACKFACING = (1 << 0), /* backside of surface? */
+ SD_EMISSION = (1 << 1), /* have emissive closure? */
+ SD_BSDF = (1 << 2), /* have bsdf closure? */
+ SD_BSDF_HAS_EVAL = (1 << 3), /* have non-singular bsdf closure? */
+ SD_PHASE_HAS_EVAL = (1 << 3), /* have non-singular phase closure? */
+ SD_BSDF_GLOSSY = (1 << 4), /* have glossy bsdf */
+ SD_BSSRDF = (1 << 5), /* have bssrdf */
+ SD_HOLDOUT = (1 << 6), /* have holdout closure? */
+ SD_ABSORPTION = (1 << 7), /* have volume absorption closure? */
+ SD_SCATTER = (1 << 8), /* have volume phase closure? */
+ SD_AO = (1 << 9), /* have ao closure? */
+ SD_TRANSPARENT = (1 << 10), /* have transparent closure? */
+
+ SD_CLOSURE_FLAGS = (SD_EMISSION|SD_BSDF|SD_BSDF_HAS_EVAL|SD_BSDF_GLOSSY|
+ SD_BSSRDF|SD_HOLDOUT|SD_ABSORPTION|SD_SCATTER|SD_AO),
/* shader flags */
- SD_USE_MIS = 2048, /* direct light sample */
- SD_HAS_TRANSPARENT_SHADOW = 4096, /* has transparent shadow */
- SD_HAS_VOLUME = 8192, /* has volume shader */
- SD_HAS_ONLY_VOLUME = 16384, /* has only volume shader, no surface */
- SD_HETEROGENEOUS_VOLUME = 32768, /* has heterogeneous volume */
- SD_HAS_BSSRDF_BUMP = 65536, /* bssrdf normal uses bump */
-
- SD_SHADER_FLAGS = (SD_USE_MIS|SD_HAS_TRANSPARENT_SHADOW|SD_HAS_VOLUME|SD_HAS_ONLY_VOLUME|SD_HETEROGENEOUS_VOLUME|SD_HAS_BSSRDF_BUMP),
+ SD_USE_MIS = (1 << 11), /* direct light sample */
+ SD_HAS_TRANSPARENT_SHADOW = (1 << 12), /* has transparent shadow */
+ SD_HAS_VOLUME = (1 << 13), /* has volume shader */
+ SD_HAS_ONLY_VOLUME = (1 << 14), /* has only volume shader, no surface */
+ SD_HETEROGENEOUS_VOLUME = (1 << 15), /* has heterogeneous volume */
+ SD_HAS_BSSRDF_BUMP = (1 << 16), /* bssrdf normal uses bump */
+ SD_VOLUME_EQUIANGULAR = (1 << 17), /* use equiangular sampling */
+ SD_VOLUME_MIS = (1 << 18), /* use multiple importance sampling */
+ SD_VOLUME_CUBIC = (1 << 19), /* use cubic interpolation for voxels */
+
+ SD_SHADER_FLAGS = (SD_USE_MIS|SD_HAS_TRANSPARENT_SHADOW|SD_HAS_VOLUME|
+ SD_HAS_ONLY_VOLUME|SD_HETEROGENEOUS_VOLUME|
+ SD_HAS_BSSRDF_BUMP|SD_VOLUME_EQUIANGULAR|SD_VOLUME_MIS|
+ SD_VOLUME_CUBIC),
/* object flags */
- SD_HOLDOUT_MASK = 131072, /* holdout for camera rays */
- SD_OBJECT_MOTION = 262144, /* has object motion blur */
- SD_TRANSFORM_APPLIED = 524288, /* vertices have transform applied */
-
- SD_OBJECT_FLAGS = (SD_HOLDOUT_MASK|SD_OBJECT_MOTION|SD_TRANSFORM_APPLIED)
+ SD_HOLDOUT_MASK = (1 << 20), /* holdout for camera rays */
+ SD_OBJECT_MOTION = (1 << 21), /* has object motion blur */
+ SD_TRANSFORM_APPLIED = (1 << 22), /* vertices have transform applied */
+ SD_NEGATIVE_SCALE_APPLIED = (1 << 23), /* vertices have negative scale applied */
+ SD_OBJECT_HAS_VOLUME = (1 << 24), /* object has a volume shader */
+ SD_OBJECT_INTERSECTS_VOLUME = (1 << 25), /* object intersects AABB of an object with volume shader */
+
+ SD_OBJECT_FLAGS = (SD_HOLDOUT_MASK|SD_OBJECT_MOTION|SD_TRANSFORM_APPLIED|
+ SD_NEGATIVE_SCALE_APPLIED|SD_OBJECT_HAS_VOLUME|
+ SD_OBJECT_INTERSECTS_VOLUME)
};
struct KernelGlobals;
@@ -686,9 +726,10 @@ typedef struct PathState {
int flag;
/* random number generator state */
- int rng_offset; /* dimension offset */
- int sample; /* path sample number */
- int num_samples; /* total number of times this path will be sampled */
+ int rng_offset; /* dimension offset */
+ int rng_offset_bsdf; /* dimension offset for picking bsdf */
+ int sample; /* path sample number */
+ int num_samples; /* total number of times this path will be sampled */
/* bounce counting */
int bounce;
@@ -756,9 +797,12 @@ typedef struct KernelCamera {
/* render size */
float width, height;
int resolution;
- int pad1;
+
+ /* anamorphic lens bokeh */
+ float inv_aperture_ratio;
+
+ int is_inside_volume;
int pad2;
- int pad3;
/* more matrices */
Transform screentoworld;
@@ -819,6 +863,11 @@ typedef struct KernelFilm {
float mist_start;
float mist_inv_depth;
float mist_falloff;
+
+#ifdef __KERNEL_DEBUG__
+ int pass_bvh_traversal_steps;
+ int pass_pad3, pass_pad4, pass_pad5;
+#endif
} KernelFilm;
typedef struct KernelBackground {
@@ -860,7 +909,8 @@ typedef struct KernelIntegrator {
int transparent_shadows;
/* caustics */
- int no_caustics;
+ int caustics_reflective;
+ int caustics_refractive;
float filter_glossy;
/* seed */
@@ -892,7 +942,6 @@ typedef struct KernelIntegrator {
int aa_samples;
/* volume render */
- int volume_homogeneous_sampling;
int use_volumes;
int volume_max_steps;
float volume_step_size;
@@ -922,7 +971,6 @@ typedef enum CurveFlag {
} CurveFlag;
typedef struct KernelCurves {
- /* strand intersect and normal parameters - many can be changed to flags */
int curveflags;
int subdivisions;
@@ -930,11 +978,11 @@ typedef struct KernelCurves {
float maximum_width;
} KernelCurves;
-typedef struct KernelBlackbody {
- int table_offset;
- int pad1, pad2, pad3;
-} KernelBlackbody;
-
+typedef struct KernelTables {
+ int blackbody_offset;
+ int beckmann_offset;
+ int pad1, pad2;
+} KernelTables;
typedef struct KernelData {
KernelCamera cam;
@@ -943,9 +991,17 @@ typedef struct KernelData {
KernelIntegrator integrator;
KernelBVH bvh;
KernelCurves curve;
- KernelBlackbody blackbody;
+ KernelTables tables;
} KernelData;
+#ifdef __KERNEL_DEBUG__
+typedef struct DebugData {
+ // Total number of BVH node travesal steps and primitives intersections
+ // for the camera rays.
+ int num_bvh_traversal_steps;
+} DebugData;
+#endif
+
CCL_NAMESPACE_END
#endif /* __KERNEL_TYPES_H__ */
diff --git a/intern/cycles/kernel/kernel_volume.h b/intern/cycles/kernel/kernel_volume.h
index faaa68e3309..ce20f20e75a 100644
--- a/intern/cycles/kernel/kernel_volume.h
+++ b/intern/cycles/kernel/kernel_volume.h
@@ -116,6 +116,36 @@ ccl_device bool volume_stack_is_heterogeneous(KernelGlobals *kg, VolumeStack *st
return false;
}
+ccl_device int volume_stack_sampling_method(KernelGlobals *kg, VolumeStack *stack)
+{
+ if(kernel_data.integrator.num_all_lights == 0)
+ return 0;
+
+ int method = -1;
+
+ for(int i = 0; stack[i].shader != SHADER_NONE; i++) {
+ int shader_flag = kernel_tex_fetch(__shader_flag, (stack[i].shader & SHADER_MASK)*2);
+
+ if(shader_flag & SD_VOLUME_MIS) {
+ return SD_VOLUME_MIS;
+ }
+ else if(shader_flag & SD_VOLUME_EQUIANGULAR) {
+ if(method == 0)
+ return SD_VOLUME_MIS;
+
+ method = SD_VOLUME_EQUIANGULAR;
+ }
+ else {
+ if(method == SD_VOLUME_EQUIANGULAR)
+ return SD_VOLUME_MIS;
+
+ method = 0;
+ }
+ }
+
+ return method;
+}
+
/* Volume Shadows
*
* These functions are used to attenuate shadow rays to lights. Both absorption
@@ -136,7 +166,7 @@ ccl_device void kernel_volume_shadow_homogeneous(KernelGlobals *kg, PathState *s
ccl_device void kernel_volume_shadow_heterogeneous(KernelGlobals *kg, PathState *state, Ray *ray, ShaderData *sd, float3 *throughput)
{
float3 tp = *throughput;
- const float tp_eps = 1e-10f; /* todo: this is likely not the right value */
+ const float tp_eps = 1e-6f; /* todo: this is likely not the right value */
/* prepare for stepping */
int max_steps = kernel_data.integrator.volume_max_steps;
@@ -146,6 +176,8 @@ ccl_device void kernel_volume_shadow_heterogeneous(KernelGlobals *kg, PathState
/* compute extinction at the start */
float t = 0.0f;
+ float3 sum = make_float3(0.0f, 0.0f, 0.0f);
+
for(int i = 0; i < max_steps; i++) {
/* advance to new position */
float new_t = min(ray->t, (i+1) * step);
@@ -160,20 +192,26 @@ ccl_device void kernel_volume_shadow_heterogeneous(KernelGlobals *kg, PathState
/* compute attenuation over segment */
if(volume_shader_extinction_sample(kg, sd, state, new_P, &sigma_t)) {
- /* todo: we could avoid computing expf() for each step by summing,
- * because exp(a)*exp(b) = exp(a+b), but we still want a quick
- * tp_eps check too */
- tp *= volume_color_transmittance(sigma_t, new_t - t);
-
- /* stop if nearly all light blocked */
- if(tp.x < tp_eps && tp.y < tp_eps && tp.z < tp_eps)
- break;
+ /* Compute expf() only for every Nth step, to save some calculations
+ * because exp(a)*exp(b) = exp(a+b), also do a quick tp_eps check then. */
+
+ sum += (-sigma_t * (new_t - t));
+ if((i & 0x07) == 0) { /* ToDo: Other interval? */
+ tp = *throughput * make_float3(expf(sum.x), expf(sum.y), expf(sum.z));
+
+ /* stop if nearly all light is blocked */
+ if(tp.x < tp_eps && tp.y < tp_eps && tp.z < tp_eps)
+ break;
+ }
}
/* stop if at the end of the volume */
t = new_t;
- if(t == ray->t)
+ if(t == ray->t) {
+ /* Update throughput in case we haven't done it above */
+ tp = *throughput * make_float3(expf(sum.x), expf(sum.y), expf(sum.z));
break;
+ }
}
*throughput = tp;
@@ -226,33 +264,6 @@ ccl_device float kernel_volume_equiangular_pdf(Ray *ray, float3 light_P, float s
return pdf;
}
-ccl_device bool kernel_volume_equiangular_light_position(KernelGlobals *kg, PathState *state, Ray *ray, RNG *rng, float3 *light_P)
-{
- /* light RNGs */
- float light_t = path_state_rng_1D(kg, rng, state, PRNG_LIGHT);
- float light_u, light_v;
- path_state_rng_2D(kg, rng, state, PRNG_LIGHT_U, &light_u, &light_v);
-
- /* light sample */
- LightSample ls;
- light_sample(kg, light_t, light_u, light_v, ray->time, ray->P, &ls);
- if(ls.pdf == 0.0f)
- return false;
-
- *light_P = ls.P;
- return true;
-}
-
-ccl_device float kernel_volume_decoupled_equiangular_pdf(KernelGlobals *kg, PathState *state, Ray *ray, RNG *rng, float sample_t)
-{
- float3 light_P;
-
- if(!kernel_volume_equiangular_light_position(kg, state, ray, rng, &light_P))
- return 0.0f;
-
- return kernel_volume_equiangular_pdf(ray, light_P, sample_t);
-}
-
/* Distance sampling */
ccl_device float kernel_volume_distance_sample(float max_t, float3 sigma_t, int channel, float xi, float3 *transmittance, float3 *pdf)
@@ -312,7 +323,7 @@ ccl_device float3 kernel_volume_emission_integrate(VolumeShaderCoefficients *coe
* the volume shading coefficient for the entire line segment */
ccl_device VolumeIntegrateResult kernel_volume_integrate_homogeneous(KernelGlobals *kg,
PathState *state, Ray *ray, ShaderData *sd, PathRadiance *L, float3 *throughput,
- RNG *rng)
+ RNG *rng, bool probalistic_scatter)
{
VolumeShaderCoefficients coeff;
@@ -323,6 +334,7 @@ ccl_device VolumeIntegrateResult kernel_volume_integrate_homogeneous(KernelGloba
float t = ray->t;
float3 new_tp;
+#ifdef __VOLUME_SCATTER__
/* randomly scatter, and if we do t is shortened */
if(closure_flag & SD_SCATTER) {
/* extinction coefficient */
@@ -330,43 +342,41 @@ ccl_device VolumeIntegrateResult kernel_volume_integrate_homogeneous(KernelGloba
/* pick random color channel, we use the Veach one-sample
* model with balance heuristic for the channels */
- float rphase = path_state_rng_1D(kg, rng, state, PRNG_PHASE);
+ float rphase = path_state_rng_1D_for_decision(kg, rng, state, PRNG_PHASE);
int channel = (int)(rphase*3.0f);
sd->randb_closure = rphase*3.0f - channel;
- float xi = path_state_rng_1D(kg, rng, state, PRNG_SCATTER_DISTANCE);
-
/* decide if we will hit or miss */
- float sample_sigma_t = kernel_volume_channel_get(sigma_t, channel);
- float sample_transmittance = expf(-sample_sigma_t * t);
+ bool scatter = true;
+ float xi = path_state_rng_1D_for_decision(kg, rng, state, PRNG_SCATTER_DISTANCE);
+
+ if(probalistic_scatter) {
+ float sample_sigma_t = kernel_volume_channel_get(sigma_t, channel);
+ float sample_transmittance = expf(-sample_sigma_t * t);
+
+ if(1.0f - xi >= sample_transmittance) {
+ scatter = true;
+
+ /* rescale random number so we can reuse it */
+ xi = 1.0f - (1.0f - xi - sample_transmittance)/(1.0f - sample_transmittance);
- if(xi >= sample_transmittance) {
+ }
+ else
+ scatter = false;
+ }
+
+ if(scatter) {
/* scattering */
float3 pdf;
float3 transmittance;
float sample_t;
- /* rescale random number so we can reuse it */
- xi = (xi - sample_transmittance)/(1.0f - sample_transmittance);
-
- if(kernel_data.integrator.volume_homogeneous_sampling == 0 || !kernel_data.integrator.num_all_lights) {
- /* distance sampling */
- sample_t = kernel_volume_distance_sample(ray->t, sigma_t, channel, xi, &transmittance, &pdf);
- }
- else {
- /* equiangular sampling */
- float3 light_P;
- float equi_pdf;
- if(!kernel_volume_equiangular_light_position(kg, state, ray, rng, &light_P))
- return VOLUME_PATH_MISSED;
-
- sample_t = kernel_volume_equiangular_sample(ray, light_P, xi, &equi_pdf);
- transmittance = volume_color_transmittance(sigma_t, sample_t);
- pdf = make_float3(equi_pdf, equi_pdf, equi_pdf);
- }
+ /* distance sampling */
+ sample_t = kernel_volume_distance_sample(ray->t, sigma_t, channel, xi, &transmittance, &pdf);
/* modifiy pdf for hit/miss decision */
- pdf *= make_float3(1.0f, 1.0f, 1.0f) - volume_color_transmittance(sigma_t, t);
+ if(probalistic_scatter)
+ pdf *= make_float3(1.0f, 1.0f, 1.0f) - volume_color_transmittance(sigma_t, t);
new_tp = *throughput * coeff.sigma_s * transmittance / average(pdf);
t = sample_t;
@@ -378,14 +388,16 @@ ccl_device VolumeIntegrateResult kernel_volume_integrate_homogeneous(KernelGloba
new_tp = *throughput * transmittance / pdf;
}
}
- else if(closure_flag & SD_ABSORPTION) {
+ else
+#endif
+ if(closure_flag & SD_ABSORPTION) {
/* absorption only, no sampling needed */
float3 transmittance = volume_color_transmittance(coeff.sigma_a, t);
new_tp = *throughput * transmittance;
}
/* integrate emission attenuated by extinction */
- if(closure_flag & SD_EMISSION) {
+ if(L && (closure_flag & SD_EMISSION)) {
float3 sigma_t = coeff.sigma_a + coeff.sigma_s;
float3 transmittance = volume_color_transmittance(sigma_t, ray->t);
float3 emission = kernel_volume_emission_integrate(&coeff, closure_flag, transmittance, ray->t);
@@ -408,13 +420,15 @@ ccl_device VolumeIntegrateResult kernel_volume_integrate_homogeneous(KernelGloba
return VOLUME_PATH_ATTENUATED;
}
-/* heterogeneous volume: integrate stepping through the volume until we
- * reach the end, get absorbed entirely, or run out of iterations */
-ccl_device VolumeIntegrateResult kernel_volume_integrate_heterogeneous(KernelGlobals *kg,
+/* heterogeneous volume distance sampling: integrate stepping through the
+ * volume until we reach the end, get absorbed entirely, or run out of
+ * iterations. this does probalistically scatter or get transmitted through
+ * for path tracing where we don't want to branch. */
+ccl_device VolumeIntegrateResult kernel_volume_integrate_heterogeneous_distance(KernelGlobals *kg,
PathState *state, Ray *ray, ShaderData *sd, PathRadiance *L, float3 *throughput, RNG *rng)
{
float3 tp = *throughput;
- const float tp_eps = 1e-10f; /* todo: this is likely not the right value */
+ const float tp_eps = 1e-6f; /* todo: this is likely not the right value */
/* prepare for stepping */
int max_steps = kernel_data.integrator.volume_max_steps;
@@ -425,9 +439,12 @@ ccl_device VolumeIntegrateResult kernel_volume_integrate_heterogeneous(KernelGlo
float t = 0.0f;
float3 accum_transmittance = make_float3(1.0f, 1.0f, 1.0f);
- /* cache some constant variables */
- float xi;
- int channel = -1;
+ /* pick random color channel, we use the Veach one-sample
+ * model with balance heuristic for the channels */
+ float xi = path_state_rng_1D_for_decision(kg, rng, state, PRNG_SCATTER_DISTANCE);
+ float rphase = path_state_rng_1D_for_decision(kg, rng, state, PRNG_PHASE);
+ int channel = (int)(rphase*3.0f);
+ sd->randb_closure = rphase*3.0f - channel;
bool has_scatter = false;
for(int i = 0; i < max_steps; i++) {
@@ -449,25 +466,14 @@ ccl_device VolumeIntegrateResult kernel_volume_integrate_heterogeneous(KernelGlo
float3 transmittance;
bool scatter = false;
- /* randomly scatter, and if we do dt and new_t are shortened */
+ /* distance sampling */
+#ifdef __VOLUME_SCATTER__
if((closure_flag & SD_SCATTER) || (has_scatter && (closure_flag & SD_ABSORPTION))) {
has_scatter = true;
- /* average sigma_t and sigma_s over segment */
float3 sigma_t = coeff.sigma_a + coeff.sigma_s;
float3 sigma_s = coeff.sigma_s;
- /* lazily set up variables for sampling */
- if(channel == -1) {
- /* pick random color channel, we use the Veach one-sample
- * model with balance heuristic for the channels */
- xi = path_state_rng_1D(kg, rng, state, PRNG_SCATTER_DISTANCE);
-
- float rphase = path_state_rng_1D(kg, rng, state, PRNG_PHASE);
- channel = (int)(rphase*3.0f);
- sd->randb_closure = rphase*3.0f - channel;
- }
-
/* compute transmittance over full step */
transmittance = volume_color_transmittance(sigma_t, dt);
@@ -480,10 +486,12 @@ ccl_device VolumeIntegrateResult kernel_volume_integrate_heterogeneous(KernelGlo
float new_dt = -logf(1.0f - xi)/sample_sigma_t;
new_t = t + new_dt;
- /* transmittance, throughput */
+ /* transmittance and pdf */
float3 new_transmittance = volume_color_transmittance(sigma_t, new_dt);
- float pdf = average(sigma_t * new_transmittance);
- new_tp = tp * sigma_s * new_transmittance / pdf;
+ float3 pdf = sigma_t * new_transmittance;
+
+ /* throughput */
+ new_tp = tp * sigma_s * new_transmittance / average(pdf);
scatter = true;
}
else {
@@ -495,7 +503,9 @@ ccl_device VolumeIntegrateResult kernel_volume_integrate_heterogeneous(KernelGlo
xi = 1.0f - (1.0f - xi)/sample_transmittance;
}
}
- else if(closure_flag & SD_ABSORPTION) {
+ else
+#endif
+ if(closure_flag & SD_ABSORPTION) {
/* absorption only, no sampling needed */
float3 sigma_a = coeff.sigma_a;
@@ -504,7 +514,7 @@ ccl_device VolumeIntegrateResult kernel_volume_integrate_heterogeneous(KernelGlo
}
/* integrate emission attenuated by absorption */
- if(closure_flag & SD_EMISSION) {
+ if(L && (closure_flag & SD_EMISSION)) {
float3 emission = kernel_volume_emission_integrate(&coeff, closure_flag, transmittance, dt);
path_radiance_accum_emission(L, tp, emission, state->bounce);
}
@@ -518,19 +528,19 @@ ccl_device VolumeIntegrateResult kernel_volume_integrate_heterogeneous(KernelGlo
tp = make_float3(0.0f, 0.0f, 0.0f);
break;
}
+ }
- /* prepare to scatter to new direction */
- if(scatter) {
- /* adjust throughput and move to new location */
- sd->P = ray->P + new_t*ray->D;
- *throughput = tp;
+ /* prepare to scatter to new direction */
+ if(scatter) {
+ /* adjust throughput and move to new location */
+ sd->P = ray->P + new_t*ray->D;
+ *throughput = tp;
- return VOLUME_PATH_SCATTERED;
- }
- else {
- /* accumulate transmittance */
- accum_transmittance *= transmittance;
- }
+ return VOLUME_PATH_SCATTERED;
+ }
+ else {
+ /* accumulate transmittance */
+ accum_transmittance *= transmittance;
}
}
@@ -545,14 +555,34 @@ ccl_device VolumeIntegrateResult kernel_volume_integrate_heterogeneous(KernelGlo
return VOLUME_PATH_ATTENUATED;
}
+/* get the volume attenuation and emission over line segment defined by
+ * ray, with the assumption that there are no surfaces blocking light
+ * between the endpoints. distance sampling is used to decide if we will
+ * scatter or not. */
+ccl_device_noinline VolumeIntegrateResult kernel_volume_integrate(KernelGlobals *kg,
+ PathState *state, ShaderData *sd, Ray *ray, PathRadiance *L, float3 *throughput, RNG *rng, bool heterogeneous)
+{
+ /* workaround to fix correlation bug in T38710, can find better solution
+ * in random number generator later, for now this is done here to not impact
+ * performance of rendering without volumes */
+ RNG tmp_rng = cmj_hash(*rng, state->rng_offset);
+
+ shader_setup_from_volume(kg, sd, ray, state->bounce, state->transparent_bounce);
+
+ if(heterogeneous)
+ return kernel_volume_integrate_heterogeneous_distance(kg, state, ray, sd, L, throughput, &tmp_rng);
+ else
+ return kernel_volume_integrate_homogeneous(kg, state, ray, sd, L, throughput, &tmp_rng, true);
+}
+
/* Decoupled Volume Sampling
*
* VolumeSegment is list of coefficients and transmittance stored at all steps
* through a volume. This can then latter be used for decoupled sampling as in:
- * "Importance Sampling Techniques for Path Tracing in Participating Media" */
-
-/* CPU only because of malloc/free */
-#ifdef __KERNEL_CPU__
+ * "Importance Sampling Techniques for Path Tracing in Participating Media"
+ *
+ * On the GPU this is only supported for homogeneous volumes (1 step), due to
+ * no support for malloc/free and too much stack usage with a fix size array. */
typedef struct VolumeStep {
float3 sigma_s; /* scatter coefficient */
@@ -571,6 +601,8 @@ typedef struct VolumeSegment {
float3 accum_emission; /* accumulated emission at end of segment */
float3 accum_transmittance; /* accumulated transmittance at end of segment */
+
+ int sampling_method; /* volume sampling method */
} VolumeSegment;
/* record volume steps to the end of the volume.
@@ -578,10 +610,12 @@ typedef struct VolumeSegment {
* it would be nice if we could only record up to the point that we need to scatter,
* but the entire segment is needed to do always scattering, rather than probalistically
* hitting or missing the volume. if we don't know the transmittance at the end of the
- * volume we can't generate stratitied distance samples up to that transmittance */
+ * volume we can't generate stratified distance samples up to that transmittance */
ccl_device void kernel_volume_decoupled_record(KernelGlobals *kg, PathState *state,
Ray *ray, ShaderData *sd, VolumeSegment *segment, bool heterogeneous)
{
+ const float tp_eps = 1e-6f; /* todo: this is likely not the right value */
+
/* prepare for volume stepping */
int max_steps;
float step_size, random_jitter_offset;
@@ -608,6 +642,7 @@ ccl_device void kernel_volume_decoupled_record(KernelGlobals *kg, PathState *sta
segment->closure_flag = 0;
segment->numsteps = 0;
+
segment->steps = (VolumeStep*)malloc(sizeof(VolumeStep)*max_steps);
VolumeStep *step = segment->steps;
@@ -669,6 +704,10 @@ ccl_device void kernel_volume_decoupled_record(KernelGlobals *kg, PathState *sta
t = new_t;
if(t == ray->t)
break;
+
+ /* stop if nearly all light blocked */
+ if(accum_transmittance.x < tp_eps && accum_transmittance.y < tp_eps && accum_transmittance.z < tp_eps)
+ break;
}
/* store total emission and transmittance */
@@ -698,35 +737,70 @@ ccl_device void kernel_volume_decoupled_free(KernelGlobals *kg, VolumeSegment *s
* scattering, they always scatter if there is any non-zero scattering
* coefficient.
*
- * these also do not do emission or modify throughput. */
+ * these also do not do emission or modify throughput.
+ *
+ * function is expected to return VOLUME_PATH_SCATTERED when probalistic_scatter is false */
ccl_device VolumeIntegrateResult kernel_volume_decoupled_scatter(
KernelGlobals *kg, PathState *state, Ray *ray, ShaderData *sd,
- float3 *throughput, RNG *rng, VolumeSegment *segment)
+ float3 *throughput, float rphase, float rscatter,
+ const VolumeSegment *segment, const float3 *light_P, bool probalistic_scatter)
{
- int closure_flag = segment->closure_flag;
-
- if(!(closure_flag & SD_SCATTER))
- return VOLUME_PATH_MISSED;
+ kernel_assert(segment->closure_flag & SD_SCATTER);
/* pick random color channel, we use the Veach one-sample
* model with balance heuristic for the channels */
- float rphase = path_state_rng_1D(kg, rng, state, PRNG_PHASE);
int channel = (int)(rphase*3.0f);
sd->randb_closure = rphase*3.0f - channel;
+ float xi = rscatter;
- float xi = path_state_rng_1D(kg, rng, state, PRNG_SCATTER_DISTANCE);
+ /* probalistic scattering decision based on transmittance */
+ if(probalistic_scatter) {
+ float sample_transmittance = kernel_volume_channel_get(segment->accum_transmittance, channel);
+
+ if(1.0f - xi >= sample_transmittance) {
+ /* rescale random number so we can reuse it */
+ xi = 1.0f - (1.0f - xi - sample_transmittance)/(1.0f - sample_transmittance);
+ }
+ else {
+ *throughput /= sample_transmittance;
+ return VOLUME_PATH_MISSED;
+ }
+ }
VolumeStep *step;
float3 transmittance;
float pdf, sample_t;
+ float mis_weight = 1.0f;
+ bool distance_sample = true;
+ bool use_mis = false;
+
+ if(segment->sampling_method && light_P) {
+ if(segment->sampling_method == SD_VOLUME_MIS) {
+ /* multiple importance sample: randomly pick between
+ * equiangular and distance sampling strategy */
+ if(xi < 0.5f) {
+ xi *= 2.0f;
+ }
+ else {
+ xi = (xi - 0.5f)*2.0f;
+ distance_sample = false;
+ }
+
+ use_mis = true;
+ }
+ else {
+ /* only equiangular sampling */
+ distance_sample = false;
+ }
+ }
/* distance sampling */
- if(kernel_data.integrator.volume_homogeneous_sampling == 0 || !kernel_data.integrator.num_all_lights) {
+ if(distance_sample) {
/* find step in cdf */
step = segment->steps;
float prev_t = 0.0f;
- float3 step_pdf = make_float3(1.0f, 1.0f, 1.0f);
+ float3 step_pdf_distance = make_float3(1.0f, 1.0f, 1.0f);
if(segment->numsteps > 1) {
float prev_cdf = 0.0f;
@@ -749,7 +823,7 @@ ccl_device VolumeIntegrateResult kernel_volume_decoupled_scatter(
xi = (xi - prev_cdf)/(step_cdf - prev_cdf);
/* pdf for picking step */
- step_pdf = step->cdf_distance - prev_cdf_distance;
+ step_pdf_distance = step->cdf_distance - prev_cdf_distance;
}
/* determine range in which we will sample */
@@ -758,35 +832,77 @@ ccl_device VolumeIntegrateResult kernel_volume_decoupled_scatter(
/* sample distance and compute transmittance */
float3 distance_pdf;
sample_t = prev_t + kernel_volume_distance_sample(step_t, step->sigma_t, channel, xi, &transmittance, &distance_pdf);
- pdf = average(distance_pdf * step_pdf);
+
+ /* modifiy pdf for hit/miss decision */
+ if(probalistic_scatter)
+ distance_pdf *= make_float3(1.0f, 1.0f, 1.0f) - segment->accum_transmittance;
+
+ pdf = average(distance_pdf * step_pdf_distance);
+
+ /* multiple importance sampling */
+ if(use_mis) {
+ float equi_pdf = kernel_volume_equiangular_pdf(ray, *light_P, sample_t);
+ mis_weight = 2.0f*power_heuristic(pdf, equi_pdf);
+ }
}
/* equi-angular sampling */
else {
- /* pick position on light */
- float3 light_P;
- if(!kernel_volume_equiangular_light_position(kg, state, ray, rng, &light_P))
- return VOLUME_PATH_MISSED;
-
/* sample distance */
- sample_t = kernel_volume_equiangular_sample(ray, light_P, xi, &pdf);
+ sample_t = kernel_volume_equiangular_sample(ray, *light_P, xi, &pdf);
/* find step in which sampled distance is located */
step = segment->steps;
float prev_t = 0.0f;
+ float3 step_pdf_distance = make_float3(1.0f, 1.0f, 1.0f);
if(segment->numsteps > 1) {
- /* todo: optimize using binary search */
- for(int i = 0; i < segment->numsteps-1; i++, step++) {
- if(sample_t < step->t)
+ float3 prev_cdf_distance = make_float3(0.0f, 0.0f, 0.0f);
+
+ int numsteps = segment->numsteps;
+ int high = numsteps - 1;
+ int low = 0;
+ int mid;
+
+ while(low < high) {
+ mid = (low + high) >> 1;
+
+ if(sample_t < step[mid].t)
+ high = mid;
+ else if(sample_t >= step[mid + 1].t)
+ low = mid + 1;
+ else {
+ /* found our interval in step[mid] .. step[mid+1] */
+ prev_t = step[mid].t;
+ prev_cdf_distance = step[mid].cdf_distance;
+ step += mid+1;
break;
+ }
+ }
- prev_t = step->t;
+ if(low >= numsteps - 1) {
+ prev_t = step[numsteps - 1].t;
+ prev_cdf_distance = step[numsteps-1].cdf_distance;
+ step += numsteps - 1;
}
+
+ /* pdf for picking step with distance sampling */
+ step_pdf_distance = step->cdf_distance - prev_cdf_distance;
}
-
+
+ /* determine range in which we will sample */
+ float step_t = step->t - prev_t;
+ float step_sample_t = sample_t - prev_t;
+
/* compute transmittance */
- transmittance = volume_color_transmittance(step->sigma_t, sample_t - prev_t);
+ transmittance = volume_color_transmittance(step->sigma_t, step_sample_t);
+
+ /* multiple importance sampling */
+ if(use_mis) {
+ float3 distance_pdf3 = kernel_volume_distance_pdf(step_t, step->sigma_t, step_sample_t);
+ float distance_pdf = average(distance_pdf3 * step_pdf_distance);
+ mis_weight = 2.0f*power_heuristic(pdf, distance_pdf);
+ }
}
/* compute transmittance up to this step */
@@ -794,7 +910,7 @@ ccl_device VolumeIntegrateResult kernel_volume_decoupled_scatter(
transmittance *= (step-1)->accum_transmittance;
/* modify throughput */
- *throughput *= step->sigma_s * transmittance / pdf;
+ *throughput *= step->sigma_s * transmittance * (mis_weight / pdf);
/* evaluate shader to create closures at shading point */
if(segment->numsteps > 1) {
@@ -810,40 +926,27 @@ ccl_device VolumeIntegrateResult kernel_volume_decoupled_scatter(
return VOLUME_PATH_SCATTERED;
}
-#endif
-
-/* get the volume attenuation and emission over line segment defined by
- * ray, with the assumption that there are no surfaces blocking light
- * between the endpoints */
-ccl_device_noinline VolumeIntegrateResult kernel_volume_integrate(KernelGlobals *kg,
- PathState *state, ShaderData *sd, Ray *ray, PathRadiance *L, float3 *throughput, RNG *rng)
+/* decide if we need to use decoupled or not */
+ccl_device bool kernel_volume_use_decoupled(KernelGlobals *kg, bool heterogeneous, bool direct, int sampling_method)
{
- /* workaround to fix correlation bug in T38710, can find better solution
- * in random number generator later, for now this is done here to not impact
- * performance of rendering without volumes */
- RNG tmp_rng = cmj_hash(*rng, state->rng_offset);
- bool heterogeneous = volume_stack_is_heterogeneous(kg, state->volume_stack);
-
-#if 0
- /* debugging code to compare decoupled ray marching */
- VolumeSegment segment;
-
- shader_setup_from_volume(kg, sd, ray, state->bounce, state->transparent_bounce);
- kernel_volume_decoupled_record(kg, state, ray, sd, &segment, heterogeneous);
-
- VolumeIntegrateResult result = kernel_volume_decoupled_scatter(kg, state, ray, sd, throughput, &tmp_rng, &segment);
-
- kernel_volume_decoupled_free(kg, &segment);
+ /* decoupled ray marching for heterogenous volumes not supported on the GPU,
+ * which also means equiangular and multiple importance sampling is not
+ * support for that case */
+#ifdef __KERNEL_GPU__
+ if(heterogeneous)
+ return false;
+#endif
- return result;
-#else
- shader_setup_from_volume(kg, sd, ray, state->bounce, state->transparent_bounce);
+ /* equiangular and multiple importance sampling only implemented for decoupled */
+ if(sampling_method != 0)
+ return true;
- if(heterogeneous)
- return kernel_volume_integrate_heterogeneous(kg, state, ray, sd, L, throughput, &tmp_rng);
+ /* for all light sampling use decoupled, reusing shader evaluations is
+ * typically faster in that case */
+ if(direct)
+ return kernel_data.integrator.sample_all_lights_direct;
else
- return kernel_volume_integrate_homogeneous(kg, state, ray, sd, L, throughput, &tmp_rng);
-#endif
+ return kernel_data.integrator.sample_all_lights_indirect;
}
/* Volume Stack
@@ -851,17 +954,88 @@ ccl_device_noinline VolumeIntegrateResult kernel_volume_integrate(KernelGlobals
* This is an array of object/shared ID's that the current segment of the path
* is inside of. */
-ccl_device void kernel_volume_stack_init(KernelGlobals *kg, VolumeStack *stack)
+ccl_device void kernel_volume_stack_init(KernelGlobals *kg,
+ Ray *ray,
+ VolumeStack *stack)
{
- /* todo: this assumes camera is always in air, need to detect when it isn't */
- if(kernel_data.background.volume_shader == SHADER_NONE) {
- stack[0].shader = SHADER_NONE;
+ /* NULL ray happens in the baker, does it need proper initialization of
+ * camera in volume?
+ */
+ if(!kernel_data.cam.is_inside_volume || ray == NULL) {
+ /* Camera is guaranteed to be in the air, only take background volume
+ * into account in this case.
+ */
+ if(kernel_data.background.volume_shader != SHADER_NONE) {
+ stack[0].shader = kernel_data.background.volume_shader;
+ stack[0].object = PRIM_NONE;
+ stack[1].shader = SHADER_NONE;
+ }
+ else {
+ stack[0].shader = SHADER_NONE;
+ }
+ return;
}
- else {
+
+ Ray volume_ray = *ray;
+ volume_ray.t = FLT_MAX;
+
+ int stack_index = 0, enclosed_index = 0;
+ int enclosed_volumes[VOLUME_STACK_SIZE];
+
+ while(stack_index < VOLUME_STACK_SIZE - 1 &&
+ enclosed_index < VOLUME_STACK_SIZE - 1)
+ {
+ Intersection isect;
+ if(!scene_intersect_volume(kg, &volume_ray, &isect)) {
+ break;
+ }
+
+ ShaderData sd;
+ shader_setup_from_ray(kg, &sd, &isect, &volume_ray, 0, 0);
+ if(sd.flag & SD_HAS_VOLUME) {
+ if(sd.flag & SD_BACKFACING) {
+ /* If ray exited the volume and never entered to that volume
+ * it means that camera is inside such a volume.
+ */
+ bool is_enclosed = false;
+ for(int i = 0; i < enclosed_index; ++i) {
+ if(enclosed_volumes[i] == sd.object) {
+ is_enclosed = true;
+ break;
+ }
+ }
+ if(is_enclosed == false) {
+ stack[stack_index].object = sd.object;
+ stack[stack_index].shader = sd.shader;
+ ++stack_index;
+ }
+ }
+ else {
+ /* If ray from camera enters the volume, this volume shouldn't
+ * be added to the stak on exit.
+ */
+ enclosed_volumes[enclosed_index++] = sd.object;
+ }
+ }
+
+ /* Move ray forward. */
+ volume_ray.P = ray_offset(sd.P, -sd.Ng);
+ }
+ /* stack_index of 0 means quick checks outside of the kernel gave false
+ * positive, nothing to worry about, just we've wasted quite a few of
+ * ticks just to come into conclusion that camera is in the air.
+ *
+ * In this case we're doing the same above -- check whether background has
+ * volume.
+ */
+ if(stack_index == 0 && kernel_data.background.volume_shader == SHADER_NONE) {
stack[0].shader = kernel_data.background.volume_shader;
stack[0].object = PRIM_NONE;
stack[1].shader = SHADER_NONE;
}
+ else {
+ stack[stack_index].shader = SHADER_NONE;
+ }
}
ccl_device void kernel_volume_stack_enter_exit(KernelGlobals *kg, ShaderData *sd, VolumeStack *stack)
@@ -910,4 +1084,3 @@ ccl_device void kernel_volume_stack_enter_exit(KernelGlobals *kg, ShaderData *sd
}
CCL_NAMESPACE_END
-
diff --git a/intern/cycles/kernel/osl/SConscript b/intern/cycles/kernel/osl/SConscript
index 4685bb7753e..d721edbaf6e 100644
--- a/intern/cycles/kernel/osl/SConscript
+++ b/intern/cycles/kernel/osl/SConscript
@@ -43,6 +43,9 @@ defs.append('CCL_NAMESPACE_BEGIN=namespace ccl {')
defs.append('CCL_NAMESPACE_END=}')
defs.append('WITH_OSL')
+if env['WITH_BF_CYCLES_DEBUG']:
+ defs.append('WITH_CYCLES_DEBUG')
+
if env['OURPLATFORM'] in ('win32-vc', 'win64-vc'):
cxxflags.append('-DBOOST_NO_RTTI -DBOOST_NO_TYPEID /fp:fast'.split())
incs.append(env['BF_PTHREADS_INC'])
diff --git a/intern/cycles/kernel/osl/osl_bssrdf.cpp b/intern/cycles/kernel/osl/osl_bssrdf.cpp
index 94337290d20..84ef85e089d 100644
--- a/intern/cycles/kernel/osl/osl_bssrdf.cpp
+++ b/intern/cycles/kernel/osl/osl_bssrdf.cpp
@@ -66,18 +66,6 @@ ClosureParam *closure_bssrdf_cubic_params()
static ClosureParam params[] = {
CLOSURE_FLOAT3_PARAM(CubicBSSRDFClosure, sc.N),
CLOSURE_FLOAT3_PARAM(CubicBSSRDFClosure, radius),
- //CLOSURE_FLOAT_PARAM(CubicBSSRDFClosure, sc.data1),
- CLOSURE_STRING_KEYPARAM("label"),
- CLOSURE_FINISH_PARAM(CubicBSSRDFClosure)
- };
- return params;
-}
-
-ClosureParam *closure_bssrdf_cubic_extended_params()
-{
- static ClosureParam params[] = {
- CLOSURE_FLOAT3_PARAM(CubicBSSRDFClosure, sc.N),
- CLOSURE_FLOAT3_PARAM(CubicBSSRDFClosure, radius),
CLOSURE_FLOAT_PARAM(CubicBSSRDFClosure, sc.data1),
CLOSURE_FLOAT_PARAM(CubicBSSRDFClosure, sc.T.x),
CLOSURE_STRING_KEYPARAM("label"),
@@ -107,18 +95,6 @@ ClosureParam *closure_bssrdf_gaussian_params()
static ClosureParam params[] = {
CLOSURE_FLOAT3_PARAM(GaussianBSSRDFClosure, sc.N),
CLOSURE_FLOAT3_PARAM(GaussianBSSRDFClosure, radius),
- //CLOSURE_FLOAT_PARAM(GaussianBSSRDFClosure, sc.data1),
- CLOSURE_STRING_KEYPARAM("label"),
- CLOSURE_FINISH_PARAM(GaussianBSSRDFClosure)
- };
- return params;
-}
-
-ClosureParam *closure_bssrdf_gaussian_extended_params()
-{
- static ClosureParam params[] = {
- CLOSURE_FLOAT3_PARAM(GaussianBSSRDFClosure, sc.N),
- CLOSURE_FLOAT3_PARAM(GaussianBSSRDFClosure, radius),
CLOSURE_FLOAT_PARAM(GaussianBSSRDFClosure, sc.data1),
CLOSURE_STRING_KEYPARAM("label"),
CLOSURE_FINISH_PARAM(GaussianBSSRDFClosure)
diff --git a/intern/cycles/kernel/osl/osl_closures.cpp b/intern/cycles/kernel/osl/osl_closures.cpp
index a96c0e2b1fb..cc9942b024e 100644
--- a/intern/cycles/kernel/osl/osl_closures.cpp
+++ b/intern/cycles/kernel/osl/osl_closures.cpp
@@ -41,6 +41,8 @@
#include "util_param.h"
#include "kernel_types.h"
+#include "kernel_compat_cpu.h"
+#include "kernel_globals.h"
#include "kernel_montecarlo.h"
#include "closure/bsdf_util.h"
@@ -51,8 +53,7 @@
#include "closure/bsdf_reflection.h"
#include "closure/bsdf_refraction.h"
#include "closure/bsdf_transparent.h"
-#include "closure/bsdf_ward.h"
-#include "closure/bsdf_westin.h"
+#include "closure/bsdf_ashikhmin_shirley.h"
#include "closure/bsdf_toon.h"
#include "closure/bsdf_hair.h"
#include "closure/volume.h"
@@ -85,16 +86,6 @@ BSDF_CLOSURE_CLASS_BEGIN(Refraction, refraction, refraction, LABEL_SINGULAR)
CLOSURE_FLOAT_PARAM(RefractionClosure, sc.data0),
BSDF_CLOSURE_CLASS_END(Refraction, refraction)
-BSDF_CLOSURE_CLASS_BEGIN(WestinBackscatter, westin_backscatter, westin_backscatter, LABEL_GLOSSY)
- CLOSURE_FLOAT3_PARAM(WestinBackscatterClosure, sc.N),
- CLOSURE_FLOAT_PARAM(WestinBackscatterClosure, sc.data0),
-BSDF_CLOSURE_CLASS_END(WestinBackscatter, westin_backscatter)
-
-BSDF_CLOSURE_CLASS_BEGIN(WestinSheen, westin_sheen, westin_sheen, LABEL_DIFFUSE)
- CLOSURE_FLOAT3_PARAM(WestinSheenClosure, sc.N),
- CLOSURE_FLOAT_PARAM(WestinSheenClosure, sc.data0),
-BSDF_CLOSURE_CLASS_END(WestinSheen, westin_sheen)
-
BSDF_CLOSURE_CLASS_BEGIN(Transparent, transparent, transparent, LABEL_SINGULAR)
BSDF_CLOSURE_CLASS_END(Transparent, transparent)
@@ -103,12 +94,12 @@ BSDF_CLOSURE_CLASS_BEGIN(AshikhminVelvet, ashikhmin_velvet, ashikhmin_velvet, LA
CLOSURE_FLOAT_PARAM(AshikhminVelvetClosure, sc.data0),
BSDF_CLOSURE_CLASS_END(AshikhminVelvet, ashikhmin_velvet)
-BSDF_CLOSURE_CLASS_BEGIN(Ward, ward, ward, LABEL_GLOSSY)
- CLOSURE_FLOAT3_PARAM(WardClosure, sc.N),
- CLOSURE_FLOAT3_PARAM(WardClosure, sc.T),
- CLOSURE_FLOAT_PARAM(WardClosure, sc.data0),
- CLOSURE_FLOAT_PARAM(WardClosure, sc.data1),
-BSDF_CLOSURE_CLASS_END(Ward, ward)
+BSDF_CLOSURE_CLASS_BEGIN(AshikhminShirley, ashikhmin_shirley_aniso, ashikhmin_shirley, LABEL_GLOSSY|LABEL_REFLECT)
+ CLOSURE_FLOAT3_PARAM(AshikhminShirleyClosure, sc.N),
+ CLOSURE_FLOAT3_PARAM(AshikhminShirleyClosure, sc.T),
+ CLOSURE_FLOAT_PARAM(AshikhminShirleyClosure, sc.data0),
+ CLOSURE_FLOAT_PARAM(AshikhminShirleyClosure, sc.data1),
+BSDF_CLOSURE_CLASS_END(AshikhminShirley, ashikhmin_shirley_aniso)
BSDF_CLOSURE_CLASS_BEGIN(DiffuseToon, diffuse_toon, diffuse_toon, LABEL_DIFFUSE)
CLOSURE_FLOAT3_PARAM(DiffuseToonClosure, sc.N),
@@ -122,26 +113,40 @@ BSDF_CLOSURE_CLASS_BEGIN(GlossyToon, glossy_toon, glossy_toon, LABEL_GLOSSY)
CLOSURE_FLOAT_PARAM(GlossyToonClosure, sc.data1),
BSDF_CLOSURE_CLASS_END(GlossyToon, glossy_toon)
-BSDF_CLOSURE_CLASS_BEGIN(MicrofacetGGX, microfacet_ggx, microfacet_ggx, LABEL_GLOSSY)
+BSDF_CLOSURE_CLASS_BEGIN(MicrofacetGGX, microfacet_ggx, microfacet_ggx, LABEL_GLOSSY|LABEL_REFLECT)
CLOSURE_FLOAT3_PARAM(MicrofacetGGXClosure, sc.N),
CLOSURE_FLOAT_PARAM(MicrofacetGGXClosure, sc.data0),
BSDF_CLOSURE_CLASS_END(MicrofacetGGX, microfacet_ggx)
-BSDF_CLOSURE_CLASS_BEGIN(MicrofacetBeckmann, microfacet_beckmann, microfacet_beckmann, LABEL_GLOSSY)
+BSDF_CLOSURE_CLASS_BEGIN(MicrofacetGGXAniso, microfacet_ggx_aniso, microfacet_ggx, LABEL_GLOSSY|LABEL_REFLECT)
+ CLOSURE_FLOAT3_PARAM(MicrofacetGGXAnisoClosure, sc.N),
+ CLOSURE_FLOAT3_PARAM(MicrofacetGGXAnisoClosure, sc.T),
+ CLOSURE_FLOAT_PARAM(MicrofacetGGXAnisoClosure, sc.data0),
+ CLOSURE_FLOAT_PARAM(MicrofacetGGXAnisoClosure, sc.data1),
+BSDF_CLOSURE_CLASS_END(MicrofacetGGXAniso, microfacet_ggx_aniso)
+
+BSDF_CLOSURE_CLASS_BEGIN(MicrofacetBeckmann, microfacet_beckmann, microfacet_beckmann, LABEL_GLOSSY|LABEL_REFLECT)
CLOSURE_FLOAT3_PARAM(MicrofacetBeckmannClosure, sc.N),
CLOSURE_FLOAT_PARAM(MicrofacetBeckmannClosure, sc.data0),
BSDF_CLOSURE_CLASS_END(MicrofacetBeckmann, microfacet_beckmann)
-BSDF_CLOSURE_CLASS_BEGIN(MicrofacetGGXRefraction, microfacet_ggx_refraction, microfacet_ggx, LABEL_GLOSSY)
+BSDF_CLOSURE_CLASS_BEGIN(MicrofacetBeckmannAniso, microfacet_beckmann_aniso, microfacet_beckmann, LABEL_GLOSSY|LABEL_REFLECT)
+ CLOSURE_FLOAT3_PARAM(MicrofacetBeckmannAnisoClosure, sc.N),
+ CLOSURE_FLOAT3_PARAM(MicrofacetBeckmannAnisoClosure, sc.T),
+ CLOSURE_FLOAT_PARAM(MicrofacetBeckmannAnisoClosure, sc.data0),
+ CLOSURE_FLOAT_PARAM(MicrofacetBeckmannAnisoClosure, sc.data1),
+BSDF_CLOSURE_CLASS_END(MicrofacetBeckmannAniso, microfacet_beckmann_aniso)
+
+BSDF_CLOSURE_CLASS_BEGIN(MicrofacetGGXRefraction, microfacet_ggx_refraction, microfacet_ggx, LABEL_GLOSSY|LABEL_TRANSMIT)
CLOSURE_FLOAT3_PARAM(MicrofacetGGXRefractionClosure, sc.N),
CLOSURE_FLOAT_PARAM(MicrofacetGGXRefractionClosure, sc.data0),
- CLOSURE_FLOAT_PARAM(MicrofacetGGXRefractionClosure, sc.data1),
+ CLOSURE_FLOAT_PARAM(MicrofacetGGXRefractionClosure, sc.data2),
BSDF_CLOSURE_CLASS_END(MicrofacetGGXRefraction, microfacet_ggx_refraction)
-BSDF_CLOSURE_CLASS_BEGIN(MicrofacetBeckmannRefraction, microfacet_beckmann_refraction, microfacet_beckmann, LABEL_GLOSSY)
+BSDF_CLOSURE_CLASS_BEGIN(MicrofacetBeckmannRefraction, microfacet_beckmann_refraction, microfacet_beckmann, LABEL_GLOSSY|LABEL_TRANSMIT)
CLOSURE_FLOAT3_PARAM(MicrofacetBeckmannRefractionClosure, sc.N),
CLOSURE_FLOAT_PARAM(MicrofacetBeckmannRefractionClosure, sc.data0),
- CLOSURE_FLOAT_PARAM(MicrofacetBeckmannRefractionClosure, sc.data1),
+ CLOSURE_FLOAT_PARAM(MicrofacetBeckmannRefractionClosure, sc.data2),
BSDF_CLOSURE_CLASS_END(MicrofacetBeckmannRefraction, microfacet_beckmann_refraction)
BSDF_CLOSURE_CLASS_BEGIN(HairReflection, hair_reflection, hair_reflection, LABEL_GLOSSY)
@@ -150,7 +155,7 @@ BSDF_CLOSURE_CLASS_BEGIN(HairReflection, hair_reflection, hair_reflection, LABEL
CLOSURE_FLOAT_PARAM(HairReflectionClosure, sc.data1),
#ifdef __HAIR__
CLOSURE_FLOAT3_PARAM(HairReflectionClosure, sc.T),
- CLOSURE_FLOAT_PARAM(HairReflectionClosure, sc.offset),
+ CLOSURE_FLOAT_PARAM(HairReflectionClosure, sc.data2),
#else
CLOSURE_FLOAT3_PARAM(HairReflectionClosure, sc.N),
CLOSURE_FLOAT_PARAM(HairReflectionClosure, sc.data1),
@@ -163,7 +168,7 @@ BSDF_CLOSURE_CLASS_BEGIN(HairTransmission, hair_transmission, hair_transmission,
CLOSURE_FLOAT_PARAM(HairTransmissionClosure, sc.data1),
#ifdef __HAIR__
CLOSURE_FLOAT3_PARAM(HairReflectionClosure, sc.T),
- CLOSURE_FLOAT_PARAM(HairReflectionClosure, sc.offset),
+ CLOSURE_FLOAT_PARAM(HairReflectionClosure, sc.data2),
#else
CLOSURE_FLOAT3_PARAM(HairReflectionClosure, sc.N),
CLOSURE_FLOAT_PARAM(HairReflectionClosure, sc.data1),
@@ -210,26 +215,24 @@ void OSLShader::register_closures(OSLShadingSystem *ss_)
bsdf_transparent_params(), bsdf_transparent_prepare);
register_closure(ss, "microfacet_ggx", id++,
bsdf_microfacet_ggx_params(), bsdf_microfacet_ggx_prepare);
+ register_closure(ss, "microfacet_ggx_aniso", id++,
+ bsdf_microfacet_ggx_aniso_params(), bsdf_microfacet_ggx_aniso_prepare);
register_closure(ss, "microfacet_ggx_refraction", id++,
bsdf_microfacet_ggx_refraction_params(), bsdf_microfacet_ggx_refraction_prepare);
register_closure(ss, "microfacet_beckmann", id++,
bsdf_microfacet_beckmann_params(), bsdf_microfacet_beckmann_prepare);
+ register_closure(ss, "microfacet_beckmann_aniso", id++,
+ bsdf_microfacet_beckmann_aniso_params(), bsdf_microfacet_beckmann_aniso_prepare);
register_closure(ss, "microfacet_beckmann_refraction", id++,
bsdf_microfacet_beckmann_refraction_params(), bsdf_microfacet_beckmann_refraction_prepare);
- register_closure(ss, "ward", id++,
- bsdf_ward_params(), bsdf_ward_prepare);
+ register_closure(ss, "ashikhmin_shirley", id++,
+ bsdf_ashikhmin_shirley_aniso_params(), bsdf_ashikhmin_shirley_aniso_prepare);
register_closure(ss, "ashikhmin_velvet", id++,
bsdf_ashikhmin_velvet_params(), bsdf_ashikhmin_velvet_prepare);
register_closure(ss, "diffuse_toon", id++,
bsdf_diffuse_toon_params(), bsdf_diffuse_toon_prepare);
register_closure(ss, "glossy_toon", id++,
bsdf_glossy_toon_params(), bsdf_glossy_toon_prepare);
- register_closure(ss, "specular_toon", id++,
- bsdf_glossy_toon_params(), bsdf_glossy_toon_prepare);
- register_closure(ss, "westin_backscatter", id++,
- bsdf_westin_backscatter_params(), bsdf_westin_backscatter_prepare);
- register_closure(ss, "westin_sheen", id++,
- bsdf_westin_sheen_params(), bsdf_westin_sheen_prepare);
register_closure(ss, "emission", id++,
closure_emission_params(), closure_emission_prepare);
@@ -247,10 +250,6 @@ void OSLShader::register_closures(OSLShadingSystem *ss_)
closure_bssrdf_cubic_params(), closure_bssrdf_cubic_prepare);
register_closure(ss, "bssrdf_gaussian", id++,
closure_bssrdf_gaussian_params(), closure_bssrdf_gaussian_prepare);
- register_closure(ss, "bssrdf_cubic", id++,
- closure_bssrdf_cubic_extended_params(), closure_bssrdf_cubic_prepare);
- register_closure(ss, "bssrdf_gaussian", id++,
- closure_bssrdf_gaussian_extended_params(), closure_bssrdf_gaussian_prepare);
register_closure(ss, "hair_reflection", id++,
bsdf_hair_reflection_params(), bsdf_hair_reflection_prepare);
diff --git a/intern/cycles/kernel/osl/osl_closures.h b/intern/cycles/kernel/osl/osl_closures.h
index 218cf1c19cc..5e833d738d8 100644
--- a/intern/cycles/kernel/osl/osl_closures.h
+++ b/intern/cycles/kernel/osl/osl_closures.h
@@ -48,12 +48,8 @@ OSL::ClosureParam *closure_holdout_params();
OSL::ClosureParam *closure_ambient_occlusion_params();
OSL::ClosureParam *closure_bsdf_diffuse_ramp_params();
OSL::ClosureParam *closure_bsdf_phong_ramp_params();
-OSL::ClosureParam *closure_westin_backscatter_params();
-OSL::ClosureParam *closure_westin_sheen_params();
OSL::ClosureParam *closure_bssrdf_cubic_params();
OSL::ClosureParam *closure_bssrdf_gaussian_params();
-OSL::ClosureParam *closure_bssrdf_cubic_extended_params();
-OSL::ClosureParam *closure_bssrdf_gaussian_extended_params();
OSL::ClosureParam *closure_henyey_greenstein_volume_params();
void closure_emission_prepare(OSL::RendererServices *, int id, void *data);
@@ -62,8 +58,6 @@ void closure_holdout_prepare(OSL::RendererServices *, int id, void *data);
void closure_ambient_occlusion_prepare(OSL::RendererServices *, int id, void *data);
void closure_bsdf_diffuse_ramp_prepare(OSL::RendererServices *, int id, void *data);
void closure_bsdf_phong_ramp_prepare(OSL::RendererServices *, int id, void *data);
-void closure_westin_backscatter_prepare(OSL::RendererServices *, int id, void *data);
-void closure_westin_sheen_prepare(OSL::RendererServices *, int id, void *data);
void closure_bssrdf_cubic_prepare(OSL::RendererServices *, int id, void *data);
void closure_bssrdf_gaussian_prepare(OSL::RendererServices *, int id, void *data);
void closure_henyey_greenstein_volume_prepare(OSL::RendererServices *, int id, void *data);
@@ -149,17 +143,18 @@ public: \
\
void blur(float roughness) \
{ \
- bsdf_##svmlower##_blur(&sc, roughness); \
} \
\
float3 eval_reflect(const float3 &omega_out, const float3 &omega_in, float& pdf) const \
{ \
- return bsdf_##svmlower##_eval_reflect(&sc, omega_out, omega_in, &pdf); \
+ pdf = 0; \
+ return make_float3(0, 0, 0); \
} \
\
float3 eval_transmit(const float3 &omega_out, const float3 &omega_in, float& pdf) const \
{ \
- return bsdf_##svmlower##_eval_transmit(&sc, omega_out, omega_in, &pdf); \
+ pdf = 0; \
+ return make_float3(0, 0, 0); \
} \
\
int sample(const float3 &Ng, \
@@ -168,8 +163,8 @@ public: \
float3 &omega_in, float3 &domega_in_dx, float3 &domega_in_dy, \
float &pdf, float3 &eval) const \
{ \
- return bsdf_##svmlower##_sample(&sc, Ng, omega_out, domega_out_dx, domega_out_dy, \
- randu, randv, &eval, &omega_in, &domega_in_dx, &domega_in_dy, &pdf); \
+ pdf = 0; \
+ return LABEL_NONE; \
} \
}; \
\
diff --git a/intern/cycles/kernel/osl/osl_globals.h b/intern/cycles/kernel/osl/osl_globals.h
index 5a658d8244a..9c3134e41c9 100644
--- a/intern/cycles/kernel/osl/osl_globals.h
+++ b/intern/cycles/kernel/osl/osl_globals.h
@@ -20,7 +20,6 @@
#ifdef WITH_OSL
#include <OSL/oslexec.h>
-#include <cmath>
#include "util_map.h"
#include "util_param.h"
diff --git a/intern/cycles/kernel/osl/osl_services.cpp b/intern/cycles/kernel/osl/osl_services.cpp
index 54894ea19eb..a9694651e14 100644
--- a/intern/cycles/kernel/osl/osl_services.cpp
+++ b/intern/cycles/kernel/osl/osl_services.cpp
@@ -126,7 +126,7 @@ void OSLRenderServices::thread_init(KernelGlobals *kernel_globals_, OSL::Texture
osl_ts = osl_ts_;
}
-bool OSLRenderServices::get_matrix(OSL::Matrix44 &result, OSL::TransformationPtr xform, float time)
+bool OSLRenderServices::get_matrix(OSL::ShaderGlobals *sg, OSL::Matrix44 &result, OSL::TransformationPtr xform, float time)
{
/* this is only used for shader and object space, we don't really have
* a concept of shader space, so we just use object space for both. */
@@ -156,7 +156,7 @@ bool OSLRenderServices::get_matrix(OSL::Matrix44 &result, OSL::TransformationPtr
return false;
}
-bool OSLRenderServices::get_inverse_matrix(OSL::Matrix44 &result, OSL::TransformationPtr xform, float time)
+bool OSLRenderServices::get_inverse_matrix(OSL::ShaderGlobals *sg, OSL::Matrix44 &result, OSL::TransformationPtr xform, float time)
{
/* this is only used for shader and object space, we don't really have
* a concept of shader space, so we just use object space for both. */
@@ -186,7 +186,7 @@ bool OSLRenderServices::get_inverse_matrix(OSL::Matrix44 &result, OSL::Transform
return false;
}
-bool OSLRenderServices::get_matrix(OSL::Matrix44 &result, ustring from, float time)
+bool OSLRenderServices::get_matrix(OSL::ShaderGlobals *sg, OSL::Matrix44 &result, ustring from, float time)
{
KernelGlobals *kg = kernel_globals;
@@ -218,7 +218,7 @@ bool OSLRenderServices::get_matrix(OSL::Matrix44 &result, ustring from, float ti
return false;
}
-bool OSLRenderServices::get_inverse_matrix(OSL::Matrix44 &result, ustring to, float time)
+bool OSLRenderServices::get_inverse_matrix(OSL::ShaderGlobals *sg, OSL::Matrix44 &result, ustring to, float time)
{
KernelGlobals *kg = kernel_globals;
@@ -250,7 +250,7 @@ bool OSLRenderServices::get_inverse_matrix(OSL::Matrix44 &result, ustring to, fl
return false;
}
-bool OSLRenderServices::get_matrix(OSL::Matrix44 &result, OSL::TransformationPtr xform)
+bool OSLRenderServices::get_matrix(OSL::ShaderGlobals *sg, OSL::Matrix44 &result, OSL::TransformationPtr xform)
{
/* this is only used for shader and object space, we don't really have
* a concept of shader space, so we just use object space for both. */
@@ -275,7 +275,7 @@ bool OSLRenderServices::get_matrix(OSL::Matrix44 &result, OSL::TransformationPtr
return false;
}
-bool OSLRenderServices::get_inverse_matrix(OSL::Matrix44 &result, OSL::TransformationPtr xform)
+bool OSLRenderServices::get_inverse_matrix(OSL::ShaderGlobals *sg, OSL::Matrix44 &result, OSL::TransformationPtr xform)
{
/* this is only used for shader and object space, we don't really have
* a concept of shader space, so we just use object space for both. */
@@ -300,7 +300,7 @@ bool OSLRenderServices::get_inverse_matrix(OSL::Matrix44 &result, OSL::Transform
return false;
}
-bool OSLRenderServices::get_matrix(OSL::Matrix44 &result, ustring from)
+bool OSLRenderServices::get_matrix(OSL::ShaderGlobals *sg, OSL::Matrix44 &result, ustring from)
{
KernelGlobals *kg = kernel_globals;
@@ -328,7 +328,7 @@ bool OSLRenderServices::get_matrix(OSL::Matrix44 &result, ustring from)
return false;
}
-bool OSLRenderServices::get_inverse_matrix(OSL::Matrix44 &result, ustring to)
+bool OSLRenderServices::get_inverse_matrix(OSL::ShaderGlobals *sg, OSL::Matrix44 &result, ustring to)
{
KernelGlobals *kg = kernel_globals;
@@ -356,7 +356,7 @@ bool OSLRenderServices::get_inverse_matrix(OSL::Matrix44 &result, ustring to)
return false;
}
-bool OSLRenderServices::get_array_attribute(void *renderstate, bool derivatives,
+bool OSLRenderServices::get_array_attribute(OSL::ShaderGlobals *sg, bool derivatives,
ustring object, TypeDesc type, ustring name,
int index, void *val)
{
@@ -479,7 +479,7 @@ static bool set_attribute_int(int i, TypeDesc type, bool derivatives, void *val)
static bool set_attribute_string(ustring str, TypeDesc type, bool derivatives, void *val)
{
- if(type.basetype == TypeDesc::INT && type.aggregate == TypeDesc::SCALAR && type.arraylen == 0) {
+ if(type.basetype == TypeDesc::STRING && type.aggregate == TypeDesc::SCALAR && type.arraylen == 0) {
ustring *sval = (ustring *)val;
sval[0] = str;
@@ -718,7 +718,7 @@ bool OSLRenderServices::get_background_attribute(KernelGlobals *kg, ShaderData *
return set_attribute_int(f, type, derivatives, val);
}
else if (name == u_path_transparent_depth) {
- /* Ray Depth */
+ /* Transparent Ray Depth */
int f = sd->transparent_depth;
return set_attribute_int(f, type, derivatives, val);
}
@@ -751,14 +751,22 @@ bool OSLRenderServices::get_background_attribute(KernelGlobals *kg, ShaderData *
return false;
}
-bool OSLRenderServices::get_attribute(void *renderstate, bool derivatives, ustring object_name,
+bool OSLRenderServices::get_attribute(OSL::ShaderGlobals *sg, bool derivatives, ustring object_name,
+ TypeDesc type, ustring name, void *val)
+{
+ if (sg->renderstate == NULL)
+ return false;
+
+ ShaderData *sd = (ShaderData *)(sg->renderstate);
+ return get_attribute(sd, derivatives, object_name, type, name, val);
+}
+
+bool OSLRenderServices::get_attribute(ShaderData *sd, bool derivatives, ustring object_name,
TypeDesc type, ustring name, void *val)
{
- ShaderData *sd = (ShaderData *)renderstate;
KernelGlobals *kg = sd->osl_globals;
bool is_curve;
int object;
- // int prim;
/* lookup of attribute on another object */
if (object_name != u_empty) {
@@ -768,12 +776,10 @@ bool OSLRenderServices::get_attribute(void *renderstate, bool derivatives, ustri
return false;
object = it->second;
- // prim = PRIM_NONE;
is_curve = false;
}
else {
object = sd->object;
- // prim = sd->prim;
is_curve = (sd->type & PRIMITIVE_ALL_CURVE) != 0;
if (object == OBJECT_NONE)
@@ -815,12 +821,12 @@ bool OSLRenderServices::get_attribute(void *renderstate, bool derivatives, ustri
}
bool OSLRenderServices::get_userdata(bool derivatives, ustring name, TypeDesc type,
- void *renderstate, void *val)
+ OSL::ShaderGlobals *sg, void *val)
{
return false; /* disabled by lockgeom */
}
-bool OSLRenderServices::has_userdata(ustring name, TypeDesc type, void *renderstate)
+bool OSLRenderServices::has_userdata(ustring name, TypeDesc type, OSL::ShaderGlobals *sg)
{
return false; /* never called by OSL */
}
@@ -871,14 +877,30 @@ bool OSLRenderServices::texture(ustring filename, TextureOpt &options,
return true;
}
#endif
+ bool status;
- OSLThreadData *tdata = kg->osl_tdata;
- OIIO::TextureSystem::Perthread *thread_info = tdata->oiio_thread_info;
+ if(filename[0] == '@' && filename.find('.') == -1) {
+ int slot = atoi(filename.c_str() + 1);
+ float4 rgba = kernel_tex_image_interp(slot, s, 1.0f - t);
- OIIO::TextureSystem::TextureHandle *th = ts->get_texture_handle(filename, thread_info);
+ result[0] = rgba[0];
+ if(options.nchannels > 1)
+ result[1] = rgba[1];
+ if(options.nchannels > 2)
+ result[2] = rgba[2];
+ if(options.nchannels > 3)
+ result[3] = rgba[3];
+ status = true;
+ }
+ else {
+ OSLThreadData *tdata = kg->osl_tdata;
+ OIIO::TextureSystem::Perthread *thread_info = tdata->oiio_thread_info;
- bool status = ts->texture(th, thread_info,
- options, s, t, dsdx, dtdx, dsdy, dtdy, result);
+ OIIO::TextureSystem::TextureHandle *th = ts->get_texture_handle(filename, thread_info);
+
+ status = ts->texture(th, thread_info,
+ options, s, t, dsdx, dtdx, dsdy, dtdy, result);
+ }
if(!status) {
if(options.nchannels == 3 || options.nchannels == 4) {
@@ -953,7 +975,7 @@ bool OSLRenderServices::environment(ustring filename, TextureOpt &options,
return status;
}
-bool OSLRenderServices::get_texture_info(ustring filename, int subimage,
+bool OSLRenderServices::get_texture_info(OSL::ShaderGlobals *sg, ustring filename, int subimage,
ustring dataname,
TypeDesc datatype, void *data)
{
@@ -996,7 +1018,7 @@ bool OSLRenderServices::trace(TraceOpt &options, OSL::ShaderGlobals *sg,
ray.P = TO_FLOAT3(P);
ray.D = TO_FLOAT3(R);
- ray.t = (options.maxdist == 1.0e30)? FLT_MAX: options.maxdist - options.mindist;
+ ray.t = (options.maxdist == 1.0e30f)? FLT_MAX: options.maxdist - options.mindist;
ray.time = sd->time;
if(options.mindist == 0.0f) {
@@ -1025,11 +1047,7 @@ bool OSLRenderServices::trace(TraceOpt &options, OSL::ShaderGlobals *sg,
tracedata->sd.osl_globals = sd->osl_globals;
/* raytrace */
-#ifdef __HAIR__
return scene_intersect(sd->osl_globals, &ray, PATH_RAY_ALL_VISIBILITY, &tracedata->isect, NULL, 0.0f, 0.0f);
-#else
- return scene_intersect(sd->osl_globals, &ray, PATH_RAY_ALL_VISIBILITY, &tracedata->isect);
-#endif
}
diff --git a/intern/cycles/kernel/osl/osl_services.h b/intern/cycles/kernel/osl/osl_services.h
index 069722d81b6..6f928a0d103 100644
--- a/intern/cycles/kernel/osl/osl_services.h
+++ b/intern/cycles/kernel/osl/osl_services.h
@@ -49,27 +49,29 @@ public:
void thread_init(KernelGlobals *kernel_globals, OSL::TextureSystem *ts);
- bool get_matrix(OSL::Matrix44 &result, OSL::TransformationPtr xform, float time);
- bool get_inverse_matrix(OSL::Matrix44 &result, OSL::TransformationPtr xform, float time);
+ bool get_matrix(OSL::ShaderGlobals *sg, OSL::Matrix44 &result, OSL::TransformationPtr xform, float time);
+ bool get_inverse_matrix(OSL::ShaderGlobals *sg, OSL::Matrix44 &result, OSL::TransformationPtr xform, float time);
- bool get_matrix(OSL::Matrix44 &result, ustring from, float time);
- bool get_inverse_matrix(OSL::Matrix44 &result, ustring to, float time);
+ bool get_matrix(OSL::ShaderGlobals *sg, OSL::Matrix44 &result, ustring from, float time);
+ bool get_inverse_matrix(OSL::ShaderGlobals *sg, OSL::Matrix44 &result, ustring to, float time);
- bool get_matrix(OSL::Matrix44 &result, OSL::TransformationPtr xform);
- bool get_inverse_matrix(OSL::Matrix44 &result, OSL::TransformationPtr xform);
+ bool get_matrix(OSL::ShaderGlobals *sg, OSL::Matrix44 &result, OSL::TransformationPtr xform);
+ bool get_inverse_matrix(OSL::ShaderGlobals *sg, OSL::Matrix44 &result, OSL::TransformationPtr xform);
- bool get_matrix(OSL::Matrix44 &result, ustring from);
- bool get_inverse_matrix(OSL::Matrix44 &result, ustring from);
+ bool get_matrix(OSL::ShaderGlobals *sg, OSL::Matrix44 &result, ustring from);
+ bool get_inverse_matrix(OSL::ShaderGlobals *sg, OSL::Matrix44 &result, ustring from);
- bool get_array_attribute(void *renderstate, bool derivatives,
+ bool get_array_attribute(OSL::ShaderGlobals *sg, bool derivatives,
ustring object, TypeDesc type, ustring name,
int index, void *val);
- bool get_attribute(void *renderstate, bool derivatives, ustring object,
+ bool get_attribute(OSL::ShaderGlobals *sg, bool derivatives, ustring object,
+ TypeDesc type, ustring name, void *val);
+ bool get_attribute(ShaderData *sd, bool derivatives, ustring object_name,
TypeDesc type, ustring name, void *val);
bool get_userdata(bool derivatives, ustring name, TypeDesc type,
- void *renderstate, void *val);
- bool has_userdata(ustring name, TypeDesc type, void *renderstate);
+ OSL::ShaderGlobals *sg, void *val);
+ bool has_userdata(ustring name, TypeDesc type, OSL::ShaderGlobals *sg);
int pointcloud_search(OSL::ShaderGlobals *sg, ustring filename, const OSL::Vec3 &center,
float radius, int max_points, bool sort, size_t *out_indices,
@@ -106,7 +108,7 @@ public:
OSL::ShaderGlobals *sg, const OSL::Vec3 &R,
const OSL::Vec3 &dRdx, const OSL::Vec3 &dRdy, float *result);
- bool get_texture_info(ustring filename, int subimage,
+ bool get_texture_info(OSL::ShaderGlobals *sg, ustring filename, int subimage,
ustring dataname, TypeDesc datatype, void *data);
static bool get_background_attribute(KernelGlobals *kg, ShaderData *sd, ustring name,
@@ -157,6 +159,70 @@ public:
static ustring u_v;
static ustring u_empty;
+#if OSL_LIBRARY_VERSION_CODE < 10500
+ bool get_matrix(OSL::Matrix44 &result, OSL::TransformationPtr xform, float time) {
+ return get_matrix(NULL, result, xform, time);
+ }
+
+ bool get_inverse_matrix(OSL::Matrix44 &result, OSL::TransformationPtr xform, float time) {
+ return get_inverse_matrix(NULL, result, xform, time);
+ }
+
+ bool get_matrix(OSL::Matrix44 &result, ustring from, float time) {
+ return get_matrix(NULL, result, from, time);
+ }
+
+ bool get_inverse_matrix(OSL::Matrix44 &result, ustring to, float time) {
+ return get_inverse_matrix(NULL, result, to, time);
+ }
+
+ bool get_matrix(OSL::Matrix44 &result, OSL::TransformationPtr xform) {
+ return get_matrix(NULL, result, xform);
+ }
+
+ bool get_inverse_matrix(OSL::Matrix44 &result, OSL::TransformationPtr xform) {
+ return get_inverse_matrix(NULL, result, xform);
+ }
+
+ bool get_matrix(OSL::Matrix44 &result, ustring from) {
+ return get_matrix(NULL, result, from);
+ }
+
+ bool get_inverse_matrix(OSL::Matrix44 &result, ustring to) {
+ return get_inverse_matrix(NULL, result, to);
+ }
+
+ bool get_array_attribute(void *renderstate, bool derivatives,
+ ustring object, TypeDesc type, ustring name,
+ int index, void *val) {
+ OSL::ShaderGlobals sg;
+ sg.renderstate = renderstate;
+ return get_array_attribute(&sg, derivatives,
+ object, type, name,
+ index, val);
+ }
+
+ bool get_attribute(void *renderstate, bool derivatives, ustring object_name,
+ TypeDesc type, ustring name, void *val) {
+ OSL::ShaderGlobals sg;
+ sg.renderstate = renderstate;
+ return get_attribute(&sg, derivatives, object_name, type, name, val);
+ }
+
+ bool has_userdata(ustring name, TypeDesc type, void *renderstate) {
+ return has_userdata(name, type, (OSL::ShaderGlobals *) renderstate);
+ }
+
+ bool get_userdata(bool derivatives, ustring name, TypeDesc type,
+ void *renderstate, void *val) {
+ return get_userdata(derivatives, name, type, (OSL::ShaderGlobals *) renderstate, val);
+ }
+
+ bool get_texture_info(ustring filename, int subimage,
+ ustring dataname, TypeDesc datatype, void *data) {
+ return get_texture_info(NULL, filename, subimage, dataname, datatype, data);
+ }
+#endif
private:
KernelGlobals *kernel_globals;
OSL::TextureSystem *osl_ts;
diff --git a/intern/cycles/kernel/osl/osl_shader.cpp b/intern/cycles/kernel/osl/osl_shader.cpp
index 843dcdd0985..ca0c2cc4415 100644
--- a/intern/cycles/kernel/osl/osl_shader.cpp
+++ b/intern/cycles/kernel/osl/osl_shader.cpp
@@ -14,6 +14,8 @@
* limitations under the License
*/
+#include <OSL/oslexec.h>
+
#include "kernel_compat_cpu.h"
#include "kernel_montecarlo.h"
#include "kernel_types.h"
@@ -34,7 +36,6 @@
#include "attribute.h"
-#include <OSL/oslexec.h>
CCL_NAMESPACE_BEGIN
@@ -164,11 +165,14 @@ static void flatten_surface_closure_tree(ShaderData *sd, int path_flag,
CBSDFClosure *bsdf = (CBSDFClosure *)prim;
int scattering = bsdf->scattering();
- /* no caustics option */
- if(scattering == LABEL_GLOSSY && (path_flag & PATH_RAY_DIFFUSE)) {
+ /* caustic options */
+ if((scattering & LABEL_GLOSSY) && (path_flag & PATH_RAY_DIFFUSE)) {
KernelGlobals *kg = sd->osl_globals;
- if(kernel_data.integrator.no_caustics)
+
+ if((!kernel_data.integrator.caustics_reflective && (scattering & LABEL_REFLECT)) ||
+ (!kernel_data.integrator.caustics_refractive && (scattering & LABEL_TRANSMIT))) {
return;
+ }
}
/* sample weight */
@@ -181,12 +185,9 @@ static void flatten_surface_closure_tree(ShaderData *sd, int path_flag,
sc.T = bsdf->sc.T;
sc.data0 = bsdf->sc.data0;
sc.data1 = bsdf->sc.data1;
+ sc.data2 = bsdf->sc.data2;
sc.prim = bsdf->sc.prim;
-#ifdef __HAIR__
- sc.offset = bsdf->sc.offset;
-#endif
-
/* add */
if(sc.sample_weight > CLOSURE_WEIGHT_CUTOFF && sd->num_closure < MAX_CLOSURE) {
sd->closure[sd->num_closure++] = sc;
@@ -202,6 +203,7 @@ static void flatten_surface_closure_tree(ShaderData *sd, int path_flag,
sc.type = CLOSURE_EMISSION_ID;
sc.data0 = 0.0f;
sc.data1 = 0.0f;
+ sc.data2 = 0.0f;
sc.prim = NULL;
/* flag */
@@ -219,6 +221,7 @@ static void flatten_surface_closure_tree(ShaderData *sd, int path_flag,
sc.type = CLOSURE_AMBIENT_OCCLUSION_ID;
sc.data0 = 0.0f;
sc.data1 = 0.0f;
+ sc.data2 = 0.0f;
sc.prim = NULL;
if(sd->num_closure < MAX_CLOSURE) {
@@ -232,6 +235,7 @@ static void flatten_surface_closure_tree(ShaderData *sd, int path_flag,
sc.type = CLOSURE_HOLDOUT_ID;
sc.data0 = 0.0f;
sc.data1 = 0.0f;
+ sc.data2 = 0.0f;
sc.prim = NULL;
if(sd->num_closure < MAX_CLOSURE) {
diff --git a/intern/cycles/kernel/shaders/CMakeLists.txt b/intern/cycles/kernel/shaders/CMakeLists.txt
index 5518d652bf9..0b735ede701 100644
--- a/intern/cycles/kernel/shaders/CMakeLists.txt
+++ b/intern/cycles/kernel/shaders/CMakeLists.txt
@@ -4,6 +4,7 @@
set(SRC_OSL
node_add_closure.osl
node_ambient_occlusion.osl
+ node_anisotropic_bsdf.osl
node_attribute.osl
node_background.osl
node_brick_texture.osl
@@ -13,6 +14,7 @@ set(SRC_OSL
node_checker_texture.osl
node_combine_rgb.osl
node_combine_hsv.osl
+ node_combine_xyz.osl
node_convert_from_color.osl
node_convert_from_float.osl
node_convert_from_int.osl
@@ -57,6 +59,7 @@ set(SRC_OSL
node_rgb_ramp.osl
node_separate_rgb.osl
node_separate_hsv.osl
+ node_separate_xyz.osl
node_set_normal.osl
node_sky_texture.osl
node_subsurface_scattering.osl
@@ -71,7 +74,6 @@ set(SRC_OSL
node_vector_transform.osl
node_velvet_bsdf.osl
node_voronoi_texture.osl
- node_ward_bsdf.osl
node_wavelength.osl
node_blackbody.osl
node_wave_texture.osl
diff --git a/intern/cycles/kernel/shaders/node_ward_bsdf.osl b/intern/cycles/kernel/shaders/node_anisotropic_bsdf.osl
index 2d360d594f2..da1e4f77107 100644
--- a/intern/cycles/kernel/shaders/node_ward_bsdf.osl
+++ b/intern/cycles/kernel/shaders/node_anisotropic_bsdf.osl
@@ -16,8 +16,9 @@
#include "stdosl.h"
-shader node_ward_bsdf(
+shader node_anisotropic_bsdf(
color Color = 0.0,
+ string distribution = "GGX",
float Roughness = 0.0,
float Anisotropy = 0.0,
float Rotation = 0.0,
@@ -44,6 +45,13 @@ shader node_ward_bsdf(
RoughnessV = Roughness / (1.0 - aniso);
}
- BSDF = Color * ward(Normal, T, RoughnessU, RoughnessV);
+ if (distribution == "Sharp")
+ BSDF = Color * reflection(Normal);
+ else if (distribution == "Beckmann")
+ BSDF = Color * microfacet_beckmann_aniso(Normal, T, RoughnessU, RoughnessV);
+ else if (distribution == "GGX")
+ BSDF = Color * microfacet_ggx_aniso(Normal, T, RoughnessU, RoughnessV);
+ else
+ BSDF = Color * ashikhmin_shirley(Normal, T, RoughnessU, RoughnessV);
}
diff --git a/intern/cycles/kernel/shaders/node_brick_texture.osl b/intern/cycles/kernel/shaders/node_brick_texture.osl
index 70a6a6ea7ce..c9fb3542aef 100644
--- a/intern/cycles/kernel/shaders/node_brick_texture.osl
+++ b/intern/cycles/kernel/shaders/node_brick_texture.osl
@@ -93,6 +93,6 @@ shader node_brick_texture(
Col[2] = facm * (Color1[2]) + tint * Color2[2];
}
- Color = (Fac == 1.0) ? Mortar: Col;
+ Color = (Fac == 1.0) ? Mortar : Col;
}
diff --git a/intern/cycles/kernel/shaders/node_checker_texture.osl b/intern/cycles/kernel/shaders/node_checker_texture.osl
index 6723076723c..a6d21fd36f3 100644
--- a/intern/cycles/kernel/shaders/node_checker_texture.osl
+++ b/intern/cycles/kernel/shaders/node_checker_texture.osl
@@ -21,9 +21,9 @@
float checker(point p)
{
- p[0] = (p[0] + 0.00001) * 0.9999;
- p[1] = (p[1] + 0.00001) * 0.9999;
- p[2] = (p[2] + 0.00001) * 0.9999;
+ p[0] = (p[0] + 0.000001) * 0.999999;
+ p[1] = (p[1] + 0.000001) * 0.999999;
+ p[2] = (p[2] + 0.000001) * 0.999999;
int xi = (int)fabs(floor(p[0]));
int yi = (int)fabs(floor(p[1]));
diff --git a/intern/cycles/kernel/shaders/node_combine_xyz.osl b/intern/cycles/kernel/shaders/node_combine_xyz.osl
new file mode 100644
index 00000000000..933dee5bd78
--- /dev/null
+++ b/intern/cycles/kernel/shaders/node_combine_xyz.osl
@@ -0,0 +1,27 @@
+/*
+ * Copyright 2011-2014 Blender Foundation
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License
+ */
+
+#include "stdosl.h"
+
+shader node_combine_xyz(
+ float X = 0.0,
+ float Y = 0.0,
+ float Z = 0.0,
+ output vector Vector = 0.8)
+{
+ Vector = vector(X, Y, Z);
+}
+
diff --git a/intern/cycles/kernel/shaders/node_emission.osl b/intern/cycles/kernel/shaders/node_emission.osl
index 2428da5ef4e..b28d731c19f 100644
--- a/intern/cycles/kernel/shaders/node_emission.osl
+++ b/intern/cycles/kernel/shaders/node_emission.osl
@@ -17,14 +17,10 @@
#include "stdosl.h"
shader node_emission(
- int TotalPower = 0,
color Color = 0.8,
float Strength = 1.0,
output closure color Emission = 0)
{
- if (TotalPower)
- Emission = ((Strength / surfacearea()) * Color) * emission();
- else
- Emission = (Strength * Color) * emission();
+ Emission = (Strength * Color) * emission();
}
diff --git a/intern/cycles/kernel/shaders/node_fresnel.h b/intern/cycles/kernel/shaders/node_fresnel.h
index 447a84255ef..d192c5d02de 100644
--- a/intern/cycles/kernel/shaders/node_fresnel.h
+++ b/intern/cycles/kernel/shaders/node_fresnel.h
@@ -34,3 +34,16 @@ float fresnel_dielectric_cos(float cosi, float eta)
return result;
}
+color fresnel_conductor(float cosi, color eta, color k)
+{
+ color cosi2 = color(cosi * cosi);
+ color one = color(1, 1, 1);
+ color tmp_f = eta * eta + k * k;
+ color tmp = tmp_f * cosi2;
+ color Rparl2 = (tmp - (2.0 * eta * cosi) + one) /
+ (tmp + (2.0 * eta * cosi) + one);
+ color Rperp2 = (tmp_f - (2.0 * eta * cosi) + cosi2) /
+ (tmp_f + (2.0 * eta * cosi) + cosi2);
+ return (Rparl2 + Rperp2) * 0.5;
+}
+
diff --git a/intern/cycles/kernel/shaders/node_geometry.osl b/intern/cycles/kernel/shaders/node_geometry.osl
index dbdf55802ae..cd68f07b21e 100644
--- a/intern/cycles/kernel/shaders/node_geometry.osl
+++ b/intern/cycles/kernel/shaders/node_geometry.osl
@@ -49,12 +49,8 @@ shader node_geometry(
/* try to create spherical tangent from generated coordinates */
if (getattribute("geom:generated", generated)) {
- matrix project = matrix(0.0, 1.0, 0.0, 0.0,
- -1.0, 0.0, 0.0, 0.0,
- 0.0, 0.0, 0.0, 0.0,
- 0.5, -0.5, 0.0, 1.0);
-
- vector T = transform("object", "world", transform(project, generated));
+ normal data = normal(-(generated[1] - 0.5), (generated[0] - 0.5), 0.0);
+ vector T = transform("object", "world", data);
Tangent = cross(Normal, normalize(cross(T, Normal)));
}
else {
diff --git a/intern/cycles/kernel/shaders/node_glossy_bsdf.osl b/intern/cycles/kernel/shaders/node_glossy_bsdf.osl
index b4e0fe62223..5c727ca6917 100644
--- a/intern/cycles/kernel/shaders/node_glossy_bsdf.osl
+++ b/intern/cycles/kernel/shaders/node_glossy_bsdf.osl
@@ -19,7 +19,7 @@
shader node_glossy_bsdf(
color Color = 0.8,
- string distribution = "Beckmann",
+ string distribution = "GGX",
float Roughness = 0.2,
normal Normal = N,
output closure color BSDF = 0)
@@ -30,6 +30,8 @@ shader node_glossy_bsdf(
BSDF = Color * microfacet_beckmann(Normal, Roughness);
else if (distribution == "GGX")
BSDF = Color * microfacet_ggx(Normal, Roughness);
+ else
+ BSDF = Color * ashikhmin_shirley(Normal, vector(0, 0, 0), Roughness, Roughness);
}
diff --git a/intern/cycles/kernel/shaders/node_image_texture.osl b/intern/cycles/kernel/shaders/node_image_texture.osl
index 7238a1e8862..18b5fb4b31f 100644
--- a/intern/cycles/kernel/shaders/node_image_texture.osl
+++ b/intern/cycles/kernel/shaders/node_image_texture.osl
@@ -113,6 +113,10 @@ shader node_image_texture(
weight[2] = ((2.0 - limit) * Nob[2] + (limit - 1.0)) / (2.0 * limit - 1.0);
}
}
+ else {
+ /* Desperate mode, no valid choice anyway, fallback to one side.*/
+ weight[0] = 1.0;
+ }
Color = color(0.0, 0.0, 0.0);
Alpha = 0.0;
diff --git a/intern/cycles/kernel/shaders/node_musgrave_texture.osl b/intern/cycles/kernel/shaders/node_musgrave_texture.osl
index 60762539002..a32c3d4b1b8 100644
--- a/intern/cycles/kernel/shaders/node_musgrave_texture.osl
+++ b/intern/cycles/kernel/shaders/node_musgrave_texture.osl
@@ -35,14 +35,14 @@ float noise_musgrave_fBm(point p, string basis, float H, float lacunarity, float
int i;
for (i = 0; i < (int)octaves; i++) {
- value += safe_noise(p, 0) * pwr;
+ value += safe_noise(p, "signed") * pwr;
pwr *= pwHL;
p *= lacunarity;
}
rmd = octaves - floor(octaves);
if (rmd != 0.0)
- value += rmd * safe_noise(p, 0) * pwr;
+ value += rmd * safe_noise(p, "signed") * pwr;
return value;
}
@@ -63,14 +63,14 @@ float noise_musgrave_multi_fractal(point p, string basis, float H, float lacunar
int i;
for (i = 0; i < (int)octaves; i++) {
- value *= (pwr * safe_noise(p, 0) + 1.0);
+ value *= (pwr * safe_noise(p, "signed") + 1.0);
pwr *= pwHL;
p *= lacunarity;
}
rmd = octaves - floor(octaves);
if (rmd != 0.0)
- value *= (rmd * pwr * safe_noise(p, 0) + 1.0); /* correct? */
+ value *= (rmd * pwr * safe_noise(p, "signed") + 1.0); /* correct? */
return value;
}
@@ -91,11 +91,11 @@ float noise_musgrave_hetero_terrain(point p, string basis, float H, float lacuna
int i;
/* first unscaled octave of function; later octaves are scaled */
- value = offset + safe_noise(p, 0);
+ value = offset + safe_noise(p, "signed");
p *= lacunarity;
for (i = 1; i < (int)octaves; i++) {
- increment = (safe_noise(p, 0) + offset) * pwr * value;
+ increment = (safe_noise(p, "signed") + offset) * pwr * value;
value += increment;
pwr *= pwHL;
p *= lacunarity;
@@ -103,7 +103,7 @@ float noise_musgrave_hetero_terrain(point p, string basis, float H, float lacuna
rmd = octaves - floor(octaves);
if (rmd != 0.0) {
- increment = (safe_noise(p, 0) + offset) * pwr * value;
+ increment = (safe_noise(p, "signed") + offset) * pwr * value;
value += rmd * increment;
}
@@ -126,7 +126,7 @@ float noise_musgrave_hybrid_multi_fractal(point p, string basis, float H,
float pwr = pwHL;
int i;
- result = safe_noise(p, 0) + offset;
+ result = safe_noise(p, "signed") + offset;
weight = gain * result;
p *= lacunarity;
@@ -134,7 +134,7 @@ float noise_musgrave_hybrid_multi_fractal(point p, string basis, float H,
if (weight > 1.0)
weight = 1.0;
- signal = (safe_noise(p, 0) + offset) * pwr;
+ signal = (safe_noise(p, "signed") + offset) * pwr;
pwr *= pwHL;
result += weight * signal;
weight *= gain * signal;
@@ -143,7 +143,7 @@ float noise_musgrave_hybrid_multi_fractal(point p, string basis, float H,
rmd = octaves - floor(octaves);
if (rmd != 0.0)
- result += rmd * ((safe_noise(p, 0) + offset) * pwr);
+ result += rmd * ((safe_noise(p, "signed") + offset) * pwr);
return result;
}
@@ -164,7 +164,7 @@ float noise_musgrave_ridged_multi_fractal(point p, string basis, float H,
float pwr = pwHL;
int i;
- signal = offset - fabs(safe_noise(p, 0));
+ signal = offset - fabs(safe_noise(p, "signed"));
signal *= signal;
result = signal;
weight = 1.0;
@@ -172,7 +172,7 @@ float noise_musgrave_ridged_multi_fractal(point p, string basis, float H,
for (i = 1; i < (int)octaves; i++) {
p *= lacunarity;
weight = clamp(signal * gain, 0.0, 1.0);
- signal = offset - fabs(safe_noise(p, 0));
+ signal = offset - fabs(safe_noise(p, "signed"));
signal *= signal;
signal *= weight;
result += signal * pwr;
diff --git a/intern/cycles/kernel/shaders/node_separate_xyz.osl b/intern/cycles/kernel/shaders/node_separate_xyz.osl
new file mode 100644
index 00000000000..63725cb9995
--- /dev/null
+++ b/intern/cycles/kernel/shaders/node_separate_xyz.osl
@@ -0,0 +1,28 @@
+/*
+ * Copyright 2011-2013 Blender Foundation
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License
+ */
+
+#include "stdosl.h"
+
+shader node_separate_xyz(
+ vector Vector = 0.8,
+ output float X = 0.0,
+ output float Y = 0.0,
+ output float Z = 0.0)
+{
+ X = Vector[0];
+ Y = Vector[1];
+ Z = Vector[2];
+}
diff --git a/intern/cycles/kernel/shaders/node_texture.h b/intern/cycles/kernel/shaders/node_texture.h
index de51559f297..2710eed414a 100644
--- a/intern/cycles/kernel/shaders/node_texture.h
+++ b/intern/cycles/kernel/shaders/node_texture.h
@@ -153,12 +153,12 @@ float voronoi_CrS(point p) { return 2.0 * voronoi_Cr(p) - 1.0; }
/* Noise Bases */
-float safe_noise(point p, int type)
+float safe_noise(point p, string type)
{
float f = 0.0;
/* Perlin noise in range -1..1 */
- if (type == 0)
+ if (type == "signed")
f = noise("perlin", p);
/* Perlin noise in range 0..1 */
@@ -175,7 +175,7 @@ float safe_noise(point p, int type)
float noise_basis(point p, string basis)
{
if (basis == "Perlin")
- return safe_noise(p, 1);
+ return safe_noise(p, "unsigned");
if (basis == "Voronoi F1")
return voronoi_F1S(p);
if (basis == "Voronoi F2")
diff --git a/intern/cycles/kernel/shaders/stdosl.h b/intern/cycles/kernel/shaders/stdosl.h
index 6f824ea8ebd..1ff8f363b49 100644
--- a/intern/cycles/kernel/shaders/stdosl.h
+++ b/intern/cycles/kernel/shaders/stdosl.h
@@ -476,17 +476,17 @@ closure color diffuse_ramp(normal N, color colors[8]) BUILTIN;
closure color phong_ramp(normal N, float exponent, color colors[8]) BUILTIN;
closure color diffuse_toon(normal N, float size, float smooth) BUILTIN;
closure color glossy_toon(normal N, float size, float smooth) BUILTIN;
-closure color westin_backscatter(normal N, float roughness) BUILTIN;
-closure color westin_sheen(normal N, float edginess) BUILTIN;
closure color translucent(normal N) BUILTIN;
closure color reflection(normal N) BUILTIN;
closure color refraction(normal N, float eta) BUILTIN;
closure color transparent() BUILTIN;
closure color microfacet_ggx(normal N, float ag) BUILTIN;
+closure color microfacet_ggx_aniso(normal N, vector T, float ax, float ay) BUILTIN;
closure color microfacet_ggx_refraction(normal N, float ag, float eta) BUILTIN;
closure color microfacet_beckmann(normal N, float ab) BUILTIN;
+closure color microfacet_beckmann_aniso(normal N, vector T, float ax, float ay) BUILTIN;
closure color microfacet_beckmann_refraction(normal N, float ab, float eta) BUILTIN;
-closure color ward(normal N, vector T,float ax, float ay) BUILTIN;
+closure color ashikhmin_shirley(normal N, vector T,float ax, float ay) BUILTIN;
closure color ashikhmin_velvet(normal N, float sigma) BUILTIN;
closure color emission() BUILTIN;
closure color background() BUILTIN;
@@ -505,12 +505,8 @@ closure color hair_transmission(normal N, float roughnessu, float roughnessv, ve
closure color henyey_greenstein(float g) BUILTIN;
closure color absorption() BUILTIN;
-// Backwards compatibility
-closure color bssrdf_cubic(normal N, vector radius) BUILTIN;
-closure color bssrdf_gaussian(normal N, vector radius) BUILTIN;
-closure color specular_toon(normal N, float size, float smooth) BUILTIN;
-
// Renderer state
+int backfacing () BUILTIN;
int raytype (string typename) BUILTIN;
// the individual 'isFOOray' functions are deprecated
int iscameraray () { return raytype("camera"); }
diff --git a/intern/cycles/kernel/svm/svm.h b/intern/cycles/kernel/svm/svm.h
index dbf59c60cb0..c13eae813d6 100644
--- a/intern/cycles/kernel/svm/svm.h
+++ b/intern/cycles/kernel/svm/svm.h
@@ -167,8 +167,8 @@ CCL_NAMESPACE_END
#include "svm_math.h"
#include "svm_mix.h"
#include "svm_ramp.h"
-#include "svm_sepcomb_rgb.h"
#include "svm_sepcomb_hsv.h"
+#include "svm_sepcomb_vector.h"
#include "svm_musgrave.h"
#include "svm_sky.h"
#include "svm_tex_coord.h"
@@ -236,7 +236,7 @@ ccl_device_noinline void svm_eval_nodes(KernelGlobals *kg, ShaderData *sd, Shade
if(stack_load_float(stack, node.z) == 1.0f)
offset += node.y;
break;
-#ifdef __IMAGE_TEXTURES__
+#ifdef __TEXTURES__
case NODE_TEX_IMAGE:
svm_node_tex_image(kg, sd, stack, node);
break;
@@ -246,8 +246,6 @@ ccl_device_noinline void svm_eval_nodes(KernelGlobals *kg, ShaderData *sd, Shade
case NODE_TEX_ENVIRONMENT:
svm_node_tex_environment(kg, sd, stack, node);
break;
-#endif
-#ifdef __PROCEDURAL_TEXTURES__
case NODE_TEX_SKY:
svm_node_tex_sky(kg, sd, stack, node, &offset);
break;
@@ -327,11 +325,11 @@ ccl_device_noinline void svm_eval_nodes(KernelGlobals *kg, ShaderData *sd, Shade
case NODE_MIX:
svm_node_mix(kg, sd, stack, node.y, node.z, node.w, &offset);
break;
- case NODE_SEPARATE_RGB:
- svm_node_separate_rgb(sd, stack, node.y, node.z, node.w);
+ case NODE_SEPARATE_VECTOR:
+ svm_node_separate_vector(sd, stack, node.y, node.z, node.w);
break;
- case NODE_COMBINE_RGB:
- svm_node_combine_rgb(sd, stack, node.y, node.z, node.w);
+ case NODE_COMBINE_VECTOR:
+ svm_node_combine_vector(sd, stack, node.y, node.z, node.w);
break;
case NODE_SEPARATE_HSV:
svm_node_separate_hsv(kg, sd, stack, node.y, node.z, node.w, &offset);
@@ -407,12 +405,7 @@ ccl_device_noinline void svm_eval_nodes(KernelGlobals *kg, ShaderData *sd, Shade
break;
case NODE_CLOSURE_SET_NORMAL:
svm_node_set_normal(kg, sd, stack, node.y, node.z );
- break;
-#endif
- case NODE_EMISSION_SET_WEIGHT_TOTAL:
- svm_node_emission_set_weight_total(kg, sd, node.y, node.z, node.w);
break;
-#ifdef __EXTRA_NODES__
case NODE_RGB_RAMP:
svm_node_rgb_ramp(kg, sd, stack, node, &offset);
break;
@@ -425,17 +418,13 @@ ccl_device_noinline void svm_eval_nodes(KernelGlobals *kg, ShaderData *sd, Shade
case NODE_LIGHT_FALLOFF:
svm_node_light_falloff(sd, stack, node);
break;
-#endif
-#ifdef __ANISOTROPIC__
+#endif
case NODE_TANGENT:
svm_node_tangent(kg, sd, stack, node);
break;
-#endif
-#ifdef __NORMAL_MAP__
case NODE_NORMAL_MAP:
svm_node_normal_map(kg, sd, stack, node);
- break;
-#endif
+ break;
case NODE_END:
default:
return;
diff --git a/intern/cycles/kernel/svm/svm_blackbody.h b/intern/cycles/kernel/svm/svm_blackbody.h
index 63dbf27d35e..1e40e868e14 100644
--- a/intern/cycles/kernel/svm/svm_blackbody.h
+++ b/intern/cycles/kernel/svm/svm_blackbody.h
@@ -42,7 +42,7 @@ ccl_device void svm_node_blackbody(KernelGlobals *kg, ShaderData *sd, float *sta
/* Input */
float temperature = stack_load_float(stack, temperature_offset);
- if (temperature < BB_DRAPPER) {
+ if (temperature < BB_DRAPER) {
/* just return very very dim red */
color_rgb = make_float3(1.0e-6f,0.0f,0.0f);
}
@@ -53,9 +53,9 @@ ccl_device void svm_node_blackbody(KernelGlobals *kg, ShaderData *sd, float *sta
/* reconstruct a proper index for the table lookup, compared to OSL we don't look up two colors
just one (the OSL-lerp is also automatically done for us by "lookup_table_read") */
- float t = powf((temperature - BB_DRAPPER) * (1.0f / BB_TABLE_SPACING), (1.0f / BB_TABLE_XPOWER));
+ float t = powf((temperature - BB_DRAPER) * (1.0f / BB_TABLE_SPACING), (1.0f / BB_TABLE_XPOWER));
- int blackbody_table_offset = kernel_data.blackbody.table_offset;
+ int blackbody_table_offset = kernel_data.tables.blackbody_offset;
/* Retrieve colors from the lookup table */
float lutval = t*lookuptablenormalize;
diff --git a/intern/cycles/kernel/svm/svm_checker.h b/intern/cycles/kernel/svm/svm_checker.h
index 8d1a1a40449..e0408ad334a 100644
--- a/intern/cycles/kernel/svm/svm_checker.h
+++ b/intern/cycles/kernel/svm/svm_checker.h
@@ -21,9 +21,9 @@ CCL_NAMESPACE_BEGIN
ccl_device_noinline float svm_checker(float3 p)
{
/* avoid precision issues on unit coordinates */
- p.x = (p.x + 0.00001f)*0.9999f;
- p.y = (p.y + 0.00001f)*0.9999f;
- p.z = (p.z + 0.00001f)*0.9999f;
+ p.x = (p.x + 0.000001f)*0.999999f;
+ p.y = (p.y + 0.000001f)*0.999999f;
+ p.z = (p.z + 0.000001f)*0.999999f;
int xi = float_to_int(fabsf(floorf(p.x)));
int yi = float_to_int(fabsf(floorf(p.y)));
diff --git a/intern/cycles/kernel/svm/svm_closure.h b/intern/cycles/kernel/svm/svm_closure.h
index a3770877544..30110db3ef9 100644
--- a/intern/cycles/kernel/svm/svm_closure.h
+++ b/intern/cycles/kernel/svm/svm_closure.h
@@ -24,6 +24,7 @@ ccl_device void svm_node_glass_setup(ShaderData *sd, ShaderClosure *sc, int type
if(refract) {
sc->data0 = eta;
sc->data1 = 0.0f;
+ sc->data2 = 0.0f;
sd->flag |= bsdf_refraction_setup(sc);
}
else
@@ -31,7 +32,8 @@ ccl_device void svm_node_glass_setup(ShaderData *sd, ShaderClosure *sc, int type
}
else if(type == CLOSURE_BSDF_MICROFACET_BECKMANN_GLASS_ID) {
sc->data0 = roughness;
- sc->data1 = eta;
+ sc->data1 = roughness;
+ sc->data2 = eta;
if(refract)
sd->flag |= bsdf_microfacet_beckmann_refraction_setup(sc);
@@ -40,7 +42,8 @@ ccl_device void svm_node_glass_setup(ShaderData *sd, ShaderClosure *sc, int type
}
else {
sc->data0 = roughness;
- sc->data1 = eta;
+ sc->data1 = roughness;
+ sc->data2 = eta;
if(refract)
sd->flag |= bsdf_microfacet_ggx_refraction_setup(sc);
@@ -135,11 +138,13 @@ ccl_device void svm_node_closure_bsdf(KernelGlobals *kg, ShaderData *sd, float *
if(roughness == 0.0f) {
sc->data0 = 0.0f;
sc->data1 = 0.0f;
+ sc->data2 = 0.0f;
sd->flag |= bsdf_diffuse_setup(sc);
}
else {
sc->data0 = roughness;
sc->data1 = 0.0f;
+ sc->data2 = 0.0f;
sd->flag |= bsdf_oren_nayar_setup(sc);
}
}
@@ -151,6 +156,7 @@ ccl_device void svm_node_closure_bsdf(KernelGlobals *kg, ShaderData *sd, float *
if(sc) {
sc->data0 = 0.0f;
sc->data1 = 0.0f;
+ sc->data2 = 0.0f;
sc->N = N;
sd->flag |= bsdf_translucent_setup(sc);
}
@@ -162,6 +168,7 @@ ccl_device void svm_node_closure_bsdf(KernelGlobals *kg, ShaderData *sd, float *
if(sc) {
sc->data0 = 0.0f;
sc->data1 = 0.0f;
+ sc->data2 = 0.0f;
sc->N = N;
sd->flag |= bsdf_transparent_setup(sc);
}
@@ -169,9 +176,10 @@ ccl_device void svm_node_closure_bsdf(KernelGlobals *kg, ShaderData *sd, float *
}
case CLOSURE_BSDF_REFLECTION_ID:
case CLOSURE_BSDF_MICROFACET_GGX_ID:
- case CLOSURE_BSDF_MICROFACET_BECKMANN_ID: {
+ case CLOSURE_BSDF_MICROFACET_BECKMANN_ID:
+ case CLOSURE_BSDF_ASHIKHMIN_SHIRLEY_ID: {
#ifdef __CAUSTICS_TRICKS__
- if(kernel_data.integrator.no_caustics && (path_flag & PATH_RAY_DIFFUSE))
+ if(!kernel_data.integrator.caustics_reflective && (path_flag & PATH_RAY_DIFFUSE))
break;
#endif
ShaderClosure *sc = svm_node_closure_get_bsdf(sd, mix_weight);
@@ -179,15 +187,18 @@ ccl_device void svm_node_closure_bsdf(KernelGlobals *kg, ShaderData *sd, float *
if(sc) {
sc->N = N;
sc->data0 = param1;
- sc->data1 = 0.0f;
+ sc->data1 = param1;
+ sc->data2 = 0.0f;
/* setup bsdf */
if(type == CLOSURE_BSDF_REFLECTION_ID)
sd->flag |= bsdf_reflection_setup(sc);
else if(type == CLOSURE_BSDF_MICROFACET_BECKMANN_ID)
sd->flag |= bsdf_microfacet_beckmann_setup(sc);
- else
+ else if(type == CLOSURE_BSDF_MICROFACET_GGX_ID)
sd->flag |= bsdf_microfacet_ggx_setup(sc);
+ else
+ sd->flag |= bsdf_ashikhmin_shirley_setup(sc);
}
break;
@@ -196,25 +207,35 @@ ccl_device void svm_node_closure_bsdf(KernelGlobals *kg, ShaderData *sd, float *
case CLOSURE_BSDF_MICROFACET_GGX_REFRACTION_ID:
case CLOSURE_BSDF_MICROFACET_BECKMANN_REFRACTION_ID: {
#ifdef __CAUSTICS_TRICKS__
- if(kernel_data.integrator.no_caustics && (path_flag & PATH_RAY_DIFFUSE))
+ if(!kernel_data.integrator.caustics_refractive && (path_flag & PATH_RAY_DIFFUSE))
break;
#endif
ShaderClosure *sc = svm_node_closure_get_bsdf(sd, mix_weight);
if(sc) {
sc->N = N;
- sc->data0 = param1;
float eta = fmaxf(param2, 1e-5f);
- sc->data1 = (sd->flag & SD_BACKFACING)? 1.0f/eta: eta;
+ eta = (sd->flag & SD_BACKFACING)? 1.0f/eta: eta;
/* setup bsdf */
- if(type == CLOSURE_BSDF_REFRACTION_ID)
+ if(type == CLOSURE_BSDF_REFRACTION_ID) {
+ sc->data0 = eta;
+ sc->data1 = 0.0f;
+ sc->data2 = 0.0f;
+
sd->flag |= bsdf_refraction_setup(sc);
- else if(type == CLOSURE_BSDF_MICROFACET_BECKMANN_REFRACTION_ID)
- sd->flag |= bsdf_microfacet_beckmann_refraction_setup(sc);
- else
- sd->flag |= bsdf_microfacet_ggx_refraction_setup(sc);
+ }
+ else {
+ sc->data0 = param1;
+ sc->data1 = param1;
+ sc->data2 = eta;
+
+ if(type == CLOSURE_BSDF_MICROFACET_BECKMANN_REFRACTION_ID)
+ sd->flag |= bsdf_microfacet_beckmann_refraction_setup(sc);
+ else
+ sd->flag |= bsdf_microfacet_ggx_refraction_setup(sc);
+ }
}
break;
@@ -223,8 +244,10 @@ ccl_device void svm_node_closure_bsdf(KernelGlobals *kg, ShaderData *sd, float *
case CLOSURE_BSDF_MICROFACET_GGX_GLASS_ID:
case CLOSURE_BSDF_MICROFACET_BECKMANN_GLASS_ID: {
#ifdef __CAUSTICS_TRICKS__
- if(kernel_data.integrator.no_caustics && (path_flag & PATH_RAY_DIFFUSE))
+ if(!kernel_data.integrator.caustics_reflective &&
+ !kernel_data.integrator.caustics_refractive && (path_flag & PATH_RAY_DIFFUSE)) {
break;
+ }
#endif
/* index of refraction */
float eta = fmaxf(param2, 1e-5f);
@@ -241,12 +264,21 @@ ccl_device void svm_node_closure_bsdf(KernelGlobals *kg, ShaderData *sd, float *
float sample_weight = sc->sample_weight;
sc = svm_node_closure_get_bsdf(sd, mix_weight*fresnel);
-
- if(sc) {
- sc->N = N;
- svm_node_glass_setup(sd, sc, type, eta, roughness, false);
+#ifdef __CAUSTICS_TRICKS__
+ if(kernel_data.integrator.caustics_reflective || (path_flag & PATH_RAY_DIFFUSE) == 0)
+#endif
+ {
+ if(sc) {
+ sc->N = N;
+ svm_node_glass_setup(sd, sc, type, eta, roughness, false);
+ }
}
+#ifdef __CAUSTICS_TRICKS__
+ if(!kernel_data.integrator.caustics_refractive && (path_flag & PATH_RAY_DIFFUSE))
+ break;
+#endif
+
/* refraction */
sc = &sd->closure[sd->num_closure];
sc->weight = weight;
@@ -261,9 +293,11 @@ ccl_device void svm_node_closure_bsdf(KernelGlobals *kg, ShaderData *sd, float *
break;
}
- case CLOSURE_BSDF_WARD_ID: {
+ case CLOSURE_BSDF_MICROFACET_BECKMANN_ANISO_ID:
+ case CLOSURE_BSDF_MICROFACET_GGX_ANISO_ID:
+ case CLOSURE_BSDF_ASHIKHMIN_SHIRLEY_ANISO_ID: {
#ifdef __CAUSTICS_TRICKS__
- if(kernel_data.integrator.no_caustics && (path_flag & PATH_RAY_DIFFUSE))
+ if(!kernel_data.integrator.caustics_reflective && (path_flag & PATH_RAY_DIFFUSE))
break;
#endif
ShaderClosure *sc = svm_node_closure_get_bsdf(sd, mix_weight);
@@ -271,7 +305,6 @@ ccl_device void svm_node_closure_bsdf(KernelGlobals *kg, ShaderData *sd, float *
if(sc) {
sc->N = N;
-#ifdef __ANISOTROPIC__
sc->T = stack_load_float3(stack, data_node.y);
/* rotate tangent */
@@ -293,10 +326,14 @@ ccl_device void svm_node_closure_bsdf(KernelGlobals *kg, ShaderData *sd, float *
sc->data1 = roughness/(1.0f - anisotropy);
}
- sd->flag |= bsdf_ward_setup(sc);
-#else
- sd->flag |= bsdf_diffuse_setup(sc);
-#endif
+ sc->data2 = 0.0f;
+
+ if (type == CLOSURE_BSDF_MICROFACET_BECKMANN_ANISO_ID)
+ sd->flag |= bsdf_microfacet_beckmann_aniso_setup(sc);
+ else if (type == CLOSURE_BSDF_MICROFACET_GGX_ANISO_ID)
+ sd->flag |= bsdf_microfacet_ggx_aniso_setup(sc);
+ else
+ sd->flag |= bsdf_ashikhmin_shirley_aniso_setup(sc);
}
break;
}
@@ -309,6 +346,7 @@ ccl_device void svm_node_closure_bsdf(KernelGlobals *kg, ShaderData *sd, float *
/* sigma */
sc->data0 = clamp(param1, 0.0f, 1.0f);
sc->data1 = 0.0f;
+ sc->data2 = 0.0f;
sd->flag |= bsdf_ashikhmin_velvet_setup(sc);
}
break;
@@ -322,6 +360,7 @@ ccl_device void svm_node_closure_bsdf(KernelGlobals *kg, ShaderData *sd, float *
sc->N = N;
sc->data0 = param1;
sc->data1 = param2;
+ sc->data2 = 0.0f;
if (type == CLOSURE_BSDF_DIFFUSE_TOON_ID)
sd->flag |= bsdf_diffuse_toon_setup(sc);
@@ -339,7 +378,7 @@ ccl_device void svm_node_closure_bsdf(KernelGlobals *kg, ShaderData *sd, float *
if(sc) {
/* todo: giving a fixed weight here will cause issues when
- * mixing multiple BSDFS. energey will not be conserved and
+ * mixing multiple BSDFS. energy will not be conserved and
* the throughput can blow up after multiple bounces. we
* better figure out a way to skip backfaces from rays
* spawned by transmission from the front */
@@ -356,11 +395,11 @@ ccl_device void svm_node_closure_bsdf(KernelGlobals *kg, ShaderData *sd, float *
sc->N = N;
sc->data0 = param1;
sc->data1 = param2;
- sc->offset = -stack_load_float(stack, data_node.z);
+ sc->data2 = -stack_load_float(stack, data_node.z);
if(!(sd->type & PRIMITIVE_ALL_CURVE)) {
sc->T = normalize(sd->dPdv);
- sc->offset = 0.0f;
+ sc->data2 = 0.0f;
}
else
sc->T = sd->dPdu;
@@ -405,6 +444,7 @@ ccl_device void svm_node_closure_bsdf(KernelGlobals *kg, ShaderData *sd, float *
sc->sample_weight = sample_weight;
sc->data0 = radius.x;
sc->data1 = texture_blur;
+ sc->data2 = 0.0f;
sc->T.x = sharpness;
#ifdef __OSL__
sc->prim = NULL;
@@ -421,6 +461,7 @@ ccl_device void svm_node_closure_bsdf(KernelGlobals *kg, ShaderData *sd, float *
sc->sample_weight = sample_weight;
sc->data0 = radius.y;
sc->data1 = texture_blur;
+ sc->data2 = 0.0f;
sc->T.x = sharpness;
#ifdef __OSL__
sc->prim = NULL;
@@ -437,6 +478,7 @@ ccl_device void svm_node_closure_bsdf(KernelGlobals *kg, ShaderData *sd, float *
sc->sample_weight = sample_weight;
sc->data0 = radius.z;
sc->data1 = texture_blur;
+ sc->data2 = 0.0f;
sc->T.x = sharpness;
#ifdef __OSL__
sc->prim = NULL;
@@ -582,16 +624,6 @@ ccl_device void svm_node_closure_set_weight(ShaderData *sd, uint r, uint g, uint
svm_node_closure_store_weight(sd, weight);
}
-ccl_device void svm_node_emission_set_weight_total(KernelGlobals *kg, ShaderData *sd, uint r, uint g, uint b)
-{
- float3 weight = make_float3(__uint_as_float(r), __uint_as_float(g), __uint_as_float(b));
-
- if(sd->object != OBJECT_NONE)
- weight /= object_surface_area(kg, sd->object);
-
- svm_node_closure_store_weight(sd, weight);
-}
-
ccl_device void svm_node_closure_weight(ShaderData *sd, float *stack, uint weight_offset)
{
float3 weight = stack_load_float3(stack, weight_offset);
@@ -603,14 +635,10 @@ ccl_device void svm_node_emission_weight(KernelGlobals *kg, ShaderData *sd, floa
{
uint color_offset = node.y;
uint strength_offset = node.z;
- uint total_power = node.w;
float strength = stack_load_float(stack, strength_offset);
float3 weight = stack_load_float3(stack, color_offset)*strength;
- if(total_power && sd->object != OBJECT_NONE)
- weight /= object_surface_area(kg, sd->object);
-
svm_node_closure_store_weight(sd, weight);
}
diff --git a/intern/cycles/kernel/svm/svm_convert.h b/intern/cycles/kernel/svm/svm_convert.h
index 2503912c5c6..b221e0728ec 100644
--- a/intern/cycles/kernel/svm/svm_convert.h
+++ b/intern/cycles/kernel/svm/svm_convert.h
@@ -45,13 +45,13 @@ ccl_device void svm_node_convert(ShaderData *sd, float *stack, uint type, uint f
}
case NODE_CONVERT_VF: {
float3 f = stack_load_float3(stack, from);
- float g = (f.x + f.y + f.z)*(1.0f/3.0f);
+ float g = average(f);
stack_store_float(stack, to, g);
break;
}
case NODE_CONVERT_VI: {
float3 f = stack_load_float3(stack, from);
- int i = (int)((f.x + f.y + f.z)*(1.0f/3.0f));
+ int i = (int)average(f);
stack_store_int(stack, to, i);
break;
}
diff --git a/intern/cycles/kernel/svm/svm_image.h b/intern/cycles/kernel/svm/svm_image.h
index daf7c6652d2..8a256c9bda5 100644
--- a/intern/cycles/kernel/svm/svm_image.h
+++ b/intern/cycles/kernel/svm/svm_image.h
@@ -134,8 +134,8 @@ ccl_device float4 svm_image_texture(KernelGlobals *kg, int id, float x, float y,
{
#ifdef __KERNEL_CPU__
#ifdef __KERNEL_SSE2__
- __m128 r_m128;
- float4 &r = (float4 &)r_m128;
+ ssef r_ssef;
+ float4 &r = (float4 &)r_ssef;
r = kernel_tex_image_interp(id, x, y);
#else
float4 r = kernel_tex_image_interp(id, x, y);
@@ -252,9 +252,9 @@ ccl_device float4 svm_image_texture(KernelGlobals *kg, int id, float x, float y,
case 96: r = kernel_tex_image_interp(__tex_image_096, x, y); break;
case 97: r = kernel_tex_image_interp(__tex_image_097, x, y); break;
case 98: r = kernel_tex_image_interp(__tex_image_098, x, y); break;
- case 99: r = kernel_tex_image_interp(__tex_image_099, x, y); break;
#if defined(__CUDA_ARCH__) && (__CUDA_ARCH__ >= 300)
+ case 99: r = kernel_tex_image_interp(__tex_image_099, x, y); break;
case 100: r = kernel_tex_image_interp(__tex_image_100, x, y); break;
case 101: r = kernel_tex_image_interp(__tex_image_101, x, y); break;
case 102: r = kernel_tex_image_interp(__tex_image_102, x, y); break;
@@ -318,14 +318,14 @@ ccl_device float4 svm_image_texture(KernelGlobals *kg, int id, float x, float y,
float alpha = r.w;
if(use_alpha && alpha != 1.0f && alpha != 0.0f) {
- r_m128 = _mm_div_ps(r_m128, _mm_set1_ps(alpha));
+ r_ssef = r_ssef / ssef(alpha);
if(id >= TEX_NUM_FLOAT_IMAGES)
- r_m128 = _mm_min_ps(r_m128, _mm_set1_ps(1.0f));
+ r_ssef = min(r_ssef, ssef(1.0f));
r.w = alpha;
}
if(srgb) {
- r_m128 = color_srgb_to_scene_linear(r_m128);
+ r_ssef = color_srgb_to_scene_linear(r_ssef);
r.w = alpha;
}
#else
@@ -435,6 +435,10 @@ ccl_device void svm_node_tex_image_box(KernelGlobals *kg, ShaderData *sd, float
weight.z = ((2.0f - limit)*N.z + (limit - 1.0f))/(2.0f*limit - 1.0f);
}
}
+ else {
+ /* Desperate mode, no valid choice anyway, fallback to one side.*/
+ weight.x = 1.0f;
+ }
/* now fetch textures */
uint co_offset, out_offset, alpha_offset, srgb;
diff --git a/intern/cycles/kernel/svm/svm_noise.h b/intern/cycles/kernel/svm/svm_noise.h
index 91dda8972f9..c77c2a1c482 100644
--- a/intern/cycles/kernel/svm/svm_noise.h
+++ b/intern/cycles/kernel/svm/svm_noise.h
@@ -38,11 +38,11 @@ ccl_device int quick_floor(float x)
return float_to_int(x) - ((x < 0) ? 1 : 0);
}
#else
-ccl_device_inline __m128i quick_floor_sse(const __m128& x)
+ccl_device_inline ssei quick_floor_sse(const ssef& x)
{
- __m128i b = _mm_cvttps_epi32(x);
- __m128i isneg = _mm_castps_si128(_mm_cmplt_ps(x, _mm_set1_ps(0.0f)));
- return _mm_add_epi32(b, isneg); // unsaturated add 0xffffffff is the same as subtract -1
+ ssei b = truncatei(x);
+ ssei isneg = cast((x < ssef(0.0f)).m128);
+ return b + isneg; // unsaturated add 0xffffffff is the same as subtract -1
}
#endif
@@ -52,9 +52,9 @@ ccl_device float bits_to_01(uint bits)
return bits * (1.0f/(float)0xFFFFFFFF);
}
#else
-ccl_device_inline __m128 bits_to_01_sse(const __m128i& bits)
+ccl_device_inline ssef bits_to_01_sse(const ssei& bits)
{
- return _mm_mul_ps(uint32_to_float(bits), _mm_set1_ps(1.0f/(float)0xFFFFFFFF));
+ return uint32_to_float(bits) * ssef(1.0f/(float)0xFFFFFFFF);
}
#endif
@@ -88,16 +88,16 @@ ccl_device uint hash(uint kx, uint ky, uint kz)
}
#ifdef __KERNEL_SSE2__
-ccl_device_inline __m128i hash_sse(const __m128i& kx, const __m128i& ky, const __m128i& kz)
+ccl_device_inline ssei hash_sse(const ssei& kx, const ssei& ky, const ssei& kz)
{
-#define rot(x,k) _mm_or_si128(_mm_slli_epi32((x), (k)), _mm_srli_epi32((x), 32-(k)))
-#define xor_rot(a, b, c) do {a = _mm_xor_si128(a, b); a = _mm_sub_epi32(a, rot(b, c));} while(0)
+#define rot(x,k) (((x)<<(k)) | (srl(x, 32-(k))))
+#define xor_rot(a, b, c) do {a = a^b; a = a - rot(b, c);} while(0)
uint len = 3;
- __m128i magic = _mm_set1_epi32(0xdeadbeef + (len << 2) + 13);
- __m128i a = _mm_add_epi32(magic, kx);
- __m128i b = _mm_add_epi32(magic, ky);
- __m128i c = _mm_add_epi32(magic, kz);
+ ssei magic = ssei(0xdeadbeef + (len << 2) + 13);
+ ssei a = magic + kx;
+ ssei b = magic + ky;
+ ssei c = magic + kz;
xor_rot(c, b, 14);
xor_rot(a, c, 11);
@@ -133,10 +133,10 @@ ccl_device float floorfrac(float x, int* i)
return x - *i;
}
#else
-ccl_device_inline __m128 floorfrac_sse(const __m128& x, __m128i *i)
+ccl_device_inline ssef floorfrac_sse(const ssef& x, ssei *i)
{
*i = quick_floor_sse(x);
- return _mm_sub_ps(x, _mm_cvtepi32_ps(*i));
+ return x - ssef(*i);
}
#endif
@@ -146,11 +146,11 @@ ccl_device float fade(float t)
return t * t * t * (t * (t * 6.0f - 15.0f) + 10.0f);
}
#else
-ccl_device_inline __m128 fade_sse(const __m128 *t)
+ccl_device_inline ssef fade_sse(const ssef *t)
{
- __m128 a = fma(*t, _mm_set1_ps(6.0f), _mm_set1_ps(-15.0f));
- __m128 b = fma(*t, a, _mm_set1_ps(10.0f));
- return _mm_mul_ps(_mm_mul_ps(*t, *t), _mm_mul_ps(*t, b));
+ ssef a = madd(*t, ssef(6.0f), ssef(-15.0f));
+ ssef b = madd(*t, a, ssef(10.0f));
+ return ((*t) * (*t)) * ((*t) * b);
}
#endif
@@ -160,10 +160,10 @@ ccl_device float nerp(float t, float a, float b)
return (1.0f - t) * a + t * b;
}
#else
-ccl_device_inline __m128 nerp_sse(const __m128& t, const __m128& a, const __m128& b)
+ccl_device_inline ssef nerp_sse(const ssef& t, const ssef& a, const ssef& b)
{
- __m128 x1 = _mm_mul_ps(_mm_sub_ps(_mm_set1_ps(1.0f), t), a);
- return fma(t, b, x1);
+ ssef x1 = (ssef(1.0f) - t) * a;
+ return madd(t, b, x1);
}
#endif
@@ -178,35 +178,35 @@ ccl_device float grad(int hash, float x, float y, float z)
return ((h&1) ? -u : u) + ((h&2) ? -v : v);
}
#else
-ccl_device_inline __m128 grad_sse(const __m128i& hash, const __m128& x, const __m128& y, const __m128& z)
+ccl_device_inline ssef grad_sse(const ssei& hash, const ssef& x, const ssef& y, const ssef& z)
{
- __m128i c1 = _mm_set1_epi32(1);
- __m128i c2 = _mm_set1_epi32(2);
+ ssei c1 = ssei(1);
+ ssei c2 = ssei(2);
- __m128i h = _mm_and_si128(hash, _mm_set1_epi32(15)); // h = hash & 15
+ ssei h = hash & ssei(15); // h = hash & 15
- __m128i case_ux = _mm_cmplt_epi32(h, _mm_set1_epi32(8)); // 0xffffffff if h < 8 else 0
+ sseb case_ux = h < ssei(8); // 0xffffffff if h < 8 else 0
- __m128 u = blend(_mm_castsi128_ps(case_ux), x, y); // u = h<8 ? x : y
+ ssef u = select(case_ux, x, y); // u = h<8 ? x : y
- __m128i case_vy = _mm_cmplt_epi32(h, _mm_set1_epi32(4)); // 0xffffffff if h < 4 else 0
+ sseb case_vy = h < ssei(4); // 0xffffffff if h < 4 else 0
- __m128i case_h12 = _mm_cmpeq_epi32(h, _mm_set1_epi32(12)); // 0xffffffff if h == 12 else 0
- __m128i case_h14 = _mm_cmpeq_epi32(h, _mm_set1_epi32(14)); // 0xffffffff if h == 14 else 0
+ sseb case_h12 = h == ssei(12); // 0xffffffff if h == 12 else 0
+ sseb case_h14 = h == ssei(14); // 0xffffffff if h == 14 else 0
- __m128i case_vx = _mm_or_si128(case_h12, case_h14); // 0xffffffff if h == 12 or h == 14 else 0
+ sseb case_vx = case_h12 | case_h14; // 0xffffffff if h == 12 or h == 14 else 0
- __m128 v = blend(_mm_castsi128_ps(case_vy), y, blend(_mm_castsi128_ps(case_vx), x, z)); // v = h<4 ? y : h == 12 || h == 14 ? x : z
+ ssef v = select(case_vy, y, select(case_vx, x, z)); // v = h<4 ? y : h == 12 || h == 14 ? x : z
- __m128i case_uneg = _mm_slli_epi32(_mm_and_si128(h, c1), 31); // 1<<31 if h&1 else 0
- __m128 case_uneg_mask = _mm_castsi128_ps(case_uneg); // -0.0 if h&1 else +0.0
- __m128 ru = _mm_xor_ps(u, case_uneg_mask); // -u if h&1 else u (copy float sign)
+ ssei case_uneg = (h & c1) << 31; // 1<<31 if h&1 else 0
+ ssef case_uneg_mask = cast(case_uneg); // -0.0 if h&1 else +0.0
+ ssef ru = u ^ case_uneg_mask; // -u if h&1 else u (copy float sign)
- __m128i case_vneg = _mm_slli_epi32(_mm_and_si128(h, c2), 30); // 2<<30 if h&2 else 0
- __m128 case_vneg_mask = _mm_castsi128_ps(case_vneg); // -0.0 if h&2 else +0.0
- __m128 rv = _mm_xor_ps(v, case_vneg_mask); // -v if h&2 else v (copy float sign)
+ ssei case_vneg = (h & c2) << 30; // 2<<30 if h&2 else 0
+ ssef case_vneg_mask = cast(case_vneg); // -0.0 if h&2 else +0.0
+ ssef rv = v ^ case_vneg_mask; // -v if h&2 else v (copy float sign)
- __m128 r = _mm_add_ps(ru, rv); // ((h&1) ? -u : u) + ((h&2) ? -v : v)
+ ssef r = ru + rv; // ((h&1) ? -u : u) + ((h&2) ? -v : v)
return r;
}
#endif
@@ -217,9 +217,9 @@ ccl_device float scale3(float result)
return 0.9820f * result;
}
#else
-ccl_device_inline __m128 scale3_sse(const __m128& result)
+ccl_device_inline ssef scale3_sse(const ssef& result)
{
- return _mm_mul_ps(_mm_set1_ps(0.9820f), result);
+ return ssef(0.9820f) * result;
}
#endif
@@ -252,75 +252,41 @@ ccl_device_noinline float perlin(float x, float y, float z)
#else
ccl_device_noinline float perlin(float x, float y, float z)
{
- __m128 xyz = _mm_setr_ps(x, y, z, 0.0f);
- __m128i XYZ;
+ ssef xyz = ssef(x, y, z, 0.0f);
+ ssei XYZ;
- __m128 fxyz = floorfrac_sse(xyz, &XYZ);
+ ssef fxyz = floorfrac_sse(xyz, &XYZ);
- __m128 uvw = fade_sse(&fxyz);
- __m128 u = broadcast<0>(uvw), v = broadcast<1>(uvw), w = broadcast<2>(uvw);
+ ssef uvw = fade_sse(&fxyz);
+ ssef u = shuffle<0>(uvw), v = shuffle<1>(uvw), w = shuffle<2>(uvw);
- __m128i XYZ_ofc = _mm_add_epi32(XYZ, _mm_set1_epi32(1));
- __m128i vdy = shuffle<1, 1, 1, 1>(XYZ, XYZ_ofc); // +0, +0, +1, +1
- __m128i vdz = shuffle<0, 2, 0, 2>(shuffle<2, 2, 2, 2>(XYZ, XYZ_ofc)); // +0, +1, +0, +1
+ ssei XYZ_ofc = XYZ + ssei(1);
+ ssei vdy = shuffle<1, 1, 1, 1>(XYZ, XYZ_ofc); // +0, +0, +1, +1
+ ssei vdz = shuffle<0, 2, 0, 2>(shuffle<2, 2, 2, 2>(XYZ, XYZ_ofc)); // +0, +1, +0, +1
- __m128i h1 = hash_sse(broadcast<0>(XYZ), vdy, vdz); // hash directions 000, 001, 010, 011
- __m128i h2 = hash_sse(broadcast<0>(XYZ_ofc), vdy, vdz); // hash directions 100, 101, 110, 111
+ ssei h1 = hash_sse(shuffle<0>(XYZ), vdy, vdz); // hash directions 000, 001, 010, 011
+ ssei h2 = hash_sse(shuffle<0>(XYZ_ofc), vdy, vdz); // hash directions 100, 101, 110, 111
- __m128 fxyz_ofc = _mm_sub_ps(fxyz, _mm_set1_ps(1.0f));
- __m128 vfy = shuffle<1, 1, 1, 1>(fxyz, fxyz_ofc);
- __m128 vfz = shuffle<0, 2, 0, 2>(shuffle<2, 2, 2, 2>(fxyz, fxyz_ofc));
+ ssef fxyz_ofc = fxyz - ssef(1.0f);
+ ssef vfy = shuffle<1, 1, 1, 1>(fxyz, fxyz_ofc);
+ ssef vfz = shuffle<0, 2, 0, 2>(shuffle<2, 2, 2, 2>(fxyz, fxyz_ofc));
- __m128 g1 = grad_sse(h1, broadcast<0>(fxyz), vfy, vfz);
- __m128 g2 = grad_sse(h2, broadcast<0>(fxyz_ofc), vfy, vfz);
- __m128 n1 = nerp_sse(u, g1, g2);
+ ssef g1 = grad_sse(h1, shuffle<0>(fxyz), vfy, vfz);
+ ssef g2 = grad_sse(h2, shuffle<0>(fxyz_ofc), vfy, vfz);
+ ssef n1 = nerp_sse(u, g1, g2);
- __m128 n1_half = shuffle<2, 3, 2, 3>(n1); // extract 2 floats to a separate vector
- __m128 n2 = nerp_sse(v, n1, n1_half); // process nerp([a, b, _, _], [c, d, _, _]) -> [a', b', _, _]
+ ssef n1_half = shuffle<2, 3, 2, 3>(n1); // extract 2 floats to a separate vector
+ ssef n2 = nerp_sse(v, n1, n1_half); // process nerp([a, b, _, _], [c, d, _, _]) -> [a', b', _, _]
- __m128 n2_second = broadcast<1>(n2); // extract b to a separate vector
- __m128 result = nerp_sse(w, n2, n2_second); // process nerp([a', _, _, _], [b', _, _, _]) -> [a'', _, _, _]
+ ssef n2_second = shuffle<1>(n2); // extract b to a separate vector
+ ssef result = nerp_sse(w, n2, n2_second); // process nerp([a', _, _, _], [b', _, _, _]) -> [a'', _, _, _]
- __m128 r = scale3_sse(result);
+ ssef r = scale3_sse(result);
- __m128 infmask = _mm_castsi128_ps(_mm_set1_epi32(0x7f800000));
- __m128 rinfmask = _mm_cmpeq_ps(_mm_and_ps(r, infmask), infmask); // 0xffffffff if r is inf/-inf/nan else 0
- __m128 rfinite = _mm_andnot_ps(rinfmask, r); // 0 if r is inf/-inf/nan else r
- return _mm_cvtss_f32(rfinite);
-}
-#endif
-
-#if 0 // unused
-ccl_device_noinline float perlin_periodic(float x, float y, float z, float3 pperiod)
-{
- int X; float fx = floorfrac(x, &X);
- int Y; float fy = floorfrac(y, &Y);
- int Z; float fz = floorfrac(z, &Z);
-
- int3 p;
-
- p.x = max(quick_floor(pperiod.x), 1);
- p.y = max(quick_floor(pperiod.y), 1);
- p.z = max(quick_floor(pperiod.z), 1);
-
- float u = fade(fx);
- float v = fade(fy);
- float w = fade(fz);
-
- float result;
-
- result = nerp (w, nerp (v, nerp (u, grad (phash (X , Y , Z , p), fx , fy , fz ),
- grad (phash (X+1, Y , Z , p), fx-1.0f, fy , fz )),
- nerp (u, grad (phash (X , Y+1, Z , p), fx , fy-1.0f, fz ),
- grad (phash (X+1, Y+1, Z , p), fx-1.0f, fy-1.0f, fz ))),
- nerp (v, nerp (u, grad (phash (X , Y , Z+1, p), fx , fy , fz-1.0f ),
- grad (phash (X+1, Y , Z+1, p), fx-1.0f, fy , fz-1.0f )),
- nerp (u, grad (phash (X , Y+1, Z+1, p), fx , fy-1.0f, fz-1.0f ),
- grad (phash (X+1, Y+1, Z+1, p), fx-1.0f, fy-1.0f, fz-1.0f ))));
- float r = scale3(result);
-
- /* can happen for big coordinates, things even out to 0.0 then anyway */
- return (isfinite(r))? r: 0.0f;
+ ssef infmask = cast(ssei(0x7f800000));
+ ssef rinfmask = ((r & infmask) == infmask).m128; // 0xffffffff if r is inf/-inf/nan else 0
+ ssef rfinite = andnot(rinfmask, r); // 0 if r is inf/-inf/nan else r
+ return extract<0>(rfinite);
}
#endif
@@ -357,30 +323,15 @@ ccl_device float3 cellnoise_color(float3 p)
return make_float3(r, g, b);
}
#else
-ccl_device __m128 cellnoise_color(const __m128& p)
+ccl_device ssef cellnoise_color(const ssef& p)
{
- __m128i ip = quick_floor_sse(p);
- __m128i ip_yxz = shuffle<1, 0, 2, 3>(ip);
- __m128i ip_xyy = shuffle<0, 1, 1, 3>(ip);
- __m128i ip_zzx = shuffle<2, 2, 0, 3>(ip);
+ ssei ip = quick_floor_sse(p);
+ ssei ip_yxz = shuffle<1, 0, 2, 3>(ip);
+ ssei ip_xyy = shuffle<0, 1, 1, 3>(ip);
+ ssei ip_zzx = shuffle<2, 2, 0, 3>(ip);
return bits_to_01_sse(hash_sse(ip_xyy, ip_yxz, ip_zzx));
}
#endif
-#if 0 // unused
-/* periodic perlin noise in range 0..1 */
-ccl_device float pnoise(float3 p, float3 pperiod)
-{
- float r = perlin_periodic(p.x, p.y, p.z, pperiod);
- return 0.5f*r + 0.5f;
-}
-
-/* periodic perlin noise in range -1..1 */
-ccl_device float psnoise(float3 p, float3 pperiod)
-{
- return perlin_periodic(p.x, p.y, p.z, pperiod);
-}
-#endif
-
CCL_NAMESPACE_END
diff --git a/intern/cycles/kernel/svm/svm_sepcomb_rgb.h b/intern/cycles/kernel/svm/svm_sepcomb_rgb.h
deleted file mode 100644
index 34c4449ecdb..00000000000
--- a/intern/cycles/kernel/svm/svm_sepcomb_rgb.h
+++ /dev/null
@@ -1,42 +0,0 @@
-/*
- * Copyright 2011-2013 Blender Foundation
- *
- * Licensed under the Apache License, Version 2.0 (the "License");
- * you may not use this file except in compliance with the License.
- * You may obtain a copy of the License at
- *
- * http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License
- */
-
-CCL_NAMESPACE_BEGIN
-
-ccl_device void svm_node_combine_rgb(ShaderData *sd, float *stack, uint in_offset, uint color_index, uint out_offset)
-{
- float color = stack_load_float(stack, in_offset);
-
- if (stack_valid(out_offset))
- stack_store_float(stack, out_offset+color_index, color);
-}
-
-ccl_device void svm_node_separate_rgb(ShaderData *sd, float *stack, uint icolor_offset, uint color_index, uint out_offset)
-{
- float3 color = stack_load_float3(stack, icolor_offset);
-
- if (stack_valid(out_offset)) {
- if (color_index == 0)
- stack_store_float(stack, out_offset, color.x);
- else if (color_index == 1)
- stack_store_float(stack, out_offset, color.y);
- else
- stack_store_float(stack, out_offset, color.z);
- }
-}
-
-CCL_NAMESPACE_END
-
diff --git a/intern/cycles/kernel/svm/svm_sepcomb_vector.h b/intern/cycles/kernel/svm/svm_sepcomb_vector.h
new file mode 100644
index 00000000000..c8e7e34f87d
--- /dev/null
+++ b/intern/cycles/kernel/svm/svm_sepcomb_vector.h
@@ -0,0 +1,44 @@
+/*
+ * Copyright 2011-2014 Blender Foundation
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License
+ */
+
+CCL_NAMESPACE_BEGIN
+
+/* Vector combine / separate, used for the RGB and XYZ nodes */
+
+ccl_device void svm_node_combine_vector(ShaderData *sd, float *stack, uint in_offset, uint vector_index, uint out_offset)
+{
+ float vector = stack_load_float(stack, in_offset);
+
+ if (stack_valid(out_offset))
+ stack_store_float(stack, out_offset+vector_index, vector);
+}
+
+ccl_device void svm_node_separate_vector(ShaderData *sd, float *stack, uint ivector_offset, uint vector_index, uint out_offset)
+{
+ float3 vector = stack_load_float3(stack, ivector_offset);
+
+ if (stack_valid(out_offset)) {
+ if (vector_index == 0)
+ stack_store_float(stack, out_offset, vector.x);
+ else if (vector_index == 1)
+ stack_store_float(stack, out_offset, vector.y);
+ else
+ stack_store_float(stack, out_offset, vector.z);
+ }
+}
+
+CCL_NAMESPACE_END
+
diff --git a/intern/cycles/kernel/svm/svm_texture.h b/intern/cycles/kernel/svm/svm_texture.h
index 5fd9204cbf6..d97c85db36a 100644
--- a/intern/cycles/kernel/svm/svm_texture.h
+++ b/intern/cycles/kernel/svm/svm_texture.h
@@ -140,15 +140,15 @@ ccl_device float voronoi_F1_distance(float3 p)
}
}
#else
- __m128 vec_p = load_m128(p);
- __m128i xyzi = quick_floor_sse(vec_p);
+ ssef vec_p = load4f(p);
+ ssei xyzi = quick_floor_sse(vec_p);
for (int xx = -1; xx <= 1; xx++) {
for (int yy = -1; yy <= 1; yy++) {
for (int zz = -1; zz <= 1; zz++) {
- __m128 ip = _mm_cvtepi32_ps(_mm_add_epi32(xyzi, _mm_setr_epi32(xx, yy, zz, 0)));
- __m128 vp = _mm_add_ps(ip, cellnoise_color(ip));
- float d = len_squared<1, 1, 1, 0>(_mm_sub_ps(vec_p, vp));
+ ssef ip = ssef(xyzi + ssei(xx, yy, zz, 0));
+ ssef vp = ip + cellnoise_color(ip);
+ float d = len_squared<1, 1, 1, 0>(vec_p - vp);
da = min(d, da);
}
}
@@ -184,15 +184,15 @@ ccl_device float3 voronoi_F1_color(float3 p)
return cellnoise_color(pa);
#else
- __m128 pa, vec_p = load_m128(p);
- __m128i xyzi = quick_floor_sse(vec_p);
+ ssef pa, vec_p = load4f(p);
+ ssei xyzi = quick_floor_sse(vec_p);
for (int xx = -1; xx <= 1; xx++) {
for (int yy = -1; yy <= 1; yy++) {
for (int zz = -1; zz <= 1; zz++) {
- __m128 ip = _mm_cvtepi32_ps(_mm_add_epi32(xyzi, _mm_setr_epi32(xx, yy, zz, 0)));
- __m128 vp = _mm_add_ps(ip, cellnoise_color(ip));
- float d = len_squared<1, 1, 1, 0>(_mm_sub_ps(vec_p, vp));
+ ssef ip = ssef(xyzi + ssei(xx, yy, zz, 0));
+ ssef vp = ip + cellnoise_color(ip);
+ float d = len_squared<1, 1, 1, 0>(vec_p - vp);
if(d < da) {
da = d;
@@ -202,7 +202,7 @@ ccl_device float3 voronoi_F1_color(float3 p)
}
}
- __m128 color = cellnoise_color(pa);
+ ssef color = cellnoise_color(pa);
return (float3 &)color;
#endif
}
diff --git a/intern/cycles/kernel/svm/svm_types.h b/intern/cycles/kernel/svm/svm_types.h
index 80972ec82bc..fbe669c1fab 100644
--- a/intern/cycles/kernel/svm/svm_types.h
+++ b/intern/cycles/kernel/svm/svm_types.h
@@ -72,15 +72,14 @@ typedef enum NodeType {
NODE_TEX_COORD,
NODE_TEX_COORD_BUMP_DX,
NODE_TEX_COORD_BUMP_DY,
- NODE_EMISSION_SET_WEIGHT_TOTAL,
NODE_ATTR_BUMP_DX,
NODE_ATTR_BUMP_DY,
NODE_TEX_ENVIRONMENT,
NODE_CLOSURE_HOLDOUT,
NODE_LAYER_WEIGHT,
NODE_CLOSURE_VOLUME,
- NODE_SEPARATE_RGB,
- NODE_COMBINE_RGB,
+ NODE_SEPARATE_VECTOR,
+ NODE_COMBINE_VECTOR,
NODE_SEPARATE_HSV,
NODE_COMBINE_HSV,
NODE_HSV,
@@ -349,7 +348,6 @@ typedef enum ClosureType {
/* Diffuse */
CLOSURE_BSDF_DIFFUSE_ID,
CLOSURE_BSDF_OREN_NAYAR_ID,
- CLOSURE_BSDF_WESTIN_SHEEN_ID,
CLOSURE_BSDF_DIFFUSE_RAMP_ID,
CLOSURE_BSDF_DIFFUSE_TOON_ID,
@@ -358,9 +356,11 @@ typedef enum ClosureType {
CLOSURE_BSDF_REFLECTION_ID,
CLOSURE_BSDF_MICROFACET_GGX_ID,
CLOSURE_BSDF_MICROFACET_BECKMANN_ID,
- CLOSURE_BSDF_WARD_ID,
+ CLOSURE_BSDF_ASHIKHMIN_SHIRLEY_ID,
+ CLOSURE_BSDF_MICROFACET_GGX_ANISO_ID,
+ CLOSURE_BSDF_MICROFACET_BECKMANN_ANISO_ID,
+ CLOSURE_BSDF_ASHIKHMIN_SHIRLEY_ANISO_ID,
CLOSURE_BSDF_ASHIKHMIN_VELVET_ID,
- CLOSURE_BSDF_WESTIN_BACKSCATTER_ID,
CLOSURE_BSDF_PHONG_RAMP_ID,
CLOSURE_BSDF_GLOSSY_TOON_ID,
CLOSURE_BSDF_HAIR_REFLECTION_ID,
@@ -404,7 +404,7 @@ typedef enum ClosureType {
#define CLOSURE_IS_BSDF_GLOSSY(type) (type >= CLOSURE_BSDF_GLOSSY_ID && type <= CLOSURE_BSDF_HAIR_REFLECTION_ID)
#define CLOSURE_IS_BSDF_TRANSMISSION(type) (type >= CLOSURE_BSDF_TRANSMISSION_ID && type <= CLOSURE_BSDF_HAIR_TRANSMISSION_ID)
#define CLOSURE_IS_BSDF_BSSRDF(type) (type == CLOSURE_BSDF_BSSRDF_ID)
-#define CLOSURE_IS_BSDF_ANISOTROPIC(type) (type == CLOSURE_BSDF_WARD_ID)
+#define CLOSURE_IS_BSDF_ANISOTROPIC(type) (type >= CLOSURE_BSDF_MICROFACET_GGX_ANISO_ID && type <= CLOSURE_BSDF_ASHIKHMIN_SHIRLEY_ANISO_ID)
#define CLOSURE_IS_BSDF_OR_BSSRDF(type) (type <= CLOSURE_BSSRDF_GAUSSIAN_ID)
#define CLOSURE_IS_BSSRDF(type) (type >= CLOSURE_BSSRDF_CUBIC_ID && type <= CLOSURE_BSSRDF_GAUSSIAN_ID)
#define CLOSURE_IS_VOLUME(type) (type >= CLOSURE_VOLUME_ID && type <= CLOSURE_VOLUME_HENYEY_GREENSTEIN_ID)
diff --git a/intern/cycles/render/CMakeLists.txt b/intern/cycles/render/CMakeLists.txt
index 449c1391980..c3907da39d0 100644
--- a/intern/cycles/render/CMakeLists.txt
+++ b/intern/cycles/render/CMakeLists.txt
@@ -7,6 +7,7 @@ set(INC
../kernel/osl
../bvh
../util
+ ../../glew-mx
)
set(INC_SYS
@@ -76,5 +77,6 @@ set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} ${RTTI_DISABLE_FLAGS}")
include_directories(${INC})
include_directories(SYSTEM ${INC_SYS})
-add_library(cycles_render ${SRC} ${SRC_HEADERS})
+add_definitions(${GL_DEFINITIONS})
+add_library(cycles_render ${SRC} ${SRC_HEADERS})
diff --git a/intern/cycles/render/attribute.cpp b/intern/cycles/render/attribute.cpp
index 14805b6f11a..8abf869a775 100644
--- a/intern/cycles/render/attribute.cpp
+++ b/intern/cycles/render/attribute.cpp
@@ -69,6 +69,15 @@ void Attribute::add(const float& f)
buffer.push_back(data[i]);
}
+void Attribute::add(const uchar4& f)
+{
+ char *data = (char*)&f;
+ size_t size = sizeof(f);
+
+ for(size_t i = 0; i < size; i++)
+ buffer.push_back(data[i]);
+}
+
void Attribute::add(const float3& f)
{
char *data = (char*)&f;
@@ -136,6 +145,7 @@ size_t Attribute::element_size(int numverts, int numtris, int numsteps, int numc
size = numtris;
break;
case ATTR_ELEMENT_CORNER:
+ case ATTR_ELEMENT_CORNER_BYTE:
size = numtris*3;
break;
case ATTR_ELEMENT_CURVE:
@@ -263,11 +273,19 @@ Attribute *AttributeSet::add(ustring name, TypeDesc type, AttributeElement eleme
remove(name);
}
- attributes.push_back(Attribute());
+#if __cplusplus >= 201103L
+ attributes.emplace_back();
attr = &attributes.back();
-
attr->set(name, type, element);
-
+#else
+ {
+ Attribute attr_temp;
+ attr_temp.set(name, type, element);
+ attributes.push_back(attr_temp);
+ attr = &attributes.back();
+ }
+#endif
+
/* this is weak .. */
if(triangle_mesh)
attr->reserve(triangle_mesh->verts.size(), triangle_mesh->triangles.size(), triangle_mesh->motion_steps, 0, 0, resize);
diff --git a/intern/cycles/render/attribute.h b/intern/cycles/render/attribute.h
index 9fc32db8444..f5227ebde52 100644
--- a/intern/cycles/render/attribute.h
+++ b/intern/cycles/render/attribute.h
@@ -68,6 +68,7 @@ public:
float3 *data_float3() { return (float3*)data(); }
float4 *data_float4() { return (float4*)data(); }
float *data_float() { return (float*)data(); }
+ uchar4 *data_uchar4() { return (uchar4*)data(); }
Transform *data_transform() { return (Transform*)data(); }
VoxelAttribute *data_voxel() { return ( VoxelAttribute*)data(); }
@@ -80,6 +81,7 @@ public:
void add(const float& f);
void add(const float3& f);
+ void add(const uchar4& f);
void add(const Transform& f);
void add(const VoxelAttribute& f);
void add(const char *data);
diff --git a/intern/cycles/render/background.cpp b/intern/cycles/render/background.cpp
index a877c52fbed..3926ecb99d6 100644
--- a/intern/cycles/render/background.cpp
+++ b/intern/cycles/render/background.cpp
@@ -78,6 +78,8 @@ void Background::device_update(Device *device, DeviceScene *dscene, Scene *scene
kbackground->surface_shader |= SHADER_EXCLUDE_GLOSSY;
if(!(visibility & PATH_RAY_TRANSMIT))
kbackground->surface_shader |= SHADER_EXCLUDE_TRANSMIT;
+ if(!(visibility & PATH_RAY_VOLUME_SCATTER))
+ kbackground->surface_shader |= SHADER_EXCLUDE_SCATTER;
if(!(visibility & PATH_RAY_CAMERA))
kbackground->surface_shader |= SHADER_EXCLUDE_CAMERA;
diff --git a/intern/cycles/render/bake.cpp b/intern/cycles/render/bake.cpp
index aa317ab672f..5723a22dd84 100644
--- a/intern/cycles/render/bake.cpp
+++ b/intern/cycles/render/bake.cpp
@@ -15,10 +15,11 @@
*/
#include "bake.h"
+#include "integrator.h"
CCL_NAMESPACE_BEGIN
-BakeData::BakeData(const int object, const int tri_offset, const int num_pixels):
+BakeData::BakeData(const int object, const size_t tri_offset, const size_t num_pixels):
m_object(object),
m_tri_offset(tri_offset),
m_num_pixels(num_pixels)
@@ -59,7 +60,7 @@ int BakeData::object()
return m_object;
}
-int BakeData::size()
+size_t BakeData::size()
{
return m_num_pixels;
}
@@ -94,6 +95,7 @@ BakeManager::BakeManager()
m_bake_data = NULL;
m_is_baking = false;
need_update = true;
+ m_shader_limit = 512 * 512;
}
BakeManager::~BakeManager()
@@ -112,75 +114,105 @@ void BakeManager::set_baking(const bool value)
m_is_baking = value;
}
-BakeData *BakeManager::init(const int object, const int tri_offset, const int num_pixels)
+BakeData *BakeManager::init(const int object, const size_t tri_offset, const size_t num_pixels)
{
m_bake_data = new BakeData(object, tri_offset, num_pixels);
return m_bake_data;
}
+void BakeManager::set_shader_limit(const size_t x, const size_t y)
+{
+ m_shader_limit = x * y;
+ m_shader_limit = (size_t)pow(2, ceil(log(m_shader_limit)/log(2)));
+}
+
bool BakeManager::bake(Device *device, DeviceScene *dscene, Scene *scene, Progress& progress, ShaderEvalType shader_type, BakeData *bake_data, float result[])
{
- size_t limit = bake_data->size();
+ size_t num_pixels = bake_data->size();
+
+ progress.reset_sample();
+ this->num_parts = 0;
+
+ /* calculate the total parts for the progress bar */
+ for(size_t shader_offset = 0; shader_offset < num_pixels; shader_offset += m_shader_limit) {
+ size_t shader_size = (size_t)fminf(num_pixels - shader_offset, m_shader_limit);
- /* setup input for device task */
- device_vector<uint4> d_input;
- uint4 *d_input_data = d_input.resize(limit * 2);
- size_t d_input_size = 0;
+ DeviceTask task(DeviceTask::SHADER);
+ task.shader_w = shader_size;
- for(size_t i = 0; i < limit; i++) {
- d_input_data[d_input_size++] = bake_data->data(i);
- d_input_data[d_input_size++] = bake_data->differentials(i);
+ this->num_parts += device->get_split_task_count(task);
}
- if(d_input_size == 0)
- return false;
+ this->num_samples = is_aa_pass(shader_type)? scene->integrator->aa_samples : 1;
- /* run device task */
- device_vector<float4> d_output;
- d_output.resize(limit);
+ for(size_t shader_offset = 0; shader_offset < num_pixels; shader_offset += m_shader_limit) {
+ size_t shader_size = (size_t)fminf(num_pixels - shader_offset, m_shader_limit);
- /* needs to be up to data for attribute access */
- device->const_copy_to("__data", &dscene->data, sizeof(dscene->data));
+ /* setup input for device task */
+ device_vector<uint4> d_input;
+ uint4 *d_input_data = d_input.resize(shader_size * 2);
+ size_t d_input_size = 0;
- device->mem_alloc(d_input, MEM_READ_ONLY);
- device->mem_copy_to(d_input);
- device->mem_alloc(d_output, MEM_WRITE_ONLY);
+ for(size_t i = shader_offset; i < (shader_offset + shader_size); i++) {
+ d_input_data[d_input_size++] = bake_data->data(i);
+ d_input_data[d_input_size++] = bake_data->differentials(i);
+ }
- DeviceTask task(DeviceTask::SHADER);
- task.shader_input = d_input.device_pointer;
- task.shader_output = d_output.device_pointer;
- task.shader_eval_type = shader_type;
- task.shader_x = 0;
- task.shader_w = d_output.size();
- task.get_cancel = function_bind(&Progress::get_cancel, &progress);
+ if(d_input_size == 0) {
+ m_is_baking = false;
+ return false;
+ }
- device->task_add(task);
- device->task_wait();
+ /* run device task */
+ device_vector<float4> d_output;
+ d_output.resize(shader_size);
+
+ /* needs to be up to data for attribute access */
+ device->const_copy_to("__data", &dscene->data, sizeof(dscene->data));
+
+ device->mem_alloc(d_input, MEM_READ_ONLY);
+ device->mem_copy_to(d_input);
+ device->mem_alloc(d_output, MEM_WRITE_ONLY);
+
+ DeviceTask task(DeviceTask::SHADER);
+ task.shader_input = d_input.device_pointer;
+ task.shader_output = d_output.device_pointer;
+ task.shader_eval_type = shader_type;
+ task.shader_x = 0;
+ task.offset = shader_offset;
+ task.shader_w = d_output.size();
+ task.num_samples = this->num_samples;
+ task.get_cancel = function_bind(&Progress::get_cancel, &progress);
+ task.update_progress_sample = function_bind(&Progress::increment_sample_update, &progress);
+
+ device->task_add(task);
+ device->task_wait();
+
+ if(progress.get_cancel()) {
+ device->mem_free(d_input);
+ device->mem_free(d_output);
+ m_is_baking = false;
+ return false;
+ }
- if(progress.get_cancel()) {
+ device->mem_copy_from(d_output, 0, 1, d_output.size(), sizeof(float4));
device->mem_free(d_input);
device->mem_free(d_output);
- m_is_baking = false;
- return false;
- }
- device->mem_copy_from(d_output, 0, 1, d_output.size(), sizeof(float4));
- device->mem_free(d_input);
- device->mem_free(d_output);
+ /* read result */
+ int k = 0;
- /* read result */
- int k = 0;
+ float4 *offset = (float4*)d_output.data_pointer;
- float4 *offset = (float4*)d_output.data_pointer;
+ size_t depth = 4;
+ for(size_t i=shader_offset; i < (shader_offset + shader_size); i++) {
+ size_t index = i * depth;
+ float4 out = offset[k++];
- size_t depth = 4;
- for(size_t i = 0; i < limit; i++) {
- size_t index = i * depth;
- float4 out = offset[k++];
-
- if(bake_data->is_valid(i)) {
- for(size_t j=0; j < 4; j++) {
- result[index + j] = out[j];
+ if(bake_data->is_valid(i)) {
+ for(size_t j=0; j < 4; j++) {
+ result[index + j] = out[j];
+ }
}
}
}
@@ -203,4 +235,35 @@ void BakeManager::device_free(Device *device, DeviceScene *dscene)
{
}
+bool BakeManager::is_aa_pass(ShaderEvalType type)
+{
+ switch(type) {
+ case SHADER_EVAL_UV:
+ case SHADER_EVAL_NORMAL:
+ return false;
+ default:
+ return true;
+ }
+}
+
+bool BakeManager::is_light_pass(ShaderEvalType type)
+{
+ switch(type) {
+ case SHADER_EVAL_AO:
+ case SHADER_EVAL_COMBINED:
+ case SHADER_EVAL_SHADOW:
+ case SHADER_EVAL_DIFFUSE_DIRECT:
+ case SHADER_EVAL_GLOSSY_DIRECT:
+ case SHADER_EVAL_TRANSMISSION_DIRECT:
+ case SHADER_EVAL_SUBSURFACE_DIRECT:
+ case SHADER_EVAL_DIFFUSE_INDIRECT:
+ case SHADER_EVAL_GLOSSY_INDIRECT:
+ case SHADER_EVAL_TRANSMISSION_INDIRECT:
+ case SHADER_EVAL_SUBSURFACE_INDIRECT:
+ return true;
+ default:
+ return false;
+ }
+}
+
CCL_NAMESPACE_END
diff --git a/intern/cycles/render/bake.h b/intern/cycles/render/bake.h
index ea403f7d39a..186fbbeea4d 100644
--- a/intern/cycles/render/bake.h
+++ b/intern/cycles/render/bake.h
@@ -17,29 +17,30 @@
#ifndef __BAKE_H__
#define __BAKE_H__
-#include "util_vector.h"
#include "device.h"
#include "scene.h"
-#include "session.h"
+
+#include "util_progress.h"
+#include "util_vector.h"
CCL_NAMESPACE_BEGIN
class BakeData {
public:
- BakeData(const int object, const int tri_offset, const int num_pixels);
+ BakeData(const int object, const size_t tri_offset, const size_t num_pixels);
~BakeData();
void set(int i, int prim, float uv[2], float dudx, float dudy, float dvdx, float dvdy);
int object();
- int size();
+ size_t size();
uint4 data(int i);
uint4 differentials(int i);
bool is_valid(int i);
private:
int m_object;
- int m_tri_offset;
- int m_num_pixels;
+ size_t m_tri_offset;
+ size_t m_num_pixels;
vector<int>m_primitive;
vector<float>m_u;
vector<float>m_v;
@@ -57,18 +58,27 @@ public:
bool get_baking();
void set_baking(const bool value);
- BakeData *init(const int object, const int tri_offset, const int num_pixels);
+ BakeData *init(const int object, const size_t tri_offset, const size_t num_pixels);
+
+ void set_shader_limit(const size_t x, const size_t y);
bool bake(Device *device, DeviceScene *dscene, Scene *scene, Progress& progress, ShaderEvalType shader_type, BakeData *bake_data, float result[]);
void device_update(Device *device, DeviceScene *dscene, Scene *scene, Progress& progress);
void device_free(Device *device, DeviceScene *dscene);
+ static bool is_light_pass(ShaderEvalType type);
+ static bool is_aa_pass(ShaderEvalType type);
+
bool need_update;
+ int num_samples;
+ int num_parts;
+
private:
BakeData *m_bake_data;
bool m_is_baking;
+ size_t m_shader_limit;
};
CCL_NAMESPACE_END
diff --git a/intern/cycles/render/blackbody.cpp b/intern/cycles/render/blackbody.cpp
index 89af714e8ec..6e2cb7c62b6 100644
--- a/intern/cycles/render/blackbody.cpp
+++ b/intern/cycles/render/blackbody.cpp
@@ -100,7 +100,7 @@ vector<float> blackbody_table()
/* ToDo: bring this back to what OSL does with the lastTemperature limit ? */
for (int i = 0; i <= 317; ++i) {
- double Temperature = pow((double)i, (double)BB_TABLE_XPOWER) * (double)BB_TABLE_SPACING + (double)BB_DRAPPER;
+ double Temperature = pow((double)i, (double)BB_TABLE_XPOWER) * (double)BB_TABLE_SPACING + (double)BB_DRAPER;
X = 0;
Y = 0;
Z = 0;
diff --git a/intern/cycles/render/buffers.cpp b/intern/cycles/render/buffers.cpp
index fc65922fc87..756e16b38b5 100644
--- a/intern/cycles/render/buffers.cpp
+++ b/intern/cycles/render/buffers.cpp
@@ -190,6 +190,14 @@ bool RenderBuffers::get_pass_rect(PassType type, float exposure, int sample, int
pixels[0] = clamp(f*scale_exposure, 0.0f, 1.0f);
}
}
+#ifdef WITH_CYCLES_DEBUG
+ else if(type == PASS_BVH_TRAVERSAL_STEPS) {
+ for(int i = 0; i < size; i++, in += pass_stride, pixels++) {
+ float f = *in;
+ pixels[0] = f;
+ }
+ }
+#endif
else {
for(int i = 0; i < size; i++, in += pass_stride, pixels++) {
float f = *in;
diff --git a/intern/cycles/render/camera.cpp b/intern/cycles/render/camera.cpp
index 8659fe4f7a3..110adb4d036 100644
--- a/intern/cycles/render/camera.cpp
+++ b/intern/cycles/render/camera.cpp
@@ -15,10 +15,13 @@
*/
#include "camera.h"
+#include "mesh.h"
+#include "object.h"
#include "scene.h"
#include "device.h"
+#include "util_foreach.h"
#include "util_vector.h"
CCL_NAMESPACE_BEGIN
@@ -38,6 +41,8 @@ Camera::Camera()
motion.post = transform_identity();
use_motion = false;
+ aperture_ratio = 1.0f;
+
type = CAMERA_PERSPECTIVE;
panorama_type = PANORAMA_EQUIRECTANGULAR;
fisheye_fov = M_PI_F;
@@ -241,6 +246,9 @@ void Camera::device_update(Device *device, DeviceScene *dscene, Scene *scene)
/* type */
kcam->type = type;
+ /* anamorphic lens bokeh */
+ kcam->inv_aperture_ratio = 1.0f / aperture_ratio;
+
/* panorama */
kcam->panorama_type = panorama_type;
kcam->fisheye_fov = fisheye_fov;
@@ -265,6 +273,20 @@ void Camera::device_update(Device *device, DeviceScene *dscene, Scene *scene)
need_device_update = false;
previous_need_motion = need_motion;
+
+ /* Camera in volume. */
+ kcam->is_inside_volume = 0;
+ BoundBox viewplane_boundbox = viewplane_bounds_get();
+ for(size_t i = 0; i < scene->objects.size(); ++i) {
+ Object *object = scene->objects[i];
+ if(object->mesh->has_volume &&
+ viewplane_boundbox.intersects(object->bounds))
+ {
+ /* TODO(sergey): Consider adding more grained check. */
+ kcam->is_inside_volume = 1;
+ break;
+ }
+ }
}
void Camera::device_free(Device *device, DeviceScene *dscene)
@@ -291,6 +313,7 @@ bool Camera::modified(const Camera& cam)
(viewplane == cam.viewplane) &&
(border == cam.border) &&
(matrix == cam.matrix) &&
+ (aperture_ratio == cam.aperture_ratio) &&
(panorama_type == cam.panorama_type) &&
(fisheye_fov == cam.fisheye_fov) &&
(fisheye_lens == cam.fisheye_lens));
@@ -307,5 +330,62 @@ void Camera::tag_update()
need_update = true;
}
+float3 Camera::transform_raster_to_world(float raster_x, float raster_y)
+{
+ float3 D, P;
+ if(type == CAMERA_PERSPECTIVE) {
+ D = transform_perspective(&rastertocamera,
+ make_float3(raster_x, raster_y, 0.0f));
+ P = make_float3(0.0f, 0.0f, 0.0f);
+ /* TODO(sergey): Aperture support? */
+ P = transform_point(&cameratoworld, P);
+ D = normalize(transform_direction(&cameratoworld, D));
+ /* TODO(sergey): Clipping is conditional in kernel, and hence it could
+ * be mistakes in here, currently leading to wrong camera-in-volume
+ * detection.
+ */
+ P += nearclip * D;
+ }
+ else if (type == CAMERA_ORTHOGRAPHIC) {
+ D = make_float3(0.0f, 0.0f, 1.0f);
+ /* TODO(sergey): Aperture support? */
+ P = transform_perspective(&rastertocamera,
+ make_float3(raster_x, raster_y, 0.0f));
+ P = transform_point(&cameratoworld, P);
+ D = normalize(transform_direction(&cameratoworld, D));
+ }
+ else {
+ assert(!"unsupported camera type");
+ }
+ return P;
+}
+
+BoundBox Camera::viewplane_bounds_get()
+{
+ /* TODO(sergey): This is all rather stupid, but is there a way to perform
+ * checks we need in a more clear and smart fasion?
+ */
+ BoundBox bounds = BoundBox::empty;
+
+ if(type == CAMERA_PANORAMA) {
+ bounds.grow(make_float3(cameratoworld.w.x,
+ cameratoworld.w.y,
+ cameratoworld.w.z));
+ }
+ else {
+ bounds.grow(transform_raster_to_world(0.0f, 0.0f));
+ bounds.grow(transform_raster_to_world(0.0f, (float)height));
+ bounds.grow(transform_raster_to_world((float)width, (float)height));
+ bounds.grow(transform_raster_to_world((float)width, 0.0f));
+ if(type == CAMERA_PERSPECTIVE) {
+ /* Center point has the most distancei in local Z axis,
+ * use it to construct bounding box/
+ */
+ bounds.grow(transform_raster_to_world(0.5f*width, 0.5f*height));
+ }
+ }
+ return bounds;
+}
+
CCL_NAMESPACE_END
diff --git a/intern/cycles/render/camera.h b/intern/cycles/render/camera.h
index c28670bc55f..788ae7b9bb6 100644
--- a/intern/cycles/render/camera.h
+++ b/intern/cycles/render/camera.h
@@ -54,6 +54,9 @@ public:
float fisheye_fov;
float fisheye_lens;
+ /* anamorphic lens bokeh */
+ float aperture_ratio;
+
/* sensor */
float sensorwidth;
float sensorheight;
@@ -113,6 +116,9 @@ public:
bool modified(const Camera& cam);
bool motion_modified(const Camera& cam);
void tag_update();
+
+ BoundBox viewplane_bounds_get();
+ float3 transform_raster_to_world(float raster_x, float raster_y);
};
CCL_NAMESPACE_END
diff --git a/intern/cycles/render/curves.cpp b/intern/cycles/render/curves.cpp
index 2c96ffa655e..dc7665fe144 100644
--- a/intern/cycles/render/curves.cpp
+++ b/intern/cycles/render/curves.cpp
@@ -46,8 +46,9 @@ void curvebounds(float *lower, float *upper, float3 *p, int dim)
float discroot = curve_coef[2] * curve_coef[2] - 3 * curve_coef[3] * curve_coef[1];
float ta = -1.0f;
float tb = -1.0f;
+
if(discroot >= 0) {
- discroot = sqrt(discroot);
+ discroot = sqrtf(discroot);
ta = (-curve_coef[2] - discroot) / (3 * curve_coef[3]);
tb = (-curve_coef[2] + discroot) / (3 * curve_coef[3]);
ta = (ta > 1.0f || ta < 0.0f) ? -1.0f : ta;
@@ -56,20 +57,21 @@ void curvebounds(float *lower, float *upper, float3 *p, int dim)
*upper = max(p1[dim],p2[dim]);
*lower = min(p1[dim],p2[dim]);
+
float exa = p1[dim];
float exb = p2[dim];
- float t2;
- float t3;
+
if(ta >= 0.0f) {
- t2 = ta * ta;
- t3 = t2 * ta;
+ float t2 = ta * ta;
+ float t3 = t2 * ta;
exa = curve_coef[3] * t3 + curve_coef[2] * t2 + curve_coef[1] * ta + curve_coef[0];
}
if(tb >= 0.0f) {
- t2 = tb * tb;
- t3 = t2 * tb;
+ float t2 = tb * tb;
+ float t3 = t2 * tb;
exb = curve_coef[3] * t3 + curve_coef[2] * t2 + curve_coef[1] * tb + curve_coef[0];
}
+
*upper = max(*upper, max(exa,exb));
*lower = min(*lower, min(exa,exb));
}
diff --git a/intern/cycles/render/film.cpp b/intern/cycles/render/film.cpp
index c1aefbcfbbc..19f959d4ea1 100644
--- a/intern/cycles/render/film.cpp
+++ b/intern/cycles/render/film.cpp
@@ -80,22 +80,13 @@ void Pass::add(PassType type, vector<Pass>& passes)
pass.components = 1;
break;
case PASS_OBJECT_ID:
- pass.components = 1;
- pass.filter = false;
- break;
case PASS_MATERIAL_ID:
pass.components = 1;
pass.filter = false;
break;
case PASS_DIFFUSE_COLOR:
- pass.components = 4;
- break;
case PASS_GLOSSY_COLOR:
- pass.components = 4;
- break;
case PASS_TRANSMISSION_COLOR:
- pass.components = 4;
- break;
case PASS_SUBSURFACE_COLOR:
pass.components = 4;
break;
@@ -141,9 +132,6 @@ void Pass::add(PassType type, vector<Pass>& passes)
break;
case PASS_EMISSION:
- pass.components = 4;
- pass.exposure = true;
- break;
case PASS_BACKGROUND:
pass.components = 4;
pass.exposure = true;
@@ -158,6 +146,12 @@ void Pass::add(PassType type, vector<Pass>& passes)
case PASS_LIGHT:
/* ignores */
break;
+#ifdef WITH_CYCLES_DEBUG
+ case PASS_BVH_TRAVERSAL_STEPS:
+ pass.components = 1;
+ pass.exposure = false;
+ break;
+#endif
}
passes.push_back(pass);
@@ -400,6 +394,13 @@ void Film::device_update(Device *device, DeviceScene *dscene, Scene *scene)
case PASS_LIGHT:
kfilm->use_light_pass = 1;
break;
+
+#ifdef WITH_CYCLES_DEBUG
+ case PASS_BVH_TRAVERSAL_STEPS:
+ kfilm->pass_bvh_traversal_steps = kfilm->pass_stride;
+ break;
+#endif
+
case PASS_NONE:
break;
}
diff --git a/intern/cycles/render/graph.cpp b/intern/cycles/render/graph.cpp
index 0ff904d06e7..45b08832fea 100644
--- a/intern/cycles/render/graph.cpp
+++ b/intern/cycles/render/graph.cpp
@@ -320,20 +320,20 @@ void ShaderGraph::remove_unneeded_nodes()
{
vector<bool> removed(num_node_ids, false);
bool any_node_removed = false;
-
+
/* find and unlink proxy nodes */
foreach(ShaderNode *node, nodes) {
if(node->special_type == SHADER_SPECIAL_TYPE_PROXY) {
ProxyNode *proxy = static_cast<ProxyNode*>(node);
ShaderInput *input = proxy->inputs[0];
ShaderOutput *output = proxy->outputs[0];
-
+
/* temp. copy of the output links list.
* output->links is modified when we disconnect!
*/
vector<ShaderInput*> links(output->links);
ShaderOutput *from = input->link;
-
+
/* bypass the proxy node */
if(from) {
disconnect(input);
@@ -391,6 +391,8 @@ void ShaderGraph::remove_unneeded_nodes()
if(output)
connect(output, input);
}
+ removed[mix->id] = true;
+ any_node_removed = true;
}
/* remove unused mix closure input when factor is 0.0 or 1.0 */
@@ -400,7 +402,7 @@ void ShaderGraph::remove_unneeded_nodes()
if(mix->inputs[0]->value.x == 0.0f) {
ShaderOutput *output = mix->inputs[1]->link;
vector<ShaderInput*> inputs = mix->outputs[0]->links;
-
+
foreach(ShaderInput *sock, mix->inputs)
if(sock->link)
disconnect(sock);
@@ -410,6 +412,8 @@ void ShaderGraph::remove_unneeded_nodes()
if(output)
connect(output, input);
}
+ removed[mix->id] = true;
+ any_node_removed = true;
}
/* factor 1.0 */
else if(mix->inputs[0]->value.x == 1.0f) {
@@ -425,13 +429,57 @@ void ShaderGraph::remove_unneeded_nodes()
if(output)
connect(output, input);
}
+ removed[mix->id] = true;
+ any_node_removed = true;
+ }
+ }
+ }
+ else if(node->special_type == SHADER_SPECIAL_TYPE_MIX_RGB) {
+ MixNode *mix = static_cast<MixNode*>(node);
+
+ /* remove unused Mix RGB inputs when factor is 0.0 or 1.0 */
+ /* check for color links and make sure factor link is disconnected */
+ if(mix->outputs[0]->links.size() && mix->inputs[1]->link && mix->inputs[2]->link && !mix->inputs[0]->link) {
+ /* factor 0.0 */
+ if(mix->inputs[0]->value.x == 0.0f) {
+ ShaderOutput *output = mix->inputs[1]->link;
+ vector<ShaderInput*> inputs = mix->outputs[0]->links;
+
+ foreach(ShaderInput *sock, mix->inputs)
+ if(sock->link)
+ disconnect(sock);
+
+ foreach(ShaderInput *input, inputs) {
+ disconnect(input);
+ if(output)
+ connect(output, input);
+ }
+ removed[mix->id] = true;
+ any_node_removed = true;
+ }
+ /* factor 1.0 */
+ else if(mix->inputs[0]->value.x == 1.0f) {
+ ShaderOutput *output = mix->inputs[2]->link;
+ vector<ShaderInput*> inputs = mix->outputs[0]->links;
+
+ foreach(ShaderInput *sock, mix->inputs)
+ if(sock->link)
+ disconnect(sock);
+
+ foreach(ShaderInput *input, inputs) {
+ disconnect(input);
+ if(output)
+ connect(output, input);
+ }
+ removed[mix->id] = true;
+ any_node_removed = true;
}
}
}
}
/* remove nodes */
- if (any_node_removed) {
+ if(any_node_removed) {
list<ShaderNode*> newnodes;
foreach(ShaderNode *node, nodes) {
@@ -787,5 +835,47 @@ void ShaderGraph::transform_multi_closure(ShaderNode *node, ShaderOutput *weight
}
}
+void ShaderGraph::dump_graph(const char *filename)
+{
+ FILE *fd = fopen(filename, "w");
+
+ if(fd == NULL) {
+ printf("Error opening file for dumping the graph: %s\n", filename);
+ return;
+ }
+
+ fprintf(fd, "digraph dependencygraph {\n");
+ fprintf(fd, "ranksep=1.5\n");
+ fprintf(fd, "splines=false\n");
+
+ foreach(ShaderNode *node, nodes) {
+ fprintf(fd, "// NODE: %p\n", node);
+ fprintf(fd,
+ "\"%p\" [shape=record,label=\"%s\"]\n",
+ node,
+ node->name.c_str());
+ }
+
+ foreach(ShaderNode *node, nodes) {
+ foreach(ShaderOutput *output, node->outputs) {
+ foreach(ShaderInput *input, output->links) {
+ fprintf(fd,
+ "// CONNECTION: %p->%p (%s:%s)\n",
+ output,
+ input,
+ output->name, input->name);
+ fprintf(fd,
+ "\"%p\":s -> \"%p\":n [label=\"%s:%s\"]\n",
+ output->parent,
+ input->parent,
+ output->name, input->name);
+ }
+ }
+ }
+
+ fprintf(fd, "}\n");
+ fclose(fd);
+}
+
CCL_NAMESPACE_END
diff --git a/intern/cycles/render/graph.h b/intern/cycles/render/graph.h
index 89a066195d6..7b95703d3aa 100644
--- a/intern/cycles/render/graph.h
+++ b/intern/cycles/render/graph.h
@@ -76,6 +76,7 @@ enum ShaderNodeSpecialType {
SHADER_SPECIAL_TYPE_NONE,
SHADER_SPECIAL_TYPE_PROXY,
SHADER_SPECIAL_TYPE_MIX_CLOSURE,
+ SHADER_SPECIAL_TYPE_MIX_RGB, /* Only Mix subtype */
SHADER_SPECIAL_TYPE_AUTOCONVERT,
SHADER_SPECIAL_TYPE_GEOMETRY,
SHADER_SPECIAL_TYPE_SCRIPT
@@ -249,6 +250,8 @@ public:
void remove_unneeded_nodes();
void finalize(bool do_bump = false, bool do_osl = false);
+ void dump_graph(const char *filename);
+
protected:
typedef pair<ShaderNode* const, ShaderNode*> NodePair;
diff --git a/intern/cycles/render/image.cpp b/intern/cycles/render/image.cpp
index 86755badc42..eb2c3333c44 100644
--- a/intern/cycles/render/image.cpp
+++ b/intern/cycles/render/image.cpp
@@ -135,6 +135,7 @@ bool ImageManager::is_float_image(const string& filename, void *builtin_data, bo
(colorspace == "" &&
(strcmp(in->format_name(), "png") == 0 ||
strcmp(in->format_name(), "tiff") == 0 ||
+ strcmp(in->format_name(), "dpx") == 0 ||
strcmp(in->format_name(), "jpeg2000") == 0)));
}
else {
@@ -157,7 +158,8 @@ static bool image_equals(ImageManager::Image *image, const string& filename, voi
image->interpolation == interpolation;
}
-int ImageManager::add_image(const string& filename, void *builtin_data, bool animated, bool& is_float, bool& is_linear, InterpolationType interpolation, bool use_alpha)
+int ImageManager::add_image(const string& filename, void *builtin_data, bool animated, float frame,
+ bool& is_float, bool& is_linear, InterpolationType interpolation, bool use_alpha)
{
Image *img;
size_t slot;
@@ -168,8 +170,17 @@ int ImageManager::add_image(const string& filename, void *builtin_data, bool ani
if(is_float) {
/* find existing image */
for(slot = 0; slot < float_images.size(); slot++) {
- if(float_images[slot] && image_equals(float_images[slot], filename, builtin_data, interpolation)) {
- float_images[slot]->users++;
+ img = float_images[slot];
+ if(img && image_equals(img, filename, builtin_data, interpolation)) {
+ if(img->frame != frame) {
+ img->frame = frame;
+ img->need_load = true;
+ }
+ if(img->use_alpha != use_alpha) {
+ img->use_alpha = use_alpha;
+ img->need_load = true;
+ }
+ img->users++;
return slot;
}
}
@@ -197,6 +208,7 @@ int ImageManager::add_image(const string& filename, void *builtin_data, bool ani
img->builtin_data = builtin_data;
img->need_load = true;
img->animated = animated;
+ img->frame = frame;
img->interpolation = interpolation;
img->users = 1;
img->use_alpha = use_alpha;
@@ -205,8 +217,17 @@ int ImageManager::add_image(const string& filename, void *builtin_data, bool ani
}
else {
for(slot = 0; slot < images.size(); slot++) {
- if(images[slot] && image_equals(images[slot], filename, builtin_data, interpolation)) {
- images[slot]->users++;
+ img = images[slot];
+ if(img && image_equals(img, filename, builtin_data, interpolation)) {
+ if(img->frame != frame) {
+ img->frame = frame;
+ img->need_load = true;
+ }
+ if(img->use_alpha != use_alpha) {
+ img->use_alpha = use_alpha;
+ img->need_load = true;
+ }
+ img->users++;
return slot+tex_image_byte_start;
}
}
@@ -234,6 +255,7 @@ int ImageManager::add_image(const string& filename, void *builtin_data, bool ani
img->builtin_data = builtin_data;
img->need_load = true;
img->animated = animated;
+ img->frame = frame;
img->interpolation = interpolation;
img->users = 1;
img->use_alpha = use_alpha;
@@ -242,6 +264,7 @@ int ImageManager::add_image(const string& filename, void *builtin_data, bool ani
slot += tex_image_byte_start;
}
+
need_update = true;
return slot;
@@ -299,6 +322,32 @@ void ImageManager::remove_image(const string& filename, void *builtin_data, Inte
}
}
+/* TODO(sergey): Deduplicate with the iteration above, but make it pretty,
+ * without bunch of arguments passing around making code readability even
+ * more cluttered.
+ */
+void ImageManager::tag_reload_image(const string& filename, void *builtin_data, InterpolationType interpolation)
+{
+ size_t slot;
+
+ for(slot = 0; slot < images.size(); slot++) {
+ if(images[slot] && image_equals(images[slot], filename, builtin_data, interpolation)) {
+ images[slot]->need_load = true;
+ break;
+ }
+ }
+
+ if(slot == images.size()) {
+ /* see if it's in a float texture slot */
+ for(slot = 0; slot < float_images.size(); slot++) {
+ if(float_images[slot] && image_equals(float_images[slot], filename, builtin_data, interpolation)) {
+ float_images[slot]->need_load = true;
+ break;
+ }
+ }
+ }
+}
+
bool ImageManager::file_load_image(Image *img, device_vector<uchar4>& tex_img)
{
if(img->filename == "")
@@ -351,6 +400,7 @@ bool ImageManager::file_load_image(Image *img, device_vector<uchar4>& tex_img)
/* read RGBA pixels */
uchar *pixels = (uchar*)tex_img.resize(width, height, depth);
+ bool cmyk = false;
if(in) {
if(depth <= 1) {
@@ -366,6 +416,8 @@ bool ImageManager::file_load_image(Image *img, device_vector<uchar4>& tex_img)
in->read_image(TypeDesc::UINT8, (uchar*)pixels);
}
+ cmyk = strcmp(in->format_name(), "jpeg") == 0 && components == 4;
+
in->close();
delete in;
}
@@ -373,7 +425,17 @@ bool ImageManager::file_load_image(Image *img, device_vector<uchar4>& tex_img)
builtin_image_pixels_cb(img->filename, img->builtin_data, pixels);
}
- if(components == 2) {
+ if(cmyk) {
+ /* CMYK */
+ for(int i = width*height*depth-1; i >= 0; i--) {
+ pixels[i*4+2] = (pixels[i*4+2]*pixels[i*4+3])/255;
+ pixels[i*4+1] = (pixels[i*4+1]*pixels[i*4+3])/255;
+ pixels[i*4+0] = (pixels[i*4+0]*pixels[i*4+3])/255;
+ pixels[i*4+3] = 255;
+ }
+ }
+ else if(components == 2) {
+ /* grayscale + alpha */
for(int i = width*height*depth-1; i >= 0; i--) {
pixels[i*4+3] = pixels[i*2+1];
pixels[i*4+2] = pixels[i*2+0];
@@ -382,6 +444,7 @@ bool ImageManager::file_load_image(Image *img, device_vector<uchar4>& tex_img)
}
}
else if(components == 3) {
+ /* RGB */
for(int i = width*height*depth-1; i >= 0; i--) {
pixels[i*4+3] = 255;
pixels[i*4+2] = pixels[i*3+2];
@@ -390,6 +453,7 @@ bool ImageManager::file_load_image(Image *img, device_vector<uchar4>& tex_img)
}
}
else if(components == 1) {
+ /* grayscale */
for(int i = width*height*depth-1; i >= 0; i--) {
pixels[i*4+3] = 255;
pixels[i*4+2] = pixels[i];
@@ -448,7 +512,7 @@ bool ImageManager::file_load_float_image(Image *img, device_vector<float4>& tex_
builtin_image_info_cb(img->filename, img->builtin_data, is_float, width, height, depth, components);
}
- if(!(components >= 1 && components <= 4)) {
+ if(components < 1 || width == 0 || height == 0) {
if(in) {
in->close();
delete in;
@@ -458,21 +522,43 @@ bool ImageManager::file_load_float_image(Image *img, device_vector<float4>& tex_
/* read RGBA pixels */
float *pixels = (float*)tex_img.resize(width, height, depth);
+ bool cmyk = false;
if(in) {
+ float *readpixels = pixels;
+ vector<float> tmppixels;
+
+ if(components > 4) {
+ tmppixels.resize(width*height*components);
+ readpixels = &tmppixels[0];
+ }
+
if(depth <= 1) {
int scanlinesize = width*components*sizeof(float);
in->read_image(TypeDesc::FLOAT,
- (uchar*)pixels + (height-1)*scanlinesize,
+ (uchar*)readpixels + (height-1)*scanlinesize,
AutoStride,
-scanlinesize,
AutoStride);
}
else {
- in->read_image(TypeDesc::FLOAT, (uchar*)pixels);
+ in->read_image(TypeDesc::FLOAT, (uchar*)readpixels);
+ }
+
+ if(components > 4) {
+ for(int i = width*height-1; i >= 0; i--) {
+ pixels[i*4+3] = tmppixels[i*components+3];
+ pixels[i*4+2] = tmppixels[i*components+2];
+ pixels[i*4+1] = tmppixels[i*components+1];
+ pixels[i*4+0] = tmppixels[i*components+0];
+ }
+
+ tmppixels.clear();
}
+ cmyk = strcmp(in->format_name(), "jpeg") == 0 && components == 4;
+
in->close();
delete in;
}
@@ -480,7 +566,17 @@ bool ImageManager::file_load_float_image(Image *img, device_vector<float4>& tex_
builtin_image_float_pixels_cb(img->filename, img->builtin_data, pixels);
}
- if(components == 2) {
+ if(cmyk) {
+ /* CMYK */
+ for(int i = width*height*depth-1; i >= 0; i--) {
+ pixels[i*4+3] = 255;
+ pixels[i*4+2] = (pixels[i*4+2]*pixels[i*4+3])/255;
+ pixels[i*4+1] = (pixels[i*4+1]*pixels[i*4+3])/255;
+ pixels[i*4+0] = (pixels[i*4+0]*pixels[i*4+3])/255;
+ }
+ }
+ else if(components == 2) {
+ /* grayscale + alpha */
for(int i = width*height*depth-1; i >= 0; i--) {
pixels[i*4+3] = pixels[i*2+1];
pixels[i*4+2] = pixels[i*2+0];
@@ -489,6 +585,7 @@ bool ImageManager::file_load_float_image(Image *img, device_vector<float4>& tex_
}
}
else if(components == 3) {
+ /* RGB */
for(int i = width*height*depth-1; i >= 0; i--) {
pixels[i*4+3] = 1.0f;
pixels[i*4+2] = pixels[i*3+2];
@@ -497,6 +594,7 @@ bool ImageManager::file_load_float_image(Image *img, device_vector<float4>& tex_
}
}
else if(components == 1) {
+ /* grayscale */
for(int i = width*height*depth-1; i >= 0; i--) {
pixels[i*4+3] = 1.0f;
pixels[i*4+2] = pixels[i];
@@ -557,7 +655,8 @@ void ImageManager::device_load_image(Device *device, DeviceScene *dscene, int sl
string name;
- if(slot >= 10) name = string_printf("__tex_image_float_0%d", slot);
+ if(slot >= 100) name = string_printf("__tex_image_float_%d", slot);
+ else if(slot >= 10) name = string_printf("__tex_image_float_0%d", slot);
else name = string_printf("__tex_image_float_00%d", slot);
if(!pack_images) {
@@ -588,7 +687,8 @@ void ImageManager::device_load_image(Device *device, DeviceScene *dscene, int sl
string name;
- if(slot >= 10) name = string_printf("__tex_image_0%d", slot);
+ if(slot >= 100) name = string_printf("__tex_image_%d", slot);
+ else if(slot >= 10) name = string_printf("__tex_image_0%d", slot);
else name = string_printf("__tex_image_00%d", slot);
if(!pack_images) {
@@ -744,6 +844,17 @@ void ImageManager::device_pack_images(Device *device, DeviceScene *dscene, Progr
}
}
+void ImageManager::device_free_builtin(Device *device, DeviceScene *dscene)
+{
+ for(size_t slot = 0; slot < images.size(); slot++)
+ if(images[slot] && images[slot]->builtin_data)
+ device_free_image(device, dscene, slot + tex_image_byte_start);
+
+ for(size_t slot = 0; slot < float_images.size(); slot++)
+ if(float_images[slot] && float_images[slot]->builtin_data)
+ device_free_image(device, dscene, slot);
+}
+
void ImageManager::device_free(Device *device, DeviceScene *dscene)
{
for(size_t slot = 0; slot < images.size(); slot++)
diff --git a/intern/cycles/render/image.h b/intern/cycles/render/image.h
index 561550fe0d2..535f0ff156d 100644
--- a/intern/cycles/render/image.h
+++ b/intern/cycles/render/image.h
@@ -29,7 +29,7 @@
CCL_NAMESPACE_BEGIN
/* generic */
-#define TEX_NUM_IMAGES 95
+#define TEX_NUM_IMAGES 94
#define TEX_IMAGE_BYTE_START TEX_NUM_FLOAT_IMAGES
/* extended gpu */
@@ -55,13 +55,16 @@ public:
ImageManager();
~ImageManager();
- int add_image(const string& filename, void *builtin_data, bool animated, bool& is_float, bool& is_linear, InterpolationType interpolation, bool use_alpha);
+ int add_image(const string& filename, void *builtin_data, bool animated, float frame,
+ bool& is_float, bool& is_linear, InterpolationType interpolation, bool use_alpha);
void remove_image(int slot);
void remove_image(const string& filename, void *builtin_data, InterpolationType interpolation);
+ void tag_reload_image(const string& filename, void *builtin_data, InterpolationType interpolation);
bool is_float_image(const string& filename, void *builtin_data, bool& is_linear);
void device_update(Device *device, DeviceScene *dscene, Progress& progress);
void device_free(Device *device, DeviceScene *dscene);
+ void device_free_builtin(Device *device, DeviceScene *dscene);
void set_osl_texture_system(void *texture_system);
void set_pack_images(bool pack_images_);
@@ -81,6 +84,7 @@ public:
bool use_alpha;
bool need_load;
bool animated;
+ float frame;
InterpolationType interpolation;
int users;
diff --git a/intern/cycles/render/integrator.cpp b/intern/cycles/render/integrator.cpp
index 59a0de07e5a..03a8cd5d2d3 100644
--- a/intern/cycles/render/integrator.cpp
+++ b/intern/cycles/render/integrator.cpp
@@ -43,7 +43,8 @@ Integrator::Integrator()
volume_max_steps = 1024;
volume_step_size = 0.1f;
- no_caustics = false;
+ caustics_reflective = true;
+ caustics_refractive = true;
filter_glossy = 0.0f;
seed = 0;
layer_flag = ~0;
@@ -86,22 +87,33 @@ void Integrator::device_update(Device *device, DeviceScene *dscene, Scene *scene
kintegrator->max_diffuse_bounce = max_diffuse_bounce + 1;
kintegrator->max_glossy_bounce = max_glossy_bounce + 1;
kintegrator->max_transmission_bounce = max_transmission_bounce + 1;
-
- if(kintegrator->use_volumes)
- kintegrator->max_volume_bounce = max_volume_bounce + 1;
- else
- kintegrator->max_volume_bounce = 1;
+ kintegrator->max_volume_bounce = max_volume_bounce + 1;
kintegrator->transparent_max_bounce = transparent_max_bounce + 1;
kintegrator->transparent_min_bounce = transparent_min_bounce + 1;
- kintegrator->transparent_shadows = transparent_shadows;
+ /* Transparent Shadows
+ * We only need to enable transparent shadows, if we actually have
+ * transparent shaders in the scene. Otherwise we can disable it
+ * to improve performance a bit. */
+ if(transparent_shadows) {
+ foreach(Shader *shader, scene->shaders) {
+ /* keep this in sync with SD_HAS_TRANSPARENT_SHADOW in shader.cpp */
+ if((shader->has_surface_transparent && shader->use_transparent_shadow) || shader->has_volume) {
+ kintegrator->transparent_shadows = true;
+ break;
+ }
+ }
+ }
+ else {
+ kintegrator->transparent_shadows = false;
+ }
- kintegrator->volume_homogeneous_sampling = volume_homogeneous_sampling;
kintegrator->volume_max_steps = volume_max_steps;
kintegrator->volume_step_size = volume_step_size;
- kintegrator->no_caustics = no_caustics;
+ kintegrator->caustics_reflective = caustics_reflective;
+ kintegrator->caustics_refractive = caustics_refractive;
kintegrator->filter_glossy = (filter_glossy == 0.0f)? FLT_MAX: 1.0f/filter_glossy;
kintegrator->seed = hash_int(seed);
@@ -121,8 +133,15 @@ void Integrator::device_update(Device *device, DeviceScene *dscene, Scene *scene
kintegrator->mesh_light_samples = mesh_light_samples;
kintegrator->subsurface_samples = subsurface_samples;
kintegrator->volume_samples = volume_samples;
- kintegrator->sample_all_lights_direct = sample_all_lights_direct;
- kintegrator->sample_all_lights_indirect = sample_all_lights_indirect;
+
+ if(method == BRANCHED_PATH) {
+ kintegrator->sample_all_lights_direct = sample_all_lights_direct;
+ kintegrator->sample_all_lights_indirect = sample_all_lights_indirect;
+ }
+ else {
+ kintegrator->sample_all_lights_direct = false;
+ kintegrator->sample_all_lights_indirect = false;
+ }
kintegrator->sampling_pattern = sampling_pattern;
kintegrator->aa_samples = aa_samples;
@@ -173,7 +192,8 @@ bool Integrator::modified(const Integrator& integrator)
volume_homogeneous_sampling == integrator.volume_homogeneous_sampling &&
volume_max_steps == integrator.volume_max_steps &&
volume_step_size == integrator.volume_step_size &&
- no_caustics == integrator.no_caustics &&
+ caustics_reflective == integrator.caustics_reflective &&
+ caustics_refractive == integrator.caustics_refractive &&
filter_glossy == integrator.filter_glossy &&
layer_flag == integrator.layer_flag &&
seed == integrator.seed &&
diff --git a/intern/cycles/render/integrator.h b/intern/cycles/render/integrator.h
index 380c1a65722..13c10e8ca94 100644
--- a/intern/cycles/render/integrator.h
+++ b/intern/cycles/render/integrator.h
@@ -43,7 +43,8 @@ public:
int volume_max_steps;
float volume_step_size;
- bool no_caustics;
+ bool caustics_reflective;
+ bool caustics_refractive;
float filter_glossy;
int seed;
diff --git a/intern/cycles/render/light.cpp b/intern/cycles/render/light.cpp
index 7bdb1fbf8af..1f006637e67 100644
--- a/intern/cycles/render/light.cpp
+++ b/intern/cycles/render/light.cpp
@@ -66,11 +66,12 @@ static void shade_background_pixels(Device *device, DeviceScene *dscene, int res
main_task.shader_eval_type = SHADER_EVAL_BACKGROUND;
main_task.shader_x = 0;
main_task.shader_w = width*height;
+ main_task.num_samples = 1;
main_task.get_cancel = function_bind(&Progress::get_cancel, &progress);
/* disabled splitting for now, there's an issue with multi-GPU mem_copy_from */
list<DeviceTask> split_tasks;
- main_task.split_max_size(split_tasks, 128*128);
+ main_task.split(split_tasks, 1, 128*128);
foreach(DeviceTask& task, split_tasks) {
device->task_add(task);
@@ -120,6 +121,7 @@ Light::Light()
use_diffuse = true;
use_glossy = true;
use_transmission = true;
+ use_scatter = true;
shader = 0;
samples = 1;
@@ -205,8 +207,10 @@ void LightManager::device_update_distribution(Device *device, DeviceScene *dscen
}
/* skip motion blurred deforming meshes, not supported yet */
- if(mesh->has_motion_blur())
+ if(mesh->has_motion_blur()) {
+ j++;
continue;
+ }
/* skip if we have no emission shaders */
foreach(uint sindex, mesh->used_shaders) {
@@ -240,6 +244,10 @@ void LightManager::device_update_distribution(Device *device, DeviceScene *dscen
shader_flag |= SHADER_EXCLUDE_TRANSMIT;
use_light_visibility = true;
}
+ if(!(object->visibility & PATH_RAY_VOLUME_SCATTER)) {
+ shader_flag |= SHADER_EXCLUDE_SCATTER;
+ use_light_visibility = true;
+ }
for(size_t i = 0; i < mesh->triangles.size(); i++) {
Shader *shader = scene->shaders[mesh->shader[i]];
@@ -497,6 +505,10 @@ void LightManager::device_update_points(Device *device, DeviceScene *dscene, Sce
shader_id |= SHADER_EXCLUDE_TRANSMIT;
use_light_visibility = true;
}
+ if(!light->use_scatter) {
+ shader_id |= SHADER_EXCLUDE_SCATTER;
+ use_light_visibility = true;
+ }
if(light->type == LIGHT_POINT) {
shader_id &= ~SHADER_AREA_LIGHT;
@@ -551,6 +563,10 @@ void LightManager::device_update_points(Device *device, DeviceScene *dscene, Sce
shader_id |= SHADER_EXCLUDE_TRANSMIT;
use_light_visibility = true;
}
+ if(!(visibility & PATH_RAY_VOLUME_SCATTER)) {
+ shader_id |= SHADER_EXCLUDE_SCATTER;
+ use_light_visibility = true;
+ }
light_data[i*LIGHT_SIZE + 0] = make_float4(__int_as_float(light->type), 0.0f, 0.0f, 0.0f);
light_data[i*LIGHT_SIZE + 1] = make_float4(__int_as_float(shader_id), 0.0f, 0.0f, 0.0f);
diff --git a/intern/cycles/render/light.h b/intern/cycles/render/light.h
index 82308cf3e88..89091bb5f9e 100644
--- a/intern/cycles/render/light.h
+++ b/intern/cycles/render/light.h
@@ -54,6 +54,7 @@ public:
bool use_diffuse;
bool use_glossy;
bool use_transmission;
+ bool use_scatter;
int shader;
int samples;
diff --git a/intern/cycles/render/mesh.cpp b/intern/cycles/render/mesh.cpp
index 9c5ddd55010..6137f7d4fdc 100644
--- a/intern/cycles/render/mesh.cpp
+++ b/intern/cycles/render/mesh.cpp
@@ -93,6 +93,8 @@ Mesh::Mesh()
attributes.triangle_mesh = this;
curve_attributes.curve_mesh = this;
+
+ has_volume = false;
}
Mesh::~Mesh()
@@ -132,6 +134,7 @@ void Mesh::clear()
transform_applied = false;
transform_negative_scaled = false;
transform_normal = transform_identity();
+ geometry_synced = false;
}
int Mesh::split_vertex(int vertex)
@@ -377,14 +380,12 @@ void Mesh::add_vertex_normals()
}
}
-void Mesh::pack_normals(Scene *scene, float4 *normal, float4 *vnormal)
+void Mesh::pack_normals(Scene *scene, uint *tri_shader, float4 *vnormal)
{
- Attribute *attr_fN = attributes.find(ATTR_STD_FACE_NORMAL);
Attribute *attr_vN = attributes.find(ATTR_STD_VERTEX_NORMAL);
- float3 *fN = attr_fN->data_float3();
float3 *vN = attr_vN->data_float3();
- int shader_id = 0;
+ uint shader_id = 0;
uint last_shader = -1;
bool last_smooth = false;
@@ -394,24 +395,15 @@ void Mesh::pack_normals(Scene *scene, float4 *normal, float4 *vnormal)
bool do_transform = transform_applied;
Transform ntfm = transform_normal;
+ /* save shader */
for(size_t i = 0; i < triangles_size; i++) {
- float3 fNi = fN[i];
-
- if(do_transform)
- fNi = normalize(transform_direction(&ntfm, fNi));
-
- normal[i].x = fNi.x;
- normal[i].y = fNi.y;
- normal[i].z = fNi.z;
-
- /* stuff shader id in here too */
if(shader_ptr[i] != last_shader || last_smooth != smooth[i]) {
last_shader = shader_ptr[i];
last_smooth = smooth[i];
shader_id = scene->shader_manager->get_shader_id(last_shader, this, last_smooth);
}
- normal[i].w = __int_as_float(shader_id);
+ tri_shader[i] = shader_id;
}
size_t verts_size = verts.size();
@@ -756,7 +748,7 @@ void MeshManager::update_svm_attributes(Device *device, DeviceScene *dscene, Sce
device->tex_alloc("__attributes_map", dscene->attributes_map);
}
-static void update_attribute_element_offset(Mesh *mesh, vector<float>& attr_float, vector<float4>& attr_float3,
+static void update_attribute_element_offset(Mesh *mesh, vector<float>& attr_float, vector<float4>& attr_float3, vector<uchar4>& attr_uchar4,
Attribute *mattr, TypeDesc& type, int& offset, AttributeElement& element)
{
if(mattr) {
@@ -777,6 +769,15 @@ static void update_attribute_element_offset(Mesh *mesh, vector<float>& attr_floa
VoxelAttribute *voxel_data = mattr->data_voxel();
offset = voxel_data->slot;
}
+ else if(mattr->element == ATTR_ELEMENT_CORNER_BYTE) {
+ uchar4 *data = mattr->data_uchar4();
+ offset = attr_uchar4.size();
+
+ attr_uchar4.resize(attr_uchar4.size() + size);
+
+ for(size_t k = 0; k < size; k++)
+ attr_uchar4[offset+k] = data[k];
+ }
else if(mattr->type == TypeDesc::TypeFloat) {
float *data = mattr->data_float();
offset = attr_float.size();
@@ -813,7 +814,7 @@ static void update_attribute_element_offset(Mesh *mesh, vector<float>& attr_floa
offset -= mesh->vert_offset;
else if(element == ATTR_ELEMENT_FACE)
offset -= mesh->tri_offset;
- else if(element == ATTR_ELEMENT_CORNER)
+ else if(element == ATTR_ELEMENT_CORNER || element == ATTR_ELEMENT_CORNER_BYTE)
offset -= 3*mesh->tri_offset;
else if(element == ATTR_ELEMENT_CURVE)
offset -= mesh->curve_offset;
@@ -854,6 +855,7 @@ void MeshManager::device_update_attributes(Device *device, DeviceScene *dscene,
* maps next */
vector<float> attr_float;
vector<float4> attr_float3;
+ vector<uchar4> attr_uchar4;
for(size_t i = 0; i < scene->meshes.size(); i++) {
Mesh *mesh = scene->meshes[i];
@@ -874,10 +876,10 @@ void MeshManager::device_update_attributes(Device *device, DeviceScene *dscene,
memcpy(triangle_mattr->data_float3(), &mesh->verts[0], sizeof(float3)*mesh->verts.size());
}
- update_attribute_element_offset(mesh, attr_float, attr_float3, triangle_mattr,
+ update_attribute_element_offset(mesh, attr_float, attr_float3, attr_uchar4, triangle_mattr,
req.triangle_type, req.triangle_offset, req.triangle_element);
- update_attribute_element_offset(mesh, attr_float, attr_float3, curve_mattr,
+ update_attribute_element_offset(mesh, attr_float, attr_float3, attr_uchar4, curve_mattr,
req.curve_type, req.curve_offset, req.curve_element);
if(progress.get_cancel()) return;
@@ -903,6 +905,10 @@ void MeshManager::device_update_attributes(Device *device, DeviceScene *dscene,
dscene->attributes_float3.copy(&attr_float3[0], attr_float3.size());
device->tex_alloc("__attributes_float3", dscene->attributes_float3);
}
+ if(attr_uchar4.size()) {
+ dscene->attributes_uchar4.copy(&attr_uchar4[0], attr_uchar4.size());
+ device->tex_alloc("__attributes_uchar4", dscene->attributes_uchar4);
+ }
}
void MeshManager::device_update_mesh(Device *device, DeviceScene *dscene, Scene *scene, Progress& progress)
@@ -932,13 +938,13 @@ void MeshManager::device_update_mesh(Device *device, DeviceScene *dscene, Scene
/* normals */
progress.set_status("Updating Mesh", "Computing normals");
- float4 *normal = dscene->tri_normal.resize(tri_size);
+ uint *tri_shader = dscene->tri_shader.resize(tri_size);
float4 *vnormal = dscene->tri_vnormal.resize(vert_size);
float4 *tri_verts = dscene->tri_verts.resize(vert_size);
float4 *tri_vindex = dscene->tri_vindex.resize(tri_size);
foreach(Mesh *mesh, scene->meshes) {
- mesh->pack_normals(scene, &normal[mesh->tri_offset], &vnormal[mesh->vert_offset]);
+ mesh->pack_normals(scene, &tri_shader[mesh->tri_offset], &vnormal[mesh->vert_offset]);
mesh->pack_verts(&tri_verts[mesh->vert_offset], &tri_vindex[mesh->tri_offset], mesh->vert_offset);
if(progress.get_cancel()) return;
@@ -947,7 +953,7 @@ void MeshManager::device_update_mesh(Device *device, DeviceScene *dscene, Scene
/* vertex coordinates */
progress.set_status("Updating Mesh", "Copying Mesh to device");
- device->tex_alloc("__tri_normal", dscene->tri_normal);
+ device->tex_alloc("__tri_shader", dscene->tri_shader);
device->tex_alloc("__tri_vnormal", dscene->tri_vnormal);
device->tex_alloc("__tri_verts", dscene->tri_verts);
device->tex_alloc("__tri_vindex", dscene->tri_vindex);
@@ -1028,11 +1034,16 @@ void MeshManager::device_update(Device *device, DeviceScene *dscene, Scene *scen
if(!need_update)
return;
- /* update normals */
+ /* update normals and flags */
foreach(Mesh *mesh, scene->meshes) {
- foreach(uint shader, mesh->used_shaders)
+ mesh->has_volume = false;
+ foreach(uint shader, mesh->used_shaders) {
if(scene->shaders[shader]->need_update_attributes)
mesh->need_update = true;
+ if(scene->shaders[shader]->has_volume) {
+ mesh->has_volume = true;
+ }
+ }
if(mesh->need_update) {
mesh->add_face_normals();
@@ -1100,6 +1111,8 @@ void MeshManager::device_update(Device *device, DeviceScene *dscene, Scene *scen
bool motion_blur = false;
#endif
+ /* update obejcts */
+ vector<Object *> volume_objects;
foreach(Object *object, scene->objects)
object->compute_bounds(motion_blur);
@@ -1119,7 +1132,7 @@ void MeshManager::device_free(Device *device, DeviceScene *dscene)
device->tex_free(dscene->prim_visibility);
device->tex_free(dscene->prim_index);
device->tex_free(dscene->prim_object);
- device->tex_free(dscene->tri_normal);
+ device->tex_free(dscene->tri_shader);
device->tex_free(dscene->tri_vnormal);
device->tex_free(dscene->tri_vindex);
device->tex_free(dscene->tri_verts);
@@ -1128,6 +1141,7 @@ void MeshManager::device_free(Device *device, DeviceScene *dscene)
device->tex_free(dscene->attributes_map);
device->tex_free(dscene->attributes_float);
device->tex_free(dscene->attributes_float3);
+ device->tex_free(dscene->attributes_uchar4);
dscene->bvh_nodes.clear();
dscene->object_node.clear();
@@ -1136,7 +1150,7 @@ void MeshManager::device_free(Device *device, DeviceScene *dscene)
dscene->prim_visibility.clear();
dscene->prim_index.clear();
dscene->prim_object.clear();
- dscene->tri_normal.clear();
+ dscene->tri_shader.clear();
dscene->tri_vnormal.clear();
dscene->tri_vindex.clear();
dscene->tri_verts.clear();
@@ -1145,6 +1159,7 @@ void MeshManager::device_free(Device *device, DeviceScene *dscene)
dscene->attributes_map.clear();
dscene->attributes_float.clear();
dscene->attributes_float3.clear();
+ dscene->attributes_uchar4.clear();
#ifdef WITH_OSL
OSLGlobals *og = (OSLGlobals*)device->osl_memory();
diff --git a/intern/cycles/render/mesh.h b/intern/cycles/render/mesh.h
index 247e3dd555e..7e34b761faf 100644
--- a/intern/cycles/render/mesh.h
+++ b/intern/cycles/render/mesh.h
@@ -71,11 +71,16 @@ public:
ustring name;
/* Mesh Data */
+ bool geometry_synced; /* used to distinguish meshes with no verts
+ and meshed for which geometry is not created */
+
vector<float3> verts;
vector<Triangle> triangles;
vector<uint> shader;
vector<bool> smooth;
+ bool has_volume; /* Set in the device_update(). */
+
vector<float4> curve_keys; /* co + radius */
vector<Curve> curves;
@@ -120,7 +125,7 @@ public:
void add_face_normals();
void add_vertex_normals();
- void pack_normals(Scene *scene, float4 *normal, float4 *vnormal);
+ void pack_normals(Scene *scene, uint *shader, float4 *vnormal);
void pack_verts(float4 *tri_verts, float4 *tri_vindex, size_t vert_offset);
void pack_curves(Scene *scene, float4 *curve_key_co, float4 *curve_data, size_t curvekey_offset);
void compute_bvh(SceneParams *params, Progress *progress, int n, int total);
diff --git a/intern/cycles/render/mesh_displace.cpp b/intern/cycles/render/mesh_displace.cpp
index 661fd9c66c1..4c0ee76299c 100644
--- a/intern/cycles/render/mesh_displace.cpp
+++ b/intern/cycles/render/mesh_displace.cpp
@@ -119,6 +119,7 @@ bool MeshManager::displace(Device *device, DeviceScene *dscene, Scene *scene, Me
task.shader_eval_type = SHADER_EVAL_DISPLACE;
task.shader_x = 0;
task.shader_w = d_output.size();
+ task.num_samples = 1;
task.get_cancel = function_bind(&Progress::get_cancel, &progress);
device->task_add(task);
diff --git a/intern/cycles/render/nodes.cpp b/intern/cycles/render/nodes.cpp
index a53e0b39435..e8476bfac4c 100644
--- a/intern/cycles/render/nodes.cpp
+++ b/intern/cycles/render/nodes.cpp
@@ -243,7 +243,9 @@ void ImageTextureNode::compile(SVMCompiler& compiler)
image_manager = compiler.image_manager;
if(is_float == -1) {
bool is_float_bool;
- slot = image_manager->add_image(filename, builtin_data, animated, is_float_bool, is_linear, interpolation, use_alpha);
+ slot = image_manager->add_image(filename, builtin_data,
+ animated, 0, is_float_bool, is_linear,
+ interpolation, use_alpha);
is_float = (int)is_float_bool;
}
@@ -305,10 +307,32 @@ void ImageTextureNode::compile(OSLCompiler& compiler)
tex_mapping.compile(compiler);
- if(is_float == -1)
- is_float = (int)image_manager->is_float_image(filename, NULL, is_linear);
+ image_manager = compiler.image_manager;
+ if(is_float == -1) {
+ if(builtin_data == NULL) {
+ is_float = (int)image_manager->is_float_image(filename, NULL, is_linear);
+ }
+ else {
+ bool is_float_bool;
+ slot = image_manager->add_image(filename, builtin_data,
+ animated, 0, is_float_bool, is_linear,
+ interpolation, use_alpha);
+ is_float = (int)is_float_bool;
+ }
+ }
- compiler.parameter("filename", filename.c_str());
+ if(slot == -1) {
+ compiler.parameter("filename", filename.c_str());
+ }
+ else {
+ /* TODO(sergey): It's not so simple to pass custom attribute
+ * to the texture() function in order to make builtin images
+ * support more clear. So we use special file name which is
+ * "@<slot_number>" and check whether file name matches this
+ * mask in the OSLRenderServices::texture().
+ */
+ compiler.parameter("filename", string_printf("@%d", slot).c_str());
+ }
if(is_linear || color_space != "Color")
compiler.parameter("color_space", "Linear");
else
@@ -408,7 +432,9 @@ void EnvironmentTextureNode::compile(SVMCompiler& compiler)
image_manager = compiler.image_manager;
if(slot == -1) {
bool is_float_bool;
- slot = image_manager->add_image(filename, builtin_data, animated, is_float_bool, is_linear, INTERPOLATION_LINEAR, use_alpha);
+ slot = image_manager->add_image(filename, builtin_data,
+ animated, 0, is_float_bool, is_linear,
+ INTERPOLATION_LINEAR, use_alpha);
is_float = (int)is_float_bool;
}
@@ -459,10 +485,29 @@ void EnvironmentTextureNode::compile(OSLCompiler& compiler)
tex_mapping.compile(compiler);
- if(is_float == -1)
- is_float = (int)image_manager->is_float_image(filename, NULL, is_linear);
+ /* See comments in ImageTextureNode::compile about support
+ * of builtin images.
+ */
+ image_manager = compiler.image_manager;
+ if(is_float == -1) {
+ if(builtin_data == NULL) {
+ is_float = (int)image_manager->is_float_image(filename, NULL, is_linear);
+ }
+ else {
+ bool is_float_bool;
+ slot = image_manager->add_image(filename, builtin_data,
+ animated, 0, is_float_bool, is_linear,
+ INTERPOLATION_LINEAR, use_alpha);
+ is_float = (int)is_float_bool;
+ }
+ }
- compiler.parameter("filename", filename.c_str());
+ if(slot == -1) {
+ compiler.parameter("filename", filename.c_str());
+ }
+ else {
+ compiler.parameter("filename", string_printf("@%d", slot).c_str());
+ }
compiler.parameter("projection", projection);
if(is_linear || color_space != "Color")
compiler.parameter("color_space", "Linear");
@@ -1543,11 +1588,24 @@ void BsdfNode::compile(OSLCompiler& compiler)
assert(0);
}
-/* Ward BSDF Closure */
+/* Anisotropic BSDF Closure */
-WardBsdfNode::WardBsdfNode()
+static ShaderEnum aniso_distribution_init()
{
- closure = CLOSURE_BSDF_WARD_ID;
+ ShaderEnum enm;
+
+ enm.insert("Beckmann", CLOSURE_BSDF_MICROFACET_BECKMANN_ANISO_ID);
+ enm.insert("GGX", CLOSURE_BSDF_MICROFACET_GGX_ANISO_ID);
+ enm.insert("Ashikhmin-Shirley", CLOSURE_BSDF_ASHIKHMIN_SHIRLEY_ANISO_ID);
+
+ return enm;
+}
+
+ShaderEnum AnisotropicBsdfNode::distribution_enum = aniso_distribution_init();
+
+AnisotropicBsdfNode::AnisotropicBsdfNode()
+{
+ distribution = ustring("GGX");
add_input("Tangent", SHADER_SOCKET_VECTOR, ShaderInput::TANGENT);
@@ -1556,7 +1614,7 @@ WardBsdfNode::WardBsdfNode()
add_input("Rotation", SHADER_SOCKET_FLOAT, 0.0f);
}
-void WardBsdfNode::attributes(Shader *shader, AttributeRequestSet *attributes)
+void AnisotropicBsdfNode::attributes(Shader *shader, AttributeRequestSet *attributes)
{
if(shader->has_surface) {
ShaderInput *tangent_in = input("Tangent");
@@ -1568,14 +1626,17 @@ void WardBsdfNode::attributes(Shader *shader, AttributeRequestSet *attributes)
ShaderNode::attributes(shader, attributes);
}
-void WardBsdfNode::compile(SVMCompiler& compiler)
+void AnisotropicBsdfNode::compile(SVMCompiler& compiler)
{
+ closure = (ClosureType)distribution_enum[distribution];
+
BsdfNode::compile(compiler, input("Roughness"), input("Anisotropy"), input("Rotation"));
}
-void WardBsdfNode::compile(OSLCompiler& compiler)
+void AnisotropicBsdfNode::compile(OSLCompiler& compiler)
{
- compiler.add(this, "node_ward_bsdf");
+ compiler.parameter("distribution", distribution);
+ compiler.add(this, "node_anisotropic_bsdf");
}
/* Glossy BSDF Closure */
@@ -1587,6 +1648,7 @@ static ShaderEnum glossy_distribution_init()
enm.insert("Sharp", CLOSURE_BSDF_REFLECTION_ID);
enm.insert("Beckmann", CLOSURE_BSDF_MICROFACET_BECKMANN_ID);
enm.insert("GGX", CLOSURE_BSDF_MICROFACET_GGX_ID);
+ enm.insert("Ashikhmin-Shirley", CLOSURE_BSDF_ASHIKHMIN_SHIRLEY_ID);
return enm;
}
@@ -1595,7 +1657,7 @@ ShaderEnum GlossyBsdfNode::distribution_enum = glossy_distribution_init();
GlossyBsdfNode::GlossyBsdfNode()
{
- distribution = ustring("Beckmann");
+ distribution = ustring("GGX");
add_input("Roughness", SHADER_SOCKET_FLOAT, 0.2f);
}
@@ -1850,8 +1912,6 @@ bool SubsurfaceScatteringNode::has_bssrdf_bump()
EmissionNode::EmissionNode()
: ShaderNode("emission")
{
- total_power = false;
-
add_input("Color", SHADER_SOCKET_COLOR, make_float3(0.8f, 0.8f, 0.8f));
add_input("Strength", SHADER_SOCKET_FLOAT, 10.0f);
add_input("SurfaceMixWeight", SHADER_SOCKET_FLOAT, 0.0f, ShaderInput::USE_SVM);
@@ -1867,10 +1927,8 @@ void EmissionNode::compile(SVMCompiler& compiler)
if(color_in->link || strength_in->link) {
compiler.stack_assign(color_in);
compiler.stack_assign(strength_in);
- compiler.add_node(NODE_EMISSION_WEIGHT, color_in->stack_offset, strength_in->stack_offset, total_power? 1: 0);
+ compiler.add_node(NODE_EMISSION_WEIGHT, color_in->stack_offset, strength_in->stack_offset);
}
- else if(total_power)
- compiler.add_node(NODE_EMISSION_SET_WEIGHT_TOTAL, color_in->value * strength_in->value.x);
else
compiler.add_node(NODE_CLOSURE_SET_WEIGHT, color_in->value * strength_in->value.x);
@@ -1879,7 +1937,6 @@ void EmissionNode::compile(SVMCompiler& compiler)
void EmissionNode::compile(OSLCompiler& compiler)
{
- compiler.parameter("TotalPower", (total_power)? 1: 0);
compiler.add(this, "node_emission");
}
@@ -3007,13 +3064,13 @@ void CombineRGBNode::compile(SVMCompiler& compiler)
compiler.stack_assign(color_out);
compiler.stack_assign(red_in);
- compiler.add_node(NODE_COMBINE_RGB, red_in->stack_offset, 0, color_out->stack_offset);
+ compiler.add_node(NODE_COMBINE_VECTOR, red_in->stack_offset, 0, color_out->stack_offset);
compiler.stack_assign(green_in);
- compiler.add_node(NODE_COMBINE_RGB, green_in->stack_offset, 1, color_out->stack_offset);
+ compiler.add_node(NODE_COMBINE_VECTOR, green_in->stack_offset, 1, color_out->stack_offset);
compiler.stack_assign(blue_in);
- compiler.add_node(NODE_COMBINE_RGB, blue_in->stack_offset, 2, color_out->stack_offset);
+ compiler.add_node(NODE_COMBINE_VECTOR, blue_in->stack_offset, 2, color_out->stack_offset);
}
void CombineRGBNode::compile(OSLCompiler& compiler)
@@ -3021,6 +3078,40 @@ void CombineRGBNode::compile(OSLCompiler& compiler)
compiler.add(this, "node_combine_rgb");
}
+/* Combine XYZ */
+CombineXYZNode::CombineXYZNode()
+: ShaderNode("combine_xyz")
+{
+ add_input("X", SHADER_SOCKET_FLOAT);
+ add_input("Y", SHADER_SOCKET_FLOAT);
+ add_input("Z", SHADER_SOCKET_FLOAT);
+ add_output("Vector", SHADER_SOCKET_VECTOR);
+}
+
+void CombineXYZNode::compile(SVMCompiler& compiler)
+{
+ ShaderInput *x_in = input("X");
+ ShaderInput *y_in = input("Y");
+ ShaderInput *z_in = input("Z");
+ ShaderOutput *vector_out = output("Vector");
+
+ compiler.stack_assign(vector_out);
+
+ compiler.stack_assign(x_in);
+ compiler.add_node(NODE_COMBINE_VECTOR, x_in->stack_offset, 0, vector_out->stack_offset);
+
+ compiler.stack_assign(y_in);
+ compiler.add_node(NODE_COMBINE_VECTOR, y_in->stack_offset, 1, vector_out->stack_offset);
+
+ compiler.stack_assign(z_in);
+ compiler.add_node(NODE_COMBINE_VECTOR, z_in->stack_offset, 2, vector_out->stack_offset);
+}
+
+void CombineXYZNode::compile(OSLCompiler& compiler)
+{
+ compiler.add(this, "node_combine_xyz");
+}
+
/* Combine HSV */
CombineHSVNode::CombineHSVNode()
: ShaderNode("combine_hsv")
@@ -3131,13 +3222,13 @@ void SeparateRGBNode::compile(SVMCompiler& compiler)
compiler.stack_assign(color_in);
compiler.stack_assign(red_out);
- compiler.add_node(NODE_SEPARATE_RGB, color_in->stack_offset, 0, red_out->stack_offset);
+ compiler.add_node(NODE_SEPARATE_VECTOR, color_in->stack_offset, 0, red_out->stack_offset);
compiler.stack_assign(green_out);
- compiler.add_node(NODE_SEPARATE_RGB, color_in->stack_offset, 1, green_out->stack_offset);
+ compiler.add_node(NODE_SEPARATE_VECTOR, color_in->stack_offset, 1, green_out->stack_offset);
compiler.stack_assign(blue_out);
- compiler.add_node(NODE_SEPARATE_RGB, color_in->stack_offset, 2, blue_out->stack_offset);
+ compiler.add_node(NODE_SEPARATE_VECTOR, color_in->stack_offset, 2, blue_out->stack_offset);
}
void SeparateRGBNode::compile(OSLCompiler& compiler)
@@ -3145,6 +3236,40 @@ void SeparateRGBNode::compile(OSLCompiler& compiler)
compiler.add(this, "node_separate_rgb");
}
+/* Separate XYZ */
+SeparateXYZNode::SeparateXYZNode()
+: ShaderNode("separate_xyz")
+{
+ add_input("Vector", SHADER_SOCKET_VECTOR);
+ add_output("X", SHADER_SOCKET_FLOAT);
+ add_output("Y", SHADER_SOCKET_FLOAT);
+ add_output("Z", SHADER_SOCKET_FLOAT);
+}
+
+void SeparateXYZNode::compile(SVMCompiler& compiler)
+{
+ ShaderInput *vector_in = input("Vector");
+ ShaderOutput *x_out = output("X");
+ ShaderOutput *y_out = output("Y");
+ ShaderOutput *z_out = output("Z");
+
+ compiler.stack_assign(vector_in);
+
+ compiler.stack_assign(x_out);
+ compiler.add_node(NODE_SEPARATE_VECTOR, vector_in->stack_offset, 0, x_out->stack_offset);
+
+ compiler.stack_assign(y_out);
+ compiler.add_node(NODE_SEPARATE_VECTOR, vector_in->stack_offset, 1, y_out->stack_offset);
+
+ compiler.stack_assign(z_out);
+ compiler.add_node(NODE_SEPARATE_VECTOR, vector_in->stack_offset, 2, z_out->stack_offset);
+}
+
+void SeparateXYZNode::compile(OSLCompiler& compiler)
+{
+ compiler.add(this, "node_separate_xyz");
+}
+
/* Separate HSV */
SeparateHSVNode::SeparateHSVNode()
: ShaderNode("separate_hsv")
@@ -4126,4 +4251,3 @@ void TangentNode::compile(OSLCompiler& compiler)
}
CCL_NAMESPACE_END
-
diff --git a/intern/cycles/render/nodes.h b/intern/cycles/render/nodes.h
index d94d8ce6033..31b6f4e50c4 100644
--- a/intern/cycles/render/nodes.h
+++ b/intern/cycles/render/nodes.h
@@ -218,9 +218,13 @@ public:
bool scattering;
};
-class WardBsdfNode : public BsdfNode {
+class AnisotropicBsdfNode : public BsdfNode {
public:
- SHADER_NODE_CLASS(WardBsdfNode)
+ SHADER_NODE_CLASS(AnisotropicBsdfNode)
+
+ ustring distribution;
+ static ShaderEnum distribution_enum;
+
void attributes(Shader *shader, AttributeRequestSet *attributes);
};
@@ -294,8 +298,6 @@ public:
bool has_surface_emission() { return true; }
bool has_spatial_varying() { return true; }
-
- bool total_power;
};
class BackgroundNode : public ShaderNode {
@@ -453,6 +455,11 @@ public:
SHADER_NODE_CLASS(CombineHSVNode)
};
+class CombineXYZNode : public ShaderNode {
+public:
+ SHADER_NODE_CLASS(CombineXYZNode)
+};
+
class GammaNode : public ShaderNode {
public:
SHADER_NODE_CLASS(GammaNode)
@@ -473,6 +480,11 @@ public:
SHADER_NODE_CLASS(SeparateHSVNode)
};
+class SeparateXYZNode : public ShaderNode {
+public:
+ SHADER_NODE_CLASS(SeparateXYZNode)
+};
+
class HSVNode : public ShaderNode {
public:
SHADER_NODE_CLASS(HSVNode)
diff --git a/intern/cycles/render/object.cpp b/intern/cycles/render/object.cpp
index 027bfd71931..46ddab235d9 100644
--- a/intern/cycles/render/object.cpp
+++ b/intern/cycles/render/object.cpp
@@ -75,8 +75,14 @@ void Object::compute_bounds(bool motion_blur)
bounds.grow(mbounds.transformed(&ttfm));
}
}
- else
- bounds = mbounds.transformed(&tfm);
+ else {
+ if(mesh->transform_applied) {
+ bounds = mbounds;
+ }
+ else {
+ bounds = mbounds.transformed(&tfm);
+ }
+ }
}
void Object::apply_transform(bool apply_to_motion)
@@ -372,8 +378,6 @@ void ObjectManager::device_update(Device *device, DeviceScene *dscene, Scene *sc
device_free(device, dscene);
- need_update = false;
-
if(scene->objects.size() == 0)
return;
@@ -392,6 +396,46 @@ void ObjectManager::device_update(Device *device, DeviceScene *dscene, Scene *sc
progress.set_status("Updating Objects", "Applying Static Transformations");
apply_static_transforms(dscene, scene, object_flag, progress);
}
+}
+
+void ObjectManager::device_update_flags(Device *device, DeviceScene *dscene,
+ Scene *scene, Progress& progress)
+{
+ if(!need_update)
+ return;
+
+ need_update = false;
+
+ if(scene->objects.size() == 0)
+ return;
+
+ /* object info flag */
+ uint *object_flag = dscene->object_flag.get_data();
+
+ vector<Object *> volume_objects;
+ foreach(Object *object, scene->objects) {
+ if(object->mesh->has_volume) {
+ volume_objects.push_back(object);
+ }
+ }
+
+ int object_index = 0;
+ foreach(Object *object, scene->objects) {
+ if(object->mesh->has_volume) {
+ object_flag[object_index] |= SD_OBJECT_HAS_VOLUME;
+ }
+
+ foreach(Object *volume_object, volume_objects) {
+ if(object == volume_object) {
+ continue;
+ }
+ if(object->bounds.intersects(volume_object->bounds)) {
+ object_flag[object_index] |= SD_OBJECT_INTERSECTS_VOLUME;
+ break;
+ }
+ }
+ ++object_index;
+ }
/* allocate object flag */
device->tex_alloc("__object_flag", dscene->object_flag);
@@ -449,6 +493,8 @@ void ObjectManager::apply_static_transforms(DeviceScene *dscene, Scene *scene, u
}
object_flag[i] |= SD_TRANSFORM_APPLIED;
+ if(object->mesh->transform_negative_scaled)
+ object_flag[i] |= SD_NEGATIVE_SCALE_APPLIED;
}
else
have_instancing = true;
diff --git a/intern/cycles/render/object.h b/intern/cycles/render/object.h
index 677526b715f..2c69b83a2e9 100644
--- a/intern/cycles/render/object.h
+++ b/intern/cycles/render/object.h
@@ -76,6 +76,7 @@ public:
void device_update(Device *device, DeviceScene *dscene, Scene *scene, Progress& progress);
void device_update_transforms(Device *device, DeviceScene *dscene, Scene *scene, uint *object_flag, Progress& progress);
+ void device_update_flags(Device *device, DeviceScene *dscene, Scene *scene, Progress& progress);
void device_free(Device *device, DeviceScene *dscene);
void tag_update(Scene *scene);
diff --git a/intern/cycles/render/osl.cpp b/intern/cycles/render/osl.cpp
index 94866102f60..f57e16471a1 100644
--- a/intern/cycles/render/osl.cpp
+++ b/intern/cycles/render/osl.cpp
@@ -248,20 +248,27 @@ void OSLShaderManager::shading_system_free()
bool OSLShaderManager::osl_compile(const string& inputfile, const string& outputfile)
{
- vector<string> options;
+#if OSL_LIBRARY_VERSION_CODE < 10500
+ typedef string string_view;
+#endif
+
+ vector<string_view> options;
string stdosl_path;
+ string shader_path = path_get("shader");
/* specify output file name */
options.push_back("-o");
options.push_back(outputfile);
/* specify standard include path */
- options.push_back("-I" + path_get("shader"));
+ options.push_back("-I");
+ options.push_back(shader_path);
+
stdosl_path = path_get("shader/stdosl.h");
/* compile */
OSL::OSLCompiler *compiler = OSL::OSLCompiler::create();
- bool ok = compiler->compile(inputfile, options, stdosl_path);
+ bool ok = compiler->compile(string_view(inputfile), options, string_view(stdosl_path));
delete compiler;
return ok;
diff --git a/intern/cycles/render/scene.cpp b/intern/cycles/render/scene.cpp
index 4f5ad439520..3662c29587e 100644
--- a/intern/cycles/render/scene.cpp
+++ b/intern/cycles/render/scene.cpp
@@ -61,7 +61,7 @@ Scene::Scene(const SceneParams& params_, const DeviceInfo& device_info_)
if(device_info_.type == DEVICE_CPU)
shader_manager = ShaderManager::create(this, params.shadingsystem);
else
- shader_manager = ShaderManager::create(this, SceneParams::SVM);
+ shader_manager = ShaderManager::create(this, SHADINGSYSTEM_SVM);
/* Extended image limits for CPU and GPUs */
image_manager->set_extended_image_limits(device_info_);
@@ -109,6 +109,8 @@ void Scene::free_memory(bool final)
if(!params.persistent_data || final)
image_manager->device_free(device, &dscene);
+ else
+ image_manager->device_free_builtin(device, &dscene);
lookup_tables->device_free(device, &dscene);
}
@@ -139,7 +141,7 @@ void Scene::device_update(Device *device_, Progress& progress)
* the different managers, using data computed by previous managers.
*
* - Image manager uploads images used by shaders.
- * - Camera may be used for adapative subdivison.
+ * - Camera may be used for adaptive subdivision.
* - Displacement shader must have all shader data available.
* - Light manager needs lookup tables and final mesh data to compute emission CDF.
* - Film needs light manager to run for use_light_visibility
@@ -163,13 +165,18 @@ void Scene::device_update(Device *device_, Progress& progress)
if(progress.get_cancel()) return;
- progress.set_status("Updating Camera");
- camera->device_update(device, &dscene, this);
+ progress.set_status("Updating Objects");
+ object_manager->device_update(device, &dscene, this, progress);
if(progress.get_cancel()) return;
- progress.set_status("Updating Objects");
- object_manager->device_update(device, &dscene, this, progress);
+ progress.set_status("Updating Meshes");
+ mesh_manager->device_update(device, &dscene, this, progress);
+
+ if(progress.get_cancel()) return;
+
+ progress.set_status("Updating Objects Flags");
+ object_manager->device_update_flags(device, &dscene, this, progress);
if(progress.get_cancel()) return;
@@ -183,8 +190,9 @@ void Scene::device_update(Device *device_, Progress& progress)
if(progress.get_cancel()) return;
- progress.set_status("Updating Meshes");
- mesh_manager->device_update(device, &dscene, this, progress);
+ /* TODO(sergey): Make sure camera is not needed above. */
+ progress.set_status("Updating Camera");
+ camera->device_update(device, &dscene, this);
if(progress.get_cancel()) return;
@@ -269,7 +277,8 @@ bool Scene::need_reset()
|| shader_manager->need_update
|| particle_system_manager->need_update
|| curve_system_manager->need_update
- || bake_manager->need_update);
+ || bake_manager->need_update
+ || film->need_update);
}
void Scene::reset()
@@ -282,6 +291,11 @@ void Scene::reset()
film->tag_update(this);
background->tag_update(this);
integrator->tag_update(this);
+ object_manager->tag_update(this);
+ mesh_manager->tag_update(this);
+ light_manager->tag_update(this);
+ particle_system_manager->tag_update(this);
+ curve_system_manager->tag_update(this);
}
void Scene::device_free()
diff --git a/intern/cycles/render/scene.h b/intern/cycles/render/scene.h
index 0f0bb725823..5d205225d97 100644
--- a/intern/cycles/render/scene.h
+++ b/intern/cycles/render/scene.h
@@ -18,6 +18,7 @@
#define __SCENE_H__
#include "image.h"
+#include "shader.h"
#include "device_memory.h"
@@ -68,7 +69,7 @@ public:
device_vector<uint> prim_object;
/* mesh */
- device_vector<float4> tri_normal;
+ device_vector<uint> tri_shader;
device_vector<float4> tri_vnormal;
device_vector<float4> tri_vindex;
device_vector<float4> tri_verts;
@@ -84,6 +85,7 @@ public:
device_vector<uint4> attributes_map;
device_vector<float> attributes_float;
device_vector<float4> attributes_float3;
+ device_vector<uchar4> attributes_uchar4;
/* lights */
device_vector<float4> light_distribution;
@@ -120,7 +122,7 @@ public:
class SceneParams {
public:
- enum { OSL, SVM } shadingsystem;
+ ShadingSystem shadingsystem;
enum BVHType { BVH_DYNAMIC, BVH_STATIC } bvh_type;
bool use_bvh_cache;
bool use_bvh_spatial_split;
@@ -129,7 +131,7 @@ public:
SceneParams()
{
- shadingsystem = SVM;
+ shadingsystem = SHADINGSYSTEM_SVM;
bvh_type = BVH_DYNAMIC;
use_bvh_cache = false;
use_bvh_spatial_split = false;
diff --git a/intern/cycles/render/session.cpp b/intern/cycles/render/session.cpp
index 28b44df6b36..9fcd9fa85f5 100644
--- a/intern/cycles/render/session.cpp
+++ b/intern/cycles/render/session.cpp
@@ -592,9 +592,10 @@ void Session::run_cpu()
update_progressive_refine(true);
}
-void Session::run()
+void Session::load_kernels()
{
- /* load kernels */
+ thread_scoped_lock scene_lock(scene->mutex);
+
if(!kernels_loaded) {
progress.set_status("Loading render kernels (may take a few minutes the first time)");
@@ -603,6 +604,7 @@ void Session::run()
if(message.empty())
message = "Failed loading render kernel, see console for errors";
+ progress.set_cancel(message);
progress.set_status("Error", message);
progress.set_update();
return;
@@ -610,6 +612,12 @@ void Session::run()
kernels_loaded = true;
}
+}
+
+void Session::run()
+{
+ /* load kernels */
+ load_kernels();
/* session thread loop */
progress.set_status("Waiting for render to start");
diff --git a/intern/cycles/render/session.h b/intern/cycles/render/session.h
index 1e625158652..9da7a0aafa3 100644
--- a/intern/cycles/render/session.h
+++ b/intern/cycles/render/session.h
@@ -19,6 +19,7 @@
#include "buffers.h"
#include "device.h"
+#include "shader.h"
#include "tile.h"
#include "util_progress.h"
@@ -59,7 +60,7 @@ public:
double reset_timeout;
double text_timeout;
- enum { OSL, SVM } shadingsystem;
+ ShadingSystem shadingsystem;
SessionParams()
{
@@ -80,7 +81,7 @@ public:
reset_timeout = 0.1;
text_timeout = 1.0;
- shadingsystem = SVM;
+ shadingsystem = SHADINGSYSTEM_SVM;
tile_order = TILE_CENTER;
}
@@ -137,7 +138,10 @@ public:
void set_pause(bool pause);
void update_scene();
+ void load_kernels();
+
void device_free();
+
protected:
struct DelayedReset {
thread_mutex mutex;
diff --git a/intern/cycles/render/shader.cpp b/intern/cycles/render/shader.cpp
index b25673b36c3..2a3969b6188 100644
--- a/intern/cycles/render/shader.cpp
+++ b/intern/cycles/render/shader.cpp
@@ -31,6 +31,100 @@
CCL_NAMESPACE_BEGIN
+/* Beckmann sampling precomputed table, see bsdf_microfacet.h */
+
+/* 2D slope distribution (alpha = 1.0) */
+static float beckmann_table_P22(const float slope_x, const float slope_y)
+{
+ return expf(-(slope_x*slope_x + slope_y*slope_y));
+}
+
+/* maximal slope amplitude (range that contains 99.99% of the distribution) */
+static float beckmann_table_slope_max()
+{
+ return 6.0;
+}
+
+/* Paper used: Importance Sampling Microfacet-Based BSDFs with the
+ * Distribution of Visible Normals. Supplemental Material 2/2.
+ *
+ * http://hal.inria.fr/docs/01/00/66/20/ANNEX/supplemental2.pdf
+ */
+static void beckmann_table_rows(float *table, int row_from, int row_to)
+{
+ /* allocate temporary data */
+ const int DATA_TMP_SIZE = 512;
+ vector<double> slope_x(DATA_TMP_SIZE);
+ vector<double> CDF_P22_omega_i(DATA_TMP_SIZE);
+
+ /* loop over incident directions */
+ for(int index_theta = row_from; index_theta < row_to; index_theta++) {
+ /* incident vector */
+ const float cos_theta = index_theta / (BECKMANN_TABLE_SIZE - 1.0f);
+ const float sin_theta = safe_sqrtf(1.0f - cos_theta*cos_theta);
+
+ /* for a given incident vector
+ * integrate P22_{omega_i}(x_slope, 1, 1), Eq. (10) */
+ slope_x[0] = -beckmann_table_slope_max();
+ CDF_P22_omega_i[0] = 0;
+
+ for(int index_slope_x = 1; index_slope_x < DATA_TMP_SIZE; ++index_slope_x) {
+ /* slope_x */
+ slope_x[index_slope_x] = -beckmann_table_slope_max() + 2.0f * beckmann_table_slope_max() * index_slope_x/(DATA_TMP_SIZE - 1.0f);
+
+ /* dot product with incident vector */
+ float dot_product = fmaxf(0.0f, -(float)slope_x[index_slope_x]*sin_theta + cos_theta);
+ /* marginalize P22_{omega_i}(x_slope, 1, 1), Eq. (10) */
+ float P22_omega_i = 0.0f;
+
+ for(int j = 0; j < 100; ++j) {
+ float slope_y = -beckmann_table_slope_max() + 2.0f * beckmann_table_slope_max() * j * (1.0f/99.0f);
+ P22_omega_i += dot_product * beckmann_table_P22((float)slope_x[index_slope_x], slope_y);
+ }
+
+ /* CDF of P22_{omega_i}(x_slope, 1, 1), Eq. (10) */
+ CDF_P22_omega_i[index_slope_x] = CDF_P22_omega_i[index_slope_x - 1] + (double)P22_omega_i;
+ }
+
+ /* renormalize CDF_P22_omega_i */
+ for(int index_slope_x = 1; index_slope_x < DATA_TMP_SIZE; ++index_slope_x)
+ CDF_P22_omega_i[index_slope_x] /= CDF_P22_omega_i[DATA_TMP_SIZE - 1];
+
+ /* loop over random number U1 */
+ int index_slope_x = 0;
+
+ for(int index_U = 0; index_U < BECKMANN_TABLE_SIZE; ++index_U) {
+ const double U = 0.0000001 + 0.9999998 * index_U / (double)(BECKMANN_TABLE_SIZE - 1);
+
+ /* inverse CDF_P22_omega_i, solve Eq.(11) */
+ while(CDF_P22_omega_i[index_slope_x] <= U)
+ ++index_slope_x;
+
+ const double interp =
+ (CDF_P22_omega_i[index_slope_x] - U) /
+ (CDF_P22_omega_i[index_slope_x] - CDF_P22_omega_i[index_slope_x - 1]);
+
+ /* store value */
+ table[index_U + index_theta*BECKMANN_TABLE_SIZE] = (float)(
+ interp * slope_x[index_slope_x - 1] +
+ (1.0 - interp) * slope_x[index_slope_x]);
+ }
+ }
+}
+
+static void beckmann_table_build(vector<float>& table)
+{
+ table.resize(BECKMANN_TABLE_SIZE*BECKMANN_TABLE_SIZE);
+
+ /* multithreaded build */
+ TaskPool pool;
+
+ for(int i = 0; i < BECKMANN_TABLE_SIZE; i+=8)
+ pool.push(function_bind(&beckmann_table_rows, &table[0], i, i+8));
+
+ pool.wait_work();
+}
+
/* Shader */
Shader::Shader()
@@ -44,6 +138,8 @@ Shader::Shader()
use_mis = true;
use_transparent_shadow = true;
heterogeneous_volume = true;
+ volume_sampling_method = VOLUME_SAMPLING_DISTANCE;
+ volume_interpolation_method = VOLUME_INTERPOLATION_LINEAR;
has_surface = false;
has_surface_transparent = false;
@@ -137,6 +233,7 @@ ShaderManager::ShaderManager()
{
need_update = true;
blackbody_table_offset = TABLE_OFFSET_INVALID;
+ beckmann_table_offset = TABLE_OFFSET_INVALID;
}
ShaderManager::~ShaderManager()
@@ -148,7 +245,7 @@ ShaderManager *ShaderManager::create(Scene *scene, int shadingsystem)
ShaderManager *manager;
#ifdef WITH_OSL
- if(shadingsystem == SceneParams::OSL)
+ if(shadingsystem == SHADINGSYSTEM_OSL)
manager = new OSLShaderManager();
else
#endif
@@ -256,6 +353,12 @@ void ShaderManager::device_update_common(Device *device, DeviceScene *dscene, Sc
flag |= SD_HAS_BSSRDF_BUMP;
if(shader->has_converter_blackbody)
has_converter_blackbody = true;
+ if(shader->volume_sampling_method == VOLUME_SAMPLING_EQUIANGULAR)
+ flag |= SD_VOLUME_EQUIANGULAR;
+ if(shader->volume_sampling_method == VOLUME_SAMPLING_MULTIPLE_IMPORTANCE)
+ flag |= SD_VOLUME_MIS;
+ if(shader->volume_interpolation_method == VOLUME_INTERPOLATION_CUBIC)
+ flag |= SD_VOLUME_CUBIC;
/* regular shader */
shader_flag[i++] = flag;
@@ -272,20 +375,29 @@ void ShaderManager::device_update_common(Device *device, DeviceScene *dscene, Sc
device->tex_alloc("__shader_flag", dscene->shader_flag);
/* blackbody lookup table */
- KernelBlackbody *kblackbody = &dscene->data.blackbody;
+ KernelTables *ktables = &dscene->data.tables;
if(has_converter_blackbody && blackbody_table_offset == TABLE_OFFSET_INVALID) {
vector<float> table = blackbody_table();
blackbody_table_offset = scene->lookup_tables->add_table(dscene, table);
- kblackbody->table_offset = (int)blackbody_table_offset;
+ ktables->blackbody_offset = (int)blackbody_table_offset;
}
else if(!has_converter_blackbody && blackbody_table_offset != TABLE_OFFSET_INVALID) {
scene->lookup_tables->remove_table(blackbody_table_offset);
blackbody_table_offset = TABLE_OFFSET_INVALID;
}
- /* volumes */
+ /* beckmann lookup table */
+ if(beckmann_table_offset == TABLE_OFFSET_INVALID) {
+ vector<float> table;
+ beckmann_table_build(table);
+ beckmann_table_offset = scene->lookup_tables->add_table(dscene, table);
+
+ ktables->beckmann_offset = (int)beckmann_table_offset;
+ }
+
+ /* integrator */
KernelIntegrator *kintegrator = &dscene->data.integrator;
kintegrator->use_volumes = has_volumes;
}
@@ -297,6 +409,11 @@ void ShaderManager::device_free_common(Device *device, DeviceScene *dscene, Scen
blackbody_table_offset = TABLE_OFFSET_INVALID;
}
+ if(beckmann_table_offset != TABLE_OFFSET_INVALID) {
+ scene->lookup_tables->remove_table(beckmann_table_offset);
+ beckmann_table_offset = TABLE_OFFSET_INVALID;
+ }
+
device->tex_free(dscene->shader_flag);
dscene->shader_flag.clear();
}
diff --git a/intern/cycles/render/shader.h b/intern/cycles/render/shader.h
index 874e8face7a..b267731abe5 100644
--- a/intern/cycles/render/shader.h
+++ b/intern/cycles/render/shader.h
@@ -17,6 +17,10 @@
#ifndef __SHADER_H__
#define __SHADER_H__
+#ifdef WITH_OSL
+# include <OSL/oslexec.h>
+#endif
+
#include "attribute.h"
#include "kernel_types.h"
@@ -25,10 +29,6 @@
#include "util_string.h"
#include "util_types.h"
-#ifdef WITH_OSL
-#include <OSL/oslexec.h>
-#endif
-
CCL_NAMESPACE_BEGIN
class Device;
@@ -39,6 +39,23 @@ class Scene;
class ShaderGraph;
struct float3;
+enum ShadingSystem {
+ SHADINGSYSTEM_OSL,
+ SHADINGSYSTEM_SVM
+};
+
+/* Keep those in sync with the python-defined enum. */
+enum VolumeSampling {
+ VOLUME_SAMPLING_DISTANCE = 0,
+ VOLUME_SAMPLING_EQUIANGULAR = 1,
+ VOLUME_SAMPLING_MULTIPLE_IMPORTANCE = 2,
+};
+
+enum VolumeInterpolation {
+ VOLUME_INTERPOLATION_LINEAR = 0,
+ VOLUME_INTERPOLATION_CUBIC = 1,
+};
+
/* Shader describing the appearance of a Mesh, Light or Background.
*
* While there is only a single shader graph, it has three outputs: surface,
@@ -63,6 +80,8 @@ public:
bool use_mis;
bool use_transparent_shadow;
bool heterogeneous_volume;
+ VolumeSampling volume_sampling_method;
+ int volume_interpolation_method;
/* synchronization */
bool need_update;
@@ -143,6 +162,7 @@ protected:
AttributeIDMap unique_attribute_id;
size_t blackbody_table_offset;
+ size_t beckmann_table_offset;
};
CCL_NAMESPACE_END
diff --git a/intern/cycles/render/svm.cpp b/intern/cycles/render/svm.cpp
index 576c176759c..13c63d9420c 100644
--- a/intern/cycles/render/svm.cpp
+++ b/intern/cycles/render/svm.cpp
@@ -363,14 +363,17 @@ bool SVMCompiler::node_skip_input(ShaderNode *node, ShaderInput *input)
return false;
}
-void SVMCompiler::find_dependencies(set<ShaderNode*>& dependencies, const set<ShaderNode*>& done, ShaderInput *input)
+void SVMCompiler::find_dependencies(set<ShaderNode*>& dependencies,
+ const set<ShaderNode*>& done,
+ ShaderInput *input,
+ ShaderNode *skip_node)
{
ShaderNode *node = (input->link)? input->link->parent: NULL;
- if(node && done.find(node) == done.end()) {
+ if(node && done.find(node) == done.end() && node != skip_node) {
foreach(ShaderInput *in, node->inputs)
if(!node_skip_input(node, in))
- find_dependencies(dependencies, done, in);
+ find_dependencies(dependencies, done, in, skip_node);
dependencies.insert(node);
}
@@ -459,20 +462,28 @@ void SVMCompiler::generate_closure_node(ShaderNode *node, set<ShaderNode*>& done
}
}
-void SVMCompiler::generated_shared_closure_nodes(ShaderNode *node, set<ShaderNode*>& done, set<ShaderNode*>& closure_done, const set<ShaderNode*>& shared)
+void SVMCompiler::generated_shared_closure_nodes(ShaderNode *root_node,
+ ShaderNode *node,
+ set<ShaderNode*>& done,
+ set<ShaderNode*>& closure_done,
+ const set<ShaderNode*>& shared)
{
if(shared.find(node) != shared.end()) {
- generate_multi_closure(node, done, closure_done);
+ generate_multi_closure(root_node, node, done, closure_done);
}
else {
foreach(ShaderInput *in, node->inputs) {
if(in->type == SHADER_SOCKET_CLOSURE && in->link)
- generated_shared_closure_nodes(in->link->parent, done, closure_done, shared);
+ generated_shared_closure_nodes(root_node, in->link->parent,
+ done, closure_done, shared);
}
}
}
-void SVMCompiler::generate_multi_closure(ShaderNode *node, set<ShaderNode*>& done, set<ShaderNode*>& closure_done)
+void SVMCompiler::generate_multi_closure(ShaderNode *root_node,
+ ShaderNode *node,
+ set<ShaderNode*>& done,
+ set<ShaderNode*>& closure_done)
{
/* only generate once */
if(closure_done.find(node) != closure_done.end())
@@ -509,12 +520,33 @@ void SVMCompiler::generate_multi_closure(ShaderNode *node, set<ShaderNode*>& don
set_intersection(cl1deps.begin(), cl1deps.end(),
cl2deps.begin(), cl2deps.end(),
std::inserter(shareddeps, shareddeps.begin()));
-
+
+ /* it's possible some nodes are not shared between this mix node
+ * inputs, but still needed to be always executed, this mainly
+ * happens when a node of current subbranch is used by a parent
+ * node or so */
+ if(root_node != node) {
+ foreach(ShaderInput *in, root_node->inputs) {
+ set<ShaderNode*> rootdeps;
+ find_dependencies(rootdeps, done, in, node);
+ set_intersection(rootdeps.begin(), rootdeps.end(),
+ cl1deps.begin(), cl1deps.end(),
+ std::inserter(shareddeps, shareddeps.begin()));
+ set_intersection(rootdeps.begin(), rootdeps.end(),
+ cl2deps.begin(), cl2deps.end(),
+ std::inserter(shareddeps, shareddeps.begin()));
+ }
+ }
+
if(!shareddeps.empty()) {
- if(cl1in->link)
- generated_shared_closure_nodes(cl1in->link->parent, done, closure_done, shareddeps);
- if(cl2in->link)
- generated_shared_closure_nodes(cl2in->link->parent, done, closure_done, shareddeps);
+ if(cl1in->link) {
+ generated_shared_closure_nodes(root_node, cl1in->link->parent,
+ done, closure_done, shareddeps);
+ }
+ if(cl2in->link) {
+ generated_shared_closure_nodes(root_node, cl2in->link->parent,
+ done, closure_done, shareddeps);
+ }
generate_svm_nodes(shareddeps, done);
}
@@ -525,7 +557,7 @@ void SVMCompiler::generate_multi_closure(ShaderNode *node, set<ShaderNode*>& don
svm_nodes.push_back(make_int4(NODE_JUMP_IF_ONE, 0, facin->stack_offset, 0));
int node_jump_skip_index = svm_nodes.size() - 1;
- generate_multi_closure(cl1in->link->parent, done, closure_done);
+ generate_multi_closure(root_node, cl1in->link->parent, done, closure_done);
/* fill in jump instruction location to be after closure */
svm_nodes[node_jump_skip_index].y = svm_nodes.size() - node_jump_skip_index - 1;
@@ -537,7 +569,7 @@ void SVMCompiler::generate_multi_closure(ShaderNode *node, set<ShaderNode*>& don
svm_nodes.push_back(make_int4(NODE_JUMP_IF_ZERO, 0, facin->stack_offset, 0));
int node_jump_skip_index = svm_nodes.size() - 1;
- generate_multi_closure(cl2in->link->parent, done, closure_done);
+ generate_multi_closure(root_node, cl2in->link->parent, done, closure_done);
/* fill in jump instruction location to be after closure */
svm_nodes[node_jump_skip_index].y = svm_nodes.size() - node_jump_skip_index - 1;
@@ -551,9 +583,9 @@ void SVMCompiler::generate_multi_closure(ShaderNode *node, set<ShaderNode*>& don
* to skip closures here because was already optimized due to
* fixed weight or add closure that always needs both */
if(cl1in->link)
- generate_multi_closure(cl1in->link->parent, done, closure_done);
+ generate_multi_closure(root_node, cl1in->link->parent, done, closure_done);
if(cl2in->link)
- generate_multi_closure(cl2in->link->parent, done, closure_done);
+ generate_multi_closure(root_node, cl2in->link->parent, done, closure_done);
}
}
else {
@@ -638,7 +670,8 @@ void SVMCompiler::compile_type(Shader *shader, ShaderGraph *graph, ShaderType ty
if(generate) {
set<ShaderNode*> done, closure_done;
- generate_multi_closure(clin->link->parent, done, closure_done);
+ generate_multi_closure(clin->link->parent, clin->link->parent,
+ done, closure_done);
}
}
diff --git a/intern/cycles/render/svm.h b/intern/cycles/render/svm.h
index 45aa4d26926..c1dd96e4d80 100644
--- a/intern/cycles/render/svm.h
+++ b/intern/cycles/render/svm.h
@@ -123,15 +123,21 @@ protected:
/* single closure */
void find_dependencies(set<ShaderNode*>& dependencies,
- const set<ShaderNode*>& done, ShaderInput *input);
+ const set<ShaderNode*>& done,
+ ShaderInput *input,
+ ShaderNode *skip_node = NULL);
void generate_node(ShaderNode *node, set<ShaderNode*>& done);
void generate_closure_node(ShaderNode *node, set<ShaderNode*>& done);
- void generated_shared_closure_nodes(ShaderNode *node, set<ShaderNode*>& done,
+ void generated_shared_closure_nodes(ShaderNode *root_node, ShaderNode *node,
+ set<ShaderNode*>& done,
set<ShaderNode*>& closure_done, const set<ShaderNode*>& shared);
void generate_svm_nodes(const set<ShaderNode*>& nodes, set<ShaderNode*>& done);
/* multi closure */
- void generate_multi_closure(ShaderNode *node, set<ShaderNode*>& done, set<ShaderNode*>& closure_done);
+ void generate_multi_closure(ShaderNode *root_node,
+ ShaderNode *node,
+ set<ShaderNode*>& done,
+ set<ShaderNode*>& closure_done);
/* compile */
void compile_type(Shader *shader, ShaderGraph *graph, ShaderType type);
diff --git a/intern/cycles/render/tile.cpp b/intern/cycles/render/tile.cpp
index 72bcdf966b5..e37d8e5f8a1 100644
--- a/intern/cycles/render/tile.cpp
+++ b/intern/cycles/render/tile.cpp
@@ -200,9 +200,9 @@ list<Tile>::iterator TileManager::next_background_tile(int device, TileOrder til
switch (tile_order) {
case TILE_CENTER:
- distx = centx - (cur_tile.x + cur_tile.w);
- disty = centy - (cur_tile.y + cur_tile.h);
- distx = (int64_t) sqrt((double)distx * distx + disty * disty);
+ distx = centx - (cur_tile.x + (cur_tile.w / 2));
+ disty = centy - (cur_tile.y + (cur_tile.h / 2));
+ distx = (int64_t)sqrt((double)(distx * distx + disty * disty));
break;
case TILE_RIGHT_TO_LEFT:
distx = cordx - cur_tile.x;
diff --git a/intern/cycles/util/CMakeLists.txt b/intern/cycles/util/CMakeLists.txt
index c1150d226ae..842d5efac79 100644
--- a/intern/cycles/util/CMakeLists.txt
+++ b/intern/cycles/util/CMakeLists.txt
@@ -1,21 +1,21 @@
set(INC
.
+ ../../glew-mx
)
set(INC_SYS
${GLEW_INCLUDE_PATH}
- ${OPENGL_INCLUDE_DIR}
)
set(SRC
util_cache.cpp
- util_cuda.cpp
util_dynlib.cpp
+ util_logging.cpp
util_md5.cpp
- util_opencl.cpp
util_path.cpp
util_string.cpp
+ util_simd.cpp
util_system.cpp
util_task.cpp
util_time.cpp
@@ -33,7 +33,6 @@ set(SRC_HEADERS
util_args.h
util_boundbox.h
util_cache.h
- util_cuda.h
util_debug.h
util_dynlib.h
util_foreach.h
@@ -42,10 +41,10 @@ set(SRC_HEADERS
util_hash.h
util_image.h
util_list.h
+ util_logging.h
util_map.h
util_math.h
util_md5.h
- util_opencl.h
util_opengl.h
util_optimization.h
util_param.h
@@ -53,6 +52,9 @@ set(SRC_HEADERS
util_progress.h
util_set.h
util_simd.h
+ util_sseb.h
+ util_ssef.h
+ util_ssei.h
util_stats.h
util_string.h
util_system.h
@@ -69,4 +71,6 @@ set(SRC_HEADERS
include_directories(${INC})
include_directories(SYSTEM ${INC_SYS})
+add_definitions(${GL_DEFINITIONS})
+
add_library(cycles_util ${SRC} ${SRC_HEADERS})
diff --git a/intern/cycles/util/util_boundbox.h b/intern/cycles/util/util_boundbox.h
index 369082af60a..a71e0399619 100644
--- a/intern/cycles/util/util_boundbox.h
+++ b/intern/cycles/util/util_boundbox.h
@@ -167,6 +167,15 @@ public:
return result;
}
+
+ __forceinline bool intersects(const BoundBox& other)
+ {
+ float3 center_diff = center() - other.center(),
+ total_size = (size() + other.size()) * 0.5f;
+ return fabsf(center_diff.x) <= total_size.x &&
+ fabsf(center_diff.y) <= total_size.y &&
+ fabsf(center_diff.z) <= total_size.z;
+ }
};
__forceinline BoundBox merge(const BoundBox& bbox, const float3& pt)
diff --git a/intern/cycles/util/util_cache.h b/intern/cycles/util/util_cache.h
index 417f4a869b6..bfb2877a22b 100644
--- a/intern/cycles/util/util_cache.h
+++ b/intern/cycles/util/util_cache.h
@@ -25,7 +25,7 @@
* again into the appropriate data structures.
*
* This way we do not need to accurately track changes, compare dates and
- * invalidate cache entries, at the cost of exta computation. If everything
+ * invalidate cache entries, at the cost of extra computation. If everything
* is stored in a global cache, computations can perhaps even be shared between
* different scenes where it may be hard to detect duplicate work.
*/
@@ -96,54 +96,70 @@ public:
buffers.push_back(buffer);
}
- template<typename T> void read(array<T>& data)
+ template<typename T> bool read(array<T>& data)
{
size_t size;
if(!fread(&size, sizeof(size), 1, f)) {
fprintf(stderr, "Failed to read vector size from cache.\n");
- return;
+ return false;
}
if(!size)
- return;
+ return false;
data.resize(size/sizeof(T));
if(!fread(&data[0], size, 1, f)) {
fprintf(stderr, "Failed to read vector data from cache (%lu).\n", (unsigned long)size);
- return;
+ return false;
}
+ return true;
}
- void read(int& data)
+ bool read(int& data)
{
size_t size;
- if(!fread(&size, sizeof(size), 1, f))
+ if(!fread(&size, sizeof(size), 1, f)) {
fprintf(stderr, "Failed to read int size from cache.\n");
- if(!fread(&data, sizeof(data), 1, f))
+ return false;
+ }
+ if(!fread(&data, sizeof(data), 1, f)) {
fprintf(stderr, "Failed to read int from cache.\n");
+ return false;
+ }
+ return true;
}
- void read(float& data)
+ bool read(float& data)
{
size_t size;
- if(!fread(&size, sizeof(size), 1, f))
+ if(!fread(&size, sizeof(size), 1, f)) {
fprintf(stderr, "Failed to read float size from cache.\n");
- if(!fread(&data, sizeof(data), 1, f))
+ return false;
+ }
+ if(!fread(&data, sizeof(data), 1, f)) {
fprintf(stderr, "Failed to read float from cache.\n");
+ return false;
+ }
+ return true;
}
- void read(size_t& data)
+ bool read(size_t& data)
{
size_t size;
- if(!fread(&size, sizeof(size), 1, f))
+ if(!fread(&size, sizeof(size), 1, f)) {
fprintf(stderr, "Failed to read size_t size from cache.\n");
- if(!fread(&data, sizeof(data), 1, f))
+ return false;
+ }
+ if(!fread(&data, sizeof(data), 1, f)) {
fprintf(stderr, "Failed to read size_t from cache.\n");
+ return false;
+ }
+ return true;
}
};
diff --git a/intern/cycles/util/util_color.h b/intern/cycles/util/util_color.h
index b72cc6bc873..53b3d72de67 100644
--- a/intern/cycles/util/util_color.h
+++ b/intern/cycles/util/util_color.h
@@ -26,6 +26,27 @@
CCL_NAMESPACE_BEGIN
+ccl_device uchar float_to_byte(float val)
+{
+ return ((val <= 0.0f) ? 0 : ((val > (1.0f - 0.5f / 255.0f)) ? 255 : (uchar)((255.0f * val) + 0.5f)));
+}
+
+ccl_device uchar4 color_float_to_byte(float3 c)
+{
+ uchar r, g, b;
+
+ r = float_to_byte(c.x);
+ g = float_to_byte(c.y);
+ b = float_to_byte(c.z);
+
+ return make_uchar4(r, g, b, 0);
+}
+
+ccl_device_inline float3 color_byte_to_float(uchar4 c)
+{
+ return make_float3(c.x*(1.0f/255.0f), c.y*(1.0f/255.0f), c.z*(1.0f/255.0f));
+}
+
ccl_device float color_srgb_to_scene_linear(float c)
{
if(c < 0.04045f)
@@ -149,34 +170,34 @@ ccl_device float3 color_srgb_to_scene_linear(float3 c)
#ifdef __KERNEL_SSE2__
/*
* Calculate initial guess for arg^exp based on float representation
- * This method gives a constant bias, which can be easily compensated by multiplicating with bias_coeff.
+ * This method gives a constant bias, which can be easily compensated by multiplication with bias_coeff.
* Gives better results for exponents near 1 (e. g. 4/5).
* exp = exponent, encoded as uint32_t
* e2coeff = 2^(127/exponent - 127) * bias_coeff^(1/exponent), encoded as uint32_t
*/
template<unsigned exp, unsigned e2coeff>
-ccl_device_inline __m128 fastpow(const __m128 &arg)
+ccl_device_inline ssef fastpow(const ssef &arg)
{
- __m128 ret;
- ret = _mm_mul_ps(arg, _mm_castsi128_ps(_mm_set1_epi32(e2coeff)));
- ret = _mm_cvtepi32_ps(_mm_castps_si128(ret));
- ret = _mm_mul_ps(ret, _mm_castsi128_ps(_mm_set1_epi32(exp)));
- ret = _mm_castsi128_ps(_mm_cvtps_epi32(ret));
+ ssef ret;
+ ret = arg * cast(ssei(e2coeff));
+ ret = ssef(cast(ret));
+ ret = ret * cast(ssei(exp));
+ ret = cast(ssei(ret));
return ret;
}
/* Improve x ^ 1.0f/5.0f solution with Newton-Raphson method */
-ccl_device_inline __m128 improve_5throot_solution(const __m128 &old_result, const __m128 &x)
+ccl_device_inline ssef improve_5throot_solution(const ssef &old_result, const ssef &x)
{
- __m128 approx2 = _mm_mul_ps(old_result, old_result);
- __m128 approx4 = _mm_mul_ps(approx2, approx2);
- __m128 t = _mm_div_ps(x, approx4);
- __m128 summ = _mm_add_ps(_mm_mul_ps(_mm_set1_ps(4.0f), old_result), t); /* fma */
- return _mm_mul_ps(summ, _mm_set1_ps(1.0f/5.0f));
+ ssef approx2 = old_result * old_result;
+ ssef approx4 = approx2 * approx2;
+ ssef t = x / approx4;
+ ssef summ = madd(ssef(4.0f), old_result, t);
+ return summ * ssef(1.0f/5.0f);
}
/* Calculate powf(x, 2.4). Working domain: 1e-10 < x < 1e+10 */
-ccl_device_inline __m128 fastpow24(const __m128 &arg)
+ccl_device_inline ssef fastpow24(const ssef &arg)
{
/* max, avg and |avg| errors were calculated in gcc without FMA instructions
* The final precision should be better than powf in glibc */
@@ -184,22 +205,22 @@ ccl_device_inline __m128 fastpow24(const __m128 &arg)
/* Calculate x^4/5, coefficient 0.994 was constructed manually to minimize avg error */
/* 0x3F4CCCCD = 4/5 */
/* 0x4F55A7FB = 2^(127/(4/5) - 127) * 0.994^(1/(4/5)) */
- __m128 x = fastpow<0x3F4CCCCD, 0x4F55A7FB>(arg); // error max = 0.17 avg = 0.0018 |avg| = 0.05
- __m128 arg2 = _mm_mul_ps(arg, arg);
- __m128 arg4 = _mm_mul_ps(arg2, arg2);
+ ssef x = fastpow<0x3F4CCCCD, 0x4F55A7FB>(arg); // error max = 0.17 avg = 0.0018 |avg| = 0.05
+ ssef arg2 = arg * arg;
+ ssef arg4 = arg2 * arg2;
x = improve_5throot_solution(x, arg4); /* error max = 0.018 avg = 0.0031 |avg| = 0.0031 */
x = improve_5throot_solution(x, arg4); /* error max = 0.00021 avg = 1.6e-05 |avg| = 1.6e-05 */
x = improve_5throot_solution(x, arg4); /* error max = 6.1e-07 avg = 5.2e-08 |avg| = 1.1e-07 */
- return _mm_mul_ps(x, _mm_mul_ps(x, x));
+ return x * (x * x);
}
-ccl_device __m128 color_srgb_to_scene_linear(const __m128 &c)
+ccl_device ssef color_srgb_to_scene_linear(const ssef &c)
{
- __m128 cmp = _mm_cmplt_ps(c, _mm_set1_ps(0.04045f));
- __m128 lt = _mm_max_ps(_mm_mul_ps(c, _mm_set1_ps(1.0f/12.92f)), _mm_set1_ps(0.0f));
- __m128 gtebase = _mm_mul_ps(_mm_add_ps(c, _mm_set1_ps(0.055f)), _mm_set1_ps(1.0f/1.055f)); /* fma */
- __m128 gte = fastpow24(gtebase);
- return blend(cmp, lt, gte);
+ sseb cmp = c < ssef(0.04045f);
+ ssef lt = max(c * ssef(1.0f/12.92f), ssef(0.0f));
+ ssef gtebase = (c + ssef(0.055f)) * ssef(1.0f/1.055f); /* fma */
+ ssef gte = fastpow24(gtebase);
+ return select(cmp, lt, gte);
}
#endif
diff --git a/intern/cycles/util/util_cuda.cpp b/intern/cycles/util/util_cuda.cpp
deleted file mode 100644
index e9140633e4a..00000000000
--- a/intern/cycles/util/util_cuda.cpp
+++ /dev/null
@@ -1,495 +0,0 @@
-/*
- * Copyright 2011-2013 Blender Foundation
- *
- * Licensed under the Apache License, Version 2.0 (the "License");
- * you may not use this file except in compliance with the License.
- * You may obtain a copy of the License at
- *
- * http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License
- */
-
-#include <iostream>
-
-#include <stdlib.h>
-#include <stdio.h>
-
-#include "util_cuda.h"
-#include "util_debug.h"
-#include "util_dynlib.h"
-#include "util_path.h"
-#include "util_string.h"
-
-#ifdef _WIN32
-#define popen _popen
-#define pclose _pclose
-#endif
-
-/* function defininitions */
-
-tcuInit *cuInit;
-tcuDriverGetVersion *cuDriverGetVersion;
-tcuDeviceGet *cuDeviceGet;
-tcuDeviceGetCount *cuDeviceGetCount;
-tcuDeviceGetName *cuDeviceGetName;
-tcuDeviceComputeCapability *cuDeviceComputeCapability;
-tcuDeviceTotalMem *cuDeviceTotalMem;
-tcuDeviceGetProperties *cuDeviceGetProperties;
-tcuDeviceGetAttribute *cuDeviceGetAttribute;
-tcuCtxCreate *cuCtxCreate;
-tcuCtxDestroy *cuCtxDestroy;
-tcuCtxAttach *cuCtxAttach;
-tcuCtxDetach *cuCtxDetach;
-tcuCtxPushCurrent *cuCtxPushCurrent;
-tcuCtxPopCurrent *cuCtxPopCurrent;
-tcuCtxGetDevice *cuCtxGetDevice;
-tcuCtxSynchronize *cuCtxSynchronize;
-tcuModuleLoad *cuModuleLoad;
-tcuModuleLoadData *cuModuleLoadData;
-tcuModuleLoadDataEx *cuModuleLoadDataEx;
-tcuModuleLoadFatBinary *cuModuleLoadFatBinary;
-tcuModuleUnload *cuModuleUnload;
-tcuModuleGetFunction *cuModuleGetFunction;
-tcuModuleGetGlobal *cuModuleGetGlobal;
-tcuModuleGetTexRef *cuModuleGetTexRef;
-tcuModuleGetSurfRef *cuModuleGetSurfRef;
-tcuMemGetInfo *cuMemGetInfo;
-tcuMemAlloc *cuMemAlloc;
-tcuMemAllocPitch *cuMemAllocPitch;
-tcuMemFree *cuMemFree;
-tcuMemGetAddressRange *cuMemGetAddressRange;
-tcuMemAllocHost *cuMemAllocHost;
-tcuMemFreeHost *cuMemFreeHost;
-tcuMemHostAlloc *cuMemHostAlloc;
-tcuMemHostGetDevicePointer *cuMemHostGetDevicePointer;
-tcuMemHostGetFlags *cuMemHostGetFlags;
-tcuMemcpyHtoD *cuMemcpyHtoD;
-tcuMemcpyDtoH *cuMemcpyDtoH;
-tcuMemcpyDtoD *cuMemcpyDtoD;
-tcuMemcpyDtoA *cuMemcpyDtoA;
-tcuMemcpyAtoD *cuMemcpyAtoD;
-tcuMemcpyHtoA *cuMemcpyHtoA;
-tcuMemcpyAtoH *cuMemcpyAtoH;
-tcuMemcpyAtoA *cuMemcpyAtoA;
-tcuMemcpy2D *cuMemcpy2D;
-tcuMemcpy2DUnaligned *cuMemcpy2DUnaligned;
-tcuMemcpy3D *cuMemcpy3D;
-tcuMemcpyHtoDAsync *cuMemcpyHtoDAsync;
-tcuMemcpyDtoHAsync *cuMemcpyDtoHAsync;
-tcuMemcpyDtoDAsync *cuMemcpyDtoDAsync;
-tcuMemcpyHtoAAsync *cuMemcpyHtoAAsync;
-tcuMemcpyAtoHAsync *cuMemcpyAtoHAsync;
-tcuMemcpy2DAsync *cuMemcpy2DAsync;
-tcuMemcpy3DAsync *cuMemcpy3DAsync;
-tcuMemsetD8 *cuMemsetD8;
-tcuMemsetD16 *cuMemsetD16;
-tcuMemsetD32 *cuMemsetD32;
-tcuMemsetD2D8 *cuMemsetD2D8;
-tcuMemsetD2D16 *cuMemsetD2D16;
-tcuMemsetD2D32 *cuMemsetD2D32;
-tcuFuncSetBlockShape *cuFuncSetBlockShape;
-tcuFuncSetSharedSize *cuFuncSetSharedSize;
-tcuFuncGetAttribute *cuFuncGetAttribute;
-tcuFuncSetCacheConfig *cuFuncSetCacheConfig;
-tcuArrayCreate *cuArrayCreate;
-tcuArrayGetDescriptor *cuArrayGetDescriptor;
-tcuArrayDestroy *cuArrayDestroy;
-tcuArray3DCreate *cuArray3DCreate;
-tcuArray3DGetDescriptor *cuArray3DGetDescriptor;
-tcuTexRefCreate *cuTexRefCreate;
-tcuTexRefDestroy *cuTexRefDestroy;
-tcuTexRefSetArray *cuTexRefSetArray;
-tcuTexRefSetAddress *cuTexRefSetAddress;
-tcuTexRefSetAddress2D *cuTexRefSetAddress2D;
-tcuTexRefSetFormat *cuTexRefSetFormat;
-tcuTexRefSetAddressMode *cuTexRefSetAddressMode;
-tcuTexRefSetFilterMode *cuTexRefSetFilterMode;
-tcuTexRefSetFlags *cuTexRefSetFlags;
-tcuTexRefGetAddress *cuTexRefGetAddress;
-tcuTexRefGetArray *cuTexRefGetArray;
-tcuTexRefGetAddressMode *cuTexRefGetAddressMode;
-tcuTexRefGetFilterMode *cuTexRefGetFilterMode;
-tcuTexRefGetFormat *cuTexRefGetFormat;
-tcuTexRefGetFlags *cuTexRefGetFlags;
-tcuSurfRefSetArray *cuSurfRefSetArray;
-tcuSurfRefGetArray *cuSurfRefGetArray;
-tcuParamSetSize *cuParamSetSize;
-tcuParamSeti *cuParamSeti;
-tcuParamSetf *cuParamSetf;
-tcuParamSetv *cuParamSetv;
-tcuParamSetTexRef *cuParamSetTexRef;
-tcuLaunch *cuLaunch;
-tcuLaunchGrid *cuLaunchGrid;
-tcuLaunchGridAsync *cuLaunchGridAsync;
-tcuEventCreate *cuEventCreate;
-tcuEventRecord *cuEventRecord;
-tcuEventQuery *cuEventQuery;
-tcuEventSynchronize *cuEventSynchronize;
-tcuEventDestroy *cuEventDestroy;
-tcuEventElapsedTime *cuEventElapsedTime;
-tcuStreamCreate *cuStreamCreate;
-tcuStreamQuery *cuStreamQuery;
-tcuStreamSynchronize *cuStreamSynchronize;
-tcuStreamDestroy *cuStreamDestroy;
-tcuGraphicsUnregisterResource *cuGraphicsUnregisterResource;
-tcuGraphicsSubResourceGetMappedArray *cuGraphicsSubResourceGetMappedArray;
-tcuGraphicsResourceGetMappedPointer *cuGraphicsResourceGetMappedPointer;
-tcuGraphicsResourceSetMapFlags *cuGraphicsResourceSetMapFlags;
-tcuGraphicsMapResources *cuGraphicsMapResources;
-tcuGraphicsUnmapResources *cuGraphicsUnmapResources;
-tcuGetExportTable *cuGetExportTable;
-tcuCtxSetLimit *cuCtxSetLimit;
-tcuCtxGetLimit *cuCtxGetLimit;
-tcuGLCtxCreate *cuGLCtxCreate;
-tcuGraphicsGLRegisterBuffer *cuGraphicsGLRegisterBuffer;
-tcuGraphicsGLRegisterImage *cuGraphicsGLRegisterImage;
-tcuCtxSetCurrent *cuCtxSetCurrent;
-
-CCL_NAMESPACE_BEGIN
-
-/* utility macros */
-#define CUDA_LIBRARY_FIND_CHECKED(name) \
- name = (t##name*)dynamic_library_find(lib, #name);
-
-#define CUDA_LIBRARY_FIND(name) \
- name = (t##name*)dynamic_library_find(lib, #name); \
- assert(name);
-
-#define CUDA_LIBRARY_FIND_V2(name) \
- name = (t##name*)dynamic_library_find(lib, #name "_v2"); \
- assert(name);
-
-/* initialization function */
-
-bool cuLibraryInit()
-{
- static bool initialized = false;
- static bool result = false;
-
- if(initialized)
- return result;
-
- initialized = true;
-
- /* library paths */
-#ifdef _WIN32
- /* expected in c:/windows/system or similar, no path needed */
- const char *path = "nvcuda.dll";
-#elif defined(__APPLE__)
- /* default installation path */
- const char *path = "/usr/local/cuda/lib/libcuda.dylib";
-#else
- const char *path = "libcuda.so";
-#endif
-
- /* load library */
- DynamicLibrary *lib = dynamic_library_open(path);
-
- if(lib == NULL)
- return false;
-
- /* detect driver version */
- int driver_version = 1000;
-
- CUDA_LIBRARY_FIND_CHECKED(cuDriverGetVersion);
- if(cuDriverGetVersion)
- cuDriverGetVersion(&driver_version);
-
- /* we require version 4.0 */
- if(driver_version < 4000)
- return false;
-
- /* fetch all function pointers */
- CUDA_LIBRARY_FIND(cuInit);
- CUDA_LIBRARY_FIND(cuDeviceGet);
- CUDA_LIBRARY_FIND(cuDeviceGetCount);
- CUDA_LIBRARY_FIND(cuDeviceGetName);
- CUDA_LIBRARY_FIND(cuDeviceComputeCapability);
- CUDA_LIBRARY_FIND(cuDeviceTotalMem);
- CUDA_LIBRARY_FIND(cuDeviceGetProperties);
- CUDA_LIBRARY_FIND(cuDeviceGetAttribute);
- CUDA_LIBRARY_FIND(cuCtxCreate);
- CUDA_LIBRARY_FIND(cuCtxDestroy);
- CUDA_LIBRARY_FIND(cuCtxAttach);
- CUDA_LIBRARY_FIND(cuCtxDetach);
- CUDA_LIBRARY_FIND(cuCtxPushCurrent);
- CUDA_LIBRARY_FIND(cuCtxPopCurrent);
- CUDA_LIBRARY_FIND(cuCtxGetDevice);
- CUDA_LIBRARY_FIND(cuCtxSynchronize);
- CUDA_LIBRARY_FIND(cuModuleLoad);
- CUDA_LIBRARY_FIND(cuModuleLoadData);
- CUDA_LIBRARY_FIND(cuModuleUnload);
- CUDA_LIBRARY_FIND(cuModuleGetFunction);
- CUDA_LIBRARY_FIND(cuModuleGetGlobal);
- CUDA_LIBRARY_FIND(cuModuleGetTexRef);
- CUDA_LIBRARY_FIND(cuMemGetInfo);
- CUDA_LIBRARY_FIND(cuMemAlloc);
- CUDA_LIBRARY_FIND(cuMemAllocPitch);
- CUDA_LIBRARY_FIND(cuMemFree);
- CUDA_LIBRARY_FIND(cuMemGetAddressRange);
- CUDA_LIBRARY_FIND(cuMemAllocHost);
- CUDA_LIBRARY_FIND(cuMemFreeHost);
- CUDA_LIBRARY_FIND(cuMemHostAlloc);
- CUDA_LIBRARY_FIND(cuMemHostGetDevicePointer);
- CUDA_LIBRARY_FIND(cuMemcpyHtoD);
- CUDA_LIBRARY_FIND(cuMemcpyDtoH);
- CUDA_LIBRARY_FIND(cuMemcpyDtoD);
- CUDA_LIBRARY_FIND(cuMemcpyDtoA);
- CUDA_LIBRARY_FIND(cuMemcpyAtoD);
- CUDA_LIBRARY_FIND(cuMemcpyHtoA);
- CUDA_LIBRARY_FIND(cuMemcpyAtoH);
- CUDA_LIBRARY_FIND(cuMemcpyAtoA);
- CUDA_LIBRARY_FIND(cuMemcpy2D);
- CUDA_LIBRARY_FIND(cuMemcpy2DUnaligned);
- CUDA_LIBRARY_FIND(cuMemcpy3D);
- CUDA_LIBRARY_FIND(cuMemcpyHtoDAsync);
- CUDA_LIBRARY_FIND(cuMemcpyDtoHAsync);
- CUDA_LIBRARY_FIND(cuMemcpyHtoAAsync);
- CUDA_LIBRARY_FIND(cuMemcpyAtoHAsync);
- CUDA_LIBRARY_FIND(cuMemcpy2DAsync);
- CUDA_LIBRARY_FIND(cuMemcpy3DAsync);
- CUDA_LIBRARY_FIND(cuMemsetD8);
- CUDA_LIBRARY_FIND(cuMemsetD16);
- CUDA_LIBRARY_FIND(cuMemsetD32);
- CUDA_LIBRARY_FIND(cuMemsetD2D8);
- CUDA_LIBRARY_FIND(cuMemsetD2D16);
- CUDA_LIBRARY_FIND(cuMemsetD2D32);
- CUDA_LIBRARY_FIND(cuFuncSetBlockShape);
- CUDA_LIBRARY_FIND(cuFuncSetSharedSize);
- CUDA_LIBRARY_FIND(cuFuncGetAttribute);
- CUDA_LIBRARY_FIND(cuArrayCreate);
- CUDA_LIBRARY_FIND(cuArrayGetDescriptor);
- CUDA_LIBRARY_FIND(cuArrayDestroy);
- CUDA_LIBRARY_FIND(cuArray3DCreate);
- CUDA_LIBRARY_FIND(cuArray3DGetDescriptor);
- CUDA_LIBRARY_FIND(cuTexRefCreate);
- CUDA_LIBRARY_FIND(cuTexRefDestroy);
- CUDA_LIBRARY_FIND(cuTexRefSetArray);
- CUDA_LIBRARY_FIND(cuTexRefSetAddress);
- CUDA_LIBRARY_FIND(cuTexRefSetAddress2D);
- CUDA_LIBRARY_FIND(cuTexRefSetFormat);
- CUDA_LIBRARY_FIND(cuTexRefSetAddressMode);
- CUDA_LIBRARY_FIND(cuTexRefSetFilterMode);
- CUDA_LIBRARY_FIND(cuTexRefSetFlags);
- CUDA_LIBRARY_FIND(cuTexRefGetAddress);
- CUDA_LIBRARY_FIND(cuTexRefGetArray);
- CUDA_LIBRARY_FIND(cuTexRefGetAddressMode);
- CUDA_LIBRARY_FIND(cuTexRefGetFilterMode);
- CUDA_LIBRARY_FIND(cuTexRefGetFormat);
- CUDA_LIBRARY_FIND(cuTexRefGetFlags);
- CUDA_LIBRARY_FIND(cuParamSetSize);
- CUDA_LIBRARY_FIND(cuParamSeti);
- CUDA_LIBRARY_FIND(cuParamSetf);
- CUDA_LIBRARY_FIND(cuParamSetv);
- CUDA_LIBRARY_FIND(cuParamSetTexRef);
- CUDA_LIBRARY_FIND(cuLaunch);
- CUDA_LIBRARY_FIND(cuLaunchGrid);
- CUDA_LIBRARY_FIND(cuLaunchGridAsync);
- CUDA_LIBRARY_FIND(cuEventCreate);
- CUDA_LIBRARY_FIND(cuEventRecord);
- CUDA_LIBRARY_FIND(cuEventQuery);
- CUDA_LIBRARY_FIND(cuEventSynchronize);
- CUDA_LIBRARY_FIND(cuEventDestroy);
- CUDA_LIBRARY_FIND(cuEventElapsedTime);
- CUDA_LIBRARY_FIND(cuStreamCreate);
- CUDA_LIBRARY_FIND(cuStreamQuery);
- CUDA_LIBRARY_FIND(cuStreamSynchronize);
- CUDA_LIBRARY_FIND(cuStreamDestroy);
-
- /* cuda 2.1 */
- CUDA_LIBRARY_FIND(cuModuleLoadDataEx);
- CUDA_LIBRARY_FIND(cuModuleLoadFatBinary);
- CUDA_LIBRARY_FIND(cuGLCtxCreate);
- CUDA_LIBRARY_FIND(cuGraphicsGLRegisterBuffer);
- CUDA_LIBRARY_FIND(cuGraphicsGLRegisterImage);
-
- /* cuda 2.3 */
- CUDA_LIBRARY_FIND(cuMemHostGetFlags);
- CUDA_LIBRARY_FIND(cuGraphicsGLRegisterBuffer);
- CUDA_LIBRARY_FIND(cuGraphicsGLRegisterImage);
-
- /* cuda 3.0 */
- CUDA_LIBRARY_FIND(cuMemcpyDtoDAsync);
- CUDA_LIBRARY_FIND(cuFuncSetCacheConfig);
- CUDA_LIBRARY_FIND(cuGraphicsUnregisterResource);
- CUDA_LIBRARY_FIND(cuGraphicsSubResourceGetMappedArray);
- CUDA_LIBRARY_FIND(cuGraphicsResourceGetMappedPointer);
- CUDA_LIBRARY_FIND(cuGraphicsResourceSetMapFlags);
- CUDA_LIBRARY_FIND(cuGraphicsMapResources);
- CUDA_LIBRARY_FIND(cuGraphicsUnmapResources);
- CUDA_LIBRARY_FIND(cuGetExportTable);
-
- /* cuda 3.1 */
- CUDA_LIBRARY_FIND(cuModuleGetSurfRef);
- CUDA_LIBRARY_FIND(cuSurfRefSetArray);
- CUDA_LIBRARY_FIND(cuSurfRefGetArray);
- CUDA_LIBRARY_FIND(cuCtxSetLimit);
- CUDA_LIBRARY_FIND(cuCtxGetLimit);
-
- /* functions which changed 3.1 -> 3.2 for 64 bit stuff, the cuda library
- * has both the old ones for compatibility and new ones with _v2 postfix,
- * we load the _v2 ones here. */
- CUDA_LIBRARY_FIND_V2(cuDeviceTotalMem);
- CUDA_LIBRARY_FIND_V2(cuCtxCreate);
- CUDA_LIBRARY_FIND_V2(cuModuleGetGlobal);
- CUDA_LIBRARY_FIND_V2(cuMemGetInfo);
- CUDA_LIBRARY_FIND_V2(cuMemAlloc);
- CUDA_LIBRARY_FIND_V2(cuMemAllocPitch);
- CUDA_LIBRARY_FIND_V2(cuMemFree);
- CUDA_LIBRARY_FIND_V2(cuMemGetAddressRange);
- CUDA_LIBRARY_FIND_V2(cuMemAllocHost);
- CUDA_LIBRARY_FIND_V2(cuMemHostGetDevicePointer);
- CUDA_LIBRARY_FIND_V2(cuMemcpyHtoD);
- CUDA_LIBRARY_FIND_V2(cuMemcpyDtoH);
- CUDA_LIBRARY_FIND_V2(cuMemcpyDtoD);
- CUDA_LIBRARY_FIND_V2(cuMemcpyDtoA);
- CUDA_LIBRARY_FIND_V2(cuMemcpyAtoD);
- CUDA_LIBRARY_FIND_V2(cuMemcpyHtoA);
- CUDA_LIBRARY_FIND_V2(cuMemcpyAtoH);
- CUDA_LIBRARY_FIND_V2(cuMemcpyAtoA);
- CUDA_LIBRARY_FIND_V2(cuMemcpyHtoAAsync);
- CUDA_LIBRARY_FIND_V2(cuMemcpyAtoHAsync);
- CUDA_LIBRARY_FIND_V2(cuMemcpy2D);
- CUDA_LIBRARY_FIND_V2(cuMemcpy2DUnaligned);
- CUDA_LIBRARY_FIND_V2(cuMemcpy3D);
- CUDA_LIBRARY_FIND_V2(cuMemcpyHtoDAsync);
- CUDA_LIBRARY_FIND_V2(cuMemcpyDtoHAsync);
- CUDA_LIBRARY_FIND_V2(cuMemcpyDtoDAsync);
- CUDA_LIBRARY_FIND_V2(cuMemcpy2DAsync);
- CUDA_LIBRARY_FIND_V2(cuMemcpy3DAsync);
- CUDA_LIBRARY_FIND_V2(cuMemsetD8);
- CUDA_LIBRARY_FIND_V2(cuMemsetD16);
- CUDA_LIBRARY_FIND_V2(cuMemsetD32);
- CUDA_LIBRARY_FIND_V2(cuMemsetD2D8);
- CUDA_LIBRARY_FIND_V2(cuMemsetD2D16);
- CUDA_LIBRARY_FIND_V2(cuMemsetD2D32);
- CUDA_LIBRARY_FIND_V2(cuArrayCreate);
- CUDA_LIBRARY_FIND_V2(cuArrayGetDescriptor);
- CUDA_LIBRARY_FIND_V2(cuArray3DCreate);
- CUDA_LIBRARY_FIND_V2(cuArray3DGetDescriptor);
- CUDA_LIBRARY_FIND_V2(cuTexRefSetAddress);
- CUDA_LIBRARY_FIND_V2(cuTexRefSetAddress2D);
- CUDA_LIBRARY_FIND_V2(cuTexRefGetAddress);
- CUDA_LIBRARY_FIND_V2(cuGraphicsResourceGetMappedPointer);
- CUDA_LIBRARY_FIND_V2(cuGLCtxCreate);
-
- /* cuda 4.0 */
- CUDA_LIBRARY_FIND(cuCtxSetCurrent);
-
- if(cuHavePrecompiledKernels())
- result = true;
-#ifndef _WIN32
- else if(cuCompilerPath() != "")
- result = true;
-#endif
-
- return result;
-}
-
-bool cuHavePrecompiledKernels()
-{
- string cubins_path = path_get("lib");
-
- return path_exists(cubins_path);
-}
-
-string cuCompilerPath()
-{
-#ifdef _WIN32
- const char *defaultpaths[] = {"C:/CUDA/bin", NULL};
- const char *executable = "nvcc.exe";
-#else
- const char *defaultpaths[] = {
- "/Developer/NVIDIA/CUDA-5.0/bin",
- "/usr/local/cuda-5.0/bin",
- "/usr/local/cuda/bin",
- "/Developer/NVIDIA/CUDA-6.0/bin",
- "/usr/local/cuda-6.0/bin",
- "/Developer/NVIDIA/CUDA-5.5/bin",
- "/usr/local/cuda-5.5/bin",
- NULL};
- const char *executable = "nvcc";
-#endif
-
- const char *binpath = getenv("CUDA_BIN_PATH");
-
- string nvcc;
-
- if(binpath) {
- nvcc = path_join(binpath, executable);
- if(path_exists(nvcc))
- return nvcc;
- }
-
- for(int i = 0; defaultpaths[i]; i++) {
- nvcc = path_join(defaultpaths[i], executable);
- if(path_exists(nvcc))
- return nvcc;
- }
-
-#ifndef _WIN32
- {
- FILE *handle = popen("which nvcc", "r");
- if(handle) {
- char buffer[4096] = {0};
- int len = fread(buffer, 1, sizeof(buffer) - 1, handle);
- buffer[len] = '\0';
- pclose(handle);
-
- if(buffer[0])
- return "nvcc";
- }
- }
-#endif
-
- return "";
-}
-
-int cuCompilerVersion()
-{
- string path = cuCompilerPath();
- if(path == "")
- return 0;
-
- /* get --version output */
- FILE *pipe = popen((path + " --version").c_str(), "r");
- if(!pipe) {
- fprintf(stderr, "CUDA: failed to run compiler to retrieve version");
- return 0;
- }
-
- char buf[128];
- string output = "";
-
- while(!feof(pipe))
- if(fgets(buf, 128, pipe) != NULL)
- output += buf;
-
- pclose(pipe);
-
- /* parse version number */
- string marker = "Cuda compilation tools, release ";
- size_t offset = output.find(marker);
- if(offset == string::npos) {
- fprintf(stderr, "CUDA: failed to find version number in:\n\n%s\n", output.c_str());
- return 0;
- }
-
- string versionstr = output.substr(offset + marker.size(), string::npos);
- int major, minor;
-
- if(sscanf(versionstr.c_str(), "%d.%d", &major, &minor) < 2) {
- fprintf(stderr, "CUDA: failed to parse version number from:\n\n%s\n", output.c_str());
- return 0;
- }
-
- return 10*major + minor;
-}
-
-CCL_NAMESPACE_END
-
diff --git a/intern/cycles/util/util_cuda.h b/intern/cycles/util/util_cuda.h
deleted file mode 100644
index 0c80303df9b..00000000000
--- a/intern/cycles/util/util_cuda.h
+++ /dev/null
@@ -1,624 +0,0 @@
-/*
- * Copyright 2011-2013 Blender Foundation
- *
- * Licensed under the Apache License, Version 2.0 (the "License");
- * you may not use this file except in compliance with the License.
- * You may obtain a copy of the License at
- *
- * http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License
- */
-
-#ifndef __UTIL_CUDA_H__
-#define __UTIL_CUDA_H__
-
-#include <stdlib.h>
-#include "util_opengl.h"
-#include "util_string.h"
-
-CCL_NAMESPACE_BEGIN
-
-/* CUDA is linked in dynamically at runtime, so we can start the application
- * without requiring a CUDA installation. Code adapted from the example
- * matrixMulDynlinkJIT in the CUDA SDK. */
-
-bool cuLibraryInit();
-bool cuHavePrecompiledKernels();
-string cuCompilerPath();
-int cuCompilerVersion();
-
-CCL_NAMESPACE_END
-
-/* defines, structs, enums */
-
-#define CUDA_VERSION 3020
-
-#if defined(__x86_64) || defined(AMD64) || defined(_M_AMD64) || defined(__LP64__)
-typedef unsigned long long CUdeviceptr;
-#else
-typedef unsigned int CUdeviceptr;
-#endif
-
-typedef int CUdevice;
-typedef struct CUctx_st *CUcontext;
-typedef struct CUmod_st *CUmodule;
-typedef struct CUfunc_st *CUfunction;
-typedef struct CUarray_st *CUarray;
-typedef struct CUtexref_st *CUtexref;
-typedef struct CUsurfref_st *CUsurfref;
-typedef struct CUevent_st *CUevent;
-typedef struct CUstream_st *CUstream;
-typedef struct CUgraphicsResource_st *CUgraphicsResource;
-
-typedef struct CUuuid_st {
- char bytes[16];
-} CUuuid;
-
-typedef enum CUctx_flags_enum {
- CU_CTX_SCHED_AUTO = 0,
- CU_CTX_SCHED_SPIN = 1,
- CU_CTX_SCHED_YIELD = 2,
- CU_CTX_SCHED_MASK = 0x3,
- CU_CTX_BLOCKING_SYNC = 4,
- CU_CTX_MAP_HOST = 8,
- CU_CTX_LMEM_RESIZE_TO_MAX = 16,
- CU_CTX_FLAGS_MASK = 0x1f
-} CUctx_flags;
-
-typedef enum CUevent_flags_enum {
- CU_EVENT_DEFAULT = 0,
- CU_EVENT_BLOCKING_SYNC = 1,
- CU_EVENT_DISABLE_TIMING = 2
-} CUevent_flags;
-
-typedef enum CUarray_format_enum {
- CU_AD_FORMAT_UNSIGNED_INT8 = 0x01,
- CU_AD_FORMAT_UNSIGNED_INT16 = 0x02,
- CU_AD_FORMAT_UNSIGNED_INT32 = 0x03,
- CU_AD_FORMAT_SIGNED_INT8 = 0x08,
- CU_AD_FORMAT_SIGNED_INT16 = 0x09,
- CU_AD_FORMAT_SIGNED_INT32 = 0x0a,
- CU_AD_FORMAT_HALF = 0x10,
- CU_AD_FORMAT_FLOAT = 0x20
-} CUarray_format;
-
-typedef enum CUaddress_mode_enum {
- CU_TR_ADDRESS_MODE_WRAP = 0,
- CU_TR_ADDRESS_MODE_CLAMP = 1,
- CU_TR_ADDRESS_MODE_MIRROR = 2,
- CU_TR_ADDRESS_MODE_BORDER = 3
-} CUaddress_mode;
-
-typedef enum CUfilter_mode_enum {
- CU_TR_FILTER_MODE_POINT = 0,
- CU_TR_FILTER_MODE_LINEAR = 1
-} CUfilter_mode;
-
-typedef enum CUdevice_attribute_enum {
- CU_DEVICE_ATTRIBUTE_MAX_THREADS_PER_BLOCK = 1,
- CU_DEVICE_ATTRIBUTE_MAX_BLOCK_DIM_X = 2,
- CU_DEVICE_ATTRIBUTE_MAX_BLOCK_DIM_Y = 3,
- CU_DEVICE_ATTRIBUTE_MAX_BLOCK_DIM_Z = 4,
- CU_DEVICE_ATTRIBUTE_MAX_GRID_DIM_X = 5,
- CU_DEVICE_ATTRIBUTE_MAX_GRID_DIM_Y = 6,
- CU_DEVICE_ATTRIBUTE_MAX_GRID_DIM_Z = 7,
- CU_DEVICE_ATTRIBUTE_MAX_SHARED_MEMORY_PER_BLOCK = 8,
- CU_DEVICE_ATTRIBUTE_SHARED_MEMORY_PER_BLOCK = 8,
- CU_DEVICE_ATTRIBUTE_TOTAL_CONSTANT_MEMORY = 9,
- CU_DEVICE_ATTRIBUTE_WARP_SIZE = 10,
- CU_DEVICE_ATTRIBUTE_MAX_PITCH = 11,
- CU_DEVICE_ATTRIBUTE_MAX_REGISTERS_PER_BLOCK = 12,
- CU_DEVICE_ATTRIBUTE_REGISTERS_PER_BLOCK = 12,
- CU_DEVICE_ATTRIBUTE_CLOCK_RATE = 13,
- CU_DEVICE_ATTRIBUTE_TEXTURE_ALIGNMENT = 14,
- CU_DEVICE_ATTRIBUTE_GPU_OVERLAP = 15,
- CU_DEVICE_ATTRIBUTE_MULTIPROCESSOR_COUNT = 16,
- CU_DEVICE_ATTRIBUTE_KERNEL_EXEC_TIMEOUT = 17,
- CU_DEVICE_ATTRIBUTE_INTEGRATED = 18,
- CU_DEVICE_ATTRIBUTE_CAN_MAP_HOST_MEMORY = 19,
- CU_DEVICE_ATTRIBUTE_COMPUTE_MODE = 20,
- CU_DEVICE_ATTRIBUTE_MAXIMUM_TEXTURE1D_WIDTH = 21,
- CU_DEVICE_ATTRIBUTE_MAXIMUM_TEXTURE2D_WIDTH = 22,
- CU_DEVICE_ATTRIBUTE_MAXIMUM_TEXTURE2D_HEIGHT = 23,
- CU_DEVICE_ATTRIBUTE_MAXIMUM_TEXTURE3D_WIDTH = 24,
- CU_DEVICE_ATTRIBUTE_MAXIMUM_TEXTURE3D_HEIGHT = 25,
- CU_DEVICE_ATTRIBUTE_MAXIMUM_TEXTURE3D_DEPTH = 26,
- CU_DEVICE_ATTRIBUTE_MAXIMUM_TEXTURE2D_ARRAY_WIDTH = 27,
- CU_DEVICE_ATTRIBUTE_MAXIMUM_TEXTURE2D_ARRAY_HEIGHT = 28,
- CU_DEVICE_ATTRIBUTE_MAXIMUM_TEXTURE2D_ARRAY_NUMSLICES = 29,
- CU_DEVICE_ATTRIBUTE_SURFACE_ALIGNMENT = 30,
- CU_DEVICE_ATTRIBUTE_CONCURRENT_KERNELS = 31,
- CU_DEVICE_ATTRIBUTE_ECC_ENABLED = 32,
- CU_DEVICE_ATTRIBUTE_PCI_BUS_ID = 33,
- CU_DEVICE_ATTRIBUTE_PCI_DEVICE_ID = 34,
- CU_DEVICE_ATTRIBUTE_TCC_DRIVER = 35
-} CUdevice_attribute;
-
-typedef struct CUdevprop_st {
- int maxThreadsPerBlock;
- int maxThreadsDim[3];
- int maxGridSize[3];
- int sharedMemPerBlock;
- int totalConstantMemory;
- int SIMDWidth;
- int memPitch;
- int regsPerBlock;
- int clockRate;
- int textureAlign;
-} CUdevprop;
-
-typedef enum CUfunction_attribute_enum {
- CU_FUNC_ATTRIBUTE_MAX_THREADS_PER_BLOCK = 0,
- CU_FUNC_ATTRIBUTE_SHARED_SIZE_BYTES = 1,
- CU_FUNC_ATTRIBUTE_CONST_SIZE_BYTES = 2,
- CU_FUNC_ATTRIBUTE_LOCAL_SIZE_BYTES = 3,
- CU_FUNC_ATTRIBUTE_NUM_REGS = 4,
- CU_FUNC_ATTRIBUTE_PTX_VERSION = 5,
- CU_FUNC_ATTRIBUTE_BINARY_VERSION = 6,
- CU_FUNC_ATTRIBUTE_MAX
-} CUfunction_attribute;
-
-typedef enum CUfunc_cache_enum {
- CU_FUNC_CACHE_PREFER_NONE = 0x00,
- CU_FUNC_CACHE_PREFER_SHARED = 0x01,
- CU_FUNC_CACHE_PREFER_L1 = 0x02
-} CUfunc_cache;
-
-typedef enum CUmemorytype_enum {
- CU_MEMORYTYPE_HOST = 0x01,
- CU_MEMORYTYPE_DEVICE = 0x02,
- CU_MEMORYTYPE_ARRAY = 0x03
-} CUmemorytype;
-
-typedef enum CUcomputemode_enum {
- CU_COMPUTEMODE_DEFAULT = 0,
- CU_COMPUTEMODE_EXCLUSIVE = 1,
- CU_COMPUTEMODE_PROHIBITED = 2
-} CUcomputemode;
-
-typedef enum CUjit_option_enum
-{
- CU_JIT_MAX_REGISTERS = 0,
- CU_JIT_THREADS_PER_BLOCK,
- CU_JIT_WALL_TIME,
- CU_JIT_INFO_LOG_BUFFER,
- CU_JIT_INFO_LOG_BUFFER_SIZE_BYTES,
- CU_JIT_ERROR_LOG_BUFFER,
- CU_JIT_ERROR_LOG_BUFFER_SIZE_BYTES,
- CU_JIT_OPTIMIZATION_LEVEL,
- CU_JIT_TARGET_FROM_CUCONTEXT,
- CU_JIT_TARGET,
- CU_JIT_FALLBACK_STRATEGY
-
-} CUjit_option;
-
-typedef enum CUjit_target_enum
-{
- CU_TARGET_COMPUTE_10 = 0,
- CU_TARGET_COMPUTE_11,
- CU_TARGET_COMPUTE_12,
- CU_TARGET_COMPUTE_13,
- CU_TARGET_COMPUTE_20,
- CU_TARGET_COMPUTE_21,
- CU_TARGET_COMPUTE_30,
- CU_TARGET_COMPUTE_35,
- CU_TARGET_COMPUTE_50
-} CUjit_target;
-
-typedef enum CUjit_fallback_enum
-{
- CU_PREFER_PTX = 0,
- CU_PREFER_BINARY
-
-} CUjit_fallback;
-
-typedef enum CUgraphicsRegisterFlags_enum {
- CU_GRAPHICS_REGISTER_FLAGS_NONE = 0x00
-} CUgraphicsRegisterFlags;
-
-typedef enum CUgraphicsMapResourceFlags_enum {
- CU_GRAPHICS_MAP_RESOURCE_FLAGS_NONE = 0x00,
- CU_GRAPHICS_MAP_RESOURCE_FLAGS_READ_ONLY = 0x01,
- CU_GRAPHICS_MAP_RESOURCE_FLAGS_WRITE_DISCARD = 0x02
-} CUgraphicsMapResourceFlags;
-
-typedef enum CUarray_cubemap_face_enum {
- CU_CUBEMAP_FACE_POSITIVE_X = 0x00,
- CU_CUBEMAP_FACE_NEGATIVE_X = 0x01,
- CU_CUBEMAP_FACE_POSITIVE_Y = 0x02,
- CU_CUBEMAP_FACE_NEGATIVE_Y = 0x03,
- CU_CUBEMAP_FACE_POSITIVE_Z = 0x04,
- CU_CUBEMAP_FACE_NEGATIVE_Z = 0x05
-} CUarray_cubemap_face;
-
-typedef enum CUlimit_enum {
- CU_LIMIT_STACK_SIZE = 0x00,
- CU_LIMIT_PRINTF_FIFO_SIZE = 0x01,
- CU_LIMIT_MALLOC_HEAP_SIZE = 0x02
-} CUlimit;
-
-typedef enum cudaError_enum {
- CUDA_SUCCESS = 0,
- CUDA_ERROR_INVALID_VALUE = 1,
- CUDA_ERROR_OUT_OF_MEMORY = 2,
- CUDA_ERROR_NOT_INITIALIZED = 3,
- CUDA_ERROR_DEINITIALIZED = 4,
- CUDA_ERROR_NO_DEVICE = 100,
- CUDA_ERROR_INVALID_DEVICE = 101,
- CUDA_ERROR_INVALID_IMAGE = 200,
- CUDA_ERROR_INVALID_CONTEXT = 201,
- CUDA_ERROR_CONTEXT_ALREADY_CURRENT = 202,
- CUDA_ERROR_MAP_FAILED = 205,
- CUDA_ERROR_UNMAP_FAILED = 206,
- CUDA_ERROR_ARRAY_IS_MAPPED = 207,
- CUDA_ERROR_ALREADY_MAPPED = 208,
- CUDA_ERROR_NO_BINARY_FOR_GPU = 209,
- CUDA_ERROR_ALREADY_ACQUIRED = 210,
- CUDA_ERROR_NOT_MAPPED = 211,
- CUDA_ERROR_NOT_MAPPED_AS_ARRAY = 212,
- CUDA_ERROR_NOT_MAPPED_AS_POINTER = 213,
- CUDA_ERROR_ECC_UNCORRECTABLE = 214,
- CUDA_ERROR_UNSUPPORTED_LIMIT = 215,
- CUDA_ERROR_INVALID_SOURCE = 300,
- CUDA_ERROR_FILE_NOT_FOUND = 301,
- CUDA_ERROR_SHARED_OBJECT_SYMBOL_NOT_FOUND = 302,
- CUDA_ERROR_SHARED_OBJECT_INIT_FAILED = 303,
- CUDA_ERROR_OPERATING_SYSTEM = 304,
- CUDA_ERROR_INVALID_HANDLE = 400,
- CUDA_ERROR_NOT_FOUND = 500,
- CUDA_ERROR_NOT_READY = 600,
- CUDA_ERROR_LAUNCH_FAILED = 700,
- CUDA_ERROR_LAUNCH_OUT_OF_RESOURCES = 701,
- CUDA_ERROR_LAUNCH_TIMEOUT = 702,
- CUDA_ERROR_LAUNCH_INCOMPATIBLE_TEXTURING = 703,
- CUDA_ERROR_UNKNOWN = 999
-} CUresult;
-
-#define CU_MEMHOSTALLOC_PORTABLE 0x01
-#define CU_MEMHOSTALLOC_DEVICEMAP 0x02
-#define CU_MEMHOSTALLOC_WRITECOMBINED 0x04
-
-typedef struct CUDA_MEMCPY2D_st {
- size_t srcXInBytes;
- size_t srcY;
-
- CUmemorytype srcMemoryType;
- const void *srcHost;
- CUdeviceptr srcDevice;
- CUarray srcArray;
- size_t srcPitch;
-
- size_t dstXInBytes;
- size_t dstY;
-
- CUmemorytype dstMemoryType;
- void *dstHost;
- CUdeviceptr dstDevice;
- CUarray dstArray;
- size_t dstPitch;
-
- size_t WidthInBytes;
- size_t Height;
-} CUDA_MEMCPY2D;
-
-typedef struct CUDA_MEMCPY3D_st {
- size_t srcXInBytes;
- size_t srcY;
- size_t srcZ;
- size_t srcLOD;
- CUmemorytype srcMemoryType;
- const void *srcHost;
- CUdeviceptr srcDevice;
- CUarray srcArray;
- void *reserved0;
- size_t srcPitch;
- size_t srcHeight;
-
- size_t dstXInBytes;
- size_t dstY;
- size_t dstZ;
- size_t dstLOD;
- CUmemorytype dstMemoryType;
- void *dstHost;
- CUdeviceptr dstDevice;
- CUarray dstArray;
- void *reserved1;
- size_t dstPitch;
- size_t dstHeight;
-
- size_t WidthInBytes;
- size_t Height;
- size_t Depth;
-} CUDA_MEMCPY3D;
-
-typedef struct CUDA_ARRAY_DESCRIPTOR_st
-{
- size_t Width;
- size_t Height;
-
- CUarray_format Format;
- unsigned int NumChannels;
-} CUDA_ARRAY_DESCRIPTOR;
-
-typedef struct CUDA_ARRAY3D_DESCRIPTOR_st
-{
- size_t Width;
- size_t Height;
- size_t Depth;
-
- CUarray_format Format;
- unsigned int NumChannels;
- unsigned int Flags;
-} CUDA_ARRAY3D_DESCRIPTOR;
-
-#define CUDA_ARRAY3D_2DARRAY 0x01
-#define CUDA_ARRAY3D_SURFACE_LDST 0x02
-#define CU_TRSA_OVERRIDE_FORMAT 0x01
-#define CU_TRSF_READ_AS_INTEGER 0x01
-#define CU_TRSF_NORMALIZED_COORDINATES 0x02
-#define CU_TRSF_SRGB 0x10
-#define CU_PARAM_TR_DEFAULT -1
-
-#ifdef _WIN32
-#define CUDAAPI __stdcall
-#else
-#define CUDAAPI
-#endif
-
-/* function types */
-
-typedef CUresult CUDAAPI tcuInit(unsigned int Flags);
-typedef CUresult CUDAAPI tcuDriverGetVersion(int *driverVersion);
-typedef CUresult CUDAAPI tcuDeviceGet(CUdevice *device, int ordinal);
-typedef CUresult CUDAAPI tcuDeviceGetCount(int *count);
-typedef CUresult CUDAAPI tcuDeviceGetName(char *name, int len, CUdevice dev);
-typedef CUresult CUDAAPI tcuDeviceComputeCapability(int *major, int *minor, CUdevice dev);
-typedef CUresult CUDAAPI tcuDeviceTotalMem(size_t *bytes, CUdevice dev);
-typedef CUresult CUDAAPI tcuDeviceGetProperties(CUdevprop *prop, CUdevice dev);
-typedef CUresult CUDAAPI tcuDeviceGetAttribute(int *pi, CUdevice_attribute attrib, CUdevice dev);
-typedef CUresult CUDAAPI tcuCtxCreate(CUcontext *pctx, unsigned int flags, CUdevice dev);
-typedef CUresult CUDAAPI tcuCtxDestroy(CUcontext ctx);
-typedef CUresult CUDAAPI tcuCtxAttach(CUcontext *pctx, unsigned int flags);
-typedef CUresult CUDAAPI tcuCtxDetach(CUcontext ctx);
-typedef CUresult CUDAAPI tcuCtxPushCurrent(CUcontext ctx );
-typedef CUresult CUDAAPI tcuCtxPopCurrent(CUcontext *pctx);
-typedef CUresult CUDAAPI tcuCtxGetDevice(CUdevice *device);
-typedef CUresult CUDAAPI tcuCtxSynchronize(void);
-typedef CUresult CUDAAPI tcuCtxSetLimit(CUlimit limit, size_t value);
-typedef CUresult CUDAAPI tcuCtxGetLimit(size_t *pvalue, CUlimit limit);
-typedef CUresult CUDAAPI tcuCtxGetCacheConfig(CUfunc_cache *pconfig);
-typedef CUresult CUDAAPI tcuCtxSetCacheConfig(CUfunc_cache config);
-typedef CUresult CUDAAPI tcuCtxGetApiVersion(CUcontext ctx, unsigned int *version);
-typedef CUresult CUDAAPI tcuModuleLoad(CUmodule *module, const char *fname);
-typedef CUresult CUDAAPI tcuModuleLoadData(CUmodule *module, const void *image);
-typedef CUresult CUDAAPI tcuModuleLoadDataEx(CUmodule *module, const void *image, unsigned int numOptions, CUjit_option *options, void **optionValues);
-typedef CUresult CUDAAPI tcuModuleLoadFatBinary(CUmodule *module, const void *fatCubin);
-typedef CUresult CUDAAPI tcuModuleUnload(CUmodule hmod);
-typedef CUresult CUDAAPI tcuModuleGetFunction(CUfunction *hfunc, CUmodule hmod, const char *name);
-typedef CUresult CUDAAPI tcuModuleGetGlobal(CUdeviceptr *dptr, size_t *bytes, CUmodule hmod, const char *name);
-typedef CUresult CUDAAPI tcuModuleGetTexRef(CUtexref *pTexRef, CUmodule hmod, const char *name);
-typedef CUresult CUDAAPI tcuModuleGetSurfRef(CUsurfref *pSurfRef, CUmodule hmod, const char *name);
-typedef CUresult CUDAAPI tcuMemGetInfo(size_t *free, size_t *total);
-typedef CUresult CUDAAPI tcuMemAlloc(CUdeviceptr *dptr, size_t bytesize);
-typedef CUresult CUDAAPI tcuMemAllocPitch(CUdeviceptr *dptr, size_t *pPitch, size_t WidthInBytes, size_t Height, unsigned int ElementSizeBytes);
-typedef CUresult CUDAAPI tcuMemFree(CUdeviceptr dptr);
-typedef CUresult CUDAAPI tcuMemGetAddressRange(CUdeviceptr *pbase, size_t *psize, CUdeviceptr dptr);
-typedef CUresult CUDAAPI tcuMemAllocHost(void **pp, size_t bytesize);
-typedef CUresult CUDAAPI tcuMemFreeHost(void *p);
-typedef CUresult CUDAAPI tcuMemHostAlloc(void **pp, size_t bytesize, unsigned int Flags);
-typedef CUresult CUDAAPI tcuMemHostGetDevicePointer(CUdeviceptr *pdptr, void *p, unsigned int Flags);
-typedef CUresult CUDAAPI tcuMemHostGetFlags(unsigned int *pFlags, void *p);
-typedef CUresult CUDAAPI tcuMemcpyHtoD(CUdeviceptr dstDevice, const void *srcHost, size_t ByteCount);
-typedef CUresult CUDAAPI tcuMemcpyDtoH(void *dstHost, CUdeviceptr srcDevice, size_t ByteCount);
-typedef CUresult CUDAAPI tcuMemcpyDtoD(CUdeviceptr dstDevice, CUdeviceptr srcDevice, size_t ByteCount);
-typedef CUresult CUDAAPI tcuMemcpyDtoA(CUarray dstArray, size_t dstOffset, CUdeviceptr srcDevice, size_t ByteCount);
-typedef CUresult CUDAAPI tcuMemcpyAtoD(CUdeviceptr dstDevice, CUarray srcArray, size_t srcOffset, size_t ByteCount);
-typedef CUresult CUDAAPI tcuMemcpyHtoA(CUarray dstArray, size_t dstOffset, const void *srcHost, size_t ByteCount);
-typedef CUresult CUDAAPI tcuMemcpyAtoH(void *dstHost, CUarray srcArray, size_t srcOffset, size_t ByteCount);
-typedef CUresult CUDAAPI tcuMemcpyAtoA(CUarray dstArray, size_t dstOffset, CUarray srcArray, size_t srcOffset, size_t ByteCount);
-typedef CUresult CUDAAPI tcuMemcpy2D(const CUDA_MEMCPY2D *pCopy);
-typedef CUresult CUDAAPI tcuMemcpy2DUnaligned(const CUDA_MEMCPY2D *pCopy);
-typedef CUresult CUDAAPI tcuMemcpy3D(const CUDA_MEMCPY3D *pCopy);
-typedef CUresult CUDAAPI tcuMemcpyHtoDAsync(CUdeviceptr dstDevice, const void *srcHost, size_t ByteCount, CUstream hStream);
-typedef CUresult CUDAAPI tcuMemcpyDtoHAsync(void *dstHost, CUdeviceptr srcDevice, size_t ByteCount, CUstream hStream);
-typedef CUresult CUDAAPI tcuMemcpyDtoDAsync(CUdeviceptr dstDevice, CUdeviceptr srcDevice, size_t ByteCount, CUstream hStream);
-typedef CUresult CUDAAPI tcuMemcpyHtoAAsync(CUarray dstArray, size_t dstOffset, const void *srcHost, size_t ByteCount, CUstream hStream);
-typedef CUresult CUDAAPI tcuMemcpyAtoHAsync(void *dstHost, CUarray srcArray, size_t srcOffset, size_t ByteCount, CUstream hStream);
-typedef CUresult CUDAAPI tcuMemcpy2DAsync(const CUDA_MEMCPY2D *pCopy, CUstream hStream);
-typedef CUresult CUDAAPI tcuMemcpy3DAsync(const CUDA_MEMCPY3D *pCopy, CUstream hStream);
-typedef CUresult CUDAAPI tcuMemsetD8(CUdeviceptr dstDevice, unsigned char uc, size_t N);
-typedef CUresult CUDAAPI tcuMemsetD16(CUdeviceptr dstDevice, unsigned short us, size_t N);
-typedef CUresult CUDAAPI tcuMemsetD32(CUdeviceptr dstDevice, unsigned int ui, size_t N);
-typedef CUresult CUDAAPI tcuMemsetD2D8(CUdeviceptr dstDevice, size_t dstPitch, unsigned char uc, size_t Width, size_t Height);
-typedef CUresult CUDAAPI tcuMemsetD2D16(CUdeviceptr dstDevice, size_t dstPitch, unsigned short us, size_t Width, size_t Height);
-typedef CUresult CUDAAPI tcuMemsetD2D32(CUdeviceptr dstDevice, size_t dstPitch, unsigned int ui, size_t Width, size_t Height);
-typedef CUresult CUDAAPI tcuMemsetD8Async(CUdeviceptr dstDevice, unsigned char uc, size_t N, CUstream hStream);
-typedef CUresult CUDAAPI tcuMemsetD16Async(CUdeviceptr dstDevice, unsigned short us, size_t N, CUstream hStream);
-typedef CUresult CUDAAPI tcuMemsetD32Async(CUdeviceptr dstDevice, unsigned int ui, size_t N, CUstream hStream);
-typedef CUresult CUDAAPI tcuMemsetD2D8Async(CUdeviceptr dstDevice, size_t dstPitch, unsigned char uc, size_t Width, size_t Height, CUstream hStream);
-typedef CUresult CUDAAPI tcuMemsetD2D16Async(CUdeviceptr dstDevice, size_t dstPitch, unsigned short us, size_t Width, size_t Height, CUstream hStream);
-typedef CUresult CUDAAPI tcuMemsetD2D32Async(CUdeviceptr dstDevice, size_t dstPitch, unsigned int ui, size_t Width, size_t Height, CUstream hStream);
-typedef CUresult CUDAAPI tcuArrayCreate(CUarray *pHandle, const CUDA_ARRAY_DESCRIPTOR *pAllocateArray);
-typedef CUresult CUDAAPI tcuArrayGetDescriptor(CUDA_ARRAY_DESCRIPTOR *pArrayDescriptor, CUarray hArray);
-typedef CUresult CUDAAPI tcuArrayDestroy(CUarray hArray);
-typedef CUresult CUDAAPI tcuArray3DCreate(CUarray *pHandle, const CUDA_ARRAY3D_DESCRIPTOR *pAllocateArray);
-typedef CUresult CUDAAPI tcuArray3DGetDescriptor(CUDA_ARRAY3D_DESCRIPTOR *pArrayDescriptor, CUarray hArray);
-typedef CUresult CUDAAPI tcuStreamCreate(CUstream *phStream, unsigned int Flags);
-typedef CUresult CUDAAPI tcuStreamWaitEvent(CUstream hStream, CUevent hEvent, unsigned int Flags);
-typedef CUresult CUDAAPI tcuStreamQuery(CUstream hStream);
-typedef CUresult CUDAAPI tcuStreamSynchronize(CUstream hStream);
-typedef CUresult CUDAAPI tcuStreamDestroy(CUstream hStream);
-typedef CUresult CUDAAPI tcuEventCreate(CUevent *phEvent, unsigned int Flags);
-typedef CUresult CUDAAPI tcuEventRecord(CUevent hEvent, CUstream hStream);
-typedef CUresult CUDAAPI tcuEventQuery(CUevent hEvent);
-typedef CUresult CUDAAPI tcuEventSynchronize(CUevent hEvent);
-typedef CUresult CUDAAPI tcuEventDestroy(CUevent hEvent);
-typedef CUresult CUDAAPI tcuEventElapsedTime(float *pMilliseconds, CUevent hStart, CUevent hEnd);
-typedef CUresult CUDAAPI tcuFuncSetBlockShape(CUfunction hfunc, int x, int y, int z);
-typedef CUresult CUDAAPI tcuFuncSetSharedSize(CUfunction hfunc, unsigned int bytes);
-typedef CUresult CUDAAPI tcuFuncGetAttribute(int *pi, CUfunction_attribute attrib, CUfunction hfunc);
-typedef CUresult CUDAAPI tcuFuncSetCacheConfig(CUfunction hfunc, CUfunc_cache config);
-typedef CUresult CUDAAPI tcuParamSetSize(CUfunction hfunc, unsigned int numbytes);
-typedef CUresult CUDAAPI tcuParamSeti(CUfunction hfunc, int offset, unsigned int value);
-typedef CUresult CUDAAPI tcuParamSetf(CUfunction hfunc, int offset, float value);
-typedef CUresult CUDAAPI tcuParamSetv(CUfunction hfunc, int offset, void *ptr, unsigned int numbytes);
-typedef CUresult CUDAAPI tcuLaunch(CUfunction f);
-typedef CUresult CUDAAPI tcuLaunchGrid(CUfunction f, int grid_width, int grid_height);
-typedef CUresult CUDAAPI tcuLaunchGridAsync(CUfunction f, int grid_width, int grid_height, CUstream hStream);
-typedef CUresult CUDAAPI tcuParamSetTexRef(CUfunction hfunc, int texunit, CUtexref hTexRef);
-typedef CUresult CUDAAPI tcuTexRefSetArray(CUtexref hTexRef, CUarray hArray, unsigned int Flags);
-typedef CUresult CUDAAPI tcuTexRefSetAddress(size_t *ByteOffset, CUtexref hTexRef, CUdeviceptr dptr, size_t bytes);
-typedef CUresult CUDAAPI tcuTexRefSetAddress2D(CUtexref hTexRef, const CUDA_ARRAY_DESCRIPTOR *desc, CUdeviceptr dptr, size_t Pitch);
-typedef CUresult CUDAAPI tcuTexRefSetFormat(CUtexref hTexRef, CUarray_format fmt, int NumPackedComponents);
-typedef CUresult CUDAAPI tcuTexRefSetAddressMode(CUtexref hTexRef, int dim, CUaddress_mode am);
-typedef CUresult CUDAAPI tcuTexRefSetFilterMode(CUtexref hTexRef, CUfilter_mode fm);
-typedef CUresult CUDAAPI tcuTexRefSetFlags(CUtexref hTexRef, unsigned int Flags);
-typedef CUresult CUDAAPI tcuTexRefGetAddress(CUdeviceptr *pdptr, CUtexref hTexRef);
-typedef CUresult CUDAAPI tcuTexRefGetArray(CUarray *phArray, CUtexref hTexRef);
-typedef CUresult CUDAAPI tcuTexRefGetAddressMode(CUaddress_mode *pam, CUtexref hTexRef, int dim);
-typedef CUresult CUDAAPI tcuTexRefGetFilterMode(CUfilter_mode *pfm, CUtexref hTexRef);
-typedef CUresult CUDAAPI tcuTexRefGetFormat(CUarray_format *pFormat, int *pNumChannels, CUtexref hTexRef);
-typedef CUresult CUDAAPI tcuTexRefGetFlags(unsigned int *pFlags, CUtexref hTexRef);
-typedef CUresult CUDAAPI tcuTexRefCreate(CUtexref *pTexRef);
-typedef CUresult CUDAAPI tcuTexRefDestroy(CUtexref hTexRef);
-typedef CUresult CUDAAPI tcuSurfRefSetArray(CUsurfref hSurfRef, CUarray hArray, unsigned int Flags);
-typedef CUresult CUDAAPI tcuSurfRefGetArray(CUarray *phArray, CUsurfref hSurfRef);
-typedef CUresult CUDAAPI tcuGraphicsUnregisterResource(CUgraphicsResource resource);
-typedef CUresult CUDAAPI tcuGraphicsSubResourceGetMappedArray(CUarray *pArray, CUgraphicsResource resource, unsigned int arrayIndex, unsigned int mipLevel);
-typedef CUresult CUDAAPI tcuGraphicsResourceGetMappedPointer(CUdeviceptr *pDevPtr, size_t *pSize, CUgraphicsResource resource);
-typedef CUresult CUDAAPI tcuGraphicsResourceSetMapFlags(CUgraphicsResource resource, unsigned int flags);
-typedef CUresult CUDAAPI tcuGraphicsMapResources(unsigned int count, CUgraphicsResource *resources, CUstream hStream);
-typedef CUresult CUDAAPI tcuGraphicsUnmapResources(unsigned int count, CUgraphicsResource *resources, CUstream hStream);
-typedef CUresult CUDAAPI tcuGetExportTable(const void **ppExportTable, const CUuuid *pExportTableId);
-typedef CUresult CUDAAPI tcuGLCtxCreate(CUcontext *pCtx, unsigned int Flags, CUdevice device );
-typedef CUresult CUDAAPI tcuGraphicsGLRegisterBuffer(CUgraphicsResource *pCudaResource, GLuint buffer, unsigned int Flags);
-typedef CUresult CUDAAPI tcuGraphicsGLRegisterImage(CUgraphicsResource *pCudaResource, GLuint image, GLenum target, unsigned int Flags);
-typedef CUresult CUDAAPI tcuCtxSetCurrent(CUcontext ctx);
-
-/* function declarations */
-
-extern tcuInit *cuInit;
-extern tcuDriverGetVersion *cuDriverGetVersion;
-extern tcuDeviceGet *cuDeviceGet;
-extern tcuDeviceGetCount *cuDeviceGetCount;
-extern tcuDeviceGetName *cuDeviceGetName;
-extern tcuDeviceComputeCapability *cuDeviceComputeCapability;
-extern tcuDeviceTotalMem *cuDeviceTotalMem;
-extern tcuDeviceGetProperties *cuDeviceGetProperties;
-extern tcuDeviceGetAttribute *cuDeviceGetAttribute;
-extern tcuCtxCreate *cuCtxCreate;
-extern tcuCtxDestroy *cuCtxDestroy;
-extern tcuCtxAttach *cuCtxAttach;
-extern tcuCtxDetach *cuCtxDetach;
-extern tcuCtxPushCurrent *cuCtxPushCurrent;
-extern tcuCtxPopCurrent *cuCtxPopCurrent;
-extern tcuCtxGetDevice *cuCtxGetDevice;
-extern tcuCtxSynchronize *cuCtxSynchronize;
-extern tcuModuleLoad *cuModuleLoad;
-extern tcuModuleLoadData *cuModuleLoadData;
-extern tcuModuleLoadDataEx *cuModuleLoadDataEx;
-extern tcuModuleLoadFatBinary *cuModuleLoadFatBinary;
-extern tcuModuleUnload *cuModuleUnload;
-extern tcuModuleGetFunction *cuModuleGetFunction;
-extern tcuModuleGetGlobal *cuModuleGetGlobal;
-extern tcuModuleGetTexRef *cuModuleGetTexRef;
-extern tcuModuleGetSurfRef *cuModuleGetSurfRef;
-extern tcuMemGetInfo *cuMemGetInfo;
-extern tcuMemAlloc *cuMemAlloc;
-extern tcuMemAllocPitch *cuMemAllocPitch;
-extern tcuMemFree *cuMemFree;
-extern tcuMemGetAddressRange *cuMemGetAddressRange;
-extern tcuMemAllocHost *cuMemAllocHost;
-extern tcuMemFreeHost *cuMemFreeHost;
-extern tcuMemHostAlloc *cuMemHostAlloc;
-extern tcuMemHostGetDevicePointer *cuMemHostGetDevicePointer;
-extern tcuMemHostGetFlags *cuMemHostGetFlags;
-extern tcuMemcpyHtoD *cuMemcpyHtoD;
-extern tcuMemcpyDtoH *cuMemcpyDtoH;
-extern tcuMemcpyDtoD *cuMemcpyDtoD;
-extern tcuMemcpyDtoA *cuMemcpyDtoA;
-extern tcuMemcpyAtoD *cuMemcpyAtoD;
-extern tcuMemcpyHtoA *cuMemcpyHtoA;
-extern tcuMemcpyAtoH *cuMemcpyAtoH;
-extern tcuMemcpyAtoA *cuMemcpyAtoA;
-extern tcuMemcpy2D *cuMemcpy2D;
-extern tcuMemcpy2DUnaligned *cuMemcpy2DUnaligned;
-extern tcuMemcpy3D *cuMemcpy3D;
-extern tcuMemcpyHtoDAsync *cuMemcpyHtoDAsync;
-extern tcuMemcpyDtoHAsync *cuMemcpyDtoHAsync;
-extern tcuMemcpyDtoDAsync *cuMemcpyDtoDAsync;
-extern tcuMemcpyHtoAAsync *cuMemcpyHtoAAsync;
-extern tcuMemcpyAtoHAsync *cuMemcpyAtoHAsync;
-extern tcuMemcpy2DAsync *cuMemcpy2DAsync;
-extern tcuMemcpy3DAsync *cuMemcpy3DAsync;
-extern tcuMemsetD8 *cuMemsetD8;
-extern tcuMemsetD16 *cuMemsetD16;
-extern tcuMemsetD32 *cuMemsetD32;
-extern tcuMemsetD2D8 *cuMemsetD2D8;
-extern tcuMemsetD2D16 *cuMemsetD2D16;
-extern tcuMemsetD2D32 *cuMemsetD2D32;
-extern tcuFuncSetBlockShape *cuFuncSetBlockShape;
-extern tcuFuncSetSharedSize *cuFuncSetSharedSize;
-extern tcuFuncGetAttribute *cuFuncGetAttribute;
-extern tcuFuncSetCacheConfig *cuFuncSetCacheConfig;
-extern tcuArrayCreate *cuArrayCreate;
-extern tcuArrayGetDescriptor *cuArrayGetDescriptor;
-extern tcuArrayDestroy *cuArrayDestroy;
-extern tcuArray3DCreate *cuArray3DCreate;
-extern tcuArray3DGetDescriptor *cuArray3DGetDescriptor;
-extern tcuTexRefCreate *cuTexRefCreate;
-extern tcuTexRefDestroy *cuTexRefDestroy;
-extern tcuTexRefSetArray *cuTexRefSetArray;
-extern tcuTexRefSetAddress *cuTexRefSetAddress;
-extern tcuTexRefSetAddress2D *cuTexRefSetAddress2D;
-extern tcuTexRefSetFormat *cuTexRefSetFormat;
-extern tcuTexRefSetAddressMode *cuTexRefSetAddressMode;
-extern tcuTexRefSetFilterMode *cuTexRefSetFilterMode;
-extern tcuTexRefSetFlags *cuTexRefSetFlags;
-extern tcuTexRefGetAddress *cuTexRefGetAddress;
-extern tcuTexRefGetArray *cuTexRefGetArray;
-extern tcuTexRefGetAddressMode *cuTexRefGetAddressMode;
-extern tcuTexRefGetFilterMode *cuTexRefGetFilterMode;
-extern tcuTexRefGetFormat *cuTexRefGetFormat;
-extern tcuTexRefGetFlags *cuTexRefGetFlags;
-extern tcuSurfRefSetArray *cuSurfRefSetArray;
-extern tcuSurfRefGetArray *cuSurfRefGetArray;
-extern tcuParamSetSize *cuParamSetSize;
-extern tcuParamSeti *cuParamSeti;
-extern tcuParamSetf *cuParamSetf;
-extern tcuParamSetv *cuParamSetv;
-extern tcuParamSetTexRef *cuParamSetTexRef;
-extern tcuLaunch *cuLaunch;
-extern tcuLaunchGrid *cuLaunchGrid;
-extern tcuLaunchGridAsync *cuLaunchGridAsync;
-extern tcuEventCreate *cuEventCreate;
-extern tcuEventRecord *cuEventRecord;
-extern tcuEventQuery *cuEventQuery;
-extern tcuEventSynchronize *cuEventSynchronize;
-extern tcuEventDestroy *cuEventDestroy;
-extern tcuEventElapsedTime *cuEventElapsedTime;
-extern tcuStreamCreate *cuStreamCreate;
-extern tcuStreamQuery *cuStreamQuery;
-extern tcuStreamSynchronize *cuStreamSynchronize;
-extern tcuStreamDestroy *cuStreamDestroy;
-extern tcuGraphicsUnregisterResource *cuGraphicsUnregisterResource;
-extern tcuGraphicsSubResourceGetMappedArray *cuGraphicsSubResourceGetMappedArray;
-extern tcuGraphicsResourceGetMappedPointer *cuGraphicsResourceGetMappedPointer;
-extern tcuGraphicsResourceSetMapFlags *cuGraphicsResourceSetMapFlags;
-extern tcuGraphicsMapResources *cuGraphicsMapResources;
-extern tcuGraphicsUnmapResources *cuGraphicsUnmapResources;
-extern tcuGetExportTable *cuGetExportTable;
-extern tcuCtxSetLimit *cuCtxSetLimit;
-extern tcuCtxGetLimit *cuCtxGetLimit;
-extern tcuGLCtxCreate *cuGLCtxCreate;
-extern tcuGraphicsGLRegisterBuffer *cuGraphicsGLRegisterBuffer;
-extern tcuGraphicsGLRegisterImage *cuGraphicsGLRegisterImage;
-extern tcuCtxSetCurrent *cuCtxSetCurrent;
-
-#endif /* __UTIL_CUDA_H__ */
-
diff --git a/intern/cycles/util/util_half.h b/intern/cycles/util/util_half.h
index da6fae79bb9..397133618be 100644
--- a/intern/cycles/util/util_half.h
+++ b/intern/cycles/util/util_half.h
@@ -68,18 +68,18 @@ ccl_device_inline void float4_store_half(half *h, float4 f, float scale)
}
#else
/* same as above with SSE */
- const __m128 mm_scale = _mm_set_ps1(scale);
- const __m128i mm_38800000 = _mm_set1_epi32(0x38800000);
- const __m128i mm_7FFF = _mm_set1_epi32(0x7FFF);
- const __m128i mm_7FFFFFFF = _mm_set1_epi32(0x7FFFFFFF);
- const __m128i mm_C8000000 = _mm_set1_epi32(0xC8000000);
-
- __m128 mm_fscale = _mm_mul_ps(load_m128(f), mm_scale);
- __m128i x = _mm_castps_si128(_mm_min_ps(_mm_max_ps(mm_fscale, _mm_set_ps1(0.0f)), _mm_set_ps1(65500.0f)));
- __m128i absolute = _mm_and_si128(x, mm_7FFFFFFF);
- __m128i Z = _mm_add_epi32(absolute, mm_C8000000);
- __m128i result = _mm_andnot_si128(_mm_cmplt_epi32(absolute, mm_38800000), Z);
- __m128i rh = _mm_and_si128(_mm_srai_epi32(result, 13), mm_7FFF);
+ const ssef mm_scale = ssef(scale);
+ const ssei mm_38800000 = ssei(0x38800000);
+ const ssei mm_7FFF = ssei(0x7FFF);
+ const ssei mm_7FFFFFFF = ssei(0x7FFFFFFF);
+ const ssei mm_C8000000 = ssei(0xC8000000);
+
+ ssef mm_fscale = load4f(f) * mm_scale;
+ ssei x = cast(min(max(mm_fscale, ssef(0.0f)), ssef(65500.0f)));
+ ssei absolute = x & mm_7FFFFFFF;
+ ssei Z = absolute + mm_C8000000;
+ ssei result = andnot(absolute < mm_38800000, Z);
+ ssei rh = (result >> 13) & mm_7FFF;
_mm_storel_pi((__m64*)h, _mm_castsi128_ps(_mm_packs_epi32(rh, rh)));
#endif
diff --git a/intern/cycles/util/util_logging.cpp b/intern/cycles/util/util_logging.cpp
new file mode 100644
index 00000000000..0722f16cf45
--- /dev/null
+++ b/intern/cycles/util/util_logging.cpp
@@ -0,0 +1,33 @@
+/*
+ * Copyright 2011-2014 Blender Foundation
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License
+ */
+
+#include <util_logging.h>
+
+#include "util_math.h"
+
+CCL_NAMESPACE_BEGIN
+
+std::ostream& operator <<(std::ostream &os,
+ const float3 &value)
+{
+ os << "(" << value.x
+ << ", " << value.y
+ << ", " << value.z
+ << ")";
+ return os;
+}
+
+CCL_NAMESPACE_END
diff --git a/intern/cycles/util/util_logging.h b/intern/cycles/util/util_logging.h
new file mode 100644
index 00000000000..991789e7460
--- /dev/null
+++ b/intern/cycles/util/util_logging.h
@@ -0,0 +1,53 @@
+/*
+ * Copyright 2011-2014 Blender Foundation
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License
+ */
+
+#ifndef __UTIL_LOGGING_H__
+#define __UTIL_LOGGING_H__
+
+#if defined(WITH_CYCLES_LOGGING) && !defined(__KERNEL_GPU__)
+# include <glog/logging.h>
+#else
+# include <iostream>
+#endif
+
+CCL_NAMESPACE_BEGIN
+
+#if !defined(WITH_CYCLES_LOGGING) || defined(__KERNEL_GPU__)
+class StubStream : public std::ostream {
+ public:
+ StubStream() : std::ostream(NULL) { }
+};
+
+class LogMessageVoidify {
+public:
+ LogMessageVoidify() { }
+ void operator&(::std::ostream&) { }
+};
+
+# define LOG_SUPPRESS() (true) ? (void) 0 : LogMessageVoidify() & StubStream()
+# define LOG(severity) LOG_SUPPRESS()
+# define VLOG(severity) LOG_SUPPRESS()
+
+#endif
+
+class float3;
+
+std::ostream& operator <<(std::ostream &os,
+ const float3 &value);
+
+CCL_NAMESPACE_END
+
+#endif /* __UTIL_LOGGING_H__ */
diff --git a/intern/cycles/util/util_math.h b/intern/cycles/util/util_math.h
index ded75762cd2..c332e1709db 100644
--- a/intern/cycles/util/util_math.h
+++ b/intern/cycles/util/util_math.h
@@ -76,17 +76,6 @@ CCL_NAMESPACE_BEGIN
#ifdef _WIN32
-#ifndef __KERNEL_GPU__
-
-#if defined(_MSC_VER) && (_MSC_VER < 1800)
-# define copysignf(x, y) ((float)_copysign(x, y))
-# define hypotf(x, y) _hypotf(x, y)
-# define isnan(x) _isnan(x)
-# define isfinite(x) _finite(x)
-#endif
-
-#endif
-
#ifndef __KERNEL_OPENCL__
ccl_device_inline float fmaxf(float a, float b)
@@ -622,11 +611,7 @@ ccl_device_inline bool is_zero(const float3 a)
ccl_device_inline float reduce_add(const float3 a)
{
-#ifdef __KERNEL_SSE__
return (a.x + a.y + a.z);
-#else
- return (a.x + a.y + a.z);
-#endif
}
ccl_device_inline float average(const float3 a)
@@ -857,7 +842,6 @@ ccl_device_inline float4 max(float4 a, float4 b)
ccl_device_inline float4 select(const int4& mask, const float4& a, const float4& b)
{
#ifdef __KERNEL_SSE__
- /* blendv is sse4, and apparently broken on vs2008 */
return _mm_or_ps(_mm_and_ps(_mm_cvtepi32_ps(mask), a), _mm_andnot_ps(_mm_cvtepi32_ps(mask), b)); /* todo: avoid cvt */
#else
return make_float4((mask.x)? a.x: b.x, (mask.y)? a.y: b.y, (mask.z)? a.z: b.z, (mask.w)? a.w: b.w);
@@ -1429,6 +1413,27 @@ ccl_device bool ray_quad_intersect(
return false;
}
+/* projections */
+ccl_device bool map_to_sphere(float *r_u, float *r_v,
+ const float x, const float y, const float z)
+{
+ float len = sqrtf(x * x + y * y + z * z);
+ if(len > 0.0f) {
+ if(UNLIKELY(x == 0.0f && y == 0.0f)) {
+ *r_u = 0.0f; /* othwise domain error */
+ }
+ else {
+ *r_u = (1.0f - atan2f(x, y) / M_PI_F) / 2.0f;
+ }
+ *r_v = 1.0f - safe_acosf(z / len) / M_PI_F;
+ return true;
+ }
+ else {
+ *r_v = *r_u = 0.0f; /* to avoid un-initialized variables */
+ return false;
+ }
+}
+
CCL_NAMESPACE_END
#endif /* __UTIL_MATH_H__ */
diff --git a/intern/cycles/util/util_opencl.cpp b/intern/cycles/util/util_opencl.cpp
deleted file mode 100644
index c2d6bc66dc1..00000000000
--- a/intern/cycles/util/util_opencl.cpp
+++ /dev/null
@@ -1,337 +0,0 @@
-//////////////////////////////////////////////////////////////////////////
-// Copyright (c) 2009 Organic Vectory B.V.
-// Written by George van Venrooij
-//
-// Distributed under the Boost Software License, Version 1.0.
-// (See accompanying file doc/license/Boost.txt)
-// Extracted from the CLCC project - http://clcc.sourceforge.net/
-//////////////////////////////////////////////////////////////////////////
-
-#include <stdlib.h>
-
-#include "util_opencl.h"
-
-#ifndef CLCC_GENERATE_DOCUMENTATION
-#ifdef _WIN32
-# define WIN32_LEAN_AND_MEAN
-# define VC_EXTRALEAN
-# include <windows.h>
-
- typedef HMODULE CLCC_DYNLIB_HANDLE;
-
-# define CLCC_DYNLIB_OPEN LoadLibrary
-# define CLCC_DYNLIB_CLOSE FreeLibrary
-# define CLCC_DYNLIB_IMPORT GetProcAddress
-#else
-# include <dlfcn.h>
-
- typedef void* CLCC_DYNLIB_HANDLE;
-
-# define CLCC_DYNLIB_OPEN(path) dlopen(path, RTLD_NOW | RTLD_GLOBAL)
-# define CLCC_DYNLIB_CLOSE dlclose
-# define CLCC_DYNLIB_IMPORT dlsym
-#endif
-#else
-// typedef implementation_defined CLCC_DYNLIB_HANDLE;
-//# define CLCC_DYNLIB_OPEN(path) implementation_defined
-//# define CLCC_DYNLIB_CLOSE implementation_defined
-//# define CLCC_DYNLIB_IMPORT implementation_defined
-#endif
-
-CCL_NAMESPACE_BEGIN
-
-//! \brief module handle
-static CLCC_DYNLIB_HANDLE module = NULL;
-
-// Variables holding function entry points
-#ifndef CLCC_GENERATE_DOCUMENTATION
-PFNCLGETPLATFORMIDS __clewGetPlatformIDs = NULL;
-PFNCLGETPLATFORMINFO __clewGetPlatformInfo = NULL;
-PFNCLGETDEVICEIDS __clewGetDeviceIDs = NULL;
-PFNCLGETDEVICEINFO __clewGetDeviceInfo = NULL;
-PFNCLCREATECONTEXT __clewCreateContext = NULL;
-PFNCLCREATECONTEXTFROMTYPE __clewCreateContextFromType = NULL;
-PFNCLRETAINCONTEXT __clewRetainContext = NULL;
-PFNCLRELEASECONTEXT __clewReleaseContext = NULL;
-PFNCLGETCONTEXTINFO __clewGetContextInfo = NULL;
-PFNCLCREATECOMMANDQUEUE __clewCreateCommandQueue = NULL;
-PFNCLRETAINCOMMANDQUEUE __clewRetainCommandQueue = NULL;
-PFNCLRELEASECOMMANDQUEUE __clewReleaseCommandQueue = NULL;
-PFNCLGETCOMMANDQUEUEINFO __clewGetCommandQueueInfo = NULL;
-PFNCLSETCOMMANDQUEUEPROPERTY __clewSetCommandQueueProperty = NULL;
-PFNCLCREATEBUFFER __clewCreateBuffer = NULL;
-PFNCLCREATEIMAGE2D __clewCreateImage2D = NULL;
-PFNCLCREATEIMAGE3D __clewCreateImage3D = NULL;
-PFNCLRETAINMEMOBJECT __clewRetainMemObject = NULL;
-PFNCLRELEASEMEMOBJECT __clewReleaseMemObject = NULL;
-PFNCLGETSUPPORTEDIMAGEFORMATS __clewGetSupportedImageFormats = NULL;
-PFNCLGETMEMOBJECTINFO __clewGetMemObjectInfo = NULL;
-PFNCLGETIMAGEINFO __clewGetImageInfo = NULL;
-PFNCLCREATESAMPLER __clewCreateSampler = NULL;
-PFNCLRETAINSAMPLER __clewRetainSampler = NULL;
-PFNCLRELEASESAMPLER __clewReleaseSampler = NULL;
-PFNCLGETSAMPLERINFO __clewGetSamplerInfo = NULL;
-PFNCLCREATEPROGRAMWITHSOURCE __clewCreateProgramWithSource = NULL;
-PFNCLCREATEPROGRAMWITHBINARY __clewCreateProgramWithBinary = NULL;
-PFNCLRETAINPROGRAM __clewRetainProgram = NULL;
-PFNCLRELEASEPROGRAM __clewReleaseProgram = NULL;
-PFNCLBUILDPROGRAM __clewBuildProgram = NULL;
-PFNCLUNLOADCOMPILER __clewUnloadCompiler = NULL;
-PFNCLGETPROGRAMINFO __clewGetProgramInfo = NULL;
-PFNCLGETPROGRAMBUILDINFO __clewGetProgramBuildInfo = NULL;
-PFNCLCREATEKERNEL __clewCreateKernel = NULL;
-PFNCLCREATEKERNELSINPROGRAM __clewCreateKernelsInProgram = NULL;
-PFNCLRETAINKERNEL __clewRetainKernel = NULL;
-PFNCLRELEASEKERNEL __clewReleaseKernel = NULL;
-PFNCLSETKERNELARG __clewSetKernelArg = NULL;
-PFNCLGETKERNELINFO __clewGetKernelInfo = NULL;
-PFNCLGETKERNELWORKGROUPINFO __clewGetKernelWorkGroupInfo = NULL;
-PFNCLWAITFOREVENTS __clewWaitForEvents = NULL;
-PFNCLGETEVENTINFO __clewGetEventInfo = NULL;
-PFNCLRETAINEVENT __clewRetainEvent = NULL;
-PFNCLRELEASEEVENT __clewReleaseEvent = NULL;
-PFNCLGETEVENTPROFILINGINFO __clewGetEventProfilingInfo = NULL;
-PFNCLFLUSH __clewFlush = NULL;
-PFNCLFINISH __clewFinish = NULL;
-PFNCLENQUEUEREADBUFFER __clewEnqueueReadBuffer = NULL;
-PFNCLENQUEUEWRITEBUFFER __clewEnqueueWriteBuffer = NULL;
-PFNCLENQUEUECOPYBUFFER __clewEnqueueCopyBuffer = NULL;
-PFNCLENQUEUEREADIMAGE __clewEnqueueReadImage = NULL;
-PFNCLENQUEUEWRITEIMAGE __clewEnqueueWriteImage = NULL;
-PFNCLENQUEUECOPYIMAGE __clewEnqueueCopyImage = NULL;
-PFNCLENQUEUECOPYIMAGETOBUFFER __clewEnqueueCopyImageToBuffer = NULL;
-PFNCLENQUEUECOPYBUFFERTOIMAGE __clewEnqueueCopyBufferToImage = NULL;
-PFNCLENQUEUEMAPBUFFER __clewEnqueueMapBuffer = NULL;
-PFNCLENQUEUEMAPIMAGE __clewEnqueueMapImage = NULL;
-PFNCLENQUEUEUNMAPMEMOBJECT __clewEnqueueUnmapMemObject = NULL;
-PFNCLENQUEUENDRANGEKERNEL __clewEnqueueNDRangeKernel = NULL;
-PFNCLENQUEUETASK __clewEnqueueTask = NULL;
-PFNCLENQUEUENATIVEKERNEL __clewEnqueueNativeKernel = NULL;
-PFNCLENQUEUEMARKER __clewEnqueueMarker = NULL;
-PFNCLENQUEUEWAITFOREVENTS __clewEnqueueWaitForEvents = NULL;
-PFNCLENQUEUEBARRIER __clewEnqueueBarrier = NULL;
-PFNCLGETEXTENSIONFUNCTIONADDRESS __clewGetExtensionFunctionAddress = NULL;
-#endif // CLCC_GENERATE_DOCUMENTATION
-
-
-#if 0
-//! \brief Unloads OpenCL dynamic library, should not be called directly
-static void clewExit(void)
-{
- if (module != NULL)
- {
- // Ignore errors
- CLCC_DYNLIB_CLOSE(module);
- module = NULL;
- }
-}
-#endif
-
-//! \param path path to dynamic library to load
-//! \return CLEW_ERROR_OPEN_FAILED if the library could not be opened
-//! CLEW_ERROR_ATEXIT_FAILED if atexit(clewExit) failed
-//! CLEW_SUCCESS when the library was succesfully loaded
-int clLibraryInit()
-{
-#ifdef _WIN32
- const char *path = "OpenCL.dll";
-#elif defined(__APPLE__)
- const char *path = "/Library/Frameworks/OpenCL.framework/OpenCL";
-#else
- const char *path = "libOpenCL.so";
-#endif
-
- // OpenCL disabled for now, only works with this environment variable set
- if(!getenv("CYCLES_OPENCL_TEST"))
- return 0;
-
- // Check if already initialized
- if (module != NULL)
- {
- return 1;
- }
-
- // Load library
- module = CLCC_DYNLIB_OPEN(path);
-
- // Check for errors
- if (module == NULL)
- {
- return 0;
- }
-
- // Disabled because we retain OpenCL context and it's difficult to ensure
- // this will exit after releasing the context
-#if 0
- // Set unloading
- int error = atexit(clewExit);
-
- if (error)
- {
- // Failure queing atexit, shutdown with error
- CLCC_DYNLIB_CLOSE(module);
- module = NULL;
-
- return 0;
- }
-#endif
-
- // Determine function entry-points
- __clewGetPlatformIDs = (PFNCLGETPLATFORMIDS )CLCC_DYNLIB_IMPORT(module, "clGetPlatformIDs");
- __clewGetPlatformInfo = (PFNCLGETPLATFORMINFO )CLCC_DYNLIB_IMPORT(module, "clGetPlatformInfo");
- __clewGetDeviceIDs = (PFNCLGETDEVICEIDS )CLCC_DYNLIB_IMPORT(module, "clGetDeviceIDs");
- __clewGetDeviceInfo = (PFNCLGETDEVICEINFO )CLCC_DYNLIB_IMPORT(module, "clGetDeviceInfo");
- __clewCreateContext = (PFNCLCREATECONTEXT )CLCC_DYNLIB_IMPORT(module, "clCreateContext");
- __clewCreateContextFromType = (PFNCLCREATECONTEXTFROMTYPE )CLCC_DYNLIB_IMPORT(module, "clCreateContextFromType");
- __clewRetainContext = (PFNCLRETAINCONTEXT )CLCC_DYNLIB_IMPORT(module, "clRetainContext");
- __clewReleaseContext = (PFNCLRELEASECONTEXT )CLCC_DYNLIB_IMPORT(module, "clReleaseContext");
- __clewGetContextInfo = (PFNCLGETCONTEXTINFO )CLCC_DYNLIB_IMPORT(module, "clGetContextInfo");
- __clewCreateCommandQueue = (PFNCLCREATECOMMANDQUEUE )CLCC_DYNLIB_IMPORT(module, "clCreateCommandQueue");
- __clewRetainCommandQueue = (PFNCLRETAINCOMMANDQUEUE )CLCC_DYNLIB_IMPORT(module, "clRetainCommandQueue");
- __clewReleaseCommandQueue = (PFNCLRELEASECOMMANDQUEUE )CLCC_DYNLIB_IMPORT(module, "clReleaseCommandQueue");
- __clewGetCommandQueueInfo = (PFNCLGETCOMMANDQUEUEINFO )CLCC_DYNLIB_IMPORT(module, "clGetCommandQueueInfo");
- __clewSetCommandQueueProperty = (PFNCLSETCOMMANDQUEUEPROPERTY )CLCC_DYNLIB_IMPORT(module, "clSetCommandQueueProperty");
- __clewCreateBuffer = (PFNCLCREATEBUFFER )CLCC_DYNLIB_IMPORT(module, "clCreateBuffer");
- __clewCreateImage2D = (PFNCLCREATEIMAGE2D )CLCC_DYNLIB_IMPORT(module, "clCreateImage2D");
- __clewCreateImage3D = (PFNCLCREATEIMAGE3D )CLCC_DYNLIB_IMPORT(module, "clCreateImage3D");
- __clewRetainMemObject = (PFNCLRETAINMEMOBJECT )CLCC_DYNLIB_IMPORT(module, "clRetainMemObject");
- __clewReleaseMemObject = (PFNCLRELEASEMEMOBJECT )CLCC_DYNLIB_IMPORT(module, "clReleaseMemObject");
- __clewGetSupportedImageFormats = (PFNCLGETSUPPORTEDIMAGEFORMATS )CLCC_DYNLIB_IMPORT(module, "clGetSupportedImageFormats");
- __clewGetMemObjectInfo = (PFNCLGETMEMOBJECTINFO )CLCC_DYNLIB_IMPORT(module, "clGetMemObjectInfo");
- __clewGetImageInfo = (PFNCLGETIMAGEINFO )CLCC_DYNLIB_IMPORT(module, "clGetImageInfo");
- __clewCreateSampler = (PFNCLCREATESAMPLER )CLCC_DYNLIB_IMPORT(module, "clCreateSampler");
- __clewRetainSampler = (PFNCLRETAINSAMPLER )CLCC_DYNLIB_IMPORT(module, "clRetainSampler");
- __clewReleaseSampler = (PFNCLRELEASESAMPLER )CLCC_DYNLIB_IMPORT(module, "clReleaseSampler");
- __clewGetSamplerInfo = (PFNCLGETSAMPLERINFO )CLCC_DYNLIB_IMPORT(module, "clGetSamplerInfo");
- __clewCreateProgramWithSource = (PFNCLCREATEPROGRAMWITHSOURCE )CLCC_DYNLIB_IMPORT(module, "clCreateProgramWithSource");
- __clewCreateProgramWithBinary = (PFNCLCREATEPROGRAMWITHBINARY )CLCC_DYNLIB_IMPORT(module, "clCreateProgramWithBinary");
- __clewRetainProgram = (PFNCLRETAINPROGRAM )CLCC_DYNLIB_IMPORT(module, "clRetainProgram");
- __clewReleaseProgram = (PFNCLRELEASEPROGRAM )CLCC_DYNLIB_IMPORT(module, "clReleaseProgram");
- __clewBuildProgram = (PFNCLBUILDPROGRAM )CLCC_DYNLIB_IMPORT(module, "clBuildProgram");
- __clewUnloadCompiler = (PFNCLUNLOADCOMPILER )CLCC_DYNLIB_IMPORT(module, "clUnloadCompiler");
- __clewGetProgramInfo = (PFNCLGETPROGRAMINFO )CLCC_DYNLIB_IMPORT(module, "clGetProgramInfo");
- __clewGetProgramBuildInfo = (PFNCLGETPROGRAMBUILDINFO )CLCC_DYNLIB_IMPORT(module, "clGetProgramBuildInfo");
- __clewCreateKernel = (PFNCLCREATEKERNEL )CLCC_DYNLIB_IMPORT(module, "clCreateKernel");
- __clewCreateKernelsInProgram = (PFNCLCREATEKERNELSINPROGRAM )CLCC_DYNLIB_IMPORT(module, "clCreateKernelsInProgram");
- __clewRetainKernel = (PFNCLRETAINKERNEL )CLCC_DYNLIB_IMPORT(module, "clRetainKernel");
- __clewReleaseKernel = (PFNCLRELEASEKERNEL )CLCC_DYNLIB_IMPORT(module, "clReleaseKernel");
- __clewSetKernelArg = (PFNCLSETKERNELARG )CLCC_DYNLIB_IMPORT(module, "clSetKernelArg");
- __clewGetKernelInfo = (PFNCLGETKERNELINFO )CLCC_DYNLIB_IMPORT(module, "clGetKernelInfo");
- __clewGetKernelWorkGroupInfo = (PFNCLGETKERNELWORKGROUPINFO )CLCC_DYNLIB_IMPORT(module, "clGetKernelWorkGroupInfo");
- __clewWaitForEvents = (PFNCLWAITFOREVENTS )CLCC_DYNLIB_IMPORT(module, "clWaitForEvents");
- __clewGetEventInfo = (PFNCLGETEVENTINFO )CLCC_DYNLIB_IMPORT(module, "clGetEventInfo");
- __clewRetainEvent = (PFNCLRETAINEVENT )CLCC_DYNLIB_IMPORT(module, "clRetainEvent");
- __clewReleaseEvent = (PFNCLRELEASEEVENT )CLCC_DYNLIB_IMPORT(module, "clReleaseEvent");
- __clewGetEventProfilingInfo = (PFNCLGETEVENTPROFILINGINFO )CLCC_DYNLIB_IMPORT(module, "clGetEventProfilingInfo");
- __clewFlush = (PFNCLFLUSH )CLCC_DYNLIB_IMPORT(module, "clFlush");
- __clewFinish = (PFNCLFINISH )CLCC_DYNLIB_IMPORT(module, "clFinish");
- __clewEnqueueReadBuffer = (PFNCLENQUEUEREADBUFFER )CLCC_DYNLIB_IMPORT(module, "clEnqueueReadBuffer");
- __clewEnqueueWriteBuffer = (PFNCLENQUEUEWRITEBUFFER )CLCC_DYNLIB_IMPORT(module, "clEnqueueWriteBuffer");
- __clewEnqueueCopyBuffer = (PFNCLENQUEUECOPYBUFFER )CLCC_DYNLIB_IMPORT(module, "clEnqueueCopyBuffer");
- __clewEnqueueReadImage = (PFNCLENQUEUEREADIMAGE )CLCC_DYNLIB_IMPORT(module, "clEnqueueReadImage");
- __clewEnqueueWriteImage = (PFNCLENQUEUEWRITEIMAGE )CLCC_DYNLIB_IMPORT(module, "clEnqueueWriteImage");
- __clewEnqueueCopyImage = (PFNCLENQUEUECOPYIMAGE )CLCC_DYNLIB_IMPORT(module, "clEnqueueCopyImage");
- __clewEnqueueCopyImageToBuffer = (PFNCLENQUEUECOPYIMAGETOBUFFER )CLCC_DYNLIB_IMPORT(module, "clEnqueueCopyImageToBuffer");
- __clewEnqueueCopyBufferToImage = (PFNCLENQUEUECOPYBUFFERTOIMAGE )CLCC_DYNLIB_IMPORT(module, "clEnqueueCopyBufferToImage");
- __clewEnqueueMapBuffer = (PFNCLENQUEUEMAPBUFFER )CLCC_DYNLIB_IMPORT(module, "clEnqueueMapBuffer");
- __clewEnqueueMapImage = (PFNCLENQUEUEMAPIMAGE )CLCC_DYNLIB_IMPORT(module, "clEnqueueMapImage");
- __clewEnqueueUnmapMemObject = (PFNCLENQUEUEUNMAPMEMOBJECT )CLCC_DYNLIB_IMPORT(module, "clEnqueueUnmapMemObject");
- __clewEnqueueNDRangeKernel = (PFNCLENQUEUENDRANGEKERNEL )CLCC_DYNLIB_IMPORT(module, "clEnqueueNDRangeKernel");
- __clewEnqueueTask = (PFNCLENQUEUETASK )CLCC_DYNLIB_IMPORT(module, "clEnqueueTask");
- __clewEnqueueNativeKernel = (PFNCLENQUEUENATIVEKERNEL )CLCC_DYNLIB_IMPORT(module, "clEnqueueNativeKernel");
- __clewEnqueueMarker = (PFNCLENQUEUEMARKER )CLCC_DYNLIB_IMPORT(module, "clEnqueueMarker");
- __clewEnqueueWaitForEvents = (PFNCLENQUEUEWAITFOREVENTS )CLCC_DYNLIB_IMPORT(module, "clEnqueueWaitForEvents");
- __clewEnqueueBarrier = (PFNCLENQUEUEBARRIER )CLCC_DYNLIB_IMPORT(module, "clEnqueueBarrier");
- __clewGetExtensionFunctionAddress = (PFNCLGETEXTENSIONFUNCTIONADDRESS )CLCC_DYNLIB_IMPORT(module, "clGetExtensionFunctionAddress");
-
- if(__clewGetPlatformIDs == NULL) return 0;
- if(__clewGetPlatformInfo == NULL) return 0;
- if(__clewGetDeviceIDs == NULL) return 0;
- if(__clewGetDeviceInfo == NULL) return 0;
-
- return 1;
-}
-
-//! \param error CL error code
-//! \return a string representation of the error code
-const char *clErrorString(cl_int error)
-{
- static const char* strings[] =
- {
- // Error Codes
- "CL_SUCCESS" // 0
- , "CL_DEVICE_NOT_FOUND" // -1
- , "CL_DEVICE_NOT_AVAILABLE" // -2
- , "CL_COMPILER_NOT_AVAILABLE" // -3
- , "CL_MEM_OBJECT_ALLOCATION_FAILURE" // -4
- , "CL_OUT_OF_RESOURCES" // -5
- , "CL_OUT_OF_HOST_MEMORY" // -6
- , "CL_PROFILING_INFO_NOT_AVAILABLE" // -7
- , "CL_MEM_COPY_OVERLAP" // -8
- , "CL_IMAGE_FORMAT_MISMATCH" // -9
- , "CL_IMAGE_FORMAT_NOT_SUPPORTED" // -10
- , "CL_BUILD_PROGRAM_FAILURE" // -11
- , "CL_MAP_FAILURE" // -12
-
- , "" // -13
- , "" // -14
- , "" // -15
- , "" // -16
- , "" // -17
- , "" // -18
- , "" // -19
-
- , "" // -20
- , "" // -21
- , "" // -22
- , "" // -23
- , "" // -24
- , "" // -25
- , "" // -26
- , "" // -27
- , "" // -28
- , "" // -29
-
- , "CL_INVALID_VALUE" // -30
- , "CL_INVALID_DEVICE_TYPE" // -31
- , "CL_INVALID_PLATFORM" // -32
- , "CL_INVALID_DEVICE" // -33
- , "CL_INVALID_CONTEXT" // -34
- , "CL_INVALID_QUEUE_PROPERTIES" // -35
- , "CL_INVALID_COMMAND_QUEUE" // -36
- , "CL_INVALID_HOST_PTR" // -37
- , "CL_INVALID_MEM_OBJECT" // -38
- , "CL_INVALID_IMAGE_FORMAT_DESCRIPTOR" // -39
- , "CL_INVALID_IMAGE_SIZE" // -40
- , "CL_INVALID_SAMPLER" // -41
- , "CL_INVALID_BINARY" // -42
- , "CL_INVALID_BUILD_OPTIONS" // -43
- , "CL_INVALID_PROGRAM" // -44
- , "CL_INVALID_PROGRAM_EXECUTABLE" // -45
- , "CL_INVALID_KERNEL_NAME" // -46
- , "CL_INVALID_KERNEL_DEFINITION" // -47
- , "CL_INVALID_KERNEL" // -48
- , "CL_INVALID_ARG_INDEX" // -49
- , "CL_INVALID_ARG_VALUE" // -50
- , "CL_INVALID_ARG_SIZE" // -51
- , "CL_INVALID_KERNEL_ARGS" // -52
- , "CL_INVALID_WORK_DIMENSION" // -53
- , "CL_INVALID_WORK_GROUP_SIZE" // -54
- , "CL_INVALID_WORK_ITEM_SIZE" // -55
- , "CL_INVALID_GLOBAL_OFFSET" // -56
- , "CL_INVALID_EVENT_WAIT_LIST" // -57
- , "CL_INVALID_EVENT" // -58
- , "CL_INVALID_OPERATION" // -59
- , "CL_INVALID_GL_OBJECT" // -60
- , "CL_INVALID_BUFFER_SIZE" // -61
- , "CL_INVALID_MIP_LEVEL" // -62
- , "CL_INVALID_GLOBAL_WORK_SIZE" // -63
- };
-
- return strings[-error];
-}
-
-CCL_NAMESPACE_END
-
-#ifdef CLCC_DYNLIB_CLOSE
-#endif
diff --git a/intern/cycles/util/util_opencl.h b/intern/cycles/util/util_opencl.h
deleted file mode 100644
index 141c5e38273..00000000000
--- a/intern/cycles/util/util_opencl.h
+++ /dev/null
@@ -1,1313 +0,0 @@
-//////////////////////////////////////////////////////////////////////////
-// Copyright (c) 2009 Organic Vectory B.V.
-// Written by George van Venrooij
-//
-// Distributed under the Boost Software License, Version 1.0.
-// (See accompanying file doc/license/Boost.txt)
-// Extracted from the CLCC project - http://clcc.sourceforge.net/
-//////////////////////////////////////////////////////////////////////////
-
-#ifndef __UTIL_OPENCL_H__
-#define __UTIL_OPENCL_H__
-
-CCL_NAMESPACE_BEGIN
-
-//! This file contains a copy of the contents of CL.H and CL_PLATFORM.H from the
-//! official OpenCL spec. The purpose of this code is to load the OpenCL dynamic
-//! library at run-time and thus allow the executable to function on many
-//! platforms regardless of the vendor of the OpenCL driver actually installed.
-//! Some of the techniques used here were inspired by work done in the GLEW
-//! library (http://glew.sourceforge.net/)
-
-// Run-time dynamic linking functionality based on concepts used in GLEW
-#ifdef __OPENCL_CL_H
-#error cl.h included before clew.h
-#endif
-
-#ifdef __OPENCL_CL_PLATFORM_H
-#error cl_platform.h included before clew.h
-#endif
-
-#ifndef CLCC_GENERATE_DOCUMENTATION
-// Prevent cl.h inclusion
-#define __OPENCL_CL_H
-// Prevent cl_platform.h inclusion
-#define __CL_PLATFORM_H
-#endif // CLCC_GENERATE_DOCUMENTATION
-
-/*******************************************************************************
- * Copyright (c) 2008-2009 The Khronos Group Inc.
- *
- * Permission is hereby granted, free of charge, to any person obtaining a
- * copy of this software and/or associated documentation files (the
- * "Materials"), to deal in the Materials without restriction, including
- * without limitation the rights to use, copy, modify, merge, publish,
- * distribute, sublicense, and/or sell copies of the Materials, and to
- * permit persons to whom the Materials are furnished to do so, subject to
- * the following conditions:
- *
- * The above copyright notice and this permission notice shall be included
- * in all copies or substantial portions of the Materials.
- *
- * THE MATERIALS ARE PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
- * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
- * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.
- * IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY
- * CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT,
- * TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE
- * MATERIALS OR THE USE OR OTHER DEALINGS IN THE MATERIALS.
- ******************************************************************************/
-
-#ifndef CLCC_GENERATE_DOCUMENTATION
-
-#if defined(_WIN32)
-#define CL_API_ENTRY
-#define CL_API_CALL __stdcall
-#else
-#define CL_API_ENTRY
-#define CL_API_CALL
-#endif
-
-#define CL_API_SUFFIX__VERSION_1_0
-
-#if defined(_WIN32) && defined(_MSC_VER)
-
-/* scalar types */
-typedef signed __int8 cl_char;
-typedef unsigned __int8 cl_uchar;
-typedef signed __int16 cl_short;
-typedef unsigned __int16 cl_ushort;
-typedef signed __int32 cl_int;
-typedef unsigned __int32 cl_uint;
-typedef signed __int64 cl_long;
-typedef unsigned __int64 cl_ulong;
-
-typedef unsigned __int16 cl_half;
-typedef float cl_float;
-typedef double cl_double;
-
-
-/*
- * Vector types
- *
- * Note: OpenCL requires that all types be naturally aligned.
- * This means that vector types must be naturally aligned.
- * For example, a vector of four floats must be aligned to
- * a 16 byte boundary (calculated as 4 * the natural 4-byte
- * alignment of the float). The alignment qualifiers here
- * will only function properly if your compiler supports them
- * and if you don't actively work to defeat them. For example,
- * in order for a cl_float4 to be 16 byte aligned in a struct,
- * the start of the struct must itself be 16-byte aligned.
- *
- * Maintaining proper alignment is the user's responsibility.
- */
-typedef signed __int8 cl_char2[2];
-typedef signed __int8 cl_char4[4];
-typedef signed __int8 cl_char8[8];
-typedef signed __int8 cl_char16[16];
-typedef unsigned __int8 cl_uchar2[2];
-typedef unsigned __int8 cl_uchar4[4];
-typedef unsigned __int8 cl_uchar8[8];
-typedef unsigned __int8 cl_uchar16[16];
-
-typedef signed __int16 cl_short2[2];
-typedef signed __int16 cl_short4[4];
-typedef signed __int16 cl_short8[8];
-typedef signed __int16 cl_short16[16];
-typedef unsigned __int16 cl_ushort2[2];
-typedef unsigned __int16 cl_ushort4[4];
-typedef unsigned __int16 cl_ushort8[8];
-typedef unsigned __int16 cl_ushort16[16];
-
-typedef signed __int32 cl_int2[2];
-typedef signed __int32 cl_int4[4];
-typedef signed __int32 cl_int8[8];
-typedef signed __int32 cl_int16[16];
-typedef unsigned __int32 cl_uint2[2];
-typedef unsigned __int32 cl_uint4[4];
-typedef unsigned __int32 cl_uint8[8];
-typedef unsigned __int32 cl_uint16[16];
-
-typedef signed __int64 cl_long2[2];
-typedef signed __int64 cl_long4[4];
-typedef signed __int64 cl_long8[8];
-typedef signed __int64 cl_long16[16];
-typedef unsigned __int64 cl_ulong2[2];
-typedef unsigned __int64 cl_ulong4[4];
-typedef unsigned __int64 cl_ulong8[8];
-typedef unsigned __int64 cl_ulong16[16];
-
-typedef float cl_float2[2];
-typedef float cl_float4[4];
-typedef float cl_float8[8];
-typedef float cl_float16[16];
-
-typedef double cl_double2[2];
-typedef double cl_double4[4];
-typedef double cl_double8[8];
-typedef double cl_double16[16];
-/* There are no vector types for half */
-
-#else
-
-#include <stdint.h>
-
-/* scalar types */
-typedef int8_t cl_char;
-typedef uint8_t cl_uchar;
-typedef int16_t cl_short __attribute__((aligned(2)));
-typedef uint16_t cl_ushort __attribute__((aligned(2)));
-typedef int32_t cl_int __attribute__((aligned(4)));
-typedef uint32_t cl_uint __attribute__((aligned(4)));
-typedef int64_t cl_long __attribute__((aligned(8)));
-typedef uint64_t cl_ulong __attribute__((aligned(8)));
-
-typedef uint16_t cl_half __attribute__((aligned(2)));
-typedef float cl_float __attribute__((aligned(4)));
-typedef double cl_double __attribute__((aligned(8)));
-
-/*
- * Vector types
- *
- * Note: OpenCL requires that all types be naturally aligned.
- * This means that vector types must be naturally aligned.
- * For example, a vector of four floats must be aligned to
- * a 16 byte boundary (calculated as 4 * the natural 4-byte
- * alignment of the float). The alignment qualifiers here
- * will only function properly if your compiler supports them
- * and if you don't actively work to defeat them. For example,
- * in order for a cl_float4 to be 16 byte aligned in a struct,
- * the start of the struct must itself be 16-byte aligned.
- *
- * Maintaining proper alignment is the user's responsibility.
- */
-typedef int8_t cl_char2[2] __attribute__((aligned(2)));
-typedef int8_t cl_char4[4] __attribute__((aligned(4)));
-typedef int8_t cl_char8[8] __attribute__((aligned(8)));
-typedef int8_t cl_char16[16] __attribute__((aligned(16)));
-typedef uint8_t cl_uchar2[2] __attribute__((aligned(2)));
-typedef uint8_t cl_uchar4[4] __attribute__((aligned(4)));
-typedef uint8_t cl_uchar8[8] __attribute__((aligned(8)));
-typedef uint8_t cl_uchar16[16] __attribute__((aligned(16)));
-
-typedef int16_t cl_short2[2] __attribute__((aligned(4)));
-typedef int16_t cl_short4[4] __attribute__((aligned(8)));
-typedef int16_t cl_short8[8] __attribute__((aligned(16)));
-typedef int16_t cl_short16[16] __attribute__((aligned(32)));
-typedef uint16_t cl_ushort2[2] __attribute__((aligned(4)));
-typedef uint16_t cl_ushort4[4] __attribute__((aligned(8)));
-typedef uint16_t cl_ushort8[8] __attribute__((aligned(16)));
-typedef uint16_t cl_ushort16[16] __attribute__((aligned(32)));
-
-typedef int32_t cl_int2[2] __attribute__((aligned(8)));
-typedef int32_t cl_int4[4] __attribute__((aligned(16)));
-typedef int32_t cl_int8[8] __attribute__((aligned(32)));
-typedef int32_t cl_int16[16] __attribute__((aligned(64)));
-typedef uint32_t cl_uint2[2] __attribute__((aligned(8)));
-typedef uint32_t cl_uint4[4] __attribute__((aligned(16)));
-typedef uint32_t cl_uint8[8] __attribute__((aligned(32)));
-typedef uint32_t cl_uint16[16] __attribute__((aligned(64)));
-
-typedef int64_t cl_long2[2] __attribute__((aligned(16)));
-typedef int64_t cl_long4[4] __attribute__((aligned(32)));
-typedef int64_t cl_long8[8] __attribute__((aligned(64)));
-typedef int64_t cl_long16[16] __attribute__((aligned(128)));
-typedef uint64_t cl_ulong2[2] __attribute__((aligned(16)));
-typedef uint64_t cl_ulong4[4] __attribute__((aligned(32)));
-typedef uint64_t cl_ulong8[8] __attribute__((aligned(64)));
-typedef uint64_t cl_ulong16[16] __attribute__((aligned(128)));
-
-typedef float cl_float2[2] __attribute__((aligned(8)));
-typedef float cl_float4[4] __attribute__((aligned(16)));
-typedef float cl_float8[8] __attribute__((aligned(32)));
-typedef float cl_float16[16] __attribute__((aligned(64)));
-
-typedef double cl_double2[2] __attribute__((aligned(16)));
-typedef double cl_double4[4] __attribute__((aligned(32)));
-typedef double cl_double8[8] __attribute__((aligned(64)));
-typedef double cl_double16[16] __attribute__((aligned(128)));
-
-/* There are no vector types for half */
-
-#endif
-
-/******************************************************************************/
-
-// Macro names and corresponding values defined by OpenCL
-
-#define CL_CHAR_BIT 8
-#define CL_SCHAR_MAX 127
-#define CL_SCHAR_MIN (-127-1)
-#define CL_CHAR_MAX CL_SCHAR_MAX
-#define CL_CHAR_MIN CL_SCHAR_MIN
-#define CL_UCHAR_MAX 255
-#define CL_SHRT_MAX 32767
-#define CL_SHRT_MIN (-32767-1)
-#define CL_USHRT_MAX 65535
-#define CL_INT_MAX 2147483647
-#define CL_INT_MIN (-2147483647-1)
-#define CL_UINT_MAX 0xffffffffU
-#define CL_LONG_MAX ((cl_long) 0x7FFFFFFFFFFFFFFFLL)
-#define CL_LONG_MIN ((cl_long) -0x7FFFFFFFFFFFFFFFLL - 1LL)
-#define CL_ULONG_MAX ((cl_ulong) 0xFFFFFFFFFFFFFFFFULL)
-
-#define CL_FLT_DIG 6
-#define CL_FLT_MANT_DIG 24
-#define CL_FLT_MAX_10_EXP +38
-#define CL_FLT_MAX_EXP +128
-#define CL_FLT_MIN_10_EXP -37
-#define CL_FLT_MIN_EXP -125
-#define CL_FLT_RADIX 2
-#if defined(_MSC_VER)
-// MSVC doesn't understand hex floats
-#define CL_FLT_MAX 3.402823466e+38F
-#define CL_FLT_MIN 1.175494351e-38F
-#define CL_FLT_EPSILON 1.192092896e-07F
-#else
-#define CL_FLT_MAX 0x1.fffffep127f
-#define CL_FLT_MIN 0x1.0p-126f
-#define CL_FLT_EPSILON 0x1.0p-23f
-#endif
-
-#define CL_DBL_DIG 15
-#define CL_DBL_MANT_DIG 53
-#define CL_DBL_MAX_10_EXP +308
-#define CL_DBL_MAX_EXP +1024
-#define CL_DBL_MIN_10_EXP -307
-#define CL_DBL_MIN_EXP -1021
-#define CL_DBL_RADIX 2
-#if defined(_MSC_VER)
-// MSVC doesn't understand hex floats
-#define CL_DBL_MAX 1.7976931348623158e+308
-#define CL_DBL_MIN 2.2250738585072014e-308
-#define CL_DBL_EPSILON 2.2204460492503131e-016
-#else
-#define CL_DBL_MAX 0x1.fffffffffffffp1023
-#define CL_DBL_MIN 0x1.0p-1022
-#define CL_DBL_EPSILON 0x1.0p-52
-#endif
-
-#include <stddef.h>
-
-
-// CL.h contents
-/******************************************************************************/
-
-typedef struct _cl_platform_id * cl_platform_id;
-typedef struct _cl_device_id * cl_device_id;
-typedef struct _cl_context * cl_context;
-typedef struct _cl_command_queue * cl_command_queue;
-typedef struct _cl_mem * cl_mem;
-typedef struct _cl_program * cl_program;
-typedef struct _cl_kernel * cl_kernel;
-typedef struct _cl_event * cl_event;
-typedef struct _cl_sampler * cl_sampler;
-
-/* WARNING! Unlike cl_ types in cl_platform.h,
- * cl_bool is not guaranteed to be the same size as the bool in kernels. */
-typedef cl_uint cl_bool;
-typedef cl_ulong cl_bitfield;
-typedef cl_bitfield cl_device_type;
-typedef cl_uint cl_platform_info;
-typedef cl_uint cl_device_info;
-typedef cl_bitfield cl_device_address_info;
-typedef cl_bitfield cl_device_fp_config;
-typedef cl_uint cl_device_mem_cache_type;
-typedef cl_uint cl_device_local_mem_type;
-typedef cl_bitfield cl_device_exec_capabilities;
-typedef cl_bitfield cl_command_queue_properties;
-
-typedef intptr_t cl_context_properties;
-typedef cl_uint cl_context_info;
-typedef cl_uint cl_command_queue_info;
-typedef cl_uint cl_channel_order;
-typedef cl_uint cl_channel_type;
-typedef cl_bitfield cl_mem_flags;
-typedef cl_uint cl_mem_object_type;
-typedef cl_uint cl_mem_info;
-typedef cl_uint cl_image_info;
-typedef cl_uint cl_addressing_mode;
-typedef cl_uint cl_filter_mode;
-typedef cl_uint cl_sampler_info;
-typedef cl_bitfield cl_map_flags;
-typedef cl_uint cl_program_info;
-typedef cl_uint cl_program_build_info;
-typedef cl_int cl_build_status;
-typedef cl_uint cl_kernel_info;
-typedef cl_uint cl_kernel_work_group_info;
-typedef cl_uint cl_event_info;
-typedef cl_uint cl_command_type;
-typedef cl_uint cl_profiling_info;
-
-typedef struct _cl_image_format {
- cl_channel_order image_channel_order;
- cl_channel_type image_channel_data_type;
-} cl_image_format;
-
-
-
-/******************************************************************************/
-
-// Error Codes
-#define CL_SUCCESS 0
-#define CL_DEVICE_NOT_FOUND -1
-#define CL_DEVICE_NOT_AVAILABLE -2
-#define CL_COMPILER_NOT_AVAILABLE -3
-#define CL_MEM_OBJECT_ALLOCATION_FAILURE -4
-#define CL_OUT_OF_RESOURCES -5
-#define CL_OUT_OF_HOST_MEMORY -6
-#define CL_PROFILING_INFO_NOT_AVAILABLE -7
-#define CL_MEM_COPY_OVERLAP -8
-#define CL_IMAGE_FORMAT_MISMATCH -9
-#define CL_IMAGE_FORMAT_NOT_SUPPORTED -10
-#define CL_BUILD_PROGRAM_FAILURE -11
-#define CL_MAP_FAILURE -12
-
-#define CL_INVALID_VALUE -30
-#define CL_INVALID_DEVICE_TYPE -31
-#define CL_INVALID_PLATFORM -32
-#define CL_INVALID_DEVICE -33
-#define CL_INVALID_CONTEXT -34
-#define CL_INVALID_QUEUE_PROPERTIES -35
-#define CL_INVALID_COMMAND_QUEUE -36
-#define CL_INVALID_HOST_PTR -37
-#define CL_INVALID_MEM_OBJECT -38
-#define CL_INVALID_IMAGE_FORMAT_DESCRIPTOR -39
-#define CL_INVALID_IMAGE_SIZE -40
-#define CL_INVALID_SAMPLER -41
-#define CL_INVALID_BINARY -42
-#define CL_INVALID_BUILD_OPTIONS -43
-#define CL_INVALID_PROGRAM -44
-#define CL_INVALID_PROGRAM_EXECUTABLE -45
-#define CL_INVALID_KERNEL_NAME -46
-#define CL_INVALID_KERNEL_DEFINITION -47
-#define CL_INVALID_KERNEL -48
-#define CL_INVALID_ARG_INDEX -49
-#define CL_INVALID_ARG_VALUE -50
-#define CL_INVALID_ARG_SIZE -51
-#define CL_INVALID_KERNEL_ARGS -52
-#define CL_INVALID_WORK_DIMENSION -53
-#define CL_INVALID_WORK_GROUP_SIZE -54
-#define CL_INVALID_WORK_ITEM_SIZE -55
-#define CL_INVALID_GLOBAL_OFFSET -56
-#define CL_INVALID_EVENT_WAIT_LIST -57
-#define CL_INVALID_EVENT -58
-#define CL_INVALID_OPERATION -59
-#define CL_INVALID_GL_OBJECT -60
-#define CL_INVALID_BUFFER_SIZE -61
-#define CL_INVALID_MIP_LEVEL -62
-#define CL_INVALID_GLOBAL_WORK_SIZE -63
-
-// OpenCL Version
-#define CL_VERSION_1_0 1
-
-// cl_bool
-#define CL_FALSE 0
-#define CL_TRUE 1
-
-// cl_platform_info
-#define CL_PLATFORM_PROFILE 0x0900
-#define CL_PLATFORM_VERSION 0x0901
-#define CL_PLATFORM_NAME 0x0902
-#define CL_PLATFORM_VENDOR 0x0903
-#define CL_PLATFORM_EXTENSIONS 0x0904
-
-// cl_device_type - bitfield
-#define CL_DEVICE_TYPE_DEFAULT (1 << 0)
-#define CL_DEVICE_TYPE_CPU (1 << 1)
-#define CL_DEVICE_TYPE_GPU (1 << 2)
-#define CL_DEVICE_TYPE_ACCELERATOR (1 << 3)
-#define CL_DEVICE_TYPE_ALL 0xFFFFFFFF
-
-// cl_device_info
-#define CL_DEVICE_TYPE 0x1000
-#define CL_DEVICE_VENDOR_ID 0x1001
-#define CL_DEVICE_MAX_COMPUTE_UNITS 0x1002
-#define CL_DEVICE_MAX_WORK_ITEM_DIMENSIONS 0x1003
-#define CL_DEVICE_MAX_WORK_GROUP_SIZE 0x1004
-#define CL_DEVICE_MAX_WORK_ITEM_SIZES 0x1005
-#define CL_DEVICE_PREFERRED_VECTOR_WIDTH_CHAR 0x1006
-#define CL_DEVICE_PREFERRED_VECTOR_WIDTH_SHORT 0x1007
-#define CL_DEVICE_PREFERRED_VECTOR_WIDTH_INT 0x1008
-#define CL_DEVICE_PREFERRED_VECTOR_WIDTH_LONG 0x1009
-#define CL_DEVICE_PREFERRED_VECTOR_WIDTH_FLOAT 0x100A
-#define CL_DEVICE_PREFERRED_VECTOR_WIDTH_DOUBLE 0x100B
-#define CL_DEVICE_MAX_CLOCK_FREQUENCY 0x100C
-#define CL_DEVICE_ADDRESS_BITS 0x100D
-#define CL_DEVICE_MAX_READ_IMAGE_ARGS 0x100E
-#define CL_DEVICE_MAX_WRITE_IMAGE_ARGS 0x100F
-#define CL_DEVICE_MAX_MEM_ALLOC_SIZE 0x1010
-#define CL_DEVICE_IMAGE2D_MAX_WIDTH 0x1011
-#define CL_DEVICE_IMAGE2D_MAX_HEIGHT 0x1012
-#define CL_DEVICE_IMAGE3D_MAX_WIDTH 0x1013
-#define CL_DEVICE_IMAGE3D_MAX_HEIGHT 0x1014
-#define CL_DEVICE_IMAGE3D_MAX_DEPTH 0x1015
-#define CL_DEVICE_IMAGE_SUPPORT 0x1016
-#define CL_DEVICE_MAX_PARAMETER_SIZE 0x1017
-#define CL_DEVICE_MAX_SAMPLERS 0x1018
-#define CL_DEVICE_MEM_BASE_ADDR_ALIGN 0x1019
-#define CL_DEVICE_MIN_DATA_TYPE_ALIGN_SIZE 0x101A
-#define CL_DEVICE_SINGLE_FP_CONFIG 0x101B
-#define CL_DEVICE_GLOBAL_MEM_CACHE_TYPE 0x101C
-#define CL_DEVICE_GLOBAL_MEM_CACHELINE_SIZE 0x101D
-#define CL_DEVICE_GLOBAL_MEM_CACHE_SIZE 0x101E
-#define CL_DEVICE_GLOBAL_MEM_SIZE 0x101F
-#define CL_DEVICE_MAX_CONSTANT_BUFFER_SIZE 0x1020
-#define CL_DEVICE_MAX_CONSTANT_ARGS 0x1021
-#define CL_DEVICE_LOCAL_MEM_TYPE 0x1022
-#define CL_DEVICE_LOCAL_MEM_SIZE 0x1023
-#define CL_DEVICE_ERROR_CORRECTION_SUPPORT 0x1024
-#define CL_DEVICE_PROFILING_TIMER_RESOLUTION 0x1025
-#define CL_DEVICE_ENDIAN_LITTLE 0x1026
-#define CL_DEVICE_AVAILABLE 0x1027
-#define CL_DEVICE_COMPILER_AVAILABLE 0x1028
-#define CL_DEVICE_EXECUTION_CAPABILITIES 0x1029
-#define CL_DEVICE_QUEUE_PROPERTIES 0x102A
-#define CL_DEVICE_NAME 0x102B
-#define CL_DEVICE_VENDOR 0x102C
-#define CL_DRIVER_VERSION 0x102D
-#define CL_DEVICE_PROFILE 0x102E
-#define CL_DEVICE_VERSION 0x102F
-#define CL_DEVICE_EXTENSIONS 0x1030
-#define CL_DEVICE_PLATFORM 0x1031
-/* 0x1032 reserved for CL_DEVICE_DOUBLE_FP_CONFIG */
-/* 0x1033 reserved for CL_DEVICE_HALF_FP_CONFIG */
-#define CL_DEVICE_PREFERRED_VECTOR_WIDTH_HALF 0x1034
-#define CL_DEVICE_HOST_UNIFIED_MEMORY 0x1035
-#define CL_DEVICE_NATIVE_VECTOR_WIDTH_CHAR 0x1036
-#define CL_DEVICE_NATIVE_VECTOR_WIDTH_SHORT 0x1037
-#define CL_DEVICE_NATIVE_VECTOR_WIDTH_INT 0x1038
-#define CL_DEVICE_NATIVE_VECTOR_WIDTH_LONG 0x1039
-#define CL_DEVICE_NATIVE_VECTOR_WIDTH_FLOAT 0x103A
-#define CL_DEVICE_NATIVE_VECTOR_WIDTH_DOUBLE 0x103B
-#define CL_DEVICE_NATIVE_VECTOR_WIDTH_HALF 0x103C
-#define CL_DEVICE_OPENCL_C_VERSION 0x103D
-
-// cl_device_fp_config - bitfield
-#define CL_FP_DENORM (1 << 0)
-#define CL_FP_INF_NAN (1 << 1)
-#define CL_FP_ROUND_TO_NEAREST (1 << 2)
-#define CL_FP_ROUND_TO_ZERO (1 << 3)
-#define CL_FP_ROUND_TO_INF (1 << 4)
-#define CL_FP_FMA (1 << 5)
-
-// cl_device_mem_cache_type
-#define CL_NONE 0x0
-#define CL_READ_ONLY_CACHE 0x1
-#define CL_READ_WRITE_CACHE 0x2
-
-// cl_device_local_mem_type
-#define CL_LOCAL 0x1
-#define CL_GLOBAL 0x2
-
-// cl_device_exec_capabilities - bitfield
-#define CL_EXEC_KERNEL (1 << 0)
-#define CL_EXEC_NATIVE_KERNEL (1 << 1)
-
-// cl_command_queue_properties - bitfield
-#define CL_QUEUE_OUT_OF_ORDER_EXEC_MODE_ENABLE (1 << 0)
-#define CL_QUEUE_PROFILING_ENABLE (1 << 1)
-
-// cl_context_info
-#define CL_CONTEXT_REFERENCE_COUNT 0x1080
-#define CL_CONTEXT_DEVICES 0x1081
-#define CL_CONTEXT_PROPERTIES 0x1082
-
-// cl_context_properties
-#define CL_CONTEXT_PLATFORM 0x1084
-
-// cl_command_queue_info
-#define CL_QUEUE_CONTEXT 0x1090
-#define CL_QUEUE_DEVICE 0x1091
-#define CL_QUEUE_REFERENCE_COUNT 0x1092
-#define CL_QUEUE_PROPERTIES 0x1093
-
-// cl_mem_flags - bitfield
-#define CL_MEM_READ_WRITE (1 << 0)
-#define CL_MEM_WRITE_ONLY (1 << 1)
-#define CL_MEM_READ_ONLY (1 << 2)
-#define CL_MEM_USE_HOST_PTR (1 << 3)
-#define CL_MEM_ALLOC_HOST_PTR (1 << 4)
-#define CL_MEM_COPY_HOST_PTR (1 << 5)
-
-// cl_channel_order
-#define CL_R 0x10B0
-#define CL_A 0x10B1
-#define CL_RG 0x10B2
-#define CL_RA 0x10B3
-#define CL_RGB 0x10B4
-#define CL_RGBA 0x10B5
-#define CL_BGRA 0x10B6
-#define CL_ARGB 0x10B7
-#define CL_INTENSITY 0x10B8
-#define CL_LUMINANCE 0x10B9
-
-// cl_channel_type
-#define CL_SNORM_INT8 0x10D0
-#define CL_SNORM_INT16 0x10D1
-#define CL_UNORM_INT8 0x10D2
-#define CL_UNORM_INT16 0x10D3
-#define CL_UNORM_SHORT_565 0x10D4
-#define CL_UNORM_SHORT_555 0x10D5
-#define CL_UNORM_INT_101010 0x10D6
-#define CL_SIGNED_INT8 0x10D7
-#define CL_SIGNED_INT16 0x10D8
-#define CL_SIGNED_INT32 0x10D9
-#define CL_UNSIGNED_INT8 0x10DA
-#define CL_UNSIGNED_INT16 0x10DB
-#define CL_UNSIGNED_INT32 0x10DC
-#define CL_HALF_FLOAT 0x10DD
-#define CL_FLOAT 0x10DE
-
-// cl_mem_object_type
-#define CL_MEM_OBJECT_BUFFER 0x10F0
-#define CL_MEM_OBJECT_IMAGE2D 0x10F1
-#define CL_MEM_OBJECT_IMAGE3D 0x10F2
-
-// cl_mem_info
-#define CL_MEM_TYPE 0x1100
-#define CL_MEM_FLAGS 0x1101
-#define CL_MEM_SIZE 0x1102
-#define CL_MEM_HOST_PTR 0x1103
-#define CL_MEM_MAP_COUNT 0x1104
-#define CL_MEM_REFERENCE_COUNT 0x1105
-#define CL_MEM_CONTEXT 0x1106
-
-// cl_image_info
-#define CL_IMAGE_FORMAT 0x1110
-#define CL_IMAGE_ELEMENT_SIZE 0x1111
-#define CL_IMAGE_ROW_PITCH 0x1112
-#define CL_IMAGE_SLICE_PITCH 0x1113
-#define CL_IMAGE_WIDTH 0x1114
-#define CL_IMAGE_HEIGHT 0x1115
-#define CL_IMAGE_DEPTH 0x1116
-
-// cl_addressing_mode
-#define CL_ADDRESS_NONE 0x1130
-#define CL_ADDRESS_CLAMP_TO_EDGE 0x1131
-#define CL_ADDRESS_CLAMP 0x1132
-#define CL_ADDRESS_REPEAT 0x1133
-
-// cl_filter_mode
-#define CL_FILTER_NEAREST 0x1140
-#define CL_FILTER_LINEAR 0x1141
-
-// cl_sampler_info
-#define CL_SAMPLER_REFERENCE_COUNT 0x1150
-#define CL_SAMPLER_CONTEXT 0x1151
-#define CL_SAMPLER_NORMALIZED_COORDS 0x1152
-#define CL_SAMPLER_ADDRESSING_MODE 0x1153
-#define CL_SAMPLER_FILTER_MODE 0x1154
-
-// cl_map_flags - bitfield
-#define CL_MAP_READ (1 << 0)
-#define CL_MAP_WRITE (1 << 1)
-
-// cl_program_info
-#define CL_PROGRAM_REFERENCE_COUNT 0x1160
-#define CL_PROGRAM_CONTEXT 0x1161
-#define CL_PROGRAM_NUM_DEVICES 0x1162
-#define CL_PROGRAM_DEVICES 0x1163
-#define CL_PROGRAM_SOURCE 0x1164
-#define CL_PROGRAM_BINARY_SIZES 0x1165
-#define CL_PROGRAM_BINARIES 0x1166
-
-// cl_program_build_info
-#define CL_PROGRAM_BUILD_STATUS 0x1181
-#define CL_PROGRAM_BUILD_OPTIONS 0x1182
-#define CL_PROGRAM_BUILD_LOG 0x1183
-
-// cl_build_status
-#define CL_BUILD_SUCCESS 0
-#define CL_BUILD_NONE -1
-#define CL_BUILD_ERROR -2
-#define CL_BUILD_IN_PROGRESS -3
-
-// cl_kernel_info
-#define CL_KERNEL_FUNCTION_NAME 0x1190
-#define CL_KERNEL_NUM_ARGS 0x1191
-#define CL_KERNEL_REFERENCE_COUNT 0x1192
-#define CL_KERNEL_CONTEXT 0x1193
-#define CL_KERNEL_PROGRAM 0x1194
-
-// cl_kernel_work_group_info
-#define CL_KERNEL_WORK_GROUP_SIZE 0x11B0
-#define CL_KERNEL_COMPILE_WORK_GROUP_SIZE 0x11B1
-#define CL_KERNEL_LOCAL_MEM_SIZE 0x11B2
-
-// cl_event_info
-#define CL_EVENT_COMMAND_QUEUE 0x11D0
-#define CL_EVENT_COMMAND_TYPE 0x11D1
-#define CL_EVENT_REFERENCE_COUNT 0x11D2
-#define CL_EVENT_COMMAND_EXECUTION_STATUS 0x11D3
-
-// cl_command_type
-#define CL_COMMAND_NDRANGE_KERNEL 0x11F0
-#define CL_COMMAND_TASK 0x11F1
-#define CL_COMMAND_NATIVE_KERNEL 0x11F2
-#define CL_COMMAND_READ_BUFFER 0x11F3
-#define CL_COMMAND_WRITE_BUFFER 0x11F4
-#define CL_COMMAND_COPY_BUFFER 0x11F5
-#define CL_COMMAND_READ_IMAGE 0x11F6
-#define CL_COMMAND_WRITE_IMAGE 0x11F7
-#define CL_COMMAND_COPY_IMAGE 0x11F8
-#define CL_COMMAND_COPY_IMAGE_TO_BUFFER 0x11F9
-#define CL_COMMAND_COPY_BUFFER_TO_IMAGE 0x11FA
-#define CL_COMMAND_MAP_BUFFER 0x11FB
-#define CL_COMMAND_MAP_IMAGE 0x11FC
-#define CL_COMMAND_UNMAP_MEM_OBJECT 0x11FD
-#define CL_COMMAND_MARKER 0x11FE
-#define CL_COMMAND_ACQUIRE_GL_OBJECTS 0x11FF
-#define CL_COMMAND_RELEASE_GL_OBJECTS 0x1200
-
-// command execution status
-#define CL_COMPLETE 0x0
-#define CL_RUNNING 0x1
-#define CL_SUBMITTED 0x2
-#define CL_QUEUED 0x3
-
-// cl_profiling_info
-#define CL_PROFILING_COMMAND_QUEUED 0x1280
-#define CL_PROFILING_COMMAND_SUBMIT 0x1281
-#define CL_PROFILING_COMMAND_START 0x1282
-#define CL_PROFILING_COMMAND_END 0x1283
-
-/********************************************************************************************************/
-
-/********************************************************************************************************/
-
-// Function signature typedef's
-
-// Platform API
-typedef CL_API_ENTRY cl_int (CL_API_CALL *
-PFNCLGETPLATFORMIDS)(cl_uint /* num_entries */,
- cl_platform_id * /* platforms */,
- cl_uint * /* num_platforms */) CL_API_SUFFIX__VERSION_1_0;
-
-typedef CL_API_ENTRY cl_int (CL_API_CALL *
-PFNCLGETPLATFORMINFO)(cl_platform_id /* platform */,
- cl_platform_info /* param_name */,
- size_t /* param_value_size */,
- void * /* param_value */,
- size_t * /* param_value_size_ret */) CL_API_SUFFIX__VERSION_1_0;
-
-// Device APIs
-typedef CL_API_ENTRY cl_int (CL_API_CALL *
-PFNCLGETDEVICEIDS)(cl_platform_id /* platform */,
- cl_device_type /* device_type */,
- cl_uint /* num_entries */,
- cl_device_id * /* devices */,
- cl_uint * /* num_devices */) CL_API_SUFFIX__VERSION_1_0;
-
-typedef CL_API_ENTRY cl_int (CL_API_CALL *
-PFNCLGETDEVICEINFO)(cl_device_id /* device */,
- cl_device_info /* param_name */,
- size_t /* param_value_size */,
- void * /* param_value */,
- size_t * /* param_value_size_ret */) CL_API_SUFFIX__VERSION_1_0;
-
-// Context APIs
-typedef CL_API_ENTRY cl_context (CL_API_CALL *
-PFNCLCREATECONTEXT)(const cl_context_properties * /* properties */,
- cl_uint /* num_devices */,
- const cl_device_id * /* devices */,
- void (*pfn_notify)(const char *, const void *, size_t, void *) /* pfn_notify */,
- void * /* user_data */,
- cl_int * /* errcode_ret */) CL_API_SUFFIX__VERSION_1_0;
-
-typedef CL_API_ENTRY cl_context (CL_API_CALL *
-PFNCLCREATECONTEXTFROMTYPE)(const cl_context_properties * /* properties */,
- cl_device_type /* device_type */,
- void (*pfn_notify)(const char *, const void *, size_t, void *) /* pfn_notify */,
- void * /* user_data */,
- cl_int * /* errcode_ret */) CL_API_SUFFIX__VERSION_1_0;
-
-typedef CL_API_ENTRY cl_int (CL_API_CALL *
-PFNCLRETAINCONTEXT)(cl_context /* context */) CL_API_SUFFIX__VERSION_1_0;
-
-typedef CL_API_ENTRY cl_int (CL_API_CALL *
-PFNCLRELEASECONTEXT)(cl_context /* context */) CL_API_SUFFIX__VERSION_1_0;
-
-typedef CL_API_ENTRY cl_int (CL_API_CALL *
-PFNCLGETCONTEXTINFO)(cl_context /* context */,
- cl_context_info /* param_name */,
- size_t /* param_value_size */,
- void * /* param_value */,
- size_t * /* param_value_size_ret */) CL_API_SUFFIX__VERSION_1_0;
-
-// Command Queue APIs
-typedef CL_API_ENTRY cl_command_queue (CL_API_CALL *
-PFNCLCREATECOMMANDQUEUE)(cl_context /* context */,
- cl_device_id /* device */,
- cl_command_queue_properties /* properties */,
- cl_int * /* errcode_ret */) CL_API_SUFFIX__VERSION_1_0;
-
-typedef CL_API_ENTRY cl_int (CL_API_CALL *
-PFNCLRETAINCOMMANDQUEUE)(cl_command_queue /* command_queue */) CL_API_SUFFIX__VERSION_1_0;
-
-typedef CL_API_ENTRY cl_int (CL_API_CALL *
-PFNCLRELEASECOMMANDQUEUE)(cl_command_queue /* command_queue */) CL_API_SUFFIX__VERSION_1_0;
-
-typedef CL_API_ENTRY cl_int (CL_API_CALL *
-PFNCLGETCOMMANDQUEUEINFO)(cl_command_queue /* command_queue */,
- cl_command_queue_info /* param_name */,
- size_t /* param_value_size */,
- void * /* param_value */,
- size_t * /* param_value_size_ret */) CL_API_SUFFIX__VERSION_1_0;
-
-typedef CL_API_ENTRY cl_int (CL_API_CALL *
-PFNCLSETCOMMANDQUEUEPROPERTY)(cl_command_queue /* command_queue */,
- cl_command_queue_properties /* properties */,
- cl_bool /* enable */,
- cl_command_queue_properties * /* old_properties */) CL_API_SUFFIX__VERSION_1_0;
-
-// Memory Object APIs
-typedef CL_API_ENTRY cl_mem (CL_API_CALL *
-PFNCLCREATEBUFFER)(cl_context /* context */,
- cl_mem_flags /* flags */,
- size_t /* size */,
- void * /* host_ptr */,
- cl_int * /* errcode_ret */) CL_API_SUFFIX__VERSION_1_0;
-
-typedef CL_API_ENTRY cl_mem (CL_API_CALL *
-PFNCLCREATEIMAGE2D)(cl_context /* context */,
- cl_mem_flags /* flags */,
- const cl_image_format * /* image_format */,
- size_t /* image_width */,
- size_t /* image_height */,
- size_t /* image_row_pitch */,
- void * /* host_ptr */,
- cl_int * /* errcode_ret */) CL_API_SUFFIX__VERSION_1_0;
-
-typedef CL_API_ENTRY cl_mem (CL_API_CALL *
-PFNCLCREATEIMAGE3D)(cl_context /* context */,
- cl_mem_flags /* flags */,
- const cl_image_format * /* image_format */,
- size_t /* image_width */,
- size_t /* image_height */,
- size_t /* image_depth */,
- size_t /* image_row_pitch */,
- size_t /* image_slice_pitch */,
- void * /* host_ptr */,
- cl_int * /* errcode_ret */) CL_API_SUFFIX__VERSION_1_0;
-
-typedef CL_API_ENTRY cl_int (CL_API_CALL *
-PFNCLRETAINMEMOBJECT)(cl_mem /* memobj */) CL_API_SUFFIX__VERSION_1_0;
-
-typedef CL_API_ENTRY cl_int (CL_API_CALL *
-PFNCLRELEASEMEMOBJECT)(cl_mem /* memobj */) CL_API_SUFFIX__VERSION_1_0;
-
-typedef CL_API_ENTRY cl_int (CL_API_CALL *
-PFNCLGETSUPPORTEDIMAGEFORMATS)(cl_context /* context */,
- cl_mem_flags /* flags */,
- cl_mem_object_type /* image_type */,
- cl_uint /* num_entries */,
- cl_image_format * /* image_formats */,
- cl_uint * /* num_image_formats */) CL_API_SUFFIX__VERSION_1_0;
-
-typedef CL_API_ENTRY cl_int (CL_API_CALL *
-PFNCLGETMEMOBJECTINFO)(cl_mem /* memobj */,
- cl_mem_info /* param_name */,
- size_t /* param_value_size */,
- void * /* param_value */,
- size_t * /* param_value_size_ret */) CL_API_SUFFIX__VERSION_1_0;
-
-typedef CL_API_ENTRY cl_int (CL_API_CALL *
-PFNCLGETIMAGEINFO)(cl_mem /* image */,
- cl_image_info /* param_name */,
- size_t /* param_value_size */,
- void * /* param_value */,
- size_t * /* param_value_size_ret */) CL_API_SUFFIX__VERSION_1_0;
-
-// Sampler APIs
-typedef CL_API_ENTRY cl_sampler (CL_API_CALL *
-PFNCLCREATESAMPLER)(cl_context /* context */,
- cl_bool /* normalized_coords */,
- cl_addressing_mode /* addressing_mode */,
- cl_filter_mode /* filter_mode */,
- cl_int * /* errcode_ret */) CL_API_SUFFIX__VERSION_1_0;
-
-typedef CL_API_ENTRY cl_int (CL_API_CALL *
-PFNCLRETAINSAMPLER)(cl_sampler /* sampler */) CL_API_SUFFIX__VERSION_1_0;
-
-typedef CL_API_ENTRY cl_int (CL_API_CALL *
-PFNCLRELEASESAMPLER)(cl_sampler /* sampler */) CL_API_SUFFIX__VERSION_1_0;
-
-typedef CL_API_ENTRY cl_int (CL_API_CALL *
-PFNCLGETSAMPLERINFO)(cl_sampler /* sampler */,
- cl_sampler_info /* param_name */,
- size_t /* param_value_size */,
- void * /* param_value */,
- size_t * /* param_value_size_ret */) CL_API_SUFFIX__VERSION_1_0;
-
-// Program Object APIs
-typedef CL_API_ENTRY cl_program (CL_API_CALL *
-PFNCLCREATEPROGRAMWITHSOURCE)(cl_context /* context */,
- cl_uint /* count */,
- const char ** /* strings */,
- const size_t * /* lengths */,
- cl_int * /* errcode_ret */) CL_API_SUFFIX__VERSION_1_0;
-
-typedef CL_API_ENTRY cl_program (CL_API_CALL *
-PFNCLCREATEPROGRAMWITHBINARY)(cl_context /* context */,
- cl_uint /* num_devices */,
- const cl_device_id * /* device_list */,
- const size_t * /* lengths */,
- const unsigned char ** /* binaries */,
- cl_int * /* binary_status */,
- cl_int * /* errcode_ret */) CL_API_SUFFIX__VERSION_1_0;
-
-typedef CL_API_ENTRY cl_int (CL_API_CALL *
-PFNCLRETAINPROGRAM)(cl_program /* program */) CL_API_SUFFIX__VERSION_1_0;
-
-typedef CL_API_ENTRY cl_int (CL_API_CALL *
-PFNCLRELEASEPROGRAM)(cl_program /* program */) CL_API_SUFFIX__VERSION_1_0;
-
-typedef CL_API_ENTRY cl_int (CL_API_CALL *
-PFNCLBUILDPROGRAM)(cl_program /* program */,
- cl_uint /* num_devices */,
- const cl_device_id * /* device_list */,
- const char * /* options */,
- void (*pfn_notify)(cl_program /* program */, void * /* user_data */),
- void * /* user_data */) CL_API_SUFFIX__VERSION_1_0;
-
-typedef CL_API_ENTRY cl_int (CL_API_CALL *
-PFNCLUNLOADCOMPILER)(void) CL_API_SUFFIX__VERSION_1_0;
-
-typedef CL_API_ENTRY cl_int (CL_API_CALL *
-PFNCLGETPROGRAMINFO)(cl_program /* program */,
- cl_program_info /* param_name */,
- size_t /* param_value_size */,
- void * /* param_value */,
- size_t * /* param_value_size_ret */) CL_API_SUFFIX__VERSION_1_0;
-
-typedef CL_API_ENTRY cl_int (CL_API_CALL *
-PFNCLGETPROGRAMBUILDINFO)(cl_program /* program */,
- cl_device_id /* device */,
- cl_program_build_info /* param_name */,
- size_t /* param_value_size */,
- void * /* param_value */,
- size_t * /* param_value_size_ret */) CL_API_SUFFIX__VERSION_1_0;
-
-// Kernel Object APIs
-typedef CL_API_ENTRY cl_kernel (CL_API_CALL *
-PFNCLCREATEKERNEL)(cl_program /* program */,
- const char * /* kernel_name */,
- cl_int * /* errcode_ret */) CL_API_SUFFIX__VERSION_1_0;
-
-typedef CL_API_ENTRY cl_int (CL_API_CALL *
-PFNCLCREATEKERNELSINPROGRAM)(cl_program /* program */,
- cl_uint /* num_kernels */,
- cl_kernel * /* kernels */,
- cl_uint * /* num_kernels_ret */) CL_API_SUFFIX__VERSION_1_0;
-
-typedef CL_API_ENTRY cl_int (CL_API_CALL *
-PFNCLRETAINKERNEL)(cl_kernel /* kernel */) CL_API_SUFFIX__VERSION_1_0;
-
-typedef CL_API_ENTRY cl_int (CL_API_CALL *
-PFNCLRELEASEKERNEL)(cl_kernel /* kernel */) CL_API_SUFFIX__VERSION_1_0;
-
-typedef CL_API_ENTRY cl_int (CL_API_CALL *
-PFNCLSETKERNELARG)(cl_kernel /* kernel */,
- cl_uint /* arg_index */,
- size_t /* arg_size */,
- const void * /* arg_value */) CL_API_SUFFIX__VERSION_1_0;
-
-typedef CL_API_ENTRY cl_int (CL_API_CALL *
-PFNCLGETKERNELINFO)(cl_kernel /* kernel */,
- cl_kernel_info /* param_name */,
- size_t /* param_value_size */,
- void * /* param_value */,
- size_t * /* param_value_size_ret */) CL_API_SUFFIX__VERSION_1_0;
-
-typedef CL_API_ENTRY cl_int (CL_API_CALL *
-PFNCLGETKERNELWORKGROUPINFO)(cl_kernel /* kernel */,
- cl_device_id /* device */,
- cl_kernel_work_group_info /* param_name */,
- size_t /* param_value_size */,
- void * /* param_value */,
- size_t * /* param_value_size_ret */) CL_API_SUFFIX__VERSION_1_0;
-
-// Event Object APIs
-typedef CL_API_ENTRY cl_int (CL_API_CALL *
-PFNCLWAITFOREVENTS)(cl_uint /* num_events */,
- const cl_event * /* event_list */) CL_API_SUFFIX__VERSION_1_0;
-
-typedef CL_API_ENTRY cl_int (CL_API_CALL *
-PFNCLGETEVENTINFO)(cl_event /* event */,
- cl_event_info /* param_name */,
- size_t /* param_value_size */,
- void * /* param_value */,
- size_t * /* param_value_size_ret */) CL_API_SUFFIX__VERSION_1_0;
-
-typedef CL_API_ENTRY cl_int (CL_API_CALL *
-PFNCLRETAINEVENT)(cl_event /* event */) CL_API_SUFFIX__VERSION_1_0;
-
-typedef CL_API_ENTRY cl_int (CL_API_CALL *
-PFNCLRELEASEEVENT)(cl_event /* event */) CL_API_SUFFIX__VERSION_1_0;
-
-// Profiling APIs
-typedef CL_API_ENTRY cl_int (CL_API_CALL *
-PFNCLGETEVENTPROFILINGINFO)(cl_event /* event */,
- cl_profiling_info /* param_name */,
- size_t /* param_value_size */,
- void * /* param_value */,
- size_t * /* param_value_size_ret */) CL_API_SUFFIX__VERSION_1_0;
-
-// Flush and Finish APIs
-typedef CL_API_ENTRY cl_int (CL_API_CALL *
-PFNCLFLUSH)(cl_command_queue /* command_queue */) CL_API_SUFFIX__VERSION_1_0;
-
-typedef CL_API_ENTRY cl_int (CL_API_CALL *
-PFNCLFINISH)(cl_command_queue /* command_queue */) CL_API_SUFFIX__VERSION_1_0;
-
-// Enqueued Commands APIs
-typedef CL_API_ENTRY cl_int (CL_API_CALL *
-PFNCLENQUEUEREADBUFFER)(cl_command_queue /* command_queue */,
- cl_mem /* buffer */,
- cl_bool /* blocking_read */,
- size_t /* offset */,
- size_t /* cb */,
- void * /* ptr */,
- cl_uint /* num_events_in_wait_list */,
- const cl_event * /* event_wait_list */,
- cl_event * /* event */) CL_API_SUFFIX__VERSION_1_0;
-
-typedef CL_API_ENTRY cl_int (CL_API_CALL *
-PFNCLENQUEUEWRITEBUFFER)(cl_command_queue /* command_queue */,
- cl_mem /* buffer */,
- cl_bool /* blocking_write */,
- size_t /* offset */,
- size_t /* cb */,
- const void * /* ptr */,
- cl_uint /* num_events_in_wait_list */,
- const cl_event * /* event_wait_list */,
- cl_event * /* event */) CL_API_SUFFIX__VERSION_1_0;
-
-typedef CL_API_ENTRY cl_int (CL_API_CALL *
-PFNCLENQUEUECOPYBUFFER)(cl_command_queue /* command_queue */,
- cl_mem /* src_buffer */,
- cl_mem /* dst_buffer */,
- size_t /* src_offset */,
- size_t /* dst_offset */,
- size_t /* cb */,
- cl_uint /* num_events_in_wait_list */,
- const cl_event * /* event_wait_list */,
- cl_event * /* event */) CL_API_SUFFIX__VERSION_1_0;
-
-typedef CL_API_ENTRY cl_int (CL_API_CALL *
-PFNCLENQUEUEREADIMAGE)(cl_command_queue /* command_queue */,
- cl_mem /* image */,
- cl_bool /* blocking_read */,
- const size_t * /* origin[3] */,
- const size_t * /* region[3] */,
- size_t /* row_pitch */,
- size_t /* slice_pitch */,
- void * /* ptr */,
- cl_uint /* num_events_in_wait_list */,
- const cl_event * /* event_wait_list */,
- cl_event * /* event */) CL_API_SUFFIX__VERSION_1_0;
-
-typedef CL_API_ENTRY cl_int (CL_API_CALL *
-PFNCLENQUEUEWRITEIMAGE)(cl_command_queue /* command_queue */,
- cl_mem /* image */,
- cl_bool /* blocking_write */,
- const size_t * /* origin[3] */,
- const size_t * /* region[3] */,
- size_t /* input_row_pitch */,
- size_t /* input_slice_pitch */,
- const void * /* ptr */,
- cl_uint /* num_events_in_wait_list */,
- const cl_event * /* event_wait_list */,
- cl_event * /* event */) CL_API_SUFFIX__VERSION_1_0;
-
-typedef CL_API_ENTRY cl_int (CL_API_CALL *
-PFNCLENQUEUECOPYIMAGE)(cl_command_queue /* command_queue */,
- cl_mem /* src_image */,
- cl_mem /* dst_image */,
- const size_t * /* src_origin[3] */,
- const size_t * /* dst_origin[3] */,
- const size_t * /* region[3] */,
- cl_uint /* num_events_in_wait_list */,
- const cl_event * /* event_wait_list */,
- cl_event * /* event */) CL_API_SUFFIX__VERSION_1_0;
-
-typedef CL_API_ENTRY cl_int (CL_API_CALL *
-PFNCLENQUEUECOPYIMAGETOBUFFER)(cl_command_queue /* command_queue */,
- cl_mem /* src_image */,
- cl_mem /* dst_buffer */,
- const size_t * /* src_origin[3] */,
- const size_t * /* region[3] */,
- size_t /* dst_offset */,
- cl_uint /* num_events_in_wait_list */,
- const cl_event * /* event_wait_list */,
- cl_event * /* event */) CL_API_SUFFIX__VERSION_1_0;
-
-typedef CL_API_ENTRY cl_int (CL_API_CALL *
-PFNCLENQUEUECOPYBUFFERTOIMAGE)(cl_command_queue /* command_queue */,
- cl_mem /* src_buffer */,
- cl_mem /* dst_image */,
- size_t /* src_offset */,
- const size_t * /* dst_origin[3] */,
- const size_t * /* region[3] */,
- cl_uint /* num_events_in_wait_list */,
- const cl_event * /* event_wait_list */,
- cl_event * /* event */) CL_API_SUFFIX__VERSION_1_0;
-
-typedef CL_API_ENTRY void * (CL_API_CALL *
-PFNCLENQUEUEMAPBUFFER)(cl_command_queue /* command_queue */,
- cl_mem /* buffer */,
- cl_bool /* blocking_map */,
- cl_map_flags /* map_flags */,
- size_t /* offset */,
- size_t /* cb */,
- cl_uint /* num_events_in_wait_list */,
- const cl_event * /* event_wait_list */,
- cl_event * /* event */,
- cl_int * /* errcode_ret */) CL_API_SUFFIX__VERSION_1_0;
-
-typedef CL_API_ENTRY void * (CL_API_CALL *
-PFNCLENQUEUEMAPIMAGE)(cl_command_queue /* command_queue */,
- cl_mem /* image */,
- cl_bool /* blocking_map */,
- cl_map_flags /* map_flags */,
- const size_t * /* origin[3] */,
- const size_t * /* region[3] */,
- size_t * /* image_row_pitch */,
- size_t * /* image_slice_pitch */,
- cl_uint /* num_events_in_wait_list */,
- const cl_event * /* event_wait_list */,
- cl_event * /* event */,
- cl_int * /* errcode_ret */) CL_API_SUFFIX__VERSION_1_0;
-
-typedef CL_API_ENTRY cl_int (CL_API_CALL *
-PFNCLENQUEUEUNMAPMEMOBJECT)(cl_command_queue /* command_queue */,
- cl_mem /* memobj */,
- void * /* mapped_ptr */,
- cl_uint /* num_events_in_wait_list */,
- const cl_event * /* event_wait_list */,
- cl_event * /* event */) CL_API_SUFFIX__VERSION_1_0;
-
-typedef CL_API_ENTRY cl_int (CL_API_CALL *
-PFNCLENQUEUENDRANGEKERNEL)(cl_command_queue /* command_queue */,
- cl_kernel /* kernel */,
- cl_uint /* work_dim */,
- const size_t * /* global_work_offset */,
- const size_t * /* global_work_size */,
- const size_t * /* local_work_size */,
- cl_uint /* num_events_in_wait_list */,
- const cl_event * /* event_wait_list */,
- cl_event * /* event */) CL_API_SUFFIX__VERSION_1_0;
-
-typedef CL_API_ENTRY cl_int (CL_API_CALL *
-PFNCLENQUEUETASK)(cl_command_queue /* command_queue */,
- cl_kernel /* kernel */,
- cl_uint /* num_events_in_wait_list */,
- const cl_event * /* event_wait_list */,
- cl_event * /* event */) CL_API_SUFFIX__VERSION_1_0;
-
-typedef CL_API_ENTRY cl_int (CL_API_CALL *
-PFNCLENQUEUENATIVEKERNEL)(cl_command_queue /* command_queue */,
- void (*user_func)(void *),
- void * /* args */,
- size_t /* cb_args */,
- cl_uint /* num_mem_objects */,
- const cl_mem * /* mem_list */,
- const void ** /* args_mem_loc */,
- cl_uint /* num_events_in_wait_list */,
- const cl_event * /* event_wait_list */,
- cl_event * /* event */) CL_API_SUFFIX__VERSION_1_0;
-
-typedef CL_API_ENTRY cl_int (CL_API_CALL *
-PFNCLENQUEUEMARKER)(cl_command_queue /* command_queue */,
- cl_event * /* event */) CL_API_SUFFIX__VERSION_1_0;
-
-typedef CL_API_ENTRY cl_int (CL_API_CALL *
-PFNCLENQUEUEWAITFOREVENTS)(cl_command_queue /* command_queue */,
- cl_uint /* num_events */,
- const cl_event * /* event_list */) CL_API_SUFFIX__VERSION_1_0;
-
-typedef CL_API_ENTRY cl_int (CL_API_CALL *
-PFNCLENQUEUEBARRIER)(cl_command_queue /* command_queue */) CL_API_SUFFIX__VERSION_1_0;
-
-// Extension function access
-//
-// Returns the extension function address for the given function name,
-// or NULL if a valid function can not be found. The client must
-// check to make sure the address is not NULL, before using or
-// calling the returned function address.
-//
-typedef CL_API_ENTRY void * (CL_API_CALL * PFNCLGETEXTENSIONFUNCTIONADDRESS)(const char * /* func_name */) CL_API_SUFFIX__VERSION_1_0;
-
-
-#define CLEW_STATIC
-
-#ifdef CLEW_STATIC
-# define CLEWAPI extern
-#else
-# ifdef CLEW_BUILD
-# define CLEWAPI extern __declspec(dllexport)
-# else
-# define CLEWAPI extern __declspec(dllimport)
-# endif
-#endif
-
-#if defined(_WIN32)
-#define CLEW_FUN_EXPORT extern
-#else
-#define CLEW_FUN_EXPORT CLEWAPI
-#endif
-
-#define CLEW_GET_FUN(x) x
-
-
-// Variables holding function entry points
-CLEW_FUN_EXPORT PFNCLGETPLATFORMIDS __clewGetPlatformIDs ;
-CLEW_FUN_EXPORT PFNCLGETPLATFORMINFO __clewGetPlatformInfo ;
-CLEW_FUN_EXPORT PFNCLGETDEVICEIDS __clewGetDeviceIDs ;
-CLEW_FUN_EXPORT PFNCLGETDEVICEINFO __clewGetDeviceInfo ;
-CLEW_FUN_EXPORT PFNCLCREATECONTEXT __clewCreateContext ;
-CLEW_FUN_EXPORT PFNCLCREATECONTEXTFROMTYPE __clewCreateContextFromType ;
-CLEW_FUN_EXPORT PFNCLRETAINCONTEXT __clewRetainContext ;
-CLEW_FUN_EXPORT PFNCLRELEASECONTEXT __clewReleaseContext ;
-CLEW_FUN_EXPORT PFNCLGETCONTEXTINFO __clewGetContextInfo ;
-CLEW_FUN_EXPORT PFNCLCREATECOMMANDQUEUE __clewCreateCommandQueue ;
-CLEW_FUN_EXPORT PFNCLRETAINCOMMANDQUEUE __clewRetainCommandQueue ;
-CLEW_FUN_EXPORT PFNCLRELEASECOMMANDQUEUE __clewReleaseCommandQueue ;
-CLEW_FUN_EXPORT PFNCLGETCOMMANDQUEUEINFO __clewGetCommandQueueInfo ;
-CLEW_FUN_EXPORT PFNCLSETCOMMANDQUEUEPROPERTY __clewSetCommandQueueProperty ;
-CLEW_FUN_EXPORT PFNCLCREATEBUFFER __clewCreateBuffer ;
-CLEW_FUN_EXPORT PFNCLCREATEIMAGE2D __clewCreateImage2D ;
-CLEW_FUN_EXPORT PFNCLCREATEIMAGE3D __clewCreateImage3D ;
-CLEW_FUN_EXPORT PFNCLRETAINMEMOBJECT __clewRetainMemObject ;
-CLEW_FUN_EXPORT PFNCLRELEASEMEMOBJECT __clewReleaseMemObject ;
-CLEW_FUN_EXPORT PFNCLGETSUPPORTEDIMAGEFORMATS __clewGetSupportedImageFormats ;
-CLEW_FUN_EXPORT PFNCLGETMEMOBJECTINFO __clewGetMemObjectInfo ;
-CLEW_FUN_EXPORT PFNCLGETIMAGEINFO __clewGetImageInfo ;
-CLEW_FUN_EXPORT PFNCLCREATESAMPLER __clewCreateSampler ;
-CLEW_FUN_EXPORT PFNCLRETAINSAMPLER __clewRetainSampler ;
-CLEW_FUN_EXPORT PFNCLRELEASESAMPLER __clewReleaseSampler ;
-CLEW_FUN_EXPORT PFNCLGETSAMPLERINFO __clewGetSamplerInfo ;
-CLEW_FUN_EXPORT PFNCLCREATEPROGRAMWITHSOURCE __clewCreateProgramWithSource ;
-CLEW_FUN_EXPORT PFNCLCREATEPROGRAMWITHBINARY __clewCreateProgramWithBinary ;
-CLEW_FUN_EXPORT PFNCLRETAINPROGRAM __clewRetainProgram ;
-CLEW_FUN_EXPORT PFNCLRELEASEPROGRAM __clewReleaseProgram ;
-CLEW_FUN_EXPORT PFNCLBUILDPROGRAM __clewBuildProgram ;
-CLEW_FUN_EXPORT PFNCLUNLOADCOMPILER __clewUnloadCompiler ;
-CLEW_FUN_EXPORT PFNCLGETPROGRAMINFO __clewGetProgramInfo ;
-CLEW_FUN_EXPORT PFNCLGETPROGRAMBUILDINFO __clewGetProgramBuildInfo ;
-CLEW_FUN_EXPORT PFNCLCREATEKERNEL __clewCreateKernel ;
-CLEW_FUN_EXPORT PFNCLCREATEKERNELSINPROGRAM __clewCreateKernelsInProgram ;
-CLEW_FUN_EXPORT PFNCLRETAINKERNEL __clewRetainKernel ;
-CLEW_FUN_EXPORT PFNCLRELEASEKERNEL __clewReleaseKernel ;
-CLEW_FUN_EXPORT PFNCLSETKERNELARG __clewSetKernelArg ;
-CLEW_FUN_EXPORT PFNCLGETKERNELINFO __clewGetKernelInfo ;
-CLEW_FUN_EXPORT PFNCLGETKERNELWORKGROUPINFO __clewGetKernelWorkGroupInfo ;
-CLEW_FUN_EXPORT PFNCLWAITFOREVENTS __clewWaitForEvents ;
-CLEW_FUN_EXPORT PFNCLGETEVENTINFO __clewGetEventInfo ;
-CLEW_FUN_EXPORT PFNCLRETAINEVENT __clewRetainEvent ;
-CLEW_FUN_EXPORT PFNCLRELEASEEVENT __clewReleaseEvent ;
-CLEW_FUN_EXPORT PFNCLGETEVENTPROFILINGINFO __clewGetEventProfilingInfo ;
-CLEW_FUN_EXPORT PFNCLFLUSH __clewFlush ;
-CLEW_FUN_EXPORT PFNCLFINISH __clewFinish ;
-CLEW_FUN_EXPORT PFNCLENQUEUEREADBUFFER __clewEnqueueReadBuffer ;
-CLEW_FUN_EXPORT PFNCLENQUEUEWRITEBUFFER __clewEnqueueWriteBuffer ;
-CLEW_FUN_EXPORT PFNCLENQUEUECOPYBUFFER __clewEnqueueCopyBuffer ;
-CLEW_FUN_EXPORT PFNCLENQUEUEREADIMAGE __clewEnqueueReadImage ;
-CLEW_FUN_EXPORT PFNCLENQUEUEWRITEIMAGE __clewEnqueueWriteImage ;
-CLEW_FUN_EXPORT PFNCLENQUEUECOPYIMAGE __clewEnqueueCopyImage ;
-CLEW_FUN_EXPORT PFNCLENQUEUECOPYIMAGETOBUFFER __clewEnqueueCopyImageToBuffer ;
-CLEW_FUN_EXPORT PFNCLENQUEUECOPYBUFFERTOIMAGE __clewEnqueueCopyBufferToImage ;
-CLEW_FUN_EXPORT PFNCLENQUEUEMAPBUFFER __clewEnqueueMapBuffer ;
-CLEW_FUN_EXPORT PFNCLENQUEUEMAPIMAGE __clewEnqueueMapImage ;
-CLEW_FUN_EXPORT PFNCLENQUEUEUNMAPMEMOBJECT __clewEnqueueUnmapMemObject ;
-CLEW_FUN_EXPORT PFNCLENQUEUENDRANGEKERNEL __clewEnqueueNDRangeKernel ;
-CLEW_FUN_EXPORT PFNCLENQUEUETASK __clewEnqueueTask ;
-CLEW_FUN_EXPORT PFNCLENQUEUENATIVEKERNEL __clewEnqueueNativeKernel ;
-CLEW_FUN_EXPORT PFNCLENQUEUEMARKER __clewEnqueueMarker ;
-CLEW_FUN_EXPORT PFNCLENQUEUEWAITFOREVENTS __clewEnqueueWaitForEvents ;
-CLEW_FUN_EXPORT PFNCLENQUEUEBARRIER __clewEnqueueBarrier ;
-CLEW_FUN_EXPORT PFNCLGETEXTENSIONFUNCTIONADDRESS __clewGetExtensionFunctionAddress ;
-
-
-#define clGetPlatformIDs CLEW_GET_FUN(__clewGetPlatformIDs )
-#define clGetPlatformInfo CLEW_GET_FUN(__clewGetPlatformInfo )
-#define clGetDeviceIDs CLEW_GET_FUN(__clewGetDeviceIDs )
-#define clGetDeviceInfo CLEW_GET_FUN(__clewGetDeviceInfo )
-#define clCreateContext CLEW_GET_FUN(__clewCreateContext )
-#define clCreateContextFromType CLEW_GET_FUN(__clewCreateContextFromType )
-#define clRetainContext CLEW_GET_FUN(__clewRetainContext )
-#define clReleaseContext CLEW_GET_FUN(__clewReleaseContext )
-#define clGetContextInfo CLEW_GET_FUN(__clewGetContextInfo )
-#define clCreateCommandQueue CLEW_GET_FUN(__clewCreateCommandQueue )
-#define clRetainCommandQueue CLEW_GET_FUN(__clewRetainCommandQueue )
-#define clReleaseCommandQueue CLEW_GET_FUN(__clewReleaseCommandQueue )
-#define clGetCommandQueueInfo CLEW_GET_FUN(__clewGetCommandQueueInfo )
-#define clSetCommandQueueProperty CLEW_GET_FUN(__clewSetCommandQueueProperty )
-#define clCreateBuffer CLEW_GET_FUN(__clewCreateBuffer )
-#define clCreateImage2D CLEW_GET_FUN(__clewCreateImage2D )
-#define clCreateImage3D CLEW_GET_FUN(__clewCreateImage3D )
-#define clRetainMemObject CLEW_GET_FUN(__clewRetainMemObject )
-#define clReleaseMemObject CLEW_GET_FUN(__clewReleaseMemObject )
-#define clGetSupportedImageFormats CLEW_GET_FUN(__clewGetSupportedImageFormats )
-#define clGetMemObjectInfo CLEW_GET_FUN(__clewGetMemObjectInfo )
-#define clGetImageInfo CLEW_GET_FUN(__clewGetImageInfo )
-#define clCreateSampler CLEW_GET_FUN(__clewCreateSampler )
-#define clRetainSampler CLEW_GET_FUN(__clewRetainSampler )
-#define clReleaseSampler CLEW_GET_FUN(__clewReleaseSampler )
-#define clGetSamplerInfo CLEW_GET_FUN(__clewGetSamplerInfo )
-#define clCreateProgramWithSource CLEW_GET_FUN(__clewCreateProgramWithSource )
-#define clCreateProgramWithBinary CLEW_GET_FUN(__clewCreateProgramWithBinary )
-#define clRetainProgram CLEW_GET_FUN(__clewRetainProgram )
-#define clReleaseProgram CLEW_GET_FUN(__clewReleaseProgram )
-#define clBuildProgram CLEW_GET_FUN(__clewBuildProgram )
-#define clUnloadCompiler CLEW_GET_FUN(__clewUnloadCompiler )
-#define clGetProgramInfo CLEW_GET_FUN(__clewGetProgramInfo )
-#define clGetProgramBuildInfo CLEW_GET_FUN(__clewGetProgramBuildInfo )
-#define clCreateKernel CLEW_GET_FUN(__clewCreateKernel )
-#define clCreateKernelsInProgram CLEW_GET_FUN(__clewCreateKernelsInProgram )
-#define clRetainKernel CLEW_GET_FUN(__clewRetainKernel )
-#define clReleaseKernel CLEW_GET_FUN(__clewReleaseKernel )
-#define clSetKernelArg CLEW_GET_FUN(__clewSetKernelArg )
-#define clGetKernelInfo CLEW_GET_FUN(__clewGetKernelInfo )
-#define clGetKernelWorkGroupInfo CLEW_GET_FUN(__clewGetKernelWorkGroupInfo )
-#define clWaitForEvents CLEW_GET_FUN(__clewWaitForEvents )
-#define clGetEventInfo CLEW_GET_FUN(__clewGetEventInfo )
-#define clRetainEvent CLEW_GET_FUN(__clewRetainEvent )
-#define clReleaseEvent CLEW_GET_FUN(__clewReleaseEvent )
-#define clGetEventProfilingInfo CLEW_GET_FUN(__clewGetEventProfilingInfo )
-#define clFlush CLEW_GET_FUN(__clewFlush )
-#define clFinish CLEW_GET_FUN(__clewFinish )
-#define clEnqueueReadBuffer CLEW_GET_FUN(__clewEnqueueReadBuffer )
-#define clEnqueueWriteBuffer CLEW_GET_FUN(__clewEnqueueWriteBuffer )
-#define clEnqueueCopyBuffer CLEW_GET_FUN(__clewEnqueueCopyBuffer )
-#define clEnqueueReadImage CLEW_GET_FUN(__clewEnqueueReadImage )
-#define clEnqueueWriteImage CLEW_GET_FUN(__clewEnqueueWriteImage )
-#define clEnqueueCopyImage CLEW_GET_FUN(__clewEnqueueCopyImage )
-#define clEnqueueCopyImageToBuffer CLEW_GET_FUN(__clewEnqueueCopyImageToBuffer )
-#define clEnqueueCopyBufferToImage CLEW_GET_FUN(__clewEnqueueCopyBufferToImage )
-#define clEnqueueMapBuffer CLEW_GET_FUN(__clewEnqueueMapBuffer )
-#define clEnqueueMapImage CLEW_GET_FUN(__clewEnqueueMapImage )
-#define clEnqueueUnmapMemObject CLEW_GET_FUN(__clewEnqueueUnmapMemObject )
-#define clEnqueueNDRangeKernel CLEW_GET_FUN(__clewEnqueueNDRangeKernel )
-#define clEnqueueTask CLEW_GET_FUN(__clewEnqueueTask )
-#define clEnqueueNativeKernel CLEW_GET_FUN(__clewEnqueueNativeKernel )
-#define clEnqueueMarker CLEW_GET_FUN(__clewEnqueueMarker )
-#define clEnqueueWaitForEvents CLEW_GET_FUN(__clewEnqueueWaitForEvents )
-#define clEnqueueBarrier CLEW_GET_FUN(__clewEnqueueBarrier )
-#define clGetExtensionFunctionAddress CLEW_GET_FUN(__clewGetExtensionFunctionAddress )
-
-#endif // CLCC_GENERATE_DOCUMENTATION
-
-#define CLEW_SUCCESS 0 //!< Success error code
-#define CLEW_ERROR_OPEN_FAILED -1 //!< Error code for failing to open the dynamic library
-#define CLEW_ERROR_ATEXIT_FAILED -2 //!< Error code for failing to queue the closing of the dynamic library to atexit()
-
-int clLibraryInit(void);
-const char *clErrorString(cl_int error);
-
-CCL_NAMESPACE_END
-
-#endif /* __UTIL_OPENCL_H__ */
-
diff --git a/intern/cycles/util/util_opengl.h b/intern/cycles/util/util_opengl.h
index 04a3e039c9d..667a5db653d 100644
--- a/intern/cycles/util/util_opengl.h
+++ b/intern/cycles/util/util_opengl.h
@@ -20,7 +20,6 @@
/* OpenGL header includes, used everywhere we use OpenGL, to deal with
* platform differences in one central place. */
-#include <GL/glew.h>
+#include "glew-mx.h"
#endif /* __UTIL_OPENGL_H__ */
-
diff --git a/intern/cycles/util/util_optimization.h b/intern/cycles/util/util_optimization.h
index f901513ec4b..2feb3d6ab7e 100644
--- a/intern/cycles/util/util_optimization.h
+++ b/intern/cycles/util/util_optimization.h
@@ -65,10 +65,8 @@
#define WITH_CYCLES_OPTIMIZED_KERNEL_AVX
#endif
-/* MSVC 2008, no SSE41 (broken blendv intrinsic) and no AVX support */
-#if defined(_MSC_VER) && (_MSC_VER < 1700)
-#undef WITH_CYCLES_OPTIMIZED_KERNEL_SSE41
-#undef WITH_CYCLES_OPTIMIZED_KERNEL_AVX
+#ifdef WITH_KERNEL_AVX2
+#define WITH_CYCLES_OPTIMIZED_KERNEL_AVX2
#endif
#endif
@@ -101,6 +99,10 @@
/* SSE intrinsics headers */
#ifndef FREE_WINDOWS64
+#ifdef _MSC_VER
+#include <intrin.h>
+#else
+
#ifdef __KERNEL_SSE2__
#include <xmmintrin.h> /* SSE 1 */
#include <emmintrin.h> /* SSE 2 */
@@ -118,6 +120,12 @@
#include <smmintrin.h> /* SSE 4.1 */
#endif
+#ifdef __KERNEL_AVX__
+#include <immintrin.h> /* AVX */
+#endif
+
+#endif
+
#else
/* MinGW64 has conflicting declarations for these SSE headers in <windows.h>.
diff --git a/intern/cycles/util/util_path.cpp b/intern/cycles/util/util_path.cpp
index 85d19b6a325..aa424045ece 100644
--- a/intern/cycles/util/util_path.cpp
+++ b/intern/cycles/util/util_path.cpp
@@ -41,21 +41,12 @@ static string cached_user_path = "";
static boost::filesystem::path to_boost(const string& path)
{
-#ifdef _MSC_VER
- std::wstring path_utf16 = Strutil::utf8_to_utf16(path.c_str());
- return boost::filesystem::path(path_utf16.c_str());
-#else
return boost::filesystem::path(path.c_str());
-#endif
}
static string from_boost(const boost::filesystem::path& path)
{
-#ifdef _MSC_VER
- return Strutil::utf16_to_utf8(path.wstring().c_str());
-#else
return path.string().c_str();
-#endif
}
void path_init(const string& path, const string& user_path)
@@ -259,14 +250,7 @@ string path_source_replace_includes(const string& source_, const string& path)
FILE *path_fopen(const string& path, const string& mode)
{
-#ifdef _WIN32
- std::wstring path_utf16 = Strutil::utf8_to_utf16(path);
- std::wstring mode_utf16 = Strutil::utf8_to_utf16(mode);
-
- return _wfopen(path_utf16.c_str(), mode_utf16.c_str());
-#else
return fopen(path.c_str(), mode.c_str());
-#endif
}
void path_cache_clear_except(const string& name, const set<string>& except)
diff --git a/intern/cycles/util/util_progress.h b/intern/cycles/util/util_progress.h
index 5d1219bfef3..e721a3f5047 100644
--- a/intern/cycles/util/util_progress.h
+++ b/intern/cycles/util/util_progress.h
@@ -149,6 +149,12 @@ public:
sample++;
}
+ void increment_sample_update()
+ {
+ increment_sample();
+ set_update();
+ }
+
int get_sample()
{
return sample;
diff --git a/intern/cycles/util/util_simd.cpp b/intern/cycles/util/util_simd.cpp
new file mode 100644
index 00000000000..0436823e62a
--- /dev/null
+++ b/intern/cycles/util/util_simd.cpp
@@ -0,0 +1,47 @@
+/*
+ * Copyright 2011-2013 Intel Corporation
+ * Modifications Copyright 2014, Blender Foundation.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License
+ */
+
+#ifdef WITH_KERNEL_SSE2
+
+#define __KERNEL_SSE2__
+#include "util_simd.h"
+
+CCL_NAMESPACE_BEGIN
+
+const __m128 _mm_lookupmask_ps[16] = {
+ _mm_castsi128_ps(_mm_set_epi32( 0, 0, 0, 0)),
+ _mm_castsi128_ps(_mm_set_epi32( 0, 0, 0,-1)),
+ _mm_castsi128_ps(_mm_set_epi32( 0, 0,-1, 0)),
+ _mm_castsi128_ps(_mm_set_epi32( 0, 0,-1,-1)),
+ _mm_castsi128_ps(_mm_set_epi32( 0,-1, 0, 0)),
+ _mm_castsi128_ps(_mm_set_epi32( 0,-1, 0,-1)),
+ _mm_castsi128_ps(_mm_set_epi32( 0,-1,-1, 0)),
+ _mm_castsi128_ps(_mm_set_epi32( 0,-1,-1,-1)),
+ _mm_castsi128_ps(_mm_set_epi32(-1, 0, 0, 0)),
+ _mm_castsi128_ps(_mm_set_epi32(-1, 0, 0,-1)),
+ _mm_castsi128_ps(_mm_set_epi32(-1, 0,-1, 0)),
+ _mm_castsi128_ps(_mm_set_epi32(-1, 0,-1,-1)),
+ _mm_castsi128_ps(_mm_set_epi32(-1,-1, 0, 0)),
+ _mm_castsi128_ps(_mm_set_epi32(-1,-1, 0,-1)),
+ _mm_castsi128_ps(_mm_set_epi32(-1,-1,-1, 0)),
+ _mm_castsi128_ps(_mm_set_epi32(-1,-1,-1,-1))
+};
+
+
+CCL_NAMESPACE_END
+
+#endif // WITH_KERNEL_SSE2
diff --git a/intern/cycles/util/util_simd.h b/intern/cycles/util/util_simd.h
index f0f37fa57aa..39506a6359b 100644
--- a/intern/cycles/util/util_simd.h
+++ b/intern/cycles/util/util_simd.h
@@ -1,7 +1,8 @@
/*
- * Copyright 2011-2013 Blender Foundation
+ * Copyright 2011-2013 Intel Corporation
+ * Modifications Copyright 2014, Blender Foundation.
*
- * Licensed under the Apache License, Version 2.0 (the "License");
+ * Licensed under the Apache License, Version 2.0(the "License");
* you may not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
@@ -14,263 +15,425 @@
* limitations under the License
*/
-#ifndef __UTIL_SIMD_H__
-#define __UTIL_SIMD_H__
+#ifndef __UTIL_SIMD_TYPES_H__
+#define __UTIL_SIMD_TYPES_H__
+
+#include <limits>
+
+#include "util_debug.h"
+#include "util_types.h"
CCL_NAMESPACE_BEGIN
#ifdef __KERNEL_SSE2__
-/* SSE shuffle utility functions */
+struct sseb;
+struct ssei;
+struct ssef;
+
+extern const __m128 _mm_lookupmask_ps[16];
+
+/* Special Types */
-#ifdef __KERNEL_SSSE3__
+static struct TrueTy {
+__forceinline operator bool( ) const { return true; }
+} True ccl_maybe_unused;
-/* faster version for SSSE3 */
-typedef __m128i shuffle_swap_t;
+static struct FalseTy {
+__forceinline operator bool( ) const { return false; }
+} False ccl_maybe_unused;
-ccl_device_inline const shuffle_swap_t shuffle_swap_identity(void)
+static struct NegInfTy
{
- return _mm_set_epi8(15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0);
-}
+__forceinline operator float ( ) const { return -std::numeric_limits<float>::infinity(); }
+__forceinline operator int ( ) const { return std::numeric_limits<int>::min(); }
+} neg_inf ccl_maybe_unused;
-ccl_device_inline const shuffle_swap_t shuffle_swap_swap(void)
+static struct PosInfTy
{
- return _mm_set_epi8(7, 6, 5, 4, 3, 2, 1, 0, 15, 14, 13, 12, 11, 10, 9, 8);
+__forceinline operator float ( ) const { return std::numeric_limits<float>::infinity(); }
+__forceinline operator int ( ) const { return std::numeric_limits<int>::max(); }
+} inf ccl_maybe_unused, pos_inf ccl_maybe_unused;
+
+/* Intrinsics Functions */
+
+#if defined(__BMI__) && defined(__GNUC__)
+#define _tzcnt_u32 __tzcnt_u32
+#define _tzcnt_u64 __tzcnt_u64
+#endif
+
+#if defined(__LZCNT__)
+#define _lzcnt_u32 __lzcnt32
+#define _lzcnt_u64 __lzcnt64
+#endif
+
+#if defined(_WIN32) && !defined(__MINGW32__)
+
+__forceinline int __popcnt(int in) {
+ return _mm_popcnt_u32(in);
}
-ccl_device_inline const __m128 shuffle_swap(const __m128& a, const shuffle_swap_t& shuf)
-{
- return _mm_castsi128_ps(_mm_shuffle_epi8(_mm_castps_si128(a), shuf));
+#if !defined(_MSC_VER)
+__forceinline unsigned int __popcnt(unsigned int in) {
+ return _mm_popcnt_u32(in);
+}
+#endif
+
+#if defined(__KERNEL_64_BIT__)
+__forceinline long long __popcnt(long long in) {
+ return _mm_popcnt_u64(in);
+}
+__forceinline size_t __popcnt(size_t in) {
+ return _mm_popcnt_u64(in);
+}
+#endif
+
+__forceinline int __bsf(int v) {
+#if defined(__KERNEL_AVX2__)
+ return _tzcnt_u32(v);
+#else
+ unsigned long r = 0; _BitScanForward(&r,v); return r;
+#endif
}
+__forceinline unsigned int __bsf(unsigned int v) {
+#if defined(__KERNEL_AVX2__)
+ return _tzcnt_u32(v);
#else
+ unsigned long r = 0; _BitScanForward(&r,v); return r;
+#endif
+}
-/* somewhat slower version for SSE2 */
-typedef int shuffle_swap_t;
+__forceinline int __bsr(int v) {
+ unsigned long r = 0; _BitScanReverse(&r,v); return r;
+}
-ccl_device_inline const shuffle_swap_t shuffle_swap_identity(void)
-{
- return 0;
+__forceinline int __btc(int v, int i) {
+ long r = v; _bittestandcomplement(&r,i); return r;
}
-ccl_device_inline const shuffle_swap_t shuffle_swap_swap(void)
-{
- return 1;
+__forceinline int __bts(int v, int i) {
+ long r = v; _bittestandset(&r,i); return r;
}
-ccl_device_inline const __m128 shuffle_swap(const __m128& a, shuffle_swap_t shuf)
-{
- /* shuffle value must be a constant, so we need to branch */
- if(shuf)
- return _mm_shuffle_ps(a, a, _MM_SHUFFLE(1, 0, 3, 2));
- else
- return _mm_shuffle_ps(a, a, _MM_SHUFFLE(3, 2, 1, 0));
+__forceinline int __btr(int v, int i) {
+ long r = v; _bittestandreset(&r,i); return r;
}
+__forceinline int bitscan(int v) {
+#if defined(__KERNEL_AVX2__)
+ return _tzcnt_u32(v);
+#else
+ return __bsf(v);
#endif
+}
-#ifdef __KERNEL_SSE41__
-ccl_device_inline void gen_idirsplat_swap(const __m128 &pn, const shuffle_swap_t &shuf_identity, const shuffle_swap_t &shuf_swap,
- const float3& idir, __m128 idirsplat[3], shuffle_swap_t shufflexyz[3])
+__forceinline int clz(const int x)
{
- const __m128 idirsplat_raw[] = { _mm_set_ps1(idir.x), _mm_set_ps1(idir.y), _mm_set_ps1(idir.z) };
- idirsplat[0] = _mm_xor_ps(idirsplat_raw[0], pn);
- idirsplat[1] = _mm_xor_ps(idirsplat_raw[1], pn);
- idirsplat[2] = _mm_xor_ps(idirsplat_raw[2], pn);
-
- const __m128 signmask = _mm_castsi128_ps(_mm_set1_epi32(0x80000000));
- const __m128 shuf_identity_f = _mm_castsi128_ps(shuf_identity);
- const __m128 shuf_swap_f = _mm_castsi128_ps(shuf_swap);
- shufflexyz[0] = _mm_castps_si128(_mm_blendv_ps(shuf_identity_f, shuf_swap_f, _mm_and_ps(idirsplat_raw[0], signmask)));
- shufflexyz[1] = _mm_castps_si128(_mm_blendv_ps(shuf_identity_f, shuf_swap_f, _mm_and_ps(idirsplat_raw[1], signmask)));
- shufflexyz[2] = _mm_castps_si128(_mm_blendv_ps(shuf_identity_f, shuf_swap_f, _mm_and_ps(idirsplat_raw[2], signmask)));
-}
+#if defined(__KERNEL_AVX2__)
+ return _lzcnt_u32(x);
#else
-ccl_device_inline void gen_idirsplat_swap(const __m128 &pn, const shuffle_swap_t &shuf_identity, const shuffle_swap_t &shuf_swap,
- const float3& idir, __m128 idirsplat[3], shuffle_swap_t shufflexyz[3])
-{
- idirsplat[0] = _mm_xor_ps(_mm_set_ps1(idir.x), pn);
- idirsplat[1] = _mm_xor_ps(_mm_set_ps1(idir.y), pn);
- idirsplat[2] = _mm_xor_ps(_mm_set_ps1(idir.z), pn);
-
- shufflexyz[0] = (idir.x >= 0)? shuf_identity: shuf_swap;
- shufflexyz[1] = (idir.y >= 0)? shuf_identity: shuf_swap;
- shufflexyz[2] = (idir.z >= 0)? shuf_identity: shuf_swap;
-}
+ if (UNLIKELY(x == 0)) return 32;
+ return 31 - __bsr(x);
#endif
+}
-template<size_t i0, size_t i1, size_t i2, size_t i3> ccl_device_inline const __m128 shuffle(const __m128& a, const __m128& b)
+__forceinline int __bscf(int& v)
{
- return _mm_shuffle_ps(a, b, _MM_SHUFFLE(i3, i2, i1, i0));
+ int i = __bsf(v);
+ v &= v-1;
+ return i;
}
-template<size_t i0, size_t i1, size_t i2, size_t i3> ccl_device_inline const __m128 shuffle(const __m128& a)
+__forceinline unsigned int __bscf(unsigned int& v)
{
- return _mm_castsi128_ps(_mm_shuffle_epi32(_mm_castps_si128(a), _MM_SHUFFLE(i3, i2, i1, i0)));
+ unsigned int i = __bsf(v);
+ v &= v-1;
+ return i;
}
-template<> __forceinline const __m128 shuffle<0, 1, 0, 1>(const __m128& a)
-{
- return _mm_movelh_ps(a, a);
+#if defined(__KERNEL_64_BIT__)
+
+__forceinline size_t __bsf(size_t v) {
+#if defined(__KERNEL_AVX2__)
+ return _tzcnt_u64(v);
+#else
+ unsigned long r = 0; _BitScanForward64(&r,v); return r;
+#endif
}
-template<> __forceinline const __m128 shuffle<2, 3, 2, 3>(const __m128& a)
-{
- return _mm_movehl_ps(a, a);
+__forceinline size_t __bsr(size_t v) {
+ unsigned long r = 0; _BitScanReverse64(&r,v); return r;
}
-template<size_t i0, size_t i1, size_t i2, size_t i3> ccl_device_inline const __m128i shuffle(const __m128i& a)
-{
- return _mm_shuffle_epi32(a, _MM_SHUFFLE(i3, i2, i1, i0));
+__forceinline size_t __btc(size_t v, size_t i) {
+ size_t r = v; _bittestandcomplement64((__int64*)&r,i); return r;
}
-template<size_t i0, size_t i1, size_t i2, size_t i3> ccl_device_inline const __m128i shuffle(const __m128i& a, const __m128i& b)
-{
- return _mm_castps_si128(_mm_shuffle_ps(_mm_castsi128_ps(a), _mm_castsi128_ps(b), _MM_SHUFFLE(i3, i2, i1, i0)));
+__forceinline size_t __bts(size_t v, size_t i) {
+ __int64 r = v; _bittestandset64(&r,i); return r;
}
-/* Blend 2 vectors based on mask: (a[i] & mask[i]) | (b[i] & ~mask[i]) */
-#ifdef __KERNEL_SSE41__
-ccl_device_inline const __m128 blend(const __m128& mask, const __m128& a, const __m128& b)
-{
- return _mm_blendv_ps(b, a, mask);
+__forceinline size_t __btr(size_t v, size_t i) {
+ __int64 r = v; _bittestandreset64(&r,i); return r;
}
+
+__forceinline size_t bitscan(size_t v) {
+#if defined(__KERNEL_AVX2__)
+#if defined(__KERNEL_64_BIT__)
+ return _tzcnt_u64(v);
#else
-ccl_device_inline const __m128 blend(const __m128& mask, const __m128& a, const __m128& b)
-{
- return _mm_or_ps(_mm_and_ps(mask, a), _mm_andnot_ps(mask, b));
-}
+ return _tzcnt_u32(v);
#endif
+#else
+ return __bsf(v);
+#endif
+}
-/* calculate a*b+c (replacement for fused multiply-add on SSE CPUs) */
-ccl_device_inline const __m128 fma(const __m128& a, const __m128& b, const __m128& c)
+__forceinline size_t __bscf(size_t& v)
{
- return _mm_add_ps(_mm_mul_ps(a, b), c);
+ size_t i = __bsf(v);
+ v &= v-1;
+ return i;
}
-/* calculate a*b-c (replacement for fused multiply-subtract on SSE CPUs) */
-ccl_device_inline const __m128 fms(const __m128& a, const __m128& b, const __m128& c)
-{
- return _mm_sub_ps(_mm_mul_ps(a, b), c);
+#endif /* __KERNEL_64_BIT__ */
+
+#else /* _WIN32 */
+
+__forceinline unsigned int __popcnt(unsigned int in) {
+ int r = 0; asm ("popcnt %1,%0" : "=r"(r) : "r"(in)); return r;
}
-/* calculate -a*b+c (replacement for fused negated-multiply-subtract on SSE CPUs) */
-ccl_device_inline const __m128 fnma(const __m128& a, const __m128& b, const __m128& c)
-{
- return _mm_sub_ps(c, _mm_mul_ps(a, b));
+__forceinline int __bsf(int v) {
+ int r = 0; asm ("bsf %1,%0" : "=r"(r) : "r"(v)); return r;
}
-template<size_t N> ccl_device_inline const __m128 broadcast(const __m128& a)
-{
- return _mm_castsi128_ps(_mm_shuffle_epi32(_mm_castps_si128(a), _MM_SHUFFLE(N, N, N, N)));
+__forceinline int __bsr(int v) {
+ int r = 0; asm ("bsr %1,%0" : "=r"(r) : "r"(v)); return r;
}
-template<size_t N> ccl_device_inline const __m128i broadcast(const __m128i& a)
-{
- return _mm_shuffle_epi32(a, _MM_SHUFFLE(N, N, N, N));
+__forceinline int __btc(int v, int i) {
+ int r = 0; asm ("btc %1,%0" : "=r"(r) : "r"(i), "0"(v) : "flags" ); return r;
}
-ccl_device_inline const __m128 uint32_to_float(const __m128i &in)
-{
- __m128i a = _mm_srli_epi32(in, 16);
- __m128i b = _mm_and_si128(in, _mm_set1_epi32(0x0000ffff));
- __m128i c = _mm_or_si128(a, _mm_set1_epi32(0x53000000));
- __m128 d = _mm_cvtepi32_ps(b);
- __m128 e = _mm_sub_ps(_mm_castsi128_ps(c), _mm_castsi128_ps(_mm_set1_epi32(0x53000000)));
- return _mm_add_ps(e, d);
+__forceinline int __bts(int v, int i) {
+ int r = 0; asm ("bts %1,%0" : "=r"(r) : "r"(i), "0"(v) : "flags"); return r;
}
-template<size_t S1, size_t S2, size_t S3, size_t S4>
-ccl_device_inline const __m128 set_sign_bit(const __m128 &a)
-{
- return _mm_xor_ps(a, _mm_castsi128_ps(_mm_setr_epi32(S1 << 31, S2 << 31, S3 << 31, S4 << 31)));
+__forceinline int __btr(int v, int i) {
+ int r = 0; asm ("btr %1,%0" : "=r"(r) : "r"(i), "0"(v) : "flags"); return r;
}
-#ifdef __KERNEL_WITH_SSE_ALIGN__
-ccl_device_inline const __m128 load_m128(const float4 &vec)
-{
- return _mm_load_ps(&vec.x);
+#if defined(__KERNEL_64_BIT__) || defined(__APPLE__)
+__forceinline size_t __bsf(size_t v) {
+ size_t r = 0; asm ("bsf %1,%0" : "=r"(r) : "r"(v)); return r;
}
+#endif
-ccl_device_inline const __m128 load_m128(const float3 &vec)
-{
- return _mm_load_ps(&vec.x);
+__forceinline unsigned int __bsf(unsigned int v) {
+ unsigned int r = 0; asm ("bsf %1,%0" : "=r"(r) : "r"(v)); return r;
}
-#else
+__forceinline size_t __bsr(size_t v) {
+ size_t r = 0; asm ("bsr %1,%0" : "=r"(r) : "r"(v)); return r;
+}
-ccl_device_inline const __m128 load_m128(const float4 &vec)
-{
- return _mm_loadu_ps(&vec.x);
+__forceinline size_t __btc(size_t v, size_t i) {
+ size_t r = 0; asm ("btc %1,%0" : "=r"(r) : "r"(i), "0"(v) : "flags" ); return r;
}
-ccl_device_inline const __m128 load_m128(const float3 &vec)
-{
- return _mm_loadu_ps(&vec.x);
+__forceinline size_t __bts(size_t v, size_t i) {
+ size_t r = 0; asm ("bts %1,%0" : "=r"(r) : "r"(i), "0"(v) : "flags"); return r;
}
-#endif /* __KERNEL_WITH_SSE_ALIGN__ */
-ccl_device_inline const __m128 dot3_splat(const __m128& a, const __m128& b)
-{
-#ifdef __KERNEL_SSE41__
- return _mm_dp_ps(a, b, 0x7f);
+__forceinline size_t __btr(size_t v, size_t i) {
+ size_t r = 0; asm ("btr %1,%0" : "=r"(r) : "r"(i), "0"(v) : "flags"); return r;
+}
+
+__forceinline int bitscan(int v) {
+#if defined(__KERNEL_AVX2__)
+ return _tzcnt_u32(v);
#else
- __m128 t = _mm_mul_ps(a, b);
- return _mm_set1_ps(((float*)&t)[0] + ((float*)&t)[1] + ((float*)&t)[2]);
+ return __bsf(v);
#endif
}
-/* squared length taking only specified axes into account */
-template<size_t X, size_t Y, size_t Z, size_t W>
-ccl_device_inline float len_squared(const __m128& a)
-{
-#ifndef __KERNEL_SSE41__
- float4& t = (float4 &)a;
- return (X ? t.x * t.x : 0.0f) + (Y ? t.y * t.y : 0.0f) + (Z ? t.z * t.z : 0.0f) + (W ? t.w * t.w : 0.0f);
+__forceinline unsigned int bitscan(unsigned int v) {
+#if defined(__KERNEL_AVX2__)
+ return _tzcnt_u32(v);
#else
- return _mm_cvtss_f32(_mm_dp_ps(a, a, (X << 4) | (Y << 5) | (Z << 6) | (W << 7) | 0xf));
+ return __bsf(v);
#endif
}
-ccl_device_inline float dot3(const __m128& a, const __m128& b)
-{
-#ifdef __KERNEL_SSE41__
- return _mm_cvtss_f32(_mm_dp_ps(a, b, 0x7f));
+#if defined(__KERNEL_64_BIT__) || defined(__APPLE__)
+__forceinline size_t bitscan(size_t v) {
+#if defined(__KERNEL_AVX2__)
+#if defined(__KERNEL_64_BIT__)
+ return _tzcnt_u64(v);
+#else
+ return _tzcnt_u32(v);
+#endif
#else
- __m128 t = _mm_mul_ps(a, b);
- return ((float*)&t)[0] + ((float*)&t)[1] + ((float*)&t)[2];
+ return __bsf(v);
#endif
}
+#endif
-ccl_device_inline const __m128 len3_squared_splat(const __m128& a)
+__forceinline int clz(const int x)
{
- return dot3_splat(a, a);
+#if defined(__KERNEL_AVX2__)
+ return _lzcnt_u32(x);
+#else
+ if (UNLIKELY(x == 0)) return 32;
+ return 31 - __bsr(x);
+#endif
}
-ccl_device_inline float len3_squared(const __m128& a)
+__forceinline int __bscf(int& v)
{
- return dot3(a, a);
+ int i = bitscan(v);
+#if defined(__KERNEL_AVX2__)
+ v &= v-1;
+#else
+ v = __btc(v,i);
+#endif
+ return i;
}
-ccl_device_inline float len3(const __m128& a)
+__forceinline unsigned int __bscf(unsigned int& v)
{
- return _mm_cvtss_f32(_mm_sqrt_ss(dot3_splat(a, a)));
+ unsigned int i = bitscan(v);
+ v &= v-1;
+ return i;
}
-/* calculate shuffled cross product, useful when order of components does not matter */
-ccl_device_inline const __m128 cross_zxy(const __m128& a, const __m128& b)
+#if defined(__KERNEL_64_BIT__) || defined(__APPLE__)
+__forceinline size_t __bscf(size_t& v)
{
- return fms(a, shuffle<1, 2, 0, 3>(b), _mm_mul_ps(b, shuffle<1, 2, 0, 3>(a)));
+ size_t i = bitscan(v);
+#if defined(__KERNEL_AVX2__)
+ v &= v-1;
+#else
+ v = __btc(v,i);
+#endif
+ return i;
+}
+#endif
+
+#endif /* _WIN32 */
+
+static const unsigned int BITSCAN_NO_BIT_SET_32 = 32;
+static const size_t BITSCAN_NO_BIT_SET_64 = 64;
+
+/* Emulation of SSE4 functions with SSE3 */
+
+#if defined(__KERNEL_SSE3) && !defined(__KERNEL_SSE4__)
+
+#define _MM_FROUND_TO_NEAREST_INT 0x00
+#define _MM_FROUND_TO_NEG_INF 0x01
+#define _MM_FROUND_TO_POS_INF 0x02
+#define _MM_FROUND_TO_ZERO 0x03
+#define _MM_FROUND_CUR_DIRECTION 0x04
+
+#define _mm_blendv_ps __emu_mm_blendv_ps
+__forceinline __m128 _mm_blendv_ps( __m128 value, __m128 input, __m128 mask ) {
+ return _mm_or_ps(_mm_and_ps(mask, input), _mm_andnot_ps(mask, value));
+}
+
+#define _mm_blend_ps __emu_mm_blend_ps
+__forceinline __m128 _mm_blend_ps( __m128 value, __m128 input, const int mask ) {
+ assert(mask < 0x10); return _mm_blendv_ps(value, input, _mm_lookupmask_ps[mask]);
+}
+
+#define _mm_blendv_epi8 __emu_mm_blendv_epi8
+__forceinline __m128i _mm_blendv_epi8( __m128i value, __m128i input, __m128i mask ) {
+ return _mm_or_si128(_mm_and_si128(mask, input), _mm_andnot_si128(mask, value));
+}
+
+#define _mm_mullo_epi32 __emu_mm_mullo_epi32
+__forceinline __m128i _mm_mullo_epi32( __m128i value, __m128i input ) {
+ __m128i rvalue;
+ char* _r = (char*)(&rvalue + 1);
+ char* _v = (char*)(& value + 1);
+ char* _i = (char*)(& input + 1);
+ for ( ssize_t i = -16 ; i != 0 ; i += 4 ) *((int32*)(_r + i)) = *((int32*)(_v + i))* *((int32*)(_i + i));
+ return rvalue;
+}
+
+
+#define _mm_min_epi32 __emu_mm_min_epi32
+__forceinline __m128i _mm_min_epi32( __m128i value, __m128i input ) {
+ return _mm_blendv_epi8(input, value, _mm_cmplt_epi32(value, input));
+}
+
+#define _mm_max_epi32 __emu_mm_max_epi32
+__forceinline __m128i _mm_max_epi32( __m128i value, __m128i input ) {
+ return _mm_blendv_epi8(value, input, _mm_cmplt_epi32(value, input));
+}
+
+#define _mm_extract_epi32 __emu_mm_extract_epi32
+__forceinline int _mm_extract_epi32( __m128i input, const int index ) {
+ switch ( index ) {
+ case 0: return _mm_cvtsi128_si32(input);
+ case 1: return _mm_cvtsi128_si32(_mm_shuffle_epi32(input, _MM_SHUFFLE(1, 1, 1, 1)));
+ case 2: return _mm_cvtsi128_si32(_mm_shuffle_epi32(input, _MM_SHUFFLE(2, 2, 2, 2)));
+ case 3: return _mm_cvtsi128_si32(_mm_shuffle_epi32(input, _MM_SHUFFLE(3, 3, 3, 3)));
+ default: assert(false); return 0;
+ }
+}
+
+#define _mm_insert_epi32 __emu_mm_insert_epi32
+__forceinline __m128i _mm_insert_epi32( __m128i value, int input, const int index ) {
+ assert(index >= 0 && index < 4); ((int*)&value)[index] = input; return value;
}
-ccl_device_inline const __m128 cross(const __m128& a, const __m128& b)
+#define _mm_extract_ps __emu_mm_extract_ps
+__forceinline int _mm_extract_ps( __m128 input, const int index ) {
+ int32* ptr = (int32*)&input; return ptr[index];
+}
+
+#define _mm_insert_ps __emu_mm_insert_ps
+__forceinline __m128 _mm_insert_ps( __m128 value, __m128 input, const int index )
+{ assert(index < 0x100); ((float*)&value)[(index >> 4)&0x3] = ((float*)&input)[index >> 6]; return _mm_andnot_ps(_mm_lookupmask_ps[index&0xf], value); }
+
+#define _mm_round_ps __emu_mm_round_ps
+__forceinline __m128 _mm_round_ps( __m128 value, const int flags )
{
- return shuffle<1, 2, 0, 3>(cross_zxy(a, b));
+ switch ( flags )
+ {
+ case _MM_FROUND_TO_NEAREST_INT: return _mm_cvtepi32_ps(_mm_cvtps_epi32(value));
+ case _MM_FROUND_TO_NEG_INF : return _mm_cvtepi32_ps(_mm_cvtps_epi32(_mm_add_ps(value, _mm_set1_ps(-0.5f))));
+ case _MM_FROUND_TO_POS_INF : return _mm_cvtepi32_ps(_mm_cvtps_epi32(_mm_add_ps(value, _mm_set1_ps( 0.5f))));
+ case _MM_FROUND_TO_ZERO : return _mm_cvtepi32_ps(_mm_cvttps_epi32(value));
+ }
+ return value;
+}
+
+#ifdef _M_X64
+#define _mm_insert_epi64 __emu_mm_insert_epi64
+__forceinline __m128i _mm_insert_epi64( __m128i value, __int64 input, const int index ) {
+ assert(size_t(index) < 4); ((__int64*)&value)[index] = input; return value;
+}
+
+#define _mm_extract_epi64 __emu_mm_extract_epi64
+__forceinline __int64 _mm_extract_epi64( __m128i input, const int index ) {
+ assert(size_t(index) < 2);
+ return index == 0 ? _mm_cvtsi128_si64x(input) : _mm_cvtsi128_si64x(_mm_unpackhi_epi64(input, input));
}
+#endif
+
+#endif
#endif /* __KERNEL_SSE2__ */
CCL_NAMESPACE_END
-#endif /* __UTIL_SIMD_H__ */
+#include "util_math.h"
+#include "util_sseb.h"
+#include "util_ssei.h"
+#include "util_ssef.h"
+
+#endif /* __UTIL_SIMD_TYPES_H__ */
diff --git a/intern/cycles/util/util_sseb.h b/intern/cycles/util/util_sseb.h
new file mode 100644
index 00000000000..be510256dd3
--- /dev/null
+++ b/intern/cycles/util/util_sseb.h
@@ -0,0 +1,161 @@
+/*
+ * Copyright 2011-2013 Intel Corporation
+ * Modifications Copyright 2014, Blender Foundation.
+ *
+ * Licensed under the Apache License, Version 2.0(the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License
+ */
+
+#ifndef __UTIL_SSEB_H__
+#define __UTIL_SSEB_H__
+
+CCL_NAMESPACE_BEGIN
+
+#ifdef __KERNEL_SSE2__
+
+/*! 4-wide SSE bool type. */
+struct sseb
+{
+ typedef sseb Mask; // mask type
+ typedef ssei Int; // int type
+ typedef ssef Float; // float type
+
+ enum { size = 4 }; // number of SIMD elements
+ union { __m128 m128; int32_t v[4]; }; // data
+
+ ////////////////////////////////////////////////////////////////////////////////
+ /// Constructors, Assignment & Cast Operators
+ ////////////////////////////////////////////////////////////////////////////////
+
+ __forceinline sseb ( ) {}
+ __forceinline sseb ( const sseb& other ) { m128 = other.m128; }
+ __forceinline sseb& operator=( const sseb& other ) { m128 = other.m128; return *this; }
+
+ __forceinline sseb( const __m128 input ) : m128(input) {}
+ __forceinline operator const __m128&( void ) const { return m128; }
+ __forceinline operator const __m128i( void ) const { return _mm_castps_si128(m128); }
+ __forceinline operator const __m128d( void ) const { return _mm_castps_pd(m128); }
+
+ __forceinline sseb ( bool a )
+ : m128(_mm_lookupmask_ps[(size_t(a) << 3) | (size_t(a) << 2) | (size_t(a) << 1) | size_t(a)]) {}
+ __forceinline sseb ( bool a, bool b)
+ : m128(_mm_lookupmask_ps[(size_t(b) << 3) | (size_t(a) << 2) | (size_t(b) << 1) | size_t(a)]) {}
+ __forceinline sseb ( bool a, bool b, bool c, bool d)
+ : m128(_mm_lookupmask_ps[(size_t(d) << 3) | (size_t(c) << 2) | (size_t(b) << 1) | size_t(a)]) {}
+ __forceinline sseb(int mask) {
+ assert(mask >= 0 && mask < 16);
+ m128 = _mm_lookupmask_ps[mask];
+ }
+
+ ////////////////////////////////////////////////////////////////////////////////
+ /// Constants
+ ////////////////////////////////////////////////////////////////////////////////
+
+ __forceinline sseb( FalseTy ) : m128(_mm_setzero_ps()) {}
+ __forceinline sseb( TrueTy ) : m128(_mm_castsi128_ps(_mm_cmpeq_epi32(_mm_setzero_si128(), _mm_setzero_si128()))) {}
+
+ ////////////////////////////////////////////////////////////////////////////////
+ /// Array Access
+ ////////////////////////////////////////////////////////////////////////////////
+
+ __forceinline bool operator []( const size_t i ) const { assert(i < 4); return (_mm_movemask_ps(m128) >> i) & 1; }
+ __forceinline int32_t& operator []( const size_t i ) { assert(i < 4); return v[i]; }
+};
+
+////////////////////////////////////////////////////////////////////////////////
+/// Unary Operators
+////////////////////////////////////////////////////////////////////////////////
+
+__forceinline const sseb operator !( const sseb& a ) { return _mm_xor_ps(a, sseb(True)); }
+
+////////////////////////////////////////////////////////////////////////////////
+/// Binary Operators
+////////////////////////////////////////////////////////////////////////////////
+
+__forceinline const sseb operator &( const sseb& a, const sseb& b ) { return _mm_and_ps(a, b); }
+__forceinline const sseb operator |( const sseb& a, const sseb& b ) { return _mm_or_ps (a, b); }
+__forceinline const sseb operator ^( const sseb& a, const sseb& b ) { return _mm_xor_ps(a, b); }
+
+////////////////////////////////////////////////////////////////////////////////
+/// Assignment Operators
+////////////////////////////////////////////////////////////////////////////////
+
+__forceinline const sseb operator &=( sseb& a, const sseb& b ) { return a = a & b; }
+__forceinline const sseb operator |=( sseb& a, const sseb& b ) { return a = a | b; }
+__forceinline const sseb operator ^=( sseb& a, const sseb& b ) { return a = a ^ b; }
+
+////////////////////////////////////////////////////////////////////////////////
+/// Comparison Operators + Select
+////////////////////////////////////////////////////////////////////////////////
+
+__forceinline const sseb operator !=( const sseb& a, const sseb& b ) { return _mm_xor_ps(a, b); }
+__forceinline const sseb operator ==( const sseb& a, const sseb& b ) { return _mm_castsi128_ps(_mm_cmpeq_epi32(a, b)); }
+
+__forceinline const sseb select( const sseb& m, const sseb& t, const sseb& f ) {
+#if defined(__KERNEL_SSE41__)
+ return _mm_blendv_ps(f, t, m);
+#else
+ return _mm_or_ps(_mm_and_ps(m, t), _mm_andnot_ps(m, f));
+#endif
+}
+
+////////////////////////////////////////////////////////////////////////////////
+/// Movement/Shifting/Shuffling Functions
+////////////////////////////////////////////////////////////////////////////////
+
+__forceinline const sseb unpacklo( const sseb& a, const sseb& b ) { return _mm_unpacklo_ps(a, b); }
+__forceinline const sseb unpackhi( const sseb& a, const sseb& b ) { return _mm_unpackhi_ps(a, b); }
+
+template<size_t i0, size_t i1, size_t i2, size_t i3> __forceinline const sseb shuffle( const sseb& a ) {
+ return _mm_shuffle_epi32(a, _MM_SHUFFLE(i3, i2, i1, i0));
+}
+
+template<size_t i0, size_t i1, size_t i2, size_t i3> __forceinline const sseb shuffle( const sseb& a, const sseb& b ) {
+ return _mm_shuffle_ps(a, b, _MM_SHUFFLE(i3, i2, i1, i0));
+}
+
+#if defined(__KERNEL_SSE3__)
+template<> __forceinline const sseb shuffle<0, 0, 2, 2>( const sseb& a ) { return _mm_moveldup_ps(a); }
+template<> __forceinline const sseb shuffle<1, 1, 3, 3>( const sseb& a ) { return _mm_movehdup_ps(a); }
+template<> __forceinline const sseb shuffle<0, 1, 0, 1>( const sseb& a ) { return _mm_castpd_ps(_mm_movedup_pd (a)); }
+#endif
+
+#if defined(__KERNEL_SSE41__)
+template<size_t dst, size_t src, size_t clr> __forceinline const sseb insert( const sseb& a, const sseb& b ) { return _mm_insert_ps(a, b, (dst << 4) | (src << 6) | clr); }
+template<size_t dst, size_t src> __forceinline const sseb insert( const sseb& a, const sseb& b ) { return insert<dst, src, 0>(a, b); }
+template<size_t dst> __forceinline const sseb insert( const sseb& a, const bool b ) { return insert<dst,0>(a, sseb(b)); }
+#endif
+
+////////////////////////////////////////////////////////////////////////////////
+/// Reduction Operations
+////////////////////////////////////////////////////////////////////////////////
+
+#if defined(__KERNEL_SSE41__)
+__forceinline size_t popcnt( const sseb& a ) { return __popcnt(_mm_movemask_ps(a)); }
+#else
+__forceinline size_t popcnt( const sseb& a ) { return bool(a[0])+bool(a[1])+bool(a[2])+bool(a[3]); }
+#endif
+
+__forceinline bool reduce_and( const sseb& a ) { return _mm_movemask_ps(a) == 0xf; }
+__forceinline bool reduce_or ( const sseb& a ) { return _mm_movemask_ps(a) != 0x0; }
+__forceinline bool all ( const sseb& b ) { return _mm_movemask_ps(b) == 0xf; }
+__forceinline bool any ( const sseb& b ) { return _mm_movemask_ps(b) != 0x0; }
+__forceinline bool none ( const sseb& b ) { return _mm_movemask_ps(b) == 0x0; }
+
+__forceinline size_t movemask( const sseb& a ) { return _mm_movemask_ps(a); }
+
+#endif
+
+CCL_NAMESPACE_END
+
+#endif
+
diff --git a/intern/cycles/util/util_ssef.h b/intern/cycles/util/util_ssef.h
new file mode 100644
index 00000000000..f4236cc616e
--- /dev/null
+++ b/intern/cycles/util/util_ssef.h
@@ -0,0 +1,588 @@
+/*
+ * Copyright 2011-2013 Intel Corporation
+ * Modifications Copyright 2014, Blender Foundation.
+ *
+ * Licensed under the Apache License, Version 2.0(the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License
+ */
+
+#ifndef __UTIL_SSEF_H__
+#define __UTIL_SSEF_H__
+
+CCL_NAMESPACE_BEGIN
+
+#ifdef __KERNEL_SSE2__
+
+/*! 4-wide SSE float type. */
+struct ssef
+{
+ typedef sseb Mask; // mask type
+ typedef ssei Int; // int type
+ typedef ssef Float; // float type
+
+ enum { size = 4 }; // number of SIMD elements
+ union { __m128 m128; float f[4]; int i[4]; }; // data
+
+ ////////////////////////////////////////////////////////////////////////////////
+ /// Constructors, Assignment & Cast Operators
+ ////////////////////////////////////////////////////////////////////////////////
+
+ __forceinline ssef () {}
+ __forceinline ssef (const ssef& other) { m128 = other.m128; }
+ __forceinline ssef& operator=(const ssef& other) { m128 = other.m128; return *this; }
+
+ __forceinline ssef(const __m128 a) : m128(a) {}
+ __forceinline operator const __m128&(void) const { return m128; }
+ __forceinline operator __m128&(void) { return m128; }
+
+ __forceinline ssef (float a) : m128(_mm_set1_ps(a)) {}
+ __forceinline ssef (float a, float b, float c, float d) : m128(_mm_setr_ps(a, b, c, d)) {}
+
+ __forceinline explicit ssef(const __m128i a) : m128(_mm_cvtepi32_ps(a)) {}
+
+ ////////////////////////////////////////////////////////////////////////////////
+ /// Loads and Stores
+ ////////////////////////////////////////////////////////////////////////////////
+
+#if defined(__KERNEL_AVX__)
+ static __forceinline ssef broadcast(const void* const a) { return _mm_broadcast_ss((float*)a); }
+#else
+ static __forceinline ssef broadcast(const void* const a) { return _mm_set1_ps(*(float*)a); }
+#endif
+
+ ////////////////////////////////////////////////////////////////////////////////
+ /// Array Access
+ ////////////////////////////////////////////////////////////////////////////////
+
+ __forceinline const float& operator [](const size_t i) const { assert(i < 4); return f[i]; }
+ __forceinline float& operator [](const size_t i) { assert(i < 4); return f[i]; }
+};
+
+
+////////////////////////////////////////////////////////////////////////////////
+/// Unary Operators
+////////////////////////////////////////////////////////////////////////////////
+
+__forceinline const ssef cast (const __m128i& a) { return _mm_castsi128_ps(a); }
+__forceinline const ssef operator +(const ssef& a) { return a; }
+__forceinline const ssef operator -(const ssef& a) { return _mm_xor_ps(a.m128, _mm_castsi128_ps(_mm_set1_epi32(0x80000000))); }
+__forceinline const ssef abs (const ssef& a) { return _mm_and_ps(a.m128, _mm_castsi128_ps(_mm_set1_epi32(0x7fffffff))); }
+#if defined(__KERNEL_SSE41__)
+__forceinline const ssef sign (const ssef& a) { return _mm_blendv_ps(ssef(1.0f), -ssef(1.0f), _mm_cmplt_ps(a,ssef(0.0f))); }
+#endif
+__forceinline const ssef signmsk (const ssef& a) { return _mm_and_ps(a.m128,_mm_castsi128_ps(_mm_set1_epi32(0x80000000))); }
+
+__forceinline const ssef rcp (const ssef& a) {
+ const ssef r = _mm_rcp_ps(a.m128);
+ return _mm_sub_ps(_mm_add_ps(r, r), _mm_mul_ps(_mm_mul_ps(r, r), a));
+}
+__forceinline const ssef sqr (const ssef& a) { return _mm_mul_ps(a,a); }
+__forceinline const ssef mm_sqrt(const ssef& a) { return _mm_sqrt_ps(a.m128); }
+__forceinline const ssef rsqrt(const ssef& a) {
+ const ssef r = _mm_rsqrt_ps(a.m128);
+ return _mm_add_ps(_mm_mul_ps(_mm_set_ps(1.5f, 1.5f, 1.5f, 1.5f), r),
+ _mm_mul_ps(_mm_mul_ps(_mm_mul_ps(a, _mm_set_ps(-0.5f, -0.5f, -0.5f, -0.5f)), r), _mm_mul_ps(r, r)));
+}
+
+////////////////////////////////////////////////////////////////////////////////
+/// Binary Operators
+////////////////////////////////////////////////////////////////////////////////
+
+__forceinline const ssef operator +(const ssef& a, const ssef& b) { return _mm_add_ps(a.m128, b.m128); }
+__forceinline const ssef operator +(const ssef& a, const float& b) { return a + ssef(b); }
+__forceinline const ssef operator +(const float& a, const ssef& b) { return ssef(a) + b; }
+
+__forceinline const ssef operator -(const ssef& a, const ssef& b) { return _mm_sub_ps(a.m128, b.m128); }
+__forceinline const ssef operator -(const ssef& a, const float& b) { return a - ssef(b); }
+__forceinline const ssef operator -(const float& a, const ssef& b) { return ssef(a) - b; }
+
+__forceinline const ssef operator *(const ssef& a, const ssef& b) { return _mm_mul_ps(a.m128, b.m128); }
+__forceinline const ssef operator *(const ssef& a, const float& b) { return a * ssef(b); }
+__forceinline const ssef operator *(const float& a, const ssef& b) { return ssef(a) * b; }
+
+__forceinline const ssef operator /(const ssef& a, const ssef& b) { return _mm_div_ps(a.m128,b.m128); }
+__forceinline const ssef operator /(const ssef& a, const float& b) { return a/ssef(b); }
+__forceinline const ssef operator /(const float& a, const ssef& b) { return ssef(a)/b; }
+
+__forceinline const ssef operator^(const ssef& a, const ssef& b) { return _mm_xor_ps(a.m128,b.m128); }
+__forceinline const ssef operator^(const ssef& a, const ssei& b) { return _mm_xor_ps(a.m128,_mm_castsi128_ps(b.m128)); }
+
+__forceinline const ssef operator&(const ssef& a, const ssef& b) { return _mm_and_ps(a.m128,b.m128); }
+__forceinline const ssef operator&(const ssef& a, const ssei& b) { return _mm_and_ps(a.m128,_mm_castsi128_ps(b.m128)); }
+
+__forceinline const ssef andnot(const ssef& a, const ssef& b) { return _mm_andnot_ps(a.m128,b.m128); }
+
+__forceinline const ssef min(const ssef& a, const ssef& b) { return _mm_min_ps(a.m128,b.m128); }
+__forceinline const ssef min(const ssef& a, const float& b) { return _mm_min_ps(a.m128,ssef(b)); }
+__forceinline const ssef min(const float& a, const ssef& b) { return _mm_min_ps(ssef(a),b.m128); }
+
+__forceinline const ssef max(const ssef& a, const ssef& b) { return _mm_max_ps(a.m128,b.m128); }
+__forceinline const ssef max(const ssef& a, const float& b) { return _mm_max_ps(a.m128,ssef(b)); }
+__forceinline const ssef max(const float& a, const ssef& b) { return _mm_max_ps(ssef(a),b.m128); }
+
+#if defined(__KERNEL_SSE41__)
+__forceinline ssef mini(const ssef& a, const ssef& b) {
+ const ssei ai = _mm_castps_si128(a);
+ const ssei bi = _mm_castps_si128(b);
+ const ssei ci = _mm_min_epi32(ai,bi);
+ return _mm_castsi128_ps(ci);
+}
+#endif
+
+#if defined(__KERNEL_SSE41__)
+__forceinline ssef maxi(const ssef& a, const ssef& b) {
+ const ssei ai = _mm_castps_si128(a);
+ const ssei bi = _mm_castps_si128(b);
+ const ssei ci = _mm_max_epi32(ai,bi);
+ return _mm_castsi128_ps(ci);
+}
+#endif
+
+////////////////////////////////////////////////////////////////////////////////
+/// Ternary Operators
+////////////////////////////////////////////////////////////////////////////////
+
+#if defined(__KERNEL_AVX2__)
+__forceinline const ssef madd (const ssef& a, const ssef& b, const ssef& c) { return _mm_fmadd_ps(a,b,c); }
+__forceinline const ssef msub (const ssef& a, const ssef& b, const ssef& c) { return _mm_fmsub_ps(a,b,c); }
+__forceinline const ssef nmadd(const ssef& a, const ssef& b, const ssef& c) { return _mm_fnmadd_ps(a,b,c); }
+__forceinline const ssef nmsub(const ssef& a, const ssef& b, const ssef& c) { return _mm_fnmsub_ps(a,b,c); }
+#else
+__forceinline const ssef madd (const ssef& a, const ssef& b, const ssef& c) { return a*b+c; }
+__forceinline const ssef msub (const ssef& a, const ssef& b, const ssef& c) { return a*b-c; }
+__forceinline const ssef nmadd(const ssef& a, const ssef& b, const ssef& c) { return -a*b-c;}
+__forceinline const ssef nmsub(const ssef& a, const ssef& b, const ssef& c) { return c-a*b; }
+#endif
+
+////////////////////////////////////////////////////////////////////////////////
+/// Assignment Operators
+////////////////////////////////////////////////////////////////////////////////
+
+__forceinline ssef& operator +=(ssef& a, const ssef& b) { return a = a + b; }
+__forceinline ssef& operator +=(ssef& a, const float& b) { return a = a + b; }
+
+__forceinline ssef& operator -=(ssef& a, const ssef& b) { return a = a - b; }
+__forceinline ssef& operator -=(ssef& a, const float& b) { return a = a - b; }
+
+__forceinline ssef& operator *=(ssef& a, const ssef& b) { return a = a * b; }
+__forceinline ssef& operator *=(ssef& a, const float& b) { return a = a * b; }
+
+__forceinline ssef& operator /=(ssef& a, const ssef& b) { return a = a / b; }
+__forceinline ssef& operator /=(ssef& a, const float& b) { return a = a / b; }
+
+////////////////////////////////////////////////////////////////////////////////
+/// Comparison Operators + Select
+////////////////////////////////////////////////////////////////////////////////
+
+__forceinline const sseb operator ==(const ssef& a, const ssef& b) { return _mm_cmpeq_ps(a.m128, b.m128); }
+__forceinline const sseb operator ==(const ssef& a, const float& b) { return a == ssef(b); }
+__forceinline const sseb operator ==(const float& a, const ssef& b) { return ssef(a) == b; }
+
+__forceinline const sseb operator !=(const ssef& a, const ssef& b) { return _mm_cmpneq_ps(a.m128, b.m128); }
+__forceinline const sseb operator !=(const ssef& a, const float& b) { return a != ssef(b); }
+__forceinline const sseb operator !=(const float& a, const ssef& b) { return ssef(a) != b; }
+
+__forceinline const sseb operator <(const ssef& a, const ssef& b) { return _mm_cmplt_ps(a.m128, b.m128); }
+__forceinline const sseb operator <(const ssef& a, const float& b) { return a < ssef(b); }
+__forceinline const sseb operator <(const float& a, const ssef& b) { return ssef(a) < b; }
+
+__forceinline const sseb operator >=(const ssef& a, const ssef& b) { return _mm_cmpnlt_ps(a.m128, b.m128); }
+__forceinline const sseb operator >=(const ssef& a, const float& b) { return a >= ssef(b); }
+__forceinline const sseb operator >=(const float& a, const ssef& b) { return ssef(a) >= b; }
+
+__forceinline const sseb operator >(const ssef& a, const ssef& b) { return _mm_cmpnle_ps(a.m128, b.m128); }
+__forceinline const sseb operator >(const ssef& a, const float& b) { return a > ssef(b); }
+__forceinline const sseb operator >(const float& a, const ssef& b) { return ssef(a) > b; }
+
+__forceinline const sseb operator <=(const ssef& a, const ssef& b) { return _mm_cmple_ps(a.m128, b.m128); }
+__forceinline const sseb operator <=(const ssef& a, const float& b) { return a <= ssef(b); }
+__forceinline const sseb operator <=(const float& a, const ssef& b) { return ssef(a) <= b; }
+
+__forceinline const ssef select(const sseb& m, const ssef& t, const ssef& f) {
+#ifdef __KERNEL_SSE41__
+ return _mm_blendv_ps(f, t, m);
+#else
+ return _mm_or_ps(_mm_and_ps(m, t), _mm_andnot_ps(m, f));
+#endif
+}
+
+__forceinline const ssef select(const ssef& m, const ssef& t, const ssef& f) {
+#ifdef __KERNEL_SSE41__
+ return _mm_blendv_ps(f, t, m);
+#else
+ return _mm_or_ps(_mm_and_ps(m, t), _mm_andnot_ps(m, f));
+#endif
+}
+
+__forceinline const ssef select(const int mask, const ssef& t, const ssef& f) {
+#if defined(__KERNEL_SSE41__) && ((!defined(__clang__) && !defined(_MSC_VER)) || defined(__INTEL_COMPILER))
+ return _mm_blend_ps(f, t, mask);
+#else
+ return select(sseb(mask),t,f);
+#endif
+}
+
+////////////////////////////////////////////////////////////////////////////////
+/// Rounding Functions
+////////////////////////////////////////////////////////////////////////////////
+
+#if defined(__KERNEL_SSE41__)
+__forceinline const ssef round_even(const ssef& a) { return _mm_round_ps(a, _MM_FROUND_TO_NEAREST_INT); }
+__forceinline const ssef round_down(const ssef& a) { return _mm_round_ps(a, _MM_FROUND_TO_NEG_INF ); }
+__forceinline const ssef round_up (const ssef& a) { return _mm_round_ps(a, _MM_FROUND_TO_POS_INF ); }
+__forceinline const ssef round_zero(const ssef& a) { return _mm_round_ps(a, _MM_FROUND_TO_ZERO ); }
+__forceinline const ssef floor (const ssef& a) { return _mm_round_ps(a, _MM_FROUND_TO_NEG_INF ); }
+__forceinline const ssef ceil (const ssef& a) { return _mm_round_ps(a, _MM_FROUND_TO_POS_INF ); }
+#endif
+
+__forceinline ssei truncatei(const ssef& a) {
+ return _mm_cvttps_epi32(a.m128);
+}
+
+__forceinline ssei floori(const ssef& a) {
+#if defined(__KERNEL_SSE41__)
+ return ssei(floor(a));
+#else
+ return ssei(a-ssef(0.5f));
+#endif
+}
+
+////////////////////////////////////////////////////////////////////////////////
+/// Movement/Shifting/Shuffling Functions
+////////////////////////////////////////////////////////////////////////////////
+
+__forceinline ssef unpacklo(const ssef& a, const ssef& b) { return _mm_unpacklo_ps(a.m128, b.m128); }
+__forceinline ssef unpackhi(const ssef& a, const ssef& b) { return _mm_unpackhi_ps(a.m128, b.m128); }
+
+template<size_t i0, size_t i1, size_t i2, size_t i3> __forceinline const ssef shuffle(const ssef& b) {
+ return _mm_castsi128_ps(_mm_shuffle_epi32(_mm_castps_si128(b), _MM_SHUFFLE(i3, i2, i1, i0)));
+}
+
+template<size_t i0, size_t i1, size_t i2, size_t i3> __forceinline const ssef shuffle(const ssef& a, const ssef& b) {
+ return _mm_shuffle_ps(a, b, _MM_SHUFFLE(i3, i2, i1, i0));
+}
+
+#if defined(__KERNEL_SSSE3__)
+__forceinline const ssef shuffle8(const ssef& a, const ssei& shuf) {
+ return _mm_castsi128_ps(_mm_shuffle_epi8(_mm_castps_si128(a), shuf));
+}
+#endif
+
+#if defined(__KERNEL_SSE3__)
+template<> __forceinline const ssef shuffle<0, 0, 2, 2>(const ssef& b) { return _mm_moveldup_ps(b); }
+template<> __forceinline const ssef shuffle<1, 1, 3, 3>(const ssef& b) { return _mm_movehdup_ps(b); }
+template<> __forceinline const ssef shuffle<0, 1, 0, 1>(const ssef& b) { return _mm_castpd_ps(_mm_movedup_pd(_mm_castps_pd(b))); }
+#endif
+
+template<size_t i0> __forceinline const ssef shuffle(const ssef& b) {
+ return shuffle<i0,i0,i0,i0>(b);
+}
+
+#if defined(__KERNEL_SSE41__) && !defined(__GNUC__)
+template<size_t i> __forceinline float extract (const ssef& a) { return _mm_cvtss_f32(_mm_extract_ps(a,i)); }
+#else
+template<size_t i> __forceinline float extract (const ssef& a) { return _mm_cvtss_f32(shuffle<i,i,i,i>(a)); }
+#endif
+template<> __forceinline float extract<0>(const ssef& a) { return _mm_cvtss_f32(a); }
+
+#if defined(__KERNEL_SSE41__)
+template<size_t dst, size_t src, size_t clr> __forceinline const ssef insert(const ssef& a, const ssef& b) { return _mm_insert_ps(a, b,(dst << 4) |(src << 6) | clr); }
+template<size_t dst, size_t src> __forceinline const ssef insert(const ssef& a, const ssef& b) { return insert<dst, src, 0>(a, b); }
+template<size_t dst> __forceinline const ssef insert(const ssef& a, const float b) { return insert<dst, 0>(a, _mm_set_ss(b)); }
+#else
+template<size_t dst> __forceinline const ssef insert(const ssef& a, const float b) { ssef c = a; c[dst] = b; return c; }
+#endif
+
+////////////////////////////////////////////////////////////////////////////////
+/// Transpose
+////////////////////////////////////////////////////////////////////////////////
+
+__forceinline void transpose(const ssef& r0, const ssef& r1, const ssef& r2, const ssef& r3, ssef& c0, ssef& c1, ssef& c2, ssef& c3)
+{
+ ssef l02 = unpacklo(r0,r2);
+ ssef h02 = unpackhi(r0,r2);
+ ssef l13 = unpacklo(r1,r3);
+ ssef h13 = unpackhi(r1,r3);
+ c0 = unpacklo(l02,l13);
+ c1 = unpackhi(l02,l13);
+ c2 = unpacklo(h02,h13);
+ c3 = unpackhi(h02,h13);
+}
+
+__forceinline void transpose(const ssef& r0, const ssef& r1, const ssef& r2, const ssef& r3, ssef& c0, ssef& c1, ssef& c2)
+{
+ ssef l02 = unpacklo(r0,r2);
+ ssef h02 = unpackhi(r0,r2);
+ ssef l13 = unpacklo(r1,r3);
+ ssef h13 = unpackhi(r1,r3);
+ c0 = unpacklo(l02,l13);
+ c1 = unpackhi(l02,l13);
+ c2 = unpacklo(h02,h13);
+}
+
+////////////////////////////////////////////////////////////////////////////////
+/// Reductions
+////////////////////////////////////////////////////////////////////////////////
+
+__forceinline const ssef vreduce_min(const ssef& v) { ssef h = min(shuffle<1,0,3,2>(v),v); return min(shuffle<2,3,0,1>(h),h); }
+__forceinline const ssef vreduce_max(const ssef& v) { ssef h = max(shuffle<1,0,3,2>(v),v); return max(shuffle<2,3,0,1>(h),h); }
+__forceinline const ssef vreduce_add(const ssef& v) { ssef h = shuffle<1,0,3,2>(v) + v ; return shuffle<2,3,0,1>(h) + h ; }
+
+__forceinline float reduce_min(const ssef& v) { return _mm_cvtss_f32(vreduce_min(v)); }
+__forceinline float reduce_max(const ssef& v) { return _mm_cvtss_f32(vreduce_max(v)); }
+__forceinline float reduce_add(const ssef& v) { return _mm_cvtss_f32(vreduce_add(v)); }
+
+__forceinline size_t select_min(const ssef& v) { return __bsf(movemask(v == vreduce_min(v))); }
+__forceinline size_t select_max(const ssef& v) { return __bsf(movemask(v == vreduce_max(v))); }
+
+__forceinline size_t select_min(const sseb& valid, const ssef& v) { const ssef a = select(valid,v,ssef(pos_inf)); return __bsf(movemask(valid &(a == vreduce_min(a)))); }
+__forceinline size_t select_max(const sseb& valid, const ssef& v) { const ssef a = select(valid,v,ssef(neg_inf)); return __bsf(movemask(valid &(a == vreduce_max(a)))); }
+
+////////////////////////////////////////////////////////////////////////////////
+/// Memory load and store operations
+////////////////////////////////////////////////////////////////////////////////
+
+__forceinline ssef load4f(const float4& a) {
+#ifdef __KERNEL_WITH_SSE_ALIGN__
+ return _mm_load_ps(&a.x);
+#else
+ return _mm_loadu_ps(&a.x);
+#endif
+}
+
+__forceinline ssef load4f(const float3& a) {
+#ifdef __KERNEL_WITH_SSE_ALIGN__
+ return _mm_load_ps(&a.x);
+#else
+ return _mm_loadu_ps(&a.x);
+#endif
+}
+
+__forceinline ssef load4f(const void* const a) {
+ return _mm_load_ps((float*)a);
+}
+
+__forceinline ssef load1f_first(const float a) {
+ return _mm_set_ss(a);
+}
+
+__forceinline void store4f(void* ptr, const ssef& v) {
+ _mm_store_ps((float*)ptr,v);
+}
+
+__forceinline ssef loadu4f(const void* const a) {
+ return _mm_loadu_ps((float*)a);
+}
+
+__forceinline void storeu4f(void* ptr, const ssef& v) {
+ _mm_storeu_ps((float*)ptr,v);
+}
+
+__forceinline void store4f(const sseb& mask, void* ptr, const ssef& f) {
+#if defined(__KERNEL_AVX__)
+ _mm_maskstore_ps((float*)ptr,(__m128i)mask,f);
+#else
+ *(ssef*)ptr = select(mask,f,*(ssef*)ptr);
+#endif
+}
+
+__forceinline ssef load4f_nt(void* ptr) {
+#if defined(__KERNEL_SSE41__)
+ return _mm_castsi128_ps(_mm_stream_load_si128((__m128i*)ptr));
+#else
+ return _mm_load_ps((float*)ptr);
+#endif
+}
+
+__forceinline void store4f_nt(void* ptr, const ssef& v) {
+#if defined(__KERNEL_SSE41__)
+ _mm_stream_ps((float*)ptr,v);
+#else
+ _mm_store_ps((float*)ptr,v);
+#endif
+}
+
+////////////////////////////////////////////////////////////////////////////////
+/// Euclidian Space Operators
+////////////////////////////////////////////////////////////////////////////////
+
+__forceinline float dot(const ssef& a, const ssef& b) {
+ return reduce_add(a*b);
+}
+
+/* calculate shuffled cross product, useful when order of components does not matter */
+__forceinline ssef cross_zxy(const ssef& a, const ssef& b)
+{
+ const ssef a0 = a;
+ const ssef b0 = shuffle<1,2,0,3>(b);
+ const ssef a1 = shuffle<1,2,0,3>(a);
+ const ssef b1 = b;
+ return msub(a0,b0,a1*b1);
+}
+
+__forceinline ssef cross(const ssef& a, const ssef& b)
+{
+ return shuffle<1,2,0,3>(cross_zxy(a, b));
+}
+
+ccl_device_inline const ssef dot3_splat(const ssef& a, const ssef& b)
+{
+#ifdef __KERNEL_SSE41__
+ return _mm_dp_ps(a.m128, b.m128, 0x7f);
+#else
+ ssef t = a * b;
+ return ssef(((float*)&t)[0] + ((float*)&t)[1] + ((float*)&t)[2]);
+#endif
+}
+
+/* squared length taking only specified axes into account */
+template<size_t X, size_t Y, size_t Z, size_t W>
+ccl_device_inline float len_squared(const ssef& a)
+{
+#ifndef __KERNEL_SSE41__
+ float4& t = (float4 &)a;
+ return (X ? t.x * t.x : 0.0f) + (Y ? t.y * t.y : 0.0f) + (Z ? t.z * t.z : 0.0f) + (W ? t.w * t.w : 0.0f);
+#else
+ return extract<0>(ssef(_mm_dp_ps(a.m128, a.m128, (X << 4) | (Y << 5) | (Z << 6) | (W << 7) | 0xf)));
+#endif
+}
+
+ccl_device_inline float dot3(const ssef& a, const ssef& b)
+{
+#ifdef __KERNEL_SSE41__
+ return extract<0>(ssef(_mm_dp_ps(a.m128, b.m128, 0x7f)));
+#else
+ ssef t = a * b;
+ return ((float*)&t)[0] + ((float*)&t)[1] + ((float*)&t)[2];
+#endif
+}
+
+ccl_device_inline const ssef len3_squared_splat(const ssef& a)
+{
+ return dot3_splat(a, a);
+}
+
+ccl_device_inline float len3_squared(const ssef& a)
+{
+ return dot3(a, a);
+}
+
+ccl_device_inline float len3(const ssef& a)
+{
+ return extract<0>(mm_sqrt(dot3_splat(a, a)));
+}
+
+/* SSE shuffle utility functions */
+
+#ifdef __KERNEL_SSSE3__
+
+/* faster version for SSSE3 */
+typedef ssei shuffle_swap_t;
+
+ccl_device_inline const shuffle_swap_t shuffle_swap_identity(void)
+{
+ return _mm_set_epi8(15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0);
+}
+
+ccl_device_inline const shuffle_swap_t shuffle_swap_swap(void)
+{
+ return _mm_set_epi8(7, 6, 5, 4, 3, 2, 1, 0, 15, 14, 13, 12, 11, 10, 9, 8);
+}
+
+ccl_device_inline const ssef shuffle_swap(const ssef& a, const shuffle_swap_t& shuf)
+{
+ return cast(_mm_shuffle_epi8(cast(a), shuf));
+}
+
+#else
+
+/* somewhat slower version for SSE2 */
+typedef int shuffle_swap_t;
+
+ccl_device_inline const shuffle_swap_t shuffle_swap_identity(void)
+{
+ return 0;
+}
+
+ccl_device_inline const shuffle_swap_t shuffle_swap_swap(void)
+{
+ return 1;
+}
+
+ccl_device_inline const ssef shuffle_swap(const ssef& a, shuffle_swap_t shuf)
+{
+ /* shuffle value must be a constant, so we need to branch */
+ if(shuf)
+ return ssef(_mm_shuffle_ps(a.m128, a.m128, _MM_SHUFFLE(1, 0, 3, 2)));
+ else
+ return ssef(_mm_shuffle_ps(a.m128, a.m128, _MM_SHUFFLE(3, 2, 1, 0)));
+}
+
+#endif
+
+#ifdef __KERNEL_SSE41__
+
+ccl_device_inline void gen_idirsplat_swap(const ssef &pn, const shuffle_swap_t &shuf_identity, const shuffle_swap_t &shuf_swap,
+ const float3& idir, ssef idirsplat[3], shuffle_swap_t shufflexyz[3])
+{
+ const __m128 idirsplat_raw[] = { _mm_set_ps1(idir.x), _mm_set_ps1(idir.y), _mm_set_ps1(idir.z) };
+ idirsplat[0] = _mm_xor_ps(idirsplat_raw[0], pn);
+ idirsplat[1] = _mm_xor_ps(idirsplat_raw[1], pn);
+ idirsplat[2] = _mm_xor_ps(idirsplat_raw[2], pn);
+
+ const ssef signmask = cast(ssei(0x80000000));
+ const ssef shuf_identity_f = cast(shuf_identity);
+ const ssef shuf_swap_f = cast(shuf_swap);
+
+ shufflexyz[0] = _mm_castps_si128(_mm_blendv_ps(shuf_identity_f, shuf_swap_f, _mm_and_ps(idirsplat_raw[0], signmask)));
+ shufflexyz[1] = _mm_castps_si128(_mm_blendv_ps(shuf_identity_f, shuf_swap_f, _mm_and_ps(idirsplat_raw[1], signmask)));
+ shufflexyz[2] = _mm_castps_si128(_mm_blendv_ps(shuf_identity_f, shuf_swap_f, _mm_and_ps(idirsplat_raw[2], signmask)));
+}
+
+#else
+
+ccl_device_inline void gen_idirsplat_swap(const ssef &pn, const shuffle_swap_t &shuf_identity, const shuffle_swap_t &shuf_swap,
+ const float3& idir, ssef idirsplat[3], shuffle_swap_t shufflexyz[3])
+{
+ idirsplat[0] = ssef(idir.x) ^ pn;
+ idirsplat[1] = ssef(idir.y) ^ pn;
+ idirsplat[2] = ssef(idir.z) ^ pn;
+
+ shufflexyz[0] = (idir.x >= 0)? shuf_identity: shuf_swap;
+ shufflexyz[1] = (idir.y >= 0)? shuf_identity: shuf_swap;
+ shufflexyz[2] = (idir.z >= 0)? shuf_identity: shuf_swap;
+}
+
+#endif
+
+ccl_device_inline const ssef uint32_to_float(const ssei &in)
+{
+ ssei a = _mm_srli_epi32(in, 16);
+ ssei b = _mm_and_si128(in, _mm_set1_epi32(0x0000ffff));
+ ssei c = _mm_or_si128(a, _mm_set1_epi32(0x53000000));
+ ssef d = _mm_cvtepi32_ps(b);
+ ssef e = _mm_sub_ps(_mm_castsi128_ps(c), _mm_castsi128_ps(_mm_set1_epi32(0x53000000)));
+ return _mm_add_ps(e, d);
+}
+
+template<size_t S1, size_t S2, size_t S3, size_t S4>
+ccl_device_inline const ssef set_sign_bit(const ssef &a)
+{
+ return a ^ cast(ssei(S1 << 31, S2 << 31, S3 << 31, S4 << 31));
+}
+
+#endif
+
+CCL_NAMESPACE_END
+
+#endif
+
diff --git a/intern/cycles/util/util_ssei.h b/intern/cycles/util/util_ssei.h
new file mode 100644
index 00000000000..5f5a8686e35
--- /dev/null
+++ b/intern/cycles/util/util_ssei.h
@@ -0,0 +1,294 @@
+/*
+ * Copyright 2011-2013 Intel Corporation
+ * Modifications Copyright 2014, Blender Foundation.
+ *
+ * Licensed under the Apache License, Version 2.0(the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License
+ */
+
+#ifndef __UTIL_SSEI_H__
+#define __UTIL_SSEI_H__
+
+CCL_NAMESPACE_BEGIN
+
+#ifdef __KERNEL_SSE2__
+
+/*! 4-wide SSE integer type. */
+struct ssei
+{
+ typedef sseb Mask; // mask type
+ typedef ssei Int; // int type
+ typedef ssef Float; // float type
+
+ enum { size = 4 }; // number of SIMD elements
+ union { __m128i m128; int32_t i[4]; }; // data
+
+ ////////////////////////////////////////////////////////////////////////////////
+ /// Constructors, Assignment & Cast Operators
+ ////////////////////////////////////////////////////////////////////////////////
+
+ __forceinline ssei ( ) {}
+ __forceinline ssei ( const ssei& a ) { m128 = a.m128; }
+ __forceinline ssei& operator=( const ssei& a ) { m128 = a.m128; return *this; }
+
+ __forceinline ssei( const __m128i a ) : m128(a) {}
+ __forceinline operator const __m128i&( void ) const { return m128; }
+ __forceinline operator __m128i&( void ) { return m128; }
+
+ __forceinline ssei ( const int a ) : m128(_mm_set1_epi32(a)) {}
+ __forceinline ssei ( int a, int b, int c, int d ) : m128(_mm_setr_epi32(a, b, c, d)) {}
+
+ __forceinline explicit ssei( const __m128 a ) : m128(_mm_cvtps_epi32(a)) {}
+
+ ////////////////////////////////////////////////////////////////////////////////
+ /// Array Access
+ ////////////////////////////////////////////////////////////////////////////////
+
+ __forceinline const int32_t& operator []( const size_t index ) const { assert(index < 4); return i[index]; }
+ __forceinline int32_t& operator []( const size_t index ) { assert(index < 4); return i[index]; }
+};
+
+////////////////////////////////////////////////////////////////////////////////
+/// Unary Operators
+////////////////////////////////////////////////////////////////////////////////
+
+__forceinline const ssei cast ( const __m128& a ) { return _mm_castps_si128(a); }
+__forceinline const ssei operator +( const ssei& a ) { return a; }
+__forceinline const ssei operator -( const ssei& a ) { return _mm_sub_epi32(_mm_setzero_si128(), a.m128); }
+#if defined(__KERNEL_SSSE3__)
+__forceinline const ssei abs ( const ssei& a ) { return _mm_abs_epi32(a.m128); }
+#endif
+
+////////////////////////////////////////////////////////////////////////////////
+/// Binary Operators
+////////////////////////////////////////////////////////////////////////////////
+
+__forceinline const ssei operator +( const ssei& a, const ssei& b ) { return _mm_add_epi32(a.m128, b.m128); }
+__forceinline const ssei operator +( const ssei& a, const int32_t& b ) { return a + ssei(b); }
+__forceinline const ssei operator +( const int32_t& a, const ssei& b ) { return ssei(a) + b; }
+
+__forceinline const ssei operator -( const ssei& a, const ssei& b ) { return _mm_sub_epi32(a.m128, b.m128); }
+__forceinline const ssei operator -( const ssei& a, const int32_t& b ) { return a - ssei(b); }
+__forceinline const ssei operator -( const int32_t& a, const ssei& b ) { return ssei(a) - b; }
+
+#if defined(__KERNEL_SSE41__)
+__forceinline const ssei operator *( const ssei& a, const ssei& b ) { return _mm_mullo_epi32(a.m128, b.m128); }
+__forceinline const ssei operator *( const ssei& a, const int32_t& b ) { return a * ssei(b); }
+__forceinline const ssei operator *( const int32_t& a, const ssei& b ) { return ssei(a) * b; }
+#endif
+
+__forceinline const ssei operator &( const ssei& a, const ssei& b ) { return _mm_and_si128(a.m128, b.m128); }
+__forceinline const ssei operator &( const ssei& a, const int32_t& b ) { return a & ssei(b); }
+__forceinline const ssei operator &( const int32_t& a, const ssei& b ) { return ssei(a) & b; }
+
+__forceinline const ssei operator |( const ssei& a, const ssei& b ) { return _mm_or_si128(a.m128, b.m128); }
+__forceinline const ssei operator |( const ssei& a, const int32_t& b ) { return a | ssei(b); }
+__forceinline const ssei operator |( const int32_t& a, const ssei& b ) { return ssei(a) | b; }
+
+__forceinline const ssei operator ^( const ssei& a, const ssei& b ) { return _mm_xor_si128(a.m128, b.m128); }
+__forceinline const ssei operator ^( const ssei& a, const int32_t& b ) { return a ^ ssei(b); }
+__forceinline const ssei operator ^( const int32_t& a, const ssei& b ) { return ssei(a) ^ b; }
+
+__forceinline const ssei operator <<( const ssei& a, const int32_t& n ) { return _mm_slli_epi32(a.m128, n); }
+__forceinline const ssei operator >>( const ssei& a, const int32_t& n ) { return _mm_srai_epi32(a.m128, n); }
+
+__forceinline const ssei andnot(const ssei& a, const ssei& b) { return _mm_andnot_si128(a.m128,b.m128); }
+__forceinline const ssei andnot(const sseb& a, const ssei& b) { return _mm_andnot_si128(cast(a.m128),b.m128); }
+__forceinline const ssei andnot(const ssei& a, const sseb& b) { return _mm_andnot_si128(a.m128,cast(b.m128)); }
+
+__forceinline const ssei sra ( const ssei& a, const int32_t& b ) { return _mm_srai_epi32(a.m128, b); }
+__forceinline const ssei srl ( const ssei& a, const int32_t& b ) { return _mm_srli_epi32(a.m128, b); }
+
+#if defined(__KERNEL_SSE41__)
+__forceinline const ssei min( const ssei& a, const ssei& b ) { return _mm_min_epi32(a.m128, b.m128); }
+__forceinline const ssei min( const ssei& a, const int32_t& b ) { return min(a,ssei(b)); }
+__forceinline const ssei min( const int32_t& a, const ssei& b ) { return min(ssei(a),b); }
+
+__forceinline const ssei max( const ssei& a, const ssei& b ) { return _mm_max_epi32(a.m128, b.m128); }
+__forceinline const ssei max( const ssei& a, const int32_t& b ) { return max(a,ssei(b)); }
+__forceinline const ssei max( const int32_t& a, const ssei& b ) { return max(ssei(a),b); }
+#endif
+
+////////////////////////////////////////////////////////////////////////////////
+/// Assignment Operators
+////////////////////////////////////////////////////////////////////////////////
+
+__forceinline ssei& operator +=( ssei& a, const ssei& b ) { return a = a + b; }
+__forceinline ssei& operator +=( ssei& a, const int32_t& b ) { return a = a + b; }
+
+__forceinline ssei& operator -=( ssei& a, const ssei& b ) { return a = a - b; }
+__forceinline ssei& operator -=( ssei& a, const int32_t& b ) { return a = a - b; }
+
+#if defined(__KERNEL_SSE41__)
+__forceinline ssei& operator *=( ssei& a, const ssei& b ) { return a = a * b; }
+__forceinline ssei& operator *=( ssei& a, const int32_t& b ) { return a = a * b; }
+#endif
+
+__forceinline ssei& operator &=( ssei& a, const ssei& b ) { return a = a & b; }
+__forceinline ssei& operator &=( ssei& a, const int32_t& b ) { return a = a & b; }
+
+__forceinline ssei& operator |=( ssei& a, const ssei& b ) { return a = a | b; }
+__forceinline ssei& operator |=( ssei& a, const int32_t& b ) { return a = a | b; }
+
+__forceinline ssei& operator <<=( ssei& a, const int32_t& b ) { return a = a << b; }
+__forceinline ssei& operator >>=( ssei& a, const int32_t& b ) { return a = a >> b; }
+
+////////////////////////////////////////////////////////////////////////////////
+/// Comparison Operators + Select
+////////////////////////////////////////////////////////////////////////////////
+
+__forceinline const sseb operator ==( const ssei& a, const ssei& b ) { return _mm_castsi128_ps(_mm_cmpeq_epi32 (a.m128, b.m128)); }
+__forceinline const sseb operator ==( const ssei& a, const int32_t& b ) { return a == ssei(b); }
+__forceinline const sseb operator ==( const int32_t& a, const ssei& b ) { return ssei(a) == b; }
+
+__forceinline const sseb operator !=( const ssei& a, const ssei& b ) { return !(a == b); }
+__forceinline const sseb operator !=( const ssei& a, const int32_t& b ) { return a != ssei(b); }
+__forceinline const sseb operator !=( const int32_t& a, const ssei& b ) { return ssei(a) != b; }
+
+__forceinline const sseb operator < ( const ssei& a, const ssei& b ) { return _mm_castsi128_ps(_mm_cmplt_epi32 (a.m128, b.m128)); }
+__forceinline const sseb operator < ( const ssei& a, const int32_t& b ) { return a < ssei(b); }
+__forceinline const sseb operator < ( const int32_t& a, const ssei& b ) { return ssei(a) < b; }
+
+__forceinline const sseb operator >=( const ssei& a, const ssei& b ) { return !(a < b); }
+__forceinline const sseb operator >=( const ssei& a, const int32_t& b ) { return a >= ssei(b); }
+__forceinline const sseb operator >=( const int32_t& a, const ssei& b ) { return ssei(a) >= b; }
+
+__forceinline const sseb operator > ( const ssei& a, const ssei& b ) { return _mm_castsi128_ps(_mm_cmpgt_epi32 (a.m128, b.m128)); }
+__forceinline const sseb operator > ( const ssei& a, const int32_t& b ) { return a > ssei(b); }
+__forceinline const sseb operator > ( const int32_t& a, const ssei& b ) { return ssei(a) > b; }
+
+__forceinline const sseb operator <=( const ssei& a, const ssei& b ) { return !(a > b); }
+__forceinline const sseb operator <=( const ssei& a, const int32_t& b ) { return a <= ssei(b); }
+__forceinline const sseb operator <=( const int32_t& a, const ssei& b ) { return ssei(a) <= b; }
+
+__forceinline const ssei select( const sseb& m, const ssei& t, const ssei& f ) {
+#ifdef __KERNEL_SSE41__
+ return _mm_castps_si128(_mm_blendv_ps(_mm_castsi128_ps(f), _mm_castsi128_ps(t), m));
+#else
+ return _mm_or_si128(_mm_and_si128(m, t), _mm_andnot_si128(m, f));
+#endif
+}
+
+__forceinline const ssei select( const int mask, const ssei& t, const ssei& f ) {
+#if defined(__KERNEL_SSE41__) && ((!defined(__clang__) && !defined(_MSC_VER)) || defined(__INTEL_COMPILER))
+ return _mm_castps_si128(_mm_blend_ps(_mm_castsi128_ps(f), _mm_castsi128_ps(t), mask));
+#else
+ return select(sseb(mask),t,f);
+#endif
+}
+
+////////////////////////////////////////////////////////////////////////////////
+// Movement/Shifting/Shuffling Functions
+////////////////////////////////////////////////////////////////////////////////
+
+__forceinline ssei unpacklo( const ssei& a, const ssei& b ) { return _mm_castps_si128(_mm_unpacklo_ps(_mm_castsi128_ps(a.m128), _mm_castsi128_ps(b.m128))); }
+__forceinline ssei unpackhi( const ssei& a, const ssei& b ) { return _mm_castps_si128(_mm_unpackhi_ps(_mm_castsi128_ps(a.m128), _mm_castsi128_ps(b.m128))); }
+
+template<size_t i0, size_t i1, size_t i2, size_t i3> __forceinline const ssei shuffle( const ssei& a ) {
+ return _mm_shuffle_epi32(a, _MM_SHUFFLE(i3, i2, i1, i0));
+}
+
+template<size_t i0, size_t i1, size_t i2, size_t i3> __forceinline const ssei shuffle( const ssei& a, const ssei& b ) {
+ return _mm_castps_si128(_mm_shuffle_ps(_mm_castsi128_ps(a), _mm_castsi128_ps(b), _MM_SHUFFLE(i3, i2, i1, i0)));
+}
+
+#if defined(__KERNEL_SSE3__)
+template<> __forceinline const ssei shuffle<0, 0, 2, 2>( const ssei& a ) { return _mm_castps_si128(_mm_moveldup_ps(_mm_castsi128_ps(a))); }
+template<> __forceinline const ssei shuffle<1, 1, 3, 3>( const ssei& a ) { return _mm_castps_si128(_mm_movehdup_ps(_mm_castsi128_ps(a))); }
+template<> __forceinline const ssei shuffle<0, 1, 0, 1>( const ssei& a ) { return _mm_castpd_si128(_mm_movedup_pd (_mm_castsi128_pd(a))); }
+#endif
+
+template<size_t i0> __forceinline const ssei shuffle( const ssei& b ) {
+ return shuffle<i0,i0,i0,i0>(b);
+}
+
+#if defined(__KERNEL_SSE41__)
+template<size_t src> __forceinline int extract( const ssei& b ) { return _mm_extract_epi32(b, src); }
+template<size_t dst> __forceinline const ssei insert( const ssei& a, const int32_t b ) { return _mm_insert_epi32(a, b, dst); }
+#else
+template<size_t src> __forceinline int extract( const ssei& b ) { return b[src]; }
+template<size_t dst> __forceinline const ssei insert( const ssei& a, const int32_t b ) { ssei c = a; c[dst] = b; return c; }
+#endif
+
+////////////////////////////////////////////////////////////////////////////////
+/// Reductions
+////////////////////////////////////////////////////////////////////////////////
+
+#if defined(__KERNEL_SSE41__)
+__forceinline const ssei vreduce_min(const ssei& v) { ssei h = min(shuffle<1,0,3,2>(v),v); return min(shuffle<2,3,0,1>(h),h); }
+__forceinline const ssei vreduce_max(const ssei& v) { ssei h = max(shuffle<1,0,3,2>(v),v); return max(shuffle<2,3,0,1>(h),h); }
+__forceinline const ssei vreduce_add(const ssei& v) { ssei h = shuffle<1,0,3,2>(v) + v ; return shuffle<2,3,0,1>(h) + h ; }
+
+__forceinline int reduce_min(const ssei& v) { return extract<0>(vreduce_min(v)); }
+__forceinline int reduce_max(const ssei& v) { return extract<0>(vreduce_max(v)); }
+__forceinline int reduce_add(const ssei& v) { return extract<0>(vreduce_add(v)); }
+
+__forceinline size_t select_min(const ssei& v) { return __bsf(movemask(v == vreduce_min(v))); }
+__forceinline size_t select_max(const ssei& v) { return __bsf(movemask(v == vreduce_max(v))); }
+
+__forceinline size_t select_min(const sseb& valid, const ssei& v) { const ssei a = select(valid,v,ssei((int)pos_inf)); return __bsf(movemask(valid & (a == vreduce_min(a)))); }
+__forceinline size_t select_max(const sseb& valid, const ssei& v) { const ssei a = select(valid,v,ssei((int)neg_inf)); return __bsf(movemask(valid & (a == vreduce_max(a)))); }
+
+#else
+
+__forceinline int reduce_min(const ssei& v) { return min(min(v[0],v[1]),min(v[2],v[3])); }
+__forceinline int reduce_max(const ssei& v) { return max(max(v[0],v[1]),max(v[2],v[3])); }
+__forceinline int reduce_add(const ssei& v) { return v[0]+v[1]+v[2]+v[3]; }
+
+#endif
+
+////////////////////////////////////////////////////////////////////////////////
+/// Memory load and store operations
+////////////////////////////////////////////////////////////////////////////////
+
+__forceinline ssei load4i( const void* const a ) {
+ return _mm_load_si128((__m128i*)a);
+}
+
+__forceinline void store4i(void* ptr, const ssei& v) {
+ _mm_store_si128((__m128i*)ptr,v);
+}
+
+__forceinline void storeu4i(void* ptr, const ssei& v) {
+ _mm_storeu_si128((__m128i*)ptr,v);
+}
+
+__forceinline void store4i( const sseb& mask, void* ptr, const ssei& i ) {
+#if defined (__KERNEL_AVX__)
+ _mm_maskstore_ps((float*)ptr,(__m128i)mask,_mm_castsi128_ps(i));
+#else
+ *(ssei*)ptr = select(mask,i,*(ssei*)ptr);
+#endif
+}
+
+__forceinline ssei load4i_nt (void* ptr) {
+#if defined(__KERNEL_SSE41__)
+ return _mm_stream_load_si128((__m128i*)ptr);
+#else
+ return _mm_load_si128((__m128i*)ptr);
+#endif
+}
+
+__forceinline void store4i_nt(void* ptr, const ssei& v) {
+#if defined(__KERNEL_SSE41__)
+ _mm_stream_ps((float*)ptr,_mm_castsi128_ps(v));
+#else
+ _mm_store_si128((__m128i*)ptr,v);
+#endif
+}
+
+#endif
+
+CCL_NAMESPACE_END
+
+#endif
+
diff --git a/intern/cycles/util/util_stats.h b/intern/cycles/util/util_stats.h
index 62b1f1760d7..8758b823084 100644
--- a/intern/cycles/util/util_stats.h
+++ b/intern/cycles/util/util_stats.h
@@ -30,6 +30,7 @@ public:
}
void mem_free(size_t size) {
+ assert(mem_used >= size);
mem_used -= size;
}
diff --git a/intern/cycles/util/util_system.cpp b/intern/cycles/util/util_system.cpp
index 0764f7d9345..7c0445577e2 100644
--- a/intern/cycles/util/util_system.cpp
+++ b/intern/cycles/util/util_system.cpp
@@ -127,9 +127,12 @@ struct CPUCapabilities {
bool sse42;
bool sse4a;
bool avx;
+ bool avx2;
bool xop;
bool fma3;
bool fma4;
+ bool bmi1;
+ bool bmi2;
};
static CPUCapabilities& system_cpu_capabilities()
@@ -180,6 +183,11 @@ static CPUCapabilities& system_cpu_capabilities()
#endif
caps.avx = (xcr_feature_mask & 0x6) == 0x6;
}
+
+ __cpuid(result, 0x00000007);
+ caps.bmi1 = (result[1] & ((int)1 << 3)) != 0;
+ caps.bmi2 = (result[1] & ((int)1 << 8)) != 0;
+ caps.avx2 = (result[1] & ((int)1 << 5)) != 0;
}
#if 0
@@ -221,6 +229,11 @@ bool system_cpu_support_avx()
CPUCapabilities& caps = system_cpu_capabilities();
return caps.sse && caps.sse2 && caps.sse3 && caps.ssse3 && caps.sse41 && caps.avx;
}
+bool system_cpu_support_avx2()
+{
+ CPUCapabilities& caps = system_cpu_capabilities();
+ return caps.sse && caps.sse2 && caps.sse3 && caps.ssse3 && caps.sse41 && caps.avx && caps.avx2 && caps.fma3 && caps.bmi1 && caps.bmi2;
+}
#else
bool system_cpu_support_sse2()
@@ -242,6 +255,10 @@ bool system_cpu_support_avx()
{
return false;
}
+bool system_cpu_support_avx2()
+{
+ return false;
+}
#endif
diff --git a/intern/cycles/util/util_system.h b/intern/cycles/util/util_system.h
index 4409ea752cd..0e8868c7dfc 100644
--- a/intern/cycles/util/util_system.h
+++ b/intern/cycles/util/util_system.h
@@ -28,6 +28,7 @@ bool system_cpu_support_sse2();
bool system_cpu_support_sse3();
bool system_cpu_support_sse41();
bool system_cpu_support_avx();
+bool system_cpu_support_avx2();
CCL_NAMESPACE_END
diff --git a/intern/cycles/util/util_types.h b/intern/cycles/util/util_types.h
index bfaab3dba3b..2a199e591bf 100644
--- a/intern/cycles/util/util_types.h
+++ b/intern/cycles/util/util_types.h
@@ -33,14 +33,17 @@
#ifndef __KERNEL_GPU__
-#define ccl_device static inline
+# ifdef NDEBUG
+# define ccl_device static inline
+# else
+# define ccl_device static
+# endif
#define ccl_device_noinline static
#define ccl_global
#define ccl_constant
#define __KERNEL_WITH_SSE_ALIGN__
#if defined(_WIN32) && !defined(FREE_WINDOWS)
-
#define ccl_device_inline static __forceinline
#define ccl_align(...) __declspec(align(__VA_ARGS__))
#ifdef __KERNEL_64_BIT__
@@ -50,7 +53,12 @@
#define ccl_try_align(...) /* not support for function arguments (error C2719) */
#endif
#define ccl_may_alias
-#define ccl_always_inline __forceinline
+# ifdef NDEBUG
+# define ccl_always_inline __forceinline
+# else
+# define ccl_always_inline
+# endif
+#define ccl_maybe_unused
#else
@@ -62,6 +70,7 @@
#define ccl_try_align(...) __attribute__((aligned(__VA_ARGS__)))
#define ccl_may_alias __attribute__((__may_alias__))
#define ccl_always_inline __attribute__((always_inline))
+#define ccl_maybe_unused __attribute__((used))
#endif
@@ -456,7 +465,6 @@ enum InterpolationType {
INTERPOLATION_SMART = 3,
};
-
/* macros */
/* hints for branch prediction, only use in code that runs a _lot_ */
@@ -473,14 +481,14 @@ enum InterpolationType {
* ... the compiler optimizes away the temp var */
#ifdef __GNUC__
#define CHECK_TYPE(var, type) { \
- __typeof(var) *__tmp; \
+ typeof(var) *__tmp; \
__tmp = (type *)NULL; \
(void)__tmp; \
} (void)0
#define CHECK_TYPE_PAIR(var_a, var_b) { \
- __typeof(var_a) *__tmp; \
- __tmp = (__typeof(var_b) *)NULL; \
+ typeof(var_a) *__tmp; \
+ __tmp = (typeof(var_b) *)NULL; \
(void)__tmp; \
} (void)0
#else
diff --git a/intern/cycles/util/util_vector.h b/intern/cycles/util/util_vector.h
index 2085177eefa..cc6e8a371ed 100644
--- a/intern/cycles/util/util_vector.h
+++ b/intern/cycles/util/util_vector.h
@@ -127,8 +127,10 @@ public:
}
else if(newsize != datasize) {
T *newdata = (T*)malloc_aligned(sizeof(T)*newsize, alignment);
- memcpy(newdata, data, ((datasize < newsize)? datasize: newsize)*sizeof(T));
- free_aligned(data);
+ if(data) {
+ memcpy(newdata, data, ((datasize < newsize)? datasize: newsize)*sizeof(T));
+ free_aligned(data);
+ }
data = newdata;
datasize = newsize;
diff --git a/intern/cycles/util/util_view.cpp b/intern/cycles/util/util_view.cpp
index 6bf9c9ed8c0..fe08389fe3f 100644
--- a/intern/cycles/util/util_view.cpp
+++ b/intern/cycles/util/util_view.cpp
@@ -248,7 +248,7 @@ void view_main_loop(const char *title, int width, int height,
glutInitDisplayMode(GLUT_RGB|GLUT_DOUBLE|GLUT_DEPTH);
glutCreateWindow(title);
- glewInit();
+ mxMakeCurrentContext(mxCreateContext());
view_reshape(width, height);